Source code for shillelagh.adapters.api.gsheets.parsing.number

"""
Parse and format Google Sheet number formats.

https://developers.google.com/sheets/api/guides/formats#number_format_tokens
"""

# pylint: disable=c-extension-no-member, broad-exception-raised
import math
import operator
import re
from collections.abc import Iterator
from itertools import zip_longest
from typing import Any, Union, cast

from shillelagh.adapters.api.gsheets.parsing.base import (
    LITERAL,
    InvalidValue,
    Token,
    tokenize,
)



[docs]
def adjust_value(value: Union[int, float], tokens: list[Token]) -> Union[int, float]:
    """
    Adjust value applying percent, scientific notation, and comma multiplier.
    """
    for token in tokens:
        if token.__class__.__name__ == "PERCENT":
            value *= 100
        elif token.__class__.__name__ == "COMMA":
            value /= 1000 ** len(token.token)
        elif token.__class__.__name__ == "E":
            while value > 10:
                value /= 10
            while value < 1:
                value *= 10

    return value




[docs]
class DIGITS(Token):
    """
    One or more digits in the number.

    Each token renders digits differently:

    - 0: if the digit is an insignificant 0, it is rendered as 0.
    - #: if the digit is an insignificant 0, it is not rendered.
    - ?: if the digit is an insignificant 0, it is rendered as a space.

    If there's a comma between tokens it renders the entire number with grouping
    separators (grouping by the thousands)
    """

    regex = r"(0|#|\?)+(,(0|#|\?){3})?"


[docs]
    def format(  # pylint: disable=too-many-branches
        self,
        value: Union[int, float],
        tokens: list[Token],
    ) -> str:
        value = adjust_value(value, tokens)
        number = str(value)

        formatted: list[str] = []
        if is_fractional(self, tokens):
            number = number.split(".")[1] if "." in number else "0"
            for token, digit in zip_longest(self.token, number):
                if token is None:
                    break
                if token == "0":
                    formatted.append(digit or "0")
                elif token == "#":
                    formatted.append(digit or "")
                elif token == "?":
                    formatted.append(digit or " ")
                else:
                    raise Exception(f"Invalid token: {token}")
            return "".join(formatted)

        number = number.split(".", maxsplit=1)[0]
        has_comma = "," in self.token
        token = self.token.replace(",", "")

        for i, (token, digit) in enumerate(zip_longest(token[::-1], number[::-1])):
            if token is None:
                formatted.append(digit)
            elif token == "0":
                formatted.append(digit or "0")
            elif token == "#":
                formatted.append(digit or "")
            elif token == "?":
                formatted.append(digit or " ")
            else:
                raise Exception(f"Invalid token: {token}")

            # group by thousands
            if has_comma and (i + 1) % 3 == 0:
                formatted.append(",")

        return "".join(formatted[::-1]).strip(",")



[docs]
    def parse(self, value: str, tokens: list[Token]) -> tuple[dict[str, Any], str]:
        match = re.match(r"\s*(\+|-)?\d+(,\d+)*", value)
        if not match:
            raise InvalidValue(value)

        digits = match.group().replace(",", "")
        if is_fractional(self, tokens):
            return (
                {
                    "operation": lambda number: math.copysign(
                        abs(number) + get_fraction(digits),
                        number,
                    ),
                },
                value[len(match.group()) :],
            )

        return (
            {"operation": lambda number: number + int(digits)},
            value[len(match.group()) :],
        )





[docs]
class PERIOD(Token):
    """
    The decimal point in the number.

    The first period represents the decimal point in the number. Subsequent
    periods are rendered as literals. If you include a decimal point in the
    format, it will always be rendered, even for whole numbers.
    """

    regex = r"\."


[docs]
    def format(self, value: Union[int, float], tokens: list[Token]) -> str:
        return "."



[docs]
    def parse(self, value: str, tokens: list[Token]) -> tuple[dict[str, Any], str]:
        match = re.match(r"\.", value)
        if not match:
            raise InvalidValue(value)

        return {}, value[1:]





[docs]
class PERCENT(Token):
    """
    Literal percent sign.

    Appears as a literal but also causes existing numbers to be multiplied
    by 100 before being rendered, in order to make percentages more readable.
    """

    regex = "%"


[docs]
    def format(self, value: Union[int, float], tokens: list[Token]) -> str:
        return "%"



[docs]
    def parse(self, value: str, tokens: list[Token]) -> tuple[dict[str, Any], str]:
        match = re.match("%", value)
        if not match:
            raise InvalidValue(value)

        return {"operation": lambda number: number / 100.0}, value[1:]





[docs]
class COMMA(Token):
    """
    Multiplier.

    When appearing after the digit characters the comma scales the digits by one thousand
    per comma.
    """

    regex = ",+"


[docs]
    def format(self, value: Union[int, float], tokens: list[Token]) -> str:
        return ""



[docs]
    def parse(self, value: str, tokens: list[Token]) -> tuple[dict[str, Any], str]:
        size = len(self.token)
        return {"operation": lambda number: number * 1000**size}, value





[docs]
class E(Token):  # pylint: disable=invalid-name
    """
    Scientific format.

    Renders the number in scientific format, with the formatting to the left of the E
    used for the non-exponent portion and the formatting to the right of the E used for
    the exponent portion. E+ will show a + sign for positive exponents. E- will only
    show a sign for negative exponents. If lowercase is used, the output e is
    lowercased as well.
    """

    regex = r"(E|e)(-|\+)((0|#|\?)+)"


[docs]
    def format(self, value: Union[int, float], tokens: list[Token]) -> str:
        exponent = 0

        if value >= 1:
            sign = "+" if "+" in self.token else ""
            while value > 10:
                value /= 10
                exponent += 1
        else:
            sign = "-"
            while value < 1:
                value *= 10
                exponent += 1

        match = re.match(self.regex, self.token)
        if not match:
            # should never happen
            raise Exception("You are likely to be eaten by a grue.")
        cased_e = match.group(1)

        # process the exponent according to the pattern
        pattern = match.group(3)
        formatted_exponent = format_number_pattern(exponent, pattern)

        return f"{cased_e}{sign}{formatted_exponent}"



[docs]
    def parse(self, value: str, tokens: list[Token]) -> tuple[dict[str, Any], str]:
        match = re.match(r"(E|e)((\+|-)?\d+)", value)
        if not match:
            raise InvalidValue(value)

        exponent = int(match.group(2))
        size = len(match.group())

        return {"operation": lambda number: number * 10**exponent}, value[size:]





[docs]
class FRACTION(Token):
    """
    Fractional format.

    If it appears between two digit characters (0, # or ?), then it treats those digit
    groups as a fractional format. For example, the number format 0 #/# renders the
    number 23.25 as 23 1/4. The denominator can also be a literal integer, in which case
    it will enforce that integer as the denominator. The number format 0 #/8 displays
    the number 23.25 as 23 2/8. The fraction part is not rendered at all if the
    numerator would become 0. The number 23.1 with the number format 0 #/3 renders as
    just 23 (because the 0.1 rounded to 0/3). / is not compatible with scientific format
    or a format with a decimal point in it.
    """

    regex = r"(?:((?:0|#|\?)+))/(?:((?:0|#|\?)+)|(\d+))"


[docs]
    def format(self, value: Union[int, float], tokens: list[Token]) -> str:
        fractional_part = float(value - math.floor(value))
        numerator: Union[int, float]
        numerator, denominator = fractional_part.as_integer_ratio()

        # force denominator?
        match = re.match(self.regex, self.token)
        if not match:
            # should never happen
            raise Exception("You are likely to be eaten by a grue.")
        groups = match.groups()
        if groups[2] is not None:
            numerator *= int(groups[2]) / denominator
            formatted_denominator = groups[2]
        else:
            # the denominator needs to be formatted as a fraction, with spaces and zeros
            # right padded
            pattern = "." + groups[1]
            number = get_fraction(str(denominator))
            formatted_denominator = format_number_pattern(number, pattern).lstrip(".")

        if math.floor(numerator) == 0:
            return ""

        formatted_numerator = format_number_pattern(numerator, groups[0])

        return f"{formatted_numerator}/{formatted_denominator}"



[docs]
    def parse(self, value: str, tokens: list[Token]) -> tuple[dict[str, Any], str]:
        match = re.match(r"\s*(\d+)/(\d+)", value)
        if not match:
            # fractions with numerator 0 are not shown
            return {}, value

        numerator = int(match.group(1))
        denominator = int(match.group(2))
        size = len(match.group())

        return {"operation": lambda number: number + (numerator / denominator)}, value[
            size:
        ]





[docs]
class STAR(Token):
    """
    This is included for compatibility with Excel number formats. It is currently ignored.
    """

    regex = r"\*"


[docs]
    def format(self, value: Union[int, float], tokens: list[Token]) -> str:
        return ""



[docs]
    def parse(self, value: str, tokens: list[Token]) -> tuple[dict[str, Any], str]:
        return {}, value





[docs]
class UNDERSCORE(Token):
    """
    Skips the next character and renders a space.

    This is used to line up number formats where the negative value is surrounded
    by parenthesis.
    """

    regex = "_."


[docs]
    def format(self, value: Union[int, float], tokens: list[Token]) -> str:
        return " "



[docs]
    def parse(self, value: str, tokens: list[Token]) -> tuple[dict[str, Any], str]:
        if value[:1] != " ":
            raise InvalidValue(value[:1])

        return {}, value[1:]





[docs]
class AT(Token):
    """
    Inserts the raw text for the cell, if the cell has text input.

    Not compatible with any of the other special characters and won’t display for
    numeric values (which are displayed as general format).
    """

    regex = "@"


[docs]
    def format(self, value: Union[int, float, str], tokens: list[Token]) -> str:
        return str(value)



[docs]
    def parse(self, value: str, tokens: list[Token]) -> tuple[dict[str, Any], str]:
        i = tokens.index(self)

        if i == len(tokens) - 1:
            # last token, consume everything
            return {"operation": lambda number: value}, ""

        # LL(1) to see how much can be consumed
        sibling = tokens[i + 1]
        j = 0
        for j in range(len(value)):
            try:
                sibling.parse(value[j:], tokens)
                break
            except InvalidValue:
                continue
        else:
            raise InvalidValue(value)

        return {"operation": lambda number: value[:j]}, value[j:]





[docs]
class COLOR(Token):
    """
    Causes any value that is rendered by this sub-format to appear with the given text color.

    Valid values for Color are Black, Blue, Cyan, Green, Magenta, Red, White or Yellow.
    Valid values for the "#" in Color# are 0 - 56 (this color palette shows a list of the
    colors that correspond to each number). Number format colors will override any
    user-entered colors on the cell, but will not override colors set by conditional
    formatting.
    """

    regex = r"\[(Black|Blue|Cyan|Green|Magenta|Red|White|Yellow|Color\d{1,2})\]"


[docs]
    def format(self, value: Union[int, float], tokens: list[Token]) -> str:
        return ""



[docs]
    def parse(self, value: str, tokens: list[Token]) -> tuple[dict[str, Any], str]:
        return {}, value





[docs]
class CONDITION(Token):
    """
    Replaces the default comparison section with another conditional expression.

    Replaces the default positive, negative, or zero comparison section with another
    conditional expression. For example, [<100]”Low”;[>1000]”High”;000 will render the
    word “Low” for values below 100, “High” for values above 1000 and a three digit
    number (with leading 0s) for anything in between. Conditions can only be applied to
    the first two sub-formats and if a number matches more than one, it will use the
    first one it matches. If there is a third format, it will be used for "everything
    else", otherwise if a number doesn’t match either format, it will be rendered as all
    "#"s filling up the cell width. The fourth format is always used for text, if it
    exists.
    """

    regex = r"\[(>|>=|<|<=|=)\d*(\.\d*)?\]"


[docs]
    def format(self, value: Union[int, float], tokens: list[Token]) -> str:
        return ""



[docs]
    def parse(self, value: str, tokens: list[Token]) -> tuple[dict[str, Any], str]:
        return {}, value





[docs]
def get_fraction(number: str) -> float:
    """
    Return the fraction associated with a fractional part.

        >>> get_fraction("9")
        0.9
        >>> get_fraction("123")
        0.123
        >>> get_fraction("1")
        0.1
        >>> get_fraction("001")
        0.001

    """
    if int(number) < 0:
        raise Exception("Number should be a positive integer")

    return cast(float, int(number) / (10 ** len(number)))




[docs]
def parse_number_pattern(value: str, pattern: str) -> float:
    """
    Parse a value using a given pattern.
    """
    formats = pattern.split(";")

    i = -1
    for i, format_ in enumerate(formats):
        try:
            number = parse_number_format(value, format_)
            break
        except InvalidValue:
            # weird edge case
            if value.startswith("-"):
                try:
                    number = -parse_number_pattern(value[1:], format_)
                    break
                except InvalidValue:
                    pass
    else:
        try:
            return int(value)
        except ValueError:
            try:
                return float(value)
            except ValueError as ex:
                raise InvalidValue(
                    f"Unable to parse value {value} with pattern {pattern}",
                ) from ex

    # is negative?
    if i == 1 and (len(formats) > 2 or (len(formats) == 2 and "@" not in formats[1])):
        return -number

    return number




[docs]
def fix_periods(tokens: Iterator[Token]) -> Iterator[Token]:
    """
    Convert periods into literals after the first one.
    """
    seen_period = False
    for token in tokens:
        if token.__class__.__name__ == "PERIOD":
            yield LITERAL(".") if seen_period else token
            seen_period = True
        else:
            yield token




[docs]
def parse_number_format(value: str, format_: str) -> float:
    """
    Parse a value using a given format pattern.
    """
    classes = [
        FRACTION,  # should come first
        DIGITS,
        COMMA,
        PERIOD,
        PERCENT,
        E,
        STAR,
        UNDERSCORE,
        AT,
        COLOR,
        CONDITION,
        LITERAL,
    ]

    number = 0

    tokens = list(fix_periods(tokenize(format_, classes)))
    for token in tokens:
        consumed, value = token.parse(value, tokens)
        if "operation" in consumed:
            number = consumed["operation"](number)

    return number




[docs]
def has_condition(pattern: str) -> bool:
    """
    Return true if the pattern has condition instructions.
    """
    return bool(re.match(r"\[(>|>=|<|<=|=)\d*(\.\d*)?\]", pattern))




[docs]
def condition_matches(value: Union[int, float], format_: str) -> bool:
    """
    Return true if the value matches the condition in the pattern.
    """
    match = re.search(r"\[(>|>=|<|<=|=)(\d*(?:\.\d*)?)\]", format_)
    if not match:
        return True

    operators = {
        ">": operator.gt,
        ">=": operator.ge,
        "<": operator.lt,
        "<=": operator.le,
        "=": operator.eq,
    }
    comparison, threshold = match.groups()
    op = operators[comparison]  # pylint: disable=invalid-name
    return bool(op(float(value), float(threshold)))




[docs]
def format_number_pattern(  # pylint: disable=too-many-branches
    value: Union[int, float, str],
    pattern: str,
) -> str:
    """
    Format a number to a given pattern.
    """
    formats = pattern.split(";")

    if not any(format_ for format_ in formats):
        raise Exception("Empty pattern!")

    if isinstance(value, str):
        for format_ in formats:
            if "@" in format_:
                break
        else:
            raise Exception("No text format found for string value")

    elif has_condition(pattern):
        for format_ in formats:
            if condition_matches(value, format_):
                break
        else:
            format_ = '"########"'

    else:
        # remove any text format
        formats = [format_ for format_ in formats if "@" not in format_]

        if value > 0:
            format_ = formats[0]
        elif value == 0:
            format_ = formats[2] if len(formats) == 3 else formats[0]
        else:
            if len(formats) >= 2:
                format_ = formats[1]
                value *= -1
            else:
                format_ = formats[0]

    classes = [
        FRACTION,  # should come first
        DIGITS,
        COMMA,
        PERIOD,
        PERCENT,
        E,
        STAR,
        UNDERSCORE,
        AT,
        COLOR,
        CONDITION,
        LITERAL,
    ]

    parts = []

    tokens = list(fix_periods(tokenize(format_, classes)))
    for token in tokens:
        parts.append(token.format(value, tokens))

    return "".join(parts)




[docs]
def is_fractional(token: Token, tokens: list[Token]) -> bool:
    """
    Return true if the token is after the period.
    """
    seen_self = False
    for sibling in tokens:
        if sibling is token:
            seen_self = True
        elif isinstance(sibling, PERIOD) and not seen_self:
            return True
    return False
Source code for shillelagh.adapters.api.gsheets.parsing.number

Shillelagh

Navigation

Related Topics