"""
Parse and format Google Sheet date/time patterns.
https://developers.google.com/sheets/api/guides/formats?hl=en#date_and_time_format_patterns
"""
# pylint: disable=invalid-name, fixme, broad-exception-raised
import calendar
import re
from collections import defaultdict
from datetime import date, datetime, time, timedelta
from enum import Enum
from typing import Any, Dict, List, Tuple, Type, TypeVar, Union
from shillelagh.adapters.api.gsheets.parsing.base import LITERAL, Token, tokenize
DateTime = TypeVar("DateTime", datetime, date, time, timedelta)
[docs]
class Meridiem(Enum):
"""
Represent ante or post meridiem.
"""
AM = "AM"
PM = "PM"
[docs]
class H(Token):
"""
Hour of the day.
Switches between 12 and 24 hour format depending on whether an am/pm
indicator is present in the string.
"""
regex = "h(?!h)"
[docs]
def parse(self, value: str, tokens: List[Token]) -> Tuple[Dict[str, Any], str]:
match = re.match(r"\d{1,2}", value)
if not match:
raise Exception(f"Cannot parse value: {value}")
size = len(match.group()) if 0 <= int(match.group()) < 24 else 1
return {"hour": int(value[:size])}, value[size:]
[docs]
class HHPlus(H):
"""
Same as previous, but with a leading 0 for 1-9.
"""
regex = "hh+"
[docs]
def parse(self, value: str, tokens: List[Token]) -> Tuple[Dict[str, Any], str]:
return {"hour": int(value[:2])}, value[2:]
[docs]
class M(Token):
"""
If the previous non-literal token was hours or the subsequent one is
seconds, then it represents minutes in the hour (no leading 0).
Otherwise, it represents the month of the year as a number (no leading 0).
"""
regex = "m(?!m)"
def _is_minute(self, tokens: List[Token]) -> bool:
"""
Return true if the token represents minutes, false if months.
"""
is_minute = False
i = -1
for i, token in enumerate(tokens):
if token is self:
break
else:
raise Exception("Token is not present in list of tokens")
for token in reversed(tokens[:i]):
if token.__class__.__name__ == "LITERAL":
continue
if token.__class__.__name__ in {"H", "HHPlus"}:
is_minute = True
break
for token in tokens[i + 1 :]:
if token.__class__.__name__ == "LITERAL":
continue
if token.__class__.__name__ in {"S", "SS"}:
is_minute = True
break
return is_minute
[docs]
def parse(self, value: str, tokens: List[Token]) -> Tuple[Dict[str, Any], str]:
match = re.match(r"\d{1,2}", value)
if not match:
raise Exception(f"Cannot parse value: {value}")
size = len(match.group()) if 1 <= int(match.group()) <= 24 else 1
if self._is_minute(tokens):
return {"minute": int(value[:size])}, value[size:]
return {"month": int(value[:size])}, value[size:]
[docs]
class MM(M):
"""
As above, but with a leading 0 for both cases.
"""
regex = "mm(?!m)"
[docs]
def parse(self, value: str, tokens: List[Token]) -> Tuple[Dict[str, Any], str]:
if self._is_minute(tokens):
return {"minute": int(value[:2])}, value[2:]
return {"month": int(value[:2])}, value[2:]
[docs]
class MMM(Token):
"""
Three letter month abbreviation (e.g., "Feb").
"""
regex = "mmm(?!m)"
[docs]
def parse(self, value: str, tokens: List[Token]) -> Tuple[Dict[str, Any], str]:
month = datetime.strptime(value[:3], "%b").month
return {"month": month}, value[3:]
[docs]
class MMMM(MMM):
"""
Full month name. mmmmmm+ also matches this.
"""
regex = "(mmmm(?!m))|(m{6,})"
[docs]
def parse(self, value: str, tokens: List[Token]) -> Tuple[Dict[str, Any], str]:
token = re.split(r"\b", value, 2)[1]
size = len(token)
month = datetime.strptime(value[:size], "%B").month
return {"month": month}, value[size:]
[docs]
class MMMMM(MMM):
"""
First letter of the month (e.g., "J" for June).
"""
regex = "mmmmm"
[docs]
def parse(self, value: str, tokens: List[Token]) -> Tuple[Dict[str, Any], str]:
letter = value[0]
mapping = defaultdict(list)
for i in range(1, 13):
mapping[calendar.month_name[i][0]].append(i)
if len(mapping[letter]) == 0:
raise Exception(f"Unable to find month letter: {letter}")
if len(mapping[letter]) > 1:
raise Exception(f"Unable to parse month letter unambiguously: {letter}")
return {"month": mapping[letter][0]}, value[1:]
[docs]
class S(Token):
"""
Seconds in the minute without a leading 0.
"""
regex = "s(?!s)"
[docs]
def parse(self, value: str, tokens: List[Token]) -> Tuple[Dict[str, Any], str]:
match = re.match(r"\d{1,2}", value)
if not match:
raise Exception(f"Cannot parse value: {value}")
# leap seconds can be 60 or even 61
size = len(match.group()) if 0 <= int(match.group()) <= 61 else 1
return {"second": int(value[:size])}, value[size:]
[docs]
class SS(S):
"""
Seconds in the minute with a leading 0.
"""
regex = "ss"
[docs]
def parse(self, value: str, tokens: List[Token]) -> Tuple[Dict[str, Any], str]:
return {"second": int(value[:2])}, value[2:]
[docs]
class DurationToken(Token): # pylint: disable=abstract-method
"""
A token for durations.
Durations are special because often only the first token is annotated. For example:
- [h]:mm:ss
- [ss].000
But apparently it is valid to annotate subsequent tokens:
- [hh]:[mm]:[ss].000
Who knows?
Because of this, their regexes are dynamic, and depend on the token history.
"""
is_duration = True
regexes: Tuple[str, str]
[docs]
@classmethod
def match(
cls,
pattern: str,
history: List[Token],
) -> bool:
if any(isinstance(token, DurationToken) for token in history):
regex = cls.regexes[1]
else:
regex = cls.regexes[0]
return bool(re.match(regex, pattern))
[docs]
@classmethod
def consume(
cls,
pattern: str,
history: List[Token],
) -> Tuple[Token, str]:
if any(isinstance(token, DurationToken) for token in history):
regex = cls.regexes[1]
else:
regex = cls.regexes[0]
match = re.match(regex, pattern)
if not match:
# pylint: disable=broad-exception-raised
raise Exception("Token could not find match")
token = match.group()
return cls(token), pattern[len(token) :]
[docs]
class HPlusDuration(DurationToken):
"""
Number of elapsed hours in a time duration. Number of letters indicates
minimum number of digits (adds leading 0s).
"""
regexes = (r"\[h+\]", r"(h+)|(\[h+\])")
[docs]
def parse(self, value: str, tokens: List[Token]) -> Tuple[Dict[str, Any], str]:
match = re.match(r"\d+", value)
if not match:
raise Exception(f"Cannot parse value: {value}")
size = len(match.group())
return {"hours": int(value[:size])}, value[size:]
[docs]
class MPlusDuration(DurationToken):
"""
Number of elapsed minutes in a time duration. Number of letters indicates
minimum number of digits (adds leading 0s).
"""
regexes = (r"\[m+\]", r"(m+)|(\[m+\])")
[docs]
def parse(self, value: str, tokens: List[Token]) -> Tuple[Dict[str, Any], str]:
match = re.match(r"\d+", value)
if not match:
raise Exception(f"Cannot parse value: {value}")
size = len(match.group())
return {"minutes": int(value[:size])}, value[size:]
[docs]
class SPlusDuration(DurationToken):
"""
Number of elapsed seconds in a time duration. Number of letters indicates
minimum number of digits (adds leading 0s).
"""
regexes = (r"\[s+\]", r"(s+)|(\[s+\])")
[docs]
def parse(self, value: str, tokens: List[Token]) -> Tuple[Dict[str, Any], str]:
match = re.match(r"\d+", value)
if not match:
raise Exception(f"Cannot parse value: {value}")
size = len(match.group())
return {"seconds": int(value[:size])}, value[size:]
[docs]
class D(Token):
"""
Day of the month, no leading 0 for numbers less than 10.
"""
regex = "d(?!d)"
[docs]
def parse(self, value: str, tokens: List[Token]) -> Tuple[Dict[str, Any], str]:
match = re.match(r"\d{1,2}", value)
if not match:
raise Exception(f"Cannot parse value: {value}")
size = len(match.group()) if 1 <= int(match.group()) <= 31 else 1
return {"day": int(value[:size])}, value[size:]
[docs]
class DD(D):
"""
Day of the month, with a leading 0 for numbers less than 10.
"""
regex = "dd(?!d)"
[docs]
def parse(self, value: str, tokens: List[Token]) -> Tuple[Dict[str, Any], str]:
return {"day": int(value[:2])}, value[2:]
[docs]
class DDD(D):
"""
Day of the week, three letter abbreviation (e.g., "Mon").
"""
regex = "ddd(?!d)"
[docs]
def parse(self, value: str, tokens: List[Token]) -> Tuple[Dict[str, Any], str]:
return {"weekday": datetime.strptime(value[:3], "%a").weekday()}, value[3:]
[docs]
class DDDDPlus(D):
"""
Day of the week, full name.
"""
regex = "d{4,}"
[docs]
def parse(self, value: str, tokens: List[Token]) -> Tuple[Dict[str, Any], str]:
token = re.split(r"\b", value, 2)[1]
size = len(token)
return {"weekday": datetime.strptime(value[:size], "%A").weekday()}, value[
size:
]
[docs]
class YY(Token):
"""
2-digit year.
"""
regex = "y{1,2}(?!y)"
[docs]
def parse(self, value: str, tokens: List[Token]) -> Tuple[Dict[str, Any], str]:
# assume 00 == 2000
year = int(value[:2]) + 2000
return {"year": year}, value[2:]
[docs]
class YYYY(Token):
"""
4-digit year.
"""
regex = "y{3,}"
[docs]
def parse(self, value: str, tokens: List[Token]) -> Tuple[Dict[str, Any], str]:
return {"year": int(value[:4])}, value[4:]
[docs]
class ZERO(Token):
"""
Tenths of seconds. You can increase the precision to 2 digits
with 00 or 3 digits (milliseconds) with 000.
"""
regex = "0{1,3}(?!0)"
[docs]
def parse(self, value: str, tokens: List[Token]) -> Tuple[Dict[str, Any], str]:
size = len(self.token)
# adjust precision
token = value[:size]
token += "0" * (6 - size)
microsecond = int(token)
return {"microsecond": microsecond}, value[size:]
[docs]
class AP(Token):
"""
Displays "a" for AM, and "p" for PM. Also changes hours to 12-hour
format. If the token letter is capitalized, the output is as well.
"""
regex = "(a/p)|(A/P)"
[docs]
def parse(self, value: str, tokens: List[Token]) -> Tuple[Dict[str, Any], str]:
letter = value[:1]
meridiem = Meridiem.PM if letter.upper() == "P" else Meridiem.AM
return {"meridiem": meridiem}, value[1:]
[docs]
class AMPM(AP):
"""
As above, but displays "AM" or "PM" instead and is always capitalized.
"""
regex = "am/pm"
[docs]
def parse(self, value: str, tokens: List[Token]) -> Tuple[Dict[str, Any], str]:
letter = value[:2]
meridiem = Meridiem.PM if letter.upper() == "PM" else Meridiem.AM
return {"meridiem": meridiem}, value[2:]
[docs]
def infer_column_type(pattern: str) -> str:
"""
Infer the correct date-related type.
GSheets returns ``datetime`` as the type for timestamps, but also for time of day and
durations. We need to parse the pattern to figure out the exact type.
"""
classes = [
# durations should come first because they need to be modified
# after the first capture
HPlusDuration,
MPlusDuration,
SPlusDuration,
# then the rest
H,
HHPlus,
M,
MM,
MMM,
MMMM,
MMMMM,
S,
SS,
D,
DD,
DDD,
DDDDPlus,
YY,
YYYY,
AP,
AMPM,
ZERO,
LITERAL,
]
tokens = list(tokenize(pattern, classes))
if any(isinstance(token, DurationToken) for token in tokens):
return "duration"
datetime_tokens = (D, DD, DDD, DDDDPlus, YY, YYYY)
if any(isinstance(token, datetime_tokens) for token in tokens):
return "datetime"
return "timeofday"
[docs]
def parse_date_time_pattern(
value: str,
pattern: str,
class_: Type[DateTime],
) -> DateTime:
"""
Parse a value using a given pattern.
See https://developers.google.com/sheets/api/guides/formats?hl=en.
"""
classes = [
# durations should come first because they need to be modified
# after the first capture
HPlusDuration,
MPlusDuration,
SPlusDuration,
# then the rest
H,
HHPlus,
M,
MM,
MMM,
MMMM,
MMMMM,
S,
SS,
D,
DD,
DDD,
DDDDPlus,
YY,
YYYY,
AP,
AMPM,
ZERO,
LITERAL,
]
kwargs: Dict[str, Any] = {}
tokens = list(tokenize(pattern, classes))
for token in tokens:
consumed, value = token.parse(value, tokens)
kwargs.update(**consumed)
# add PM offset
if "hour" in kwargs:
meridiem = kwargs.pop("meridiem", None)
if meridiem == Meridiem.PM and kwargs["hour"] != 12:
kwargs["hour"] += 12
elif meridiem == Meridiem.AM and kwargs["hour"] == 12:
kwargs["hour"] -= 12
# we can't really do anything with ``weekday``
if "weekday" in kwargs:
del kwargs["weekday"]
if "microsecond" in kwargs and class_ is timedelta:
kwargs["microseconds"] = kwargs.pop("microsecond")
try:
return class_(**kwargs)
except TypeError as ex:
raise Exception("Unsupported format") from ex