Expand DD/MM/YYYY-style dates before generic number expansion
Browse files
app.py
CHANGED
|
@@ -117,6 +117,7 @@ _MIXED_EN_SEGMENT_RE = re.compile(
|
|
| 117 |
r"|\d+[A-Za-z]+"
|
| 118 |
r"|[A-Za-z]+(?:[.'’\-][A-Za-z0-9]+)*"
|
| 119 |
)
|
|
|
|
| 120 |
_PERCENT_WORDS = {
|
| 121 |
"he": "אחוז",
|
| 122 |
"en": "percent",
|
|
@@ -804,6 +805,22 @@ def expand_ratios(text: str, lang: str = "en") -> str:
|
|
| 804 |
return re.sub(r"(?<!\d)(\d+)\s*:\s*(\d+)(?!\d)", rf"\1 {word} \2", text)
|
| 805 |
|
| 806 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 807 |
def normalize_common_text(text: str) -> str:
|
| 808 |
text = strip_hebrew_nikud(text)
|
| 809 |
text = re.sub(
|
|
@@ -819,6 +836,7 @@ def prepare_text_for_synthesis(text: str, lang: str) -> str:
|
|
| 819 |
text = normalize_common_text(text)
|
| 820 |
text = strip_hebrew_abbreviation_quotes(text, lang)
|
| 821 |
text = expand_hebrew_lamed_before_latin(text, lang)
|
|
|
|
| 822 |
text = expand_percent_symbols(text, lang=lang)
|
| 823 |
text = expand_ratios(text, lang=lang)
|
| 824 |
text = expand_numbers(text, lang=lang)
|
|
|
|
| 117 |
r"|\d+[A-Za-z]+"
|
| 118 |
r"|[A-Za-z]+(?:[.'’\-][A-Za-z0-9]+)*"
|
| 119 |
)
|
| 120 |
+
_DATE_RE = re.compile(r"(?<!\d)([0-3]?\d)[/.]([01]?\d)[/.](\d{2}|\d{4})(?!\d)")
|
| 121 |
_PERCENT_WORDS = {
|
| 122 |
"he": "אחוז",
|
| 123 |
"en": "percent",
|
|
|
|
| 805 |
return re.sub(r"(?<!\d)(\d+)\s*:\s*(\d+)(?!\d)", rf"\1 {word} \2", text)
|
| 806 |
|
| 807 |
|
| 808 |
+
def expand_dates(text: str, lang: str = "en") -> str:
|
| 809 |
+
"""Normalize numeric day/month/year dates before generic number expansion."""
|
| 810 |
+
def repl(m: re.Match[str]) -> str:
|
| 811 |
+
day = int(m.group(1))
|
| 812 |
+
month = int(m.group(2))
|
| 813 |
+
raw_year = m.group(3)
|
| 814 |
+
if not (1 <= day <= 31 and 1 <= month <= 12):
|
| 815 |
+
return m.group(0)
|
| 816 |
+
year = int(raw_year)
|
| 817 |
+
if len(raw_year) == 2:
|
| 818 |
+
year += 2000 if year < 70 else 1900
|
| 819 |
+
return f"{day} {month} {year}"
|
| 820 |
+
|
| 821 |
+
return _DATE_RE.sub(repl, text)
|
| 822 |
+
|
| 823 |
+
|
| 824 |
def normalize_common_text(text: str) -> str:
|
| 825 |
text = strip_hebrew_nikud(text)
|
| 826 |
text = re.sub(
|
|
|
|
| 836 |
text = normalize_common_text(text)
|
| 837 |
text = strip_hebrew_abbreviation_quotes(text, lang)
|
| 838 |
text = expand_hebrew_lamed_before_latin(text, lang)
|
| 839 |
+
text = expand_dates(text, lang=lang)
|
| 840 |
text = expand_percent_symbols(text, lang=lang)
|
| 841 |
text = expand_ratios(text, lang=lang)
|
| 842 |
text = expand_numbers(text, lang=lang)
|