notmax123 commited on
Commit
9eebcc5
·
1 Parent(s): 0ce399f

Expand DD/MM/YYYY-style dates before generic number expansion

Browse files
Files changed (1) hide show
  1. app.py +18 -0
app.py CHANGED
@@ -117,6 +117,7 @@ _MIXED_EN_SEGMENT_RE = re.compile(
117
  r"|\d+[A-Za-z]+"
118
  r"|[A-Za-z]+(?:[.'’\-][A-Za-z0-9]+)*"
119
  )
 
120
  _PERCENT_WORDS = {
121
  "he": "אחוז",
122
  "en": "percent",
@@ -804,6 +805,22 @@ def expand_ratios(text: str, lang: str = "en") -> str:
804
  return re.sub(r"(?<!\d)(\d+)\s*:\s*(\d+)(?!\d)", rf"\1 {word} \2", text)
805
 
806
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
807
  def normalize_common_text(text: str) -> str:
808
  text = strip_hebrew_nikud(text)
809
  text = re.sub(
@@ -819,6 +836,7 @@ def prepare_text_for_synthesis(text: str, lang: str) -> str:
819
  text = normalize_common_text(text)
820
  text = strip_hebrew_abbreviation_quotes(text, lang)
821
  text = expand_hebrew_lamed_before_latin(text, lang)
 
822
  text = expand_percent_symbols(text, lang=lang)
823
  text = expand_ratios(text, lang=lang)
824
  text = expand_numbers(text, lang=lang)
 
117
  r"|\d+[A-Za-z]+"
118
  r"|[A-Za-z]+(?:[.'’\-][A-Za-z0-9]+)*"
119
  )
120
+ _DATE_RE = re.compile(r"(?<!\d)([0-3]?\d)[/.]([01]?\d)[/.](\d{2}|\d{4})(?!\d)")
121
  _PERCENT_WORDS = {
122
  "he": "אחוז",
123
  "en": "percent",
 
805
  return re.sub(r"(?<!\d)(\d+)\s*:\s*(\d+)(?!\d)", rf"\1 {word} \2", text)
806
 
807
 
808
+ def expand_dates(text: str, lang: str = "en") -> str:
809
+ """Normalize numeric day/month/year dates before generic number expansion."""
810
+ def repl(m: re.Match[str]) -> str:
811
+ day = int(m.group(1))
812
+ month = int(m.group(2))
813
+ raw_year = m.group(3)
814
+ if not (1 <= day <= 31 and 1 <= month <= 12):
815
+ return m.group(0)
816
+ year = int(raw_year)
817
+ if len(raw_year) == 2:
818
+ year += 2000 if year < 70 else 1900
819
+ return f"{day} {month} {year}"
820
+
821
+ return _DATE_RE.sub(repl, text)
822
+
823
+
824
  def normalize_common_text(text: str) -> str:
825
  text = strip_hebrew_nikud(text)
826
  text = re.sub(
 
836
  text = normalize_common_text(text)
837
  text = strip_hebrew_abbreviation_quotes(text, lang)
838
  text = expand_hebrew_lamed_before_latin(text, lang)
839
+ text = expand_dates(text, lang=lang)
840
  text = expand_percent_symbols(text, lang=lang)
841
  text = expand_ratios(text, lang=lang)
842
  text = expand_numbers(text, lang=lang)