DialectAnalysis / dialect_analysis /normalization.py
thomascerniglia's picture
Upload 8 files
d0326ea verified
from __future__ import annotations
import re
import unicodedata
from typing import List
# A small punctuation set that commonly appears in Greek texts.
_EXTRA_PUNCT = "··;;—–…«»‹›“”‘’" # ano teleia, Greek question mark, dashes, quotes
def strip_greek_diacritics(text: str) -> str:
"""Strip diacritics while preserving iota subscript as an explicit iota.
- Converts combining GREEK YPOGEGRAMMENI (U+0345) to 'ι'.
- Removes other combining marks (accents, breathings, etc.).
"""
decomposed = unicodedata.normalize("NFD", text)
out_chars: List[str] = []
for ch in decomposed:
if ch == "\u0345":
out_chars.append("ι")
continue
if unicodedata.combining(ch):
continue
out_chars.append(ch)
return unicodedata.normalize("NFC", "".join(out_chars))
def sigma_normalize(token: str) -> str:
"""Normalize sigma variants for matching."""
return token.replace("ς", "σ")
def normalize_text(text: str, *, strip_diacritics: bool = False) -> str:
"""Normalize input Greek text.
- Lowercase
- Remove punctuation
- Optionally strip diacritics
Keep diacritics by default so feature extraction can detect iota-subscript
endings like -ᾳ.
"""
lowered = text.lower()
# Replace tabs/newlines with spaces.
cleaned = lowered.translate(str.maketrans({"\n": " ", "\t": " "}))
cleaned = cleaned.translate(str.maketrans({ch: " " for ch in _EXTRA_PUNCT}))
# Remove remaining punctuation/symbols while keeping word chars and spaces.
cleaned = re.sub(r"[^\w\s]", " ", cleaned, flags=re.UNICODE)
cleaned = re.sub(r"\s+", " ", cleaned).strip()
if strip_diacritics:
cleaned = strip_greek_diacritics(cleaned)
return cleaned