from __future__ import annotations import re import unicodedata from typing import List # A small punctuation set that commonly appears in Greek texts. _EXTRA_PUNCT = "··;;—–…«»‹›“”‘’" # ano teleia, Greek question mark, dashes, quotes def strip_greek_diacritics(text: str) -> str: """Strip diacritics while preserving iota subscript as an explicit iota. - Converts combining GREEK YPOGEGRAMMENI (U+0345) to 'ι'. - Removes other combining marks (accents, breathings, etc.). """ decomposed = unicodedata.normalize("NFD", text) out_chars: List[str] = [] for ch in decomposed: if ch == "\u0345": out_chars.append("ι") continue if unicodedata.combining(ch): continue out_chars.append(ch) return unicodedata.normalize("NFC", "".join(out_chars)) def sigma_normalize(token: str) -> str: """Normalize sigma variants for matching.""" return token.replace("ς", "σ") def normalize_text(text: str, *, strip_diacritics: bool = False) -> str: """Normalize input Greek text. - Lowercase - Remove punctuation - Optionally strip diacritics Keep diacritics by default so feature extraction can detect iota-subscript endings like -ᾳ. """ lowered = text.lower() # Replace tabs/newlines with spaces. cleaned = lowered.translate(str.maketrans({"\n": " ", "\t": " "})) cleaned = cleaned.translate(str.maketrans({ch: " " for ch in _EXTRA_PUNCT})) # Remove remaining punctuation/symbols while keeping word chars and spaces. cleaned = re.sub(r"[^\w\s]", " ", cleaned, flags=re.UNICODE) cleaned = re.sub(r"\s+", " ", cleaned).strip() if strip_diacritics: cleaned = strip_greek_diacritics(cleaned) return cleaned