Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import re | |
| import unicodedata | |
| from typing import List | |
| # A small punctuation set that commonly appears in Greek texts. | |
| _EXTRA_PUNCT = "··;;—–…«»‹›“”‘’" # ano teleia, Greek question mark, dashes, quotes | |
| def strip_greek_diacritics(text: str) -> str: | |
| """Strip diacritics while preserving iota subscript as an explicit iota. | |
| - Converts combining GREEK YPOGEGRAMMENI (U+0345) to 'ι'. | |
| - Removes other combining marks (accents, breathings, etc.). | |
| """ | |
| decomposed = unicodedata.normalize("NFD", text) | |
| out_chars: List[str] = [] | |
| for ch in decomposed: | |
| if ch == "\u0345": | |
| out_chars.append("ι") | |
| continue | |
| if unicodedata.combining(ch): | |
| continue | |
| out_chars.append(ch) | |
| return unicodedata.normalize("NFC", "".join(out_chars)) | |
| def sigma_normalize(token: str) -> str: | |
| """Normalize sigma variants for matching.""" | |
| return token.replace("ς", "σ") | |
| def normalize_text(text: str, *, strip_diacritics: bool = False) -> str: | |
| """Normalize input Greek text. | |
| - Lowercase | |
| - Remove punctuation | |
| - Optionally strip diacritics | |
| Keep diacritics by default so feature extraction can detect iota-subscript | |
| endings like -ᾳ. | |
| """ | |
| lowered = text.lower() | |
| # Replace tabs/newlines with spaces. | |
| cleaned = lowered.translate(str.maketrans({"\n": " ", "\t": " "})) | |
| cleaned = cleaned.translate(str.maketrans({ch: " " for ch in _EXTRA_PUNCT})) | |
| # Remove remaining punctuation/symbols while keeping word chars and spaces. | |
| cleaned = re.sub(r"[^\w\s]", " ", cleaned, flags=re.UNICODE) | |
| cleaned = re.sub(r"\s+", " ", cleaned).strip() | |
| if strip_diacritics: | |
| cleaned = strip_greek_diacritics(cleaned) | |
| return cleaned | |