Spaces:
Sleeping
Sleeping
File size: 1,864 Bytes
d0326ea | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | from __future__ import annotations
import re
import unicodedata
from typing import List
# A small punctuation set that commonly appears in Greek texts.
_EXTRA_PUNCT = "··;;—–…«»‹›“”‘’" # ano teleia, Greek question mark, dashes, quotes
def strip_greek_diacritics(text: str) -> str:
"""Strip diacritics while preserving iota subscript as an explicit iota.
- Converts combining GREEK YPOGEGRAMMENI (U+0345) to 'ι'.
- Removes other combining marks (accents, breathings, etc.).
"""
decomposed = unicodedata.normalize("NFD", text)
out_chars: List[str] = []
for ch in decomposed:
if ch == "\u0345":
out_chars.append("ι")
continue
if unicodedata.combining(ch):
continue
out_chars.append(ch)
return unicodedata.normalize("NFC", "".join(out_chars))
def sigma_normalize(token: str) -> str:
"""Normalize sigma variants for matching."""
return token.replace("ς", "σ")
def normalize_text(text: str, *, strip_diacritics: bool = False) -> str:
"""Normalize input Greek text.
- Lowercase
- Remove punctuation
- Optionally strip diacritics
Keep diacritics by default so feature extraction can detect iota-subscript
endings like -ᾳ.
"""
lowered = text.lower()
# Replace tabs/newlines with spaces.
cleaned = lowered.translate(str.maketrans({"\n": " ", "\t": " "}))
cleaned = cleaned.translate(str.maketrans({ch: " " for ch in _EXTRA_PUNCT}))
# Remove remaining punctuation/symbols while keeping word chars and spaces.
cleaned = re.sub(r"[^\w\s]", " ", cleaned, flags=re.UNICODE)
cleaned = re.sub(r"\s+", " ", cleaned).strip()
if strip_diacritics:
cleaned = strip_greek_diacritics(cleaned)
return cleaned
|