Spaces:

thomascerniglia
/

DialectAnalysis

Sleeping

File size: 1,864 Bytes

d0326ea

from __future__ import annotations

import re
import unicodedata
from typing import List

# A small punctuation set that commonly appears in Greek texts.
_EXTRA_PUNCT = "··;;—–…«»‹›“”‘’"  # ano teleia, Greek question mark, dashes, quotes


def strip_greek_diacritics(text: str) -> str:
    """Strip diacritics while preserving iota subscript as an explicit iota.



    - Converts combining GREEK YPOGEGRAMMENI (U+0345) to 'ι'.

    - Removes other combining marks (accents, breathings, etc.).

    """

    decomposed = unicodedata.normalize("NFD", text)
    out_chars: List[str] = []
    for ch in decomposed:
        if ch == "\u0345":
            out_chars.append("ι")
            continue
        if unicodedata.combining(ch):
            continue
        out_chars.append(ch)
    return unicodedata.normalize("NFC", "".join(out_chars))


def sigma_normalize(token: str) -> str:
    """Normalize sigma variants for matching."""

    return token.replace("ς", "σ")


def normalize_text(text: str, *, strip_diacritics: bool = False) -> str:
    """Normalize input Greek text.



    - Lowercase

    - Remove punctuation

    - Optionally strip diacritics



    Keep diacritics by default so feature extraction can detect iota-subscript

    endings like -ᾳ.

    """

    lowered = text.lower()

    # Replace tabs/newlines with spaces.
    cleaned = lowered.translate(str.maketrans({"\n": " ", "\t": " "}))
    cleaned = cleaned.translate(str.maketrans({ch: " " for ch in _EXTRA_PUNCT}))

    # Remove remaining punctuation/symbols while keeping word chars and spaces.
    cleaned = re.sub(r"[^\w\s]", " ", cleaned, flags=re.UNICODE)
    cleaned = re.sub(r"\s+", " ", cleaned).strip()

    if strip_diacritics:
        cleaned = strip_greek_diacritics(cleaned)

    return cleaned