Spaces:

thomascerniglia
/

DialectAnalysis

Sleeping

File size: 13,757 Bytes

d0326ea

from __future__ import annotations

import unicodedata
from collections import Counter
from typing import Any, Dict, List, Mapping, Tuple

from .normalization import sigma_normalize, strip_greek_diacritics


PARTICLES: Tuple[str, ...] = ("μεν", "δε", "γαρ", "τε", "δη", "ουν")
ENDINGS_PLAIN: Tuple[str, ...] = ("οι", "αι", "ηι", "οισι")

# Infinitive endings (high-signal morphology when present).
# These are matched on diacritic-stripped, sigma-normalized tokens.
INFINITIVE_ENDINGS_PLAIN: Tuple[str, ...] = (
    "ειν",    # common Attic/Ionic/Koine infinitive
    "μεναι",  # Aeolic-style infinitive
    "μεν",    # Doric/Aeolic-style infinitive
)

# A few additional, high-signal Homeric / epic-Ionic patterns (MVP).
# Matched on diacritic-stripped tokens.
EPIC_ENDINGS_PLAIN: Tuple[str, ...] = (
    "οιο",   # e.g., Ἠελίοιο
    "φι",    # e.g., -φι instrumental
    "εσσι",  # -εσσι(ν)
    # Epic/Ionic genitive (sigma-normalized): -ηος (e.g., Ἀχιλῆος -> αχιληοσ)
    "ηοσ",
    # Epic patronymic genitive (e.g., Πηληϊάδεω, Ἀτρεΐδεω)
    "αδεω",
    "ιδεω",
)

# Dative plural patterns (useful for Ionic/Epic vs Attic/Koine tendencies).
# Matched on diacritic-stripped tokens.
DATIVE_PLURAL_ENDINGS_PLAIN: Tuple[str, ...] = (
    "οισι",
    "ηισι",
    "αισι",
    "οις",
    "αις",
)

# Epic particles (very small MVP subset; diacritics stripped and sigma-normalized).
EPIC_PARTICLES_PLAIN: Tuple[str, ...] = (
    "κε",
    "κεν",
    # Very common Homeric particle (often written ἄρ/ἄρ᾽)
    "αρ",
    # Homeric/epic pronoun form
    "μιν",
)

# A few very common Homeric-vocabulary tokens (NOT dialect-specific in isolation).
# We only treat these as weak epic-Ionic evidence when multiple hits occur.
EPIC_WORDS_PLAIN: Tuple[str, ...] = (
    "εννεπε",
    "αειδε",
    "μουσα",
    "μηνιν",
    "θεα",
)

# Very small lexicalized Attic-vs-Ionic spelling cues (MVP).
# These are substring-based to catch inflectional variants.
ATTIC_TT_STEMS: Tuple[str, ...] = (
    "θαλαττ",  # θάλαττα
    "γλωττ",   # γλῶττα
    "πραττ",   # πράττω
    "ταττ",    # τάττω
)

IONIC_SS_STEMS: Tuple[str, ...] = (
    "θαλασσ",  # θάλασσα
    "γλωσσ",   # γλῶσσα
    "πρασσ",   # πράσσω
    "τασσ",    # τάσσω
)

# Preposition preference (edition-dependent but often helpful): εἰς vs ἐς.
PREPOSITIONS_PLAIN: Tuple[str, ...] = (
    # NOTE: these are *sigma-normalized* (final ς -> σ)
    "εισ",
    "εσ",
)

# Koine-leaning function words (very small MVP set; genre-sensitive).
# These should be low-weight, positive-only cues.
KOINE_FUNCTION_WORDS_PLAIN: Tuple[str, ...] = (
    "ινα",
    "οτι",
    # NOTE: sigma-normalized
    "καθωσ",
    # NT-style narrative formula is common in Koine
    "εγενετο",
)

# Literary/poetic morphology cues.
# - Doric 1pl active ending often appears as -μες (vs -μεν).
# - Aeolic pronoun forms like ἄμμι/ὔμμι are strong when they occur.
POETIC_MORPH_CUES: Tuple[str, ...] = (
    "verb_1pl_mes",
    "aeolic_ammi",
    "aeolic_ummi",
)


def _ends_with_iota_subscript_cluster(token: str, base_letter: str) -> bool:
    """True if token ends with base_letter + iota-subscript (any accents allowed)."""

    if not token:
        return False

    decomposed = unicodedata.normalize("NFD", token)
    i = len(decomposed) - 1
    saw_ypogegrammeni = False
    while i >= 0 and unicodedata.combining(decomposed[i]):
        if decomposed[i] == "\u0345":
            saw_ypogegrammeni = True
        i -= 1

    if i < 0:
        return False

    base = decomposed[i]
    return base == base_letter and saw_ypogegrammeni


def extract_features(tokens: List[str]) -> Dict[str, Any]:
    """Extract interpretable linguistic feature counts from tokens."""

    token_count = len(tokens)
    particles = Counter({p: 0 for p in PARTICLES})
    endings = Counter({e: 0 for e in (*ENDINGS_PLAIN, "ᾳ")})
    infinitives = Counter({e: 0 for e in INFINITIVE_ENDINGS_PLAIN})

    epic_endings = Counter({e: 0 for e in EPIC_ENDINGS_PLAIN})

    dative_plural_endings = Counter({e: 0 for e in DATIVE_PLURAL_ENDINGS_PLAIN})
    epic_particles = Counter({p: 0 for p in EPIC_PARTICLES_PLAIN})

    epic_words = Counter({w: 0 for w in EPIC_WORDS_PLAIN})

    prepositions = Counter({p: 0 for p in PREPOSITIONS_PLAIN})
    koine_words = Counter({w: 0 for w in KOINE_FUNCTION_WORDS_PLAIN})

    lexical_cues = Counter(
        {
            "attic_tt": 0,
            "ionic_ss": 0,
        }
    )

    # Mild Doric cue: initial rough-breathed alpha (e.g., ἁ as article in Doric).
    doric_ha_initial = 0

    poetic_morph = Counter({k: 0 for k in POETIC_MORPH_CUES})

    # Orthographic patterns
    tt_count = 0
    ss_count = 0

    alpha_endings = 0
    eta_endings = 0

    # Script evidence: helps detect non-Greek input or encoding issues.
    greek_alpha_chars = 0
    alpha_chars = 0

    for tok in tokens:
        if not tok:
            continue

        for ch in tok:
            if not ch.isalpha():
                continue
            alpha_chars += 1
            code = ord(ch)
            if (0x0370 <= code <= 0x03FF) or (0x1F00 <= code <= 0x1FFF):
                greek_alpha_chars += 1

        plain = sigma_normalize(strip_greek_diacritics(tok))
        # Doric 1pl -μες (sigma-normalized: -μεσ).
        # Guard against counting very short tokens.
        if len(plain) >= 5 and plain.endswith("μεσ"):
            poetic_morph["verb_1pl_mes"] += 1

        # Aeolic pronoun forms (very high signal).
        if plain == "αμμι":
            poetic_morph["aeolic_ammi"] += 1
        if plain == "υμμι":
            poetic_morph["aeolic_ummi"] += 1

        # Doric cue: token begins with alpha + rough breathing.
        # This is intentionally weak; lots of words can have rough breathing.
        nfd = unicodedata.normalize("NFD", tok)
        if nfd:
            base0 = nfd[0]
            # Collect leading combining marks
            j = 1
            has_rough = False
            while j < len(nfd) and unicodedata.combining(nfd[j]):
                # COMBINING REVERSED COMMA ABOVE (rough breathing)
                if nfd[j] == "\u0314":
                    has_rough = True
                j += 1
            if base0 == "α" and has_rough:
                doric_ha_initial += 1

        # Count orthographic patterns (occurrences, not just token presence)
        tt_count += plain.count("ττ")
        ss_count += plain.count("σσ")

        if plain in particles:
            particles[plain] += 1

        if plain in epic_particles:
            epic_particles[plain] += 1

        if plain in epic_words:
            epic_words[plain] += 1

        if plain in prepositions:
            prepositions[plain] += 1

        if plain in koine_words:
            koine_words[plain] += 1

        # Lexicalized Attic/Ionic cues
        if any(stem in plain for stem in ATTIC_TT_STEMS):
            lexical_cues["attic_tt"] += 1
        if any(stem in plain for stem in IONIC_SS_STEMS):
            lexical_cues["ionic_ss"] += 1

        for ending in ENDINGS_PLAIN:
            if plain.endswith(ending):
                endings[ending] += 1

        # Infinitive endings (prefer longer endings first to avoid double-counting)
        # Guard against short function words like the particle "μεν".
        if len(plain) >= 5:
            if plain.endswith("μεναι"):
                infinitives["μεναι"] += 1
            elif plain.endswith("ειν"):
                infinitives["ειν"] += 1
            elif plain.endswith("μεν"):
                infinitives["μεν"] += 1

        for ending in EPIC_ENDINGS_PLAIN:
            if plain.endswith(ending):
                epic_endings[ending] += 1

        for ending in DATIVE_PLURAL_ENDINGS_PLAIN:
            if plain.endswith(ending):
                dative_plural_endings[ending] += 1

        if _ends_with_iota_subscript_cluster(tok, "α"):
            endings["ᾳ"] += 1

        if plain.endswith(("α", "ας", "αν")):
            alpha_endings += 1
        if plain.endswith(("η", "ης", "ην")):
            eta_endings += 1

    return {
        "token_count": token_count,
        "particles": dict(particles),
        "endings": dict(endings),
        "infinitives": dict(infinitives),
        "epic_endings": dict(epic_endings),
        "dative_plural_endings": dict(dative_plural_endings),
        "epic_particles": dict(epic_particles),
        "epic_words": dict(epic_words),
        "prepositions": dict(prepositions),
        "koine_words": dict(koine_words),
        "lexical_cues": dict(lexical_cues),
        "patterns": {
            "tt": tt_count,
            "ss": ss_count,
        },
        "orthography": {
            "alpha_endings": alpha_endings,
            "eta_endings": eta_endings,
        },
        "script": {
            "greek_alpha_chars": greek_alpha_chars,
            "alpha_chars": alpha_chars,
        },
        "doric_cues": {
            "ha_initial": doric_ha_initial,
        },
        "poetic_morph": dict(poetic_morph),
    }


def rate_per_100(count: int, token_count: int) -> float:
    if token_count <= 0:
        return 0.0
    return 100.0 * (count / token_count)


def compute_rates(feature_dict: Mapping[str, Any]) -> Dict[str, Any]:
    """Compute per-100-token rates from feature counts."""

    token_count = int(feature_dict.get("token_count", 0) or 0)
    particles: Mapping[str, int] = feature_dict.get("particles", {}) or {}
    endings: Mapping[str, int] = feature_dict.get("endings", {}) or {}
    infinitives: Mapping[str, int] = feature_dict.get("infinitives", {}) or {}
    orth: Mapping[str, int] = feature_dict.get("orthography", {}) or {}
    patterns: Mapping[str, int] = feature_dict.get("patterns", {}) or {}
    epic_particles: Mapping[str, int] = feature_dict.get("epic_particles", {}) or {}
    epic_endings: Mapping[str, int] = feature_dict.get("epic_endings", {}) or {}
    dative_plural_endings: Mapping[str, int] = feature_dict.get("dative_plural_endings", {}) or {}
    prepositions: Mapping[str, int] = feature_dict.get("prepositions", {}) or {}
    koine_words: Mapping[str, int] = feature_dict.get("koine_words", {}) or {}
    lexical_cues: Mapping[str, int] = feature_dict.get("lexical_cues", {}) or {}
    doric_cues: Mapping[str, int] = feature_dict.get("doric_cues", {}) or {}
    poetic_morph: Mapping[str, int] = feature_dict.get("poetic_morph", {}) or {}

    particle_rates = {p: rate_per_100(int(particles.get(p, 0) or 0), token_count) for p in PARTICLES}
    ending_rates = {e: rate_per_100(int(endings.get(e, 0) or 0), token_count) for e in (*ENDINGS_PLAIN, "ᾳ")}
    infinitive_rates = {
        e: rate_per_100(int(infinitives.get(e, 0) or 0), token_count) for e in INFINITIVE_ENDINGS_PLAIN
    }

    alpha_rate = rate_per_100(int(orth.get("alpha_endings", 0) or 0), token_count)
    eta_rate = rate_per_100(int(orth.get("eta_endings", 0) or 0), token_count)

    marked_rate = ending_rates.get("οισι", 0.0) + ending_rates.get("ηι", 0.0) + ending_rates.get("ᾳ", 0.0)

    pattern_rates = {
        "tt": rate_per_100(int(patterns.get("tt", 0) or 0), token_count),
        "ss": rate_per_100(int(patterns.get("ss", 0) or 0), token_count),
    }

    epic_particle_rates = {p: rate_per_100(int(epic_particles.get(p, 0) or 0), token_count) for p in EPIC_PARTICLES_PLAIN}
    epic_ending_rates = {e: rate_per_100(int(epic_endings.get(e, 0) or 0), token_count) for e in EPIC_ENDINGS_PLAIN}
    dative_plural_ending_rates = {
        e: rate_per_100(int(dative_plural_endings.get(e, 0) or 0), token_count)
        for e in DATIVE_PLURAL_ENDINGS_PLAIN
    }

    preposition_rates = {p: rate_per_100(int(prepositions.get(p, 0) or 0), token_count) for p in PREPOSITIONS_PLAIN}
    koine_word_rates = {w: rate_per_100(int(koine_words.get(w, 0) or 0), token_count) for w in KOINE_FUNCTION_WORDS_PLAIN}
    lexical_cue_rates = {
        "attic_tt": rate_per_100(int(lexical_cues.get("attic_tt", 0) or 0), token_count),
        "ionic_ss": rate_per_100(int(lexical_cues.get("ionic_ss", 0) or 0), token_count),
    }
    doric_cue_rates = {
        "ha_initial": rate_per_100(int(doric_cues.get("ha_initial", 0) or 0), token_count),
    }

    poetic_morph_rates = {
        k: rate_per_100(int(poetic_morph.get(k, 0) or 0), token_count) for k in POETIC_MORPH_CUES
    }

    return {
        "particles_per_100": particle_rates,
        "endings_per_100": ending_rates,
        "infinitives_per_100": infinitive_rates,
        "patterns_per_100": pattern_rates,
        "epic_particles_per_100": epic_particle_rates,
        "epic_endings_per_100": epic_ending_rates,
        "dative_plural_endings_per_100": dative_plural_ending_rates,
        "prepositions_per_100": preposition_rates,
        "koine_words_per_100": koine_word_rates,
        "lexical_cues_per_100": lexical_cue_rates,
        "doric_cues_per_100": doric_cue_rates,
        "poetic_morph_per_100": poetic_morph_rates,
        "alpha_endings_per_100": alpha_rate,
        "eta_endings_per_100": eta_rate,
        "marked_endings_per_100": marked_rate,
    }