from __future__ import annotations import unicodedata from collections import Counter from typing import Any, Dict, List, Mapping, Tuple from .normalization import sigma_normalize, strip_greek_diacritics PARTICLES: Tuple[str, ...] = ("μεν", "δε", "γαρ", "τε", "δη", "ουν") ENDINGS_PLAIN: Tuple[str, ...] = ("οι", "αι", "ηι", "οισι") # Infinitive endings (high-signal morphology when present). # These are matched on diacritic-stripped, sigma-normalized tokens. INFINITIVE_ENDINGS_PLAIN: Tuple[str, ...] = ( "ειν", # common Attic/Ionic/Koine infinitive "μεναι", # Aeolic-style infinitive "μεν", # Doric/Aeolic-style infinitive ) # A few additional, high-signal Homeric / epic-Ionic patterns (MVP). # Matched on diacritic-stripped tokens. EPIC_ENDINGS_PLAIN: Tuple[str, ...] = ( "οιο", # e.g., Ἠελίοιο "φι", # e.g., -φι instrumental "εσσι", # -εσσι(ν) # Epic/Ionic genitive (sigma-normalized): -ηος (e.g., Ἀχιλῆος -> αχιληοσ) "ηοσ", # Epic patronymic genitive (e.g., Πηληϊάδεω, Ἀτρεΐδεω) "αδεω", "ιδεω", ) # Dative plural patterns (useful for Ionic/Epic vs Attic/Koine tendencies). # Matched on diacritic-stripped tokens. DATIVE_PLURAL_ENDINGS_PLAIN: Tuple[str, ...] = ( "οισι", "ηισι", "αισι", "οις", "αις", ) # Epic particles (very small MVP subset; diacritics stripped and sigma-normalized). EPIC_PARTICLES_PLAIN: Tuple[str, ...] = ( "κε", "κεν", # Very common Homeric particle (often written ἄρ/ἄρ᾽) "αρ", # Homeric/epic pronoun form "μιν", ) # A few very common Homeric-vocabulary tokens (NOT dialect-specific in isolation). # We only treat these as weak epic-Ionic evidence when multiple hits occur. EPIC_WORDS_PLAIN: Tuple[str, ...] = ( "εννεπε", "αειδε", "μουσα", "μηνιν", "θεα", ) # Very small lexicalized Attic-vs-Ionic spelling cues (MVP). # These are substring-based to catch inflectional variants. ATTIC_TT_STEMS: Tuple[str, ...] = ( "θαλαττ", # θάλαττα "γλωττ", # γλῶττα "πραττ", # πράττω "ταττ", # τάττω ) IONIC_SS_STEMS: Tuple[str, ...] = ( "θαλασσ", # θάλασσα "γλωσσ", # γλῶσσα "πρασσ", # πράσσω "τασσ", # τάσσω ) # Preposition preference (edition-dependent but often helpful): εἰς vs ἐς. PREPOSITIONS_PLAIN: Tuple[str, ...] = ( # NOTE: these are *sigma-normalized* (final ς -> σ) "εισ", "εσ", ) # Koine-leaning function words (very small MVP set; genre-sensitive). # These should be low-weight, positive-only cues. KOINE_FUNCTION_WORDS_PLAIN: Tuple[str, ...] = ( "ινα", "οτι", # NOTE: sigma-normalized "καθωσ", # NT-style narrative formula is common in Koine "εγενετο", ) # Literary/poetic morphology cues. # - Doric 1pl active ending often appears as -μες (vs -μεν). # - Aeolic pronoun forms like ἄμμι/ὔμμι are strong when they occur. POETIC_MORPH_CUES: Tuple[str, ...] = ( "verb_1pl_mes", "aeolic_ammi", "aeolic_ummi", ) def _ends_with_iota_subscript_cluster(token: str, base_letter: str) -> bool: """True if token ends with base_letter + iota-subscript (any accents allowed).""" if not token: return False decomposed = unicodedata.normalize("NFD", token) i = len(decomposed) - 1 saw_ypogegrammeni = False while i >= 0 and unicodedata.combining(decomposed[i]): if decomposed[i] == "\u0345": saw_ypogegrammeni = True i -= 1 if i < 0: return False base = decomposed[i] return base == base_letter and saw_ypogegrammeni def extract_features(tokens: List[str]) -> Dict[str, Any]: """Extract interpretable linguistic feature counts from tokens.""" token_count = len(tokens) particles = Counter({p: 0 for p in PARTICLES}) endings = Counter({e: 0 for e in (*ENDINGS_PLAIN, "ᾳ")}) infinitives = Counter({e: 0 for e in INFINITIVE_ENDINGS_PLAIN}) epic_endings = Counter({e: 0 for e in EPIC_ENDINGS_PLAIN}) dative_plural_endings = Counter({e: 0 for e in DATIVE_PLURAL_ENDINGS_PLAIN}) epic_particles = Counter({p: 0 for p in EPIC_PARTICLES_PLAIN}) epic_words = Counter({w: 0 for w in EPIC_WORDS_PLAIN}) prepositions = Counter({p: 0 for p in PREPOSITIONS_PLAIN}) koine_words = Counter({w: 0 for w in KOINE_FUNCTION_WORDS_PLAIN}) lexical_cues = Counter( { "attic_tt": 0, "ionic_ss": 0, } ) # Mild Doric cue: initial rough-breathed alpha (e.g., ἁ as article in Doric). doric_ha_initial = 0 poetic_morph = Counter({k: 0 for k in POETIC_MORPH_CUES}) # Orthographic patterns tt_count = 0 ss_count = 0 alpha_endings = 0 eta_endings = 0 # Script evidence: helps detect non-Greek input or encoding issues. greek_alpha_chars = 0 alpha_chars = 0 for tok in tokens: if not tok: continue for ch in tok: if not ch.isalpha(): continue alpha_chars += 1 code = ord(ch) if (0x0370 <= code <= 0x03FF) or (0x1F00 <= code <= 0x1FFF): greek_alpha_chars += 1 plain = sigma_normalize(strip_greek_diacritics(tok)) # Doric 1pl -μες (sigma-normalized: -μεσ). # Guard against counting very short tokens. if len(plain) >= 5 and plain.endswith("μεσ"): poetic_morph["verb_1pl_mes"] += 1 # Aeolic pronoun forms (very high signal). if plain == "αμμι": poetic_morph["aeolic_ammi"] += 1 if plain == "υμμι": poetic_morph["aeolic_ummi"] += 1 # Doric cue: token begins with alpha + rough breathing. # This is intentionally weak; lots of words can have rough breathing. nfd = unicodedata.normalize("NFD", tok) if nfd: base0 = nfd[0] # Collect leading combining marks j = 1 has_rough = False while j < len(nfd) and unicodedata.combining(nfd[j]): # COMBINING REVERSED COMMA ABOVE (rough breathing) if nfd[j] == "\u0314": has_rough = True j += 1 if base0 == "α" and has_rough: doric_ha_initial += 1 # Count orthographic patterns (occurrences, not just token presence) tt_count += plain.count("ττ") ss_count += plain.count("σσ") if plain in particles: particles[plain] += 1 if plain in epic_particles: epic_particles[plain] += 1 if plain in epic_words: epic_words[plain] += 1 if plain in prepositions: prepositions[plain] += 1 if plain in koine_words: koine_words[plain] += 1 # Lexicalized Attic/Ionic cues if any(stem in plain for stem in ATTIC_TT_STEMS): lexical_cues["attic_tt"] += 1 if any(stem in plain for stem in IONIC_SS_STEMS): lexical_cues["ionic_ss"] += 1 for ending in ENDINGS_PLAIN: if plain.endswith(ending): endings[ending] += 1 # Infinitive endings (prefer longer endings first to avoid double-counting) # Guard against short function words like the particle "μεν". if len(plain) >= 5: if plain.endswith("μεναι"): infinitives["μεναι"] += 1 elif plain.endswith("ειν"): infinitives["ειν"] += 1 elif plain.endswith("μεν"): infinitives["μεν"] += 1 for ending in EPIC_ENDINGS_PLAIN: if plain.endswith(ending): epic_endings[ending] += 1 for ending in DATIVE_PLURAL_ENDINGS_PLAIN: if plain.endswith(ending): dative_plural_endings[ending] += 1 if _ends_with_iota_subscript_cluster(tok, "α"): endings["ᾳ"] += 1 if plain.endswith(("α", "ας", "αν")): alpha_endings += 1 if plain.endswith(("η", "ης", "ην")): eta_endings += 1 return { "token_count": token_count, "particles": dict(particles), "endings": dict(endings), "infinitives": dict(infinitives), "epic_endings": dict(epic_endings), "dative_plural_endings": dict(dative_plural_endings), "epic_particles": dict(epic_particles), "epic_words": dict(epic_words), "prepositions": dict(prepositions), "koine_words": dict(koine_words), "lexical_cues": dict(lexical_cues), "patterns": { "tt": tt_count, "ss": ss_count, }, "orthography": { "alpha_endings": alpha_endings, "eta_endings": eta_endings, }, "script": { "greek_alpha_chars": greek_alpha_chars, "alpha_chars": alpha_chars, }, "doric_cues": { "ha_initial": doric_ha_initial, }, "poetic_morph": dict(poetic_morph), } def rate_per_100(count: int, token_count: int) -> float: if token_count <= 0: return 0.0 return 100.0 * (count / token_count) def compute_rates(feature_dict: Mapping[str, Any]) -> Dict[str, Any]: """Compute per-100-token rates from feature counts.""" token_count = int(feature_dict.get("token_count", 0) or 0) particles: Mapping[str, int] = feature_dict.get("particles", {}) or {} endings: Mapping[str, int] = feature_dict.get("endings", {}) or {} infinitives: Mapping[str, int] = feature_dict.get("infinitives", {}) or {} orth: Mapping[str, int] = feature_dict.get("orthography", {}) or {} patterns: Mapping[str, int] = feature_dict.get("patterns", {}) or {} epic_particles: Mapping[str, int] = feature_dict.get("epic_particles", {}) or {} epic_endings: Mapping[str, int] = feature_dict.get("epic_endings", {}) or {} dative_plural_endings: Mapping[str, int] = feature_dict.get("dative_plural_endings", {}) or {} prepositions: Mapping[str, int] = feature_dict.get("prepositions", {}) or {} koine_words: Mapping[str, int] = feature_dict.get("koine_words", {}) or {} lexical_cues: Mapping[str, int] = feature_dict.get("lexical_cues", {}) or {} doric_cues: Mapping[str, int] = feature_dict.get("doric_cues", {}) or {} poetic_morph: Mapping[str, int] = feature_dict.get("poetic_morph", {}) or {} particle_rates = {p: rate_per_100(int(particles.get(p, 0) or 0), token_count) for p in PARTICLES} ending_rates = {e: rate_per_100(int(endings.get(e, 0) or 0), token_count) for e in (*ENDINGS_PLAIN, "ᾳ")} infinitive_rates = { e: rate_per_100(int(infinitives.get(e, 0) or 0), token_count) for e in INFINITIVE_ENDINGS_PLAIN } alpha_rate = rate_per_100(int(orth.get("alpha_endings", 0) or 0), token_count) eta_rate = rate_per_100(int(orth.get("eta_endings", 0) or 0), token_count) marked_rate = ending_rates.get("οισι", 0.0) + ending_rates.get("ηι", 0.0) + ending_rates.get("ᾳ", 0.0) pattern_rates = { "tt": rate_per_100(int(patterns.get("tt", 0) or 0), token_count), "ss": rate_per_100(int(patterns.get("ss", 0) or 0), token_count), } epic_particle_rates = {p: rate_per_100(int(epic_particles.get(p, 0) or 0), token_count) for p in EPIC_PARTICLES_PLAIN} epic_ending_rates = {e: rate_per_100(int(epic_endings.get(e, 0) or 0), token_count) for e in EPIC_ENDINGS_PLAIN} dative_plural_ending_rates = { e: rate_per_100(int(dative_plural_endings.get(e, 0) or 0), token_count) for e in DATIVE_PLURAL_ENDINGS_PLAIN } preposition_rates = {p: rate_per_100(int(prepositions.get(p, 0) or 0), token_count) for p in PREPOSITIONS_PLAIN} koine_word_rates = {w: rate_per_100(int(koine_words.get(w, 0) or 0), token_count) for w in KOINE_FUNCTION_WORDS_PLAIN} lexical_cue_rates = { "attic_tt": rate_per_100(int(lexical_cues.get("attic_tt", 0) or 0), token_count), "ionic_ss": rate_per_100(int(lexical_cues.get("ionic_ss", 0) or 0), token_count), } doric_cue_rates = { "ha_initial": rate_per_100(int(doric_cues.get("ha_initial", 0) or 0), token_count), } poetic_morph_rates = { k: rate_per_100(int(poetic_morph.get(k, 0) or 0), token_count) for k in POETIC_MORPH_CUES } return { "particles_per_100": particle_rates, "endings_per_100": ending_rates, "infinitives_per_100": infinitive_rates, "patterns_per_100": pattern_rates, "epic_particles_per_100": epic_particle_rates, "epic_endings_per_100": epic_ending_rates, "dative_plural_endings_per_100": dative_plural_ending_rates, "prepositions_per_100": preposition_rates, "koine_words_per_100": koine_word_rates, "lexical_cues_per_100": lexical_cue_rates, "doric_cues_per_100": doric_cue_rates, "poetic_morph_per_100": poetic_morph_rates, "alpha_endings_per_100": alpha_rate, "eta_endings_per_100": eta_rate, "marked_endings_per_100": marked_rate, }