Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import unicodedata | |
| from collections import Counter | |
| from typing import Any, Dict, List, Mapping, Tuple | |
| from .normalization import sigma_normalize, strip_greek_diacritics | |
| PARTICLES: Tuple[str, ...] = ("μεν", "δε", "γαρ", "τε", "δη", "ουν") | |
| ENDINGS_PLAIN: Tuple[str, ...] = ("οι", "αι", "ηι", "οισι") | |
| # Infinitive endings (high-signal morphology when present). | |
| # These are matched on diacritic-stripped, sigma-normalized tokens. | |
| INFINITIVE_ENDINGS_PLAIN: Tuple[str, ...] = ( | |
| "ειν", # common Attic/Ionic/Koine infinitive | |
| "μεναι", # Aeolic-style infinitive | |
| "μεν", # Doric/Aeolic-style infinitive | |
| ) | |
| # A few additional, high-signal Homeric / epic-Ionic patterns (MVP). | |
| # Matched on diacritic-stripped tokens. | |
| EPIC_ENDINGS_PLAIN: Tuple[str, ...] = ( | |
| "οιο", # e.g., Ἠελίοιο | |
| "φι", # e.g., -φι instrumental | |
| "εσσι", # -εσσι(ν) | |
| # Epic/Ionic genitive (sigma-normalized): -ηος (e.g., Ἀχιλῆος -> αχιληοσ) | |
| "ηοσ", | |
| # Epic patronymic genitive (e.g., Πηληϊάδεω, Ἀτρεΐδεω) | |
| "αδεω", | |
| "ιδεω", | |
| ) | |
| # Dative plural patterns (useful for Ionic/Epic vs Attic/Koine tendencies). | |
| # Matched on diacritic-stripped tokens. | |
| DATIVE_PLURAL_ENDINGS_PLAIN: Tuple[str, ...] = ( | |
| "οισι", | |
| "ηισι", | |
| "αισι", | |
| "οις", | |
| "αις", | |
| ) | |
| # Epic particles (very small MVP subset; diacritics stripped and sigma-normalized). | |
| EPIC_PARTICLES_PLAIN: Tuple[str, ...] = ( | |
| "κε", | |
| "κεν", | |
| # Very common Homeric particle (often written ἄρ/ἄρ᾽) | |
| "αρ", | |
| # Homeric/epic pronoun form | |
| "μιν", | |
| ) | |
| # A few very common Homeric-vocabulary tokens (NOT dialect-specific in isolation). | |
| # We only treat these as weak epic-Ionic evidence when multiple hits occur. | |
| EPIC_WORDS_PLAIN: Tuple[str, ...] = ( | |
| "εννεπε", | |
| "αειδε", | |
| "μουσα", | |
| "μηνιν", | |
| "θεα", | |
| ) | |
| # Very small lexicalized Attic-vs-Ionic spelling cues (MVP). | |
| # These are substring-based to catch inflectional variants. | |
| ATTIC_TT_STEMS: Tuple[str, ...] = ( | |
| "θαλαττ", # θάλαττα | |
| "γλωττ", # γλῶττα | |
| "πραττ", # πράττω | |
| "ταττ", # τάττω | |
| ) | |
| IONIC_SS_STEMS: Tuple[str, ...] = ( | |
| "θαλασσ", # θάλασσα | |
| "γλωσσ", # γλῶσσα | |
| "πρασσ", # πράσσω | |
| "τασσ", # τάσσω | |
| ) | |
| # Preposition preference (edition-dependent but often helpful): εἰς vs ἐς. | |
| PREPOSITIONS_PLAIN: Tuple[str, ...] = ( | |
| # NOTE: these are *sigma-normalized* (final ς -> σ) | |
| "εισ", | |
| "εσ", | |
| ) | |
| # Koine-leaning function words (very small MVP set; genre-sensitive). | |
| # These should be low-weight, positive-only cues. | |
| KOINE_FUNCTION_WORDS_PLAIN: Tuple[str, ...] = ( | |
| "ινα", | |
| "οτι", | |
| # NOTE: sigma-normalized | |
| "καθωσ", | |
| # NT-style narrative formula is common in Koine | |
| "εγενετο", | |
| ) | |
| # Literary/poetic morphology cues. | |
| # - Doric 1pl active ending often appears as -μες (vs -μεν). | |
| # - Aeolic pronoun forms like ἄμμι/ὔμμι are strong when they occur. | |
| POETIC_MORPH_CUES: Tuple[str, ...] = ( | |
| "verb_1pl_mes", | |
| "aeolic_ammi", | |
| "aeolic_ummi", | |
| ) | |
| def _ends_with_iota_subscript_cluster(token: str, base_letter: str) -> bool: | |
| """True if token ends with base_letter + iota-subscript (any accents allowed).""" | |
| if not token: | |
| return False | |
| decomposed = unicodedata.normalize("NFD", token) | |
| i = len(decomposed) - 1 | |
| saw_ypogegrammeni = False | |
| while i >= 0 and unicodedata.combining(decomposed[i]): | |
| if decomposed[i] == "\u0345": | |
| saw_ypogegrammeni = True | |
| i -= 1 | |
| if i < 0: | |
| return False | |
| base = decomposed[i] | |
| return base == base_letter and saw_ypogegrammeni | |
| def extract_features(tokens: List[str]) -> Dict[str, Any]: | |
| """Extract interpretable linguistic feature counts from tokens.""" | |
| token_count = len(tokens) | |
| particles = Counter({p: 0 for p in PARTICLES}) | |
| endings = Counter({e: 0 for e in (*ENDINGS_PLAIN, "ᾳ")}) | |
| infinitives = Counter({e: 0 for e in INFINITIVE_ENDINGS_PLAIN}) | |
| epic_endings = Counter({e: 0 for e in EPIC_ENDINGS_PLAIN}) | |
| dative_plural_endings = Counter({e: 0 for e in DATIVE_PLURAL_ENDINGS_PLAIN}) | |
| epic_particles = Counter({p: 0 for p in EPIC_PARTICLES_PLAIN}) | |
| epic_words = Counter({w: 0 for w in EPIC_WORDS_PLAIN}) | |
| prepositions = Counter({p: 0 for p in PREPOSITIONS_PLAIN}) | |
| koine_words = Counter({w: 0 for w in KOINE_FUNCTION_WORDS_PLAIN}) | |
| lexical_cues = Counter( | |
| { | |
| "attic_tt": 0, | |
| "ionic_ss": 0, | |
| } | |
| ) | |
| # Mild Doric cue: initial rough-breathed alpha (e.g., ἁ as article in Doric). | |
| doric_ha_initial = 0 | |
| poetic_morph = Counter({k: 0 for k in POETIC_MORPH_CUES}) | |
| # Orthographic patterns | |
| tt_count = 0 | |
| ss_count = 0 | |
| alpha_endings = 0 | |
| eta_endings = 0 | |
| # Script evidence: helps detect non-Greek input or encoding issues. | |
| greek_alpha_chars = 0 | |
| alpha_chars = 0 | |
| for tok in tokens: | |
| if not tok: | |
| continue | |
| for ch in tok: | |
| if not ch.isalpha(): | |
| continue | |
| alpha_chars += 1 | |
| code = ord(ch) | |
| if (0x0370 <= code <= 0x03FF) or (0x1F00 <= code <= 0x1FFF): | |
| greek_alpha_chars += 1 | |
| plain = sigma_normalize(strip_greek_diacritics(tok)) | |
| # Doric 1pl -μες (sigma-normalized: -μεσ). | |
| # Guard against counting very short tokens. | |
| if len(plain) >= 5 and plain.endswith("μεσ"): | |
| poetic_morph["verb_1pl_mes"] += 1 | |
| # Aeolic pronoun forms (very high signal). | |
| if plain == "αμμι": | |
| poetic_morph["aeolic_ammi"] += 1 | |
| if plain == "υμμι": | |
| poetic_morph["aeolic_ummi"] += 1 | |
| # Doric cue: token begins with alpha + rough breathing. | |
| # This is intentionally weak; lots of words can have rough breathing. | |
| nfd = unicodedata.normalize("NFD", tok) | |
| if nfd: | |
| base0 = nfd[0] | |
| # Collect leading combining marks | |
| j = 1 | |
| has_rough = False | |
| while j < len(nfd) and unicodedata.combining(nfd[j]): | |
| # COMBINING REVERSED COMMA ABOVE (rough breathing) | |
| if nfd[j] == "\u0314": | |
| has_rough = True | |
| j += 1 | |
| if base0 == "α" and has_rough: | |
| doric_ha_initial += 1 | |
| # Count orthographic patterns (occurrences, not just token presence) | |
| tt_count += plain.count("ττ") | |
| ss_count += plain.count("σσ") | |
| if plain in particles: | |
| particles[plain] += 1 | |
| if plain in epic_particles: | |
| epic_particles[plain] += 1 | |
| if plain in epic_words: | |
| epic_words[plain] += 1 | |
| if plain in prepositions: | |
| prepositions[plain] += 1 | |
| if plain in koine_words: | |
| koine_words[plain] += 1 | |
| # Lexicalized Attic/Ionic cues | |
| if any(stem in plain for stem in ATTIC_TT_STEMS): | |
| lexical_cues["attic_tt"] += 1 | |
| if any(stem in plain for stem in IONIC_SS_STEMS): | |
| lexical_cues["ionic_ss"] += 1 | |
| for ending in ENDINGS_PLAIN: | |
| if plain.endswith(ending): | |
| endings[ending] += 1 | |
| # Infinitive endings (prefer longer endings first to avoid double-counting) | |
| # Guard against short function words like the particle "μεν". | |
| if len(plain) >= 5: | |
| if plain.endswith("μεναι"): | |
| infinitives["μεναι"] += 1 | |
| elif plain.endswith("ειν"): | |
| infinitives["ειν"] += 1 | |
| elif plain.endswith("μεν"): | |
| infinitives["μεν"] += 1 | |
| for ending in EPIC_ENDINGS_PLAIN: | |
| if plain.endswith(ending): | |
| epic_endings[ending] += 1 | |
| for ending in DATIVE_PLURAL_ENDINGS_PLAIN: | |
| if plain.endswith(ending): | |
| dative_plural_endings[ending] += 1 | |
| if _ends_with_iota_subscript_cluster(tok, "α"): | |
| endings["ᾳ"] += 1 | |
| if plain.endswith(("α", "ας", "αν")): | |
| alpha_endings += 1 | |
| if plain.endswith(("η", "ης", "ην")): | |
| eta_endings += 1 | |
| return { | |
| "token_count": token_count, | |
| "particles": dict(particles), | |
| "endings": dict(endings), | |
| "infinitives": dict(infinitives), | |
| "epic_endings": dict(epic_endings), | |
| "dative_plural_endings": dict(dative_plural_endings), | |
| "epic_particles": dict(epic_particles), | |
| "epic_words": dict(epic_words), | |
| "prepositions": dict(prepositions), | |
| "koine_words": dict(koine_words), | |
| "lexical_cues": dict(lexical_cues), | |
| "patterns": { | |
| "tt": tt_count, | |
| "ss": ss_count, | |
| }, | |
| "orthography": { | |
| "alpha_endings": alpha_endings, | |
| "eta_endings": eta_endings, | |
| }, | |
| "script": { | |
| "greek_alpha_chars": greek_alpha_chars, | |
| "alpha_chars": alpha_chars, | |
| }, | |
| "doric_cues": { | |
| "ha_initial": doric_ha_initial, | |
| }, | |
| "poetic_morph": dict(poetic_morph), | |
| } | |
| def rate_per_100(count: int, token_count: int) -> float: | |
| if token_count <= 0: | |
| return 0.0 | |
| return 100.0 * (count / token_count) | |
| def compute_rates(feature_dict: Mapping[str, Any]) -> Dict[str, Any]: | |
| """Compute per-100-token rates from feature counts.""" | |
| token_count = int(feature_dict.get("token_count", 0) or 0) | |
| particles: Mapping[str, int] = feature_dict.get("particles", {}) or {} | |
| endings: Mapping[str, int] = feature_dict.get("endings", {}) or {} | |
| infinitives: Mapping[str, int] = feature_dict.get("infinitives", {}) or {} | |
| orth: Mapping[str, int] = feature_dict.get("orthography", {}) or {} | |
| patterns: Mapping[str, int] = feature_dict.get("patterns", {}) or {} | |
| epic_particles: Mapping[str, int] = feature_dict.get("epic_particles", {}) or {} | |
| epic_endings: Mapping[str, int] = feature_dict.get("epic_endings", {}) or {} | |
| dative_plural_endings: Mapping[str, int] = feature_dict.get("dative_plural_endings", {}) or {} | |
| prepositions: Mapping[str, int] = feature_dict.get("prepositions", {}) or {} | |
| koine_words: Mapping[str, int] = feature_dict.get("koine_words", {}) or {} | |
| lexical_cues: Mapping[str, int] = feature_dict.get("lexical_cues", {}) or {} | |
| doric_cues: Mapping[str, int] = feature_dict.get("doric_cues", {}) or {} | |
| poetic_morph: Mapping[str, int] = feature_dict.get("poetic_morph", {}) or {} | |
| particle_rates = {p: rate_per_100(int(particles.get(p, 0) or 0), token_count) for p in PARTICLES} | |
| ending_rates = {e: rate_per_100(int(endings.get(e, 0) or 0), token_count) for e in (*ENDINGS_PLAIN, "ᾳ")} | |
| infinitive_rates = { | |
| e: rate_per_100(int(infinitives.get(e, 0) or 0), token_count) for e in INFINITIVE_ENDINGS_PLAIN | |
| } | |
| alpha_rate = rate_per_100(int(orth.get("alpha_endings", 0) or 0), token_count) | |
| eta_rate = rate_per_100(int(orth.get("eta_endings", 0) or 0), token_count) | |
| marked_rate = ending_rates.get("οισι", 0.0) + ending_rates.get("ηι", 0.0) + ending_rates.get("ᾳ", 0.0) | |
| pattern_rates = { | |
| "tt": rate_per_100(int(patterns.get("tt", 0) or 0), token_count), | |
| "ss": rate_per_100(int(patterns.get("ss", 0) or 0), token_count), | |
| } | |
| epic_particle_rates = {p: rate_per_100(int(epic_particles.get(p, 0) or 0), token_count) for p in EPIC_PARTICLES_PLAIN} | |
| epic_ending_rates = {e: rate_per_100(int(epic_endings.get(e, 0) or 0), token_count) for e in EPIC_ENDINGS_PLAIN} | |
| dative_plural_ending_rates = { | |
| e: rate_per_100(int(dative_plural_endings.get(e, 0) or 0), token_count) | |
| for e in DATIVE_PLURAL_ENDINGS_PLAIN | |
| } | |
| preposition_rates = {p: rate_per_100(int(prepositions.get(p, 0) or 0), token_count) for p in PREPOSITIONS_PLAIN} | |
| koine_word_rates = {w: rate_per_100(int(koine_words.get(w, 0) or 0), token_count) for w in KOINE_FUNCTION_WORDS_PLAIN} | |
| lexical_cue_rates = { | |
| "attic_tt": rate_per_100(int(lexical_cues.get("attic_tt", 0) or 0), token_count), | |
| "ionic_ss": rate_per_100(int(lexical_cues.get("ionic_ss", 0) or 0), token_count), | |
| } | |
| doric_cue_rates = { | |
| "ha_initial": rate_per_100(int(doric_cues.get("ha_initial", 0) or 0), token_count), | |
| } | |
| poetic_morph_rates = { | |
| k: rate_per_100(int(poetic_morph.get(k, 0) or 0), token_count) for k in POETIC_MORPH_CUES | |
| } | |
| return { | |
| "particles_per_100": particle_rates, | |
| "endings_per_100": ending_rates, | |
| "infinitives_per_100": infinitive_rates, | |
| "patterns_per_100": pattern_rates, | |
| "epic_particles_per_100": epic_particle_rates, | |
| "epic_endings_per_100": epic_ending_rates, | |
| "dative_plural_endings_per_100": dative_plural_ending_rates, | |
| "prepositions_per_100": preposition_rates, | |
| "koine_words_per_100": koine_word_rates, | |
| "lexical_cues_per_100": lexical_cue_rates, | |
| "doric_cues_per_100": doric_cue_rates, | |
| "poetic_morph_per_100": poetic_morph_rates, | |
| "alpha_endings_per_100": alpha_rate, | |
| "eta_endings_per_100": eta_rate, | |
| "marked_endings_per_100": marked_rate, | |
| } | |