thomascerniglia's picture
Upload 8 files
d0326ea verified
from __future__ import annotations
import unicodedata
from collections import Counter
from typing import Any, Dict, List, Mapping, Tuple
from .normalization import sigma_normalize, strip_greek_diacritics
PARTICLES: Tuple[str, ...] = ("μεν", "δε", "γαρ", "τε", "δη", "ουν")
ENDINGS_PLAIN: Tuple[str, ...] = ("οι", "αι", "ηι", "οισι")
# Infinitive endings (high-signal morphology when present).
# These are matched on diacritic-stripped, sigma-normalized tokens.
INFINITIVE_ENDINGS_PLAIN: Tuple[str, ...] = (
"ειν", # common Attic/Ionic/Koine infinitive
"μεναι", # Aeolic-style infinitive
"μεν", # Doric/Aeolic-style infinitive
)
# A few additional, high-signal Homeric / epic-Ionic patterns (MVP).
# Matched on diacritic-stripped tokens.
EPIC_ENDINGS_PLAIN: Tuple[str, ...] = (
"οιο", # e.g., Ἠελίοιο
"φι", # e.g., -φι instrumental
"εσσι", # -εσσι(ν)
# Epic/Ionic genitive (sigma-normalized): -ηος (e.g., Ἀχιλῆος -> αχιληοσ)
"ηοσ",
# Epic patronymic genitive (e.g., Πηληϊάδεω, Ἀτρεΐδεω)
"αδεω",
"ιδεω",
)
# Dative plural patterns (useful for Ionic/Epic vs Attic/Koine tendencies).
# Matched on diacritic-stripped tokens.
DATIVE_PLURAL_ENDINGS_PLAIN: Tuple[str, ...] = (
"οισι",
"ηισι",
"αισι",
"οις",
"αις",
)
# Epic particles (very small MVP subset; diacritics stripped and sigma-normalized).
EPIC_PARTICLES_PLAIN: Tuple[str, ...] = (
"κε",
"κεν",
# Very common Homeric particle (often written ἄρ/ἄρ᾽)
"αρ",
# Homeric/epic pronoun form
"μιν",
)
# A few very common Homeric-vocabulary tokens (NOT dialect-specific in isolation).
# We only treat these as weak epic-Ionic evidence when multiple hits occur.
EPIC_WORDS_PLAIN: Tuple[str, ...] = (
"εννεπε",
"αειδε",
"μουσα",
"μηνιν",
"θεα",
)
# Very small lexicalized Attic-vs-Ionic spelling cues (MVP).
# These are substring-based to catch inflectional variants.
ATTIC_TT_STEMS: Tuple[str, ...] = (
"θαλαττ", # θάλαττα
"γλωττ", # γλῶττα
"πραττ", # πράττω
"ταττ", # τάττω
)
IONIC_SS_STEMS: Tuple[str, ...] = (
"θαλασσ", # θάλασσα
"γλωσσ", # γλῶσσα
"πρασσ", # πράσσω
"τασσ", # τάσσω
)
# Preposition preference (edition-dependent but often helpful): εἰς vs ἐς.
PREPOSITIONS_PLAIN: Tuple[str, ...] = (
# NOTE: these are *sigma-normalized* (final ς -> σ)
"εισ",
"εσ",
)
# Koine-leaning function words (very small MVP set; genre-sensitive).
# These should be low-weight, positive-only cues.
KOINE_FUNCTION_WORDS_PLAIN: Tuple[str, ...] = (
"ινα",
"οτι",
# NOTE: sigma-normalized
"καθωσ",
# NT-style narrative formula is common in Koine
"εγενετο",
)
# Literary/poetic morphology cues.
# - Doric 1pl active ending often appears as -μες (vs -μεν).
# - Aeolic pronoun forms like ἄμμι/ὔμμι are strong when they occur.
POETIC_MORPH_CUES: Tuple[str, ...] = (
"verb_1pl_mes",
"aeolic_ammi",
"aeolic_ummi",
)
def _ends_with_iota_subscript_cluster(token: str, base_letter: str) -> bool:
"""True if token ends with base_letter + iota-subscript (any accents allowed)."""
if not token:
return False
decomposed = unicodedata.normalize("NFD", token)
i = len(decomposed) - 1
saw_ypogegrammeni = False
while i >= 0 and unicodedata.combining(decomposed[i]):
if decomposed[i] == "\u0345":
saw_ypogegrammeni = True
i -= 1
if i < 0:
return False
base = decomposed[i]
return base == base_letter and saw_ypogegrammeni
def extract_features(tokens: List[str]) -> Dict[str, Any]:
"""Extract interpretable linguistic feature counts from tokens."""
token_count = len(tokens)
particles = Counter({p: 0 for p in PARTICLES})
endings = Counter({e: 0 for e in (*ENDINGS_PLAIN, "ᾳ")})
infinitives = Counter({e: 0 for e in INFINITIVE_ENDINGS_PLAIN})
epic_endings = Counter({e: 0 for e in EPIC_ENDINGS_PLAIN})
dative_plural_endings = Counter({e: 0 for e in DATIVE_PLURAL_ENDINGS_PLAIN})
epic_particles = Counter({p: 0 for p in EPIC_PARTICLES_PLAIN})
epic_words = Counter({w: 0 for w in EPIC_WORDS_PLAIN})
prepositions = Counter({p: 0 for p in PREPOSITIONS_PLAIN})
koine_words = Counter({w: 0 for w in KOINE_FUNCTION_WORDS_PLAIN})
lexical_cues = Counter(
{
"attic_tt": 0,
"ionic_ss": 0,
}
)
# Mild Doric cue: initial rough-breathed alpha (e.g., ἁ as article in Doric).
doric_ha_initial = 0
poetic_morph = Counter({k: 0 for k in POETIC_MORPH_CUES})
# Orthographic patterns
tt_count = 0
ss_count = 0
alpha_endings = 0
eta_endings = 0
# Script evidence: helps detect non-Greek input or encoding issues.
greek_alpha_chars = 0
alpha_chars = 0
for tok in tokens:
if not tok:
continue
for ch in tok:
if not ch.isalpha():
continue
alpha_chars += 1
code = ord(ch)
if (0x0370 <= code <= 0x03FF) or (0x1F00 <= code <= 0x1FFF):
greek_alpha_chars += 1
plain = sigma_normalize(strip_greek_diacritics(tok))
# Doric 1pl -μες (sigma-normalized: -μεσ).
# Guard against counting very short tokens.
if len(plain) >= 5 and plain.endswith("μεσ"):
poetic_morph["verb_1pl_mes"] += 1
# Aeolic pronoun forms (very high signal).
if plain == "αμμι":
poetic_morph["aeolic_ammi"] += 1
if plain == "υμμι":
poetic_morph["aeolic_ummi"] += 1
# Doric cue: token begins with alpha + rough breathing.
# This is intentionally weak; lots of words can have rough breathing.
nfd = unicodedata.normalize("NFD", tok)
if nfd:
base0 = nfd[0]
# Collect leading combining marks
j = 1
has_rough = False
while j < len(nfd) and unicodedata.combining(nfd[j]):
# COMBINING REVERSED COMMA ABOVE (rough breathing)
if nfd[j] == "\u0314":
has_rough = True
j += 1
if base0 == "α" and has_rough:
doric_ha_initial += 1
# Count orthographic patterns (occurrences, not just token presence)
tt_count += plain.count("ττ")
ss_count += plain.count("σσ")
if plain in particles:
particles[plain] += 1
if plain in epic_particles:
epic_particles[plain] += 1
if plain in epic_words:
epic_words[plain] += 1
if plain in prepositions:
prepositions[plain] += 1
if plain in koine_words:
koine_words[plain] += 1
# Lexicalized Attic/Ionic cues
if any(stem in plain for stem in ATTIC_TT_STEMS):
lexical_cues["attic_tt"] += 1
if any(stem in plain for stem in IONIC_SS_STEMS):
lexical_cues["ionic_ss"] += 1
for ending in ENDINGS_PLAIN:
if plain.endswith(ending):
endings[ending] += 1
# Infinitive endings (prefer longer endings first to avoid double-counting)
# Guard against short function words like the particle "μεν".
if len(plain) >= 5:
if plain.endswith("μεναι"):
infinitives["μεναι"] += 1
elif plain.endswith("ειν"):
infinitives["ειν"] += 1
elif plain.endswith("μεν"):
infinitives["μεν"] += 1
for ending in EPIC_ENDINGS_PLAIN:
if plain.endswith(ending):
epic_endings[ending] += 1
for ending in DATIVE_PLURAL_ENDINGS_PLAIN:
if plain.endswith(ending):
dative_plural_endings[ending] += 1
if _ends_with_iota_subscript_cluster(tok, "α"):
endings["ᾳ"] += 1
if plain.endswith(("α", "ας", "αν")):
alpha_endings += 1
if plain.endswith(("η", "ης", "ην")):
eta_endings += 1
return {
"token_count": token_count,
"particles": dict(particles),
"endings": dict(endings),
"infinitives": dict(infinitives),
"epic_endings": dict(epic_endings),
"dative_plural_endings": dict(dative_plural_endings),
"epic_particles": dict(epic_particles),
"epic_words": dict(epic_words),
"prepositions": dict(prepositions),
"koine_words": dict(koine_words),
"lexical_cues": dict(lexical_cues),
"patterns": {
"tt": tt_count,
"ss": ss_count,
},
"orthography": {
"alpha_endings": alpha_endings,
"eta_endings": eta_endings,
},
"script": {
"greek_alpha_chars": greek_alpha_chars,
"alpha_chars": alpha_chars,
},
"doric_cues": {
"ha_initial": doric_ha_initial,
},
"poetic_morph": dict(poetic_morph),
}
def rate_per_100(count: int, token_count: int) -> float:
if token_count <= 0:
return 0.0
return 100.0 * (count / token_count)
def compute_rates(feature_dict: Mapping[str, Any]) -> Dict[str, Any]:
"""Compute per-100-token rates from feature counts."""
token_count = int(feature_dict.get("token_count", 0) or 0)
particles: Mapping[str, int] = feature_dict.get("particles", {}) or {}
endings: Mapping[str, int] = feature_dict.get("endings", {}) or {}
infinitives: Mapping[str, int] = feature_dict.get("infinitives", {}) or {}
orth: Mapping[str, int] = feature_dict.get("orthography", {}) or {}
patterns: Mapping[str, int] = feature_dict.get("patterns", {}) or {}
epic_particles: Mapping[str, int] = feature_dict.get("epic_particles", {}) or {}
epic_endings: Mapping[str, int] = feature_dict.get("epic_endings", {}) or {}
dative_plural_endings: Mapping[str, int] = feature_dict.get("dative_plural_endings", {}) or {}
prepositions: Mapping[str, int] = feature_dict.get("prepositions", {}) or {}
koine_words: Mapping[str, int] = feature_dict.get("koine_words", {}) or {}
lexical_cues: Mapping[str, int] = feature_dict.get("lexical_cues", {}) or {}
doric_cues: Mapping[str, int] = feature_dict.get("doric_cues", {}) or {}
poetic_morph: Mapping[str, int] = feature_dict.get("poetic_morph", {}) or {}
particle_rates = {p: rate_per_100(int(particles.get(p, 0) or 0), token_count) for p in PARTICLES}
ending_rates = {e: rate_per_100(int(endings.get(e, 0) or 0), token_count) for e in (*ENDINGS_PLAIN, "ᾳ")}
infinitive_rates = {
e: rate_per_100(int(infinitives.get(e, 0) or 0), token_count) for e in INFINITIVE_ENDINGS_PLAIN
}
alpha_rate = rate_per_100(int(orth.get("alpha_endings", 0) or 0), token_count)
eta_rate = rate_per_100(int(orth.get("eta_endings", 0) or 0), token_count)
marked_rate = ending_rates.get("οισι", 0.0) + ending_rates.get("ηι", 0.0) + ending_rates.get("ᾳ", 0.0)
pattern_rates = {
"tt": rate_per_100(int(patterns.get("tt", 0) or 0), token_count),
"ss": rate_per_100(int(patterns.get("ss", 0) or 0), token_count),
}
epic_particle_rates = {p: rate_per_100(int(epic_particles.get(p, 0) or 0), token_count) for p in EPIC_PARTICLES_PLAIN}
epic_ending_rates = {e: rate_per_100(int(epic_endings.get(e, 0) or 0), token_count) for e in EPIC_ENDINGS_PLAIN}
dative_plural_ending_rates = {
e: rate_per_100(int(dative_plural_endings.get(e, 0) or 0), token_count)
for e in DATIVE_PLURAL_ENDINGS_PLAIN
}
preposition_rates = {p: rate_per_100(int(prepositions.get(p, 0) or 0), token_count) for p in PREPOSITIONS_PLAIN}
koine_word_rates = {w: rate_per_100(int(koine_words.get(w, 0) or 0), token_count) for w in KOINE_FUNCTION_WORDS_PLAIN}
lexical_cue_rates = {
"attic_tt": rate_per_100(int(lexical_cues.get("attic_tt", 0) or 0), token_count),
"ionic_ss": rate_per_100(int(lexical_cues.get("ionic_ss", 0) or 0), token_count),
}
doric_cue_rates = {
"ha_initial": rate_per_100(int(doric_cues.get("ha_initial", 0) or 0), token_count),
}
poetic_morph_rates = {
k: rate_per_100(int(poetic_morph.get(k, 0) or 0), token_count) for k in POETIC_MORPH_CUES
}
return {
"particles_per_100": particle_rates,
"endings_per_100": ending_rates,
"infinitives_per_100": infinitive_rates,
"patterns_per_100": pattern_rates,
"epic_particles_per_100": epic_particle_rates,
"epic_endings_per_100": epic_ending_rates,
"dative_plural_endings_per_100": dative_plural_ending_rates,
"prepositions_per_100": preposition_rates,
"koine_words_per_100": koine_word_rates,
"lexical_cues_per_100": lexical_cue_rates,
"doric_cues_per_100": doric_cue_rates,
"poetic_morph_per_100": poetic_morph_rates,
"alpha_endings_per_100": alpha_rate,
"eta_endings_per_100": eta_rate,
"marked_endings_per_100": marked_rate,
}