from __future__ import annotations import math from collections import Counter from typing import Any, Dict, Mapping, Tuple from .features import ENDINGS_PLAIN, INFINITIVE_ENDINGS_PLAIN, PARTICLES, compute_rates DIALECTS: Tuple[str, ...] = ("Attic", "Ionic", "Doric", "Aeolic", "Koine") def _clamp(x: float, lo: float, hi: float) -> float: return max(lo, min(hi, x)) def _softmax_percent(raw_scores: Mapping[str, float], *, temperature: float = 2.0) -> Dict[str, float]: """Softmax over dialect scores with temperature to reduce overconfidence.""" if not raw_scores: return {d: 0.0 for d in DIALECTS} t = max(1e-6, float(temperature)) max_raw = max(float(v) for v in raw_scores.values()) exp_scores = {d: math.exp((float(raw_scores[d]) - max_raw) / t) for d in DIALECTS} total = sum(exp_scores.values()) or 1.0 return {d: 100.0 * (exp_scores[d] / total) for d in DIALECTS} def score_dialects(feature_dict: Mapping[str, Any]) -> Dict[str, float]: """Score dialects using a weighted, rule-based scoring system. Returns a dict mapping dialect -> confidence percentage (0-100). Weights are placeholders intended to be edited as the rule-set grows. """ rates = compute_rates(feature_dict) token_count = int(feature_dict.get("token_count", 0) or 0) script = feature_dict.get("script", {}) or {} greek_alpha = int(script.get("greek_alpha_chars", 0) or 0) alpha_chars = int(script.get("alpha_chars", 0) or 0) greek_ratio = (greek_alpha / alpha_chars) if alpha_chars > 0 else 0.0 particle_rates: Mapping[str, float] = rates["particles_per_100"] ending_rates: Mapping[str, float] = rates["endings_per_100"] infinitives: Mapping[str, int] = feature_dict.get("infinitives", {}) or {} poetic_morph: Mapping[str, int] = feature_dict.get("poetic_morph", {}) or {} epic_particle_rates: Mapping[str, float] = rates.get("epic_particles_per_100", {}) or {} epic_ending_rates: Mapping[str, float] = rates.get("epic_endings_per_100", {}) or {} epic_words: Mapping[str, int] = feature_dict.get("epic_words", {}) or {} dative_plural_rates: Mapping[str, float] = rates.get("dative_plural_endings_per_100", {}) or {} prepositions: Mapping[str, int] = feature_dict.get("prepositions", {}) or {} koine_words: Mapping[str, int] = feature_dict.get("koine_words", {}) or {} lexical_cues: Mapping[str, int] = feature_dict.get("lexical_cues", {}) or {} doric_cues: Mapping[str, int] = feature_dict.get("doric_cues", {}) or {} patterns: Mapping[str, int] = feature_dict.get("patterns", {}) or {} marked_rate = float(rates["marked_endings_per_100"]) epic_oio_rate = float(epic_ending_rates.get("οιο", 0.0) or 0.0) epic_essi_rate = float(epic_ending_rates.get("εσσι", 0.0) or 0.0) epic_fi_rate = float(epic_ending_rates.get("φι", 0.0) or 0.0) epic_eta_os_rate = float(epic_ending_rates.get("ηοσ", 0.0) or 0.0) epic_adeo_rate = float(epic_ending_rates.get("αδεω", 0.0) or 0.0) epic_ideo_rate = float(epic_ending_rates.get("ιδεω", 0.0) or 0.0) epic_ke_rate = float(epic_particle_rates.get("κε", 0.0) or 0.0) epic_ken_rate = float(epic_particle_rates.get("κεν", 0.0) or 0.0) epic_ke_ken_rate = epic_ke_rate + epic_ken_rate epic_ar_rate = float(epic_particle_rates.get("αρ", 0.0) or 0.0) epic_min_rate = float(epic_particle_rates.get("μιν", 0.0) or 0.0) tt_count = int(patterns.get("tt", 0) or 0) ss_count = int(patterns.get("ss", 0) or 0) # --- Weights (MVP placeholders) --- weights: Dict[str, Dict[str, float]] = { "particle_μεν": {"Attic": 0.25, "Ionic": 0.15, "Doric": 0.10, "Aeolic": 0.10, "Koine": 0.05}, "particle_δε": {"Attic": 0.20, "Ionic": 0.20, "Doric": 0.15, "Aeolic": 0.15, "Koine": 0.10}, "particle_γαρ": {"Attic": 0.20, "Ionic": 0.15, "Doric": 0.10, "Aeolic": 0.10, "Koine": 0.10}, "particle_τε": {"Attic": 0.15, "Ionic": 0.10, "Doric": 0.20, "Aeolic": 0.12, "Koine": 0.05}, "particle_δη": {"Attic": 0.10, "Ionic": 0.10, "Doric": 0.10, "Aeolic": 0.08, "Koine": 0.05}, "particle_ουν": {"Attic": 0.15, "Ionic": 0.10, "Doric": 0.05, "Aeolic": 0.05, "Koine": 0.10}, "ending_οισι": {"Ionic": 3.50, "Attic": -1.00, "Doric": 0.50, "Aeolic": 0.20, "Koine": -1.50}, "ending_ηι": {"Attic": 1.10, "Ionic": 0.80, "Doric": 0.10, "Aeolic": 0.20, "Koine": -0.30}, "ending_ᾳ": {"Attic": 0.80, "Ionic": 0.60, "Doric": 0.30, "Aeolic": 0.20, "Koine": -0.60}, "ending_οι": {"Attic": 0.15, "Ionic": 0.15, "Doric": 0.15, "Aeolic": 0.15, "Koine": 0.15}, "ending_αι": {"Attic": 0.15, "Ionic": 0.15, "Doric": 0.15, "Aeolic": 0.15, "Koine": 0.15}, # NOTE: This is intentionally low-weight. "Few strong markers" is not # uniquely Koine; it can also describe many Attic passages. "low_marked_endings": {"Koine": 0.25, "Attic": 0.05, "Ionic": -0.05, "Doric": 0.05, "Aeolic": -0.05}, # Homeric / epic-Ionic signal "epic_ending_οιο": {"Ionic": 4.00, "Attic": -0.50, "Doric": -0.50, "Aeolic": -0.30, "Koine": -0.50}, # Epic endings and particles (conservative; only meaningful when present) "epic_ending_εσσι": {"Ionic": 3.00, "Attic": -0.40, "Doric": -0.20, "Aeolic": -0.20, "Koine": -0.80}, "epic_ending_φι": {"Ionic": 1.50, "Attic": -0.20, "Doric": 0.10, "Aeolic": 0.05, "Koine": -0.50}, "epic_particle_κεκεν": {"Ionic": 2.00, "Attic": -0.20, "Doric": 0.10, "Aeolic": 0.05, "Koine": -0.70}, "epic_ending_ηοσ": {"Ionic": 2.60, "Attic": -0.30, "Doric": -0.10, "Aeolic": -0.10, "Koine": -0.60}, "epic_ending_αδεω": {"Ionic": 2.80, "Attic": -0.20, "Doric": 0.00, "Aeolic": 0.00, "Koine": -0.70}, "epic_ending_ιδεω": {"Ionic": 2.80, "Attic": -0.20, "Doric": 0.00, "Aeolic": 0.00, "Koine": -0.70}, # Homeric / epic particles (ambiguous individually; keep weights modest) "epic_particle_αρ": {"Ionic": 0.80, "Attic": -0.05, "Doric": 0.00, "Aeolic": 0.00, "Koine": -0.15}, "epic_particle_μιν": {"Ionic": 1.20, "Attic": -0.10, "Doric": 0.00, "Aeolic": 0.00, "Koine": -0.25}, # Homeric vocabulary: apply only when multiple hits occur (see logic below) "epic_word_hits": {"Ionic": 1.80, "Attic": 0.00, "Doric": 0.00, "Aeolic": 0.00, "Koine": 0.00}, # Orthographic patterns (COUNT-based; prevents short-text rate blowups) "pattern_tt": {"Attic": 0.45, "Ionic": 0.00, "Doric": 0.00, "Aeolic": 0.00, "Koine": 0.05}, "pattern_ss": {"Ionic": 0.10, "Attic": 0.00, "Doric": 0.00, "Aeolic": 0.00, "Koine": 0.00}, # Dative plural endings: -οισι/-αισι/-ηισι vs -οις/-αις "dative_οισι": {"Ionic": 0.90, "Attic": -0.20, "Doric": 0.10, "Aeolic": 0.05, "Koine": -0.40}, "dative_αισι": {"Ionic": 2.20, "Attic": -0.40, "Doric": 0.20, "Aeolic": 0.10, "Koine": -0.80}, "dative_ηισι": {"Ionic": 2.20, "Attic": -0.30, "Doric": 0.10, "Aeolic": 0.10, "Koine": -0.80}, "dative_οις": {"Attic": 0.20, "Ionic": 0.05, "Doric": 0.10, "Aeolic": 0.10, "Koine": 0.15}, "dative_αις": {"Attic": 0.20, "Ionic": 0.05, "Doric": 0.10, "Aeolic": 0.10, "Koine": 0.15}, # εἰς vs ἐς (COUNT-based; keys are sigma-normalized: εισ / εσ) "prep_εισ": {"Koine": 0.30, "Attic": 0.05, "Ionic": 0.00, "Doric": 0.00, "Aeolic": 0.00}, "prep_εσ": {"Attic": 0.25, "Ionic": 0.15, "Koine": 0.05, "Doric": 0.00, "Aeolic": 0.05}, # Koine-ish function words (COUNT-based; sigma-normalized: καθωσ) "koine_ινα": {"Koine": 0.60, "Attic": 0.00, "Ionic": 0.00, "Doric": 0.00}, "koine_οτι": {"Koine": 0.40, "Attic": 0.00, "Ionic": 0.00, "Doric": 0.00}, "koine_καθωσ": {"Koine": 0.35, "Attic": 0.00, "Ionic": 0.00, "Doric": 0.00}, "koine_εγενετο": {"Koine": 0.90, "Attic": 0.00, "Ionic": 0.00, "Doric": 0.00, "Aeolic": 0.00}, # Lexicalized ττ/σσ stems (COUNT-based) "lexical_attic_tt": {"Attic": 0.75, "Koine": 0.08, "Ionic": 0.00, "Doric": 0.00}, "lexical_ionic_ss": {"Ionic": 0.25, "Attic": 0.00, "Doric": 0.00, "Koine": 0.00}, # Doric-ish ἁ- (very weak; COUNT-based) "doric_ha_initial": {"Doric": 0.12, "Attic": 0.00, "Ionic": 0.00, "Koine": 0.00}, # Infinitives (morphology): strong signal when present # These are COUNT-based to avoid short-text rate blowups. "inf_μεναι": {"Aeolic": 2.40, "Doric": 0.40, "Ionic": 0.05, "Attic": 0.00, "Koine": 0.00}, "inf_μεν": {"Doric": 1.20, "Aeolic": 0.80, "Ionic": 0.00, "Attic": 0.00, "Koine": 0.00}, "inf_ειν": {"Koine": 0.55, "Attic": 0.35, "Ionic": 0.35, "Doric": 0.00, "Aeolic": 0.00}, # Poetic morphology cues (COUNT-based) "verb_1pl_mes": {"Doric": 1.30, "Aeolic": 0.30, "Attic": 0.00, "Ionic": 0.00, "Koine": 0.00}, "aeolic_ammi": {"Aeolic": 2.20, "Doric": 0.20, "Attic": 0.00, "Ionic": 0.00, "Koine": 0.00}, "aeolic_ummi": {"Aeolic": 2.20, "Doric": 0.20, "Attic": 0.00, "Ionic": 0.00, "Koine": 0.00}, } raw_scores: Dict[str, float] = {d: 1.0 for d in DIALECTS} contributions: Dict[str, Counter[str]] = {d: Counter() for d in DIALECTS} # Evidence scaling: short passages should not yield extreme confidence. evidence_scale = _clamp(token_count / 40.0, 0.0, 1.0) if greek_ratio < 0.30: evidence_scale *= 0.15 def apply_feature(feature_name: str, feature_value: float) -> None: for dialect, w in weights.get(feature_name, {}).items(): delta = w * feature_value * evidence_scale raw_scores[dialect] += delta contributions[dialect][feature_name] += delta def apply_tier_a(feature_name: str, feature_value: float) -> None: """Apply highly diagnostic features with a minimum evidence scale. Rationale: some morphology is genuinely strong evidence even in short passages; we still keep the scale modest to avoid overconfidence. """ tier_scale = max(evidence_scale, 0.25) for dialect, w in weights.get(feature_name, {}).items(): delta = w * feature_value * tier_scale raw_scores[dialect] += delta contributions[dialect][feature_name] += delta for p in PARTICLES: apply_feature(f"particle_{p}", float(particle_rates.get(p, 0.0))) for e in (*ENDINGS_PLAIN, "ᾳ"): apply_feature(f"ending_{e}", float(ending_rates.get(e, 0.0))) # Infinitive morphology apply_tier_a("inf_μεναι", float(int(infinitives.get("μεναι", 0) or 0))) apply_tier_a("inf_μεν", float(int(infinitives.get("μεν", 0) or 0))) apply_tier_a("inf_ειν", float(int(infinitives.get("ειν", 0) or 0))) # Poetic morphology apply_tier_a("verb_1pl_mes", float(int(poetic_morph.get("verb_1pl_mes", 0) or 0))) apply_tier_a("aeolic_ammi", float(int(poetic_morph.get("aeolic_ammi", 0) or 0))) apply_tier_a("aeolic_ummi", float(int(poetic_morph.get("aeolic_ummi", 0) or 0))) # Only apply the Koine scarcity heuristic when we have enough text. if token_count >= 20: apply_feature("low_marked_endings", max(0.0, 1.5 - marked_rate)) # Epic marker apply_feature("epic_ending_οιο", epic_oio_rate) # Additional epic markers apply_feature("epic_ending_εσσι", epic_essi_rate) apply_feature("epic_ending_φι", epic_fi_rate) apply_feature("epic_particle_κεκεν", epic_ke_ken_rate) apply_feature("epic_ending_ηοσ", epic_eta_os_rate) apply_feature("epic_ending_αδεω", epic_adeo_rate) apply_feature("epic_ending_ιδεω", epic_ideo_rate) apply_feature("epic_particle_αρ", epic_ar_rate) apply_feature("epic_particle_μιν", epic_min_rate) epic_word_hits = sum( int(epic_words.get(w, 0) or 0) for w in ("εννεπε", "αειδε", "μουσα", "μηνιν", "θεα") ) if epic_word_hits >= 2: apply_tier_a("epic_word_hits", float(min(4, epic_word_hits))) # tt/ss orthography (separate, conservative) apply_feature("pattern_tt", float(tt_count)) apply_feature("pattern_ss", float(ss_count)) # Dative plural endings apply_feature("dative_οισι", float(dative_plural_rates.get("οισι", 0.0) or 0.0)) apply_feature("dative_αισι", float(dative_plural_rates.get("αισι", 0.0) or 0.0)) apply_feature("dative_ηισι", float(dative_plural_rates.get("ηισι", 0.0) or 0.0)) apply_feature("dative_οις", float(dative_plural_rates.get("οις", 0.0) or 0.0)) apply_feature("dative_αις", float(dative_plural_rates.get("αις", 0.0) or 0.0)) # εἰς / ἐς (counts; sigma-normalized) apply_feature("prep_εισ", float(int(prepositions.get("εισ", 0) or 0))) apply_feature("prep_εσ", float(int(prepositions.get("εσ", 0) or 0))) # Koine-ish function words (counts; sigma-normalized) apply_feature("koine_ινα", float(int(koine_words.get("ινα", 0) or 0))) apply_feature("koine_οτι", float(int(koine_words.get("οτι", 0) or 0))) apply_feature("koine_καθωσ", float(int(koine_words.get("καθωσ", 0) or 0))) apply_feature("koine_εγενετο", float(int(koine_words.get("εγενετο", 0) or 0))) # Lexicalized ττ/σσ stems (counts) apply_feature("lexical_attic_tt", float(int(lexical_cues.get("attic_tt", 0) or 0))) apply_feature("lexical_ionic_ss", float(int(lexical_cues.get("ionic_ss", 0) or 0))) # Doric cue (very noisy): require longer text + multiple hits ha_hits = int(doric_cues.get("ha_initial", 0) or 0) if token_count >= 30 and ha_hits >= 2: apply_feature("doric_ha_initial", float(ha_hits)) # If mutable, persist diagnostics for explainability. if isinstance(feature_dict, dict): feature_dict["rates"] = rates feature_dict["diagnostics"] = { "greek_ratio": greek_ratio, "evidence_scale": evidence_scale, } feature_dict["_raw_scores"] = dict(raw_scores) feature_dict["_contributions"] = {d: dict(contributions[d]) for d in DIALECTS} # Slightly increase confidence only when evidence is strong. temperature = _clamp(2.0 - 0.6 * evidence_scale, 1.4, 2.0) scores = _softmax_percent(raw_scores, temperature=temperature) # Post-hoc discrimination diagnostics. ordered = sorted(scores.items(), key=lambda kv: kv[1], reverse=True) best_pct = float(ordered[0][1]) if ordered else 0.0 second_pct = float(ordered[1][1]) if len(ordered) > 1 else 0.0 top_gap_pct = best_pct - second_pct if isinstance(feature_dict, dict): diagnostics = feature_dict.get("diagnostics", {}) or {} diagnostics.update( { "best_pct": best_pct, "second_pct": second_pct, "top_gap_pct": top_gap_pct, } ) feature_dict["diagnostics"] = diagnostics return scores