Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import math | |
| from collections import Counter | |
| from typing import Any, Dict, Mapping, Tuple | |
| from .features import ENDINGS_PLAIN, INFINITIVE_ENDINGS_PLAIN, PARTICLES, compute_rates | |
| DIALECTS: Tuple[str, ...] = ("Attic", "Ionic", "Doric", "Aeolic", "Koine") | |
| def _clamp(x: float, lo: float, hi: float) -> float: | |
| return max(lo, min(hi, x)) | |
| def _softmax_percent(raw_scores: Mapping[str, float], *, temperature: float = 2.0) -> Dict[str, float]: | |
| """Softmax over dialect scores with temperature to reduce overconfidence.""" | |
| if not raw_scores: | |
| return {d: 0.0 for d in DIALECTS} | |
| t = max(1e-6, float(temperature)) | |
| max_raw = max(float(v) for v in raw_scores.values()) | |
| exp_scores = {d: math.exp((float(raw_scores[d]) - max_raw) / t) for d in DIALECTS} | |
| total = sum(exp_scores.values()) or 1.0 | |
| return {d: 100.0 * (exp_scores[d] / total) for d in DIALECTS} | |
| def score_dialects(feature_dict: Mapping[str, Any]) -> Dict[str, float]: | |
| """Score dialects using a weighted, rule-based scoring system. | |
| Returns a dict mapping dialect -> confidence percentage (0-100). | |
| Weights are placeholders intended to be edited as the rule-set grows. | |
| """ | |
| rates = compute_rates(feature_dict) | |
| token_count = int(feature_dict.get("token_count", 0) or 0) | |
| script = feature_dict.get("script", {}) or {} | |
| greek_alpha = int(script.get("greek_alpha_chars", 0) or 0) | |
| alpha_chars = int(script.get("alpha_chars", 0) or 0) | |
| greek_ratio = (greek_alpha / alpha_chars) if alpha_chars > 0 else 0.0 | |
| particle_rates: Mapping[str, float] = rates["particles_per_100"] | |
| ending_rates: Mapping[str, float] = rates["endings_per_100"] | |
| infinitives: Mapping[str, int] = feature_dict.get("infinitives", {}) or {} | |
| poetic_morph: Mapping[str, int] = feature_dict.get("poetic_morph", {}) or {} | |
| epic_particle_rates: Mapping[str, float] = rates.get("epic_particles_per_100", {}) or {} | |
| epic_ending_rates: Mapping[str, float] = rates.get("epic_endings_per_100", {}) or {} | |
| epic_words: Mapping[str, int] = feature_dict.get("epic_words", {}) or {} | |
| dative_plural_rates: Mapping[str, float] = rates.get("dative_plural_endings_per_100", {}) or {} | |
| prepositions: Mapping[str, int] = feature_dict.get("prepositions", {}) or {} | |
| koine_words: Mapping[str, int] = feature_dict.get("koine_words", {}) or {} | |
| lexical_cues: Mapping[str, int] = feature_dict.get("lexical_cues", {}) or {} | |
| doric_cues: Mapping[str, int] = feature_dict.get("doric_cues", {}) or {} | |
| patterns: Mapping[str, int] = feature_dict.get("patterns", {}) or {} | |
| marked_rate = float(rates["marked_endings_per_100"]) | |
| epic_oio_rate = float(epic_ending_rates.get("οιο", 0.0) or 0.0) | |
| epic_essi_rate = float(epic_ending_rates.get("εσσι", 0.0) or 0.0) | |
| epic_fi_rate = float(epic_ending_rates.get("φι", 0.0) or 0.0) | |
| epic_eta_os_rate = float(epic_ending_rates.get("ηοσ", 0.0) or 0.0) | |
| epic_adeo_rate = float(epic_ending_rates.get("αδεω", 0.0) or 0.0) | |
| epic_ideo_rate = float(epic_ending_rates.get("ιδεω", 0.0) or 0.0) | |
| epic_ke_rate = float(epic_particle_rates.get("κε", 0.0) or 0.0) | |
| epic_ken_rate = float(epic_particle_rates.get("κεν", 0.0) or 0.0) | |
| epic_ke_ken_rate = epic_ke_rate + epic_ken_rate | |
| epic_ar_rate = float(epic_particle_rates.get("αρ", 0.0) or 0.0) | |
| epic_min_rate = float(epic_particle_rates.get("μιν", 0.0) or 0.0) | |
| tt_count = int(patterns.get("tt", 0) or 0) | |
| ss_count = int(patterns.get("ss", 0) or 0) | |
| # --- Weights (MVP placeholders) --- | |
| weights: Dict[str, Dict[str, float]] = { | |
| "particle_μεν": {"Attic": 0.25, "Ionic": 0.15, "Doric": 0.10, "Aeolic": 0.10, "Koine": 0.05}, | |
| "particle_δε": {"Attic": 0.20, "Ionic": 0.20, "Doric": 0.15, "Aeolic": 0.15, "Koine": 0.10}, | |
| "particle_γαρ": {"Attic": 0.20, "Ionic": 0.15, "Doric": 0.10, "Aeolic": 0.10, "Koine": 0.10}, | |
| "particle_τε": {"Attic": 0.15, "Ionic": 0.10, "Doric": 0.20, "Aeolic": 0.12, "Koine": 0.05}, | |
| "particle_δη": {"Attic": 0.10, "Ionic": 0.10, "Doric": 0.10, "Aeolic": 0.08, "Koine": 0.05}, | |
| "particle_ουν": {"Attic": 0.15, "Ionic": 0.10, "Doric": 0.05, "Aeolic": 0.05, "Koine": 0.10}, | |
| "ending_οισι": {"Ionic": 3.50, "Attic": -1.00, "Doric": 0.50, "Aeolic": 0.20, "Koine": -1.50}, | |
| "ending_ηι": {"Attic": 1.10, "Ionic": 0.80, "Doric": 0.10, "Aeolic": 0.20, "Koine": -0.30}, | |
| "ending_ᾳ": {"Attic": 0.80, "Ionic": 0.60, "Doric": 0.30, "Aeolic": 0.20, "Koine": -0.60}, | |
| "ending_οι": {"Attic": 0.15, "Ionic": 0.15, "Doric": 0.15, "Aeolic": 0.15, "Koine": 0.15}, | |
| "ending_αι": {"Attic": 0.15, "Ionic": 0.15, "Doric": 0.15, "Aeolic": 0.15, "Koine": 0.15}, | |
| # NOTE: This is intentionally low-weight. "Few strong markers" is not | |
| # uniquely Koine; it can also describe many Attic passages. | |
| "low_marked_endings": {"Koine": 0.25, "Attic": 0.05, "Ionic": -0.05, "Doric": 0.05, "Aeolic": -0.05}, | |
| # Homeric / epic-Ionic signal | |
| "epic_ending_οιο": {"Ionic": 4.00, "Attic": -0.50, "Doric": -0.50, "Aeolic": -0.30, "Koine": -0.50}, | |
| # Epic endings and particles (conservative; only meaningful when present) | |
| "epic_ending_εσσι": {"Ionic": 3.00, "Attic": -0.40, "Doric": -0.20, "Aeolic": -0.20, "Koine": -0.80}, | |
| "epic_ending_φι": {"Ionic": 1.50, "Attic": -0.20, "Doric": 0.10, "Aeolic": 0.05, "Koine": -0.50}, | |
| "epic_particle_κεκεν": {"Ionic": 2.00, "Attic": -0.20, "Doric": 0.10, "Aeolic": 0.05, "Koine": -0.70}, | |
| "epic_ending_ηοσ": {"Ionic": 2.60, "Attic": -0.30, "Doric": -0.10, "Aeolic": -0.10, "Koine": -0.60}, | |
| "epic_ending_αδεω": {"Ionic": 2.80, "Attic": -0.20, "Doric": 0.00, "Aeolic": 0.00, "Koine": -0.70}, | |
| "epic_ending_ιδεω": {"Ionic": 2.80, "Attic": -0.20, "Doric": 0.00, "Aeolic": 0.00, "Koine": -0.70}, | |
| # Homeric / epic particles (ambiguous individually; keep weights modest) | |
| "epic_particle_αρ": {"Ionic": 0.80, "Attic": -0.05, "Doric": 0.00, "Aeolic": 0.00, "Koine": -0.15}, | |
| "epic_particle_μιν": {"Ionic": 1.20, "Attic": -0.10, "Doric": 0.00, "Aeolic": 0.00, "Koine": -0.25}, | |
| # Homeric vocabulary: apply only when multiple hits occur (see logic below) | |
| "epic_word_hits": {"Ionic": 1.80, "Attic": 0.00, "Doric": 0.00, "Aeolic": 0.00, "Koine": 0.00}, | |
| # Orthographic patterns (COUNT-based; prevents short-text rate blowups) | |
| "pattern_tt": {"Attic": 0.45, "Ionic": 0.00, "Doric": 0.00, "Aeolic": 0.00, "Koine": 0.05}, | |
| "pattern_ss": {"Ionic": 0.10, "Attic": 0.00, "Doric": 0.00, "Aeolic": 0.00, "Koine": 0.00}, | |
| # Dative plural endings: -οισι/-αισι/-ηισι vs -οις/-αις | |
| "dative_οισι": {"Ionic": 0.90, "Attic": -0.20, "Doric": 0.10, "Aeolic": 0.05, "Koine": -0.40}, | |
| "dative_αισι": {"Ionic": 2.20, "Attic": -0.40, "Doric": 0.20, "Aeolic": 0.10, "Koine": -0.80}, | |
| "dative_ηισι": {"Ionic": 2.20, "Attic": -0.30, "Doric": 0.10, "Aeolic": 0.10, "Koine": -0.80}, | |
| "dative_οις": {"Attic": 0.20, "Ionic": 0.05, "Doric": 0.10, "Aeolic": 0.10, "Koine": 0.15}, | |
| "dative_αις": {"Attic": 0.20, "Ionic": 0.05, "Doric": 0.10, "Aeolic": 0.10, "Koine": 0.15}, | |
| # εἰς vs ἐς (COUNT-based; keys are sigma-normalized: εισ / εσ) | |
| "prep_εισ": {"Koine": 0.30, "Attic": 0.05, "Ionic": 0.00, "Doric": 0.00, "Aeolic": 0.00}, | |
| "prep_εσ": {"Attic": 0.25, "Ionic": 0.15, "Koine": 0.05, "Doric": 0.00, "Aeolic": 0.05}, | |
| # Koine-ish function words (COUNT-based; sigma-normalized: καθωσ) | |
| "koine_ινα": {"Koine": 0.60, "Attic": 0.00, "Ionic": 0.00, "Doric": 0.00}, | |
| "koine_οτι": {"Koine": 0.40, "Attic": 0.00, "Ionic": 0.00, "Doric": 0.00}, | |
| "koine_καθωσ": {"Koine": 0.35, "Attic": 0.00, "Ionic": 0.00, "Doric": 0.00}, | |
| "koine_εγενετο": {"Koine": 0.90, "Attic": 0.00, "Ionic": 0.00, "Doric": 0.00, "Aeolic": 0.00}, | |
| # Lexicalized ττ/σσ stems (COUNT-based) | |
| "lexical_attic_tt": {"Attic": 0.75, "Koine": 0.08, "Ionic": 0.00, "Doric": 0.00}, | |
| "lexical_ionic_ss": {"Ionic": 0.25, "Attic": 0.00, "Doric": 0.00, "Koine": 0.00}, | |
| # Doric-ish ἁ- (very weak; COUNT-based) | |
| "doric_ha_initial": {"Doric": 0.12, "Attic": 0.00, "Ionic": 0.00, "Koine": 0.00}, | |
| # Infinitives (morphology): strong signal when present | |
| # These are COUNT-based to avoid short-text rate blowups. | |
| "inf_μεναι": {"Aeolic": 2.40, "Doric": 0.40, "Ionic": 0.05, "Attic": 0.00, "Koine": 0.00}, | |
| "inf_μεν": {"Doric": 1.20, "Aeolic": 0.80, "Ionic": 0.00, "Attic": 0.00, "Koine": 0.00}, | |
| "inf_ειν": {"Koine": 0.55, "Attic": 0.35, "Ionic": 0.35, "Doric": 0.00, "Aeolic": 0.00}, | |
| # Poetic morphology cues (COUNT-based) | |
| "verb_1pl_mes": {"Doric": 1.30, "Aeolic": 0.30, "Attic": 0.00, "Ionic": 0.00, "Koine": 0.00}, | |
| "aeolic_ammi": {"Aeolic": 2.20, "Doric": 0.20, "Attic": 0.00, "Ionic": 0.00, "Koine": 0.00}, | |
| "aeolic_ummi": {"Aeolic": 2.20, "Doric": 0.20, "Attic": 0.00, "Ionic": 0.00, "Koine": 0.00}, | |
| } | |
| raw_scores: Dict[str, float] = {d: 1.0 for d in DIALECTS} | |
| contributions: Dict[str, Counter[str]] = {d: Counter() for d in DIALECTS} | |
| # Evidence scaling: short passages should not yield extreme confidence. | |
| evidence_scale = _clamp(token_count / 40.0, 0.0, 1.0) | |
| if greek_ratio < 0.30: | |
| evidence_scale *= 0.15 | |
| def apply_feature(feature_name: str, feature_value: float) -> None: | |
| for dialect, w in weights.get(feature_name, {}).items(): | |
| delta = w * feature_value * evidence_scale | |
| raw_scores[dialect] += delta | |
| contributions[dialect][feature_name] += delta | |
| def apply_tier_a(feature_name: str, feature_value: float) -> None: | |
| """Apply highly diagnostic features with a minimum evidence scale. | |
| Rationale: some morphology is genuinely strong evidence even in short | |
| passages; we still keep the scale modest to avoid overconfidence. | |
| """ | |
| tier_scale = max(evidence_scale, 0.25) | |
| for dialect, w in weights.get(feature_name, {}).items(): | |
| delta = w * feature_value * tier_scale | |
| raw_scores[dialect] += delta | |
| contributions[dialect][feature_name] += delta | |
| for p in PARTICLES: | |
| apply_feature(f"particle_{p}", float(particle_rates.get(p, 0.0))) | |
| for e in (*ENDINGS_PLAIN, "ᾳ"): | |
| apply_feature(f"ending_{e}", float(ending_rates.get(e, 0.0))) | |
| # Infinitive morphology | |
| apply_tier_a("inf_μεναι", float(int(infinitives.get("μεναι", 0) or 0))) | |
| apply_tier_a("inf_μεν", float(int(infinitives.get("μεν", 0) or 0))) | |
| apply_tier_a("inf_ειν", float(int(infinitives.get("ειν", 0) or 0))) | |
| # Poetic morphology | |
| apply_tier_a("verb_1pl_mes", float(int(poetic_morph.get("verb_1pl_mes", 0) or 0))) | |
| apply_tier_a("aeolic_ammi", float(int(poetic_morph.get("aeolic_ammi", 0) or 0))) | |
| apply_tier_a("aeolic_ummi", float(int(poetic_morph.get("aeolic_ummi", 0) or 0))) | |
| # Only apply the Koine scarcity heuristic when we have enough text. | |
| if token_count >= 20: | |
| apply_feature("low_marked_endings", max(0.0, 1.5 - marked_rate)) | |
| # Epic marker | |
| apply_feature("epic_ending_οιο", epic_oio_rate) | |
| # Additional epic markers | |
| apply_feature("epic_ending_εσσι", epic_essi_rate) | |
| apply_feature("epic_ending_φι", epic_fi_rate) | |
| apply_feature("epic_particle_κεκεν", epic_ke_ken_rate) | |
| apply_feature("epic_ending_ηοσ", epic_eta_os_rate) | |
| apply_feature("epic_ending_αδεω", epic_adeo_rate) | |
| apply_feature("epic_ending_ιδεω", epic_ideo_rate) | |
| apply_feature("epic_particle_αρ", epic_ar_rate) | |
| apply_feature("epic_particle_μιν", epic_min_rate) | |
| epic_word_hits = sum( | |
| int(epic_words.get(w, 0) or 0) | |
| for w in ("εννεπε", "αειδε", "μουσα", "μηνιν", "θεα") | |
| ) | |
| if epic_word_hits >= 2: | |
| apply_tier_a("epic_word_hits", float(min(4, epic_word_hits))) | |
| # tt/ss orthography (separate, conservative) | |
| apply_feature("pattern_tt", float(tt_count)) | |
| apply_feature("pattern_ss", float(ss_count)) | |
| # Dative plural endings | |
| apply_feature("dative_οισι", float(dative_plural_rates.get("οισι", 0.0) or 0.0)) | |
| apply_feature("dative_αισι", float(dative_plural_rates.get("αισι", 0.0) or 0.0)) | |
| apply_feature("dative_ηισι", float(dative_plural_rates.get("ηισι", 0.0) or 0.0)) | |
| apply_feature("dative_οις", float(dative_plural_rates.get("οις", 0.0) or 0.0)) | |
| apply_feature("dative_αις", float(dative_plural_rates.get("αις", 0.0) or 0.0)) | |
| # εἰς / ἐς (counts; sigma-normalized) | |
| apply_feature("prep_εισ", float(int(prepositions.get("εισ", 0) or 0))) | |
| apply_feature("prep_εσ", float(int(prepositions.get("εσ", 0) or 0))) | |
| # Koine-ish function words (counts; sigma-normalized) | |
| apply_feature("koine_ινα", float(int(koine_words.get("ινα", 0) or 0))) | |
| apply_feature("koine_οτι", float(int(koine_words.get("οτι", 0) or 0))) | |
| apply_feature("koine_καθωσ", float(int(koine_words.get("καθωσ", 0) or 0))) | |
| apply_feature("koine_εγενετο", float(int(koine_words.get("εγενετο", 0) or 0))) | |
| # Lexicalized ττ/σσ stems (counts) | |
| apply_feature("lexical_attic_tt", float(int(lexical_cues.get("attic_tt", 0) or 0))) | |
| apply_feature("lexical_ionic_ss", float(int(lexical_cues.get("ionic_ss", 0) or 0))) | |
| # Doric cue (very noisy): require longer text + multiple hits | |
| ha_hits = int(doric_cues.get("ha_initial", 0) or 0) | |
| if token_count >= 30 and ha_hits >= 2: | |
| apply_feature("doric_ha_initial", float(ha_hits)) | |
| # If mutable, persist diagnostics for explainability. | |
| if isinstance(feature_dict, dict): | |
| feature_dict["rates"] = rates | |
| feature_dict["diagnostics"] = { | |
| "greek_ratio": greek_ratio, | |
| "evidence_scale": evidence_scale, | |
| } | |
| feature_dict["_raw_scores"] = dict(raw_scores) | |
| feature_dict["_contributions"] = {d: dict(contributions[d]) for d in DIALECTS} | |
| # Slightly increase confidence only when evidence is strong. | |
| temperature = _clamp(2.0 - 0.6 * evidence_scale, 1.4, 2.0) | |
| scores = _softmax_percent(raw_scores, temperature=temperature) | |
| # Post-hoc discrimination diagnostics. | |
| ordered = sorted(scores.items(), key=lambda kv: kv[1], reverse=True) | |
| best_pct = float(ordered[0][1]) if ordered else 0.0 | |
| second_pct = float(ordered[1][1]) if len(ordered) > 1 else 0.0 | |
| top_gap_pct = best_pct - second_pct | |
| if isinstance(feature_dict, dict): | |
| diagnostics = feature_dict.get("diagnostics", {}) or {} | |
| diagnostics.update( | |
| { | |
| "best_pct": best_pct, | |
| "second_pct": second_pct, | |
| "top_gap_pct": top_gap_pct, | |
| } | |
| ) | |
| feature_dict["diagnostics"] = diagnostics | |
| return scores | |