thomascerniglia's picture
Upload 8 files
d0326ea verified
from __future__ import annotations
import math
from collections import Counter
from typing import Any, Dict, Mapping, Tuple
from .features import ENDINGS_PLAIN, INFINITIVE_ENDINGS_PLAIN, PARTICLES, compute_rates
DIALECTS: Tuple[str, ...] = ("Attic", "Ionic", "Doric", "Aeolic", "Koine")
def _clamp(x: float, lo: float, hi: float) -> float:
return max(lo, min(hi, x))
def _softmax_percent(raw_scores: Mapping[str, float], *, temperature: float = 2.0) -> Dict[str, float]:
"""Softmax over dialect scores with temperature to reduce overconfidence."""
if not raw_scores:
return {d: 0.0 for d in DIALECTS}
t = max(1e-6, float(temperature))
max_raw = max(float(v) for v in raw_scores.values())
exp_scores = {d: math.exp((float(raw_scores[d]) - max_raw) / t) for d in DIALECTS}
total = sum(exp_scores.values()) or 1.0
return {d: 100.0 * (exp_scores[d] / total) for d in DIALECTS}
def score_dialects(feature_dict: Mapping[str, Any]) -> Dict[str, float]:
"""Score dialects using a weighted, rule-based scoring system.
Returns a dict mapping dialect -> confidence percentage (0-100).
Weights are placeholders intended to be edited as the rule-set grows.
"""
rates = compute_rates(feature_dict)
token_count = int(feature_dict.get("token_count", 0) or 0)
script = feature_dict.get("script", {}) or {}
greek_alpha = int(script.get("greek_alpha_chars", 0) or 0)
alpha_chars = int(script.get("alpha_chars", 0) or 0)
greek_ratio = (greek_alpha / alpha_chars) if alpha_chars > 0 else 0.0
particle_rates: Mapping[str, float] = rates["particles_per_100"]
ending_rates: Mapping[str, float] = rates["endings_per_100"]
infinitives: Mapping[str, int] = feature_dict.get("infinitives", {}) or {}
poetic_morph: Mapping[str, int] = feature_dict.get("poetic_morph", {}) or {}
epic_particle_rates: Mapping[str, float] = rates.get("epic_particles_per_100", {}) or {}
epic_ending_rates: Mapping[str, float] = rates.get("epic_endings_per_100", {}) or {}
epic_words: Mapping[str, int] = feature_dict.get("epic_words", {}) or {}
dative_plural_rates: Mapping[str, float] = rates.get("dative_plural_endings_per_100", {}) or {}
prepositions: Mapping[str, int] = feature_dict.get("prepositions", {}) or {}
koine_words: Mapping[str, int] = feature_dict.get("koine_words", {}) or {}
lexical_cues: Mapping[str, int] = feature_dict.get("lexical_cues", {}) or {}
doric_cues: Mapping[str, int] = feature_dict.get("doric_cues", {}) or {}
patterns: Mapping[str, int] = feature_dict.get("patterns", {}) or {}
marked_rate = float(rates["marked_endings_per_100"])
epic_oio_rate = float(epic_ending_rates.get("οιο", 0.0) or 0.0)
epic_essi_rate = float(epic_ending_rates.get("εσσι", 0.0) or 0.0)
epic_fi_rate = float(epic_ending_rates.get("φι", 0.0) or 0.0)
epic_eta_os_rate = float(epic_ending_rates.get("ηοσ", 0.0) or 0.0)
epic_adeo_rate = float(epic_ending_rates.get("αδεω", 0.0) or 0.0)
epic_ideo_rate = float(epic_ending_rates.get("ιδεω", 0.0) or 0.0)
epic_ke_rate = float(epic_particle_rates.get("κε", 0.0) or 0.0)
epic_ken_rate = float(epic_particle_rates.get("κεν", 0.0) or 0.0)
epic_ke_ken_rate = epic_ke_rate + epic_ken_rate
epic_ar_rate = float(epic_particle_rates.get("αρ", 0.0) or 0.0)
epic_min_rate = float(epic_particle_rates.get("μιν", 0.0) or 0.0)
tt_count = int(patterns.get("tt", 0) or 0)
ss_count = int(patterns.get("ss", 0) or 0)
# --- Weights (MVP placeholders) ---
weights: Dict[str, Dict[str, float]] = {
"particle_μεν": {"Attic": 0.25, "Ionic": 0.15, "Doric": 0.10, "Aeolic": 0.10, "Koine": 0.05},
"particle_δε": {"Attic": 0.20, "Ionic": 0.20, "Doric": 0.15, "Aeolic": 0.15, "Koine": 0.10},
"particle_γαρ": {"Attic": 0.20, "Ionic": 0.15, "Doric": 0.10, "Aeolic": 0.10, "Koine": 0.10},
"particle_τε": {"Attic": 0.15, "Ionic": 0.10, "Doric": 0.20, "Aeolic": 0.12, "Koine": 0.05},
"particle_δη": {"Attic": 0.10, "Ionic": 0.10, "Doric": 0.10, "Aeolic": 0.08, "Koine": 0.05},
"particle_ουν": {"Attic": 0.15, "Ionic": 0.10, "Doric": 0.05, "Aeolic": 0.05, "Koine": 0.10},
"ending_οισι": {"Ionic": 3.50, "Attic": -1.00, "Doric": 0.50, "Aeolic": 0.20, "Koine": -1.50},
"ending_ηι": {"Attic": 1.10, "Ionic": 0.80, "Doric": 0.10, "Aeolic": 0.20, "Koine": -0.30},
"ending_ᾳ": {"Attic": 0.80, "Ionic": 0.60, "Doric": 0.30, "Aeolic": 0.20, "Koine": -0.60},
"ending_οι": {"Attic": 0.15, "Ionic": 0.15, "Doric": 0.15, "Aeolic": 0.15, "Koine": 0.15},
"ending_αι": {"Attic": 0.15, "Ionic": 0.15, "Doric": 0.15, "Aeolic": 0.15, "Koine": 0.15},
# NOTE: This is intentionally low-weight. "Few strong markers" is not
# uniquely Koine; it can also describe many Attic passages.
"low_marked_endings": {"Koine": 0.25, "Attic": 0.05, "Ionic": -0.05, "Doric": 0.05, "Aeolic": -0.05},
# Homeric / epic-Ionic signal
"epic_ending_οιο": {"Ionic": 4.00, "Attic": -0.50, "Doric": -0.50, "Aeolic": -0.30, "Koine": -0.50},
# Epic endings and particles (conservative; only meaningful when present)
"epic_ending_εσσι": {"Ionic": 3.00, "Attic": -0.40, "Doric": -0.20, "Aeolic": -0.20, "Koine": -0.80},
"epic_ending_φι": {"Ionic": 1.50, "Attic": -0.20, "Doric": 0.10, "Aeolic": 0.05, "Koine": -0.50},
"epic_particle_κεκεν": {"Ionic": 2.00, "Attic": -0.20, "Doric": 0.10, "Aeolic": 0.05, "Koine": -0.70},
"epic_ending_ηοσ": {"Ionic": 2.60, "Attic": -0.30, "Doric": -0.10, "Aeolic": -0.10, "Koine": -0.60},
"epic_ending_αδεω": {"Ionic": 2.80, "Attic": -0.20, "Doric": 0.00, "Aeolic": 0.00, "Koine": -0.70},
"epic_ending_ιδεω": {"Ionic": 2.80, "Attic": -0.20, "Doric": 0.00, "Aeolic": 0.00, "Koine": -0.70},
# Homeric / epic particles (ambiguous individually; keep weights modest)
"epic_particle_αρ": {"Ionic": 0.80, "Attic": -0.05, "Doric": 0.00, "Aeolic": 0.00, "Koine": -0.15},
"epic_particle_μιν": {"Ionic": 1.20, "Attic": -0.10, "Doric": 0.00, "Aeolic": 0.00, "Koine": -0.25},
# Homeric vocabulary: apply only when multiple hits occur (see logic below)
"epic_word_hits": {"Ionic": 1.80, "Attic": 0.00, "Doric": 0.00, "Aeolic": 0.00, "Koine": 0.00},
# Orthographic patterns (COUNT-based; prevents short-text rate blowups)
"pattern_tt": {"Attic": 0.45, "Ionic": 0.00, "Doric": 0.00, "Aeolic": 0.00, "Koine": 0.05},
"pattern_ss": {"Ionic": 0.10, "Attic": 0.00, "Doric": 0.00, "Aeolic": 0.00, "Koine": 0.00},
# Dative plural endings: -οισι/-αισι/-ηισι vs -οις/-αις
"dative_οισι": {"Ionic": 0.90, "Attic": -0.20, "Doric": 0.10, "Aeolic": 0.05, "Koine": -0.40},
"dative_αισι": {"Ionic": 2.20, "Attic": -0.40, "Doric": 0.20, "Aeolic": 0.10, "Koine": -0.80},
"dative_ηισι": {"Ionic": 2.20, "Attic": -0.30, "Doric": 0.10, "Aeolic": 0.10, "Koine": -0.80},
"dative_οις": {"Attic": 0.20, "Ionic": 0.05, "Doric": 0.10, "Aeolic": 0.10, "Koine": 0.15},
"dative_αις": {"Attic": 0.20, "Ionic": 0.05, "Doric": 0.10, "Aeolic": 0.10, "Koine": 0.15},
# εἰς vs ἐς (COUNT-based; keys are sigma-normalized: εισ / εσ)
"prep_εισ": {"Koine": 0.30, "Attic": 0.05, "Ionic": 0.00, "Doric": 0.00, "Aeolic": 0.00},
"prep_εσ": {"Attic": 0.25, "Ionic": 0.15, "Koine": 0.05, "Doric": 0.00, "Aeolic": 0.05},
# Koine-ish function words (COUNT-based; sigma-normalized: καθωσ)
"koine_ινα": {"Koine": 0.60, "Attic": 0.00, "Ionic": 0.00, "Doric": 0.00},
"koine_οτι": {"Koine": 0.40, "Attic": 0.00, "Ionic": 0.00, "Doric": 0.00},
"koine_καθωσ": {"Koine": 0.35, "Attic": 0.00, "Ionic": 0.00, "Doric": 0.00},
"koine_εγενετο": {"Koine": 0.90, "Attic": 0.00, "Ionic": 0.00, "Doric": 0.00, "Aeolic": 0.00},
# Lexicalized ττ/σσ stems (COUNT-based)
"lexical_attic_tt": {"Attic": 0.75, "Koine": 0.08, "Ionic": 0.00, "Doric": 0.00},
"lexical_ionic_ss": {"Ionic": 0.25, "Attic": 0.00, "Doric": 0.00, "Koine": 0.00},
# Doric-ish ἁ- (very weak; COUNT-based)
"doric_ha_initial": {"Doric": 0.12, "Attic": 0.00, "Ionic": 0.00, "Koine": 0.00},
# Infinitives (morphology): strong signal when present
# These are COUNT-based to avoid short-text rate blowups.
"inf_μεναι": {"Aeolic": 2.40, "Doric": 0.40, "Ionic": 0.05, "Attic": 0.00, "Koine": 0.00},
"inf_μεν": {"Doric": 1.20, "Aeolic": 0.80, "Ionic": 0.00, "Attic": 0.00, "Koine": 0.00},
"inf_ειν": {"Koine": 0.55, "Attic": 0.35, "Ionic": 0.35, "Doric": 0.00, "Aeolic": 0.00},
# Poetic morphology cues (COUNT-based)
"verb_1pl_mes": {"Doric": 1.30, "Aeolic": 0.30, "Attic": 0.00, "Ionic": 0.00, "Koine": 0.00},
"aeolic_ammi": {"Aeolic": 2.20, "Doric": 0.20, "Attic": 0.00, "Ionic": 0.00, "Koine": 0.00},
"aeolic_ummi": {"Aeolic": 2.20, "Doric": 0.20, "Attic": 0.00, "Ionic": 0.00, "Koine": 0.00},
}
raw_scores: Dict[str, float] = {d: 1.0 for d in DIALECTS}
contributions: Dict[str, Counter[str]] = {d: Counter() for d in DIALECTS}
# Evidence scaling: short passages should not yield extreme confidence.
evidence_scale = _clamp(token_count / 40.0, 0.0, 1.0)
if greek_ratio < 0.30:
evidence_scale *= 0.15
def apply_feature(feature_name: str, feature_value: float) -> None:
for dialect, w in weights.get(feature_name, {}).items():
delta = w * feature_value * evidence_scale
raw_scores[dialect] += delta
contributions[dialect][feature_name] += delta
def apply_tier_a(feature_name: str, feature_value: float) -> None:
"""Apply highly diagnostic features with a minimum evidence scale.
Rationale: some morphology is genuinely strong evidence even in short
passages; we still keep the scale modest to avoid overconfidence.
"""
tier_scale = max(evidence_scale, 0.25)
for dialect, w in weights.get(feature_name, {}).items():
delta = w * feature_value * tier_scale
raw_scores[dialect] += delta
contributions[dialect][feature_name] += delta
for p in PARTICLES:
apply_feature(f"particle_{p}", float(particle_rates.get(p, 0.0)))
for e in (*ENDINGS_PLAIN, "ᾳ"):
apply_feature(f"ending_{e}", float(ending_rates.get(e, 0.0)))
# Infinitive morphology
apply_tier_a("inf_μεναι", float(int(infinitives.get("μεναι", 0) or 0)))
apply_tier_a("inf_μεν", float(int(infinitives.get("μεν", 0) or 0)))
apply_tier_a("inf_ειν", float(int(infinitives.get("ειν", 0) or 0)))
# Poetic morphology
apply_tier_a("verb_1pl_mes", float(int(poetic_morph.get("verb_1pl_mes", 0) or 0)))
apply_tier_a("aeolic_ammi", float(int(poetic_morph.get("aeolic_ammi", 0) or 0)))
apply_tier_a("aeolic_ummi", float(int(poetic_morph.get("aeolic_ummi", 0) or 0)))
# Only apply the Koine scarcity heuristic when we have enough text.
if token_count >= 20:
apply_feature("low_marked_endings", max(0.0, 1.5 - marked_rate))
# Epic marker
apply_feature("epic_ending_οιο", epic_oio_rate)
# Additional epic markers
apply_feature("epic_ending_εσσι", epic_essi_rate)
apply_feature("epic_ending_φι", epic_fi_rate)
apply_feature("epic_particle_κεκεν", epic_ke_ken_rate)
apply_feature("epic_ending_ηοσ", epic_eta_os_rate)
apply_feature("epic_ending_αδεω", epic_adeo_rate)
apply_feature("epic_ending_ιδεω", epic_ideo_rate)
apply_feature("epic_particle_αρ", epic_ar_rate)
apply_feature("epic_particle_μιν", epic_min_rate)
epic_word_hits = sum(
int(epic_words.get(w, 0) or 0)
for w in ("εννεπε", "αειδε", "μουσα", "μηνιν", "θεα")
)
if epic_word_hits >= 2:
apply_tier_a("epic_word_hits", float(min(4, epic_word_hits)))
# tt/ss orthography (separate, conservative)
apply_feature("pattern_tt", float(tt_count))
apply_feature("pattern_ss", float(ss_count))
# Dative plural endings
apply_feature("dative_οισι", float(dative_plural_rates.get("οισι", 0.0) or 0.0))
apply_feature("dative_αισι", float(dative_plural_rates.get("αισι", 0.0) or 0.0))
apply_feature("dative_ηισι", float(dative_plural_rates.get("ηισι", 0.0) or 0.0))
apply_feature("dative_οις", float(dative_plural_rates.get("οις", 0.0) or 0.0))
apply_feature("dative_αις", float(dative_plural_rates.get("αις", 0.0) or 0.0))
# εἰς / ἐς (counts; sigma-normalized)
apply_feature("prep_εισ", float(int(prepositions.get("εισ", 0) or 0)))
apply_feature("prep_εσ", float(int(prepositions.get("εσ", 0) or 0)))
# Koine-ish function words (counts; sigma-normalized)
apply_feature("koine_ινα", float(int(koine_words.get("ινα", 0) or 0)))
apply_feature("koine_οτι", float(int(koine_words.get("οτι", 0) or 0)))
apply_feature("koine_καθωσ", float(int(koine_words.get("καθωσ", 0) or 0)))
apply_feature("koine_εγενετο", float(int(koine_words.get("εγενετο", 0) or 0)))
# Lexicalized ττ/σσ stems (counts)
apply_feature("lexical_attic_tt", float(int(lexical_cues.get("attic_tt", 0) or 0)))
apply_feature("lexical_ionic_ss", float(int(lexical_cues.get("ionic_ss", 0) or 0)))
# Doric cue (very noisy): require longer text + multiple hits
ha_hits = int(doric_cues.get("ha_initial", 0) or 0)
if token_count >= 30 and ha_hits >= 2:
apply_feature("doric_ha_initial", float(ha_hits))
# If mutable, persist diagnostics for explainability.
if isinstance(feature_dict, dict):
feature_dict["rates"] = rates
feature_dict["diagnostics"] = {
"greek_ratio": greek_ratio,
"evidence_scale": evidence_scale,
}
feature_dict["_raw_scores"] = dict(raw_scores)
feature_dict["_contributions"] = {d: dict(contributions[d]) for d in DIALECTS}
# Slightly increase confidence only when evidence is strong.
temperature = _clamp(2.0 - 0.6 * evidence_scale, 1.4, 2.0)
scores = _softmax_percent(raw_scores, temperature=temperature)
# Post-hoc discrimination diagnostics.
ordered = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)
best_pct = float(ordered[0][1]) if ordered else 0.0
second_pct = float(ordered[1][1]) if len(ordered) > 1 else 0.0
top_gap_pct = best_pct - second_pct
if isinstance(feature_dict, dict):
diagnostics = feature_dict.get("diagnostics", {}) or {}
diagnostics.update(
{
"best_pct": best_pct,
"second_pct": second_pct,
"top_gap_pct": top_gap_pct,
}
)
feature_dict["diagnostics"] = diagnostics
return scores