Spaces:

thomascerniglia
/

DialectAnalysis

Sleeping

App Files Files Community

thomascerniglia commited on Feb 19

Commit

d0326ea

verified ·

1 Parent(s): ba2bf14

Upload 8 files

Browse files

Files changed (8) hide show

dialect_analysis/__main__.py +11 -0
dialect_analysis/cli.py +96 -0
dialect_analysis/explanation.py +146 -0
dialect_analysis/features.py +373 -0
dialect_analysis/normalization.py +60 -0
dialect_analysis/pipeline.py +35 -0
dialect_analysis/scoring.py +285 -0
dialect_analysis/tokenization.py +11 -0

dialect_analysis/__main__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from __future__ import annotations
+from .cli import run_cli
+def main() -> int:
+    return run_cli()
+if __name__ == "__main__":
+    raise SystemExit(main())

dialect_analysis/cli.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from __future__ import annotations
+import sys
+from typing import List
+from .pipeline import classify_text
+from .scoring import DIALECTS
+def _decode_stdin_bytes(data: bytes) -> str:
+    """Decode piped stdin bytes robustly on Windows/PowerShell.
+    PowerShell (especially Windows PowerShell 5.x) may pipe text to native
+    executables as UTF-16LE, which can appear in Python as NUL-padded bytes or
+    mojibake if decoded with a legacy code page.
+    """
+    if not data:
+        return ""
+    # Heuristic: lots of NUL bytes strongly suggests UTF-16.
+    nul_ratio = data.count(b"\x00") / max(1, len(data))
+    if nul_ratio > 0.10:
+        for enc in ("utf-16", "utf-16-le", "utf-16-be"):
+            try:
+                return data.decode(enc)
+            except UnicodeDecodeError:
+                continue
+    # Otherwise, try UTF-8 first (common in PowerShell 7+), then UTF-16 just in case.
+    for enc in ("utf-8-sig", "utf-8", "utf-16", "utf-16-le", "utf-16-be"):
+        try:
+            return data.decode(enc)
+        except UnicodeDecodeError:
+            continue
+    # Fallback: replace undecodable bytes.
+    return data.decode("utf-8", errors="replace")
+def read_multiline_stdin() -> str:
+    """Read multi-line input.
+    - If text is piped in, read all of stdin.
+    - If interactive, read until an empty line or EOF.
+    """
+    if not sys.stdin.isatty():
+        data = sys.stdin.buffer.read()
+        return _decode_stdin_bytes(data)
+    print("Enter Greek text (finish with an empty line, or Ctrl-Z then Enter on Windows):")
+    lines: List[str] = []
+    while True:
+        try:
+            line = input()
+        except EOFError:
+            break
+        if line.strip() == "":
+            break
+        lines.append(line)
+    return "\n".join(lines)
+def run_cli() -> int:
+    # Best-effort Windows console UTF-8 handling.
+    # This does not affect piped-input decoding (handled separately).
+    try:
+        if sys.stdin.isatty():
+            sys.stdin.reconfigure(encoding="utf-8", errors="replace")  # type: ignore[attr-defined]
+        if sys.stdout.isatty():
+            sys.stdout.reconfigure(encoding="utf-8", errors="replace")  # type: ignore[attr-defined]
+    except Exception:
+        pass
+    text = read_multiline_stdin()
+    if not text.strip():
+        print("No input provided.")
+        return 2
+    # If the console encoding is wrong, Greek often turns into '?'.
+    if text.count("?") >= 10 and sys.stdin.isatty():
+        print(
+            "Warning: many '?' characters detected; your terminal may not be using UTF-8. "
+            "In PowerShell, try: chcp 65001"
+        )
+    result = classify_text(text)
+    print(f"Dialect: {result['dialect']}")
+    print(f"Confidence: {result['confidence'] * 100:.1f}%")
+    print("Scores (%):")
+    for d in DIALECTS:
+        print(f"  {d}: {float(result['scores'].get(d, 0.0)):.1f}")
+    print("")
+    print(result["explanation"])
+    return 0

dialect_analysis/explanation.py ADDED Viewed

	@@ -0,0 +1,146 @@

+from __future__ import annotations
+from typing import Any, List, Mapping, Tuple
+from .features import ENDINGS_PLAIN, PARTICLES
+def explain_results(feature_dict: Mapping[str, Any], scores: Mapping[str, float]) -> str:
+    """Generate a human-readable explanation of the classification."""
+    if not scores:
+        return "No scores were produced."
+    best_dialect = max(scores.items(), key=lambda kv: kv[1])[0]
+    best_pct = float(scores[best_dialect])
+    token_count = int(feature_dict.get("token_count", 0) or 0)
+    particles: Mapping[str, int] = feature_dict.get("particles", {}) or {}
+    endings: Mapping[str, int] = feature_dict.get("endings", {}) or {}
+    infinitives: Mapping[str, int] = feature_dict.get("infinitives", {}) or {}
+    dative_plural: Mapping[str, int] = feature_dict.get("dative_plural_endings", {}) or {}
+    epic_endings: Mapping[str, int] = feature_dict.get("epic_endings", {}) or {}
+    epic_particles: Mapping[str, int] = feature_dict.get("epic_particles", {}) or {}
+    epic_words: Mapping[str, int] = feature_dict.get("epic_words", {}) or {}
+    prepositions: Mapping[str, int] = feature_dict.get("prepositions", {}) or {}
+    koine_words: Mapping[str, int] = feature_dict.get("koine_words", {}) or {}
+    lexical_cues: Mapping[str, int] = feature_dict.get("lexical_cues", {}) or {}
+    doric_cues: Mapping[str, int] = feature_dict.get("doric_cues", {}) or {}
+    poetic_morph: Mapping[str, int] = feature_dict.get("poetic_morph", {}) or {}
+    patterns: Mapping[str, int] = feature_dict.get("patterns", {}) or {}
+    orth: Mapping[str, int] = feature_dict.get("orthography", {}) or {}
+    diagnostics = feature_dict.get("diagnostics", {}) or {}
+    greek_ratio = diagnostics.get("greek_ratio", None)
+    top_gap_pct = diagnostics.get("top_gap_pct", None)
+    contrib = (feature_dict.get("_contributions", {}) or {}).get(best_dialect, {})  # type: ignore[assignment]
+    top_contrib: List[Tuple[str, float]] = sorted(contrib.items(), key=lambda kv: abs(kv[1]), reverse=True)[:8]
+    particle_bits = ", ".join(f"{p}={int(particles.get(p, 0) or 0)}" for p in PARTICLES)
+    ending_bits = ", ".join(f"-{e}={int(endings.get(e, 0) or 0)}" for e in (*ENDINGS_PLAIN, "ᾳ"))
+    orth_bits = (
+        f"alpha_endings={int(orth.get('alpha_endings', 0) or 0)}, "
+        f"eta_endings={int(orth.get('eta_endings', 0) or 0)}"
+    )
+    lines: List[str] = []
+    lines.append(f"Prediction: {best_dialect} (confidence {best_pct:.1f}%)")
+    lines.append(f"Tokens analyzed: {token_count}")
+    if isinstance(greek_ratio, (int, float)):
+        lines.append(f"Greek-script ratio (letters): {float(greek_ratio):.2f}")
+        if float(greek_ratio) < 0.30:
+            lines.append("Warning: input contains little/no Greek; classification is low-evidence.")
+    if token_count < 20:
+        lines.append("Warning: very short passage; confidence may be unreliable.")
+    if isinstance(top_gap_pct, (int, float)) and float(top_gap_pct) < 10.0:
+        lines.append("Warning: scores are clustered; dialect signal is weak.")
+    lines.append("")
+    lines.append("Observed feature counts:")
+    lines.append(f"  Particles: {particle_bits}")
+    lines.append(f"  Endings: {ending_bits}")
+    lines.append(
+        "  Infinitives: "
+        + ", ".join(
+            [
+                f"-ειν={int(infinitives.get('ειν', 0) or 0)}",
+                f"-μεναι={int(infinitives.get('μεναι', 0) or 0)}",
+                f"-μεν={int(infinitives.get('μεν', 0) or 0)}",
+            ]
+        )
+    )
+    lines.append(
+        "  Dative plural endings: "
+        + ", ".join(
+            f"-{e}={int(dative_plural.get(e, 0) or 0)}" for e in ("οισι", "ηισι", "αισι", "οις", "αις")
+        )
+    )
+    lines.append(
+        "  Epic: "
+        + ", ".join(
+            [
+                f"-{e}={int(epic_endings.get(e, 0) or 0)}" for e in ("οιο", "εσσι", "φι", "ηοσ", "αδεω", "ιδεω")
+            ]
+            + [
+                f"{p}={int(epic_particles.get(p, 0) or 0)}" for p in ("κε", "κεν", "αρ", "μιν")
+            ]
+            + [
+                f"{w}={int(epic_words.get(w, 0) or 0)}" for w in ("εννεπε", "αειδε", "μουσα", "μηνιν", "θεα")
+            ]
+        )
+    )
+    lines.append(
+        f"  Patterns: ττ={int(patterns.get('tt', 0) or 0)}, σσ={int(patterns.get('ss', 0) or 0)}"
+    )
+    lines.append(
+        "  Prepositions: "
+        + ", ".join(
+            [
+                f"εἰς={int(prepositions.get('εισ', 0) or 0)}",
+                f"ἐς={int(prepositions.get('εσ', 0) or 0)}",
+            ]
+        )
+    )
+    lines.append(
+        "  Koine function words: "
+        + ", ".join(
+            [
+                f"ἵνα={int(koine_words.get('ινα', 0) or 0)}",
+                f"ὅτι={int(koine_words.get('οτι', 0) or 0)}",
+                f"καθώς={int(koine_words.get('καθωσ', 0) or 0)}",
+                f"ἐγένετο={int(koine_words.get('εγενετο', 0) or 0)}",
+            ]
+        )
+    )
+    lines.append(
+        "  Lexicalized cues: "
+        + ", ".join(
+            [
+                f"TT-stems={int(lexical_cues.get('attic_tt', 0) or 0)}",
+                f"SS-stems={int(lexical_cues.get('ionic_ss', 0) or 0)}",
+            ]
+        )
+    )
+    lines.append(f"  Doric cue: ἁ-initial={int(doric_cues.get('ha_initial', 0) or 0)}")
+    if poetic_morph:
+        lines.append(
+            "  Poetic morph: "
+            + ", ".join(
+                [
+                    f"-μες(1pl)={int(poetic_morph.get('verb_1pl_mes', 0) or 0)}",
+                    f"ἄμμι={int(poetic_morph.get('aeolic_ammi', 0) or 0)}",
+                    f"ὔμμι={int(poetic_morph.get('aeolic_ummi', 0) or 0)}",
+                ]
+            )
+        )
+    lines.append(f"  Orthography: {orth_bits}")
+    if top_contrib:
+        lines.append("")
+        lines.append(f"Top contributing rules for {best_dialect}:")
+        for name, delta in top_contrib:
+            lines.append(f"  {name}: {delta:+.3f}")
+    lines.append("")
+    lines.append("Note: weights are MVP placeholders; edit dialect_analysis/scoring.py to refine rules.")
+    return "\n".join(lines)

dialect_analysis/features.py ADDED Viewed

	@@ -0,0 +1,373 @@

+from __future__ import annotations
+import unicodedata
+from collections import Counter
+from typing import Any, Dict, List, Mapping, Tuple
+from .normalization import sigma_normalize, strip_greek_diacritics
+PARTICLES: Tuple[str, ...] = ("μεν", "δε", "γαρ", "τε", "δη", "ουν")
+ENDINGS_PLAIN: Tuple[str, ...] = ("οι", "αι", "ηι", "οισι")
+# Infinitive endings (high-signal morphology when present).
+# These are matched on diacritic-stripped, sigma-normalized tokens.
+INFINITIVE_ENDINGS_PLAIN: Tuple[str, ...] = (
+    "ειν",    # common Attic/Ionic/Koine infinitive
+    "μεναι",  # Aeolic-style infinitive
+    "μεν",    # Doric/Aeolic-style infinitive
+)
+# A few additional, high-signal Homeric / epic-Ionic patterns (MVP).
+# Matched on diacritic-stripped tokens.
+EPIC_ENDINGS_PLAIN: Tuple[str, ...] = (
+    "οιο",   # e.g., Ἠελίοιο
+    "φι",    # e.g., -φι instrumental
+    "εσσι",  # -εσσι(ν)
+    # Epic/Ionic genitive (sigma-normalized): -ηος (e.g., Ἀχιλῆος -> αχιληοσ)
+    "ηοσ",
+    # Epic patronymic genitive (e.g., Πηληϊάδεω, Ἀτρεΐδεω)
+    "αδεω",
+    "ιδεω",
+)
+# Dative plural patterns (useful for Ionic/Epic vs Attic/Koine tendencies).
+# Matched on diacritic-stripped tokens.
+DATIVE_PLURAL_ENDINGS_PLAIN: Tuple[str, ...] = (
+    "οισι",
+    "ηισι",
+    "αισι",
+    "οις",
+    "αις",
+)
+# Epic particles (very small MVP subset; diacritics stripped and sigma-normalized).
+EPIC_PARTICLES_PLAIN: Tuple[str, ...] = (
+    "κε",
+    "κεν",
+    # Very common Homeric particle (often written ἄρ/ἄρ᾽)
+    "αρ",
+    # Homeric/epic pronoun form
+    "μιν",
+)
+# A few very common Homeric-vocabulary tokens (NOT dialect-specific in isolation).
+# We only treat these as weak epic-Ionic evidence when multiple hits occur.
+EPIC_WORDS_PLAIN: Tuple[str, ...] = (
+    "εννεπε",
+    "αειδε",
+    "μουσα",
+    "μηνιν",
+    "θεα",
+)
+# Very small lexicalized Attic-vs-Ionic spelling cues (MVP).
+# These are substring-based to catch inflectional variants.
+ATTIC_TT_STEMS: Tuple[str, ...] = (
+    "θαλαττ",  # θάλαττα
+    "γλωττ",   # γλῶττα
+    "πραττ",   # πράττω
+    "ταττ",    # τάττω
+)
+IONIC_SS_STEMS: Tuple[str, ...] = (
+    "θαλασσ",  # θάλασσα
+    "γλωσσ",   # γλῶσσα
+    "πρασσ",   # πράσσω
+    "τασσ",    # τάσσω
+)
+# Preposition preference (edition-dependent but often helpful): εἰς vs ἐς.
+PREPOSITIONS_PLAIN: Tuple[str, ...] = (
+    # NOTE: these are *sigma-normalized* (final ς -> σ)
+    "εισ",
+    "εσ",
+)
+# Koine-leaning function words (very small MVP set; genre-sensitive).
+# These should be low-weight, positive-only cues.
+KOINE_FUNCTION_WORDS_PLAIN: Tuple[str, ...] = (
+    "ινα",
+    "οτι",
+    # NOTE: sigma-normalized
+    "καθωσ",
+    # NT-style narrative formula is common in Koine
+    "εγενετο",
+)
+# Literary/poetic morphology cues.
+# - Doric 1pl active ending often appears as -μες (vs -μεν).
+# - Aeolic pronoun forms like ἄμμι/ὔμμι are strong when they occur.
+POETIC_MORPH_CUES: Tuple[str, ...] = (
+    "verb_1pl_mes",
+    "aeolic_ammi",
+    "aeolic_ummi",
+)
+def _ends_with_iota_subscript_cluster(token: str, base_letter: str) -> bool:
+    """True if token ends with base_letter + iota-subscript (any accents allowed)."""
+    if not token:
+        return False
+    decomposed = unicodedata.normalize("NFD", token)
+    i = len(decomposed) - 1
+    saw_ypogegrammeni = False
+    while i >= 0 and unicodedata.combining(decomposed[i]):
+        if decomposed[i] == "\u0345":
+            saw_ypogegrammeni = True
+        i -= 1
+    if i < 0:
+        return False
+    base = decomposed[i]
+    return base == base_letter and saw_ypogegrammeni
+def extract_features(tokens: List[str]) -> Dict[str, Any]:
+    """Extract interpretable linguistic feature counts from tokens."""
+    token_count = len(tokens)
+    particles = Counter({p: 0 for p in PARTICLES})
+    endings = Counter({e: 0 for e in (*ENDINGS_PLAIN, "ᾳ")})
+    infinitives = Counter({e: 0 for e in INFINITIVE_ENDINGS_PLAIN})
+    epic_endings = Counter({e: 0 for e in EPIC_ENDINGS_PLAIN})
+    dative_plural_endings = Counter({e: 0 for e in DATIVE_PLURAL_ENDINGS_PLAIN})
+    epic_particles = Counter({p: 0 for p in EPIC_PARTICLES_PLAIN})
+    epic_words = Counter({w: 0 for w in EPIC_WORDS_PLAIN})
+    prepositions = Counter({p: 0 for p in PREPOSITIONS_PLAIN})
+    koine_words = Counter({w: 0 for w in KOINE_FUNCTION_WORDS_PLAIN})
+    lexical_cues = Counter(
+        {
+            "attic_tt": 0,
+            "ionic_ss": 0,
+        }
+    )
+    # Mild Doric cue: initial rough-breathed alpha (e.g., ἁ as article in Doric).
+    doric_ha_initial = 0
+    poetic_morph = Counter({k: 0 for k in POETIC_MORPH_CUES})
+    # Orthographic patterns
+    tt_count = 0
+    ss_count = 0
+    alpha_endings = 0
+    eta_endings = 0
+    # Script evidence: helps detect non-Greek input or encoding issues.
+    greek_alpha_chars = 0
+    alpha_chars = 0
+    for tok in tokens:
+        if not tok:
+            continue
+        for ch in tok:
+            if not ch.isalpha():
+                continue
+            alpha_chars += 1
+            code = ord(ch)
+            if (0x0370 <= code <= 0x03FF) or (0x1F00 <= code <= 0x1FFF):
+                greek_alpha_chars += 1
+        plain = sigma_normalize(strip_greek_diacritics(tok))
+        # Doric 1pl -μες (sigma-normalized: -μεσ).
+        # Guard against counting very short tokens.
+        if len(plain) >= 5 and plain.endswith("μεσ"):
+            poetic_morph["verb_1pl_mes"] += 1
+        # Aeolic pronoun forms (very high signal).
+        if plain == "αμμι":
+            poetic_morph["aeolic_ammi"] += 1
+        if plain == "υμμι":
+            poetic_morph["aeolic_ummi"] += 1
+        # Doric cue: token begins with alpha + rough breathing.
+        # This is intentionally weak; lots of words can have rough breathing.
+        nfd = unicodedata.normalize("NFD", tok)
+        if nfd:
+            base0 = nfd[0]
+            # Collect leading combining marks
+            j = 1
+            has_rough = False
+            while j < len(nfd) and unicodedata.combining(nfd[j]):
+                # COMBINING REVERSED COMMA ABOVE (rough breathing)
+                if nfd[j] == "\u0314":
+                    has_rough = True
+                j += 1
+            if base0 == "α" and has_rough:
+                doric_ha_initial += 1
+        # Count orthographic patterns (occurrences, not just token presence)
+        tt_count += plain.count("ττ")
+        ss_count += plain.count("σσ")
+        if plain in particles:
+            particles[plain] += 1
+        if plain in epic_particles:
+            epic_particles[plain] += 1
+        if plain in epic_words:
+            epic_words[plain] += 1
+        if plain in prepositions:
+            prepositions[plain] += 1
+        if plain in koine_words:
+            koine_words[plain] += 1
+        # Lexicalized Attic/Ionic cues
+        if any(stem in plain for stem in ATTIC_TT_STEMS):
+            lexical_cues["attic_tt"] += 1
+        if any(stem in plain for stem in IONIC_SS_STEMS):
+            lexical_cues["ionic_ss"] += 1
+        for ending in ENDINGS_PLAIN:
+            if plain.endswith(ending):
+                endings[ending] += 1
+        # Infinitive endings (prefer longer endings first to avoid double-counting)
+        # Guard against short function words like the particle "μεν".
+        if len(plain) >= 5:
+            if plain.endswith("μεναι"):
+                infinitives["μεναι"] += 1
+            elif plain.endswith("ειν"):
+                infinitives["ειν"] += 1
+            elif plain.endswith("μεν"):
+                infinitives["μεν"] += 1
+        for ending in EPIC_ENDINGS_PLAIN:
+            if plain.endswith(ending):
+                epic_endings[ending] += 1
+        for ending in DATIVE_PLURAL_ENDINGS_PLAIN:
+            if plain.endswith(ending):
+                dative_plural_endings[ending] += 1
+        if _ends_with_iota_subscript_cluster(tok, "α"):
+            endings["ᾳ"] += 1
+        if plain.endswith(("α", "ας", "αν")):
+            alpha_endings += 1
+        if plain.endswith(("η", "ης", "ην")):
+            eta_endings += 1
+    return {
+        "token_count": token_count,
+        "particles": dict(particles),
+        "endings": dict(endings),
+        "infinitives": dict(infinitives),
+        "epic_endings": dict(epic_endings),
+        "dative_plural_endings": dict(dative_plural_endings),
+        "epic_particles": dict(epic_particles),
+        "epic_words": dict(epic_words),
+        "prepositions": dict(prepositions),
+        "koine_words": dict(koine_words),
+        "lexical_cues": dict(lexical_cues),
+        "patterns": {
+            "tt": tt_count,
+            "ss": ss_count,
+        },
+        "orthography": {
+            "alpha_endings": alpha_endings,
+            "eta_endings": eta_endings,
+        },
+        "script": {
+            "greek_alpha_chars": greek_alpha_chars,
+            "alpha_chars": alpha_chars,
+        },
+        "doric_cues": {
+            "ha_initial": doric_ha_initial,
+        },
+        "poetic_morph": dict(poetic_morph),
+    }
+def rate_per_100(count: int, token_count: int) -> float:
+    if token_count <= 0:
+        return 0.0
+    return 100.0 * (count / token_count)
+def compute_rates(feature_dict: Mapping[str, Any]) -> Dict[str, Any]:
+    """Compute per-100-token rates from feature counts."""
+    token_count = int(feature_dict.get("token_count", 0) or 0)
+    particles: Mapping[str, int] = feature_dict.get("particles", {}) or {}
+    endings: Mapping[str, int] = feature_dict.get("endings", {}) or {}
+    infinitives: Mapping[str, int] = feature_dict.get("infinitives", {}) or {}
+    orth: Mapping[str, int] = feature_dict.get("orthography", {}) or {}
+    patterns: Mapping[str, int] = feature_dict.get("patterns", {}) or {}
+    epic_particles: Mapping[str, int] = feature_dict.get("epic_particles", {}) or {}
+    epic_endings: Mapping[str, int] = feature_dict.get("epic_endings", {}) or {}
+    dative_plural_endings: Mapping[str, int] = feature_dict.get("dative_plural_endings", {}) or {}
+    prepositions: Mapping[str, int] = feature_dict.get("prepositions", {}) or {}
+    koine_words: Mapping[str, int] = feature_dict.get("koine_words", {}) or {}
+    lexical_cues: Mapping[str, int] = feature_dict.get("lexical_cues", {}) or {}
+    doric_cues: Mapping[str, int] = feature_dict.get("doric_cues", {}) or {}
+    poetic_morph: Mapping[str, int] = feature_dict.get("poetic_morph", {}) or {}
+    particle_rates = {p: rate_per_100(int(particles.get(p, 0) or 0), token_count) for p in PARTICLES}
+    ending_rates = {e: rate_per_100(int(endings.get(e, 0) or 0), token_count) for e in (*ENDINGS_PLAIN, "ᾳ")}
+    infinitive_rates = {
+        e: rate_per_100(int(infinitives.get(e, 0) or 0), token_count) for e in INFINITIVE_ENDINGS_PLAIN
+    }
+    alpha_rate = rate_per_100(int(orth.get("alpha_endings", 0) or 0), token_count)
+    eta_rate = rate_per_100(int(orth.get("eta_endings", 0) or 0), token_count)
+    marked_rate = ending_rates.get("οισι", 0.0) + ending_rates.get("ηι", 0.0) + ending_rates.get("ᾳ", 0.0)
+    pattern_rates = {
+        "tt": rate_per_100(int(patterns.get("tt", 0) or 0), token_count),
+        "ss": rate_per_100(int(patterns.get("ss", 0) or 0), token_count),
+    }
+    epic_particle_rates = {p: rate_per_100(int(epic_particles.get(p, 0) or 0), token_count) for p in EPIC_PARTICLES_PLAIN}
+    epic_ending_rates = {e: rate_per_100(int(epic_endings.get(e, 0) or 0), token_count) for e in EPIC_ENDINGS_PLAIN}
+    dative_plural_ending_rates = {
+        e: rate_per_100(int(dative_plural_endings.get(e, 0) or 0), token_count)
+        for e in DATIVE_PLURAL_ENDINGS_PLAIN
+    }
+    preposition_rates = {p: rate_per_100(int(prepositions.get(p, 0) or 0), token_count) for p in PREPOSITIONS_PLAIN}
+    koine_word_rates = {w: rate_per_100(int(koine_words.get(w, 0) or 0), token_count) for w in KOINE_FUNCTION_WORDS_PLAIN}
+    lexical_cue_rates = {
+        "attic_tt": rate_per_100(int(lexical_cues.get("attic_tt", 0) or 0), token_count),
+        "ionic_ss": rate_per_100(int(lexical_cues.get("ionic_ss", 0) or 0), token_count),
+    }
+    doric_cue_rates = {
+        "ha_initial": rate_per_100(int(doric_cues.get("ha_initial", 0) or 0), token_count),
+    }
+    poetic_morph_rates = {
+        k: rate_per_100(int(poetic_morph.get(k, 0) or 0), token_count) for k in POETIC_MORPH_CUES
+    }
+    return {
+        "particles_per_100": particle_rates,
+        "endings_per_100": ending_rates,
+        "infinitives_per_100": infinitive_rates,
+        "patterns_per_100": pattern_rates,
+        "epic_particles_per_100": epic_particle_rates,
+        "epic_endings_per_100": epic_ending_rates,
+        "dative_plural_endings_per_100": dative_plural_ending_rates,
+        "prepositions_per_100": preposition_rates,
+        "koine_words_per_100": koine_word_rates,
+        "lexical_cues_per_100": lexical_cue_rates,
+        "doric_cues_per_100": doric_cue_rates,
+        "poetic_morph_per_100": poetic_morph_rates,
+        "alpha_endings_per_100": alpha_rate,
+        "eta_endings_per_100": eta_rate,
+        "marked_endings_per_100": marked_rate,
+    }

dialect_analysis/normalization.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from __future__ import annotations
+import re
+import unicodedata
+from typing import List
+# A small punctuation set that commonly appears in Greek texts.
+_EXTRA_PUNCT = "··;;—–…«»‹›“”‘’"  # ano teleia, Greek question mark, dashes, quotes
+def strip_greek_diacritics(text: str) -> str:
+    """Strip diacritics while preserving iota subscript as an explicit iota.
+    - Converts combining GREEK YPOGEGRAMMENI (U+0345) to 'ι'.
+    - Removes other combining marks (accents, breathings, etc.).
+    """
+    decomposed = unicodedata.normalize("NFD", text)
+    out_chars: List[str] = []
+    for ch in decomposed:
+        if ch == "\u0345":
+            out_chars.append("ι")
+            continue
+        if unicodedata.combining(ch):
+            continue
+        out_chars.append(ch)
+    return unicodedata.normalize("NFC", "".join(out_chars))
+def sigma_normalize(token: str) -> str:
+    """Normalize sigma variants for matching."""
+    return token.replace("ς", "σ")
+def normalize_text(text: str, *, strip_diacritics: bool = False) -> str:
+    """Normalize input Greek text.
+    - Lowercase
+    - Remove punctuation
+    - Optionally strip diacritics
+    Keep diacritics by default so feature extraction can detect iota-subscript
+    endings like -ᾳ.
+    """
+    lowered = text.lower()
+    # Replace tabs/newlines with spaces.
+    cleaned = lowered.translate(str.maketrans({"\n": " ", "\t": " "}))
+    cleaned = cleaned.translate(str.maketrans({ch: " " for ch in _EXTRA_PUNCT}))
+    # Remove remaining punctuation/symbols while keeping word chars and spaces.
+    cleaned = re.sub(r"[^\w\s]", " ", cleaned, flags=re.UNICODE)
+    cleaned = re.sub(r"\s+", " ", cleaned).strip()
+    if strip_diacritics:
+        cleaned = strip_greek_diacritics(cleaned)
+    return cleaned

dialect_analysis/pipeline.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from __future__ import annotations
+from typing import Any, Dict, Mapping
+from .explanation import explain_results
+from .features import extract_features
+from .normalization import normalize_text
+from .scoring import DIALECTS, score_dialects
+from .tokenization import tokenize
+def classify_text(text: str, *, strip_diacritics: bool = False) -> Dict[str, Any]:
+    """End-to-end dialect classification pipeline."""
+    normalized = normalize_text(text, strip_diacritics=strip_diacritics)
+    tokens = tokenize(normalized)
+    features = extract_features(tokens)
+    scores = score_dialects(features)
+    dialect = max(scores.items(), key=lambda kv: kv[1])[0] if scores else "Unknown"
+    confidence = (float(scores.get(dialect, 0.0)) / 100.0) if scores else 0.0
+    top_features: Dict[str, Any] = {}
+    contrib_map: Mapping[str, float] = (features.get("_contributions", {}) or {}).get(dialect, {})  # type: ignore[assignment]
+    for name, delta in sorted(contrib_map.items(), key=lambda kv: abs(kv[1]), reverse=True)[:6]:
+        top_features[name] = {"contribution": float(delta)}
+    explanation = explain_results(features, scores)
+    return {
+        "dialect": dialect,
+        "confidence": confidence,
+        "scores": scores,
+        "top_features": top_features,
+        "explanation": explanation,
+    }

dialect_analysis/scoring.py ADDED Viewed

	@@ -0,0 +1,285 @@

+from __future__ import annotations
+import math
+from collections import Counter
+from typing import Any, Dict, Mapping, Tuple
+from .features import ENDINGS_PLAIN, INFINITIVE_ENDINGS_PLAIN, PARTICLES, compute_rates
+DIALECTS: Tuple[str, ...] = ("Attic", "Ionic", "Doric", "Aeolic", "Koine")
+def _clamp(x: float, lo: float, hi: float) -> float:
+    return max(lo, min(hi, x))
+def _softmax_percent(raw_scores: Mapping[str, float], *, temperature: float = 2.0) -> Dict[str, float]:
+    """Softmax over dialect scores with temperature to reduce overconfidence."""
+    if not raw_scores:
+        return {d: 0.0 for d in DIALECTS}
+    t = max(1e-6, float(temperature))
+    max_raw = max(float(v) for v in raw_scores.values())
+    exp_scores = {d: math.exp((float(raw_scores[d]) - max_raw) / t) for d in DIALECTS}
+    total = sum(exp_scores.values()) or 1.0
+    return {d: 100.0 * (exp_scores[d] / total) for d in DIALECTS}
+def score_dialects(feature_dict: Mapping[str, Any]) -> Dict[str, float]:
+    """Score dialects using a weighted, rule-based scoring system.
+    Returns a dict mapping dialect -> confidence percentage (0-100).
+    Weights are placeholders intended to be edited as the rule-set grows.
+    """
+    rates = compute_rates(feature_dict)
+    token_count = int(feature_dict.get("token_count", 0) or 0)
+    script = feature_dict.get("script", {}) or {}
+    greek_alpha = int(script.get("greek_alpha_chars", 0) or 0)
+    alpha_chars = int(script.get("alpha_chars", 0) or 0)
+    greek_ratio = (greek_alpha / alpha_chars) if alpha_chars > 0 else 0.0
+    particle_rates: Mapping[str, float] = rates["particles_per_100"]
+    ending_rates: Mapping[str, float] = rates["endings_per_100"]
+    infinitives: Mapping[str, int] = feature_dict.get("infinitives", {}) or {}
+    poetic_morph: Mapping[str, int] = feature_dict.get("poetic_morph", {}) or {}
+    epic_particle_rates: Mapping[str, float] = rates.get("epic_particles_per_100", {}) or {}
+    epic_ending_rates: Mapping[str, float] = rates.get("epic_endings_per_100", {}) or {}
+    epic_words: Mapping[str, int] = feature_dict.get("epic_words", {}) or {}
+    dative_plural_rates: Mapping[str, float] = rates.get("dative_plural_endings_per_100", {}) or {}
+    prepositions: Mapping[str, int] = feature_dict.get("prepositions", {}) or {}
+    koine_words: Mapping[str, int] = feature_dict.get("koine_words", {}) or {}
+    lexical_cues: Mapping[str, int] = feature_dict.get("lexical_cues", {}) or {}
+    doric_cues: Mapping[str, int] = feature_dict.get("doric_cues", {}) or {}
+    patterns: Mapping[str, int] = feature_dict.get("patterns", {}) or {}
+    marked_rate = float(rates["marked_endings_per_100"])
+    epic_oio_rate = float(epic_ending_rates.get("οιο", 0.0) or 0.0)
+    epic_essi_rate = float(epic_ending_rates.get("εσσι", 0.0) or 0.0)
+    epic_fi_rate = float(epic_ending_rates.get("φι", 0.0) or 0.0)
+    epic_eta_os_rate = float(epic_ending_rates.get("ηοσ", 0.0) or 0.0)
+    epic_adeo_rate = float(epic_ending_rates.get("αδεω", 0.0) or 0.0)
+    epic_ideo_rate = float(epic_ending_rates.get("ιδεω", 0.0) or 0.0)
+    epic_ke_rate = float(epic_particle_rates.get("κε", 0.0) or 0.0)
+    epic_ken_rate = float(epic_particle_rates.get("κεν", 0.0) or 0.0)
+    epic_ke_ken_rate = epic_ke_rate + epic_ken_rate
+    epic_ar_rate = float(epic_particle_rates.get("αρ", 0.0) or 0.0)
+    epic_min_rate = float(epic_particle_rates.get("μιν", 0.0) or 0.0)
+    tt_count = int(patterns.get("tt", 0) or 0)
+    ss_count = int(patterns.get("ss", 0) or 0)
+    # --- Weights (MVP placeholders) ---
+    weights: Dict[str, Dict[str, float]] = {
+        "particle_μεν": {"Attic": 0.25, "Ionic": 0.15, "Doric": 0.10, "Aeolic": 0.10, "Koine": 0.05},
+        "particle_δε": {"Attic": 0.20, "Ionic": 0.20, "Doric": 0.15, "Aeolic": 0.15, "Koine": 0.10},
+        "particle_γαρ": {"Attic": 0.20, "Ionic": 0.15, "Doric": 0.10, "Aeolic": 0.10, "Koine": 0.10},
+        "particle_τε": {"Attic": 0.15, "Ionic": 0.10, "Doric": 0.20, "Aeolic": 0.12, "Koine": 0.05},
+        "particle_δη": {"Attic": 0.10, "Ionic": 0.10, "Doric": 0.10, "Aeolic": 0.08, "Koine": 0.05},
+        "particle_ουν": {"Attic": 0.15, "Ionic": 0.10, "Doric": 0.05, "Aeolic": 0.05, "Koine": 0.10},
+        "ending_οισι": {"Ionic": 3.50, "Attic": -1.00, "Doric": 0.50, "Aeolic": 0.20, "Koine": -1.50},
+        "ending_ηι": {"Attic": 1.10, "Ionic": 0.80, "Doric": 0.10, "Aeolic": 0.20, "Koine": -0.30},
+        "ending_ᾳ": {"Attic": 0.80, "Ionic": 0.60, "Doric": 0.30, "Aeolic": 0.20, "Koine": -0.60},
+        "ending_οι": {"Attic": 0.15, "Ionic": 0.15, "Doric": 0.15, "Aeolic": 0.15, "Koine": 0.15},
+        "ending_αι": {"Attic": 0.15, "Ionic": 0.15, "Doric": 0.15, "Aeolic": 0.15, "Koine": 0.15},
+        # NOTE: This is intentionally low-weight. "Few strong markers" is not
+        # uniquely Koine; it can also describe many Attic passages.
+        "low_marked_endings": {"Koine": 0.25, "Attic": 0.05, "Ionic": -0.05, "Doric": 0.05, "Aeolic": -0.05},
+        # Homeric / epic-Ionic signal
+        "epic_ending_οιο": {"Ionic": 4.00, "Attic": -0.50, "Doric": -0.50, "Aeolic": -0.30, "Koine": -0.50},
+        # Epic endings and particles (conservative; only meaningful when present)
+        "epic_ending_εσσι": {"Ionic": 3.00, "Attic": -0.40, "Doric": -0.20, "Aeolic": -0.20, "Koine": -0.80},
+        "epic_ending_φι": {"Ionic": 1.50, "Attic": -0.20, "Doric": 0.10, "Aeolic": 0.05, "Koine": -0.50},
+        "epic_particle_κεκεν": {"Ionic": 2.00, "Attic": -0.20, "Doric": 0.10, "Aeolic": 0.05, "Koine": -0.70},
+        "epic_ending_ηοσ": {"Ionic": 2.60, "Attic": -0.30, "Doric": -0.10, "Aeolic": -0.10, "Koine": -0.60},
+        "epic_ending_αδεω": {"Ionic": 2.80, "Attic": -0.20, "Doric": 0.00, "Aeolic": 0.00, "Koine": -0.70},
+        "epic_ending_ιδεω": {"Ionic": 2.80, "Attic": -0.20, "Doric": 0.00, "Aeolic": 0.00, "Koine": -0.70},
+        # Homeric / epic particles (ambiguous individually; keep weights modest)
+        "epic_particle_αρ": {"Ionic": 0.80, "Attic": -0.05, "Doric": 0.00, "Aeolic": 0.00, "Koine": -0.15},
+        "epic_particle_μιν": {"Ionic": 1.20, "Attic": -0.10, "Doric": 0.00, "Aeolic": 0.00, "Koine": -0.25},
+        # Homeric vocabulary: apply only when multiple hits occur (see logic below)
+        "epic_word_hits": {"Ionic": 1.80, "Attic": 0.00, "Doric": 0.00, "Aeolic": 0.00, "Koine": 0.00},
+        # Orthographic patterns (COUNT-based; prevents short-text rate blowups)
+        "pattern_tt": {"Attic": 0.45, "Ionic": 0.00, "Doric": 0.00, "Aeolic": 0.00, "Koine": 0.05},
+        "pattern_ss": {"Ionic": 0.10, "Attic": 0.00, "Doric": 0.00, "Aeolic": 0.00, "Koine": 0.00},
+        # Dative plural endings: -οισι/-αισι/-ηισι vs -οις/-αις
+        "dative_οισι": {"Ionic": 0.90, "Attic": -0.20, "Doric": 0.10, "Aeolic": 0.05, "Koine": -0.40},
+        "dative_αισι": {"Ionic": 2.20, "Attic": -0.40, "Doric": 0.20, "Aeolic": 0.10, "Koine": -0.80},
+        "dative_ηισι": {"Ionic": 2.20, "Attic": -0.30, "Doric": 0.10, "Aeolic": 0.10, "Koine": -0.80},
+        "dative_οις": {"Attic": 0.20, "Ionic": 0.05, "Doric": 0.10, "Aeolic": 0.10, "Koine": 0.15},
+        "dative_αις": {"Attic": 0.20, "Ionic": 0.05, "Doric": 0.10, "Aeolic": 0.10, "Koine": 0.15},
+        # εἰς vs ἐς (COUNT-based; keys are sigma-normalized: εισ / εσ)
+        "prep_εισ": {"Koine": 0.30, "Attic": 0.05, "Ionic": 0.00, "Doric": 0.00, "Aeolic": 0.00},
+        "prep_εσ": {"Attic": 0.25, "Ionic": 0.15, "Koine": 0.05, "Doric": 0.00, "Aeolic": 0.05},
+        # Koine-ish function words (COUNT-based; sigma-normalized: καθωσ)
+        "koine_ινα": {"Koine": 0.60, "Attic": 0.00, "Ionic": 0.00, "Doric": 0.00},
+        "koine_οτι": {"Koine": 0.40, "Attic": 0.00, "Ionic": 0.00, "Doric": 0.00},
+        "koine_καθωσ": {"Koine": 0.35, "Attic": 0.00, "Ionic": 0.00, "Doric": 0.00},
+        "koine_εγενετο": {"Koine": 0.90, "Attic": 0.00, "Ionic": 0.00, "Doric": 0.00, "Aeolic": 0.00},
+        # Lexicalized ττ/σσ stems (COUNT-based)
+        "lexical_attic_tt": {"Attic": 0.75, "Koine": 0.08, "Ionic": 0.00, "Doric": 0.00},
+        "lexical_ionic_ss": {"Ionic": 0.25, "Attic": 0.00, "Doric": 0.00, "Koine": 0.00},
+        # Doric-ish ἁ- (very weak; COUNT-based)
+        "doric_ha_initial": {"Doric": 0.12, "Attic": 0.00, "Ionic": 0.00, "Koine": 0.00},
+        # Infinitives (morphology): strong signal when present
+        # These are COUNT-based to avoid short-text rate blowups.
+        "inf_μεναι": {"Aeolic": 2.40, "Doric": 0.40, "Ionic": 0.05, "Attic": 0.00, "Koine": 0.00},
+        "inf_μεν": {"Doric": 1.20, "Aeolic": 0.80, "Ionic": 0.00, "Attic": 0.00, "Koine": 0.00},
+        "inf_ειν": {"Koine": 0.55, "Attic": 0.35, "Ionic": 0.35, "Doric": 0.00, "Aeolic": 0.00},
+        # Poetic morphology cues (COUNT-based)
+        "verb_1pl_mes": {"Doric": 1.30, "Aeolic": 0.30, "Attic": 0.00, "Ionic": 0.00, "Koine": 0.00},
+        "aeolic_ammi": {"Aeolic": 2.20, "Doric": 0.20, "Attic": 0.00, "Ionic": 0.00, "Koine": 0.00},
+        "aeolic_ummi": {"Aeolic": 2.20, "Doric": 0.20, "Attic": 0.00, "Ionic": 0.00, "Koine": 0.00},
+    }
+    raw_scores: Dict[str, float] = {d: 1.0 for d in DIALECTS}
+    contributions: Dict[str, Counter[str]] = {d: Counter() for d in DIALECTS}
+    # Evidence scaling: short passages should not yield extreme confidence.
+    evidence_scale = _clamp(token_count / 40.0, 0.0, 1.0)
+    if greek_ratio < 0.30:
+        evidence_scale *= 0.15
+    def apply_feature(feature_name: str, feature_value: float) -> None:
+        for dialect, w in weights.get(feature_name, {}).items():
+            delta = w * feature_value * evidence_scale
+            raw_scores[dialect] += delta
+            contributions[dialect][feature_name] += delta
+    def apply_tier_a(feature_name: str, feature_value: float) -> None:
+        """Apply highly diagnostic features with a minimum evidence scale.
+        Rationale: some morphology is genuinely strong evidence even in short
+        passages; we still keep the scale modest to avoid overconfidence.
+        """
+        tier_scale = max(evidence_scale, 0.25)
+        for dialect, w in weights.get(feature_name, {}).items():
+            delta = w * feature_value * tier_scale
+            raw_scores[dialect] += delta
+            contributions[dialect][feature_name] += delta
+    for p in PARTICLES:
+        apply_feature(f"particle_{p}", float(particle_rates.get(p, 0.0)))
+    for e in (*ENDINGS_PLAIN, "ᾳ"):
+        apply_feature(f"ending_{e}", float(ending_rates.get(e, 0.0)))
+    # Infinitive morphology
+    apply_tier_a("inf_μεναι", float(int(infinitives.get("μεναι", 0) or 0)))
+    apply_tier_a("inf_μεν", float(int(infinitives.get("μεν", 0) or 0)))
+    apply_tier_a("inf_ειν", float(int(infinitives.get("ειν", 0) or 0)))
+    # Poetic morphology
+    apply_tier_a("verb_1pl_mes", float(int(poetic_morph.get("verb_1pl_mes", 0) or 0)))
+    apply_tier_a("aeolic_ammi", float(int(poetic_morph.get("aeolic_ammi", 0) or 0)))
+    apply_tier_a("aeolic_ummi", float(int(poetic_morph.get("aeolic_ummi", 0) or 0)))
+    # Only apply the Koine scarcity heuristic when we have enough text.
+    if token_count >= 20:
+        apply_feature("low_marked_endings", max(0.0, 1.5 - marked_rate))
+    # Epic marker
+    apply_feature("epic_ending_οιο", epic_oio_rate)
+    # Additional epic markers
+    apply_feature("epic_ending_εσσι", epic_essi_rate)
+    apply_feature("epic_ending_φι", epic_fi_rate)
+    apply_feature("epic_particle_κεκεν", epic_ke_ken_rate)
+    apply_feature("epic_ending_ηοσ", epic_eta_os_rate)
+    apply_feature("epic_ending_αδεω", epic_adeo_rate)
+    apply_feature("epic_ending_ιδεω", epic_ideo_rate)
+    apply_feature("epic_particle_αρ", epic_ar_rate)
+    apply_feature("epic_particle_μιν", epic_min_rate)
+    epic_word_hits = sum(
+        int(epic_words.get(w, 0) or 0)
+        for w in ("εννεπε", "αειδε", "μουσα", "μηνιν", "θεα")
+    )
+    if epic_word_hits >= 2:
+        apply_tier_a("epic_word_hits", float(min(4, epic_word_hits)))
+    # tt/ss orthography (separate, conservative)
+    apply_feature("pattern_tt", float(tt_count))
+    apply_feature("pattern_ss", float(ss_count))
+    # Dative plural endings
+    apply_feature("dative_οισι", float(dative_plural_rates.get("οισι", 0.0) or 0.0))
+    apply_feature("dative_αισι", float(dative_plural_rates.get("αισι", 0.0) or 0.0))
+    apply_feature("dative_ηισι", float(dative_plural_rates.get("ηισι", 0.0) or 0.0))
+    apply_feature("dative_οις", float(dative_plural_rates.get("οις", 0.0) or 0.0))
+    apply_feature("dative_αις", float(dative_plural_rates.get("αις", 0.0) or 0.0))
+    # εἰς / ἐς (counts; sigma-normalized)
+    apply_feature("prep_εισ", float(int(prepositions.get("εισ", 0) or 0)))
+    apply_feature("prep_εσ", float(int(prepositions.get("εσ", 0) or 0)))
+    # Koine-ish function words (counts; sigma-normalized)
+    apply_feature("koine_ινα", float(int(koine_words.get("ινα", 0) or 0)))
+    apply_feature("koine_οτι", float(int(koine_words.get("οτι", 0) or 0)))
+    apply_feature("koine_καθωσ", float(int(koine_words.get("καθωσ", 0) or 0)))
+    apply_feature("koine_εγενετο", float(int(koine_words.get("εγενετο", 0) or 0)))
+    # Lexicalized ττ/σσ stems (counts)
+    apply_feature("lexical_attic_tt", float(int(lexical_cues.get("attic_tt", 0) or 0)))
+    apply_feature("lexical_ionic_ss", float(int(lexical_cues.get("ionic_ss", 0) or 0)))
+    # Doric cue (very noisy): require longer text + multiple hits
+    ha_hits = int(doric_cues.get("ha_initial", 0) or 0)
+    if token_count >= 30 and ha_hits >= 2:
+        apply_feature("doric_ha_initial", float(ha_hits))
+    # If mutable, persist diagnostics for explainability.
+    if isinstance(feature_dict, dict):
+        feature_dict["rates"] = rates
+        feature_dict["diagnostics"] = {
+            "greek_ratio": greek_ratio,
+            "evidence_scale": evidence_scale,
+        }
+        feature_dict["_raw_scores"] = dict(raw_scores)
+        feature_dict["_contributions"] = {d: dict(contributions[d]) for d in DIALECTS}
+    # Slightly increase confidence only when evidence is strong.
+    temperature = _clamp(2.0 - 0.6 * evidence_scale, 1.4, 2.0)
+    scores = _softmax_percent(raw_scores, temperature=temperature)
+    # Post-hoc discrimination diagnostics.
+    ordered = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)
+    best_pct = float(ordered[0][1]) if ordered else 0.0
+    second_pct = float(ordered[1][1]) if len(ordered) > 1 else 0.0
+    top_gap_pct = best_pct - second_pct
+    if isinstance(feature_dict, dict):
+        diagnostics = feature_dict.get("diagnostics", {}) or {}
+        diagnostics.update(
+            {
+                "best_pct": best_pct,
+                "second_pct": second_pct,
+                "top_gap_pct": top_gap_pct,
+            }
+        )
+        feature_dict["diagnostics"] = diagnostics
+    return scores

dialect_analysis/tokenization.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from __future__ import annotations
+from typing import List
+def tokenize(text: str) -> List[str]:
+    """Tokenize a normalized text into whitespace-delimited tokens."""
+    if not text:
+        return []
+    return [t for t in text.split(" ") if t]