Spaces:

thomascerniglia
/

DialectAnalysis

Running

App Files Files Community

thomascerniglia commited on Feb 19

Commit

05b9702

verified ·

1 Parent(s): d1fb3cf

Upload 8 files

Browse files

Files changed (8) hide show

.gitignore +14 -0
app.py +74 -0
evaluate_samples.py +143 -0
generate_synthetic_samples.py +275 -0
main.py +21 -0
requirements.txt +1 -0
samples.jsonl +28 -0
samples.synthetic.jsonl +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,14 @@

+# Python
+__pycache__/
+*.py[cod]
+# Virtualenvs
+.venv/
+venv/
+# OS/editor
+.DS_Store
+.vscode/
+# Local artifacts
+*.log

app.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from __future__ import annotations
+from typing import Any, Dict, Mapping, Tuple
+import gradio as gr
+from dialect_analysis.pipeline import classify_text
+def _format_scores(scores: Mapping[str, float]) -> str:
+    ordered = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)
+    return "\n".join(f"- {d}: {pct:.1f}%" for d, pct in ordered)
+def analyze_text(text: str, *, strip_diacritics: bool) -> Tuple[str, str, str]:
+    text = (text or "").strip()
+    if not text:
+        return "", "", ""
+    result: Dict[str, Any] = classify_text(text, strip_diacritics=strip_diacritics)
+    dialect = str(result.get("dialect", ""))
+    confidence = float(result.get("confidence", 0.0) or 0.0) * 100.0
+    scores: Mapping[str, float] = result.get("scores", {}) or {}
+    explanation = str(result.get("explanation", ""))
+    summary_md = f"**Dialect:** {dialect}\n\n**Confidence:** {confidence:.1f}%"
+    scores_md = _format_scores(scores)
+    explanation_md = f"```\n{explanation}\n```" if explanation else ""
+    return summary_md, scores_md, explanation_md
+with gr.Blocks(title="Ancient Greek Dialect Classifier") as demo:
+    gr.Markdown(
+        "# Ancient Greek Dialect Classifier\n"
+        "Rule-based, explainable classifier for Attic / Ionic / Doric / Aeolic / Koine.\n"
+        "\nPaste Greek text below and click **Analyze**."
+    )
+    with gr.Row():
+        strip = gr.Checkbox(value=True, label="Strip diacritics (recommended)")
+    inp = gr.Textbox(
+        label="Greek text",
+        lines=12,
+        placeholder="Paste Ancient Greek text here…",
+    )
+    btn = gr.Button("Analyze")
+    with gr.Row():
+        summary = gr.Markdown(label="Summary")
+    with gr.Row():
+        scores = gr.Markdown(label="Scores")
+    explanation = gr.Markdown(label="Explanation")
+    btn.click(
+        fn=analyze_text,
+        inputs=[inp, strip],
+        outputs=[summary, scores, explanation],
+    )
+    # Nice-to-have for Spaces: enable Shift+Enter submit.
+    inp.submit(
+        fn=analyze_text,
+        inputs=[inp, strip],
+        outputs=[summary, scores, explanation],
+    )
+if __name__ == "__main__":
+    demo.launch()

evaluate_samples.py ADDED Viewed

	@@ -0,0 +1,143 @@

+from __future__ import annotations
+import argparse
+import json
+from collections import Counter, defaultdict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, DefaultDict, Dict, Iterable, List, Mapping, Tuple
+from dialect_analysis.pipeline import classify_text
+@dataclass(frozen=True)
+class Sample:
+    id: str
+    label: str
+    text: str
+    strip_diacritics: bool = True
+    synthetic: bool = False
+def load_samples(path: Path) -> List[Sample]:
+    samples: List[Sample] = []
+    for i, line in enumerate(path.read_text(encoding="utf-8").splitlines(), start=1):
+        line = line.strip()
+        if not line or line.startswith("#"):
+            continue
+        obj = json.loads(line)
+        samples.append(
+            Sample(
+                id=str(obj.get("id") or f"sample_{i}"),
+                label=str(obj["label"]),
+                text=str(obj["text"]),
+                strip_diacritics=bool(obj.get("strip_diacritics", True)),
+                synthetic=bool(obj.get("synthetic", False)),
+            )
+        )
+    return samples
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="Evaluate dialect classifier against a JSONL sample set.")
+    p.add_argument(
+        "--samples",
+        type=Path,
+        default=Path(__file__).with_name("samples.jsonl"),
+        help="Path to JSONL file with {id,label,text,strip_diacritics[,synthetic]}",
+    )
+    return p.parse_args()
+def confusion_matrix(rows: Iterable[Tuple[str, str]]) -> Tuple[List[str], List[List[int]]]:
+    labels = sorted({t for t, _ in rows} | {p for _, p in rows})
+    idx = {l: i for i, l in enumerate(labels)}
+    mat = [[0 for _ in labels] for _ in labels]
+    for true_label, pred_label in rows:
+        mat[idx[true_label]][idx[pred_label]] += 1
+    return labels, mat
+def main() -> int:
+    args = parse_args()
+    path = Path(args.samples)
+    if not path.exists():
+        print(f"Missing samples file: {path}")
+        return 2
+    samples = load_samples(path)
+    if not samples:
+        print("No samples found.")
+        return 2
+    pairs: List[Tuple[str, str]] = []
+    correct = 0
+    confidences: List[float] = []
+    pairs_real: List[Tuple[str, str]] = []
+    pairs_synth: List[Tuple[str, str]] = []
+    correct_real = 0
+    correct_synth = 0
+    by_label: DefaultDict[str, Counter[str]] = defaultdict(Counter)
+    for s in samples:
+        result: Mapping[str, Any] = classify_text(s.text, strip_diacritics=s.strip_diacritics)
+        pred = str(result.get("dialect", ""))
+        conf = float(result.get("confidence", 0.0) or 0.0)
+        confidences.append(conf)
+        pairs.append((s.label, pred))
+        if s.synthetic:
+            pairs_synth.append((s.label, pred))
+            if pred == s.label:
+                correct_synth += 1
+        else:
+            pairs_real.append((s.label, pred))
+            if pred == s.label:
+                correct_real += 1
+        by_label[s.label][pred] += 1
+        if pred == s.label:
+            correct += 1
+        else:
+            scores: Mapping[str, float] = result.get("scores", {}) or {}
+            top2 = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)[:2]
+            top2_str = ", ".join(f"{d}={pct:.1f}%" for d, pct in top2)
+            print(f"MISS {s.id}: true={s.label} pred={pred} conf={conf*100:.1f}% top2=({top2_str})")
+    acc = correct / max(1, len(samples))
+    avg_conf = sum(confidences) / max(1, len(confidences))
+    print("\nSummary")
+    print(f"  File: {path.name}")
+    print(f"  Samples: {len(samples)}")
+    print(f"  Accuracy: {acc*100:.1f}%")
+    print(f"  Avg confidence: {avg_conf*100:.1f}%")
+    if pairs_real and pairs_synth:
+        acc_real = correct_real / max(1, len(pairs_real))
+        acc_synth = correct_synth / max(1, len(pairs_synth))
+        print(f"  Accuracy (real): {acc_real*100:.1f}% (n={len(pairs_real)})")
+        print(f"  Accuracy (synthetic): {acc_synth*100:.1f}% (n={len(pairs_synth)})")
+    labels, mat = confusion_matrix(pairs)
+    print("\nConfusion matrix (rows=true, cols=pred)")
+    header = "".ljust(14) + " ".join(l[:10].ljust(10) for l in labels)
+    print(header)
+    for i, true_label in enumerate(labels):
+        row = " ".join(str(mat[i][j]).ljust(10) for j in range(len(labels)))
+        print(true_label[:12].ljust(14) + row)
+    print("\nPer-label predictions")
+    for true_label in sorted(by_label.keys()):
+        counts = by_label[true_label]
+        total = sum(counts.values())
+        ordered = sorted(counts.items(), key=lambda kv: kv[1], reverse=True)
+        dist = ", ".join(f"{p}:{c}" for p, c in ordered)
+        print(f"  {true_label} (n={total}): {dist}")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

generate_synthetic_samples.py ADDED Viewed

	@@ -0,0 +1,275 @@

+from __future__ import annotations
+import argparse
+import json
+import random
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Iterable, List, Mapping, Sequence, Tuple
+DIALECTS: Tuple[str, ...] = ("Attic", "Ionic", "Doric", "Aeolic", "Koine")
+@dataclass(frozen=True)
+class DialectRecipe:
+    label: str
+    # Tokens that strongly (but not exclusively) signal the label in our rule set.
+    # These should be diacritic-stripped and sigma-normalized, matching features.py.
+    marker_tokens: Tuple[str, ...]
+    # Tokens that *tend* to be neutral (particles etc.)
+    neutral_tokens: Tuple[str, ...]
+    # One or more groups of tokens where we must include at least one token per group.
+    # This prevents generating mostly-neutral passages that collapse to Attic by design.
+    required_token_groups: Tuple[Tuple[str, ...], ...] = ()
+# IMPORTANT: keep these *unscored* by our feature rules.
+# Do NOT include the particles in dialect_analysis.features.PARTICLES here,
+# or the synthetic set will be systematically biased toward Attic.
+NEUTRAL_TOKENS: Tuple[str, ...] = (
+    "και",
+    "εστι",
+    "ανηρ",
+    "γυνη",
+    "λογος",
+    "εργον",
+    "οικος",
+    "πολις",
+    "θεος",
+    "χρονος",
+    "βιος",
+    "αγαθος",
+    "καλος",
+    "μεγας",
+    "μικρος",
+)
+# Minimal templates are intentionally simple (no attempt at grammaticality);
+# the goal is to exercise the *feature extractors* at scale.
+RECIPES: Mapping[str, DialectRecipe] = {
+    "Attic": DialectRecipe(
+        label="Attic",
+        marker_tokens=(
+            # Lexicalized TT stems
+            "θαλαττα",
+            "γλωττα",
+            "πραττω",
+            # Preposition preference (edition-dependent but useful)
+            "εσ",
+            # Attic dative -ῃ often shows up as plain -ηι after stripping
+            "τηιδε",
+        ),
+        required_token_groups=(
+            ("θαλαττα", "γλωττα", "πραττω"),
+            ("εσ", "τηιδε"),
+        ),
+        neutral_tokens=NEUTRAL_TOKENS,
+    ),
+    "Ionic": DialectRecipe(
+        label="Ionic",
+        marker_tokens=(
+            # SS stems
+            "θαλασσα",
+            "γλωσσα",
+            "τασσω",
+            # Dative plural -οισι and epic endings
+            "λογοισι",
+            "ηελιοιο",
+            "αχιληοσ",
+            "πηληιαδεω",
+            # Epic particles/words
+            "αρ",
+            "μιν",
+            "εννεπε",
+            "μουσα",
+            "μηνιν",
+            "αειδε",
+            "θεα",
+        ),
+        required_token_groups=(
+            ("λογοισι", "ηελιοιο", "αχιληοσ", "πηληιαδεω"),
+            ("αρ", "μιν"),
+            ("εννεπε", "αειδε", "μουσα", "μηνιν", "θεα"),
+        ),
+        neutral_tokens=NEUTRAL_TOKENS,
+    ),
+    "Doric": DialectRecipe(
+        label="Doric",
+        marker_tokens=(
+            # Infinitive -μεν
+            "ποιεμεν",
+            # 1pl -μες (sigma-normalized -μεσ)
+            "λεγομεσ",
+            # Mild Doric-ish article form sometimes represented with rough breathing;
+            # but we avoid diacritics here. Keep other cues carrying the recipe.
+        ),
+        required_token_groups=(("ποιεμεν",), ("λεγομεσ",)),
+        neutral_tokens=NEUTRAL_TOKENS,
+    ),
+    "Aeolic": DialectRecipe(
+        label="Aeolic",
+        marker_tokens=(
+            # Aeolic pronoun forms
+            "αμμι",
+            "υμμι",
+            # Infinitive -μεναι
+            "ποιεμεναι",
+        ),
+        required_token_groups=(("ποιεμεναι",), ("αμμι", "υμμι")),
+        neutral_tokens=NEUTRAL_TOKENS,
+    ),
+    "Koine": DialectRecipe(
+        label="Koine",
+        marker_tokens=(
+            # Koine-ish function words
+            "ινα",
+            "οτι",
+            "καθωσ",
+            "εγενετο",
+            # Preposition preference
+            "εισ",
+        ),
+        required_token_groups=(("εγενετο",), ("ινα", "οτι", "καθωσ", "εισ")),
+        neutral_tokens=NEUTRAL_TOKENS,
+    ),
+}
+def _choose_markers(rng: random.Random, markers: Sequence[str], *, min_markers: int, max_markers: int) -> List[str]:
+    k = rng.randint(min_markers, max_markers)
+    if k <= 0:
+        return []
+    if k >= len(markers):
+        # Shuffle copy
+        out = list(markers)
+        rng.shuffle(out)
+        return out
+    return rng.sample(list(markers), k=k)
+def _choose_neutrals(rng: random.Random, neutrals: Sequence[str], *, count: int) -> List[str]:
+    if count <= 0:
+        return []
+    return [rng.choice(list(neutrals)) for _ in range(count)]
+def _make_text(
+    rng: random.Random,
+    recipe: DialectRecipe,
+    *,
+    min_tokens: int,
+    max_tokens: int,
+    min_markers: int,
+    max_markers: int,
+) -> str:
+    n = rng.randint(min_tokens, max_tokens)
+    required: List[str] = []
+    for group in recipe.required_token_groups:
+        if group:
+            required.append(rng.choice(list(group)))
+    markers = _choose_markers(
+        rng,
+        recipe.marker_tokens,
+        min_markers=max(0, min_markers - len(required)),
+        max_markers=max(0, max_markers - len(required)),
+    )
+    markers = required + markers
+    # Fill the rest with neutrals.
+    remaining = max(0, n - len(markers))
+    neutrals = _choose_neutrals(rng, recipe.neutral_tokens, count=remaining)
+    # Mix and add a tiny amount of noise by shuffling.
+    tokens = markers + neutrals
+    rng.shuffle(tokens)
+    # Add a small chance of repeating a marker (to emulate multiple hits).
+    if markers and rng.random() < 0.25:
+        tokens.insert(rng.randrange(0, len(tokens) + 1), rng.choice(markers))
+    return " ".join(tokens)
+def generate_samples(
+    *,
+    seed: int,
+    n_per_dialect: int,
+    min_tokens: int,
+    max_tokens: int,
+    min_markers: int,
+    max_markers: int,
+) -> List[Dict[str, object]]:
+    rng = random.Random(seed)
+    out: List[Dict[str, object]] = []
+    for label in DIALECTS:
+        recipe = RECIPES[label]
+        for i in range(n_per_dialect):
+            text = _make_text(
+                rng,
+                recipe,
+                min_tokens=min_tokens,
+                max_tokens=max_tokens,
+                min_markers=min_markers,
+                max_markers=max_markers,
+            )
+            out.append(
+                {
+                    "id": f"synthetic_{label.lower()}_{i+1}",
+                    "label": label,
+                    "text": text,
+                    "strip_diacritics": True,
+                    "synthetic": True,
+                    "seed": seed,
+                }
+            )
+    # Stable-ish shuffle to avoid grouped labels in file
+    rng.shuffle(out)
+    return out
+def write_jsonl(path: Path, rows: Iterable[Mapping[str, object]]) -> None:
+    lines = [json.dumps(dict(r), ensure_ascii=False) for r in rows]
+    path.write_text("\n".join(lines) + "\n", encoding="utf-8")
+def parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description="Generate a large synthetic dialect sample set (JSONL).")
+    p.add_argument("--out", type=Path, default=Path("samples.synthetic.jsonl"), help="Output JSONL path")
+    p.add_argument("--seed", type=int, default=1, help="PRNG seed for reproducibility")
+    p.add_argument("--n-per-dialect", type=int, default=200, help="How many samples to generate per dialect")
+    p.add_argument("--min-tokens", type=int, default=30)
+    p.add_argument("--max-tokens", type=int, default=60)
+    p.add_argument("--min-markers", type=int, default=4)
+    p.add_argument("--max-markers", type=int, default=8)
+    return p.parse_args()
+def main() -> int:
+    args = parse_args()
+    if args.n_per_dialect <= 0:
+        raise SystemExit("--n-per-dialect must be > 0")
+    if args.min_tokens <= 0 or args.max_tokens < args.min_tokens:
+        raise SystemExit("Invalid token range")
+    if args.min_markers < 0 or args.max_markers < args.min_markers:
+        raise SystemExit("Invalid marker range")
+    rows = generate_samples(
+        seed=int(args.seed),
+        n_per_dialect=int(args.n_per_dialect),
+        min_tokens=int(args.min_tokens),
+        max_tokens=int(args.max_tokens),
+        min_markers=int(args.min_markers),
+        max_markers=int(args.max_markers),
+    )
+    write_jsonl(Path(args.out), rows)
+    print(f"Wrote {len(rows)} samples to {args.out}")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

main.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""Entry point for the DialectAnalysis MVP.
+The main implementation lives in the `dialect_analysis/` package.
+You can run either:
+  - python main.py
+  - python -m dialect_analysis
+"""
+from __future__ import annotations
+from dialect_analysis.cli import run_cli
+def main() -> int:
+	return run_cli()
+if __name__ == "__main__":
+	raise SystemExit(main())

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ gradio>=4.0.0

samples.jsonl ADDED Viewed

	@@ -0,0 +1,28 @@

+# Minimal development set (public-domain style snippets; meant for iterative tuning, not evaluation).
+# label: one of Attic/Ionic/Doric/Aeolic/Koine
+{"id":"attic_tt_1","label":"Attic","text":"ἡ θάλαττα καὶ ἡ γλῶττα· πράττω ἃ δεῖ.","strip_diacritics":true}
+{"id":"ionic_ss_1","label":"Ionic","text":"θάλασσα γλῶσσα τάσσω.","strip_diacritics":true}
+{"id":"koine_fn_1","label":"Koine","text":"ἵνα γνῶτε ὅτι εἰσῆλθεν εἰς τὸν οἶκον.","strip_diacritics":true}
+{"id":"aeolic_inf_1","label":"Aeolic","text":"βουλομαι ποιεμεναι ταδε.","strip_diacritics":true}
+{"id":"doric_inf_1","label":"Doric","text":"βουλομαι ποιεμεν ταδε.","strip_diacritics":true}
+# Epic/Ionic-like marker example (very small):
+{"id":"ionic_epic_1","label":"Ionic","text":"Ἠελίοιο φαεινοῦ.","strip_diacritics":true}
+# Homeric epic (Ionic epic language; short excerpt)
+{"id":"homer_od_1","label":"Ionic","text":"Ἄνδρα μοι ἔννεπε, Μοῦσα, πολύτροπον, ὃς μάλα πολλὰ πλάγχθη.","strip_diacritics":true}
+{"id":"homer_il_1","label":"Ionic","text":"Μῆνιν ἄειδε, θεά, Πηληϊάδεω Ἀχιλῆος.","strip_diacritics":true}
+# Attic tragedy (Aeschylus; short excerpt)
+{"id":"aesch_1","label":"Attic","text":"πρῶτον μὲν εὐχῇ τῇδε πρεσβεύω θεῶν.","strip_diacritics":true}
+{"id":"aesch_2","label":"Attic","text":"ἐς τήνδε γαῖαν ἦλθε Παρνησοῦ θ᾽ ἕδρας.","strip_diacritics":true}
+# Aeolic lyric-style markers (short, with pronoun)
+{"id":"aeolic_pron_1","label":"Aeolic","text":"αμμι δ᾽ ἄνασσα.","strip_diacritics":true}
+# Doric choral-like morphology (very small synthetic but diagnostic)
+{"id":"doric_1pl_mes_1","label":"Doric","text":"λεγομεσ ταδε.","strip_diacritics":true}
+# Koine (NT-style; short)
+{"id":"koine_nt_1","label":"Koine","text":"καὶ ἐγένετο ἐν ταῖς ἡμέραις ἐκείναις.","strip_diacritics":true}

samples.synthetic.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff