Spaces:
Running
Running
Upload 8 files
Browse files- .gitignore +14 -0
- app.py +74 -0
- evaluate_samples.py +143 -0
- generate_synthetic_samples.py +275 -0
- main.py +21 -0
- requirements.txt +1 -0
- samples.jsonl +28 -0
- samples.synthetic.jsonl +0 -0
.gitignore
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
|
| 5 |
+
# Virtualenvs
|
| 6 |
+
.venv/
|
| 7 |
+
venv/
|
| 8 |
+
|
| 9 |
+
# OS/editor
|
| 10 |
+
.DS_Store
|
| 11 |
+
.vscode/
|
| 12 |
+
|
| 13 |
+
# Local artifacts
|
| 14 |
+
*.log
|
app.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from typing import Any, Dict, Mapping, Tuple
|
| 4 |
+
|
| 5 |
+
import gradio as gr
|
| 6 |
+
|
| 7 |
+
from dialect_analysis.pipeline import classify_text
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def _format_scores(scores: Mapping[str, float]) -> str:
|
| 11 |
+
ordered = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)
|
| 12 |
+
return "\n".join(f"- {d}: {pct:.1f}%" for d, pct in ordered)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def analyze_text(text: str, *, strip_diacritics: bool) -> Tuple[str, str, str]:
|
| 16 |
+
text = (text or "").strip()
|
| 17 |
+
if not text:
|
| 18 |
+
return "", "", ""
|
| 19 |
+
|
| 20 |
+
result: Dict[str, Any] = classify_text(text, strip_diacritics=strip_diacritics)
|
| 21 |
+
dialect = str(result.get("dialect", ""))
|
| 22 |
+
confidence = float(result.get("confidence", 0.0) or 0.0) * 100.0
|
| 23 |
+
scores: Mapping[str, float] = result.get("scores", {}) or {}
|
| 24 |
+
explanation = str(result.get("explanation", ""))
|
| 25 |
+
|
| 26 |
+
summary_md = f"**Dialect:** {dialect}\n\n**Confidence:** {confidence:.1f}%"
|
| 27 |
+
scores_md = _format_scores(scores)
|
| 28 |
+
explanation_md = f"```\n{explanation}\n```" if explanation else ""
|
| 29 |
+
|
| 30 |
+
return summary_md, scores_md, explanation_md
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
with gr.Blocks(title="Ancient Greek Dialect Classifier") as demo:
|
| 34 |
+
gr.Markdown(
|
| 35 |
+
"# Ancient Greek Dialect Classifier\n"
|
| 36 |
+
"Rule-based, explainable classifier for Attic / Ionic / Doric / Aeolic / Koine.\n"
|
| 37 |
+
"\nPaste Greek text below and click **Analyze**."
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
with gr.Row():
|
| 41 |
+
strip = gr.Checkbox(value=True, label="Strip diacritics (recommended)")
|
| 42 |
+
|
| 43 |
+
inp = gr.Textbox(
|
| 44 |
+
label="Greek text",
|
| 45 |
+
lines=12,
|
| 46 |
+
placeholder="Paste Ancient Greek text here…",
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
btn = gr.Button("Analyze")
|
| 50 |
+
|
| 51 |
+
with gr.Row():
|
| 52 |
+
summary = gr.Markdown(label="Summary")
|
| 53 |
+
|
| 54 |
+
with gr.Row():
|
| 55 |
+
scores = gr.Markdown(label="Scores")
|
| 56 |
+
|
| 57 |
+
explanation = gr.Markdown(label="Explanation")
|
| 58 |
+
|
| 59 |
+
btn.click(
|
| 60 |
+
fn=analyze_text,
|
| 61 |
+
inputs=[inp, strip],
|
| 62 |
+
outputs=[summary, scores, explanation],
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
# Nice-to-have for Spaces: enable Shift+Enter submit.
|
| 66 |
+
inp.submit(
|
| 67 |
+
fn=analyze_text,
|
| 68 |
+
inputs=[inp, strip],
|
| 69 |
+
outputs=[summary, scores, explanation],
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
if __name__ == "__main__":
|
| 74 |
+
demo.launch()
|
evaluate_samples.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
from collections import Counter, defaultdict
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Any, DefaultDict, Dict, Iterable, List, Mapping, Tuple
|
| 9 |
+
|
| 10 |
+
from dialect_analysis.pipeline import classify_text
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@dataclass(frozen=True)
|
| 14 |
+
class Sample:
|
| 15 |
+
id: str
|
| 16 |
+
label: str
|
| 17 |
+
text: str
|
| 18 |
+
strip_diacritics: bool = True
|
| 19 |
+
synthetic: bool = False
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def load_samples(path: Path) -> List[Sample]:
|
| 23 |
+
samples: List[Sample] = []
|
| 24 |
+
for i, line in enumerate(path.read_text(encoding="utf-8").splitlines(), start=1):
|
| 25 |
+
line = line.strip()
|
| 26 |
+
if not line or line.startswith("#"):
|
| 27 |
+
continue
|
| 28 |
+
obj = json.loads(line)
|
| 29 |
+
samples.append(
|
| 30 |
+
Sample(
|
| 31 |
+
id=str(obj.get("id") or f"sample_{i}"),
|
| 32 |
+
label=str(obj["label"]),
|
| 33 |
+
text=str(obj["text"]),
|
| 34 |
+
strip_diacritics=bool(obj.get("strip_diacritics", True)),
|
| 35 |
+
synthetic=bool(obj.get("synthetic", False)),
|
| 36 |
+
)
|
| 37 |
+
)
|
| 38 |
+
return samples
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def parse_args() -> argparse.Namespace:
|
| 42 |
+
p = argparse.ArgumentParser(description="Evaluate dialect classifier against a JSONL sample set.")
|
| 43 |
+
p.add_argument(
|
| 44 |
+
"--samples",
|
| 45 |
+
type=Path,
|
| 46 |
+
default=Path(__file__).with_name("samples.jsonl"),
|
| 47 |
+
help="Path to JSONL file with {id,label,text,strip_diacritics[,synthetic]}",
|
| 48 |
+
)
|
| 49 |
+
return p.parse_args()
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def confusion_matrix(rows: Iterable[Tuple[str, str]]) -> Tuple[List[str], List[List[int]]]:
|
| 53 |
+
labels = sorted({t for t, _ in rows} | {p for _, p in rows})
|
| 54 |
+
idx = {l: i for i, l in enumerate(labels)}
|
| 55 |
+
mat = [[0 for _ in labels] for _ in labels]
|
| 56 |
+
for true_label, pred_label in rows:
|
| 57 |
+
mat[idx[true_label]][idx[pred_label]] += 1
|
| 58 |
+
return labels, mat
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def main() -> int:
|
| 62 |
+
args = parse_args()
|
| 63 |
+
path = Path(args.samples)
|
| 64 |
+
if not path.exists():
|
| 65 |
+
print(f"Missing samples file: {path}")
|
| 66 |
+
return 2
|
| 67 |
+
|
| 68 |
+
samples = load_samples(path)
|
| 69 |
+
if not samples:
|
| 70 |
+
print("No samples found.")
|
| 71 |
+
return 2
|
| 72 |
+
|
| 73 |
+
pairs: List[Tuple[str, str]] = []
|
| 74 |
+
correct = 0
|
| 75 |
+
confidences: List[float] = []
|
| 76 |
+
|
| 77 |
+
pairs_real: List[Tuple[str, str]] = []
|
| 78 |
+
pairs_synth: List[Tuple[str, str]] = []
|
| 79 |
+
correct_real = 0
|
| 80 |
+
correct_synth = 0
|
| 81 |
+
|
| 82 |
+
by_label: DefaultDict[str, Counter[str]] = defaultdict(Counter)
|
| 83 |
+
|
| 84 |
+
for s in samples:
|
| 85 |
+
result: Mapping[str, Any] = classify_text(s.text, strip_diacritics=s.strip_diacritics)
|
| 86 |
+
pred = str(result.get("dialect", ""))
|
| 87 |
+
conf = float(result.get("confidence", 0.0) or 0.0)
|
| 88 |
+
confidences.append(conf)
|
| 89 |
+
|
| 90 |
+
pairs.append((s.label, pred))
|
| 91 |
+
if s.synthetic:
|
| 92 |
+
pairs_synth.append((s.label, pred))
|
| 93 |
+
if pred == s.label:
|
| 94 |
+
correct_synth += 1
|
| 95 |
+
else:
|
| 96 |
+
pairs_real.append((s.label, pred))
|
| 97 |
+
if pred == s.label:
|
| 98 |
+
correct_real += 1
|
| 99 |
+
by_label[s.label][pred] += 1
|
| 100 |
+
if pred == s.label:
|
| 101 |
+
correct += 1
|
| 102 |
+
else:
|
| 103 |
+
scores: Mapping[str, float] = result.get("scores", {}) or {}
|
| 104 |
+
top2 = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)[:2]
|
| 105 |
+
top2_str = ", ".join(f"{d}={pct:.1f}%" for d, pct in top2)
|
| 106 |
+
print(f"MISS {s.id}: true={s.label} pred={pred} conf={conf*100:.1f}% top2=({top2_str})")
|
| 107 |
+
|
| 108 |
+
acc = correct / max(1, len(samples))
|
| 109 |
+
avg_conf = sum(confidences) / max(1, len(confidences))
|
| 110 |
+
|
| 111 |
+
print("\nSummary")
|
| 112 |
+
print(f" File: {path.name}")
|
| 113 |
+
print(f" Samples: {len(samples)}")
|
| 114 |
+
print(f" Accuracy: {acc*100:.1f}%")
|
| 115 |
+
print(f" Avg confidence: {avg_conf*100:.1f}%")
|
| 116 |
+
|
| 117 |
+
if pairs_real and pairs_synth:
|
| 118 |
+
acc_real = correct_real / max(1, len(pairs_real))
|
| 119 |
+
acc_synth = correct_synth / max(1, len(pairs_synth))
|
| 120 |
+
print(f" Accuracy (real): {acc_real*100:.1f}% (n={len(pairs_real)})")
|
| 121 |
+
print(f" Accuracy (synthetic): {acc_synth*100:.1f}% (n={len(pairs_synth)})")
|
| 122 |
+
|
| 123 |
+
labels, mat = confusion_matrix(pairs)
|
| 124 |
+
print("\nConfusion matrix (rows=true, cols=pred)")
|
| 125 |
+
header = "".ljust(14) + " ".join(l[:10].ljust(10) for l in labels)
|
| 126 |
+
print(header)
|
| 127 |
+
for i, true_label in enumerate(labels):
|
| 128 |
+
row = " ".join(str(mat[i][j]).ljust(10) for j in range(len(labels)))
|
| 129 |
+
print(true_label[:12].ljust(14) + row)
|
| 130 |
+
|
| 131 |
+
print("\nPer-label predictions")
|
| 132 |
+
for true_label in sorted(by_label.keys()):
|
| 133 |
+
counts = by_label[true_label]
|
| 134 |
+
total = sum(counts.values())
|
| 135 |
+
ordered = sorted(counts.items(), key=lambda kv: kv[1], reverse=True)
|
| 136 |
+
dist = ", ".join(f"{p}:{c}" for p, c in ordered)
|
| 137 |
+
print(f" {true_label} (n={total}): {dist}")
|
| 138 |
+
|
| 139 |
+
return 0
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
if __name__ == "__main__":
|
| 143 |
+
raise SystemExit(main())
|
generate_synthetic_samples.py
ADDED
|
@@ -0,0 +1,275 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import random
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Dict, Iterable, List, Mapping, Sequence, Tuple
|
| 9 |
+
|
| 10 |
+
DIALECTS: Tuple[str, ...] = ("Attic", "Ionic", "Doric", "Aeolic", "Koine")
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@dataclass(frozen=True)
|
| 14 |
+
class DialectRecipe:
|
| 15 |
+
label: str
|
| 16 |
+
# Tokens that strongly (but not exclusively) signal the label in our rule set.
|
| 17 |
+
# These should be diacritic-stripped and sigma-normalized, matching features.py.
|
| 18 |
+
marker_tokens: Tuple[str, ...]
|
| 19 |
+
# Tokens that *tend* to be neutral (particles etc.)
|
| 20 |
+
neutral_tokens: Tuple[str, ...]
|
| 21 |
+
# One or more groups of tokens where we must include at least one token per group.
|
| 22 |
+
# This prevents generating mostly-neutral passages that collapse to Attic by design.
|
| 23 |
+
required_token_groups: Tuple[Tuple[str, ...], ...] = ()
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# IMPORTANT: keep these *unscored* by our feature rules.
|
| 27 |
+
# Do NOT include the particles in dialect_analysis.features.PARTICLES here,
|
| 28 |
+
# or the synthetic set will be systematically biased toward Attic.
|
| 29 |
+
NEUTRAL_TOKENS: Tuple[str, ...] = (
|
| 30 |
+
"και",
|
| 31 |
+
"εστι",
|
| 32 |
+
"ανηρ",
|
| 33 |
+
"γυνη",
|
| 34 |
+
"λογος",
|
| 35 |
+
"εργον",
|
| 36 |
+
"οικος",
|
| 37 |
+
"πολις",
|
| 38 |
+
"θεος",
|
| 39 |
+
"χρονος",
|
| 40 |
+
"βιος",
|
| 41 |
+
"αγαθος",
|
| 42 |
+
"καλος",
|
| 43 |
+
"μεγας",
|
| 44 |
+
"μικρος",
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
# Minimal templates are intentionally simple (no attempt at grammaticality);
|
| 48 |
+
# the goal is to exercise the *feature extractors* at scale.
|
| 49 |
+
RECIPES: Mapping[str, DialectRecipe] = {
|
| 50 |
+
"Attic": DialectRecipe(
|
| 51 |
+
label="Attic",
|
| 52 |
+
marker_tokens=(
|
| 53 |
+
# Lexicalized TT stems
|
| 54 |
+
"θαλαττα",
|
| 55 |
+
"γλωττα",
|
| 56 |
+
"πραττω",
|
| 57 |
+
# Preposition preference (edition-dependent but useful)
|
| 58 |
+
"εσ",
|
| 59 |
+
# Attic dative -ῃ often shows up as plain -ηι after stripping
|
| 60 |
+
"τηιδε",
|
| 61 |
+
),
|
| 62 |
+
required_token_groups=(
|
| 63 |
+
("θαλαττα", "γλωττα", "πραττω"),
|
| 64 |
+
("εσ", "τηιδε"),
|
| 65 |
+
),
|
| 66 |
+
neutral_tokens=NEUTRAL_TOKENS,
|
| 67 |
+
),
|
| 68 |
+
"Ionic": DialectRecipe(
|
| 69 |
+
label="Ionic",
|
| 70 |
+
marker_tokens=(
|
| 71 |
+
# SS stems
|
| 72 |
+
"θαλασσα",
|
| 73 |
+
"γλωσσα",
|
| 74 |
+
"τασσω",
|
| 75 |
+
# Dative plural -οισι and epic endings
|
| 76 |
+
"λογοισι",
|
| 77 |
+
"ηελιοιο",
|
| 78 |
+
"αχιληοσ",
|
| 79 |
+
"πηληιαδεω",
|
| 80 |
+
# Epic particles/words
|
| 81 |
+
"αρ",
|
| 82 |
+
"μιν",
|
| 83 |
+
"εννεπε",
|
| 84 |
+
"μουσα",
|
| 85 |
+
"μηνιν",
|
| 86 |
+
"αειδε",
|
| 87 |
+
"θεα",
|
| 88 |
+
),
|
| 89 |
+
required_token_groups=(
|
| 90 |
+
("λογοισι", "ηελιοιο", "αχιληοσ", "πηληιαδεω"),
|
| 91 |
+
("αρ", "μιν"),
|
| 92 |
+
("εννεπε", "αειδε", "μουσα", "μηνιν", "θεα"),
|
| 93 |
+
),
|
| 94 |
+
neutral_tokens=NEUTRAL_TOKENS,
|
| 95 |
+
),
|
| 96 |
+
"Doric": DialectRecipe(
|
| 97 |
+
label="Doric",
|
| 98 |
+
marker_tokens=(
|
| 99 |
+
# Infinitive -μεν
|
| 100 |
+
"ποιεμεν",
|
| 101 |
+
# 1pl -μες (sigma-normalized -μεσ)
|
| 102 |
+
"λεγομεσ",
|
| 103 |
+
# Mild Doric-ish article form sometimes represented with rough breathing;
|
| 104 |
+
# but we avoid diacritics here. Keep other cues carrying the recipe.
|
| 105 |
+
),
|
| 106 |
+
required_token_groups=(("ποιεμεν",), ("λεγομεσ",)),
|
| 107 |
+
neutral_tokens=NEUTRAL_TOKENS,
|
| 108 |
+
),
|
| 109 |
+
"Aeolic": DialectRecipe(
|
| 110 |
+
label="Aeolic",
|
| 111 |
+
marker_tokens=(
|
| 112 |
+
# Aeolic pronoun forms
|
| 113 |
+
"αμμι",
|
| 114 |
+
"υμμι",
|
| 115 |
+
# Infinitive -μεναι
|
| 116 |
+
"ποιεμεναι",
|
| 117 |
+
),
|
| 118 |
+
required_token_groups=(("ποιεμεναι",), ("αμμι", "υμμι")),
|
| 119 |
+
neutral_tokens=NEUTRAL_TOKENS,
|
| 120 |
+
),
|
| 121 |
+
"Koine": DialectRecipe(
|
| 122 |
+
label="Koine",
|
| 123 |
+
marker_tokens=(
|
| 124 |
+
# Koine-ish function words
|
| 125 |
+
"ινα",
|
| 126 |
+
"οτι",
|
| 127 |
+
"καθωσ",
|
| 128 |
+
"εγενετο",
|
| 129 |
+
# Preposition preference
|
| 130 |
+
"εισ",
|
| 131 |
+
),
|
| 132 |
+
required_token_groups=(("εγενετο",), ("ινα", "οτι", "καθωσ", "εισ")),
|
| 133 |
+
neutral_tokens=NEUTRAL_TOKENS,
|
| 134 |
+
),
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def _choose_markers(rng: random.Random, markers: Sequence[str], *, min_markers: int, max_markers: int) -> List[str]:
|
| 139 |
+
k = rng.randint(min_markers, max_markers)
|
| 140 |
+
if k <= 0:
|
| 141 |
+
return []
|
| 142 |
+
if k >= len(markers):
|
| 143 |
+
# Shuffle copy
|
| 144 |
+
out = list(markers)
|
| 145 |
+
rng.shuffle(out)
|
| 146 |
+
return out
|
| 147 |
+
return rng.sample(list(markers), k=k)
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def _choose_neutrals(rng: random.Random, neutrals: Sequence[str], *, count: int) -> List[str]:
|
| 151 |
+
if count <= 0:
|
| 152 |
+
return []
|
| 153 |
+
return [rng.choice(list(neutrals)) for _ in range(count)]
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def _make_text(
|
| 157 |
+
rng: random.Random,
|
| 158 |
+
recipe: DialectRecipe,
|
| 159 |
+
*,
|
| 160 |
+
min_tokens: int,
|
| 161 |
+
max_tokens: int,
|
| 162 |
+
min_markers: int,
|
| 163 |
+
max_markers: int,
|
| 164 |
+
) -> str:
|
| 165 |
+
n = rng.randint(min_tokens, max_tokens)
|
| 166 |
+
|
| 167 |
+
required: List[str] = []
|
| 168 |
+
for group in recipe.required_token_groups:
|
| 169 |
+
if group:
|
| 170 |
+
required.append(rng.choice(list(group)))
|
| 171 |
+
|
| 172 |
+
markers = _choose_markers(
|
| 173 |
+
rng,
|
| 174 |
+
recipe.marker_tokens,
|
| 175 |
+
min_markers=max(0, min_markers - len(required)),
|
| 176 |
+
max_markers=max(0, max_markers - len(required)),
|
| 177 |
+
)
|
| 178 |
+
markers = required + markers
|
| 179 |
+
|
| 180 |
+
# Fill the rest with neutrals.
|
| 181 |
+
remaining = max(0, n - len(markers))
|
| 182 |
+
neutrals = _choose_neutrals(rng, recipe.neutral_tokens, count=remaining)
|
| 183 |
+
|
| 184 |
+
# Mix and add a tiny amount of noise by shuffling.
|
| 185 |
+
tokens = markers + neutrals
|
| 186 |
+
rng.shuffle(tokens)
|
| 187 |
+
|
| 188 |
+
# Add a small chance of repeating a marker (to emulate multiple hits).
|
| 189 |
+
if markers and rng.random() < 0.25:
|
| 190 |
+
tokens.insert(rng.randrange(0, len(tokens) + 1), rng.choice(markers))
|
| 191 |
+
|
| 192 |
+
return " ".join(tokens)
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def generate_samples(
|
| 196 |
+
*,
|
| 197 |
+
seed: int,
|
| 198 |
+
n_per_dialect: int,
|
| 199 |
+
min_tokens: int,
|
| 200 |
+
max_tokens: int,
|
| 201 |
+
min_markers: int,
|
| 202 |
+
max_markers: int,
|
| 203 |
+
) -> List[Dict[str, object]]:
|
| 204 |
+
rng = random.Random(seed)
|
| 205 |
+
|
| 206 |
+
out: List[Dict[str, object]] = []
|
| 207 |
+
for label in DIALECTS:
|
| 208 |
+
recipe = RECIPES[label]
|
| 209 |
+
for i in range(n_per_dialect):
|
| 210 |
+
text = _make_text(
|
| 211 |
+
rng,
|
| 212 |
+
recipe,
|
| 213 |
+
min_tokens=min_tokens,
|
| 214 |
+
max_tokens=max_tokens,
|
| 215 |
+
min_markers=min_markers,
|
| 216 |
+
max_markers=max_markers,
|
| 217 |
+
)
|
| 218 |
+
out.append(
|
| 219 |
+
{
|
| 220 |
+
"id": f"synthetic_{label.lower()}_{i+1}",
|
| 221 |
+
"label": label,
|
| 222 |
+
"text": text,
|
| 223 |
+
"strip_diacritics": True,
|
| 224 |
+
"synthetic": True,
|
| 225 |
+
"seed": seed,
|
| 226 |
+
}
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
# Stable-ish shuffle to avoid grouped labels in file
|
| 230 |
+
rng.shuffle(out)
|
| 231 |
+
return out
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
def write_jsonl(path: Path, rows: Iterable[Mapping[str, object]]) -> None:
|
| 235 |
+
lines = [json.dumps(dict(r), ensure_ascii=False) for r in rows]
|
| 236 |
+
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
def parse_args() -> argparse.Namespace:
|
| 240 |
+
p = argparse.ArgumentParser(description="Generate a large synthetic dialect sample set (JSONL).")
|
| 241 |
+
p.add_argument("--out", type=Path, default=Path("samples.synthetic.jsonl"), help="Output JSONL path")
|
| 242 |
+
p.add_argument("--seed", type=int, default=1, help="PRNG seed for reproducibility")
|
| 243 |
+
p.add_argument("--n-per-dialect", type=int, default=200, help="How many samples to generate per dialect")
|
| 244 |
+
p.add_argument("--min-tokens", type=int, default=30)
|
| 245 |
+
p.add_argument("--max-tokens", type=int, default=60)
|
| 246 |
+
p.add_argument("--min-markers", type=int, default=4)
|
| 247 |
+
p.add_argument("--max-markers", type=int, default=8)
|
| 248 |
+
return p.parse_args()
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
def main() -> int:
|
| 252 |
+
args = parse_args()
|
| 253 |
+
|
| 254 |
+
if args.n_per_dialect <= 0:
|
| 255 |
+
raise SystemExit("--n-per-dialect must be > 0")
|
| 256 |
+
if args.min_tokens <= 0 or args.max_tokens < args.min_tokens:
|
| 257 |
+
raise SystemExit("Invalid token range")
|
| 258 |
+
if args.min_markers < 0 or args.max_markers < args.min_markers:
|
| 259 |
+
raise SystemExit("Invalid marker range")
|
| 260 |
+
|
| 261 |
+
rows = generate_samples(
|
| 262 |
+
seed=int(args.seed),
|
| 263 |
+
n_per_dialect=int(args.n_per_dialect),
|
| 264 |
+
min_tokens=int(args.min_tokens),
|
| 265 |
+
max_tokens=int(args.max_tokens),
|
| 266 |
+
min_markers=int(args.min_markers),
|
| 267 |
+
max_markers=int(args.max_markers),
|
| 268 |
+
)
|
| 269 |
+
write_jsonl(Path(args.out), rows)
|
| 270 |
+
print(f"Wrote {len(rows)} samples to {args.out}")
|
| 271 |
+
return 0
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
if __name__ == "__main__":
|
| 275 |
+
raise SystemExit(main())
|
main.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Entry point for the DialectAnalysis MVP.
|
| 2 |
+
|
| 3 |
+
The main implementation lives in the `dialect_analysis/` package.
|
| 4 |
+
|
| 5 |
+
You can run either:
|
| 6 |
+
- python main.py
|
| 7 |
+
- python -m dialect_analysis
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
from dialect_analysis.cli import run_cli
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def main() -> int:
|
| 16 |
+
return run_cli()
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
if __name__ == "__main__":
|
| 20 |
+
raise SystemExit(main())
|
| 21 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
gradio>=4.0.0
|
samples.jsonl
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Minimal development set (public-domain style snippets; meant for iterative tuning, not evaluation).
|
| 2 |
+
# label: one of Attic/Ionic/Doric/Aeolic/Koine
|
| 3 |
+
|
| 4 |
+
{"id":"attic_tt_1","label":"Attic","text":"ἡ θάλαττα καὶ ἡ γλῶττα· πράττω ἃ δεῖ.","strip_diacritics":true}
|
| 5 |
+
{"id":"ionic_ss_1","label":"Ionic","text":"θάλασσα γλῶσσα τάσσω.","strip_diacritics":true}
|
| 6 |
+
{"id":"koine_fn_1","label":"Koine","text":"ἵνα γνῶτε ὅτι εἰσῆλθεν εἰς τὸν οἶκον.","strip_diacritics":true}
|
| 7 |
+
{"id":"aeolic_inf_1","label":"Aeolic","text":"βουλομαι ποιεμεναι ταδε.","strip_diacritics":true}
|
| 8 |
+
{"id":"doric_inf_1","label":"Doric","text":"βουλομαι ποιεμεν ταδε.","strip_diacritics":true}
|
| 9 |
+
|
| 10 |
+
# Epic/Ionic-like marker example (very small):
|
| 11 |
+
{"id":"ionic_epic_1","label":"Ionic","text":"Ἠελίοιο φαεινοῦ.","strip_diacritics":true}
|
| 12 |
+
|
| 13 |
+
# Homeric epic (Ionic epic language; short excerpt)
|
| 14 |
+
{"id":"homer_od_1","label":"Ionic","text":"Ἄνδρα μοι ἔννεπε, Μοῦσα, πολύτροπον, ὃς μάλα πολλὰ πλάγχθη.","strip_diacritics":true}
|
| 15 |
+
{"id":"homer_il_1","label":"Ionic","text":"Μῆνιν ἄειδε, θεά, Πηληϊάδεω Ἀχιλῆος.","strip_diacritics":true}
|
| 16 |
+
|
| 17 |
+
# Attic tragedy (Aeschylus; short excerpt)
|
| 18 |
+
{"id":"aesch_1","label":"Attic","text":"πρῶτον μὲν εὐχῇ τῇδε πρεσβεύω θεῶν.","strip_diacritics":true}
|
| 19 |
+
{"id":"aesch_2","label":"Attic","text":"ἐς τήνδε γαῖαν ἦλθε Παρνησοῦ θ᾽ ἕδρας.","strip_diacritics":true}
|
| 20 |
+
|
| 21 |
+
# Aeolic lyric-style markers (short, with pronoun)
|
| 22 |
+
{"id":"aeolic_pron_1","label":"Aeolic","text":"αμμι δ᾽ ἄνασσα.","strip_diacritics":true}
|
| 23 |
+
|
| 24 |
+
# Doric choral-like morphology (very small synthetic but diagnostic)
|
| 25 |
+
{"id":"doric_1pl_mes_1","label":"Doric","text":"λεγομεσ ταδε.","strip_diacritics":true}
|
| 26 |
+
|
| 27 |
+
# Koine (NT-style; short)
|
| 28 |
+
{"id":"koine_nt_1","label":"Koine","text":"καὶ ἐγένετο ἐν ταῖς ἡμέραις ἐκείναις.","strip_diacritics":true}
|
samples.synthetic.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|