thomascerniglia commited on
Commit
05b9702
·
verified ·
1 Parent(s): d1fb3cf

Upload 8 files

Browse files
.gitignore ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+
5
+ # Virtualenvs
6
+ .venv/
7
+ venv/
8
+
9
+ # OS/editor
10
+ .DS_Store
11
+ .vscode/
12
+
13
+ # Local artifacts
14
+ *.log
app.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, Mapping, Tuple
4
+
5
+ import gradio as gr
6
+
7
+ from dialect_analysis.pipeline import classify_text
8
+
9
+
10
+ def _format_scores(scores: Mapping[str, float]) -> str:
11
+ ordered = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)
12
+ return "\n".join(f"- {d}: {pct:.1f}%" for d, pct in ordered)
13
+
14
+
15
+ def analyze_text(text: str, *, strip_diacritics: bool) -> Tuple[str, str, str]:
16
+ text = (text or "").strip()
17
+ if not text:
18
+ return "", "", ""
19
+
20
+ result: Dict[str, Any] = classify_text(text, strip_diacritics=strip_diacritics)
21
+ dialect = str(result.get("dialect", ""))
22
+ confidence = float(result.get("confidence", 0.0) or 0.0) * 100.0
23
+ scores: Mapping[str, float] = result.get("scores", {}) or {}
24
+ explanation = str(result.get("explanation", ""))
25
+
26
+ summary_md = f"**Dialect:** {dialect}\n\n**Confidence:** {confidence:.1f}%"
27
+ scores_md = _format_scores(scores)
28
+ explanation_md = f"```\n{explanation}\n```" if explanation else ""
29
+
30
+ return summary_md, scores_md, explanation_md
31
+
32
+
33
+ with gr.Blocks(title="Ancient Greek Dialect Classifier") as demo:
34
+ gr.Markdown(
35
+ "# Ancient Greek Dialect Classifier\n"
36
+ "Rule-based, explainable classifier for Attic / Ionic / Doric / Aeolic / Koine.\n"
37
+ "\nPaste Greek text below and click **Analyze**."
38
+ )
39
+
40
+ with gr.Row():
41
+ strip = gr.Checkbox(value=True, label="Strip diacritics (recommended)")
42
+
43
+ inp = gr.Textbox(
44
+ label="Greek text",
45
+ lines=12,
46
+ placeholder="Paste Ancient Greek text here…",
47
+ )
48
+
49
+ btn = gr.Button("Analyze")
50
+
51
+ with gr.Row():
52
+ summary = gr.Markdown(label="Summary")
53
+
54
+ with gr.Row():
55
+ scores = gr.Markdown(label="Scores")
56
+
57
+ explanation = gr.Markdown(label="Explanation")
58
+
59
+ btn.click(
60
+ fn=analyze_text,
61
+ inputs=[inp, strip],
62
+ outputs=[summary, scores, explanation],
63
+ )
64
+
65
+ # Nice-to-have for Spaces: enable Shift+Enter submit.
66
+ inp.submit(
67
+ fn=analyze_text,
68
+ inputs=[inp, strip],
69
+ outputs=[summary, scores, explanation],
70
+ )
71
+
72
+
73
+ if __name__ == "__main__":
74
+ demo.launch()
evaluate_samples.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ from collections import Counter, defaultdict
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import Any, DefaultDict, Dict, Iterable, List, Mapping, Tuple
9
+
10
+ from dialect_analysis.pipeline import classify_text
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class Sample:
15
+ id: str
16
+ label: str
17
+ text: str
18
+ strip_diacritics: bool = True
19
+ synthetic: bool = False
20
+
21
+
22
+ def load_samples(path: Path) -> List[Sample]:
23
+ samples: List[Sample] = []
24
+ for i, line in enumerate(path.read_text(encoding="utf-8").splitlines(), start=1):
25
+ line = line.strip()
26
+ if not line or line.startswith("#"):
27
+ continue
28
+ obj = json.loads(line)
29
+ samples.append(
30
+ Sample(
31
+ id=str(obj.get("id") or f"sample_{i}"),
32
+ label=str(obj["label"]),
33
+ text=str(obj["text"]),
34
+ strip_diacritics=bool(obj.get("strip_diacritics", True)),
35
+ synthetic=bool(obj.get("synthetic", False)),
36
+ )
37
+ )
38
+ return samples
39
+
40
+
41
+ def parse_args() -> argparse.Namespace:
42
+ p = argparse.ArgumentParser(description="Evaluate dialect classifier against a JSONL sample set.")
43
+ p.add_argument(
44
+ "--samples",
45
+ type=Path,
46
+ default=Path(__file__).with_name("samples.jsonl"),
47
+ help="Path to JSONL file with {id,label,text,strip_diacritics[,synthetic]}",
48
+ )
49
+ return p.parse_args()
50
+
51
+
52
+ def confusion_matrix(rows: Iterable[Tuple[str, str]]) -> Tuple[List[str], List[List[int]]]:
53
+ labels = sorted({t for t, _ in rows} | {p for _, p in rows})
54
+ idx = {l: i for i, l in enumerate(labels)}
55
+ mat = [[0 for _ in labels] for _ in labels]
56
+ for true_label, pred_label in rows:
57
+ mat[idx[true_label]][idx[pred_label]] += 1
58
+ return labels, mat
59
+
60
+
61
+ def main() -> int:
62
+ args = parse_args()
63
+ path = Path(args.samples)
64
+ if not path.exists():
65
+ print(f"Missing samples file: {path}")
66
+ return 2
67
+
68
+ samples = load_samples(path)
69
+ if not samples:
70
+ print("No samples found.")
71
+ return 2
72
+
73
+ pairs: List[Tuple[str, str]] = []
74
+ correct = 0
75
+ confidences: List[float] = []
76
+
77
+ pairs_real: List[Tuple[str, str]] = []
78
+ pairs_synth: List[Tuple[str, str]] = []
79
+ correct_real = 0
80
+ correct_synth = 0
81
+
82
+ by_label: DefaultDict[str, Counter[str]] = defaultdict(Counter)
83
+
84
+ for s in samples:
85
+ result: Mapping[str, Any] = classify_text(s.text, strip_diacritics=s.strip_diacritics)
86
+ pred = str(result.get("dialect", ""))
87
+ conf = float(result.get("confidence", 0.0) or 0.0)
88
+ confidences.append(conf)
89
+
90
+ pairs.append((s.label, pred))
91
+ if s.synthetic:
92
+ pairs_synth.append((s.label, pred))
93
+ if pred == s.label:
94
+ correct_synth += 1
95
+ else:
96
+ pairs_real.append((s.label, pred))
97
+ if pred == s.label:
98
+ correct_real += 1
99
+ by_label[s.label][pred] += 1
100
+ if pred == s.label:
101
+ correct += 1
102
+ else:
103
+ scores: Mapping[str, float] = result.get("scores", {}) or {}
104
+ top2 = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)[:2]
105
+ top2_str = ", ".join(f"{d}={pct:.1f}%" for d, pct in top2)
106
+ print(f"MISS {s.id}: true={s.label} pred={pred} conf={conf*100:.1f}% top2=({top2_str})")
107
+
108
+ acc = correct / max(1, len(samples))
109
+ avg_conf = sum(confidences) / max(1, len(confidences))
110
+
111
+ print("\nSummary")
112
+ print(f" File: {path.name}")
113
+ print(f" Samples: {len(samples)}")
114
+ print(f" Accuracy: {acc*100:.1f}%")
115
+ print(f" Avg confidence: {avg_conf*100:.1f}%")
116
+
117
+ if pairs_real and pairs_synth:
118
+ acc_real = correct_real / max(1, len(pairs_real))
119
+ acc_synth = correct_synth / max(1, len(pairs_synth))
120
+ print(f" Accuracy (real): {acc_real*100:.1f}% (n={len(pairs_real)})")
121
+ print(f" Accuracy (synthetic): {acc_synth*100:.1f}% (n={len(pairs_synth)})")
122
+
123
+ labels, mat = confusion_matrix(pairs)
124
+ print("\nConfusion matrix (rows=true, cols=pred)")
125
+ header = "".ljust(14) + " ".join(l[:10].ljust(10) for l in labels)
126
+ print(header)
127
+ for i, true_label in enumerate(labels):
128
+ row = " ".join(str(mat[i][j]).ljust(10) for j in range(len(labels)))
129
+ print(true_label[:12].ljust(14) + row)
130
+
131
+ print("\nPer-label predictions")
132
+ for true_label in sorted(by_label.keys()):
133
+ counts = by_label[true_label]
134
+ total = sum(counts.values())
135
+ ordered = sorted(counts.items(), key=lambda kv: kv[1], reverse=True)
136
+ dist = ", ".join(f"{p}:{c}" for p, c in ordered)
137
+ print(f" {true_label} (n={total}): {dist}")
138
+
139
+ return 0
140
+
141
+
142
+ if __name__ == "__main__":
143
+ raise SystemExit(main())
generate_synthetic_samples.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import random
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import Dict, Iterable, List, Mapping, Sequence, Tuple
9
+
10
+ DIALECTS: Tuple[str, ...] = ("Attic", "Ionic", "Doric", "Aeolic", "Koine")
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class DialectRecipe:
15
+ label: str
16
+ # Tokens that strongly (but not exclusively) signal the label in our rule set.
17
+ # These should be diacritic-stripped and sigma-normalized, matching features.py.
18
+ marker_tokens: Tuple[str, ...]
19
+ # Tokens that *tend* to be neutral (particles etc.)
20
+ neutral_tokens: Tuple[str, ...]
21
+ # One or more groups of tokens where we must include at least one token per group.
22
+ # This prevents generating mostly-neutral passages that collapse to Attic by design.
23
+ required_token_groups: Tuple[Tuple[str, ...], ...] = ()
24
+
25
+
26
+ # IMPORTANT: keep these *unscored* by our feature rules.
27
+ # Do NOT include the particles in dialect_analysis.features.PARTICLES here,
28
+ # or the synthetic set will be systematically biased toward Attic.
29
+ NEUTRAL_TOKENS: Tuple[str, ...] = (
30
+ "και",
31
+ "εστι",
32
+ "ανηρ",
33
+ "γυνη",
34
+ "λογος",
35
+ "εργον",
36
+ "οικος",
37
+ "πολις",
38
+ "θεος",
39
+ "χρονος",
40
+ "βιος",
41
+ "αγαθος",
42
+ "καλος",
43
+ "μεγας",
44
+ "μικρος",
45
+ )
46
+
47
+ # Minimal templates are intentionally simple (no attempt at grammaticality);
48
+ # the goal is to exercise the *feature extractors* at scale.
49
+ RECIPES: Mapping[str, DialectRecipe] = {
50
+ "Attic": DialectRecipe(
51
+ label="Attic",
52
+ marker_tokens=(
53
+ # Lexicalized TT stems
54
+ "θαλαττα",
55
+ "γλωττα",
56
+ "πραττω",
57
+ # Preposition preference (edition-dependent but useful)
58
+ "εσ",
59
+ # Attic dative -ῃ often shows up as plain -ηι after stripping
60
+ "τηιδε",
61
+ ),
62
+ required_token_groups=(
63
+ ("θαλαττα", "γλωττα", "πραττω"),
64
+ ("εσ", "τηιδε"),
65
+ ),
66
+ neutral_tokens=NEUTRAL_TOKENS,
67
+ ),
68
+ "Ionic": DialectRecipe(
69
+ label="Ionic",
70
+ marker_tokens=(
71
+ # SS stems
72
+ "θαλασσα",
73
+ "γλωσσα",
74
+ "τασσω",
75
+ # Dative plural -οισι and epic endings
76
+ "λογοισι",
77
+ "ηελιοιο",
78
+ "αχιληοσ",
79
+ "πηληιαδεω",
80
+ # Epic particles/words
81
+ "αρ",
82
+ "μιν",
83
+ "εννεπε",
84
+ "μουσα",
85
+ "μηνιν",
86
+ "αειδε",
87
+ "θεα",
88
+ ),
89
+ required_token_groups=(
90
+ ("λογοισι", "ηελιοιο", "αχιληοσ", "πηληιαδεω"),
91
+ ("αρ", "μιν"),
92
+ ("εννεπε", "αειδε", "μουσα", "μηνιν", "θεα"),
93
+ ),
94
+ neutral_tokens=NEUTRAL_TOKENS,
95
+ ),
96
+ "Doric": DialectRecipe(
97
+ label="Doric",
98
+ marker_tokens=(
99
+ # Infinitive -μεν
100
+ "ποιεμεν",
101
+ # 1pl -μες (sigma-normalized -μεσ)
102
+ "λεγομεσ",
103
+ # Mild Doric-ish article form sometimes represented with rough breathing;
104
+ # but we avoid diacritics here. Keep other cues carrying the recipe.
105
+ ),
106
+ required_token_groups=(("ποιεμεν",), ("λεγομεσ",)),
107
+ neutral_tokens=NEUTRAL_TOKENS,
108
+ ),
109
+ "Aeolic": DialectRecipe(
110
+ label="Aeolic",
111
+ marker_tokens=(
112
+ # Aeolic pronoun forms
113
+ "αμμι",
114
+ "υμμι",
115
+ # Infinitive -μεναι
116
+ "ποιεμεναι",
117
+ ),
118
+ required_token_groups=(("ποιεμεναι",), ("αμμι", "υμμι")),
119
+ neutral_tokens=NEUTRAL_TOKENS,
120
+ ),
121
+ "Koine": DialectRecipe(
122
+ label="Koine",
123
+ marker_tokens=(
124
+ # Koine-ish function words
125
+ "ινα",
126
+ "οτι",
127
+ "καθωσ",
128
+ "εγενετο",
129
+ # Preposition preference
130
+ "εισ",
131
+ ),
132
+ required_token_groups=(("εγενετο",), ("ινα", "οτι", "καθωσ", "εισ")),
133
+ neutral_tokens=NEUTRAL_TOKENS,
134
+ ),
135
+ }
136
+
137
+
138
+ def _choose_markers(rng: random.Random, markers: Sequence[str], *, min_markers: int, max_markers: int) -> List[str]:
139
+ k = rng.randint(min_markers, max_markers)
140
+ if k <= 0:
141
+ return []
142
+ if k >= len(markers):
143
+ # Shuffle copy
144
+ out = list(markers)
145
+ rng.shuffle(out)
146
+ return out
147
+ return rng.sample(list(markers), k=k)
148
+
149
+
150
+ def _choose_neutrals(rng: random.Random, neutrals: Sequence[str], *, count: int) -> List[str]:
151
+ if count <= 0:
152
+ return []
153
+ return [rng.choice(list(neutrals)) for _ in range(count)]
154
+
155
+
156
+ def _make_text(
157
+ rng: random.Random,
158
+ recipe: DialectRecipe,
159
+ *,
160
+ min_tokens: int,
161
+ max_tokens: int,
162
+ min_markers: int,
163
+ max_markers: int,
164
+ ) -> str:
165
+ n = rng.randint(min_tokens, max_tokens)
166
+
167
+ required: List[str] = []
168
+ for group in recipe.required_token_groups:
169
+ if group:
170
+ required.append(rng.choice(list(group)))
171
+
172
+ markers = _choose_markers(
173
+ rng,
174
+ recipe.marker_tokens,
175
+ min_markers=max(0, min_markers - len(required)),
176
+ max_markers=max(0, max_markers - len(required)),
177
+ )
178
+ markers = required + markers
179
+
180
+ # Fill the rest with neutrals.
181
+ remaining = max(0, n - len(markers))
182
+ neutrals = _choose_neutrals(rng, recipe.neutral_tokens, count=remaining)
183
+
184
+ # Mix and add a tiny amount of noise by shuffling.
185
+ tokens = markers + neutrals
186
+ rng.shuffle(tokens)
187
+
188
+ # Add a small chance of repeating a marker (to emulate multiple hits).
189
+ if markers and rng.random() < 0.25:
190
+ tokens.insert(rng.randrange(0, len(tokens) + 1), rng.choice(markers))
191
+
192
+ return " ".join(tokens)
193
+
194
+
195
+ def generate_samples(
196
+ *,
197
+ seed: int,
198
+ n_per_dialect: int,
199
+ min_tokens: int,
200
+ max_tokens: int,
201
+ min_markers: int,
202
+ max_markers: int,
203
+ ) -> List[Dict[str, object]]:
204
+ rng = random.Random(seed)
205
+
206
+ out: List[Dict[str, object]] = []
207
+ for label in DIALECTS:
208
+ recipe = RECIPES[label]
209
+ for i in range(n_per_dialect):
210
+ text = _make_text(
211
+ rng,
212
+ recipe,
213
+ min_tokens=min_tokens,
214
+ max_tokens=max_tokens,
215
+ min_markers=min_markers,
216
+ max_markers=max_markers,
217
+ )
218
+ out.append(
219
+ {
220
+ "id": f"synthetic_{label.lower()}_{i+1}",
221
+ "label": label,
222
+ "text": text,
223
+ "strip_diacritics": True,
224
+ "synthetic": True,
225
+ "seed": seed,
226
+ }
227
+ )
228
+
229
+ # Stable-ish shuffle to avoid grouped labels in file
230
+ rng.shuffle(out)
231
+ return out
232
+
233
+
234
+ def write_jsonl(path: Path, rows: Iterable[Mapping[str, object]]) -> None:
235
+ lines = [json.dumps(dict(r), ensure_ascii=False) for r in rows]
236
+ path.write_text("\n".join(lines) + "\n", encoding="utf-8")
237
+
238
+
239
+ def parse_args() -> argparse.Namespace:
240
+ p = argparse.ArgumentParser(description="Generate a large synthetic dialect sample set (JSONL).")
241
+ p.add_argument("--out", type=Path, default=Path("samples.synthetic.jsonl"), help="Output JSONL path")
242
+ p.add_argument("--seed", type=int, default=1, help="PRNG seed for reproducibility")
243
+ p.add_argument("--n-per-dialect", type=int, default=200, help="How many samples to generate per dialect")
244
+ p.add_argument("--min-tokens", type=int, default=30)
245
+ p.add_argument("--max-tokens", type=int, default=60)
246
+ p.add_argument("--min-markers", type=int, default=4)
247
+ p.add_argument("--max-markers", type=int, default=8)
248
+ return p.parse_args()
249
+
250
+
251
+ def main() -> int:
252
+ args = parse_args()
253
+
254
+ if args.n_per_dialect <= 0:
255
+ raise SystemExit("--n-per-dialect must be > 0")
256
+ if args.min_tokens <= 0 or args.max_tokens < args.min_tokens:
257
+ raise SystemExit("Invalid token range")
258
+ if args.min_markers < 0 or args.max_markers < args.min_markers:
259
+ raise SystemExit("Invalid marker range")
260
+
261
+ rows = generate_samples(
262
+ seed=int(args.seed),
263
+ n_per_dialect=int(args.n_per_dialect),
264
+ min_tokens=int(args.min_tokens),
265
+ max_tokens=int(args.max_tokens),
266
+ min_markers=int(args.min_markers),
267
+ max_markers=int(args.max_markers),
268
+ )
269
+ write_jsonl(Path(args.out), rows)
270
+ print(f"Wrote {len(rows)} samples to {args.out}")
271
+ return 0
272
+
273
+
274
+ if __name__ == "__main__":
275
+ raise SystemExit(main())
main.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Entry point for the DialectAnalysis MVP.
2
+
3
+ The main implementation lives in the `dialect_analysis/` package.
4
+
5
+ You can run either:
6
+ - python main.py
7
+ - python -m dialect_analysis
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from dialect_analysis.cli import run_cli
13
+
14
+
15
+ def main() -> int:
16
+ return run_cli()
17
+
18
+
19
+ if __name__ == "__main__":
20
+ raise SystemExit(main())
21
+
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ gradio>=4.0.0
samples.jsonl ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Minimal development set (public-domain style snippets; meant for iterative tuning, not evaluation).
2
+ # label: one of Attic/Ionic/Doric/Aeolic/Koine
3
+
4
+ {"id":"attic_tt_1","label":"Attic","text":"ἡ θάλαττα καὶ ἡ γλῶττα· πράττω ἃ δεῖ.","strip_diacritics":true}
5
+ {"id":"ionic_ss_1","label":"Ionic","text":"θάλασσα γλῶσσα τάσσω.","strip_diacritics":true}
6
+ {"id":"koine_fn_1","label":"Koine","text":"ἵνα γνῶτε ὅτι εἰσῆλθεν εἰς τὸν οἶκον.","strip_diacritics":true}
7
+ {"id":"aeolic_inf_1","label":"Aeolic","text":"βουλομαι ποιεμεναι ταδε.","strip_diacritics":true}
8
+ {"id":"doric_inf_1","label":"Doric","text":"βουλομαι ποιεμεν ταδε.","strip_diacritics":true}
9
+
10
+ # Epic/Ionic-like marker example (very small):
11
+ {"id":"ionic_epic_1","label":"Ionic","text":"Ἠελίοιο φαεινοῦ.","strip_diacritics":true}
12
+
13
+ # Homeric epic (Ionic epic language; short excerpt)
14
+ {"id":"homer_od_1","label":"Ionic","text":"Ἄνδρα μοι ἔννεπε, Μοῦσα, πολύτροπον, ὃς μάλα πολλὰ πλάγχθη.","strip_diacritics":true}
15
+ {"id":"homer_il_1","label":"Ionic","text":"Μῆνιν ἄειδε, θεά, Πηληϊάδεω Ἀχιλῆος.","strip_diacritics":true}
16
+
17
+ # Attic tragedy (Aeschylus; short excerpt)
18
+ {"id":"aesch_1","label":"Attic","text":"πρῶτον μὲν εὐχῇ τῇδε πρεσβεύω θεῶν.","strip_diacritics":true}
19
+ {"id":"aesch_2","label":"Attic","text":"ἐς τήνδε γαῖαν ἦλθε Παρνησοῦ θ᾽ ἕδρας.","strip_diacritics":true}
20
+
21
+ # Aeolic lyric-style markers (short, with pronoun)
22
+ {"id":"aeolic_pron_1","label":"Aeolic","text":"αμμι δ᾽ ἄνασσα.","strip_diacritics":true}
23
+
24
+ # Doric choral-like morphology (very small synthetic but diagnostic)
25
+ {"id":"doric_1pl_mes_1","label":"Doric","text":"λεγομεσ ταδε.","strip_diacritics":true}
26
+
27
+ # Koine (NT-style; short)
28
+ {"id":"koine_nt_1","label":"Koine","text":"καὶ ἐγένετο ἐν ταῖς ἡμέραις ἐκείναις.","strip_diacritics":true}
samples.synthetic.jsonl ADDED
The diff for this file is too large to render. See raw diff