thomascerniglia commited on
Commit
d0326ea
·
verified ·
1 Parent(s): ba2bf14

Upload 8 files

Browse files
dialect_analysis/__main__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from .cli import run_cli
4
+
5
+
6
+ def main() -> int:
7
+ return run_cli()
8
+
9
+
10
+ if __name__ == "__main__":
11
+ raise SystemExit(main())
dialect_analysis/cli.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import sys
4
+ from typing import List
5
+
6
+ from .pipeline import classify_text
7
+ from .scoring import DIALECTS
8
+
9
+
10
+ def _decode_stdin_bytes(data: bytes) -> str:
11
+ """Decode piped stdin bytes robustly on Windows/PowerShell.
12
+
13
+ PowerShell (especially Windows PowerShell 5.x) may pipe text to native
14
+ executables as UTF-16LE, which can appear in Python as NUL-padded bytes or
15
+ mojibake if decoded with a legacy code page.
16
+ """
17
+
18
+ if not data:
19
+ return ""
20
+
21
+ # Heuristic: lots of NUL bytes strongly suggests UTF-16.
22
+ nul_ratio = data.count(b"\x00") / max(1, len(data))
23
+ if nul_ratio > 0.10:
24
+ for enc in ("utf-16", "utf-16-le", "utf-16-be"):
25
+ try:
26
+ return data.decode(enc)
27
+ except UnicodeDecodeError:
28
+ continue
29
+
30
+ # Otherwise, try UTF-8 first (common in PowerShell 7+), then UTF-16 just in case.
31
+ for enc in ("utf-8-sig", "utf-8", "utf-16", "utf-16-le", "utf-16-be"):
32
+ try:
33
+ return data.decode(enc)
34
+ except UnicodeDecodeError:
35
+ continue
36
+
37
+ # Fallback: replace undecodable bytes.
38
+ return data.decode("utf-8", errors="replace")
39
+
40
+
41
+ def read_multiline_stdin() -> str:
42
+ """Read multi-line input.
43
+
44
+ - If text is piped in, read all of stdin.
45
+ - If interactive, read until an empty line or EOF.
46
+ """
47
+
48
+ if not sys.stdin.isatty():
49
+ data = sys.stdin.buffer.read()
50
+ return _decode_stdin_bytes(data)
51
+
52
+ print("Enter Greek text (finish with an empty line, or Ctrl-Z then Enter on Windows):")
53
+ lines: List[str] = []
54
+ while True:
55
+ try:
56
+ line = input()
57
+ except EOFError:
58
+ break
59
+ if line.strip() == "":
60
+ break
61
+ lines.append(line)
62
+ return "\n".join(lines)
63
+
64
+
65
+ def run_cli() -> int:
66
+ # Best-effort Windows console UTF-8 handling.
67
+ # This does not affect piped-input decoding (handled separately).
68
+ try:
69
+ if sys.stdin.isatty():
70
+ sys.stdin.reconfigure(encoding="utf-8", errors="replace") # type: ignore[attr-defined]
71
+ if sys.stdout.isatty():
72
+ sys.stdout.reconfigure(encoding="utf-8", errors="replace") # type: ignore[attr-defined]
73
+ except Exception:
74
+ pass
75
+
76
+ text = read_multiline_stdin()
77
+ if not text.strip():
78
+ print("No input provided.")
79
+ return 2
80
+
81
+ # If the console encoding is wrong, Greek often turns into '?'.
82
+ if text.count("?") >= 10 and sys.stdin.isatty():
83
+ print(
84
+ "Warning: many '?' characters detected; your terminal may not be using UTF-8. "
85
+ "In PowerShell, try: chcp 65001"
86
+ )
87
+
88
+ result = classify_text(text)
89
+ print(f"Dialect: {result['dialect']}")
90
+ print(f"Confidence: {result['confidence'] * 100:.1f}%")
91
+ print("Scores (%):")
92
+ for d in DIALECTS:
93
+ print(f" {d}: {float(result['scores'].get(d, 0.0)):.1f}")
94
+ print("")
95
+ print(result["explanation"])
96
+ return 0
dialect_analysis/explanation.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, List, Mapping, Tuple
4
+
5
+ from .features import ENDINGS_PLAIN, PARTICLES
6
+
7
+
8
+ def explain_results(feature_dict: Mapping[str, Any], scores: Mapping[str, float]) -> str:
9
+ """Generate a human-readable explanation of the classification."""
10
+
11
+ if not scores:
12
+ return "No scores were produced."
13
+
14
+ best_dialect = max(scores.items(), key=lambda kv: kv[1])[0]
15
+ best_pct = float(scores[best_dialect])
16
+
17
+ token_count = int(feature_dict.get("token_count", 0) or 0)
18
+ particles: Mapping[str, int] = feature_dict.get("particles", {}) or {}
19
+ endings: Mapping[str, int] = feature_dict.get("endings", {}) or {}
20
+ infinitives: Mapping[str, int] = feature_dict.get("infinitives", {}) or {}
21
+ dative_plural: Mapping[str, int] = feature_dict.get("dative_plural_endings", {}) or {}
22
+ epic_endings: Mapping[str, int] = feature_dict.get("epic_endings", {}) or {}
23
+ epic_particles: Mapping[str, int] = feature_dict.get("epic_particles", {}) or {}
24
+ epic_words: Mapping[str, int] = feature_dict.get("epic_words", {}) or {}
25
+ prepositions: Mapping[str, int] = feature_dict.get("prepositions", {}) or {}
26
+ koine_words: Mapping[str, int] = feature_dict.get("koine_words", {}) or {}
27
+ lexical_cues: Mapping[str, int] = feature_dict.get("lexical_cues", {}) or {}
28
+ doric_cues: Mapping[str, int] = feature_dict.get("doric_cues", {}) or {}
29
+ poetic_morph: Mapping[str, int] = feature_dict.get("poetic_morph", {}) or {}
30
+ patterns: Mapping[str, int] = feature_dict.get("patterns", {}) or {}
31
+ orth: Mapping[str, int] = feature_dict.get("orthography", {}) or {}
32
+ diagnostics = feature_dict.get("diagnostics", {}) or {}
33
+ greek_ratio = diagnostics.get("greek_ratio", None)
34
+ top_gap_pct = diagnostics.get("top_gap_pct", None)
35
+
36
+ contrib = (feature_dict.get("_contributions", {}) or {}).get(best_dialect, {}) # type: ignore[assignment]
37
+ top_contrib: List[Tuple[str, float]] = sorted(contrib.items(), key=lambda kv: abs(kv[1]), reverse=True)[:8]
38
+
39
+ particle_bits = ", ".join(f"{p}={int(particles.get(p, 0) or 0)}" for p in PARTICLES)
40
+ ending_bits = ", ".join(f"-{e}={int(endings.get(e, 0) or 0)}" for e in (*ENDINGS_PLAIN, "ᾳ"))
41
+ orth_bits = (
42
+ f"alpha_endings={int(orth.get('alpha_endings', 0) or 0)}, "
43
+ f"eta_endings={int(orth.get('eta_endings', 0) or 0)}"
44
+ )
45
+
46
+ lines: List[str] = []
47
+ lines.append(f"Prediction: {best_dialect} (confidence {best_pct:.1f}%)")
48
+ lines.append(f"Tokens analyzed: {token_count}")
49
+
50
+ if isinstance(greek_ratio, (int, float)):
51
+ lines.append(f"Greek-script ratio (letters): {float(greek_ratio):.2f}")
52
+ if float(greek_ratio) < 0.30:
53
+ lines.append("Warning: input contains little/no Greek; classification is low-evidence.")
54
+ if token_count < 20:
55
+ lines.append("Warning: very short passage; confidence may be unreliable.")
56
+ if isinstance(top_gap_pct, (int, float)) and float(top_gap_pct) < 10.0:
57
+ lines.append("Warning: scores are clustered; dialect signal is weak.")
58
+ lines.append("")
59
+ lines.append("Observed feature counts:")
60
+ lines.append(f" Particles: {particle_bits}")
61
+ lines.append(f" Endings: {ending_bits}")
62
+ lines.append(
63
+ " Infinitives: "
64
+ + ", ".join(
65
+ [
66
+ f"-ειν={int(infinitives.get('ειν', 0) or 0)}",
67
+ f"-μεναι={int(infinitives.get('μεναι', 0) or 0)}",
68
+ f"-μεν={int(infinitives.get('μεν', 0) or 0)}",
69
+ ]
70
+ )
71
+ )
72
+ lines.append(
73
+ " Dative plural endings: "
74
+ + ", ".join(
75
+ f"-{e}={int(dative_plural.get(e, 0) or 0)}" for e in ("οισι", "ηισι", "αισι", "οις", "αις")
76
+ )
77
+ )
78
+ lines.append(
79
+ " Epic: "
80
+ + ", ".join(
81
+ [
82
+ f"-{e}={int(epic_endings.get(e, 0) or 0)}" for e in ("οιο", "εσσι", "φι", "ηοσ", "αδεω", "ιδεω")
83
+ ]
84
+ + [
85
+ f"{p}={int(epic_particles.get(p, 0) or 0)}" for p in ("κε", "κεν", "αρ", "μιν")
86
+ ]
87
+ + [
88
+ f"{w}={int(epic_words.get(w, 0) or 0)}" for w in ("εννεπε", "αειδε", "μουσα", "μηνιν", "θεα")
89
+ ]
90
+ )
91
+ )
92
+ lines.append(
93
+ f" Patterns: ττ={int(patterns.get('tt', 0) or 0)}, σσ={int(patterns.get('ss', 0) or 0)}"
94
+ )
95
+ lines.append(
96
+ " Prepositions: "
97
+ + ", ".join(
98
+ [
99
+ f"εἰς={int(prepositions.get('εισ', 0) or 0)}",
100
+ f"ἐς={int(prepositions.get('εσ', 0) or 0)}",
101
+ ]
102
+ )
103
+ )
104
+ lines.append(
105
+ " Koine function words: "
106
+ + ", ".join(
107
+ [
108
+ f"ἵνα={int(koine_words.get('ινα', 0) or 0)}",
109
+ f"ὅτι={int(koine_words.get('οτι', 0) or 0)}",
110
+ f"καθώς={int(koine_words.get('καθωσ', 0) or 0)}",
111
+ f"ἐγένετο={int(koine_words.get('εγενετο', 0) or 0)}",
112
+ ]
113
+ )
114
+ )
115
+ lines.append(
116
+ " Lexicalized cues: "
117
+ + ", ".join(
118
+ [
119
+ f"TT-stems={int(lexical_cues.get('attic_tt', 0) or 0)}",
120
+ f"SS-stems={int(lexical_cues.get('ionic_ss', 0) or 0)}",
121
+ ]
122
+ )
123
+ )
124
+ lines.append(f" Doric cue: ἁ-initial={int(doric_cues.get('ha_initial', 0) or 0)}")
125
+ if poetic_morph:
126
+ lines.append(
127
+ " Poetic morph: "
128
+ + ", ".join(
129
+ [
130
+ f"-μες(1pl)={int(poetic_morph.get('verb_1pl_mes', 0) or 0)}",
131
+ f"ἄμμι={int(poetic_morph.get('aeolic_ammi', 0) or 0)}",
132
+ f"ὔμμι={int(poetic_morph.get('aeolic_ummi', 0) or 0)}",
133
+ ]
134
+ )
135
+ )
136
+ lines.append(f" Orthography: {orth_bits}")
137
+
138
+ if top_contrib:
139
+ lines.append("")
140
+ lines.append(f"Top contributing rules for {best_dialect}:")
141
+ for name, delta in top_contrib:
142
+ lines.append(f" {name}: {delta:+.3f}")
143
+
144
+ lines.append("")
145
+ lines.append("Note: weights are MVP placeholders; edit dialect_analysis/scoring.py to refine rules.")
146
+ return "\n".join(lines)
dialect_analysis/features.py ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import unicodedata
4
+ from collections import Counter
5
+ from typing import Any, Dict, List, Mapping, Tuple
6
+
7
+ from .normalization import sigma_normalize, strip_greek_diacritics
8
+
9
+
10
+ PARTICLES: Tuple[str, ...] = ("μεν", "δε", "γαρ", "τε", "δη", "ουν")
11
+ ENDINGS_PLAIN: Tuple[str, ...] = ("οι", "αι", "ηι", "οισι")
12
+
13
+ # Infinitive endings (high-signal morphology when present).
14
+ # These are matched on diacritic-stripped, sigma-normalized tokens.
15
+ INFINITIVE_ENDINGS_PLAIN: Tuple[str, ...] = (
16
+ "ειν", # common Attic/Ionic/Koine infinitive
17
+ "μεναι", # Aeolic-style infinitive
18
+ "μεν", # Doric/Aeolic-style infinitive
19
+ )
20
+
21
+ # A few additional, high-signal Homeric / epic-Ionic patterns (MVP).
22
+ # Matched on diacritic-stripped tokens.
23
+ EPIC_ENDINGS_PLAIN: Tuple[str, ...] = (
24
+ "οιο", # e.g., Ἠελίοιο
25
+ "φι", # e.g., -φι instrumental
26
+ "εσσι", # -εσσι(ν)
27
+ # Epic/Ionic genitive (sigma-normalized): -ηος (e.g., Ἀχιλῆος -> αχιληοσ)
28
+ "ηοσ",
29
+ # Epic patronymic genitive (e.g., Πηληϊάδεω, Ἀτρεΐδεω)
30
+ "αδεω",
31
+ "ιδεω",
32
+ )
33
+
34
+ # Dative plural patterns (useful for Ionic/Epic vs Attic/Koine tendencies).
35
+ # Matched on diacritic-stripped tokens.
36
+ DATIVE_PLURAL_ENDINGS_PLAIN: Tuple[str, ...] = (
37
+ "οισι",
38
+ "ηισι",
39
+ "αισι",
40
+ "οις",
41
+ "αις",
42
+ )
43
+
44
+ # Epic particles (very small MVP subset; diacritics stripped and sigma-normalized).
45
+ EPIC_PARTICLES_PLAIN: Tuple[str, ...] = (
46
+ "κε",
47
+ "κεν",
48
+ # Very common Homeric particle (often written ἄρ/ἄρ᾽)
49
+ "αρ",
50
+ # Homeric/epic pronoun form
51
+ "μιν",
52
+ )
53
+
54
+ # A few very common Homeric-vocabulary tokens (NOT dialect-specific in isolation).
55
+ # We only treat these as weak epic-Ionic evidence when multiple hits occur.
56
+ EPIC_WORDS_PLAIN: Tuple[str, ...] = (
57
+ "εννεπε",
58
+ "αειδε",
59
+ "μουσα",
60
+ "μηνιν",
61
+ "θεα",
62
+ )
63
+
64
+ # Very small lexicalized Attic-vs-Ionic spelling cues (MVP).
65
+ # These are substring-based to catch inflectional variants.
66
+ ATTIC_TT_STEMS: Tuple[str, ...] = (
67
+ "θαλαττ", # θάλαττα
68
+ "γλωττ", # γλῶττα
69
+ "πραττ", # πράττω
70
+ "ταττ", # τάττω
71
+ )
72
+
73
+ IONIC_SS_STEMS: Tuple[str, ...] = (
74
+ "θαλασσ", # θάλασσα
75
+ "γλωσσ", # γλῶσσα
76
+ "πρασσ", # πράσσω
77
+ "τασσ", # τάσσω
78
+ )
79
+
80
+ # Preposition preference (edition-dependent but often helpful): εἰς vs ἐς.
81
+ PREPOSITIONS_PLAIN: Tuple[str, ...] = (
82
+ # NOTE: these are *sigma-normalized* (final ς -> σ)
83
+ "εισ",
84
+ "εσ",
85
+ )
86
+
87
+ # Koine-leaning function words (very small MVP set; genre-sensitive).
88
+ # These should be low-weight, positive-only cues.
89
+ KOINE_FUNCTION_WORDS_PLAIN: Tuple[str, ...] = (
90
+ "ινα",
91
+ "οτι",
92
+ # NOTE: sigma-normalized
93
+ "καθωσ",
94
+ # NT-style narrative formula is common in Koine
95
+ "εγενετο",
96
+ )
97
+
98
+ # Literary/poetic morphology cues.
99
+ # - Doric 1pl active ending often appears as -μες (vs -μεν).
100
+ # - Aeolic pronoun forms like ἄμμι/ὔμμι are strong when they occur.
101
+ POETIC_MORPH_CUES: Tuple[str, ...] = (
102
+ "verb_1pl_mes",
103
+ "aeolic_ammi",
104
+ "aeolic_ummi",
105
+ )
106
+
107
+
108
+ def _ends_with_iota_subscript_cluster(token: str, base_letter: str) -> bool:
109
+ """True if token ends with base_letter + iota-subscript (any accents allowed)."""
110
+
111
+ if not token:
112
+ return False
113
+
114
+ decomposed = unicodedata.normalize("NFD", token)
115
+ i = len(decomposed) - 1
116
+ saw_ypogegrammeni = False
117
+ while i >= 0 and unicodedata.combining(decomposed[i]):
118
+ if decomposed[i] == "\u0345":
119
+ saw_ypogegrammeni = True
120
+ i -= 1
121
+
122
+ if i < 0:
123
+ return False
124
+
125
+ base = decomposed[i]
126
+ return base == base_letter and saw_ypogegrammeni
127
+
128
+
129
+ def extract_features(tokens: List[str]) -> Dict[str, Any]:
130
+ """Extract interpretable linguistic feature counts from tokens."""
131
+
132
+ token_count = len(tokens)
133
+ particles = Counter({p: 0 for p in PARTICLES})
134
+ endings = Counter({e: 0 for e in (*ENDINGS_PLAIN, "ᾳ")})
135
+ infinitives = Counter({e: 0 for e in INFINITIVE_ENDINGS_PLAIN})
136
+
137
+ epic_endings = Counter({e: 0 for e in EPIC_ENDINGS_PLAIN})
138
+
139
+ dative_plural_endings = Counter({e: 0 for e in DATIVE_PLURAL_ENDINGS_PLAIN})
140
+ epic_particles = Counter({p: 0 for p in EPIC_PARTICLES_PLAIN})
141
+
142
+ epic_words = Counter({w: 0 for w in EPIC_WORDS_PLAIN})
143
+
144
+ prepositions = Counter({p: 0 for p in PREPOSITIONS_PLAIN})
145
+ koine_words = Counter({w: 0 for w in KOINE_FUNCTION_WORDS_PLAIN})
146
+
147
+ lexical_cues = Counter(
148
+ {
149
+ "attic_tt": 0,
150
+ "ionic_ss": 0,
151
+ }
152
+ )
153
+
154
+ # Mild Doric cue: initial rough-breathed alpha (e.g., ἁ as article in Doric).
155
+ doric_ha_initial = 0
156
+
157
+ poetic_morph = Counter({k: 0 for k in POETIC_MORPH_CUES})
158
+
159
+ # Orthographic patterns
160
+ tt_count = 0
161
+ ss_count = 0
162
+
163
+ alpha_endings = 0
164
+ eta_endings = 0
165
+
166
+ # Script evidence: helps detect non-Greek input or encoding issues.
167
+ greek_alpha_chars = 0
168
+ alpha_chars = 0
169
+
170
+ for tok in tokens:
171
+ if not tok:
172
+ continue
173
+
174
+ for ch in tok:
175
+ if not ch.isalpha():
176
+ continue
177
+ alpha_chars += 1
178
+ code = ord(ch)
179
+ if (0x0370 <= code <= 0x03FF) or (0x1F00 <= code <= 0x1FFF):
180
+ greek_alpha_chars += 1
181
+
182
+ plain = sigma_normalize(strip_greek_diacritics(tok))
183
+ # Doric 1pl -μες (sigma-normalized: -μεσ).
184
+ # Guard against counting very short tokens.
185
+ if len(plain) >= 5 and plain.endswith("μεσ"):
186
+ poetic_morph["verb_1pl_mes"] += 1
187
+
188
+ # Aeolic pronoun forms (very high signal).
189
+ if plain == "αμμι":
190
+ poetic_morph["aeolic_ammi"] += 1
191
+ if plain == "υμμι":
192
+ poetic_morph["aeolic_ummi"] += 1
193
+
194
+ # Doric cue: token begins with alpha + rough breathing.
195
+ # This is intentionally weak; lots of words can have rough breathing.
196
+ nfd = unicodedata.normalize("NFD", tok)
197
+ if nfd:
198
+ base0 = nfd[0]
199
+ # Collect leading combining marks
200
+ j = 1
201
+ has_rough = False
202
+ while j < len(nfd) and unicodedata.combining(nfd[j]):
203
+ # COMBINING REVERSED COMMA ABOVE (rough breathing)
204
+ if nfd[j] == "\u0314":
205
+ has_rough = True
206
+ j += 1
207
+ if base0 == "α" and has_rough:
208
+ doric_ha_initial += 1
209
+
210
+ # Count orthographic patterns (occurrences, not just token presence)
211
+ tt_count += plain.count("ττ")
212
+ ss_count += plain.count("σσ")
213
+
214
+ if plain in particles:
215
+ particles[plain] += 1
216
+
217
+ if plain in epic_particles:
218
+ epic_particles[plain] += 1
219
+
220
+ if plain in epic_words:
221
+ epic_words[plain] += 1
222
+
223
+ if plain in prepositions:
224
+ prepositions[plain] += 1
225
+
226
+ if plain in koine_words:
227
+ koine_words[plain] += 1
228
+
229
+ # Lexicalized Attic/Ionic cues
230
+ if any(stem in plain for stem in ATTIC_TT_STEMS):
231
+ lexical_cues["attic_tt"] += 1
232
+ if any(stem in plain for stem in IONIC_SS_STEMS):
233
+ lexical_cues["ionic_ss"] += 1
234
+
235
+ for ending in ENDINGS_PLAIN:
236
+ if plain.endswith(ending):
237
+ endings[ending] += 1
238
+
239
+ # Infinitive endings (prefer longer endings first to avoid double-counting)
240
+ # Guard against short function words like the particle "μεν".
241
+ if len(plain) >= 5:
242
+ if plain.endswith("μεναι"):
243
+ infinitives["μεναι"] += 1
244
+ elif plain.endswith("ειν"):
245
+ infinitives["ειν"] += 1
246
+ elif plain.endswith("μεν"):
247
+ infinitives["μεν"] += 1
248
+
249
+ for ending in EPIC_ENDINGS_PLAIN:
250
+ if plain.endswith(ending):
251
+ epic_endings[ending] += 1
252
+
253
+ for ending in DATIVE_PLURAL_ENDINGS_PLAIN:
254
+ if plain.endswith(ending):
255
+ dative_plural_endings[ending] += 1
256
+
257
+ if _ends_with_iota_subscript_cluster(tok, "α"):
258
+ endings["ᾳ"] += 1
259
+
260
+ if plain.endswith(("α", "ας", "αν")):
261
+ alpha_endings += 1
262
+ if plain.endswith(("η", "ης", "ην")):
263
+ eta_endings += 1
264
+
265
+ return {
266
+ "token_count": token_count,
267
+ "particles": dict(particles),
268
+ "endings": dict(endings),
269
+ "infinitives": dict(infinitives),
270
+ "epic_endings": dict(epic_endings),
271
+ "dative_plural_endings": dict(dative_plural_endings),
272
+ "epic_particles": dict(epic_particles),
273
+ "epic_words": dict(epic_words),
274
+ "prepositions": dict(prepositions),
275
+ "koine_words": dict(koine_words),
276
+ "lexical_cues": dict(lexical_cues),
277
+ "patterns": {
278
+ "tt": tt_count,
279
+ "ss": ss_count,
280
+ },
281
+ "orthography": {
282
+ "alpha_endings": alpha_endings,
283
+ "eta_endings": eta_endings,
284
+ },
285
+ "script": {
286
+ "greek_alpha_chars": greek_alpha_chars,
287
+ "alpha_chars": alpha_chars,
288
+ },
289
+ "doric_cues": {
290
+ "ha_initial": doric_ha_initial,
291
+ },
292
+ "poetic_morph": dict(poetic_morph),
293
+ }
294
+
295
+
296
+ def rate_per_100(count: int, token_count: int) -> float:
297
+ if token_count <= 0:
298
+ return 0.0
299
+ return 100.0 * (count / token_count)
300
+
301
+
302
+ def compute_rates(feature_dict: Mapping[str, Any]) -> Dict[str, Any]:
303
+ """Compute per-100-token rates from feature counts."""
304
+
305
+ token_count = int(feature_dict.get("token_count", 0) or 0)
306
+ particles: Mapping[str, int] = feature_dict.get("particles", {}) or {}
307
+ endings: Mapping[str, int] = feature_dict.get("endings", {}) or {}
308
+ infinitives: Mapping[str, int] = feature_dict.get("infinitives", {}) or {}
309
+ orth: Mapping[str, int] = feature_dict.get("orthography", {}) or {}
310
+ patterns: Mapping[str, int] = feature_dict.get("patterns", {}) or {}
311
+ epic_particles: Mapping[str, int] = feature_dict.get("epic_particles", {}) or {}
312
+ epic_endings: Mapping[str, int] = feature_dict.get("epic_endings", {}) or {}
313
+ dative_plural_endings: Mapping[str, int] = feature_dict.get("dative_plural_endings", {}) or {}
314
+ prepositions: Mapping[str, int] = feature_dict.get("prepositions", {}) or {}
315
+ koine_words: Mapping[str, int] = feature_dict.get("koine_words", {}) or {}
316
+ lexical_cues: Mapping[str, int] = feature_dict.get("lexical_cues", {}) or {}
317
+ doric_cues: Mapping[str, int] = feature_dict.get("doric_cues", {}) or {}
318
+ poetic_morph: Mapping[str, int] = feature_dict.get("poetic_morph", {}) or {}
319
+
320
+ particle_rates = {p: rate_per_100(int(particles.get(p, 0) or 0), token_count) for p in PARTICLES}
321
+ ending_rates = {e: rate_per_100(int(endings.get(e, 0) or 0), token_count) for e in (*ENDINGS_PLAIN, "ᾳ")}
322
+ infinitive_rates = {
323
+ e: rate_per_100(int(infinitives.get(e, 0) or 0), token_count) for e in INFINITIVE_ENDINGS_PLAIN
324
+ }
325
+
326
+ alpha_rate = rate_per_100(int(orth.get("alpha_endings", 0) or 0), token_count)
327
+ eta_rate = rate_per_100(int(orth.get("eta_endings", 0) or 0), token_count)
328
+
329
+ marked_rate = ending_rates.get("οισι", 0.0) + ending_rates.get("ηι", 0.0) + ending_rates.get("ᾳ", 0.0)
330
+
331
+ pattern_rates = {
332
+ "tt": rate_per_100(int(patterns.get("tt", 0) or 0), token_count),
333
+ "ss": rate_per_100(int(patterns.get("ss", 0) or 0), token_count),
334
+ }
335
+
336
+ epic_particle_rates = {p: rate_per_100(int(epic_particles.get(p, 0) or 0), token_count) for p in EPIC_PARTICLES_PLAIN}
337
+ epic_ending_rates = {e: rate_per_100(int(epic_endings.get(e, 0) or 0), token_count) for e in EPIC_ENDINGS_PLAIN}
338
+ dative_plural_ending_rates = {
339
+ e: rate_per_100(int(dative_plural_endings.get(e, 0) or 0), token_count)
340
+ for e in DATIVE_PLURAL_ENDINGS_PLAIN
341
+ }
342
+
343
+ preposition_rates = {p: rate_per_100(int(prepositions.get(p, 0) or 0), token_count) for p in PREPOSITIONS_PLAIN}
344
+ koine_word_rates = {w: rate_per_100(int(koine_words.get(w, 0) or 0), token_count) for w in KOINE_FUNCTION_WORDS_PLAIN}
345
+ lexical_cue_rates = {
346
+ "attic_tt": rate_per_100(int(lexical_cues.get("attic_tt", 0) or 0), token_count),
347
+ "ionic_ss": rate_per_100(int(lexical_cues.get("ionic_ss", 0) or 0), token_count),
348
+ }
349
+ doric_cue_rates = {
350
+ "ha_initial": rate_per_100(int(doric_cues.get("ha_initial", 0) or 0), token_count),
351
+ }
352
+
353
+ poetic_morph_rates = {
354
+ k: rate_per_100(int(poetic_morph.get(k, 0) or 0), token_count) for k in POETIC_MORPH_CUES
355
+ }
356
+
357
+ return {
358
+ "particles_per_100": particle_rates,
359
+ "endings_per_100": ending_rates,
360
+ "infinitives_per_100": infinitive_rates,
361
+ "patterns_per_100": pattern_rates,
362
+ "epic_particles_per_100": epic_particle_rates,
363
+ "epic_endings_per_100": epic_ending_rates,
364
+ "dative_plural_endings_per_100": dative_plural_ending_rates,
365
+ "prepositions_per_100": preposition_rates,
366
+ "koine_words_per_100": koine_word_rates,
367
+ "lexical_cues_per_100": lexical_cue_rates,
368
+ "doric_cues_per_100": doric_cue_rates,
369
+ "poetic_morph_per_100": poetic_morph_rates,
370
+ "alpha_endings_per_100": alpha_rate,
371
+ "eta_endings_per_100": eta_rate,
372
+ "marked_endings_per_100": marked_rate,
373
+ }
dialect_analysis/normalization.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ import unicodedata
5
+ from typing import List
6
+
7
+ # A small punctuation set that commonly appears in Greek texts.
8
+ _EXTRA_PUNCT = "··;;—–…«»‹›“”‘’" # ano teleia, Greek question mark, dashes, quotes
9
+
10
+
11
+ def strip_greek_diacritics(text: str) -> str:
12
+ """Strip diacritics while preserving iota subscript as an explicit iota.
13
+
14
+ - Converts combining GREEK YPOGEGRAMMENI (U+0345) to 'ι'.
15
+ - Removes other combining marks (accents, breathings, etc.).
16
+ """
17
+
18
+ decomposed = unicodedata.normalize("NFD", text)
19
+ out_chars: List[str] = []
20
+ for ch in decomposed:
21
+ if ch == "\u0345":
22
+ out_chars.append("ι")
23
+ continue
24
+ if unicodedata.combining(ch):
25
+ continue
26
+ out_chars.append(ch)
27
+ return unicodedata.normalize("NFC", "".join(out_chars))
28
+
29
+
30
+ def sigma_normalize(token: str) -> str:
31
+ """Normalize sigma variants for matching."""
32
+
33
+ return token.replace("ς", "σ")
34
+
35
+
36
+ def normalize_text(text: str, *, strip_diacritics: bool = False) -> str:
37
+ """Normalize input Greek text.
38
+
39
+ - Lowercase
40
+ - Remove punctuation
41
+ - Optionally strip diacritics
42
+
43
+ Keep diacritics by default so feature extraction can detect iota-subscript
44
+ endings like -ᾳ.
45
+ """
46
+
47
+ lowered = text.lower()
48
+
49
+ # Replace tabs/newlines with spaces.
50
+ cleaned = lowered.translate(str.maketrans({"\n": " ", "\t": " "}))
51
+ cleaned = cleaned.translate(str.maketrans({ch: " " for ch in _EXTRA_PUNCT}))
52
+
53
+ # Remove remaining punctuation/symbols while keeping word chars and spaces.
54
+ cleaned = re.sub(r"[^\w\s]", " ", cleaned, flags=re.UNICODE)
55
+ cleaned = re.sub(r"\s+", " ", cleaned).strip()
56
+
57
+ if strip_diacritics:
58
+ cleaned = strip_greek_diacritics(cleaned)
59
+
60
+ return cleaned
dialect_analysis/pipeline.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, Mapping
4
+
5
+ from .explanation import explain_results
6
+ from .features import extract_features
7
+ from .normalization import normalize_text
8
+ from .scoring import DIALECTS, score_dialects
9
+ from .tokenization import tokenize
10
+
11
+
12
+ def classify_text(text: str, *, strip_diacritics: bool = False) -> Dict[str, Any]:
13
+ """End-to-end dialect classification pipeline."""
14
+
15
+ normalized = normalize_text(text, strip_diacritics=strip_diacritics)
16
+ tokens = tokenize(normalized)
17
+ features = extract_features(tokens)
18
+ scores = score_dialects(features)
19
+
20
+ dialect = max(scores.items(), key=lambda kv: kv[1])[0] if scores else "Unknown"
21
+ confidence = (float(scores.get(dialect, 0.0)) / 100.0) if scores else 0.0
22
+
23
+ top_features: Dict[str, Any] = {}
24
+ contrib_map: Mapping[str, float] = (features.get("_contributions", {}) or {}).get(dialect, {}) # type: ignore[assignment]
25
+ for name, delta in sorted(contrib_map.items(), key=lambda kv: abs(kv[1]), reverse=True)[:6]:
26
+ top_features[name] = {"contribution": float(delta)}
27
+
28
+ explanation = explain_results(features, scores)
29
+ return {
30
+ "dialect": dialect,
31
+ "confidence": confidence,
32
+ "scores": scores,
33
+ "top_features": top_features,
34
+ "explanation": explanation,
35
+ }
dialect_analysis/scoring.py ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import math
4
+ from collections import Counter
5
+ from typing import Any, Dict, Mapping, Tuple
6
+
7
+ from .features import ENDINGS_PLAIN, INFINITIVE_ENDINGS_PLAIN, PARTICLES, compute_rates
8
+
9
+
10
+ DIALECTS: Tuple[str, ...] = ("Attic", "Ionic", "Doric", "Aeolic", "Koine")
11
+
12
+
13
+ def _clamp(x: float, lo: float, hi: float) -> float:
14
+ return max(lo, min(hi, x))
15
+
16
+
17
+ def _softmax_percent(raw_scores: Mapping[str, float], *, temperature: float = 2.0) -> Dict[str, float]:
18
+ """Softmax over dialect scores with temperature to reduce overconfidence."""
19
+
20
+ if not raw_scores:
21
+ return {d: 0.0 for d in DIALECTS}
22
+
23
+ t = max(1e-6, float(temperature))
24
+ max_raw = max(float(v) for v in raw_scores.values())
25
+ exp_scores = {d: math.exp((float(raw_scores[d]) - max_raw) / t) for d in DIALECTS}
26
+ total = sum(exp_scores.values()) or 1.0
27
+ return {d: 100.0 * (exp_scores[d] / total) for d in DIALECTS}
28
+
29
+
30
+ def score_dialects(feature_dict: Mapping[str, Any]) -> Dict[str, float]:
31
+ """Score dialects using a weighted, rule-based scoring system.
32
+
33
+ Returns a dict mapping dialect -> confidence percentage (0-100).
34
+
35
+ Weights are placeholders intended to be edited as the rule-set grows.
36
+ """
37
+
38
+ rates = compute_rates(feature_dict)
39
+
40
+ token_count = int(feature_dict.get("token_count", 0) or 0)
41
+ script = feature_dict.get("script", {}) or {}
42
+ greek_alpha = int(script.get("greek_alpha_chars", 0) or 0)
43
+ alpha_chars = int(script.get("alpha_chars", 0) or 0)
44
+ greek_ratio = (greek_alpha / alpha_chars) if alpha_chars > 0 else 0.0
45
+
46
+ particle_rates: Mapping[str, float] = rates["particles_per_100"]
47
+ ending_rates: Mapping[str, float] = rates["endings_per_100"]
48
+ infinitives: Mapping[str, int] = feature_dict.get("infinitives", {}) or {}
49
+ poetic_morph: Mapping[str, int] = feature_dict.get("poetic_morph", {}) or {}
50
+ epic_particle_rates: Mapping[str, float] = rates.get("epic_particles_per_100", {}) or {}
51
+ epic_ending_rates: Mapping[str, float] = rates.get("epic_endings_per_100", {}) or {}
52
+ epic_words: Mapping[str, int] = feature_dict.get("epic_words", {}) or {}
53
+ dative_plural_rates: Mapping[str, float] = rates.get("dative_plural_endings_per_100", {}) or {}
54
+ prepositions: Mapping[str, int] = feature_dict.get("prepositions", {}) or {}
55
+ koine_words: Mapping[str, int] = feature_dict.get("koine_words", {}) or {}
56
+ lexical_cues: Mapping[str, int] = feature_dict.get("lexical_cues", {}) or {}
57
+ doric_cues: Mapping[str, int] = feature_dict.get("doric_cues", {}) or {}
58
+ patterns: Mapping[str, int] = feature_dict.get("patterns", {}) or {}
59
+ marked_rate = float(rates["marked_endings_per_100"])
60
+
61
+ epic_oio_rate = float(epic_ending_rates.get("οιο", 0.0) or 0.0)
62
+ epic_essi_rate = float(epic_ending_rates.get("εσσι", 0.0) or 0.0)
63
+ epic_fi_rate = float(epic_ending_rates.get("φι", 0.0) or 0.0)
64
+
65
+ epic_eta_os_rate = float(epic_ending_rates.get("ηοσ", 0.0) or 0.0)
66
+ epic_adeo_rate = float(epic_ending_rates.get("αδεω", 0.0) or 0.0)
67
+ epic_ideo_rate = float(epic_ending_rates.get("ιδεω", 0.0) or 0.0)
68
+
69
+ epic_ke_rate = float(epic_particle_rates.get("κε", 0.0) or 0.0)
70
+ epic_ken_rate = float(epic_particle_rates.get("κεν", 0.0) or 0.0)
71
+ epic_ke_ken_rate = epic_ke_rate + epic_ken_rate
72
+
73
+ epic_ar_rate = float(epic_particle_rates.get("αρ", 0.0) or 0.0)
74
+ epic_min_rate = float(epic_particle_rates.get("μιν", 0.0) or 0.0)
75
+
76
+ tt_count = int(patterns.get("tt", 0) or 0)
77
+ ss_count = int(patterns.get("ss", 0) or 0)
78
+
79
+ # --- Weights (MVP placeholders) ---
80
+ weights: Dict[str, Dict[str, float]] = {
81
+ "particle_μεν": {"Attic": 0.25, "Ionic": 0.15, "Doric": 0.10, "Aeolic": 0.10, "Koine": 0.05},
82
+ "particle_δε": {"Attic": 0.20, "Ionic": 0.20, "Doric": 0.15, "Aeolic": 0.15, "Koine": 0.10},
83
+ "particle_γαρ": {"Attic": 0.20, "Ionic": 0.15, "Doric": 0.10, "Aeolic": 0.10, "Koine": 0.10},
84
+ "particle_τε": {"Attic": 0.15, "Ionic": 0.10, "Doric": 0.20, "Aeolic": 0.12, "Koine": 0.05},
85
+ "particle_δη": {"Attic": 0.10, "Ionic": 0.10, "Doric": 0.10, "Aeolic": 0.08, "Koine": 0.05},
86
+ "particle_ουν": {"Attic": 0.15, "Ionic": 0.10, "Doric": 0.05, "Aeolic": 0.05, "Koine": 0.10},
87
+
88
+ "ending_οισι": {"Ionic": 3.50, "Attic": -1.00, "Doric": 0.50, "Aeolic": 0.20, "Koine": -1.50},
89
+ "ending_ηι": {"Attic": 1.10, "Ionic": 0.80, "Doric": 0.10, "Aeolic": 0.20, "Koine": -0.30},
90
+ "ending_ᾳ": {"Attic": 0.80, "Ionic": 0.60, "Doric": 0.30, "Aeolic": 0.20, "Koine": -0.60},
91
+ "ending_οι": {"Attic": 0.15, "Ionic": 0.15, "Doric": 0.15, "Aeolic": 0.15, "Koine": 0.15},
92
+ "ending_αι": {"Attic": 0.15, "Ionic": 0.15, "Doric": 0.15, "Aeolic": 0.15, "Koine": 0.15},
93
+
94
+ # NOTE: This is intentionally low-weight. "Few strong markers" is not
95
+ # uniquely Koine; it can also describe many Attic passages.
96
+ "low_marked_endings": {"Koine": 0.25, "Attic": 0.05, "Ionic": -0.05, "Doric": 0.05, "Aeolic": -0.05},
97
+
98
+ # Homeric / epic-Ionic signal
99
+ "epic_ending_οιο": {"Ionic": 4.00, "Attic": -0.50, "Doric": -0.50, "Aeolic": -0.30, "Koine": -0.50},
100
+
101
+ # Epic endings and particles (conservative; only meaningful when present)
102
+ "epic_ending_εσσι": {"Ionic": 3.00, "Attic": -0.40, "Doric": -0.20, "Aeolic": -0.20, "Koine": -0.80},
103
+ "epic_ending_φι": {"Ionic": 1.50, "Attic": -0.20, "Doric": 0.10, "Aeolic": 0.05, "Koine": -0.50},
104
+ "epic_particle_κεκεν": {"Ionic": 2.00, "Attic": -0.20, "Doric": 0.10, "Aeolic": 0.05, "Koine": -0.70},
105
+
106
+ "epic_ending_ηοσ": {"Ionic": 2.60, "Attic": -0.30, "Doric": -0.10, "Aeolic": -0.10, "Koine": -0.60},
107
+ "epic_ending_αδεω": {"Ionic": 2.80, "Attic": -0.20, "Doric": 0.00, "Aeolic": 0.00, "Koine": -0.70},
108
+ "epic_ending_ιδεω": {"Ionic": 2.80, "Attic": -0.20, "Doric": 0.00, "Aeolic": 0.00, "Koine": -0.70},
109
+
110
+ # Homeric / epic particles (ambiguous individually; keep weights modest)
111
+ "epic_particle_αρ": {"Ionic": 0.80, "Attic": -0.05, "Doric": 0.00, "Aeolic": 0.00, "Koine": -0.15},
112
+ "epic_particle_μιν": {"Ionic": 1.20, "Attic": -0.10, "Doric": 0.00, "Aeolic": 0.00, "Koine": -0.25},
113
+
114
+ # Homeric vocabulary: apply only when multiple hits occur (see logic below)
115
+ "epic_word_hits": {"Ionic": 1.80, "Attic": 0.00, "Doric": 0.00, "Aeolic": 0.00, "Koine": 0.00},
116
+
117
+ # Orthographic patterns (COUNT-based; prevents short-text rate blowups)
118
+ "pattern_tt": {"Attic": 0.45, "Ionic": 0.00, "Doric": 0.00, "Aeolic": 0.00, "Koine": 0.05},
119
+ "pattern_ss": {"Ionic": 0.10, "Attic": 0.00, "Doric": 0.00, "Aeolic": 0.00, "Koine": 0.00},
120
+
121
+ # Dative plural endings: -οισι/-αισι/-ηισι vs -οις/-αις
122
+ "dative_οισι": {"Ionic": 0.90, "Attic": -0.20, "Doric": 0.10, "Aeolic": 0.05, "Koine": -0.40},
123
+ "dative_αισι": {"Ionic": 2.20, "Attic": -0.40, "Doric": 0.20, "Aeolic": 0.10, "Koine": -0.80},
124
+ "dative_ηισι": {"Ionic": 2.20, "Attic": -0.30, "Doric": 0.10, "Aeolic": 0.10, "Koine": -0.80},
125
+ "dative_οις": {"Attic": 0.20, "Ionic": 0.05, "Doric": 0.10, "Aeolic": 0.10, "Koine": 0.15},
126
+ "dative_αις": {"Attic": 0.20, "Ionic": 0.05, "Doric": 0.10, "Aeolic": 0.10, "Koine": 0.15},
127
+
128
+ # εἰς vs ἐς (COUNT-based; keys are sigma-normalized: εισ / εσ)
129
+ "prep_εισ": {"Koine": 0.30, "Attic": 0.05, "Ionic": 0.00, "Doric": 0.00, "Aeolic": 0.00},
130
+ "prep_εσ": {"Attic": 0.25, "Ionic": 0.15, "Koine": 0.05, "Doric": 0.00, "Aeolic": 0.05},
131
+
132
+ # Koine-ish function words (COUNT-based; sigma-normalized: καθωσ)
133
+ "koine_ινα": {"Koine": 0.60, "Attic": 0.00, "Ionic": 0.00, "Doric": 0.00},
134
+ "koine_οτι": {"Koine": 0.40, "Attic": 0.00, "Ionic": 0.00, "Doric": 0.00},
135
+ "koine_καθωσ": {"Koine": 0.35, "Attic": 0.00, "Ionic": 0.00, "Doric": 0.00},
136
+ "koine_εγενετο": {"Koine": 0.90, "Attic": 0.00, "Ionic": 0.00, "Doric": 0.00, "Aeolic": 0.00},
137
+
138
+ # Lexicalized ττ/σσ stems (COUNT-based)
139
+ "lexical_attic_tt": {"Attic": 0.75, "Koine": 0.08, "Ionic": 0.00, "Doric": 0.00},
140
+ "lexical_ionic_ss": {"Ionic": 0.25, "Attic": 0.00, "Doric": 0.00, "Koine": 0.00},
141
+
142
+ # Doric-ish ἁ- (very weak; COUNT-based)
143
+ "doric_ha_initial": {"Doric": 0.12, "Attic": 0.00, "Ionic": 0.00, "Koine": 0.00},
144
+
145
+ # Infinitives (morphology): strong signal when present
146
+ # These are COUNT-based to avoid short-text rate blowups.
147
+ "inf_μεναι": {"Aeolic": 2.40, "Doric": 0.40, "Ionic": 0.05, "Attic": 0.00, "Koine": 0.00},
148
+ "inf_μεν": {"Doric": 1.20, "Aeolic": 0.80, "Ionic": 0.00, "Attic": 0.00, "Koine": 0.00},
149
+ "inf_ειν": {"Koine": 0.55, "Attic": 0.35, "Ionic": 0.35, "Doric": 0.00, "Aeolic": 0.00},
150
+
151
+ # Poetic morphology cues (COUNT-based)
152
+ "verb_1pl_mes": {"Doric": 1.30, "Aeolic": 0.30, "Attic": 0.00, "Ionic": 0.00, "Koine": 0.00},
153
+ "aeolic_ammi": {"Aeolic": 2.20, "Doric": 0.20, "Attic": 0.00, "Ionic": 0.00, "Koine": 0.00},
154
+ "aeolic_ummi": {"Aeolic": 2.20, "Doric": 0.20, "Attic": 0.00, "Ionic": 0.00, "Koine": 0.00},
155
+ }
156
+
157
+ raw_scores: Dict[str, float] = {d: 1.0 for d in DIALECTS}
158
+ contributions: Dict[str, Counter[str]] = {d: Counter() for d in DIALECTS}
159
+
160
+ # Evidence scaling: short passages should not yield extreme confidence.
161
+ evidence_scale = _clamp(token_count / 40.0, 0.0, 1.0)
162
+ if greek_ratio < 0.30:
163
+ evidence_scale *= 0.15
164
+
165
+ def apply_feature(feature_name: str, feature_value: float) -> None:
166
+ for dialect, w in weights.get(feature_name, {}).items():
167
+ delta = w * feature_value * evidence_scale
168
+ raw_scores[dialect] += delta
169
+ contributions[dialect][feature_name] += delta
170
+
171
+ def apply_tier_a(feature_name: str, feature_value: float) -> None:
172
+ """Apply highly diagnostic features with a minimum evidence scale.
173
+
174
+ Rationale: some morphology is genuinely strong evidence even in short
175
+ passages; we still keep the scale modest to avoid overconfidence.
176
+ """
177
+
178
+ tier_scale = max(evidence_scale, 0.25)
179
+ for dialect, w in weights.get(feature_name, {}).items():
180
+ delta = w * feature_value * tier_scale
181
+ raw_scores[dialect] += delta
182
+ contributions[dialect][feature_name] += delta
183
+
184
+ for p in PARTICLES:
185
+ apply_feature(f"particle_{p}", float(particle_rates.get(p, 0.0)))
186
+
187
+ for e in (*ENDINGS_PLAIN, "ᾳ"):
188
+ apply_feature(f"ending_{e}", float(ending_rates.get(e, 0.0)))
189
+
190
+ # Infinitive morphology
191
+ apply_tier_a("inf_μεναι", float(int(infinitives.get("μεναι", 0) or 0)))
192
+ apply_tier_a("inf_μεν", float(int(infinitives.get("μεν", 0) or 0)))
193
+ apply_tier_a("inf_ειν", float(int(infinitives.get("ειν", 0) or 0)))
194
+
195
+ # Poetic morphology
196
+ apply_tier_a("verb_1pl_mes", float(int(poetic_morph.get("verb_1pl_mes", 0) or 0)))
197
+ apply_tier_a("aeolic_ammi", float(int(poetic_morph.get("aeolic_ammi", 0) or 0)))
198
+ apply_tier_a("aeolic_ummi", float(int(poetic_morph.get("aeolic_ummi", 0) or 0)))
199
+
200
+ # Only apply the Koine scarcity heuristic when we have enough text.
201
+ if token_count >= 20:
202
+ apply_feature("low_marked_endings", max(0.0, 1.5 - marked_rate))
203
+
204
+ # Epic marker
205
+ apply_feature("epic_ending_οιο", epic_oio_rate)
206
+
207
+ # Additional epic markers
208
+ apply_feature("epic_ending_εσσι", epic_essi_rate)
209
+ apply_feature("epic_ending_φι", epic_fi_rate)
210
+ apply_feature("epic_particle_κεκεν", epic_ke_ken_rate)
211
+ apply_feature("epic_ending_ηοσ", epic_eta_os_rate)
212
+ apply_feature("epic_ending_αδεω", epic_adeo_rate)
213
+ apply_feature("epic_ending_ιδεω", epic_ideo_rate)
214
+ apply_feature("epic_particle_αρ", epic_ar_rate)
215
+ apply_feature("epic_particle_μιν", epic_min_rate)
216
+
217
+ epic_word_hits = sum(
218
+ int(epic_words.get(w, 0) or 0)
219
+ for w in ("εννεπε", "αειδε", "μουσα", "μηνιν", "θεα")
220
+ )
221
+ if epic_word_hits >= 2:
222
+ apply_tier_a("epic_word_hits", float(min(4, epic_word_hits)))
223
+
224
+ # tt/ss orthography (separate, conservative)
225
+ apply_feature("pattern_tt", float(tt_count))
226
+ apply_feature("pattern_ss", float(ss_count))
227
+
228
+ # Dative plural endings
229
+ apply_feature("dative_οισι", float(dative_plural_rates.get("οισι", 0.0) or 0.0))
230
+ apply_feature("dative_αισι", float(dative_plural_rates.get("αισι", 0.0) or 0.0))
231
+ apply_feature("dative_ηισι", float(dative_plural_rates.get("ηισι", 0.0) or 0.0))
232
+ apply_feature("dative_οις", float(dative_plural_rates.get("οις", 0.0) or 0.0))
233
+ apply_feature("dative_αις", float(dative_plural_rates.get("αις", 0.0) or 0.0))
234
+
235
+ # εἰς / ἐς (counts; sigma-normalized)
236
+ apply_feature("prep_εισ", float(int(prepositions.get("εισ", 0) or 0)))
237
+ apply_feature("prep_εσ", float(int(prepositions.get("εσ", 0) or 0)))
238
+
239
+ # Koine-ish function words (counts; sigma-normalized)
240
+ apply_feature("koine_ινα", float(int(koine_words.get("ινα", 0) or 0)))
241
+ apply_feature("koine_οτι", float(int(koine_words.get("οτι", 0) or 0)))
242
+ apply_feature("koine_καθωσ", float(int(koine_words.get("καθωσ", 0) or 0)))
243
+ apply_feature("koine_εγενετο", float(int(koine_words.get("εγενετο", 0) or 0)))
244
+
245
+ # Lexicalized ττ/σσ stems (counts)
246
+ apply_feature("lexical_attic_tt", float(int(lexical_cues.get("attic_tt", 0) or 0)))
247
+ apply_feature("lexical_ionic_ss", float(int(lexical_cues.get("ionic_ss", 0) or 0)))
248
+
249
+ # Doric cue (very noisy): require longer text + multiple hits
250
+ ha_hits = int(doric_cues.get("ha_initial", 0) or 0)
251
+ if token_count >= 30 and ha_hits >= 2:
252
+ apply_feature("doric_ha_initial", float(ha_hits))
253
+
254
+ # If mutable, persist diagnostics for explainability.
255
+ if isinstance(feature_dict, dict):
256
+ feature_dict["rates"] = rates
257
+ feature_dict["diagnostics"] = {
258
+ "greek_ratio": greek_ratio,
259
+ "evidence_scale": evidence_scale,
260
+ }
261
+ feature_dict["_raw_scores"] = dict(raw_scores)
262
+ feature_dict["_contributions"] = {d: dict(contributions[d]) for d in DIALECTS}
263
+
264
+ # Slightly increase confidence only when evidence is strong.
265
+ temperature = _clamp(2.0 - 0.6 * evidence_scale, 1.4, 2.0)
266
+ scores = _softmax_percent(raw_scores, temperature=temperature)
267
+
268
+ # Post-hoc discrimination diagnostics.
269
+ ordered = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)
270
+ best_pct = float(ordered[0][1]) if ordered else 0.0
271
+ second_pct = float(ordered[1][1]) if len(ordered) > 1 else 0.0
272
+ top_gap_pct = best_pct - second_pct
273
+
274
+ if isinstance(feature_dict, dict):
275
+ diagnostics = feature_dict.get("diagnostics", {}) or {}
276
+ diagnostics.update(
277
+ {
278
+ "best_pct": best_pct,
279
+ "second_pct": second_pct,
280
+ "top_gap_pct": top_gap_pct,
281
+ }
282
+ )
283
+ feature_dict["diagnostics"] = diagnostics
284
+
285
+ return scores
dialect_analysis/tokenization.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import List
4
+
5
+
6
+ def tokenize(text: str) -> List[str]:
7
+ """Tokenize a normalized text into whitespace-delimited tokens."""
8
+
9
+ if not text:
10
+ return []
11
+ return [t for t in text.split(" ") if t]