Commit ·
fa9c240
1
Parent(s): 3ca29ed
WWHO
Browse files- EVALUATION.md +167 -127
- encoder.py +245 -27
- linguis_trie.py +249 -166
- meta_config.json +8 -0
- router.py +247 -0
- tokenizer.json +0 -0
- vocab.json +0 -0
EVALUATION.md
CHANGED
|
@@ -1,11 +1,6 @@
|
|
| 1 |
-
# SGPE Battle Test and Evaluation Report
|
| 2 |
================================================================================
|
| 3 |
-
BATTERY 1: LINGUISTIC COMPLEXITY
|
| 4 |
================================================================================
|
| 5 |
-
Generated 2000 complex words across multiple categories
|
| 6 |
-
Layer1 integrity: 100%|████████████████████████████| 2000/2000 [00:00<00:00, 32898.70 word/s]
|
| 7 |
-
Testing with leading-space prefix...
|
| 8 |
-
leading-space check: 100%|███████████████████████████| 500/500 [00:00<00:00, 49599.17 word/s]
|
| 9 |
|
| 10 |
Category Total Pass Fail
|
| 11 |
------------------------------------------------------
|
|
@@ -28,17 +23,16 @@ BATTERY 1: LINGUISTIC COMPLEXITY TEST (2,000 Edge-Case Words)
|
|
| 28 |
brahmaya 1 1 0
|
| 29 |
chandrikaa 1 1 0
|
| 30 |
chhandas 1 1 0
|
| 31 |
-
conjunct_anusvara
|
| 32 |
-
conjunct_pili_anusvara
|
| 33 |
-
constructed_multisyllable
|
| 34 |
cricket 1 1 0
|
| 35 |
dangling_zwj 1 1 0
|
| 36 |
dhammachakka 1 1 0
|
| 37 |
dhyaanaya 1 1 0
|
| 38 |
-
double_conjunct
|
| 39 |
dravyaya 1 1 0
|
| 40 |
duhkhaya 1 1 0
|
| 41 |
-
filler_conjunct 190 190 0
|
| 42 |
grahanaya 1 1 0
|
| 43 |
granthaya 1 1 0
|
| 44 |
indriya 1 1 0
|
|
@@ -70,7 +64,7 @@ BATTERY 1: LINGUISTIC COMPLEXITY TEST (2,000 Edge-Case Words)
|
|
| 70 |
premaya 1 1 0
|
| 71 |
quad_stack 1 1 0
|
| 72 |
quad_virama_chain 1 1 0
|
| 73 |
-
rakaransaya_form
|
| 74 |
ritvija 1 1 0
|
| 75 |
saammpradaayika 1 1 0
|
| 76 |
samasth 1 1 0
|
|
@@ -92,7 +86,7 @@ BATTERY 1: LINGUISTIC COMPLEXITY TEST (2,000 Edge-Case Words)
|
|
| 92 |
svachchhand 1 1 0
|
| 93 |
tantraya 1 1 0
|
| 94 |
triple_conjunct 1 1 0
|
| 95 |
-
triple_conjunct_gen
|
| 96 |
trividha 1 1 0
|
| 97 |
udghoshanaya 1 1 0
|
| 98 |
upaadaanaya 1 1 0
|
|
@@ -109,148 +103,117 @@ BATTERY 1: LINGUISTIC COMPLEXITY TEST (2,000 Edge-Case Words)
|
|
| 109 |
vyatirekaya 1 1 0
|
| 110 |
vyavahaarika 1 1 0
|
| 111 |
vyavasthaava 1 1 0
|
| 112 |
-
yansaya_form
|
| 113 |
yantraya 1 1 0
|
| 114 |
zwnj_middle 1 1 0
|
| 115 |
|
| 116 |
-
Result: PASS — Tested
|
| 117 |
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
|
|
|
| 123 |
|
|
|
|
| 124 |
|
| 125 |
================================================================================
|
| 126 |
-
BATTERY
|
| 127 |
================================================================================
|
| 128 |
-
Counting token usage across test corpus...
|
| 129 |
-
scanning: 100%|█████████████████████████████████| 536508/536508 [01:46<00:00, 5057.98 sent/s]
|
| 130 |
-
Total vocab size: 100,000
|
| 131 |
-
Zero-usage tokens: 34,868
|
| 132 |
-
Near-zero (< 3) tokens: 8,942
|
| 133 |
-
Glitched tokens (bare ZWJ/HAL): 4
|
| 134 |
-
Encoding errors during scan: 0
|
| 135 |
|
| 136 |
-
|
| 137 |
-
stress-test: 100%|██████████████████████████████████████| 34868/34868 [04:08<00:00, 140.42 tok/s]
|
| 138 |
-
near-zero test: 100%|██████████████████████████████████████| 500/500 [00:00<00:00, 9508.09 tok/s]
|
| 139 |
|
| 140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
|
|
|
|
| 155 |
|
| 156 |
-
============
|
| 157 |
-
|
| 158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
DeepSeek V3 5.965 54,977,828 1.08 Local
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
Sample tokenizations:
|
| 171 |
-
'ක්රෝෂ්ඨ්ර':
|
| 172 |
-
SGPE ['ක්\u200dරෝ', '[UNK]'] (2 tokens)
|
| 173 |
-
OpenAI (o200k_base) [9 tokens]
|
| 174 |
-
Llama 4 Scout [8 tokens]
|
| 175 |
-
DeepSeek V3 [14 tokens]
|
| 176 |
-
'ශාස්ත්රීය':
|
| 177 |
-
SGPE ['ශාස්ත්\u200dරීය'] (1 tokens)
|
| 178 |
-
OpenAI (o200k_base) [6 tokens]
|
| 179 |
-
Llama 4 Scout [6 tokens]
|
| 180 |
-
DeepSeek V3 [10 tokens]
|
| 181 |
-
'ව්යාකරණය':
|
| 182 |
-
SGPE ['ව්\u200dයා', 'කරණය'] (2 tokens)
|
| 183 |
-
OpenAI (o200k_base) [5 tokens]
|
| 184 |
-
Llama 4 Scout [5 tokens]
|
| 185 |
-
DeepSeek V3 [10 tokens]
|
| 186 |
-
'ප්රත්යක්ෂ':
|
| 187 |
-
SGPE ['ප්\u200dරත්\u200dය', 'ක්ෂ'] (2 tokens)
|
| 188 |
-
OpenAI (o200k_base) [5 tokens]
|
| 189 |
-
Llama 4 Scout [5 tokens]
|
| 190 |
-
DeepSeek V3 [11 tokens]
|
| 191 |
-
'ධම්මචක්කප්පවත්තන':
|
| 192 |
-
SGPE ['ධම්ම', 'චක්ක', 'ප්ප', 'වත්තන'] (4 tokens)
|
| 193 |
-
OpenAI (o200k_base) [11 tokens]
|
| 194 |
-
Llama 4 Scout [11 tokens]
|
| 195 |
-
DeepSeek V3 [17 tokens]
|
| 196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
│ SGPE TWR: 1.438 │
|
| 206 |
-
│ GPT-4o TWR (o200k_base): 3.515 │
|
| 207 |
-
│ SGPE reduction vs GPT-4o: 59.1% │
|
| 208 |
-
│ SGPE reduction vs Llama 4: 60.8% │
|
| 209 |
-
└───────────────────────────────────────────────────────────────┘
|
| 210 |
-
|
| 211 |
|
| 212 |
================================================================================
|
| 213 |
BATTERY 4: ROUND-TRIP CONSISTENCY
|
| 214 |
================================================================================
|
| 215 |
|
| 216 |
-
Sentences tested:
|
| 217 |
-
Total
|
| 218 |
-
Total
|
|
|
|
| 219 |
Mismatches (non-UNK): 0
|
| 220 |
-
Mismatches (with UNK loss):
|
| 221 |
Crashes: 0
|
| 222 |
|
| 223 |
-
Result: PASS — Tested
|
| 224 |
-
|
| 225 |
-
Test Battery Status Key Metric
|
| 226 |
-
────────────────────────────────────────────────────────────────────────────────
|
| 227 |
-
Round-Trip Consistency (1M sentences) ✓ PASS 0 mismatches
|
| 228 |
-
────────────────────────────────────────────────────────────────────────────────
|
| 229 |
-
TOTAL P:1 F:0 W:0
|
| 230 |
-
|
| 231 |
|
| 232 |
================================================================================
|
| 233 |
-
BATTERY 5: BOUNDARY & LEADING SPACE EDGE-CASES
|
| 234 |
================================================================================
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
|
| 249 |
-
|
| 250 |
-
────────────────────────────────────────────────────────────────────────────────
|
| 251 |
-
Boundary & Leading Space Edge-Cases ✓ PASS 0 violations
|
| 252 |
-
────────────────────────────────────────────────────────────────────────────────
|
| 253 |
-
TOTAL P:1 F:0 W:0
|
| 254 |
|
| 255 |
================================================================================
|
| 256 |
BATTERY 6: ZERO-BREAKAGE GUARANTEE
|
|
@@ -266,8 +229,85 @@ BATTERY 6: ZERO-BREAKAGE GUARANTEE
|
|
| 266 |
|
| 267 |
Result: PASS — Ran 1,703 exhaustive breakage tests. Violations: 0
|
| 268 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
Test Battery Status Key Metric
|
| 270 |
────────────────────────────────────────────────────────────────────────────────
|
| 271 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
────────────────────────────────────────────────────────────────────────────────
|
| 273 |
-
TOTAL
|
|
|
|
|
|
|
| 1 |
================================================================================
|
| 2 |
+
BATTERY 1: SINHALA LINGUISTIC COMPLEXITY (2,000 Edge-Case Words)
|
| 3 |
================================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
Category Total Pass Fail
|
| 6 |
------------------------------------------------------
|
|
|
|
| 23 |
brahmaya 1 1 0
|
| 24 |
chandrikaa 1 1 0
|
| 25 |
chhandas 1 1 0
|
| 26 |
+
conjunct_anusvara 28 28 0
|
| 27 |
+
conjunct_pili_anusvara 22 22 0
|
| 28 |
+
constructed_multisyllable 252 252 0
|
| 29 |
cricket 1 1 0
|
| 30 |
dangling_zwj 1 1 0
|
| 31 |
dhammachakka 1 1 0
|
| 32 |
dhyaanaya 1 1 0
|
| 33 |
+
double_conjunct 29 29 0
|
| 34 |
dravyaya 1 1 0
|
| 35 |
duhkhaya 1 1 0
|
|
|
|
| 36 |
grahanaya 1 1 0
|
| 37 |
granthaya 1 1 0
|
| 38 |
indriya 1 1 0
|
|
|
|
| 64 |
premaya 1 1 0
|
| 65 |
quad_stack 1 1 0
|
| 66 |
quad_virama_chain 1 1 0
|
| 67 |
+
rakaransaya_form 3 3 0
|
| 68 |
ritvija 1 1 0
|
| 69 |
saammpradaayika 1 1 0
|
| 70 |
samasth 1 1 0
|
|
|
|
| 86 |
svachchhand 1 1 0
|
| 87 |
tantraya 1 1 0
|
| 88 |
triple_conjunct 1 1 0
|
| 89 |
+
triple_conjunct_gen 64 64 0
|
| 90 |
trividha 1 1 0
|
| 91 |
udghoshanaya 1 1 0
|
| 92 |
upaadaanaya 1 1 0
|
|
|
|
| 103 |
vyatirekaya 1 1 0
|
| 104 |
vyavahaarika 1 1 0
|
| 105 |
vyavasthaava 1 1 0
|
| 106 |
+
yansaya_form 7 7 0
|
| 107 |
yantraya 1 1 0
|
| 108 |
zwnj_middle 1 1 0
|
| 109 |
|
| 110 |
+
Result: PASS — Tested 500 complex words. Violations: 0, Leading-space violations: 0
|
| 111 |
|
| 112 |
+
================================================================================
|
| 113 |
+
BATTERY 2: GLITCHED TOKEN DETECTION (v2 Multi-Script)
|
| 114 |
+
================================================================================
|
| 115 |
+
Total unified vocab size: 328,020 (SGPE component: 128,001)
|
| 116 |
+
Zero-usage SGPE tokens: 1,394
|
| 117 |
+
Near-zero (< 3) tokens: 3,163
|
| 118 |
|
| 119 |
+
Result: PASS — Zero: 1394, Near-Zero: 3163, Glitched: 0
|
| 120 |
|
| 121 |
================================================================================
|
| 122 |
+
BATTERY 3: FRONTIER BENCHMARKING (V2 STRATIFIED)
|
| 123 |
================================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
+
1. Tokenization Anatomy (Visual Examples)
|
|
|
|
|
|
|
| 126 |
|
| 127 |
+
'ව්යාකරණය':
|
| 128 |
+
SGPE ['ව්යා', 'කරණය'] (2 tokens)
|
| 129 |
+
OpenAI (o200k_base) ['ව්', 'යා', 'ක', 'රණ', 'ය'] (5 tokens)
|
| 130 |
+
Llama 4 Scout ['ව්', 'යා', 'කර', 'ණය'] (4 tokens)
|
| 131 |
+
DeepSeek V3 ['ව', '්', 'ය', 'ා', 'ක', 'ර', '�', '�', 'ය'] (9 tokens)
|
| 132 |
|
| 133 |
+
'ශ්රී ලංකාව':
|
| 134 |
+
SGPE ['ශ්\u200dරී', ' ලංකාව'] (2 tokens)
|
| 135 |
+
OpenAI (o200k_base) ['ශ්', '\u200dරී', ' ලංක', 'ාව'] (4 tokens)
|
| 136 |
+
Llama 4 Scout ['ශ්', '\u200dර', 'ී', ' ල', 'ං', 'ක', 'ාව'] (7 tokens)
|
| 137 |
+
DeepSeek V3 ['�', '�', '්', '\u200d', 'ර', 'ී', ' �', '�', '�', '�', 'ක', 'ා', 'ව'] (13 tokens)
|
| 138 |
|
| 139 |
+
'अंतर्राष्ट्रीय':
|
| 140 |
+
SGPE ['अंतर्राष्ट्रीय'] (1 tokens)
|
| 141 |
+
OpenAI (o200k_base) ['अ', 'ंतर', '्र', 'ाष्ट्रीय'] (4 tokens)
|
| 142 |
+
Llama 4 Scout ['अ', 'ंतर', '्र', 'ाष्ट्रीय'] (4 tokens)
|
| 143 |
+
DeepSeek V3 ['अ', 'ंत', 'र', '्र', 'ाष', '्ट', '्री', 'य'] (8 tokens)
|
| 144 |
|
| 145 |
+
'कृत्रिम बुद्धिमत्ता':
|
| 146 |
+
SGPE ['कृत्रिम', ' बुद्धिमत्ता'] (2 tokens)
|
| 147 |
+
OpenAI (o200k_base) ['क', 'ृ', 'त्र', 'िम', ' बुद्ध', 'िम', 'त्ता'] (7 tokens)
|
| 148 |
+
Llama 4 Scout ['क', 'ृ', 'त्र', 'िम', ' ब', 'ुद्ध', 'िम', 'त्ता'] (8 tokens)
|
| 149 |
+
DeepSeek V3 ['क', 'ृ', 'त्र', 'िम', ' ब', 'ुद', '्ध', 'िम', 'त्त', 'ा'] (10 tokens)
|
| 150 |
|
| 151 |
+
Evaluating 1,499,950 sentences...
|
| 152 |
|
| 153 |
+
====== Sinhala Results ======
|
| 154 |
+
Tokenizer | Tokens | TWR | Chr/Tok | % Reduction
|
| 155 |
+
----------------------------------------------------------------------
|
| 156 |
+
SGPE | 6,665,177 | 1.276 | 4.83 | -
|
| 157 |
+
OpenAI (o200k_base) | 17,360,196 | 3.324 | 1.85 | 61.6%
|
| 158 |
+
Llama 4 Scout | 18,157,707 | 3.476 | 1.77 | 63.3%
|
| 159 |
+
DeepSeek V3 | 29,152,698 | 5.581 | 1.10 | 77.1%
|
| 160 |
|
| 161 |
+
====== Hindi Results ======
|
| 162 |
+
Tokenizer | Tokens | TWR | Chr/Tok | % Reduction
|
| 163 |
+
----------------------------------------------------------------------
|
| 164 |
+
SGPE | 13,432,763 | 1.181 | 4.29 | -
|
| 165 |
+
OpenAI (o200k_base) | 18,394,075 | 1.617 | 3.13 | 27.0%
|
| 166 |
+
Llama 4 Scout | 19,566,121 | 1.720 | 2.94 | 31.3%
|
| 167 |
+
DeepSeek V3 | 31,682,218 | 2.786 | 1.82 | 57.6%
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
+
====== English Results ======
|
| 170 |
+
Tokenizer | Tokens | TWR | Chr/Tok | % Reduction
|
| 171 |
+
----------------------------------------------------------------------
|
| 172 |
+
SGPE | 7,240,151 | 1.330 | 4.46 | -
|
| 173 |
+
OpenAI (o200k_base) | 7,420,527 | 1.364 | 4.35 | 2.4%
|
| 174 |
+
Llama 4 Scout | 7,512,843 | 1.381 | 4.30 | 3.6%
|
| 175 |
+
DeepSeek V3 | 7,904,670 | 1.453 | 4.09 | 8.4%
|
| 176 |
|
| 177 |
+
========================= OVERALL Results =========================
|
| 178 |
+
Tokenizer | Tokens | TWR | Chr/Tok | % Reduction
|
| 179 |
+
----------------------------------------------------------------------
|
| 180 |
+
SGPE | 27,338,091 | 1.241 | 4.47 | -
|
| 181 |
+
OpenAI (o200k_base) | 43,174,798 | 1.959 | 2.83 | 36.7%
|
| 182 |
+
Llama 4 Scout | 45,236,671 | 2.053 | 2.70 | 39.6%
|
| 183 |
+
DeepSeek V3 | 68,739,586 | 3.119 | 1.78 | 60.2%
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
================================================================================
|
| 186 |
BATTERY 4: ROUND-TRIP CONSISTENCY
|
| 187 |
================================================================================
|
| 188 |
|
| 189 |
+
Sentences tested: 1,499,950
|
| 190 |
+
Total words: 22,190,730
|
| 191 |
+
Total characters tested: 122,274,117
|
| 192 |
+
Total tokens generated: 27,503,859
|
| 193 |
Mismatches (non-UNK): 0
|
| 194 |
+
Mismatches (with UNK loss): 19,320
|
| 195 |
Crashes: 0
|
| 196 |
|
| 197 |
+
Result: PASS — Tested 1,499,950 sentences (122,274,117 chars). Non-UNK mismatches: 0, UNK-caused losses: 19320, Crashes: 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
|
| 199 |
================================================================================
|
| 200 |
+
BATTERY 5: BOUNDARY & LEADING SPACE EDGE-CASES
|
| 201 |
================================================================================
|
| 202 |
+
[✓] [B01-Sinhala-leading-space ] ' සිංහල' -> '[UNK]හල'
|
| 203 |
+
[✓] [B02-Sinhala-no-leading-space] 'සිංහල' -> '[UNK]හල'
|
| 204 |
+
[✓] [B03-Sinhala-trailing-punct ] 'සිංහල.' -> '[UNK]හල.'
|
| 205 |
+
[✓] [B04-Sinhala-multi-word ] 'දරුවන් පාසලට' -> 'දරුවන් පාසලට'
|
| 206 |
+
[✓] [D01-Devanagari-leading-space] ' हिंदी' -> '[UNK]दी'
|
| 207 |
+
[✓] [D02-Devanagari-no-leading ] 'नमस्ते' -> 'नमस्ते'
|
| 208 |
+
[✓] [D03-Devanagari-trailing-danda] 'नमस्ते।' -> 'नमस्ते।'
|
| 209 |
+
[✓] [D04-Devanagari-multi-word ] 'भारत देश' -> 'भारत देश'
|
| 210 |
+
[✓] [D05-Devanagari-anusvara ] 'संस्कृत' -> 'संस्कृत'
|
| 211 |
+
[✓] [F01-SinhalaEng ] 'සිංහලදABC' -> '[UNK]හලදABC'
|
| 212 |
+
[✓] [F02-DevanagariEng ] 'हिंदीDEF' -> '[UNK]दीDEF'
|
| 213 |
+
[✓] [F03-Sinhala-Devanagari ] 'සිංහල हिंदी' -> '[UNK]හල[UNK]दी'
|
| 214 |
+
[✓] [G01-Mixed-3-scripts ] ' සිංහල123ABCहिंदी ' -> '[UNK]හල123ABC[UNK]दी '
|
| 215 |
|
| 216 |
+
Result: PASS — Violations: 0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
|
| 218 |
================================================================================
|
| 219 |
BATTERY 6: ZERO-BREAKAGE GUARANTEE
|
|
|
|
| 229 |
|
| 230 |
Result: PASS — Ran 1,703 exhaustive breakage tests. Violations: 0
|
| 231 |
|
| 232 |
+
================================================================================
|
| 233 |
+
BATTERY 6: ZERO-BREAKAGE GUARANTEE (v2 Multi-Script)
|
| 234 |
+
================================================================================
|
| 235 |
+
Testing Devanagari C + HAL + C pairs (implicit conjuncts)...
|
| 236 |
+
Testing Devanagari C + vowel_sign...
|
| 237 |
+
Testing Devanagari C + HAL (terminal virama)...
|
| 238 |
+
Testing Devanagari C + anusvara / visarga / chandrabindu...
|
| 239 |
+
Testing Devanagari C + vowel_sign + modifier...
|
| 240 |
+
|
| 241 |
+
Result: PASS — Devanagari Violations: 0
|
| 242 |
+
|
| 243 |
+
================================================================================
|
| 244 |
+
BATTERY 7: DEVANAGARI LINGUISTIC COMPLEXITY
|
| 245 |
+
================================================================================
|
| 246 |
+
|
| 247 |
+
Category Total Pass Fail
|
| 248 |
+
----------------------------------------------------
|
| 249 |
+
anusvara 1 1 0
|
| 250 |
+
anusvara_prefix 5 5 0
|
| 251 |
+
complex 2 2 0
|
| 252 |
+
conjunct 3 3 0
|
| 253 |
+
conjunct_anusvara 4 4 0
|
| 254 |
+
double_conjunct 1 1 0
|
| 255 |
+
double_conjunct_gen 470 470 0
|
| 256 |
+
extreme_compound 1 1 0
|
| 257 |
+
matra 3 3 0
|
| 258 |
+
sanskrit 4 4 0
|
| 259 |
+
simple 4 4 0
|
| 260 |
+
super_compound 1 1 0
|
| 261 |
+
very_complex 1 1 0
|
| 262 |
+
|
| 263 |
+
Result: PASS — Tested 500 Devanagari words. Violations: 0
|
| 264 |
+
|
| 265 |
+
================================================================================
|
| 266 |
+
BATTERY 8: CODE-SWITCHING INTEGRITY
|
| 267 |
+
================================================================================
|
| 268 |
+
[simple_sinhala_english ] 5 tokens | ['Hello', ',', ' ශ්\u200dරී', ' ලංකාව', '!']
|
| 269 |
+
[code_sinhala ] 5 tokens | ['const', ' x', ' =', ' ප්\u200dරකාශය', ';']
|
| 270 |
+
[devanagari_english ] 7 tokens | ['मेरा', ' नाम', ' है', ' और', ' I', ' love', ' Python']
|
| 271 |
+
[code_sinhala_mixed ] 9 tokens | ['function', ' foo', '()', ' {', ' return', " '", 'ශ්\u200dරී', "';"]
|
| 272 |
+
[sinhala_english_mixed ] 8 tokens | ['ශ', '\u200d', '්', '\u200d', 'රී', ' ලංකාව', ' is', ' beautiful']
|
| 273 |
+
[python_devanagari_comment ] 7 tokens | ['print', "('", 'नमस्ते', "')", ' #', ' Say', ' Hello']
|
| 274 |
+
[sinhala_english_complex ] 8 tokens | ['ඒ', ' කියන්නේ', ',', ' G', 'PE', ' Token', 'izer', ' English']
|
| 275 |
+
[python_sinhala_comment ] 10 tokens | ['for', ' i', ' in', ' range', '(', '10', '):', ' #']
|
| 276 |
+
[sql_devanagari ] 9 tokens | ['SELECT', ' *', ' FROM', ' users', ' WHERE', ' नाम', "='", 'राम']
|
| 277 |
+
[arrow_fn_sinhala ] 22 tokens | ['const', ' create', '_func', ' =', ' (', 'p', '1', ',']
|
| 278 |
+
[math_sinhala ] 6 tokens | ['123', ' +', ' ', '456', ' =', ' ෆ']
|
| 279 |
+
|
| 280 |
+
Result: PASS — Tested 13 code-switching cases. Violations: 0, Crashes: 0
|
| 281 |
+
|
| 282 |
+
================================================================================
|
| 283 |
+
BATTERY 9: META-VOCAB ROUND-TRIP (SGPEMetaEncoder)
|
| 284 |
+
================================================================================
|
| 285 |
+
|
| 286 |
+
Sentences: 1,499,950
|
| 287 |
+
Round-trip failures: 0 (100.00% lossless)
|
| 288 |
+
Avg tokens/sentence: 18.3
|
| 289 |
+
UNK rate: 0.08%
|
| 290 |
+
|
| 291 |
+
Result: PASS — Tested 1,499,950 sentences. Failures: 0, Crashes: 0, Lossless: 100.00%, UNK rate: 0.08%
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
████████████████████████████████████████████████████████████████████████████████
|
| 295 |
+
█ █
|
| 296 |
+
█ SGPE - BATTLE TEST REPORT █
|
| 297 |
+
█ █
|
| 298 |
+
████████████████████████████████████████████████████████████████████████████████
|
| 299 |
+
|
| 300 |
Test Battery Status Key Metric
|
| 301 |
────────────────────────────────────────────────────────────────────────────────
|
| 302 |
+
Linguistic Complexity (2K Sanskrit/Pali Words) ✓ PASS 0 violations
|
| 303 |
+
Glitched Token Detection (v2) ✓ PASS
|
| 304 |
+
Frontier Benchmarking (Stratified) ✓ PASS
|
| 305 |
+
Round-Trip Consistency (v2) ✓ PASS 0 mismatches
|
| 306 |
+
Boundary Edge-Cases (v2) ✓ PASS
|
| 307 |
+
Zero-Breakage Guarantee (Extended) ✓ PASS 0 violations
|
| 308 |
+
Zero-Breakage Guarantee (v2 Devanagari) ✓ PASS
|
| 309 |
+
Devanagari Linguistic Complexity ✓ PASS 0 violations
|
| 310 |
+
Code-Switching Integrity ✓ PASS 0 violations
|
| 311 |
+
Meta-Vocab Round-Trip (SGPEMetaEncoder) ✓ PASS
|
| 312 |
────────────────────────────────────────────────────────────────────────────────
|
| 313 |
+
TOTAL P:10 F:0 W:0
|
encoder.py
CHANGED
|
@@ -1,22 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import argparse
|
| 2 |
import json
|
|
|
|
| 3 |
|
| 4 |
-
from linguis_trie import LinguisTrie
|
| 5 |
from gpe_trainer import segment_into_words, _is_boundary_token
|
| 6 |
|
| 7 |
-
|
| 8 |
class SGPEEncoder:
|
| 9 |
|
| 10 |
def __init__(self, vocab_path: str):
|
| 11 |
with open(vocab_path, "r", encoding="utf-8") as f:
|
| 12 |
data = json.load(f)
|
| 13 |
|
| 14 |
-
self.vocab: dict[str, int]
|
| 15 |
-
self.merges: list[tuple[str, str]]
|
| 16 |
-
self.special_tokens: list[str]
|
| 17 |
-
self.tokenizer
|
| 18 |
-
self.unk_id
|
| 19 |
-
self.leading_space: bool
|
| 20 |
|
| 21 |
self._merge_priority: dict[tuple[str, str], int] = {
|
| 22 |
(a, b): rank for rank, (a, b) in enumerate(self.merges)
|
|
@@ -32,18 +40,15 @@ class SGPEEncoder:
|
|
| 32 |
|
| 33 |
while True:
|
| 34 |
best_rank = len(self.merges)
|
| 35 |
-
best_idx
|
| 36 |
-
|
| 37 |
for i in range(len(tokens) - 1):
|
| 38 |
pair = (tokens[i], tokens[i + 1])
|
| 39 |
rank = self._merge_priority.get(pair)
|
| 40 |
if rank is not None and rank < best_rank:
|
| 41 |
best_rank = rank
|
| 42 |
-
best_idx
|
| 43 |
-
|
| 44 |
if best_idx == -1:
|
| 45 |
break
|
| 46 |
-
|
| 47 |
merged = tokens[best_idx] + tokens[best_idx + 1]
|
| 48 |
tokens = tokens[:best_idx] + [merged] + tokens[best_idx + 2:]
|
| 49 |
|
|
@@ -51,21 +56,17 @@ class SGPEEncoder:
|
|
| 51 |
|
| 52 |
def tokenize(self, text: str) -> list[str]:
|
| 53 |
syllables = self.layer1_tokenize(text)
|
| 54 |
-
words
|
| 55 |
-
|
| 56 |
result: list[str] = []
|
| 57 |
for word_tokens in words:
|
| 58 |
if len(word_tokens) == 1 and _is_boundary_token(word_tokens[0]):
|
| 59 |
result.append(word_tokens[0])
|
| 60 |
continue
|
| 61 |
-
|
| 62 |
cleaned = [t if t in self.vocab else "[UNK]" for t in word_tokens]
|
| 63 |
result.extend(self._apply_merges_to_word(cleaned))
|
| 64 |
-
|
| 65 |
return result
|
| 66 |
|
| 67 |
def layer1_tokenize(self, text: str) -> list[str]:
|
| 68 |
-
"""Layer 1: Deterministic LinguisTrie pre-tokenization (Syllables)."""
|
| 69 |
return self.tokenizer.tokenize(text, leading_space=self.leading_space)
|
| 70 |
|
| 71 |
def decode(self, ids: list[int]) -> str:
|
|
@@ -73,18 +74,235 @@ class SGPEEncoder:
|
|
| 73 |
return "".join(id_to_token.get(i, "") for i in ids)
|
| 74 |
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
def main():
|
| 77 |
-
parser = argparse.ArgumentParser(description="
|
| 78 |
-
parser.add_argument("--vocab",
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
args = parser.parse_args()
|
| 81 |
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
|
| 90 |
if __name__ == "__main__":
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
==========================================
|
| 3 |
+
WWHO Encoder (Unified Meta-Vocabulary)
|
| 4 |
+
==========================================
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
import argparse
|
| 10 |
import json
|
| 11 |
+
from typing import Optional
|
| 12 |
|
| 13 |
+
from linguis_trie import LinguisTrie, build_sinhala_linguis_trie
|
| 14 |
from gpe_trainer import segment_into_words, _is_boundary_token
|
| 15 |
|
|
|
|
| 16 |
class SGPEEncoder:
|
| 17 |
|
| 18 |
def __init__(self, vocab_path: str):
|
| 19 |
with open(vocab_path, "r", encoding="utf-8") as f:
|
| 20 |
data = json.load(f)
|
| 21 |
|
| 22 |
+
self.vocab: dict[str, int] = data["vocab"]
|
| 23 |
+
self.merges: list[tuple[str, str]] = [tuple(m) for m in data["merges"]]
|
| 24 |
+
self.special_tokens: list[str] = data["special_tokens"]
|
| 25 |
+
self.tokenizer = build_sinhala_linguis_trie()
|
| 26 |
+
self.unk_id = self.vocab.get("[UNK]", 1)
|
| 27 |
+
self.leading_space: bool = data.get("leading_space", False)
|
| 28 |
|
| 29 |
self._merge_priority: dict[tuple[str, str], int] = {
|
| 30 |
(a, b): rank for rank, (a, b) in enumerate(self.merges)
|
|
|
|
| 40 |
|
| 41 |
while True:
|
| 42 |
best_rank = len(self.merges)
|
| 43 |
+
best_idx = -1
|
|
|
|
| 44 |
for i in range(len(tokens) - 1):
|
| 45 |
pair = (tokens[i], tokens[i + 1])
|
| 46 |
rank = self._merge_priority.get(pair)
|
| 47 |
if rank is not None and rank < best_rank:
|
| 48 |
best_rank = rank
|
| 49 |
+
best_idx = i
|
|
|
|
| 50 |
if best_idx == -1:
|
| 51 |
break
|
|
|
|
| 52 |
merged = tokens[best_idx] + tokens[best_idx + 1]
|
| 53 |
tokens = tokens[:best_idx] + [merged] + tokens[best_idx + 2:]
|
| 54 |
|
|
|
|
| 56 |
|
| 57 |
def tokenize(self, text: str) -> list[str]:
|
| 58 |
syllables = self.layer1_tokenize(text)
|
| 59 |
+
words = segment_into_words(syllables)
|
|
|
|
| 60 |
result: list[str] = []
|
| 61 |
for word_tokens in words:
|
| 62 |
if len(word_tokens) == 1 and _is_boundary_token(word_tokens[0]):
|
| 63 |
result.append(word_tokens[0])
|
| 64 |
continue
|
|
|
|
| 65 |
cleaned = [t if t in self.vocab else "[UNK]" for t in word_tokens]
|
| 66 |
result.extend(self._apply_merges_to_word(cleaned))
|
|
|
|
| 67 |
return result
|
| 68 |
|
| 69 |
def layer1_tokenize(self, text: str) -> list[str]:
|
|
|
|
| 70 |
return self.tokenizer.tokenize(text, leading_space=self.leading_space)
|
| 71 |
|
| 72 |
def decode(self, ids: list[int]) -> str:
|
|
|
|
| 74 |
return "".join(id_to_token.get(i, "") for i in ids)
|
| 75 |
|
| 76 |
|
| 77 |
+
# ============================================================================
|
| 78 |
+
# MetaVocab — unified ID space
|
| 79 |
+
# ============================================================================
|
| 80 |
+
|
| 81 |
+
class MetaVocab:
|
| 82 |
+
def __init__(self, sgpe_vocab: dict[str, int], tiktoken_size: int):
|
| 83 |
+
self.tiktoken_size: int = tiktoken_size
|
| 84 |
+
self._sgpe_raw: dict[str, int] = sgpe_vocab
|
| 85 |
+
self._sgpe_offset: dict[str, int] = {
|
| 86 |
+
tok: idx + tiktoken_size for tok, idx in sgpe_vocab.items()
|
| 87 |
+
}
|
| 88 |
+
self._sgpe_reverse: dict[int, str] = {
|
| 89 |
+
v: k for k, v in self._sgpe_offset.items()
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
@property
|
| 93 |
+
def total_size(self) -> int:
|
| 94 |
+
return self.tiktoken_size + len(self._sgpe_raw)
|
| 95 |
+
|
| 96 |
+
def encode_sgpe_token(self, token: str, unk_id_raw: int) -> int:
|
| 97 |
+
return self._sgpe_offset.get(token, unk_id_raw + self.tiktoken_size)
|
| 98 |
+
|
| 99 |
+
def decode_id(self, uid: int) -> Optional[str]:
|
| 100 |
+
if uid < self.tiktoken_size:
|
| 101 |
+
return None
|
| 102 |
+
return self._sgpe_reverse.get(uid)
|
| 103 |
+
|
| 104 |
+
def is_tiktoken_id(self, uid: int) -> bool:
|
| 105 |
+
return uid < self.tiktoken_size
|
| 106 |
+
|
| 107 |
+
def sgpe_unk_id(self, raw_unk: int) -> int:
|
| 108 |
+
return raw_unk + self.tiktoken_size
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
# ============================================================================
|
| 112 |
+
# WWHOMetaEncoder
|
| 113 |
+
# ============================================================================
|
| 114 |
+
|
| 115 |
+
class WWHOMetaEncoder:
|
| 116 |
+
|
| 117 |
+
def __init__(self, vocab_path: str, tiktoken_model: str = "o200k_base"):
|
| 118 |
+
# Load SGPE vocab
|
| 119 |
+
with open(vocab_path, "r", encoding="utf-8") as f:
|
| 120 |
+
data = json.load(f)
|
| 121 |
+
|
| 122 |
+
sgpe_vocab: dict[str, int] = data["vocab"]
|
| 123 |
+
self._merges: list[tuple[str, str]] = [tuple(m) for m in data["merges"]]
|
| 124 |
+
self._special_tokens: list[str] = data["special_tokens"]
|
| 125 |
+
self._leading_space: bool = data.get("leading_space", False)
|
| 126 |
+
self._raw_unk_id: int = sgpe_vocab.get("[UNK]", 1)
|
| 127 |
+
|
| 128 |
+
if " " not in sgpe_vocab:
|
| 129 |
+
next_id = max(sgpe_vocab.values()) + 1
|
| 130 |
+
sgpe_vocab[" "] = next_id
|
| 131 |
+
|
| 132 |
+
try:
|
| 133 |
+
from router import _INDIC_PUNCT_CHARS
|
| 134 |
+
for ch in _INDIC_PUNCT_CHARS:
|
| 135 |
+
if ch not in sgpe_vocab:
|
| 136 |
+
next_id = max(sgpe_vocab.values()) + 1
|
| 137 |
+
sgpe_vocab[ch] = next_id
|
| 138 |
+
except ImportError:
|
| 139 |
+
pass
|
| 140 |
+
|
| 141 |
+
self._merge_priority: dict[tuple[str, str], int] = {
|
| 142 |
+
(a, b): rank for rank, (a, b) in enumerate(self._merges)
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
# tiktoken
|
| 146 |
+
try:
|
| 147 |
+
import tiktoken as _tiktoken
|
| 148 |
+
self._tik = _tiktoken.get_encoding(tiktoken_model)
|
| 149 |
+
except Exception as e:
|
| 150 |
+
raise RuntimeError(
|
| 151 |
+
f"tiktoken ({tiktoken_model!r}) unavailable: {e}. "
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
# Unified vocab
|
| 155 |
+
self._meta = MetaVocab(sgpe_vocab, self._tik.n_vocab)
|
| 156 |
+
self._space_id: int = self._meta._sgpe_offset[" "]
|
| 157 |
+
|
| 158 |
+
# Router
|
| 159 |
+
from router import CodeSwitchSegmenter, Script
|
| 160 |
+
self._segmenter = CodeSwitchSegmenter()
|
| 161 |
+
self._Script = Script
|
| 162 |
+
|
| 163 |
+
# Indic LinguisTries
|
| 164 |
+
from linguis_trie import build_sinhala_linguis_trie, build_devanagari_linguis_trie
|
| 165 |
+
self._sinhala_dfa = build_sinhala_linguis_trie()
|
| 166 |
+
self._devanagari_dfa = build_devanagari_linguis_trie()
|
| 167 |
+
|
| 168 |
+
# ------------------------------------------------------------------
|
| 169 |
+
# Public API
|
| 170 |
+
# ------------------------------------------------------------------
|
| 171 |
+
|
| 172 |
+
@property
|
| 173 |
+
def vocab_size(self) -> int:
|
| 174 |
+
return self._meta.total_size
|
| 175 |
+
|
| 176 |
+
@property
|
| 177 |
+
def tiktoken_size(self) -> int:
|
| 178 |
+
return self._meta.tiktoken_size
|
| 179 |
+
|
| 180 |
+
@property
|
| 181 |
+
def vocab(self) -> dict[str, int]:
|
| 182 |
+
return self._meta._sgpe_raw
|
| 183 |
+
|
| 184 |
+
def encode(self, text: str) -> list[int]:
|
| 185 |
+
ids: list[int] = []
|
| 186 |
+
for seg in self._segmenter.segment(text):
|
| 187 |
+
if seg.script == self._Script.LATIN:
|
| 188 |
+
ids.extend(self._tik.encode(seg.text))
|
| 189 |
+
else:
|
| 190 |
+
dfa = (
|
| 191 |
+
self._sinhala_dfa
|
| 192 |
+
if seg.script == self._Script.SINHALA
|
| 193 |
+
else self._devanagari_dfa
|
| 194 |
+
)
|
| 195 |
+
syllables = dfa.tokenize(seg.text, leading_space=seg.has_leading_space)
|
| 196 |
+
words = segment_into_words(syllables)
|
| 197 |
+
for word_toks in words:
|
| 198 |
+
if len(word_toks) == 1 and _is_boundary_token(word_toks[0]):
|
| 199 |
+
ids.extend(self._tik.encode(word_toks[0]))
|
| 200 |
+
continue
|
| 201 |
+
merged = self._apply_merges(word_toks)
|
| 202 |
+
for tok in merged:
|
| 203 |
+
ids.append(self._meta.encode_sgpe_token(tok, self._raw_unk_id))
|
| 204 |
+
return ids
|
| 205 |
+
|
| 206 |
+
def decode(self, ids: list[int]) -> str:
|
| 207 |
+
tik_buf: list[int] = []
|
| 208 |
+
result_parts: list[str] = []
|
| 209 |
+
|
| 210 |
+
def _flush_tik():
|
| 211 |
+
if tik_buf:
|
| 212 |
+
result_parts.append(self._tik.decode(tik_buf))
|
| 213 |
+
tik_buf.clear()
|
| 214 |
+
|
| 215 |
+
for uid in ids:
|
| 216 |
+
if self._meta.is_tiktoken_id(uid):
|
| 217 |
+
tik_buf.append(uid)
|
| 218 |
+
else:
|
| 219 |
+
_flush_tik()
|
| 220 |
+
tok = self._meta.decode_id(uid)
|
| 221 |
+
result_parts.append(tok if tok is not None else "")
|
| 222 |
+
|
| 223 |
+
_flush_tik()
|
| 224 |
+
return "".join(result_parts)
|
| 225 |
+
|
| 226 |
+
def tokenize(self, text: str) -> list[str]:
|
| 227 |
+
tokens: list[str] = []
|
| 228 |
+
for seg in self._segmenter.segment(text):
|
| 229 |
+
if seg.script == self._Script.LATIN:
|
| 230 |
+
ids = self._tik.encode(seg.text)
|
| 231 |
+
tokens.extend(self._tik.decode([i]) for i in ids)
|
| 232 |
+
else:
|
| 233 |
+
dfa = (
|
| 234 |
+
self._sinhala_dfa
|
| 235 |
+
if seg.script == self._Script.SINHALA
|
| 236 |
+
else self._devanagari_dfa
|
| 237 |
+
)
|
| 238 |
+
syllables = dfa.tokenize(seg.text, leading_space=seg.has_leading_space)
|
| 239 |
+
words = segment_into_words(syllables)
|
| 240 |
+
for word_toks in words:
|
| 241 |
+
if len(word_toks) == 1 and _is_boundary_token(word_toks[0]):
|
| 242 |
+
ids = self._tik.encode(word_toks[0])
|
| 243 |
+
tokens.extend(self._tik.decode([i]) for i in ids)
|
| 244 |
+
continue
|
| 245 |
+
tokens.extend(self._apply_merges(word_toks))
|
| 246 |
+
return tokens
|
| 247 |
+
|
| 248 |
+
def _apply_merges(self, tokens: list[str]) -> list[str]:
|
| 249 |
+
if len(tokens) <= 1:
|
| 250 |
+
return tokens
|
| 251 |
+
sgpe = self._meta._sgpe_raw
|
| 252 |
+
cleaned = [t if t in sgpe else "[UNK]" for t in tokens]
|
| 253 |
+
while True:
|
| 254 |
+
best_rank = len(self._merges)
|
| 255 |
+
best_idx = -1
|
| 256 |
+
for i in range(len(cleaned) - 1):
|
| 257 |
+
pair = (cleaned[i], cleaned[i + 1])
|
| 258 |
+
rank = self._merge_priority.get(pair)
|
| 259 |
+
if rank is not None and rank < best_rank:
|
| 260 |
+
best_rank = rank
|
| 261 |
+
best_idx = i
|
| 262 |
+
if best_idx == -1:
|
| 263 |
+
break
|
| 264 |
+
merged = cleaned[best_idx] + cleaned[best_idx + 1]
|
| 265 |
+
cleaned = cleaned[:best_idx] + [merged] + cleaned[best_idx + 2:]
|
| 266 |
+
return cleaned
|
| 267 |
+
|
| 268 |
+
|
| 269 |
+
# ============================================================================
|
| 270 |
+
# CLI
|
| 271 |
+
# ============================================================================
|
| 272 |
+
|
| 273 |
def main():
|
| 274 |
+
parser = argparse.ArgumentParser(description="WWHO Encoder (Unified Meta-Vocabulary)")
|
| 275 |
+
parser.add_argument("--vocab", type=str, default="output/vocab.json",
|
| 276 |
+
help="Path to WWHO vocab.json")
|
| 277 |
+
parser.add_argument("--text", type=str, required=True,
|
| 278 |
+
help="Text to encode (supports mixed Latin + Indic)")
|
| 279 |
+
parser.add_argument("--mode", type=str, default="meta",
|
| 280 |
+
choices=["sgpe", "meta"],
|
| 281 |
+
help="'sgpe' = pure SGPE encoder; 'meta' = unified meta-encoder")
|
| 282 |
+
parser.add_argument("--tiktoken_model", type=str, default="o200k_base")
|
| 283 |
args = parser.parse_args()
|
| 284 |
|
| 285 |
+
if args.mode == "sgpe":
|
| 286 |
+
enc = SGPEEncoder(args.vocab)
|
| 287 |
+
tokens = enc.tokenize(args.text)
|
| 288 |
+
ids = enc.encode(args.text)
|
| 289 |
+
print(f"[SGPEEncoder]")
|
| 290 |
+
print(f" tokens : {tokens}")
|
| 291 |
+
print(f" ids : {ids}")
|
| 292 |
+
print(f" count : {len(tokens)}")
|
| 293 |
+
else:
|
| 294 |
+
enc = WWHOMetaEncoder(args.vocab, tiktoken_model=args.tiktoken_model)
|
| 295 |
+
tokens = enc.tokenize(args.text)
|
| 296 |
+
ids = enc.encode(args.text)
|
| 297 |
+
decoded = enc.decode(ids)
|
| 298 |
+
print(f"[WWHOMetaEncoder]")
|
| 299 |
+
print(f" vocab_size : {enc.vocab_size:,} "
|
| 300 |
+
f"(tiktoken={enc.tiktoken_size:,} + SGPE={enc.vocab_size - enc.tiktoken_size:,})")
|
| 301 |
+
print(f" tokens : {tokens}")
|
| 302 |
+
print(f" ids : {ids}")
|
| 303 |
+
print(f" count : {len(tokens)}")
|
| 304 |
+
print(f" decoded: {decoded!r}")
|
| 305 |
+
print(f" lossless: {decoded == args.text}")
|
| 306 |
|
| 307 |
|
| 308 |
if __name__ == "__main__":
|
linguis_trie.py
CHANGED
|
@@ -1,119 +1,146 @@
|
|
| 1 |
"""
|
| 2 |
==========================================
|
| 3 |
-
|
| 4 |
==========================================
|
| 5 |
"""
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
# --- Independent vowels (svara) ---
|
| 13 |
-
VOWELS: set[str] = {
|
| 14 |
-
'\u0D85', # අ
|
| 15 |
-
'\u0D86', # ආ
|
| 16 |
-
'\u0D87', # ඇ
|
| 17 |
-
'\u0D88', # ඈ
|
| 18 |
-
'\u0D89', # ඉ
|
| 19 |
-
'\u0D8A', # ඊ
|
| 20 |
-
'\u0D8B', # උ
|
| 21 |
-
'\u0D8C', # ඌ
|
| 22 |
-
'\u0D8D', # ඍ
|
| 23 |
-
'\u0D8E', # ඎ
|
| 24 |
-
'\u0D8F', # ඏ
|
| 25 |
-
'\u0D90', # ඐ
|
| 26 |
-
'\u0D91', # එ
|
| 27 |
-
'\u0D92', # ඒ
|
| 28 |
-
'\u0D93', # ඓ
|
| 29 |
-
'\u0D94', # ඔ
|
| 30 |
-
'\u0D95', # ඕ
|
| 31 |
-
'\u0D96', # ඖ
|
| 32 |
-
}
|
| 33 |
-
|
| 34 |
-
# --- Consonants (vyanjana) ---
|
| 35 |
-
CONSONANTS: set[str] = {chr(c) for c in range(0x0D9A, 0x0DC7)}
|
| 36 |
-
|
| 37 |
-
# --- Dependent vowel signs (pili) ---
|
| 38 |
-
VOWEL_SIGNS: set[str] = {
|
| 39 |
-
'\u0DCF', # ා
|
| 40 |
-
'\u0DD0', # ැ
|
| 41 |
-
'\u0DD1', # ෑ
|
| 42 |
-
'\u0DD2', # ි
|
| 43 |
-
'\u0DD3', # ී
|
| 44 |
-
'\u0DD4', # ු
|
| 45 |
-
'\u0DD5', # (rare/archaic)
|
| 46 |
-
'\u0DD6', # ූ
|
| 47 |
-
'\u0DD7', # (rare/archaic)
|
| 48 |
-
'\u0DD8', # ෘ
|
| 49 |
-
'\u0DD9', # ෙ
|
| 50 |
-
'\u0DDA', # ේ
|
| 51 |
-
'\u0DDB', # ෛ
|
| 52 |
-
'\u0DDC', # ො
|
| 53 |
-
'\u0DDD', # ෝ
|
| 54 |
-
'\u0DDE', # ෞ
|
| 55 |
-
'\u0DDF', # ෟ
|
| 56 |
-
'\u0DF2', # ෲ
|
| 57 |
-
'\u0DF3', # ෳ
|
| 58 |
-
}
|
| 59 |
-
|
| 60 |
-
# --- Post-consonant modifiers (anusvara, visarga) ---
|
| 61 |
-
POST_MODIFIERS: set[str] = {
|
| 62 |
-
'\u0D82', # ං anusvara
|
| 63 |
-
'\u0D83', # ඃ visarga
|
| 64 |
-
}
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
def _is_consonant(ch: str) -> bool:
|
| 69 |
-
return ch in CONSONANTS
|
| 70 |
-
|
| 71 |
-
def _is_vowel(ch: str) -> bool:
|
| 72 |
-
return ch in VOWELS
|
| 73 |
-
|
| 74 |
-
def _is_vowel_sign(ch: str) -> bool:
|
| 75 |
-
return ch in VOWEL_SIGNS
|
| 76 |
-
|
| 77 |
-
def _is_post_modifier(ch: str) -> bool:
|
| 78 |
-
return ch in POST_MODIFIERS
|
| 79 |
-
|
| 80 |
-
def _is_hal(ch: str) -> bool:
|
| 81 |
-
return ch == HAL
|
| 82 |
-
|
| 83 |
-
def _is_zwj(ch: str) -> bool:
|
| 84 |
-
return ch == ZWJ
|
| 85 |
-
|
| 86 |
-
def _is_sinhala(ch: str) -> bool:
|
| 87 |
-
"""Any character in the Sinhala Unicode block or ZWJ."""
|
| 88 |
-
cp = ord(ch)
|
| 89 |
-
return (0x0D80 <= cp <= 0x0DFF) or cp == 0x200D
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
class LinguisTrie:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
def tokenize(self, text: str, leading_space: bool = False) -> list[str]:
|
| 95 |
-
"""
|
| 96 |
-
Tokenize Sinhala text into atomic syllable tokens.
|
| 97 |
-
Example: "මම යනවා" → [" මම", " ය", "න", "වා"]
|
| 98 |
-
"""
|
| 99 |
tokens: list[str] = []
|
| 100 |
-
n
|
| 101 |
-
pos
|
| 102 |
-
|
|
|
|
| 103 |
|
| 104 |
while pos < n:
|
| 105 |
ch = text[pos]
|
| 106 |
|
| 107 |
-
# ─── Whitespace handling (leading-space mode) ─────────
|
| 108 |
-
if leading_space and ch in (
|
| 109 |
ws_buffer = ""
|
| 110 |
-
while pos < n and text[pos] in (
|
| 111 |
ws_buffer += text[pos]
|
| 112 |
pos += 1
|
| 113 |
-
|
| 114 |
-
if ws_buffer.endswith(
|
| 115 |
for ws_char in ws_buffer[:-1]:
|
| 116 |
-
|
| 117 |
pending_space = " "
|
| 118 |
else:
|
| 119 |
for ws_char in ws_buffer:
|
|
@@ -121,92 +148,119 @@ class LinguisTrie:
|
|
| 121 |
pending_space = ""
|
| 122 |
continue
|
| 123 |
|
| 124 |
-
# ───
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
pos += 1
|
| 128 |
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
pos += 3
|
| 136 |
-
continue
|
| 137 |
-
else:
|
| 138 |
-
# Stray HAL+ZWJ at end — absorb HAL+ZWJ
|
| 139 |
-
pos += 2
|
| 140 |
-
break
|
| 141 |
-
|
| 142 |
-
elif pos + 1 < n and _is_consonant(text[pos + 1]):
|
| 143 |
-
# HAL + C (implicit conjunct, no ZWJ)
|
| 144 |
-
pos += 2
|
| 145 |
-
continue
|
| 146 |
-
|
| 147 |
-
else:
|
| 148 |
-
break
|
| 149 |
-
|
| 150 |
-
# ── Post-cluster modifiers ──
|
| 151 |
-
|
| 152 |
-
if pos < n and _is_vowel_sign(text[pos]):
|
| 153 |
-
pos += 1 # pili
|
| 154 |
-
elif pos < n and _is_hal(text[pos]):
|
| 155 |
-
pos += 1 # virama
|
| 156 |
-
|
| 157 |
-
if pos < n and _is_post_modifier(text[pos]):
|
| 158 |
-
pos += 1 # anusvara/visarga
|
| 159 |
-
|
| 160 |
-
tokens.append(pending_space + text[start:pos])
|
| 161 |
-
pending_space = ""
|
| 162 |
-
continue
|
| 163 |
-
|
| 164 |
-
# ─── Independent vowel ────────────────────────────────
|
| 165 |
-
if _is_vowel(ch):
|
| 166 |
-
start = pos
|
| 167 |
pos += 1
|
| 168 |
-
|
| 169 |
-
# Vowel + post-modifier (e.g. අං)
|
| 170 |
-
if pos < n and _is_post_modifier(text[pos]):
|
| 171 |
-
pos += 1
|
| 172 |
-
|
| 173 |
-
tokens.append(pending_space + text[start:pos])
|
| 174 |
-
pending_space = ""
|
| 175 |
continue
|
| 176 |
|
| 177 |
-
|
| 178 |
-
if _is_post_modifier(ch) or _is_hal(ch) or _is_vowel_sign(ch):
|
| 179 |
tokens.append(pending_space + ch)
|
| 180 |
pending_space = ""
|
| 181 |
pos += 1
|
| 182 |
continue
|
| 183 |
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
else:
|
| 189 |
-
|
| 190 |
-
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
if pending_space:
|
| 193 |
tokens.append(pending_space)
|
| 194 |
|
| 195 |
return tokens
|
| 196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
|
|
|
|
|
|
|
| 198 |
|
| 199 |
-
def build_linguistrie() -> LinguisTrie:
|
| 200 |
-
"""Build and return the LinguisTrie."""
|
| 201 |
-
return LinguisTrie()
|
| 202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
|
| 204 |
|
| 205 |
-
|
| 206 |
-
|
| 207 |
|
| 208 |
-
|
| 209 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
"ශ්රී ලංකා ද්වීපයේ ස්වෛරීභාවය සහ ත්රිවිධ හමුදාව.",
|
| 211 |
"භාෂාවේ ප්රෞඪත්වය විදහාපායි",
|
| 212 |
"ආචාර්යවරයාගේ වෛද්ය විද්යා පර්යේෂණය සාර්ථකයි.",
|
|
@@ -214,13 +268,42 @@ if __name__ == '__main__':
|
|
| 214 |
"මම ක්ෂණිකව ගඟට පැන්නා",
|
| 215 |
"සඤ්ඤක ක්ෂමතාවය ක්රමය සහ ඥානය",
|
| 216 |
"ද්වී ත්වේ ලං කඃ",
|
| 217 |
-
"න්ද්රී ක්ෂි ඤ්ඤ",
|
| 218 |
"2026 වසරේ AI තාක්ෂණය 60% දියුණුයි!",
|
| 219 |
]
|
| 220 |
|
| 221 |
-
for text in
|
| 222 |
-
|
| 223 |
-
print(f"Input:
|
| 224 |
-
print(f"
|
| 225 |
-
print(f"Count:
|
| 226 |
-
print("-" *
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
==========================================
|
| 3 |
+
Table-Driven DFA Tokenizer
|
| 4 |
==========================================
|
| 5 |
"""
|
| 6 |
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
import os
|
| 11 |
+
from dataclasses import dataclass, field
|
| 12 |
+
from typing import Optional
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# ---------------------------------------------------------------------------
|
| 16 |
+
# Schema loading and validation
|
| 17 |
+
# ---------------------------------------------------------------------------
|
| 18 |
+
|
| 19 |
+
class SchemaError(ValueError):
|
| 20 |
+
"""Raised when a language schema JSON is malformed or incomplete."""
|
| 21 |
+
|
| 22 |
+
@dataclass
|
| 23 |
+
class LanguageSchema:
|
| 24 |
+
language: str
|
| 25 |
+
grammar_notation: str
|
| 26 |
+
char_classes: dict[str, set[int]] # class-label → set of codepoints
|
| 27 |
+
transitions: dict[str, dict[str, Optional[str]]] # state → (class → next_state | None)
|
| 28 |
+
start_state: str
|
| 29 |
+
accept_states: set[str]
|
| 30 |
+
emit_states: set[str]
|
| 31 |
+
|
| 32 |
+
def get_regex(self) -> str:
|
| 33 |
+
parts = []
|
| 34 |
+
for cps in self.char_classes.values():
|
| 35 |
+
for cp in cps:
|
| 36 |
+
parts.append(chr(cp))
|
| 37 |
+
|
| 38 |
+
if not parts:
|
| 39 |
+
return ""
|
| 40 |
+
|
| 41 |
+
safe_parts = []
|
| 42 |
+
for p in parts:
|
| 43 |
+
if p in ('-', ']', '\\', '^'):
|
| 44 |
+
safe_parts.append('\\' + p)
|
| 45 |
+
else:
|
| 46 |
+
safe_parts.append(p)
|
| 47 |
+
|
| 48 |
+
char_set = "".join(set(safe_parts))
|
| 49 |
+
return f"[{char_set}]+"
|
| 50 |
+
|
| 51 |
|
| 52 |
+
class SchemaLoader:
|
| 53 |
+
def load(self, path: str) -> LanguageSchema:
|
| 54 |
+
with open(path, "r", encoding="utf-8") as fh:
|
| 55 |
+
raw = json.load(fh)
|
| 56 |
|
| 57 |
+
language = raw.get("language", "unknown")
|
| 58 |
+
grammar = raw.get("grammar_notation", "")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
+
if "char_classes" not in raw:
|
| 61 |
+
raise SchemaError(f"[{path}] Missing 'char_classes' key.")
|
| 62 |
+
if "dfa" not in raw:
|
| 63 |
+
raise SchemaError(f"[{path}] Missing 'dfa' key.")
|
| 64 |
|
| 65 |
+
char_classes: dict[str, set[int]] = {}
|
| 66 |
+
for label, definition in raw["char_classes"].items():
|
| 67 |
+
if label.startswith("_"):
|
| 68 |
+
continue
|
| 69 |
+
cps: set[int] = set()
|
| 70 |
+
for rng in definition.get("ranges", []):
|
| 71 |
+
lo, hi = int(rng[0], 16), int(rng[1], 16)
|
| 72 |
+
cps.update(range(lo, hi + 1))
|
| 73 |
+
for cp_hex in definition.get("codepoints", []):
|
| 74 |
+
cps.add(int(cp_hex, 16))
|
| 75 |
+
char_classes[label] = cps
|
| 76 |
+
|
| 77 |
+
dfa_raw = raw["dfa"]
|
| 78 |
+
start_state = dfa_raw.get("start", "START")
|
| 79 |
+
accept_states = set(dfa_raw.get("accept_states", []))
|
| 80 |
+
emit_states = set(dfa_raw.get("emit_states", []))
|
| 81 |
+
transitions = dfa_raw.get("transitions", {})
|
| 82 |
+
|
| 83 |
+
return LanguageSchema(
|
| 84 |
+
language=language,
|
| 85 |
+
grammar_notation=grammar,
|
| 86 |
+
char_classes=char_classes,
|
| 87 |
+
transitions=transitions,
|
| 88 |
+
start_state=start_state,
|
| 89 |
+
accept_states=accept_states,
|
| 90 |
+
emit_states=emit_states,
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
# ---------------------------------------------------------------------------
|
| 95 |
+
# Codepoint classifier
|
| 96 |
+
# ---------------------------------------------------------------------------
|
| 97 |
+
|
| 98 |
+
class CharClassifier:
|
| 99 |
+
def __init__(self, schema: LanguageSchema):
|
| 100 |
+
self._table: dict[int, str] = {}
|
| 101 |
+
for label, cps in schema.char_classes.items():
|
| 102 |
+
for cp in cps:
|
| 103 |
+
if cp in self._table:
|
| 104 |
+
continue
|
| 105 |
+
self._table[cp] = label
|
| 106 |
+
|
| 107 |
+
def classify(self, ch: str) -> str:
|
| 108 |
+
return self._table.get(ord(ch), "O")
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
# ---------------------------------------------------------------------------
|
| 112 |
+
# DFA Tokenizer
|
| 113 |
+
# ---------------------------------------------------------------------------
|
| 114 |
|
| 115 |
class LinguisTrie:
|
| 116 |
+
def __init__(self, schema: LanguageSchema):
|
| 117 |
+
self._schema = schema
|
| 118 |
+
self._classifier = CharClassifier(schema)
|
| 119 |
+
self._transitions = schema.transitions
|
| 120 |
+
self._start = schema.start_state
|
| 121 |
+
self._accept = schema.accept_states
|
| 122 |
+
self._emit = schema.emit_states
|
| 123 |
+
|
| 124 |
def tokenize(self, text: str, leading_space: bool = False) -> list[str]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
tokens: list[str] = []
|
| 126 |
+
n = len(text)
|
| 127 |
+
pos = 0
|
| 128 |
+
|
| 129 |
+
pending_space = " " if leading_space and text and text[0] not in (" ", "\t", "\n", "\r") else ""
|
| 130 |
|
| 131 |
while pos < n:
|
| 132 |
ch = text[pos]
|
| 133 |
|
| 134 |
+
# ─── Whitespace handling (leading-space mode) ────────────
|
| 135 |
+
if leading_space and ch in (" ", "\t", "\n", "\r"):
|
| 136 |
ws_buffer = ""
|
| 137 |
+
while pos < n and text[pos] in (" ", "\t", "\n", "\r"):
|
| 138 |
ws_buffer += text[pos]
|
| 139 |
pos += 1
|
| 140 |
+
|
| 141 |
+
if ws_buffer.endswith(" "):
|
| 142 |
for ws_char in ws_buffer[:-1]:
|
| 143 |
+
tokens.append(ws_char)
|
| 144 |
pending_space = " "
|
| 145 |
else:
|
| 146 |
for ws_char in ws_buffer:
|
|
|
|
| 148 |
pending_space = ""
|
| 149 |
continue
|
| 150 |
|
| 151 |
+
# ─── DFA syllable recognition ────────────────────
|
| 152 |
+
cls = self._classifier.classify(ch)
|
| 153 |
+
init_next = self._transitions.get(self._start, {}).get(cls)
|
|
|
|
| 154 |
|
| 155 |
+
if init_next is None:
|
| 156 |
+
if pending_space:
|
| 157 |
+
tokens.append(pending_space + ch)
|
| 158 |
+
pending_space = ""
|
| 159 |
+
else:
|
| 160 |
+
tokens.append(ch)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
pos += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
continue
|
| 163 |
|
| 164 |
+
if init_next in self._emit:
|
|
|
|
| 165 |
tokens.append(pending_space + ch)
|
| 166 |
pending_space = ""
|
| 167 |
pos += 1
|
| 168 |
continue
|
| 169 |
|
| 170 |
+
span_start = pos
|
| 171 |
+
state = init_next
|
| 172 |
+
pos += 1
|
| 173 |
+
last_accept_pos = pos if state in self._accept else -1
|
| 174 |
+
|
| 175 |
+
while pos < n:
|
| 176 |
+
ch2 = text[pos]
|
| 177 |
+
cls2 = self._classifier.classify(ch2)
|
| 178 |
+
next_state = self._transitions.get(state, {}).get(cls2)
|
| 179 |
+
|
| 180 |
+
if next_state is None:
|
| 181 |
+
break
|
| 182 |
+
|
| 183 |
+
state = next_state
|
| 184 |
+
pos += 1
|
| 185 |
+
|
| 186 |
+
if state in self._accept:
|
| 187 |
+
last_accept_pos = pos
|
| 188 |
+
elif state in self._emit:
|
| 189 |
+
last_accept_pos = pos
|
| 190 |
+
break
|
| 191 |
+
|
| 192 |
+
if last_accept_pos > span_start:
|
| 193 |
+
emit_end = last_accept_pos
|
| 194 |
else:
|
| 195 |
+
emit_end = pos
|
| 196 |
+
|
| 197 |
+
tokens.append(pending_space + text[span_start:emit_end])
|
| 198 |
+
pending_space = ""
|
| 199 |
+
pos = emit_end
|
| 200 |
|
| 201 |
if pending_space:
|
| 202 |
tokens.append(pending_space)
|
| 203 |
|
| 204 |
return tokens
|
| 205 |
|
| 206 |
+
# ------------------------------------------------------------------
|
| 207 |
+
# Helpers
|
| 208 |
+
# ------------------------------------------------------------------
|
| 209 |
+
|
| 210 |
+
@property
|
| 211 |
+
def language(self) -> str:
|
| 212 |
+
return self._schema.language
|
| 213 |
+
|
| 214 |
+
@property
|
| 215 |
+
def regex(self) -> str:
|
| 216 |
+
return self._schema.get_regex()
|
| 217 |
+
|
| 218 |
+
@property
|
| 219 |
+
def grammar(self) -> str:
|
| 220 |
+
return self._schema.grammar_notation
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
# ---------------------------------------------------------------------------
|
| 224 |
+
# Factory
|
| 225 |
+
# ---------------------------------------------------------------------------
|
| 226 |
+
|
| 227 |
+
_SCHEMA_DIR = os.path.join(os.path.dirname(__file__), "schemas")
|
| 228 |
|
| 229 |
+
_schema_loader = SchemaLoader()
|
| 230 |
+
_dfa_cache: dict[str, LinguisTrie] = {}
|
| 231 |
|
|
|
|
|
|
|
|
|
|
| 232 |
|
| 233 |
+
def build_linguis_trie(schema_path: str) -> LinguisTrie:
|
| 234 |
+
if schema_path not in _dfa_cache:
|
| 235 |
+
schema = _schema_loader.load(schema_path)
|
| 236 |
+
_dfa_cache[schema_path] = LinguisTrie(schema)
|
| 237 |
+
return _dfa_cache[schema_path]
|
| 238 |
|
| 239 |
|
| 240 |
+
def build_sinhala_linguis_trie() -> LinguisTrie:
|
| 241 |
+
return build_linguis_trie(os.path.join(_SCHEMA_DIR, "sinhala.json"))
|
| 242 |
|
| 243 |
+
|
| 244 |
+
def build_devanagari_linguis_trie() -> LinguisTrie:
|
| 245 |
+
return build_linguis_trie(os.path.join(_SCHEMA_DIR, "devanagari.json"))
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
# ---------------------------------------------------------------------------
|
| 249 |
+
# Self-test
|
| 250 |
+
# ---------------------------------------------------------------------------
|
| 251 |
+
|
| 252 |
+
if __name__ == "__main__":
|
| 253 |
+
import sys
|
| 254 |
+
|
| 255 |
+
print("=" * 65)
|
| 256 |
+
print("DFA Tokenizer — self-test")
|
| 257 |
+
print("=" * 65)
|
| 258 |
+
|
| 259 |
+
# --- Sinhala ---
|
| 260 |
+
sinhala_dfa = build_sinhala_linguis_trie()
|
| 261 |
+
print(f"\n[Sinhala DFA] grammar: {sinhala_dfa.grammar}\n")
|
| 262 |
+
|
| 263 |
+
sinhala_tests = [
|
| 264 |
"ශ්රී ලංකා ද්වීපයේ ස්වෛරීභාවය සහ ත්රිවිධ හමුදාව.",
|
| 265 |
"භාෂාවේ ප්රෞඪත්වය විදහාපායි",
|
| 266 |
"ආචාර්යවරයාගේ වෛද්ය විද්යා පර්යේෂණය සාර්ථකයි.",
|
|
|
|
| 268 |
"මම ක්ෂණිකව ගඟට පැන්නා",
|
| 269 |
"සඤ්ඤක ක්ෂමතාවය ක්රමය සහ ඥානය",
|
| 270 |
"ද්වී ත්වේ ලං කඃ",
|
|
|
|
| 271 |
"2026 වසරේ AI තාක්ෂණය 60% දියුණුයි!",
|
| 272 |
]
|
| 273 |
|
| 274 |
+
for text in sinhala_tests:
|
| 275 |
+
toks = sinhala_dfa.tokenize(text, leading_space=True)
|
| 276 |
+
print(f" Input : {text}")
|
| 277 |
+
print(f" Syllables: {toks}")
|
| 278 |
+
print(f" Count : {len(toks)}")
|
| 279 |
+
print("-" * 65)
|
| 280 |
+
|
| 281 |
+
# --- Devanagari ---
|
| 282 |
+
deva_dfa = build_devanagari_linguis_trie()
|
| 283 |
+
print(f"\n[Devanagari DFA] grammar: {deva_dfa.grammar}\n")
|
| 284 |
+
|
| 285 |
+
deva_tests = [
|
| 286 |
+
"नमस्ते",
|
| 287 |
+
"भारत",
|
| 288 |
+
"हिन्दी",
|
| 289 |
+
"संस्कृत",
|
| 290 |
+
"क़िला",
|
| 291 |
+
"ज़िंदगी",
|
| 292 |
+
"प्रेम",
|
| 293 |
+
"द्वारा",
|
| 294 |
+
"श्रीमान्",
|
| 295 |
+
"हिन्दुस्तान",
|
| 296 |
+
"नमस्कार दुनिया",
|
| 297 |
+
"मैं ठीक हूँ",
|
| 298 |
+
"विद्यालय में पढ़ाई होती है।",
|
| 299 |
+
]
|
| 300 |
+
|
| 301 |
+
for text in deva_tests:
|
| 302 |
+
toks = deva_dfa.tokenize(text, leading_space=True)
|
| 303 |
+
print(f" Input : {text}")
|
| 304 |
+
print(f" Syllables: {toks}")
|
| 305 |
+
print(f" Count : {len(toks)}")
|
| 306 |
+
print("-" * 65)
|
| 307 |
+
|
| 308 |
+
print("\nAll self-tests complete.")
|
| 309 |
+
sys.exit(0)
|
meta_config.json
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"tiktoken_model": "o200k_base",
|
| 3 |
+
"tiktoken_vocab_size": 200019,
|
| 4 |
+
"sgpe_vocab_size": 128000,
|
| 5 |
+
"sgpe_id_offset": 200019,
|
| 6 |
+
"script_mode": "mixed",
|
| 7 |
+
"sgpe_vocab_path": "vocab.json"
|
| 8 |
+
}
|
router.py
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
==========================================
|
| 3 |
+
Code-Switching Router
|
| 4 |
+
==========================================
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import re
|
| 10 |
+
from dataclasses import dataclass
|
| 11 |
+
from enum import Enum, auto
|
| 12 |
+
from typing import Optional
|
| 13 |
+
|
| 14 |
+
import tiktoken
|
| 15 |
+
|
| 16 |
+
from linguis_trie import build_sinhala_linguis_trie, build_devanagari_linguis_trie, LinguisTrie
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# ---------------------------------------------------------------------------
|
| 20 |
+
# Script-block detection
|
| 21 |
+
# ---------------------------------------------------------------------------
|
| 22 |
+
|
| 23 |
+
class Script(Enum):
|
| 24 |
+
LATIN = auto() # ASCII, Latin, digits, punctuation, code, emoji, etc.
|
| 25 |
+
SINHALA = auto()
|
| 26 |
+
DEVANAGARI = auto()
|
| 27 |
+
|
| 28 |
+
_sinhala_dfa = build_sinhala_linguis_trie()
|
| 29 |
+
_devanagari_dfa = build_devanagari_linguis_trie()
|
| 30 |
+
|
| 31 |
+
_INDIC_PUNCT_CHARS = "\u0964\u0965"
|
| 32 |
+
|
| 33 |
+
def _get_char_script(ch: str) -> Optional[Script]:
|
| 34 |
+
if '\u0D80' <= ch <= '\u0DFF':
|
| 35 |
+
return Script.SINHALA
|
| 36 |
+
if '\u0900' <= ch <= '\u097F':
|
| 37 |
+
return Script.DEVANAGARI
|
| 38 |
+
if ch in _INDIC_PUNCT_CHARS:
|
| 39 |
+
return Script.SINHALA # Dandas handled identically by both schemas
|
| 40 |
+
return None
|
| 41 |
+
|
| 42 |
+
def _is_indic_joiner(ch: str) -> bool:
|
| 43 |
+
# True if ZWJ or ZWNJ
|
| 44 |
+
return ch in ('\u200C', '\u200D')
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# ---------------------------------------------------------------------------
|
| 48 |
+
# Segment dataclass
|
| 49 |
+
# ---------------------------------------------------------------------------
|
| 50 |
+
|
| 51 |
+
@dataclass
|
| 52 |
+
class TextSegment:
|
| 53 |
+
text: str
|
| 54 |
+
script: Script
|
| 55 |
+
has_leading_space: bool = False # True if a boundary space was absorbed
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
# ---------------------------------------------------------------------------
|
| 59 |
+
# Segmenter
|
| 60 |
+
# ---------------------------------------------------------------------------
|
| 61 |
+
|
| 62 |
+
class CodeSwitchSegmenter:
|
| 63 |
+
def segment(self, text: str) -> list[TextSegment]:
|
| 64 |
+
if not text:
|
| 65 |
+
return []
|
| 66 |
+
|
| 67 |
+
segments: list[TextSegment] = []
|
| 68 |
+
n = len(text)
|
| 69 |
+
pos = 0
|
| 70 |
+
|
| 71 |
+
while pos < n:
|
| 72 |
+
ch = text[pos]
|
| 73 |
+
ch_script = _get_char_script(ch)
|
| 74 |
+
|
| 75 |
+
is_indic_start = (ch_script is not None)
|
| 76 |
+
|
| 77 |
+
if not is_indic_start:
|
| 78 |
+
# ─── 1. Accumulate Latin block ───
|
| 79 |
+
start = pos
|
| 80 |
+
while pos < n:
|
| 81 |
+
ch2 = text[pos]
|
| 82 |
+
if _get_char_script(ch2) is not None:
|
| 83 |
+
break # Found distinct Indic start
|
| 84 |
+
pos += 1
|
| 85 |
+
|
| 86 |
+
latin_chunk = text[start:pos]
|
| 87 |
+
|
| 88 |
+
has_ls = False
|
| 89 |
+
if pos < n and latin_chunk.endswith(" "):
|
| 90 |
+
latin_chunk = latin_chunk[:-1]
|
| 91 |
+
has_ls = True
|
| 92 |
+
|
| 93 |
+
if latin_chunk:
|
| 94 |
+
segments.append(TextSegment(text=latin_chunk, script=Script.LATIN))
|
| 95 |
+
|
| 96 |
+
if has_ls and pos < n:
|
| 97 |
+
indic_start = pos
|
| 98 |
+
current_script = _get_char_script(text[pos]) or Script.SINHALA
|
| 99 |
+
|
| 100 |
+
while pos < n:
|
| 101 |
+
c = text[pos]
|
| 102 |
+
c_script = _get_char_script(c)
|
| 103 |
+
if _is_indic_joiner(c):
|
| 104 |
+
pos += 1
|
| 105 |
+
elif c_script is not None:
|
| 106 |
+
if c_script != current_script and c not in _INDIC_PUNCT_CHARS:
|
| 107 |
+
break
|
| 108 |
+
pos += 1
|
| 109 |
+
else:
|
| 110 |
+
break
|
| 111 |
+
|
| 112 |
+
segments.append(TextSegment(
|
| 113 |
+
text=text[indic_start:pos],
|
| 114 |
+
script=current_script,
|
| 115 |
+
has_leading_space=True
|
| 116 |
+
))
|
| 117 |
+
else:
|
| 118 |
+
# ─── 2. Accumulate Indic block (no prior Latin with space) ───
|
| 119 |
+
indic_start = pos
|
| 120 |
+
current_script = ch_script
|
| 121 |
+
|
| 122 |
+
while pos < n:
|
| 123 |
+
c = text[pos]
|
| 124 |
+
c_script = _get_char_script(c)
|
| 125 |
+
if _is_indic_joiner(c):
|
| 126 |
+
pos += 1
|
| 127 |
+
elif c_script is not None:
|
| 128 |
+
if c_script != current_script and c not in _INDIC_PUNCT_CHARS:
|
| 129 |
+
break
|
| 130 |
+
pos += 1
|
| 131 |
+
else:
|
| 132 |
+
break
|
| 133 |
+
|
| 134 |
+
segments.append(TextSegment(
|
| 135 |
+
text=text[indic_start:pos],
|
| 136 |
+
script=current_script,
|
| 137 |
+
has_leading_space=False
|
| 138 |
+
))
|
| 139 |
+
|
| 140 |
+
return segments
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
# ---------------------------------------------------------------------------
|
| 145 |
+
# Router
|
| 146 |
+
# ---------------------------------------------------------------------------
|
| 147 |
+
|
| 148 |
+
class CodeSwitchRouter:
|
| 149 |
+
def __init__(
|
| 150 |
+
self,
|
| 151 |
+
tiktoken_model: str = "o200k_base",
|
| 152 |
+
sinhala_schema: Optional[str] = None,
|
| 153 |
+
devanagari_schema: Optional[str] = None,
|
| 154 |
+
):
|
| 155 |
+
# Indic DFAs
|
| 156 |
+
self._sinhala_dfa: LinguisTrie = build_sinhala_linguis_trie()
|
| 157 |
+
self._devanagari_dfa: LinguisTrie = build_devanagari_linguis_trie()
|
| 158 |
+
|
| 159 |
+
self._enc = tiktoken.get_encoding(tiktoken_model)
|
| 160 |
+
|
| 161 |
+
self._segmenter = CodeSwitchSegmenter()
|
| 162 |
+
|
| 163 |
+
# ------------------------------------------------------------------
|
| 164 |
+
# Public API
|
| 165 |
+
# ------------------------------------------------------------------
|
| 166 |
+
|
| 167 |
+
def tokenize_to_strings(self, text: str) -> list[str]:
|
| 168 |
+
result: list[str] = []
|
| 169 |
+
for seg in self._segmenter.segment(text):
|
| 170 |
+
result.extend(self._route_segment_strings(seg))
|
| 171 |
+
return result
|
| 172 |
+
|
| 173 |
+
def tokenize_to_ids(self, text: str) -> list[int]:
|
| 174 |
+
raise NotImplementedError(
|
| 175 |
+
"Use WWHOMetaEncoder.encode() for unified IDs. "
|
| 176 |
+
"tokenize_to_ids() on the raw router is intentionally not implemented "
|
| 177 |
+
"to prevent accidental ID space collision."
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
return self._enc.encode(text)
|
| 181 |
+
|
| 182 |
+
def tiktoken_decode(self, ids: list[int]) -> str:
|
| 183 |
+
return self._enc.decode(ids)
|
| 184 |
+
|
| 185 |
+
def tiktoken_vocab_size(self) -> int:
|
| 186 |
+
return self._enc.n_vocab
|
| 187 |
+
|
| 188 |
+
# ------------------------------------------------------------------
|
| 189 |
+
# Internal routing
|
| 190 |
+
# ------------------------------------------------------------------
|
| 191 |
+
|
| 192 |
+
def _route_segment_strings(self, seg: TextSegment) -> list[str]:
|
| 193 |
+
if seg.script == Script.LATIN:
|
| 194 |
+
ids = self._enc.encode(seg.text)
|
| 195 |
+
return [self._enc.decode([i]) for i in ids]
|
| 196 |
+
|
| 197 |
+
# Indic — route to appropriate DFA
|
| 198 |
+
dfa = (
|
| 199 |
+
self._sinhala_dfa
|
| 200 |
+
if seg.script == Script.SINHALA
|
| 201 |
+
else self._devanagari_dfa
|
| 202 |
+
)
|
| 203 |
+
return dfa.tokenize(seg.text, leading_space=seg.has_leading_space)
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
# ---------------------------------------------------------------------------
|
| 207 |
+
# Self-test
|
| 208 |
+
# ---------------------------------------------------------------------------
|
| 209 |
+
|
| 210 |
+
if __name__ == "__main__":
|
| 211 |
+
router = CodeSwitchRouter()
|
| 212 |
+
|
| 213 |
+
test_cases = [
|
| 214 |
+
# Pure Sinhala
|
| 215 |
+
"ශ්රී ලංකාව",
|
| 216 |
+
# Pure English
|
| 217 |
+
"Hello, world!",
|
| 218 |
+
# Mixed — English then Sinhala
|
| 219 |
+
"The capital is කොළඹ.",
|
| 220 |
+
# Mixed — Sinhala then English
|
| 221 |
+
"ලංකාව is beautiful.",
|
| 222 |
+
# Mixed — Devanagari
|
| 223 |
+
"Hello नमस्ते world",
|
| 224 |
+
# Code-switching with numbers
|
| 225 |
+
"2026 AI සහ machine learning",
|
| 226 |
+
# Boundary space edge-case
|
| 227 |
+
"GPT-4 ශ්රී ලංකා",
|
| 228 |
+
# Dense Sinhala
|
| 229 |
+
"ආචාර්යවරයාගේ වෛද්ය විද්යා පර්යේෂණය සාර්ථකයි.",
|
| 230 |
+
# Dense Devanagari
|
| 231 |
+
"विद्यालय में पढ़ाई होती है।",
|
| 232 |
+
# Multi-script sentence
|
| 233 |
+
"AI (Artificial Intelligence) සහ देवनागरी text.",
|
| 234 |
+
]
|
| 235 |
+
|
| 236 |
+
print("=" * 70)
|
| 237 |
+
print("CodeSwitchRouter — self-test")
|
| 238 |
+
print("=" * 70)
|
| 239 |
+
|
| 240 |
+
seg = CodeSwitchSegmenter()
|
| 241 |
+
for text in test_cases:
|
| 242 |
+
tokens = router.tokenize_to_strings(text)
|
| 243 |
+
blocks = seg.segment(text)
|
| 244 |
+
print(f"\n Input : {text!r}")
|
| 245 |
+
print(f" Blocks : {[(b.text, b.script.name, b.has_leading_space) for b in blocks]}")
|
| 246 |
+
print(f" Tokens : {tokens}")
|
| 247 |
+
print(f" Count : {len(tokens)}")
|
tokenizer.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
vocab.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|