omarkamali commited on
Commit
cda4232
·
verified ·
1 Parent(s): 7713c5e

Upload all models and assets for alt (latest)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. README.md +197 -160
  3. models/embeddings/aligned/alt_128d.bin +3 -0
  4. models/embeddings/aligned/alt_128d.meta.json +1 -0
  5. models/embeddings/aligned/alt_128d.projection.npy +3 -0
  6. models/embeddings/aligned/alt_128d_metadata.json +8 -0
  7. models/embeddings/aligned/alt_32d.bin +3 -0
  8. models/embeddings/aligned/alt_32d.meta.json +1 -0
  9. models/embeddings/aligned/alt_32d.projection.npy +3 -0
  10. models/embeddings/aligned/alt_32d_metadata.json +8 -0
  11. models/embeddings/aligned/alt_64d.bin +3 -0
  12. models/embeddings/aligned/alt_64d.meta.json +1 -0
  13. models/embeddings/aligned/alt_64d.projection.npy +3 -0
  14. models/embeddings/aligned/alt_64d_metadata.json +8 -0
  15. models/embeddings/monolingual/alt_128d.bin +2 -2
  16. models/embeddings/monolingual/alt_128d_metadata.json +1 -1
  17. models/embeddings/monolingual/alt_32d.bin +2 -2
  18. models/embeddings/monolingual/alt_32d_metadata.json +1 -1
  19. models/embeddings/monolingual/alt_64d.bin +2 -2
  20. models/embeddings/monolingual/alt_64d_metadata.json +1 -1
  21. models/subword_markov/alt_markov_ctx1_subword.parquet +2 -2
  22. models/subword_markov/alt_markov_ctx1_subword_metadata.json +1 -1
  23. models/subword_markov/alt_markov_ctx2_subword.parquet +2 -2
  24. models/subword_markov/alt_markov_ctx2_subword_metadata.json +2 -2
  25. models/subword_markov/alt_markov_ctx3_subword.parquet +2 -2
  26. models/subword_markov/alt_markov_ctx3_subword_metadata.json +2 -2
  27. models/subword_markov/alt_markov_ctx4_subword.parquet +2 -2
  28. models/subword_markov/alt_markov_ctx4_subword_metadata.json +2 -2
  29. models/subword_ngram/alt_2gram_subword.parquet +2 -2
  30. models/subword_ngram/alt_2gram_subword_metadata.json +2 -2
  31. models/subword_ngram/alt_3gram_subword.parquet +2 -2
  32. models/subword_ngram/alt_3gram_subword_metadata.json +2 -2
  33. models/subword_ngram/alt_4gram_subword.parquet +2 -2
  34. models/subword_ngram/alt_4gram_subword_metadata.json +2 -2
  35. models/subword_ngram/alt_5gram_subword.parquet +3 -0
  36. models/subword_ngram/alt_5gram_subword_metadata.json +7 -0
  37. models/tokenizer/alt_tokenizer_16k.model +2 -2
  38. models/tokenizer/alt_tokenizer_16k.vocab +0 -0
  39. models/tokenizer/alt_tokenizer_8k.model +2 -2
  40. models/tokenizer/alt_tokenizer_8k.vocab +0 -0
  41. models/vocabulary/alt_vocabulary.parquet +2 -2
  42. models/vocabulary/alt_vocabulary_metadata.json +9 -9
  43. models/word_markov/alt_markov_ctx1_word.parquet +2 -2
  44. models/word_markov/alt_markov_ctx1_word_metadata.json +2 -2
  45. models/word_markov/alt_markov_ctx2_word.parquet +2 -2
  46. models/word_markov/alt_markov_ctx2_word_metadata.json +2 -2
  47. models/word_markov/alt_markov_ctx3_word.parquet +2 -2
  48. models/word_markov/alt_markov_ctx3_word_metadata.json +2 -2
  49. models/word_markov/alt_markov_ctx4_word.parquet +2 -2
  50. models/word_markov/alt_markov_ctx4_word_metadata.json +2 -2
.gitattributes CHANGED
@@ -39,3 +39,4 @@ visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -t
39
  visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
40
  visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
41
  visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
 
 
39
  visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
40
  visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
41
  visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
42
+ visualizations/embedding_tsne_multilingual.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  language: alt
3
- language_name: ALT
4
  language_family: turkic_siberian
5
  tags:
6
  - wikilangs
@@ -10,11 +10,21 @@ tags:
10
  - n-gram
11
  - markov
12
  - wikipedia
 
 
 
 
 
 
 
 
 
 
13
  - monolingual
14
  - family-turkic_siberian
15
  license: mit
16
  library_name: wikilangs
17
- pipeline_tag: feature-extraction
18
  datasets:
19
  - omarkamali/wikipedia-monthly
20
  dataset_info:
@@ -23,20 +33,20 @@ dataset_info:
23
  metrics:
24
  - name: best_compression_ratio
25
  type: compression
26
- value: 3.681
27
  - name: best_isotropy
28
  type: isotropy
29
- value: 0.8352
30
  - name: vocabulary_size
31
  type: vocab
32
  value: 0
33
  generated: 2026-01-03
34
  ---
35
 
36
- # ALT - Wikilangs Models
37
  ## Comprehensive Research Report & Full Ablation Study
38
 
39
- This repository contains NLP models trained and evaluated by Wikilangs, specifically on **ALT** Wikipedia data.
40
  We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
41
 
42
  ## 📋 Repository Contents
@@ -60,7 +70,7 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
60
  - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
61
  - [4. Vocabulary Analysis](#4-vocabulary-analysis)
62
  - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
63
- - [6. Morphological Analysis (Experimental)](#6-morphological-analysis)
64
  - [7. Summary & Recommendations](#7-summary--recommendations)
65
  - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
66
  - [Visualizations Index](#visualizations-index)
@@ -80,39 +90,39 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
80
 
81
  | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
82
  |------------|-------------|---------------|----------|--------------|
83
- | **8k** | 3.483x | 3.48 | 0.3997% | 976,020 |
84
- | **16k** | 3.681x 🏆 | 3.68 | 0.4223% | 923,645 |
85
 
86
  ### Tokenization Examples
87
 
88
  Below are sample sentences tokenized with each vocabulary size:
89
 
90
- **Sample 1:** `Тижимеева Галина ИвановнаКан-Оозы аймактыҥ аймак депутатды. Ӱстӱги Јалаҥый Ба...`
91
 
92
  | Vocab | Tokens | Count |
93
  |-------|--------|-------|
94
- | 8k | `▁ти жи ме ева ▁галина ▁ивановна ▁— ▁кан - оозы ... (+12 more)` | 22 |
95
- | 16k | `▁тижимеева ▁галина ▁ивановна ▁— ▁кан - оозы ▁аймактыҥ ▁аймак ▁депутатды ... (+8 more)` | 18 |
96
 
97
- **Sample 2:** `«Кызалаҥду јылдар» (орус. «Трудные годы») — баштапкы алтай тӱӱкилик роман. Автор...`
98
 
99
  | Vocab | Tokens | Count |
100
  |-------|--------|-------|
101
- | 8k | `▁« кы за ла ҥ ду ▁јылдар » ▁( орус ... (+19 more)` | 29 |
102
- | 16k | `▁« кызалаҥду ▁јылдар » ▁( орус . ▁« трудные ▁годы ... (+14 more)` | 24 |
103
 
104
- **Sample 3:** `Эски Чечкаб (, ) јурт Россияда Татарстан Республиканыҥ Кайбыч аймагында кирет....`
105
 
106
  | Vocab | Tokens | Count |
107
  |-------|--------|-------|
108
- | 8k | `▁эски ▁че ч ка б ▁(, ▁) ▁— ▁јурт ▁россияда ... (+12 more)` | 22 |
109
- | 16k | `▁эски ▁чечкаб ▁(, ▁) ▁— ▁јурт ▁россияда ▁татарстан ▁республиканыҥ ▁кайбыч ... (+7 more)` | 17 |
110
 
111
 
112
  ### Key Findings
113
 
114
- - **Best Compression:** 16k achieves 3.681x compression
115
- - **Lowest UNK Rate:** 8k with 0.3997% unknown tokens
116
  - **Trade-off:** Larger vocabularies improve compression but increase model size
117
  - **Recommendation:** 32k vocabulary provides optimal balance for production use
118
 
@@ -129,12 +139,14 @@ Below are sample sentences tokenized with each vocabulary size:
129
 
130
  | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
131
  |--------|---------|------------|---------|----------------|------------------|-------------------|
132
- | **2-gram** | Word | 4,436 | 12.12 | 12,008 | 16.5% | 55.5% |
133
- | **2-gram** | Subword | 413 🏆 | 8.69 | 2,712 | 55.2% | 98.2% |
134
- | **3-gram** | Word | 5,478 | 12.42 | 16,272 | 15.6% | 52.1% |
135
- | **3-gram** | Subword | 3,295 | 11.69 | 22,501 | 19.5% | 62.8% |
136
- | **4-gram** | Word | 8,026 | 12.97 | 27,756 | 15.3% | 46.2% |
137
- | **4-gram** | Subword | 14,033 | 13.78 | 96,739 | 10.5% | 35.6% |
 
 
138
 
139
  ### Top 5 N-grams by Size
140
 
@@ -142,10 +154,10 @@ Below are sample sentences tokenized with each vocabulary size:
142
 
143
  | Rank | N-gram | Count |
144
  |------|--------|-------|
145
- | 1 | `республики алтай` | 1,480 |
146
  | 2 | `ј чык` | 1,391 |
147
  | 3 | `горно алтайск` | 1,246 |
148
- | 4 | `алтай республиканыҥ` | 1,222 |
149
  | 5 | `ј бож` | 1,072 |
150
 
151
  **3-grams (Word):**
@@ -156,7 +168,7 @@ Below are sample sentences tokenized with each vocabulary size:
156
  | 2 | `ӱлӱрген айыныҥ 15` | 730 |
157
  | 3 | `алтайск ау ра` | 511 |
158
  | 4 | `горно алтайск ау` | 511 |
159
- | 5 | `јон јаткан јерлери` | 504 |
160
 
161
  **4-grams (Word):**
162
 
@@ -165,45 +177,65 @@ Below are sample sentences tokenized with each vocabulary size:
165
  | 1 | `јылдыҥ ӱлӱрген айыныҥ 15` | 730 |
166
  | 2 | `горно алтайск ау ра` | 511 |
167
  | 3 | `болгон јылдыҥ ӱлӱрген айыныҥ` | 367 |
168
- | 4 | `тоолоорго окылу конвертер датла` | 365 |
169
- | 5 | `окылу конвертер датла тузаланарга` | 365 |
 
 
 
 
 
 
 
 
 
 
170
 
171
  **2-grams (Subword):**
172
 
173
  | Rank | N-gram | Count |
174
  |------|--------|-------|
175
- | 1 | `_ к` | 74,491 |
176
- | 2 | `, _` | 64,716 |
177
- | 3 | `_ ј` | 55,670 |
178
- | 4 | `а _` | 55,340 |
179
- | 5 | `ҥ _` | 54,127 |
180
 
181
  **3-grams (Subword):**
182
 
183
  | Rank | N-gram | Count |
184
  |------|--------|-------|
185
- | 1 | `ы ҥ _` | 34,280 |
186
- | 2 | `д а _` | 17,047 |
187
- | 3 | `_ — _` | 16,876 |
188
- | 4 | `н ы ҥ` | 15,865 |
189
- | 5 | `_ к а` | 15,102 |
190
 
191
  **4-grams (Subword):**
192
 
193
  | Rank | N-gram | Count |
194
  |------|--------|-------|
195
- | 1 | `н ы ҥ _` | 15,267 |
196
- | 2 | `д ы ҥ _` | 13,210 |
197
- | 3 | `_ к ӱ н` | 11,149 |
198
- | 4 | `а л т а` | 9,638 |
199
- | 5 | `_ ј ы л` | 9,359 |
 
 
 
 
 
 
 
 
 
 
200
 
201
 
202
  ### Key Findings
203
 
204
  - **Best Perplexity:** 2-gram (subword) with 413
205
  - **Entropy Trend:** Decreases with larger n-grams (more predictable)
206
- - **Coverage:** Top-1000 patterns cover ~36% of corpus
207
  - **Recommendation:** 4-gram or 5-gram for best predictive performance
208
 
209
  ---
@@ -219,14 +251,14 @@ Below are sample sentences tokenized with each vocabulary size:
219
 
220
  | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
221
  |---------|---------|-------------|------------|------------------|-----------------|----------------|
222
- | **1** | Word | 0.7272 | 1.655 | 4.24 | 64,506 | 27.3% |
223
- | **1** | Subword | 1.6383 | 3.113 | 16.08 | 301 | 0.0% |
224
- | **2** | Word | 0.1675 | 1.123 | 1.34 | 273,261 | 83.2% |
225
- | **2** | Subword | 1.3152 | 2.488 | 8.05 | 4,839 | 0.0% |
226
- | **3** | Word | 0.0551 | 1.039 | 1.10 | 366,294 | 94.5% |
227
- | **3** | Subword | 0.8839 | 1.845 | 4.16 | 38,940 | 11.6% |
228
- | **4** | Word | 0.0265 🏆 | 1.019 | 1.05 | 402,354 | 97.4% |
229
- | **4** | Subword | 0.6047 | 1.521 | 2.55 | 162,075 | 39.5% |
230
 
231
  ### Generated Text Samples (Word-based)
232
 
@@ -234,27 +266,27 @@ Below are text samples generated from each word-based Markov chain model:
234
 
235
  **Context Size 1:**
236
 
237
- 1. `ла эмчиликте фундаментал шиҥжӱлер эдип чотолот чике тоозын айдып салган аш курсактыҥ томский пивоныҥ...`
238
- 2. `ле бийик эмес ортолой кеми 27 ноября года n 107 об образовании муниципальных образований наделении с...`
239
- 3. `алтай республиканыҥ јурт јеезезине статус ла лесопильный ла иш аайынча министр сорокин почвоведение ...`
240
 
241
  **Context Size 2:**
242
 
243
- 1. `республики алтай и верхний иртыш под ред и м краевед ада тӧрӧл учун улу јууныҥ туружаачызы канча`
244
- 2. `ј чык британ черӱниҥ баштапкы јаан чууганга туштаган театрдыҥ сценазында јылда ачылган зимняя вишня ...`
245
- 3. `горно алтайск гагу 267 с ил библиогр с 233 256 isbn текст электронный сууларда азый балыктыҥ кандыйы`
246
 
247
  **Context Size 3:**
248
 
249
- 1. `јылдыҥ ӱлӱрген айыныҥ 15 кӱнинеҥ ала кочкор айдыҥ 18 кӱнинде восход 2 корабльда космонавт а а леонов...`
250
- 2. `ӱлӱрген айыныҥ 15 кӱнинеҥ ала кочкор айдыҥ 3 кӱни григориан кӱнтизӱде јылдыҥ 208 кӱни високосный јыл...`
251
- 3. `алтайск ау ра литературно издательский дом алтын туу јери ле јолдоры јуртта 3 ором казаковтыҥ кыдраш...`
252
 
253
  **Context Size 4:**
254
 
255
- 1. `јылдыҥ ӱлӱрген айыныҥ 15 кӱнинеҥ ала чаган айдыҥ 17 кӱни юлиан кӱнтизӱ аайынча јылдыҥ ӱлӱрген айыныҥ...`
256
- 2. `горно алтайск ау ра литературно издательский дом алтын туу јери ле јолдоры јурттыҥ текши јери 124 4 ...`
257
- 3. `болгон јылдыҥ ӱлӱрген айыныҥ 15 кӱнине јетире болгон јылдыҥ ӱлӱрген айыныҥ 15 кӱнинеҥ ала кандык айд...`
258
 
259
 
260
  ### Generated Text Samples (Subword-based)
@@ -263,34 +295,34 @@ Below are text samples generated from each subword-based Markov chain model:
263
 
264
  **Context Size 1:**
265
 
266
- 1. `_эдыҥ_оваралетик`
267
- 2. `акен._ј._бачӱ_10`
268
- 3. `рн_орнфилтӧрораа`
269
 
270
  **Context Size 2:**
271
 
272
- 1. `_ка_мештай,_эдищн`
273
- 2. `,_ӱйматкальдынде_`
274
- 3. `_јылдыҥ_мет_башен`
275
 
276
  **Context Size 3:**
277
 
278
- 1. `ыҥ_бичинентизӱлери`
279
- 2. `да_эмчилевич_ј.бож`
280
- 3. `_—_грицаныҥ_јаҥыс_`
281
 
282
  **Context Size 4:**
283
 
284
- 1. `ныҥ_15_кӱнде_фоновы`
285
- 2. `дыҥ_эдеги_келтейинд`
286
- 3. `_кӱн_айдыҥ_15_айдыҥ`
287
 
288
 
289
  ### Key Findings
290
 
291
- - **Best Predictability:** Context-4 (word) with 97.4% predictability
292
  - **Branching Factor:** Decreases with context size (more deterministic)
293
- - **Memory Trade-off:** Larger contexts require more storage (162,075 contexts)
294
  - **Recommendation:** Context-3 or Context-4 for text generation
295
 
296
  ---
@@ -306,9 +338,9 @@ Below are text samples generated from each subword-based Markov chain model:
306
 
307
  | Metric | Value |
308
  |--------|-------|
309
- | Vocabulary Size | 26,456 |
310
- | Total Tokens | 567,020 |
311
- | Mean Frequency | 21.43 |
312
  | Median Frequency | 3 |
313
  | Frequency Std Dev | 124.45 |
314
 
@@ -316,14 +348,14 @@ Below are text samples generated from each subword-based Markov chain model:
316
 
317
  | Rank | Word | Frequency |
318
  |------|------|-----------|
319
- | 1 | ла | 6,612 |
320
- | 2 | ле | 4,973 |
321
- | 3 | алтай | 4,656 |
322
- | 4 | деп | 3,921 |
323
- | 5 | с | 3,896 |
324
- | 6 | јылда | 3,763 |
325
- | 7 | айдыҥ | 3,450 |
326
- | 8 | болгон | 3,231 |
327
  | 9 | км | 3,151 |
328
  | 10 | јурт | 3,140 |
329
 
@@ -346,8 +378,8 @@ Below are text samples generated from each subword-based Markov chain model:
346
 
347
  | Metric | Value |
348
  |--------|-------|
349
- | Zipf Coefficient | 1.1623 |
350
- | R² (Goodness of Fit) | 0.985922 |
351
  | Adherence Quality | **excellent** |
352
 
353
  ### Coverage Analysis
@@ -355,15 +387,15 @@ Below are text samples generated from each subword-based Markov chain model:
355
  | Top N Words | Coverage |
356
  |-------------|----------|
357
  | Top 100 | 27.1% |
358
- | Top 1,000 | 65.6% |
359
- | Top 5,000 | 85.8% |
360
- | Top 10,000 | 92.3% |
361
 
362
  ### Key Findings
363
 
364
  - **Zipf Compliance:** R²=0.9859 indicates excellent adherence to Zipf's law
365
  - **High Frequency Dominance:** Top 100 words cover 27.1% of corpus
366
- - **Long Tail:** 16,456 words needed for remaining 7.7% coverage
367
 
368
  ---
369
  ## 5. Word Embeddings Evaluation
@@ -379,37 +411,40 @@ Below are text samples generated from each subword-based Markov chain model:
379
 
380
  ### 5.1 Cross-Lingual Alignment
381
 
382
- > *Note: Multilingual alignment visualization not available for this language.*
 
 
383
 
384
 
385
  ### 5.2 Model Comparison
386
 
387
  | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
388
  |-------|-----------|----------|------------------|---------------|----------------|
389
- | **mono_32d** | 32 | 0.8352 🏆 | 0.3587 | N/A | N/A |
390
- | **mono_64d** | 64 | 0.7406 | 0.3005 | N/A | N/A |
391
- | **mono_128d** | 128 | 0.3709 | 0.2867 | N/A | N/A |
 
 
 
392
 
393
  ### Key Findings
394
 
395
- - **Best Isotropy:** mono_32d with 0.8352 (more uniform distribution)
396
- - **Semantic Density:** Average pairwise similarity of 0.3153. Lower values indicate better semantic separation.
397
- - **Alignment Quality:** No aligned models evaluated in this run.
398
  - **Recommendation:** 128d aligned for best cross-lingual performance
399
 
400
  ---
401
  ## 6. Morphological Analysis (Experimental)
402
 
403
- > ⚠️ **Warning:** This language shows low morphological productivity. The statistical signals used for this analysis may be noisy or less reliable than for morphologically rich languages.
404
-
405
  This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
406
 
407
  ### 6.1 Productivity & Complexity
408
 
409
  | Metric | Value | Interpretation | Recommendation |
410
  |--------|-------|----------------|----------------|
411
- | Productivity Index | **0.000** | Low morphological productivity | ⚠️ Likely unreliable |
412
- | Idiomaticity Gap | **-1.000** | Low formulaic content | - |
413
 
414
  ### 6.2 Affix Inventory (Productive Units)
415
 
@@ -418,20 +453,20 @@ These are the most productive prefixes and suffixes identified by sampling the v
418
  #### Productive Prefixes
419
  | Prefix | Examples |
420
  |--------|----------|
421
- | `-ко` | корнелия, концертные, коруланар |
422
- | `-ка` | каа, каталанской, казанды |
423
 
424
  #### Productive Suffixes
425
  | Suffix | Examples |
426
  |--------|----------|
427
- | `-ыҥ` | пятницаныҥ, јазатырдыҥ, экспедициязыныҥ |
428
- | `-ий` | автобиографический, университетский, кентерберийский |
429
- | `-кий` | автобиографический, университетский, кентерберийский |
430
- | `-ский` | автобиографический, университетский, кентерберийский |
431
- | `-ныҥ` | пятницаныҥ, экспедициязыныҥ, тартканыныҥ |
432
- | `-иҥ` | унсеттиҥ, билимдериниҥ, эштектиҥ |
433
- | `-да` | фонында, лида, украинада |
434
- | `-ый` | сосновый, туберкулезный, маршрутный |
435
 
436
  ### 6.3 Bound Stems (Lexical Roots)
437
 
@@ -439,18 +474,18 @@ Bound stems are high-frequency subword units that are semantically cohesive but
439
 
440
  | Stem | Cohesion | Substitutability | Examples |
441
  |------|----------|------------------|----------|
442
- | `ский` | 2.13x | 43 contexts | южский, айский, омский |
443
- | `ында` | 1.56x | 51 contexts | мында, адында, ойында |
444
- | `ыныҥ` | 1.77x | 30 contexts | зыныҥ, мыныҥ, ажыныҥ |
445
- | `лтай` | 1.93x | 21 contexts | алтай, шылтай, алтайды |
446
- | `лгон` | 2.28x | 12 contexts | болгон, толгон, болгоны |
447
- | `аныҥ` | 1.77x | 23 contexts | кааныҥ, уфаныҥ, оканыҥ |
448
- | `олго` | 1.78x | 22 contexts | јолго, колго, иолго |
449
- | `осси` | 2.07x | 13 contexts | россии, россий, россия |
450
- | `алта` | 1.64x | 26 contexts | алтам, алтан, алтая |
451
- | `лган` | 1.67x | 24 contexts | алган, салган, алганы |
452
- | `рген` | 1.53x | 27 contexts | юрген, мерген, тӱрген |
453
- | `ылда` | 1.69x | 19 contexts | јылда, дылда, тылда |
454
 
455
  ### 6.4 Affix Compatibility (Co-occurrence)
456
 
@@ -458,16 +493,16 @@ This table shows which prefixes and suffixes most frequently co-occur on the sam
458
 
459
  | Prefix | Suffix | Frequency | Examples |
460
  |--------|--------|-----------|----------|
461
- | `-ко` | `-ыҥ` | 26 words | комедияныҥ, командазыныҥ |
462
- | `-ка` | `-ыҥ` | 23 words | каспаныҥ, кардыҥ |
463
- | `-ко` | `-ныҥ` | 16 words | комедияныҥ, командазыныҥ |
464
- | `-ка` | `-ий` | 15 words | калий, кавказский |
465
- | `-ка` | `-ныҥ` | 13 words | каспаныҥ, калаларыныҥ |
466
- | `-ка` | `-да` | 13 words | картазында, кампанияда |
467
- | `-ка` | `-кий` | 12 words | кавказский, каледонский |
468
- | `-ка` | `-ский` | 12 words | кавказский, каледонский |
469
- | `-ка` | `-ар` | 11 words | кайыҥдар, каналдар |
470
- | `-ко` | `-ар` | 11 words | космонавттар, коллекциялар |
471
 
472
  ### 6.5 Recursive Morpheme Segmentation
473
 
@@ -475,26 +510,28 @@ Using **Recursive Hierarchical Substitutability**, we decompose complex words in
475
 
476
  | Word | Suggested Split | Confidence | Stem |
477
  |------|-----------------|------------|------|
478
- | молотовский | **`молот-ов-ский`** | 6.0 | `молот` |
479
- | логиканыҥ | **`логика-ныҥ`** | 4.5 | `логика` |
480
- | кереестериниҥ | **`кереестерин-иҥ`** | 4.5 | `кереестерин` |
481
- | тӱӱкизиниҥ | **`тӱӱкизин-иҥ`** | 4.5 | `тӱӱкизин` |
482
- | швейцарияда | **`швейцария-да`** | 4.5 | `швейцария` |
483
- | съездиниҥ | **`съездин-иҥ`** | 4.5 | `съездин` |
484
- | јӱрӱминиҥ | **`јӱрӱмин-иҥ`** | 4.5 | `јӱрӱмин` |
485
- | политиканыҥ | **`политика-ныҥ`** | 4.5 | `политика` |
486
- | алексеевский | **`алексеев-ский`** | 4.5 | `алексеев` |
487
- | субъектов | **`субъект-ов`** | 4.5 | `субъект` |
488
- | фабриканыҥ | **`фабрика-ныҥ`** | 4.5 | `фабрика` |
489
- | улаганский | **`улаган-ский`** | 4.5 | `улаган` |
490
- | бийигиниҥ | **`бийигин-иҥ`** | 4.5 | `бийигин` |
491
- | черӱлериниҥ | **`черӱлерин-иҥ`** | 4.5 | `черӱлерин` |
492
- | мьянманыҥ | **`мьянма-ныҥ`** | 4.5 | `мьянма` |
493
 
494
  ### 6.6 Linguistic Interpretation
495
 
496
  > **Automated Insight:**
497
- The language ALT appears to be more isolating or has a highly fixed vocabulary. Word-level models perform nearly as well as subword models, indicating fewer productive morphological processes.
 
 
498
 
499
  ---
500
  ## 7. Summary & Recommendations
@@ -505,9 +542,9 @@ The language ALT appears to be more isolating or has a highly fixed vocabulary.
505
 
506
  | Component | Recommended | Rationale |
507
  |-----------|-------------|-----------|
508
- | Tokenizer | **16k BPE** | Best compression (3.68x) |
509
  | N-gram | **2-gram** | Lowest perplexity (413) |
510
- | Markov | **Context-4** | Highest predictability (97.4%) |
511
  | Embeddings | **100d** | Balanced semantic capture and isotropy |
512
 
513
 
@@ -721,4 +758,4 @@ MIT License - Free for academic and commercial use.
721
  ---
722
  *Generated by Wikilangs Models Pipeline*
723
 
724
- *Report Date: 2026-01-03 05:04:55*
 
1
  ---
2
  language: alt
3
+ language_name: Southern Altai
4
  language_family: turkic_siberian
5
  tags:
6
  - wikilangs
 
10
  - n-gram
11
  - markov
12
  - wikipedia
13
+ - feature-extraction
14
+ - sentence-similarity
15
+ - tokenization
16
+ - n-grams
17
+ - markov-chain
18
+ - text-mining
19
+ - fasttext
20
+ - babelvec
21
+ - vocabulous
22
+ - vocabulary
23
  - monolingual
24
  - family-turkic_siberian
25
  license: mit
26
  library_name: wikilangs
27
+ pipeline_tag: text-generation
28
  datasets:
29
  - omarkamali/wikipedia-monthly
30
  dataset_info:
 
33
  metrics:
34
  - name: best_compression_ratio
35
  type: compression
36
+ value: 3.686
37
  - name: best_isotropy
38
  type: isotropy
39
+ value: 0.8419
40
  - name: vocabulary_size
41
  type: vocab
42
  value: 0
43
  generated: 2026-01-03
44
  ---
45
 
46
+ # Southern Altai - Wikilangs Models
47
  ## Comprehensive Research Report & Full Ablation Study
48
 
49
+ This repository contains NLP models trained and evaluated by Wikilangs, specifically on **Southern Altai** Wikipedia data.
50
  We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
51
 
52
  ## 📋 Repository Contents
 
70
  - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
71
  - [4. Vocabulary Analysis](#4-vocabulary-analysis)
72
  - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
73
+ - [6. Morphological Analysis (Experimental)](#6--morphological-analysis-experimental)
74
  - [7. Summary & Recommendations](#7-summary--recommendations)
75
  - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
76
  - [Visualizations Index](#visualizations-index)
 
90
 
91
  | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
92
  |------------|-------------|---------------|----------|--------------|
93
+ | **8k** | 3.486x | 3.49 | 0.3992% | 972,913 |
94
+ | **16k** | 3.686x 🏆 | 3.69 | 0.4221% | 920,240 |
95
 
96
  ### Tokenization Examples
97
 
98
  Below are sample sentences tokenized with each vocabulary size:
99
 
100
+ **Sample 1:** `Оҥныут кошуун ()ӧвӧр моҥолдыҥ кошуун. Этимологиязы Оҥныут (калка моҥолдоп о...`
101
 
102
  | Vocab | Tokens | Count |
103
  |-------|--------|-------|
104
+ | 8k | `▁оҥныут ▁кошуун ▁() ▁— ▁ӧвӧр ▁моҥолдыҥ ▁кошуун . ▁этимологиязы ▁оҥныут ... (+27 more)` | 37 |
105
+ | 16k | `▁оҥныут ▁кошуун ▁() ▁— ▁ӧвӧр ▁моҥолдыҥ ▁кошуун . ▁этимологиязы ▁оҥныут ... (+25 more)` | 35 |
106
 
107
+ **Sample 2:** `Эски Чечкаб (, ) — јурт Россияда Татарстан Республиканыҥ Кайбыч аймагында кирет....`
108
 
109
  | Vocab | Tokens | Count |
110
  |-------|--------|-------|
111
+ | 8k | `▁эски ▁че ч ка б ▁(, ▁) ▁— ▁јурт ▁россияда ... (+12 more)` | 22 |
112
+ | 16k | `▁эски ▁чечкаб ▁(, ▁) ▁— ▁јурт ▁россияда ▁татарстан ▁республиканыҥ ▁кайбыч ... (+7 more)` | 17 |
113
 
114
+ **Sample 3:** `Танк - темирле јабылган тебингиштерлӱ јуучыл машина.`
115
 
116
  | Vocab | Tokens | Count |
117
  |-------|--------|-------|
118
+ | 8k | `▁танк ▁- ▁темир ле ▁ја б ылган ▁тебин ги ш ... (+6 more)` | 16 |
119
+ | 16k | `▁танк ▁- ▁темирле ▁јабылган ▁тебингиштерлӱ ▁јуучыл ▁машина .` | 8 |
120
 
121
 
122
  ### Key Findings
123
 
124
+ - **Best Compression:** 16k achieves 3.686x compression
125
+ - **Lowest UNK Rate:** 8k with 0.3992% unknown tokens
126
  - **Trade-off:** Larger vocabularies improve compression but increase model size
127
  - **Recommendation:** 32k vocabulary provides optimal balance for production use
128
 
 
139
 
140
  | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
141
  |--------|---------|------------|---------|----------------|------------------|-------------------|
142
+ | **2-gram** | Word | 4,423 | 12.11 | 11,976 | 16.5% | 55.6% |
143
+ | **2-gram** | Subword | 413 🏆 | 8.69 | 2,708 | 55.2% | 98.2% |
144
+ | **3-gram** | Word | 5,471 | 12.42 | 16,254 | 15.6% | 52.1% |
145
+ | **3-gram** | Subword | 3,292 | 11.68 | 22,428 | 19.5% | 62.9% |
146
+ | **4-gram** | Word | 8,010 | 12.97 | 27,702 | 15.3% | 46.3% |
147
+ | **4-gram** | Subword | 14,003 | 13.77 | 96,467 | 10.5% | 35.7% |
148
+ | **5-gram** | Word | 7,318 | 12.84 | 24,542 | 16.3% | 46.7% |
149
+ | **5-gram** | Subword | 33,559 | 15.03 | 198,894 | 7.1% | 25.2% |
150
 
151
  ### Top 5 N-grams by Size
152
 
 
154
 
155
  | Rank | N-gram | Count |
156
  |------|--------|-------|
157
+ | 1 | `республики алтай` | 1,479 |
158
  | 2 | `ј чык` | 1,391 |
159
  | 3 | `горно алтайск` | 1,246 |
160
+ | 4 | `алтай республиканыҥ` | 1,220 |
161
  | 5 | `ј бож` | 1,072 |
162
 
163
  **3-grams (Word):**
 
168
  | 2 | `ӱлӱрген айыныҥ 15` | 730 |
169
  | 3 | `алтайск ау ра` | 511 |
170
  | 4 | `горно алтайск ау` | 511 |
171
+ | 5 | `јон јаткан јерлери` | 503 |
172
 
173
  **4-grams (Word):**
174
 
 
177
  | 1 | `јылдыҥ ӱлӱрген айыныҥ 15` | 730 |
178
  | 2 | `горно алтайск ау ра` | 511 |
179
  | 3 | `болгон јылдыҥ ӱлӱрген айыныҥ` | 367 |
180
+ | 4 | `айыныҥ 15 кӱнине јетире` | 365 |
181
+ | 5 | `аайынча јылдыҥ ӱлӱрген айыныҥ` | 365 |
182
+
183
+ **5-grams (Word):**
184
+
185
+ | Rank | N-gram | Count |
186
+ |------|--------|-------|
187
+ | 1 | `юлиан кӱнтизӱ аайынча јылдыҥ ӱлӱрген` | 365 |
188
+ | 2 | `кӱнтизӱ аайынча јылдыҥ ӱлӱрген айыныҥ` | 365 |
189
+ | 3 | `кӱнине јетире болгон јылдыҥ ӱлӱрген` | 365 |
190
+ | 4 | `юлиан кӱнтизӱни 13 кӱнге озолоп` | 365 |
191
+ | 5 | `кӱнтизӱ юлиан кӱнтизӱни 13 кӱнге` | 365 |
192
 
193
  **2-grams (Subword):**
194
 
195
  | Rank | N-gram | Count |
196
  |------|--------|-------|
197
+ | 1 | `_ к` | 74,208 |
198
+ | 2 | `, _` | 64,571 |
199
+ | 3 | `_ ј` | 55,512 |
200
+ | 4 | `а _` | 55,147 |
201
+ | 5 | `ҥ _` | 53,924 |
202
 
203
  **3-grams (Subword):**
204
 
205
  | Rank | N-gram | Count |
206
  |------|--------|-------|
207
+ | 1 | `ы ҥ _` | 34,158 |
208
+ | 2 | `д а _` | 16,990 |
209
+ | 3 | `_ — _` | 16,847 |
210
+ | 4 | `н ы ҥ` | 15,805 |
211
+ | 5 | `_ к а` | 15,039 |
212
 
213
  **4-grams (Subword):**
214
 
215
  | Rank | N-gram | Count |
216
  |------|--------|-------|
217
+ | 1 | `н ы ҥ _` | 15,207 |
218
+ | 2 | `д ы ҥ _` | 13,173 |
219
+ | 3 | `_ к ӱ н` | 11,135 |
220
+ | 4 | `а л т а` | 9,624 |
221
+ | 5 | `_ ј ы л` | 9,304 |
222
+
223
+ **5-grams (Subword):**
224
+
225
+ | Rank | N-gram | Count |
226
+ |------|--------|-------|
227
+ | 1 | `а л т а й` | 8,736 |
228
+ | 2 | `_ ј ы л д` | 7,756 |
229
+ | 3 | `с к и й _` | 7,663 |
230
+ | 4 | `_ а л т а` | 6,748 |
231
+ | 5 | `й д ы ҥ _` | 5,904 |
232
 
233
 
234
  ### Key Findings
235
 
236
  - **Best Perplexity:** 2-gram (subword) with 413
237
  - **Entropy Trend:** Decreases with larger n-grams (more predictable)
238
+ - **Coverage:** Top-1000 patterns cover ~25% of corpus
239
  - **Recommendation:** 4-gram or 5-gram for best predictive performance
240
 
241
  ---
 
251
 
252
  | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
253
  |---------|---------|-------------|------------|------------------|-----------------|----------------|
254
+ | **1** | Word | 0.7265 | 1.655 | 4.23 | 64,260 | 27.4% |
255
+ | **1** | Subword | 1.6376 | 3.112 | 16.04 | 301 | 0.0% |
256
+ | **2** | Word | 0.1676 | 1.123 | 1.34 | 271,928 | 83.2% |
257
+ | **2** | Subword | 1.3152 | 2.488 | 8.04 | 4,828 | 0.0% |
258
+ | **3** | Word | 0.0551 | 1.039 | 1.10 | 364,496 | 94.5% |
259
+ | **3** | Subword | 0.8837 | 1.845 | 4.16 | 38,825 | 11.6% |
260
+ | **4** | Word | 0.0265 🏆 | 1.019 | 1.05 | 400,428 | 97.3% |
261
+ | **4** | Subword | 0.6047 | 1.521 | 2.55 | 161,528 | 39.5% |
262
 
263
  ### Generated Text Samples (Word-based)
264
 
 
266
 
267
  **Context Size 1:**
268
 
269
+ 1. `ла ӧскӧ кижиниҥ адын масс системы но строеніемъ мерзокъ всё спишет вермахт понёс 90 км јаш`
270
+ 2. `ле јолдоры јуртта 9 кӱнинде москвада в в ломоносова јылда гаагада переплётчик бичиктер берестяная гр...`
271
+ 3. `алтай республика хакасия монголия горно алтайск гагу ныҥ јарымјылдык курстарына аткарылган оныҥ адыл...`
272
 
273
  **Context Size 2:**
274
 
275
+ 1. `республики алтай от 3 марта года n 9 6 о языках народов проживающих на территории республики алтай`
276
+ 2. `ј чык совет ле россий орнитолог јурукчы анималист бу кӱнде божогондор ајарулар 27 айдыҥ 27 кӱни юлиа...`
277
+ 3. `горно алтайск алтайдыҥ бичиктер чыгарар изд возы 1 эл опт диск cd rom на алт яз б`
278
 
279
  **Context Size 3:**
280
 
281
+ 1. `јылдыҥ ӱлӱрген айыныҥ 15 кӱнинеҥ ала тулаан айдыҥ 29 кӱнинде артист россияныҥ театрал ишчилериниҥ би...`
282
+ 2. `ӱлӱрген айыныҥ 15 кӱнинеҥ ала кандык айдыҥ 15 кӱни юлиан кӱнтизӱ аайынча јылдыҥ ӱлӱрген айыныҥ 15 кӱ...`
283
+ 3. `алтайск ау ра литературно издательский дом алтын туу сууда балык кезем астаган да болзо корулу јерле...`
284
 
285
  **Context Size 4:**
286
 
287
+ 1. `јылдыҥ ӱлӱрген айыныҥ 15 кӱнине јетире болгон јылдыҥ ӱлӱрген айыныҥ 15 кӱнине јетире болгон јылдыҥ ӱ...`
288
+ 2. `горно алтайск ау ра литературно издательский дом алтын туу јайдыҥ бойында аркалары койу ла бийик ӧлӧ...`
289
+ 3. `болгон јылдыҥ ӱлӱрген айыныҥ 15 кӱнинеҥ ала кӱӱк айдыҥ 6 кӱни григориан кӱнтизӱде јылдыҥ 360 кӱни ви...`
290
 
291
 
292
  ### Generated Text Samples (Subword-based)
 
295
 
296
  **Context Size 1:**
297
 
298
+ 1. `_гатӱли»)_јектич`
299
+ 2. `аканамикет_јыхих`
300
+ 3. `ртакклан_онла_бь`
301
 
302
  **Context Size 2:**
303
 
304
+ 1. `_кыл,_баснов_кылг`
305
+ 2. `,_29_21,97_малтал`
306
+ 3. `_јуртиреспублик_а`
307
 
308
  **Context Size 3:**
309
 
310
+ 1. `ыҥ_кодондо_инфранс`
311
+ 2. `да_православ_башка`
312
+ 3. `_—_titus_liefs_asb`
313
 
314
  **Context Size 4:**
315
 
316
+ 1. `ныҥ_кандыра_агып_ба`
317
+ 2. `дыҥ_физиканыҥ_ӱӱрел`
318
+ 3. `_кӱнтизӱле_кӱни_гри`
319
 
320
 
321
  ### Key Findings
322
 
323
+ - **Best Predictability:** Context-4 (word) with 97.3% predictability
324
  - **Branching Factor:** Decreases with context size (more deterministic)
325
+ - **Memory Trade-off:** Larger contexts require more storage (161,528 contexts)
326
  - **Recommendation:** Context-3 or Context-4 for text generation
327
 
328
  ---
 
338
 
339
  | Metric | Value |
340
  |--------|-------|
341
+ | Vocabulary Size | 26,328 |
342
+ | Total Tokens | 565,164 |
343
+ | Mean Frequency | 21.47 |
344
  | Median Frequency | 3 |
345
  | Frequency Std Dev | 124.45 |
346
 
 
348
 
349
  | Rank | Word | Frequency |
350
  |------|------|-----------|
351
+ | 1 | ла | 6,601 |
352
+ | 2 | ле | 4,964 |
353
+ | 3 | алтай | 4,646 |
354
+ | 4 | деп | 3,903 |
355
+ | 5 | с | 3,881 |
356
+ | 6 | јылда | 3,745 |
357
+ | 7 | айдыҥ | 3,441 |
358
+ | 8 | болгон | 3,230 |
359
  | 9 | км | 3,151 |
360
  | 10 | јурт | 3,140 |
361
 
 
378
 
379
  | Metric | Value |
380
  |--------|-------|
381
+ | Zipf Coefficient | 1.1627 |
382
+ | R² (Goodness of Fit) | 0.985919 |
383
  | Adherence Quality | **excellent** |
384
 
385
  ### Coverage Analysis
 
387
  | Top N Words | Coverage |
388
  |-------------|----------|
389
  | Top 100 | 27.1% |
390
+ | Top 1,000 | 65.7% |
391
+ | Top 5,000 | 85.9% |
392
+ | Top 10,000 | 92.4% |
393
 
394
  ### Key Findings
395
 
396
  - **Zipf Compliance:** R²=0.9859 indicates excellent adherence to Zipf's law
397
  - **High Frequency Dominance:** Top 100 words cover 27.1% of corpus
398
+ - **Long Tail:** 16,328 words needed for remaining 7.6% coverage
399
 
400
  ---
401
  ## 5. Word Embeddings Evaluation
 
411
 
412
  ### 5.1 Cross-Lingual Alignment
413
 
414
+ ![Alignment Quality](visualizations/embedding_alignment_quality.png)
415
+
416
+ ![Multilingual t-SNE](visualizations/embedding_tsne_multilingual.png)
417
 
418
 
419
  ### 5.2 Model Comparison
420
 
421
  | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
422
  |-------|-----------|----------|------------------|---------------|----------------|
423
+ | **mono_32d** | 32 | 0.8419 | 0.3607 | N/A | N/A |
424
+ | **mono_64d** | 64 | 0.7375 | 0.3054 | N/A | N/A |
425
+ | **mono_128d** | 128 | 0.3603 | 0.2810 | N/A | N/A |
426
+ | **aligned_32d** | 32 | 0.8419 🏆 | 0.3554 | 0.0260 | 0.1460 |
427
+ | **aligned_64d** | 64 | 0.7375 | 0.2999 | 0.0660 | 0.2980 |
428
+ | **aligned_128d** | 128 | 0.3603 | 0.2823 | 0.1580 | 0.4340 |
429
 
430
  ### Key Findings
431
 
432
+ - **Best Isotropy:** aligned_32d with 0.8419 (more uniform distribution)
433
+ - **Semantic Density:** Average pairwise similarity of 0.3141. Lower values indicate better semantic separation.
434
+ - **Alignment Quality:** Aligned models achieve up to 15.8% R@1 in cross-lingual retrieval.
435
  - **Recommendation:** 128d aligned for best cross-lingual performance
436
 
437
  ---
438
  ## 6. Morphological Analysis (Experimental)
439
 
 
 
440
  This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
441
 
442
  ### 6.1 Productivity & Complexity
443
 
444
  | Metric | Value | Interpretation | Recommendation |
445
  |--------|-------|----------------|----------------|
446
+ | Productivity Index | **5.000** | High morphological productivity | Reliable analysis |
447
+ | Idiomaticity Gap | **0.854** | High formulaic/idiomatic content | - |
448
 
449
  ### 6.2 Affix Inventory (Productive Units)
450
 
 
453
  #### Productive Prefixes
454
  | Prefix | Examples |
455
  |--------|----------|
456
+ | `-ка` | калькутта, калба, кацукава |
457
+ | `-ко` | контр, козерёкова, кожондоп |
458
 
459
  #### Productive Suffixes
460
  | Suffix | Examples |
461
  |--------|----------|
462
+ | `-ыҥ` | филармонияныҥ, транспорттыҥ, британияныҥ |
463
+ | `-ий` | белорусский, макарьевский, исетский |
464
+ | `-кий` | белорусский, макарьевский, исетский |
465
+ | `-ский` | белорусский, макарьевский, исетский |
466
+ | `-ныҥ` | филармонияныҥ, британияныҥ, наралканыҥ |
467
+ | `-иҥ` | јеезезиниҥ, изӱзиниҥ, ӱренчиктердиҥ |
468
+ | `-да` | ордында, совхозында, садуда |
469
+ | `-ый` | государственный, музейный, тёплый |
470
 
471
  ### 6.3 Bound Stems (Lexical Roots)
472
 
 
474
 
475
  | Stem | Cohesion | Substitutability | Examples |
476
  |------|----------|------------------|----------|
477
+ | `ский` | 2.17x | 43 contexts | омский, окский, юрский |
478
+ | `ында` | 1.53x | 51 contexts | мында, айында, сындар |
479
+ | `ыныҥ` | 1.68x | 30 contexts | мыныҥ, зыныҥ, угыныҥ |
480
+ | `лтай` | 1.85x | 21 contexts | алтай, шылтай, алтайды |
481
+ | `лгон` | 2.21x | 12 contexts | толгон, болгон, болгонм |
482
+ | `лган` | 1.70x | 23 contexts | алган, калган, салган |
483
+ | `осси` | 2.03x | 13 contexts | россия, россию, россии |
484
+ | `аныҥ` | 1.67x | 23 contexts | оканыҥ, сшаныҥ, эраныҥ |
485
+ | `олго` | 1.66x | 22 contexts | колго, волго, голго |
486
+ | `алта` | 1.49x | 26 contexts | алтай, алтан, алтам |
487
+ | `јылд` | 1.77x | 15 contexts | јылда, јылды, јылдын |
488
+ | `ылда` | 1.63x | 19 contexts | тылда, дылда, јылда |
489
 
490
  ### 6.4 Affix Compatibility (Co-occurrence)
491
 
 
493
 
494
  | Prefix | Suffix | Frequency | Examples |
495
  |--------|--------|-----------|----------|
496
+ | `-ка` | `-ыҥ` | 21 words | казакстанныҥ, кайырлыктыҥ |
497
+ | `-ко` | `-ыҥ` | 20 words | конституцияныҥ, конкурстардыҥ |
498
+ | `-ка` | `-ий` | 14 words | кадетский, карский |
499
+ | `-ко` | `-ый` | 13 words | консалтинговый, командный |
500
+ | `-ка` | `-ныҥ` | 11 words | казакстанныҥ, канаданыҥ |
501
+ | `-ко` | `-ныҥ` | 11 words | конституцияныҥ, колхозыныҥ |
502
+ | `-ко` | `-ий` | 10 words | комментарий, ковалевский |
503
+ | `-ка` | `-кий` | 10 words | кадетский, карский |
504
+ | `-ка` | `-ский` | 10 words | кадетский, карский |
505
+ | `-ко` | `-да` | 9 words | косметологияда, коруда |
506
 
507
  ### 6.5 Recursive Morpheme Segmentation
508
 
 
510
 
511
  | Word | Suggested Split | Confidence | Stem |
512
  |------|-----------------|------------|------|
513
+ | планеталарында | **`планеталарын-да`** | 4.5 | `планеталарын` |
514
+ | актуруныҥ | **`актуру-ныҥ`** | 4.5 | `актуру` |
515
+ | покровский | **`покров-ский`** | 4.5 | `покров` |
516
+ | искусствоныҥ | **`искусство-ныҥ`** | 4.5 | `искусство` |
517
+ | думазыныҥ | **`думазы-ныҥ`** | 4.5 | `думазы` |
518
+ | медицинада | **`медицина-да`** | 4.5 | `медицина` |
519
+ | балдарыныҥ | **`балдары-ныҥ`** | 4.5 | `балдары` |
520
+ | португалияда | **`португалия-да`** | 4.5 | `португалия` |
521
+ | программада | **`программа-да`** | 4.5 | `программа` |
522
+ | аймагыныҥ | **`аймагы-ныҥ`** | 4.5 | `аймагы` |
523
+ | академияда | **`академия-да`** | 4.5 | `академия` |
524
+ | авиацияныҥ | **`авиация-ныҥ`** | 4.5 | `авиация` |
525
+ | шотландский | **`шотланд-ский`** | 4.5 | `шотланд` |
526
+ | киргизияныҥ | **`киргизия-ныҥ`** | 4.5 | `киргизия` |
527
+ | регрессияныҥ | **`регрессия-ныҥ`** | 4.5 | `регрессия` |
528
 
529
  ### 6.6 Linguistic Interpretation
530
 
531
  > **Automated Insight:**
532
+ The language Southern Altai shows high morphological productivity. The subword models are significantly more efficient than word models, suggesting a rich system of affixation or compounding.
533
+
534
+ > **Note on Idiomaticity:** The high Idiomaticity Gap suggests a large number of frequent multi-word expressions or formulaic sequences that are statistically distinct from their component parts.
535
 
536
  ---
537
  ## 7. Summary & Recommendations
 
542
 
543
  | Component | Recommended | Rationale |
544
  |-----------|-------------|-----------|
545
+ | Tokenizer | **16k BPE** | Best compression (3.69x) |
546
  | N-gram | **2-gram** | Lowest perplexity (413) |
547
+ | Markov | **Context-4** | Highest predictability (97.3%) |
548
  | Embeddings | **100d** | Balanced semantic capture and isotropy |
549
 
550
 
 
758
  ---
759
  *Generated by Wikilangs Models Pipeline*
760
 
761
+ *Report Date: 2026-01-03 16:17:03*
models/embeddings/aligned/alt_128d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69b49db53759f31efa64c4f2d7d7be9b6c4a0dd66f132f278507518e1d880168
3
+ size 1036324583
models/embeddings/aligned/alt_128d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "alt", "dim": 128, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/alt_128d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53c0661fbc07341a94b6af5794cec2dd258834958ca585700e24709c25070767
3
+ size 65664
models/embeddings/aligned/alt_128d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "alt",
3
+ "dimension": 128,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 1005,
7
+ "vocab_size": 11761
8
+ }
models/embeddings/aligned/alt_32d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a6abdeea8e67944fcc5ee1694036589855e874e24a7c77bd4c4209a5d5d8d26
3
+ size 259292135
models/embeddings/aligned/alt_32d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "alt", "dim": 32, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/alt_32d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b66e7b3da22cdea9d53726b7023049952f4477dae9106655d0f21b6b044e514d
3
+ size 4224
models/embeddings/aligned/alt_32d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "alt",
3
+ "dimension": 32,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 1005,
7
+ "vocab_size": 11761
8
+ }
models/embeddings/aligned/alt_64d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d37f24d69c323fd9bf8a50980438c9258d8e34ccead6aae9b90a789e9b492c6
3
+ size 518302951
models/embeddings/aligned/alt_64d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "alt", "dim": 64, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/alt_64d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b70ae5264aab6f216db3e0671fafeebbc01cc568ae1669ac762b6a0bfe0842b
3
+ size 16512
models/embeddings/aligned/alt_64d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "alt",
3
+ "dimension": 64,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 1005,
7
+ "vocab_size": 11761
8
+ }
models/embeddings/monolingual/alt_128d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e396190247b1c989d377e3a31a5ca94405fd3ee9794d9a1f7bafcef3e5cf2c32
3
- size 1036365432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69b49db53759f31efa64c4f2d7d7be9b6c4a0dd66f132f278507518e1d880168
3
+ size 1036324583
models/embeddings/monolingual/alt_128d_metadata.json CHANGED
@@ -11,5 +11,5 @@
11
  "encoding_method": "rope",
12
  "dim": 128
13
  },
14
- "vocab_size": 11800
15
  }
 
11
  "encoding_method": "rope",
12
  "dim": 128
13
  },
14
+ "vocab_size": 11761
15
  }
models/embeddings/monolingual/alt_32d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:61f263d2302c0b79944fb6dea7a5410f34344972105b8624236c585557cd9b72
3
- size 259303032
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a6abdeea8e67944fcc5ee1694036589855e874e24a7c77bd4c4209a5d5d8d26
3
+ size 259292135
models/embeddings/monolingual/alt_32d_metadata.json CHANGED
@@ -11,5 +11,5 @@
11
  "encoding_method": "rope",
12
  "dim": 32
13
  },
14
- "vocab_size": 11800
15
  }
 
11
  "encoding_method": "rope",
12
  "dim": 32
13
  },
14
+ "vocab_size": 11761
15
  }
models/embeddings/monolingual/alt_64d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0af70072fe6e458bf918c9d11f1a56126a09d6b5cade10dc1cf79494ec3cad2b
3
- size 518323832
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d37f24d69c323fd9bf8a50980438c9258d8e34ccead6aae9b90a789e9b492c6
3
+ size 518302951
models/embeddings/monolingual/alt_64d_metadata.json CHANGED
@@ -11,5 +11,5 @@
11
  "encoding_method": "rope",
12
  "dim": 64
13
  },
14
- "vocab_size": 11800
15
  }
 
11
  "encoding_method": "rope",
12
  "dim": 64
13
  },
14
+ "vocab_size": 11761
15
  }
models/subword_markov/alt_markov_ctx1_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9835d764d81a373a43d4c69af7b885b80b3e7e6708cce0d6899e9b5ea4672187
3
- size 43649
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5de3a411b37f584ca33da338fbcff27d94573428168a1e648d2b742b2fc8dcdf
3
+ size 43597
models/subword_markov/alt_markov_ctx1_subword_metadata.json CHANGED
@@ -3,5 +3,5 @@
3
  "variant": "subword",
4
  "language": "alt",
5
  "unique_contexts": 301,
6
- "total_transitions": 4392884
7
  }
 
3
  "variant": "subword",
4
  "language": "alt",
5
  "unique_contexts": 301,
6
+ "total_transitions": 4378023
7
  }
models/subword_markov/alt_markov_ctx2_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d66028d1955552f26a81718406af2dce47d0f3dedd00f8bb0b84c80b869c131
3
- size 310925
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f697eae7fb6cf10b61177dc18c8a55cbae2e40ad0d4afd4a8da85f56afbd2237
3
+ size 309826
models/subword_markov/alt_markov_ctx2_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 2,
3
  "variant": "subword",
4
  "language": "alt",
5
- "unique_contexts": 4839,
6
- "total_transitions": 4391785
7
  }
 
2
  "context_size": 2,
3
  "variant": "subword",
4
  "language": "alt",
5
+ "unique_contexts": 4828,
6
+ "total_transitions": 4376923
7
  }
models/subword_markov/alt_markov_ctx3_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:93204d2e51859eacc761cb0757c0e6456d7c7bc8f68ee9401b438c2b0f12f236
3
- size 1232693
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d155f8f5450de809f8f092c6dbb0c3edf33a9f8fceca3e23b01a9ec33458a89
3
+ size 1242900
models/subword_markov/alt_markov_ctx3_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 3,
3
  "variant": "subword",
4
  "language": "alt",
5
- "unique_contexts": 38940,
6
- "total_transitions": 4390686
7
  }
 
2
  "context_size": 3,
3
  "variant": "subword",
4
  "language": "alt",
5
+ "unique_contexts": 38825,
6
+ "total_transitions": 4375823
7
  }
models/subword_markov/alt_markov_ctx4_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:24dc5e341215c2a39c5ae7484dd8bd985f42205d269ff7f3a91b8cc25d862939
3
- size 3689341
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d059c0f9ee39262493fd9a6d03eae15c65ad6aae24aafe491837dba6ca124d1d
3
+ size 3667084
models/subword_markov/alt_markov_ctx4_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 4,
3
  "variant": "subword",
4
  "language": "alt",
5
- "unique_contexts": 162075,
6
- "total_transitions": 4389587
7
  }
 
2
  "context_size": 4,
3
  "variant": "subword",
4
  "language": "alt",
5
+ "unique_contexts": 161528,
6
+ "total_transitions": 4374723
7
  }
models/subword_ngram/alt_2gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a6cafd90f885b32dd8861ed71808430f5107a59536dc5f4e342a7bdc0fbbba4c
3
- size 38120
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:316d420820493b671016a9b16daf866564bb5d7a7372b5170fd329e3c9c21546
3
+ size 38127
models/subword_ngram/alt_2gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 2,
3
  "variant": "subword",
4
  "language": "alt",
5
- "unique_ngrams": 2712,
6
- "total_ngrams": 4392884
7
  }
 
2
  "n": 2,
3
  "variant": "subword",
4
  "language": "alt",
5
+ "unique_ngrams": 2708,
6
+ "total_ngrams": 4378023
7
  }
models/subword_ngram/alt_3gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:34a836528a3396f9306f4ecc25205690e3dd0d56877599f1efb2e2f194507c84
3
- size 295825
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60352b70be6638b28e4a718c309322d12428980bbe91c7bf0970204529910c66
3
+ size 294997
models/subword_ngram/alt_3gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 3,
3
  "variant": "subword",
4
  "language": "alt",
5
- "unique_ngrams": 22501,
6
- "total_ngrams": 4391785
7
  }
 
2
  "n": 3,
3
  "variant": "subword",
4
  "language": "alt",
5
+ "unique_ngrams": 22428,
6
+ "total_ngrams": 4376923
7
  }
models/subword_ngram/alt_4gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:884f3c7557c6823455b4677c54106f38a9691634c5e0cfe29fac18815f11c7a2
3
- size 1241123
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77218c8b8554d3860d25d145b941dd913a161caad61f0365175465c6fe92157e
3
+ size 1238337
models/subword_ngram/alt_4gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 4,
3
  "variant": "subword",
4
  "language": "alt",
5
- "unique_ngrams": 96739,
6
- "total_ngrams": 4390686
7
  }
 
2
  "n": 4,
3
  "variant": "subword",
4
  "language": "alt",
5
+ "unique_ngrams": 96467,
6
+ "total_ngrams": 4375823
7
  }
models/subword_ngram/alt_5gram_subword.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b8723b876de6f61cb203330f95f792d0e1cdec11ce378450bb745718331c6e1
3
+ size 2724808
models/subword_ngram/alt_5gram_subword_metadata.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "n": 5,
3
+ "variant": "subword",
4
+ "language": "alt",
5
+ "unique_ngrams": 198894,
6
+ "total_ngrams": 4374723
7
+ }
models/tokenizer/alt_tokenizer_16k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ed995fcbde5668b2f32931d416ecfd444547f4fccf04118ff4bf11e3c248ef4
3
- size 600334
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:192d7e44e0196b96f1303f1a410abfcee217c2ca7c785632537870d57d1b37ba
3
+ size 600913
models/tokenizer/alt_tokenizer_16k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/tokenizer/alt_tokenizer_8k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:452a0aec3e7e4b4e17384e2ff0d3b52a51f9cb273b8e8bbc7addbb7f2e51363f
3
- size 410662
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20270bb636c034ecfc25d110dd94006f754d960d971e1420d5e4f0aaf265e29d
3
+ size 410773
models/tokenizer/alt_tokenizer_8k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/vocabulary/alt_vocabulary.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c316fcfa2120415d93073b97d54878932bd6da42c81696b2da0093488988631
3
- size 512673
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:405199c1e4ec8eb327312b45b88ddeac4e778fefcefab6ffb844bb2ee7d0952c
3
+ size 508112
models/vocabulary/alt_vocabulary_metadata.json CHANGED
@@ -1,17 +1,17 @@
1
  {
2
  "language": "alt",
3
- "vocabulary_size": 26456,
4
  "variant": "full",
5
  "statistics": {
6
- "type_token_ratio": 0.10662391749851259,
7
  "coverage": {
8
- "top_100": 0.2540457460170556,
9
- "top_1000": 0.6146790507040392,
10
- "top_5000": 0.8042837310768824,
11
- "top_10000": 0.8649781847028493
12
  },
13
- "hapax_count": 38060,
14
- "hapax_ratio": 0.5899311798623598,
15
- "total_documents": 1099
16
  }
17
  }
 
1
  {
2
  "language": "alt",
3
+ "vocabulary_size": 26328,
4
  "variant": "full",
5
  "statistics": {
6
+ "type_token_ratio": 0.10656501510513906,
7
  "coverage": {
8
+ "top_100": 0.2543251103454451,
9
+ "top_1000": 0.6152218681293172,
10
+ "top_5000": 0.8047888762506094,
11
+ "top_10000": 0.8654481965027706
12
  },
13
+ "hapax_count": 37942,
14
+ "hapax_ratio": 0.5903531974482651,
15
+ "total_documents": 1100
16
  }
17
  }
models/word_markov/alt_markov_ctx1_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ed78756b17891f1d853be9b080f0dbe62a2e12d8ac311c5761369520b78a512
3
- size 3264406
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:533e2e91945887ef0d84238b923f84b042558be4dc1ccffd52df6dcd02b274df
3
+ size 3226693
models/word_markov/alt_markov_ctx1_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 1,
3
  "variant": "word",
4
  "language": "alt",
5
- "unique_contexts": 64506,
6
- "total_transitions": 603981
7
  }
 
2
  "context_size": 1,
3
  "variant": "word",
4
  "language": "alt",
5
+ "unique_contexts": 64260,
6
+ "total_transitions": 602006
7
  }
models/word_markov/alt_markov_ctx2_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:99cc5861c45675e04db4b52347a14f4dad4cced5b1fde724f96eb05b82e2b557
3
- size 8258854
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9f3092a02b11bc73db586c569a387f0b67b7090a254317b683cc84422219d77
3
+ size 8206564
models/word_markov/alt_markov_ctx2_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 2,
3
  "variant": "word",
4
  "language": "alt",
5
- "unique_contexts": 273261,
6
- "total_transitions": 602882
7
  }
 
2
  "context_size": 2,
3
  "variant": "word",
4
  "language": "alt",
5
+ "unique_contexts": 271928,
6
+ "total_transitions": 600906
7
  }
models/word_markov/alt_markov_ctx3_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:761a9c53530fa3d9cba8ad3b211af79457306c88812173b71d35bdd3d1faedac
3
- size 11105253
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe3aaa68b6d9bbab96a5dabe2bb678631fbf37dccf6d6af6381ca3f3057e524f
3
+ size 11039046
models/word_markov/alt_markov_ctx3_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 3,
3
  "variant": "word",
4
  "language": "alt",
5
- "unique_contexts": 366294,
6
- "total_transitions": 601783
7
  }
 
2
  "context_size": 3,
3
  "variant": "word",
4
  "language": "alt",
5
+ "unique_contexts": 364496,
6
+ "total_transitions": 599806
7
  }
models/word_markov/alt_markov_ctx4_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4483468f8c0c180566bb8295e44c88b85cd214ea65a3fca1dfdc4c1fd87d8d95
3
- size 13560943
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2c83cdb7a1ac1c3baa04dc828755b3a73934cce5c5cd328499fd099d3c0e46a
3
+ size 13488455
models/word_markov/alt_markov_ctx4_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 4,
3
  "variant": "word",
4
  "language": "alt",
5
- "unique_contexts": 402354,
6
- "total_transitions": 600684
7
  }
 
2
  "context_size": 4,
3
  "variant": "word",
4
  "language": "alt",
5
+ "unique_contexts": 400428,
6
+ "total_transitions": 598706
7
  }