omarkamali commited on
Commit
8f235b3
·
verified ·
1 Parent(s): c5df758

Upload all models and assets for ann (latest)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. README.md +190 -153
  3. models/embeddings/aligned/ann_128d.bin +3 -0
  4. models/embeddings/aligned/ann_128d.meta.json +1 -0
  5. models/embeddings/aligned/ann_128d.projection.npy +3 -0
  6. models/embeddings/aligned/ann_128d_metadata.json +8 -0
  7. models/embeddings/aligned/ann_32d.bin +3 -0
  8. models/embeddings/aligned/ann_32d.meta.json +1 -0
  9. models/embeddings/aligned/ann_32d.projection.npy +3 -0
  10. models/embeddings/aligned/ann_32d_metadata.json +8 -0
  11. models/embeddings/aligned/ann_64d.bin +3 -0
  12. models/embeddings/aligned/ann_64d.meta.json +1 -0
  13. models/embeddings/aligned/ann_64d.projection.npy +3 -0
  14. models/embeddings/aligned/ann_64d_metadata.json +8 -0
  15. models/embeddings/monolingual/ann_128d.bin +2 -2
  16. models/embeddings/monolingual/ann_128d_metadata.json +1 -1
  17. models/embeddings/monolingual/ann_32d.bin +2 -2
  18. models/embeddings/monolingual/ann_32d_metadata.json +1 -1
  19. models/embeddings/monolingual/ann_64d.bin +2 -2
  20. models/embeddings/monolingual/ann_64d_metadata.json +1 -1
  21. models/subword_markov/ann_markov_ctx1_subword.parquet +2 -2
  22. models/subword_markov/ann_markov_ctx1_subword_metadata.json +1 -1
  23. models/subword_markov/ann_markov_ctx2_subword.parquet +2 -2
  24. models/subword_markov/ann_markov_ctx2_subword_metadata.json +2 -2
  25. models/subword_markov/ann_markov_ctx3_subword.parquet +2 -2
  26. models/subword_markov/ann_markov_ctx3_subword_metadata.json +2 -2
  27. models/subword_markov/ann_markov_ctx4_subword.parquet +2 -2
  28. models/subword_markov/ann_markov_ctx4_subword_metadata.json +2 -2
  29. models/subword_ngram/ann_2gram_subword.parquet +2 -2
  30. models/subword_ngram/ann_2gram_subword_metadata.json +2 -2
  31. models/subword_ngram/ann_3gram_subword.parquet +2 -2
  32. models/subword_ngram/ann_3gram_subword_metadata.json +2 -2
  33. models/subword_ngram/ann_4gram_subword.parquet +2 -2
  34. models/subword_ngram/ann_4gram_subword_metadata.json +2 -2
  35. models/subword_ngram/ann_5gram_subword.parquet +3 -0
  36. models/subword_ngram/ann_5gram_subword_metadata.json +7 -0
  37. models/tokenizer/ann_tokenizer_16k.model +2 -2
  38. models/tokenizer/ann_tokenizer_16k.vocab +0 -0
  39. models/tokenizer/ann_tokenizer_8k.model +2 -2
  40. models/tokenizer/ann_tokenizer_8k.vocab +0 -0
  41. models/vocabulary/ann_vocabulary.parquet +2 -2
  42. models/vocabulary/ann_vocabulary_metadata.json +8 -8
  43. models/word_markov/ann_markov_ctx1_word.parquet +2 -2
  44. models/word_markov/ann_markov_ctx1_word_metadata.json +2 -2
  45. models/word_markov/ann_markov_ctx2_word.parquet +2 -2
  46. models/word_markov/ann_markov_ctx2_word_metadata.json +2 -2
  47. models/word_markov/ann_markov_ctx3_word.parquet +2 -2
  48. models/word_markov/ann_markov_ctx3_word_metadata.json +2 -2
  49. models/word_markov/ann_markov_ctx4_word.parquet +2 -2
  50. models/word_markov/ann_markov_ctx4_word_metadata.json +2 -2
.gitattributes CHANGED
@@ -39,3 +39,5 @@ visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -t
39
  visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
40
  visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
41
  visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
 
 
 
39
  visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
40
  visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
41
  visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
42
+ visualizations/embedding_tsne_multilingual.png filter=lfs diff=lfs merge=lfs -text
43
+ visualizations/ngram_coverage.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  language: ann
3
- language_name: ANN
4
  language_family: atlantic_other
5
  tags:
6
  - wikilangs
@@ -10,11 +10,21 @@ tags:
10
  - n-gram
11
  - markov
12
  - wikipedia
 
 
 
 
 
 
 
 
 
 
13
  - monolingual
14
  - family-atlantic_other
15
  license: mit
16
  library_name: wikilangs
17
- pipeline_tag: feature-extraction
18
  datasets:
19
  - omarkamali/wikipedia-monthly
20
  dataset_info:
@@ -23,20 +33,20 @@ dataset_info:
23
  metrics:
24
  - name: best_compression_ratio
25
  type: compression
26
- value: 4.351
27
  - name: best_isotropy
28
  type: isotropy
29
- value: 0.1947
30
  - name: vocabulary_size
31
  type: vocab
32
  value: 0
33
  generated: 2026-01-03
34
  ---
35
 
36
- # ANN - Wikilangs Models
37
  ## Comprehensive Research Report & Full Ablation Study
38
 
39
- This repository contains NLP models trained and evaluated by Wikilangs, specifically on **ANN** Wikipedia data.
40
  We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
41
 
42
  ## 📋 Repository Contents
@@ -60,7 +70,7 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
60
  - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
61
  - [4. Vocabulary Analysis](#4-vocabulary-analysis)
62
  - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
63
- - [6. Morphological Analysis (Experimental)](#6-morphological-analysis)
64
  - [7. Summary & Recommendations](#7-summary--recommendations)
65
  - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
66
  - [Visualizations Index](#visualizations-index)
@@ -80,39 +90,39 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
80
 
81
  | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
82
  |------------|-------------|---------------|----------|--------------|
83
- | **8k** | 4.112x | 4.12 | 0.1464% | 133,892 |
84
- | **16k** | 4.351x 🏆 | 4.36 | 0.1549% | 126,546 |
85
 
86
  ### Tokenization Examples
87
 
88
  Below are sample sentences tokenized with each vocabulary size:
89
 
90
- **Sample 1:** `Luwis òso 14 ìre ogwu ubọọn̄ me Furans bene me acha abayaage ire usen mkpa kan̄ ...`
91
 
92
  | Vocab | Tokens | Count |
93
  |-------|--------|-------|
94
- | 8k | `▁lu wis ▁òso 1 4 ▁ìreogwu ▁ubọọn̄me ... (+23 more)` | 33 |
95
- | 16k | `▁luwis ▁òso1 4 ▁ìreogwu ▁ubọọn̄ ▁mefurans ... (+21 more)` | 31 |
96
 
97
- **Sample 2:** `Mọlita ìre ido me Yurop. thumb|Egop Ido Mọlita thumb|Iman̄-ido Mọlita thumb|Okwa...`
98
 
99
  | Vocab | Tokens | Count |
100
  |-------|--------|-------|
101
- | 8k | `▁mọlita ▁ìre ▁idomeyurop . ▁thumb | egop ido ... (+19 more)` | 29 |
102
- | 16k | `▁mọlita ▁ìre ▁idomeyurop . ▁thumb | egop ido ... (+19 more)` | 29 |
103
 
104
- **Sample 3:** `Saint Marino ìre ido me Yurop. thumb|Egop Ido Saint Marino thumb|Iman̄-ido Saint...`
105
 
106
  | Vocab | Tokens | Count |
107
  |-------|--------|-------|
108
- | 8k | `▁saintmarino ▁ìreido ▁me ▁yurop . thumb | egop ... (+17 more)` | 27 |
109
- | 16k | `▁saintmarino ▁ìreido ▁me ▁yurop .thumb | egop ... (+17 more)` | 27 |
110
 
111
 
112
  ### Key Findings
113
 
114
- - **Best Compression:** 16k achieves 4.351x compression
115
- - **Lowest UNK Rate:** 8k with 0.1464% unknown tokens
116
  - **Trade-off:** Larger vocabularies improve compression but increase model size
117
  - **Recommendation:** 32k vocabulary provides optimal balance for production use
118
 
@@ -129,12 +139,14 @@ Below are sample sentences tokenized with each vocabulary size:
129
 
130
  | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
131
  |--------|---------|------------|---------|----------------|------------------|-------------------|
132
- | **2-gram** | Word | 1,111 | 10.12 | 2,498 | 36.3% | 78.1% |
133
- | **2-gram** | Subword | 241 🏆 | 7.91 | 1,230 | 67.8% | 99.7% |
134
- | **3-gram** | Word | 1,927 | 10.91 | 3,289 | 25.2% | 65.4% |
135
- | **3-gram** | Subword | 1,414 | 10.47 | 7,165 | 32.1% | 80.6% |
136
- | **4-gram** | Word | 3,376 | 11.72 | 4,802 | 16.9% | 48.5% |
137
- | **4-gram** | Subword | 4,883 | 12.25 | 24,184 | 20.0% | 55.6% |
 
 
138
 
139
  ### Top 5 N-grams by Size
140
 
@@ -142,21 +154,21 @@ Below are sample sentences tokenized with each vocabulary size:
142
 
143
  | Rank | N-gram | Count |
144
  |------|--------|-------|
145
- | 1 | `me lek` | 1,089 |
146
- | 2 | `me agan̄` | 844 |
147
- | 3 | `me emen` | 801 |
148
  | 4 | `ido ya` | 458 |
149
- | 5 | `ichit me` | 381 |
150
 
151
  **3-grams (Word):**
152
 
153
  | Rank | N-gram | Count |
154
  |------|--------|-------|
155
- | 1 | `agan̄ ichep ura` | 217 |
156
  | 2 | `me ido ya` | 190 |
157
- | 3 | `me agan̄ osiki` | 183 |
158
- | 4 | `agan̄ mbum ura` | 176 |
159
- | 5 | `me agan̄ inyọn̄` | 172 |
160
 
161
  **4-grams (Word):**
162
 
@@ -165,45 +177,65 @@ Below are sample sentences tokenized with each vocabulary size:
165
  | 1 | `me agan̄ mbum ura` | 103 |
166
  | 2 | `me agan̄ ichep ura` | 96 |
167
  | 3 | `me ido ya ìre` | 62 |
168
- | 4 | `agan̄ inyọn̄ mbum ura` | 56 |
169
- | 5 | `ewabe ichit me emen` | 50 |
 
 
 
 
 
 
 
 
 
 
170
 
171
  **2-grams (Subword):**
172
 
173
  | Rank | N-gram | Count |
174
  |------|--------|-------|
175
- | 1 | `e _` | 19,443 |
176
- | 2 | `_ i` | 16,978 |
177
- | 3 | `_ m` | 15,100 |
178
- | 4 | `_ e` | 11,773 |
179
- | 5 | `a _` | 9,778 |
180
 
181
  **3-grams (Subword):**
182
 
183
  | Rank | N-gram | Count |
184
  |------|--------|-------|
185
- | 1 | `_ m e` | 7,822 |
186
- | 2 | `m e _` | 7,755 |
187
- | 3 | `a _` | 4,098 |
188
- | 4 | `r e _` | 4,084 |
189
- | 5 | `e _ i` | 3,290 |
190
 
191
  **4-grams (Subword):**
192
 
193
  | Rank | N-gram | Count |
194
  |------|--------|-------|
195
- | 1 | `_ m e _` | 7,635 |
196
- | 2 | `_ m è _` | 2,895 |
197
- | 3 | `l e k _` | 2,350 |
198
- | 4 | `_ a g a` | 1,914 |
199
- | 5 | `a g a n̄` | 1,906 |
 
 
 
 
 
 
 
 
 
 
200
 
201
 
202
  ### Key Findings
203
 
204
- - **Best Perplexity:** 2-gram (subword) with 241
205
  - **Entropy Trend:** Decreases with larger n-grams (more predictable)
206
- - **Coverage:** Top-1000 patterns cover ~56% of corpus
207
  - **Recommendation:** 4-gram or 5-gram for best predictive performance
208
 
209
  ---
@@ -219,14 +251,14 @@ Below are sample sentences tokenized with each vocabulary size:
219
 
220
  | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
221
  |---------|---------|-------------|------------|------------------|-----------------|----------------|
222
- | **1** | Word | 0.7773 | 1.714 | 4.68 | 9,818 | 22.3% |
223
- | **1** | Subword | 1.1325 | 2.192 | 8.75 | 290 | 0.0% |
224
- | **2** | Word | 0.2751 | 1.210 | 1.61 | 45,714 | 72.5% |
225
- | **2** | Subword | 1.0635 | 2.090 | 5.50 | 2,537 | 0.0% |
226
- | **3** | Word | 0.1069 | 1.077 | 1.18 | 73,222 | 89.3% |
227
- | **3** | Subword | 0.7725 | 1.708 | 3.21 | 13,932 | 22.7% |
228
- | **4** | Word | 0.0450 🏆 | 1.032 | 1.07 | 86,066 | 95.5% |
229
- | **4** | Subword | 0.4908 | 1.405 | 2.04 | 44,651 | 50.9% |
230
 
231
  ### Generated Text Samples (Word-based)
232
 
@@ -234,27 +266,27 @@ Below are text samples generated from each word-based Markov chain model:
234
 
235
  **Context Size 1:**
236
 
237
- 1. `me akparalek ijọn̄ ikọkọp uji ijọn̄ ìjeen̄ ibe ekọp esaba èwê alt left thumb iman̄ ido`
238
- 2. `mè owuwa ebi barazilu thumb iman̄ kan̄ belgiọm burazil pọtugalu thumb egop ubọọn̄ me zambia`
239
- 3. `agan̄ mkpulu ubọọn̄ yi ìre siera leyon togo me yurop ìniluk me agan̄ ichepura eyi india`
240
 
241
  **Context Size 2:**
242
 
243
- 1. `me lek ike uti ìkatibi me èwê dubai`
244
- 2. `me agan̄ osiki ire ebi ofifi èwê ere òla ijọn̄ eba ìkup ewuuk ewuuk me mgbọ`
245
- 3. `me emen senturi akọp gweregwen ene ewabe me emen mîwa iraka efie ita thumb egop agan̄`
246
 
247
  **Context Size 3:**
248
 
249
- 1. `agan̄ ichep ura me agan̄ ichep ura ruwanda me agan̄ osiki me ido naijiria achubọk inyinyi òrom òkuku...`
250
- 2. `me ido ya bene me senturi 16 re 19 emen awaji atilantik ore achubọk ebon ere ewe inyam`
251
- 3. `me agan̄ osiki naijiria ama mkpulu ìtatap ikana ọmọ ìre ginì ikwetọ me agan̄ inyọn̄ mbum ura ido`
252
 
253
  **Context Size 4:**
254
 
255
- 1. `me agan̄ mbum ura naija me agan̄ inyọn̄ mbum ura silovakia me agan̄ osiki mbum ura me lek`
256
- 2. `me agan̄ ichep ura eyi amerika agan̄ inyọn̄ thumb ọrọsi thumb ọrọsi môkọt ikaan̄ esese mbet unwen mè...`
257
- 3. `me ido ya ìre eyi ebọkọbe itap me 17 akọp onyan̄ ge otu ifuk ene ìluk me ido`
258
 
259
 
260
  ### Generated Text Samples (Subword-based)
@@ -263,34 +295,34 @@ Below are text samples generated from each subword-based Markov chain model:
263
 
264
  **Context Size 1:**
265
 
266
- 1. `_ma_mè_mè_erirup`
267
- 2. `e_ògan̄_chikilukp`
268
- 3. `ituwupanwebọte_m`
269
 
270
  **Context Size 2:**
271
 
272
- 1. `e_lek_mè_ìkuria_m`
273
- 2. `_ififuuke_si_ichọ`
274
- 3. `_me_òkuk_use_agan̄`
275
 
276
  **Context Size 3:**
277
 
278
- 1. `_me_lek_ebi_kibert`
279
- 2. `me_levan_obolo_pas`
280
- 3. `an̄_echieen̄_ya_orọr`
281
 
282
  **Context Size 4:**
283
 
284
- 1. `_me_lek_ìmọnọ_ire_o`
285
- 2. `_mè_ebi_kè_ama-ile_`
286
- 3. `lek_<raw_mate>_igba`
287
 
288
 
289
  ### Key Findings
290
 
291
  - **Best Predictability:** Context-4 (word) with 95.5% predictability
292
  - **Branching Factor:** Decreases with context size (more deterministic)
293
- - **Memory Trade-off:** Larger contexts require more storage (44,651 contexts)
294
  - **Recommendation:** Context-3 or Context-4 for text generation
295
 
296
  ---
@@ -306,36 +338,36 @@ Below are text samples generated from each subword-based Markov chain model:
306
 
307
  | Metric | Value |
308
  |--------|-------|
309
- | Vocabulary Size | 4,243 |
310
- | Total Tokens | 93,606 |
311
- | Mean Frequency | 22.06 |
312
  | Median Frequency | 4 |
313
- | Frequency Std Dev | 154.88 |
314
 
315
  ### Most Common Words
316
 
317
  | Rank | Word | Frequency |
318
  |------|------|-----------|
319
- | 1 | me | 7,683 |
320
- | 2 | mè | 2,927 |
321
- | 3 | agan̄ | 1,906 |
322
- | 4 | ido | 1,757 |
323
- | 5 | ebi | 1,749 |
324
- | 6 | ìre | 1,621 |
325
- | 7 | lek | 1,606 |
326
- | 8 | eyi | 1,291 |
327
- | 9 | ya | 1,169 |
328
- | 10 | emen | 1,082 |
329
 
330
  ### Least Common Words (from vocabulary)
331
 
332
  | Rank | Word | Frequency |
333
  |------|------|-----------|
334
- | 1 | iyaak | 2 |
335
- | 2 | medvedev | 2 |
336
- | 3 | race | 2 |
337
- | 4 | lenin | 2 |
338
- | 5 | walvis | 2 |
339
  | 6 | ọkọlọba | 2 |
340
  | 7 | ǹkọọn̄ | 2 |
341
  | 8 | edeh | 2 |
@@ -346,24 +378,24 @@ Below are text samples generated from each subword-based Markov chain model:
346
 
347
  | Metric | Value |
348
  |--------|-------|
349
- | Zipf Coefficient | 1.1690 |
350
- | R² (Goodness of Fit) | 0.990906 |
351
  | Adherence Quality | **excellent** |
352
 
353
  ### Coverage Analysis
354
 
355
  | Top N Words | Coverage |
356
  |-------------|----------|
357
- | Top 100 | 59.7% |
358
- | Top 1,000 | 87.8% |
359
  | Top 5,000 | 0.0% |
360
  | Top 10,000 | 0.0% |
361
 
362
  ### Key Findings
363
 
364
- - **Zipf Compliance:** R²=0.9909 indicates excellent adherence to Zipf's law
365
- - **High Frequency Dominance:** Top 100 words cover 59.7% of corpus
366
- - **Long Tail:** -5,757 words needed for remaining 100.0% coverage
367
 
368
  ---
369
  ## 5. Word Embeddings Evaluation
@@ -379,37 +411,40 @@ Below are text samples generated from each subword-based Markov chain model:
379
 
380
  ### 5.1 Cross-Lingual Alignment
381
 
382
- > *Note: Multilingual alignment visualization not available for this language.*
 
 
383
 
384
 
385
  ### 5.2 Model Comparison
386
 
387
  | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
388
  |-------|-----------|----------|------------------|---------------|----------------|
389
- | **mono_32d** | 32 | 0.1947 🏆 | 0.5302 | N/A | N/A |
390
- | **mono_64d** | 64 | 0.0325 | 0.5569 | N/A | N/A |
391
- | **mono_128d** | 128 | 0.0071 | 0.5825 | N/A | N/A |
 
 
 
392
 
393
  ### Key Findings
394
 
395
- - **Best Isotropy:** mono_32d with 0.1947 (more uniform distribution)
396
- - **Semantic Density:** Average pairwise similarity of 0.5565. Lower values indicate better semantic separation.
397
- - **Alignment Quality:** No aligned models evaluated in this run.
398
  - **Recommendation:** 128d aligned for best cross-lingual performance
399
 
400
  ---
401
  ## 6. Morphological Analysis (Experimental)
402
 
403
- > ⚠️ **Warning:** This language shows low morphological productivity. The statistical signals used for this analysis may be noisy or less reliable than for morphologically rich languages.
404
-
405
  This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
406
 
407
  ### 6.1 Productivity & Complexity
408
 
409
  | Metric | Value | Interpretation | Recommendation |
410
  |--------|-------|----------------|----------------|
411
- | Productivity Index | **0.000** | Low morphological productivity | ⚠️ Likely unreliable |
412
- | Idiomaticity Gap | **-1.000** | Low formulaic content | - |
413
 
414
  ### 6.2 Affix Inventory (Productive Units)
415
 
@@ -418,14 +453,14 @@ These are the most productive prefixes and suffixes identified by sampling the v
418
  #### Productive Prefixes
419
  | Prefix | Examples |
420
  |--------|----------|
421
- | `-ek` | ekefuk, ekwut, ekpọkbe |
422
- | `-ik` | ikike, ikwuk, ikisip |
423
 
424
  #### Productive Suffixes
425
  | Suffix | Examples |
426
  |--------|----------|
427
- | `-n̄` | mun̄, òrọriọọn̄, ijejeen̄ |
428
- | `-be` | îgebe, olobobe, eweekbe |
429
 
430
  ### 6.3 Bound Stems (Lexical Roots)
431
 
@@ -433,18 +468,18 @@ Bound stems are high-frequency subword units that are semantically cohesive but
433
 
434
  | Stem | Cohesion | Substitutability | Examples |
435
  |------|----------|------------------|----------|
436
- | `gọọk` | 1.55x | 19 contexts | agọọk, igọọk, îgọọk |
437
- | `tumu` | 1.48x | 21 contexts | ìtumu, òtumu, etumu |
438
- | `kpul` | 1.58x | 16 contexts | ikpulu, ìkpulu, îkpulu |
439
- | `sibi` | 1.52x | 18 contexts | ìsibi, osibi, îsibi |
440
- | `kana` | 1.46x | 20 contexts | ikana, okana, ìkana |
441
- | `kikp` | 1.44x | 19 contexts | ikikpa, ikikpọ, ìkikpa |
442
- | `kisa` | 1.46x | 18 contexts | okisa, îkisa, ekisa |
443
- | `chie` | 1.55x | 14 contexts | chief, echieek, ìchieek |
444
- | `kpọk` | 1.42x | 17 contexts | okpọk, akpọk, ọkpọk |
445
- | `gbaa` | 1.46x | 15 contexts | ogbaan̄, egbaan̄, îgbaan̄ |
446
- | `ikaa` | 1.60x | 11 contexts | ikaan̄, ikikaan̄, ekikaan̄ |
447
- | `riọọ` | 1.54x | 12 contexts | nriọọk, riọọn̄, oriọọn̄ |
448
 
449
  ### 6.4 Affix Compatibility (Co-occurrence)
450
 
@@ -452,9 +487,9 @@ This table shows which prefixes and suffixes most frequently co-occur on the sam
452
 
453
  | Prefix | Suffix | Frequency | Examples |
454
  |--------|--------|-----------|----------|
455
- | `-ek` | `-be` | 15 words | ekpọkbe, ekifukbe |
456
- | `-ik` | `-n̄` | 15 words | ikwaan̄, ikikaan̄ |
457
- | `-ek` | `-n̄` | 10 words | ekimọọn̄, ekekaan̄ |
458
 
459
  ### 6.5 Recursive Morpheme Segmentation
460
 
@@ -465,23 +500,25 @@ Using **Recursive Hierarchical Substitutability**, we decompose complex words in
465
  | ekinyambe | **`ek-inyam-be`** | 6.0 | `inyam` |
466
  | ekitumube | **`ek-itumu-be`** | 6.0 | `itumu` |
467
  | ekigwenbe | **`ek-igwen-be`** | 6.0 | `igwen` |
468
- | echichinibe | **`echichini-be`** | 4.5 | `echichini` |
469
- | echieekbe | **`echieek-be`** | 4.5 | `echieek` |
470
- | ekikpulube | **`ek-ik-pulu-be`** | 4.5 | `pulu` |
471
  | ikichieek | **`ik-ichieek`** | 4.5 | `ichieek` |
472
  | ekichichini | **`ek-ichichini`** | 4.5 | `ichichini` |
473
- | ekekikpulu | **`ek-ek-ik-pulu`** | 4.5 | `pulu` |
474
  | ekiweweek | **`ek-iweweek`** | 4.5 | `iweweek` |
475
- | ikibieen̄ | **`ik-ibiee-n̄`** | 3.0 | `ibiee` |
476
- | ekititiin̄ | **`ek-ititii-n̄`** | 3.0 | `ititii` |
477
- | etitiin̄be | **`etitii-n̄-be`** | 3.0 | `etitii` |
478
- | îriọọn̄be | **`îriọọ-n̄-be`** | 3.0 | `îriọọ` |
479
  | ekikpukpo | **`ek-ik-pukpo`** | 3.0 | `pukpo` |
 
 
 
 
480
 
481
  ### 6.6 Linguistic Interpretation
482
 
483
  > **Automated Insight:**
484
- The language ANN appears to be more isolating or has a highly fixed vocabulary. Word-level models perform nearly as well as subword models, indicating fewer productive morphological processes.
 
 
485
 
486
  ---
487
  ## 7. Summary & Recommendations
@@ -493,7 +530,7 @@ The language ANN appears to be more isolating or has a highly fixed vocabulary.
493
  | Component | Recommended | Rationale |
494
  |-----------|-------------|-----------|
495
  | Tokenizer | **16k BPE** | Best compression (4.35x) |
496
- | N-gram | **2-gram** | Lowest perplexity (241) |
497
  | Markov | **Context-4** | Highest predictability (95.5%) |
498
  | Embeddings | **100d** | Balanced semantic capture and isotropy |
499
 
@@ -708,4 +745,4 @@ MIT License - Free for academic and commercial use.
708
  ---
709
  *Generated by Wikilangs Models Pipeline*
710
 
711
- *Report Date: 2026-01-03 05:13:43*
 
1
  ---
2
  language: ann
3
+ language_name: Obolo
4
  language_family: atlantic_other
5
  tags:
6
  - wikilangs
 
10
  - n-gram
11
  - markov
12
  - wikipedia
13
+ - feature-extraction
14
+ - sentence-similarity
15
+ - tokenization
16
+ - n-grams
17
+ - markov-chain
18
+ - text-mining
19
+ - fasttext
20
+ - babelvec
21
+ - vocabulous
22
+ - vocabulary
23
  - monolingual
24
  - family-atlantic_other
25
  license: mit
26
  library_name: wikilangs
27
+ pipeline_tag: text-generation
28
  datasets:
29
  - omarkamali/wikipedia-monthly
30
  dataset_info:
 
33
  metrics:
34
  - name: best_compression_ratio
35
  type: compression
36
+ value: 4.353
37
  - name: best_isotropy
38
  type: isotropy
39
+ value: 0.1710
40
  - name: vocabulary_size
41
  type: vocab
42
  value: 0
43
  generated: 2026-01-03
44
  ---
45
 
46
+ # Obolo - Wikilangs Models
47
  ## Comprehensive Research Report & Full Ablation Study
48
 
49
+ This repository contains NLP models trained and evaluated by Wikilangs, specifically on **Obolo** Wikipedia data.
50
  We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
51
 
52
  ## 📋 Repository Contents
 
70
  - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
71
  - [4. Vocabulary Analysis](#4-vocabulary-analysis)
72
  - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
73
+ - [6. Morphological Analysis (Experimental)](#6--morphological-analysis-experimental)
74
  - [7. Summary & Recommendations](#7-summary--recommendations)
75
  - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
76
  - [Visualizations Index](#visualizations-index)
 
90
 
91
  | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
92
  |------------|-------------|---------------|----------|--------------|
93
+ | **8k** | 4.116x | 4.12 | 0.1487% | 128,471 |
94
+ | **16k** | 4.353x 🏆 | 4.36 | 0.1572% | 121,476 |
95
 
96
  ### Tokenization Examples
97
 
98
  Below are sample sentences tokenized with each vocabulary size:
99
 
100
+ **Sample 1:** `Ọrọn môkọt ire: Ebi Ọrọn (ife) Ido Ọrọn (ama ere) Ọrọn (Mkpulu-ija) Usem Ọrọn...`
101
 
102
  | Vocab | Tokens | Count |
103
  |-------|--------|-------|
104
+ | 8k | `▁ọrọn ▁môkọtire : ▁ebi ▁ọrọn( ife ) ido ... (+17 more)` | 27 |
105
+ | 16k | `▁ọrọn ▁môkọtire : ▁ebi ▁ọrọn( ife )ido ... (+17 more)` | 27 |
106
 
107
+ **Sample 2:** `Nde ìre oke mgbọ òsoso usen jaaba. Ekisa nde ifuk onyan̄ isa ifuk acha si. Nd...`
108
 
109
  | Vocab | Tokens | Count |
110
  |-------|--------|-------|
111
+ | 8k | `▁nde ▁ìre ▁okemgbọ ▁òsoso usen ▁jaaba . ▁ekisande ... (+27 more)` | 37 |
112
+ | 16k | `▁nde ▁ìre ▁okemgbọ ▁òsoso usen ▁jaaba . ▁ekisande ... (+26 more)` | 36 |
113
 
114
+ **Sample 3:** `Ọngari (òrere Hungary me usem Ebeke, ire Magyarország me usem Ọn̄gari) ìre id...`
115
 
116
  | Vocab | Tokens | Count |
117
  |-------|--------|-------|
118
+ | 8k | `▁ọ n gari ( òrereh ungary ▁me ▁usemebeke ... (+28 more)` | 38 |
119
+ | 16k | `▁ọngari( òrerehungary ▁me ▁usem ▁ebeke , ▁ire ... (+19 more)` | 29 |
120
 
121
 
122
  ### Key Findings
123
 
124
+ - **Best Compression:** 16k achieves 4.353x compression
125
+ - **Lowest UNK Rate:** 8k with 0.1487% unknown tokens
126
  - **Trade-off:** Larger vocabularies improve compression but increase model size
127
  - **Recommendation:** 32k vocabulary provides optimal balance for production use
128
 
 
139
 
140
  | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
141
  |--------|---------|------------|---------|----------------|------------------|-------------------|
142
+ | **2-gram** | Word | 1,077 | 10.07 | 2,406 | 36.6% | 78.7% |
143
+ | **2-gram** | Subword | 236 🏆 | 7.88 | 1,214 | 68.6% | 99.7% |
144
+ | **3-gram** | Word | 1,871 | 10.87 | 3,155 | 25.1% | 66.0% |
145
+ | **3-gram** | Subword | 1,382 | 10.43 | 7,013 | 32.7% | 80.9% |
146
+ | **4-gram** | Word | 3,277 | 11.68 | 4,661 | 17.3% | 49.0% |
147
+ | **4-gram** | Subword | 4,770 | 12.22 | 23,560 | 20.3% | 56.1% |
148
+ | **5-gram** | Word | 2,207 | 11.11 | 2,786 | 18.0% | 56.1% |
149
+ | **5-gram** | Subword | 9,921 | 13.28 | 38,424 | 14.8% | 42.7% |
150
 
151
  ### Top 5 N-grams by Size
152
 
 
154
 
155
  | Rank | N-gram | Count |
156
  |------|--------|-------|
157
+ | 1 | `me lek` | 1,069 |
158
+ | 2 | `me agan̄` | 831 |
159
+ | 3 | `me emen` | 791 |
160
  | 4 | `ido ya` | 458 |
161
+ | 5 | `ichit me` | 380 |
162
 
163
  **3-grams (Word):**
164
 
165
  | Rank | N-gram | Count |
166
  |------|--------|-------|
167
+ | 1 | `agan̄ ichep ura` | 215 |
168
  | 2 | `me ido ya` | 190 |
169
+ | 3 | `me agan̄ osiki` | 182 |
170
+ | 4 | `agan̄ mbum ura` | 174 |
171
+ | 5 | `me agan̄ inyọn̄` | 171 |
172
 
173
  **4-grams (Word):**
174
 
 
177
  | 1 | `me agan̄ mbum ura` | 103 |
178
  | 2 | `me agan̄ ichep ura` | 96 |
179
  | 3 | `me ido ya ìre` | 62 |
180
+ | 4 | `agan̄ inyọn̄ mbum ura` | 55 |
181
+ | 5 | `me usem uket chieen̄` | 50 |
182
+
183
+ **5-grams (Word):**
184
+
185
+ | Rank | N-gram | Count |
186
+ |------|--------|-------|
187
+ | 1 | `ene ewabe ichit me emen` | 48 |
188
+ | 2 | `me agan̄ inyọn̄ mbum ura` | 38 |
189
+ | 3 | `me agan̄ osiki mbum ura` | 37 |
190
+ | 4 | `me agan̄ osiki ichep ura` | 36 |
191
+ | 5 | `otu ifuk ebi ìluk me` | 33 |
192
 
193
  **2-grams (Subword):**
194
 
195
  | Rank | N-gram | Count |
196
  |------|--------|-------|
197
+ | 1 | `e _` | 19,047 |
198
+ | 2 | `_ i` | 16,640 |
199
+ | 3 | `_ m` | 14,795 |
200
+ | 4 | `_ e` | 11,553 |
201
+ | 5 | `a _` | 9,463 |
202
 
203
  **3-grams (Subword):**
204
 
205
  | Rank | N-gram | Count |
206
  |------|--------|-------|
207
+ | 1 | `_ m e` | 7,633 |
208
+ | 2 | `m e _` | 7,573 |
209
+ | 3 | `r e _` | 4,030 |
210
+ | 4 | `a _` | 3,973 |
211
+ | 5 | `e _ i` | 3,231 |
212
 
213
  **4-grams (Subword):**
214
 
215
  | Rank | N-gram | Count |
216
  |------|--------|-------|
217
+ | 1 | `_ m e _` | 7,454 |
218
+ | 2 | `_ m è _` | 2,866 |
219
+ | 3 | `l e k _` | 2,314 |
220
+ | 4 | `_ a g a` | 1,867 |
221
+ | 5 | `_ e b i` | 1,856 |
222
+
223
+ **5-grams (Subword):**
224
+
225
+ | Rank | N-gram | Count |
226
+ |------|--------|-------|
227
+ | 1 | `_ a g a n̄` | 1,844 |
228
+ | 2 | `_ e b i _` | 1,713 |
229
+ | 3 | `_ m e _ a` | 1,652 |
230
+ | 4 | `_ ì r e _` | 1,547 |
231
+ | 5 | `a g a n̄ _` | 1,513 |
232
 
233
 
234
  ### Key Findings
235
 
236
+ - **Best Perplexity:** 2-gram (subword) with 236
237
  - **Entropy Trend:** Decreases with larger n-grams (more predictable)
238
+ - **Coverage:** Top-1000 patterns cover ~43% of corpus
239
  - **Recommendation:** 4-gram or 5-gram for best predictive performance
240
 
241
  ---
 
251
 
252
  | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
253
  |---------|---------|-------------|------------|------------------|-----------------|----------------|
254
+ | **1** | Word | 0.7698 | 1.705 | 4.61 | 9,664 | 23.0% |
255
+ | **1** | Subword | 1.1244 | 2.180 | 8.63 | 290 | 0.0% |
256
+ | **2** | Word | 0.2727 | 1.208 | 1.60 | 44,320 | 72.7% |
257
+ | **2** | Subword | 1.0645 | 2.091 | 5.47 | 2,502 | 0.0% |
258
+ | **3** | Word | 0.1063 | 1.076 | 1.18 | 70,635 | 89.4% |
259
+ | **3** | Subword | 0.7756 | 1.712 | 3.20 | 13,671 | 22.4% |
260
+ | **4** | Word | 0.0452 🏆 | 1.032 | 1.07 | 82,762 | 95.5% |
261
+ | **4** | Subword | 0.4895 | 1.404 | 2.03 | 43,657 | 51.0% |
262
 
263
  ### Generated Text Samples (Word-based)
264
 
 
266
 
267
  **Context Size 1:**
268
 
269
+ 1. `me lek kiban̄ ekitimbe akọn̄ ofirikosok gbọgbọ otu ifuk ikpọk ya ìre isilam inu`
270
+ 2. `mè ijọn̄ ido ya me naijiria agan̄ ichep ura ogwu òkitaak chieen̄ ikpọ mè mgbọ`
271
+ 3. `agan̄ mkpulu uwu usọ ifuk ene ewabe ichit me emen mgbọ etiopia ìkup me agọọk nkween̄`
272
 
273
  **Context Size 2:**
274
 
275
+ 1. `me lek adasi nkwukwuuk cha isa ikije mè isa me ikeya lesoto ìre ge me lek ijọn̄`
276
+ 2. `me agan̄ inyọn̄ mbum ura emen awaji atik me agan̄ inyọn̄ ichep ura agan̄ ichep`
277
+ 3. `me emen wire môkọtbe irọ inu due process odobe`
278
 
279
  **Context Size 3:**
280
 
281
+ 1. `agan̄ ichep ura oniin̄ ikpọkpọk ikire ibot ikọ me ukpatu ebi uga ifuk ibot chereyi kperiọọn̄ owuw...`
282
+ 2. `me ido ya ìnire usem furenchi mèlek usem wolof sa me ebi otoko wolof erebe ebi ìwawa ichit`
283
+ 3. `me agan̄ osiki ruwanda burundi kongo kinshasa ekup me agan̄ inyọn̄ abia akwa ibom me`
284
 
285
  **Context Size 4:**
286
 
287
+ 1. `me agan̄ mbum ura kan̄ emen awaji atilantik otap ikana ọmọ me agan̄ inyọn̄ afirika agan̄ inyọn̄ ò...`
288
+ 2. `me agan̄ ichep ura isi ire iteke indus me agan̄ mbum ura me afirika ire si òso 20`
289
+ 3. `me ido ya ìre furench ire îre akọp irek me efit si re akọp irek go me efit`
290
 
291
 
292
  ### Generated Text Samples (Subword-based)
 
295
 
296
  **Context Size 1:**
297
 
298
+ 1. `_idem_ijan̄_masee`
299
+ 2. `e_erelukp_mi_lup`
300
+ 3. `irit_enyikp_n_si`
301
 
302
  **Context Size 2:**
303
 
304
+ 1. `e_obageeleki_[cor`
305
+ 2. `_ike_ubọ_erere_ik`
306
+ 3. `_me_ebi_mè_mem_ya`
307
 
308
  **Context Size 3:**
309
 
310
+ 1. `_me_emire_ge,_mè_d`
311
+ 2. `me_jodan_ichechich`
312
+ 3. `re_ge,_ìkigwook,_ò`
313
 
314
  **Context Size 4:**
315
 
316
+ 1. `_me_si_inwàn_ikwaan̄`
317
+ 2. `_mè_ikikaan̄ge;_me_<`
318
+ 3. `lek_ebi_ìkike_eriọọ`
319
 
320
 
321
  ### Key Findings
322
 
323
  - **Best Predictability:** Context-4 (word) with 95.5% predictability
324
  - **Branching Factor:** Decreases with context size (more deterministic)
325
+ - **Memory Trade-off:** Larger contexts require more storage (43,657 contexts)
326
  - **Recommendation:** Context-3 or Context-4 for text generation
327
 
328
  ---
 
338
 
339
  | Metric | Value |
340
  |--------|-------|
341
+ | Vocabulary Size | 4,154 |
342
+ | Total Tokens | 89,919 |
343
+ | Mean Frequency | 21.65 |
344
  | Median Frequency | 4 |
345
+ | Frequency Std Dev | 152.24 |
346
 
347
  ### Most Common Words
348
 
349
  | Rank | Word | Frequency |
350
  |------|------|-----------|
351
+ | 1 | me | 7,502 |
352
+ | 2 | mè | 2,898 |
353
+ | 3 | agan̄ | 1,854 |
354
+ | 4 | ebi | 1,728 |
355
+ | 5 | ìre | 1,597 |
356
+ | 6 | lek | 1,576 |
357
+ | 7 | ido | 1,514 |
358
+ | 8 | eyi | 1,242 |
359
+ | 9 | ya | 1,165 |
360
+ | 10 | emen | 1,065 |
361
 
362
  ### Least Common Words (from vocabulary)
363
 
364
  | Rank | Word | Frequency |
365
  |------|------|-----------|
366
+ | 1 | lanzarote | 2 |
367
+ | 2 | iyaak | 2 |
368
+ | 3 | medvedev | 2 |
369
+ | 4 | race | 2 |
370
+ | 5 | lenin | 2 |
371
  | 6 | ọkọlọba | 2 |
372
  | 7 | ǹkọọn̄ | 2 |
373
  | 8 | edeh | 2 |
 
378
 
379
  | Metric | Value |
380
  |--------|-------|
381
+ | Zipf Coefficient | 1.1652 |
382
+ | R² (Goodness of Fit) | 0.990704 |
383
  | Adherence Quality | **excellent** |
384
 
385
  ### Coverage Analysis
386
 
387
  | Top N Words | Coverage |
388
  |-------------|----------|
389
+ | Top 100 | 59.9% |
390
+ | Top 1,000 | 87.9% |
391
  | Top 5,000 | 0.0% |
392
  | Top 10,000 | 0.0% |
393
 
394
  ### Key Findings
395
 
396
+ - **Zipf Compliance:** R²=0.9907 indicates excellent adherence to Zipf's law
397
+ - **High Frequency Dominance:** Top 100 words cover 59.9% of corpus
398
+ - **Long Tail:** -5,846 words needed for remaining 100.0% coverage
399
 
400
  ---
401
  ## 5. Word Embeddings Evaluation
 
411
 
412
  ### 5.1 Cross-Lingual Alignment
413
 
414
+ ![Alignment Quality](visualizations/embedding_alignment_quality.png)
415
+
416
+ ![Multilingual t-SNE](visualizations/embedding_tsne_multilingual.png)
417
 
418
 
419
  ### 5.2 Model Comparison
420
 
421
  | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
422
  |-------|-----------|----------|------------------|---------------|----------------|
423
+ | **mono_32d** | 32 | 0.1710 🏆 | 0.5365 | N/A | N/A |
424
+ | **mono_64d** | 64 | 0.0323 | 0.5579 | N/A | N/A |
425
+ | **mono_128d** | 128 | 0.0059 | 0.5505 | N/A | N/A |
426
+ | **aligned_32d** | 32 | 0.1710 | 0.5499 | 0.0111 | 0.1274 |
427
+ | **aligned_64d** | 64 | 0.0323 | 0.5740 | 0.0222 | 0.1717 |
428
+ | **aligned_128d** | 128 | 0.0059 | 0.5600 | 0.0139 | 0.1634 |
429
 
430
  ### Key Findings
431
 
432
+ - **Best Isotropy:** mono_32d with 0.1710 (more uniform distribution)
433
+ - **Semantic Density:** Average pairwise similarity of 0.5548. Lower values indicate better semantic separation.
434
+ - **Alignment Quality:** Aligned models achieve up to 2.2% R@1 in cross-lingual retrieval.
435
  - **Recommendation:** 128d aligned for best cross-lingual performance
436
 
437
  ---
438
  ## 6. Morphological Analysis (Experimental)
439
 
 
 
440
  This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
441
 
442
  ### 6.1 Productivity & Complexity
443
 
444
  | Metric | Value | Interpretation | Recommendation |
445
  |--------|-------|----------------|----------------|
446
+ | Productivity Index | **5.000** | High morphological productivity | Reliable analysis |
447
+ | Idiomaticity Gap | **0.314** | High formulaic/idiomatic content | - |
448
 
449
  ### 6.2 Affix Inventory (Productive Units)
450
 
 
453
  #### Productive Prefixes
454
  | Prefix | Examples |
455
  |--------|----------|
456
+ | `-ek` | eket, ekpabe, ekibene |
457
+ | `-ik` | ikpele, ikinen̄e, ikinyam |
458
 
459
  #### Productive Suffixes
460
  | Suffix | Examples |
461
  |--------|----------|
462
+ | `-n̄` | ugwun̄, akwaan̄, esun̄ |
463
+ | `-be` | ekpabe, îkwube, ejibibe |
464
 
465
  ### 6.3 Bound Stems (Lexical Roots)
466
 
 
468
 
469
  | Stem | Cohesion | Substitutability | Examples |
470
  |------|----------|------------------|----------|
471
+ | `gọọk` | 1.51x | 19 contexts | igọọk, agọọk, ogọọk |
472
+ | `tumu` | 1.46x | 21 contexts | ntumu, etumu, itumu |
473
+ | `kpul` | 1.58x | 16 contexts | ikpulu, îkpulu, òkpulu |
474
+ | `sibi` | 1.51x | 18 contexts | nsibi, ìsibi, îsibi |
475
+ | `kikp` | 1.44x | 19 contexts | òkikpa, òkikpọ, ekikpọ |
476
+ | `kana` | 1.41x | 20 contexts | ekana, ìkana, nkana |
477
+ | `chie` | 1.55x | 14 contexts | chief, ichieek, ìchieek |
478
+ | `riọọ` | 1.63x | 12 contexts | nriọọk, riọọn̄, nriọọn̄ |
479
+ | `kisa` | 1.43x | 17 contexts | òkisa, ìkisa, îkisa |
480
+ | `gbaa` | 1.46x | 15 contexts | ògbaan̄, egbaan̄, ìgbaan̄ |
481
+ | `ikaa` | 1.61x | 11 contexts | ikaan̄, enikaan̄, ebikaan̄ |
482
+ | `kpọk` | 1.39x | 16 contexts | ukpọk, okpọk, ọkpọk |
483
 
484
  ### 6.4 Affix Compatibility (Co-occurrence)
485
 
 
487
 
488
  | Prefix | Suffix | Frequency | Examples |
489
  |--------|--------|-----------|----------|
490
+ | `-ik` | `-n̄` | 15 words | ikikpan̄, ikwaan̄ |
491
+ | `-ek` | `-be` | 13 words | ekpabe, ekaan̄be |
492
+ | `-ek` | `-n̄` | 10 words | ekijeen̄, ekitoon̄ |
493
 
494
  ### 6.5 Recursive Morpheme Segmentation
495
 
 
500
  | ekinyambe | **`ek-inyam-be`** | 6.0 | `inyam` |
501
  | ekitumube | **`ek-itumu-be`** | 6.0 | `itumu` |
502
  | ekigwenbe | **`ek-igwen-be`** | 6.0 | `igwen` |
 
 
 
503
  | ikichieek | **`ik-ichieek`** | 4.5 | `ichieek` |
504
  | ekichichini | **`ek-ichichini`** | 4.5 | `ichichini` |
505
+ | echichinibe | **`echichini-be`** | 4.5 | `echichini` |
506
  | ekiweweek | **`ek-iweweek`** | 4.5 | `iweweek` |
507
+ | ekikpulube | **`ek-ik-pulu-be`** | 4.5 | `pulu` |
508
+ | ekekikpulu | **`ek-ek-ik-pulu`** | 4.5 | `pulu` |
509
+ | echieekbe | **`echieek-be`** | 4.5 | `echieek` |
 
510
  | ekikpukpo | **`ek-ik-pukpo`** | 3.0 | `pukpo` |
511
+ | ikpọchieen̄ | **`ik-pọchiee-n̄`** | 3.0 | `pọchiee` |
512
+ | ikibieen̄ | **`ik-ibiee-n̄`** | 3.0 | `ibiee` |
513
+ | eriọọn̄be | **`eriọọ-n̄-be`** | 3.0 | `eriọọ` |
514
+ | egbaan̄be | **`egbaa-n̄-be`** | 3.0 | `egbaa` |
515
 
516
  ### 6.6 Linguistic Interpretation
517
 
518
  > **Automated Insight:**
519
+ The language Obolo shows high morphological productivity. The subword models are significantly more efficient than word models, suggesting a rich system of affixation or compounding.
520
+
521
+ > **Note on Idiomaticity:** The high Idiomaticity Gap suggests a large number of frequent multi-word expressions or formulaic sequences that are statistically distinct from their component parts.
522
 
523
  ---
524
  ## 7. Summary & Recommendations
 
530
  | Component | Recommended | Rationale |
531
  |-----------|-------------|-----------|
532
  | Tokenizer | **16k BPE** | Best compression (4.35x) |
533
+ | N-gram | **2-gram** | Lowest perplexity (236) |
534
  | Markov | **Context-4** | Highest predictability (95.5%) |
535
  | Embeddings | **100d** | Balanced semantic capture and isotropy |
536
 
 
745
  ---
746
  *Generated by Wikilangs Models Pipeline*
747
 
748
+ *Report Date: 2026-01-03 14:12:13*
models/embeddings/aligned/ann_128d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc209d93d13b120c8c95cba7ed8ea3511e7f3a3be58c71d6b52f823aba7e3a58
3
+ size 1025972707
models/embeddings/aligned/ann_128d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "ann", "dim": 128, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/ann_128d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ecbf9194a3110e9db3f94be28e4d970c89d4ae602cd76c5e4114d930138c369
3
+ size 65664
models/embeddings/aligned/ann_128d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "ann",
3
+ "dimension": 128,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 361,
7
+ "vocab_size": 1896
8
+ }
models/embeddings/aligned/ann_32d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:046109a4422a9a603a48c2abd0d192b758882631e72ac429557dc6f857e7b375
3
+ size 256516579
models/embeddings/aligned/ann_32d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "ann", "dim": 32, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/ann_32d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ac69b84943611045a295f4bb05aa98d313c747e3e3ff20df0a86d90596c100c
3
+ size 4224
models/embeddings/aligned/ann_32d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "ann",
3
+ "dimension": 32,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 361,
7
+ "vocab_size": 1896
8
+ }
models/embeddings/aligned/ann_64d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b00d28e874e41679c615dce915575ad329279f02507e33677323ebb64d12a95e
3
+ size 513001955
models/embeddings/aligned/ann_64d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "ann", "dim": 64, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/ann_64d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8417b28498d4ded38028edb9ebd5965b99df3b52961d1191e017b63208828a47
3
+ size 16512
models/embeddings/aligned/ann_64d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "ann",
3
+ "dimension": 64,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 361,
7
+ "vocab_size": 1896
8
+ }
models/embeddings/monolingual/ann_128d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce6196c912d733cb4ecafe0bf16e0d4c89052442dc36f0dbeb46533cf966be8c
3
- size 1026069585
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc209d93d13b120c8c95cba7ed8ea3511e7f3a3be58c71d6b52f823aba7e3a58
3
+ size 1025972707
models/embeddings/monolingual/ann_128d_metadata.json CHANGED
@@ -11,5 +11,5 @@
11
  "encoding_method": "rope",
12
  "dim": 128
13
  },
14
- "vocab_size": 1989
15
  }
 
11
  "encoding_method": "rope",
12
  "dim": 128
13
  },
14
+ "vocab_size": 1896
15
  }
models/embeddings/monolingual/ann_32d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a08097b63d64495bc93a7d4418641dbfe3c922fe0270d5f7d226318ad60d9b5c
3
- size 256542033
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:046109a4422a9a603a48c2abd0d192b758882631e72ac429557dc6f857e7b375
3
+ size 256516579
models/embeddings/monolingual/ann_32d_metadata.json CHANGED
@@ -11,5 +11,5 @@
11
  "encoding_method": "rope",
12
  "dim": 32
13
  },
14
- "vocab_size": 1989
15
  }
 
11
  "encoding_method": "rope",
12
  "dim": 32
13
  },
14
+ "vocab_size": 1896
15
  }
models/embeddings/monolingual/ann_64d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2afa2ec65fb8007208da56111bface363437e9bec87e59314877a23345b9ad5e
3
- size 513051217
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b00d28e874e41679c615dce915575ad329279f02507e33677323ebb64d12a95e
3
+ size 513001955
models/embeddings/monolingual/ann_64d_metadata.json CHANGED
@@ -11,5 +11,5 @@
11
  "encoding_method": "rope",
12
  "dim": 64
13
  },
14
- "vocab_size": 1989
15
  }
 
11
  "encoding_method": "rope",
12
  "dim": 64
13
  },
14
+ "vocab_size": 1896
15
  }
models/subword_markov/ann_markov_ctx1_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:133284a0112df666902ff1c8b3f87c98d928ca5e21cc040ac90519fc9b835b57
3
- size 24522
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14f1e4d2b16ca83acf99005e1d8f2ef8ddd448d536aa3453c2aed6989f746a93
3
+ size 24116
models/subword_markov/ann_markov_ctx1_subword_metadata.json CHANGED
@@ -3,5 +3,5 @@
3
  "variant": "subword",
4
  "language": "ann",
5
  "unique_contexts": 290,
6
- "total_transitions": 538876
7
  }
 
3
  "variant": "subword",
4
  "language": "ann",
5
  "unique_contexts": 290,
6
+ "total_transitions": 517578
7
  }
models/subword_markov/ann_markov_ctx2_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e64bce1c8aaafc988a5935e2ed628caf16c7fa8930527c46f194ce12fe4dc84a
3
- size 114074
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c0bec0998b02a5eeadf5a6d286401974bf066faefc1a30b5c0c82d32484aab0
3
+ size 106684
models/subword_markov/ann_markov_ctx2_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 2,
3
  "variant": "subword",
4
  "language": "ann",
5
- "unique_contexts": 2537,
6
- "total_transitions": 538383
7
  }
 
2
  "context_size": 2,
3
  "variant": "subword",
4
  "language": "ann",
5
+ "unique_contexts": 2502,
6
+ "total_transitions": 517106
7
  }
models/subword_markov/ann_markov_ctx3_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:72b96d3df529060de51aa1fc4dcb9ab588af0929fb1e0eb2b958bff6d7a6b633
3
- size 338702
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:969d810e955fb0ffe5cc55f4bb726f3ed8d55b5683f15470101b94fd4aa037d2
3
+ size 339360
models/subword_markov/ann_markov_ctx3_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 3,
3
  "variant": "subword",
4
  "language": "ann",
5
- "unique_contexts": 13932,
6
- "total_transitions": 537890
7
  }
 
2
  "context_size": 3,
3
  "variant": "subword",
4
  "language": "ann",
5
+ "unique_contexts": 13671,
6
+ "total_transitions": 516634
7
  }
models/subword_markov/ann_markov_ctx4_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e09659d50aeeca9954a013b1ca95d57253b9f038aa3a3b0c4df7c0d2fc0caa1e
3
- size 785453
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7dcda352ac96f30b9157dcea88dff5aabf8fc2a3e539499978e16442601f454
3
+ size 772993
models/subword_markov/ann_markov_ctx4_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 4,
3
  "variant": "subword",
4
  "language": "ann",
5
- "unique_contexts": 44651,
6
- "total_transitions": 537397
7
  }
 
2
  "context_size": 4,
3
  "variant": "subword",
4
  "language": "ann",
5
+ "unique_contexts": 43657,
6
+ "total_transitions": 516162
7
  }
models/subword_ngram/ann_2gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b7879bdc16bf8a3f29b779a338720f6ce381e91fa8ba5e247518dc7ca628446
3
- size 17029
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54f121852b71196304a2a8986075bbf0fff2db6ac1109f14ff830a4533d19c57
3
+ size 16779
models/subword_ngram/ann_2gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 2,
3
  "variant": "subword",
4
  "language": "ann",
5
- "unique_ngrams": 1230,
6
- "total_ngrams": 538876
7
  }
 
2
  "n": 2,
3
  "variant": "subword",
4
  "language": "ann",
5
+ "unique_ngrams": 1214,
6
+ "total_ngrams": 517578
7
  }
models/subword_ngram/ann_3gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a22334c4abf593637f95f14f3195c7e9b977cab70ffe6f1ebd3ac954408c1e58
3
- size 82404
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:207a1b64805bd946990386c767a8f886d3619c8a8403b02522c13f431e351ae3
3
+ size 80691
models/subword_ngram/ann_3gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 3,
3
  "variant": "subword",
4
  "language": "ann",
5
- "unique_ngrams": 7165,
6
- "total_ngrams": 538383
7
  }
 
2
  "n": 3,
3
  "variant": "subword",
4
  "language": "ann",
5
+ "unique_ngrams": 7013,
6
+ "total_ngrams": 517106
7
  }
models/subword_ngram/ann_4gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:edff385989d8cb6ddba4a7096924e12c5f750a23c337bf178ed91664b6d637f8
3
- size 292528
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1033bd3465d84b199fe00a7423d9aea18725a1aedee84e66a6041c26eb24017
3
+ size 289755
models/subword_ngram/ann_4gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 4,
3
  "variant": "subword",
4
  "language": "ann",
5
- "unique_ngrams": 24184,
6
- "total_ngrams": 537890
7
  }
 
2
  "n": 4,
3
  "variant": "subword",
4
  "language": "ann",
5
+ "unique_ngrams": 23560,
6
+ "total_ngrams": 516634
7
  }
models/subword_ngram/ann_5gram_subword.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afb932726cd84df3cca7c67ff8301e1bf81a11bdf5dd6200fb84663dfb585e6c
3
+ size 498830
models/subword_ngram/ann_5gram_subword_metadata.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "n": 5,
3
+ "variant": "subword",
4
+ "language": "ann",
5
+ "unique_ngrams": 38424,
6
+ "total_ngrams": 516162
7
+ }
models/tokenizer/ann_tokenizer_16k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:685a0e9ed7c0107b85e07b0286c03eb8b5cd6fc5ada26786ea117b602d0ca131
3
- size 511767
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b74f59549b94c9bbb91cffef6bc44cacadb4f440a30004c5287afd482a9adcf2
3
+ size 510998
models/tokenizer/ann_tokenizer_16k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/tokenizer/ann_tokenizer_8k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13e4e094459602ef55a87f4cad2bf9b7eaaeb3ca6f3d5f2b6a36760cf76610a2
3
- size 374711
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89d553eae70a0c59b46d72bb3b7a2e84791949a1b7ea431452fca72f093214e2
3
+ size 374759
models/tokenizer/ann_tokenizer_8k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/vocabulary/ann_vocabulary.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8145b130fe314515c8d0e309cb26039c412cf86c9b7eeafd8670fe99f9cdda88
3
- size 69333
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9005473acebe3bdbe8c244c492a69eb7e7aad2dd59e7a35a9a4ca06fbaa2f2e5
3
+ size 67918
models/vocabulary/ann_vocabulary_metadata.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
  "language": "ann",
3
- "vocabulary_size": 4243,
4
  "variant": "full",
5
  "statistics": {
6
- "type_token_ratio": 0.09944472997349618,
7
  "coverage": {
8
- "top_100": 0.562737450998176,
9
- "top_1000": 0.8278562142878737,
10
- "top_5000": 0.9509427497455433
11
  },
12
- "hapax_count": 5625,
13
- "hapax_ratio": 0.5700243210376976,
14
- "total_documents": 493
15
  }
16
  }
 
1
  {
2
  "language": "ann",
3
+ "vocabulary_size": 4154,
4
  "variant": "full",
5
  "statistics": {
6
+ "type_token_ratio": 0.10174905739421869,
7
  "coverage": {
8
+ "top_100": 0.564390448261416,
9
+ "top_1000": 0.8281210724759112,
10
+ "top_5000": 0.9506179304566401
11
  },
12
+ "hapax_count": 5561,
13
+ "hapax_ratio": 0.5724137931034483,
14
+ "total_documents": 472
15
  }
16
  }
models/word_markov/ann_markov_ctx1_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ba0963e3a66739914b91e29d2fbde25bf375a4973d157084f09f818b169e344
3
- size 334500
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2c2df7df4f25457b6030282137cac1a9855ddf787d0f90b75ec2442c37970bc
3
+ size 326225
models/word_markov/ann_markov_ctx1_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 1,
3
  "variant": "word",
4
  "language": "ann",
5
- "unique_contexts": 9818,
6
- "total_transitions": 98738
7
  }
 
2
  "context_size": 1,
3
  "variant": "word",
4
  "language": "ann",
5
+ "unique_contexts": 9664,
6
+ "total_transitions": 95008
7
  }
models/word_markov/ann_markov_ctx2_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:32de02b53a9ca990e598e15d5ee6ad630df14d4ec653fb740b4a7d42bdb26a5f
3
- size 845652
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1421fc6795e12b35a51213ce822b2be78a7f4e7d0d13d3be77f09b266e60c0cd
3
+ size 823564
models/word_markov/ann_markov_ctx2_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 2,
3
  "variant": "word",
4
  "language": "ann",
5
- "unique_contexts": 45714,
6
- "total_transitions": 98245
7
  }
 
2
  "context_size": 2,
3
  "variant": "word",
4
  "language": "ann",
5
+ "unique_contexts": 44320,
6
+ "total_transitions": 94536
7
  }
models/word_markov/ann_markov_ctx3_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed08c5a00857486794ad9d25afd118b795f050f0abf1cd60420d0f37b096aa05
3
- size 1239826
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e1cc33784190581e139822dc45a48a345ed714d3a644a8855faf21ee3b75546
3
+ size 1200817
models/word_markov/ann_markov_ctx3_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 3,
3
  "variant": "word",
4
  "language": "ann",
5
- "unique_contexts": 73222,
6
- "total_transitions": 97752
7
  }
 
2
  "context_size": 3,
3
  "variant": "word",
4
  "language": "ann",
5
+ "unique_contexts": 70635,
6
+ "total_transitions": 94064
7
  }
models/word_markov/ann_markov_ctx4_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9952b21c610e474602739d4608bcdf47886b1e13139eb17d44e2d73ff21aff7f
3
- size 1490446
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3282749b62f9354b96f20943251ce901348cdeac9a17c0d4a1a76554e03042c7
3
+ size 1439251
models/word_markov/ann_markov_ctx4_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 4,
3
  "variant": "word",
4
  "language": "ann",
5
- "unique_contexts": 86066,
6
- "total_transitions": 97259
7
  }
 
2
  "context_size": 4,
3
  "variant": "word",
4
  "language": "ann",
5
+ "unique_contexts": 82762,
6
+ "total_transitions": 93592
7
  }