omarkamali commited on
Commit
b2b9a92
·
verified ·
1 Parent(s): e91ecf9

Upload all models and assets for din (latest)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. README.md +310 -126
  3. models/embeddings/aligned/din_128d.bin +3 -0
  4. models/embeddings/aligned/din_128d.meta.json +1 -0
  5. models/embeddings/aligned/din_128d.projection.npy +3 -0
  6. models/embeddings/aligned/din_128d_metadata.json +8 -0
  7. models/embeddings/aligned/din_32d.bin +3 -0
  8. models/embeddings/aligned/din_32d.meta.json +1 -0
  9. models/embeddings/aligned/din_32d.projection.npy +3 -0
  10. models/embeddings/aligned/din_32d_metadata.json +8 -0
  11. models/embeddings/aligned/din_64d.bin +3 -0
  12. models/embeddings/aligned/din_64d.meta.json +1 -0
  13. models/embeddings/aligned/din_64d.projection.npy +3 -0
  14. models/embeddings/aligned/din_64d_metadata.json +8 -0
  15. models/embeddings/monolingual/din_128d.bin +2 -2
  16. models/embeddings/monolingual/din_128d_metadata.json +5 -3
  17. models/embeddings/monolingual/din_32d.bin +2 -2
  18. models/embeddings/monolingual/din_32d_metadata.json +5 -3
  19. models/embeddings/monolingual/din_64d.bin +2 -2
  20. models/embeddings/monolingual/din_64d_metadata.json +5 -3
  21. models/subword_markov/din_markov_ctx1_subword.parquet +2 -2
  22. models/subword_markov/din_markov_ctx1_subword_metadata.json +2 -2
  23. models/subword_markov/din_markov_ctx2_subword.parquet +2 -2
  24. models/subword_markov/din_markov_ctx2_subword_metadata.json +2 -2
  25. models/subword_markov/din_markov_ctx3_subword.parquet +2 -2
  26. models/subword_markov/din_markov_ctx3_subword_metadata.json +2 -2
  27. models/subword_markov/din_markov_ctx4_subword.parquet +2 -2
  28. models/subword_markov/din_markov_ctx4_subword_metadata.json +2 -2
  29. models/subword_ngram/din_2gram_subword.parquet +2 -2
  30. models/subword_ngram/din_2gram_subword_metadata.json +2 -2
  31. models/subword_ngram/din_3gram_subword.parquet +2 -2
  32. models/subword_ngram/din_3gram_subword_metadata.json +2 -2
  33. models/subword_ngram/din_4gram_subword.parquet +2 -2
  34. models/subword_ngram/din_4gram_subword_metadata.json +2 -2
  35. models/subword_ngram/din_5gram_subword.parquet +3 -0
  36. models/subword_ngram/din_5gram_subword_metadata.json +7 -0
  37. models/tokenizer/din_tokenizer_16k.model +2 -2
  38. models/tokenizer/din_tokenizer_16k.vocab +0 -0
  39. models/tokenizer/din_tokenizer_32k.model +2 -2
  40. models/tokenizer/din_tokenizer_32k.vocab +0 -0
  41. models/tokenizer/din_tokenizer_8k.model +2 -2
  42. models/tokenizer/din_tokenizer_8k.vocab +0 -0
  43. models/vocabulary/din_vocabulary.parquet +2 -2
  44. models/vocabulary/din_vocabulary_metadata.json +10 -9
  45. models/word_markov/din_markov_ctx1_word.parquet +2 -2
  46. models/word_markov/din_markov_ctx1_word_metadata.json +2 -2
  47. models/word_markov/din_markov_ctx2_word.parquet +2 -2
  48. models/word_markov/din_markov_ctx2_word_metadata.json +2 -2
  49. models/word_markov/din_markov_ctx3_word.parquet +2 -2
  50. models/word_markov/din_markov_ctx3_word_metadata.json +2 -2
.gitattributes CHANGED
@@ -39,3 +39,4 @@ visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -t
39
  visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
40
  visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
41
  visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
 
 
39
  visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
40
  visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
41
  visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
42
+ visualizations/embedding_tsne_multilingual.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  language: din
3
- language_name: DIN
4
  language_family: african_nilotic
5
  tags:
6
  - wikilangs
@@ -10,11 +10,21 @@ tags:
10
  - n-gram
11
  - markov
12
  - wikipedia
 
 
 
 
 
 
 
 
 
 
13
  - monolingual
14
  - family-african_nilotic
15
  license: mit
16
  library_name: wikilangs
17
- pipeline_tag: feature-extraction
18
  datasets:
19
  - omarkamali/wikipedia-monthly
20
  dataset_info:
@@ -23,20 +33,20 @@ dataset_info:
23
  metrics:
24
  - name: best_compression_ratio
25
  type: compression
26
- value: 4.266
27
  - name: best_isotropy
28
  type: isotropy
29
- value: 0.1273
30
  - name: vocabulary_size
31
  type: vocab
32
- value: 5872
33
- generated: 2025-12-30
34
  ---
35
 
36
- # DIN - Wikilangs Models
37
  ## Comprehensive Research Report & Full Ablation Study
38
 
39
- This repository contains NLP models trained and evaluated by Wikilangs, specifically on **DIN** Wikipedia data.
40
  We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
41
 
42
  ## 📋 Repository Contents
@@ -44,12 +54,13 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
44
  ### Models & Assets
45
 
46
  - Tokenizers (8k, 16k, 32k, 64k)
47
- - N-gram models (2, 3, 4-gram)
48
- - Markov chains (context of 1, 2, 3 and 4)
49
  - Subword N-gram and Markov chains
50
- - Embeddings in various sizes and dimensions
51
  - Language Vocabulary
52
  - Language Statistics
 
53
  ![Performance Dashboard](visualizations/performance_dashboard.png)
54
 
55
  ### Analysis and Evaluation
@@ -59,7 +70,8 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
59
  - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
60
  - [4. Vocabulary Analysis](#4-vocabulary-analysis)
61
  - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
62
- - [6. Summary & Recommendations](#6-summary--recommendations)
 
63
  - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
64
  - [Visualizations Index](#visualizations-index)
65
 
@@ -68,50 +80,53 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
68
 
69
  ![Tokenizer Compression](visualizations/tokenizer_compression.png)
70
 
 
 
 
 
 
 
71
  ### Results
72
 
73
  | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
74
  |------------|-------------|---------------|----------|--------------|
75
- | **8k** | 3.627x | 3.61 | 0.1100% | 146,415 |
76
- | **16k** | 3.922x | 3.91 | 0.1189% | 135,412 |
77
- | **32k** | 4.266x 🏆 | 4.25 | 0.1293% | 124,488 |
78
 
79
  ### Tokenization Examples
80
 
81
  Below are sample sentences tokenized with each vocabulary size:
82
 
83
- **Sample 1:** `Paankɔc Ciɛl de Libya ee paan thiɔ̈ɔ̈k thïn Apirïka ciɛlic. Genamaatnhomde ay...`
84
 
85
  | Vocab | Tokens | Count |
86
  |-------|--------|-------|
87
- | 8k | `▁paankɔcciɛldelibyaeepaanthiɔ̈ɔ̈k ▁thïn ▁apirïka ... (+11 more)` | 21 |
88
- | 16k | `▁paankɔcciɛldelibyaeepaanthiɔ̈ɔ̈k ▁thïnapirïka ... (+11 more)` | 21 |
89
- | 32k | `▁paankɔcciɛldelibyaeepaanthiɔ̈ɔ̈kthïn ▁apirïka ... (+10 more)` | 20 |
90
-
91
- **Sample 2:** `Heen acï puööu miet apeidït ne rin cïï ok rot mat thääi pinynhom yiic.
92
 
93
- Piööc k...`
94
 
95
  | Vocab | Tokens | Count |
96
  |-------|--------|-------|
97
- | 8k | `▁h eenacïpu öö u ▁mi et apei dït ... (+21 more)` | 31 |
98
- | 16k | `▁heen ▁acï ▁pu öö umi et apeidïtnerin ... (+18 more)` | 28 |
99
- | 32k | `▁heenacïpuööumiet ▁apeidïtnerincïïok ▁rot ... (+15 more)` | 25 |
100
 
101
- **Sample 3:** `+Japan 125px 135px 300px
102
- Japan ee pamac tɔ Athiɛ. Genamaatnhomde ayee cɔl Tokyo...`
103
 
104
  | Vocab | Tokens | Count |
105
  |-------|--------|-------|
106
- | 8k | `▁+ j apan 1 2 5 px1 ... (+23 more)` | 33 |
107
- | 16k | `▁+ japan1 2 5 px1 3 ... (+21 more)` | 31 |
108
- | 32k | `▁+ japan1 2 5 px1 3 ... (+21 more)` | 31 |
109
 
110
 
111
  ### Key Findings
112
 
113
- - **Best Compression:** 32k achieves 4.266x compression
114
- - **Lowest UNK Rate:** 8k with 0.1100% unknown tokens
115
  - **Trade-off:** Larger vocabularies improve compression but increase model size
116
  - **Recommendation:** 32k vocabulary provides optimal balance for production use
117
 
@@ -120,57 +135,111 @@ Japan ee pamac tɔ Athiɛ. Genamaatnhomde ayee cɔl Tokyo...`
120
 
121
  ![N-gram Perplexity](visualizations/ngram_perplexity.png)
122
 
 
 
123
  ![N-gram Coverage](visualizations/ngram_coverage.png)
124
 
125
  ### Results
126
 
127
- | N-gram | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
128
- |--------|------------|---------|----------------|------------------|-------------------|
129
- | **2-gram** | 969 🏆 | 9.92 | 2,614 | 42.8% | 79.5% |
130
- | **2-gram** | 353 🏆 | 8.46 | 1,719 | 60.1% | 98.7% |
131
- | **3-gram** | 958 | 9.90 | 2,667 | 44.4% | 78.1% |
132
- | **3-gram** | 2,358 | 11.20 | 10,075 | 24.1% | 70.3% |
133
- | **4-gram** | 1,138 | 10.15 | 3,607 | 44.8% | 70.6% |
134
- | **4-gram** | 9,037 | 13.14 | 33,136 | 12.8% | 42.8% |
 
 
135
 
136
  ### Top 5 N-grams by Size
137
 
138
- **2-grams:**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
  | Rank | N-gram | Count |
141
  |------|--------|-------|
142
- | 1 | `, ku` | 881 |
143
- | 2 | n` | 812 |
144
- | 3 | k` | 805 |
145
- | 4 | ̈` | 709 |
146
- | 5 | ɔ` | 708 |
147
 
148
- **3-grams:**
149
 
150
  | Rank | N-gram | Count |
151
  |------|--------|-------|
152
- | 1 | ɔ ̈` | 704 |
153
- | 2 | ɛ ̈` | 526 |
154
- | 3 | ` ̈ k` | 316 |
155
- | 4 | `bɛ ̈ n` | 233 |
156
- | 5 | `. bekätakthook :` | 223 |
157
 
158
- **4-grams:**
159
 
160
  | Rank | N-gram | Count |
161
  |------|--------|-------|
162
- | 1 | `. 2006 . english` | 167 |
163
- | 2 | `blench . 2006 .` | 167 |
164
- | 3 | `2006 . english to` | 167 |
165
- | 4 | `: derived from sil` | 167 |
166
- | 5 | `. kay williamson educational` | 167 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
 
169
  ### Key Findings
170
 
171
- - **Best Perplexity:** 2-gram with 353
172
  - **Entropy Trend:** Decreases with larger n-grams (more predictable)
173
- - **Coverage:** Top-1000 patterns cover ~43% of corpus
174
  - **Recommendation:** 4-gram or 5-gram for best predictive performance
175
 
176
  ---
@@ -178,55 +247,86 @@ Japan ee pamac tɔ Athiɛ. Genamaatnhomde ayee cɔl Tokyo...`
178
 
179
  ![Markov Entropy](visualizations/markov_entropy.png)
180
 
 
 
181
  ![Markov Branching](visualizations/markov_branching.png)
182
 
183
  ### Results
184
 
185
- | Context | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
186
- |---------|-------------|------------|------------------|-----------------|----------------|
187
- | **1** | 0.6001 | 1.516 | 4.04 | 17,160 | 40.0% |
188
- | **1** | 1.6144 | 3.062 | 12.68 | 311 | 0.0% |
189
- | **2** | 0.2159 | 1.161 | 1.46 | 69,220 | 78.4% |
190
- | **2** | 1.1540 | 2.225 | 5.73 | 3,939 | 0.0% |
191
- | **3** | 0.0701 | 1.050 | 1.12 | 101,209 | 93.0% |
192
- | **3** | 0.7580 | 1.691 | 3.07 | 22,573 | 24.2% |
193
- | **4** | 0.0275 🏆 | 1.019 | 1.04 | 113,437 | 97.2% |
194
- | **4** | 0.4989 🏆 | 1.413 | 2.04 | 69,374 | 50.1% |
195
 
196
- ### Generated Text Samples
197
 
198
- Below are text samples generated from each Markov chain model:
199
 
200
  **Context Size 1:**
201
 
202
- 1. k ̈ yuganda , ̈ th aacï röth juiɛr ke piny de`
203
- 2. `. ̈ nyë riɛm ̈ ɔ ̈ juëkjuëk apɛi ake mɛt thïriɛa ,`
204
- 3. `, olimpik löŋden cal lɔn yen wën ye lac tïŋ ɣɔn këc guoɔn nyucciëëŋden tueŋ`
205
 
206
  **Context Size 2:**
207
 
208
- 1. `, ku tënɔŋ adhande ke keek . kuat yic bɛ ̈ nywut . keekë kɔckɛ ̈ ,`
209
- 2. n ya ̈ nykoormacbaai tueŋ zubair mohamed salih , ̈ n . kɔc ke`
210
- 3. k alëu bïkï nhïïm nɔŋ tuŋ bär ye cɔl mayoŋ . pïu rac cï cöp abuk`
211
 
212
  **Context Size 3:**
213
 
214
- 1. ɔ ̈ k ɣaa këc pööc lac dööt , ciɛŋden thɛɛr ku pïïr ɣene töŋ maɣëmë ke`
215
- 2. ɛ ̈ r cïke baar rɛɛnken tɔŋbaai de rou kɔc ke thudän ( ylkt ) ku`
216
- 3. ` ̈ k ke 15 , 000 dɔm . burjuŋ aka rilic bï ye thok poc kek`
217
 
218
  **Context Size 4:**
219
 
220
- 1. `from sil international ' s 2005 draft dinka - english dictionary . kay williamson educational founda...`
221
- 2. `' s 2005 draft dinka - english dictionary . kay williamson educational foundation / sil internationa...`
222
- 3. `draft dinka - english dictionary . kay williamson educational foundation / sil international . dikco...`
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
 
224
 
225
  ### Key Findings
226
 
227
- - **Best Predictability:** Context-4 with 97.2% predictability
228
  - **Branching Factor:** Decreases with context size (more deterministic)
229
- - **Memory Trade-off:** Larger contexts require more storage (69,374 contexts)
230
  - **Recommendation:** Context-3 or Context-4 for text generation
231
 
232
  ---
@@ -242,26 +342,26 @@ Below are text samples generated from each Markov chain model:
242
 
243
  | Metric | Value |
244
  |--------|-------|
245
- | Vocabulary Size | 5,872 |
246
- | Total Tokens | 91,802 |
247
- | Mean Frequency | 15.63 |
248
  | Median Frequency | 3 |
249
- | Frequency Std Dev | 91.07 |
250
 
251
  ### Most Common Words
252
 
253
  | Rank | Word | Frequency |
254
  |------|------|-----------|
255
- | 1 | ku | 3,547 |
256
- | 2 | në | 2,806 |
257
- | 3 | de | 2,159 |
258
- | 4 | ë | 1,891 |
259
- | 5 | ke | 1,779 |
260
- | 6 | ye | 1,488 |
261
- | 7 | kɔc | 1,187 |
262
- | 8 | ee | 1,187 |
263
  | 9 | cï | 883 |
264
- | 10 | k | 882 |
265
 
266
  ### Least Common Words (from vocabulary)
267
 
@@ -269,9 +369,9 @@ Below are text samples generated from each Markov chain model:
269
  |------|------|-----------|
270
  | 1 | mayall | 2 |
271
  | 2 | cream | 2 |
272
- | 3 | layla | 2 |
273
- | 4 | adëgëk | 2 |
274
- | 5 | 1988 | 2 |
275
  | 6 | skobarkä | 2 |
276
  | 7 | pïlïbït | 2 |
277
  | 8 | tïgër | 2 |
@@ -282,24 +382,24 @@ Below are text samples generated from each Markov chain model:
282
 
283
  | Metric | Value |
284
  |--------|-------|
285
- | Zipf Coefficient | 1.0740 |
286
- | R² (Goodness of Fit) | 0.989252 |
287
  | Adherence Quality | **excellent** |
288
 
289
  ### Coverage Analysis
290
 
291
  | Top N Words | Coverage |
292
  |-------------|----------|
293
- | Top 100 | 48.2% |
294
- | Top 1,000 | 80.3% |
295
- | Top 5,000 | 98.1% |
296
  | Top 10,000 | 0.0% |
297
 
298
  ### Key Findings
299
 
300
  - **Zipf Compliance:** R²=0.9893 indicates excellent adherence to Zipf's law
301
- - **High Frequency Dominance:** Top 100 words cover 48.2% of corpus
302
- - **Long Tail:** -4,128 words needed for remaining 100.0% coverage
303
 
304
  ---
305
  ## 5. Word Embeddings Evaluation
@@ -312,24 +412,105 @@ Below are text samples generated from each Markov chain model:
312
 
313
  ![t-SNE Sentences](visualizations/tsne_sentences.png)
314
 
315
- ### Model Comparison
316
 
317
- | Model | Vocab Size | Dimension | Avg Norm | Std Norm | Isotropy |
318
- |-------|------------|-----------|----------|----------|----------|
319
- | **mono_32d** | 2,175 | 32 | 2.250 | 0.776 | 0.1273 🏆 |
320
- | **mono_64d** | 2,175 | 64 | 2.220 | 0.783 | 0.0336 |
321
- | **mono_128d** | 2,175 | 128 | 2.208 | 0.762 | 0.0072 |
322
- | **embeddings_enhanced** | 0 | 0 | 0.000 | 0.000 | 0.0000 |
 
 
 
 
 
 
 
 
 
 
 
323
 
324
  ### Key Findings
325
 
326
- - **Best Isotropy:** mono_32d with 0.1273 (more uniform distribution)
327
- - **Dimension Trade-off:** Higher dimensions capture more semantics but reduce isotropy
328
- - **Vocabulary Coverage:** All models cover 2,175 words
329
- - **Recommendation:** 100d for balanced semantic capture and efficiency
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
330
 
331
  ---
332
- ## 6. Summary & Recommendations
333
 
334
  ![Performance Dashboard](visualizations/performance_dashboard.png)
335
 
@@ -337,11 +518,12 @@ Below are text samples generated from each Markov chain model:
337
 
338
  | Component | Recommended | Rationale |
339
  |-----------|-------------|-----------|
340
- | Tokenizer | **32k BPE** | Best compression (4.27x) with low UNK rate |
341
- | N-gram | **5-gram** | Lowest perplexity (353) |
342
- | Markov | **Context-4** | Highest predictability (97.2%) |
343
  | Embeddings | **100d** | Balanced semantic capture and isotropy |
344
 
 
345
  ---
346
  ## Appendix: Metrics Glossary & Interpretation Guide
347
 
@@ -531,7 +713,8 @@ If you use these models in your research, please cite:
531
  author = {Kamali, Omar},
532
  title = {Wikilangs: Open NLP Models for Wikipedia Languages},
533
  year = {2025},
534
- publisher = {HuggingFace},
 
535
  url = {https://huggingface.co/wikilangs}
536
  institution = {Omneity Labs}
537
  }
@@ -547,7 +730,8 @@ MIT License - Free for academic and commercial use.
547
  - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
548
  - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
549
  - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
 
550
  ---
551
  *Generated by Wikilangs Models Pipeline*
552
 
553
- *Report Date: 2025-12-30 08:24:47*
 
1
  ---
2
  language: din
3
+ language_name: Dinka
4
  language_family: african_nilotic
5
  tags:
6
  - wikilangs
 
10
  - n-gram
11
  - markov
12
  - wikipedia
13
+ - feature-extraction
14
+ - sentence-similarity
15
+ - tokenization
16
+ - n-grams
17
+ - markov-chain
18
+ - text-mining
19
+ - fasttext
20
+ - babelvec
21
+ - vocabulous
22
+ - vocabulary
23
  - monolingual
24
  - family-african_nilotic
25
  license: mit
26
  library_name: wikilangs
27
+ pipeline_tag: text-generation
28
  datasets:
29
  - omarkamali/wikipedia-monthly
30
  dataset_info:
 
33
  metrics:
34
  - name: best_compression_ratio
35
  type: compression
36
+ value: 4.248
37
  - name: best_isotropy
38
  type: isotropy
39
+ value: 0.2108
40
  - name: vocabulary_size
41
  type: vocab
42
+ value: 0
43
+ generated: 2026-01-04
44
  ---
45
 
46
+ # Dinka - Wikilangs Models
47
  ## Comprehensive Research Report & Full Ablation Study
48
 
49
+ This repository contains NLP models trained and evaluated by Wikilangs, specifically on **Dinka** Wikipedia data.
50
  We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
51
 
52
  ## 📋 Repository Contents
 
54
  ### Models & Assets
55
 
56
  - Tokenizers (8k, 16k, 32k, 64k)
57
+ - N-gram models (2, 3, 4, 5-gram)
58
+ - Markov chains (context of 1, 2, 3, 4 and 5)
59
  - Subword N-gram and Markov chains
60
+ - Embeddings in various sizes and dimensions (aligned and unaligned)
61
  - Language Vocabulary
62
  - Language Statistics
63
+
64
  ![Performance Dashboard](visualizations/performance_dashboard.png)
65
 
66
  ### Analysis and Evaluation
 
70
  - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
71
  - [4. Vocabulary Analysis](#4-vocabulary-analysis)
72
  - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
73
+ - [6. Morphological Analysis (Experimental)](#6--morphological-analysis-experimental)
74
+ - [7. Summary & Recommendations](#7-summary--recommendations)
75
  - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
76
  - [Visualizations Index](#visualizations-index)
77
 
 
80
 
81
  ![Tokenizer Compression](visualizations/tokenizer_compression.png)
82
 
83
+ ![Tokenizer Fertility](visualizations/tokenizer_fertility.png)
84
+
85
+ ![Tokenizer OOV](visualizations/tokenizer_oov.png)
86
+
87
+ ![Total Tokens](visualizations/tokenizer_total_tokens.png)
88
+
89
  ### Results
90
 
91
  | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
92
  |------------|-------------|---------------|----------|--------------|
93
+ | **8k** | 3.696x | 3.70 | 1.0395% | 137,657 |
94
+ | **16k** | 3.984x | 3.99 | 1.1206% | 127,694 |
95
+ | **32k** | 4.248x 🏆 | 4.25 | 1.1949% | 119,761 |
96
 
97
  ### Tokenization Examples
98
 
99
  Below are sample sentences tokenized with each vocabulary size:
100
 
101
+ **Sample 1:** `Ukraine ee paan en Yurop Penëdhiäk ee Volodymyr Zelensky. Genamaatnhomde ayee cɔ...`
102
 
103
  | Vocab | Tokens | Count |
104
  |-------|--------|-------|
105
+ | 8k | `▁ukraineeepaanenyuroppenëdhiäkeev ol od ... (+15 more)` | 25 |
106
+ | 16k | `▁ukraineeepaanenyuroppenëdhiäkeev olodymyrzelensky ... (+8 more)` | 18 |
107
+ | 32k | `▁ukraineeepaanenyuroppenëdhiäkeevolodymyrzelensky . ... (+5 more)` | 15 |
 
 
108
 
109
+ **Sample 2:** `Monteaguila ee gendït Chile. Cinëkɔcde aa tëcit ruonic`
110
 
111
  | Vocab | Tokens | Count |
112
  |-------|--------|-------|
113
+ | 8k | `▁mon te agu ila eegendït ▁ch ile .cinëkɔcde ... (+3 more)` | 13 |
114
+ | 16k | `▁mon te agu ila ▁eegendïtchile . cinëkɔcdeaa ... (+2 more)` | 12 |
115
+ | 32k | `▁monteaguilaeegendïtchile .cinëkɔcdeaatëcitruonic` | 9 |
116
 
117
+ **Sample 3:** `Dhambia ee Apirïka. Genamaatnhomde ayee cɔl Lusaka.`
 
118
 
119
  | Vocab | Tokens | Count |
120
  |-------|--------|-------|
121
+ | 8k | `▁dhambia ▁eeapirïka . ▁genamaatnhomde ▁ayee ▁cɔllu sak a ... (+1 more)` | 11 |
122
+ | 16k | `▁dhambia ▁eeapirïka . ▁genamaatnhomde ▁ayee ▁cɔllusaka .` | 9 |
123
+ | 32k | `▁dhambia ▁eeapirïka . ▁genamaatnhomde ▁ayee ▁cɔllusaka .` | 9 |
124
 
125
 
126
  ### Key Findings
127
 
128
+ - **Best Compression:** 32k achieves 4.248x compression
129
+ - **Lowest UNK Rate:** 8k with 1.0395% unknown tokens
130
  - **Trade-off:** Larger vocabularies improve compression but increase model size
131
  - **Recommendation:** 32k vocabulary provides optimal balance for production use
132
 
 
135
 
136
  ![N-gram Perplexity](visualizations/ngram_perplexity.png)
137
 
138
+ ![N-gram Unique](visualizations/ngram_unique.png)
139
+
140
  ![N-gram Coverage](visualizations/ngram_coverage.png)
141
 
142
  ### Results
143
 
144
+ | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
145
+ |--------|---------|------------|---------|----------------|------------------|-------------------|
146
+ | **2-gram** | Word | 846 | 9.72 | 1,522 | 38.9% | 86.3% |
147
+ | **2-gram** | Subword | 328 | 8.36 | 1,563 | 62.0% | 99.1% |
148
+ | **3-gram** | Word | 240 | 7.90 | 785 | 62.9% | 100.0% |
149
+ | **3-gram** | Subword | 2,240 | 11.13 | 9,446 | 25.3% | 71.0% |
150
+ | **4-gram** | Word | 166 | 7.38 | 882 | 69.6% | 100.0% |
151
+ | **4-gram** | Subword | 8,823 | 13.11 | 31,591 | 13.0% | 43.0% |
152
+ | **5-gram** | Word | 59 🏆 | 5.89 | 373 | 86.5% | 100.0% |
153
+ | **5-gram** | Subword | 18,719 | 14.19 | 51,151 | 8.6% | 31.8% |
154
 
155
  ### Top 5 N-grams by Size
156
 
157
+ **2-grams (Word):**
158
+
159
+ | Rank | N-gram | Count |
160
+ |------|--------|-------|
161
+ | 1 | `glossary derived` | 167 |
162
+ | 2 | `derived from` | 167 |
163
+ | 3 | `from sil` | 167 |
164
+ | 4 | `sil internationals` | 167 |
165
+ | 5 | `internationals draft` | 167 |
166
+
167
+ **3-grams (Word):**
168
+
169
+ | Rank | N-gram | Count |
170
+ |------|--------|-------|
171
+ | 1 | `internationals draft dinka` | 167 |
172
+ | 2 | `from sil internationals` | 167 |
173
+ | 3 | `derived from sil` | 167 |
174
+ | 4 | `dinka glossary derived` | 167 |
175
+ | 5 | `educational foundation sil` | 167 |
176
+
177
+ **4-grams (Word):**
178
+
179
+ | Rank | N-gram | Count |
180
+ |------|--------|-------|
181
+ | 1 | `english to dinka glossary` | 167 |
182
+ | 2 | `to dinka glossary derived` | 167 |
183
+ | 3 | `dinka glossary derived from` | 167 |
184
+ | 4 | `glossary derived from sil` | 167 |
185
+ | 5 | `from sil internationals draft` | 167 |
186
+
187
+ **5-grams (Word):**
188
 
189
  | Rank | N-gram | Count |
190
  |------|--------|-------|
191
+ | 1 | `dinka glossary derived from sil` | 167 |
192
+ | 2 | `williamson educational foundation sil international` | 167 |
193
+ | 3 | `kay williamson educational foundation sil` | 167 |
194
+ | 4 | `dictionary kay williamson educational foundation` | 167 |
195
+ | 5 | `english dictionary kay williamson educational` | 167 |
196
 
197
+ **2-grams (Subword):**
198
 
199
  | Rank | N-gram | Count |
200
  |------|--------|-------|
201
+ | 1 | `_ k` | 14,243 |
202
+ | 2 | `e _` | 10,060 |
203
+ | 3 | `_ a` | 9,948 |
204
+ | 4 | _` | 8,555 |
205
+ | 5 | `n _` | 7,924 |
206
 
207
+ **3-grams (Subword):**
208
 
209
  | Rank | N-gram | Count |
210
  |------|--------|-------|
211
+ | 1 | `_ k u` | 4,510 |
212
+ | 2 | `n ë _` | 3,923 |
213
+ | 3 | `k u _` | 3,559 |
214
+ | 4 | `_ k e` | 3,459 |
215
+ | 5 | `_ t h` | 3,193 |
216
+
217
+ **4-grams (Subword):**
218
+
219
+ | Rank | N-gram | Count |
220
+ |------|--------|-------|
221
+ | 1 | `_ k u _` | 3,514 |
222
+ | 2 | `_ n ë _` | 2,762 |
223
+ | 3 | `_ d e _` | 2,147 |
224
+ | 4 | `_ k e _` | 1,756 |
225
+ | 5 | `_ y e _` | 1,452 |
226
+
227
+ **5-grams (Subword):**
228
+
229
+ | Rank | N-gram | Count |
230
+ |------|--------|-------|
231
+ | 1 | `_ k ɔ c _` | 1,091 |
232
+ | 2 | `, _ k u _` | 836 |
233
+ | 3 | `_ y e n _` | 729 |
234
+ | 4 | `a t i o n` | 718 |
235
+ | 5 | `t i o n a` | 686 |
236
 
237
 
238
  ### Key Findings
239
 
240
+ - **Best Perplexity:** 5-gram (word) with 59
241
  - **Entropy Trend:** Decreases with larger n-grams (more predictable)
242
+ - **Coverage:** Top-1000 patterns cover ~32% of corpus
243
  - **Recommendation:** 4-gram or 5-gram for best predictive performance
244
 
245
  ---
 
247
 
248
  ![Markov Entropy](visualizations/markov_entropy.png)
249
 
250
+ ![Markov Contexts](visualizations/markov_contexts.png)
251
+
252
  ![Markov Branching](visualizations/markov_branching.png)
253
 
254
  ### Results
255
 
256
+ | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
257
+ |---------|---------|-------------|------------|------------------|-----------------|----------------|
258
+ | **1** | Word | 0.6343 | 1.552 | 3.69 | 17,365 | 36.6% |
259
+ | **1** | Subword | 1.5315 | 2.891 | 11.78 | 318 | 0.0% |
260
+ | **2** | Word | 0.1750 | 1.129 | 1.30 | 63,845 | 82.5% |
261
+ | **2** | Subword | 1.1046 | 2.150 | 5.58 | 3,744 | 0.0% |
262
+ | **3** | Word | 0.0333 | 1.023 | 1.04 | 83,004 | 96.7% |
263
+ | **3** | Subword | 0.7588 | 1.692 | 3.12 | 20,888 | 24.1% |
264
+ | **4** | Word | 0.0076 🏆 | 1.005 | 1.01 | 86,340 | 99.2% |
265
+ | **4** | Subword | 0.5088 | 1.423 | 2.08 | 65,173 | 49.1% |
266
 
267
+ ### Generated Text Samples (Word-based)
268
 
269
+ Below are text samples generated from each word-based Markov chain model:
270
 
271
  **Context Size 1:**
272
 
273
+ 1. `ku gɛɛth puɔɔth ben jam ë kɔcnhiaardiɛtë acik gam ke panmäcalëi french indochina ya kë`
274
+ 2. `në bɛ̈ɛ̈i tënë tïmëtïm 57 ku tiem thidhic ku kek aa alëk dɛl miɲ kaːl`
275
+ 3. `de spain ku aye raan döŋ acï giit en kɛ̈ɛ̈cë anyak atɔ̈ thïn rin keloirɔt wët`
276
 
277
  **Context Size 2:**
278
 
279
+ 1. `english dictionary kay williamson educational foundation sil international dikconari thudän`
280
+ 2. `english to dinka glossary derived from sil internationals draft dinka english dictionary kay william...`
281
+ 3. `to dinka glossary derived from sil internationals draft dinka english dictionary kay williamson educ...`
282
 
283
  **Context Size 3:**
284
 
285
+ 1. `and roger blench english to dinka glossary derived from sil internationals draft dinka english dicti...`
286
+ 2. `internationals draft dinka english dictionary kay williamson educational foundation sil internationa...`
287
+ 3. `roger blench english to dinka glossary derived from sil internationals draft dinka english dictionar...`
288
 
289
  **Context Size 4:**
290
 
291
+ 1. `internationals draft dinka english dictionary kay williamson educational foundation sil internationa...`
292
+ 2. `to dinka glossary derived from sil internationals draft dinka english dictionary kay williamson educ...`
293
+ 3. `derived from sil internationals draft dinka english dictionary kay williamson educational foundation...`
294
+
295
+
296
+ ### Generated Text Samples (Subword-based)
297
+
298
+ Below are text samples generated from each subword-based Markov chain model:
299
+
300
+ **Context Size 1:**
301
+
302
+ 1. `_adde_cïnapae_lu`
303
+ 2. `a_piic_ciän_anya`
304
+ 3. `kuɛ̈c_arabo_san_k`
305
+
306
+ **Context Size 2:**
307
+
308
+ 1. `_ku_acï_raŋdec_bï`
309
+ 2. `e_bïk_ëk_cök_de_y`
310
+ 3. `_aŋrɛn,_juäi_adhi`
311
+
312
+ **Context Size 3:**
313
+
314
+ 1. `_ku_yiic,_thudän._`
315
+ 2. `në_2._“tx2_awɛ̈ɛ̈rde`
316
+ 3. `ku_puses)._ë_makut`
317
+
318
+ **Context Size 4:**
319
+
320
+ 1. `_ku_cɔl_muɔɔr_aacë_`
321
+ 2. `_në_keye,_ee_noŋic_`
322
+ 3. `_de_joŋlei_paguot_k`
323
 
324
 
325
  ### Key Findings
326
 
327
+ - **Best Predictability:** Context-4 (word) with 99.2% predictability
328
  - **Branching Factor:** Decreases with context size (more deterministic)
329
+ - **Memory Trade-off:** Larger contexts require more storage (65,173 contexts)
330
  - **Recommendation:** Context-3 or Context-4 for text generation
331
 
332
  ---
 
342
 
343
  | Metric | Value |
344
  |--------|-------|
345
+ | Vocabulary Size | 5,848 |
346
+ | Total Tokens | 81,189 |
347
+ | Mean Frequency | 13.88 |
348
  | Median Frequency | 3 |
349
+ | Frequency Std Dev | 86.66 |
350
 
351
  ### Most Common Words
352
 
353
  | Rank | Word | Frequency |
354
  |------|------|-----------|
355
+ | 1 | ku | 3,546 |
356
+ | 2 | në | 2,775 |
357
+ | 3 | de | 2,158 |
358
+ | 4 | ë | 1,890 |
359
+ | 5 | ke | 1,776 |
360
+ | 6 | ye | 1,484 |
361
+ | 7 | ee | 1,173 |
362
+ | 8 | kɔc | 1,137 |
363
  | 9 | cï | 883 |
364
+ | 10 | yen | 747 |
365
 
366
  ### Least Common Words (from vocabulary)
367
 
 
369
  |------|------|-----------|
370
  | 1 | mayall | 2 |
371
  | 2 | cream | 2 |
372
+ | 3 | puɔ̈k | 2 |
373
+ | 4 | layla | 2 |
374
+ | 5 | adëgëk | 2 |
375
  | 6 | skobarkä | 2 |
376
  | 7 | pïlïbït | 2 |
377
  | 8 | tïgër | 2 |
 
382
 
383
  | Metric | Value |
384
  |--------|-------|
385
+ | Zipf Coefficient | 1.0295 |
386
+ | R² (Goodness of Fit) | 0.989261 |
387
  | Adherence Quality | **excellent** |
388
 
389
  ### Coverage Analysis
390
 
391
  | Top N Words | Coverage |
392
  |-------------|----------|
393
+ | Top 100 | 47.4% |
394
+ | Top 1,000 | 78.6% |
395
+ | Top 5,000 | 97.9% |
396
  | Top 10,000 | 0.0% |
397
 
398
  ### Key Findings
399
 
400
  - **Zipf Compliance:** R²=0.9893 indicates excellent adherence to Zipf's law
401
+ - **High Frequency Dominance:** Top 100 words cover 47.4% of corpus
402
+ - **Long Tail:** -4,152 words needed for remaining 100.0% coverage
403
 
404
  ---
405
  ## 5. Word Embeddings Evaluation
 
412
 
413
  ![t-SNE Sentences](visualizations/tsne_sentences.png)
414
 
 
415
 
416
+ ### 5.1 Cross-Lingual Alignment
417
+
418
+ ![Alignment Quality](visualizations/embedding_alignment_quality.png)
419
+
420
+ ![Multilingual t-SNE](visualizations/embedding_tsne_multilingual.png)
421
+
422
+
423
+ ### 5.2 Model Comparison
424
+
425
+ | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
426
+ |-------|-----------|----------|------------------|---------------|----------------|
427
+ | **mono_32d** | 32 | 0.2108 🏆 | 0.6155 | N/A | N/A |
428
+ | **mono_64d** | 64 | 0.0418 | 0.6059 | N/A | N/A |
429
+ | **mono_128d** | 128 | 0.0088 | 0.6443 | N/A | N/A |
430
+ | **aligned_32d** | 32 | 0.2108 | 0.5998 | 0.0070 | 0.0607 |
431
+ | **aligned_64d** | 64 | 0.0418 | 0.5881 | 0.0187 | 0.1028 |
432
+ | **aligned_128d** | 128 | 0.0088 | 0.6544 | 0.0164 | 0.0911 |
433
 
434
  ### Key Findings
435
 
436
+ - **Best Isotropy:** mono_32d with 0.2108 (more uniform distribution)
437
+ - **Semantic Density:** Average pairwise similarity of 0.6180. Lower values indicate better semantic separation.
438
+ - **Alignment Quality:** Aligned models achieve up to 1.9% R@1 in cross-lingual retrieval.
439
+ - **Recommendation:** 128d aligned for best cross-lingual performance
440
+
441
+ ---
442
+ ## 6. Morphological Analysis (Experimental)
443
+
444
+ This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
445
+
446
+ ### 6.1 Productivity & Complexity
447
+
448
+ | Metric | Value | Interpretation | Recommendation |
449
+ |--------|-------|----------------|----------------|
450
+ | Productivity Index | **1.232** | High morphological productivity | Reliable analysis |
451
+ | Idiomaticity Gap | **2.143** | High formulaic/idiomatic content | - |
452
+
453
+ ### 6.2 Affix Inventory (Productive Units)
454
+
455
+ These are the most productive prefixes and suffixes identified by sampling the vocabulary for global substitutability patterns. A unit is considered an affix if stripping it leaves a valid stem that appears in other contexts.
456
+
457
+ #### Productive Prefixes
458
+ | Prefix | Examples |
459
+ |--------|----------|
460
+ | `-th` | thiεkde, thɔ̈r, thiɛɛr |
461
+
462
+ #### Productive Suffixes
463
+ | Suffix | Examples |
464
+ |--------|----------|
465
+ | `-ic` | tocdïtic, nyinic, ciaryic |
466
+
467
+ ### 6.3 Bound Stems (Lexical Roots)
468
+
469
+ Bound stems are high-frequency subword units that are semantically cohesive but rarely appear as standalone words. These often correspond to the 'core' of a word that requires inflection or derivation to be valid.
470
+
471
+ | Stem | Cohesion | Substitutability | Examples |
472
+ |------|----------|------------------|----------|
473
+ | `thiä` | 1.36x | 12 contexts | thiär, thiäŋ, thiäi |
474
+
475
+ ### 6.4 Affix Compatibility (Co-occurrence)
476
+
477
+ This table shows which prefixes and suffixes most frequently co-occur on the same stems, revealing the 'stacking' rules of the language's morphology.
478
+
479
+ | Prefix | Suffix | Frequency | Examples |
480
+ |--------|--------|-----------|----------|
481
+ | `-th` | `-ic` | 10 words | thändïtic, thudänic |
482
+
483
+ ### 6.5 Recursive Morpheme Segmentation
484
+
485
+ Using **Recursive Hierarchical Substitutability**, we decompose complex words into their constituent morphemes. This approach handles nested affixes (e.g., `prefix-prefix-root-suffix`).
486
+
487
+ | Word | Suggested Split | Confidence | Stem |
488
+ |------|-----------------|------------|------|
489
+ | kathɛɛric | **`kathɛɛr-ic`** | 4.5 | `kathɛɛr` |
490
+ | wëlëmiiric | **`wëlëmiir-ic`** | 4.5 | `wëlëmiir` |
491
+ | ruɔ̈ɔ̈nic | **`ruɔ̈ɔ̈n-ic`** | 4.5 | `ruɔ̈ɔ̈n` |
492
+ | pïïrdenic | **`pïïrden-ic`** | 4.5 | `pïïrden` |
493
+ | manywëëthic | **`manywëëth-ic`** | 4.5 | `manywëëth` |
494
+ | pinynhomic | **`pinynhom-ic`** | 4.5 | `pinynhom` |
495
+ | krïthmathic | **`krïthmath-ic`** | 4.5 | `krïthmath` |
496
+ | käcïpuric | **`käcïpur-ic`** | 4.5 | `käcïpur` |
497
+ | abëkruöönic | **`abëkruöön-ic`** | 4.5 | `abëkruöön` |
498
+ | thändïtic | **`th-ändït-ic`** | 3.0 | `ändït` |
499
+ | thiɛ̈ɛ̈ric | **`th-iɛ̈ɛ̈r-ic`** | 3.0 | `iɛ̈ɛ̈r` |
500
+ | wëljamiic | **`wëljami-ic`** | 1.5 | `wëljami` |
501
+ | pabakciɛlic | **`pabakciɛl-ic`** | 1.5 | `pabakciɛl` |
502
+ | thanypiny | **`th-anypiny`** | 1.5 | `anypiny` |
503
+ | lëkthɛɛric | **`lëkthɛɛr-ic`** | 1.5 | `lëkthɛɛr` |
504
+
505
+ ### 6.6 Linguistic Interpretation
506
+
507
+ > **Automated Insight:**
508
+ The language Dinka shows moderate morphological complexity. There is a balanced trade-off between whole-word memorization and subword composition.
509
+
510
+ > **Note on Idiomaticity:** The high Idiomaticity Gap suggests a large number of frequent multi-word expressions or formulaic sequences that are statistically distinct from their component parts.
511
 
512
  ---
513
+ ## 7. Summary & Recommendations
514
 
515
  ![Performance Dashboard](visualizations/performance_dashboard.png)
516
 
 
518
 
519
  | Component | Recommended | Rationale |
520
  |-----------|-------------|-----------|
521
+ | Tokenizer | **32k BPE** | Best compression (4.25x) |
522
+ | N-gram | **5-gram** | Lowest perplexity (59) |
523
+ | Markov | **Context-4** | Highest predictability (99.2%) |
524
  | Embeddings | **100d** | Balanced semantic capture and isotropy |
525
 
526
+
527
  ---
528
  ## Appendix: Metrics Glossary & Interpretation Guide
529
 
 
713
  author = {Kamali, Omar},
714
  title = {Wikilangs: Open NLP Models for Wikipedia Languages},
715
  year = {2025},
716
+ doi = {10.5281/zenodo.18073153},
717
+ publisher = {Zenodo},
718
  url = {https://huggingface.co/wikilangs}
719
  institution = {Omneity Labs}
720
  }
 
730
  - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
731
  - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
732
  - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
733
+ - 🤝 Sponsor: [Featherless AI](https://featherless.ai)
734
  ---
735
  *Generated by Wikilangs Models Pipeline*
736
 
737
+ *Report Date: 2026-01-04 02:12:14*
models/embeddings/aligned/din_128d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f243cc52a4cfb3763acfd48b178dcd64b011e9131ecbb5e1d510de62a856c07
3
+ size 1026179536
models/embeddings/aligned/din_128d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "din", "dim": 128, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/din_128d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:479c369c09f8e5f73f66bd6462abbe1d0234a6e139b63df3eb83c04cd32d0da6
3
+ size 65664
models/embeddings/aligned/din_128d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "din",
3
+ "dimension": 128,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 428,
7
+ "vocab_size": 2096
8
+ }
models/embeddings/aligned/din_32d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d8768bf52eddd63c85e2ff451fc6b325c0f5aa7b3549ab0750e68e0964bce4c
3
+ size 256569808
models/embeddings/aligned/din_32d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "din", "dim": 32, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/din_32d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f349ee1be134188f25eb38feeb3f55eeaf5be6b9469af829b6290fe44358225b
3
+ size 4224
models/embeddings/aligned/din_32d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "din",
3
+ "dimension": 32,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 428,
7
+ "vocab_size": 2096
8
+ }
models/embeddings/aligned/din_64d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9c428bdde94fffa409d6936e9d5592ce5b51d6d62b39afa7d55f75c84007349
3
+ size 513106384
models/embeddings/aligned/din_64d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "din", "dim": 64, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/din_64d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f9553aeeafa889361df217fad742e4660917d490c7ce017376513bc69a42cca
3
+ size 16512
models/embeddings/aligned/din_64d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "din",
3
+ "dimension": 64,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 428,
7
+ "vocab_size": 2096
8
+ }
models/embeddings/monolingual/din_128d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e6c249eaf6de7f76a2eb64012f9c5bd48a0adcdb03e51ba632711de5ceefe19
3
- size 1026261862
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f243cc52a4cfb3763acfd48b178dcd64b011e9131ecbb5e1d510de62a856c07
3
+ size 1026179536
models/embeddings/monolingual/din_128d_metadata.json CHANGED
@@ -3,11 +3,13 @@
3
  "dimension": 128,
4
  "version": "monolingual",
5
  "training_params": {
6
- "dim": 128,
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
- "epochs": 5
 
 
11
  },
12
- "vocab_size": 2175
13
  }
 
3
  "dimension": 128,
4
  "version": "monolingual",
5
  "training_params": {
6
+ "algorithm": "skipgram",
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
+ "epochs": 5,
11
+ "encoding_method": "rope",
12
+ "dim": 128
13
  },
14
+ "vocab_size": 2096
15
  }
models/embeddings/monolingual/din_32d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c9adf0a330f417082c7c5a9c6f4e477197b81549a90f452d392ab2ff63f046d
3
- size 256591462
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d8768bf52eddd63c85e2ff451fc6b325c0f5aa7b3549ab0750e68e0964bce4c
3
+ size 256569808
models/embeddings/monolingual/din_32d_metadata.json CHANGED
@@ -3,11 +3,13 @@
3
  "dimension": 32,
4
  "version": "monolingual",
5
  "training_params": {
6
- "dim": 32,
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
- "epochs": 5
 
 
11
  },
12
- "vocab_size": 2175
13
  }
 
3
  "dimension": 32,
4
  "version": "monolingual",
5
  "training_params": {
6
+ "algorithm": "skipgram",
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
+ "epochs": 5,
11
+ "encoding_method": "rope",
12
+ "dim": 32
13
  },
14
+ "vocab_size": 2096
15
  }
models/embeddings/monolingual/din_64d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1514e0be95a610a658c8c219590572c945576d2e2bc9973d4c5cd0d8feca2e8b
3
- size 513148262
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9c428bdde94fffa409d6936e9d5592ce5b51d6d62b39afa7d55f75c84007349
3
+ size 513106384
models/embeddings/monolingual/din_64d_metadata.json CHANGED
@@ -3,11 +3,13 @@
3
  "dimension": 64,
4
  "version": "monolingual",
5
  "training_params": {
6
- "dim": 64,
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
- "epochs": 5
 
 
11
  },
12
- "vocab_size": 2175
13
  }
 
3
  "dimension": 64,
4
  "version": "monolingual",
5
  "training_params": {
6
+ "algorithm": "skipgram",
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
+ "epochs": 5,
11
+ "encoding_method": "rope",
12
+ "dim": 64
13
  },
14
+ "vocab_size": 2096
15
  }
models/subword_markov/din_markov_ctx1_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:127eb7a360963d3f6be767e357c4d4a65c5ddf9d6903aacc12e7f0cd0891f2ac
3
- size 32007
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60a3a8d00b8a00b00689f570a808b1e7dc5f30bfb816215c3b110026a6347151
3
+ size 30663
models/subword_markov/din_markov_ctx1_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 1,
3
  "variant": "subword",
4
  "language": "din",
5
- "unique_contexts": 311,
6
- "total_transitions": 530585
7
  }
 
2
  "context_size": 1,
3
  "variant": "subword",
4
  "language": "din",
5
+ "unique_contexts": 318,
6
+ "total_transitions": 500776
7
  }
models/subword_markov/din_markov_ctx2_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:723a434808ffddca7d4d3f6fc077b3f4bfec5725f6d980d87d4937facb08b4f1
3
- size 165205
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5821973129b3b3c0bafae913bd8daa4491075f89ed2ac47d07eaea79d2b99e7
3
+ size 157513
models/subword_markov/din_markov_ctx2_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 2,
3
  "variant": "subword",
4
  "language": "din",
5
- "unique_contexts": 3939,
6
- "total_transitions": 530073
7
  }
 
2
  "context_size": 2,
3
  "variant": "subword",
4
  "language": "din",
5
+ "unique_contexts": 3744,
6
+ "total_transitions": 500284
7
  }
models/subword_markov/din_markov_ctx3_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:05df3bb536b33c9fc2ffe30a4887acaefc1931b0e6660ec692b5617c96f69a39
3
- size 515269
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:573f8ad86998c5675ea57679a88b9cb5318f4bfa602f3298340e494bd05f1aef
3
+ size 492554
models/subword_markov/din_markov_ctx3_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 3,
3
  "variant": "subword",
4
  "language": "din",
5
- "unique_contexts": 22573,
6
- "total_transitions": 529561
7
  }
 
2
  "context_size": 3,
3
  "variant": "subword",
4
  "language": "din",
5
+ "unique_contexts": 20888,
6
+ "total_transitions": 499792
7
  }
models/subword_markov/din_markov_ctx4_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f58b1bd8dc7995d9eef7140cd002c4ce4033a150549689f0454a94f78e89d6d
3
- size 1178741
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49e0d14e2a1ac0060a198ad9ff7b1c25aa116e292ebe4392b97f3178aa4e4b7a
3
+ size 1121452
models/subword_markov/din_markov_ctx4_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 4,
3
  "variant": "subword",
4
  "language": "din",
5
- "unique_contexts": 69374,
6
- "total_transitions": 529049
7
  }
 
2
  "context_size": 4,
3
  "variant": "subword",
4
  "language": "din",
5
+ "unique_contexts": 65173,
6
+ "total_transitions": 499300
7
  }
models/subword_ngram/din_2gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f2ddd1b09dd4bed63e4af468ebddd67973f99facc8a20bf0591cbe3849e4ab36
3
- size 22481
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b2d07a40cfac91548511f27f396d477958065feb18a3affb05847e9a8f568bf
3
+ size 20862
models/subword_ngram/din_2gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 2,
3
  "variant": "subword",
4
  "language": "din",
5
- "unique_ngrams": 1719,
6
- "total_ngrams": 530585
7
  }
 
2
  "n": 2,
3
  "variant": "subword",
4
  "language": "din",
5
+ "unique_ngrams": 1563,
6
+ "total_ngrams": 500776
7
  }
models/subword_ngram/din_3gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:777f53e0084a7f33049dfdaadb8943997d7fb317fb2eac3d6c96df153d9d2149
3
- size 118141
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:573b15c3bd7d2dc9a5be8ac8c98ed3b2ede2b5f3ff8b773ed49a00aaaef950c0
3
+ size 112004
models/subword_ngram/din_3gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 3,
3
  "variant": "subword",
4
  "language": "din",
5
- "unique_ngrams": 10075,
6
- "total_ngrams": 530073
7
  }
 
2
  "n": 3,
3
  "variant": "subword",
4
  "language": "din",
5
+ "unique_ngrams": 9446,
6
+ "total_ngrams": 500284
7
  }
models/subword_ngram/din_4gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:89b8f7725cd6aac5b8a68406b158cf5acffba7494d1805c8b4f8c0a8e81884ac
3
- size 418893
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d995ca621e6e4dafa43f2f57773bb7435eb880e9a50a1437bee6f51df7bc7963
3
+ size 395998
models/subword_ngram/din_4gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 4,
3
  "variant": "subword",
4
  "language": "din",
5
- "unique_ngrams": 33136,
6
- "total_ngrams": 529561
7
  }
 
2
  "n": 4,
3
  "variant": "subword",
4
  "language": "din",
5
+ "unique_ngrams": 31591,
6
+ "total_ngrams": 499792
7
  }
models/subword_ngram/din_5gram_subword.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f5c32b943bd4a3dbb2314319f76dcb6dd3aacafcfcdd0b67c2e120c44454a6f
3
+ size 648882
models/subword_ngram/din_5gram_subword_metadata.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "n": 5,
3
+ "variant": "subword",
4
+ "language": "din",
5
+ "unique_ngrams": 51151,
6
+ "total_ngrams": 499300
7
+ }
models/tokenizer/din_tokenizer_16k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9524e8a2e065c61facff1cd75b7a97a9f37237a6e985f395e56f2245a77fb3cf
3
- size 515580
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a69148ed9617b47531585cb835971c50916505ad2c383f49f99afe3c493e171
3
+ size 522717
models/tokenizer/din_tokenizer_16k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/tokenizer/din_tokenizer_32k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3852bead6d9965c9944c990c6fd911f0db3e8a212430fdb57a05c11d1045b6c0
3
- size 803819
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7ce8fe69a6bfbdc648d3b25788d90376618ed288685ca42d266eabbb5455665
3
+ size 795594
models/tokenizer/din_tokenizer_32k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/tokenizer/din_tokenizer_8k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8455517f8c2cba07e54774765d1d68c9d28cc2cc5b6cc9d5e1be16a7943e20b0
3
- size 374243
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fac82c37aa86983a062646b307c798be71eb87be63967cdbb9eb4a4afcaa2209
3
+ size 375314
models/tokenizer/din_tokenizer_8k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/vocabulary/din_vocabulary.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2820281559a689aceb40dbbe835641b1a46c89b22c6260d174c40ed56187ad25
3
- size 91682
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c58191781619d7719376a7b98fcfb94dd50f23e69fd48432a64797b3041cb522
3
+ size 92110
models/vocabulary/din_vocabulary_metadata.json CHANGED
@@ -1,16 +1,17 @@
1
  {
2
  "language": "din",
3
- "vocabulary_size": 5872,
 
4
  "statistics": {
5
- "type_token_ratio": 0.16588202176297576,
6
  "coverage": {
7
- "top_100": 0.4291053106708471,
8
- "top_1000": 0.7156349799551539,
9
- "top_5000": 0.8741882565352023,
10
- "top_10000": 0.9311874508585795
11
  },
12
- "hapax_count": 11217,
13
- "hapax_ratio": 0.6563871496284159,
14
- "total_documents": 512
15
  }
16
  }
 
1
  {
2
  "language": "din",
3
+ "vocabulary_size": 5848,
4
+ "variant": "full",
5
  "statistics": {
6
+ "type_token_ratio": 0.18787323488196614,
7
  "coverage": {
8
+ "top_100": 0.41450900075455427,
9
+ "top_1000": 0.6878732348819662,
10
+ "top_5000": 0.8568826129136574,
11
+ "top_10000": 0.9199202328338902
12
  },
13
+ "hapax_count": 11581,
14
+ "hapax_ratio": 0.6644672671983476,
15
+ "total_documents": 492
16
  }
17
  }
models/word_markov/din_markov_ctx1_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c43f4dfe21d25d407d5b5b9d1441679b7523749c31a528129cda5959c6b7253
3
- size 554267
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:246fc144af31744bc8e9c6bec9030fe28bda481e1ef4e97c1be4b93fa4a48dcf
3
+ size 550979
models/word_markov/din_markov_ctx1_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 1,
3
  "variant": "word",
4
  "language": "din",
5
- "unique_contexts": 17160,
6
- "total_transitions": 129566
7
  }
 
2
  "context_size": 1,
3
  "variant": "word",
4
  "language": "din",
5
+ "unique_contexts": 17365,
6
+ "total_transitions": 92278
7
  }
models/word_markov/din_markov_ctx2_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:965c69dacc40dac71dc8d8cca1b37275d8fd36766158f235cb2da6dae61c83b0
3
- size 1217993
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0c57fde3bcbadba0e30dbb8351a7870d8ad9241eb1706f048ea8133749668e6
3
+ size 1157480
models/word_markov/din_markov_ctx2_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 2,
3
  "variant": "word",
4
  "language": "din",
5
- "unique_contexts": 69220,
6
- "total_transitions": 129054
7
  }
 
2
  "context_size": 2,
3
  "variant": "word",
4
  "language": "din",
5
+ "unique_contexts": 63845,
6
+ "total_transitions": 91786
7
  }
models/word_markov/din_markov_ctx3_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a0bb0aef0df5a98a9d5197038fe63754cb4b87fcd13fb51b6a3fe4b2ae713dae
3
- size 1669385
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de2ffee95e8d073a4b4152526a192cdc27fc3537af859f40c8755b3456597503
3
+ size 1467750
models/word_markov/din_markov_ctx3_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 3,
3
  "variant": "word",
4
  "language": "din",
5
- "unique_contexts": 101209,
6
- "total_transitions": 128542
7
  }
 
2
  "context_size": 3,
3
  "variant": "word",
4
  "language": "din",
5
+ "unique_contexts": 83004,
6
+ "total_transitions": 91294
7
  }