omarkamali commited on
Commit
38e662b
·
verified ·
1 Parent(s): 02fa115

Upload all models and assets for ar (latest)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. README.md +343 -142
  3. models/embeddings/aligned/ar_128d.bin +3 -0
  4. models/embeddings/aligned/ar_128d.meta.json +1 -0
  5. models/embeddings/aligned/ar_128d.projection.npy +3 -0
  6. models/embeddings/aligned/ar_128d_metadata.json +8 -0
  7. models/embeddings/aligned/ar_32d.bin +3 -0
  8. models/embeddings/aligned/ar_32d.meta.json +1 -0
  9. models/embeddings/aligned/ar_32d.projection.npy +3 -0
  10. models/embeddings/aligned/ar_32d_metadata.json +8 -0
  11. models/embeddings/aligned/ar_64d.bin +3 -0
  12. models/embeddings/aligned/ar_64d.meta.json +1 -0
  13. models/embeddings/aligned/ar_64d.projection.npy +3 -0
  14. models/embeddings/aligned/ar_64d_metadata.json +8 -0
  15. models/embeddings/monolingual/ar_128d.bin +2 -2
  16. models/embeddings/monolingual/ar_128d_metadata.json +5 -3
  17. models/embeddings/monolingual/ar_32d.bin +2 -2
  18. models/embeddings/monolingual/ar_32d_metadata.json +5 -3
  19. models/embeddings/monolingual/ar_64d.bin +2 -2
  20. models/embeddings/monolingual/ar_64d_metadata.json +5 -3
  21. models/subword_markov/ar_markov_ctx1_subword.parquet +2 -2
  22. models/subword_markov/ar_markov_ctx1_subword_metadata.json +2 -2
  23. models/subword_markov/ar_markov_ctx2_subword.parquet +2 -2
  24. models/subword_markov/ar_markov_ctx2_subword_metadata.json +2 -2
  25. models/subword_markov/ar_markov_ctx3_subword.parquet +2 -2
  26. models/subword_markov/ar_markov_ctx3_subword_metadata.json +2 -2
  27. models/subword_markov/ar_markov_ctx4_subword.parquet +2 -2
  28. models/subword_markov/ar_markov_ctx4_subword_metadata.json +2 -2
  29. models/subword_ngram/ar_2gram_subword.parquet +2 -2
  30. models/subword_ngram/ar_2gram_subword_metadata.json +2 -2
  31. models/subword_ngram/ar_3gram_subword.parquet +2 -2
  32. models/subword_ngram/ar_3gram_subword_metadata.json +2 -2
  33. models/subword_ngram/ar_4gram_subword.parquet +2 -2
  34. models/subword_ngram/ar_4gram_subword_metadata.json +2 -2
  35. models/subword_ngram/ar_5gram_subword.parquet +3 -0
  36. models/subword_ngram/ar_5gram_subword_metadata.json +7 -0
  37. models/tokenizer/ar_tokenizer_16k.model +2 -2
  38. models/tokenizer/ar_tokenizer_16k.vocab +0 -0
  39. models/tokenizer/ar_tokenizer_32k.model +2 -2
  40. models/tokenizer/ar_tokenizer_32k.vocab +0 -0
  41. models/tokenizer/ar_tokenizer_64k.model +2 -2
  42. models/tokenizer/ar_tokenizer_64k.vocab +0 -0
  43. models/tokenizer/ar_tokenizer_8k.model +2 -2
  44. models/tokenizer/ar_tokenizer_8k.vocab +0 -0
  45. models/vocabulary/ar_vocabulary.parquet +2 -2
  46. models/vocabulary/ar_vocabulary_metadata.json +10 -9
  47. models/vocabulary/ar_vocabulary_top.parquet +3 -0
  48. models/vocabulary/ar_vocabulary_top_metadata.json +20 -0
  49. models/word_markov/ar_markov_ctx1_word.parquet +2 -2
  50. models/word_markov/ar_markov_ctx1_word_metadata.json +2 -2
.gitattributes CHANGED
@@ -39,3 +39,4 @@ visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -t
39
  visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
40
  visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
41
  visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
 
 
39
  visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
40
  visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
41
  visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
42
+ visualizations/embedding_tsne_multilingual.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -10,11 +10,21 @@ tags:
10
  - n-gram
11
  - markov
12
  - wikipedia
 
 
 
 
 
 
 
 
 
 
13
  - monolingual
14
  - family-arabic
15
  license: mit
16
  library_name: wikilangs
17
- pipeline_tag: feature-extraction
18
  datasets:
19
  - omarkamali/wikipedia-monthly
20
  dataset_info:
@@ -23,14 +33,14 @@ dataset_info:
23
  metrics:
24
  - name: best_compression_ratio
25
  type: compression
26
- value: 4.103
27
  - name: best_isotropy
28
  type: isotropy
29
- value: 0.7155
30
  - name: vocabulary_size
31
  type: vocab
32
- value: 1000000
33
- generated: 2025-12-27
34
  ---
35
 
36
  # Arabic - Wikilangs Models
@@ -44,12 +54,13 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
44
  ### Models & Assets
45
 
46
  - Tokenizers (8k, 16k, 32k, 64k)
47
- - N-gram models (2, 3, 4-gram)
48
- - Markov chains (context of 1, 2, 3 and 4)
49
  - Subword N-gram and Markov chains
50
- - Embeddings in various sizes and dimensions
51
  - Language Vocabulary
52
  - Language Statistics
 
53
  ![Performance Dashboard](visualizations/performance_dashboard.png)
54
 
55
  ### Analysis and Evaluation
@@ -59,7 +70,8 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
59
  - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
60
  - [4. Vocabulary Analysis](#4-vocabulary-analysis)
61
  - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
62
- - [6. Summary & Recommendations](#6-summary--recommendations)
 
63
  - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
64
  - [Visualizations Index](#visualizations-index)
65
 
@@ -68,58 +80,57 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
68
 
69
  ![Tokenizer Compression](visualizations/tokenizer_compression.png)
70
 
 
 
 
 
 
 
71
  ### Results
72
 
73
  | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
74
  |------------|-------------|---------------|----------|--------------|
75
- | **8k** | 3.156x | 3.13 | 0.0848% | 5,982,398 |
76
- | **16k** | 3.513x | 3.49 | 0.0944% | 5,374,291 |
77
- | **32k** | 3.837x | 3.81 | 0.1031% | 4,920,728 |
78
- | **64k** | 4.103x 🏆 | 4.07 | 0.1103% | 4,602,368 |
79
 
80
  ### Tokenization Examples
81
 
82
  Below are sample sentences tokenized with each vocabulary size:
83
 
84
- **Sample 1:** `تحويل ميلفورد (كونيتيكت)`
85
 
86
  | Vocab | Tokens | Count |
87
  |-------|--------|-------|
88
- | 8k | `▁تحويل ▁ميل فورد ▁( ك وني تي كت )` | 9 |
89
- | 16k | `▁تحويل ▁ميل فورد ▁( ك وني تي كت )` | 9 |
90
- | 32k | `▁تحويل ▁ميل فورد ▁( كوني تيكت )` | 7 |
91
- | 64k | `▁تحويل ▁ميل فورد ( كونيتيكت )` | 6 |
92
 
93
- **Sample 2:** `قد يقصد من «الفرفار» :
94
-
95
- الفرفار (إدا وكماض) : دوار تابع لجماعة إدا وڭماض في إقل...`
96
 
97
  | Vocab | Tokens | Count |
98
  |-------|--------|-------|
99
- | 8k | `▁قد ▁يق صد ▁من ▁« الف رف ار » ▁: ... (+43 more)` | 53 |
100
- | 16k | `▁قد ▁يقصد ▁من ▁« الف رف ار » ▁: ▁الف ... (+37 more)` | 47 |
101
- | 32k | `▁قد ▁يقصد ▁من ▁« الف رف ار » ▁: ▁الف ... (+36 more)` | 46 |
102
- | 64k | `▁قد ▁يقصد ▁من ▁« الف رف ار » ▁: ▁الف ... (+34 more)` | 44 |
103
-
104
- **Sample 3:** `المراجع
105
 
106
- تصنيف:أنهار إفريقية دولية
107
- تصنيف:أنهار بوروندي
108
- تصنيف:أنهار تنزانيا
109
- تصني...`
110
 
111
  | Vocab | Tokens | Count |
112
  |-------|--------|-------|
113
- | 8k | `▁المراجع ▁تصنيف : أن هار ▁إ فريقية ▁دولية ▁تصنيف : ... (+16 more)` | 26 |
114
- | 16k | `▁المراجع ▁تصنيف : أنهار ▁إفريقية ▁دولية ▁تصنيف : أنهار ▁بور ... (+12 more)` | 22 |
115
- | 32k | `▁المراجع ▁تصنيف : أنهار ▁إفريقية ▁دولية ▁تصنيف : أنهار ▁بور ... (+9 more)` | 19 |
116
- | 64k | `▁المراجع ▁تصنيف : أنهار ▁إفريقية ▁دولية ▁تصنيف : أنهار ▁بوروندي ... (+8 more)` | 18 |
117
 
118
 
119
  ### Key Findings
120
 
121
- - **Best Compression:** 64k achieves 4.103x compression
122
- - **Lowest UNK Rate:** 8k with 0.0848% unknown tokens
123
  - **Trade-off:** Larger vocabularies improve compression but increase model size
124
  - **Recommendation:** 32k vocabulary provides optimal balance for production use
125
 
@@ -128,57 +139,111 @@ Below are sample sentences tokenized with each vocabulary size:
128
 
129
  ![N-gram Perplexity](visualizations/ngram_perplexity.png)
130
 
 
 
131
  ![N-gram Coverage](visualizations/ngram_coverage.png)
132
 
133
  ### Results
134
 
135
- | N-gram | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
136
- |--------|------------|---------|----------------|------------------|-------------------|
137
- | **2-gram** | 224,018 🏆 | 17.77 | 6,245,473 | 10.5% | 22.3% |
138
- | **2-gram** | 514 🏆 | 9.00 | 52,884 | 52.6% | 94.6% |
139
- | **3-gram** | 831,530 | 19.67 | 14,344,223 | 6.6% | 16.3% |
140
- | **3-gram** | 4,885 | 12.25 | 487,957 | 23.0% | 53.9% |
141
- | **4-gram** | 1,784,666 | 20.77 | 25,822,600 | 4.6% | 13.6% |
142
- | **4-gram** | 29,916 | 14.87 | 3,376,435 | 13.3% | 31.6% |
 
 
143
 
144
  ### Top 5 N-grams by Size
145
 
146
- **2-grams:**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
  | Rank | N-gram | Count |
149
  |------|--------|-------|
150
- | 1 | `تصنيف :` | 9,397,729 |
151
- | 2 | ا` | 2,647,403 |
152
- | 3 | `: لاعبو` | 1,539,560 |
153
- | 4 | `| |` | 1,324,145 |
154
- | 5 | `كرة قدم` | 758,315 |
155
 
156
- **3-grams:**
157
 
158
  | Rank | N-gram | Count |
159
  |------|--------|-------|
160
- | 1 | `تصنيف : لاعبو` | 1,539,552 |
161
- | 2 | `تصنيف : مواليد` | 617,808 |
162
- | 3 | `: لاعبو كرة` | 498,223 |
163
- | 4 | `| | |` | 459,400 |
164
- | 5 | `تصنيف : أشخاص` | 441,938 |
165
 
166
- **4-grams:**
167
 
168
  | Rank | N-gram | Count |
169
  |------|--------|-------|
170
- | 1 | `تصنيف : لاعبو كرة` | 498,220 |
171
- | 2 | `: لاعبو كرة قدم` | 381,016 |
172
- | 3 | `القرن 20 تصنيف :` | 278,900 |
173
- | 4 | `في القرن 20 تصنيف` | 266,135 |
174
- | 5 | `| | | |` | 255,908 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
 
176
 
177
  ### Key Findings
178
 
179
- - **Best Perplexity:** 2-gram with 514
180
  - **Entropy Trend:** Decreases with larger n-grams (more predictable)
181
- - **Coverage:** Top-1000 patterns cover ~32% of corpus
182
  - **Recommendation:** 4-gram or 5-gram for best predictive performance
183
 
184
  ---
@@ -186,55 +251,86 @@ Below are sample sentences tokenized with each vocabulary size:
186
 
187
  ![Markov Entropy](visualizations/markov_entropy.png)
188
 
 
 
189
  ![Markov Branching](visualizations/markov_branching.png)
190
 
191
  ### Results
192
 
193
- | Context | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
194
- |---------|-------------|------------|------------------|-----------------|----------------|
195
- | **1** | 0.7411 | 1.671 | 12.81 | 5,367,543 | 25.9% |
196
- | **1** | 1.8418 | 3.585 | 17.44 | 13,038 | 0.0% |
197
- | **2** | 0.4074 | 1.326 | 2.70 | 68,744,585 | 59.3% |
198
- | **2** | 0.7015 | 1.626 | 5.08 | 227,339 | 29.9% |
199
- | **3** | 0.1748 | 1.129 | 1.44 | 185,531,332 | 82.5% |
200
- | **3** | 0.8426 | 1.793 | 5.22 | 1,153,787 | 15.7% |
201
- | **4** | 0.0757 🏆 | 1.054 | 1.16 | 267,713,644 | 92.4% |
202
- | **4** | 0.7645 🏆 | 1.699 | 3.88 | 6,025,353 | 23.6% |
203
 
204
- ### Generated Text Samples
205
 
206
- Below are text samples generated from each Markov chain model:
207
 
208
  **Context Size 1:**
209
 
210
- 1. `. اكت ُ ص م َ ّ ة . أخته ، وافتتاح مشروع مرصد أونديجوف |`
211
- 2. `في المدار في جمهورية ألمانيا تصنيف : خلافات في . أنظر : تشغيل الحواسيب . 1`
212
- 3. واستحوذت أيضا التحريفية للبلغاريين والأجانب أو تمليح اللحوم والأتواب والملابس والبطانيات الى القاه...`
213
 
214
  **Context Size 2:**
215
 
216
- 1. `تصنيف : كتاب ومؤلفو قصص مصورة تصنيف : فائزون بميداليات برونزية في ألعاب الكومنولث في إنجلترا تصنيف`
217
- 2. ا للاغتسال . وقال القرطبي في تفسيره على أنه آمن خلال الرضاعة الطبيعية يسبب زيادة الكوليسترول`
218
- 3. `: لاعبو كرة قدم صرب مغتربون في روسيا تصنيف : أفلام دراما باللغة الإنجليزية تصنيف : سائقو`
219
 
220
  **Context Size 3:**
221
 
222
- 1. `تصنيف : لاعبو بوتكيت ريد سوكس تصنيف : مواليد 1955 تصنيف : مؤيدون لتنظيم ملكية الأسلحة تصنيف :`
223
- 2. `تصنيف : مواليد 1986 تصنيف : لاعبو وسط كرة قدم رجالية تصنيف : مواليد 1390 هـ تصنيف :`
224
- 3. `: لاعبو كرة قدم مغاربة تصنيف : عداؤو مسافات متوسطة نيوزيلنديون تصنيف : مواليد 1981 تصنيف : مواليد`
225
 
226
  **Context Size 4:**
227
 
228
- 1. `تصنيف : لاعبو كرة قدم مغتربون في المجر تصنيف : لاعبو كرة اليد في الألعاب الأولمبية الصيفية 1956 تصني...`
229
- 2. `: لاعبو كرة قدم مغتربون في إنجلترا تصنيف : لاعبو كرة قدم مغتربون في إيطاليا تصنيف : أماكن مأهولة`
230
- 3. `القرن 20 تصنيف : كاتبات أمريكيات في القرن 20 تصنيف : شعراء بالعربية في القرن 21 تصنيف : لاعبو`
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
 
232
 
233
  ### Key Findings
234
 
235
- - **Best Predictability:** Context-4 with 92.4% predictability
236
  - **Branching Factor:** Decreases with context size (more deterministic)
237
- - **Memory Trade-off:** Larger contexts require more storage (6,025,353 contexts)
238
  - **Recommendation:** Context-3 or Context-4 for text generation
239
 
240
  ---
@@ -250,64 +346,64 @@ Below are text samples generated from each Markov chain model:
250
 
251
  | Metric | Value |
252
  |--------|-------|
253
- | Vocabulary Size | 1,000,000 |
254
- | Total Tokens | 366,842,150 |
255
- | Mean Frequency | 366.84 |
256
- | Median Frequency | 12 |
257
- | Frequency Std Dev | 20900.79 |
258
 
259
  ### Most Common Words
260
 
261
  | Rank | Word | Frequency |
262
  |------|------|-----------|
263
- | 1 | في | 14,346,570 |
264
- | 2 | تصنيف | 9,437,038 |
265
- | 3 | من | 8,350,052 |
266
- | 4 | على | 3,295,037 |
267
- | 5 | ا | 2,755,855 |
268
- | 6 | إلى | 2,451,934 |
269
- | 7 | عام | 1,684,151 |
270
- | 8 | لاعبو | 1,540,822 |
271
- | 9 | أن | 1,441,897 |
272
- | 10 | مع | 1,171,753 |
273
 
274
  ### Least Common Words (from vocabulary)
275
 
276
  | Rank | Word | Frequency |
277
  |------|------|-----------|
278
- | 1 | твоим | 4 |
279
- | 2 | своему | 4 |
280
- | 3 | вашей | 4 |
281
- | 4 | нашу | 4 |
282
- | 5 | кого | 4 |
283
- | 6 | чьей | 4 |
284
- | 7 | работать | 4 |
285
- | 8 | говорит | 4 |
286
- | 9 | говорят | 4 |
287
- | 10 | идёт | 4 |
288
 
289
  ### Zipf's Law Analysis
290
 
291
  | Metric | Value |
292
  |--------|-------|
293
- | Zipf Coefficient | 0.9655 |
294
- | R² (Goodness of Fit) | 0.990109 |
295
  | Adherence Quality | **excellent** |
296
 
297
  ### Coverage Analysis
298
 
299
  | Top N Words | Coverage |
300
  |-------------|----------|
301
- | Top 100 | 24.9% |
302
- | Top 1,000 | 48.1% |
303
- | Top 5,000 | 68.6% |
304
- | Top 10,000 | 76.6% |
305
 
306
  ### Key Findings
307
 
308
- - **Zipf Compliance:** R²=0.9901 indicates excellent adherence to Zipf's law
309
- - **High Frequency Dominance:** Top 100 words cover 24.9% of corpus
310
- - **Long Tail:** 990,000 words needed for remaining 23.4% coverage
311
 
312
  ---
313
  ## 5. Word Embeddings Evaluation
@@ -320,24 +416,126 @@ Below are text samples generated from each Markov chain model:
320
 
321
  ![t-SNE Sentences](visualizations/tsne_sentences.png)
322
 
323
- ### Model Comparison
324
 
325
- | Model | Vocab Size | Dimension | Avg Norm | Std Norm | Isotropy |
326
- |-------|------------|-----------|----------|----------|----------|
327
- | **mono_32d** | 1,505,991 | 32 | 3.562 | 1.491 | 0.7155 🏆 |
328
- | **mono_64d** | 1,505,991 | 64 | 3.899 | 1.405 | 0.7134 |
329
- | **mono_128d** | 1,505,991 | 128 | 4.337 | 1.358 | 0.6849 |
330
- | **embeddings_enhanced** | 0 | 0 | 0.000 | 0.000 | 0.0000 |
 
 
 
 
 
 
 
 
 
 
 
331
 
332
  ### Key Findings
333
 
334
- - **Best Isotropy:** mono_32d with 0.7155 (more uniform distribution)
335
- - **Dimension Trade-off:** Higher dimensions capture more semantics but reduce isotropy
336
- - **Vocabulary Coverage:** All models cover 1,505,991 words
337
- - **Recommendation:** 100d for balanced semantic capture and efficiency
338
 
339
  ---
340
- ## 6. Summary & Recommendations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
 
342
  ![Performance Dashboard](visualizations/performance_dashboard.png)
343
 
@@ -345,11 +543,12 @@ Below are text samples generated from each Markov chain model:
345
 
346
  | Component | Recommended | Rationale |
347
  |-----------|-------------|-----------|
348
- | Tokenizer | **32k BPE** | Best compression (4.10x) with low UNK rate |
349
- | N-gram | **5-gram** | Lowest perplexity (514) |
350
- | Markov | **Context-4** | Highest predictability (92.4%) |
351
  | Embeddings | **100d** | Balanced semantic capture and isotropy |
352
 
 
353
  ---
354
  ## Appendix: Metrics Glossary & Interpretation Guide
355
 
@@ -539,7 +738,8 @@ If you use these models in your research, please cite:
539
  author = {Kamali, Omar},
540
  title = {Wikilangs: Open NLP Models for Wikipedia Languages},
541
  year = {2025},
542
- publisher = {HuggingFace},
 
543
  url = {https://huggingface.co/wikilangs}
544
  institution = {Omneity Labs}
545
  }
@@ -555,7 +755,8 @@ MIT License - Free for academic and commercial use.
555
  - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
556
  - ��� Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
557
  - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
 
558
  ---
559
  *Generated by Wikilangs Models Pipeline*
560
 
561
- *Report Date: 2025-12-27 16:32:09*
 
10
  - n-gram
11
  - markov
12
  - wikipedia
13
+ - feature-extraction
14
+ - sentence-similarity
15
+ - tokenization
16
+ - n-grams
17
+ - markov-chain
18
+ - text-mining
19
+ - fasttext
20
+ - babelvec
21
+ - vocabulous
22
+ - vocabulary
23
  - monolingual
24
  - family-arabic
25
  license: mit
26
  library_name: wikilangs
27
+ pipeline_tag: text-generation
28
  datasets:
29
  - omarkamali/wikipedia-monthly
30
  dataset_info:
 
33
  metrics:
34
  - name: best_compression_ratio
35
  type: compression
36
+ value: 4.347
37
  - name: best_isotropy
38
  type: isotropy
39
+ value: 0.7394
40
  - name: vocabulary_size
41
  type: vocab
42
+ value: 0
43
+ generated: 2026-01-07
44
  ---
45
 
46
  # Arabic - Wikilangs Models
 
54
  ### Models & Assets
55
 
56
  - Tokenizers (8k, 16k, 32k, 64k)
57
+ - N-gram models (2, 3, 4, 5-gram)
58
+ - Markov chains (context of 1, 2, 3, 4 and 5)
59
  - Subword N-gram and Markov chains
60
+ - Embeddings in various sizes and dimensions (aligned and unaligned)
61
  - Language Vocabulary
62
  - Language Statistics
63
+
64
  ![Performance Dashboard](visualizations/performance_dashboard.png)
65
 
66
  ### Analysis and Evaluation
 
70
  - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
71
  - [4. Vocabulary Analysis](#4-vocabulary-analysis)
72
  - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
73
+ - [6. Morphological Analysis (Experimental)](#6--morphological-analysis-experimental)
74
+ - [7. Summary & Recommendations](#7-summary--recommendations)
75
  - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
76
  - [Visualizations Index](#visualizations-index)
77
 
 
80
 
81
  ![Tokenizer Compression](visualizations/tokenizer_compression.png)
82
 
83
+ ![Tokenizer Fertility](visualizations/tokenizer_fertility.png)
84
+
85
+ ![Tokenizer OOV](visualizations/tokenizer_oov.png)
86
+
87
+ ![Total Tokens](visualizations/tokenizer_total_tokens.png)
88
+
89
  ### Results
90
 
91
  | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
92
  |------------|-------------|---------------|----------|--------------|
93
+ | **8k** | 3.252x | 3.25 | 0.0704% | 5,499,500 |
94
+ | **16k** | 3.655x | 3.65 | 0.0791% | 4,893,689 |
95
+ | **32k** | 4.034x | 4.03 | 0.0873% | 4,433,903 |
96
+ | **64k** | 4.347x 🏆 | 4.35 | 0.0941% | 4,114,555 |
97
 
98
  ### Tokenization Examples
99
 
100
  Below are sample sentences tokenized with each vocabulary size:
101
 
102
+ **Sample 1:** `بيغجة خاتون هي قرية في مقاطعة شبستر، إيران. يقدر عدد سكانها بـ 635 نسمة بحسب إحص...`
103
 
104
  | Vocab | Tokens | Count |
105
  |-------|--------|-------|
106
+ | 8k | `▁بي غ جة ▁خ ات ون ▁هي ▁قرية ▁في ▁مقاطعة ... (+26 more)` | 36 |
107
+ | 16k | `▁بي غ جة ▁خ ات ون ▁هي ▁قرية ▁في ▁مقاطعة ... (+23 more)` | 33 |
108
+ | 32k | `▁بيغ جة ▁خاتون ▁هي ▁قرية ▁في ▁مقاطعة ▁شب ستر ، ... (+20 more)` | 30 |
109
+ | 64k | `▁بيغ جة ▁خاتون ▁هي ▁قرية ▁في ▁مقاطعة ▁شب ستر ، ... (+20 more)` | 30 |
110
 
111
+ **Sample 2:** `IL18BP (Interleukin 18 binding protein) هوَ بروتين يُشَفر بواسطة جين IL18BP في ا...`
 
 
112
 
113
  | Vocab | Tokens | Count |
114
  |-------|--------|-------|
115
+ | 8k | `▁ il 1 8 b p ▁( in ter le ... (+51 more)` | 61 |
116
+ | 16k | `▁il 1 8 b p ▁( in ter le uk ... (+44 more)` | 54 |
117
+ | 32k | `▁il 1 8 b p ▁( inter le uk in ... (+39 more)` | 49 |
118
+ | 64k | `▁il 1 8 b p ▁( inter le uk in ... (+36 more)` | 46 |
 
 
119
 
120
+ **Sample 3:** `هي مقاطعة في ولاية قشقداريا في أوزبكستان، ومركزها مدينة شهرسبز. المصادر مأهولة ف...`
 
 
 
121
 
122
  | Vocab | Tokens | Count |
123
  |-------|--------|-------|
124
+ | 8k | `▁هي ▁مقاطعة ▁في ▁ولاية ▁ق ش قد اريا ▁في ▁أوزب ... (+18 more)` | 28 |
125
+ | 16k | `▁هي ▁مقاطعة ▁في ▁ولاية ▁ق ش قد اريا ▁في ▁أوزبكستان ... (+16 more)` | 26 |
126
+ | 32k | `▁هي ▁مقاطعة ▁في ▁ولاية ▁قش قد اريا ▁في ▁أوزبكستان ، ... (+13 more)` | 23 |
127
+ | 64k | `▁هي ▁مقاطعة ▁في ▁ولاية ▁قش قد اريا ▁في ▁أوزبكستان ، ... (+13 more)` | 23 |
128
 
129
 
130
  ### Key Findings
131
 
132
+ - **Best Compression:** 64k achieves 4.347x compression
133
+ - **Lowest UNK Rate:** 8k with 0.0704% unknown tokens
134
  - **Trade-off:** Larger vocabularies improve compression but increase model size
135
  - **Recommendation:** 32k vocabulary provides optimal balance for production use
136
 
 
139
 
140
  ![N-gram Perplexity](visualizations/ngram_perplexity.png)
141
 
142
+ ![N-gram Unique](visualizations/ngram_unique.png)
143
+
144
  ![N-gram Coverage](visualizations/ngram_coverage.png)
145
 
146
  ### Results
147
 
148
+ | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
149
+ |--------|---------|------------|---------|----------------|------------------|-------------------|
150
+ | **2-gram** | Word | 452,226 | 18.79 | 5,760,373 | 5.7% | 16.3% |
151
+ | **2-gram** | Subword | 436 🏆 | 8.77 | 70,700 | 55.9% | 96.1% |
152
+ | **3-gram** | Word | 1,074,568 | 20.04 | 10,101,258 | 4.3% | 14.7% |
153
+ | **3-gram** | Subword | 4,203 | 12.04 | 528,264 | 23.7% | 56.2% |
154
+ | **4-gram** | Word | 1,869,871 | 20.83 | 16,693,684 | 3.8% | 14.3% |
155
+ | **4-gram** | Subword | 26,613 | 14.70 | 2,851,427 | 13.2% | 31.9% |
156
+ | **5-gram** | Word | 1,422,629 | 20.44 | 12,591,346 | 4.2% | 15.4% |
157
+ | **5-gram** | Subword | 126,300 | 16.95 | 9,618,770 | 6.2% | 19.5% |
158
 
159
  ### Top 5 N-grams by Size
160
 
161
+ **2-grams (Word):**
162
+
163
+ | Rank | N-gram | Count |
164
+ |------|--------|-------|
165
+ | 1 | `كرة قدم` | 754,062 |
166
+ | 2 | `في القرن` | 693,987 |
167
+ | 3 | `في عام` | 580,274 |
168
+ | 4 | `الولايات المتحدة` | 468,192 |
169
+ | 5 | `وصلات خارجية` | 357,388 |
170
+
171
+ **3-grams (Word):**
172
+
173
+ | Rank | N-gram | Count |
174
+ |------|--------|-------|
175
+ | 1 | `في القرن 20` | 274,915 |
176
+ | 2 | `مراجع وصلات خارجية` | 255,117 |
177
+ | 3 | `في الولايات المتحدة` | 245,241 |
178
+ | 4 | `في القرن 21` | 238,844 |
179
+ | 5 | `أمريكيون في القرن` | 166,269 |
180
+
181
+ **4-grams (Word):**
182
 
183
  | Rank | N-gram | Count |
184
  |------|--------|-------|
185
+ | 1 | `كرة قدم مغتربون في` | 94,639 |
186
+ | 2 | `تحت سن الثامنة عشر` | 93,897 |
187
+ | 3 | `هو لاعب كرة قدم` | 93,478 |
188
+ | 4 | `أمريكيون في القرن 20` | 87,276 |
189
+ | 5 | `في الألعاب الأولمبية الصيفية` | 66,167 |
190
 
191
+ **5-grams (Word):**
192
 
193
  | Rank | N-gram | Count |
194
  |------|--------|-------|
195
+ | 1 | `تعداد عام بلغ عدد سكان` | 38,914 |
196
+ | 2 | `بحسب تعداد عام وبلغ عدد` | 38,787 |
197
+ | 3 | `تعداد عام وبلغ عدد الأسر` | 38,786 |
198
+ | 4 | `نسمة بحسب تعداد عام وبلغ` | 38,783 |
199
+ | 5 | `في الفئة العمرية ما بين` | 38,744 |
200
 
201
+ **2-grams (Subword):**
202
 
203
  | Rank | N-gram | Count |
204
  |------|--------|-------|
205
+ | 1 | ل` | 88,022,277 |
206
+ | 2 | `_ ا` | 75,496,816 |
207
+ | 3 | _` | 45,404,729 |
208
+ | 4 | _` | 32,155,198 |
209
+ | 5 | _` | 31,357,117 |
210
+
211
+ **3-grams (Subword):**
212
+
213
+ | Rank | N-gram | Count |
214
+ |------|--------|-------|
215
+ | 1 | `_ ا ل` | 71,328,243 |
216
+ | 2 | `_ ف ي` | 15,404,541 |
217
+ | 3 | `ف ي _` | 15,103,296 |
218
+ | 4 | `ي ة _` | 14,752,185 |
219
+ | 5 | `ا ل م` | 13,544,149 |
220
+
221
+ **4-grams (Subword):**
222
+
223
+ | Rank | N-gram | Count |
224
+ |------|--------|-------|
225
+ | 1 | `_ ف ي _` | 14,189,454 |
226
+ | 2 | `ة _ ا ل` | 12,269,528 |
227
+ | 3 | `_ ا ل م` | 11,772,138 |
228
+ | 4 | `_ م ن _` | 8,237,350 |
229
+ | 5 | `ي _ ا ل` | 7,703,248 |
230
+
231
+ **5-grams (Subword):**
232
+
233
+ | Rank | N-gram | Count |
234
+ |------|--------|-------|
235
+ | 1 | `ف ي _ ا ل` | 4,810,645 |
236
+ | 2 | `_ ف ي _ ا` | 4,774,417 |
237
+ | 3 | `ا ت _ ا ل` | 3,857,996 |
238
+ | 4 | `ي ة _ ا ل` | 3,696,976 |
239
+ | 5 | `_ ع ل ى _` | 3,259,756 |
240
 
241
 
242
  ### Key Findings
243
 
244
+ - **Best Perplexity:** 2-gram (subword) with 436
245
  - **Entropy Trend:** Decreases with larger n-grams (more predictable)
246
+ - **Coverage:** Top-1000 patterns cover ~19% of corpus
247
  - **Recommendation:** 4-gram or 5-gram for best predictive performance
248
 
249
  ---
 
251
 
252
  ![Markov Entropy](visualizations/markov_entropy.png)
253
 
254
+ ![Markov Contexts](visualizations/markov_contexts.png)
255
+
256
  ![Markov Branching](visualizations/markov_branching.png)
257
 
258
  ### Results
259
 
260
+ | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
261
+ |---------|---------|-------------|------------|------------------|-----------------|----------------|
262
+ | **1** | Word | 0.9908 | 1.987 | 17.58 | 4,471,621 | 0.9% |
263
+ | **1** | Subword | 1.3702 | 2.585 | 13.33 | 18,570 | 0.0% |
264
+ | **2** | Word | 0.3659 | 1.289 | 2.31 | 78,540,786 | 63.4% |
265
+ | **2** | Subword | 0.7295 | 1.658 | 5.21 | 247,596 | 27.1% |
266
+ | **3** | Word | 0.1310 | 1.095 | 1.29 | 181,002,468 | 86.9% |
267
+ | **3** | Subword | 0.6782 | 1.600 | 4.14 | 1,290,623 | 32.2% |
268
+ | **4** | Word | 0.0499 🏆 | 1.035 | 1.09 | 233,679,791 | 95.0% |
269
+ | **4** | Subword | 0.6490 | 1.568 | 3.51 | 5,343,485 | 35.1% |
270
 
271
+ ### Generated Text Samples (Word-based)
272
 
273
+ Below are text samples generated from each word-based Markov chain model:
274
 
275
  **Context Size 1:**
276
 
277
+ 1. `في المدائن وهي منتزه نيقولا الصايغ أميناً عاماً ونسبة 22 مايو حين سجلت في مجال تعليم`
278
+ 2. `من مونتريال اسمه إلى الساحل في الإصدار الرابع قبل الرابطة مع نادي ثون نادي سيون ببطولة`
279
+ 3. `على الصيد فلا يطالب بتنفيذها أو وجود منافسة ألعاب البحر في حين احتفظت بهويتها الجديدة بقيمة`
280
 
281
  **Context Size 2:**
282
 
283
+ 1. `كرة قدم من قصرش مقاطعة إسبان من كتالونيا إسبانيات في القرن 20 استمر التعليم التطوري أو التنموي`
284
+ 2. `في القرن 11 في وقتٍ واحد غابرييلا قرنفل وقرفة ترجمة عوض أحمد بن عبد الله الأميرة منيرة`
285
+ 3. `في عام أن تكلفة الوجبة البسيطة في نسج الظهارية ثخانة الجلد وتصلبه المترافقين مع المشكلات التي تنشأ`
286
 
287
  **Context Size 3:**
288
 
289
+ 1. `في القرن 20 أمريكيون أفارقة في القرن 21 كرة قدم رجالية أحياء دوري الدرجة الأولى الأرجنتيني فيليز سار...`
290
+ 2. `مراجع وصلات خارجية كرة قدم رجالية مغتربون في روسيا على أنها قوة بحرية صغيرة إلى مدينة تشهد حركة`
291
+ 3. `في الولايات المتحدة مراجع وصلات خارجية تلفزيونية مصرية بدأ عرضها في كوميديا سوداء تلفزيونية بريطانية...`
292
 
293
  **Context Size 4:**
294
 
295
+ 1. `كرة قدم مغتربون في السلفادور كرة قدم هندوراسيون كرة قدم هندوراسيون مغتربون كوبا سينتروأمريكانا منتخب...`
296
+ 2. `تحت سن الثامنة عشر تعيش معهم وبلغت نسبة الأزواج القاطنين مع بعضهم البعض 46 3 من أصل المجموع الكلي`
297
+ 3. `هو لاعب كرة قدم بريطاني في مركز لعب مع برادفورد سيتي وريث روفرز ونادي بارتيك ثيسل ونادي رينجرز ونادي`
298
+
299
+
300
+ ### Generated Text Samples (Subword-based)
301
+
302
+ Below are text samples generated from each subword-based Markov chain model:
303
+
304
+ **Context Size 1:**
305
+
306
+ 1. `_فيا،_دارب_ي_أمر`
307
+ 2. `اقصالمعب_ع_حمالم`
308
+ 3. `لبطة_قالمندواب_ا`
309
+
310
+ **Context Size 2:**
311
+
312
+ 1. `الأخرها_تشت_علية_`
313
+ 2. `_الممثل_أصدققه_حا`
314
+ 3. `ة_لدعار_الة)_جوزي`
315
+
316
+ **Context Size 3:**
317
+
318
+ 1. `_الذين_حليلار_رُزِق_`
319
+ 2. `_في_إحصاءات_الله)،`
320
+ 3. `في_الوالصحيحًا_كرة_`
321
+
322
+ **Context Size 4:**
323
+
324
+ 1. `_في_جمهور._جسدت_ديك`
325
+ 2. `ة_البلدي_في_اخترعه_`
326
+ 3. `_المتحدة._يقدمه_في_`
327
 
328
 
329
  ### Key Findings
330
 
331
+ - **Best Predictability:** Context-4 (word) with 95.0% predictability
332
  - **Branching Factor:** Decreases with context size (more deterministic)
333
+ - **Memory Trade-off:** Larger contexts require more storage (5,343,485 contexts)
334
  - **Recommendation:** Context-3 or Context-4 for text generation
335
 
336
  ---
 
346
 
347
  | Metric | Value |
348
  |--------|-------|
349
+ | Vocabulary Size | 1,950,572 |
350
+ | Total Tokens | 322,254,287 |
351
+ | Mean Frequency | 165.21 |
352
+ | Median Frequency | 4 |
353
+ | Frequency Std Dev | 12979.56 |
354
 
355
  ### Most Common Words
356
 
357
  | Rank | Word | Frequency |
358
  |------|------|-----------|
359
+ | 1 | في | 14,286,084 |
360
+ | 2 | من | 8,287,878 |
361
+ | 3 | على | 3,284,746 |
362
+ | 4 | إلى | 2,443,493 |
363
+ | 5 | عام | 1,621,280 |
364
+ | 6 | أن | 1,387,527 |
365
+ | 7 | مع | 1,153,439 |
366
+ | 8 | عن | 1,144,208 |
367
+ | 9 | أو | 1,098,905 |
368
+ | 10 | التي | 1,084,821 |
369
 
370
  ### Least Common Words (from vocabulary)
371
 
372
  | Rank | Word | Frequency |
373
  |------|------|-----------|
374
+ | 1 | dekréty | 2 |
375
+ | 2 | تادينا | 2 |
376
+ | 3 | بوكسوري | 2 |
377
+ | 4 | نموذجاالأدب | 2 |
378
+ | 5 | كنونالأدب | 2 |
379
+ | 6 | وليتاز | 2 |
380
+ | 7 | حكمٌّ | 2 |
381
+ | 8 | أسديراكي | 2 |
382
+ | 9 | إنتركوليجيت | 2 |
383
+ | 10 | للفيزيولوجية | 2 |
384
 
385
  ### Zipf's Law Analysis
386
 
387
  | Metric | Value |
388
  |--------|-------|
389
+ | Zipf Coefficient | 0.9488 |
390
+ | R² (Goodness of Fit) | 0.991144 |
391
  | Adherence Quality | **excellent** |
392
 
393
  ### Coverage Analysis
394
 
395
  | Top N Words | Coverage |
396
  |-------------|----------|
397
+ | Top 100 | 23.1% |
398
+ | Top 1,000 | 45.9% |
399
+ | Top 5,000 | 66.1% |
400
+ | Top 10,000 | 74.2% |
401
 
402
  ### Key Findings
403
 
404
+ - **Zipf Compliance:** R²=0.9911 indicates excellent adherence to Zipf's law
405
+ - **High Frequency Dominance:** Top 100 words cover 23.1% of corpus
406
+ - **Long Tail:** 1,940,572 words needed for remaining 25.8% coverage
407
 
408
  ---
409
  ## 5. Word Embeddings Evaluation
 
416
 
417
  ![t-SNE Sentences](visualizations/tsne_sentences.png)
418
 
 
419
 
420
+ ### 5.1 Cross-Lingual Alignment
421
+
422
+ ![Alignment Quality](visualizations/embedding_alignment_quality.png)
423
+
424
+ ![Multilingual t-SNE](visualizations/embedding_tsne_multilingual.png)
425
+
426
+
427
+ ### 5.2 Model Comparison
428
+
429
+ | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
430
+ |-------|-----------|----------|------------------|---------------|----------------|
431
+ | **mono_32d** | 32 | 0.7379 | 0.3519 | N/A | N/A |
432
+ | **mono_64d** | 64 | 0.7394 🏆 | 0.2816 | N/A | N/A |
433
+ | **mono_128d** | 128 | 0.7002 | 0.2259 | N/A | N/A |
434
+ | **aligned_32d** | 32 | 0.7379 | 0.3528 | 0.2700 | 0.6440 |
435
+ | **aligned_64d** | 64 | 0.7394 | 0.2881 | 0.4140 | 0.8200 |
436
+ | **aligned_128d** | 128 | 0.7002 | 0.2283 | 0.6000 | 0.8940 |
437
 
438
  ### Key Findings
439
 
440
+ - **Best Isotropy:** mono_64d with 0.7394 (more uniform distribution)
441
+ - **Semantic Density:** Average pairwise similarity of 0.2881. Lower values indicate better semantic separation.
442
+ - **Alignment Quality:** Aligned models achieve up to 60.0% R@1 in cross-lingual retrieval.
443
+ - **Recommendation:** 128d aligned for best cross-lingual performance
444
 
445
  ---
446
+ ## 6. Morphological Analysis (Experimental)
447
+
448
+ This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
449
+
450
+ ### 6.1 Productivity & Complexity
451
+
452
+ | Metric | Value | Interpretation | Recommendation |
453
+ |--------|-------|----------------|----------------|
454
+ | Productivity Index | **5.000** | High morphological productivity | Reliable analysis |
455
+ | Idiomaticity Gap | **-0.210** | Low formulaic content | - |
456
+
457
+ ### 6.2 Affix Inventory (Productive Units)
458
+
459
+ These are the most productive prefixes and suffixes identified by sampling the vocabulary for global substitutability patterns. A unit is considered an affix if stripping it leaves a valid stem that appears in other contexts.
460
+
461
+ #### Productive Prefixes
462
+ | Prefix | Examples |
463
+ |--------|----------|
464
+ | `-ال` | الألمانينصف, الاعتياديّ, الباكترية |
465
+ | `-وا` | والشجرية, والكاحِل, والميلانين |
466
+ | `-وال` | والشجرية, والكاحِل, والميلانين |
467
+ | `-الم` | المُحاضرة, المورينو, الممنوعة |
468
+
469
+ #### Productive Suffixes
470
+ | Suffix | Examples |
471
+ |--------|----------|
472
+ | `-ين` | ضوئيتين, بقلبين, نحوين |
473
+ | `-ات` | وخصوصيات, نانديات, دويركات |
474
+ | `-ية` | والشجرية, الباكترية, الّدودية |
475
+ | `-ها` | هاماريتيها, اختها, أُصولها |
476
+
477
+ ### 6.3 Bound Stems (Lexical Roots)
478
+
479
+ Bound stems are high-frequency subword units that are semantically cohesive but rarely appear as standalone words. These often correspond to the 'core' of a word that requires inflection or derivation to be valid.
480
+
481
+ | Stem | Cohesion | Substitutability | Examples |
482
+ |------|----------|------------------|----------|
483
+ | `تخدا` | 2.86x | 173 contexts | متخدا, كتخدا, متخداً |
484
+ | `ستخد` | 2.18x | 623 contexts | مستخد, استخد, تستخد |
485
+ | `ألعا` | 2.68x | 82 contexts | ألعاد, ألعاب, ألعالم |
486
+ | `والع` | 1.74x | 629 contexts | والعز, والعي, والعى |
487
+ | `اطعة` | 3.13x | 28 contexts | قاطعة, ساطعة, ساطعةً |
488
+ | `التع` | 1.63x | 578 contexts | التعة, التعس, التعب |
489
+ | `رنسي` | 1.82x | 179 contexts | درنسي, رنسيس, فرنسي |
490
+ | `استخ` | 1.79x | 192 contexts | استخم, استخد, استخر |
491
+ | `ريطا` | 2.08x | 85 contexts | غريطا, شريطا, وشريطا |
492
+ | `لمنا` | 1.37x | 729 contexts | تلمنا, ظلمنا, ألمنا |
493
+ | `غترب` | 2.44x | 39 contexts | اغترب, مغترب, يغترب |
494
+ | `الحا` | 1.34x | 693 contexts | الحاء, مالحا, الحاص |
495
+
496
+ ### 6.4 Affix Compatibility (Co-occurrence)
497
+
498
+ This table shows which prefixes and suffixes most frequently co-occur on the same stems, revealing the 'stacking' rules of the language's morphology.
499
+
500
+ | Prefix | Suffix | Frequency | Examples |
501
+ |--------|--------|-----------|----------|
502
+ | `-ال` | `-ية` | 95 words | الائتمانية, الويبرية |
503
+ | `-ال` | `-ات` | 76 words | الهباءات, الكوميديات |
504
+ | `-ال` | `-ين` | 68 words | البحـرين, المتوارثين |
505
+ | `-وا` | `-ية` | 35 words | والعضدية, والهانرية |
506
+ | `-وا` | `-ات` | 24 words | والمطرزات, والسلوريات |
507
+ | `-وا` | `-ين` | 17 words | والمُغنين, والميكرونيزيين |
508
+ | `-وا` | `-ها` | 4 words | واعترضتها, واستبعدتها |
509
+
510
+ ### 6.5 Recursive Morpheme Segmentation
511
+
512
+ Using **Recursive Hierarchical Substitutability**, we decompose complex words into their constituent morphemes. This approach handles nested affixes (e.g., `prefix-prefix-root-suffix`).
513
+
514
+ | Word | Suggested Split | Confidence | Stem |
515
+ |------|-----------------|------------|------|
516
+ | البروتينين | **`ال-بروت-ين-ين`** | 7.5 | `بروت` |
517
+ | والكاظمية | **`وال-كاظم-ية`** | 6.0 | `كاظم` |
518
+ | والسرورية | **`وال-سرور-ية`** | 6.0 | `سرور` |
519
+ | الغيلوغية | **`ال-غيلوغ-ية`** | 6.0 | `غيلوغ` |
520
+ | والحطابين | **`وال-حطاب-ين`** | 6.0 | `حطاب` |
521
+ | والمقدسيين | **`وال-مقدسي-ين`** | 6.0 | `مقدسي` |
522
+ | والنجومية | **`وال-نجوم-ية`** | 6.0 | `نجوم` |
523
+ | والرباعيات | **`وال-رباعي-ات`** | 6.0 | `رباعي` |
524
+ | الكلابشات | **`ال-كلابش-ات`** | 6.0 | `كلابش` |
525
+ | السبعينات | **`ال-سبعين-ات`** | 6.0 | `سبعين` |
526
+ | لاحتجاجاتها | **`لاحتجاج-ات-ها`** | 6.0 | `لاحتجاج` |
527
+ | والمكسّرات | **`وال-مكسّر-ات`** | 6.0 | `مكسّر` |
528
+ | والسكيريين | **`وال-سكيري-ين`** | 6.0 | `سكيري` |
529
+ | إسقاطاتها | **`إسقاط-ات-ها`** | 6.0 | `إسقاط` |
530
+ | واستثمارها | **`وا-ستثمار-ها`** | 6.0 | `ستثمار` |
531
+
532
+ ### 6.6 Linguistic Interpretation
533
+
534
+ > **Automated Insight:**
535
+ The language Arabic shows high morphological productivity. The subword models are significantly more efficient than word models, suggesting a rich system of affixation or compounding.
536
+
537
+ ---
538
+ ## 7. Summary & Recommendations
539
 
540
  ![Performance Dashboard](visualizations/performance_dashboard.png)
541
 
 
543
 
544
  | Component | Recommended | Rationale |
545
  |-----------|-------------|-----------|
546
+ | Tokenizer | **64k BPE** | Best compression (4.35x) |
547
+ | N-gram | **2-gram** | Lowest perplexity (436) |
548
+ | Markov | **Context-4** | Highest predictability (95.0%) |
549
  | Embeddings | **100d** | Balanced semantic capture and isotropy |
550
 
551
+
552
  ---
553
  ## Appendix: Metrics Glossary & Interpretation Guide
554
 
 
738
  author = {Kamali, Omar},
739
  title = {Wikilangs: Open NLP Models for Wikipedia Languages},
740
  year = {2025},
741
+ doi = {10.5281/zenodo.18073153},
742
+ publisher = {Zenodo},
743
  url = {https://huggingface.co/wikilangs}
744
  institution = {Omneity Labs}
745
  }
 
755
  - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
756
  - ��� Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
757
  - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
758
+ - 🤝 Sponsor: [Featherless AI](https://featherless.ai)
759
  ---
760
  *Generated by Wikilangs Models Pipeline*
761
 
762
+ *Report Date: 2026-01-07 13:14:53*
models/embeddings/aligned/ar_128d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef219238cabd8a9ea12da66dbcd80332fbd38875433dc953df85a37a71b899aa
3
+ size 2486763168
models/embeddings/aligned/ar_128d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "ar", "dim": 128, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/ar_128d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d61b78158be557df612ca2ef8343029b39babaae6f038fcd428534279135917
3
+ size 65664
models/embeddings/aligned/ar_128d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "ar",
3
+ "dimension": 128,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 200166,
7
+ "vocab_size": 1398324
8
+ }
models/embeddings/aligned/ar_32d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70e78275bfd0f11c47d9f144be703b7d9277b2e2698a9f3c400035b92f124640
3
+ size 644850336
models/embeddings/aligned/ar_32d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "ar", "dim": 32, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/ar_32d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:baedb1283a79af0fe218dcef59dcfa703ea3550812bcf17a0edcc064c20d6b39
3
+ size 4224
models/embeddings/aligned/ar_32d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "ar",
3
+ "dimension": 32,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 200166,
7
+ "vocab_size": 1398324
8
+ }
models/embeddings/aligned/ar_64d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02bc5f2f59c273f6f58b680abc0742d269afaafa47c336c6429c6a81304b006c
3
+ size 1258821280
models/embeddings/aligned/ar_64d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "ar", "dim": 64, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/ar_64d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9efa73eb7ec91ab64fdf021d74e55418d0906b47c6bc624aab5b6a37dbd88bb9
3
+ size 16512
models/embeddings/aligned/ar_64d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "ar",
3
+ "dimension": 64,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 200166,
7
+ "vocab_size": 1398324
8
+ }
models/embeddings/monolingual/ar_128d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5a15e8d3f91894ef733d19288cf8f9905570f8b2cb6c96d55ef1706a098238e3
3
- size 2599525002
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef219238cabd8a9ea12da66dbcd80332fbd38875433dc953df85a37a71b899aa
3
+ size 2486763168
models/embeddings/monolingual/ar_128d_metadata.json CHANGED
@@ -3,11 +3,13 @@
3
  "dimension": 128,
4
  "version": "monolingual",
5
  "training_params": {
6
- "dim": 128,
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
- "epochs": 5
 
 
11
  },
12
- "vocab_size": 1505991
13
  }
 
3
  "dimension": 128,
4
  "version": "monolingual",
5
  "training_params": {
6
+ "algorithm": "skipgram",
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
+ "epochs": 5,
11
+ "encoding_method": "rope",
12
+ "dim": 128
13
  },
14
+ "vocab_size": 1398324
15
  }
models/embeddings/monolingual/ar_32d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c69754e00f4a7215bd624c3eada95e3f38a60dfc1f0e7cb54a049f1e7c7dfe5b
3
- size 674923914
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70e78275bfd0f11c47d9f144be703b7d9277b2e2698a9f3c400035b92f124640
3
+ size 644850336
models/embeddings/monolingual/ar_32d_metadata.json CHANGED
@@ -3,11 +3,13 @@
3
  "dimension": 32,
4
  "version": "monolingual",
5
  "training_params": {
6
- "dim": 32,
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
- "epochs": 5
 
 
11
  },
12
- "vocab_size": 1505991
13
  }
 
3
  "dimension": 32,
4
  "version": "monolingual",
5
  "training_params": {
6
+ "algorithm": "skipgram",
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
+ "epochs": 5,
11
+ "encoding_method": "rope",
12
+ "dim": 32
13
  },
14
+ "vocab_size": 1398324
15
  }
models/embeddings/monolingual/ar_64d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b216bcc9d5802fb5f2765d412ed8e269961ce1d314758d5c5986a12d8d067325
3
- size 1316457610
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02bc5f2f59c273f6f58b680abc0742d269afaafa47c336c6429c6a81304b006c
3
+ size 1258821280
models/embeddings/monolingual/ar_64d_metadata.json CHANGED
@@ -3,11 +3,13 @@
3
  "dimension": 64,
4
  "version": "monolingual",
5
  "training_params": {
6
- "dim": 64,
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
- "epochs": 5
 
 
11
  },
12
- "vocab_size": 1505991
13
  }
 
3
  "dimension": 64,
4
  "version": "monolingual",
5
  "training_params": {
6
+ "algorithm": "skipgram",
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
+ "epochs": 5,
11
+ "encoding_method": "rope",
12
+ "dim": 64
13
  },
14
+ "vocab_size": 1398324
15
  }
models/subword_markov/ar_markov_ctx1_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f138ceea502bc716af3c32b8672e5560c86fb05241a4c6df25882580b5a54208
3
- size 1240058
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:574edae048b28cd710d49d346838e995d3075aa7e9404f1962349f0f2b606f4f
3
+ size 1510871
models/subword_markov/ar_markov_ctx1_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 1,
3
  "variant": "subword",
4
  "language": "ar",
5
- "unique_contexts": 13038,
6
- "total_transitions": 2227602784
7
  }
 
2
  "context_size": 1,
3
  "variant": "subword",
4
  "language": "ar",
5
+ "unique_contexts": 18570,
6
+ "total_transitions": 1931607401
7
  }
models/subword_markov/ar_markov_ctx2_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3927d88cac7fc454b48e23919fdaacb6c000902069154a165316b426b04cb9d2
3
- size 9037901
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c019e23cfe8e01e37162bcd6b57200033e2d76cdc0cc88a628d6e7fbd6649b8
3
+ size 10138087
models/subword_markov/ar_markov_ctx2_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 2,
3
  "variant": "subword",
4
  "language": "ar",
5
- "unique_contexts": 227339,
6
- "total_transitions": 2226268866
7
  }
 
2
  "context_size": 2,
3
  "variant": "subword",
4
  "language": "ar",
5
+ "unique_contexts": 247596,
6
+ "total_transitions": 1930342017
7
  }
models/subword_markov/ar_markov_ctx3_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:19911a0827041ed4348c506978baaae745ceac594b0d61c980d2bc3777aa6a97
3
- size 44692812
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ef7a7d5ec14ab20dc8f9e56c4e3c940701ba628aca3d6c03fe556dff843d17d
3
+ size 44109485
models/subword_markov/ar_markov_ctx3_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 3,
3
  "variant": "subword",
4
  "language": "ar",
5
- "unique_contexts": 1153787,
6
- "total_transitions": 2224934948
7
  }
 
2
  "context_size": 3,
3
  "variant": "subword",
4
  "language": "ar",
5
+ "unique_contexts": 1290623,
6
+ "total_transitions": 1929076633
7
  }
models/subword_markov/ar_markov_ctx4_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:67f755180f4df9e71b72cbe5cce476a0f75aa2d3bb2be4d74074ae69f6f89b19
3
- size 190825986
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7ee34b80789c6d5441f10ed5482fffa62a5db280978fce263f369b0031fc1f2
3
+ size 165497966
models/subword_markov/ar_markov_ctx4_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 4,
3
  "variant": "subword",
4
  "language": "ar",
5
- "unique_contexts": 6025353,
6
- "total_transitions": 2223601030
7
  }
 
2
  "context_size": 4,
3
  "variant": "subword",
4
  "language": "ar",
5
+ "unique_contexts": 5343485,
6
+ "total_transitions": 1927811249
7
  }
models/subword_ngram/ar_2gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:71ed96bee2deb0bad70f5449c7c15403d96685f635321bae37b67c5fac7ce756
3
- size 750740
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:051d0c469791c68d0aab623ee6f2f4956d7da4e670aec6403120a79e8734828f
3
+ size 1010043
models/subword_ngram/ar_2gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 2,
3
  "variant": "subword",
4
  "language": "ar",
5
- "unique_ngrams": 52884,
6
- "total_ngrams": 2227602784
7
  }
 
2
  "n": 2,
3
  "variant": "subword",
4
  "language": "ar",
5
+ "unique_ngrams": 70700,
6
+ "total_ngrams": 1931607401
7
  }
models/subword_ngram/ar_3gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3bbdad1db2897a969c03e3f47d560f9d1dea5e8a3448c3f90f1c693656c1a2ff
3
- size 6329144
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11567f60179d7d679ac0fe53473a44a51239bc6838cc731298039e543e3539ec
3
+ size 7513640
models/subword_ngram/ar_3gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 3,
3
  "variant": "subword",
4
  "language": "ar",
5
- "unique_ngrams": 487957,
6
- "total_ngrams": 2226268866
7
  }
 
2
  "n": 3,
3
  "variant": "subword",
4
  "language": "ar",
5
+ "unique_ngrams": 528264,
6
+ "total_ngrams": 1930342017
7
  }
models/subword_ngram/ar_4gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1d75bfcaa4ba765d4a698064ad06b8d77a93f04f7e84e5b96dc323009209b04
3
- size 43956365
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf2ba8cd29e1c4e91b5db8c9a037dd559bec62bb1a44450fe5ffc882168bea52
3
+ size 40025731
models/subword_ngram/ar_4gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 4,
3
  "variant": "subword",
4
  "language": "ar",
5
- "unique_ngrams": 3376435,
6
- "total_ngrams": 2224934948
7
  }
 
2
  "n": 4,
3
  "variant": "subword",
4
  "language": "ar",
5
+ "unique_ngrams": 2851427,
6
+ "total_ngrams": 1929076633
7
  }
models/subword_ngram/ar_5gram_subword.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d827290f0e4a08a5ac6ad25f1e75c878bc6b8a34c0c2c90d128f1fea76e05bd
3
+ size 139944679
models/subword_ngram/ar_5gram_subword_metadata.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "n": 5,
3
+ "variant": "subword",
4
+ "language": "ar",
5
+ "unique_ngrams": 9618770,
6
+ "total_ngrams": 1927811249
7
+ }
models/tokenizer/ar_tokenizer_16k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7edaf2dbdc3ada01d30b8717ebf8add3d42cafa3d77bcdd80f720a97d9746d1
3
- size 559100
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:762cad48886cec152e6e3f7ed18568586bc3c9f4e9953b3185639b108ba8aac0
3
+ size 560431
models/tokenizer/ar_tokenizer_16k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/tokenizer/ar_tokenizer_32k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:442456b5ec25cb0683e0fe13294df9823fad435a42a56c7742c1cae081aa137a
3
- size 896676
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c76c24bcacf6d2321aead7205693601874ae17f908952dc1038e02c3176e6192
3
+ size 898238
models/tokenizer/ar_tokenizer_32k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/tokenizer/ar_tokenizer_64k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:95c3e0d8a7a2ddcdb77379df9ab674034abf4e5f512785ebba09ecfc8b354f66
3
- size 1589031
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f086baad9b6b110330fd920dd11031d9f3ef7f5660ab6ac1c35134bb867b9124
3
+ size 1589613
models/tokenizer/ar_tokenizer_64k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/tokenizer/ar_tokenizer_8k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3f348b2c21a24d3a2ea09cf2eb93040bd9b203a25a891e426a51af0742693d90
3
- size 395404
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77f3c2a9fceaccd03fdcfabdbaec3cf8ededb0885e5142dd55eb2decced61240
3
+ size 396806
models/tokenizer/ar_tokenizer_8k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/vocabulary/ar_vocabulary.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a53a54e5478b4d7db731e85c634c1c05678e6956d42d16a8b8195a001195afbf
3
- size 15296577
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b31772fd6a7b4c36fa172ae0fd4a4d5dabd71f58e9966f89680cfe83fda2db9c
3
+ size 30017440
models/vocabulary/ar_vocabulary_metadata.json CHANGED
@@ -1,16 +1,17 @@
1
  {
2
  "language": "ar",
3
- "vocabulary_size": 1000000,
 
4
  "statistics": {
5
- "type_token_ratio": 0.014399599085234752,
6
  "coverage": {
7
- "top_100": 0.24468957694891222,
8
- "top_1000": 0.473562596402342,
9
- "top_5000": 0.6748063946199147,
10
- "top_10000": 0.7536829840670893
11
  },
12
- "hapax_count": 3360857,
13
- "hapax_ratio": 0.6262799827295155,
14
- "total_documents": 1333918
15
  }
16
  }
 
1
  {
2
  "language": "ar",
3
+ "vocabulary_size": 1950572,
4
+ "variant": "full",
5
  "statistics": {
6
+ "type_token_ratio": 0.01377005207307626,
7
  "coverage": {
8
+ "top_100": 0.22964907469610985,
9
+ "top_1000": 0.4549956533720101,
10
+ "top_5000": 0.6553878216380935,
11
+ "top_10000": 0.7363291209271269
12
  },
13
+ "hapax_count": 2521609,
14
+ "hapax_ratio": 0.5638432344308068,
15
+ "total_documents": 1265384
16
  }
17
  }
models/vocabulary/ar_vocabulary_top.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8f91926cd559196b96927f546da84a97a5bcb514ae35ee9b3c8cb9074f1b7c5
3
+ size 15453105
models/vocabulary/ar_vocabulary_top_metadata.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "ar",
3
+ "vocabulary_size": 1000000,
4
+ "variant": "top",
5
+ "statistics": {
6
+ "type_token_ratio": 0.01377005207307626,
7
+ "coverage": {
8
+ "top_100": 0.22964907469610985,
9
+ "top_1000": 0.4549956533720101,
10
+ "top_5000": 0.6553878216380935,
11
+ "top_10000": 0.7363291209271269
12
+ },
13
+ "hapax_count": 2521609,
14
+ "hapax_ratio": 0.5638432344308068,
15
+ "total_documents": 1265384,
16
+ "top_vocab_size": 1000000,
17
+ "coverage_ratio": 0.9850047861926305,
18
+ "tokens_excluded": 950572
19
+ }
20
+ }
models/word_markov/ar_markov_ctx1_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:595835c3288deb49535347878aaafaab9a18550487e54619d262b4d69c8215b8
3
- size 680761022
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9fddcbf1a23594543219647432ec4ead2edb2b2e1de36c8a8f47889080afef7
3
+ size 765307915
models/word_markov/ar_markov_ctx1_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 1,
3
  "variant": "word",
4
  "language": "ar",
5
- "unique_contexts": 5367543,
6
- "total_transitions": 454840210
7
  }
 
2
  "context_size": 1,
3
  "variant": "word",
4
  "language": "ar",
5
+ "unique_contexts": 4471621,
6
+ "total_transitions": 323510512
7
  }