omarkamali commited on
Commit
bf5a127
·
verified ·
1 Parent(s): d637a00

Upload all models and assets for ckb (latest)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. README.md +345 -151
  3. models/embeddings/aligned/ckb_128d.bin +3 -0
  4. models/embeddings/aligned/ckb_128d.meta.json +1 -0
  5. models/embeddings/aligned/ckb_128d.projection.npy +3 -0
  6. models/embeddings/aligned/ckb_128d_metadata.json +8 -0
  7. models/embeddings/aligned/ckb_32d.bin +3 -0
  8. models/embeddings/aligned/ckb_32d.meta.json +1 -0
  9. models/embeddings/aligned/ckb_32d.projection.npy +3 -0
  10. models/embeddings/aligned/ckb_32d_metadata.json +8 -0
  11. models/embeddings/aligned/ckb_64d.bin +3 -0
  12. models/embeddings/aligned/ckb_64d.meta.json +1 -0
  13. models/embeddings/aligned/ckb_64d.projection.npy +3 -0
  14. models/embeddings/aligned/ckb_64d_metadata.json +8 -0
  15. models/embeddings/monolingual/ckb_128d.bin +2 -2
  16. models/embeddings/monolingual/ckb_128d_metadata.json +5 -3
  17. models/embeddings/monolingual/ckb_32d.bin +2 -2
  18. models/embeddings/monolingual/ckb_32d_metadata.json +5 -3
  19. models/embeddings/monolingual/ckb_64d.bin +2 -2
  20. models/embeddings/monolingual/ckb_64d_metadata.json +5 -3
  21. models/subword_markov/ckb_markov_ctx1_subword.parquet +2 -2
  22. models/subword_markov/ckb_markov_ctx1_subword_metadata.json +2 -2
  23. models/subword_markov/ckb_markov_ctx2_subword.parquet +2 -2
  24. models/subword_markov/ckb_markov_ctx2_subword_metadata.json +2 -2
  25. models/subword_markov/ckb_markov_ctx3_subword.parquet +2 -2
  26. models/subword_markov/ckb_markov_ctx3_subword_metadata.json +2 -2
  27. models/subword_markov/ckb_markov_ctx4_subword.parquet +2 -2
  28. models/subword_markov/ckb_markov_ctx4_subword_metadata.json +2 -2
  29. models/subword_ngram/ckb_2gram_subword.parquet +2 -2
  30. models/subword_ngram/ckb_2gram_subword_metadata.json +2 -2
  31. models/subword_ngram/ckb_3gram_subword.parquet +2 -2
  32. models/subword_ngram/ckb_3gram_subword_metadata.json +2 -2
  33. models/subword_ngram/ckb_4gram_subword.parquet +2 -2
  34. models/subword_ngram/ckb_4gram_subword_metadata.json +2 -2
  35. models/subword_ngram/ckb_5gram_subword.parquet +3 -0
  36. models/subword_ngram/ckb_5gram_subword_metadata.json +7 -0
  37. models/tokenizer/ckb_tokenizer_16k.model +2 -2
  38. models/tokenizer/ckb_tokenizer_16k.vocab +0 -0
  39. models/tokenizer/ckb_tokenizer_32k.model +2 -2
  40. models/tokenizer/ckb_tokenizer_32k.vocab +0 -0
  41. models/tokenizer/ckb_tokenizer_64k.model +2 -2
  42. models/tokenizer/ckb_tokenizer_64k.vocab +0 -0
  43. models/tokenizer/ckb_tokenizer_8k.model +2 -2
  44. models/tokenizer/ckb_tokenizer_8k.vocab +0 -0
  45. models/vocabulary/ckb_vocabulary.parquet +2 -2
  46. models/vocabulary/ckb_vocabulary_metadata.json +10 -9
  47. models/word_markov/ckb_markov_ctx1_word.parquet +2 -2
  48. models/word_markov/ckb_markov_ctx1_word_metadata.json +2 -2
  49. models/word_markov/ckb_markov_ctx2_word.parquet +2 -2
  50. models/word_markov/ckb_markov_ctx2_word_metadata.json +2 -2
.gitattributes CHANGED
@@ -39,3 +39,4 @@ visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -t
39
  visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
40
  visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
41
  visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
 
 
39
  visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
40
  visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
41
  visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
42
+ visualizations/embedding_tsne_multilingual.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  language: ckb
3
- language_name: CKB
4
  language_family: iranian_western
5
  tags:
6
  - wikilangs
@@ -10,11 +10,21 @@ tags:
10
  - n-gram
11
  - markov
12
  - wikipedia
 
 
 
 
 
 
 
 
 
 
13
  - monolingual
14
  - family-iranian_western
15
  license: mit
16
  library_name: wikilangs
17
- pipeline_tag: feature-extraction
18
  datasets:
19
  - omarkamali/wikipedia-monthly
20
  dataset_info:
@@ -23,20 +33,20 @@ dataset_info:
23
  metrics:
24
  - name: best_compression_ratio
25
  type: compression
26
- value: 4.743
27
  - name: best_isotropy
28
  type: isotropy
29
- value: 0.7972
30
  - name: vocabulary_size
31
  type: vocab
32
- value: 267929
33
- generated: 2025-12-28
34
  ---
35
 
36
- # CKB - Wikilangs Models
37
  ## Comprehensive Research Report & Full Ablation Study
38
 
39
- This repository contains NLP models trained and evaluated by Wikilangs, specifically on **CKB** Wikipedia data.
40
  We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
41
 
42
  ## 📋 Repository Contents
@@ -44,12 +54,13 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
44
  ### Models & Assets
45
 
46
  - Tokenizers (8k, 16k, 32k, 64k)
47
- - N-gram models (2, 3, 4-gram)
48
- - Markov chains (context of 1, 2, 3 and 4)
49
  - Subword N-gram and Markov chains
50
- - Embeddings in various sizes and dimensions
51
  - Language Vocabulary
52
  - Language Statistics
 
53
  ![Performance Dashboard](visualizations/performance_dashboard.png)
54
 
55
  ### Analysis and Evaluation
@@ -59,7 +70,8 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
59
  - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
60
  - [4. Vocabulary Analysis](#4-vocabulary-analysis)
61
  - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
62
- - [6. Summary & Recommendations](#6-summary--recommendations)
 
63
  - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
64
  - [Visualizations Index](#visualizations-index)
65
 
@@ -68,71 +80,57 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
68
 
69
  ![Tokenizer Compression](visualizations/tokenizer_compression.png)
70
 
 
 
 
 
 
 
71
  ### Results
72
 
73
  | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
74
  |------------|-------------|---------------|----------|--------------|
75
- | **8k** | 3.683x | 3.65 | 0.0660% | 975,730 |
76
- | **16k** | 4.093x | 4.06 | 0.0733% | 878,011 |
77
- | **32k** | 4.448x | 4.41 | 0.0797% | 808,075 |
78
- | **64k** | 4.743x 🏆 | 4.70 | 0.0850% | 757,838 |
79
 
80
  ### Tokenization Examples
81
 
82
  Below are sample sentences tokenized with each vocabulary size:
83
 
84
- **Sample 1:** `شارێکی ویلایەتی جۆرجیایە لە ویلایەتە یەکگرتووەکانی ئەمریکا.
85
-
86
- بەستەرە دەرکییەکان`
87
 
88
  | Vocab | Tokens | Count |
89
  |-------|--------|-------|
90
- | 8k | `▁شارێکی ▁ویلایەتی ▁جۆر جی ایە ▁لە ▁ویلایەتە ▁یەکگرتووەکانی ▁ئەمریکا . ... (+4 more)` | 14 |
91
- | 16k | `▁شارێکی ▁ویلایەتی ▁جۆرجی ایە ▁لە ▁ویلایەتە ▁یەکگرتووەکانی ▁ئەمریکا . ▁بەستەرە ... (+2 more)` | 12 |
92
- | 32k | `▁شارێکی ▁ویلایەتی ▁جۆرجی ایە ▁لە ▁ویلایەتە ▁یەکگرتووەکانی ▁ئەمریکا . ▁بەستەرە ... (+2 more)` | 12 |
93
- | 64k | `▁شارێکی ▁ویلایەتی ▁جۆرجی ایە ▁لە ▁ویلایەتە ▁یەکگرتووەکانی ▁ئەمریکا . ▁بەستەرە ... (+2 more)` | 12 |
94
-
95
- **Sample 2:** `ڕووداوەکان
96
-
97
- لەدایکبوونەکان
98
-
99
- مردنەکان
100
 
101
- سەرچاوەکان
102
-
103
-
104
- پۆل:ساڵەکان`
105
 
106
  | Vocab | Tokens | Count |
107
  |-------|--------|-------|
108
- | 8k | `▁ڕووداوەکان ▁لەدایکبوونەکان ▁مردنەکان ▁سەرچاوەکان ▁پۆل : ساڵەکان` | 7 |
109
- | 16k | `▁ڕووداوەکان ▁لەدایکبوونەکان ▁مردنەکان ▁سەرچاوەکان ▁پۆل : ساڵەکان` | 7 |
110
- | 32k | `▁ڕووداوەکان ▁لەدایکبوونەکان ▁مردنەکان ▁سەرچاوەکان ▁پۆل : ساڵەکان` | 7 |
111
- | 64k | `▁ڕووداوەکان ▁لەدایکبوونەکان ▁مردنەکان ▁سەرچاوەکان ▁پۆل : ساڵەکان` | 7 |
112
-
113
- **Sample 3:** `ڕووداوەکان
114
-
115
- لەدایکبوونەکان
116
-
117
- مردنەکان
118
-
119
- سەرچاوەکان
120
-
121
 
122
- پۆل:ساڵەکان`
123
 
124
  | Vocab | Tokens | Count |
125
  |-------|--------|-------|
126
- | 8k | `▁ڕووداوەکان ▁لەدایکبوونەکان ▁مردنەکان ▁سەرچاوەکان ▁پۆل : ساڵەکان` | 7 |
127
- | 16k | `▁ڕووداوەکان ▁لەدایکبوونەکان ▁مردنەکان ▁سەرچاوەکان ▁پۆل : ساڵەکان` | 7 |
128
- | 32k | `▁ڕووداوەکان ▁لەدایکبوونەکان ▁مردنەکان ▁سەرچاوەکان ▁پۆل : ساڵەکان` | 7 |
129
- | 64k | `▁ڕووداوەکان ▁لەدایکبوونەکان ▁مردنەکان ▁سەرچاوەکان ▁پۆل : ساڵەکان` | 7 |
130
 
131
 
132
  ### Key Findings
133
 
134
- - **Best Compression:** 64k achieves 4.743x compression
135
- - **Lowest UNK Rate:** 8k with 0.0660% unknown tokens
136
  - **Trade-off:** Larger vocabularies improve compression but increase model size
137
  - **Recommendation:** 32k vocabulary provides optimal balance for production use
138
 
@@ -141,57 +139,111 @@ Below are sample sentences tokenized with each vocabulary size:
141
 
142
  ![N-gram Perplexity](visualizations/ngram_perplexity.png)
143
 
 
 
144
  ![N-gram Coverage](visualizations/ngram_coverage.png)
145
 
146
  ### Results
147
 
148
- | N-gram | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
149
- |--------|------------|---------|----------------|------------------|-------------------|
150
- | **2-gram** | 35,123 🏆 | 15.10 | 297,076 | 15.8% | 33.2% |
151
- | **2-gram** | 381 🏆 | 8.57 | 15,002 | 62.1% | 96.1% |
152
- | **3-gram** | 68,724 | 16.07 | 466,780 | 12.3% | 28.4% |
153
- | **3-gram** | 3,034 | 11.57 | 125,615 | 27.5% | 67.3% |
154
- | **4-gram** | 110,939 | 16.76 | 754,916 | 10.4% | 25.7% |
155
- | **4-gram** | 15,538 | 13.92 | 641,521 | 13.6% | 40.6% |
 
 
156
 
157
  ### Top 5 N-grams by Size
158
 
159
- **2-grams:**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
  | Rank | N-gram | Count |
162
  |------|--------|-------|
163
- | 1 | `پۆل :` | 315,394 |
164
- | 2 | `. لە` | 57,144 |
165
- | 3 | `لە ساڵی` | 47,235 |
166
- | 4 | `. سەرچاوەکان` | 43,826 |
167
- | 5 | `سەرچاوەکان پۆل` | 31,612 |
168
 
169
- **3-grams:**
170
 
171
  | Rank | N-gram | Count |
172
  |------|--------|-------|
173
- | 1 | `سەرچاوەکان پۆل :` | 31,612 |
174
- | 2 | `: / /` | 24,968 |
175
- | 3 | `پۆل : ئەکتەرەکانی` | 22,109 |
176
- | 4 | `پۆل : لەدایکبووانی` | 22,019 |
177
- | 5 | `ئەمریکییەکان پۆل :` | 18,941 |
178
 
179
- **4-grams:**
180
 
181
  | Rank | N-gram | Count |
182
  |------|--------|-------|
183
- | 1 | `. سەرچاوەکان پۆل :` | 17,589 |
184
- | 2 | `بەستەرە دەرەکییەکان پۆل :` | 16,063 |
185
- | 3 | `سەرچاوەکان بەستەرە دەرەکییەکان پۆل` | 15,316 |
186
- | 4 | `. سەرچاوەکان بەستەرە دەرەکییەکان` | 14,823 |
187
- | 5 | `http : / /` | 12,552 |
188
 
189
 
190
  ### Key Findings
191
 
192
- - **Best Perplexity:** 2-gram with 381
193
  - **Entropy Trend:** Decreases with larger n-grams (more predictable)
194
- - **Coverage:** Top-1000 patterns cover ~41% of corpus
195
  - **Recommendation:** 4-gram or 5-gram for best predictive performance
196
 
197
  ---
@@ -199,55 +251,86 @@ Below are sample sentences tokenized with each vocabulary size:
199
 
200
  ![Markov Entropy](visualizations/markov_entropy.png)
201
 
 
 
202
  ![Markov Branching](visualizations/markov_branching.png)
203
 
204
  ### Results
205
 
206
- | Context | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
207
- |---------|-------------|------------|------------------|-----------------|----------------|
208
- | **1** | 0.7236 | 1.651 | 6.62 | 684,303 | 27.6% |
209
- | **1** | 1.3292 | 2.513 | 9.41 | 4,993 | 0.0% |
210
- | **2** | 0.3062 | 1.236 | 1.94 | 4,525,451 | 69.4% |
211
- | **2** | 0.8890 | 1.852 | 6.06 | 46,978 | 11.1% |
212
- | **3** | 0.1145 | 1.083 | 1.24 | 8,778,035 | 88.5% |
213
- | **3** | 0.8191 | 1.764 | 4.27 | 284,820 | 18.1% |
214
- | **4** | 0.0429 🏆 | 1.030 | 1.08 | 10,844,509 | 95.7% |
215
- | **4** | 0.6245 🏆 | 1.542 | 2.80 | 1,215,545 | 37.5% |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
 
217
- ### Generated Text Samples
 
 
 
 
 
 
218
 
219
- Below are text samples generated from each Markov chain model:
 
 
 
 
 
 
 
220
 
221
  **Context Size 1:**
222
 
223
- 1. `لە ١٩٩٤ وەک کانگ ھون ( ١٧٣ ڕۆژ بوو لە پارتی دیموکراتی لەو ئامێرانە دەبنەهۆی کەمکردنەوەی`
224
- 2. `. کۆمارەکە لە ئاستی عێراق بکات وا بیر و ناودارەکان و پەرە بە گراڤ ئەوە ،`
225
- 3. `، سوپەر فلودی ھیلێۆم وایە پێویستە بۆ ئەو بڕە داھاتێکی باشیش بوو لە تاراوگە زۆرەملێکان لە`
226
 
227
  **Context Size 2:**
228
 
229
- 1. `پۆل : فیلمە بەرھەمھێنراوەکان لەلایەن بیو فلین پۆل : فیلمە ئەنیمەیشنەکانی سۆنی پیکچەرز ، ڤۆگێل لە ساڵ...`
230
- 2. `. لە ڕێکەوتی ١٠ی کانوونی دووەمی ١٩١٩ ، بادە لە ساڵی ١٩٣٧ وەک « ئەگەر ناچار بێت`
231
- 3. `لە ساڵی ١٩٨١ پەنای بردووەتە ویلایەتە یەکگرتووەکانی ئەمریکا . سەنتەرەکە نزیکەی ٢٥٠ کارمەندی تێدابوو ک...`
232
 
233
  **Context Size 3:**
234
 
235
- 1. `سەرچاوەکان پۆل : خۆراک و ژینگە پۆل : پێشەکییەکانی ساڵی ١٩٧٢ پۆل : ئیسرائیل لە ١٩٤٨ پۆل :`
236
- 2. `: / / kurdipedia . org / web / 20090302175610 / http : / / web . archive`
237
- 3. `پۆل : ئەکتەرەکانی تەلەڤیزیۆنی پیاوی ئەمریکی پۆل : ئەو فیلمانەی لە نیویۆرک داندراون پۆل : فیلمە کەنەد...`
238
 
239
  **Context Size 4:**
240
 
241
- 1. `. سەرچاوەکان پۆل : شوێنە ئاوەدانەکانی پارێز��ای سلێمانی پۆل : گوندەکانی باشووری کوردستان پۆل : نیشتەج...`
242
- 2. `بەستەرە دەرەکییەکان پۆل : لەدایکبووانی ١٩١٧ پۆل : مردووانی ١٩٩٧ پۆل : ئەکتەرە پیاوە ئەمریکییەکانی سە...`
243
- 3. `سەرچاوەکان بەستەرە دەرەکییەکان پۆل : لەدایکبووانی ١٩٥٧ پۆل : نووسەرە پیاوە ئەمریکییەکانی سەدەی ٢٠ەم ...`
244
 
245
 
246
  ### Key Findings
247
 
248
- - **Best Predictability:** Context-4 with 95.7% predictability
249
  - **Branching Factor:** Decreases with context size (more deterministic)
250
- - **Memory Trade-off:** Larger contexts require more storage (1,215,545 contexts)
251
  - **Recommendation:** Context-3 or Context-4 for text generation
252
 
253
  ---
@@ -263,26 +346,26 @@ Below are text samples generated from each Markov chain model:
263
 
264
  | Metric | Value |
265
  |--------|-------|
266
- | Vocabulary Size | 267,929 |
267
- | Total Tokens | 12,273,016 |
268
- | Mean Frequency | 45.81 |
269
  | Median Frequency | 4 |
270
- | Frequency Std Dev | 1797.49 |
271
 
272
  ### Most Common Words
273
 
274
  | Rank | Word | Frequency |
275
  |------|------|-----------|
276
- | 1 | لە | 634,045 |
277
- | 2 | و | 445,997 |
278
- | 3 | پۆل | 315,758 |
279
- | 4 | بە | 217,465 |
280
- | 5 | کە | 180,742 |
281
- | 6 | بۆ | 132,391 |
282
- | 7 | ساڵی | 84,816 |
283
- | 8 | سەرچاوەکان | 65,009 |
284
- | 9 | بوو | 61,528 |
285
- | 10 | لەگەڵ | 54,424 |
286
 
287
  ### Least Common Words (from vocabulary)
288
 
@@ -290,10 +373,10 @@ Below are text samples generated from each Markov chain model:
290
  |------|------|-----------|
291
  | 1 | microarchitecture | 2 |
292
  | 2 | gigabit | 2 |
293
- | 3 | سوپەرکۆمپیوتەرەکە | 2 |
294
- | 4 | تایوانیا | 2 |
295
- | 5 | بایۆمۆلیکولەر | 2 |
296
- | 6 | gimps | 2 |
297
  | 7 | principatele | 2 |
298
  | 8 | دۆمنیتۆر | 2 |
299
  | 9 | باربو | 2 |
@@ -303,24 +386,24 @@ Below are text samples generated from each Markov chain model:
303
 
304
  | Metric | Value |
305
  |--------|-------|
306
- | Zipf Coefficient | 1.0351 |
307
- | R² (Goodness of Fit) | 0.990570 |
308
  | Adherence Quality | **excellent** |
309
 
310
  ### Coverage Analysis
311
 
312
  | Top N Words | Coverage |
313
  |-------------|----------|
314
- | Top 100 | 31.0% |
315
- | Top 1,000 | 55.5% |
316
- | Top 5,000 | 74.1% |
317
- | Top 10,000 | 80.8% |
318
 
319
  ### Key Findings
320
 
321
- - **Zipf Compliance:** R²=0.9906 indicates excellent adherence to Zipf's law
322
- - **High Frequency Dominance:** Top 100 words cover 31.0% of corpus
323
- - **Long Tail:** 257,929 words needed for remaining 19.2% coverage
324
 
325
  ---
326
  ## 5. Word Embeddings Evaluation
@@ -333,24 +416,132 @@ Below are text samples generated from each Markov chain model:
333
 
334
  ![t-SNE Sentences](visualizations/tsne_sentences.png)
335
 
336
- ### Model Comparison
337
 
338
- | Model | Vocab Size | Dimension | Avg Norm | Std Norm | Isotropy |
339
- |-------|------------|-----------|----------|----------|----------|
340
- | **mono_32d** | 129,587 | 32 | 3.467 | 1.190 | 0.7972 🏆 |
341
- | **mono_64d** | 129,587 | 64 | 3.957 | 1.135 | 0.7842 |
342
- | **mono_128d** | 129,587 | 128 | 4.525 | 1.098 | 0.7539 |
343
- | **embeddings_enhanced** | 0 | 0 | 0.000 | 0.000 | 0.0000 |
 
 
 
 
 
 
 
 
 
 
 
344
 
345
  ### Key Findings
346
 
347
- - **Best Isotropy:** mono_32d with 0.7972 (more uniform distribution)
348
- - **Dimension Trade-off:** Higher dimensions capture more semantics but reduce isotropy
349
- - **Vocabulary Coverage:** All models cover 129,587 words
350
- - **Recommendation:** 100d for balanced semantic capture and efficiency
351
 
352
  ---
353
- ## 6. Summary & Recommendations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
 
355
  ![Performance Dashboard](visualizations/performance_dashboard.png)
356
 
@@ -358,11 +549,12 @@ Below are text samples generated from each Markov chain model:
358
 
359
  | Component | Recommended | Rationale |
360
  |-----------|-------------|-----------|
361
- | Tokenizer | **32k BPE** | Best compression (4.74x) with low UNK rate |
362
- | N-gram | **5-gram** | Lowest perplexity (381) |
363
- | Markov | **Context-4** | Highest predictability (95.7%) |
364
  | Embeddings | **100d** | Balanced semantic capture and isotropy |
365
 
 
366
  ---
367
  ## Appendix: Metrics Glossary & Interpretation Guide
368
 
@@ -552,7 +744,8 @@ If you use these models in your research, please cite:
552
  author = {Kamali, Omar},
553
  title = {Wikilangs: Open NLP Models for Wikipedia Languages},
554
  year = {2025},
555
- publisher = {HuggingFace},
 
556
  url = {https://huggingface.co/wikilangs}
557
  institution = {Omneity Labs}
558
  }
@@ -568,7 +761,8 @@ MIT License - Free for academic and commercial use.
568
  - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
569
  - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
570
  - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
 
571
  ---
572
  *Generated by Wikilangs Models Pipeline*
573
 
574
- *Report Date: 2025-12-28 23:07:50*
 
1
  ---
2
  language: ckb
3
+ language_name: Central Kurdish
4
  language_family: iranian_western
5
  tags:
6
  - wikilangs
 
10
  - n-gram
11
  - markov
12
  - wikipedia
13
+ - feature-extraction
14
+ - sentence-similarity
15
+ - tokenization
16
+ - n-grams
17
+ - markov-chain
18
+ - text-mining
19
+ - fasttext
20
+ - babelvec
21
+ - vocabulous
22
+ - vocabulary
23
  - monolingual
24
  - family-iranian_western
25
  license: mit
26
  library_name: wikilangs
27
+ pipeline_tag: text-generation
28
  datasets:
29
  - omarkamali/wikipedia-monthly
30
  dataset_info:
 
33
  metrics:
34
  - name: best_compression_ratio
35
  type: compression
36
+ value: 4.804
37
  - name: best_isotropy
38
  type: isotropy
39
+ value: 0.8085
40
  - name: vocabulary_size
41
  type: vocab
42
+ value: 0
43
+ generated: 2026-01-04
44
  ---
45
 
46
+ # Central Kurdish - Wikilangs Models
47
  ## Comprehensive Research Report & Full Ablation Study
48
 
49
+ This repository contains NLP models trained and evaluated by Wikilangs, specifically on **Central Kurdish** Wikipedia data.
50
  We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
51
 
52
  ## 📋 Repository Contents
 
54
  ### Models & Assets
55
 
56
  - Tokenizers (8k, 16k, 32k, 64k)
57
+ - N-gram models (2, 3, 4, 5-gram)
58
+ - Markov chains (context of 1, 2, 3, 4 and 5)
59
  - Subword N-gram and Markov chains
60
+ - Embeddings in various sizes and dimensions (aligned and unaligned)
61
  - Language Vocabulary
62
  - Language Statistics
63
+
64
  ![Performance Dashboard](visualizations/performance_dashboard.png)
65
 
66
  ### Analysis and Evaluation
 
70
  - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
71
  - [4. Vocabulary Analysis](#4-vocabulary-analysis)
72
  - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
73
+ - [6. Morphological Analysis (Experimental)](#6--morphological-analysis-experimental)
74
+ - [7. Summary & Recommendations](#7-summary--recommendations)
75
  - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
76
  - [Visualizations Index](#visualizations-index)
77
 
 
80
 
81
  ![Tokenizer Compression](visualizations/tokenizer_compression.png)
82
 
83
+ ![Tokenizer Fertility](visualizations/tokenizer_fertility.png)
84
+
85
+ ![Tokenizer OOV](visualizations/tokenizer_oov.png)
86
+
87
+ ![Total Tokens](visualizations/tokenizer_total_tokens.png)
88
+
89
  ### Results
90
 
91
  | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
92
  |------------|-------------|---------------|----------|--------------|
93
+ | **8k** | 3.742x | 3.74 | 0.0597% | 899,331 |
94
+ | **16k** | 4.157x | 4.16 | 0.0663% | 809,551 |
95
+ | **32k** | 4.517x | 4.52 | 0.0721% | 745,101 |
96
+ | **64k** | 4.804x 🏆 | 4.80 | 0.0766% | 700,630 |
97
 
98
  ### Tokenization Examples
99
 
100
  Below are sample sentences tokenized with each vocabulary size:
101
 
102
+ **Sample 1:** `پیشوا () شارێکە لە پارێزگای تاران، ئێران. ئەمانەش ببینە پێڕستی شارەکانی ئێران پێ...`
 
 
103
 
104
  | Vocab | Tokens | Count |
105
  |-------|--------|-------|
106
+ | 8k | `▁پیش وا ▁() ▁شارێکە ▁لە ▁پارێزگای ▁تاران ، ▁ئێران . ... (+12 more)` | 22 |
107
+ | 16k | `▁پیش وا ▁() ▁شارێکە ▁لە ▁پارێزگای ▁تاران ، ▁ئێران . ... (+12 more)` | 22 |
108
+ | 32k | `▁پیش وا ▁() ▁شارێکە ▁لە ▁پارێزگای ▁تاران ، ▁ئێران . ... (+12 more)` | 22 |
109
+ | 64k | `▁پیش وا ▁() ▁شارێکە ▁لە ▁پارێزگای ▁تاران ، ▁ئێران . ... (+12 more)` | 22 |
 
 
 
 
 
 
110
 
111
+ **Sample 2:** `پەنەما نەتەوەیەکی بەشداربووی ئۆڵۆمپیادی ھاوینەی بوو کە لە ١٧ی ئایار تا ١٢ی ئابی ...`
 
 
 
112
 
113
  | Vocab | Tokens | Count |
114
  |-------|--------|-------|
115
+ | 8k | `▁پەن ەم ا ▁نەتەوەیەکی ▁بەشداربووی ▁ئۆڵۆمپیادی ▁ھاوینەی ▁بوو ▁کە ▁لە ... (+20 more)` | 30 |
116
+ | 16k | `▁پەنەما ▁نەتەوەیەکی ▁بەشداربووی ▁ئۆڵۆمپیادی ▁ھاوینەی ▁بوو ▁کە ▁لە ▁١٧ی ▁ئایار ... (+14 more)` | 24 |
117
+ | 32k | `▁پەنەما ▁نەتەوەیەکی ▁بەشداربووی ▁ئۆڵۆمپیادی ▁ھاوینەی ▁بوو ▁کە ▁لە ▁١٧ی ▁ئایار ... (+14 more)` | 24 |
118
+ | 64k | `▁پەنەما ▁نەتەوەیەکی ▁بەشداربووی ▁ئۆڵۆمپیادی ▁ھاوینەی ▁بوو ▁کە ▁لە ▁١٧ی ▁ئایار ... (+14 more)` | 24 |
 
 
 
 
 
 
 
 
 
119
 
120
+ **Sample 3:** `بێثێل () شارێکە دەکەوێتە ویلایەتی ئالاسکا، ئەمریکا. ژمارەی دانیشتووانی بەپێی سەر...`
121
 
122
  | Vocab | Tokens | Count |
123
  |-------|--------|-------|
124
+ | 8k | `▁بێ ث ێل ▁() ▁شارێکە ▁دەکەوێتە ▁ویلایەتی ▁ئالاسکا ، ▁ئەمریکا ... (+18 more)` | 28 |
125
+ | 16k | `▁بێ ث ێل ▁() ▁شارێکە ▁دەکەوێتە ▁ویلایەتی ▁ئالاسکا ، ▁ئەمریکا ... (+18 more)` | 28 |
126
+ | 32k | `▁بێ ث ێل ▁() ▁شارێکە ▁دەکەوێتە ▁ویلایەتی ▁ئالاسکا ، ▁ئەمریکا ... (+18 more)` | 28 |
127
+ | 64k | `▁بێ ث ێل ▁() ▁شارێکە ▁دەکەوێتە ▁ویلایەتی ▁ئالاسکا ، ▁ئەمریکا ... (+18 more)` | 28 |
128
 
129
 
130
  ### Key Findings
131
 
132
+ - **Best Compression:** 64k achieves 4.804x compression
133
+ - **Lowest UNK Rate:** 8k with 0.0597% unknown tokens
134
  - **Trade-off:** Larger vocabularies improve compression but increase model size
135
  - **Recommendation:** 32k vocabulary provides optimal balance for production use
136
 
 
139
 
140
  ![N-gram Perplexity](visualizations/ngram_perplexity.png)
141
 
142
+ ![N-gram Unique](visualizations/ngram_unique.png)
143
+
144
  ![N-gram Coverage](visualizations/ngram_coverage.png)
145
 
146
  ### Results
147
 
148
+ | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
149
+ |--------|---------|------------|---------|----------------|------------------|-------------------|
150
+ | **2-gram** | Word | 43,391 | 15.41 | 224,985 | 11.6% | 28.8% |
151
+ | **2-gram** | Subword | 307 🏆 | 8.26 | 12,264 | 66.4% | 97.8% |
152
+ | **3-gram** | Word | 66,250 | 16.02 | 298,666 | 10.5% | 25.9% |
153
+ | **3-gram** | Subword | 2,476 | 11.27 | 92,875 | 29.2% | 70.6% |
154
+ | **4-gram** | Word | 100,774 | 16.62 | 472,614 | 10.7% | 24.7% |
155
+ | **4-gram** | Subword | 13,099 | 13.68 | 482,188 | 14.0% | 42.0% |
156
+ | **5-gram** | Word | 72,668 | 16.15 | 353,585 | 11.8% | 27.3% |
157
+ | **5-gram** | Subword | 47,108 | 15.52 | 1,228,808 | 7.9% | 26.8% |
158
 
159
  ### Top 5 N-grams by Size
160
 
161
+ **2-grams (Word):**
162
+
163
+ | Rank | N-gram | Count |
164
+ |------|--------|-------|
165
+ | 1 | `لە ساڵی` | 47,065 |
166
+ | 2 | `کە لە` | 28,992 |
167
+ | 3 | `و لە` | 26,652 |
168
+ | 4 | `بەستەرە دەرەکییەکان` | 19,291 |
169
+ | 5 | `سەرچاوەکان بەستەرە` | 17,555 |
170
+
171
+ **3-grams (Word):**
172
+
173
+ | Rank | N-gram | Count |
174
+ |------|--------|-------|
175
+ | 1 | `سەرچاوەکان بەستەرە دەرەکییەکان` | 17,516 |
176
+ | 2 | `دەستی بە چالاکی` | 7,882 |
177
+ | 3 | `لە دەستی بە` | 7,873 |
178
+ | 4 | `بە چالاکی کردووە` | 7,857 |
179
+ | 5 | `ئەمریکییەکانی سەدەی ٢٠ەم` | 7,760 |
180
+
181
+ **4-grams (Word):**
182
+
183
+ | Rank | N-gram | Count |
184
+ |------|--------|-------|
185
+ | 1 | `دەستی بە چالاکی کردووە` | 7,857 |
186
+ | 2 | `لە دەستی بە چالاکی` | 7,838 |
187
+ | 3 | `کردووە سەرچاوەکان بەستەرە دەرەکییەکان` | 6,699 |
188
+ | 4 | `پیاوە ئەمریکییەکانی سەدەی ٢٠ەم` | 6,045 |
189
+ | 5 | `ئەمریکییە لە دەستی بە` | 5,227 |
190
+
191
+ **5-grams (Word):**
192
+
193
+ | Rank | N-gram | Count |
194
+ |------|--------|-------|
195
+ | 1 | `لە دەستی بە چالاکی کردووە` | 7,827 |
196
+ | 2 | `ئەمریکییە لە دەستی بە چالاکی` | 5,227 |
197
+ | 3 | `ئەکتەرێکی ئەمریکییە لە دەستی بە` | 5,224 |
198
+ | 4 | `چالاکی کردووە سەرچاوەکان بەستەرە دەرەکییەکان` | 4,624 |
199
+ | 5 | `دەستی بە چالاکی کردووە سەرچاوەکان` | 4,624 |
200
+
201
+ **2-grams (Subword):**
202
+
203
+ | Rank | N-gram | Count |
204
+ |------|--------|-------|
205
+ | 1 | `ی _` | 3,411,049 |
206
+ | 2 | `ە _` | 1,937,601 |
207
+ | 3 | `ا ن` | 1,774,322 |
208
+ | 4 | `_ ب` | 1,264,353 |
209
+ | 5 | `ە ک` | 1,085,531 |
210
+
211
+ **3-grams (Subword):**
212
 
213
  | Rank | N-gram | Count |
214
  |------|--------|-------|
215
+ | 1 | `_ ل ە` | 875,397 |
216
+ | 2 | ی _` | 698,413 |
217
+ | 3 | ە _` | 639,579 |
218
+ | 4 | ن ی` | 592,978 |
219
+ | 5 | `_ ب ە` | 565,735 |
220
 
221
+ **4-grams (Subword):**
222
 
223
  | Rank | N-gram | Count |
224
  |------|--------|-------|
225
+ | 1 | `_ ل ە _` | 625,605 |
226
+ | 2 | ک ا ن` | 467,335 |
227
+ | 3 | ن ی _` | 454,442 |
228
+ | 4 | ا ن _` | 226,640 |
229
+ | 5 | ا ن ی` | 214,980 |
230
 
231
+ **5-grams (Subword):**
232
 
233
  | Rank | N-gram | Count |
234
  |------|--------|-------|
235
+ | 1 | ک ا ن _` | 217,466 |
236
+ | 2 | ا ن ی _` | 198,040 |
237
+ | 3 | ک ا ن ی` | 193,300 |
238
+ | 4 | ە ک ا ن` | 146,991 |
239
+ | 5 | ی ە ک ا` | 135,823 |
240
 
241
 
242
  ### Key Findings
243
 
244
+ - **Best Perplexity:** 2-gram (subword) with 307
245
  - **Entropy Trend:** Decreases with larger n-grams (more predictable)
246
+ - **Coverage:** Top-1000 patterns cover ~27% of corpus
247
  - **Recommendation:** 4-gram or 5-gram for best predictive performance
248
 
249
  ---
 
251
 
252
  ![Markov Entropy](visualizations/markov_entropy.png)
253
 
254
+ ![Markov Contexts](visualizations/markov_contexts.png)
255
+
256
  ![Markov Branching](visualizations/markov_branching.png)
257
 
258
  ### Results
259
 
260
+ | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
261
+ |---------|---------|-------------|------------|------------------|-----------------|----------------|
262
+ | **1** | Word | 0.8150 | 1.759 | 7.19 | 625,283 | 18.5% |
263
+ | **1** | Subword | 1.1771 | 2.261 | 7.84 | 5,867 | 0.0% |
264
+ | **2** | Word | 0.2642 | 1.201 | 1.74 | 4,486,871 | 73.6% |
265
+ | **2** | Subword | 0.7063 | 1.632 | 4.63 | 46,011 | 29.4% |
266
+ | **3** | Word | 0.0868 | 1.062 | 1.16 | 7,800,583 | 91.3% |
267
+ | **3** | Subword | 0.7560 | 1.689 | 4.12 | 212,847 | 24.4% |
268
+ | **4** | Word | 0.0293 🏆 | 1.021 | 1.05 | 9,049,668 | 97.1% |
269
+ | **4** | Subword | 0.6434 | 1.562 | 2.94 | 877,504 | 35.7% |
270
+
271
+ ### Generated Text Samples (Word-based)
272
+
273
+ Below are text samples generated from each word-based Markov chain model:
274
+
275
+ **Context Size 1:**
276
+
277
+ 1. `لە ئۆڵۆمپیادی زستانەی پێکھاتووە لە ساڵی لەلایەن ھێربێرت بێرگ لە ڕۆژھەڵاتی ویلایەتە یەکگرتووەکان کە پ...`
278
+ 2. `و تا ٤ی حوزەیرانی لە ١٩ی ئەیلوولی لە بەردەم ماڵەکەی دەسووتێ و بەم شێوەیەن و بنچینەکانی`
279
+ 3. `بە ڕەچەڵەک هەنگاری یانۆس پرۆھاسکا ١٠ی ئەیلوولی بەنەخۆشی لە سەر ڕێڕەوەکە لە ڕێشە وشەی بە زمانی`
280
+
281
+ **Context Size 2:**
282
+
283
+ 1. `لە ساڵی مەحموود پاشا ناردی بۆ لای خوا ڕاکێشێت پیاوێک ھەبوو کە لە ئەڵمانیا دانراون فیلمانەی لە`
284
+ 2. `کە لە ئاشەکاندا بۆ ھاڕینەوەی گەنم بە کار دەھێنن ئەمەش سوود لە باروودۆخی کوژرانی خۆپیشاندەرانی کورد ک...`
285
+ 3. `و لە ساڵی دروستکراوە و کارەکتەری ھونەریەکەی بە جەنەڕاڵی شانۆی کوردی سقز یەکێک لە خودایانی ھیندووەکان...`
286
 
287
+ **Context Size 3:**
288
+
289
+ 1. `سەرچاوەکان بەستەرە دەرەکییەکان فیلمە پیاوە ئەمریکییەکان تەلەڤیزیۆنی پیاوی ئەمریکی نێرەکانی کۆلۆرادۆ ...`
290
+ 2. `دەستی بە چالاکی کردووە بەشداریی لە فیلمی ٣٠ ڕۆژ لە شەو و ڕۆژێکدا تەنھا ٢ کاتژمێر خەوتووە زۆرینەی`
291
+ 3. `لە دەستی بە چالاکی کردووە بەشداریی لە زنجیرەی ھاوسدا کردووە سەرچاوەکان بەستەرە دەرەکییەکان پیاوە ئەم...`
292
+
293
+ **Context Size 4:**
294
 
295
+ 1. `دەستی بە چالاکی کردووە سەرچاوەکان بەستەرە دەرەکییەکان پیاوە ئەمریکییەکانی سەدەی ٢٠ەم فیلمە پیاوە ئەم...`
296
+ 2. `لە دەستی بە چالاکی کردووە و تا بەردەوام بووە سەرچاوەکان بەستەرە دەرەکییەکان پیاوە ئەمریکییەکانی سەدە...`
297
+ 3. `کردووە سەرچاوەکان بەستەرە دەرەکییەکان پیاوە ئەمریکییەکانی سەدەی ٢٠ەم مافەکانی کۆمەڵگەی پەلکەزێڕینە ل...`
298
+
299
+
300
+ ### Generated Text Samples (Subword-based)
301
+
302
+ Below are text samples generated from each subword-based Markov chain model:
303
 
304
  **Context Size 1:**
305
 
306
+ 1. `_خصعە_ڕشدەکدیلە_`
307
+ 2. `ەوەری_بانگەتی_بی`
308
+ 3. `ی_تری_خانۆ_باموی`
309
 
310
  **Context Size 2:**
311
 
312
+ 1. `ی_جیادارەندادەی_ب`
313
+ 2. `ە_پێ_کانە_ئامەزەک`
314
+ 3. `انی_پەی_بۆ_گەکانی`
315
 
316
  **Context Size 3:**
317
 
318
+ 1. `_لە_بڕیاری_موونی_ئ`
319
+ 2. `نی_ژمار_ناو_ھەبوو.`
320
+ 3. `لە_ھاوی_و_لەسەنگی_`
321
 
322
  **Context Size 4:**
323
 
324
+ 1. `_لە_بەکار_و_دانوستا`
325
+ 2. `انی_جینگ،_مایکل_٣_ئ`
326
+ 3. `ەکان_بۆ_نیشتووان_ئە`
327
 
328
 
329
  ### Key Findings
330
 
331
+ - **Best Predictability:** Context-4 (word) with 97.1% predictability
332
  - **Branching Factor:** Decreases with context size (more deterministic)
333
+ - **Memory Trade-off:** Larger contexts require more storage (877,504 contexts)
334
  - **Recommendation:** Context-3 or Context-4 for text generation
335
 
336
  ---
 
346
 
347
  | Metric | Value |
348
  |--------|-------|
349
+ | Vocabulary Size | 254,727 |
350
+ | Total Tokens | 10,896,559 |
351
+ | Mean Frequency | 42.78 |
352
  | Median Frequency | 4 |
353
+ | Frequency Std Dev | 1719.93 |
354
 
355
  ### Most Common Words
356
 
357
  | Rank | Word | Frequency |
358
  |------|------|-----------|
359
+ | 1 | لە | 632,400 |
360
+ | 2 | و | 442,707 |
361
+ | 3 | بە | 216,191 |
362
+ | 4 | کە | 179,841 |
363
+ | 5 | بۆ | 132,098 |
364
+ | 6 | ساڵی | 84,358 |
365
+ | 7 | سەرچاوەکان | 63,400 |
366
+ | 8 | بوو | 61,016 |
367
+ | 9 | لەگەڵ | 54,346 |
368
+ | 10 | ئەم | 49,216 |
369
 
370
  ### Least Common Words (from vocabulary)
371
 
 
373
  |------|------|-----------|
374
  | 1 | microarchitecture | 2 |
375
  | 2 | gigabit | 2 |
376
+ | 3 | ethernet | 2 |
377
+ | 4 | سوپەرکۆمپیوتەرەکە | 2 |
378
+ | 5 | تایوانیا | 2 |
379
+ | 6 | بایۆمۆلیکولەر | 2 |
380
  | 7 | principatele | 2 |
381
  | 8 | دۆمنیتۆر | 2 |
382
  | 9 | باربو | 2 |
 
386
 
387
  | Metric | Value |
388
  |--------|-------|
389
+ | Zipf Coefficient | 1.0274 |
390
+ | R² (Goodness of Fit) | 0.992430 |
391
  | Adherence Quality | **excellent** |
392
 
393
  ### Coverage Analysis
394
 
395
  | Top N Words | Coverage |
396
  |-------------|----------|
397
+ | Top 100 | 31.2% |
398
+ | Top 1,000 | 55.6% |
399
+ | Top 5,000 | 73.7% |
400
+ | Top 10,000 | 80.5% |
401
 
402
  ### Key Findings
403
 
404
+ - **Zipf Compliance:** R²=0.9924 indicates excellent adherence to Zipf's law
405
+ - **High Frequency Dominance:** Top 100 words cover 31.2% of corpus
406
+ - **Long Tail:** 244,727 words needed for remaining 19.5% coverage
407
 
408
  ---
409
  ## 5. Word Embeddings Evaluation
 
416
 
417
  ![t-SNE Sentences](visualizations/tsne_sentences.png)
418
 
 
419
 
420
+ ### 5.1 Cross-Lingual Alignment
421
+
422
+ ![Alignment Quality](visualizations/embedding_alignment_quality.png)
423
+
424
+ ![Multilingual t-SNE](visualizations/embedding_tsne_multilingual.png)
425
+
426
+
427
+ ### 5.2 Model Comparison
428
+
429
+ | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
430
+ |-------|-----------|----------|------------------|---------------|----------------|
431
+ | **mono_32d** | 32 | 0.8085 | 0.3591 | N/A | N/A |
432
+ | **mono_64d** | 64 | 0.8061 | 0.2799 | N/A | N/A |
433
+ | **mono_128d** | 128 | 0.7738 | 0.2134 | N/A | N/A |
434
+ | **aligned_32d** | 32 | 0.8085 🏆 | 0.3647 | 0.0280 | 0.1960 |
435
+ | **aligned_64d** | 64 | 0.8061 | 0.2755 | 0.0680 | 0.3020 |
436
+ | **aligned_128d** | 128 | 0.7738 | 0.2095 | 0.0960 | 0.3920 |
437
 
438
  ### Key Findings
439
 
440
+ - **Best Isotropy:** aligned_32d with 0.8085 (more uniform distribution)
441
+ - **Semantic Density:** Average pairwise similarity of 0.2837. Lower values indicate better semantic separation.
442
+ - **Alignment Quality:** Aligned models achieve up to 9.6% R@1 in cross-lingual retrieval.
443
+ - **Recommendation:** 128d aligned for best cross-lingual performance
444
 
445
  ---
446
+ ## 6. Morphological Analysis (Experimental)
447
+
448
+ This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
449
+
450
+ ### 6.1 Productivity & Complexity
451
+
452
+ | Metric | Value | Interpretation | Recommendation |
453
+ |--------|-------|----------------|----------------|
454
+ | Productivity Index | **5.000** | High morphological productivity | Reliable analysis |
455
+ | Idiomaticity Gap | **0.020** | Low formulaic content | - |
456
+
457
+ ### 6.2 Affix Inventory (Productive Units)
458
+
459
+ These are the most productive prefixes and suffixes identified by sampling the vocabulary for global substitutability patterns. A unit is considered an affix if stripping it leaves a valid stem that appears in other contexts.
460
+
461
+ #### Productive Prefixes
462
+ | Prefix | Examples |
463
+ |--------|----------|
464
+ | `-ئە` | ئەرمەنستانەوە, ئەرزەروم, ئەمبڕێلاوە |
465
+ | `-بە` | بەڕوودا, بەیاوبەس, بەلتیک |
466
+ | `-دە` | دەدایە, دەشتانە, دەیبرد |
467
+
468
+ #### Productive Suffixes
469
+ | Suffix | Examples |
470
+ |--------|----------|
471
+ | `-ی` | ویکیپدیای, نەوەکەی, جاگتای |
472
+ | `-ە` | ئینگلستانەوە, چۆنە, ناوەکیە |
473
+ | `-ن` | ئامانجەکان, کارلێککارەکان, ھەمەدانیان |
474
+ | `-ان` | ئامانجەکان, کارلێککارەکان, ھەمەدانیان |
475
+ | `-نی` | بووەکانی, مەجنونی, کۆمیکسەکانی |
476
+ | `-وە` | ئینگلستانەوە, تریەوە, ئەرمەنستانەوە |
477
+ | `-ەوە` | ئینگلستانەوە, تریەوە, ئەرمەنستانەوە |
478
+ | `-ەی` | نەوەکەی, وەزیفەی, حەوانەوەی |
479
+
480
+ ### 6.3 Bound Stems (Lexical Roots)
481
+
482
+ Bound stems are high-frequency subword units that are semantically cohesive but rarely appear as standalone words. These often correspond to the 'core' of a word that requires inflection or derivation to be valid.
483
+
484
+ | Stem | Cohesion | Substitutability | Examples |
485
+ |------|----------|------------------|----------|
486
+ | `انیا` | 1.88x | 226 contexts | کانیا, خانیا, شانیا |
487
+ | `ییەک` | 1.50x | 396 contexts | چییەک, دییەک, دییەکی |
488
+ | `ەمری` | 2.19x | 44 contexts | دەمری, عەمری, کەمری |
489
+ | `مریک` | 2.13x | 48 contexts | ئێمریک, ئیمریک, ئەمریک |
490
+ | `اوەک` | 1.50x | 247 contexts | تاوەک, ماوەک, ڕاوەکە |
491
+ | `وەکا` | 1.61x | 150 contexts | وەکار, بوەکان, وەکاری |
492
+ | `ەڵات` | 1.71x | 100 contexts | هەڵات, سەڵات, خەڵات |
493
+ | `ەسەر` | 1.59x | 133 contexts | بەسەر, ئەسەر, کەسەر |
494
+ | `رەکا` | 1.38x | 274 contexts | ترەکان, چرەکان, مۆرەکان |
495
+ | `ەرچا` | 2.05x | 42 contexts | سەرچاو, بەرچاو, بەرچاون |
496
+ | `رچاو` | 1.84x | 60 contexts | قرچاو, رچاوه, سەرچاو |
497
+ | `ردنی` | 1.72x | 80 contexts | كردنی, مردنی, بردنی |
498
+
499
+ ### 6.4 Affix Compatibility (Co-occurrence)
500
+
501
+ This table shows which prefixes and suffixes most frequently co-occur on the same stems, revealing the 'stacking' rules of the language's morphology.
502
+
503
+ | Prefix | Suffix | Frequency | Examples |
504
+ |--------|--------|-----------|----------|
505
+ | `-بە` | `-ی` | 83 words | بەرەوپێشبردنی, بەتانی |
506
+ | `-بە` | `-ە` | 50 words | بەدواوەیە, بەدواداچوونەکە |
507
+ | `-ئە` | `-ە` | 49 words | ئەفسانەییە, ئەستێرەیەکەوە |
508
+ | `-دە` | `-ە` | 45 words | دەروونییەکانییەوە, دەرئەنجامەکە |
509
+ | `-ئە` | `-ی` | 44 words | ئەهێنی, ئەوێی |
510
+ | `-بە` | `-ن` | 38 words | بەرپرسەکەیان, بەرنامەکان |
511
+ | `-دە` | `-ن` | 34 words | دەخرێن, دەکران |
512
+ | `-دە` | `-ی` | 32 words | دەپەیوەندی, دەبیری |
513
+ | `-بە` | `-نی` | 31 words | بەرەوپێشبردنی, بەتانی |
514
+ | `-دە` | `-وە` | 26 words | دەروونییەکانییەوە, دەگوازیتەوە |
515
+
516
+ ### 6.5 Recursive Morpheme Segmentation
517
+
518
+ Using **Recursive Hierarchical Substitutability**, we decompose complex words into their constituent morphemes. This approach handles nested affixes (e.g., `prefix-prefix-root-suffix`).
519
+
520
+ | Word | Suggested Split | Confidence | Stem |
521
+ |------|-----------------|------------|------|
522
+ | خراپەکارانەی | **`خراپەکار-ان-ەی`** | 6.0 | `خراپەکار` |
523
+ | گیاندارانەی | **`گیاندار-ان-ەی`** | 6.0 | `گیاندار` |
524
+ | کارانەیان | **`کاران-ەی-ان`** | 6.0 | `کاران` |
525
+ | ئۆرانیەوە | **`ئۆرا-نی-ەوە`** | 6.0 | `ئۆرا` |
526
+ | پسپۆڕانەوە | **`پسپۆڕ-ان-ەوە`** | 6.0 | `پسپۆڕ` |
527
+ | مێیەکانیان | **`مێیەک-انی-ان`** | 6.0 | `مێیەک` |
528
+ | ھاوسەرگیرییاندا | **`ھاوسەرگیریی-ان-دا`** | 6.0 | `ھاوسەرگیریی` |
529
+ | پێشەنگانەی | **`پێشەنگ-ان-ەی`** | 6.0 | `پێشەنگ` |
530
+ | ئابوورییەکانەوە | **`ئابوورییەک-ان-ەوە`** | 6.0 | `ئابوورییەک` |
531
+ | وەرزشکارانەی | **`وەرزشکار-ان-ەی`** | 6.0 | `وەرزشکار` |
532
+ | گۆرانییەکاندا | **`گۆرانییەک-ان-دا`** | 6.0 | `گۆرانییەک` |
533
+ | ئەمیرەکان | **`ئە-میرەک-ان`** | 6.0 | `میرەک` |
534
+ | ڕەبیعەیان | **`ڕەبیع-ەی-ان`** | 6.0 | `ڕەبیع` |
535
+ | بەھاندانی | **`بە-ھاند-انی`** | 6.0 | `ھاند` |
536
+ | ناوخۆییانەی | **`ناوخۆیی-ان-ەی`** | 6.0 | `ناوخۆیی` |
537
+
538
+ ### 6.6 Linguistic Interpretation
539
+
540
+ > **Automated Insight:**
541
+ The language Central Kurdish shows high morphological productivity. The subword models are significantly more efficient than word models, suggesting a rich system of affixation or compounding.
542
+
543
+ ---
544
+ ## 7. Summary & Recommendations
545
 
546
  ![Performance Dashboard](visualizations/performance_dashboard.png)
547
 
 
549
 
550
  | Component | Recommended | Rationale |
551
  |-----------|-------------|-----------|
552
+ | Tokenizer | **64k BPE** | Best compression (4.80x) |
553
+ | N-gram | **2-gram** | Lowest perplexity (307) |
554
+ | Markov | **Context-4** | Highest predictability (97.1%) |
555
  | Embeddings | **100d** | Balanced semantic capture and isotropy |
556
 
557
+
558
  ---
559
  ## Appendix: Metrics Glossary & Interpretation Guide
560
 
 
744
  author = {Kamali, Omar},
745
  title = {Wikilangs: Open NLP Models for Wikipedia Languages},
746
  year = {2025},
747
+ doi = {10.5281/zenodo.18073153},
748
+ publisher = {Zenodo},
749
  url = {https://huggingface.co/wikilangs}
750
  institution = {Omneity Labs}
751
  }
 
761
  - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
762
  - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
763
  - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
764
+ - 🤝 Sponsor: [Featherless AI](https://featherless.ai)
765
  ---
766
  *Generated by Wikilangs Models Pipeline*
767
 
768
+ *Report Date: 2026-01-04 00:20:16*
models/embeddings/aligned/ckb_128d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ea3f53036d267e3d7a7ed2386b681314212fd5b35678baa23adcb7456ce7c4a
3
+ size 1151875214
models/embeddings/aligned/ckb_128d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "ckb", "dim": 128, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/ckb_128d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab88852a8e845f1196013144e2e515f744c74f546649f1b798d869cc2d2fd8c1
3
+ size 65664
models/embeddings/aligned/ckb_128d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "ckb",
3
+ "dimension": 128,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 10974,
7
+ "vocab_size": 122030
8
+ }
models/embeddings/aligned/ckb_32d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:239758fca11118baea1531fc2cb9622012abf393ad3474262f96044d6ad95106
3
+ size 290156174
models/embeddings/aligned/ckb_32d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "ckb", "dim": 32, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/ckb_32d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8072c42fdc02826370f84e43723c4a72769b1e1ad0e7ad733893bec393ad7c83
3
+ size 4224
models/embeddings/aligned/ckb_32d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "ckb",
3
+ "dimension": 32,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 10974,
7
+ "vocab_size": 122030
8
+ }
models/embeddings/aligned/ckb_64d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2f57a05805b873811da3a14b739c3f8d85335d7d60cb8556107570f9840986a
3
+ size 577395854
models/embeddings/aligned/ckb_64d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "ckb", "dim": 64, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/ckb_64d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0538cd2fd2ae9c9c75580e00cf62852e7e7506b9b98cbf6a0fa9b6ea89722f5
3
+ size 16512
models/embeddings/aligned/ckb_64d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "ckb",
3
+ "dimension": 64,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 10974,
7
+ "vocab_size": 122030
8
+ }
models/embeddings/monolingual/ckb_128d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41b65febdda7187f16f63077ef252551994a6e3947afcc78bbee22f7792d2c3b
3
- size 1159811328
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ea3f53036d267e3d7a7ed2386b681314212fd5b35678baa23adcb7456ce7c4a
3
+ size 1151875214
models/embeddings/monolingual/ckb_128d_metadata.json CHANGED
@@ -3,11 +3,13 @@
3
  "dimension": 128,
4
  "version": "monolingual",
5
  "training_params": {
6
- "dim": 128,
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
- "epochs": 5
 
 
11
  },
12
- "vocab_size": 129587
13
  }
 
3
  "dimension": 128,
4
  "version": "monolingual",
5
  "training_params": {
6
+ "algorithm": "skipgram",
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
+ "epochs": 5,
11
+ "encoding_method": "rope",
12
+ "dim": 128
13
  },
14
+ "vocab_size": 122030
15
  }
models/embeddings/monolingual/ckb_32d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fa322184e03fc86097b753549864e28479c19a0f7f851a311fc6c9b07ca61784
3
- size 292288512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:239758fca11118baea1531fc2cb9622012abf393ad3474262f96044d6ad95106
3
+ size 290156174
models/embeddings/monolingual/ckb_32d_metadata.json CHANGED
@@ -3,11 +3,13 @@
3
  "dimension": 32,
4
  "version": "monolingual",
5
  "training_params": {
6
- "dim": 32,
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
- "epochs": 5
 
 
11
  },
12
- "vocab_size": 129587
13
  }
 
3
  "dimension": 32,
4
  "version": "monolingual",
5
  "training_params": {
6
+ "algorithm": "skipgram",
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
+ "epochs": 5,
11
+ "encoding_method": "rope",
12
+ "dim": 32
13
  },
14
+ "vocab_size": 122030
15
  }
models/embeddings/monolingual/ckb_64d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:64aa558ec3f57c14e06b75aa214fae009c21695a65c7e5e450a744a7bdf362fe
3
- size 581462784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2f57a05805b873811da3a14b739c3f8d85335d7d60cb8556107570f9840986a
3
+ size 577395854
models/embeddings/monolingual/ckb_64d_metadata.json CHANGED
@@ -3,11 +3,13 @@
3
  "dimension": 64,
4
  "version": "monolingual",
5
  "training_params": {
6
- "dim": 64,
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
- "epochs": 5
 
 
11
  },
12
- "vocab_size": 129587
13
  }
 
3
  "dimension": 64,
4
  "version": "monolingual",
5
  "training_params": {
6
+ "algorithm": "skipgram",
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
+ "epochs": 5,
11
+ "encoding_method": "rope",
12
+ "dim": 64
13
  },
14
+ "vocab_size": 122030
15
  }
models/subword_markov/ckb_markov_ctx1_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03d0443702a80a2957e258cfa45a0b3470f1b7c07853a411f35b6d93f4afb2c8
3
- size 328092
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b19ff5b62a242017dba251317302d7f75d17d0178003f3a34c3faa5f393d069f
3
+ size 332993
models/subword_markov/ckb_markov_ctx1_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 1,
3
  "variant": "subword",
4
  "language": "ckb",
5
- "unique_contexts": 4993,
6
- "total_transitions": 85527017
7
  }
 
2
  "context_size": 1,
3
  "variant": "subword",
4
  "language": "ckb",
5
+ "unique_contexts": 5867,
6
+ "total_transitions": 75476639
7
  }
models/subword_markov/ckb_markov_ctx2_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8f1ea381df82c1b0e26b22165542771e484be712c855c85606ab8a53a4f60aa4
3
- size 2060794
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ae684f1932a74c79180c526cada0dc191b16b69557cd5c05753770a328b9c94
3
+ size 1737005
models/subword_markov/ckb_markov_ctx2_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 2,
3
  "variant": "subword",
4
  "language": "ckb",
5
- "unique_contexts": 46978,
6
- "total_transitions": 85446235
7
  }
 
2
  "context_size": 2,
3
  "variant": "subword",
4
  "language": "ckb",
5
+ "unique_contexts": 46011,
6
+ "total_transitions": 75398929
7
  }
models/subword_markov/ckb_markov_ctx3_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6a716a5f7d117bbb45a7edd9f6a801c8bf99e17c4641490db0e5f6918e64f57
3
- size 9594491
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ba99959a5b1d19af47ddc1fa5beb4e0067de0b294309d77f118b86e2f25c189
3
+ size 6887855
models/subword_markov/ckb_markov_ctx3_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 3,
3
  "variant": "subword",
4
  "language": "ckb",
5
- "unique_contexts": 284820,
6
- "total_transitions": 85365453
7
  }
 
2
  "context_size": 3,
3
  "variant": "subword",
4
  "language": "ckb",
5
+ "unique_contexts": 212847,
6
+ "total_transitions": 75321219
7
  }
models/subword_markov/ckb_markov_ctx4_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:146b6fa63bf73ebf6bcb8bd0dbfd02d14ab02b74ec8a0f67e6f8278666bb2770
3
- size 30314193
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96dcf1146cbc8390f23f5793dacfba1d39f4b2d948b7f5b5ff87f63e0dfc3d2c
3
+ size 23328129
models/subword_markov/ckb_markov_ctx4_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 4,
3
  "variant": "subword",
4
  "language": "ckb",
5
- "unique_contexts": 1215545,
6
- "total_transitions": 85284671
7
  }
 
2
  "context_size": 4,
3
  "variant": "subword",
4
  "language": "ckb",
5
+ "unique_contexts": 877504,
6
+ "total_transitions": 75243509
7
  }
models/subword_ngram/ckb_2gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8a8d5ed23796744064159155b6a8eb09b9e7ccc31fd3789bc489bcc890fd3950
3
- size 202131
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6229cef5a785c4b9360437efcad7221d457193f7a8b9dee5a5f8a71b8f293d8a
3
+ size 169598
models/subword_ngram/ckb_2gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 2,
3
  "variant": "subword",
4
  "language": "ckb",
5
- "unique_ngrams": 15002,
6
- "total_ngrams": 85527017
7
  }
 
2
  "n": 2,
3
  "variant": "subword",
4
  "language": "ckb",
5
+ "unique_ngrams": 12264,
6
+ "total_ngrams": 75476639
7
  }
models/subword_ngram/ckb_3gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4ae1d7e56ffe951fbc16e1b96e4b9213628ff12fce9be06bef038bd418284ba4
3
- size 1566915
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23db3fa10a208bfa197634365e864d7edbb2d6d1d6ec8788502c09f657064c57
3
+ size 1192079
models/subword_ngram/ckb_3gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 3,
3
  "variant": "subword",
4
  "language": "ckb",
5
- "unique_ngrams": 125615,
6
- "total_ngrams": 85446235
7
  }
 
2
  "n": 3,
3
  "variant": "subword",
4
  "language": "ckb",
5
+ "unique_ngrams": 92875,
6
+ "total_ngrams": 75398929
7
  }
models/subword_ngram/ckb_4gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a64971d7e7e32eed12ccb8129e7d8457f094683217e6d22fd974784a62f0c1b1
3
- size 8256147
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fb8d23b323f7bd06040fa892cded6b96d6463e8d0e18e8c4045a2d5c5ceb1bf
3
+ size 6262272
models/subword_ngram/ckb_4gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 4,
3
  "variant": "subword",
4
  "language": "ckb",
5
- "unique_ngrams": 641521,
6
- "total_ngrams": 85365453
7
  }
 
2
  "n": 4,
3
  "variant": "subword",
4
  "language": "ckb",
5
+ "unique_ngrams": 482188,
6
+ "total_ngrams": 75321219
7
  }
models/subword_ngram/ckb_5gram_subword.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac3dfab20a28a086beb4f139f8066e32cf4c2b63a4a0cf977680f3ffcf930e9b
3
+ size 17128094
models/subword_ngram/ckb_5gram_subword_metadata.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "n": 5,
3
+ "variant": "subword",
4
+ "language": "ckb",
5
+ "unique_ngrams": 1228808,
6
+ "total_ngrams": 75243509
7
+ }
models/tokenizer/ckb_tokenizer_16k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5818932355f70a312071fdf012ffcc1f50c4e1451abc5e72acc73d3be72b0626
3
- size 577885
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b9f4163c3c0884c616bc0e749e713f70000f5d98effc2fbbf5a797b4f5c13bc
3
+ size 583728
models/tokenizer/ckb_tokenizer_16k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/tokenizer/ckb_tokenizer_32k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab68f2fd4bd5528d04b467e28ccd3f89089b8d2fc9a5bda6c8dac38fff5e4996
3
- size 934428
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8027d5cd50e4ee4a4c87424e37cb2d156551deed29f3724829a8ab01d4da748e
3
+ size 941797
models/tokenizer/ckb_tokenizer_32k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/tokenizer/ckb_tokenizer_64k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c7225ca8ff935c1a6affb6e0ac59d01a87f70c6a2205c6f3fe8db054411b7ee5
3
- size 1659196
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:256993aa8a7453860c221dbdc80f11956fd6a754be1068e6c76392cf4250204f
3
+ size 1671337
models/tokenizer/ckb_tokenizer_64k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/tokenizer/ckb_tokenizer_8k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2e9166b87186603a55e723d61f0f46f77b7f7d38a03ee0e8d993f1e91703dd00
3
- size 404312
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a15e7727e37d01daf14a56a82780e0025eb7dccbfd7751085361cb4433f6d0a2
3
+ size 407812
models/tokenizer/ckb_tokenizer_8k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/vocabulary/ckb_vocabulary.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1ddb813ea1d647c688855dc38fd806dfef2594dd35cdd646090b3888aff0fcbe
3
- size 4435068
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ef22e5092696880d85c9e76edc006f8bae1ff63463f29d570ac8695e39f917c
3
+ size 4252562
models/vocabulary/ckb_vocabulary_metadata.json CHANGED
@@ -1,16 +1,17 @@
1
  {
2
  "language": "ckb",
3
- "vocabulary_size": 267929,
 
4
  "statistics": {
5
- "type_token_ratio": 0.053893850732878223,
6
  "coverage": {
7
- "top_100": 0.29993262638188223,
8
- "top_1000": 0.5369777451124179,
9
- "top_5000": 0.7163147474143433,
10
- "top_10000": 0.7817037242581987
11
  },
12
- "hapax_count": 415927,
13
- "hapax_ratio": 0.6082084532416181,
14
- "total_documents": 80782
15
  }
16
  }
 
1
  {
2
  "language": "ckb",
3
+ "vocabulary_size": 254727,
4
+ "variant": "full",
5
  "statistics": {
6
+ "type_token_ratio": 0.055516401142947924,
7
  "coverage": {
8
+ "top_100": 0.3021414520089231,
9
+ "top_1000": 0.5373608979214731,
10
+ "top_5000": 0.7128217758293761,
11
+ "top_10000": 0.7785004555195074
12
  },
13
+ "hapax_count": 370796,
14
+ "hapax_ratio": 0.5927775637346668,
15
+ "total_documents": 77710
16
  }
17
  }
models/word_markov/ckb_markov_ctx1_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dcdc9ceae2045b65dda575029e9fa539662c1e44e10fe9fc75a489868b61c334
3
- size 52948370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc8a366dfc09cccbe60be8a95bd3b957d88be5ac22f354a64a169988a35d6aa9
3
+ size 52902746
models/word_markov/ckb_markov_ctx1_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 1,
3
  "variant": "word",
4
  "language": "ckb",
5
- "unique_contexts": 684303,
6
- "total_transitions": 15213811
7
  }
 
2
  "context_size": 1,
3
  "variant": "word",
4
  "language": "ckb",
5
+ "unique_contexts": 625283,
6
+ "total_transitions": 11189645
7
  }
models/word_markov/ckb_markov_ctx2_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:19f5e168313ff46635e3fd74d85d9b056bc8a862821538f049e04df57d27bc90
3
- size 163638338
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d422a17822fe65be5611fa6d734e8119912fb90e2c14bc3902193ddd9140f757
3
+ size 161865621
models/word_markov/ckb_markov_ctx2_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 2,
3
  "variant": "word",
4
  "language": "ckb",
5
- "unique_contexts": 4525451,
6
- "total_transitions": 15133047
7
  }
 
2
  "context_size": 2,
3
  "variant": "word",
4
  "language": "ckb",
5
+ "unique_contexts": 4486871,
6
+ "total_transitions": 11111935
7
  }