omarkamali commited on
Commit
8a4c76f
·
verified ·
1 Parent(s): 20d27c8

Upload all models and assets for chr (latest)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. README.md +166 -130
  3. models/embeddings/aligned/chr_128d.bin +3 -0
  4. models/embeddings/aligned/chr_128d.meta.json +1 -0
  5. models/embeddings/aligned/chr_128d.projection.npy +3 -0
  6. models/embeddings/aligned/chr_128d_metadata.json +8 -0
  7. models/embeddings/aligned/chr_32d.bin +3 -0
  8. models/embeddings/aligned/chr_32d.meta.json +1 -0
  9. models/embeddings/aligned/chr_32d.projection.npy +3 -0
  10. models/embeddings/aligned/chr_32d_metadata.json +8 -0
  11. models/embeddings/aligned/chr_64d.bin +3 -0
  12. models/embeddings/aligned/chr_64d.meta.json +1 -0
  13. models/embeddings/aligned/chr_64d.projection.npy +3 -0
  14. models/embeddings/aligned/chr_64d_metadata.json +8 -0
  15. models/embeddings/monolingual/chr_128d.bin +2 -2
  16. models/embeddings/monolingual/chr_128d_metadata.json +1 -1
  17. models/embeddings/monolingual/chr_32d.bin +2 -2
  18. models/embeddings/monolingual/chr_32d_metadata.json +1 -1
  19. models/embeddings/monolingual/chr_64d.bin +2 -2
  20. models/embeddings/monolingual/chr_64d_metadata.json +1 -1
  21. models/subword_markov/chr_markov_ctx1_subword.parquet +2 -2
  22. models/subword_markov/chr_markov_ctx1_subword_metadata.json +2 -2
  23. models/subword_markov/chr_markov_ctx2_subword.parquet +2 -2
  24. models/subword_markov/chr_markov_ctx2_subword_metadata.json +2 -2
  25. models/subword_markov/chr_markov_ctx3_subword.parquet +2 -2
  26. models/subword_markov/chr_markov_ctx3_subword_metadata.json +2 -2
  27. models/subword_markov/chr_markov_ctx4_subword.parquet +2 -2
  28. models/subword_markov/chr_markov_ctx4_subword_metadata.json +2 -2
  29. models/subword_ngram/chr_2gram_subword.parquet +2 -2
  30. models/subword_ngram/chr_2gram_subword_metadata.json +2 -2
  31. models/subword_ngram/chr_3gram_subword.parquet +2 -2
  32. models/subword_ngram/chr_3gram_subword_metadata.json +2 -2
  33. models/subword_ngram/chr_4gram_subword.parquet +2 -2
  34. models/subword_ngram/chr_4gram_subword_metadata.json +2 -2
  35. models/subword_ngram/chr_5gram_subword.parquet +3 -0
  36. models/subword_ngram/chr_5gram_subword_metadata.json +7 -0
  37. models/tokenizer/chr_tokenizer_16k.model +2 -2
  38. models/tokenizer/chr_tokenizer_16k.vocab +0 -0
  39. models/tokenizer/chr_tokenizer_32k.model +2 -2
  40. models/tokenizer/chr_tokenizer_32k.vocab +0 -0
  41. models/tokenizer/chr_tokenizer_8k.model +2 -2
  42. models/tokenizer/chr_tokenizer_8k.vocab +0 -0
  43. models/vocabulary/chr_vocabulary.parquet +2 -2
  44. models/vocabulary/chr_vocabulary_metadata.json +9 -9
  45. models/word_markov/chr_markov_ctx1_word.parquet +2 -2
  46. models/word_markov/chr_markov_ctx1_word_metadata.json +2 -2
  47. models/word_markov/chr_markov_ctx2_word.parquet +2 -2
  48. models/word_markov/chr_markov_ctx2_word_metadata.json +2 -2
  49. models/word_markov/chr_markov_ctx3_word.parquet +2 -2
  50. models/word_markov/chr_markov_ctx3_word_metadata.json +2 -2
.gitattributes CHANGED
@@ -39,3 +39,4 @@ visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -t
39
  visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
40
  visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
41
  visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
 
 
39
  visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
40
  visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
41
  visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
42
+ visualizations/embedding_tsne_multilingual.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  language: chr
3
- language_name: CHR
4
  language_family: american_iroquoian
5
  tags:
6
  - wikilangs
@@ -10,11 +10,21 @@ tags:
10
  - n-gram
11
  - markov
12
  - wikipedia
 
 
 
 
 
 
 
 
 
 
13
  - monolingual
14
  - family-american_iroquoian
15
  license: mit
16
  library_name: wikilangs
17
- pipeline_tag: feature-extraction
18
  datasets:
19
  - omarkamali/wikipedia-monthly
20
  dataset_info:
@@ -23,20 +33,20 @@ dataset_info:
23
  metrics:
24
  - name: best_compression_ratio
25
  type: compression
26
- value: 3.573
27
  - name: best_isotropy
28
  type: isotropy
29
- value: 0.2005
30
  - name: vocabulary_size
31
  type: vocab
32
  value: 0
33
  generated: 2026-01-03
34
  ---
35
 
36
- # CHR - Wikilangs Models
37
  ## Comprehensive Research Report & Full Ablation Study
38
 
39
- This repository contains NLP models trained and evaluated by Wikilangs, specifically on **CHR** Wikipedia data.
40
  We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
41
 
42
  ## 📋 Repository Contents
@@ -60,7 +70,7 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
60
  - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
61
  - [4. Vocabulary Analysis](#4-vocabulary-analysis)
62
  - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
63
- - [6. Morphological Analysis (Experimental)](#6-morphological-analysis)
64
  - [7. Summary & Recommendations](#7-summary--recommendations)
65
  - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
66
  - [Visualizations Index](#visualizations-index)
@@ -80,42 +90,42 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
80
 
81
  | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
82
  |------------|-------------|---------------|----------|--------------|
83
- | **8k** | 2.938x | 2.95 | 0.1472% | 83,560 |
84
- | **16k** | 3.361x | 3.37 | 0.1684% | 73,046 |
85
- | **32k** | 3.573x 🏆 | 3.59 | 0.1790% | 68,709 |
86
 
87
  ### Tokenization Examples
88
 
89
  Below are sample sentences tokenized with each vocabulary size:
90
 
91
- **Sample 1:** `ᎠᏥᏂᏘᏂᎠ ᏙᏱᏗᏢᎦᏚᎲ ᏧᎦᎾᏮ ᎠᎹᏰᏟ-Ꮒ. Buenos Aires ᎠᏰᎵᏗᎦᎳᏫᎢᏍᏗ ᎦᏚᎲᎢ. ᏓᏓᏚᎬ ᎪᏪᎵ ᏙᏯᏗᏢ ᏗᏕᎬᏔᏛ ᎠᎹ...`
92
 
93
  | Vocab | Tokens | Count |
94
  |-------|--------|-------|
95
- | 8k | `▁ᎠᏥᏂᏘᏂᎠ ▁ᏙᏱᏗᏢᎦᏚᎲ ▁ᏧᎦᎾᏮ ▁ᎠᎹᏰᏟ - . ▁buenos ▁aires ▁ᎠᏰᎵᏗᎦᎳᏫᎢᏍᏗ ... (+10 more)` | 20 |
96
- | 16k | `▁ᎠᏥᏂᏘᏂᎠ ▁ᏙᏱᏗᏢᎦᏚᎲ ▁ᏧᎦᎾᏮ ▁ᎠᎹᏰᏟ - . ▁buenosaires ▁ᎠᏰᎵᏗᎦᎳᏫᎢᏍᏗ ... (+10 more)` | 20 |
97
- | 32k | `▁ᎠᏥᏂᏘᏂᎠ ▁ᏙᏱᏗᏢᎦᏚᎲ ▁ᏧᎦᎾᏮ ▁ᎠᎹᏰᏟ - . ▁buenosaires ▁ᎠᏰᎵᏗᎦᎳᏫᎢᏍᏗ ... (+10 more)` | 20 |
98
 
99
- **Sample 2:** `ᎠᎷᏆ"Tsalagi Anagalisgi." ᏙᏱᏗᏢᎦᏚᎲ ᎾᏍᎩ ᏁᏛᎳᏂ-Ꮒ. Oranjestad ᎠᏰᎵᏗᎦᎳᏫᎢᏍᏗ ᎦᏚᎲᎢ. ᏓᏓᏚᎬ ᎪᏪ...`
100
 
101
  | Vocab | Tokens | Count |
102
  |-------|--------|-------|
103
- | 8k | `▁ᎠᎷ " tsalagianagalisgi ." ▁ᏙᏱᏗᏢᎦᏚᎲ ▁ᎾᏍᎩ ▁ᏁᏛᎳᏂ - ... (+18 more)` | 28 |
104
- | 16k | `▁ᎠᎷᏆ " tsalagianagalisgi ." ▁ᏙᏱᏗᏢᎦᏚᎲ ▁ᎾᏍᎩ ▁ᏁᏛᎳᏂ - ... (+14 more)` | 24 |
105
- | 32k | `▁ᎠᎷᏆ " tsalagianagalisgi ." ▁ᏙᏱᏗᏢᎦᏚᎲ ▁ᎾᏍᎩ ▁ᏁᏛᎳᏂ - ... (+13 more)` | 23 |
106
 
107
- **Sample 3:** `ᎴᎹᏂ (Lemani) ᎠᏓᎶᏂᎨᎢ ᎤᏓᏔᏅᎯ ᎠᎩᏍᏗ ᎨᏐᎢ. ᎠᎩᏍᏗ be checked`
108
 
109
  | Vocab | Tokens | Count |
110
  |-------|--------|-------|
111
- | 8k | `▁Ꮄ ᎹᏂ ▁( le mani ) ▁ᎠᏓᎶᏂᎨᎢ ▁ᎤᏓᏔᏅᎯ ▁ᎠᎩᏍᏗ ▁ᎨᏐᎢ ... (+4 more)` | 14 |
112
- | 16k | `▁ᎴᎹᏂ( lemani ) ▁ᎠᏓᎶᏂᎨᎢ ▁ᎤᏓᏔᏅᎯ ▁ᎠᎩᏍᏗ ▁ᎨᏐᎢ . ▁ᎠᎩᏍᏗ ... (+2 more)` | 12 |
113
- | 32k | `▁ᎴᎹᏂ ▁( lemani ) ▁ᎠᏓᎶᏂᎨᎢ ▁ᎤᏓᏔᏅᎯ ▁ᎠᎩᏍᏗ ▁ᎨᏐᎢ . ▁ᎠᎩᏍᏗ ... (+2 more)` | 12 |
114
 
115
 
116
  ### Key Findings
117
 
118
- - **Best Compression:** 32k achieves 3.573x compression
119
  - **Lowest UNK Rate:** 8k with 0.1472% unknown tokens
120
  - **Trade-off:** Larger vocabularies improve compression but increase model size
121
  - **Recommendation:** 32k vocabulary provides optimal balance for production use
@@ -133,12 +143,14 @@ Below are sample sentences tokenized with each vocabulary size:
133
 
134
  | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
135
  |--------|---------|------------|---------|----------------|------------------|-------------------|
136
- | **2-gram** | Word | 160 🏆 | 7.32 | 494 | 67.3% | 100.0% |
137
- | **2-gram** | Subword | 938 | 9.87 | 3,277 | 40.0% | 86.1% |
138
- | **3-gram** | Word | 222 | 7.80 | 667 | 63.1% | 100.0% |
139
- | **3-gram** | Subword | 4,484 | 12.13 | 12,903 | 21.8% | 52.1% |
140
- | **4-gram** | Word | 508 | 8.99 | 1,308 | 48.2% | 89.2% |
141
- | **4-gram** | Subword | 9,908 | 13.27 | 28,877 | 18.5% | 39.2% |
 
 
142
 
143
  ### Top 5 N-grams by Size
144
 
@@ -146,68 +158,88 @@ Below are sample sentences tokenized with each vocabulary size:
146
 
147
  | Rank | N-gram | Count |
148
  |------|--------|-------|
149
- | 1 | `be checked` | 850 |
150
- | 2 | `ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ` | 578 |
151
- | 3 | `ꮣꮣꮪꭼ ꭺꮺꮅ` | 472 |
152
- | 4 | `ꭺꮺꮅ ꮩꮿꮧꮲ` | 431 |
153
- | 5 | `word list` | 345 |
154
 
155
  **3-grams (Word):**
156
 
157
  | Rank | N-gram | Count |
158
  |------|--------|-------|
159
- | 1 | `ꭺꮺꮅ ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ` | 431 |
160
- | 2 | `ꮣꮣꮪꭼ ꭺꮺꮅ ꮩꮿꮧꮲ` | 431 |
161
- | 3 | `consortium word list` | 343 |
162
- | 4 | `ꮧꮥꭼꮤꮫ be checked` | 227 |
163
- | 5 | `ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ be` | 216 |
164
 
165
  **4-grams (Word):**
166
 
167
  | Rank | N-gram | Count |
168
  |------|--------|-------|
169
- | 1 | `ꮣꮣꮪꭼ ꭺꮺꮅ ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ` | 431 |
170
- | 2 | `ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ be checked` | 216 |
171
- | 3 | `ꭺꮺꮅ ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ be` | 163 |
172
  | 4 | `ꭺꮺꮅ ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ ꭰꭶꮞꮝꮤꮕ` | 96 |
173
  | 5 | `ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ ꭰꭶꮞꮝꮤꮕ be` | 96 |
174
 
 
 
 
 
 
 
 
 
 
 
175
  **2-grams (Subword):**
176
 
177
  | Rank | N-gram | Count |
178
  |------|--------|-------|
179
- | 1 | `_ ꭰ` | 5,396 |
180
- | 2 | `_ ꭴ` | 3,446 |
181
- | 3 | `ꮧ _` | 2,866 |
182
- | 4 | `. _` | 2,628 |
183
- | 5 | `, _` | 2,120 |
184
 
185
  **3-grams (Subword):**
186
 
187
  | Rank | N-gram | Count |
188
  |------|--------|-------|
189
- | 1 | `ꮝ ꮧ _` | 1,406 |
190
- | 2 | `_ c h` | 987 |
191
- | 3 | `_ ꮄ` | 975 |
192
- | 4 | `c h e` | 965 |
193
- | 5 | `ꭰ _` | 899 |
194
 
195
  **4-grams (Subword):**
196
 
197
  | Rank | N-gram | Count |
198
  |------|--------|-------|
199
- | 1 | `_ c h e` | 918 |
200
- | 2 | `_ ꭰ ꮄ _` | 892 |
201
- | 3 | `e _ c h` | 857 |
202
- | 4 | `_ b e _` | 851 |
203
- | 5 | `c k e d` | 850 |
 
 
 
 
 
 
 
 
 
 
204
 
205
 
206
  ### Key Findings
207
 
208
- - **Best Perplexity:** 2-gram (word) with 160
209
  - **Entropy Trend:** Decreases with larger n-grams (more predictable)
210
- - **Coverage:** Top-1000 patterns cover ~39% of corpus
211
  - **Recommendation:** 4-gram or 5-gram for best predictive performance
212
 
213
  ---
@@ -223,14 +255,14 @@ Below are sample sentences tokenized with each vocabulary size:
223
 
224
  | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
225
  |---------|---------|-------------|------------|------------------|-----------------|----------------|
226
- | **1** | Word | 0.4942 | 1.409 | 2.32 | 13,243 | 50.6% |
227
- | **1** | Subword | 1.6130 | 3.059 | 16.20 | 448 | 0.0% |
228
- | **2** | Word | 0.0935 | 1.067 | 1.16 | 30,660 | 90.6% |
229
- | **2** | Subword | 1.0024 | 2.003 | 4.68 | 7,258 | 0.0% |
230
- | **3** | Word | 0.0291 | 1.020 | 1.05 | 35,274 | 97.1% |
231
- | **3** | Subword | 0.5824 | 1.497 | 2.33 | 33,930 | 41.8% |
232
- | **4** | Word | 0.0141 🏆 | 1.010 | 1.02 | 36,782 | 98.6% |
233
- | **4** | Subword | 0.2770 | 1.212 | 1.46 | 79,071 | 72.3% |
234
 
235
  ### Generated Text Samples (Word-based)
236
 
@@ -238,27 +270,27 @@ Below are text samples generated from each word-based Markov chain model:
238
 
239
  **Context Size 1:**
240
 
241
- 1. `ꭰꮄ ꭿꭰ ꮳꮃꭹ ꮧꭶꮳꭶꮈᏼ ꮧꮜꮓ ꮪꮎꮜꮓꭾꭲ ꭰꮒꮝꭶꮿꮓ ꭰꮗꮱꮝꮧ ꭰꮏꮼꭲ ꭵꮝꮚ ꭰꭺꮩꮝꭶ ꭴꮒꭺꮫ ᏼꮻ ꭴꮕꮤꮒꮣꮝꮩꮧ v ꮝ`
242
- 2. `be checked ꭿꭺꮹꮤ ꮎꮝꭶ ꮎꮝꮗ ꭰꮝꮪꭲꮫ ꭿꭰ ꭰꮣꮄꮒꮝꭼ ꭰꮫꮝꭼ ꭰᏸꮈ ꭴꮣꮄꮴꮂ consortium word list sdagoi ꮣꮣꮪꭼ`
243
- 3. `ꭿꭰ ꮳꮃꭹ ꭴꭼꮻᏻꭿ ꭸꮢ ꮎꮏ ꮌꮞꮋꮗꭹ ꮎꮝꭹ ꮑꮫꮃꮒ ꮪꮎꮩꮲꮹꮧꮢ be checked ꭱꮃꮧꮬ thumb none thumb`
244
 
245
  **Context Size 2:**
246
 
247
- 1. `ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ be checked ꮪꮎꮩꮲꮹꮧꮢ`
248
- 2. `ꮣꮣꮪꭼ ꭺꮺꮅ ꭴꮣꮞꭶꮴꮧ ꮧꮥꭼꮤꮫ ꮣꮆꮒꭶꮝꮫ ᏻꮃꮫ ꭼꮏꭸꮝꮫ ꮷᏼꮲ ꭰꮉᏸꮯ be checked ꭱꮃꮧꮬ ꮖꮗꭲꮴꭹꭲꮒ`
249
- 3. `ꭺꮺꮅ ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ be checked ꮤꮝꮊꮒꮿ ꮷꮑꭿ ꭰꮄ ꭿꮈꮝꮧ ꭰᏺꮅꭸ ꭰꮉᏸꮅ ꭷꮓꭾꭽ ꭰꮄ ꮶꭲ ꮏꮎ ꭸꮢ yolngu`
250
 
251
  **Context Size 3:**
252
 
253
- 1. `ꮣꮣꮪꭼ ꭺꮺꮅ ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ ꮪꮎꮩꮲꮹꮧꮢ be checked`
254
  2. `ꭺꮺꮅ ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ be checked ꭱꮃꮧꮬ ꮖꮗꭲꮴꭹꭲꮒ`
255
- 3. `consortium word list dikaneisd digoluwatisgi ꮣꮣꮪꭼ ꭺꮺꮅ ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ ꮔꮫꮏꮥꭼ be checked`
256
 
257
  **Context Size 4:**
258
 
259
- 1. `ꮣꮣꮪꭼ ꭺꮺꮅ ꮩ��ꮧꮲ ꮧꮥꭼꮤꮫ ꭰꭶꮞꮝꮩꮧ be checked`
260
- 2. `ꭺꮺꮅ ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ be checked ꮷᏼꮲ ꭰꮉᏸꮯ ᏻꮃꮫ ꮣꮆꮒꭶꮝꮫ ꭼꮏꭸꮝꮫ`
261
- 3. `ꭺꮺꮅ ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ ꭰꭶꮞꮝꮤꮕ be checked`
262
 
263
 
264
  ### Generated Text Samples (Subword-based)
@@ -267,34 +299,34 @@ Below are text samples generated from each subword-based Markov chain model:
267
 
268
  **Context Size 1:**
269
 
270
- 1. `_ꭻꮹꮧꮳꮕᏹꭹꮸꮟꮅꭳꮜꭲꮧ_`
271
- 2. `ꮧꮲ_ꮣꮞꭲᏻꭿꮵꮅ_blipe`
272
- 3. `ꮝꮩꮲꮹꮢ_ꭰ_ꭴꮓꭿᏻꭿꭰ_ꮣ`
273
 
274
  **Context Size 2:**
275
 
276
- 1. `_ꭰꮄ_ꭴꮎꮴꮅ_ꮧꭶꮎꮕꮟꮣꮑꮈ`
277
- 2. `_ꭴꮓꮈꮤꮕꭲ,_.._tr>_<`
278
- 3. `ꮧ_list."_(iaꭰꮒꭺꮫ_`
279
 
280
  **Context Size 3:**
281
 
282
- 1. `ꮝꮧ_ble._galv)._ꮣꮣꮪ`
283
- 2. `_checked_(gila)_ꮗꮅ`
284
- 3. `_ꭰꮄ_ꭴꮣꮔꮦᏺꮂ_ꮞꮅꮎ_seq`
285
 
286
  **Context Size 4:**
287
 
288
- 1. `_checked_(ꭱꮃꮧꮭꭲ_ꭰꮣꭿ`
289
- 2. `_ꭰꮄ_ꭰꮣꭿꭿ_polydeuces`
290
- 3. `e_checked_(ꭱꮃꮧꮬ._re`
291
 
292
 
293
  ### Key Findings
294
 
295
  - **Best Predictability:** Context-4 (word) with 98.6% predictability
296
  - **Branching Factor:** Decreases with context size (more deterministic)
297
- - **Memory Trade-off:** Larger contexts require more storage (79,071 contexts)
298
  - **Recommendation:** Context-3 or Context-4 for text generation
299
 
300
  ---
@@ -310,26 +342,26 @@ Below are text samples generated from each subword-based Markov chain model:
310
 
311
  | Metric | Value |
312
  |--------|-------|
313
- | Vocabulary Size | 4,236 |
314
- | Total Tokens | 35,186 |
315
- | Mean Frequency | 8.31 |
316
  | Median Frequency | 3 |
317
- | Frequency Std Dev | 35.59 |
318
 
319
  ### Most Common Words
320
 
321
  | Rank | Word | Frequency |
322
  |------|------|-----------|
323
- | 1 | ꭰꮄ | 903 |
324
- | 2 | be | 852 |
325
- | 3 | checked | 850 |
326
- | 4 | ꭿꭰ | 797 |
327
- | 5 | ꮧꮥꭼꮤꮫ | 611 |
328
- | 6 | ꮩꮿꮧꮲ | 580 |
329
- | 7 | ꭺꮺꮅ | 524 |
330
- | 8 | ꮣꮣꮪꭼ | 482 |
331
- | 9 | ꮳꮃꭹ | 475 |
332
- | 10 | word | 346 |
333
 
334
  ### Least Common Words (from vocabulary)
335
 
@@ -350,24 +382,24 @@ Below are text samples generated from each subword-based Markov chain model:
350
 
351
  | Metric | Value |
352
  |--------|-------|
353
- | Zipf Coefficient | 0.8714 |
354
- | R² (Goodness of Fit) | 0.984546 |
355
  | Adherence Quality | **excellent** |
356
 
357
  ### Coverage Analysis
358
 
359
  | Top N Words | Coverage |
360
  |-------------|----------|
361
- | Top 100 | 39.8% |
362
- | Top 1,000 | 74.2% |
363
  | Top 5,000 | 0.0% |
364
  | Top 10,000 | 0.0% |
365
 
366
  ### Key Findings
367
 
368
- - **Zipf Compliance:** R²=0.9845 indicates excellent adherence to Zipf's law
369
- - **High Frequency Dominance:** Top 100 words cover 39.8% of corpus
370
- - **Long Tail:** -5,764 words needed for remaining 100.0% coverage
371
 
372
  ---
373
  ## 5. Word Embeddings Evaluation
@@ -383,37 +415,40 @@ Below are text samples generated from each subword-based Markov chain model:
383
 
384
  ### 5.1 Cross-Lingual Alignment
385
 
386
- > *Note: Multilingual alignment visualization not available for this language.*
 
 
387
 
388
 
389
  ### 5.2 Model Comparison
390
 
391
  | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
392
  |-------|-----------|----------|------------------|---------------|----------------|
393
- | **mono_32d** | 32 | 0.2005 🏆 | 0.5045 | N/A | N/A |
394
- | **mono_64d** | 64 | 0.0662 | 0.4736 | N/A | N/A |
395
- | **mono_128d** | 128 | 0.0103 | 0.4890 | N/A | N/A |
 
 
 
396
 
397
  ### Key Findings
398
 
399
- - **Best Isotropy:** mono_32d with 0.2005 (more uniform distribution)
400
- - **Semantic Density:** Average pairwise similarity of 0.4890. Lower values indicate better semantic separation.
401
- - **Alignment Quality:** No aligned models evaluated in this run.
402
  - **Recommendation:** 128d aligned for best cross-lingual performance
403
 
404
  ---
405
  ## 6. Morphological Analysis (Experimental)
406
 
407
- > ⚠️ **Warning:** This language shows low morphological productivity. The statistical signals used for this analysis may be noisy or less reliable than for morphologically rich languages.
408
-
409
  This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
410
 
411
  ### 6.1 Productivity & Complexity
412
 
413
  | Metric | Value | Interpretation | Recommendation |
414
  |--------|-------|----------------|----------------|
415
- | Productivity Index | **0.000** | Low morphological productivity | ⚠️ Likely unreliable |
416
- | Idiomaticity Gap | **-1.000** | Low formulaic content | - |
417
 
418
  ### 6.2 Affix Inventory (Productive Units)
419
 
@@ -426,7 +461,7 @@ These are the most productive prefixes and suffixes identified by sampling the v
426
  #### Productive Suffixes
427
  | Suffix | Examples |
428
  |--------|----------|
429
- | `-ꮝꮧ` | ꮵꮣꮯꮆꮝꮧ, ꭴꮺꮕꮝꮧ, ꭰꭶꮤꮂꮝꮧ |
430
 
431
  ### 6.3 Bound Stems (Lexical Roots)
432
 
@@ -448,14 +483,15 @@ Using **Recursive Hierarchical Substitutability**, we decompose complex words in
448
 
449
  | Word | Suggested Split | Confidence | Stem |
450
  |------|-----------------|------------|------|
451
- | ꮒꭶꮅꮝꮧꮝꭸꮝꮧ | **`ꮒꭶꮅꮝꮧꮝꭸ-ꮝꮧ`** | 1.5 | `ꮒꭶꮅꮝꮧꮝꭸ` |
452
  | ꭰᏸꮅꮧꭶꮃꮻꭲꮝꮧ | **`ꭰᏸꮅꮧꭶꮃꮻꭲ-ꮝꮧ`** | 1.5 | `ꭰᏸꮅꮧꭶꮃꮻꭲ` |
453
- | ꭰꮜꮓꮧꮥꮆꮖꮝꮧ | **`ꭰꮜꮓꮧꮥꮆꮖ-ꮝꮧ`** | 1.5 | `ꭰꮜꮓꮧꮥꮆꮖ` |
454
 
455
  ### 6.6 Linguistic Interpretation
456
 
457
  > **Automated Insight:**
458
- The language CHR appears to be more isolating or has a highly fixed vocabulary. Word-level models perform nearly as well as subword models, indicating fewer productive morphological processes.
 
 
459
 
460
  ---
461
  ## 7. Summary & Recommendations
@@ -466,8 +502,8 @@ The language CHR appears to be more isolating or has a highly fixed vocabulary.
466
 
467
  | Component | Recommended | Rationale |
468
  |-----------|-------------|-----------|
469
- | Tokenizer | **32k BPE** | Best compression (3.57x) |
470
- | N-gram | **2-gram** | Lowest perplexity (160) |
471
  | Markov | **Context-4** | Highest predictability (98.6%) |
472
  | Embeddings | **100d** | Balanced semantic capture and isotropy |
473
 
@@ -682,4 +718,4 @@ MIT License - Free for academic and commercial use.
682
  ---
683
  *Generated by Wikilangs Models Pipeline*
684
 
685
- *Report Date: 2026-01-03 10:09:15*
 
1
  ---
2
  language: chr
3
+ language_name: Cherokee
4
  language_family: american_iroquoian
5
  tags:
6
  - wikilangs
 
10
  - n-gram
11
  - markov
12
  - wikipedia
13
+ - feature-extraction
14
+ - sentence-similarity
15
+ - tokenization
16
+ - n-grams
17
+ - markov-chain
18
+ - text-mining
19
+ - fasttext
20
+ - babelvec
21
+ - vocabulous
22
+ - vocabulary
23
  - monolingual
24
  - family-american_iroquoian
25
  license: mit
26
  library_name: wikilangs
27
+ pipeline_tag: text-generation
28
  datasets:
29
  - omarkamali/wikipedia-monthly
30
  dataset_info:
 
33
  metrics:
34
  - name: best_compression_ratio
35
  type: compression
36
+ value: 3.552
37
  - name: best_isotropy
38
  type: isotropy
39
+ value: 0.2412
40
  - name: vocabulary_size
41
  type: vocab
42
  value: 0
43
  generated: 2026-01-03
44
  ---
45
 
46
+ # Cherokee - Wikilangs Models
47
  ## Comprehensive Research Report & Full Ablation Study
48
 
49
+ This repository contains NLP models trained and evaluated by Wikilangs, specifically on **Cherokee** Wikipedia data.
50
  We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
51
 
52
  ## 📋 Repository Contents
 
70
  - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
71
  - [4. Vocabulary Analysis](#4-vocabulary-analysis)
72
  - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
73
+ - [6. Morphological Analysis (Experimental)](#6--morphological-analysis-experimental)
74
  - [7. Summary & Recommendations](#7-summary--recommendations)
75
  - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
76
  - [Visualizations Index](#visualizations-index)
 
90
 
91
  | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
92
  |------------|-------------|---------------|----------|--------------|
93
+ | **8k** | 2.919x | 2.93 | 0.1472% | 82,177 |
94
+ | **16k** | 3.358x | 3.37 | 0.1694% | 71,429 |
95
+ | **32k** | 3.552x 🏆 | 3.57 | 0.1792% | 67,524 |
96
 
97
  ### Tokenization Examples
98
 
99
  Below are sample sentences tokenized with each vocabulary size:
100
 
101
+ **Sample 1:** `ᏅᏓᎩ"Consortium Word List." (nvdagi) () ᎦᏚᎲᎢ ᎡᏉ ᏄᏲᎪᎢ, ᏄᏲᎩ, ᎠᎹᏰᏟ. ᏙᏯᏗᏢ ᏗᏕᎬᏔᏛ be ch...`
102
 
103
  | Vocab | Tokens | Count |
104
  |-------|--------|-------|
105
+ | 8k | `▁ᏅᏓᎩ " consortium ▁word ▁list ."( nvda gi ) ... (+13 more)` | 23 |
106
+ | 16k | `▁ᏅᏓᎩ " consortium ▁word ▁list ."( nvdagi ) () ... (+12 more)` | 22 |
107
+ | 32k | `▁ᏅᏓᎩ " consortium ▁word ▁list ."( nvdagi ) () ... (+12 more)` | 22 |
108
 
109
+ **Sample 2:** `ᏳᏈᎳ"Consortium Word List." (yuquila) (). ᏓᏓᏚᎬ ᎪᏪᎵ ᏙᏯᏗᏢ ᏗᏕᎬᏔᏛ be checked`
110
 
111
  | Vocab | Tokens | Count |
112
  |-------|--------|-------|
113
+ | 8k | `▁Ᏻ " consortiumword ▁list ." ▁( yu ... (+9 more)` | 19 |
114
+ | 16k | `▁ᏳᏈᎳ " consortiumword ▁list ." ▁( yuquila ) ▁(). ... (+6 more)` | 16 |
115
+ | 32k | `▁ᏳᏈᎳ " consortiumword ▁list ." ▁( yuquila ) ▁(). ... (+6 more)` | 16 |
116
 
117
+ **Sample 3:** `ᎦᏢᏍᏙᏗ"Consortium Word List." (gatlvsdodi). ᏓᏓᏚᎬ ᎪᏪ��� ᏙᏯᏗᏢ ᏗᏕᎬᏔᏛ ᎠᎦᏎᏍᏔᏅ be checked`
118
 
119
  | Vocab | Tokens | Count |
120
  |-------|--------|-------|
121
+ | 8k | `▁Ꭶ ᏍᏙᏗ " consortium ▁word ▁list ." ▁( gat ... (+10 more)` | 20 |
122
+ | 16k | `▁ᎦᏢᏍᏙᏗ " consortium word ▁list ." ▁( gatlvs dodi ). ... (+7 more)` | 17 |
123
+ | 32k | `▁ᎦᏢᏍᏙᏗ " consortium ▁word ▁list ." ▁( gatlvsdodi ). ▁ᏓᏓᏚᎬ ... (+6 more)` | 16 |
124
 
125
 
126
  ### Key Findings
127
 
128
+ - **Best Compression:** 32k achieves 3.552x compression
129
  - **Lowest UNK Rate:** 8k with 0.1472% unknown tokens
130
  - **Trade-off:** Larger vocabularies improve compression but increase model size
131
  - **Recommendation:** 32k vocabulary provides optimal balance for production use
 
143
 
144
  | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
145
  |--------|---------|------------|---------|----------------|------------------|-------------------|
146
+ | **2-gram** | Word | 151 🏆 | 7.24 | 471 | 68.7% | 100.0% |
147
+ | **2-gram** | Subword | 931 | 9.86 | 3,244 | 40.2% | 86.2% |
148
+ | **3-gram** | Word | 218 | 7.76 | 655 | 63.6% | 100.0% |
149
+ | **3-gram** | Subword | 4,428 | 12.11 | 12,716 | 22.1% | 52.3% |
150
+ | **4-gram** | Word | 483 | 8.91 | 1,256 | 49.2% | 90.8% |
151
+ | **4-gram** | Subword | 9,728 | 13.25 | 28,356 | 18.7% | 39.4% |
152
+ | **5-gram** | Word | 414 | 8.69 | 901 | 51.7% | 100.0% |
153
+ | **5-gram** | Subword | 9,506 | 13.21 | 27,480 | 20.2% | 39.5% |
154
 
155
  ### Top 5 N-grams by Size
156
 
 
158
 
159
  | Rank | N-gram | Count |
160
  |------|--------|-------|
161
+ | 1 | `be checked` | 841 |
162
+ | 2 | `ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ` | 577 |
163
+ | 3 | `ꮣꮣꮪꭼ ꭺꮺꮅ` | 470 |
164
+ | 4 | `ꭺꮺꮅ ꮩꮿꮧꮲ` | 430 |
165
+ | 5 | `word list` | 344 |
166
 
167
  **3-grams (Word):**
168
 
169
  | Rank | N-gram | Count |
170
  |------|--------|-------|
171
+ | 1 | `ꮣꮣꮪꭼ ꭺꮺꮅ ꮩꮿꮧꮲ` | 430 |
172
+ | 2 | `ꭺꮺꮅ ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ` | 430 |
173
+ | 3 | `consortium word list` | 342 |
174
+ | 4 | `ꮧꮥꭼꮤꮫ be checked` | 226 |
175
+ | 5 | `ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ be` | 215 |
176
 
177
  **4-grams (Word):**
178
 
179
  | Rank | N-gram | Count |
180
  |------|--------|-------|
181
+ | 1 | `ꮣꮣꮪꭼ ꭺꮺꮅ ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ` | 430 |
182
+ | 2 | `ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ be checked` | 215 |
183
+ | 3 | `ꭺꮺꮅ ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ be` | 162 |
184
  | 4 | `ꭺꮺꮅ ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ ꭰꭶꮞꮝꮤꮕ` | 96 |
185
  | 5 | `ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ ꭰꭶꮞꮝꮤꮕ be` | 96 |
186
 
187
+ **5-grams (Word):**
188
+
189
+ | Rank | N-gram | Count |
190
+ |------|--------|-------|
191
+ | 1 | `ꭺꮺꮅ ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ be checked` | 162 |
192
+ | 2 | `ꮣꮣꮪꭼ ꭺꮺꮅ ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ be` | 162 |
193
+ | 3 | `ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ ꭰꭶꮞꮝꮤꮕ be checked` | 96 |
194
+ | 4 | `ꭺꮺꮅ ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ ꭰꭶꮞꮝꮤꮕ be` | 96 |
195
+ | 5 | `ꮣꮣꮪꭼ ꭺꮺꮅ ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ ꭰꭶꮞꮝꮤꮕ` | 96 |
196
+
197
  **2-grams (Subword):**
198
 
199
  | Rank | N-gram | Count |
200
  |------|--------|-------|
201
+ | 1 | `_ ꭰ` | 5,288 |
202
+ | 2 | `_ ꭴ` | 3,380 |
203
+ | 3 | `ꮧ _` | 2,778 |
204
+ | 4 | `. _` | 2,562 |
205
+ | 5 | `, _` | 2,084 |
206
 
207
  **3-grams (Subword):**
208
 
209
  | Rank | N-gram | Count |
210
  |------|--------|-------|
211
+ | 1 | `ꮝ ꮧ _` | 1,355 |
212
+ | 2 | `_ c h` | 978 |
213
+ | 3 | `c h e` | 956 |
214
+ | 4 | `_ ꮄ` | 955 |
215
+ | 5 | `ꮧ _` | 882 |
216
 
217
  **4-grams (Subword):**
218
 
219
  | Rank | N-gram | Count |
220
  |------|--------|-------|
221
+ | 1 | `_ c h e` | 909 |
222
+ | 2 | `_ ꭰ ꮄ _` | 874 |
223
+ | 3 | `e _ c h` | 848 |
224
+ | 4 | `_ b e _` | 842 |
225
+ | 5 | `c h e c` | 841 |
226
+
227
+ **5-grams (Subword):**
228
+
229
+ | Rank | N-gram | Count |
230
+ |------|--------|-------|
231
+ | 1 | `e _ c h e` | 846 |
232
+ | 2 | `_ c h e c` | 841 |
233
+ | 3 | `e c k e d` | 841 |
234
+ | 4 | `_ b e _ c` | 841 |
235
+ | 5 | `c h e c k` | 841 |
236
 
237
 
238
  ### Key Findings
239
 
240
+ - **Best Perplexity:** 2-gram (word) with 151
241
  - **Entropy Trend:** Decreases with larger n-grams (more predictable)
242
+ - **Coverage:** Top-1000 patterns cover ~40% of corpus
243
  - **Recommendation:** 4-gram or 5-gram for best predictive performance
244
 
245
  ---
 
255
 
256
  | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
257
  |---------|---------|-------------|------------|------------------|-----------------|----------------|
258
+ | **1** | Word | 0.4882 | 1.403 | 2.29 | 13,116 | 51.2% |
259
+ | **1** | Subword | 1.6098 | 3.052 | 16.02 | 447 | 0.0% |
260
+ | **2** | Word | 0.0920 | 1.066 | 1.15 | 29,975 | 90.8% |
261
+ | **2** | Subword | 1.0061 | 2.008 | 4.67 | 7,162 | 0.0% |
262
+ | **3** | Word | 0.0290 | 1.020 | 1.05 | 34,378 | 97.1% |
263
+ | **3** | Subword | 0.5823 | 1.497 | 2.32 | 33,475 | 41.8% |
264
+ | **4** | Word | 0.0141 🏆 | 1.010 | 1.02 | 35,846 | 98.6% |
265
+ | **4** | Subword | 0.2760 | 1.211 | 1.46 | 77,796 | 72.4% |
266
 
267
  ### Generated Text Samples (Word-based)
268
 
 
270
 
271
  **Context Size 1:**
272
 
273
+ 1. `ꭰꮄ ꭳꮒꮿꭸꮝꮩꮧ ꭳꭶꮃꮀꮋ contributed ꮎꭵ ꮝꮖꮄꮝꮧ ᏹꮹꭹ ꮮꭶ ꮵꮿꮢꮒꮅꮩꮈꭲ ꮓꮚꮕ ꭳꭹꮎꮅꮝꮣᏼꮕꭲ ꭰꮳꮧ ꮄꭼꮎꮋ animalia ꭰꮭꭵꭲ phylum`
274
+ 2. `be checked ꮪꮎꮩꮲꮹꮧꮢ be checked ꭱꮃꮧꮬ ꭰꮥꮫꮝꭺꭲ ꭶꮳꮔꮃ ꭴꮭꮕꮣꮥꮂ ꭰꮣꮕꮝꮧ ordo artiodactyla ꮟꮣꮑꮈꭿ ꭴꮝꮧ subspecies c`
275
+ 3. `ꭿꭰ 50 41 fꭶꮈꮃꮧ 65 f ꭶꮈꭰꮥꭴ 49 fꭶꮈꮃꮧ 50 ꭷꮓꭾꭽ ꭰᏸꮅ ꭴꮢꭷꮅ ꭿꭰ ꭲᏼ ꭰꮒꮩꮎꭵ`
276
 
277
  **Context Size 2:**
278
 
279
+ 1. `ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ be checked ꮷᏼꮲ ꭰꮉᏸꮯ ꭰꮒꭲꮴꭲᏻꮝꮧ ꭲꮴꭲᏻꮝꮧ ꭰꮒꮼꮒꭽ ꮧꮣꮯꮆꮝꮤꮕ be checked ꭱꮃꮧꮬ ꮖꮗꭲꮴꭹꭲꮒ`
280
+ 2. `ꮣꮣꮪꭼ ꭺꮺꮅ ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ be checked ꮷᏼꮲ ꭰꮉᏸꮯ ᏻꮃꮫ ꮣꮆꮒꭶꮝꮫ ꭼꮏꭸꮝꮫ`
281
+ 3. `ꭺꮺꮅ ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ ꮳꮃꭹ ꮷꮒᏼꮻ ꭰꮒꮳꭻꭲ ꭸꭺꮞꮈꭲ ꭰꮄ ꮜꮚ ꭴꮝꮧ ꮴꮆꭿ ꮠꮑꮅꮑ safire william the way we`
282
 
283
  **Context Size 3:**
284
 
285
+ 1. `ꮣꮣꮪꭼ ꭺꮺꮅ ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ ꭰꮉᏸꮅ ꮪꮎꮩꮲꮹꮧꮢ be checked`
286
  2. `ꭺꮺꮅ ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ be checked ꭱꮃꮧꮬ ꮖꮗꭲꮴꭹꭲꮒ`
287
+ 3. `consortium word list amayutlidi saluyi ꮣꮣꮪꭼ ꭺꮺꮅ ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ be checked ꭱꮃꮧꮬ ꮖꮗꭲꮴꭹꭲꮒ`
288
 
289
  **Context Size 4:**
290
 
291
+ 1. `ꮣꮣꮪꭼ ꭺꮺꮅ ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ be checked ꮷᏼꮲ ꭰꮉᏸꮯ ᏻꮃꮫ ꮣꮆꮒꭶꮝꮫ ꭼꮏꭸꮝꮫ`
292
+ 2. `ꭺꮺꮅ ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ be checked ꭱꮃꮧꮬ`
293
+ 3. `ꮩꮿꮧꮲ ꮧꮥꭼꮤꮫ ꭰꭶꮞꮝꮤꮕ be checked`
294
 
295
 
296
  ### Generated Text Samples (Subword-based)
 
299
 
300
  **Context Size 1:**
301
 
302
+ 1. `_ꭿ_-_ꮻꮫ_ꭻꭰꮑꭶꮜꮕꭲ_`
303
+ 2. `ꮧꮭꭵ_manotatrdo,_`
304
+ 3. `ꮝꮦꮡꮣꮕꮫꮃꮜꮒꮿꮧ._ꮣꮯ.`
305
 
306
  **Context Size 2:**
307
 
308
+ 1. `_ꭰꮑ,_be_ꮳꮃꭹ,_ꭳꮻ_ꮎ`
309
+ 2. `_ꭴꭼꮻᏻꭿ_ꭶꮑꭶ_ꮒꮧꮝ_ꭲꮿ`
310
+ 3. `ꮧ_ꮣꮒꭺꮫꮢ_be_ꮣꮣꮄꭹ_ꮧ`
311
 
312
  **Context Size 3:**
313
 
314
+ 1. `ꮝꮧ_ꭸꮢꭹ_ꭽꮻꮎꮧꮲ_tassi`
315
+ 2. `_chemispherokee_na`
316
+ 3. `checked_(ꭱꮅꮯꮿ_ꭰꮅꮠ_`
317
 
318
  **Context Size 4:**
319
 
320
+ 1. `_checked_(ꭱꮃꮧꮬ)_(ꮖꮗ`
321
+ 2. `_ꭰꮄ_80,000_ꮎꮝꭶꮕꮎ_ꭶꮆ`
322
+ 3. `e_checked_ꮪᏻꭺꮫ_ꮹꮞꮝꮧ`
323
 
324
 
325
  ### Key Findings
326
 
327
  - **Best Predictability:** Context-4 (word) with 98.6% predictability
328
  - **Branching Factor:** Decreases with context size (more deterministic)
329
+ - **Memory Trade-off:** Larger contexts require more storage (77,796 contexts)
330
  - **Recommendation:** Context-3 or Context-4 for text generation
331
 
332
  ---
 
342
 
343
  | Metric | Value |
344
  |--------|-------|
345
+ | Vocabulary Size | 4,160 |
346
+ | Total Tokens | 34,218 |
347
+ | Mean Frequency | 8.23 |
348
  | Median Frequency | 3 |
349
+ | Frequency Std Dev | 35.30 |
350
 
351
  ### Most Common Words
352
 
353
  | Rank | Word | Frequency |
354
  |------|------|-----------|
355
+ | 1 | ꭰꮄ | 885 |
356
+ | 2 | be | 843 |
357
+ | 3 | checked | 841 |
358
+ | 4 | ꭿꭰ | 767 |
359
+ | 5 | ꮧꮥꭼꮤꮫ | 610 |
360
+ | 6 | ꮩꮿꮧꮲ | 579 |
361
+ | 7 | ꭺꮺꮅ | 521 |
362
+ | 8 | ꮣꮣꮪꭼ | 480 |
363
+ | 9 | ꮳꮃꭹ | 468 |
364
+ | 10 | word | 345 |
365
 
366
  ### Least Common Words (from vocabulary)
367
 
 
382
 
383
  | Metric | Value |
384
  |--------|-------|
385
+ | Zipf Coefficient | 0.8676 |
386
+ | R² (Goodness of Fit) | 0.984121 |
387
  | Adherence Quality | **excellent** |
388
 
389
  ### Coverage Analysis
390
 
391
  | Top N Words | Coverage |
392
  |-------------|----------|
393
+ | Top 100 | 40.0% |
394
+ | Top 1,000 | 74.3% |
395
  | Top 5,000 | 0.0% |
396
  | Top 10,000 | 0.0% |
397
 
398
  ### Key Findings
399
 
400
+ - **Zipf Compliance:** R²=0.9841 indicates excellent adherence to Zipf's law
401
+ - **High Frequency Dominance:** Top 100 words cover 40.0% of corpus
402
+ - **Long Tail:** -5,840 words needed for remaining 100.0% coverage
403
 
404
  ---
405
  ## 5. Word Embeddings Evaluation
 
415
 
416
  ### 5.1 Cross-Lingual Alignment
417
 
418
+ ![Alignment Quality](visualizations/embedding_alignment_quality.png)
419
+
420
+ ![Multilingual t-SNE](visualizations/embedding_tsne_multilingual.png)
421
 
422
 
423
  ### 5.2 Model Comparison
424
 
425
  | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
426
  |-------|-----------|----------|------------------|---------------|----------------|
427
+ | **mono_32d** | 32 | 0.2412 🏆 | 0.5036 | N/A | N/A |
428
+ | **mono_64d** | 64 | 0.0627 | 0.4822 | N/A | N/A |
429
+ | **mono_128d** | 128 | 0.0098 | 0.4702 | N/A | N/A |
430
+ | **aligned_32d** | 32 | 0.2412 | 0.4975 | 0.0596 | 0.3311 |
431
+ | **aligned_64d** | 64 | 0.0627 | 0.4601 | 0.0861 | 0.4702 |
432
+ | **aligned_128d** | 128 | 0.0098 | 0.4781 | 0.1325 | 0.5033 |
433
 
434
  ### Key Findings
435
 
436
+ - **Best Isotropy:** mono_32d with 0.2412 (more uniform distribution)
437
+ - **Semantic Density:** Average pairwise similarity of 0.4820. Lower values indicate better semantic separation.
438
+ - **Alignment Quality:** Aligned models achieve up to 13.2% R@1 in cross-lingual retrieval.
439
  - **Recommendation:** 128d aligned for best cross-lingual performance
440
 
441
  ---
442
  ## 6. Morphological Analysis (Experimental)
443
 
 
 
444
  This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
445
 
446
  ### 6.1 Productivity & Complexity
447
 
448
  | Metric | Value | Interpretation | Recommendation |
449
  |--------|-------|----------------|----------------|
450
+ | Productivity Index | **5.000** | High morphological productivity | Reliable analysis |
451
+ | Idiomaticity Gap | **1.531** | High formulaic/idiomatic content | - |
452
 
453
  ### 6.2 Affix Inventory (Productive Units)
454
 
 
461
  #### Productive Suffixes
462
  | Suffix | Examples |
463
  |--------|----------|
464
+ | `-ꮝꮧ` | ꭴꮒꭹꮝꮧ, ꮞꮧᏻꮝꮧ, ꭰꮣꮿꮝꮧ |
465
 
466
  ### 6.3 Bound Stems (Lexical Roots)
467
 
 
483
 
484
  | Word | Suggested Split | Confidence | Stem |
485
  |------|-----------------|------------|------|
 
486
  | ꭰᏸꮅꮧꭶꮃꮻꭲꮝꮧ | **`ꭰᏸꮅꮧꭶꮃꮻꭲ-ꮝꮧ`** | 1.5 | `ꭰᏸꮅꮧꭶꮃꮻꭲ` |
487
+ | ꮒꭶꮅꮝꮧꮝꭸꮝꮧ | **`ꮒꭶꮅꮝꮧꮝꭸ-ꮝꮧ`** | 1.5 | `ꮒꭶꮅꮝꮧꮝꭸ` |
488
 
489
  ### 6.6 Linguistic Interpretation
490
 
491
  > **Automated Insight:**
492
+ The language Cherokee shows high morphological productivity. The subword models are significantly more efficient than word models, suggesting a rich system of affixation or compounding.
493
+
494
+ > **Note on Idiomaticity:** The high Idiomaticity Gap suggests a large number of frequent multi-word expressions or formulaic sequences that are statistically distinct from their component parts.
495
 
496
  ---
497
  ## 7. Summary & Recommendations
 
502
 
503
  | Component | Recommended | Rationale |
504
  |-----------|-------------|-----------|
505
+ | Tokenizer | **32k BPE** | Best compression (3.55x) |
506
+ | N-gram | **2-gram** | Lowest perplexity (151) |
507
  | Markov | **Context-4** | Highest predictability (98.6%) |
508
  | Embeddings | **100d** | Balanced semantic capture and isotropy |
509
 
 
718
  ---
719
  *Generated by Wikilangs Models Pipeline*
720
 
721
+ *Report Date: 2026-01-03 20:28:09*
models/embeddings/aligned/chr_128d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9608e76f7cb96be932c1395d1bcfefe1c081dd492d18cab0b28d835a637f44fd
3
+ size 1025224158
models/embeddings/aligned/chr_128d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "chr", "dim": 128, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/chr_128d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46bc20b462846457385b6068fd6f90238c7d468e33af3f998fb331ad235f7d6d
3
+ size 65664
models/embeddings/aligned/chr_128d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "chr",
3
+ "dimension": 128,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 151,
7
+ "vocab_size": 1172
8
+ }
models/embeddings/aligned/chr_32d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f760c33d716161b029922d0b7c3599884ca393f9cd64c7db1c0a7e2f1ec41ad5
3
+ size 256324062
models/embeddings/aligned/chr_32d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "chr", "dim": 32, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/chr_32d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31f01b51e05d62314cbc89d3fe685678d852c389ad02104624c8c3e80bb8b6af
3
+ size 4224
models/embeddings/aligned/chr_32d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "chr",
3
+ "dimension": 32,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 151,
7
+ "vocab_size": 1172
8
+ }
models/embeddings/aligned/chr_64d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9a38a8bd94958ef1ccef12f12d92223fefb6a7bed727f0bb340377e1fd05640
3
+ size 512624094
models/embeddings/aligned/chr_64d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "chr", "dim": 64, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/chr_64d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4df8debcbf314f96d35f7d6dc2782e956cd957c7de8bd6fe988ee635cf73d7e7
3
+ size 16512
models/embeddings/aligned/chr_64d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "chr",
3
+ "dimension": 64,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 151,
7
+ "vocab_size": 1172
8
+ }
models/embeddings/monolingual/chr_128d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b65a441c9c4908f0ffa5c095ba64382b4b323cde80a4f6d979c87e4b93102427
3
- size 1025246107
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9608e76f7cb96be932c1395d1bcfefe1c081dd492d18cab0b28d835a637f44fd
3
+ size 1025224158
models/embeddings/monolingual/chr_128d_metadata.json CHANGED
@@ -11,5 +11,5 @@
11
  "encoding_method": "rope",
12
  "dim": 128
13
  },
14
- "vocab_size": 1193
15
  }
 
11
  "encoding_method": "rope",
12
  "dim": 128
13
  },
14
+ "vocab_size": 1172
15
  }
models/embeddings/monolingual/chr_32d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d9b5ecb349ca3c9d1d8f1f38b34647edbd029de966e73fe6f8043e5b64357e71
3
- size 256329883
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f760c33d716161b029922d0b7c3599884ca393f9cd64c7db1c0a7e2f1ec41ad5
3
+ size 256324062
models/embeddings/monolingual/chr_32d_metadata.json CHANGED
@@ -11,5 +11,5 @@
11
  "encoding_method": "rope",
12
  "dim": 32
13
  },
14
- "vocab_size": 1193
15
  }
 
11
  "encoding_method": "rope",
12
  "dim": 32
13
  },
14
+ "vocab_size": 1172
15
  }
models/embeddings/monolingual/chr_64d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ff3baa5798aac84304b756ac363fd816043d3a9775848343c3a39af84cf09bf
3
- size 512635291
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9a38a8bd94958ef1ccef12f12d92223fefb6a7bed727f0bb340377e1fd05640
3
+ size 512624094
models/embeddings/monolingual/chr_64d_metadata.json CHANGED
@@ -11,5 +11,5 @@
11
  "encoding_method": "rope",
12
  "dim": 64
13
  },
14
- "vocab_size": 1193
15
  }
 
11
  "encoding_method": "rope",
12
  "dim": 64
13
  },
14
+ "vocab_size": 1172
15
  }
models/subword_markov/chr_markov_ctx1_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:47a719104c2e085b3f0210500f2e0d35dd3fad4c62155a28ff1885ad952f603e
3
- size 48128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b4f4c43e8d3846167bdc43f2a3dc4c86699e912a96a34eebf31879dfeda3133
3
+ size 50406
models/subword_markov/chr_markov_ctx1_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 1,
3
  "variant": "subword",
4
  "language": "chr",
5
- "unique_contexts": 448,
6
- "total_transitions": 244557
7
  }
 
2
  "context_size": 1,
3
  "variant": "subword",
4
  "language": "chr",
5
+ "unique_contexts": 447,
6
+ "total_transitions": 238935
7
  }
models/subword_markov/chr_markov_ctx2_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:90017c14d3413b7443aa8cfd4e8186bbbfa36eb5c061301510a67e33d59ced11
3
- size 213598
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:167401283766adb0d352a1e7805c6c52332a9bc3f6da39a8ebf72250311d9c2e
3
+ size 213571
models/subword_markov/chr_markov_ctx2_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 2,
3
  "variant": "subword",
4
  "language": "chr",
5
- "unique_contexts": 7258,
6
- "total_transitions": 243650
7
  }
 
2
  "context_size": 2,
3
  "variant": "subword",
4
  "language": "chr",
5
+ "unique_contexts": 7162,
6
+ "total_transitions": 238035
7
  }
models/subword_markov/chr_markov_ctx3_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3aa3158ad438f31c7b0fe3d234fab09e4e04d062e867a5f9276cd4c9c48b1d0c
3
- size 611513
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd7ba6e70ee8f0c93fe6eaf000495cbe01f56f2d91d10c8ecda1ae5671786b71
3
+ size 601400
models/subword_markov/chr_markov_ctx3_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 3,
3
  "variant": "subword",
4
  "language": "chr",
5
- "unique_contexts": 33930,
6
- "total_transitions": 242743
7
  }
 
2
  "context_size": 3,
3
  "variant": "subword",
4
  "language": "chr",
5
+ "unique_contexts": 33475,
6
+ "total_transitions": 237135
7
  }
models/subword_markov/chr_markov_ctx4_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:041f8fb81d9f01bcb72c1f16d3266d79dd5c44b4454b909b4095a6dc7c4e41c9
3
- size 1161581
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:283f2f059945bfb99068c55820b70093ad31dce7d3d763e2c6051a868b6d6b33
3
+ size 1141008
models/subword_markov/chr_markov_ctx4_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 4,
3
  "variant": "subword",
4
  "language": "chr",
5
- "unique_contexts": 79071,
6
- "total_transitions": 241836
7
  }
 
2
  "context_size": 4,
3
  "variant": "subword",
4
  "language": "chr",
5
+ "unique_contexts": 77796,
6
+ "total_transitions": 236235
7
  }
models/subword_ngram/chr_2gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d97d4c240d2e544cee601293886cf200a3ca9744dc6f05309d9f3f6dedb5701
3
- size 37349
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b4791f7439af8a9b3785e2ec53c371f4e22342cdd1182d9ee51b3680f1eb235
3
+ size 37002
models/subword_ngram/chr_2gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 2,
3
  "variant": "subword",
4
  "language": "chr",
5
- "unique_ngrams": 3277,
6
- "total_ngrams": 244557
7
  }
 
2
  "n": 2,
3
  "variant": "subword",
4
  "language": "chr",
5
+ "unique_ngrams": 3244,
6
+ "total_ngrams": 238935
7
  }
models/subword_ngram/chr_3gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4909748af2830cbec527b301ed9c11dc5fba2c6a9e7f48e1b13a633d5cfdcd87
3
- size 157760
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b56340ab97d7f0fb81ed597d12abf6ab8e79e4b92a8df5871dc964f8393a888e
3
+ size 156597
models/subword_ngram/chr_3gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 3,
3
  "variant": "subword",
4
  "language": "chr",
5
- "unique_ngrams": 12903,
6
- "total_ngrams": 243650
7
  }
 
2
  "n": 3,
3
  "variant": "subword",
4
  "language": "chr",
5
+ "unique_ngrams": 12716,
6
+ "total_ngrams": 238035
7
  }
models/subword_ngram/chr_4gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a19dcd2eca820072c87f80cca01672f0fafd765e2ca81e6dbfd95249c77828dc
3
- size 374199
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57dc768d1347044e414d260830723b3977b4d8ae1fb1a9fd2e9c8d3254477faf
3
+ size 367072
models/subword_ngram/chr_4gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 4,
3
  "variant": "subword",
4
  "language": "chr",
5
- "unique_ngrams": 28877,
6
- "total_ngrams": 242743
7
  }
 
2
  "n": 4,
3
  "variant": "subword",
4
  "language": "chr",
5
+ "unique_ngrams": 28356,
6
+ "total_ngrams": 237135
7
  }
models/subword_ngram/chr_5gram_subword.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b62758c1284d9a0997d838e016f14c1198d6ed9997e4161cb502bd8a54ce09d9
3
+ size 383421
models/subword_ngram/chr_5gram_subword_metadata.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "n": 5,
3
+ "variant": "subword",
4
+ "language": "chr",
5
+ "unique_ngrams": 27480,
6
+ "total_ngrams": 236235
7
+ }
models/tokenizer/chr_tokenizer_16k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:359423c436a228311e5a63ed11e5f00c42f4e1f43e6dab0726841d8a8cea7899
3
- size 558319
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8302ab6dd99220f27f6a4b918a475cead1f491ee5ac099d34ad1c49a2c58d3ff
3
+ size 558565
models/tokenizer/chr_tokenizer_16k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/tokenizer/chr_tokenizer_32k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd42bf932a4c3f148e083933ebcc3b363a57f53b5b4db22c1e6a60d5017edabe
3
- size 835851
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4a07a8d5403d079ea7eaf7dc3d296dc7f93ca7ebb01caf3019f14dbd83fb421
3
+ size 836496
models/tokenizer/chr_tokenizer_32k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/tokenizer/chr_tokenizer_8k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cb436a7afc74fa9ba90ff21df46b3064327fe8aa4d69c5a5859a405d4b67a6c2
3
- size 392805
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:428c97610731c1a1360888f8d67cbdb313e6a2cef8b6bd0816578205bbb32231
3
+ size 392349
models/tokenizer/chr_tokenizer_8k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/vocabulary/chr_vocabulary.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8068c404442dc10b84690c7100fc813e925cbe51c1e787ef250cbc0cea84d3d3
3
- size 73331
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a51ad98285655f3a93b82e37b2328d6103a522ea9879c30c242647089e8a666
3
+ size 71930
models/vocabulary/chr_vocabulary_metadata.json CHANGED
@@ -1,17 +1,17 @@
1
  {
2
  "language": "chr",
3
- "vocabulary_size": 4236,
4
  "variant": "full",
5
  "statistics": {
6
- "type_token_ratio": 0.2998054386679336,
7
  "coverage": {
8
- "top_100": 0.3170671010361522,
9
- "top_1000": 0.590561513053708,
10
- "top_5000": 0.8133116148590561,
11
- "top_10000": 0.9264286683860459
12
  },
13
- "hapax_count": 9016,
14
- "hapax_ratio": 0.6803501358285542,
15
- "total_documents": 907
16
  }
17
  }
 
1
  {
2
  "language": "chr",
3
+ "vocabulary_size": 4160,
4
  "variant": "full",
5
  "statistics": {
6
+ "type_token_ratio": 0.3039551685809559,
7
  "coverage": {
8
+ "top_100": 0.3169692478695813,
9
+ "top_1000": 0.5890839199703594,
10
+ "top_5000": 0.8118284549833271,
11
+ "top_10000": 0.9276120785476102
12
  },
13
+ "hapax_count": 8966,
14
+ "hapax_ratio": 0.6830717659606887,
15
+ "total_documents": 900
16
  }
17
  }
models/word_markov/chr_markov_ctx1_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2e06367eae370c3314f083ee9fc62374591dc05fc32a00400e11de6f6a357de8
3
- size 388759
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbc9639767558234a47208531de6544ec229a2fca763ae0d3fc97ea5b2b25584
3
+ size 383564
models/word_markov/chr_markov_ctx1_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 1,
3
  "variant": "word",
4
  "language": "chr",
5
- "unique_contexts": 13243,
6
- "total_transitions": 43295
7
  }
 
2
  "context_size": 1,
3
  "variant": "word",
4
  "language": "chr",
5
+ "unique_contexts": 13116,
6
+ "total_transitions": 42284
7
  }
models/word_markov/chr_markov_ctx2_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:efac8874c4bac9908d723a0cd4c5b8e19f9b13e1c696f3e7076b4376fd315124
3
- size 694832
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff38caa6b856e38a004a07ba1e2a158a2a6e4c726f68e712654eb3fba3960208
3
+ size 679573
models/word_markov/chr_markov_ctx2_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 2,
3
  "variant": "word",
4
  "language": "chr",
5
- "unique_contexts": 30660,
6
- "total_transitions": 42388
7
  }
 
2
  "context_size": 2,
3
  "variant": "word",
4
  "language": "chr",
5
+ "unique_contexts": 29975,
6
+ "total_transitions": 41384
7
  }
models/word_markov/chr_markov_ctx3_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:61472a61178571af03b7dbace2a9fde2feec3747df77700588b54d959fe32645
3
- size 845527
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b351dfd6a07f58fcb32eb97bbc0e03270e06acb0d0985c95f1e74f0cb41c05c
3
+ size 826860
models/word_markov/chr_markov_ctx3_word_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 3,
3
  "variant": "word",
4
  "language": "chr",
5
- "unique_contexts": 35274,
6
- "total_transitions": 41481
7
  }
 
2
  "context_size": 3,
3
  "variant": "word",
4
  "language": "chr",
5
+ "unique_contexts": 34378,
6
+ "total_transitions": 40484
7
  }