omarkamali commited on
Commit
d8537d7
·
verified ·
1 Parent(s): d369a56

Upload all models and assets for el (latest)

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. README.md +353 -140
  3. el_morph_tokenizer.json +0 -0
  4. models/embeddings/aligned/el_128d.bin +3 -0
  5. models/embeddings/aligned/el_128d.meta.json +1 -0
  6. models/embeddings/aligned/el_128d.projection.npy +3 -0
  7. models/embeddings/aligned/el_128d_metadata.json +8 -0
  8. models/embeddings/aligned/el_32d.bin +3 -0
  9. models/embeddings/aligned/el_32d.meta.json +1 -0
  10. models/embeddings/aligned/el_32d.projection.npy +3 -0
  11. models/embeddings/aligned/el_32d_metadata.json +8 -0
  12. models/embeddings/aligned/el_64d.bin +3 -0
  13. models/embeddings/aligned/el_64d.meta.json +1 -0
  14. models/embeddings/aligned/el_64d.projection.npy +3 -0
  15. models/embeddings/aligned/el_64d_metadata.json +8 -0
  16. models/embeddings/monolingual/el_128d.bin +2 -2
  17. models/embeddings/monolingual/el_128d_metadata.json +5 -3
  18. models/embeddings/monolingual/el_32d.bin +2 -2
  19. models/embeddings/monolingual/el_32d_metadata.json +5 -3
  20. models/embeddings/monolingual/el_64d.bin +2 -2
  21. models/embeddings/monolingual/el_64d_metadata.json +5 -3
  22. models/subword_markov/el_markov_ctx1_subword.parquet +2 -2
  23. models/subword_markov/el_markov_ctx1_subword_metadata.json +2 -2
  24. models/subword_markov/el_markov_ctx2_subword.parquet +2 -2
  25. models/subword_markov/el_markov_ctx2_subword_metadata.json +2 -2
  26. models/subword_markov/el_markov_ctx3_subword.parquet +2 -2
  27. models/subword_markov/el_markov_ctx3_subword_metadata.json +2 -2
  28. models/subword_markov/el_markov_ctx4_subword.parquet +2 -2
  29. models/subword_markov/el_markov_ctx4_subword_metadata.json +2 -2
  30. models/subword_ngram/el_2gram_subword.parquet +2 -2
  31. models/subword_ngram/el_2gram_subword_metadata.json +2 -2
  32. models/subword_ngram/el_3gram_subword.parquet +2 -2
  33. models/subword_ngram/el_3gram_subword_metadata.json +2 -2
  34. models/subword_ngram/el_4gram_subword.parquet +2 -2
  35. models/subword_ngram/el_4gram_subword_metadata.json +2 -2
  36. models/subword_ngram/el_5gram_subword.parquet +3 -0
  37. models/subword_ngram/el_5gram_subword_metadata.json +7 -0
  38. models/tokenizer/el_tokenizer_16k.model +2 -2
  39. models/tokenizer/el_tokenizer_16k.vocab +0 -0
  40. models/tokenizer/el_tokenizer_32k.model +2 -2
  41. models/tokenizer/el_tokenizer_32k.vocab +0 -0
  42. models/tokenizer/el_tokenizer_64k.model +2 -2
  43. models/tokenizer/el_tokenizer_64k.vocab +0 -0
  44. models/tokenizer/el_tokenizer_8k.model +2 -2
  45. models/tokenizer/el_tokenizer_8k.vocab +0 -0
  46. models/vocabulary/el_vocabulary.parquet +2 -2
  47. models/vocabulary/el_vocabulary_metadata.json +10 -9
  48. models/vocabulary/el_vocabulary_top.parquet +3 -0
  49. models/vocabulary/el_vocabulary_top_metadata.json +20 -0
  50. models/word_markov/el_markov_ctx1_word.parquet +2 -2
.gitattributes CHANGED
@@ -39,3 +39,4 @@ visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -t
39
  visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
40
  visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
41
  visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
 
 
39
  visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
40
  visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
41
  visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
42
+ visualizations/embedding_tsne_multilingual.png filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -10,11 +10,21 @@ tags:
10
  - n-gram
11
  - markov
12
  - wikipedia
 
 
 
 
 
 
 
 
 
 
13
  - monolingual
14
  - family-greek
15
  license: mit
16
  library_name: wikilangs
17
- pipeline_tag: feature-extraction
18
  datasets:
19
  - omarkamali/wikipedia-monthly
20
  dataset_info:
@@ -23,14 +33,14 @@ dataset_info:
23
  metrics:
24
  - name: best_compression_ratio
25
  type: compression
26
- value: 4.539
27
  - name: best_isotropy
28
  type: isotropy
29
- value: 0.7952
30
  - name: vocabulary_size
31
  type: vocab
32
- value: 1000000
33
- generated: 2025-12-30
34
  ---
35
 
36
  # Greek - Wikilangs Models
@@ -44,12 +54,13 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
44
  ### Models & Assets
45
 
46
  - Tokenizers (8k, 16k, 32k, 64k)
47
- - N-gram models (2, 3, 4-gram)
48
- - Markov chains (context of 1, 2, 3 and 4)
49
  - Subword N-gram and Markov chains
50
- - Embeddings in various sizes and dimensions
51
  - Language Vocabulary
52
  - Language Statistics
 
53
  ![Performance Dashboard](visualizations/performance_dashboard.png)
54
 
55
  ### Analysis and Evaluation
@@ -59,7 +70,8 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
59
  - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
60
  - [4. Vocabulary Analysis](#4-vocabulary-analysis)
61
  - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
62
- - [6. Summary & Recommendations](#6-summary--recommendations)
 
63
  - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
64
  - [Visualizations Index](#visualizations-index)
65
 
@@ -68,57 +80,57 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
68
 
69
  ![Tokenizer Compression](visualizations/tokenizer_compression.png)
70
 
 
 
 
 
 
 
71
  ### Results
72
 
73
  | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
74
  |------------|-------------|---------------|----------|--------------|
75
- | **8k** | 3.468x | 3.45 | 0.0627% | 2,950,157 |
76
- | **16k** | 3.866x | 3.85 | 0.0699% | 2,647,020 |
77
- | **32k** | 4.241x | 4.22 | 0.0767% | 2,412,572 |
78
- | **64k** | 4.539x 🏆 | 4.52 | 0.0821% | 2,254,042 |
79
 
80
  ### Tokenization Examples
81
 
82
  Below are sample sentences tokenized with each vocabulary size:
83
 
84
- **Sample 1:** `Το όνομα Hulk μπορεί να αναφέρεται σε κάποιο από τα παρακάτω.
85
-
86
- Hulk (χαρακτήρας...`
87
 
88
  | Vocab | Tokens | Count |
89
  |-------|--------|-------|
90
- | 8k | `▁το ▁όνομα ▁h ul k ▁μπορεί ▁να ▁αναφέρεται ▁σε ▁κάποιο ... (+22 more)` | 32 |
91
- | 16k | `▁το ▁όνομα ▁h ul k ▁μπορεί ▁να ▁αναφέρεται ▁σε ▁κάποιο ... (+22 more)` | 32 |
92
- | 32k | `▁το ▁όνομα ▁h ulk ▁μπορεί ▁να ▁αναφέρεται ▁σε ▁κάποιο ▁από ... (+16 more)` | 26 |
93
- | 64k | `▁το ▁όνομα ▁hulk ▁μπορεί ▁να ▁αναφέρεται ▁σε ▁κάποιο ▁από ▁τα ... (+12 more)` | 22 |
94
-
95
- **Sample 2:** `Ο όρος Πισίνα μπορεί να αναφέρεται σε κάποιο από τα παρακάτω.
96
 
97
- Πισίνα (δεξαμενή...`
98
 
99
  | Vocab | Tokens | Count |
100
  |-------|--------|-------|
101
- | 8k | `▁ο ▁όροσ ▁π ισ ίνα ▁μπορεί ▁να ▁αναφέρεται ▁σε ▁κάποιο ... (+33 more)` | 43 |
102
- | 16k | `▁ο ▁όροσ ▁πισ ίνα ▁μπορεί ▁να ▁αναφέρεται ▁σε ▁κάποιο ▁από ... (+26 more)` | 36 |
103
- | 32k | `▁ο ▁όροσ ▁πισ ίνα ▁μπορεί ▁να ▁αναφέρεται ▁σε ▁κάποιο ▁από ... (+23 more)` | 33 |
104
- | 64k | `▁ο ▁όροσ ▁πισίνα ▁μπορεί ▁να ▁αναφέρεται ▁σε ▁κάποιο ▁από ▁τα ... (+18 more)` | 28 |
105
-
106
- **Sample 3:** `Ο όρος Γράμμα μπορεί να αναφέρεται σε κάποιο από τα παρακάτω.
107
 
108
- Γράμμα (στοιχείο...`
109
 
110
  | Vocab | Tokens | Count |
111
  |-------|--------|-------|
112
- | 8k | `▁ο ▁όροσ ▁γρά μμα ▁μπορεί ▁να ▁αναφέρεται ▁σε ▁κάποιο ▁από ... (+22 more)` | 32 |
113
- | 16k | `▁ο ▁όροσ ▁γράμμα ▁μπορεί ▁να ▁αναφέρεται ▁σε ▁κάποιο ▁από ▁τα ... (+18 more)` | 28 |
114
- | 32k | `▁ο ▁όροσ ▁γράμμα ▁μπορεί ▁να ▁αναφέρεται ▁σε ▁κάποιο ▁από ▁τα ... (+16 more)` | 26 |
115
- | 64k | `▁ο ▁όροσ ▁γράμμα ▁μπορεί ▁να ▁αναφέρεται ▁σε ▁κάποιο ▁από ▁τα ... (+16 more)` | 26 |
116
 
117
 
118
  ### Key Findings
119
 
120
- - **Best Compression:** 64k achieves 4.539x compression
121
- - **Lowest UNK Rate:** 8k with 0.0627% unknown tokens
122
  - **Trade-off:** Larger vocabularies improve compression but increase model size
123
  - **Recommendation:** 32k vocabulary provides optimal balance for production use
124
 
@@ -127,57 +139,111 @@ Below are sample sentences tokenized with each vocabulary size:
127
 
128
  ![N-gram Perplexity](visualizations/ngram_perplexity.png)
129
 
 
 
130
  ![N-gram Coverage](visualizations/ngram_coverage.png)
131
 
132
  ### Results
133
 
134
- | N-gram | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
135
- |--------|------------|---------|----------------|------------------|-------------------|
136
- | **2-gram** | 184,550 🏆 | 17.49 | 2,734,979 | 9.7% | 21.6% |
137
- | **2-gram** | 524 🏆 | 9.03 | 31,823 | 53.1% | 95.1% |
138
- | **3-gram** | 1,381,139 | 20.40 | 7,476,828 | 3.1% | 8.3% |
139
- | **3-gram** | 4,827 | 12.24 | 338,020 | 22.3% | 56.5% |
140
- | **4-gram** | 4,155,936 | 21.99 | 13,885,545 | 2.1% | 5.3% |
141
- | **4-gram** | 27,979 | 14.77 | 2,196,788 | 11.8% | 32.2% |
 
 
142
 
143
  ### Top 5 N-grams by Size
144
 
145
- **2-grams:**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
  | Rank | N-gram | Count |
148
  |------|--------|-------|
149
- | 1 | `κατηγορία :` | 636,996 |
150
- | 2 | `. η` | 478,250 |
151
- | 3 | `. το` | 448,858 |
152
- | 4 | `) ,` | 447,570 |
153
- | 5 | `. ο` | 442,335 |
154
 
155
- **3-grams:**
156
 
157
  | Rank | N-gram | Count |
158
  |------|--------|-------|
159
- | 1 | `| | |` | 239,466 |
160
- | 2 | `. χ .` | 95,817 |
161
- | 3 | `: / /` | 81,844 |
162
- | 4 | `, σελ .` | 80,384 |
163
- | 5 | . χ` | 76,365 |
164
 
165
- **4-grams:**
166
 
167
  | Rank | N-gram | Count |
168
  |------|--------|-------|
169
- | 1 | `| | | |` | 180,521 |
170
- | 2 | . χ .` | 74,646 |
171
- | 3 | `: / / www` | 50,485 |
172
- | 4 | `/ / www .` | 50,469 |
173
- | 5 | `. παραπομπές κατηγορία :` | 47,261 |
174
 
175
 
176
  ### Key Findings
177
 
178
- - **Best Perplexity:** 2-gram with 524
179
  - **Entropy Trend:** Decreases with larger n-grams (more predictable)
180
- - **Coverage:** Top-1000 patterns cover ~32% of corpus
181
  - **Recommendation:** 4-gram or 5-gram for best predictive performance
182
 
183
  ---
@@ -185,55 +251,86 @@ Below are sample sentences tokenized with each vocabulary size:
185
 
186
  ![Markov Entropy](visualizations/markov_entropy.png)
187
 
 
 
188
  ![Markov Branching](visualizations/markov_branching.png)
189
 
190
  ### Results
191
 
192
- | Context | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
193
- |---------|-------------|------------|------------------|-----------------|----------------|
194
- | **1** | 0.6618 | 1.582 | 7.87 | 3,151,392 | 33.8% |
195
- | **1** | 1.4637 | 2.758 | 11.20 | 10,325 | 0.0% |
196
- | **2** | 0.4556 | 1.371 | 2.92 | 24,784,714 | 54.4% |
197
- | **2** | 0.8017 | 1.743 | 6.14 | 115,568 | 19.8% |
198
- | **3** | 0.2366 | 1.178 | 1.61 | 72,311,071 | 76.3% |
199
- | **3** | 0.9287 | 1.904 | 5.40 | 709,839 | 7.1% |
200
- | **4** | 0.1100 🏆 | 1.079 | 1.21 | 116,470,687 | 89.0% |
201
- | **4** | 0.7438 🏆 | 1.675 | 3.63 | 3,835,999 | 25.6% |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
- ### Generated Text Samples
204
 
205
- Below are text samples generated from each Markov chain model:
 
 
206
 
207
  **Context Size 1:**
208
 
209
- 1. `, του χορού , donado por los kids ' επς βοιωτίας . english renaissance ( 1`
210
- 2. `. 3 : χωριά της νεκρής θάλασσας , δήμοι , στη στρατιωτική ακαδημία αθηνών . ανακτήθηκε`
211
- 3. `του 2017 . λόγω της καμπανίας ) . soundwave melbourne , εργαστήρια και μια « φεκ`
212
 
213
  **Context Size 2:**
214
 
215
- 1. `κατηγορία : αλκυλαλογονίδια κατηγορία : ταινίες με θέμα τη γαλλική πρεσβεία . δηλητηρίαση από αυτό ,...`
216
- 2. `. η γερμανία υπέστη οξεία κρίση δεν αντιστοιχεί σε μια άλλη παράδοση - φυτεύτηκε από την κεντρική`
217
- 3. `. το 1943 και μετά ο εφραίμογλου τους μετέφερε με εμπορικό περιεχόμενο βρέθηκαν στην περιοχή με μεγά...`
218
 
219
  **Context Size 3:**
220
 
221
- 1. `| | | | | | | | | | 4 - 1 πέν . ( 0 ,`
222
- 2. `. χ . ο αννίβας σε απάντηση κινήθηκε κοντά στο pd ( 15 . 441 ή 15 %`
223
- 3. `: / / www . crvenazvezdafk . com / 2020 / 08 / 2018 . ανάκτηση στις 2018`
224
 
225
  **Context Size 4:**
226
 
227
- 1. `| | | | | | | | | | | 5 , 13 | | 2 . 1`
228
- 2. `π . χ . η ονομασία δόθηκε προς τιμήν των ελευθερωτών ( ισπ . / πορτογ . : libertadores`
229
- 3. `: / / www . in . gr , ο ατρόμητος πειραιώς , ο αργοναύτης πειραιώς , ο πειραϊκός`
230
 
231
 
232
  ### Key Findings
233
 
234
- - **Best Predictability:** Context-4 with 89.0% predictability
235
  - **Branching Factor:** Decreases with context size (more deterministic)
236
- - **Memory Trade-off:** Larger contexts require more storage (3,835,999 contexts)
237
  - **Recommendation:** Context-3 or Context-4 for text generation
238
 
239
  ---
@@ -249,64 +346,64 @@ Below are text samples generated from each Markov chain model:
249
 
250
  | Metric | Value |
251
  |--------|-------|
252
- | Vocabulary Size | 1,000,000 |
253
- | Total Tokens | 141,420,269 |
254
- | Mean Frequency | 141.42 |
255
- | Median Frequency | 5 |
256
- | Frequency Std Dev | 9402.05 |
257
 
258
  ### Most Common Words
259
 
260
  | Rank | Word | Frequency |
261
  |------|------|-----------|
262
- | 1 | του | 4,114,595 |
263
- | 2 | και | 3,905,146 |
264
- | 3 | το | 3,246,877 |
265
- | 4 | της | 3,002,879 |
266
- | 5 | η | 1,973,472 |
267
- | 6 | την | 1,903,298 |
268
- | 7 | ο | 1,891,449 |
269
- | 8 | από | 1,890,535 |
270
- | 9 | με | 1,665,302 |
271
- | 10 | τον | 1,309,398 |
272
 
273
  ### Least Common Words (from vocabulary)
274
 
275
  | Rank | Word | Frequency |
276
  |------|------|-----------|
277
- | 1 | hoarder | 2 |
278
- | 2 | λακουέβα | 2 |
279
- | 3 | teasdale | 2 |
280
- | 4 | φλόσι | 2 |
281
- | 5 | μαχαλάθ | 2 |
282
- | 6 | παστούχοφ | 2 |
283
- | 7 | lorring | 2 |
284
- | 8 | ρινοπαρειακές | 2 |
285
- | 9 | λιπομεταμόσχευσης | 2 |
286
- | 10 | παμπίνος | 2 |
287
 
288
  ### Zipf's Law Analysis
289
 
290
  | Metric | Value |
291
  |--------|-------|
292
- | Zipf Coefficient | 0.9633 |
293
- | R² (Goodness of Fit) | 0.996714 |
294
  | Adherence Quality | **excellent** |
295
 
296
  ### Coverage Analysis
297
 
298
  | Top N Words | Coverage |
299
  |-------------|----------|
300
- | Top 100 | 37.1% |
301
- | Top 1,000 | 55.3% |
302
- | Top 5,000 | 71.3% |
303
  | Top 10,000 | 78.0% |
304
 
305
  ### Key Findings
306
 
307
- - **Zipf Compliance:** R²=0.9967 indicates excellent adherence to Zipf's law
308
- - **High Frequency Dominance:** Top 100 words cover 37.1% of corpus
309
- - **Long Tail:** 990,000 words needed for remaining 22.0% coverage
310
 
311
  ---
312
  ## 5. Word Embeddings Evaluation
@@ -319,24 +416,137 @@ Below are text samples generated from each Markov chain model:
319
 
320
  ![t-SNE Sentences](visualizations/tsne_sentences.png)
321
 
322
- ### Model Comparison
323
 
324
- | Model | Vocab Size | Dimension | Avg Norm | Std Norm | Isotropy |
325
- |-------|------------|-----------|----------|----------|----------|
326
- | **mono_32d** | 858,240 | 32 | 3.095 | 0.904 | 0.7952 🏆 |
327
- | **mono_64d** | 858,240 | 64 | 3.528 | 0.886 | 0.7735 |
328
- | **mono_128d** | 858,240 | 128 | 3.968 | 0.931 | 0.7220 |
329
- | **embeddings_enhanced** | 0 | 0 | 0.000 | 0.000 | 0.0000 |
 
 
 
 
 
 
 
 
 
 
 
330
 
331
  ### Key Findings
332
 
333
- - **Best Isotropy:** mono_32d with 0.7952 (more uniform distribution)
334
- - **Dimension Trade-off:** Higher dimensions capture more semantics but reduce isotropy
335
- - **Vocabulary Coverage:** All models cover 858,240 words
336
- - **Recommendation:** 100d for balanced semantic capture and efficiency
337
 
338
  ---
339
- ## 6. Summary & Recommendations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
 
341
  ![Performance Dashboard](visualizations/performance_dashboard.png)
342
 
@@ -344,11 +554,12 @@ Below are text samples generated from each Markov chain model:
344
 
345
  | Component | Recommended | Rationale |
346
  |-----------|-------------|-----------|
347
- | Tokenizer | **32k BPE** | Best compression (4.54x) with low UNK rate |
348
- | N-gram | **5-gram** | Lowest perplexity (524) |
349
- | Markov | **Context-4** | Highest predictability (89.0%) |
350
  | Embeddings | **100d** | Balanced semantic capture and isotropy |
351
 
 
352
  ---
353
  ## Appendix: Metrics Glossary & Interpretation Guide
354
 
@@ -538,7 +749,8 @@ If you use these models in your research, please cite:
538
  author = {Kamali, Omar},
539
  title = {Wikilangs: Open NLP Models for Wikipedia Languages},
540
  year = {2025},
541
- publisher = {HuggingFace},
 
542
  url = {https://huggingface.co/wikilangs}
543
  institution = {Omneity Labs}
544
  }
@@ -554,7 +766,8 @@ MIT License - Free for academic and commercial use.
554
  - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
555
  - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
556
  - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
 
557
  ---
558
  *Generated by Wikilangs Models Pipeline*
559
 
560
- *Report Date: 2025-12-30 12:18:07*
 
10
  - n-gram
11
  - markov
12
  - wikipedia
13
+ - feature-extraction
14
+ - sentence-similarity
15
+ - tokenization
16
+ - n-grams
17
+ - markov-chain
18
+ - text-mining
19
+ - fasttext
20
+ - babelvec
21
+ - vocabulous
22
+ - vocabulary
23
  - monolingual
24
  - family-greek
25
  license: mit
26
  library_name: wikilangs
27
+ pipeline_tag: text-generation
28
  datasets:
29
  - omarkamali/wikipedia-monthly
30
  dataset_info:
 
33
  metrics:
34
  - name: best_compression_ratio
35
  type: compression
36
+ value: 4.872
37
  - name: best_isotropy
38
  type: isotropy
39
+ value: 0.8028
40
  - name: vocabulary_size
41
  type: vocab
42
+ value: 0
43
+ generated: 2026-01-10
44
  ---
45
 
46
  # Greek - Wikilangs Models
 
54
  ### Models & Assets
55
 
56
  - Tokenizers (8k, 16k, 32k, 64k)
57
+ - N-gram models (2, 3, 4, 5-gram)
58
+ - Markov chains (context of 1, 2, 3, 4 and 5)
59
  - Subword N-gram and Markov chains
60
+ - Embeddings in various sizes and dimensions (aligned and unaligned)
61
  - Language Vocabulary
62
  - Language Statistics
63
+
64
  ![Performance Dashboard](visualizations/performance_dashboard.png)
65
 
66
  ### Analysis and Evaluation
 
70
  - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
71
  - [4. Vocabulary Analysis](#4-vocabulary-analysis)
72
  - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
73
+ - [6. Morphological Analysis (Experimental)](#6--morphological-analysis-experimental)
74
+ - [7. Summary & Recommendations](#7-summary--recommendations)
75
  - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
76
  - [Visualizations Index](#visualizations-index)
77
 
 
80
 
81
  ![Tokenizer Compression](visualizations/tokenizer_compression.png)
82
 
83
+ ![Tokenizer Fertility](visualizations/tokenizer_fertility.png)
84
+
85
+ ![Tokenizer OOV](visualizations/tokenizer_oov.png)
86
+
87
+ ![Total Tokens](visualizations/tokenizer_total_tokens.png)
88
+
89
  ### Results
90
 
91
  | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
92
  |------------|-------------|---------------|----------|--------------|
93
+ | **8k** | 3.621x | 3.62 | 0.0471% | 2,711,752 |
94
+ | **16k** | 4.087x | 4.09 | 0.0531% | 2,402,524 |
95
+ | **32k** | 4.519x | 4.52 | 0.0587% | 2,172,769 |
96
+ | **64k** | 4.872x 🏆 | 4.87 | 0.0633% | 2,015,689 |
97
 
98
  ### Tokenization Examples
99
 
100
  Below are sample sentences tokenized with each vocabulary size:
101
 
102
+ **Sample 1:** `.ms είναι ο top-level domain κωδικός για το Μοντσερράτ στο Διαδίκτυο. Δείτε επίσ...`
 
 
103
 
104
  | Vocab | Tokens | Count |
105
  |-------|--------|-------|
106
+ | 8k | `▁. ms ▁είναι ▁ο ▁top - level ▁domain ▁κω δικόσ ... (+30 more)` | 40 |
107
+ | 16k | `▁. ms ▁είναι ▁ο ▁top - level ▁domain ▁κωδικόσ ▁για ... (+21 more)` | 31 |
108
+ | 32k | `▁. ms ▁είναι ▁ο ▁top - level ▁domain ▁κωδικόσ ▁για ... (+21 more)` | 31 |
109
+ | 64k | `▁. ms ▁είναι ▁ο ▁top - level ▁domain ▁κωδικόσ ▁για ... (+19 more)` | 29 |
 
 
110
 
111
+ **Sample 2:** `Το Φόππολο (ιταλικά: Foppolo) είναι ιταλικός δήμος στην Επαρχία του Μπέργκαμο, σ...`
112
 
113
  | Vocab | Tokens | Count |
114
  |-------|--------|-------|
115
+ | 8k | `▁το ▁φ όπ πο λο ▁( ιταλικά : ▁f op ... (+32 more)` | 42 |
116
+ | 16k | `▁το ▁φ όπ πο λο ▁( ιταλικά : ▁f op ... (+28 more)` | 38 |
117
+ | 32k | `▁το ▁φ όπ πο λο ▁( ιταλικά : ▁f op ... (+25 more)` | 35 |
118
+ | 64k | `▁το ▁φ όπ πο λο ▁( ιταλικά : ▁f op ... (+21 more)` | 31 |
 
 
119
 
120
+ **Sample 3:** `Το Λε Τορ () είναι γαλλική κοινότητα στο νομό της Ερ, στη διοικητική περιοχή της...`
121
 
122
  | Vocab | Tokens | Count |
123
  |-------|--------|-------|
124
+ | 8k | `▁το ▁λε ▁τορ ▁() ▁είναι ▁γαλλική ▁κοινότητα ▁στο ▁νομό ▁τησ ... (+15 more)` | 25 |
125
+ | 16k | `▁το ▁λε ▁τορ ▁() ▁είναι ▁γαλλική ▁κοινότητα ▁στο ▁νομό ▁τησ ... (+14 more)` | 24 |
126
+ | 32k | `▁το ▁λε ▁τορ ▁() ▁είναι ▁γαλλική ▁κοινότητα ▁στο ▁νομό ▁τησ ... (+13 more)` | 23 |
127
+ | 64k | `▁το ▁λε ▁τορ ▁() ▁είναι ▁γαλλική ▁κοινότητα ▁στο ▁νομό ▁τησ ... (+13 more)` | 23 |
128
 
129
 
130
  ### Key Findings
131
 
132
+ - **Best Compression:** 64k achieves 4.872x compression
133
+ - **Lowest UNK Rate:** 8k with 0.0471% unknown tokens
134
  - **Trade-off:** Larger vocabularies improve compression but increase model size
135
  - **Recommendation:** 32k vocabulary provides optimal balance for production use
136
 
 
139
 
140
  ![N-gram Perplexity](visualizations/ngram_perplexity.png)
141
 
142
+ ![N-gram Unique](visualizations/ngram_unique.png)
143
+
144
  ![N-gram Coverage](visualizations/ngram_coverage.png)
145
 
146
  ### Results
147
 
148
+ | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
149
+ |--------|---------|------------|---------|----------------|------------------|-------------------|
150
+ | **2-gram** | Word | 254,029 | 17.95 | 2,414,487 | 7.3% | 17.4% |
151
+ | **2-gram** | Subword | 443 🏆 | 8.79 | 26,716 | 56.5% | 96.8% |
152
+ | **3-gram** | Word | 1,488,610 | 20.51 | 5,529,817 | 1.9% | 6.3% |
153
+ | **3-gram** | Subword | 3,933 | 11.94 | 250,216 | 24.2% | 59.6% |
154
+ | **4-gram** | Word | 3,845,615 | 21.87 | 9,144,193 | 1.3% | 3.9% |
155
+ | **4-gram** | Subword | 22,210 | 14.44 | 1,519,855 | 12.8% | 34.2% |
156
+ | **5-gram** | Word | 2,910,168 | 21.47 | 5,914,525 | 1.4% | 4.2% |
157
+ | **5-gram** | Subword | 87,887 | 16.42 | 5,267,290 | 7.2% | 20.9% |
158
 
159
  ### Top 5 N-grams by Size
160
 
161
+ **2-grams (Word):**
162
+
163
+ | Rank | N-gram | Count |
164
+ |------|--------|-------|
165
+ | 1 | `από το` | 323,213 |
166
+ | 2 | `από την` | 290,152 |
167
+ | 3 | `με την` | 252,647 |
168
+ | 4 | `από τον` | 241,108 |
169
+ | 5 | `για την` | 198,175 |
170
+
171
+ **3-grams (Word):**
172
+
173
+ | Rank | N-gram | Count |
174
+ |------|--------|-------|
175
+ | 1 | `κατά τη διάρκεια` | 71,561 |
176
+ | 2 | `παραπομπές εξωτερικοί σύνδεσμοι` | 62,539 |
177
+ | 3 | `τη διάρκεια της` | 34,723 |
178
+ | 4 | `για πρώτη φορά` | 29,480 |
179
+ | 5 | `σύμφωνα με την` | 25,173 |
180
+
181
+ **4-grams (Word):**
182
+
183
+ | Rank | N-gram | Count |
184
+ |------|--------|-------|
185
+ | 1 | `κατά τη διάρκεια της` | 32,537 |
186
+ | 2 | `από το έως το` | 20,094 |
187
+ | 3 | `κατά τη διάρκεια του` | 19,453 |
188
+ | 4 | `γαλλική κοινότητα στο νομό` | 16,152 |
189
+ | 5 | `είναι γαλλική κοινότητα στο` | 16,142 |
190
+
191
+ **5-grams (Word):**
192
+
193
+ | Rank | N-gram | Count |
194
+ |------|--------|-------|
195
+ | 1 | `είναι γαλλική κοινότητα στο νομό` | 16,142 |
196
+ | 2 | `γαλλική κοινότητα στο νομό της` | 10,798 |
197
+ | 3 | `σύμφωνα με την απογραφή του` | 8,977 |
198
+ | 4 | `προβλήματα οργανικής χημείας ν α` | 5,103 |
199
+ | 5 | `οργανικής χημείας ν α πετάση` | 5,103 |
200
+
201
+ **2-grams (Subword):**
202
+
203
+ | Rank | N-gram | Count |
204
+ |------|--------|-------|
205
+ | 1 | `ς _` | 20,530,109 |
206
+ | 2 | `_ τ` | 20,509,338 |
207
+ | 3 | `τ ο` | 15,006,596 |
208
+ | 4 | `ο υ` | 13,459,949 |
209
+ | 5 | `α _` | 12,791,705 |
210
+
211
+ **3-grams (Subword):**
212
 
213
  | Rank | N-gram | Count |
214
  |------|--------|-------|
215
+ | 1 | `_ τ ο` | 9,583,813 |
216
+ | 2 | `ο υ _` | 7,426,167 |
217
+ | 3 | `_ κ α` | 6,229,911 |
218
+ | 4 | `α ι _` | 5,946,159 |
219
+ | 5 | `_ τ η` | 5,812,762 |
220
 
221
+ **4-grams (Subword):**
222
 
223
  | Rank | N-gram | Count |
224
  |------|--------|-------|
225
+ | 1 | `_ τ ο υ` | 4,854,974 |
226
+ | 2 | ο υ _` | 3,990,563 |
227
+ | 3 | `_ κ α ι` | 3,906,895 |
228
+ | 4 | α ι _` | 3,870,183 |
229
+ | 5 | `_ τ ο _` | 3,120,828 |
230
 
231
+ **5-grams (Subword):**
232
 
233
  | Rank | N-gram | Count |
234
  |------|--------|-------|
235
+ | 1 | `_ κ α ι _` | 3,856,808 |
236
+ | 2 | `_ τ ο υ _` | 3,836,821 |
237
+ | 3 | `_ τ η ς _` | 2,888,245 |
238
+ | 4 | `_ τ η ν _` | 1,890,516 |
239
+ | 5 | `_ α π ό _` | 1,864,707 |
240
 
241
 
242
  ### Key Findings
243
 
244
+ - **Best Perplexity:** 2-gram (subword) with 443
245
  - **Entropy Trend:** Decreases with larger n-grams (more predictable)
246
+ - **Coverage:** Top-1000 patterns cover ~21% of corpus
247
  - **Recommendation:** 4-gram or 5-gram for best predictive performance
248
 
249
  ---
 
251
 
252
  ![Markov Entropy](visualizations/markov_entropy.png)
253
 
254
+ ![Markov Contexts](visualizations/markov_contexts.png)
255
+
256
  ![Markov Branching](visualizations/markov_branching.png)
257
 
258
  ### Results
259
 
260
+ | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
261
+ |---------|---------|-------------|------------|------------------|-----------------|----------------|
262
+ | **1** | Word | 0.9344 | 1.911 | 11.28 | 2,374,710 | 6.6% |
263
+ | **1** | Subword | 1.0861 | 2.123 | 7.80 | 13,425 | 0.0% |
264
+ | **2** | Word | 0.4145 | 1.333 | 2.61 | 26,731,768 | 58.6% |
265
+ | **2** | Subword | 0.7185 | 1.645 | 5.31 | 104,621 | 28.2% |
266
+ | **3** | Word | 0.1946 | 1.144 | 1.46 | 69,637,387 | 80.5% |
267
+ | **3** | Subword | 0.8000 | 1.741 | 4.75 | 555,743 | 20.0% |
268
+ | **4** | Word | 0.0819 🏆 | 1.058 | 1.15 | 101,596,464 | 91.8% |
269
+ | **4** | Subword | 0.7130 | 1.639 | 3.67 | 2,639,831 | 28.7% |
270
+
271
+ ### Generated Text Samples (Word-based)
272
+
273
+ Below are text samples generated from each word-based Markov chain model:
274
+
275
+ **Context Size 1:**
276
+
277
+ 1. `του άγραφος νόμος και εκλογές κερδίζει το ο ν ευστρατίου κώστας καραπατής έλληνας αγωνιστής του οίκο...`
278
+ 2. `και βασανίστηκε σε αντίθεση με τον στρυμόνα ο βοναπάρτης κάλεσε σε κομματικό μάθημα φυκολογία harvey...`
279
+ 3. `το μπρύγκεν κάηκε τρεις πήχεις και τους τύπους κλειδώματος πολλές προσπάθειες ευχρηστίας υπηρετεί ως...`
280
+
281
+ **Context Size 2:**
282
+
283
+ 1. `από το πανί και τον βιότοπο της κέντρο είναι το δεύτερο όσκαρ β τέλεσε τη θεία της`
284
+ 2. `από την αστυνομία ενώ είναι διαθέσι��ο σε 409 αγώνες σκοράροντας 4 γκολ σε όλες τις έδρες δηλαδή`
285
+ 3. `με την οργάνωση και επέκταση των ορίων λειτουργίας των διαδικασιών η εταιρεία το δίκτυο αποχέτευσης ...`
286
+
287
+ **Context Size 3:**
288
+
289
+ 1. `κατά τη διάρκεια της οποίας προέτρεψε να παραδοθούν αφού πρωτύτερα συμφώνησαν να μην ενημερώσουν τον...`
290
+ 2. `παραπομπές εξωτερικοί σύνδεσμοι ψηφιακό αρχείο των δημοσιεύσεων του χ σάιμον με τα πλήρη ίσια μαλλιά...`
291
+ 3. `τη διάρκεια της βασιλείας του τσάρου πέτρου α τα ελεύθερα οικόπεδα αγοράστηκαν και το μια μεταλλική ...`
292
+
293
+ **Context Size 4:**
294
+
295
+ 1. `κατά τη διάρκεια της δεκαετίας του 20 τάφηκε μαζί με την σύζυγο του αυγούστα κόρτενεϋ 8 φεβρουαρίου ...`
296
+ 2. `από το έως το με εξαίρεση εκείνες του μετά την έξωση του όθωνα κατά τη διάρκεια των φιλορωσικών ανατ...`
297
+ 3. `κατά τη διάρκεια του χειμώνα μεταξύ της τελευταίας κυριακής του οκτωβρίου μέχρι τη 1 00 utc της τελε...`
298
 
 
299
 
300
+ ### Generated Text Samples (Subword-based)
301
+
302
+ Below are text samples generated from each subword-based Markov chain model:
303
 
304
  **Context Size 1:**
305
 
306
+ 1. `_ικαθεσυπν_μμε_a`
307
+ 2. `ας._ησο_πόδύπίαι`
308
+ 3. `ούν_πού_κόπρες_ό`
309
 
310
  **Context Size 2:**
311
 
312
+ 1. `ς_εξε_μος_αντρώτο`
313
+ 2. `_τη_για_ήταχματην`
314
+ 3. `το_ναι_από_τοντις`
315
 
316
  **Context Size 3:**
317
 
318
+ 1. `_του_αναλίαρχές_αλ`
319
+ 2. `ου_έγκροτεχνολούν_`
320
+ 3. `_καιρισμοι_/σεβαιω`
321
 
322
  **Context Size 4:**
323
 
324
+ 1. `_τους_χρήση_ο_πτερύ`
325
+ 2. `του_της_ανακάλυψη_σ`
326
+ 3. `_και_τους_δικτίνας_`
327
 
328
 
329
  ### Key Findings
330
 
331
+ - **Best Predictability:** Context-4 (word) with 91.8% predictability
332
  - **Branching Factor:** Decreases with context size (more deterministic)
333
+ - **Memory Trade-off:** Larger contexts require more storage (2,639,831 contexts)
334
  - **Recommendation:** Context-3 or Context-4 for text generation
335
 
336
  ---
 
346
 
347
  | Metric | Value |
348
  |--------|-------|
349
+ | Vocabulary Size | 1,039,940 |
350
+ | Total Tokens | 132,061,031 |
351
+ | Mean Frequency | 126.99 |
352
+ | Median Frequency | 4 |
353
+ | Frequency Std Dev | 9123.56 |
354
 
355
  ### Most Common Words
356
 
357
  | Rank | Word | Frequency |
358
  |------|------|-----------|
359
+ | 1 | του | 4,095,731 |
360
+ | 2 | και | 3,886,615 |
361
+ | 3 | το | 3,228,440 |
362
+ | 4 | της | 2,987,569 |
363
+ | 5 | η | 1,958,228 |
364
+ | 6 | την | 1,895,055 |
365
+ | 7 | από | 1,882,149 |
366
+ | 8 | ο | 1,862,872 |
367
+ | 9 | με | 1,655,296 |
368
+ | 10 | τον | 1,304,224 |
369
 
370
  ### Least Common Words (from vocabulary)
371
 
372
  | Rank | Word | Frequency |
373
  |------|------|-----------|
374
+ | 1 | ωσμωπροστατευτικά | 2 |
375
+ | 2 | ορμπέκη | 2 |
376
+ | 3 | hidronor | 2 |
377
+ | 4 | jpp | 2 |
378
+ | 5 | liebrand | 2 |
379
+ | 6 | οϊρατσουμέ | 2 |
380
+ | 7 | χασιχίτο | 2 |
381
+ | 8 | σεϊσι | 2 |
382
+ | 9 | τακατσουκασά | 2 |
383
+ | 10 | κατσιρέλο | 2 |
384
 
385
  ### Zipf's Law Analysis
386
 
387
  | Metric | Value |
388
  |--------|-------|
389
+ | Zipf Coefficient | 0.9498 |
390
+ | R² (Goodness of Fit) | 0.997066 |
391
  | Adherence Quality | **excellent** |
392
 
393
  ### Coverage Analysis
394
 
395
  | Top N Words | Coverage |
396
  |-------------|----------|
397
+ | Top 100 | 38.6% |
398
+ | Top 1,000 | 55.9% |
399
+ | Top 5,000 | 71.4% |
400
  | Top 10,000 | 78.0% |
401
 
402
  ### Key Findings
403
 
404
+ - **Zipf Compliance:** R²=0.9971 indicates excellent adherence to Zipf's law
405
+ - **High Frequency Dominance:** Top 100 words cover 38.6% of corpus
406
+ - **Long Tail:** 1,029,940 words needed for remaining 22.0% coverage
407
 
408
  ---
409
  ## 5. Word Embeddings Evaluation
 
416
 
417
  ![t-SNE Sentences](visualizations/tsne_sentences.png)
418
 
 
419
 
420
+ ### 5.1 Cross-Lingual Alignment
421
+
422
+ ![Alignment Quality](visualizations/embedding_alignment_quality.png)
423
+
424
+ ![Multilingual t-SNE](visualizations/embedding_tsne_multilingual.png)
425
+
426
+
427
+ ### 5.2 Model Comparison
428
+
429
+ | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
430
+ |-------|-----------|----------|------------------|---------------|----------------|
431
+ | **mono_32d** | 32 | 0.8028 | 0.3648 | N/A | N/A |
432
+ | **mono_64d** | 64 | 0.7821 | 0.3021 | N/A | N/A |
433
+ | **mono_128d** | 128 | 0.7303 | 0.2408 | N/A | N/A |
434
+ | **aligned_32d** | 32 | 0.8028 🏆 | 0.3775 | 0.2640 | 0.6820 |
435
+ | **aligned_64d** | 64 | 0.7821 | 0.2965 | 0.4780 | 0.8720 |
436
+ | **aligned_128d** | 128 | 0.7303 | 0.2330 | 0.6560 | 0.9100 |
437
 
438
  ### Key Findings
439
 
440
+ - **Best Isotropy:** aligned_32d with 0.8028 (more uniform distribution)
441
+ - **Semantic Density:** Average pairwise similarity of 0.3025. Lower values indicate better semantic separation.
442
+ - **Alignment Quality:** Aligned models achieve up to 65.6% R@1 in cross-lingual retrieval.
443
+ - **Recommendation:** 128d aligned for best cross-lingual performance
444
 
445
  ---
446
+ ## 6. Morphological Analysis (Experimental)
447
+
448
+ This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
449
+
450
+ ### 6.1 Productivity & Complexity
451
+
452
+ | Metric | Value | Interpretation | Recommendation |
453
+ |--------|-------|----------------|----------------|
454
+ | Productivity Index | **5.000** | High morphological productivity | Reliable analysis |
455
+ | Idiomaticity Gap | **-0.798** | Low formulaic content | - |
456
+
457
+ ### 6.2 Affix Inventory (Productive Units)
458
+
459
+ These are the most productive prefixes and suffixes identified by sampling the vocabulary for global substitutability patterns. A unit is considered an affix if stripping it leaves a valid stem that appears in other contexts.
460
+
461
+ #### Productive Prefixes
462
+ | Prefix | Examples |
463
+ |--------|----------|
464
+ | `-α` | αβρανσάν, απόχρεμψη, αποφέροντάς |
465
+ | `-σ` | συνειδητοποιήσετε, στίβενσον, σπειροτόμησης |
466
+ | `-a` | ayodhya, addicted, apocolo |
467
+ | `-s` | superdome, sembrich, sibling |
468
+ | `-κ` | κίτσεβο, κλειδώνω, κινοσάκι |
469
+ | `-κα` | καριστάνιου, κασιγουαμπάρα, καλλιρροη |
470
+ | `-ε` | ελληνοαλβανικών, επανεξετάζει, ενοργάνιση |
471
+ | `-μ` | μάστερινγκ, μεθυλοβουτανονιτρίλιοασκήσεις, μπαλάφα |
472
+
473
+ #### Productive Suffixes
474
+ | Suffix | Examples |
475
+ |--------|----------|
476
+ | `-ς` | νεπαλέζους, 125ος, μεθυλοβουτανονιτρίλιοασκήσεις |
477
+ | `-ν` | ελληνοαλβανικών, νταγκάν, αβρανσάν |
478
+ | `-α` | οκτωβρίουεφημερίδα, προσωπίδα, τζιτζιμπίρα |
479
+ | `-ι` | χότζι, φρύξουσι, υπονομεύεται |
480
+ | `-ος` | 125ος, φιλαθλος, μπατιστάτος |
481
+ | `-ο` | ζηρίνειο, κίτσεβο, ριβονουκλεοτίδιο |
482
+ | `-ου` | καριστάνιου, ατταβύρου, βερεγγάριου |
483
+ | `-ης` | φαρέλης, απόρθητης, σπειροτόμησης |
484
+
485
+ ### 6.3 Bound Stems (Lexical Roots)
486
+
487
+ Bound stems are high-frequency subword units that are semantically cohesive but rarely appear as standalone words. These often correspond to the 'core' of a word that requires inflection or derivation to be valid.
488
+
489
+ | Stem | Cohesion | Substitutability | Examples |
490
+ |------|----------|------------------|----------|
491
+ | `ικών` | 2.20x | 163 contexts | δικών, νικών, οικών |
492
+ | `ικής` | 2.14x | 156 contexts | ιικής, τικής, πικής |
493
+ | `ότητ` | 2.07x | 175 contexts | κότητα, νότητα, ἑνότητα |
494
+ | `ικές` | 1.96x | 135 contexts | νικές, μικές, δικές |
495
+ | `ιστι` | 1.52x | 338 contexts | μιστι, ιστική, πιστιν |
496
+ | `ατος` | 1.90x | 92 contexts | ματος, αίατος, υπατος |
497
+ | `ανικ` | 1.44x | 370 contexts | δανικα, δανικό, μανικά |
498
+ | `ήθηκ` | 1.93x | 81 contexts | ψήθηκε, λήθηκε, μυήθηκε |
499
+ | `ολογ` | 1.40x | 399 contexts | ολογρ, υπολογ, οδολογ |
500
+ | `πίση` | 2.06x | 48 contexts | πίσης, επίση, έπίσης |
501
+ | `ατικ` | 1.38x | 317 contexts | ατικέ, ατικά, φατική |
502
+ | `οποι` | 1.45x | 200 contexts | τοποι, οποιά, οποιο |
503
+
504
+ ### 6.4 Affix Compatibility (Co-occurrence)
505
+
506
+ This table shows which prefixes and suffixes most frequently co-occur on the same stems, revealing the 'stacking' rules of the language's morphology.
507
+
508
+ | Prefix | Suffix | Frequency | Examples |
509
+ |--------|--------|-----------|----------|
510
+ | `-α` | `-ς` | 188 words | αφηγησεις, ανύπανδρους |
511
+ | `-κ` | `-ς` | 153 words | καλλιοντζής, κωστούλης |
512
+ | `-σ` | `-ς` | 127 words | στηις, σοβαρώς |
513
+ | `-ε` | `-ς` | 116 words | ενελικτικός, επιμορφωτικούς |
514
+ | `-μ` | `-ς` | 110 words | μεταξάςπρωταγωνιστικός, μπούσεβιτς |
515
+ | `-α` | `-ν` | 104 words | αιτωλίαν, απονεμηθέν |
516
+ | `-κ` | `-ν` | 68 words | κηρύκειον, κατακάηκαν |
517
+ | `-μ` | `-ν` | 65 words | μπιέγκαν, μεταβλητών |
518
+ | `-ε` | `-ν` | 65 words | εξεπόνησαν, ερείπωσαν |
519
+ | `-α` | `-α` | 65 words | αυτοκρατόρισσα, αυστραλια |
520
+
521
+ ### 6.5 Recursive Morpheme Segmentation
522
+
523
+ Using **Recursive Hierarchical Substitutability**, we decompose complex words into their constituent morphemes. This approach handles nested affixes (e.g., `prefix-prefix-root-suffix`).
524
+
525
+ | Word | Suggested Split | Confidence | Stem |
526
+ |------|-----------------|------------|------|
527
+ | έτοςχιόνι | **`έτοςχιό-ν-ι`** | 7.5 | `ν` |
528
+ | περισσεία | **`περισσ-ε-ία`** | 7.5 | `ε` |
529
+ | αἰγινήτου | **`αἰγινή-τ-ου`** | 7.5 | `τ` |
530
+ | αντιψυχωσικών | **`αντιψυχωσι-κ-ών`** | 7.5 | `κ` |
531
+ | λανγκλουά | **`λανγκλ-ου-ά`** | 6.0 | `λανγκλ` |
532
+ | μπουνάκιας | **`μπουνάκ-ια-ς`** | 6.0 | `μπουνάκ` |
533
+ | γιαλούρης | **`γιαλούρη-ς`** | 4.5 | `γιαλούρη` |
534
+ | εφαρμόζεις | **`εφαρμόζει-ς`** | 4.5 | `εφαρμόζει` |
535
+ | internationalοι | **`international-οι`** | 4.5 | `international` |
536
+ | λοξότητας | **`λοξότητα-ς`** | 4.5 | `λοξότητα` |
537
+ | δομινικανικής | **`δομινικανική-ς`** | 4.5 | `δομινικανική` |
538
+ | aθλητικός | **`aθλητικό-ς`** | 4.5 | `aθλητικό` |
539
+ | επηρεασμένης | **`επηρεασμένη-ς`** | 4.5 | `επηρεασμένη` |
540
+ | σελτζουκικός | **`σελτζουκικό-ς`** | 4.5 | `σελτζουκικό` |
541
+ | modernisme | **`modernism-e`** | 4.5 | `modernism` |
542
+
543
+ ### 6.6 Linguistic Interpretation
544
+
545
+ > **Automated Insight:**
546
+ The language Greek shows high morphological productivity. The subword models are significantly more efficient than word models, suggesting a rich system of affixation or compounding.
547
+
548
+ ---
549
+ ## 7. Summary & Recommendations
550
 
551
  ![Performance Dashboard](visualizations/performance_dashboard.png)
552
 
 
554
 
555
  | Component | Recommended | Rationale |
556
  |-----------|-------------|-----------|
557
+ | Tokenizer | **64k BPE** | Best compression (4.87x) |
558
+ | N-gram | **2-gram** | Lowest perplexity (443) |
559
+ | Markov | **Context-4** | Highest predictability (91.8%) |
560
  | Embeddings | **100d** | Balanced semantic capture and isotropy |
561
 
562
+
563
  ---
564
  ## Appendix: Metrics Glossary & Interpretation Guide
565
 
 
749
  author = {Kamali, Omar},
750
  title = {Wikilangs: Open NLP Models for Wikipedia Languages},
751
  year = {2025},
752
+ doi = {10.5281/zenodo.18073153},
753
+ publisher = {Zenodo},
754
  url = {https://huggingface.co/wikilangs}
755
  institution = {Omneity Labs}
756
  }
 
766
  - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
767
  - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
768
  - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
769
+ - 🤝 Sponsor: [Featherless AI](https://featherless.ai)
770
  ---
771
  *Generated by Wikilangs Models Pipeline*
772
 
773
+ *Report Date: 2026-01-10 02:57:50*
el_morph_tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
models/embeddings/aligned/el_128d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d24aae5f3f717e11d4b52c6ebad13272a0d0f48d2e1eed941416d87a645abfb0
3
+ size 1870546573
models/embeddings/aligned/el_128d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "el", "dim": 128, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/el_128d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9e9e5993fb99be0a078366fb1dded843fcd1a68273dfa3f681ebf5251e18044
3
+ size 65664
models/embeddings/aligned/el_128d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "el",
3
+ "dimension": 128,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 152614,
7
+ "vocab_size": 807278
8
+ }
models/embeddings/aligned/el_32d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bc69792f7354a7ef215cbf36d11a33ff2e6875f8795dc6271012465359a0cda
3
+ size 482557069
models/embeddings/aligned/el_32d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "el", "dim": 32, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/el_32d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42c00c01fc9937727eeda13b4f9f3606dec2ffd1ff0082dd8d2ef01dcb56620e
3
+ size 4224
models/embeddings/aligned/el_32d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "el",
3
+ "dimension": 32,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 152614,
7
+ "vocab_size": 807278
8
+ }
models/embeddings/aligned/el_64d.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d38b9bcf77d3285aef482f1f8541fa0f6edb75ea58c872d42cc26f4a32e8129f
3
+ size 945220237
models/embeddings/aligned/el_64d.meta.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"lang": "el", "dim": 64, "max_seq_len": 512, "is_aligned": true}
models/embeddings/aligned/el_64d.projection.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ebae6e8843855cfb482cad1e12a89aa9a91a21ff7d34066c82fc1a12534a202
3
+ size 16512
models/embeddings/aligned/el_64d_metadata.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "el",
3
+ "dimension": 64,
4
+ "version": "aligned",
5
+ "hub_language": "en",
6
+ "seed_vocab_size": 152614,
7
+ "vocab_size": 807278
8
+ }
models/embeddings/monolingual/el_128d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b9a1d229d45bd9db0b0ef58dd3caf61ea3619feddc6954305e81badb48783424
3
- size 1923977468
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d24aae5f3f717e11d4b52c6ebad13272a0d0f48d2e1eed941416d87a645abfb0
3
+ size 1870546573
models/embeddings/monolingual/el_128d_metadata.json CHANGED
@@ -3,11 +3,13 @@
3
  "dimension": 128,
4
  "version": "monolingual",
5
  "training_params": {
6
- "dim": 128,
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
- "epochs": 5
 
 
11
  },
12
- "vocab_size": 858240
13
  }
 
3
  "dimension": 128,
4
  "version": "monolingual",
5
  "training_params": {
6
+ "algorithm": "skipgram",
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
+ "epochs": 5,
11
+ "encoding_method": "rope",
12
+ "dim": 128
13
  },
14
+ "vocab_size": 807278
15
  }
models/embeddings/monolingual/el_32d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:397c614ce2b05e40c8a041db110067279bfb45dd294052c4c228bf71f464fc87
3
- size 496849148
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bc69792f7354a7ef215cbf36d11a33ff2e6875f8795dc6271012465359a0cda
3
+ size 482557069
models/embeddings/monolingual/el_32d_metadata.json CHANGED
@@ -3,11 +3,13 @@
3
  "dimension": 32,
4
  "version": "monolingual",
5
  "training_params": {
6
- "dim": 32,
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
- "epochs": 5
 
 
11
  },
12
- "vocab_size": 858240
13
  }
 
3
  "dimension": 32,
4
  "version": "monolingual",
5
  "training_params": {
6
+ "algorithm": "skipgram",
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
+ "epochs": 5,
11
+ "encoding_method": "rope",
12
+ "dim": 32
13
  },
14
+ "vocab_size": 807278
15
  }
models/embeddings/monolingual/el_64d.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f8203ff28c10d54380c8d52501c044d8267880b0d08c5763c9243ecf0e05ccbb
3
- size 972558588
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d38b9bcf77d3285aef482f1f8541fa0f6edb75ea58c872d42cc26f4a32e8129f
3
+ size 945220237
models/embeddings/monolingual/el_64d_metadata.json CHANGED
@@ -3,11 +3,13 @@
3
  "dimension": 64,
4
  "version": "monolingual",
5
  "training_params": {
6
- "dim": 64,
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
- "epochs": 5
 
 
11
  },
12
- "vocab_size": 858240
13
  }
 
3
  "dimension": 64,
4
  "version": "monolingual",
5
  "training_params": {
6
+ "algorithm": "skipgram",
7
  "min_count": 5,
8
  "window": 5,
9
  "negative": 5,
10
+ "epochs": 5,
11
+ "encoding_method": "rope",
12
+ "dim": 64
13
  },
14
+ "vocab_size": 807278
15
  }
models/subword_markov/el_markov_ctx1_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d43118910abbd51bce30b7b4085d172d50d8287e4fe0f2b8997c362a264bd36e
3
- size 747707
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c3d3f919a8e38fa63d36d388598bb5ec35089b6abe5af71a5b6031f4c654d34
3
+ size 738791
models/subword_markov/el_markov_ctx1_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 1,
3
  "variant": "subword",
4
  "language": "el",
5
- "unique_contexts": 10325,
6
- "total_transitions": 972740684
7
  }
 
2
  "context_size": 1,
3
  "variant": "subword",
4
  "language": "el",
5
+ "unique_contexts": 13425,
6
+ "total_transitions": 893169176
7
  }
models/subword_markov/el_markov_ctx2_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:75f929295791c60a292cd63d2df2bbc8b1347d8ec2b99e05a7a68704190f423e
3
- size 5548801
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b49d89797c1795c4f266b46de7242241abb24c771473be8650e3f5499290b702
3
+ size 4364987
models/subword_markov/el_markov_ctx2_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 2,
3
  "variant": "subword",
4
  "language": "el",
5
- "unique_contexts": 115568,
6
- "total_transitions": 972478877
7
  }
 
2
  "context_size": 2,
3
  "variant": "subword",
4
  "language": "el",
5
+ "unique_contexts": 104621,
6
+ "total_transitions": 892907548
7
  }
models/subword_markov/el_markov_ctx3_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9582048424f6cd487ab966c5ef6a8d135702840b29ebbb370baefc7455be2256
3
- size 27924193
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a38013ef8bdf59eb4a5305ed7af4962628f5b027d82439b5d2df043ecf64035
3
+ size 20346763
models/subword_markov/el_markov_ctx3_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 3,
3
  "variant": "subword",
4
  "language": "el",
5
- "unique_contexts": 709839,
6
- "total_transitions": 972217070
7
  }
 
2
  "context_size": 3,
3
  "variant": "subword",
4
  "language": "el",
5
+ "unique_contexts": 555743,
6
+ "total_transitions": 892645920
7
  }
models/subword_markov/el_markov_ctx4_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:10071d63decd48666e7bb2e50c066b3f510a119574d202e7a433377c893f1bbe
3
- size 113514704
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c565660c99264651a752472b234f2e6a5baeb990079c89e30892b8ea2ee6e4c
3
+ size 81307991
models/subword_markov/el_markov_ctx4_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "context_size": 4,
3
  "variant": "subword",
4
  "language": "el",
5
- "unique_contexts": 3835999,
6
- "total_transitions": 971955263
7
  }
 
2
  "context_size": 4,
3
  "variant": "subword",
4
  "language": "el",
5
+ "unique_contexts": 2639831,
6
+ "total_transitions": 892384292
7
  }
models/subword_ngram/el_2gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dce733cf11ef4a3c20fc3f75fdfa7741ded5bcfda6477587aabaf5141081c5ad
3
- size 445348
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29710a4ed8a273e7683c18190c329676ec5055e3a5c1d0acfdb70b3e1c60f3e8
3
+ size 375274
models/subword_ngram/el_2gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 2,
3
  "variant": "subword",
4
  "language": "el",
5
- "unique_ngrams": 31823,
6
- "total_ngrams": 972740684
7
  }
 
2
  "n": 2,
3
  "variant": "subword",
4
  "language": "el",
5
+ "unique_ngrams": 26716,
6
+ "total_ngrams": 893169176
7
  }
models/subword_ngram/el_3gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:360894419b4d87b279cef9bdf0a456304b82bb7ec2f94f3a5f46fbebfc9ba1c7
3
- size 4237331
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88ec61a4687c86fe02551eb6e1cd24a41d0bf2663422331eff899d4d3b0859ef
3
+ size 3212153
models/subword_ngram/el_3gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 3,
3
  "variant": "subword",
4
  "language": "el",
5
- "unique_ngrams": 338020,
6
- "total_ngrams": 972478877
7
  }
 
2
  "n": 3,
3
  "variant": "subword",
4
  "language": "el",
5
+ "unique_ngrams": 250216,
6
+ "total_ngrams": 892907548
7
  }
models/subword_ngram/el_4gram_subword.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c47b9d545847a519cfdcb9d5416ef26218017b555c56e1b7e2fb6cd06e36959a
3
- size 28199306
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c2a95caf50ea0244b4d6ad6a58ef0d4097f721649976efd100116044477605a
3
+ size 19937899
models/subword_ngram/el_4gram_subword_metadata.json CHANGED
@@ -2,6 +2,6 @@
2
  "n": 4,
3
  "variant": "subword",
4
  "language": "el",
5
- "unique_ngrams": 2196788,
6
- "total_ngrams": 972217070
7
  }
 
2
  "n": 4,
3
  "variant": "subword",
4
  "language": "el",
5
+ "unique_ngrams": 1519855,
6
+ "total_ngrams": 892645920
7
  }
models/subword_ngram/el_5gram_subword.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b959c76f30a16bce0c3ca0c7c293006542ab75514b6ae80db511ecfaf1a35ab
3
+ size 72561386
models/subword_ngram/el_5gram_subword_metadata.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "n": 5,
3
+ "variant": "subword",
4
+ "language": "el",
5
+ "unique_ngrams": 5267290,
6
+ "total_ngrams": 892384292
7
+ }
models/tokenizer/el_tokenizer_16k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ee24fef1dbf383947f73c7391363b20e4e04a2d3b368ac7ed9732a24230ff48e
3
- size 577771
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bd22c02b5d99c61a00c3c7850be0586b48ed713358f09d422d66015eea661bc
3
+ size 576910
models/tokenizer/el_tokenizer_16k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/tokenizer/el_tokenizer_32k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4ec50a3c46f80d93c0b5fe85aedead911111d3532f998fcbf635a1279ed1f7fb
3
- size 945435
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e404d8bb694c55dac892595e51ea1e17be3aa2732fe1b053b35a59478c72fcc0
3
+ size 936827
models/tokenizer/el_tokenizer_32k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/tokenizer/el_tokenizer_64k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c4b56d5c304be93e252df2515b0d937c55cffddb067598ed9ee890b95a2bb05
3
- size 1704150
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e73f1f2c6a5cb2df1246fe4ef4a75b51b8249ae1992fc659348638af39aedd9
3
+ size 1679605
models/tokenizer/el_tokenizer_64k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/tokenizer/el_tokenizer_8k.model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c8c2fa0a692569bec407b66ef8151613687857acda815dfb8e2e4f1a06994ac
3
- size 402798
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8b06722ff78d5311e781f629e61e35ae2aba248d3ad27242513c12e05237c19
3
+ size 403152
models/tokenizer/el_tokenizer_8k.vocab CHANGED
The diff for this file is too large to render. See raw diff
 
models/vocabulary/el_vocabulary.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0737b68d2c2836bbc01e785bb01bd14d2b86664af442ee49008b4acfce0dee2b
3
- size 17050356
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:483fbd106beda1141b0a2387770005c497e22d0dba6bcc1b7b8687668e19e4a5
3
+ size 17790677
models/vocabulary/el_vocabulary_metadata.json CHANGED
@@ -1,16 +1,17 @@
1
  {
2
  "language": "el",
3
- "vocabulary_size": 1000000,
 
4
  "statistics": {
5
- "type_token_ratio": 0.021967606730078443,
6
  "coverage": {
7
- "top_100": 0.3645029680593136,
8
- "top_1000": 0.5437770227902385,
9
- "top_5000": 0.7010292403499135,
10
- "top_10000": 0.7670692713472435
11
  },
12
- "hapax_count": 1979361,
13
- "hapax_ratio": 0.6267777498064436,
14
- "total_documents": 261807
15
  }
16
  }
 
1
  {
2
  "language": "el",
3
+ "vocabulary_size": 1039940,
4
+ "variant": "full",
5
  "statistics": {
6
+ "type_token_ratio": 0.01786437503595743,
7
  "coverage": {
8
+ "top_100": 0.38223912989295133,
9
+ "top_1000": 0.5537963832118573,
10
+ "top_5000": 0.7065002302212579,
11
+ "top_10000": 0.772519756207213
12
  },
13
+ "hapax_count": 1343244,
14
+ "hapax_ratio": 0.5636341969398921,
15
+ "total_documents": 261628
16
  }
17
  }
models/vocabulary/el_vocabulary_top.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7736e0a29e44411a7455de6d7adb61fbd4f8e5d189ffa70db773bb07ea9ac2da
3
+ size 17131678
models/vocabulary/el_vocabulary_top_metadata.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "el",
3
+ "vocabulary_size": 1000000,
4
+ "variant": "top",
5
+ "statistics": {
6
+ "type_token_ratio": 0.01786437503595743,
7
+ "coverage": {
8
+ "top_100": 0.38223912989295133,
9
+ "top_1000": 0.5537963832118573,
10
+ "top_5000": 0.7065002302212579,
11
+ "top_10000": 0.772519756207213
12
+ },
13
+ "hapax_count": 1343244,
14
+ "hapax_ratio": 0.5636341969398921,
15
+ "total_documents": 261628,
16
+ "top_vocab_size": 1000000,
17
+ "coverage_ratio": 0.9893322459119095,
18
+ "tokens_excluded": 39940
19
+ }
20
+ }
models/word_markov/el_markov_ctx1_word.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c096917e95f279ae5eb42278332bcbf2f029af072a217bcbe0094f2229fc768a
3
- size 304857111
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48415cd7ae5de8bc3bf2222c7fc19297bd30fc3e5d904c8b9743712e21bd2245
3
+ size 317379224