omarkamali commited on Jan 4

Commit

b2b9a92

verified ·

1 Parent(s): e91ecf9

Upload all models and assets for din (latest)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
README.md +310 -126
models/embeddings/aligned/din_128d.bin +3 -0
models/embeddings/aligned/din_128d.meta.json +1 -0
models/embeddings/aligned/din_128d.projection.npy +3 -0
models/embeddings/aligned/din_128d_metadata.json +8 -0
models/embeddings/aligned/din_32d.bin +3 -0
models/embeddings/aligned/din_32d.meta.json +1 -0
models/embeddings/aligned/din_32d.projection.npy +3 -0
models/embeddings/aligned/din_32d_metadata.json +8 -0
models/embeddings/aligned/din_64d.bin +3 -0
models/embeddings/aligned/din_64d.meta.json +1 -0
models/embeddings/aligned/din_64d.projection.npy +3 -0
models/embeddings/aligned/din_64d_metadata.json +8 -0
models/embeddings/monolingual/din_128d.bin +2 -2
models/embeddings/monolingual/din_128d_metadata.json +5 -3
models/embeddings/monolingual/din_32d.bin +2 -2
models/embeddings/monolingual/din_32d_metadata.json +5 -3
models/embeddings/monolingual/din_64d.bin +2 -2
models/embeddings/monolingual/din_64d_metadata.json +5 -3
models/subword_markov/din_markov_ctx1_subword.parquet +2 -2
models/subword_markov/din_markov_ctx1_subword_metadata.json +2 -2
models/subword_markov/din_markov_ctx2_subword.parquet +2 -2
models/subword_markov/din_markov_ctx2_subword_metadata.json +2 -2
models/subword_markov/din_markov_ctx3_subword.parquet +2 -2
models/subword_markov/din_markov_ctx3_subword_metadata.json +2 -2
models/subword_markov/din_markov_ctx4_subword.parquet +2 -2
models/subword_markov/din_markov_ctx4_subword_metadata.json +2 -2
models/subword_ngram/din_2gram_subword.parquet +2 -2
models/subword_ngram/din_2gram_subword_metadata.json +2 -2
models/subword_ngram/din_3gram_subword.parquet +2 -2
models/subword_ngram/din_3gram_subword_metadata.json +2 -2
models/subword_ngram/din_4gram_subword.parquet +2 -2
models/subword_ngram/din_4gram_subword_metadata.json +2 -2
models/subword_ngram/din_5gram_subword.parquet +3 -0
models/subword_ngram/din_5gram_subword_metadata.json +7 -0
models/tokenizer/din_tokenizer_16k.model +2 -2
models/tokenizer/din_tokenizer_16k.vocab +0 -0
models/tokenizer/din_tokenizer_32k.model +2 -2
models/tokenizer/din_tokenizer_32k.vocab +0 -0
models/tokenizer/din_tokenizer_8k.model +2 -2
models/tokenizer/din_tokenizer_8k.vocab +0 -0
models/vocabulary/din_vocabulary.parquet +2 -2
models/vocabulary/din_vocabulary_metadata.json +10 -9
models/word_markov/din_markov_ctx1_word.parquet +2 -2
models/word_markov/din_markov_ctx1_word_metadata.json +2 -2
models/word_markov/din_markov_ctx2_word.parquet +2 -2
models/word_markov/din_markov_ctx2_word_metadata.json +2 -2
models/word_markov/din_markov_ctx3_word.parquet +2 -2
models/word_markov/din_markov_ctx3_word_metadata.json +2 -2

.gitattributes CHANGED Viewed

@@ -39,3 +39,4 @@ visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -t
 visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text

 visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
+visualizations/embedding_tsne_multilingual.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 language: din
-language_name: DIN
 language_family: african_nilotic
 tags:
   - wikilangs
@@ -10,11 +10,21 @@ tags:
   - n-gram
   - markov
   - wikipedia
   - monolingual
   - family-african_nilotic
 license: mit
 library_name: wikilangs
-pipeline_tag: feature-extraction
 datasets:
   - omarkamali/wikipedia-monthly
 dataset_info:
@@ -23,20 +33,20 @@ dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
-    value: 4.266
   - name: best_isotropy
     type: isotropy
-    value: 0.1273
   - name: vocabulary_size
     type: vocab
-    value: 5872
-generated: 2025-12-30
 ---
-# DIN - Wikilangs Models
 ## Comprehensive Research Report & Full Ablation Study
-This repository contains NLP models trained and evaluated by Wikilangs, specifically on **DIN** Wikipedia data.
 We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
 ## 📋 Repository Contents
@@ -44,12 +54,13 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 ### Models & Assets
 - Tokenizers (8k, 16k, 32k, 64k)
-- N-gram models (2, 3, 4-gram)
-- Markov chains (context of 1, 2, 3 and 4)
 - Subword N-gram and Markov chains
-- Embeddings in various sizes and dimensions
 - Language Vocabulary
 - Language Statistics
 ![Performance Dashboard](visualizations/performance_dashboard.png)
 ### Analysis and Evaluation
@@ -59,7 +70,8 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
-- [6. Summary & Recommendations](#6-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
@@ -68,50 +80,53 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 ![Tokenizer Compression](visualizations/tokenizer_compression.png)
 ### Results
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
-| **8k** | 3.627x | 3.61 | 0.1100% | 146,415 |
-| **16k** | 3.922x | 3.91 | 0.1189% | 135,412 |
-| **32k** | 4.266x 🏆 | 4.25 | 0.1293% | 124,488 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
-**Sample 1:** `Paankɔc Ciɛl de Libya ee paan cï thiɔ̈ɔ̈k thïn Apirïka ciɛlic. Genamaatnhomde ay...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁paankɔc ▁ciɛl ▁de ▁libya ▁ee ▁paan ▁cï ▁thiɔ̈ɔ̈k ▁thïn ▁apirïka ... (+11 more)` | 21 |
-| 16k | `▁paankɔc ▁ciɛl ▁de ▁libya ▁ee ▁paan ▁cï ▁thiɔ̈ɔ̈k ▁thïn ▁apirïka ... (+11 more)` | 21 |
-| 32k | `▁paankɔc ▁ciɛl ▁de ▁libya ▁ee ▁paan ▁cï ▁thiɔ̈ɔ̈k ▁thïn ▁apirïka ... (+10 more)` | 20 |
-**Sample 2:** `Heen acï puööu miet apeidït ne rin  cïï ok rot mat thääi pinynhom yiic.
-Piööc k...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁h een ▁acï ▁pu öö u ▁mi et ▁apei dït ... (+21 more)` | 31 |
-| 16k | `▁heen ▁acï ▁pu öö u ▁mi et ▁apeidït ▁ne ▁rin ... (+18 more)` | 28 |
-| 32k | `▁heen ▁acï ▁puööu ▁miet ▁apeidït ▁ne ▁rin ▁cïï ▁ok ▁rot ... (+15 more)` | 25 |
-**Sample 3:** `+Japan 125px 135px  300px
-Japan ee pamac tɔ Athiɛ. Genamaatnhomde ayee cɔl Tokyo...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁+ j apan ▁ 1 2 5 px ▁ 1 ... (+23 more)` | 33 |
-| 16k | `▁+ japan ▁ 1 2 5 px ▁ 1 3 ... (+21 more)` | 31 |
-| 32k | `▁+ japan ▁ 1 2 5 px ▁ 1 3 ... (+21 more)` | 31 |
 ### Key Findings
-- **Best Compression:** 32k achieves 4.266x compression
-- **Lowest UNK Rate:** 8k with 0.1100% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
@@ -120,57 +135,111 @@ Japan ee pamac tɔ Athiɛ. Genamaatnhomde ayee cɔl Tokyo...`
 ![N-gram Perplexity](visualizations/ngram_perplexity.png)
 ![N-gram Coverage](visualizations/ngram_coverage.png)
 ### Results
-| N-gram | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
-|--------|------------|---------|----------------|------------------|-------------------|
-| **2-gram** | 969 🏆 | 9.92 | 2,614 | 42.8% | 79.5% |
-| **2-gram** | 353 🏆 | 8.46 | 1,719 | 60.1% | 98.7% |
-| **3-gram** | 958 | 9.90 | 2,667 | 44.4% | 78.1% |
-| **3-gram** | 2,358 | 11.20 | 10,075 | 24.1% | 70.3% |
-| **4-gram** | 1,138 | 10.15 | 3,607 | 44.8% | 70.6% |
-| **4-gram** | 9,037 | 13.14 | 33,136 | 12.8% | 42.8% |
 ### Top 5 N-grams by Size
-**2-grams:**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `, ku` | 881 |
-| 2 | `̈ n` | 812 |
-| 3 | `̈ k` | 805 |
-| 4 | `ɔ ̈` | 709 |
-| 5 | `̈ ɔ` | 708 |
-**3-grams:**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `̈ ɔ ̈` | 704 |
-| 2 | `̈ ɛ ̈` | 526 |
-| 3 | `kɔ ̈ k` | 316 |
-| 4 | `bɛ ̈ n` | 233 |
-| 5 | `. bekätakthook :` | 223 |
-**4-grams:**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `. 2006 . english` | 167 |
-| 2 | `blench . 2006 .` | 167 |
-| 3 | `2006 . english to` | 167 |
-| 4 | `: derived from sil` | 167 |
-| 5 | `. kay williamson educational` | 167 |
 ### Key Findings
-- **Best Perplexity:** 2-gram with 353
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
-- **Coverage:** Top-1000 patterns cover ~43% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
@@ -178,55 +247,86 @@ Japan ee pamac tɔ Athiɛ. Genamaatnhomde ayee cɔl Tokyo...`
 ![Markov Entropy](visualizations/markov_entropy.png)
 ![Markov Branching](visualizations/markov_branching.png)
 ### Results
-| Context | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
-|---------|-------------|------------|------------------|-----------------|----------------|
-| **1** | 0.6001 | 1.516 | 4.04 | 17,160 | 40.0% |
-| **1** | 1.6144 | 3.062 | 12.68 | 311 | 0.0% |
-| **2** | 0.2159 | 1.161 | 1.46 | 69,220 | 78.4% |
-| **2** | 1.1540 | 2.225 | 5.73 | 3,939 | 0.0% |
-| **3** | 0.0701 | 1.050 | 1.12 | 101,209 | 93.0% |
-| **3** | 0.7580 | 1.691 | 3.07 | 22,573 | 24.2% |
-| **4** | 0.0275 🏆 | 1.019 | 1.04 | 113,437 | 97.2% |
-| **4** | 0.4989 🏆 | 1.413 | 2.04 | 69,374 | 50.1% |
-### Generated Text Samples
-Below are text samples generated from each Markov chain model:
 **Context Size 1:**
-1. `̈ k mɛ ̈ yuganda , tɛ ̈ th aacï röth juiɛr ke lɔ piny de`
-2. `. kɛ ̈ nyë riɛm yɛ ̈ ɔ ̈ në juëkjuëk apɛi ake mɛt thïriɛa ,`
-3. `, në olimpik löŋden cal lɔn yen wën ye lac tïŋ ɣɔn këc guoɔn nyucciëëŋden tueŋ`
 **Context Size 2:**
-1. `, ku tënɔŋ adhande ke keek . kuat yic bɛ ̈ nywut . keekë kɔckɛ ̈ ,`
-2. `̈ n ya bɛ ̈ nykoormacbaai tueŋ zubair mohamed salih , bɛ ̈ n . kɔc ke`
-3. `̈ k alëu bïkï nhïïm nɔŋ tuŋ bär ye cɔl mayoŋ . pïu rac cï cöp abuk`
 **Context Size 3:**
-1. `̈ ɔ ̈ k ɣaa këc pööc lac dööt , ciɛŋden thɛɛr ku pïïr ɣene töŋ maɣëmë ke`
-2. `̈ ɛ ̈ r cïke baar në rɛɛnken tɔŋbaai de rou kɔc ke thudän ( ylkt ) ku`
-3. `kɔ ̈ k ke 15 , 000 cï dɔm . burjuŋ aka rilic bï ye thok poc kek`
 **Context Size 4:**
-1. `from sil international ' s 2005 draft dinka - english dictionary . kay williamson educational founda...`
-2. `' s 2005 draft dinka - english dictionary . kay williamson educational foundation / sil internationa...`
-3. `draft dinka - english dictionary . kay williamson educational foundation / sil international . dikco...`
 ### Key Findings
-- **Best Predictability:** Context-4 with 97.2% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
-- **Memory Trade-off:** Larger contexts require more storage (69,374 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
@@ -242,26 +342,26 @@ Below are text samples generated from each Markov chain model:
 | Metric | Value |
 |--------|-------|
-| Vocabulary Size | 5,872 |
-| Total Tokens | 91,802 |
-| Mean Frequency | 15.63 |
 | Median Frequency | 3 |
-| Frequency Std Dev | 91.07 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | ku | 3,547 |
-| 2 | në | 2,806 |
-| 3 | de | 2,159 |
-| 4 | ë | 1,891 |
-| 5 | ke | 1,779 |
-| 6 | ye | 1,488 |
-| 7 | kɔc | 1,187 |
-| 8 | ee | 1,187 |
 | 9 | cï | 883 |
-| 10 | k | 882 |
 ### Least Common Words (from vocabulary)
@@ -269,9 +369,9 @@ Below are text samples generated from each Markov chain model:
 |------|------|-----------|
 | 1 | mayall | 2 |
 | 2 | cream | 2 |
-| 3 | layla | 2 |
-| 4 | adëgëk | 2 |
-| 5 | 1988 | 2 |
 | 6 | skobarkä | 2 |
 | 7 | pïlïbït | 2 |
 | 8 | tïgër | 2 |
@@ -282,24 +382,24 @@ Below are text samples generated from each Markov chain model:
 | Metric | Value |
 |--------|-------|
-| Zipf Coefficient | 1.0740 |
-| R² (Goodness of Fit) | 0.989252 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
-| Top 100 | 48.2% |
-| Top 1,000 | 80.3% |
-| Top 5,000 | 98.1% |
 | Top 10,000 | 0.0% |
 ### Key Findings
 - **Zipf Compliance:** R²=0.9893 indicates excellent adherence to Zipf's law
-- **High Frequency Dominance:** Top 100 words cover 48.2% of corpus
-- **Long Tail:** -4,128 words needed for remaining 100.0% coverage
 ---
 ## 5. Word Embeddings Evaluation
@@ -312,24 +412,105 @@ Below are text samples generated from each Markov chain model:
 ![t-SNE Sentences](visualizations/tsne_sentences.png)
-### Model Comparison
-| Model | Vocab Size | Dimension | Avg Norm | Std Norm | Isotropy |
-|-------|------------|-----------|----------|----------|----------|
-| **mono_32d** | 2,175 | 32 | 2.250 | 0.776 | 0.1273 🏆 |
-| **mono_64d** | 2,175 | 64 | 2.220 | 0.783 | 0.0336 |
-| **mono_128d** | 2,175 | 128 | 2.208 | 0.762 | 0.0072 |
-| **embeddings_enhanced** | 0 | 0 | 0.000 | 0.000 | 0.0000 |
 ### Key Findings
-- **Best Isotropy:** mono_32d with 0.1273 (more uniform distribution)
-- **Dimension Trade-off:** Higher dimensions capture more semantics but reduce isotropy
-- **Vocabulary Coverage:** All models cover 2,175 words
-- **Recommendation:** 100d for balanced semantic capture and efficiency
 ---
-## 6. Summary & Recommendations
 ![Performance Dashboard](visualizations/performance_dashboard.png)
@@ -337,11 +518,12 @@ Below are text samples generated from each Markov chain model:
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
-| Tokenizer | **32k BPE** | Best compression (4.27x) with low UNK rate |
-| N-gram | **5-gram** | Lowest perplexity (353) |
-| Markov | **Context-4** | Highest predictability (97.2%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
 ---
 ## Appendix: Metrics Glossary & Interpretation Guide
@@ -531,7 +713,8 @@ If you use these models in your research, please cite:
   author = {Kamali, Omar},
   title = {Wikilangs: Open NLP Models for Wikipedia Languages},
   year = {2025},
-  publisher = {HuggingFace},
   url = {https://huggingface.co/wikilangs}
   institution = {Omneity Labs}
 }
@@ -547,7 +730,8 @@ MIT License - Free for academic and commercial use.
 - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
 - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
 - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
 ---
 *Generated by Wikilangs Models Pipeline*
-*Report Date: 2025-12-30 08:24:47*

 ---
 language: din
+language_name: Dinka
 language_family: african_nilotic
 tags:
   - wikilangs
   - n-gram
   - markov
   - wikipedia
+  - feature-extraction
+  - sentence-similarity
+  - tokenization
+  - n-grams
+  - markov-chain
+  - text-mining
+  - fasttext
+  - babelvec
+  - vocabulous
+  - vocabulary
   - monolingual
   - family-african_nilotic
 license: mit
 library_name: wikilangs
+pipeline_tag: text-generation
 datasets:
   - omarkamali/wikipedia-monthly
 dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
+    value: 4.248
   - name: best_isotropy
     type: isotropy
+    value: 0.2108
   - name: vocabulary_size
     type: vocab
+    value: 0
+generated: 2026-01-04
 ---
+# Dinka - Wikilangs Models
 ## Comprehensive Research Report & Full Ablation Study
+This repository contains NLP models trained and evaluated by Wikilangs, specifically on **Dinka** Wikipedia data.
 We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
 ## 📋 Repository Contents
 ### Models & Assets
 - Tokenizers (8k, 16k, 32k, 64k)
+- N-gram models (2, 3, 4, 5-gram)
+- Markov chains (context of 1, 2, 3, 4 and 5)
 - Subword N-gram and Markov chains
+- Embeddings in various sizes and dimensions (aligned and unaligned)
 - Language Vocabulary
 - Language Statistics
 ![Performance Dashboard](visualizations/performance_dashboard.png)
 ### Analysis and Evaluation
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
+- [6. Morphological Analysis (Experimental)](#6--morphological-analysis-experimental)
+- [7. Summary & Recommendations](#7-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
 ![Tokenizer Compression](visualizations/tokenizer_compression.png)
+![Tokenizer Fertility](visualizations/tokenizer_fertility.png)
+![Tokenizer OOV](visualizations/tokenizer_oov.png)
+![Total Tokens](visualizations/tokenizer_total_tokens.png)
 ### Results
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
+| **8k** | 3.696x | 3.70 | 1.0395% | 137,657 |
+| **16k** | 3.984x | 3.99 | 1.1206% | 127,694 |
+| **32k** | 4.248x 🏆 | 4.25 | 1.1949% | 119,761 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
+**Sample 1:** `Ukraine ee paan en Yurop Penëdhiäk ee Volodymyr Zelensky. Genamaatnhomde ayee cɔ...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁ukraine ▁ee ▁paan ▁en ▁yurop ▁penëdhiäk ▁ee ▁v ol od ... (+15 more)` | 25 |
+| 16k | `▁ukraine ▁ee ▁paan ▁en ▁yurop ▁penëdhiäk ▁ee ▁v olodymyr ▁zelensky ... (+8 more)` | 18 |
+| 32k | `▁ukraine ▁ee ▁paan ▁en ▁yurop ▁penëdhiäk ▁ee ▁volodymyr ▁zelensky . ... (+5 more)` | 15 |
+**Sample 2:** `Monteaguila ee gendït Chile. Cinëkɔcde aa tëcit ruonic`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁mon te agu ila ▁ee ▁gendït ▁ch ile . ▁cinëkɔcde ... (+3 more)` | 13 |
+| 16k | `▁mon te agu ila ▁ee ▁gendït ▁chile . ▁cinëkɔcde ▁aa ... (+2 more)` | 12 |
+| 32k | `▁monteaguila ▁ee ▁gendït ▁chile . ▁cinëkɔcde ▁aa ▁tëcit ▁ruonic` | 9 |
+**Sample 3:** `Dhambia ee Apirïka. Genamaatnhomde ayee cɔl Lusaka.`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁dhambia ▁ee ▁apirïka . ▁genamaatnhomde ▁ayee ▁cɔl ▁lu sak a ... (+1 more)` | 11 |
+| 16k | `▁dhambia ▁ee ▁apirïka . ▁genamaatnhomde ▁ayee ▁cɔl ▁lusaka .` | 9 |
+| 32k | `▁dhambia ▁ee ▁apirïka . ▁genamaatnhomde ▁ayee ▁cɔl ▁lusaka .` | 9 |
 ### Key Findings
+- **Best Compression:** 32k achieves 4.248x compression
+- **Lowest UNK Rate:** 8k with 1.0395% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
 ![N-gram Perplexity](visualizations/ngram_perplexity.png)
+![N-gram Unique](visualizations/ngram_unique.png)
 ![N-gram Coverage](visualizations/ngram_coverage.png)
 ### Results
+| N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
+|--------|---------|------------|---------|----------------|------------------|-------------------|
+| **2-gram** | Word | 846 | 9.72 | 1,522 | 38.9% | 86.3% |
+| **2-gram** | Subword | 328 | 8.36 | 1,563 | 62.0% | 99.1% |
+| **3-gram** | Word | 240 | 7.90 | 785 | 62.9% | 100.0% |
+| **3-gram** | Subword | 2,240 | 11.13 | 9,446 | 25.3% | 71.0% |
+| **4-gram** | Word | 166 | 7.38 | 882 | 69.6% | 100.0% |
+| **4-gram** | Subword | 8,823 | 13.11 | 31,591 | 13.0% | 43.0% |
+| **5-gram** | Word | 59 🏆 | 5.89 | 373 | 86.5% | 100.0% |
+| **5-gram** | Subword | 18,719 | 14.19 | 51,151 | 8.6% | 31.8% |
 ### Top 5 N-grams by Size
+**2-grams (Word):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `glossary derived` | 167 |
+| 2 | `derived from` | 167 |
+| 3 | `from sil` | 167 |
+| 4 | `sil internationals` | 167 |
+| 5 | `internationals draft` | 167 |
+**3-grams (Word):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `internationals draft dinka` | 167 |
+| 2 | `from sil internationals` | 167 |
+| 3 | `derived from sil` | 167 |
+| 4 | `dinka glossary derived` | 167 |
+| 5 | `educational foundation sil` | 167 |
+**4-grams (Word):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `english to dinka glossary` | 167 |
+| 2 | `to dinka glossary derived` | 167 |
+| 3 | `dinka glossary derived from` | 167 |
+| 4 | `glossary derived from sil` | 167 |
+| 5 | `from sil internationals draft` | 167 |
+**5-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `dinka glossary derived from sil` | 167 |
+| 2 | `williamson educational foundation sil international` | 167 |
+| 3 | `kay williamson educational foundation sil` | 167 |
+| 4 | `dictionary kay williamson educational foundation` | 167 |
+| 5 | `english dictionary kay williamson educational` | 167 |
+**2-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `_ k` | 14,243 |
+| 2 | `e _` | 10,060 |
+| 3 | `_ a` | 9,948 |
+| 4 | `ë _` | 8,555 |
+| 5 | `n _` | 7,924 |
+**3-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `_ k u` | 4,510 |
+| 2 | `n ë _` | 3,923 |
+| 3 | `k u _` | 3,559 |
+| 4 | `_ k e` | 3,459 |
+| 5 | `_ t h` | 3,193 |
+**4-grams (Subword):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `_ k u _` | 3,514 |
+| 2 | `_ n ë _` | 2,762 |
+| 3 | `_ d e _` | 2,147 |
+| 4 | `_ k e _` | 1,756 |
+| 5 | `_ y e _` | 1,452 |
+**5-grams (Subword):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `_ k ɔ c _` | 1,091 |
+| 2 | `, _ k u _` | 836 |
+| 3 | `_ y e n _` | 729 |
+| 4 | `a t i o n` | 718 |
+| 5 | `t i o n a` | 686 |
 ### Key Findings
+- **Best Perplexity:** 5-gram (word) with 59
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
+- **Coverage:** Top-1000 patterns cover ~32% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
 ![Markov Entropy](visualizations/markov_entropy.png)
+![Markov Contexts](visualizations/markov_contexts.png)
 ![Markov Branching](visualizations/markov_branching.png)
 ### Results
+| Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
+|---------|---------|-------------|------------|------------------|-----------------|----------------|
+| **1** | Word | 0.6343 | 1.552 | 3.69 | 17,365 | 36.6% |
+| **1** | Subword | 1.5315 | 2.891 | 11.78 | 318 | 0.0% |
+| **2** | Word | 0.1750 | 1.129 | 1.30 | 63,845 | 82.5% |
+| **2** | Subword | 1.1046 | 2.150 | 5.58 | 3,744 | 0.0% |
+| **3** | Word | 0.0333 | 1.023 | 1.04 | 83,004 | 96.7% |
+| **3** | Subword | 0.7588 | 1.692 | 3.12 | 20,888 | 24.1% |
+| **4** | Word | 0.0076 🏆 | 1.005 | 1.01 | 86,340 | 99.2% |
+| **4** | Subword | 0.5088 | 1.423 | 2.08 | 65,173 | 49.1% |
+### Generated Text Samples (Word-based)
+Below are text samples generated from each word-based Markov chain model:
 **Context Size 1:**
+1. `ku gɛɛth puɔɔth ben jam ë kɔcnhiaardiɛtë acik gam ke panmäcalëi french indochina bï ya kë`
+2. `në bɛ̈ɛ̈i tënë tïmëtïm 57 ku tiem thidhic ku kek aa kï alëk dɛl miɲ kaːl`
+3. `de spain ku aye raan döŋ acï giit en kɛ̈ɛ̈cë anyak atɔ̈ thïn rin keloirɔt wët`
 **Context Size 2:**
+1. `english dictionary kay williamson educational foundation sil international dikconari thudän`
+2. `english to dinka glossary derived from sil internationals draft dinka english dictionary kay william...`
+3. `to dinka glossary derived from sil internationals draft dinka english dictionary kay williamson educ...`
 **Context Size 3:**
+1. `and roger blench english to dinka glossary derived from sil internationals draft dinka english dicti...`
+2. `internationals draft dinka english dictionary kay williamson educational foundation sil internationa...`
+3. `roger blench english to dinka glossary derived from sil internationals draft dinka english dictionar...`
 **Context Size 4:**
+1. `internationals draft dinka english dictionary kay williamson educational foundation sil internationa...`
+2. `to dinka glossary derived from sil internationals draft dinka english dictionary kay williamson educ...`
+3. `derived from sil internationals draft dinka english dictionary kay williamson educational foundation...`
+### Generated Text Samples (Subword-based)
+Below are text samples generated from each subword-based Markov chain model:
+**Context Size 1:**
+1. `_adde_cïnapae_lu`
+2. `a_piic_ciän_anya`
+3. `kuɛ̈c_arabo_san_k`
+**Context Size 2:**
+1. `_ku_acï_raŋdec_bï`
+2. `e_bïk_ëk_cök_de_y`
+3. `_aŋrɛn,_juäi_adhi`
+**Context Size 3:**
+1. `_ku_yiic,_thudän._`
+2. `në_2._“tx2_awɛ̈ɛ̈rde`
+3. `ku_puses)._ë_makut`
+**Context Size 4:**
+1. `_ku_cɔl_muɔɔr_aacë_`
+2. `_në_keye,_ee_noŋic_`
+3. `_de_joŋlei_paguot_k`
 ### Key Findings
+- **Best Predictability:** Context-4 (word) with 99.2% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
+- **Memory Trade-off:** Larger contexts require more storage (65,173 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
 | Metric | Value |
 |--------|-------|
+| Vocabulary Size | 5,848 |
+| Total Tokens | 81,189 |
+| Mean Frequency | 13.88 |
 | Median Frequency | 3 |
+| Frequency Std Dev | 86.66 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | ku | 3,546 |
+| 2 | në | 2,775 |
+| 3 | de | 2,158 |
+| 4 | ë | 1,890 |
+| 5 | ke | 1,776 |
+| 6 | ye | 1,484 |
+| 7 | ee | 1,173 |
+| 8 | kɔc | 1,137 |
 | 9 | cï | 883 |
+| 10 | yen | 747 |
 ### Least Common Words (from vocabulary)
 |------|------|-----------|
 | 1 | mayall | 2 |
 | 2 | cream | 2 |
+| 3 | puɔ̈k | 2 |
+| 4 | layla | 2 |
+| 5 | adëgëk | 2 |
 | 6 | skobarkä | 2 |
 | 7 | pïlïbït | 2 |
 | 8 | tïgër | 2 |
 | Metric | Value |
 |--------|-------|
+| Zipf Coefficient | 1.0295 |
+| R² (Goodness of Fit) | 0.989261 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
+| Top 100 | 47.4% |
+| Top 1,000 | 78.6% |
+| Top 5,000 | 97.9% |
 | Top 10,000 | 0.0% |
 ### Key Findings
 - **Zipf Compliance:** R²=0.9893 indicates excellent adherence to Zipf's law
+- **High Frequency Dominance:** Top 100 words cover 47.4% of corpus
+- **Long Tail:** -4,152 words needed for remaining 100.0% coverage
 ---
 ## 5. Word Embeddings Evaluation
 ![t-SNE Sentences](visualizations/tsne_sentences.png)
+### 5.1 Cross-Lingual Alignment
+![Alignment Quality](visualizations/embedding_alignment_quality.png)
+![Multilingual t-SNE](visualizations/embedding_tsne_multilingual.png)
+### 5.2 Model Comparison
+| Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
+|-------|-----------|----------|------------------|---------------|----------------|
+| **mono_32d** | 32 | 0.2108 🏆 | 0.6155 | N/A | N/A |
+| **mono_64d** | 64 | 0.0418 | 0.6059 | N/A | N/A |
+| **mono_128d** | 128 | 0.0088 | 0.6443 | N/A | N/A |
+| **aligned_32d** | 32 | 0.2108 | 0.5998 | 0.0070 | 0.0607 |
+| **aligned_64d** | 64 | 0.0418 | 0.5881 | 0.0187 | 0.1028 |
+| **aligned_128d** | 128 | 0.0088 | 0.6544 | 0.0164 | 0.0911 |
 ### Key Findings
+- **Best Isotropy:** mono_32d with 0.2108 (more uniform distribution)
+- **Semantic Density:** Average pairwise similarity of 0.6180. Lower values indicate better semantic separation.
+- **Alignment Quality:** Aligned models achieve up to 1.9% R@1 in cross-lingual retrieval.
+- **Recommendation:** 128d aligned for best cross-lingual performance
+---
+## 6.  Morphological Analysis (Experimental)
+This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
+### 6.1 Productivity & Complexity
+| Metric | Value | Interpretation | Recommendation |
+|--------|-------|----------------|----------------|
+| Productivity Index | **1.232** | High morphological productivity | Reliable analysis |
+| Idiomaticity Gap | **2.143** | High formulaic/idiomatic content | - |
+### 6.2 Affix Inventory (Productive Units)
+These are the most productive prefixes and suffixes identified by sampling the vocabulary for global substitutability patterns. A unit is considered an affix if stripping it leaves a valid stem that appears in other contexts.
+#### Productive Prefixes
+| Prefix | Examples |
+|--------|----------|
+| `-th` | thiεkde, thɔ̈r, thiɛɛr |
+#### Productive Suffixes
+| Suffix | Examples |
+|--------|----------|
+| `-ic` | tocdïtic, nyinic, ciaryic |
+### 6.3 Bound Stems (Lexical Roots)
+Bound stems are high-frequency subword units that are semantically cohesive but rarely appear as standalone words. These often correspond to the 'core' of a word that requires inflection or derivation to be valid.
+| Stem | Cohesion | Substitutability | Examples |
+|------|----------|------------------|----------|
+| `thiä` | 1.36x | 12 contexts | thiär, thiäŋ, thiäi |
+### 6.4 Affix Compatibility (Co-occurrence)
+This table shows which prefixes and suffixes most frequently co-occur on the same stems, revealing the 'stacking' rules of the language's morphology.
+| Prefix | Suffix | Frequency | Examples |
+|--------|--------|-----------|----------|
+| `-th` | `-ic` | 10 words | thändïtic, thudänic |
+### 6.5 Recursive Morpheme Segmentation
+Using **Recursive Hierarchical Substitutability**, we decompose complex words into their constituent morphemes. This approach handles nested affixes (e.g., `prefix-prefix-root-suffix`).
+| Word | Suggested Split | Confidence | Stem |
+|------|-----------------|------------|------|
+| kathɛɛric | **`kathɛɛr-ic`** | 4.5 | `kathɛɛr` |
+| wëlëmiiric | **`wëlëmiir-ic`** | 4.5 | `wëlëmiir` |
+| ruɔ̈ɔ̈nic | **`ruɔ̈ɔ̈n-ic`** | 4.5 | `ruɔ̈ɔ̈n` |
+| pïïrdenic | **`pïïrden-ic`** | 4.5 | `pïïrden` |
+| manywëëthic | **`manywëëth-ic`** | 4.5 | `manywëëth` |
+| pinynhomic | **`pinynhom-ic`** | 4.5 | `pinynhom` |
+| krïthmathic | **`krïthmath-ic`** | 4.5 | `krïthmath` |
+| käcïpuric | **`käcïpur-ic`** | 4.5 | `käcïpur` |
+| abëkruöönic | **`abëkruöön-ic`** | 4.5 | `abëkruöön` |
+| thändïtic | **`th-ändït-ic`** | 3.0 | `ändït` |
+| thiɛ̈ɛ̈ric | **`th-iɛ̈ɛ̈r-ic`** | 3.0 | `iɛ̈ɛ̈r` |
+| wëljamiic | **`wëljami-ic`** | 1.5 | `wëljami` |
+| pabakciɛlic | **`pabakciɛl-ic`** | 1.5 | `pabakciɛl` |
+| thanypiny | **`th-anypiny`** | 1.5 | `anypiny` |
+| lëkthɛɛric | **`lëkthɛɛr-ic`** | 1.5 | `lëkthɛɛr` |
+### 6.6 Linguistic Interpretation
+> **Automated Insight:**
+The language Dinka shows moderate morphological complexity. There is a balanced trade-off between whole-word memorization and subword composition.
+> **Note on Idiomaticity:** The high Idiomaticity Gap suggests a large number of frequent multi-word expressions or formulaic sequences that are statistically distinct from their component parts.
 ---
+## 7. Summary & Recommendations
 ![Performance Dashboard](visualizations/performance_dashboard.png)
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
+| Tokenizer | **32k BPE** | Best compression (4.25x) |
+| N-gram | **5-gram** | Lowest perplexity (59) |
+| Markov | **Context-4** | Highest predictability (99.2%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
 ---
 ## Appendix: Metrics Glossary & Interpretation Guide
   author = {Kamali, Omar},
   title = {Wikilangs: Open NLP Models for Wikipedia Languages},
   year = {2025},
+  doi = {10.5281/zenodo.18073153},
+  publisher = {Zenodo},
   url = {https://huggingface.co/wikilangs}
   institution = {Omneity Labs}
 }
 - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
 - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
 - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
+- 🤝 Sponsor: [Featherless AI](https://featherless.ai)
 ---
 *Generated by Wikilangs Models Pipeline*
+*Report Date: 2026-01-04 02:12:14*

models/embeddings/aligned/din_128d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f243cc52a4cfb3763acfd48b178dcd64b011e9131ecbb5e1d510de62a856c07
+size 1026179536

models/embeddings/aligned/din_128d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "din", "dim": 128, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/din_128d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:479c369c09f8e5f73f66bd6462abbe1d0234a6e139b63df3eb83c04cd32d0da6
+size 65664

models/embeddings/aligned/din_128d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "din",
+  "dimension": 128,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 428,
+  "vocab_size": 2096
+}

models/embeddings/aligned/din_32d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d8768bf52eddd63c85e2ff451fc6b325c0f5aa7b3549ab0750e68e0964bce4c
+size 256569808

models/embeddings/aligned/din_32d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "din", "dim": 32, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/din_32d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f349ee1be134188f25eb38feeb3f55eeaf5be6b9469af829b6290fe44358225b
+size 4224

models/embeddings/aligned/din_32d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "din",
+  "dimension": 32,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 428,
+  "vocab_size": 2096
+}

models/embeddings/aligned/din_64d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c9c428bdde94fffa409d6936e9d5592ce5b51d6d62b39afa7d55f75c84007349
+size 513106384

models/embeddings/aligned/din_64d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "din", "dim": 64, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/din_64d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f9553aeeafa889361df217fad742e4660917d490c7ce017376513bc69a42cca
+size 16512

models/embeddings/aligned/din_64d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "din",
+  "dimension": 64,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 428,
+  "vocab_size": 2096
+}

models/embeddings/monolingual/din_128d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8e6c249eaf6de7f76a2eb64012f9c5bd48a0adcdb03e51ba632711de5ceefe19
-size 1026261862

 version https://git-lfs.github.com/spec/v1
+oid sha256:1f243cc52a4cfb3763acfd48b178dcd64b011e9131ecbb5e1d510de62a856c07
+size 1026179536

models/embeddings/monolingual/din_128d_metadata.json CHANGED Viewed

@@ -3,11 +3,13 @@
   "dimension": 128,
   "version": "monolingual",
   "training_params": {
-    "dim": 128,
     "min_count": 5,
     "window": 5,
     "negative": 5,
-    "epochs": 5
   },
-  "vocab_size": 2175
 }

   "dimension": 128,
   "version": "monolingual",
   "training_params": {
+    "algorithm": "skipgram",
     "min_count": 5,
     "window": 5,
     "negative": 5,
+    "epochs": 5,
+    "encoding_method": "rope",
+    "dim": 128
   },
+  "vocab_size": 2096
 }

models/embeddings/monolingual/din_32d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1c9adf0a330f417082c7c5a9c6f4e477197b81549a90f452d392ab2ff63f046d
-size 256591462

 version https://git-lfs.github.com/spec/v1
+oid sha256:2d8768bf52eddd63c85e2ff451fc6b325c0f5aa7b3549ab0750e68e0964bce4c
+size 256569808

models/embeddings/monolingual/din_32d_metadata.json CHANGED Viewed

@@ -3,11 +3,13 @@
   "dimension": 32,
   "version": "monolingual",
   "training_params": {
-    "dim": 32,
     "min_count": 5,
     "window": 5,
     "negative": 5,
-    "epochs": 5
   },
-  "vocab_size": 2175
 }

   "dimension": 32,
   "version": "monolingual",
   "training_params": {
+    "algorithm": "skipgram",
     "min_count": 5,
     "window": 5,
     "negative": 5,
+    "epochs": 5,
+    "encoding_method": "rope",
+    "dim": 32
   },
+  "vocab_size": 2096
 }

models/embeddings/monolingual/din_64d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1514e0be95a610a658c8c219590572c945576d2e2bc9973d4c5cd0d8feca2e8b
-size 513148262

 version https://git-lfs.github.com/spec/v1
+oid sha256:c9c428bdde94fffa409d6936e9d5592ce5b51d6d62b39afa7d55f75c84007349
+size 513106384

models/embeddings/monolingual/din_64d_metadata.json CHANGED Viewed

@@ -3,11 +3,13 @@
   "dimension": 64,
   "version": "monolingual",
   "training_params": {
-    "dim": 64,
     "min_count": 5,
     "window": 5,
     "negative": 5,
-    "epochs": 5
   },
-  "vocab_size": 2175
 }

   "dimension": 64,
   "version": "monolingual",
   "training_params": {
+    "algorithm": "skipgram",
     "min_count": 5,
     "window": 5,
     "negative": 5,
+    "epochs": 5,
+    "encoding_method": "rope",
+    "dim": 64
   },
+  "vocab_size": 2096
 }

models/subword_markov/din_markov_ctx1_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:127eb7a360963d3f6be767e357c4d4a65c5ddf9d6903aacc12e7f0cd0891f2ac
-size 32007

 version https://git-lfs.github.com/spec/v1
+oid sha256:60a3a8d00b8a00b00689f570a808b1e7dc5f30bfb816215c3b110026a6347151
+size 30663

models/subword_markov/din_markov_ctx1_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "subword",
   "language": "din",
-  "unique_contexts": 311,
-  "total_transitions": 530585
 }

   "context_size": 1,
   "variant": "subword",
   "language": "din",
+  "unique_contexts": 318,
+  "total_transitions": 500776
 }

models/subword_markov/din_markov_ctx2_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:723a434808ffddca7d4d3f6fc077b3f4bfec5725f6d980d87d4937facb08b4f1
-size 165205

 version https://git-lfs.github.com/spec/v1
+oid sha256:a5821973129b3b3c0bafae913bd8daa4491075f89ed2ac47d07eaea79d2b99e7
+size 157513

models/subword_markov/din_markov_ctx2_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "subword",
   "language": "din",
-  "unique_contexts": 3939,
-  "total_transitions": 530073
 }

   "context_size": 2,
   "variant": "subword",
   "language": "din",
+  "unique_contexts": 3744,
+  "total_transitions": 500284
 }

models/subword_markov/din_markov_ctx3_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:05df3bb536b33c9fc2ffe30a4887acaefc1931b0e6660ec692b5617c96f69a39
-size 515269

 version https://git-lfs.github.com/spec/v1
+oid sha256:573f8ad86998c5675ea57679a88b9cb5318f4bfa602f3298340e494bd05f1aef
+size 492554

models/subword_markov/din_markov_ctx3_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 3,
   "variant": "subword",
   "language": "din",
-  "unique_contexts": 22573,
-  "total_transitions": 529561
 }

   "context_size": 3,
   "variant": "subword",
   "language": "din",
+  "unique_contexts": 20888,
+  "total_transitions": 499792
 }

models/subword_markov/din_markov_ctx4_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6f58b1bd8dc7995d9eef7140cd002c4ce4033a150549689f0454a94f78e89d6d
-size 1178741

 version https://git-lfs.github.com/spec/v1
+oid sha256:49e0d14e2a1ac0060a198ad9ff7b1c25aa116e292ebe4392b97f3178aa4e4b7a
+size 1121452

models/subword_markov/din_markov_ctx4_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 4,
   "variant": "subword",
   "language": "din",
-  "unique_contexts": 69374,
-  "total_transitions": 529049
 }

   "context_size": 4,
   "variant": "subword",
   "language": "din",
+  "unique_contexts": 65173,
+  "total_transitions": 499300
 }

models/subword_ngram/din_2gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f2ddd1b09dd4bed63e4af468ebddd67973f99facc8a20bf0591cbe3849e4ab36
-size 22481

 version https://git-lfs.github.com/spec/v1
+oid sha256:3b2d07a40cfac91548511f27f396d477958065feb18a3affb05847e9a8f568bf
+size 20862

models/subword_ngram/din_2gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 2,
   "variant": "subword",
   "language": "din",
-  "unique_ngrams": 1719,
-  "total_ngrams": 530585
 }

   "n": 2,
   "variant": "subword",
   "language": "din",
+  "unique_ngrams": 1563,
+  "total_ngrams": 500776
 }

models/subword_ngram/din_3gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:777f53e0084a7f33049dfdaadb8943997d7fb317fb2eac3d6c96df153d9d2149
-size 118141

 version https://git-lfs.github.com/spec/v1
+oid sha256:573b15c3bd7d2dc9a5be8ac8c98ed3b2ede2b5f3ff8b773ed49a00aaaef950c0
+size 112004

models/subword_ngram/din_3gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 3,
   "variant": "subword",
   "language": "din",
-  "unique_ngrams": 10075,
-  "total_ngrams": 530073
 }

   "n": 3,
   "variant": "subword",
   "language": "din",
+  "unique_ngrams": 9446,
+  "total_ngrams": 500284
 }

models/subword_ngram/din_4gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:89b8f7725cd6aac5b8a68406b158cf5acffba7494d1805c8b4f8c0a8e81884ac
-size 418893

 version https://git-lfs.github.com/spec/v1
+oid sha256:d995ca621e6e4dafa43f2f57773bb7435eb880e9a50a1437bee6f51df7bc7963
+size 395998

models/subword_ngram/din_4gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 4,
   "variant": "subword",
   "language": "din",
-  "unique_ngrams": 33136,
-  "total_ngrams": 529561
 }

   "n": 4,
   "variant": "subword",
   "language": "din",
+  "unique_ngrams": 31591,
+  "total_ngrams": 499792
 }

models/subword_ngram/din_5gram_subword.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7f5c32b943bd4a3dbb2314319f76dcb6dd3aacafcfcdd0b67c2e120c44454a6f
+size 648882

models/subword_ngram/din_5gram_subword_metadata.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "n": 5,
+  "variant": "subword",
+  "language": "din",
+  "unique_ngrams": 51151,
+  "total_ngrams": 499300
+}

models/tokenizer/din_tokenizer_16k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9524e8a2e065c61facff1cd75b7a97a9f37237a6e985f395e56f2245a77fb3cf
-size 515580

 version https://git-lfs.github.com/spec/v1
+oid sha256:6a69148ed9617b47531585cb835971c50916505ad2c383f49f99afe3c493e171
+size 522717

models/tokenizer/din_tokenizer_16k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/din_tokenizer_32k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3852bead6d9965c9944c990c6fd911f0db3e8a212430fdb57a05c11d1045b6c0
-size 803819

 version https://git-lfs.github.com/spec/v1
+oid sha256:e7ce8fe69a6bfbdc648d3b25788d90376618ed288685ca42d266eabbb5455665
+size 795594

models/tokenizer/din_tokenizer_32k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/din_tokenizer_8k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8455517f8c2cba07e54774765d1d68c9d28cc2cc5b6cc9d5e1be16a7943e20b0
-size 374243

 version https://git-lfs.github.com/spec/v1
+oid sha256:fac82c37aa86983a062646b307c798be71eb87be63967cdbb9eb4a4afcaa2209
+size 375314

models/tokenizer/din_tokenizer_8k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/vocabulary/din_vocabulary.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2820281559a689aceb40dbbe835641b1a46c89b22c6260d174c40ed56187ad25
-size 91682

 version https://git-lfs.github.com/spec/v1
+oid sha256:c58191781619d7719376a7b98fcfb94dd50f23e69fd48432a64797b3041cb522
+size 92110

models/vocabulary/din_vocabulary_metadata.json CHANGED Viewed

@@ -1,16 +1,17 @@
 {
   "language": "din",
-  "vocabulary_size": 5872,
   "statistics": {
-    "type_token_ratio": 0.16588202176297576,
     "coverage": {
-      "top_100": 0.4291053106708471,
-      "top_1000": 0.7156349799551539,
-      "top_5000": 0.8741882565352023,
-      "top_10000": 0.9311874508585795
     },
-    "hapax_count": 11217,
-    "hapax_ratio": 0.6563871496284159,
-    "total_documents": 512
   }
 }

 {
   "language": "din",
+  "vocabulary_size": 5848,
+  "variant": "full",
   "statistics": {
+    "type_token_ratio": 0.18787323488196614,
     "coverage": {
+      "top_100": 0.41450900075455427,
+      "top_1000": 0.6878732348819662,
+      "top_5000": 0.8568826129136574,
+      "top_10000": 0.9199202328338902
     },
+    "hapax_count": 11581,
+    "hapax_ratio": 0.6644672671983476,
+    "total_documents": 492
   }
 }

models/word_markov/din_markov_ctx1_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4c43f4dfe21d25d407d5b5b9d1441679b7523749c31a528129cda5959c6b7253
-size 554267

 version https://git-lfs.github.com/spec/v1
+oid sha256:246fc144af31744bc8e9c6bec9030fe28bda481e1ef4e97c1be4b93fa4a48dcf
+size 550979

models/word_markov/din_markov_ctx1_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "word",
   "language": "din",
-  "unique_contexts": 17160,
-  "total_transitions": 129566
 }

   "context_size": 1,
   "variant": "word",
   "language": "din",
+  "unique_contexts": 17365,
+  "total_transitions": 92278
 }

models/word_markov/din_markov_ctx2_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:965c69dacc40dac71dc8d8cca1b37275d8fd36766158f235cb2da6dae61c83b0
-size 1217993

 version https://git-lfs.github.com/spec/v1
+oid sha256:e0c57fde3bcbadba0e30dbb8351a7870d8ad9241eb1706f048ea8133749668e6
+size 1157480

models/word_markov/din_markov_ctx2_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "word",
   "language": "din",
-  "unique_contexts": 69220,
-  "total_transitions": 129054
 }

   "context_size": 2,
   "variant": "word",
   "language": "din",
+  "unique_contexts": 63845,
+  "total_transitions": 91786
 }

models/word_markov/din_markov_ctx3_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a0bb0aef0df5a98a9d5197038fe63754cb4b87fcd13fb51b6a3fe4b2ae713dae
-size 1669385

 version https://git-lfs.github.com/spec/v1
+oid sha256:de2ffee95e8d073a4b4152526a192cdc27fc3537af859f40c8755b3456597503
+size 1467750

models/word_markov/din_markov_ctx3_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 3,
   "variant": "word",
   "language": "din",
-  "unique_contexts": 101209,
-  "total_transitions": 128542
 }

   "context_size": 3,
   "variant": "word",
   "language": "din",
+  "unique_contexts": 83004,
+  "total_transitions": 91294
 }