omarkamali commited on Jan 3

Commit

c982f3a

verified ·

1 Parent(s): 2e1be1d

Upload all models and assets for bi (20251001)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
README.md +273 -126
models/embeddings/monolingual/bi_128d.bin +2 -2
models/embeddings/monolingual/bi_128d_metadata.json +5 -3
models/embeddings/monolingual/bi_32d.bin +2 -2
models/embeddings/monolingual/bi_32d_metadata.json +5 -3
models/embeddings/monolingual/bi_64d.bin +2 -2
models/embeddings/monolingual/bi_64d_metadata.json +5 -3
models/subword_markov/bi_markov_ctx1_subword.parquet +2 -2
models/subword_markov/bi_markov_ctx1_subword_metadata.json +2 -2
models/subword_markov/bi_markov_ctx2_subword.parquet +2 -2
models/subword_markov/bi_markov_ctx2_subword_metadata.json +2 -2
models/subword_markov/bi_markov_ctx3_subword.parquet +2 -2
models/subword_markov/bi_markov_ctx3_subword_metadata.json +2 -2
models/subword_markov/bi_markov_ctx4_subword.parquet +2 -2
models/subword_markov/bi_markov_ctx4_subword_metadata.json +2 -2
models/subword_ngram/bi_2gram_subword.parquet +2 -2
models/subword_ngram/bi_2gram_subword_metadata.json +2 -2
models/subword_ngram/bi_3gram_subword.parquet +2 -2
models/subword_ngram/bi_3gram_subword_metadata.json +2 -2
models/subword_ngram/bi_4gram_subword.parquet +2 -2
models/subword_ngram/bi_4gram_subword_metadata.json +2 -2
models/tokenizer/bi_tokenizer_16k.model +2 -2
models/tokenizer/bi_tokenizer_16k.vocab +0 -0
models/tokenizer/bi_tokenizer_8k.model +2 -2
models/tokenizer/bi_tokenizer_8k.vocab +0 -0
models/vocabulary/bi_vocabulary.parquet +2 -2
models/vocabulary/bi_vocabulary_metadata.json +9 -8
models/word_markov/bi_markov_ctx1_word.parquet +2 -2
models/word_markov/bi_markov_ctx1_word_metadata.json +2 -2
models/word_markov/bi_markov_ctx2_word.parquet +2 -2
models/word_markov/bi_markov_ctx2_word_metadata.json +2 -2
models/word_markov/bi_markov_ctx3_word.parquet +2 -2
models/word_markov/bi_markov_ctx3_word_metadata.json +2 -2
models/word_markov/bi_markov_ctx4_word.parquet +2 -2
models/word_markov/bi_markov_ctx4_word_metadata.json +2 -2
models/word_ngram/bi_2gram_word.parquet +2 -2
models/word_ngram/bi_2gram_word_metadata.json +2 -2
models/word_ngram/bi_3gram_word.parquet +2 -2
models/word_ngram/bi_3gram_word_metadata.json +2 -2
models/word_ngram/bi_4gram_word.parquet +2 -2
models/word_ngram/bi_4gram_word_metadata.json +2 -2
visualizations/embedding_isotropy.png +0 -0
visualizations/embedding_norms.png +0 -0
visualizations/embedding_similarity.png +2 -2
visualizations/markov_branching.png +0 -0
visualizations/markov_contexts.png +0 -0
visualizations/markov_entropy.png +0 -0
visualizations/model_sizes.png +0 -0
visualizations/ngram_coverage.png +0 -0

.gitattributes CHANGED Viewed

@@ -39,3 +39,4 @@ visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -t
 visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text

 visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
+visualizations/ngram_coverage.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -23,14 +23,14 @@ dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
-    value: 4.017
   - name: best_isotropy
     type: isotropy
-    value: 0.0541
   - name: vocabulary_size
     type: vocab
-    value: 3655
-generated: 2025-12-28
 ---
 # BI - Wikilangs Models
@@ -44,12 +44,13 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 ### Models & Assets
 - Tokenizers (8k, 16k, 32k, 64k)
-- N-gram models (2, 3, 4-gram)
-- Markov chains (context of 1, 2, 3 and 4)
 - Subword N-gram and Markov chains
-- Embeddings in various sizes and dimensions
 - Language Vocabulary
 - Language Statistics
 ![Performance Dashboard](visualizations/performance_dashboard.png)
 ### Analysis and Evaluation
@@ -59,7 +60,8 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
-- [6. Summary & Recommendations](#6-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
@@ -68,46 +70,49 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 ![Tokenizer Compression](visualizations/tokenizer_compression.png)
 ### Results
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
-| **8k** | 3.698x | 3.65 | 0.1622% | 57,343 |
-| **16k** | 4.017x 🏆 | 3.96 | 0.1762% | 52,790 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
-**Sample 1:** `Minsk, hem i bigtaon long senta blong Belarus, mo hemi kapitol blong kaontri ia....`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁minsk , ▁hem ▁i ▁bigtaon ▁long ▁senta ▁blong ▁belarus , ... (+10 more)` | 20 |
-| 16k | `▁minsk , ▁hem ▁i ▁bigtaon ▁long ▁senta ▁blong ▁belarus , ... (+10 more)` | 20 |
-**Sample 2:** `+UetersenRosenstadt Uetersen 125px 125px   300px
-Uetersen i stap smol taon blong...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁+ ue ter sen ros ens tad t ▁uetersen ▁ ... (+41 more)` | 51 |
-| 16k | `▁+ uetersen rosenstadt ▁uetersen ▁ 1 2 5 px ▁ ... (+36 more)` | 46 |
-**Sample 3:** `Prayut Chan-o-cha (boen 1954) i praem minista blong Thailand.
-Category:Praem mi...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁pra y ut ▁chan - o - cha ▁( boen ... (+18 more)` | 28 |
-| 16k | `▁prayut ▁chan - o - cha ▁( boen ▁ 1 ... (+16 more)` | 26 |
 ### Key Findings
-- **Best Compression:** 16k achieves 4.017x compression
-- **Lowest UNK Rate:** 8k with 0.1622% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
@@ -116,57 +121,89 @@ Category:Praem mi...`
 ![N-gram Perplexity](visualizations/ngram_perplexity.png)
 ![N-gram Coverage](visualizations/ngram_coverage.png)
 ### Results
-| N-gram | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
-|--------|------------|---------|----------------|------------------|-------------------|
-| **2-gram** | 483 🏆 | 8.92 | 1,634 | 54.9% | 91.1% |
-| **2-gram** | 264 🏆 | 8.05 | 1,233 | 68.8% | 99.6% |
-| **3-gram** | 712 | 9.48 | 2,346 | 48.6% | 84.2% |
-| **3-gram** | 1,434 | 10.49 | 7,760 | 37.2% | 75.8% |
-| **4-gram** | 1,319 | 10.36 | 4,093 | 39.6% | 72.5% |
-| **4-gram** | 3,949 | 11.95 | 23,770 | 28.7% | 57.4% |
 ### Top 5 N-grams by Size
-**2-grams:**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `category :` | 2,068 |
-| 2 | `. category` | 1,177 |
-| 3 | `hem i` | 759 |
-| 4 | `stet blong` | 711 |
-| 5 | `yunaeted stet` | 643 |
-**3-grams:**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `. category :` | 1,177 |
-| 2 | `yunaeted stet blong` | 587 |
-| 3 | `stet blong amerika` | 585 |
-| 4 | `blong yunaeted stet` | 481 |
-| 5 | `category : pipol` | 468 |
-**4-grams:**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `yunaeted stet blong amerika` | 585 |
-| 2 | `blong yunaeted stet blong` | 472 |
-| 3 | `category : pipol blong` | 413 |
-| 4 | `. category : ol` | 274 |
-| 5 | `stet blong amerika .` | 229 |
 ### Key Findings
-- **Best Perplexity:** 2-gram with 264
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
-- **Coverage:** Top-1000 patterns cover ~57% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
@@ -174,55 +211,86 @@ Category:Praem mi...`
 ![Markov Entropy](visualizations/markov_entropy.png)
 ![Markov Branching](visualizations/markov_branching.png)
 ### Results
-| Context | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
-|---------|-------------|------------|------------------|-----------------|----------------|
-| **1** | 0.5518 | 1.466 | 3.32 | 9,727 | 44.8% |
-| **1** | 1.0793 | 2.113 | 7.70 | 381 | 0.0% |
-| **2** | 0.2268 | 1.170 | 1.46 | 32,184 | 77.3% |
-| **2** | 1.0810 | 2.115 | 5.52 | 2,929 | 0.0% |
-| **3** | 0.0853 | 1.061 | 1.15 | 46,928 | 91.5% |
-| **3** | 0.7971 | 1.738 | 3.07 | 16,165 | 20.3% |
-| **4** | 0.0386 🏆 | 1.027 | 1.07 | 53,803 | 96.1% |
-| **4** | 0.4284 🏆 | 1.346 | 1.80 | 49,589 | 57.2% |
-### Generated Text Samples
-Below are text samples generated from each Markov chain model:
 **Context Size 1:**
-1. `blong itali . category : politikis blong yunaeted stet blong yunaeted stet blong afrika category :`
-2. `. - mackie - 19 novemba 1962 long polan . plante samting ikam inside long not`
-3. `i stap wetem pas . king blong polanbol . ph / ebchecked / cette / /`
 **Context Size 2:**
-1. `category : yunaeted stet blong amerika . category : ol krietiv daerekta long tv show yo gabba`
-2. `. category : politikis blong franis , spain ) bibliothèque nationale de france le bulletin de la`
-3. `hem i wan kaontri long pasifik mo save mekem i gat seven koninens ( nasa , 2022`
 **Context Size 3:**
-1. `. category : pipol blong jemani category : politikis blong franis category : saentis category : pipo...`
-2. `yunaeted stet blong amerika . category : praem minista blong japan . category : kaontri category : s...`
-3. `blong yunaeted stet blong amerika . category : spen`
 **Context Size 4:**
-1. `blong yunaeted stet blong amerika . category : hed blong stet category : politikis blong taelan`
-2. `category : pipol blong jemani category : politikis`
-3. `yunaeted stet blong amerika category : ol woman blong singsing category : pipol blong yunaeted kingd...`
 ### Key Findings
-- **Best Predictability:** Context-4 with 96.1% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
-- **Memory Trade-off:** Larger contexts require more storage (49,589 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
@@ -238,64 +306,64 @@ Below are text samples generated from each Markov chain model:
 | Metric | Value |
 |--------|-------|
-| Vocabulary Size | 3,655 |
-| Total Tokens | 57,331 |
-| Mean Frequency | 15.69 |
 | Median Frequency | 3 |
-| Frequency Std Dev | 124.23 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | blong | 5,072 |
-| 2 | i | 3,328 |
-| 3 | long | 2,237 |
-| 4 | category | 2,068 |
-| 5 | ol | 1,320 |
-| 6 | mo | 1,059 |
-| 7 | hem | 1,034 |
-| 8 | wan | 894 |
-| 9 | stet | 842 |
-| 10 | yunaeted | 714 |
 ### Least Common Words (from vocabulary)
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | sftp | 2 |
-| 2 | operating | 2 |
-| 3 | guide | 2 |
-| 4 | spesifikesen | 2 |
-| 5 | firewall | 2 |
-| 6 | 2428 | 2 |
-| 7 | sapot | 2 |
-| 8 | lesin | 2 |
-| 9 | sanem | 2 |
-| 10 | extended | 2 |
 ### Zipf's Law Analysis
 | Metric | Value |
 |--------|-------|
-| Zipf Coefficient | 1.0336 |
-| R² (Goodness of Fit) | 0.989882 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
-| Top 100 | 60.4% |
-| Top 1,000 | 86.7% |
 | Top 5,000 | 0.0% |
 | Top 10,000 | 0.0% |
 ### Key Findings
-- **Zipf Compliance:** R²=0.9899 indicates excellent adherence to Zipf's law
-- **High Frequency Dominance:** Top 100 words cover 60.4% of corpus
-- **Long Tail:** -6,345 words needed for remaining 100.0% coverage
 ---
 ## 5. Word Embeddings Evaluation
@@ -308,24 +376,100 @@ Below are text samples generated from each Markov chain model:
 ![t-SNE Sentences](visualizations/tsne_sentences.png)
-### Model Comparison
-| Model | Vocab Size | Dimension | Avg Norm | Std Norm | Isotropy |
-|-------|------------|-----------|----------|----------|----------|
-| **mono_32d** | 1,195 | 32 | 2.350 | 0.505 | 0.0541 🏆 |
-| **mono_64d** | 1,195 | 64 | 2.278 | 0.491 | 0.0110 |
-| **mono_128d** | 1,195 | 128 | 2.279 | 0.484 | 0.0021 |
-| **embeddings_enhanced** | 0 | 0 | 0.000 | 0.000 | 0.0000 |
 ### Key Findings
-- **Best Isotropy:** mono_32d with 0.0541 (more uniform distribution)
-- **Dimension Trade-off:** Higher dimensions capture more semantics but reduce isotropy
-- **Vocabulary Coverage:** All models cover 1,195 words
-- **Recommendation:** 100d for balanced semantic capture and efficiency
 ---
-## 6. Summary & Recommendations
 ![Performance Dashboard](visualizations/performance_dashboard.png)
@@ -333,11 +477,12 @@ Below are text samples generated from each Markov chain model:
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
-| Tokenizer | **32k BPE** | Best compression (4.02x) with low UNK rate |
-| N-gram | **5-gram** | Lowest perplexity (264) |
-| Markov | **Context-4** | Highest predictability (96.1%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
 ---
 ## Appendix: Metrics Glossary & Interpretation Guide
@@ -527,7 +672,8 @@ If you use these models in your research, please cite:
   author = {Kamali, Omar},
   title = {Wikilangs: Open NLP Models for Wikipedia Languages},
   year = {2025},
-  publisher = {HuggingFace},
   url = {https://huggingface.co/wikilangs}
   institution = {Omneity Labs}
 }
@@ -543,7 +689,8 @@ MIT License - Free for academic and commercial use.
 - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
 - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
 - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
 ---
 *Generated by Wikilangs Models Pipeline*
-*Report Date: 2025-12-28 05:14:46*

 metrics:
   - name: best_compression_ratio
     type: compression
+    value: 4.443
   - name: best_isotropy
     type: isotropy
+    value: 0.0388
   - name: vocabulary_size
     type: vocab
+    value: 0
+generated: 2026-01-03
 ---
 # BI - Wikilangs Models
 ### Models & Assets
 - Tokenizers (8k, 16k, 32k, 64k)
+- N-gram models (2, 3, 4, 5-gram)
+- Markov chains (context of 1, 2, 3, 4 and 5)
 - Subword N-gram and Markov chains
+- Embeddings in various sizes and dimensions (aligned and unaligned)
 - Language Vocabulary
 - Language Statistics
 ![Performance Dashboard](visualizations/performance_dashboard.png)
 ### Analysis and Evaluation
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
+- [6. Morphological Analysis (Experimental)](#6-morphological-analysis)
+- [7. Summary & Recommendations](#7-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
 ![Tokenizer Compression](visualizations/tokenizer_compression.png)
+![Tokenizer Fertility](visualizations/tokenizer_fertility.png)
+![Tokenizer OOV](visualizations/tokenizer_oov.png)
+![Total Tokens](visualizations/tokenizer_total_tokens.png)
 ### Results
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
+| **8k** | 4.032x | 4.05 | 0.1444% | 47,092 |
+| **16k** | 4.443x 🏆 | 4.47 | 0.1591% | 42,734 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
+**Sample 1:** `Copenhagen (toktok Denmak: København), hem i kapitol blong Denmak. Long yia popu...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁copenhagen ▁( toktok ▁denmak : ▁københavn ), ▁hem ▁i ▁kapitol ... (+20 more)` | 30 |
+| 16k | `▁copenhagen ▁( toktok ▁denmak : ▁københavn ), ▁hem ▁i ▁kapitol ... (+20 more)` | 30 |
+**Sample 2:** `Emily Elizabeth Dickinson (10 Desemba – 15 May em i bin wan poet blong Amerika. ...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁em il y ▁elizabeth ▁dick ins on ▁( 1 0 ... (+19 more)` | 29 |
+| 16k | `▁emily ▁elizabeth ▁dickinson ▁( 1 0 ▁desemba ▁– ▁ 1 ... (+15 more)` | 25 |
+**Sample 3:** `Narafala kaen blong spot long Vanuatu i stap pleiplei tru long kaontri long yumi...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁narafala ▁kaen ▁blong ▁spot ▁long ▁vanuatu ▁i ▁stap ▁pleiplei ▁tru ... (+7 more)` | 17 |
+| 16k | `▁narafala ▁kaen ▁blong ▁spot ▁long ▁vanuatu ▁i ▁stap ▁pleiplei ▁tru ... (+7 more)` | 17 |
 ### Key Findings
+- **Best Compression:** 16k achieves 4.443x compression
+- **Lowest UNK Rate:** 8k with 0.1444% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
 ![N-gram Perplexity](visualizations/ngram_perplexity.png)
+![N-gram Unique](visualizations/ngram_unique.png)
 ![N-gram Coverage](visualizations/ngram_coverage.png)
 ### Results
+| N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
+|--------|---------|------------|---------|----------------|------------------|-------------------|
+| **2-gram** | Word | 362 | 8.50 | 1,049 | 58.8% | 98.9% |
+| **2-gram** | Subword | 209 🏆 | 7.71 | 983 | 73.7% | 100.0% |
+| **3-gram** | Word | 496 | 8.95 | 1,408 | 53.1% | 92.0% |
+| **3-gram** | Subword | 1,182 | 10.21 | 5,848 | 38.2% | 79.4% |
+| **4-gram** | Word | 887 | 9.79 | 2,457 | 43.9% | 77.4% |
+| **4-gram** | Subword | 3,532 | 11.79 | 19,225 | 28.5% | 58.2% |
 ### Top 5 N-grams by Size
+**2-grams (Word):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `hem i` | 738 |
+| 2 | `stet blong` | 729 |
+| 3 | `em i` | 617 |
+| 4 | `blong amerika` | 598 |
+| 5 | `blong yunaeted` | 535 |
+**3-grams (Word):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `stet blong amerika` | 583 |
+| 2 | `yunaeted stet blong` | 479 |
+| 3 | `blong yunaeted stet` | 479 |
+| 4 | `blong singsing blong` | 292 |
+| 5 | `blong hem i` | 259 |
+**4-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `yunaeted stet blong amerika` | 477 |
+| 2 | `blong yunaeted stet blong` | 470 |
+| 3 | `akta blong yunaeted stet` | 210 |
+| 4 | `woman blong singsing blong` | 182 |
+| 5 | `blong singsing blong japan` | 150 |
+**2-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `o n` | 9,093 |
+| 2 | `n g` | 8,780 |
+| 3 | `l o` | 8,027 |
+| 4 | `g _` | 7,936 |
+| 5 | `_ b` | 7,059 |
+**3-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `n g _` | 7,795 |
+| 2 | `o n g` | 7,296 |
+| 3 | `l o n` | 7,257 |
+| 4 | `_ b l` | 5,277 |
+| 5 | `b l o` | 5,252 |
+**4-grams (Subword):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `o n g _` | 7,200 |
+| 2 | `l o n g` | 7,191 |
+| 3 | `_ b l o` | 5,238 |
+| 4 | `b l o n` | 5,015 |
+| 5 | `_ l o n` | 2,153 |
 ### Key Findings
+- **Best Perplexity:** 2-gram (subword) with 209
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
+- **Coverage:** Top-1000 patterns cover ~58% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
 ![Markov Entropy](visualizations/markov_entropy.png)
+![Markov Contexts](visualizations/markov_contexts.png)
 ![Markov Branching](visualizations/markov_branching.png)
 ### Results
+| Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
+|---------|---------|-------------|------------|------------------|-----------------|----------------|
+| **1** | Word | 0.5840 | 1.499 | 3.04 | 8,338 | 41.6% |
+| **1** | Subword | 0.9602 | 1.946 | 6.50 | 364 | 4.0% |
+| **2** | Word | 0.1997 | 1.148 | 1.41 | 24,957 | 80.0% |
+| **2** | Subword | 0.9911 | 1.988 | 5.10 | 2,361 | 0.9% |
+| **3** | Word | 0.0755 | 1.054 | 1.13 | 34,724 | 92.4% |
+| **3** | Subword | 0.7964 | 1.737 | 3.17 | 12,016 | 20.4% |
+| **4** | Word | 0.0328 🏆 | 1.023 | 1.06 | 38,736 | 96.7% |
+| **4** | Subword | 0.4627 | 1.378 | 1.90 | 38,018 | 53.7% |
+### Generated Text Samples (Word-based)
+Below are text samples generated from each word-based Markov chain model:
 **Context Size 1:**
+1. `blong olgeta mo yu ol disaepol blong dover long wol plante fasin blong court i wan`
+2. `i bin wan strongfala win if you s 84 913 km2 populaesen blong stet blong hem`
+3. `long saed blong tekem carbondioxde mo wanwan aelan gaua o aoba hem i wokem long milly`
 **Context Size 2:**
+1. `hem i stap insaet long solwota everi man i save sindaon o silip long hem islam relijon`
+2. `stet blong philippines blong stet blong amerika blong stet blong amerika blong stet blong amerika mo...`
+3. `em i bin ded 8 septemba em i woman blong singsing blong japan man blong singsing blong`
 **Context Size 3:**
+1. `blong yunaeted stet blong amerika model akta blong pornografi blong ajentina em i stap popiula from ...`
+2. `yunaeted stet blong amerika akta blong yunaeted stet blong amerika blong yunaeted stet blong amerika...`
+3. `blong singsing blong japan thumb anna iriyama man blong singsing blong kanada man blong singsing blo...`
 **Context Size 4:**
+1. `blong yunaeted stet blong amerika blong stet blong yunaeted stet blong amerika blong yunaeted stet b...`
+2. `yunaeted stet blong amerika blong stet blong yunaeted stet blong amerika blong yunaeted stet blong a...`
+3. `akta blong yunaeted stet blong amerika akta blong yunaeted stet blong amerika blong stet blong yunae...`
+### Generated Text Samples (Subword-based)
+Below are text samples generated from each subword-based Markov chain model:
+**Context Size 1:**
+1. `_dimo_ste_lon_i_`
+2. `a_blong_bl_19_s_`
+3. `ngstang_yulolem:`
+**Context Size 2:**
+1. `ong_300px_12_3_44`
+2. `ng_st_boetexanblo`
+3. `long_prol_no,_рос`
+**Context Size 3:**
+1. `ng_amerika._akta_b`
+2. `ong_savela_taeland`
+3. `long_amerika_maura`
+**Context Size 4:**
+1. `ong_amerika._praem_`
+2. `long_not_prize_nigh`
+3. `_blong_21_man_blong`
 ### Key Findings
+- **Best Predictability:** Context-4 (word) with 96.7% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
+- **Memory Trade-off:** Larger contexts require more storage (38,018 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
 | Metric | Value |
 |--------|-------|
+| Vocabulary Size | 3,117 |
+| Total Tokens | 48,872 |
+| Mean Frequency | 15.68 |
 | Median Frequency | 3 |
+| Frequency Std Dev | 124.49 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | blong | 5,014 |
+| 2 | i | 3,182 |
+| 3 | long | 2,146 |
+| 4 | mo | 1,031 |
+| 5 | hem | 1,008 |
+| 6 | ol | 886 |
+| 7 | wan | 875 |
+| 8 | stet | 840 |
+| 9 | amerika | 673 |
+| 10 | em | 660 |
 ### Least Common Words (from vocabulary)
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | lotta | 2 |
+| 2 | continua | 2 |
+| 3 | ekshumesen | 2 |
+| 4 | suspension | 2 |
+| 5 | fulwan | 2 |
+| 6 | konfirm | 2 |
+| 7 | trial | 2 |
+| 8 | window | 2 |
+| 9 | piazza | 2 |
+| 10 | fontana | 2 |
 ### Zipf's Law Analysis
 | Metric | Value |
 |--------|-------|
+| Zipf Coefficient | 1.0400 |
+| R² (Goodness of Fit) | 0.989215 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
+| Top 100 | 62.1% |
+| Top 1,000 | 88.5% |
 | Top 5,000 | 0.0% |
 | Top 10,000 | 0.0% |
 ### Key Findings
+- **Zipf Compliance:** R²=0.9892 indicates excellent adherence to Zipf's law
+- **High Frequency Dominance:** Top 100 words cover 62.1% of corpus
+- **Long Tail:** -6,883 words needed for remaining 100.0% coverage
 ---
 ## 5. Word Embeddings Evaluation
 ![t-SNE Sentences](visualizations/tsne_sentences.png)
+### 5.1 Cross-Lingual Alignment
+> *Note: Multilingual alignment visualization not available for this language.*
+### 5.2 Model Comparison
+| Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
+|-------|-----------|----------|------------------|---------------|----------------|
+| **mono_32d** | 32 | 0.0388 🏆 | 0.6777 | N/A | N/A |
+| **mono_64d** | 64 | 0.0097 | 0.6676 | N/A | N/A |
+| **mono_128d** | 128 | 0.0021 | 0.6720 | N/A | N/A |
 ### Key Findings
+- **Best Isotropy:** mono_32d with 0.0388 (more uniform distribution)
+- **Semantic Density:** Average pairwise similarity of 0.6724. Lower values indicate better semantic separation.
+- **Alignment Quality:** No aligned models evaluated in this run.
+- **Recommendation:** 128d aligned for best cross-lingual performance
+---
+## 6.  Morphological Analysis (Experimental)
+> ⚠️ **Warning:** This language shows low morphological productivity. The statistical signals used for this analysis may be noisy or less reliable than for morphologically rich languages.
+This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
+### 6.1 Productivity & Complexity
+| Metric | Value | Interpretation | Recommendation |
+|--------|-------|----------------|----------------|
+| Productivity Index | **0.000** | Low morphological productivity | ⚠️ Likely unreliable |
+| Idiomaticity Gap | **-1.000** | Low formulaic content | - |
+### 6.2 Affix Inventory (Productive Units)
+These are the most productive prefixes and suffixes identified by sampling the vocabulary for global substitutability patterns. A unit is considered an affix if stripping it leaves a valid stem that appears in other contexts.
+#### Productive Prefixes
+| Prefix | Examples |
+|--------|----------|
+#### Productive Suffixes
+| Suffix | Examples |
+|--------|----------|
+| `-en` | ren, disisen, citizen |
+| `-an` | givhan, kirgistan, wan |
+| `-em` | shoem, wokem, blem |
+### 6.3 Bound Stems (Lexical Roots)
+Bound stems are high-frequency subword units that are semantically cohesive but rarely appear as standalone words. These often correspond to the 'core' of a word that requires inflection or derivation to be valid.
+| Stem | Cohesion | Substitutability | Examples |
+|------|----------|------------------|----------|
+| `amba` | 1.38x | 8 contexts | ambae, namba, bambae |
+### 6.4 Affix Compatibility (Co-occurrence)
+This table shows which prefixes and suffixes most frequently co-occur on the same stems, revealing the 'stacking' rules of the language's morphology.
+*No significant affix co-occurrences detected.*
+### 6.5 Recursive Morpheme Segmentation
+Using **Recursive Hierarchical Substitutability**, we decompose complex words into their constituent morphemes. This approach handles nested affixes (e.g., `prefix-prefix-root-suffix`).
+| Word | Suggested Split | Confidence | Stem |
+|------|-----------------|------------|------|
+| republican | **`republic-an`** | 4.5 | `republic` |
+| andastanem | **`andast-an-em`** | 3.0 | `andast` |
+| niutesteman | **`niutest-em-an`** | 3.0 | `niutest` |
+| kirgistan | **`kirgist-an`** | 1.5 | `kirgist` |
+| valencian | **`valenci-an`** | 1.5 | `valenci` |
+| singaotem | **`singaot-em`** | 1.5 | `singaot` |
+| defdefren | **`defdefr-en`** | 1.5 | `defdefr` |
+| melanesian | **`melanesi-an`** | 1.5 | `melanesi` |
+| konstitusen | **`konstitus-en`** | 1.5 | `konstitus` |
+| komposisen | **`komposis-en`** | 1.5 | `komposis` |
+| smithsonian | **`smithsoni-an`** | 1.5 | `smithsoni` |
+| kompitisen | **`kompitis-en`** | 1.5 | `kompitis` |
+| bisnesman | **`bisnesm-an`** | 1.5 | `bisnesm` |
+| protestan | **`protest-an`** | 1.5 | `protest` |
+| ekshumesen | **`ekshumes-en`** | 1.5 | `ekshumes` |
+### 6.6 Linguistic Interpretation
+> **Automated Insight:**
+The language BI appears to be more isolating or has a highly fixed vocabulary. Word-level models perform nearly as well as subword models, indicating fewer productive morphological processes.
 ---
+## 7. Summary & Recommendations
 ![Performance Dashboard](visualizations/performance_dashboard.png)
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
+| Tokenizer | **16k BPE** | Best compression (4.44x) |
+| N-gram | **2-gram** | Lowest perplexity (209) |
+| Markov | **Context-4** | Highest predictability (96.7%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
 ---
 ## Appendix: Metrics Glossary & Interpretation Guide
   author = {Kamali, Omar},
   title = {Wikilangs: Open NLP Models for Wikipedia Languages},
   year = {2025},
+  doi = {10.5281/zenodo.18073153},
+  publisher = {Zenodo},
   url = {https://huggingface.co/wikilangs}
   institution = {Omneity Labs}
 }
 - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
 - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
 - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
+- 🤝 Sponsor: [Featherless AI](https://featherless.ai)
 ---
 *Generated by Wikilangs Models Pipeline*
+*Report Date: 2026-01-03 07:17:54*

models/embeddings/monolingual/bi_128d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:caab5935973c28d2f5da72f9cf5b5188fa2c9edce13d7ebd5d742833957f536a
-size 1025242321

 version https://git-lfs.github.com/spec/v1
+oid sha256:5ba043557491d9bd866f4880ed63951e9cedd258275bb5ab99cc6a4588a48418
+size 1025087100

models/embeddings/monolingual/bi_128d_metadata.json CHANGED Viewed

@@ -3,11 +3,13 @@
   "dimension": 128,
   "version": "monolingual",
   "training_params": {
-    "dim": 128,
     "min_count": 5,
     "window": 5,
     "negative": 5,
-    "epochs": 5
   },
-  "vocab_size": 1195
 }

   "dimension": 128,
   "version": "monolingual",
   "training_params": {
+    "algorithm": "skipgram",
     "min_count": 5,
     "window": 5,
     "negative": 5,
+    "epochs": 5,
+    "encoding_method": "rope",
+    "dim": 128
   },
+  "vocab_size": 1046
 }

models/embeddings/monolingual/bi_32d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e416f9f9b0b23b2d2883958880cacc4980aa5e0694b6ccedc1110f52e55bce2b
-size 256324561

 version https://git-lfs.github.com/spec/v1
+oid sha256:77f00c5cf1334339a3726d39a681bacc6c8c5b7d226141cb5bd2ec7d3d93731c
+size 256283772

models/embeddings/monolingual/bi_32d_metadata.json CHANGED Viewed

@@ -3,11 +3,13 @@
   "dimension": 32,
   "version": "monolingual",
   "training_params": {
-    "dim": 32,
     "min_count": 5,
     "window": 5,
     "negative": 5,
-    "epochs": 5
   },
-  "vocab_size": 1195
 }

   "dimension": 32,
   "version": "monolingual",
   "training_params": {
+    "algorithm": "skipgram",
     "min_count": 5,
     "window": 5,
     "negative": 5,
+    "epochs": 5,
+    "encoding_method": "rope",
+    "dim": 32
   },
+  "vocab_size": 1046
 }

models/embeddings/monolingual/bi_64d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:90ad9fe1ab3aad85875dd7c7e530975798cadf8deef91de6e32dd2d5a0f21735
-size 512630481

 version https://git-lfs.github.com/spec/v1
+oid sha256:64be2d287f8ece0f9696fe2372c048f2ea1c8dc331d051d642a50efe1563bc8b
+size 512551548

models/embeddings/monolingual/bi_64d_metadata.json CHANGED Viewed

@@ -3,11 +3,13 @@
   "dimension": 64,
   "version": "monolingual",
   "training_params": {
-    "dim": 64,
     "min_count": 5,
     "window": 5,
     "negative": 5,
-    "epochs": 5
   },
-  "vocab_size": 1195
 }

   "dimension": 64,
   "version": "monolingual",
   "training_params": {
+    "algorithm": "skipgram",
     "min_count": 5,
     "window": 5,
     "negative": 5,
+    "epochs": 5,
+    "encoding_method": "rope",
+    "dim": 64
   },
+  "vocab_size": 1046
 }

models/subword_markov/bi_markov_ctx1_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:939056bb57c6d57cef88397915123fe30ff6db2b6bd41444716de44e04e02910
-size 25094

 version https://git-lfs.github.com/spec/v1
+oid sha256:2e9372210a853979f0e4b07d4830a256cffa139a1fece177ff9c6038eed35df5
+size 22700

models/subword_markov/bi_markov_ctx1_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "subword",
   "language": "bi",
-  "unique_contexts": 381,
-  "total_transitions": 375436
 }

   "context_size": 1,
   "variant": "subword",
   "language": "bi",
+  "unique_contexts": 364,
+  "total_transitions": 308852
 }

models/subword_markov/bi_markov_ctx2_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6322d05a7e7c19ac152a2f6ef1c2a6c75e3259cb3e86f72c1b9d1766212caf00
-size 115200

 version https://git-lfs.github.com/spec/v1
+oid sha256:6ca6b0d701fee7db455da7e30950385f2806f14cec4f336ce82c4779e09621b4
+size 91544

models/subword_markov/bi_markov_ctx2_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "subword",
   "language": "bi",
-  "unique_contexts": 2929,
-  "total_transitions": 373835
 }

   "context_size": 2,
   "variant": "subword",
   "language": "bi",
+  "unique_contexts": 2361,
+  "total_transitions": 307409
 }

models/subword_markov/bi_markov_ctx3_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4ea1c08f391c5c276bbab53af1851c581076fa215e1ba890bc059391e01ef855
-size 346792

 version https://git-lfs.github.com/spec/v1
+oid sha256:68a8c133f845c55867a0fe53a9d2c945b085e56ce9e1e895558840e0a31ab900
+size 279694

models/subword_markov/bi_markov_ctx3_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 3,
   "variant": "subword",
   "language": "bi",
-  "unique_contexts": 16165,
-  "total_transitions": 372234
 }

   "context_size": 3,
   "variant": "subword",
   "language": "bi",
+  "unique_contexts": 12016,
+  "total_transitions": 305966
 }

models/subword_markov/bi_markov_ctx4_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d4e91d074a40f87ad5464f595f8b4a666e20ee3e50b49909f00856a81624693d
-size 770026

 version https://git-lfs.github.com/spec/v1
+oid sha256:6e769ec2bcec378d6b983577c7020c20977d3c068c15135f0bc9e18b6c7e264f
+size 620331

models/subword_markov/bi_markov_ctx4_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 4,
   "variant": "subword",
   "language": "bi",
-  "unique_contexts": 49589,
-  "total_transitions": 370633
 }

   "context_size": 4,
   "variant": "subword",
   "language": "bi",
+  "unique_contexts": 38018,
+  "total_transitions": 304523
 }

models/subword_ngram/bi_2gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fe618ebafef2b68121e2f9be785bcaad54d3ef638542480c5567752c647086fc
-size 16793

 version https://git-lfs.github.com/spec/v1
+oid sha256:4527bcf666ffd2dd1c578cfc5840d02a71199e369b0c8a627659726d39e176f6
+size 14130

models/subword_ngram/bi_2gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 2,
   "variant": "subword",
   "language": "bi",
-  "unique_ngrams": 1233,
-  "total_ngrams": 375436
 }

   "n": 2,
   "variant": "subword",
   "language": "bi",
+  "unique_ngrams": 983,
+  "total_ngrams": 308852
 }

models/subword_ngram/bi_3gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e39f7c0cae8ddad16ffa9c4e7a5b2d47bd3d3d8c1fe65906a276c905a9e65f52
-size 83670

 version https://git-lfs.github.com/spec/v1
+oid sha256:b80fdffb1d982a5003ad057c3864a9f483b951ec85c6662f9c20d596201a26ff
+size 63627

models/subword_ngram/bi_3gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 3,
   "variant": "subword",
   "language": "bi",
-  "unique_ngrams": 7760,
-  "total_ngrams": 373835
 }

   "n": 3,
   "variant": "subword",
   "language": "bi",
+  "unique_ngrams": 5848,
+  "total_ngrams": 307409
 }

models/subword_ngram/bi_4gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:17352c622203d2149e4ce41acb530a75712c65ae857e09d52dc983df854b7d50
-size 276251

 version https://git-lfs.github.com/spec/v1
+oid sha256:c4ebfe1944f1ab6833eb266e7debe2291ba5a0381a209e1ff6ac58781421a2c2
+size 221014

models/subword_ngram/bi_4gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 4,
   "variant": "subword",
   "language": "bi",
-  "unique_ngrams": 23770,
-  "total_ngrams": 372234
 }

   "n": 4,
   "variant": "subword",
   "language": "bi",
+  "unique_ngrams": 19225,
+  "total_ngrams": 305966
 }

models/tokenizer/bi_tokenizer_16k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2acc0817a4fe6cae5cc586d4dd39f982bca4830e346325d6cb762281d84b105b
-size 505184

 version https://git-lfs.github.com/spec/v1
+oid sha256:cb93c36553e5bad9dc7098178cff3fc9a58ec9892899f622abc48bd993de075f
+size 506618

models/tokenizer/bi_tokenizer_16k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/bi_tokenizer_8k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5e03b02df521285dacf413c63825ccbb0b39cfd1e07e4eef79fb3f0a904fbbb3
-size 369743

 version https://git-lfs.github.com/spec/v1
+oid sha256:6a4efeb4ad5e3a8b678a5c79bd0331f01122c9bb189bbfd294c1172390d5a05c
+size 370520

models/tokenizer/bi_tokenizer_8k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/vocabulary/bi_vocabulary.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ea974cb6d740ce7a68a58cd7c62ff6fb4a801623a68be726034ab99cd8cec698
-size 58570

 version https://git-lfs.github.com/spec/v1
+oid sha256:87f70e9b39cbf89b9a192034747f166c231ea9bf458bb2112e79e97ae15e247a
+size 50916

models/vocabulary/bi_vocabulary_metadata.json CHANGED Viewed

@@ -1,15 +1,16 @@
 {
   "language": "bi",
-  "vocabulary_size": 3655,
   "statistics": {
-    "type_token_ratio": 0.1527339310519005,
     "coverage": {
-      "top_100": 0.5463442353832555,
-      "top_1000": 0.7844109104684935,
-      "top_5000": 0.926190175527213
     },
-    "hapax_count": 6021,
-    "hapax_ratio": 0.6222612649855312,
-    "total_documents": 1601
   }
 }

 {
   "language": "bi",
+  "vocabulary_size": 3117,
+  "variant": "full",
   "statistics": {
+    "type_token_ratio": 0.15550018456995202,
     "coverage": {
+      "top_100": 0.5605020302694721,
+      "top_1000": 0.7978589885566629,
+      "top_5000": 0.9367847914359543
     },
+    "hapax_count": 5308,
+    "hapax_ratio": 0.6300296735905044,
+    "total_documents": 1443
   }
 }