omarkamali commited on Jan 3

Commit

a13b387

verified ·

1 Parent(s): 3b32958

Upload all models and assets for ary (20251001)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
README.md +305 -138
models/embeddings/monolingual/ary_128d.bin +2 -2
models/embeddings/monolingual/ary_128d_metadata.json +5 -3
models/embeddings/monolingual/ary_32d.bin +2 -2
models/embeddings/monolingual/ary_32d_metadata.json +5 -3
models/embeddings/monolingual/ary_64d.bin +2 -2
models/embeddings/monolingual/ary_64d_metadata.json +5 -3
models/subword_markov/ary_markov_ctx1_subword.parquet +2 -2
models/subword_markov/ary_markov_ctx1_subword_metadata.json +2 -2
models/subword_markov/ary_markov_ctx2_subword.parquet +2 -2
models/subword_markov/ary_markov_ctx2_subword_metadata.json +2 -2
models/subword_markov/ary_markov_ctx3_subword.parquet +2 -2
models/subword_markov/ary_markov_ctx3_subword_metadata.json +2 -2
models/subword_markov/ary_markov_ctx4_subword.parquet +2 -2
models/subword_markov/ary_markov_ctx4_subword_metadata.json +2 -2
models/subword_ngram/ary_2gram_subword.parquet +2 -2
models/subword_ngram/ary_2gram_subword_metadata.json +2 -2
models/subword_ngram/ary_3gram_subword.parquet +2 -2
models/subword_ngram/ary_3gram_subword_metadata.json +2 -2
models/subword_ngram/ary_4gram_subword.parquet +2 -2
models/subword_ngram/ary_4gram_subword_metadata.json +2 -2
models/tokenizer/ary_tokenizer_16k.model +2 -2
models/tokenizer/ary_tokenizer_16k.vocab +0 -0
models/tokenizer/ary_tokenizer_32k.model +2 -2
models/tokenizer/ary_tokenizer_32k.vocab +0 -0
models/tokenizer/ary_tokenizer_64k.model +2 -2
models/tokenizer/ary_tokenizer_64k.vocab +0 -0
models/tokenizer/ary_tokenizer_8k.model +2 -2
models/tokenizer/ary_tokenizer_8k.vocab +0 -0
models/vocabulary/ary_vocabulary.parquet +2 -2
models/vocabulary/ary_vocabulary_metadata.json +10 -9
models/word_markov/ary_markov_ctx1_word.parquet +2 -2
models/word_markov/ary_markov_ctx1_word_metadata.json +2 -2
models/word_markov/ary_markov_ctx2_word.parquet +2 -2
models/word_markov/ary_markov_ctx2_word_metadata.json +2 -2
models/word_markov/ary_markov_ctx3_word.parquet +2 -2
models/word_markov/ary_markov_ctx3_word_metadata.json +2 -2
models/word_markov/ary_markov_ctx4_word.parquet +2 -2
models/word_markov/ary_markov_ctx4_word_metadata.json +2 -2
models/word_ngram/ary_2gram_word.parquet +2 -2
models/word_ngram/ary_2gram_word_metadata.json +2 -2
models/word_ngram/ary_3gram_word.parquet +2 -2
models/word_ngram/ary_3gram_word_metadata.json +2 -2
models/word_ngram/ary_4gram_word.parquet +2 -2
models/word_ngram/ary_4gram_word_metadata.json +2 -2
visualizations/embedding_isotropy.png +0 -0
visualizations/embedding_norms.png +0 -0
visualizations/embedding_similarity.png +2 -2
visualizations/markov_branching.png +0 -0

.gitattributes CHANGED Viewed

@@ -38,3 +38,4 @@ visualizations/performance_dashboard.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text

 visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
+visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -23,14 +23,14 @@ dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
-    value: 3.683
   - name: best_isotropy
     type: isotropy
-    value: 0.8264
   - name: vocabulary_size
     type: vocab
-    value: 81712
-generated: 2025-12-27
 ---
 # Moroccan Arabic - Wikilangs Models
@@ -44,12 +44,13 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 ### Models & Assets
 - Tokenizers (8k, 16k, 32k, 64k)
-- N-gram models (2, 3, 4-gram)
-- Markov chains (context of 1, 2, 3 and 4)
 - Subword N-gram and Markov chains
-- Embeddings in various sizes and dimensions
 - Language Vocabulary
 - Language Statistics
 ![Performance Dashboard](visualizations/performance_dashboard.png)
 ### Analysis and Evaluation
@@ -59,7 +60,8 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
-- [6. Summary & Recommendations](#6-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
@@ -68,56 +70,57 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 ![Tokenizer Compression](visualizations/tokenizer_compression.png)
 ### Results
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
-| **8k** | 3.134x | 3.09 | 0.0472% | 379,309 |
-| **16k** | 3.346x | 3.30 | 0.0504% | 355,311 |
-| **32k** | 3.535x | 3.49 | 0.0532% | 336,296 |
-| **64k** | 3.683x 🏆 | 3.64 | 0.0555% | 322,761 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
-**Sample 1:** `معمر زين العاشقين قاري و حافظ د لقرآن.
- مصادر
-تصنيف:زيادة 1954
-تصنيف:ناس حيين...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁مع مر ▁زين ▁الع اش قين ▁ق اري ▁و ▁ح ... (+21 more)` | 31 |
-| 16k | `▁مع مر ▁زين ▁الع اش قين ▁ق اري ▁و ▁حافظ ... (+20 more)` | 30 |
-| 32k | `▁معمر ▁زين ▁الع اش قين ▁قاري ▁و ▁حافظ ▁د ▁لقرآن ... (+18 more)` | 28 |
-| 64k | `▁معمر ▁زين ▁العاش قين ▁قاري ▁و ▁حافظ ▁د ▁لقرآن . ... (+17 more)` | 27 |
-**Sample 2:** `ضريب لمؤخرة (ب ) فبي دي إس إم عملية جنسية كاتخدّم كا عقاب ولا ل لإتارة لجنسية ما...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁ض ريب ▁لمؤ خرة ▁( ب ▁) ▁ف بي ▁دي ... (+40 more)` | 50 |
-| 16k | `▁ض ريب ▁لمؤ خرة ▁( ب ▁) ▁ف بي ▁دي ... (+36 more)` | 46 |
-| 32k | `▁ض ريب ▁لمؤ خرة ▁( ب ▁) ▁ف بي ▁دي ... (+32 more)` | 42 |
-| 64k | `▁ضريب ▁لمؤخرة ▁( ب ▁) ▁ف بي ▁دي ▁إس ▁إم ... (+28 more)` | 38 |
-**Sample 3:** `ضباب  هوّا إيروصول كيتشاف ب لْعين، مكوّن من قطرات صغار ديال لما ؤلا كريستالات دي...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁ض باب ▁هوّا ▁إير وص ول ▁كيت شاف ▁ب ▁لْ ... (+34 more)` | 44 |
-| 16k | `▁ض باب ▁هوّا ▁إير وص ول ▁كيت شاف ▁ب ▁لْ ... (+31 more)` | 41 |
-| 32k | `▁ض باب ▁هوّا ▁إير وصول ▁كيتشاف ▁ب ▁لْ عين ، ... (+27 more)` | 37 |
-| 64k | `▁ض باب ▁هوّا ▁إير وصول ▁كيتشاف ▁ب ▁لْ عين ، ... (+24 more)` | 34 |
 ### Key Findings
-- **Best Compression:** 64k achieves 3.683x compression
-- **Lowest UNK Rate:** 8k with 0.0472% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
@@ -126,57 +129,89 @@ Below are sample sentences tokenized with each vocabulary size:
 ![N-gram Perplexity](visualizations/ngram_perplexity.png)
 ![N-gram Coverage](visualizations/ngram_coverage.png)
 ### Results
-| N-gram | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
-|--------|------------|---------|----------------|------------------|-------------------|
-| **2-gram** | 7,187 🏆 | 12.81 | 56,749 | 24.4% | 53.2% |
-| **2-gram** | 486 🏆 | 8.93 | 6,227 | 54.9% | 95.4% |
-| **3-gram** | 8,812 | 13.11 | 76,888 | 21.3% | 52.8% |
-| **3-gram** | 4,295 | 12.07 | 51,256 | 22.1% | 58.7% |
-| **4-gram** | 12,168 | 13.57 | 124,859 | 20.1% | 50.4% |
-| **4-gram** | 22,008 | 14.43 | 260,844 | 12.0% | 35.5% |
 ### Top 5 N-grams by Size
-**2-grams:**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `تصنيف :` | 37,187 |
-| 2 | `، و` | 18,746 |
-| 3 | `ن ّ` | 10,639 |
-| 4 | `) :` | 10,185 |
-| 5 | `مصادر تصنيف` | 10,087 |
-**3-grams:**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `مصادر تصنيف :` | 10,087 |
-| 2 | `تصنيف : مقالات` | 7,001 |
-| 3 | `ن ّ اس` | 6,981 |
-| 4 | `ل ّ ي` | 6,914 |
-| 5 | `: دوار ف` | 5,007 |
-**4-grams:**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `تصنيف : دوار ف` | 5,005 |
-| 2 | `نسبة ن ّ اس` | 4,061 |
-| 3 | `. مصادر تصنيف :` | 3,827 |
-| 4 | `تصنيف : مقالات زادهوم` | 3,506 |
-| 5 | `: مقالات زادهوم داريجابوت` | 3,506 |
 ### Key Findings
-- **Best Perplexity:** 2-gram with 486
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
-- **Coverage:** Top-1000 patterns cover ~35% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
@@ -184,55 +219,86 @@ Below are sample sentences tokenized with each vocabulary size:
 ![Markov Entropy](visualizations/markov_entropy.png)
 ![Markov Branching](visualizations/markov_branching.png)
 ### Results
-| Context | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
-|---------|-------------|------------|------------------|-----------------|----------------|
-| **1** | 0.7813 | 1.719 | 5.36 | 189,320 | 21.9% |
-| **1** | 1.1519 | 2.222 | 8.71 | 1,931 | 0.0% |
-| **2** | 0.2761 | 1.211 | 1.68 | 1,014,676 | 72.4% |
-| **2** | 0.9863 | 1.981 | 6.24 | 16,826 | 1.4% |
-| **3** | 0.0931 | 1.067 | 1.18 | 1,701,309 | 90.7% |
-| **3** | 0.8744 | 1.833 | 4.33 | 104,928 | 12.6% |
-| **4** | 0.0366 🏆 | 1.026 | 1.07 | 2,000,181 | 96.3% |
-| **4** | 0.6731 🏆 | 1.594 | 2.82 | 454,694 | 32.7% |
-### Generated Text Samples
-Below are text samples generated from each Markov chain model:
 **Context Size 1:**
-1. `. لخصوبة عند الجواج ف لكامبيانة د فلوسها من ݣوجارات ف لمغريب تصنيف : لقرن 20`
-2. `، منهوم 816 , geerat j . ولادها بجوج فالإليادة ، عاود قاسها قبل منهوم 154`
-3. `ف إقليم لخميسات تصنيف : سلطان شرعي . ناس د الكاسترد تصنيف : 29 مارس 1920`
 **Context Size 2:**
-1. `تصنيف : مارس تصنيف : زيادة 1961 تصنيف : أفلام د 2005 . لمحطة التانية فيها 66`
-2. `، و معتاقل سياسي روسي . كان خدا لجايزة د لأوسكار لأحسن فيلم قصير ( 4 )`
-3. `ن ّ اس ل ّ ي قاريين فوق الليسي ( ليسي و جامعة ) : 12 ,`
 **Context Size 3:**
-1. `مصادر تصنيف : پاناما تصنيف : عواصم ديال بلدان تصنيف : بانݣلاديش تصنيف : بزوليات د جنوب آسيا`
-2. `تصنيف : مقالات فيها مصدر و 3000 بايت تصنيف : مقالات فيها مصدر و 3000 بايت تصنيف :`
-3. `ن ّ اس ل ّ ي كتعتابر لوغة كيلتية ، ؤ ل ّ يسي . كروص كانت تتحيد`
 **Context Size 4:**
-1. `تصنيف : دوار ف عمالة مكناس تصنيف : مقالات زادهوم داريجابوت تصنيف : ناس حيين تصنيف : زيادة 1987`
-2. `نسبة ن ّ اس اللي خدامين ف د ّ ولة : 8 , 3 % نسبة ن ّ اس`
-3. `. مصادر تصنيف : لوغات أمازيغية تصنيف : مقالات فيها مصدر و 3000 بايت تصنيف : مقالات زادهوم داريجابوت`
 ### Key Findings
-- **Best Predictability:** Context-4 with 96.3% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
-- **Memory Trade-off:** Larger contexts require more storage (454,694 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
@@ -248,64 +314,64 @@ Below are text samples generated from each Markov chain model:
 | Metric | Value |
 |--------|-------|
-| Vocabulary Size | 81,712 |
-| Total Tokens | 2,308,873 |
-| Mean Frequency | 28.26 |
 | Median Frequency | 4 |
-| Frequency Std Dev | 559.90 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | ف | 84,463 |
-| 2 | د | 69,201 |
-| 3 | و | 61,463 |
-| 4 | تصنيف | 37,231 |
-| 5 | ل | 34,076 |
-| 6 | ديال | 32,761 |
-| 7 | من | 29,612 |
-| 8 | على | 19,717 |
-| 9 | لي | 18,627 |
-| 10 | ب | 18,189 |
 ### Least Common Words (from vocabulary)
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | بيتسي | 2 |
-| 2 | وصانعي | 2 |
-| 3 | وأهميتها | 2 |
-| 4 | بورديو | 2 |
-| 5 | بلومر | 2 |
-| 6 | مقترحة | 2 |
-| 7 | anchor | 2 |
-| 8 | الرسميةاللي | 2 |
-| 9 | بعصبة | 2 |
-| 10 | ماڭي | 2 |
 ### Zipf's Law Analysis
 | Metric | Value |
 |--------|-------|
-| Zipf Coefficient | 1.0380 |
-| R² (Goodness of Fit) | 0.999162 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
-| Top 100 | 39.3% |
-| Top 1,000 | 63.8% |
-| Top 5,000 | 78.6% |
-| Top 10,000 | 84.8% |
 ### Key Findings
-- **Zipf Compliance:** R²=0.9992 indicates excellent adherence to Zipf's law
-- **High Frequency Dominance:** Top 100 words cover 39.3% of corpus
-- **Long Tail:** 71,712 words needed for remaining 15.2% coverage
 ---
 ## 5. Word Embeddings Evaluation
@@ -318,24 +384,122 @@ Below are text samples generated from each Markov chain model:
 ![t-SNE Sentences](visualizations/tsne_sentences.png)
-### Model Comparison
-| Model | Vocab Size | Dimension | Avg Norm | Std Norm | Isotropy |
-|-------|------------|-----------|----------|----------|----------|
-| **mono_32d** | 37,528 | 32 | 4.010 | 1.183 | 0.8264 🏆 |
-| **mono_64d** | 37,528 | 64 | 4.579 | 1.040 | 0.8183 |
-| **mono_128d** | 37,528 | 128 | 5.112 | 0.875 | 0.7212 |
-| **embeddings_enhanced** | 0 | 0 | 0.000 | 0.000 | 0.0000 |
 ### Key Findings
-- **Best Isotropy:** mono_32d with 0.8264 (more uniform distribution)
-- **Dimension Trade-off:** Higher dimensions capture more semantics but reduce isotropy
-- **Vocabulary Coverage:** All models cover 37,528 words
-- **Recommendation:** 100d for balanced semantic capture and efficiency
 ---
-## 6. Summary & Recommendations
 ![Performance Dashboard](visualizations/performance_dashboard.png)
@@ -343,11 +507,12 @@ Below are text samples generated from each Markov chain model:
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
-| Tokenizer | **32k BPE** | Best compression (3.68x) with low UNK rate |
-| N-gram | **5-gram** | Lowest perplexity (486) |
-| Markov | **Context-4** | Highest predictability (96.3%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
 ---
 ## Appendix: Metrics Glossary & Interpretation Guide
@@ -537,7 +702,8 @@ If you use these models in your research, please cite:
   author = {Kamali, Omar},
   title = {Wikilangs: Open NLP Models for Wikipedia Languages},
   year = {2025},
-  publisher = {HuggingFace},
   url = {https://huggingface.co/wikilangs}
   institution = {Omneity Labs}
 }
@@ -553,7 +719,8 @@ MIT License - Free for academic and commercial use.
 - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
 - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
 - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
 ---
 *Generated by Wikilangs Models Pipeline*
-*Report Date: 2025-12-27 04:26:59*

 metrics:
   - name: best_compression_ratio
     type: compression
+    value: 4.180
   - name: best_isotropy
     type: isotropy
+    value: 0.8384
   - name: vocabulary_size
     type: vocab
+    value: 0
+generated: 2026-01-03
 ---
 # Moroccan Arabic - Wikilangs Models
 ### Models & Assets
 - Tokenizers (8k, 16k, 32k, 64k)
+- N-gram models (2, 3, 4, 5-gram)
+- Markov chains (context of 1, 2, 3, 4 and 5)
 - Subword N-gram and Markov chains
+- Embeddings in various sizes and dimensions (aligned and unaligned)
 - Language Vocabulary
 - Language Statistics
 ![Performance Dashboard](visualizations/performance_dashboard.png)
 ### Analysis and Evaluation
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
+- [6. Morphological Analysis (Experimental)](#6-morphological-analysis)
+- [7. Summary & Recommendations](#7-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
 ![Tokenizer Compression](visualizations/tokenizer_compression.png)
+![Tokenizer Fertility](visualizations/tokenizer_fertility.png)
+![Tokenizer OOV](visualizations/tokenizer_oov.png)
+![Total Tokens](visualizations/tokenizer_total_tokens.png)
 ### Results
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
+| **8k** | 3.512x | 3.52 | 0.0922% | 278,716 |
+| **16k** | 3.778x | 3.78 | 0.0992% | 259,059 |
+| **32k** | 4.002x | 4.01 | 0.1051% | 244,561 |
+| **64k** | 4.180x 🏆 | 4.18 | 0.1098% | 234,163 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
+**Sample 1:** `مصادر شوف تا داريجة تاريخ لكتابة ب داريجة ليستة د لمكتوبات ب داريجة ليستة د لكتو...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁مصادر ▁شوف ▁تا ▁داريجة ▁تاريخ ▁لكتابة ▁ب ▁داريجة ▁ليستة ▁د ... (+22 more)` | 32 |
+| 16k | `▁مصادر ▁شوف ▁تا ▁داريجة ▁تاريخ ▁لكتابة ▁ب ▁داريجة ▁ليستة ▁د ... (+20 more)` | 30 |
+| 32k | `▁مصادر ▁شوف ▁تا ▁داريجة ▁تاريخ ▁لكتابة ▁ب ▁داريجة ▁ليستة ▁د ... (+20 more)` | 30 |
+| 64k | `▁مصادر ▁شوف ▁تا ▁داريجة ▁تاريخ ▁لكتابة ▁ب ▁داريجة ▁ليستة ▁د ... (+20 more)` | 30 |
+**Sample 2:** `أمين رباطي (مزيود ف يوليوز هو كوايري مغريبي. مصادر مغريبي د رجال حيين`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁أمين ▁رباط ي ▁( مزيود ▁ف ▁يوليوز ▁هو ▁كوايري ▁مغريبي ... (+6 more)` | 16 |
+| 16k | `▁أمين ▁رباط ي ▁( مزيود ▁ف ▁يوليوز ▁هو ▁كوايري ▁مغريبي ... (+6 more)` | 16 |
+| 32k | `▁أ��ين ▁رباطي ▁( مزيود ▁ف ▁يوليوز ▁هو ▁كوايري ▁مغريبي . ... (+5 more)` | 15 |
+| 64k | `▁أمين ▁رباطي ▁( مزيود ▁ف ▁يوليوز ▁هو ▁كوايري ▁مغريبي . ... (+5 more)` | 15 |
+**Sample 3:** `هادي صفحة د التوضيح، كلمة دوري يمكن يكونو عندها هاد لمعاني: طابلو دوري دوري أبطا...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁هادي ▁صفحة ▁د ▁التوضيح ، ▁كلمة ▁دوري ▁يمكن ▁يكونو ▁عندها ... (+10 more)` | 20 |
+| 16k | `▁هادي ▁صفحة ▁د ▁التوضيح ، ▁كلمة ▁دوري ▁يمكن ▁يكونو ▁عندها ... (+9 more)` | 19 |
+| 32k | `▁هادي ▁صفحة ▁د ▁التوضيح ، ▁كلمة ▁دوري ▁يمكن ▁يكونو ▁عندها ... (+9 more)` | 19 |
+| 64k | `▁هادي ▁صفحة ▁د ▁التوضيح ، ▁كلمة ▁دوري ▁يمكن ▁يكونو ▁عندها ... (+9 more)` | 19 |
 ### Key Findings
+- **Best Compression:** 64k achieves 4.180x compression
+- **Lowest UNK Rate:** 8k with 0.0922% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
 ![N-gram Perplexity](visualizations/ngram_perplexity.png)
+![N-gram Unique](visualizations/ngram_unique.png)
 ![N-gram Coverage](visualizations/ngram_coverage.png)
 ### Results
+| N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
+|--------|---------|------------|---------|----------------|------------------|-------------------|
+| **2-gram** | Word | 6,129 | 12.58 | 35,218 | 24.5% | 53.4% |
+| **2-gram** | Subword | 415 🏆 | 8.70 | 5,585 | 58.6% | 96.6% |
+| **3-gram** | Word | 4,994 | 12.29 | 39,702 | 28.5% | 58.9% |
+| **3-gram** | Subword | 3,624 | 11.82 | 41,944 | 23.5% | 61.8% |
+| **4-gram** | Word | 6,987 | 12.77 | 63,706 | 28.4% | 55.4% |
+| **4-gram** | Subword | 18,675 | 14.19 | 204,568 | 12.3% | 37.2% |
 ### Top 5 N-grams by Size
+**2-grams (Word):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `واصلة ل` | 8,540 |
+| 2 | `نسبة د` | 7,170 |
+| 3 | `ف لمغريب` | 6,247 |
+| 4 | `ف إقليم` | 6,016 |
+| 5 | `ف نسبة` | 4,265 |
+**3-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `ف نسبة د` | 4,264 |
+| 2 | `فيها مصدر و` | 3,236 |
+| 3 | `و نسبة د` | 2,894 |
+| 4 | `مصدر و بايت` | 2,856 |
+| 5 | `اللي خدامين ف` | 2,759 |
+**4-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `فيها مصدر و بايت` | 2,856 |
+| 2 | `نسبة نّاس اللي خدامين` | 2,705 |
+| 3 | `نّاس اللي خدامين ف` | 2,593 |
+| 4 | `على حساب لإحصاء الرسمي` | 2,501 |
+| 5 | `لعاداد د سّكان ديالو` | 2,500 |
+**2-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `ا ل` | 293,281 |
+| 2 | `_ ل` | 265,615 |
+| 3 | `ة _` | 209,034 |
+| 4 | `_ ا` | 180,710 |
+| 5 | `_ م` | 141,509 |
+**3-grams (Subword):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `_ ا ل` | 176,897 |
+| 2 | `_ ف _` | 80,240 |
+| 3 | `_ د _` | 57,749 |
+| 4 | `_ و _` | 57,033 |
+| 5 | `ا ت _` | 56,985 |
+**4-grams (Subword):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `_ د ي ا` | 43,807 |
+| 2 | `د ي ا ل` | 43,597 |
+| 3 | `ي ا ل _` | 30,362 |
+| 4 | `د _ ا ل` | 29,177 |
+| 5 | `_ م ن _` | 25,265 |
 ### Key Findings
+- **Best Perplexity:** 2-gram (subword) with 415
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
+- **Coverage:** Top-1000 patterns cover ~37% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
 ![Markov Entropy](visualizations/markov_entropy.png)
+![Markov Contexts](visualizations/markov_contexts.png)
 ![Markov Branching](visualizations/markov_branching.png)
 ### Results
+| Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
+|---------|---------|-------------|------------|------------------|-----------------|----------------|
+| **1** | Word | 0.8416 | 1.792 | 5.23 | 162,378 | 15.8% |
+| **1** | Subword | 1.1133 | 2.163 | 8.05 | 2,149 | 0.0% |
+| **2** | Word | 0.2252 | 1.169 | 1.49 | 849,251 | 77.5% |
+| **2** | Subword | 0.8048 | 1.747 | 4.99 | 17,291 | 19.5% |
+| **3** | Word | 0.0625 | 1.044 | 1.10 | 1,262,316 | 93.8% |
+| **3** | Subword | 0.8001 | 1.741 | 4.09 | 86,361 | 20.0% |
+| **4** | Word | 0.0215 🏆 | 1.015 | 1.04 | 1,391,141 | 97.9% |
+| **4** | Subword | 0.6559 | 1.576 | 2.83 | 352,807 | 34.4% |
+### Generated Text Samples (Word-based)
+Below are text samples generated from each word-based Markov chain model:
+**Context Size 1:**
+1. `ف لجولة اللولة ديالو ماسك ب الريحة فاميلة ديال لوغات الأمازيغية هويتنا الوطنية بحال بنادم بشكل`
+2. `د الشوماج واصلة ل كانت وحدة من جيهت بّاه إيرول ماسك أسس جمعية الشرف هو اللعاب`
+3. `و بايت زادهوم داريجابوت 19 فاش كانو كايطراو ف نسبة لبطالة نّاس نّشيطين لّي يقدرو يخدمو`
+**Context Size 2:**
+1. `واصلة ل 5 و عدد لفاميلات تزاد ب 12 2 لمشاركات ف كأس افريقيا في البطولة ديال`
+2. `نسبة د الناس النشيطين ف دوار أمرس واصلة ل 96 3 و نسبة د الجواج ف امزرو`
+3. `ف لمغريب ف إقليم تارودانت جهة سوس ماسة ف لمغريب ف إقليم وارزازات جهة درعا تافيلالت ساكنين`
+**Context Size 3:**
+1. `ف نسبة د الناس النشيطين ف دوار تامكونسي واصلة ل 49 7 و لموعدّال د لعمر عند الجواج`
+2. `فيها مصدر و علاين بايت د الصويرة`
+3. `و نسبة د الشوماج واصلة ل 14 7 نوطات مصادر ف لمغريب ف إقليم لحوز زادهوم داريجابوت`
+**Context Size 4:**
+1. `نسبة نّاس اللي خدامين ف دّولة ولا لبيطاليين اللي سبق ليهوم مصادر طنجة تطوان الحسيمة قروية ف إقليم لح...`
+2. `نّاس اللي خدامين ف دّولة ولا لبيطاليين اللي سبق ليهوم خدمو 6 7 نسبة نّاس اللي خدامين ف لپريڤي`
+3. `على حساب لإحصاء الرسمي د عام إحصائيات إحصائيات عامة عدد السكان ديال تمزاوروت تزاد ب 18 6 و عدد`
+### Generated Text Samples (Subword-based)
+Below are text samples generated from each subword-based Markov chain model:
 **Context Size 1:**
+1. `_"أكابي_مناتحسن_`
+2. `ايلممرسية_اهة،_ل`
+3. `لم"ليعن_لنف_لميم`
 **Context Size 2:**
+1. `ال_لليزنيز،_إسلة_`
+2. `_لعام_نخب_ور_تقرو`
+3. `ة_سويسها_كولا_بحو`
 **Context Size 3:**
+1. `_اللات،_سورين._لڭر`
+2. `_ف_نسبة_شبه_ولكرور`
+3. `_د_لعالمغريب._هوّ_و`
 **Context Size 4:**
+1. `_ديال_على_حساب_لإحص`
+2. `ديالو،_(a)_–_bringe`
+3. `يال_التاني_توفى_عوا`
 ### Key Findings
+- **Best Predictability:** Context-4 (word) with 97.9% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
+- **Memory Trade-off:** Larger contexts require more storage (352,807 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
 | Metric | Value |
 |--------|-------|
+| Vocabulary Size | 70,940 |
+| Total Tokens | 1,845,717 |
+| Mean Frequency | 26.02 |
 | Median Frequency | 4 |
+| Frequency Std Dev | 518.94 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | ف | 80,525 |
+| 2 | د | 57,913 |
+| 3 | و | 57,274 |
+| 4 | ديال | 29,978 |
+| 5 | من | 25,568 |
+| 6 | ل | 23,006 |
+| 7 | على | 17,625 |
+| 8 | لي | 17,540 |
+| 9 | نسبة | 16,376 |
+| 10 | ب | 16,161 |
 ### Least Common Words (from vocabulary)
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | تعاونيات | 2 |
+| 2 | خواني | 2 |
+| 3 | والمصطلحات | 2 |
+| 4 | والنقدية | 2 |
+| 5 | شرقًا | 2 |
+| 6 | غربًا | 2 |
+| 7 | المتري | 2 |
+| 8 | بالمدّ | 2 |
+| 9 | والعبارات | 2 |
+| 10 | الكرم | 2 |
 ### Zipf's Law Analysis
 | Metric | Value |
 |--------|-------|
+| Zipf Coefficient | 1.0352 |
+| R² (Goodness of Fit) | 0.998696 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
+| Top 100 | 40.4% |
+| Top 1,000 | 64.9% |
+| Top 5,000 | 79.3% |
+| Top 10,000 | 85.4% |
 ### Key Findings
+- **Zipf Compliance:** R²=0.9987 indicates excellent adherence to Zipf's law
+- **High Frequency Dominance:** Top 100 words cover 40.4% of corpus
+- **Long Tail:** 60,940 words needed for remaining 14.6% coverage
 ---
 ## 5. Word Embeddings Evaluation
 ![t-SNE Sentences](visualizations/tsne_sentences.png)
+### 5.1 Cross-Lingual Alignment
+> *Note: Multilingual alignment visualization not available for this language.*
+### 5.2 Model Comparison
+| Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
+|-------|-----------|----------|------------------|---------------|----------------|
+| **mono_32d** | 32 | 0.8384 🏆 | 0.3320 | N/A | N/A |
+| **mono_64d** | 64 | 0.8149 | 0.2519 | N/A | N/A |
+| **mono_128d** | 128 | 0.6695 | 0.2114 | N/A | N/A |
 ### Key Findings
+- **Best Isotropy:** mono_32d with 0.8384 (more uniform distribution)
+- **Semantic Density:** Average pairwise similarity of 0.2651. Lower values indicate better semantic separation.
+- **Alignment Quality:** No aligned models evaluated in this run.
+- **Recommendation:** 128d aligned for best cross-lingual performance
 ---
+## 6.  Morphological Analysis (Experimental)
+> ⚠️ **Warning:** This language shows low morphological productivity. The statistical signals used for this analysis may be noisy or less reliable than for morphologically rich languages.
+This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
+### 6.1 Productivity & Complexity
+| Metric | Value | Interpretation | Recommendation |
+|--------|-------|----------------|----------------|
+| Productivity Index | **0.000** | Low morphological productivity | ⚠️ Likely unreliable |
+| Idiomaticity Gap | **-1.000** | Low formulaic content | - |
+### 6.2 Affix Inventory (Productive Units)
+These are the most productive prefixes and suffixes identified by sampling the vocabulary for global substitutability patterns. A unit is considered an affix if stripping it leaves a valid stem that appears in other contexts.
+#### Productive Prefixes
+| Prefix | Examples |
+|--------|----------|
+| `-ال` | التار, العادات, الواري |
+| `-لم` | لموتقافين, لمحمية, لموتيفات |
+| `-كا` | كايعطيهوم, كايتبناو, كايلمح |
+#### Productive Suffixes
+| Suffix | Examples |
+|--------|----------|
+| `-ات` | العادات, باللوغات, وزّعات |
+| `-ية` | حيمائية, لافريقية, ليدارية |
+| `-ين` | نّازيين, فالميادين, لموتقافين |
+### 6.3 Bound Stems (Lexical Roots)
+Bound stems are high-frequency subword units that are semantically cohesive but rarely appear as standalone words. These often correspond to the 'core' of a word that requires inflection or derivation to be valid.
+| Stem | Cohesion | Substitutability | Examples |
+|------|----------|------------------|----------|
+| `انية` | 1.82x | 63 contexts | تانية, كانية, دانية |
+| `الات` | 1.79x | 57 contexts | تالات, صالات, سالات |
+| `جماع` | 1.93x | 37 contexts | تجماع, إجماع, جماعة |
+| `لمغر` | 2.01x | 28 contexts | لمغرب, لمغربي, دلمغرب |
+| `اللو` | 1.65x | 57 contexts | اللوت, اللوز, اللوح |
+| `النا` | 1.64x | 55 contexts | النار, الناس, الناتو |
+| `دهوم` | 2.21x | 16 contexts | ضدهوم, جهدهوم, بعدهوم |
+| `مغري` | 2.02x | 18 contexts | مغرية, مغريب, مغريبي |
+| `قليم` | 2.06x | 15 contexts | اقليم, فقليم, إقليم |
+| `لجوا` | 1.76x | 24 contexts | لجواب, الجوا, لجوائر |
+| `اميل` | 1.78x | 23 contexts | كاميل, عاميل, ݣاميلة |
+| `إحصا` | 2.08x | 14 contexts | لإحصا, إحصاء, إحصائي |
+### 6.4 Affix Compatibility (Co-occurrence)
+This table shows which prefixes and suffixes most frequently co-occur on the same stems, revealing the 'stacking' rules of the language's morphology.
+| Prefix | Suffix | Frequency | Examples |
+|--------|--------|-----------|----------|
+| `-ال` | `-ية` | 126 words | الكوانتية, الشهية |
+| `-ال` | `-ات` | 123 words | العقوبات, الدبانيات |
+| `-ال` | `-ين` | 70 words | الرينين, الثلاثين |
+| `-لم` | `-ات` | 41 words | لمسراحيات, لمانيفولضات |
+| `-لم` | `-ين` | 37 words | لمعروفين, لموليكيين |
+| `-لم` | `-ية` | 18 words | لماركسية, لمرساوية |
+| `-كا` | `-ين` | 2 words | كاتبيين, كالكيريين |
+| `-كا` | `-ات` | 2 words | كارنيڤورات, كاريكاتورات |
+### 6.5 Recursive Morpheme Segmentation
+Using **Recursive Hierarchical Substitutability**, we decompose complex words into their constituent morphemes. This approach handles nested affixes (e.g., `prefix-prefix-root-suffix`).
+| Word | Suggested Split | Confidence | Stem |
+|------|-----------------|------------|------|
+| لمعلوماتية | **`لم-علوم-ات-ية`** | 7.5 | `علوم` |
+| الثلاثينات | **`ال-ثلاث-ين-ات`** | 7.5 | `ثلاث` |
+| التأريخية | **`ال-تأريخ-ية`** | 6.0 | `تأريخ` |
+| المهندسين | **`ال-مهندس-ين`** | 6.0 | `مهندس` |
+| التيليفونات | **`ال-تيليفون-ات`** | 6.0 | `تيليفون` |
+| السيشيلية | **`ال-سيشيل-ية`** | 6.0 | `سيشيل` |
+| المجتمعين | **`ال-مجتمع-ين`** | 6.0 | `مجتمع` |
+| التجهيزات | **`ال-تجهيز-ات`** | 6.0 | `تجهيز` |
+| العثمانية | **`ال-عثمان-ية`** | 6.0 | `عثمان` |
+| المعتقدات | **`ال-معتقد-ات`** | 6.0 | `معتقد` |
+| البوليسية | **`ال-بوليس-ية`** | 6.0 | `بوليس` |
+| التشكالات | **`ال-تشكال-ات`** | 6.0 | `تشكال` |
+| المستشارين | **`ال-مستشار-ين`** | 6.0 | `مستشار` |
+| السيركويات | **`ال-سيركوي-ات`** | 6.0 | `سيركوي` |
+| التحضيرية | **`ال-تحضير-ية`** | 6.0 | `تحضير` |
+### 6.6 Linguistic Interpretation
+> **Automated Insight:**
+The language Moroccan Arabic appears to be more isolating or has a highly fixed vocabulary. Word-level models perform nearly as well as subword models, indicating fewer productive morphological processes.
+---
+## 7. Summary & Recommendations
 ![Performance Dashboard](visualizations/performance_dashboard.png)
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
+| Tokenizer | **64k BPE** | Best compression (4.18x) |
+| N-gram | **2-gram** | Lowest perplexity (415) |
+| Markov | **Context-4** | Highest predictability (97.9%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
 ---
 ## Appendix: Metrics Glossary & Interpretation Guide
   author = {Kamali, Omar},
   title = {Wikilangs: Open NLP Models for Wikipedia Languages},
   year = {2025},
+  doi = {10.5281/zenodo.18073153},
+  publisher = {Zenodo},
   url = {https://huggingface.co/wikilangs}
   institution = {Omneity Labs}
 }
 - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
 - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
 - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
+- 🤝 Sponsor: [Featherless AI](https://featherless.ai)
 ---
 *Generated by Wikilangs Models Pipeline*
+*Report Date: 2026-01-03 05:20:40*

models/embeddings/monolingual/ary_128d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d5cc5293cf8429fb5399c588bd22cb8909420d94a23901f2c928f46ae56e690f
-size 1063209927

 version https://git-lfs.github.com/spec/v1
+oid sha256:ec0c8ea941d4e589712d1341927459bcf79927b09c35a78a960c9f6d7e10e2d9
+size 1056923528

models/embeddings/monolingual/ary_128d_metadata.json CHANGED Viewed

@@ -3,11 +3,13 @@
   "dimension": 128,
   "version": "monolingual",
   "training_params": {
-    "dim": 128,
     "min_count": 5,
     "window": 5,
     "negative": 5,
-    "epochs": 5
   },
-  "vocab_size": 37528
 }

   "dimension": 128,
   "version": "monolingual",
   "training_params": {
+    "algorithm": "skipgram",
     "min_count": 5,
     "window": 5,
     "negative": 5,
+    "epochs": 5,
+    "encoding_method": "rope",
+    "dim": 128
   },
+  "vocab_size": 31513
 }

models/embeddings/monolingual/ary_32d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b2f8b934f3ef0b52a7ad968cdbdcf1f979c0b7a075cbb32fa213a893a5c78d2f
-size 266388423

 version https://git-lfs.github.com/spec/v1
+oid sha256:eeac1e5f452245ffe93038f0b7258b2f1f16a3bbf65b19fe05b1a6db4fae8474
+size 264721544

models/embeddings/monolingual/ary_32d_metadata.json CHANGED Viewed

@@ -3,11 +3,13 @@
   "dimension": 32,
   "version": "monolingual",
   "training_params": {
-    "dim": 32,
     "min_count": 5,
     "window": 5,
     "negative": 5,
-    "epochs": 5
   },
-  "vocab_size": 37528
 }

   "dimension": 32,
   "version": "monolingual",
   "training_params": {
+    "algorithm": "skipgram",
     "min_count": 5,
     "window": 5,
     "negative": 5,
+    "epochs": 5,
+    "encoding_method": "rope",
+    "dim": 32
   },
+  "vocab_size": 31513
 }

models/embeddings/monolingual/ary_64d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:83735f6b6acbb5e848400813f89e7a33d7fc8d847ad66d17693f0e0573ec6fd0
-size 531995591

 version https://git-lfs.github.com/spec/v1
+oid sha256:b01a57a7d1b52d182e7b0bb4f504588432f74b447abf3627c5c7314bbffab5f2
+size 528788872

models/embeddings/monolingual/ary_64d_metadata.json CHANGED Viewed

@@ -3,11 +3,13 @@
   "dimension": 64,
   "version": "monolingual",
   "training_params": {
-    "dim": 64,
     "min_count": 5,
     "window": 5,
     "negative": 5,
-    "epochs": 5
   },
-  "vocab_size": 37528
 }

   "dimension": 64,
   "version": "monolingual",
   "training_params": {
+    "algorithm": "skipgram",
     "min_count": 5,
     "window": 5,
     "negative": 5,
+    "epochs": 5,
+    "encoding_method": "rope",
+    "dim": 64
   },
+  "vocab_size": 31513
 }

models/subword_markov/ary_markov_ctx1_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:02d6d2bdc8d7dbce6c10868f4220e2d2d66cf1a06f3491e669371f1b65129d05
-size 127609

 version https://git-lfs.github.com/spec/v1
+oid sha256:459fa6b743dbcf395519af9bf18ba67b777ddd0e0cc864ac3bb6a7bf3189d793
+size 133878

models/subword_markov/ary_markov_ctx1_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "subword",
   "language": "ary",
-  "unique_contexts": 1931,
-  "total_transitions": 13216554
 }

   "context_size": 1,
   "variant": "subword",
   "language": "ary",
+  "unique_contexts": 2149,
+  "total_transitions": 10496838
 }

models/subword_markov/ary_markov_ctx2_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:322a9496ec76e3be61fdafb53d8b77843c56708cd0dbe0e62dcd5edd91548627
-size 835900

 version https://git-lfs.github.com/spec/v1
+oid sha256:552c831bedb22d7f7b1c00d88c34681f2f3325dd23a3f89d85c02eb0c4ac3281
+size 749221

models/subword_markov/ary_markov_ctx2_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "subword",
   "language": "ary",
-  "unique_contexts": 16826,
-  "total_transitions": 13205631
 }

   "context_size": 2,
   "variant": "subword",
   "language": "ary",
+  "unique_contexts": 17291,
+  "total_transitions": 10486078
 }

models/subword_markov/ary_markov_ctx3_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0d5187cefb9bc3bda8ef32e536676e0c3b42f01c2c61bed6abaeb131d627b820
-size 3229260

 version https://git-lfs.github.com/spec/v1
+oid sha256:2a0e7a2836b00fdfdf3ecab6fd3ff4f39a94e43eeb24ea7dfa6bbdea5ac77430
+size 2667571

models/subword_markov/ary_markov_ctx3_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 3,
   "variant": "subword",
   "language": "ary",
-  "unique_contexts": 104928,
-  "total_transitions": 13194708
 }

   "context_size": 3,
   "variant": "subword",
   "language": "ary",
+  "unique_contexts": 86361,
+  "total_transitions": 10475318
 }

models/subword_markov/ary_markov_ctx4_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dd8baa92bb2fa6351748ab87b35f8ba19922364e0eae3c6cbd772e527604f163
-size 10673093

 version https://git-lfs.github.com/spec/v1
+oid sha256:85296dab212dcfcc7a9ab757176cd9fbd76a93f9401830d56004280e4a7377f8
+size 8465099

models/subword_markov/ary_markov_ctx4_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 4,
   "variant": "subword",
   "language": "ary",
-  "unique_contexts": 454694,
-  "total_transitions": 13183785
 }

   "context_size": 4,
   "variant": "subword",
   "language": "ary",
+  "unique_contexts": 352807,
+  "total_transitions": 10464558
 }

models/subword_ngram/ary_2gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:62a74b4e9a08aa132c61be02cbdead1b65df56d23212d4d13b44eee1de72ca5c
-size 85260

 version https://git-lfs.github.com/spec/v1
+oid sha256:78ea13aba22c9b182dee2ab8bd64573851cedf52939c98249335191a3f23f274
+size 80021

models/subword_ngram/ary_2gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 2,
   "variant": "subword",
   "language": "ary",
-  "unique_ngrams": 6227,
-  "total_ngrams": 13216554
 }

   "n": 2,
   "variant": "subword",
   "language": "ary",
+  "unique_ngrams": 5585,
+  "total_ngrams": 10496838
 }

models/subword_ngram/ary_3gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:33a4ba5238a6f6fb79055b4420c426248fbd61bc3060e6af9de75c84a51cb747
-size 664928

 version https://git-lfs.github.com/spec/v1
+oid sha256:24b08ece7a38c3125e70f5a75f6ead1fd500f694cb05ad5c6248ba7b85fd5634
+size 573468

models/subword_ngram/ary_3gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 3,
   "variant": "subword",
   "language": "ary",
-  "unique_ngrams": 51256,
-  "total_ngrams": 13205631
 }

   "n": 3,
   "variant": "subword",
   "language": "ary",
+  "unique_ngrams": 41944,
+  "total_ngrams": 10486078
 }

models/subword_ngram/ary_4gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2a9c53578c3475bc96d000d9db9a4d19e545b73cbbe4948d2149968e06900585
-size 3262126

 version https://git-lfs.github.com/spec/v1
+oid sha256:c1dd4f7cdfe642d380522ea08b1e34ceadbe57558bcccafcd2c10d5783f15f75
+size 2617929

models/subword_ngram/ary_4gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 4,
   "variant": "subword",
   "language": "ary",
-  "unique_ngrams": 260844,
-  "total_ngrams": 13194708
 }

   "n": 4,
   "variant": "subword",
   "language": "ary",
+  "unique_ngrams": 204568,
+  "total_ngrams": 10475318
 }

models/tokenizer/ary_tokenizer_16k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:66f08427ec3757387ee07eb1bbb3518ed56da42a7b4a144381b3c9d0e2a75fd2
-size 550569

 version https://git-lfs.github.com/spec/v1
+oid sha256:2b2380e695c2c3ff7f9eec16ecc69d1a82452d80f0c0884635d764801141b6e5
+size 559353

models/tokenizer/ary_tokenizer_16k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/ary_tokenizer_32k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dfbe42ab44ef4ba9c324b96993e42805c7a7def9a21e8af3f8467142eef0b615
-size 880065

 version https://git-lfs.github.com/spec/v1
+oid sha256:334ec9fb02ebfd6451b7961710b7f94c763275ce5022015434501d439ac06ad4
+size 894340

models/tokenizer/ary_tokenizer_32k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/ary_tokenizer_64k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:46f721a26f1f518a4334f4a8844591eb94957318f9a1d14d80668720663635cc
-size 1546284

 version https://git-lfs.github.com/spec/v1
+oid sha256:228ac76778adf794a1d8de7ed6648a2bbfd80b7498be5530f090627f915c0436
+size 1593462

models/tokenizer/ary_tokenizer_64k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/ary_tokenizer_8k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:21bb3111788de7f71f160230b53c90faec9e158982f1fc3936144c7bb95bc0a1
-size 391386

 version https://git-lfs.github.com/spec/v1
+oid sha256:fa0d6cbad79ebc5208a11f4bc61982c6584a2f87a08a41d8c2c90d8c0941ea81
+size 396437

models/tokenizer/ary_tokenizer_8k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/vocabulary/ary_vocabulary.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0e9e3400acc6760b029fbf22dea48ff18cbf62eda489e8d001ff473945dde76d
-size 1446167

 version https://git-lfs.github.com/spec/v1
+oid sha256:e6a100896fb6a926f24685f0b974ea8144580d8025c46f0451fa0eb2a1cccb88
+size 1278866

models/vocabulary/ary_vocabulary_metadata.json CHANGED Viewed

@@ -1,16 +1,17 @@
 {
   "language": "ary",
-  "vocabulary_size": 81712,
   "statistics": {
-    "type_token_ratio": 0.07826179445772023,
     "coverage": {
-      "top_100": 0.3755263295122797,
-      "top_1000": 0.6095531030989189,
-      "top_5000": 0.7510592808230233,
-      "top_10000": 0.8101195151850255
     },
-    "hapax_count": 107389,
-    "hapax_ratio": 0.5678922903633508,
-    "total_documents": 10923
   }
 }

 {
   "language": "ary",
+  "vocabulary_size": 70940,
+  "variant": "full",
   "statistics": {
+    "type_token_ratio": 0.08383333066622203,
     "coverage": {
+      "top_100": 0.38467935557772986,
+      "top_1000": 0.6186641695622032,
+      "top_5000": 0.7558627838344147,
+      "top_10000": 0.8141512107566836
     },
+    "hapax_count": 91460,
+    "hapax_ratio": 0.5631773399014778,
+    "total_documents": 10760
   }
 }

models/word_markov/ary_markov_ctx1_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cf369ad231704eef182a92e5d64241ae5294d89bedc2393e46ffa50e68be61c3
-size 10974758

 version https://git-lfs.github.com/spec/v1
+oid sha256:7c9d4b1eb5fe667ef9075fa89ad5a4ea4ea33df8dba16dd4243bd25ca92d4926
+size 9279588

models/word_markov/ary_markov_ctx1_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "word",
   "language": "ary",
-  "unique_contexts": 189320,
-  "total_transitions": 3013648
 }

   "context_size": 1,
   "variant": "word",
   "language": "ary",
+  "unique_contexts": 162378,
+  "total_transitions": 1926417
 }

models/word_markov/ary_markov_ctx2_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6feb87132c6a3a8881dab7c3af6268a3da4e594d21311b665e71b2e95a0af000
-size 28263486

 version https://git-lfs.github.com/spec/v1
+oid sha256:29a939f97ef93392311582b9ad3f3b729f547581513acb58fd36113bf5b295e2
+size 23360695

models/word_markov/ary_markov_ctx2_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "word",
   "language": "ary",
-  "unique_contexts": 1014676,
-  "total_transitions": 3002725
 }

   "context_size": 2,
   "variant": "word",
   "language": "ary",
+  "unique_contexts": 849251,
+  "total_transitions": 1915657
 }

models/word_markov/ary_markov_ctx3_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:354a3b1b04e0bdb94e0c7a15c339df347f9c313c580fe88c039f721a8a1e4237
-size 41576319

 version https://git-lfs.github.com/spec/v1
+oid sha256:4fc3870d90e96a6d0f094fa6751c6f78ed9b0e520e72f7882513945cab8f1373
+size 32010448

models/word_markov/ary_markov_ctx3_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 3,
   "variant": "word",
   "language": "ary",
-  "unique_contexts": 1701309,
-  "total_transitions": 2991803
 }

   "context_size": 3,
   "variant": "word",
   "language": "ary",
+  "unique_contexts": 1262316,
+  "total_transitions": 1904897
 }

models/word_markov/ary_markov_ctx4_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5cc0395cfc1cc580368483ee202676a9d7b57cdbdf737c5c7422dfd39ecdf963
-size 50573861

 version https://git-lfs.github.com/spec/v1
+oid sha256:526783a2fed42680ed61c32c03a44df01d06d2396568770540907bffacc2d4b5
+size 38343134

models/word_markov/ary_markov_ctx4_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 4,
   "variant": "word",
   "language": "ary",
-  "unique_contexts": 2000181,
-  "total_transitions": 2980882
 }

   "context_size": 4,
   "variant": "word",
   "language": "ary",
+  "unique_contexts": 1391141,
+  "total_transitions": 1894137
 }

models/word_ngram/ary_2gram_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:60dc5550877fc074d5df5a925b63f1f3975f796566e9a5a5de18357cd8485faa
-size 1017637

 version https://git-lfs.github.com/spec/v1
+oid sha256:bfeda04510e2c553b8ed89b81167eb362bcaad06b1cd9106f259c8829124e7bd
+size 673580

models/word_ngram/ary_2gram_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 2,
   "variant": "word",
   "language": "ary",
-  "unique_ngrams": 56749,
-  "total_ngrams": 3013648
 }

   "n": 2,
   "variant": "word",
   "language": "ary",
+  "unique_ngrams": 35218,
+  "total_ngrams": 1926417
 }

models/word_ngram/ary_3gram_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:450d7e949b8c457d9c72e4207cc5a90c1e77b46a5e5091df8c971c29082d1978
-size 1493910

 version https://git-lfs.github.com/spec/v1
+oid sha256:42e065ff96cac0f99796887363b07f7ed4c89751e6ffab760b5684e38bd7c11c
+size 851553

models/word_ngram/ary_3gram_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 3,
   "variant": "word",
   "language": "ary",
-  "unique_ngrams": 76888,
-  "total_ngrams": 3002725
 }

   "n": 3,
   "variant": "word",
   "language": "ary",
+  "unique_ngrams": 39702,
+  "total_ngrams": 1915657
 }

models/word_ngram/ary_4gram_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8ee2009d481c48c5635cbb99107a721e89fb3733f2698b8960ac7a70030bbabc
-size 2616301