omarkamali commited on Jan 7

Commit

38e662b

verified ·

1 Parent(s): 02fa115

Upload all models and assets for ar (latest)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
README.md +343 -142
models/embeddings/aligned/ar_128d.bin +3 -0
models/embeddings/aligned/ar_128d.meta.json +1 -0
models/embeddings/aligned/ar_128d.projection.npy +3 -0
models/embeddings/aligned/ar_128d_metadata.json +8 -0
models/embeddings/aligned/ar_32d.bin +3 -0
models/embeddings/aligned/ar_32d.meta.json +1 -0
models/embeddings/aligned/ar_32d.projection.npy +3 -0
models/embeddings/aligned/ar_32d_metadata.json +8 -0
models/embeddings/aligned/ar_64d.bin +3 -0
models/embeddings/aligned/ar_64d.meta.json +1 -0
models/embeddings/aligned/ar_64d.projection.npy +3 -0
models/embeddings/aligned/ar_64d_metadata.json +8 -0
models/embeddings/monolingual/ar_128d.bin +2 -2
models/embeddings/monolingual/ar_128d_metadata.json +5 -3
models/embeddings/monolingual/ar_32d.bin +2 -2
models/embeddings/monolingual/ar_32d_metadata.json +5 -3
models/embeddings/monolingual/ar_64d.bin +2 -2
models/embeddings/monolingual/ar_64d_metadata.json +5 -3
models/subword_markov/ar_markov_ctx1_subword.parquet +2 -2
models/subword_markov/ar_markov_ctx1_subword_metadata.json +2 -2
models/subword_markov/ar_markov_ctx2_subword.parquet +2 -2
models/subword_markov/ar_markov_ctx2_subword_metadata.json +2 -2
models/subword_markov/ar_markov_ctx3_subword.parquet +2 -2
models/subword_markov/ar_markov_ctx3_subword_metadata.json +2 -2
models/subword_markov/ar_markov_ctx4_subword.parquet +2 -2
models/subword_markov/ar_markov_ctx4_subword_metadata.json +2 -2
models/subword_ngram/ar_2gram_subword.parquet +2 -2
models/subword_ngram/ar_2gram_subword_metadata.json +2 -2
models/subword_ngram/ar_3gram_subword.parquet +2 -2
models/subword_ngram/ar_3gram_subword_metadata.json +2 -2
models/subword_ngram/ar_4gram_subword.parquet +2 -2
models/subword_ngram/ar_4gram_subword_metadata.json +2 -2
models/subword_ngram/ar_5gram_subword.parquet +3 -0
models/subword_ngram/ar_5gram_subword_metadata.json +7 -0
models/tokenizer/ar_tokenizer_16k.model +2 -2
models/tokenizer/ar_tokenizer_16k.vocab +0 -0
models/tokenizer/ar_tokenizer_32k.model +2 -2
models/tokenizer/ar_tokenizer_32k.vocab +0 -0
models/tokenizer/ar_tokenizer_64k.model +2 -2
models/tokenizer/ar_tokenizer_64k.vocab +0 -0
models/tokenizer/ar_tokenizer_8k.model +2 -2
models/tokenizer/ar_tokenizer_8k.vocab +0 -0
models/vocabulary/ar_vocabulary.parquet +2 -2
models/vocabulary/ar_vocabulary_metadata.json +10 -9
models/vocabulary/ar_vocabulary_top.parquet +3 -0
models/vocabulary/ar_vocabulary_top_metadata.json +20 -0
models/word_markov/ar_markov_ctx1_word.parquet +2 -2
models/word_markov/ar_markov_ctx1_word_metadata.json +2 -2

.gitattributes CHANGED Viewed

@@ -39,3 +39,4 @@ visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -t
 visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text

 visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
+visualizations/embedding_tsne_multilingual.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -10,11 +10,21 @@ tags:
   - n-gram
   - markov
   - wikipedia
   - monolingual
   - family-arabic
 license: mit
 library_name: wikilangs
-pipeline_tag: feature-extraction
 datasets:
   - omarkamali/wikipedia-monthly
 dataset_info:
@@ -23,14 +33,14 @@ dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
-    value: 4.103
   - name: best_isotropy
     type: isotropy
-    value: 0.7155
   - name: vocabulary_size
     type: vocab
-    value: 1000000
-generated: 2025-12-27
 ---
 # Arabic - Wikilangs Models
@@ -44,12 +54,13 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 ### Models & Assets
 - Tokenizers (8k, 16k, 32k, 64k)
-- N-gram models (2, 3, 4-gram)
-- Markov chains (context of 1, 2, 3 and 4)
 - Subword N-gram and Markov chains
-- Embeddings in various sizes and dimensions
 - Language Vocabulary
 - Language Statistics
 ![Performance Dashboard](visualizations/performance_dashboard.png)
 ### Analysis and Evaluation
@@ -59,7 +70,8 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
-- [6. Summary & Recommendations](#6-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
@@ -68,58 +80,57 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 ![Tokenizer Compression](visualizations/tokenizer_compression.png)
 ### Results
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
-| **8k** | 3.156x | 3.13 | 0.0848% | 5,982,398 |
-| **16k** | 3.513x | 3.49 | 0.0944% | 5,374,291 |
-| **32k** | 3.837x | 3.81 | 0.1031% | 4,920,728 |
-| **64k** | 4.103x 🏆 | 4.07 | 0.1103% | 4,602,368 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
-**Sample 1:** `تحويل ميلفورد (كونيتيكت)`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁تحويل ▁ميل فورد ▁( ك وني تي كت )` | 9 |
-| 16k | `▁تحويل ▁ميل فورد ▁( ك وني تي كت )` | 9 |
-| 32k | `▁تحويل ▁ميل فورد ▁( كوني تيكت )` | 7 |
-| 64k | `▁تحويل ▁ميل فورد ▁( كونيتيكت )` | 6 |
-**Sample 2:** `قد يقصد من «الفرفار» :
- الفرفار (إدا وكماض) : دوار تابع لجماعة إدا وڭماض في إقل...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁قد ▁يق صد ▁من ▁« الف رف ار » ▁: ... (+43 more)` | 53 |
-| 16k | `▁قد ▁يقصد ▁من ▁« الف رف ار » ▁: ▁الف ... (+37 more)` | 47 |
-| 32k | `▁قد ▁يقصد ▁من ▁« الف رف ار » ▁: ▁الف ... (+36 more)` | 46 |
-| 64k | `▁قد ▁يقصد ▁من ▁« الف رف ار » ▁: ▁الف ... (+34 more)` | 44 |
-**Sample 3:** `المراجع
-تصنيف:أنهار إفريقية دولية
-تصنيف:أنهار بوروندي
-تصنيف:أنهار تنزانيا
-تصني...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁المراجع ▁تصنيف : أن هار ▁إ فريقية ▁دولية ▁تصنيف : ... (+16 more)` | 26 |
-| 16k | `▁المراجع ▁تصنيف : أنهار ▁إفريقية ▁دولية ▁تصنيف : أنهار ▁بور ... (+12 more)` | 22 |
-| 32k | `▁المراجع ▁تصنيف : أنهار ▁إفريقية ▁دولية ▁تصنيف : أنهار ▁بور ... (+9 more)` | 19 |
-| 64k | `▁المراجع ▁تصنيف : أنهار ▁إفريقية ▁دولية ▁تصنيف : أنهار ▁بوروندي ... (+8 more)` | 18 |
 ### Key Findings
-- **Best Compression:** 64k achieves 4.103x compression
-- **Lowest UNK Rate:** 8k with 0.0848% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
@@ -128,57 +139,111 @@ Below are sample sentences tokenized with each vocabulary size:
 ![N-gram Perplexity](visualizations/ngram_perplexity.png)
 ![N-gram Coverage](visualizations/ngram_coverage.png)
 ### Results
-| N-gram | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
-|--------|------------|---------|----------------|------------------|-------------------|
-| **2-gram** | 224,018 🏆 | 17.77 | 6,245,473 | 10.5% | 22.3% |
-| **2-gram** | 514 🏆 | 9.00 | 52,884 | 52.6% | 94.6% |
-| **3-gram** | 831,530 | 19.67 | 14,344,223 | 6.6% | 16.3% |
-| **3-gram** | 4,885 | 12.25 | 487,957 | 23.0% | 53.9% |
-| **4-gram** | 1,784,666 | 20.77 | 25,822,600 | 4.6% | 13.6% |
-| **4-gram** | 29,916 | 14.87 | 3,376,435 | 13.3% | 31.6% |
 ### Top 5 N-grams by Size
-**2-grams:**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `تصنيف :` | 9,397,729 |
-| 2 | `ً ا` | 2,647,403 |
-| 3 | `: لاعبو` | 1,539,560 |
-| 4 | `| |` | 1,324,145 |
-| 5 | `كرة قدم` | 758,315 |
-**3-grams:**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `تصنيف : لاعبو` | 1,539,552 |
-| 2 | `تصنيف : مواليد` | 617,808 |
-| 3 | `: لاعبو كرة` | 498,223 |
-| 4 | `| | |` | 459,400 |
-| 5 | `تصنيف : أشخاص` | 441,938 |
-**4-grams:**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `تصنيف : لاعبو كرة` | 498,220 |
-| 2 | `: لاعبو كرة قدم` | 381,016 |
-| 3 | `القرن 20 تصنيف :` | 278,900 |
-| 4 | `في القرن 20 تصنيف` | 266,135 |
-| 5 | `| | | |` | 255,908 |
 ### Key Findings
-- **Best Perplexity:** 2-gram with 514
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
-- **Coverage:** Top-1000 patterns cover ~32% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
@@ -186,55 +251,86 @@ Below are sample sentences tokenized with each vocabulary size:
 ![Markov Entropy](visualizations/markov_entropy.png)
 ![Markov Branching](visualizations/markov_branching.png)
 ### Results
-| Context | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
-|---------|-------------|------------|------------------|-----------------|----------------|
-| **1** | 0.7411 | 1.671 | 12.81 | 5,367,543 | 25.9% |
-| **1** | 1.8418 | 3.585 | 17.44 | 13,038 | 0.0% |
-| **2** | 0.4074 | 1.326 | 2.70 | 68,744,585 | 59.3% |
-| **2** | 0.7015 | 1.626 | 5.08 | 227,339 | 29.9% |
-| **3** | 0.1748 | 1.129 | 1.44 | 185,531,332 | 82.5% |
-| **3** | 0.8426 | 1.793 | 5.22 | 1,153,787 | 15.7% |
-| **4** | 0.0757 🏆 | 1.054 | 1.16 | 267,713,644 | 92.4% |
-| **4** | 0.7645 🏆 | 1.699 | 3.88 | 6,025,353 | 23.6% |
-### Generated Text Samples
-Below are text samples generated from each Markov chain model:
 **Context Size 1:**
-1. `. اكت ُ ص م َ ّ ة . أخته ، وافتتاح مشروع مرصد أونديجوف |`
-2. `في المدار في جمهورية ألمانيا تصنيف : خلافات في . أنظر : تشغيل الحواسيب . 1`
-3. `، واستحوذت أيضا التحريفية للبلغاريين والأجانب أو تمليح اللحوم والأتواب والملابس والبطانيات الى القاه...`
 **Context Size 2:**
-1. `تصنيف : كتاب ومؤلفو قصص مصورة تصنيف : فائزون بميداليات برونزية في ألعاب الكومنولث في إنجلترا تصنيف`
-2. `ً ا للاغتسال . وقال القرطبي في تفسيره على أنه آمن خلال الرضاعة الطبيعية يسبب زيادة الكوليسترول`
-3. `: لاعبو كرة قدم صرب مغتربون في روسيا تصنيف : أفلام دراما باللغة الإنجليزية تصنيف : سائقو`
 **Context Size 3:**
-1. `تصنيف : لاعبو بوتكيت ريد سوكس تصنيف : مواليد 1955 تصنيف : مؤيدون لتنظيم ملكية الأسلحة تصنيف :`
-2. `تصنيف : مواليد 1986 تصنيف : لاعبو وسط كرة قدم رجالية تصنيف : مواليد 1390 هـ تصنيف :`
-3. `: لاعبو كرة قدم مغاربة تصنيف : عداؤو مسافات متوسطة نيوزيلنديون تصنيف : مواليد 1981 تصنيف : مواليد`
 **Context Size 4:**
-1. `تصنيف : لاعبو كرة قدم مغتربون في المجر تصنيف : لاعبو كرة اليد في الألعاب الأولمبية الصيفية 1956 تصني...`
-2. `: لاعبو كرة قدم مغتربون في إنجلترا تصنيف : لاعبو كرة قدم مغتربون في إيطاليا تصنيف : أماكن مأهولة`
-3. `القرن 20 تصنيف : كاتبات أمريكيات في القرن 20 تصنيف : شعراء بالعربية في القرن 21 تصنيف : لاعبو`
 ### Key Findings
-- **Best Predictability:** Context-4 with 92.4% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
-- **Memory Trade-off:** Larger contexts require more storage (6,025,353 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
@@ -250,64 +346,64 @@ Below are text samples generated from each Markov chain model:
 | Metric | Value |
 |--------|-------|
-| Vocabulary Size | 1,000,000 |
-| Total Tokens | 366,842,150 |
-| Mean Frequency | 366.84 |
-| Median Frequency | 12 |
-| Frequency Std Dev | 20900.79 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | في | 14,346,570 |
-| 2 | تصنيف | 9,437,038 |
-| 3 | من | 8,350,052 |
-| 4 | على | 3,295,037 |
-| 5 | ا | 2,755,855 |
-| 6 | إلى | 2,451,934 |
-| 7 | عام | 1,684,151 |
-| 8 | لاعبو | 1,540,822 |
-| 9 | أن | 1,441,897 |
-| 10 | مع | 1,171,753 |
 ### Least Common Words (from vocabulary)
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | твоим | 4 |
-| 2 | своему | 4 |
-| 3 | вашей | 4 |
-| 4 | нашу | 4 |
-| 5 | кого | 4 |
-| 6 | чьей | 4 |
-| 7 | работать | 4 |
-| 8 | говорит | 4 |
-| 9 | говорят | 4 |
-| 10 | идёт | 4 |
 ### Zipf's Law Analysis
 | Metric | Value |
 |--------|-------|
-| Zipf Coefficient | 0.9655 |
-| R² (Goodness of Fit) | 0.990109 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
-| Top 100 | 24.9% |
-| Top 1,000 | 48.1% |
-| Top 5,000 | 68.6% |
-| Top 10,000 | 76.6% |
 ### Key Findings
-- **Zipf Compliance:** R²=0.9901 indicates excellent adherence to Zipf's law
-- **High Frequency Dominance:** Top 100 words cover 24.9% of corpus
-- **Long Tail:** 990,000 words needed for remaining 23.4% coverage
 ---
 ## 5. Word Embeddings Evaluation
@@ -320,24 +416,126 @@ Below are text samples generated from each Markov chain model:
 ![t-SNE Sentences](visualizations/tsne_sentences.png)
-### Model Comparison
-| Model | Vocab Size | Dimension | Avg Norm | Std Norm | Isotropy |
-|-------|------------|-----------|----------|----------|----------|
-| **mono_32d** | 1,505,991 | 32 | 3.562 | 1.491 | 0.7155 🏆 |
-| **mono_64d** | 1,505,991 | 64 | 3.899 | 1.405 | 0.7134 |
-| **mono_128d** | 1,505,991 | 128 | 4.337 | 1.358 | 0.6849 |
-| **embeddings_enhanced** | 0 | 0 | 0.000 | 0.000 | 0.0000 |
 ### Key Findings
-- **Best Isotropy:** mono_32d with 0.7155 (more uniform distribution)
-- **Dimension Trade-off:** Higher dimensions capture more semantics but reduce isotropy
-- **Vocabulary Coverage:** All models cover 1,505,991 words
-- **Recommendation:** 100d for balanced semantic capture and efficiency
 ---
-## 6. Summary & Recommendations
 ![Performance Dashboard](visualizations/performance_dashboard.png)
@@ -345,11 +543,12 @@ Below are text samples generated from each Markov chain model:
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
-| Tokenizer | **32k BPE** | Best compression (4.10x) with low UNK rate |
-| N-gram | **5-gram** | Lowest perplexity (514) |
-| Markov | **Context-4** | Highest predictability (92.4%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
 ---
 ## Appendix: Metrics Glossary & Interpretation Guide
@@ -539,7 +738,8 @@ If you use these models in your research, please cite:
   author = {Kamali, Omar},
   title = {Wikilangs: Open NLP Models for Wikipedia Languages},
   year = {2025},
-  publisher = {HuggingFace},
   url = {https://huggingface.co/wikilangs}
   institution = {Omneity Labs}
 }
@@ -555,7 +755,8 @@ MIT License - Free for academic and commercial use.
 - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
 - ��� Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
 - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
 ---
 *Generated by Wikilangs Models Pipeline*
-*Report Date: 2025-12-27 16:32:09*

   - n-gram
   - markov
   - wikipedia
+  - feature-extraction
+  - sentence-similarity
+  - tokenization
+  - n-grams
+  - markov-chain
+  - text-mining
+  - fasttext
+  - babelvec
+  - vocabulous
+  - vocabulary
   - monolingual
   - family-arabic
 license: mit
 library_name: wikilangs
+pipeline_tag: text-generation
 datasets:
   - omarkamali/wikipedia-monthly
 dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
+    value: 4.347
   - name: best_isotropy
     type: isotropy
+    value: 0.7394
   - name: vocabulary_size
     type: vocab
+    value: 0
+generated: 2026-01-07
 ---
 # Arabic - Wikilangs Models
 ### Models & Assets
 - Tokenizers (8k, 16k, 32k, 64k)
+- N-gram models (2, 3, 4, 5-gram)
+- Markov chains (context of 1, 2, 3, 4 and 5)
 - Subword N-gram and Markov chains
+- Embeddings in various sizes and dimensions (aligned and unaligned)
 - Language Vocabulary
 - Language Statistics
 ![Performance Dashboard](visualizations/performance_dashboard.png)
 ### Analysis and Evaluation
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
+- [6. Morphological Analysis (Experimental)](#6--morphological-analysis-experimental)
+- [7. Summary & Recommendations](#7-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
 ![Tokenizer Compression](visualizations/tokenizer_compression.png)
+![Tokenizer Fertility](visualizations/tokenizer_fertility.png)
+![Tokenizer OOV](visualizations/tokenizer_oov.png)
+![Total Tokens](visualizations/tokenizer_total_tokens.png)
 ### Results
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
+| **8k** | 3.252x | 3.25 | 0.0704% | 5,499,500 |
+| **16k** | 3.655x | 3.65 | 0.0791% | 4,893,689 |
+| **32k** | 4.034x | 4.03 | 0.0873% | 4,433,903 |
+| **64k** | 4.347x 🏆 | 4.35 | 0.0941% | 4,114,555 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
+**Sample 1:** `بيغجة خاتون هي قرية في مقاطعة شبستر، إيران. يقدر عدد سكانها بـ 635 نسمة بحسب إحص...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁بي غ جة ▁خ ات ون ▁هي ▁قرية ▁في ▁مقاطعة ... (+26 more)` | 36 |
+| 16k | `▁بي غ جة ▁خ ات ون ▁هي ▁قرية ▁في ▁مقاطعة ... (+23 more)` | 33 |
+| 32k | `▁بيغ جة ▁خاتون ▁هي ▁قرية ▁في ▁مقاطعة ▁شب ستر ، ... (+20 more)` | 30 |
+| 64k | `▁بيغ جة ▁خاتون ▁هي ▁قرية ▁في ▁مقاطعة ▁شب ستر ، ... (+20 more)` | 30 |
+**Sample 2:** `IL18BP (Interleukin 18 binding protein) هوَ بروتين يُشَفر بواسطة جين IL18BP في ا...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁ il 1 8 b p ▁( in ter le ... (+51 more)` | 61 |
+| 16k | `▁il 1 8 b p ▁( in ter le uk ... (+44 more)` | 54 |
+| 32k | `▁il 1 8 b p ▁( inter le uk in ... (+39 more)` | 49 |
+| 64k | `▁il 1 8 b p ▁( inter le uk in ... (+36 more)` | 46 |
+**Sample 3:** `هي مقاطعة في ولاية قشقداريا في أوزبكستان، ومركزها مدينة شهرسبز. المصادر مأهولة ف...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁هي ▁مقاطعة ▁في ▁ولاية ▁ق ش قد اريا ▁في ▁أوزب ... (+18 more)` | 28 |
+| 16k | `▁هي ▁مقاطعة ▁في ▁ولاية ▁ق ش قد اريا ▁في ▁أوزبكستان ... (+16 more)` | 26 |
+| 32k | `▁هي ▁مقاطعة ▁في ▁ولاية ▁قش قد اريا ▁في ▁أوزبكستان ، ... (+13 more)` | 23 |
+| 64k | `▁هي ▁مقاطعة ▁في ▁ولاية ▁قش قد اريا ▁في ▁أوزبكستان ، ... (+13 more)` | 23 |
 ### Key Findings
+- **Best Compression:** 64k achieves 4.347x compression
+- **Lowest UNK Rate:** 8k with 0.0704% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
 ![N-gram Perplexity](visualizations/ngram_perplexity.png)
+![N-gram Unique](visualizations/ngram_unique.png)
 ![N-gram Coverage](visualizations/ngram_coverage.png)
 ### Results
+| N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
+|--------|---------|------------|---------|----------------|------------------|-------------------|
+| **2-gram** | Word | 452,226 | 18.79 | 5,760,373 | 5.7% | 16.3% |
+| **2-gram** | Subword | 436 🏆 | 8.77 | 70,700 | 55.9% | 96.1% |
+| **3-gram** | Word | 1,074,568 | 20.04 | 10,101,258 | 4.3% | 14.7% |
+| **3-gram** | Subword | 4,203 | 12.04 | 528,264 | 23.7% | 56.2% |
+| **4-gram** | Word | 1,869,871 | 20.83 | 16,693,684 | 3.8% | 14.3% |
+| **4-gram** | Subword | 26,613 | 14.70 | 2,851,427 | 13.2% | 31.9% |
+| **5-gram** | Word | 1,422,629 | 20.44 | 12,591,346 | 4.2% | 15.4% |
+| **5-gram** | Subword | 126,300 | 16.95 | 9,618,770 | 6.2% | 19.5% |
 ### Top 5 N-grams by Size
+**2-grams (Word):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `كرة قدم` | 754,062 |
+| 2 | `في القرن` | 693,987 |
+| 3 | `في عام` | 580,274 |
+| 4 | `الولايات المتحدة` | 468,192 |
+| 5 | `وصلات خارجية` | 357,388 |
+**3-grams (Word):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `في القرن 20` | 274,915 |
+| 2 | `مراجع وصلات خارجية` | 255,117 |
+| 3 | `في الولايات المتحدة` | 245,241 |
+| 4 | `في القرن 21` | 238,844 |
+| 5 | `أمريكيون في القرن` | 166,269 |
+**4-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `كرة قدم مغتربون في` | 94,639 |
+| 2 | `تحت سن الثامنة عشر` | 93,897 |
+| 3 | `هو لاعب كرة قدم` | 93,478 |
+| 4 | `أمريكيون في القرن 20` | 87,276 |
+| 5 | `في الألعاب الأولمبية الصيفية` | 66,167 |
+**5-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `تعداد عام بلغ عدد سكان` | 38,914 |
+| 2 | `بحسب تعداد عام وبلغ عدد` | 38,787 |
+| 3 | `تعداد عام وبلغ عدد الأسر` | 38,786 |
+| 4 | `نسمة بحسب تعداد عام وبلغ` | 38,783 |
+| 5 | `في الفئة العمرية ما بين` | 38,744 |
+**2-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `ا ل` | 88,022,277 |
+| 2 | `_ ا` | 75,496,816 |
+| 3 | `ة _` | 45,404,729 |
+| 4 | `ي _` | 32,155,198 |
+| 5 | `ن _` | 31,357,117 |
+**3-grams (Subword):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `_ ا ل` | 71,328,243 |
+| 2 | `_ ف ي` | 15,404,541 |
+| 3 | `ف ي _` | 15,103,296 |
+| 4 | `ي ة _` | 14,752,185 |
+| 5 | `ا ل م` | 13,544,149 |
+**4-grams (Subword):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `_ ف ي _` | 14,189,454 |
+| 2 | `ة _ ا ل` | 12,269,528 |
+| 3 | `_ ا ل م` | 11,772,138 |
+| 4 | `_ م ن _` | 8,237,350 |
+| 5 | `ي _ ا ل` | 7,703,248 |
+**5-grams (Subword):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `ف ي _ ا ل` | 4,810,645 |
+| 2 | `_ ف ي _ ا` | 4,774,417 |
+| 3 | `ا ت _ ا ل` | 3,857,996 |
+| 4 | `ي ة _ ا ل` | 3,696,976 |
+| 5 | `_ ع ل ى _` | 3,259,756 |
 ### Key Findings
+- **Best Perplexity:** 2-gram (subword) with 436
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
+- **Coverage:** Top-1000 patterns cover ~19% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
 ![Markov Entropy](visualizations/markov_entropy.png)
+![Markov Contexts](visualizations/markov_contexts.png)
 ![Markov Branching](visualizations/markov_branching.png)
 ### Results
+| Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
+|---------|---------|-------------|------------|------------------|-----------------|----------------|
+| **1** | Word | 0.9908 | 1.987 | 17.58 | 4,471,621 | 0.9% |
+| **1** | Subword | 1.3702 | 2.585 | 13.33 | 18,570 | 0.0% |
+| **2** | Word | 0.3659 | 1.289 | 2.31 | 78,540,786 | 63.4% |
+| **2** | Subword | 0.7295 | 1.658 | 5.21 | 247,596 | 27.1% |
+| **3** | Word | 0.1310 | 1.095 | 1.29 | 181,002,468 | 86.9% |
+| **3** | Subword | 0.6782 | 1.600 | 4.14 | 1,290,623 | 32.2% |
+| **4** | Word | 0.0499 🏆 | 1.035 | 1.09 | 233,679,791 | 95.0% |
+| **4** | Subword | 0.6490 | 1.568 | 3.51 | 5,343,485 | 35.1% |
+### Generated Text Samples (Word-based)
+Below are text samples generated from each word-based Markov chain model:
 **Context Size 1:**
+1. `في المدائن وهي منتزه نيقولا الصايغ أميناً عاماً ونسبة 22 مايو حين سجلت في مجال تعليم`
+2. `من مونتريال اسمه إلى الساحل في الإصدار الرابع قبل الرابطة مع نادي ثون نادي سيون ببطولة`
+3. `على الصيد فلا يطالب بتنفيذها أو وجود منافسة ألعاب البحر في حين احتفظت بهويتها الجديدة بقيمة`
 **Context Size 2:**
+1. `كرة قدم من قصرش مقاطعة إسبان من كتالونيا إسبانيات في القرن 20 استمر التعليم التطوري أو التنموي`
+2. `في القرن 11 في وقتٍ واحد غابرييلا قرنفل وقرفة ترجمة عوض أحمد بن عبد الله الأميرة منيرة`
+3. `في عام أن تكلفة الوجبة البسيطة في نسج الظهارية ثخانة الجلد وتصلبه المترافقين مع المشكلات التي تنشأ`
 **Context Size 3:**
+1. `في القرن 20 أمريكيون أفارقة في القرن 21 كرة قدم رجالية أحياء دوري الدرجة الأولى الأرجنتيني فيليز سار...`
+2. `مراجع وصلات خارجية كرة قدم رجالية مغتربون في روسيا على أنها قوة بحرية صغيرة إلى مدينة تشهد حركة`
+3. `في الولايات المتحدة مراجع وصلات خارجية تلفزيونية مصرية بدأ عرضها في كوميديا سوداء تلفزيونية بريطانية...`
 **Context Size 4:**
+1. `كرة قدم مغتربون في السلفادور كرة قدم هندوراسيون كرة قدم هندوراسيون مغتربون كوبا سينتروأمريكانا منتخب...`
+2. `تحت سن الثامنة عشر تعيش معهم وبلغت نسبة الأزواج القاطنين مع بعضهم البعض 46 3 من أصل المجموع الكلي`
+3. `هو لاعب كرة قدم بريطاني في مركز لعب مع برادفورد سيتي وريث روفرز ونادي بارتيك ثيسل ونادي رينجرز ونادي`
+### Generated Text Samples (Subword-based)
+Below are text samples generated from each subword-based Markov chain model:
+**Context Size 1:**
+1. `_فيا،_دارب_ي_أمر`
+2. `اقصالمعب_ع_حمالم`
+3. `لبطة_قالمندواب_ا`
+**Context Size 2:**
+1. `الأخرها_تشت_علية_`
+2. `_الممثل_أصدققه_حا`
+3. `ة_لدعار_الة)_جوزي`
+**Context Size 3:**
+1. `_الذين_حليلار_رُزِق_`
+2. `_في_إحصاءات_الله)،`
+3. `في_الوالصحيحًا_كرة_`
+**Context Size 4:**
+1. `_في_جمهور._جسدت_ديك`
+2. `ة_البلدي_في_اخترعه_`
+3. `_المتحدة._يقدمه_في_`
 ### Key Findings
+- **Best Predictability:** Context-4 (word) with 95.0% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
+- **Memory Trade-off:** Larger contexts require more storage (5,343,485 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
 | Metric | Value |
 |--------|-------|
+| Vocabulary Size | 1,950,572 |
+| Total Tokens | 322,254,287 |
+| Mean Frequency | 165.21 |
+| Median Frequency | 4 |
+| Frequency Std Dev | 12979.56 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | في | 14,286,084 |
+| 2 | من | 8,287,878 |
+| 3 | على | 3,284,746 |
+| 4 | إلى | 2,443,493 |
+| 5 | عام | 1,621,280 |
+| 6 | أن | 1,387,527 |
+| 7 | مع | 1,153,439 |
+| 8 | عن | 1,144,208 |
+| 9 | أو | 1,098,905 |
+| 10 | التي | 1,084,821 |
 ### Least Common Words (from vocabulary)
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | dekréty | 2 |
+| 2 | تادينا | 2 |
+| 3 | بوكسوري | 2 |
+| 4 | نموذجاالأدب | 2 |
+| 5 | كنونالأدب | 2 |
+| 6 | وليتاز | 2 |
+| 7 | حكمٌّ | 2 |
+| 8 | أسديراكي | 2 |
+| 9 | إنتركوليجيت | 2 |
+| 10 | للفيزيولوجية | 2 |
 ### Zipf's Law Analysis
 | Metric | Value |
 |--------|-------|
+| Zipf Coefficient | 0.9488 |
+| R² (Goodness of Fit) | 0.991144 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
+| Top 100 | 23.1% |
+| Top 1,000 | 45.9% |
+| Top 5,000 | 66.1% |
+| Top 10,000 | 74.2% |
 ### Key Findings
+- **Zipf Compliance:** R²=0.9911 indicates excellent adherence to Zipf's law
+- **High Frequency Dominance:** Top 100 words cover 23.1% of corpus
+- **Long Tail:** 1,940,572 words needed for remaining 25.8% coverage
 ---
 ## 5. Word Embeddings Evaluation
 ![t-SNE Sentences](visualizations/tsne_sentences.png)
+### 5.1 Cross-Lingual Alignment
+![Alignment Quality](visualizations/embedding_alignment_quality.png)
+![Multilingual t-SNE](visualizations/embedding_tsne_multilingual.png)
+### 5.2 Model Comparison
+| Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
+|-------|-----------|----------|------------------|---------------|----------------|
+| **mono_32d** | 32 | 0.7379 | 0.3519 | N/A | N/A |
+| **mono_64d** | 64 | 0.7394 🏆 | 0.2816 | N/A | N/A |
+| **mono_128d** | 128 | 0.7002 | 0.2259 | N/A | N/A |
+| **aligned_32d** | 32 | 0.7379 | 0.3528 | 0.2700 | 0.6440 |
+| **aligned_64d** | 64 | 0.7394 | 0.2881 | 0.4140 | 0.8200 |
+| **aligned_128d** | 128 | 0.7002 | 0.2283 | 0.6000 | 0.8940 |
 ### Key Findings
+- **Best Isotropy:** mono_64d with 0.7394 (more uniform distribution)
+- **Semantic Density:** Average pairwise similarity of 0.2881. Lower values indicate better semantic separation.
+- **Alignment Quality:** Aligned models achieve up to 60.0% R@1 in cross-lingual retrieval.
+- **Recommendation:** 128d aligned for best cross-lingual performance
 ---
+## 6.  Morphological Analysis (Experimental)
+This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
+### 6.1 Productivity & Complexity
+| Metric | Value | Interpretation | Recommendation |
+|--------|-------|----------------|----------------|
+| Productivity Index | **5.000** | High morphological productivity | Reliable analysis |
+| Idiomaticity Gap | **-0.210** | Low formulaic content | - |
+### 6.2 Affix Inventory (Productive Units)
+These are the most productive prefixes and suffixes identified by sampling the vocabulary for global substitutability patterns. A unit is considered an affix if stripping it leaves a valid stem that appears in other contexts.
+#### Productive Prefixes
+| Prefix | Examples |
+|--------|----------|
+| `-ال` | الألمانينصف, الاعتياديّ, الباكترية |
+| `-وا` | والشجرية, والكاحِل, والميلانين |
+| `-وال` | والشجرية, والكاحِل, والميلانين |
+| `-الم` | المُحاضرة, المورينو, الممنوعة |
+#### Productive Suffixes
+| Suffix | Examples |
+|--------|----------|
+| `-ين` | ضوئيتين, بقلبين, نحوين |
+| `-ات` | وخصوصيات, نانديات, دويركات |
+| `-ية` | والشجرية, الباكترية, الّدودية |
+| `-ها` | هاماريتيها, اختها, أُصولها |
+### 6.3 Bound Stems (Lexical Roots)
+Bound stems are high-frequency subword units that are semantically cohesive but rarely appear as standalone words. These often correspond to the 'core' of a word that requires inflection or derivation to be valid.
+| Stem | Cohesion | Substitutability | Examples |
+|------|----------|------------------|----------|
+| `تخدا` | 2.86x | 173 contexts | متخدا, كتخدا, متخداً |
+| `ستخد` | 2.18x | 623 contexts | مستخد, استخد, تستخد |
+| `ألعا` | 2.68x | 82 contexts | ألعاد, ألعاب, ألعالم |
+| `والع` | 1.74x | 629 contexts | والعز, والعي, والعى |
+| `اطعة` | 3.13x | 28 contexts | قاطعة, ساطعة, ساطعةً |
+| `التع` | 1.63x | 578 contexts | التعة, التعس, التعب |
+| `رنسي` | 1.82x | 179 contexts | درنسي, رنسيس, فرنسي |
+| `استخ` | 1.79x | 192 contexts | استخم, استخد, استخر |
+| `ريطا` | 2.08x | 85 contexts | غريطا, شريطا, وشريطا |
+| `لمنا` | 1.37x | 729 contexts | تلمنا, ظلمنا, ألمنا |
+| `غترب` | 2.44x | 39 contexts | اغترب, مغترب, يغترب |
+| `الحا` | 1.34x | 693 contexts | الحاء, مالحا, الحاص |
+### 6.4 Affix Compatibility (Co-occurrence)
+This table shows which prefixes and suffixes most frequently co-occur on the same stems, revealing the 'stacking' rules of the language's morphology.
+| Prefix | Suffix | Frequency | Examples |
+|--------|--------|-----------|----------|
+| `-ال` | `-ية` | 95 words | الائتمانية, الويبرية |
+| `-ال` | `-ات` | 76 words | الهباءات, الكوميديات |
+| `-ال` | `-ين` | 68 words | البحـرين, المتوارثين |
+| `-وا` | `-ية` | 35 words | والعضدية, والهانرية |
+| `-وا` | `-ات` | 24 words | والمطرزات, والسلوريات |
+| `-وا` | `-ين` | 17 words | والمُغنين, والميكرونيزيين |
+| `-وا` | `-ها` | 4 words | واعترضتها, واستبعدتها |
+### 6.5 Recursive Morpheme Segmentation
+Using **Recursive Hierarchical Substitutability**, we decompose complex words into their constituent morphemes. This approach handles nested affixes (e.g., `prefix-prefix-root-suffix`).
+| Word | Suggested Split | Confidence | Stem |
+|------|-----------------|------------|------|
+| البروتينين | **`ال-بروت-ين-ين`** | 7.5 | `بروت` |
+| والكاظمية | **`وال-كاظم-ية`** | 6.0 | `كاظم` |
+| والسرورية | **`وال-سرور-ية`** | 6.0 | `سرور` |
+| الغيلوغية | **`ال-غيلوغ-ية`** | 6.0 | `غيلوغ` |
+| والحطابين | **`وال-حطاب-ين`** | 6.0 | `حطاب` |
+| والمقدسيين | **`وال-مقدسي-ين`** | 6.0 | `مقدسي` |
+| والنجومية | **`وال-نجوم-ية`** | 6.0 | `نجوم` |
+| والرباعيات | **`وال-رباعي-ات`** | 6.0 | `رباعي` |
+| الكلابشات | **`ال-كلابش-ات`** | 6.0 | `كلابش` |
+| السبعينات | **`ال-سبعين-ات`** | 6.0 | `سبعين` |
+| لاحتجاجاتها | **`لاحتجاج-ات-ها`** | 6.0 | `لاحتجاج` |
+| والمكسّرات | **`وال-مكسّر-ات`** | 6.0 | `مكسّر` |
+| والسكيريين | **`وال-سكيري-ين`** | 6.0 | `سكيري` |
+| إسقاطاتها | **`إسقاط-ات-ها`** | 6.0 | `إسقاط` |
+| واستثمارها | **`وا-ستثمار-ها`** | 6.0 | `ستثمار` |
+### 6.6 Linguistic Interpretation
+> **Automated Insight:**
+The language Arabic shows high morphological productivity. The subword models are significantly more efficient than word models, suggesting a rich system of affixation or compounding.
+---
+## 7. Summary & Recommendations
 ![Performance Dashboard](visualizations/performance_dashboard.png)
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
+| Tokenizer | **64k BPE** | Best compression (4.35x) |
+| N-gram | **2-gram** | Lowest perplexity (436) |
+| Markov | **Context-4** | Highest predictability (95.0%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
 ---
 ## Appendix: Metrics Glossary & Interpretation Guide
   author = {Kamali, Omar},
   title = {Wikilangs: Open NLP Models for Wikipedia Languages},
   year = {2025},
+  doi = {10.5281/zenodo.18073153},
+  publisher = {Zenodo},
   url = {https://huggingface.co/wikilangs}
   institution = {Omneity Labs}
 }
 - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
 - ��� Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
 - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
+- 🤝 Sponsor: [Featherless AI](https://featherless.ai)
 ---
 *Generated by Wikilangs Models Pipeline*
+*Report Date: 2026-01-07 13:14:53*

models/embeddings/aligned/ar_128d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef219238cabd8a9ea12da66dbcd80332fbd38875433dc953df85a37a71b899aa
+size 2486763168

models/embeddings/aligned/ar_128d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "ar", "dim": 128, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/ar_128d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d61b78158be557df612ca2ef8343029b39babaae6f038fcd428534279135917
+size 65664

models/embeddings/aligned/ar_128d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "ar",
+  "dimension": 128,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 200166,
+  "vocab_size": 1398324
+}

models/embeddings/aligned/ar_32d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:70e78275bfd0f11c47d9f144be703b7d9277b2e2698a9f3c400035b92f124640
+size 644850336

models/embeddings/aligned/ar_32d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "ar", "dim": 32, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/ar_32d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:baedb1283a79af0fe218dcef59dcfa703ea3550812bcf17a0edcc064c20d6b39
+size 4224

models/embeddings/aligned/ar_32d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "ar",
+  "dimension": 32,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 200166,
+  "vocab_size": 1398324
+}

models/embeddings/aligned/ar_64d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02bc5f2f59c273f6f58b680abc0742d269afaafa47c336c6429c6a81304b006c
+size 1258821280

models/embeddings/aligned/ar_64d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "ar", "dim": 64, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/ar_64d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9efa73eb7ec91ab64fdf021d74e55418d0906b47c6bc624aab5b6a37dbd88bb9
+size 16512

models/embeddings/aligned/ar_64d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "ar",
+  "dimension": 64,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 200166,
+  "vocab_size": 1398324
+}

models/embeddings/monolingual/ar_128d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5a15e8d3f91894ef733d19288cf8f9905570f8b2cb6c96d55ef1706a098238e3
-size 2599525002

 version https://git-lfs.github.com/spec/v1
+oid sha256:ef219238cabd8a9ea12da66dbcd80332fbd38875433dc953df85a37a71b899aa
+size 2486763168

models/embeddings/monolingual/ar_128d_metadata.json CHANGED Viewed

@@ -3,11 +3,13 @@
   "dimension": 128,
   "version": "monolingual",
   "training_params": {
-    "dim": 128,
     "min_count": 5,
     "window": 5,
     "negative": 5,
-    "epochs": 5
   },
-  "vocab_size": 1505991
 }

   "dimension": 128,
   "version": "monolingual",
   "training_params": {
+    "algorithm": "skipgram",
     "min_count": 5,
     "window": 5,
     "negative": 5,
+    "epochs": 5,
+    "encoding_method": "rope",
+    "dim": 128
   },
+  "vocab_size": 1398324
 }

models/embeddings/monolingual/ar_32d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c69754e00f4a7215bd624c3eada95e3f38a60dfc1f0e7cb54a049f1e7c7dfe5b
-size 674923914

 version https://git-lfs.github.com/spec/v1
+oid sha256:70e78275bfd0f11c47d9f144be703b7d9277b2e2698a9f3c400035b92f124640
+size 644850336

models/embeddings/monolingual/ar_32d_metadata.json CHANGED Viewed

@@ -3,11 +3,13 @@
   "dimension": 32,
   "version": "monolingual",
   "training_params": {
-    "dim": 32,
     "min_count": 5,
     "window": 5,
     "negative": 5,
-    "epochs": 5
   },
-  "vocab_size": 1505991
 }

   "dimension": 32,
   "version": "monolingual",
   "training_params": {
+    "algorithm": "skipgram",
     "min_count": 5,
     "window": 5,
     "negative": 5,
+    "epochs": 5,
+    "encoding_method": "rope",
+    "dim": 32
   },
+  "vocab_size": 1398324
 }

models/embeddings/monolingual/ar_64d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b216bcc9d5802fb5f2765d412ed8e269961ce1d314758d5c5986a12d8d067325
-size 1316457610

 version https://git-lfs.github.com/spec/v1
+oid sha256:02bc5f2f59c273f6f58b680abc0742d269afaafa47c336c6429c6a81304b006c
+size 1258821280

models/embeddings/monolingual/ar_64d_metadata.json CHANGED Viewed

@@ -3,11 +3,13 @@
   "dimension": 64,
   "version": "monolingual",
   "training_params": {
-    "dim": 64,
     "min_count": 5,
     "window": 5,
     "negative": 5,
-    "epochs": 5
   },
-  "vocab_size": 1505991
 }

   "dimension": 64,
   "version": "monolingual",
   "training_params": {
+    "algorithm": "skipgram",
     "min_count": 5,
     "window": 5,
     "negative": 5,
+    "epochs": 5,
+    "encoding_method": "rope",
+    "dim": 64
   },
+  "vocab_size": 1398324
 }

models/subword_markov/ar_markov_ctx1_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f138ceea502bc716af3c32b8672e5560c86fb05241a4c6df25882580b5a54208
-size 1240058

 version https://git-lfs.github.com/spec/v1
+oid sha256:574edae048b28cd710d49d346838e995d3075aa7e9404f1962349f0f2b606f4f
+size 1510871

models/subword_markov/ar_markov_ctx1_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "subword",
   "language": "ar",
-  "unique_contexts": 13038,
-  "total_transitions": 2227602784
 }

   "context_size": 1,
   "variant": "subword",
   "language": "ar",
+  "unique_contexts": 18570,
+  "total_transitions": 1931607401
 }

models/subword_markov/ar_markov_ctx2_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3927d88cac7fc454b48e23919fdaacb6c000902069154a165316b426b04cb9d2
-size 9037901

 version https://git-lfs.github.com/spec/v1
+oid sha256:4c019e23cfe8e01e37162bcd6b57200033e2d76cdc0cc88a628d6e7fbd6649b8
+size 10138087

models/subword_markov/ar_markov_ctx2_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "subword",
   "language": "ar",
-  "unique_contexts": 227339,
-  "total_transitions": 2226268866
 }

   "context_size": 2,
   "variant": "subword",
   "language": "ar",
+  "unique_contexts": 247596,
+  "total_transitions": 1930342017
 }

models/subword_markov/ar_markov_ctx3_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:19911a0827041ed4348c506978baaae745ceac594b0d61c980d2bc3777aa6a97
-size 44692812

 version https://git-lfs.github.com/spec/v1
+oid sha256:1ef7a7d5ec14ab20dc8f9e56c4e3c940701ba628aca3d6c03fe556dff843d17d
+size 44109485

models/subword_markov/ar_markov_ctx3_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 3,
   "variant": "subword",
   "language": "ar",
-  "unique_contexts": 1153787,
-  "total_transitions": 2224934948
 }

   "context_size": 3,
   "variant": "subword",
   "language": "ar",
+  "unique_contexts": 1290623,
+  "total_transitions": 1929076633
 }

models/subword_markov/ar_markov_ctx4_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:67f755180f4df9e71b72cbe5cce476a0f75aa2d3bb2be4d74074ae69f6f89b19
-size 190825986

 version https://git-lfs.github.com/spec/v1
+oid sha256:f7ee34b80789c6d5441f10ed5482fffa62a5db280978fce263f369b0031fc1f2
+size 165497966

models/subword_markov/ar_markov_ctx4_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 4,
   "variant": "subword",
   "language": "ar",
-  "unique_contexts": 6025353,
-  "total_transitions": 2223601030
 }

   "context_size": 4,
   "variant": "subword",
   "language": "ar",
+  "unique_contexts": 5343485,
+  "total_transitions": 1927811249
 }

models/subword_ngram/ar_2gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:71ed96bee2deb0bad70f5449c7c15403d96685f635321bae37b67c5fac7ce756
-size 750740

 version https://git-lfs.github.com/spec/v1
+oid sha256:051d0c469791c68d0aab623ee6f2f4956d7da4e670aec6403120a79e8734828f
+size 1010043

models/subword_ngram/ar_2gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 2,
   "variant": "subword",
   "language": "ar",
-  "unique_ngrams": 52884,
-  "total_ngrams": 2227602784
 }

   "n": 2,
   "variant": "subword",
   "language": "ar",
+  "unique_ngrams": 70700,
+  "total_ngrams": 1931607401
 }

models/subword_ngram/ar_3gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3bbdad1db2897a969c03e3f47d560f9d1dea5e8a3448c3f90f1c693656c1a2ff
-size 6329144

 version https://git-lfs.github.com/spec/v1
+oid sha256:11567f60179d7d679ac0fe53473a44a51239bc6838cc731298039e543e3539ec
+size 7513640

models/subword_ngram/ar_3gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 3,
   "variant": "subword",
   "language": "ar",
-  "unique_ngrams": 487957,
-  "total_ngrams": 2226268866
 }

   "n": 3,
   "variant": "subword",
   "language": "ar",
+  "unique_ngrams": 528264,
+  "total_ngrams": 1930342017
 }

models/subword_ngram/ar_4gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b1d75bfcaa4ba765d4a698064ad06b8d77a93f04f7e84e5b96dc323009209b04
-size 43956365

 version https://git-lfs.github.com/spec/v1
+oid sha256:cf2ba8cd29e1c4e91b5db8c9a037dd559bec62bb1a44450fe5ffc882168bea52
+size 40025731

models/subword_ngram/ar_4gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 4,
   "variant": "subword",
   "language": "ar",
-  "unique_ngrams": 3376435,
-  "total_ngrams": 2224934948
 }

   "n": 4,
   "variant": "subword",
   "language": "ar",
+  "unique_ngrams": 2851427,
+  "total_ngrams": 1929076633
 }

models/subword_ngram/ar_5gram_subword.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d827290f0e4a08a5ac6ad25f1e75c878bc6b8a34c0c2c90d128f1fea76e05bd
+size 139944679

models/subword_ngram/ar_5gram_subword_metadata.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "n": 5,
+  "variant": "subword",
+  "language": "ar",
+  "unique_ngrams": 9618770,
+  "total_ngrams": 1927811249
+}

models/tokenizer/ar_tokenizer_16k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d7edaf2dbdc3ada01d30b8717ebf8add3d42cafa3d77bcdd80f720a97d9746d1
-size 559100

 version https://git-lfs.github.com/spec/v1
+oid sha256:762cad48886cec152e6e3f7ed18568586bc3c9f4e9953b3185639b108ba8aac0
+size 560431

models/tokenizer/ar_tokenizer_16k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/ar_tokenizer_32k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:442456b5ec25cb0683e0fe13294df9823fad435a42a56c7742c1cae081aa137a
-size 896676

 version https://git-lfs.github.com/spec/v1
+oid sha256:c76c24bcacf6d2321aead7205693601874ae17f908952dc1038e02c3176e6192
+size 898238

models/tokenizer/ar_tokenizer_32k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/ar_tokenizer_64k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:95c3e0d8a7a2ddcdb77379df9ab674034abf4e5f512785ebba09ecfc8b354f66
-size 1589031

 version https://git-lfs.github.com/spec/v1
+oid sha256:f086baad9b6b110330fd920dd11031d9f3ef7f5660ab6ac1c35134bb867b9124
+size 1589613

models/tokenizer/ar_tokenizer_64k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/ar_tokenizer_8k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3f348b2c21a24d3a2ea09cf2eb93040bd9b203a25a891e426a51af0742693d90
-size 395404

 version https://git-lfs.github.com/spec/v1
+oid sha256:77f3c2a9fceaccd03fdcfabdbaec3cf8ededb0885e5142dd55eb2decced61240
+size 396806

models/tokenizer/ar_tokenizer_8k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/vocabulary/ar_vocabulary.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a53a54e5478b4d7db731e85c634c1c05678e6956d42d16a8b8195a001195afbf
-size 15296577

 version https://git-lfs.github.com/spec/v1
+oid sha256:b31772fd6a7b4c36fa172ae0fd4a4d5dabd71f58e9966f89680cfe83fda2db9c
+size 30017440

models/vocabulary/ar_vocabulary_metadata.json CHANGED Viewed

@@ -1,16 +1,17 @@
 {
   "language": "ar",
-  "vocabulary_size": 1000000,
   "statistics": {
-    "type_token_ratio": 0.014399599085234752,
     "coverage": {
-      "top_100": 0.24468957694891222,
-      "top_1000": 0.473562596402342,
-      "top_5000": 0.6748063946199147,
-      "top_10000": 0.7536829840670893
     },
-    "hapax_count": 3360857,
-    "hapax_ratio": 0.6262799827295155,
-    "total_documents": 1333918
   }
 }

 {
   "language": "ar",
+  "vocabulary_size": 1950572,
+  "variant": "full",
   "statistics": {
+    "type_token_ratio": 0.01377005207307626,
     "coverage": {
+      "top_100": 0.22964907469610985,
+      "top_1000": 0.4549956533720101,
+      "top_5000": 0.6553878216380935,
+      "top_10000": 0.7363291209271269
     },
+    "hapax_count": 2521609,
+    "hapax_ratio": 0.5638432344308068,
+    "total_documents": 1265384
   }
 }

models/vocabulary/ar_vocabulary_top.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e8f91926cd559196b96927f546da84a97a5bcb514ae35ee9b3c8cb9074f1b7c5
+size 15453105

models/vocabulary/ar_vocabulary_top_metadata.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "language": "ar",
+  "vocabulary_size": 1000000,
+  "variant": "top",
+  "statistics": {
+    "type_token_ratio": 0.01377005207307626,
+    "coverage": {
+      "top_100": 0.22964907469610985,
+      "top_1000": 0.4549956533720101,
+      "top_5000": 0.6553878216380935,
+      "top_10000": 0.7363291209271269
+    },
+    "hapax_count": 2521609,
+    "hapax_ratio": 0.5638432344308068,
+    "total_documents": 1265384,
+    "top_vocab_size": 1000000,
+    "coverage_ratio": 0.9850047861926305,
+    "tokens_excluded": 950572
+  }
+}

models/word_markov/ar_markov_ctx1_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:595835c3288deb49535347878aaafaab9a18550487e54619d262b4d69c8215b8
-size 680761022

 version https://git-lfs.github.com/spec/v1
+oid sha256:a9fddcbf1a23594543219647432ec4ead2edb2b2e1de36c8a8f47889080afef7
+size 765307915

models/word_markov/ar_markov_ctx1_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "word",
   "language": "ar",
-  "unique_contexts": 5367543,
-  "total_transitions": 454840210
 }

   "context_size": 1,
   "variant": "word",
   "language": "ar",
+  "unique_contexts": 4471621,
+  "total_transitions": 323510512
 }