omarkamali commited on Jan 3

Commit

bd17a4a

verified ·

1 Parent(s): 7ca7ea7

Upload all models and assets for arz (latest)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
README.md +212 -171
models/embeddings/aligned/arz_128d.bin +3 -0
models/embeddings/aligned/arz_128d.meta.json +1 -0
models/embeddings/aligned/arz_128d.projection.npy +3 -0
models/embeddings/aligned/arz_128d_metadata.json +8 -0
models/embeddings/aligned/arz_32d.bin +3 -0
models/embeddings/aligned/arz_32d.meta.json +1 -0
models/embeddings/aligned/arz_32d.projection.npy +3 -0
models/embeddings/aligned/arz_32d_metadata.json +8 -0
models/embeddings/aligned/arz_64d.bin +3 -0
models/embeddings/aligned/arz_64d.meta.json +1 -0
models/embeddings/aligned/arz_64d.projection.npy +3 -0
models/embeddings/aligned/arz_64d_metadata.json +8 -0
models/embeddings/monolingual/arz_128d.bin +2 -2
models/embeddings/monolingual/arz_128d_metadata.json +1 -1
models/embeddings/monolingual/arz_32d.bin +2 -2
models/embeddings/monolingual/arz_32d_metadata.json +1 -1
models/embeddings/monolingual/arz_64d.bin +2 -2
models/embeddings/monolingual/arz_64d_metadata.json +1 -1
models/subword_markov/arz_markov_ctx1_subword.parquet +2 -2
models/subword_markov/arz_markov_ctx1_subword_metadata.json +2 -2
models/subword_markov/arz_markov_ctx2_subword.parquet +2 -2
models/subword_markov/arz_markov_ctx2_subword_metadata.json +2 -2
models/subword_markov/arz_markov_ctx3_subword.parquet +2 -2
models/subword_markov/arz_markov_ctx3_subword_metadata.json +2 -2
models/subword_markov/arz_markov_ctx4_subword.parquet +2 -2
models/subword_markov/arz_markov_ctx4_subword_metadata.json +2 -2
models/subword_ngram/arz_2gram_subword.parquet +2 -2
models/subword_ngram/arz_2gram_subword_metadata.json +2 -2
models/subword_ngram/arz_3gram_subword.parquet +2 -2
models/subword_ngram/arz_3gram_subword_metadata.json +2 -2
models/subword_ngram/arz_4gram_subword.parquet +2 -2
models/subword_ngram/arz_4gram_subword_metadata.json +2 -2
models/subword_ngram/arz_5gram_subword.parquet +3 -0
models/subword_ngram/arz_5gram_subword_metadata.json +7 -0
models/tokenizer/arz_tokenizer_16k.model +2 -2
models/tokenizer/arz_tokenizer_16k.vocab +0 -0
models/tokenizer/arz_tokenizer_32k.model +2 -2
models/tokenizer/arz_tokenizer_32k.vocab +0 -0
models/tokenizer/arz_tokenizer_64k.model +2 -2
models/tokenizer/arz_tokenizer_64k.vocab +0 -0
models/tokenizer/arz_tokenizer_8k.model +2 -2
models/tokenizer/arz_tokenizer_8k.vocab +0 -0
models/vocabulary/arz_vocabulary.parquet +2 -2
models/vocabulary/arz_vocabulary_metadata.json +9 -9
models/word_markov/arz_markov_ctx1_word.parquet +2 -2
models/word_markov/arz_markov_ctx1_word_metadata.json +2 -2
models/word_markov/arz_markov_ctx2_word.parquet +2 -2
models/word_markov/arz_markov_ctx2_word_metadata.json +2 -2

.gitattributes CHANGED Viewed

@@ -39,3 +39,4 @@ visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -t
 visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text

 visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
+visualizations/embedding_tsne_multilingual.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -10,11 +10,21 @@ tags:
   - n-gram
   - markov
   - wikipedia
   - monolingual
   - family-arabic
 license: mit
 library_name: wikilangs
-pipeline_tag: feature-extraction
 datasets:
   - omarkamali/wikipedia-monthly
 dataset_info:
@@ -23,10 +33,10 @@ dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
-    value: 3.905
   - name: best_isotropy
     type: isotropy
-    value: 0.7897
   - name: vocabulary_size
     type: vocab
     value: 0
@@ -60,7 +70,7 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
-- [6. Morphological Analysis (Experimental)](#6-morphological-analysis)
 - [7. Summary & Recommendations](#7-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
@@ -80,47 +90,47 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
-| **8k** | 2.876x | 2.88 | 0.8210% | 1,709,035 |
-| **16k** | 3.215x | 3.22 | 0.9180% | 1,528,463 |
-| **32k** | 3.559x | 3.56 | 1.0163% | 1,380,735 |
-| **64k** | 3.905x 🏆 | 3.91 | 1.1149% | 1,258,558 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
-**Sample 1:** `تاملكوت هوا دوار فى المغرب. المكان تاملكوت موجود فى منطقه اداريه اسمها تماسين. س...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁تام لك وت ▁هوا ▁دوار ▁فى ▁المغرب . ▁المك ان ... (+24 more)` | 34 |
-| 16k | `▁تام لك وت ▁هوا ▁دوار ▁فى ▁المغرب . ▁المك ان ... (+24 more)` | 34 |
-| 32k | `▁تام لك وت ▁هوا ▁دوار ▁فى ▁المغرب . ▁المكان ▁تام ... (+23 more)` | 33 |
-| 64k | `▁تام لك وت ▁هوا ▁دوار ▁فى ▁المغرب . ▁المكان ▁تام ... (+23 more)` | 33 |
-**Sample 2:** `جيريمى ديفيدسون مخرج افلام من امريكا. حياته جيريمى ديفيدسون من مواليد يوم 24 ديس...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁جير يمى ▁ديفيد سون ▁مخرج ▁افلام ▁من ▁امريكا . ▁حياته ... (+23 more)` | 33 |
-| 16k | `▁جيريمى ▁ديفيد سون ▁مخرج ▁افلام ▁من ▁امريكا . ▁حياته ▁جيريمى ... (+21 more)` | 31 |
-| 32k | `▁جيريمى ▁ديفيدسون ▁مخرج ▁افلام ▁من ▁امريكا . ▁حياته ▁جيريمى ▁ديفيدسون ... (+19 more)` | 29 |
-| 64k | `▁جيريمى ▁ديفيدسون ▁مخرج ▁افلام ▁من ▁امريكا . ▁حياته ▁جيريمى ▁ديفيدسون ... (+19 more)` | 29 |
-**Sample 3:** `ابهينايا ممثله من الهند. حياتها ابهينايا من مواليد يوم 13 نوفمبر سنة فى كارنات��ك...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁اب ه ينا يا ▁ممثله ▁من ▁الهند . ▁حياتها ▁اب ... (+28 more)` | 38 |
-| 16k | `▁اب ه ينا يا ▁ممثله ▁من ▁الهند . ▁حياتها ▁اب ... (+27 more)` | 37 |
-| 32k | `▁ابه ينا يا ▁ممثله ▁من ▁الهند . ▁حياتها ▁ابه ينا ... (+25 more)` | 35 |
-| 64k | `▁ابه ينا يا ▁ممثله ▁من ▁الهند . ▁حياتها ▁ابه ينا ... (+24 more)` | 34 |
 ### Key Findings
-- **Best Compression:** 64k achieves 3.905x compression
-- **Lowest UNK Rate:** 8k with 0.8210% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
@@ -137,12 +147,14 @@ Below are sample sentences tokenized with each vocabulary size:
 | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
 |--------|---------|------------|---------|----------------|------------------|-------------------|
-| **2-gram** | Word | 5,793 | 12.50 | 1,073,861 | 30.2% | 66.4% |
-| **2-gram** | Subword | 316 🏆 | 8.30 | 15,451 | 62.6% | 98.6% |
-| **3-gram** | Word | 8,299 | 13.02 | 1,682,809 | 28.5% | 62.7% |
-| **3-gram** | Subword | 2,021 | 10.98 | 129,923 | 30.1% | 74.0% |
-| **4-gram** | Word | 12,842 | 13.65 | 3,054,922 | 27.3% | 59.4% |
-| **4-gram** | Subword | 7,215 | 12.82 | 788,718 | 19.6% | 56.9% |
 ### Top 5 N-grams by Size
@@ -150,21 +162,21 @@ Below are sample sentences tokenized with each vocabulary size:
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `لينكات برانيه` | 1,293,684 |
-| 2 | `برانيه مصادر` | 1,167,581 |
-| 3 | `من مواليد` | 829,322 |
-| 4 | `مواليد يوم` | 809,177 |
 | 5 | `الاستوا السماوى` | 668,876 |
 **3-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `لينكات برانيه مصادر` | 1,164,952 |
-| 2 | `من مواليد يوم` | 809,029 |
 | 3 | `خط الاستوا السماوى` | 630,228 |
-| 4 | `الدايره الساعيه لجرم` | 445,892 |
-| 5 | `الساعيه لجرم سماوى` | 445,892 |
 **4-grams (Word):**
@@ -172,46 +184,66 @@ Below are sample sentences tokenized with each vocabulary size:
 |------|--------|-------|
 | 1 | `الدايره الساعيه لجرم سماوى` | 445,892 |
 | 2 | `السماوى تكون قيمة بعده` | 445,860 |
-| 3 | `خط الاستوا السماوى تكون` | 445,860 |
-| 4 | `الاستوا السماوى تكون قيمة` | 445,860 |
-| 5 | `لينكات برانيه مصادر من` | 320,727 |
 **2-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `_ ا` | 31,144,333 |
-| 2 | `ا ل` | 30,224,243 |
-| 3 | `ه _` | 17,180,633 |
-| 4 | `_ م` | 13,559,836 |
-| 5 | `ى _` | 11,805,719 |
 **3-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `_ ا ل` | 25,116,125 |
-| 2 | `ي ه _` | 6,396,587 |
-| 3 | `ه _ ا` | 6,346,797 |
-| 4 | `ا ل م` | 5,946,692 |
-| 5 | `_ م ن` | 4,537,386 |
 **4-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `ه _ ا ل` | 5,297,759 |
-| 2 | `_ ا ل م` | 5,200,038 |
-| 3 | `_ ف ى _` | 4,251,301 |
-| 4 | `_ م ن _` | 3,906,606 |
-| 5 | `_ ا ل ا` | 3,578,656 |
 ### Key Findings
-- **Best Perplexity:** 2-gram (subword) with 316
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
-- **Coverage:** Top-1000 patterns cover ~57% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
@@ -227,14 +259,14 @@ Below are sample sentences tokenized with each vocabulary size:
 | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
 |---------|---------|-------------|------------|------------------|-----------------|----------------|
-| **1** | Word | 1.2217 | 2.332 | 9.13 | 1,353,062 | 0.0% |
-| **1** | Subword | 1.0533 | 2.075 | 8.28 | 5,726 | 0.0% |
-| **2** | Word | 0.3648 | 1.288 | 1.91 | 12,336,484 | 63.5% |
-| **2** | Subword | 0.7848 | 1.723 | 5.54 | 47,379 | 21.5% |
-| **3** | Word | 0.1139 | 1.082 | 1.28 | 23,517,673 | 88.6% |
-| **3** | Subword | 0.7673 | 1.702 | 4.73 | 262,420 | 23.3% |
-| **4** | Word | 0.0625 🏆 | 1.044 | 1.17 | 29,894,419 | 93.7% |
-| **4** | Subword | 0.7433 | 1.674 | 3.81 | 1,241,425 | 25.7% |
 ### Generated Text Samples (Word-based)
@@ -242,27 +274,27 @@ Below are text samples generated from each word-based Markov chain model:
 **Context Size 1:**
-1. `فى العالم حسب المساحه لستة اكبر بحيرات اوروبا لينكات مصادر من مملكه ايطاليا حياته الرياضيه بيلعب`
-2. `من مواليد يوم 16 يناير سنة فى ذا ماتشيس بتقدم الانواع الفنيه كانت دى لوبو من`
-3. `و بتنقاس بالانزياح الاحمر المطلع المستقيم ممكن يتقاس بقوس دايره الاستواء السماويه من الجرى و نادى`
 **Context Size 2:**
-1. `لينكات برانيه مصادر عجل ناريه من المانيا حياته اليكساندر انتونوڤيتش ريزونى اليكساندر انستروثير اليكس...`
-2. `برانيه مصادر كوره قدم من الميكسيك حياته اڤير كاباليرو اڤيرالدو فيريرا لاعب كورة قدم من اليابان حياته`
-3. `من مواليد يوم 19 اغسطس لسا عايشين فى استانبول لينكات برانيه مصادر هوكى الجليد من امريكا حياته`
 **Context Size 3:**
-1. `لينكات برانيه مصادر سكان سكان فى ايران المكان ادم درهسى عليا adam darrehsi ye olya هيا تجمع سكان`
-2. `من مواليد يوم 7 ديسمبر فى مونتفيدو الحياه الرياضيه بيلعب فى مركز مُدَافِع و لعب مع فريق ريال`
-3. `خط الاستوا السماوى تكون قيمة بعده بالموجب و لو النجم جنوب خط الاستوا السماوى لو كان النجم شمال`
 **Context Size 4:**
 1. `الدايره الساعيه لجرم سماوى و الدايره الساعيه لنقطة الاعتدال الربيعى المطلع المستقيم ممكن يتقاس بقوس ...`
-2. `الاستوا السماوى تكون قيمة بعده بالسالب مصادر كوكبه`
-3. `السماوى تكون قيمة بعده بالسالب مصادر 2ماس كوكبه`
 ### Generated Text Samples (Subword-based)
@@ -271,34 +303,34 @@ Below are text samples generated from each subword-based Markov chain model:
 **Context Size 1:**
-1. `_مرا_ارو_لسيه_س_`
-2. `الدريه_اثر_كالال`
-3. `لخطونالمطة_جو_عب`
 **Context Size 2:**
-1. `_الكريتالسمات_فى_`
-2. `اليه_عاعيه_مطلحجم`
-3. `ه_بقه_ليكا_بقوى_ا`
 **Context Size 3:**
-1. `_المستقيم_محمد_بيس`
-2. `يه_مصادر_كورة_قدم_`
-3. `ه_العقبت_برات_السم`
 **Context Size 4:**
-1. `ه_السماوى_مع_فريق_ن`
-2. `_المكافئ_الفلك._الم`
-3. `_فى_باردوه_مصادر_اس`
 ### Key Findings
-- **Best Predictability:** Context-4 (word) with 93.7% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
-- **Memory Trade-off:** Larger contexts require more storage (1,241,425 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
@@ -314,48 +346,48 @@ Below are text samples generated from each subword-based Markov chain model:
 | Metric | Value |
 |--------|-------|
-| Vocabulary Size | 856,070 |
-| Total Tokens | 116,711,182 |
-| Mean Frequency | 136.33 |
 | Median Frequency | 4 |
-| Frequency Std Dev | 9391.59 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | فى | 4,414,661 |
-| 2 | من | 3,909,776 |
-| 3 | و | 3,512,508 |
-| 4 | مصادر | 1,612,463 |
-| 5 | لينكات | 1,359,404 |
-| 6 | برانيه | 1,298,834 |
-| 7 | هيا | 1,062,266 |
-| 8 | اللى | 965,103 |
-| 9 | يوم | 853,034 |
-| 10 | مواليد | 836,295 |
 ### Least Common Words (from vocabulary)
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | algeriens | 2 |
-| 2 | وبتينا | 2 |
-| 3 | روتلُف | 2 |
-| 4 | bouabdellah | 2 |
-| 5 | الخُضرة | 2 |
-| 6 | impressionisms | 2 |
-| 7 | assyriaca | 2 |
-| 8 | جروكبيديا | 2 |
-| 9 | grokipedia | 2 |
-| 10 | grok | 2 |
 ### Zipf's Law Analysis
 | Metric | Value |
 |--------|-------|
-| Zipf Coefficient | 1.2602 |
-| R² (Goodness of Fit) | 0.994644 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
@@ -363,15 +395,15 @@ Below are text samples generated from each subword-based Markov chain model:
 | Top N Words | Coverage |
 |-------------|----------|
 | Top 100 | 46.0% |
-| Top 1,000 | 76.7% |
-| Top 5,000 | 85.9% |
-| Top 10,000 | 89.0% |
 ### Key Findings
-- **Zipf Compliance:** R²=0.9946 indicates excellent adherence to Zipf's law
 - **High Frequency Dominance:** Top 100 words cover 46.0% of corpus
-- **Long Tail:** 846,070 words needed for remaining 11.0% coverage
 ---
 ## 5. Word Embeddings Evaluation
@@ -387,37 +419,40 @@ Below are text samples generated from each subword-based Markov chain model:
 ### 5.1 Cross-Lingual Alignment
-> *Note: Multilingual alignment visualization not available for this language.*
 ### 5.2 Model Comparison
 | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
 |-------|-----------|----------|------------------|---------------|----------------|
-| **mono_32d** | 32 | 0.7897 🏆 | 0.3482 | N/A | N/A |
-| **mono_64d** | 64 | 0.7690 | 0.2976 | N/A | N/A |
-| **mono_128d** | 128 | 0.7177 | 0.2526 | N/A | N/A |
 ### Key Findings
-- **Best Isotropy:** mono_32d with 0.7897 (more uniform distribution)
-- **Semantic Density:** Average pairwise similarity of 0.2995. Lower values indicate better semantic separation.
-- **Alignment Quality:** No aligned models evaluated in this run.
 - **Recommendation:** 128d aligned for best cross-lingual performance
 ---
 ## 6.  Morphological Analysis (Experimental)
-> ⚠️ **Warning:** This language shows low morphological productivity. The statistical signals used for this analysis may be noisy or less reliable than for morphologically rich languages.
 This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
 ### 6.1 Productivity & Complexity
 | Metric | Value | Interpretation | Recommendation |
 |--------|-------|----------------|----------------|
-| Productivity Index | **0.000** | Low morphological productivity | ⚠️ Likely unreliable |
-| Idiomaticity Gap | **-1.000** | Low formulaic content | - |
 ### 6.2 Affix Inventory (Productive Units)
@@ -426,13 +461,15 @@ These are the most productive prefixes and suffixes identified by sampling the v
 #### Productive Prefixes
 | Prefix | Examples |
 |--------|----------|
-| `-ال` | الخوذ, المندوبين, الدمرداشيه |
 #### Productive Suffixes
 | Suffix | Examples |
 |--------|----------|
-| `-ين` | كلوكيرين, بيرجرين, المندوبين |
-| `-ان` | مالڤان, ملازمان, پايرلمان |
 ### 6.3 Bound Stems (Lexical Roots)
@@ -440,18 +477,18 @@ Bound stems are high-frequency subword units that are semantically cohesive but
 | Stem | Cohesion | Substitutability | Examples |
 |------|----------|------------------|----------|
-| `العا` | 1.85x | 296 contexts | العام, العاج, العال |
-| `المج` | 1.79x | 267 contexts | المجد, المجر, المجئ |
-| `انزي` | 1.95x | 165 contexts | انزيا, انزيت, انزيد |
-| `الشع` | 2.11x | 103 contexts | الشعب, الشعف, الشعز |
-| `ياته` | 2.11x | 96 contexts | عياته, آياته, حياته |
-| `الاع` | 2.00x | 107 contexts | الاعور, الاعتر, الاعدا |
-| `مستق` | 2.01x | 80 contexts | مستقل, مستقر, مستقله |
-| `الاح` | 1.79x | 110 contexts | الاحد, صالاحى, الاحرش |
-| `لموج` | 2.13x | 48 contexts | لموجة, الموج, الموجب |
-| `لمجر` | 1.85x | 71 contexts | لمجره, المجر, لمجرة |
-| `لساع` | 2.34x | 28 contexts | لساعة, الساعة, لساعات |
-| `مريك` | 1.69x | 102 contexts | لمريك, مريكا, مريكن |
 ### 6.4 Affix Compatibility (Co-occurrence)
@@ -459,8 +496,12 @@ This table shows which prefixes and suffixes most frequently co-occur on the sam
 | Prefix | Suffix | Frequency | Examples |
 |--------|--------|-----------|----------|
-| `-ال` | `-ين` | 47 words | الصديقين, الحدوديين |
-| `-ال` | `-ان` | 11 words | الأخوان, الترامان |
 ### 6.5 Recursive Morpheme Segmentation
@@ -468,26 +509,26 @@ Using **Recursive Hierarchical Substitutability**, we decompose complex words in
 | Word | Suggested Split | Confidence | Stem |
 |------|-----------------|------------|------|
-| السريانيين | **`ال-سرياني-ين`** | 6.0 | `سرياني` |
-| كانتيلينين | **`كانتيل-ين-ين`** | 6.0 | `كانتيل` |
-| الجينومية | **`ال-جينومية`** | 4.5 | `جينومية` |
-| البرمجيات | **`ال-برمجيات`** | 4.5 | `برمجيات` |
-| الاستعلامات | **`ال-استعلامات`** | 4.5 | `استعلامات` |
-| بيجلاندسفچوردين | **`بيجلاندسفچورد-ين`** | 4.5 | `بيجلاندسفچورد` |
-| السينابون | **`ال-سينابون`** | 4.5 | `سينابون` |
-| الديمقراطي | **`ال-ديمقراطي`** | 4.5 | `ديمقراطي` |
-| الانبعاثية | **`ال-انبعاثية`** | 4.5 | `انبعاثية` |
-| الميتانيه | **`ال-ميتانيه`** | 4.5 | `ميتانيه` |
-| الطويحينه | **`ال-طويحينه`** | 4.5 | `طويحينه` |
-| الصابونجى | **`ال-صابونجى`** | 4.5 | `صابونجى` |
-| البنغاليه | **`ال-بنغاليه`** | 4.5 | `بنغاليه` |
-| المتحدثون | **`ال-متحدثون`** | 4.5 | `متحدثون` |
-| ستشميدلين | **`ستشميدل-ين`** | 4.5 | `ستشميدل` |
 ### 6.6 Linguistic Interpretation
 > **Automated Insight:**
-The language Egyptian Arabic appears to be more isolating or has a highly fixed vocabulary. Word-level models perform nearly as well as subword models, indicating fewer productive morphological processes.
 ---
 ## 7. Summary & Recommendations
@@ -498,9 +539,9 @@ The language Egyptian Arabic appears to be more isolating or has a highly fixed
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
-| Tokenizer | **64k BPE** | Best compression (3.91x) |
-| N-gram | **2-gram** | Lowest perplexity (316) |
-| Markov | **Context-4** | Highest predictability (93.7%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
@@ -714,4 +755,4 @@ MIT License - Free for academic and commercial use.
 ---
 *Generated by Wikilangs Models Pipeline*
-*Report Date: 2026-01-03 07:45:31*

   - n-gram
   - markov
   - wikipedia
+  - feature-extraction
+  - sentence-similarity
+  - tokenization
+  - n-grams
+  - markov-chain
+  - text-mining
+  - fasttext
+  - babelvec
+  - vocabulous
+  - vocabulary
   - monolingual
   - family-arabic
 license: mit
 library_name: wikilangs
+pipeline_tag: text-generation
 datasets:
   - omarkamali/wikipedia-monthly
 dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
+    value: 3.899
   - name: best_isotropy
     type: isotropy
+    value: 0.7938
   - name: vocabulary_size
     type: vocab
     value: 0
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
+- [6. Morphological Analysis (Experimental)](#6--morphological-analysis-experimental)
 - [7. Summary & Recommendations](#7-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
+| **8k** | 2.872x | 2.87 | 0.8437% | 1,716,209 |
+| **16k** | 3.211x | 3.21 | 0.9431% | 1,535,351 |
+| **32k** | 3.553x | 3.55 | 1.0437% | 1,387,311 |
+| **64k** | 3.899x 🏆 | 3.90 | 1.1453% | 1,264,296 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
+**Sample 1:** `سينافريدى ( الاسم العلمى: Synaphridae ) هوا فصيله من العنكبيات بيتبع عنكبوت. لين...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁سين اف ريد ى ▁( ▁الاسم ▁العلم ى : ▁s ... (+29 more)` | 39 |
+| 16k | `▁سين اف ريدى ▁( ▁الاسم ▁العلمى : ▁s yn ap ... (+24 more)` | 34 |
+| 32k | `▁سين اف ريدى ▁( ▁الاسم ▁العلمى : ▁syn ap h ... (+22 more)` | 32 |
+| 64k | `▁سين اف ريدى ▁( ▁الاسم ▁العلمى : ▁syn aph rida ... (+20 more)` | 30 |
+**Sample 2:** `اينديرا باچت لاعبه شطرنج من سلوفينيا و كازاخستان. حياتها اينديرا باچت من مواليد ...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁ايند يرا ▁با چ ت ▁لاعبه ▁شطرنج ▁من ▁سلوفينيا ▁و ... (+24 more)` | 34 |
+| 16k | `▁ايند يرا ▁با چ ت ▁لاعبه ▁شطرنج ▁من ▁سلوفينيا ▁و ... (+24 more)` | 34 |
+| 32k | `▁ايند يرا ▁باچ ت ▁لاعبه ▁شطرنج ▁من ▁سلوفينيا ▁و ▁كازاخستان ... (+22 more)` | 32 |
+| 64k | `▁ايند يرا ▁باچ ت ▁لاعبه ▁شطرنج ▁من ▁سلوفينيا ▁و ▁كازاخستان ... (+22 more)` | 32 |
+**Sample 3:** `مفطورة الخنازير ( الاسم العلمى: Mycoplasma suis ) هوا نوع من بدائيات النوى بيتبع...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁مف ط ورة ▁الخ نا زير ▁( ▁الاسم ▁العلم ى ... (+32 more)` | 42 |
+| 16k | `▁مف ط ورة ▁الخ نا زير ▁( ▁الاسم ▁العلمى : ... (+30 more)` | 40 |
+| 32k | `▁مف ط ورة ▁الخ نا زير ▁( ▁الاسم ▁العلمى : ... (+30 more)` | 40 |
+| 64k | `▁مف ط ورة ▁الخ نا زير ▁( ▁الاسم ▁العلمى : ... (+29 more)` | 39 |
 ### Key Findings
+- **Best Compression:** 64k achieves 3.899x compression
+- **Lowest UNK Rate:** 8k with 0.8437% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
 | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
 |--------|---------|------------|---------|----------------|------------------|-------------------|
+| **2-gram** | Word | 5,833 | 12.51 | 1,079,967 | 30.2% | 66.4% |
+| **2-gram** | Subword | 317 🏆 | 8.31 | 15,559 | 62.6% | 98.6% |
+| **3-gram** | Word | 8,334 | 13.02 | 1,690,048 | 28.5% | 62.7% |
+| **3-gram** | Subword | 2,031 | 10.99 | 130,688 | 30.0% | 73.9% |
+| **4-gram** | Word | 12,878 | 13.65 | 3,065,781 | 27.3% | 59.4% |
+| **4-gram** | Subword | 7,269 | 12.83 | 793,433 | 19.5% | 56.8% |
+| **5-gram** | Word | 13,448 | 13.72 | 3,166,704 | 28.9% | 59.2% |
+| **5-gram** | Subword | 18,103 | 14.14 | 2,865,423 | 14.0% | 48.6% |
 ### Top 5 N-grams by Size
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `لينكات برانيه` | 1,294,219 |
+| 2 | `برانيه مصادر` | 1,167,266 |
+| 3 | `من مواليد` | 829,316 |
+| 4 | `مواليد يوم` | 809,154 |
 | 5 | `الاستوا السماوى` | 668,876 |
 **3-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `لينكات برانيه مصادر` | 1,164,637 |
+| 2 | `من مواليد يوم` | 809,006 |
 | 3 | `خط الاستوا السماوى` | 630,228 |
+| 4 | `الساعيه لجرم سماوى` | 445,892 |
+| 5 | `الدايره الساعيه لجرم` | 445,892 |
 **4-grams (Word):**
 |------|--------|-------|
 | 1 | `الدايره الساعيه لجرم سماوى` | 445,892 |
 | 2 | `السماوى تكون قيمة بعده` | 445,860 |
+| 3 | `الاستوا السماوى تكون قيمة` | 445,860 |
+| 4 | `خط الاستوا السماوى تكون` | 445,860 |
+| 5 | `لينكات برانيه مصادر من` | 320,790 |
+**5-grams (Word):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `خط الاستوا السماوى تكون قيمة` | 445,860 |
+| 2 | `الاستوا السماوى تكون قيمة بعده` | 445,860 |
+| 3 | `لستة اكبر بحيرات العالم حسب` | 255,463 |
+| 4 | `السماويه اللى المجره جزء منها` | 222,981 |
+| 5 | `صوره و هيا مجال الكره` | 222,975 |
 **2-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `_ ا` | 31,094,853 |
+| 2 | `ا ل` | 30,178,157 |
+| 3 | `ه _` | 17,208,514 |
+| 4 | `_ م` | 13,583,995 |
+| 5 | `ى _` | 11,832,103 |
 **3-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `_ ا ل` | 25,055,980 |
+| 2 | `ي ه _` | 6,400,461 |
+| 3 | `ه _ ا` | 6,229,523 |
+| 4 | `ا ل م` | 5,957,557 |
+| 5 | `_ م ن` | 4,545,069 |
 **4-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `_ ا ل م` | 5,209,448 |
+| 2 | `ه _ ا ل` | 5,178,964 |
+| 3 | `_ ف ى _` | 4,259,956 |
+| 4 | `_ م ن _` | 3,913,053 |
+| 5 | `_ ا ل ا` | 3,581,934 |
+**5-grams (Subword):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `_ م ن _ ا` | 1,823,528 |
+| 2 | `ر ه _ ا ل` | 1,712,451 |
+| 3 | `م ص ا د ر` | 1,614,472 |
+| 4 | `_ م ص ا د` | 1,612,850 |
+| 5 | `_ ل ي ن ك` | 1,400,053 |
 ### Key Findings
+- **Best Perplexity:** 2-gram (subword) with 317
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
+- **Coverage:** Top-1000 patterns cover ~49% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
 | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
 |---------|---------|-------------|------------|------------------|-----------------|----------------|
+| **1** | Word | 1.2202 | 2.330 | 9.16 | 1,361,925 | 0.0% |
+| **1** | Subword | 1.0545 | 2.077 | 8.26 | 5,787 | 0.0% |
+| **2** | Word | 0.3640 | 1.287 | 1.91 | 12,454,727 | 63.6% |
+| **2** | Subword | 0.7835 | 1.721 | 5.53 | 47,806 | 21.7% |
+| **3** | Word | 0.1137 | 1.082 | 1.27 | 23,730,854 | 88.6% |
+| **3** | Subword | 0.7666 | 1.701 | 4.73 | 264,404 | 23.3% |
+| **4** | Word | 0.0623 🏆 | 1.044 | 1.17 | 30,143,409 | 93.8% |
+| **4** | Subword | 0.7433 | 1.674 | 3.81 | 1,249,901 | 25.7% |
 ### Generated Text Samples (Word-based)
 **Context Size 1:**
+1. `فى مرصد لويل للتدوير عن تشغيلها willer trains wales police beats and diocesan links milwaukee holy`
+2. `من امستردام 16 اكتوبر فى مركز الكواكب الصغيره مصادر من النجوم اللى جايه لينا من البرتغال`
+3. `و بكده عملية فى الحزب الديمقراطى المسيحى اشتغل فى ابوت توريبيو الكوليا مساحتها 4 سبتمبر سنة`
 **Context Size 2:**
+1. `لينكات برانيه مصادر اليمن يمنيه`
+2. `برانيه مصادر صدرى من المملكه المتحده عضو برلمان المملكه المتحده حياته نيل ماثيوز ميك ديسبوروج ريس تش...`
+3. `من مواليد يوم 12 يونيه فى لوس انجليس اغانى اغانى نيو ويڤ جوايز لينكات برانيه مصادر من`
 **Context Size 3:**
+1. `لينكات برانيه مصادر من النرويج فى جامعة كوبينهاجين و جامعة جوتينجن و جامعة زيورخ و المعهد الفدرالى ا...`
+2. `من مواليد يوم 3 يناير فى تارنوف مات فى 16 يناير الحياه العمليه كان عضو فى academic division`
+3. `خط الاستوا السماوى تكون قيمة بعده بالسالب مصادر مايور 2ماس`
 **Context Size 4:**
 1. `الدايره الساعيه لجرم سماوى و الدايره الساعيه لنقطة الاعتدال الربيعى المطلع المستقيم ممكن يتقاس بقوس ...`
+2. `الاستوا السماوى تكون قيمة بعده بالموجب و لو النجم جنوب خط الاستوا السماوى تكون قيمة بعده بالموجب و ل...`
+3. `السماوى تكون قيمة بعده بالسالب مصادر مايور 2ماس`
 ### Generated Text Samples (Subword-based)
 **Context Size 1:**
+1. `_اعاده_اكلمطقص_ا`
+2. `انجنجويناتيلودره`
+3. `ل_اوانيره._حيا_ا`
 **Context Size 2:**
+1. `_الحارض_فراكريفيا`
+2. `النظمى_نقطه_الشعا`
+3. `ه_الربيس_الداد_(+`
 **Context Size 3:**
+1. `_الاكتوردشت_كندا._`
+2. `يه_لجرم_الحرة._نظا`
+3. `ه_المقرا_جبات_فى_م`
 **Context Size 4:**
+1. `_المتحده_فضاء_منها.`
+2. `ه_السكان_سكان_فى_كو`
+3. `_فى_مركز_مُدَافِع,_و_ه`
 ### Key Findings
+- **Best Predictability:** Context-4 (word) with 93.8% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
+- **Memory Trade-off:** Larger contexts require more storage (1,249,901 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
 | Metric | Value |
 |--------|-------|
+| Vocabulary Size | 859,607 |
+| Total Tokens | 116,985,057 |
+| Mean Frequency | 136.09 |
 | Median Frequency | 4 |
+| Frequency Std Dev | 9386.65 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | فى | 4,423,347 |
+| 2 | من | 3,916,260 |
+| 3 | و | 3,516,072 |
+| 4 | مصادر | 1,612,738 |
+| 5 | لينكات | 1,359,751 |
+| 6 | برانيه | 1,299,373 |
+| 7 | هيا | 1,062,774 |
+| 8 | اللى | 967,317 |
+| 9 | يوم | 853,586 |
+| 10 | مواليد | 836,389 |
 ### Least Common Words (from vocabulary)
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | ثاكراي | 2 |
+| 2 | تشوهاتها | 2 |
+| 3 | جبائر | 2 |
+| 4 | jesuss | 2 |
+| 5 | وأران | 2 |
+| 6 | مرثير | 2 |
+| 7 | راثماينز | 2 |
+| 8 | غرانغغورمان | 2 |
+| 9 | grangegorman | 2 |
+| 10 | ditsu | 2 |
 ### Zipf's Law Analysis
 | Metric | Value |
 |--------|-------|
+| Zipf Coefficient | 1.2584 |
+| R² (Goodness of Fit) | 0.994685 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
 | Top 100 | 46.0% |
+| Top 1,000 | 76.5% |
+| Top 5,000 | 85.8% |
+| Top 10,000 | 88.9% |
 ### Key Findings
+- **Zipf Compliance:** R²=0.9947 indicates excellent adherence to Zipf's law
 - **High Frequency Dominance:** Top 100 words cover 46.0% of corpus
+- **Long Tail:** 849,607 words needed for remaining 11.1% coverage
 ---
 ## 5. Word Embeddings Evaluation
 ### 5.1 Cross-Lingual Alignment
+![Alignment Quality](visualizations/embedding_alignment_quality.png)
+![Multilingual t-SNE](visualizations/embedding_tsne_multilingual.png)
 ### 5.2 Model Comparison
 | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
 |-------|-----------|----------|------------------|---------------|----------------|
+| **mono_32d** | 32 | 0.7938 | 0.3446 | N/A | N/A |
+| **mono_64d** | 64 | 0.7682 | 0.2977 | N/A | N/A |
+| **mono_128d** | 128 | 0.7168 | 0.2564 | N/A | N/A |
+| **aligned_32d** | 32 | 0.7938 🏆 | 0.3389 | 0.1080 | 0.4340 |
+| **aligned_64d** | 64 | 0.7682 | 0.3004 | 0.2180 | 0.6240 |
+| **aligned_128d** | 128 | 0.7168 | 0.2666 | 0.3440 | 0.7120 |
 ### Key Findings
+- **Best Isotropy:** aligned_32d with 0.7938 (more uniform distribution)
+- **Semantic Density:** Average pairwise similarity of 0.3008. Lower values indicate better semantic separation.
+- **Alignment Quality:** Aligned models achieve up to 34.4% R@1 in cross-lingual retrieval.
 - **Recommendation:** 128d aligned for best cross-lingual performance
 ---
 ## 6.  Morphological Analysis (Experimental)
 This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
 ### 6.1 Productivity & Complexity
 | Metric | Value | Interpretation | Recommendation |
 |--------|-------|----------------|----------------|
+| Productivity Index | **5.000** | High morphological productivity | Reliable analysis |
+| Idiomaticity Gap | **0.218** | High formulaic/idiomatic content | - |
 ### 6.2 Affix Inventory (Productive Units)
 #### Productive Prefixes
 | Prefix | Examples |
 |--------|----------|
+| `-ال` | الطبقه, التحاقه, التنسيق |
+| `-وا` | واعتراف, وازواجها, والسحالى |
 #### Productive Suffixes
 | Suffix | Examples |
 |--------|----------|
+| `-ين` | ڤيكيلين, لالغليمين, كورجتچارنين |
+| `-ان` | فالسارتان, نيوبان, تيزمان |
+| `-ون` | اندريلتون, ازانون, السيويون |
 ### 6.3 Bound Stems (Lexical Roots)
 | Stem | Cohesion | Substitutability | Examples |
 |------|----------|------------------|----------|
+| `المج` | 1.77x | 271 contexts | المجن, المجد, المجل |
+| `ياته` | 2.08x | 97 contexts | بياته, آياته, عياته |
+| `الشع` | 2.04x | 104 contexts | الشعف, الشعر, الشعب |
+| `انزي` | 1.84x | 164 contexts | انزيچ, انزيت, انزيغ |
+| `الاع` | 1.91x | 107 contexts | الاعمل, الاعدا, الاعيب |
+| `لموج` | 2.21x | 48 contexts | لموجة, الموج, الموجة |
+| `الاح` | 1.75x | 110 contexts | الاحد, الاحرد, والاحد |
+| `مستق` | 1.86x | 81 contexts | مستقر, مستقل, ومستقل |
+| `لمجر` | 1.87x | 71 contexts | لمجرى, لمجرم, للمجر |
+| `لساع` | 2.28x | 28 contexts | لساعة, الساعى, لساعته |
+| `لمطل` | 2.23x | 29 contexts | لمطلع, المطل, المطله |
+| `لسما` | 1.60x | 110 contexts | لسماء, للسما, لسماع |
 ### 6.4 Affix Compatibility (Co-occurrence)
 | Prefix | Suffix | Frequency | Examples |
 |--------|--------|-----------|----------|
+| `-ال` | `-ين` | 42 words | المسؤولين, الهواريين |
+| `-ال` | `-ون` | 27 words | الغويلفيون, المراديون |
+| `-ال` | `-ان` | 16 words | الشخصان, اليرقان |
+| `-وا` | `-ين` | 6 words | والاصلاحيين, والمخبرين |
+| `-وا` | `-ان` | 4 words | وايزمان, والغثيان |
+| `-وا` | `-ون` | 4 words | واسيون, وايتيلون |
 ### 6.5 Recursive Morpheme Segmentation
 | Word | Suggested Split | Confidence | Stem |
 |------|-----------------|------------|------|
+| الرومانيتين | **`ال-رومانيت-ين`** | 6.0 | `رومانيت` |
+| والمنظمين | **`وا-لمنظم-ين`** | 6.0 | `لمنظم` |
+| والخريجون | **`وا-لخريج-ون`** | 6.0 | `لخريج` |
+| اوليمبيين | **`اوليمبي-ين`** | 4.5 | `اوليمبي` |
+| الفينلاندى | **`ال-فينلاندى`** | 4.5 | `فينلاندى` |
+| لوڤتچارنين | **`لوڤتچارن-ين`** | 4.5 | `لوڤتچارن` |
+| الرحمانوف | **`ال-رحمانوف`** | 4.5 | `رحمانوف` |
+| الإرسالية | **`ال-إرسالية`** | 4.5 | `إرسالية` |
+| جيريدهاران | **`جيريدهار-ان`** | 4.5 | `جيريدهار` |
+| البرمائيات | **`ال-برمائيات`** | 4.5 | `برمائيات` |
+| المتبادلة | **`ال-متبادلة`** | 4.5 | `متبادلة` |
+| المستخرجة | **`ال-مستخرجة`** | 4.5 | `مستخرجة` |
+| الباراجواى | **`ال-باراجواى`** | 4.5 | `باراجواى` |
+| الايرلندى | **`ال-ايرلندى`** | 4.5 | `ايرلندى` |
+| التصميمات | **`ال-تصميمات`** | 4.5 | `تصميمات` |
 ### 6.6 Linguistic Interpretation
 > **Automated Insight:**
+The language Egyptian Arabic shows high morphological productivity. The subword models are significantly more efficient than word models, suggesting a rich system of affixation or compounding.
 ---
 ## 7. Summary & Recommendations
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
+| Tokenizer | **64k BPE** | Best compression (3.90x) |
+| N-gram | **2-gram** | Lowest perplexity (317) |
+| Markov | **Context-4** | Highest predictability (93.8%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
 ---
 *Generated by Wikilangs Models Pipeline*
+*Report Date: 2026-01-03 20:14:21*

models/embeddings/aligned/arz_128d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a93d1b1d200a873b6e9a9096c7b18c4397ac9a643ca73002034e62b536fd9f20
+size 1529650607

models/embeddings/aligned/arz_128d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "arz", "dim": 128, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/arz_128d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:45855ffa01ef1ccad3191df9709fde0e11b3fef8b54f24f7bb3fc0f139ab0ab4
+size 65664

models/embeddings/aligned/arz_128d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "arz",
+  "dimension": 128,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 51508,
+  "vocab_size": 483420
+}

models/embeddings/aligned/arz_32d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c6256df8926d38898c2a7314db52af2add7a2924515e162e6bf2132f5dcc152
+size 390384047

models/embeddings/aligned/arz_32d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "arz", "dim": 32, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/arz_32d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:879d846167a92c16b6141f9bf5e57809288cfd40ec199b718f6fe5433fddc2da
+size 4224

models/embeddings/aligned/arz_32d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "arz",
+  "dimension": 32,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 51508,
+  "vocab_size": 483420
+}

models/embeddings/aligned/arz_64d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc863c082135ba1da04b2bd1dc6317a645458adf2de617806c8169b70f01318d
+size 770139567

models/embeddings/aligned/arz_64d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "arz", "dim": 64, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/arz_64d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e4a7cbecfb3aca876da085b9eca7016b1a274c8e52c96adf1a43793d6d9ffff7
+size 16512

models/embeddings/aligned/arz_64d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "arz",
+  "dimension": 64,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 51508,
+  "vocab_size": 483420
+}

models/embeddings/monolingual/arz_128d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5bdb36e12bc1a678fd5c157ad32e02341de1eca60c4bc2b5ef93fe49b61bd555
-size 1527330535

 version https://git-lfs.github.com/spec/v1
+oid sha256:a93d1b1d200a873b6e9a9096c7b18c4397ac9a643ca73002034e62b536fd9f20
+size 1529650607

models/embeddings/monolingual/arz_128d_metadata.json CHANGED Viewed

@@ -11,5 +11,5 @@
     "encoding_method": "rope",
     "dim": 128
   },
-  "vocab_size": 481203
 }

     "encoding_method": "rope",
     "dim": 128
   },
+  "vocab_size": 483420
 }

models/embeddings/monolingual/arz_32d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:30582a755835303bdd9d0ea4ad7243b43b631aa397c3e567b21c8d4a5c4449d3
-size 389766631

 version https://git-lfs.github.com/spec/v1
+oid sha256:8c6256df8926d38898c2a7314db52af2add7a2924515e162e6bf2132f5dcc152
+size 390384047

models/embeddings/monolingual/arz_32d_metadata.json CHANGED Viewed

@@ -11,5 +11,5 @@
     "encoding_method": "rope",
     "dim": 32
   },
-  "vocab_size": 481203
 }

     "encoding_method": "rope",
     "dim": 32
   },
+  "vocab_size": 483420
 }

models/embeddings/monolingual/arz_64d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5068a8b810efa630cdf0314c353a4495557bcd65811e48356936ecd7a645b35c
-size 768954599

 version https://git-lfs.github.com/spec/v1
+oid sha256:bc863c082135ba1da04b2bd1dc6317a645458adf2de617806c8169b70f01318d
+size 770139567

models/embeddings/monolingual/arz_64d_metadata.json CHANGED Viewed

@@ -11,5 +11,5 @@
     "encoding_method": "rope",
     "dim": 64
   },
-  "vocab_size": 481203
 }

     "encoding_method": "rope",
     "dim": 64
   },
+  "vocab_size": 483420
 }

models/subword_markov/arz_markov_ctx1_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:93b9ef162389968b455ca3710639cb0faa0e67079e19324a4f3d232dc220101e
-size 347493

 version https://git-lfs.github.com/spec/v1
+oid sha256:d88aa401545a7cb868030c3e704a39018094110969e595475748f34cfa468195
+size 351656

models/subword_markov/arz_markov_ctx1_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "subword",
   "language": "arz",
-  "unique_contexts": 5726,
-  "total_transitions": 693777470
 }

   "context_size": 1,
   "variant": "subword",
   "language": "arz",
+  "unique_contexts": 5787,
+  "total_transitions": 695246839
 }

models/subword_markov/arz_markov_ctx2_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:98db3946a7c34d13376e437f6401ae1d31aa408e141da100b0793536f9682ba1
-size 2267694

 version https://git-lfs.github.com/spec/v1
+oid sha256:ef4b04112512f9152b8d2a8aead3dae96220811aada1e938a3a2fd3032aa1151
+size 2279775

models/subword_markov/arz_markov_ctx2_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "subword",
   "language": "arz",
-  "unique_contexts": 47379,
-  "total_transitions": 692148775
 }

   "context_size": 2,
   "variant": "subword",
   "language": "arz",
+  "unique_contexts": 47806,
+  "total_transitions": 693617577
 }

models/subword_markov/arz_markov_ctx3_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:791a2be4e464c30d58ac081746362b42037f50dd8d0c4b552c3b0e68c2518dea
-size 10864076

 version https://git-lfs.github.com/spec/v1
+oid sha256:c74fb24b2618a95b7dcdf3dfe34859bd85a929b291258c2ce22882405f437ee8
+size 10965216

models/subword_markov/arz_markov_ctx3_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 3,
   "variant": "subword",
   "language": "arz",
-  "unique_contexts": 262420,
-  "total_transitions": 690520080
 }

   "context_size": 3,
   "variant": "subword",
   "language": "arz",
+  "unique_contexts": 264404,
+  "total_transitions": 691988315
 }

models/subword_markov/arz_markov_ctx4_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f9412f5ef232c0c613c068829b486f969ed98f144b0956cfba48ca7651d697cc
-size 40934252

 version https://git-lfs.github.com/spec/v1
+oid sha256:d9949ab9e4f3a093304bfe0dffdbb24f12be13664e07076bc4388ea6a8b3e2e1
+size 41091997

models/subword_markov/arz_markov_ctx4_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 4,
   "variant": "subword",
   "language": "arz",
-  "unique_contexts": 1241425,
-  "total_transitions": 688891385
 }

   "context_size": 4,
   "variant": "subword",
   "language": "arz",
+  "unique_contexts": 1249901,
+  "total_transitions": 690359053
 }

models/subword_ngram/arz_2gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:54b1e94823bf6eda78e610a31f50839e3eb6945296345628a51c2775bbc535b5
-size 225336

 version https://git-lfs.github.com/spec/v1
+oid sha256:5800d1d849b8e18cc45db646a0757e75ac10bd88fb659b78004d6cebf8166c07
+size 226684

models/subword_ngram/arz_2gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 2,
   "variant": "subword",
   "language": "arz",
-  "unique_ngrams": 15451,
-  "total_ngrams": 693777470
 }

   "n": 2,
   "variant": "subword",
   "language": "arz",
+  "unique_ngrams": 15559,
+  "total_ngrams": 695246839
 }

models/subword_ngram/arz_3gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6241036ff50c386080b4d36854fdc224c1c5099af471b3862e1464cd644b63ae
-size 1697106

 version https://git-lfs.github.com/spec/v1
+oid sha256:ad77c5c7d64974ac45c5a464440d8672c08a593d927e741aa02bc7aa9c8b3068
+size 1710417

models/subword_ngram/arz_3gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 3,
   "variant": "subword",
   "language": "arz",
-  "unique_ngrams": 129923,
-  "total_ngrams": 692148775
 }

   "n": 3,
   "variant": "subword",
   "language": "arz",
+  "unique_ngrams": 130688,
+  "total_ngrams": 693617577
 }

models/subword_ngram/arz_4gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c000b6a963ccc11a1bb0feebe6d49196497cbb6667e247ab3df4764a4e91003c
-size 10220298

 version https://git-lfs.github.com/spec/v1
+oid sha256:785394e1c4ea1d480da60e9f5a6a5f1c79f89c12ac018998bb84854d099919ff
+size 10290736

models/subword_ngram/arz_4gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 4,
   "variant": "subword",
   "language": "arz",
-  "unique_ngrams": 788718,
-  "total_ngrams": 690520080
 }

   "n": 4,
   "variant": "subword",
   "language": "arz",
+  "unique_ngrams": 793433,
+  "total_ngrams": 691988315
 }

models/subword_ngram/arz_5gram_subword.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:027d1bef375f12bc153fff490b146974f8a468599aea359db02d4447fd9dedbb
+size 39354928

models/subword_ngram/arz_5gram_subword_metadata.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "n": 5,
+  "variant": "subword",
+  "language": "arz",
+  "unique_ngrams": 2865423,
+  "total_ngrams": 690359053
+}

models/tokenizer/arz_tokenizer_16k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8182c7627a2b642bc8213e1f478fb8483ae43e1decdfb9d36a8b81c0b1e5db70
-size 553522

 version https://git-lfs.github.com/spec/v1
+oid sha256:c7fe49582ea5c0bd25a74c4bd56aa7935cb61efc4d2207d8478b9f55c1deb88e
+size 553566

models/tokenizer/arz_tokenizer_16k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/arz_tokenizer_32k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3f58411587d93e3605bc0bfb493f2adb152e457978754608d4bfa1098bbe3484
-size 874271

 version https://git-lfs.github.com/spec/v1
+oid sha256:65590445b44ef400b2d4f5156f78a4370c48fef1256a27ea8cd6dc9331f5494f
+size 874340

models/tokenizer/arz_tokenizer_32k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/arz_tokenizer_64k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b7caa9ae0cc952f588395f385251d423792d50e832f759c19f62d2debfa53c97
-size 1535709

 version https://git-lfs.github.com/spec/v1
+oid sha256:624e67c5ca7f66135be05eb14503d5c8a3bba01e8de29e986a29eb35a8738253
+size 1535714

models/tokenizer/arz_tokenizer_64k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/arz_tokenizer_8k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:75cf05a2c2e3d6160ef1a57d3e6c1105fcab185e64a3ffd3fdab63dacc685a1f
-size 396360

 version https://git-lfs.github.com/spec/v1
+oid sha256:61a540a4e4713c4e1bef88d1433bc6ccd1d63553a9ef85c3114448773ca9f029
+size 396191

models/tokenizer/arz_tokenizer_8k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/vocabulary/arz_vocabulary.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f49f18512e3b41813735d519aef4205264437e74599a3808b6c8fbea97f3bb17
-size 12321602

 version https://git-lfs.github.com/spec/v1
+oid sha256:1aa9c6e17d25fc4c2264c9a06b8ad84b9a303e652725edfac492f8aeacab09e3
+size 12376425

models/vocabulary/arz_vocabulary_metadata.json CHANGED Viewed

@@ -1,17 +1,17 @@
 {
   "language": "arz",
-  "vocabulary_size": 856070,
   "variant": "full",
   "statistics": {
-    "type_token_ratio": 0.011546453807845636,
     "coverage": {
-      "top_100": 0.4579160902506231,
-      "top_1000": 0.7632735433913325,
-      "top_5000": 0.8553371329341142,
-      "top_10000": 0.885855315521865
     },
-    "hapax_count": 497272,
-    "hapax_ratio": 0.36744001146790684,
-    "total_documents": 1628695
   }
 }

 {
   "language": "arz",
+  "vocabulary_size": 859607,
   "variant": "full",
   "statistics": {
+    "type_token_ratio": 0.011594418548720495,
     "coverage": {
+      "top_100": 0.45771756046088624,
+      "top_1000": 0.762176545686491,
+      "top_5000": 0.8545014147912448,
+      "top_10000": 0.8851748938277777
     },
+    "hapax_count": 502594,
+    "hapax_ratio": 0.3689572977849818,
+    "total_documents": 1629262
   }
 }

models/word_markov/arz_markov_ctx1_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:27fac99b38072bd41d19f9620ff6d00511cdf0cb2c23718cc24825c3425b0c81
-size 125535197

 version https://git-lfs.github.com/spec/v1
+oid sha256:3ccbdd7f412f35796c8d761dd8df63b88ffbaae4d67baaced253ef9f0a25fa0d
+size 126951261

models/word_markov/arz_markov_ctx1_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "word",
   "language": "arz",
-  "unique_contexts": 1353062,
-  "total_transitions": 115579759
 }

   "context_size": 1,
   "variant": "word",
   "language": "arz",
+  "unique_contexts": 1361925,
+  "total_transitions": 115858389
 }

models/word_markov/arz_markov_ctx2_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9256f52202a6653f8df7a39b233dce04b0731a6961f54294e451f0ff631a642f
-size 413546149

 version https://git-lfs.github.com/spec/v1
+oid sha256:205fa0c25e52a899e1ca07f401999e2e9d1bfb2c14765ab1e5a8ef141f5c442c
+size 418545596

models/word_markov/arz_markov_ctx2_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "word",
   "language": "arz",
-  "unique_contexts": 12336484,
-  "total_transitions": 113951064
 }

   "context_size": 2,
   "variant": "word",
   "language": "arz",
+  "unique_contexts": 12454727,
+  "total_transitions": 114229127
 }