omarkamali commited on Jan 3

Commit

8c45e31

verified ·

1 Parent(s): 95b76b9

Upload all models and assets for ay (latest)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
README.md +339 -141
models/embeddings/aligned/ay_128d.bin +3 -0
models/embeddings/aligned/ay_128d.meta.json +1 -0
models/embeddings/aligned/ay_128d.projection.npy +3 -0
models/embeddings/aligned/ay_128d_metadata.json +8 -0
models/embeddings/aligned/ay_32d.bin +3 -0
models/embeddings/aligned/ay_32d.meta.json +1 -0
models/embeddings/aligned/ay_32d.projection.npy +3 -0
models/embeddings/aligned/ay_32d_metadata.json +8 -0
models/embeddings/aligned/ay_64d.bin +3 -0
models/embeddings/aligned/ay_64d.meta.json +1 -0
models/embeddings/aligned/ay_64d.projection.npy +3 -0
models/embeddings/aligned/ay_64d_metadata.json +8 -0
models/embeddings/monolingual/ay_128d.bin +2 -2
models/embeddings/monolingual/ay_128d_metadata.json +5 -3
models/embeddings/monolingual/ay_32d.bin +2 -2
models/embeddings/monolingual/ay_32d_metadata.json +5 -3
models/embeddings/monolingual/ay_64d.bin +2 -2
models/embeddings/monolingual/ay_64d_metadata.json +5 -3
models/subword_markov/ay_markov_ctx1_subword.parquet +2 -2
models/subword_markov/ay_markov_ctx1_subword_metadata.json +1 -1
models/subword_markov/ay_markov_ctx2_subword.parquet +2 -2
models/subword_markov/ay_markov_ctx2_subword_metadata.json +2 -2
models/subword_markov/ay_markov_ctx3_subword.parquet +2 -2
models/subword_markov/ay_markov_ctx3_subword_metadata.json +2 -2
models/subword_markov/ay_markov_ctx4_subword.parquet +2 -2
models/subword_markov/ay_markov_ctx4_subword_metadata.json +2 -2
models/subword_ngram/ay_2gram_subword.parquet +2 -2
models/subword_ngram/ay_2gram_subword_metadata.json +2 -2
models/subword_ngram/ay_3gram_subword.parquet +2 -2
models/subword_ngram/ay_3gram_subword_metadata.json +2 -2
models/subword_ngram/ay_4gram_subword.parquet +2 -2
models/subword_ngram/ay_4gram_subword_metadata.json +2 -2
models/subword_ngram/ay_5gram_subword.parquet +3 -0
models/subword_ngram/ay_5gram_subword_metadata.json +7 -0
models/tokenizer/ay_tokenizer_16k.model +2 -2
models/tokenizer/ay_tokenizer_16k.vocab +0 -0
models/tokenizer/ay_tokenizer_32k.model +2 -2
models/tokenizer/ay_tokenizer_32k.vocab +0 -0
models/tokenizer/ay_tokenizer_64k.model +2 -2
models/tokenizer/ay_tokenizer_64k.vocab +0 -0
models/tokenizer/ay_tokenizer_8k.model +2 -2
models/tokenizer/ay_tokenizer_8k.vocab +0 -0
models/vocabulary/ay_vocabulary.parquet +2 -2
models/vocabulary/ay_vocabulary_metadata.json +10 -9
models/word_markov/ay_markov_ctx1_word.parquet +2 -2
models/word_markov/ay_markov_ctx1_word_metadata.json +2 -2
models/word_markov/ay_markov_ctx2_word.parquet +2 -2
models/word_markov/ay_markov_ctx2_word_metadata.json +2 -2

.gitattributes CHANGED Viewed

@@ -39,3 +39,4 @@ visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -t
 visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text

 visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
+visualizations/embedding_tsne_multilingual.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 language: ay
-language_name: AY
 language_family: american_aymara
 tags:
   - wikilangs
@@ -10,11 +10,21 @@ tags:
   - n-gram
   - markov
   - wikipedia
   - monolingual
   - family-american_aymara
 license: mit
 library_name: wikilangs
-pipeline_tag: feature-extraction
 datasets:
   - omarkamali/wikipedia-monthly
 dataset_info:
@@ -23,20 +33,20 @@ dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
-    value: 4.018
   - name: best_isotropy
     type: isotropy
-    value: 0.7731
   - name: vocabulary_size
     type: vocab
-    value: 25329
-generated: 2025-12-27
 ---
-# AY - Wikilangs Models
 ## Comprehensive Research Report & Full Ablation Study
-This repository contains NLP models trained and evaluated by Wikilangs, specifically on **AY** Wikipedia data.
 We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
 ## 📋 Repository Contents
@@ -44,12 +54,13 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 ### Models & Assets
 - Tokenizers (8k, 16k, 32k, 64k)
-- N-gram models (2, 3, 4-gram)
-- Markov chains (context of 1, 2, 3 and 4)
 - Subword N-gram and Markov chains
-- Embeddings in various sizes and dimensions
 - Language Vocabulary
 - Language Statistics
 ![Performance Dashboard](visualizations/performance_dashboard.png)
 ### Analysis and Evaluation
@@ -59,7 +70,8 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
-- [6. Summary & Recommendations](#6-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
@@ -68,62 +80,57 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 ![Tokenizer Compression](visualizations/tokenizer_compression.png)
 ### Results
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
-| **8k** | 3.240x | 3.13 | 0.2414% | 169,821 |
-| **16k** | 3.519x | 3.40 | 0.2623% | 156,332 |
-| **32k** | 3.774x | 3.65 | 0.2812% | 145,778 |
-| **64k** | 4.018x 🏆 | 3.88 | 0.2994% | 136,918 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
-**Sample 1:** `Phuyu (), jamach'inakana janchipa imxatiriquña yänaka. Lulina phuyupaxa k'acha q...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁phuyu ▁(), ▁jamach ' i nakana ▁janchipa ▁im x atiri ... (+22 more)` | 32 |
-| 16k | `▁phuyu ▁(), ▁jamach ' i nakana ▁janchipa ▁im x atiri ... (+22 more)` | 32 |
-| 32k | `▁phuyu ▁(), ▁jamach ' inakana ▁janchipa ▁im x atiri quña ... (+18 more)` | 28 |
-| 64k | `▁phuyu ▁(), ▁jamach ' inakana ▁janchipa ▁im x atiri quña ... (+16 more)` | 26 |
-**Sample 2:** `1920 - mara.
- Yuriña
-Toshiro Mifune.
- Jiwaña
- Uruyaña
-Categoría:Maranaka`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁ 1 9 2 0 ▁- ▁mara . ▁yuriña ▁tos ... (+11 more)` | 21 |
-| 16k | `▁ 1 9 2 0 ▁- ▁mara . ▁yuriña ▁tos ... (+8 more)` | 18 |
-| 32k | `▁ 1 9 2 0 ▁- ▁mara . ▁yuriña ▁toshiro ... (+7 more)` | 17 |
-| 64k | `▁ 1 9 2 0 ▁- ▁mara . ▁yuriña ▁toshiro ... (+7 more)` | 17 |
-**Sample 3:** `Oaxaca (), marka istadu Mïxiku.
- Oaxaca nayriri marka: Oaxaca de Juárez.
-Catego...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁o axa ca ▁(), ▁marka ▁istadu ▁mïxiku . ▁o axa ... (+13 more)` | 23 |
-| 16k | `▁oaxaca ▁(), ▁marka ▁istadu ▁mïxiku . ▁oaxaca ▁nayriri ▁marka : ... (+7 more)` | 17 |
-| 32k | `▁oaxaca ▁(), ▁marka ▁istadu ▁mïxiku . ▁oaxaca ▁nayriri ▁marka : ... (+7 more)` | 17 |
-| 64k | `▁oaxaca ▁(), ▁marka ▁istadu ▁mïxiku . ▁oaxaca ▁nayriri ▁marka : ... (+7 more)` | 17 |
 ### Key Findings
-- **Best Compression:** 64k achieves 4.018x compression
-- **Lowest UNK Rate:** 8k with 0.2414% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
@@ -132,57 +139,111 @@ Catego...`
 ![N-gram Perplexity](visualizations/ngram_perplexity.png)
 ![N-gram Coverage](visualizations/ngram_coverage.png)
 ### Results
-| N-gram | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
-|--------|------------|---------|----------------|------------------|-------------------|
-| **2-gram** | 1,629 🏆 | 10.67 | 14,390 | 42.1% | 70.8% |
-| **2-gram** | 335 🏆 | 8.39 | 3,058 | 62.7% | 98.6% |
-| **3-gram** | 2,436 | 11.25 | 23,975 | 39.1% | 65.6% |
-| **3-gram** | 2,346 | 11.20 | 22,295 | 28.7% | 70.7% |
-| **4-gram** | 4,646 | 12.18 | 47,728 | 34.2% | 57.4% |
-| **4-gram** | 9,050 | 13.14 | 97,814 | 19.6% | 48.1% |
 ### Top 5 N-grams by Size
-**2-grams:**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `' a` | 22,172 |
-| 2 | `jisk '` | 16,424 |
-| 3 | `categoría :` | 12,913 |
-| 4 | `t '` | 11,645 |
-| 5 | `' aqa` | 10,721 |
-**3-grams:**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `jisk ' a` | 16,399 |
-| 2 | `t ' aqa` | 10,718 |
-| 3 | `a t '` | 10,632 |
-| 4 | `' a t` | 10,632 |
-| 5 | `' aqa suyu` | 8,509 |
-**4-grams:**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `' a t '` | 10,632 |
-| 2 | `a t ' aqa` | 10,628 |
-| 3 | `jisk ' a t` | 10,616 |
-| 4 | `t ' aqa suyu` | 8,508 |
-| 5 | `: jisk ' a` | 4,073 |
 ### Key Findings
-- **Best Perplexity:** 2-gram with 335
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
-- **Coverage:** Top-1000 patterns cover ~48% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
@@ -190,55 +251,86 @@ Catego...`
 ![Markov Entropy](visualizations/markov_entropy.png)
 ![Markov Branching](visualizations/markov_branching.png)
 ### Results
-| Context | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
-|---------|-------------|------------|------------------|-----------------|----------------|
-| **1** | 0.6043 | 1.520 | 3.69 | 65,172 | 39.6% |
-| **1** | 0.9793 | 1.971 | 7.56 | 953 | 2.1% |
-| **2** | 0.2085 | 1.155 | 1.54 | 239,858 | 79.2% |
-| **2** | 1.0055 | 2.008 | 6.09 | 7,201 | 0.0% |
-| **3** | 0.0898 | 1.064 | 1.20 | 369,469 | 91.0% |
-| **3** | 0.8336 | 1.782 | 3.92 | 43,833 | 16.6% |
-| **4** | 0.0469 🏆 | 1.033 | 1.11 | 443,066 | 95.3% |
-| **4** | 0.6060 🏆 | 1.522 | 2.52 | 171,955 | 39.4% |
-### Generated Text Samples
-Below are text samples generated from each Markov chain model:
 **Context Size 1:**
-1. `' a t ' aqa suyu kantarawi kantarawi tarata , punu jach ' a t '`
-2. `. wali alwat tukuñkamaw yuntampi wayt asitapjaniw . caritatis studium , bibliaxa mä jisk ' aq`
-3. `: freistaat thüringen suyu ( mendoza , ukax kunas manq ' a suyu asu 1889 mara`
 **Context Size 2:**
-1. `' a t ' aqa suyu ( piruw ) categoría : warmi categoría : warmi : consuelo`
-2. `jisk ' a t ' aqa suyu asu jaqinaka kurakanaka 2011 - 2014 : bernabé martí .`
-3. `categoría : warmi categoría : jiwäwi 1890 categoría : jisk ' a suyu nayriri marka ) ,`
 **Context Size 3:**
-1. `jisk ' a t ' aqa suyu asu jaqinaka kurakanaka 2011 - 2014 : wilfredo felimón vargas cotrado`
-2. `t ' aqa suyuxa ( kastilla aru : distrito de carabayllo ) nisqaqa huk jisk ' a t`
-3. `a t ' aqa suyu ( piruw ) categoría : jisk ' a t ' aqa suyu ;`
 **Context Size 4:**
-1. `' a t ' aqa suyuwa , amarujawira jach ' a suyu . nayra sarnaqawi kamasqa chichu phaxsi 29`
-2. `a t ' aqa suyuwa , 22px | la libertad la libertad jach ' a suyu , wankayu jisk`
-3. `jisk ' a t ' aqa suyu ( piruw ) categoría : jisk ' a suyu munisipyu web (`
 ### Key Findings
-- **Best Predictability:** Context-4 with 95.3% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
-- **Memory Trade-off:** Larger contexts require more storage (171,955 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
@@ -254,37 +346,37 @@ Below are text samples generated from each Markov chain model:
 | Metric | Value |
 |--------|-------|
-| Vocabulary Size | 25,329 |
-| Total Tokens | 596,604 |
-| Mean Frequency | 23.55 |
-| Median Frequency | 4 |
-| Frequency Std Dev | 287.74 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | a | 23,874 |
-| 2 | jisk | 16,567 |
-| 3 | suyu | 14,640 |
-| 4 | categoría | 12,921 |
-| 5 | t | 11,916 |
-| 6 | de | 11,655 |
-| 7 | aqa | 10,725 |
-| 8 | jach | 7,168 |
-| 9 | piruw | 5,369 |
-| 10 | jaqinaka | 5,120 |
 ### Least Common Words (from vocabulary)
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | wirnisa | 2 |
 | 2 | sawaru | 2 |
 | 3 | tuminku | 2 |
 | 4 | urupawa | 2 |
 | 5 | capitalapawa | 2 |
-| 6 | halloween | 2 |
 | 7 | uttar | 2 |
 | 8 | pradesh | 2 |
 | 9 | quqanakampi | 2 |
@@ -294,24 +386,24 @@ Below are text samples generated from each Markov chain model:
 | Metric | Value |
 |--------|-------|
-| Zipf Coefficient | 1.1050 |
-| R² (Goodness of Fit) | 0.997739 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
-| Top 100 | 46.7% |
-| Top 1,000 | 73.4% |
-| Top 5,000 | 87.8% |
-| Top 10,000 | 93.3% |
 ### Key Findings
-- **Zipf Compliance:** R²=0.9977 indicates excellent adherence to Zipf's law
-- **High Frequency Dominance:** Top 100 words cover 46.7% of corpus
-- **Long Tail:** 15,329 words needed for remaining 6.7% coverage
 ---
 ## 5. Word Embeddings Evaluation
@@ -324,24 +416,127 @@ Below are text samples generated from each Markov chain model:
 ![t-SNE Sentences](visualizations/tsne_sentences.png)
-### Model Comparison
-| Model | Vocab Size | Dimension | Avg Norm | Std Norm | Isotropy |
-|-------|------------|-----------|----------|----------|----------|
-| **mono_32d** | 10,362 | 32 | 4.255 | 0.785 | 0.7731 🏆 |
-| **mono_64d** | 10,362 | 64 | 4.432 | 0.729 | 0.4752 |
-| **mono_128d** | 10,362 | 128 | 4.521 | 0.731 | 0.1287 |
-| **embeddings_enhanced** | 0 | 0 | 0.000 | 0.000 | 0.0000 |
 ### Key Findings
-- **Best Isotropy:** mono_32d with 0.7731 (more uniform distribution)
-- **Dimension Trade-off:** Higher dimensions capture more semantics but reduce isotropy
-- **Vocabulary Coverage:** All models cover 10,362 words
-- **Recommendation:** 100d for balanced semantic capture and efficiency
 ---
-## 6. Summary & Recommendations
 ![Performance Dashboard](visualizations/performance_dashboard.png)
@@ -349,11 +544,12 @@ Below are text samples generated from each Markov chain model:
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
-| Tokenizer | **32k BPE** | Best compression (4.02x) with low UNK rate |
-| N-gram | **5-gram** | Lowest perplexity (335) |
-| Markov | **Context-4** | Highest predictability (95.3%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
 ---
 ## Appendix: Metrics Glossary & Interpretation Guide
@@ -543,7 +739,8 @@ If you use these models in your research, please cite:
   author = {Kamali, Omar},
   title = {Wikilangs: Open NLP Models for Wikipedia Languages},
   year = {2025},
-  publisher = {HuggingFace},
   url = {https://huggingface.co/wikilangs}
   institution = {Omneity Labs}
 }
@@ -559,7 +756,8 @@ MIT License - Free for academic and commercial use.
 - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
 - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
 - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
 ---
 *Generated by Wikilangs Models Pipeline*
-*Report Date: 2025-12-27 20:48:58*

 ---
 language: ay
+language_name: Aymara
 language_family: american_aymara
 tags:
   - wikilangs
   - n-gram
   - markov
   - wikipedia
+  - feature-extraction
+  - sentence-similarity
+  - tokenization
+  - n-grams
+  - markov-chain
+  - text-mining
+  - fasttext
+  - babelvec
+  - vocabulous
+  - vocabulary
   - monolingual
   - family-american_aymara
 license: mit
 library_name: wikilangs
+pipeline_tag: text-generation
 datasets:
   - omarkamali/wikipedia-monthly
 dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
+    value: 4.252
   - name: best_isotropy
     type: isotropy
+    value: 0.7572
   - name: vocabulary_size
     type: vocab
+    value: 0
+generated: 2026-01-03
 ---
+# Aymara - Wikilangs Models
 ## Comprehensive Research Report & Full Ablation Study
+This repository contains NLP models trained and evaluated by Wikilangs, specifically on **Aymara** Wikipedia data.
 We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
 ## 📋 Repository Contents
 ### Models & Assets
 - Tokenizers (8k, 16k, 32k, 64k)
+- N-gram models (2, 3, 4, 5-gram)
+- Markov chains (context of 1, 2, 3, 4 and 5)
 - Subword N-gram and Markov chains
+- Embeddings in various sizes and dimensions (aligned and unaligned)
 - Language Vocabulary
 - Language Statistics
 ![Performance Dashboard](visualizations/performance_dashboard.png)
 ### Analysis and Evaluation
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
+- [6. Morphological Analysis (Experimental)](#6--morphological-analysis-experimental)
+- [7. Summary & Recommendations](#7-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
 ![Tokenizer Compression](visualizations/tokenizer_compression.png)
+![Tokenizer Fertility](visualizations/tokenizer_fertility.png)
+![Tokenizer OOV](visualizations/tokenizer_oov.png)
+![Total Tokens](visualizations/tokenizer_total_tokens.png)
 ### Results
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
+| **8k** | 3.398x | 3.40 | 0.2746% | 168,272 |
+| **16k** | 3.708x | 3.72 | 0.2996% | 154,209 |
+| **32k** | 3.989x | 4.00 | 0.3223% | 143,366 |
+| **64k** | 4.252x 🏆 | 4.26 | 0.3435% | 134,499 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
+**Sample 1:** `Dublin (), nayriri marka Irlandiya Jisk'a t'aqa suyunaka Irpirinaka Wali uñt'at ...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁du blin ▁(), ▁nayriri ▁marka ▁ir landiya ▁jisk ' a ... (+14 more)` | 24 |
+| 16k | `▁dublin ▁(), ▁nayriri ▁marka ▁irlandiya ▁jisk ' a ▁t ' ... (+11 more)` | 21 |
+| 32k | `▁dublin ▁(), ▁nayriri ▁marka ▁irlandiya ▁jisk ' a ▁t ' ... (+11 more)` | 21 |
+| 64k | `▁dublin ▁(), ▁nayriri ▁marka ▁irlandiya ▁jisk ' a ▁t ' ... (+11 more)` | 21 |
+**Sample 2:** `- mara. Yuriña Jiwaña Uruyaña Payïr Jachʼa Chʼaxwäwi tukuyxäna.`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁- ▁mara . ▁yuriña ▁jiwaña ▁uruyaña ▁payïr ▁jach ʼ a ... (+8 more)` | 18 |
+| 16k | `▁- ▁mara . ▁yuriña ▁jiwaña ▁uruyaña ▁payïr ▁jach ʼ a ... (+6 more)` | 16 |
+| 32k | `▁- ▁mara . ▁yuriña ▁jiwaña ▁uruyaña ▁payïr ▁jach ʼ a ... (+6 more)` | 16 |
+| 64k | `▁- ▁mara . ▁yuriña ▁jiwaña ▁uruyaña ▁payïr ▁jach ʼ a ... (+6 more)` | 16 |
+**Sample 3:** `Chika uru (), qharatatata ch’amakthapkama uruna taypipa.`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁chika ▁uru ▁(), ▁qh ara tata ta ▁ch ’ ama ... (+9 more)` | 19 |
+| 16k | `▁chika ▁uru ▁(), ▁qh ara tata ta ▁ch ’ ama ... (+8 more)` | 18 |
+| 32k | `▁chika ▁uru ▁(), ▁qhara tatata ▁ch ’ amak thap kama ... (+5 more)` | 15 |
+| 64k | `▁chika ▁uru ▁(), ▁qhara tatata ▁ch ’ amakthapkama ▁uruna ▁taypipa ... (+1 more)` | 11 |
 ### Key Findings
+- **Best Compression:** 64k achieves 4.252x compression
+- **Lowest UNK Rate:** 8k with 0.2746% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
 ![N-gram Perplexity](visualizations/ngram_perplexity.png)
+![N-gram Unique](visualizations/ngram_unique.png)
 ![N-gram Coverage](visualizations/ngram_coverage.png)
 ### Results
+| N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
+|--------|---------|------------|---------|----------------|------------------|-------------------|
+| **2-gram** | Word | 1,093 | 10.09 | 8,159 | 47.5% | 75.3% |
+| **2-gram** | Subword | 282 🏆 | 8.14 | 2,432 | 66.7% | 99.2% |
+| **3-gram** | Word | 1,711 | 10.74 | 12,666 | 42.2% | 69.1% |
+| **3-gram** | Subword | 2,030 | 10.99 | 18,023 | 29.5% | 73.5% |
+| **4-gram** | Word | 4,113 | 12.01 | 28,447 | 33.5% | 56.4% |
+| **4-gram** | Subword | 8,227 | 13.01 | 79,517 | 19.1% | 48.7% |
+| **5-gram** | Word | 4,963 | 12.28 | 27,121 | 30.7% | 52.7% |
+| **5-gram** | Subword | 18,419 | 14.17 | 172,494 | 15.5% | 41.0% |
 ### Top 5 N-grams by Size
+**2-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `jisk a` | 12,410 |
+| 2 | `t aqa` | 10,719 |
+| 3 | `aqa suyu` | 8,507 |
+| 4 | `a t` | 6,972 |
+| 5 | `a suyu` | 5,247 |
+**3-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `t aqa suyu` | 8,506 |
+| 2 | `a t aqa` | 6,963 |
+| 3 | `jisk a t` | 6,951 |
+| 4 | `jisk a suyu` | 3,603 |
+| 5 | `piruw t aqa` | 2,712 |
+**4-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `jisk a t aqa` | 6,950 |
+| 2 | `a t aqa suyu` | 4,765 |
+| 3 | `piruw t aqa suyu` | 2,712 |
+| 4 | `t aqa suyu asu` | 1,947 |
+| 5 | `aqa suyu asu jaqinaka` | 1,947 |
+**5-grams (Word):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `jisk a t aqa suyu` | 4,757 |
+| 2 | `t aqa suyu asu jaqinaka` | 1,947 |
+| 3 | `a t aqa suyu asu` | 1,947 |
+| 4 | `suyu piruw t aqa suyu` | 1,830 |
+| 5 | `t aqa suyu piruw t` | 1,830 |
+**2-grams (Subword):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `a _` | 131,245 |
+| 2 | `k a` | 69,413 |
+| 3 | `n a` | 64,712 |
+| 4 | `a n` | 60,547 |
+| 5 | `a r` | 59,718 |
+**3-grams (Subword):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `a k a` | 37,061 |
+| 2 | `n a k` | 33,828 |
+| 3 | `a _ s` | 26,955 |
+| 4 | `_ m a` | 24,357 |
+| 5 | `_ j a` | 23,674 |
+**4-grams (Subword):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `n a k a` | 32,697 |
+| 2 | `s u y u` | 19,816 |
+| 3 | `_ s u y` | 19,711 |
+| 4 | `a _ s u` | 19,361 |
+| 5 | `_ m a r` | 19,102 |
+**5-grams (Subword):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `_ s u y u` | 19,654 |
+| 2 | `a _ s u y` | 18,833 |
+| 3 | `n a k a _` | 16,761 |
+| 4 | `a n a k a` | 16,081 |
+| 5 | `_ j i s k` | 12,416 |
 ### Key Findings
+- **Best Perplexity:** 2-gram (subword) with 282
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
+- **Coverage:** Top-1000 patterns cover ~41% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
 ![Markov Entropy](visualizations/markov_entropy.png)
+![Markov Contexts](visualizations/markov_contexts.png)
 ![Markov Branching](visualizations/markov_branching.png)
 ### Results
+| Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
+|---------|---------|-------------|------------|------------------|-----------------|----------------|
+| **1** | Word | 0.6845 | 1.607 | 3.61 | 60,169 | 31.6% |
+| **1** | Subword | 0.8600 | 1.815 | 6.42 | 953 | 14.0% |
+| **2** | Word | 0.1508 | 1.110 | 1.33 | 216,093 | 84.9% |
+| **2** | Subword | 0.9055 | 1.873 | 5.55 | 6,117 | 9.5% |
+| **3** | Word | 0.0575 | 1.041 | 1.13 | 286,627 | 94.3% |
+| **3** | Subword | 0.8121 | 1.756 | 3.93 | 33,906 | 18.8% |
+| **4** | Word | 0.0351 🏆 | 1.025 | 1.08 | 322,229 | 96.5% |
+| **4** | Subword | 0.6399 | 1.558 | 2.64 | 133,072 | 36.0% |
+### Generated Text Samples (Word-based)
+Below are text samples generated from each word-based Markov chain model:
+**Context Size 1:**
+1. `a crespo madrid mara fernando belaúnde umalliq uraqipa san huwan bosco giuseppe verdi nabucco italiy...`
+2. `suyu asu jaqinaka kurakanaka mario hinostroza ppc carlos milla batres lima jisk a suyupi piruw porta...`
+3. `jisk a t aqa suyu wankawillka mons karu puriy sulli phutti charqui kanka champhayna plato paceño`
+**Context Size 2:**
+1. `jisk a suyuxa wuliwya nayriri marka sport fa šiauliai fc gintra fc šiauliai lituaña marka sport fk`
+2. `t aqa suyu piruw t aqa suyu kastilla arupi distrito de chambara na mä jisk a suyu`
+3. `aqa suyu bongara jisk a suyu nayra sarnaqawi santa rusa yachay tarpuy yachaychiy asu utanaka huch uy`
+**Context Size 3:**
+1. `t aqa suyu kurunku jisk a suyu suyu piruw suyu piwra jach a suyu jisk a suyunaka aruskipäwi`
+2. `a t aqa suyu asu jaqinaka kurakanaka amílcar gerardo ramos collachagua bloque popular junín jne auto...`
+3. `jisk a t aqa suyuxa kastilla aru distrito de bambamarca na mä jisk a t aqa suyu nayriri`
+**Context Size 4:**
+1. `jisk a t aqa suyu kastilla arupi distrito de pucyura nisqaqa huk jisk a t aqa suyu pallasqa jisk`
+2. `a t aqa suyu nayriri marka shanao 270 msnm qullunaka jawiranaka qutanaka qullqinchäwi jaqinaka 9 104...`
+3. `piruw t aqa suyu ariqipa jisk a suyupi ariqipa jach a suyupi piruw jach a markapi nayra sarnaqawi qu...`
+### Generated Text Samples (Subword-based)
+Below are text samples generated from each subword-based Markov chain model:
 **Context Size 1:**
+1. `arererulu_jax_yu`
+2. `_ma_uycho_smtera`
+3. `i_lorma_-_si_lel`
 **Context Size 2:**
+1. `a_mujisqa_34_300_`
+2. `ka_jisk'aqäwiru)_`
+3. `nayrin_jisychérro`
 **Context Size 3:**
+1. `aka_nayriri_irpiru`
+2. `nakapi._maraka_-_l`
+3. `a_sasa_uywa_baldi_`
 **Context Size 4:**
+1. `naka:_musampïmwa._j`
+2. `suyu;_(kasti_wat'ay`
+3. `_suyuwa,_209,12_km2`
 ### Key Findings
+- **Best Predictability:** Context-4 (word) with 96.5% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
+- **Memory Trade-off:** Larger contexts require more storage (133,072 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
 | Metric | Value |
 |--------|-------|
+| Vocabulary Size | 24,208 |
+| Total Tokens | 520,495 |
+| Mean Frequency | 21.50 |
+| Median Frequency | 3 |
+| Frequency Std Dev | 253.66 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | a | 19,357 |
+| 2 | suyu | 14,560 |
+| 3 | jisk | 12,473 |
+| 4 | t | 11,844 |
+| 5 | de | 11,521 |
+| 6 | aqa | 10,723 |
+| 7 | jach | 6,951 |
+| 8 | jaqinaka | 5,107 |
+| 9 | piruw | 5,076 |
+| 10 | la | 4,233 |
 ### Least Common Words (from vocabulary)
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | lunisa | 2 |
 | 2 | sawaru | 2 |
 | 3 | tuminku | 2 |
 | 4 | urupawa | 2 |
 | 5 | capitalapawa | 2 |
+| 6 | kurunawirus | 2 |
 | 7 | uttar | 2 |
 | 8 | pradesh | 2 |
 | 9 | quqanakampi | 2 |
 | Metric | Value |
 |--------|-------|
+| Zipf Coefficient | 1.0705 |
+| R² (Goodness of Fit) | 0.996948 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
+| Top 100 | 47.7% |
+| Top 1,000 | 73.0% |
+| Top 5,000 | 87.2% |
+| Top 10,000 | 93.0% |
 ### Key Findings
+- **Zipf Compliance:** R²=0.9969 indicates excellent adherence to Zipf's law
+- **High Frequency Dominance:** Top 100 words cover 47.7% of corpus
+- **Long Tail:** 14,208 words needed for remaining 7.0% coverage
 ---
 ## 5. Word Embeddings Evaluation
 ![t-SNE Sentences](visualizations/tsne_sentences.png)
+### 5.1 Cross-Lingual Alignment
+![Alignment Quality](visualizations/embedding_alignment_quality.png)
+![Multilingual t-SNE](visualizations/embedding_tsne_multilingual.png)
+### 5.2 Model Comparison
+| Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
+|-------|-----------|----------|------------------|---------------|----------------|
+| **mono_32d** | 32 | 0.7572 🏆 | 0.3779 | N/A | N/A |
+| **mono_64d** | 64 | 0.4924 | 0.3361 | N/A | N/A |
+| **mono_128d** | 128 | 0.1272 | 0.3426 | N/A | N/A |
+| **aligned_32d** | 32 | 0.7572 | 0.3748 | 0.0400 | 0.2060 |
+| **aligned_64d** | 64 | 0.4924 | 0.3390 | 0.0480 | 0.2520 |
+| **aligned_128d** | 128 | 0.1272 | 0.3283 | 0.0740 | 0.3280 |
 ### Key Findings
+- **Best Isotropy:** mono_32d with 0.7572 (more uniform distribution)
+- **Semantic Density:** Average pairwise similarity of 0.3498. Lower values indicate better semantic separation.
+- **Alignment Quality:** Aligned models achieve up to 7.4% R@1 in cross-lingual retrieval.
+- **Recommendation:** 128d aligned for best cross-lingual performance
 ---
+## 6.  Morphological Analysis (Experimental)
+This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
+### 6.1 Productivity & Complexity
+| Metric | Value | Interpretation | Recommendation |
+|--------|-------|----------------|----------------|
+| Productivity Index | **5.000** | High morphological productivity | Reliable analysis |
+| Idiomaticity Gap | **0.285** | High formulaic/idiomatic content | - |
+### 6.2 Affix Inventory (Productive Units)
+These are the most productive prefixes and suffixes identified by sampling the vocabulary for global substitutability patterns. A unit is considered an affix if stripping it leaves a valid stem that appears in other contexts.
+#### Productive Prefixes
+| Prefix | Examples |
+|--------|----------|
+| `-ma` | mayura, manon, marcona |
+| `-pa` | pallasqa, palestina, pachakutiq |
+#### Productive Suffixes
+| Suffix | Examples |
+|--------|----------|
+| `-a` | horadnia, enlacenaka, pukllaykuna |
+| `-as` | cotabambas, caritas, chinapas |
+| `-na` | pukllaykuna, pukyukuna, amasuna |
+| `-es` | desapariciones, regiones, crueles |
+### 6.3 Bound Stems (Lexical Roots)
+Bound stems are high-frequency subword units that are semantically cohesive but rarely appear as standalone words. These often correspond to the 'core' of a word that requires inflection or derivation to be valid.
+| Stem | Cohesion | Substitutability | Examples |
+|------|----------|------------------|----------|
+| `kana` | 2.06x | 39 contexts | ukana, kanal, akana |
+| `arka` | 2.00x | 39 contexts | arkañ, marka, markaq |
+| `qull` | 1.97x | 27 contexts | qulla, qullu, qullq |
+| `raqi` | 2.19x | 19 contexts | uraqi, uraqiw, saraqi |
+| `hach` | 1.91x | 29 contexts | hacha, qhach, chacha |
+| `hana` | 1.93x | 25 contexts | chana, hanaq, ghana |
+| `tana` | 1.88x | 26 contexts | utana, utanak, patana |
+| `aqin` | 2.00x | 19 contexts | taqin, jaqin, jaqinx |
+| `rkan` | 2.10x | 15 contexts | hirkan, markan, markani |
+| `ista` | 1.57x | 31 contexts | vista, lista, wista |
+| `irin` | 1.96x | 14 contexts | irina, irinak, irineo |
+| `arus` | 1.90x | 15 contexts | arusa, larus, arust |
+### 6.4 Affix Compatibility (Co-occurrence)
+This table shows which prefixes and suffixes most frequently co-occur on the same stems, revealing the 'stacking' rules of the language's morphology.
+| Prefix | Suffix | Frequency | Examples |
+|--------|--------|-----------|----------|
+| `-ma` | `-a` | 66 words | maceda, marakama |
+| `-pa` | `-a` | 52 words | patunka, paulina |
+| `-ma` | `-na` | 11 words | maradona, martina |
+| `-pa` | `-na` | 9 words | paulina, pagina |
+| `-pa` | `-es` | 8 words | patrones, pacajes |
+| `-ma` | `-as` | 5 words | matorras, maravillas |
+| `-ma` | `-es` | 4 words | marques, mayores |
+| `-pa` | `-as` | 2 words | palabras, pachas |
+### 6.5 Recursive Morpheme Segmentation
+Using **Recursive Hierarchical Substitutability**, we decompose complex words into their constituent morphemes. This approach handles nested affixes (e.g., `prefix-prefix-root-suffix`).
+| Word | Suggested Split | Confidence | Stem |
+|------|-----------------|------------|------|
+| populares | **`popular-es`** | 4.5 | `popular` |
+| ceremoniales | **`ceremonial-es`** | 4.5 | `ceremonial` |
+| apóstoles | **`apóstol-es`** | 4.5 | `apóstol` |
+| uywanakana | **`uywanaka-na`** | 4.5 | `uywanaka` |
+| funerales | **`funeral-es`** | 4.5 | `funeral` |
+| christies | **`christi-es`** | 4.5 | `christi` |
+| regulares | **`regular-es`** | 4.5 | `regular` |
+| familiares | **`familiar-es`** | 4.5 | `familiar` |
+| wawanakana | **`wawanaka-na`** | 4.5 | `wawanaka` |
+| australiana | **`australia-na`** | 4.5 | `australia` |
+| magisteriales | **`ma-gisterial-es`** | 3.0 | `gisterial` |
+| pacoricona | **`pa-corico-na`** | 3.0 | `corico` |
+| maranakana | **`ma-ranaka-na`** | 3.0 | `ranaka` |
+| partituras | **`pa-rtitur-as`** | 3.0 | `rtitur` |
+| pallaytas | **`pa-llayt-as`** | 3.0 | `llayt` |
+### 6.6 Linguistic Interpretation
+> **Automated Insight:**
+The language Aymara shows high morphological productivity. The subword models are significantly more efficient than word models, suggesting a rich system of affixation or compounding.
+> **Note on Idiomaticity:** The high Idiomaticity Gap suggests a large number of frequent multi-word expressions or formulaic sequences that are statistically distinct from their component parts.
+---
+## 7. Summary & Recommendations
 ![Performance Dashboard](visualizations/performance_dashboard.png)
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
+| Tokenizer | **64k BPE** | Best compression (4.25x) |
+| N-gram | **2-gram** | Lowest perplexity (282) |
+| Markov | **Context-4** | Highest predictability (96.5%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
 ---
 ## Appendix: Metrics Glossary & Interpretation Guide
   author = {Kamali, Omar},
   title = {Wikilangs: Open NLP Models for Wikipedia Languages},
   year = {2025},
+  doi = {10.5281/zenodo.18073153},
+  publisher = {Zenodo},
   url = {https://huggingface.co/wikilangs}
   institution = {Omneity Labs}
 }
 - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
 - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
 - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
+- 🤝 Sponsor: [Featherless AI](https://featherless.ai)
 ---
 *Generated by Wikilangs Models Pipeline*
+*Report Date: 2026-01-03 18:29:39*

models/embeddings/aligned/ay_128d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:75d888891253a1aa92de8ddd80df98c7db29b4648e540064983fb281a81a46b9
+size 1033669280

models/embeddings/aligned/ay_128d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "ay", "dim": 128, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/ay_128d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1a78f4b80b51986269486d859a4ec88792de8afc75d5bdd9ed8953d6a1015ed
+size 65664

models/embeddings/aligned/ay_128d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "ay",
+  "dimension": 128,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 5455,
+  "vocab_size": 9294
+}

models/embeddings/aligned/ay_32d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fff45db79d8ee2218013d4417e66c2e2ac670f5f02f8cdf69f450bf46ef79529
+size 258531488

models/embeddings/aligned/ay_32d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "ay", "dim": 32, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/ay_32d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98d6d9d23db529e1ac02363a32ffe259502e4f09e2698ab8e12658ca422ae72c
+size 4224

models/embeddings/aligned/ay_32d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "ay",
+  "dimension": 32,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 5455,
+  "vocab_size": 9294
+}

models/embeddings/aligned/ay_64d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:773718fb46417e3a4eacea82a8b66c181ebf71989859ce733616e8890622e928
+size 516910752

models/embeddings/aligned/ay_64d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "ay", "dim": 64, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/ay_64d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d9255d50061c7fc8c3ef059fcdd7c027c81541f541931be94e5a16bd6668b9e1
+size 16512

models/embeddings/aligned/ay_64d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "ay",
+  "dimension": 64,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 5455,
+  "vocab_size": 9294
+}

models/embeddings/monolingual/ay_128d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:593d231b5c862c91d99a298b35d7a8a92af5889f7195a1096550a6cf493db26c
-size 1034781490

 version https://git-lfs.github.com/spec/v1
+oid sha256:75d888891253a1aa92de8ddd80df98c7db29b4648e540064983fb281a81a46b9
+size 1033669280

models/embeddings/monolingual/ay_128d_metadata.json CHANGED Viewed

@@ -3,11 +3,13 @@
   "dimension": 128,
   "version": "monolingual",
   "training_params": {
-    "dim": 128,
     "min_count": 5,
     "window": 5,
     "negative": 5,
-    "epochs": 5
   },
-  "vocab_size": 10362
 }

   "dimension": 128,
   "version": "monolingual",
   "training_params": {
+    "algorithm": "skipgram",
     "min_count": 5,
     "window": 5,
     "negative": 5,
+    "epochs": 5,
+    "encoding_method": "rope",
+    "dim": 128
   },
+  "vocab_size": 9294
 }

models/embeddings/monolingual/ay_32d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1148550baa373f84d4bcb57f3b890769155df89eb33f5ca94e031220a6178d22
-size 258823474

 version https://git-lfs.github.com/spec/v1
+oid sha256:fff45db79d8ee2218013d4417e66c2e2ac670f5f02f8cdf69f450bf46ef79529
+size 258531488

models/embeddings/monolingual/ay_32d_metadata.json CHANGED Viewed

@@ -3,11 +3,13 @@
   "dimension": 32,
   "version": "monolingual",
   "training_params": {
-    "dim": 32,
     "min_count": 5,
     "window": 5,
     "negative": 5,
-    "epochs": 5
   },
-  "vocab_size": 10362
 }

   "dimension": 32,
   "version": "monolingual",
   "training_params": {
+    "algorithm": "skipgram",
     "min_count": 5,
     "window": 5,
     "negative": 5,
+    "epochs": 5,
+    "encoding_method": "rope",
+    "dim": 32
   },
+  "vocab_size": 9294
 }

models/embeddings/monolingual/ay_64d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:11f64944a6949f805320a9a0366bed7ddb5434a527e1796dc32cbf647cb04617
-size 517476146

 version https://git-lfs.github.com/spec/v1
+oid sha256:773718fb46417e3a4eacea82a8b66c181ebf71989859ce733616e8890622e928
+size 516910752

models/embeddings/monolingual/ay_64d_metadata.json CHANGED Viewed

@@ -3,11 +3,13 @@
   "dimension": 64,
   "version": "monolingual",
   "training_params": {
-    "dim": 64,
     "min_count": 5,
     "window": 5,
     "negative": 5,
-    "epochs": 5
   },
-  "vocab_size": 10362
 }

   "dimension": 64,
   "version": "monolingual",
   "training_params": {
+    "algorithm": "skipgram",
     "min_count": 5,
     "window": 5,
     "negative": 5,
+    "epochs": 5,
+    "encoding_method": "rope",
+    "dim": 64
   },
+  "vocab_size": 9294
 }

models/subword_markov/ay_markov_ctx1_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a501564aa7bab749452432f4296ba393aa4888e0600291301c8acb9b46087910
-size 60606

 version https://git-lfs.github.com/spec/v1
+oid sha256:2f654f9a1af166ba7100d504c0a42b9c28d1153746d67b7f513b89cced108560
+size 53963

models/subword_markov/ay_markov_ctx1_subword_metadata.json CHANGED Viewed

@@ -3,5 +3,5 @@
   "variant": "subword",
   "language": "ay",
   "unique_contexts": 953,
-  "total_transitions": 4432265
 }

   "variant": "subword",
   "language": "ay",
   "unique_contexts": 953,
+  "total_transitions": 3713215
 }

models/subword_markov/ay_markov_ctx2_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:66f2745ff633512b340bdf8201dd98192e37615cc9042243528ef27b8cf231bb
-size 339502

 version https://git-lfs.github.com/spec/v1
+oid sha256:3498f52b616acab68c6d811d03502a2e2c453365e62690eacde2f61f3013b0b3
+size 279369

models/subword_markov/ay_markov_ctx2_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "subword",
   "language": "ay",
-  "unique_contexts": 7201,
-  "total_transitions": 4426837
 }

   "context_size": 2,
   "variant": "subword",
   "language": "ay",
+  "unique_contexts": 6117,
+  "total_transitions": 3707986
 }

models/subword_markov/ay_markov_ctx3_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fa6ef1a990648f3d87315c9fbfaba1abc08319ae83afd78f5259b68a6e9622fd
-size 1261852

 version https://git-lfs.github.com/spec/v1
+oid sha256:95e497c1994158f6b7131bbb16a2806fd1970f63daa68c89d1aaec498a96fb9e
+size 1014554

models/subword_markov/ay_markov_ctx3_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 3,
   "variant": "subword",
   "language": "ay",
-  "unique_contexts": 43833,
-  "total_transitions": 4421409
 }

   "context_size": 3,
   "variant": "subword",
   "language": "ay",
+  "unique_contexts": 33906,
+  "total_transitions": 3702757
 }

models/subword_markov/ay_markov_ctx4_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a65fd0941e7426fe960b1fee56ea748a94605c4412ad68c9094e3ae9643915cc
-size 3425718

 version https://git-lfs.github.com/spec/v1
+oid sha256:228f396f37e804f0c24770ca609187f92bcca3d3b4e4ecb74df73086171c3807
+size 2704254

models/subword_markov/ay_markov_ctx4_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 4,
   "variant": "subword",
   "language": "ay",
-  "unique_contexts": 171955,
-  "total_transitions": 4415981
 }

   "context_size": 4,
   "variant": "subword",
   "language": "ay",
+  "unique_contexts": 133072,
+  "total_transitions": 3697528
 }

models/subword_ngram/ay_2gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7b4e4597fceea0d7c8119438b61b0b4477dd0e8b8fde6f7f68febdb690bf5333
-size 41037

 version https://git-lfs.github.com/spec/v1
+oid sha256:aaf8602bddd857d99fc6ee56ddafe1df2adeceac60fc6026e61cff247852b46a
+size 33850

models/subword_ngram/ay_2gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 2,
   "variant": "subword",
   "language": "ay",
-  "unique_ngrams": 3058,
-  "total_ngrams": 4432265
 }

   "n": 2,
   "variant": "subword",
   "language": "ay",
+  "unique_ngrams": 2432,
+  "total_ngrams": 3713215
 }

models/subword_ngram/ay_3gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:00b41467becbb77b0b605997b7fefc5dfb74440266d63b49cbe7a413cc01e080
-size 270869

 version https://git-lfs.github.com/spec/v1
+oid sha256:a957cd69c5e6f74077658fcc5090387010c39f492ec195f8a2b7af6c68c1f33a
+size 219992

models/subword_ngram/ay_3gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 3,
   "variant": "subword",
   "language": "ay",
-  "unique_ngrams": 22295,
-  "total_ngrams": 4426837
 }

   "n": 3,
   "variant": "subword",
   "language": "ay",
+  "unique_ngrams": 18023,
+  "total_ngrams": 3707986
 }

models/subword_ngram/ay_4gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:47d09c14900c8b9ae9a7d8df02df50e5b5485695a6fc0b931bef0d14df9ce8b1
-size 1123557

 version https://git-lfs.github.com/spec/v1
+oid sha256:f9ef9413d4a89505e3f91ccd040c9ace37f81b42675324390c4ced638def69df
+size 922265

models/subword_ngram/ay_4gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 4,
   "variant": "subword",
   "language": "ay",
-  "unique_ngrams": 97814,
-  "total_ngrams": 4421409
 }

   "n": 4,
   "variant": "subword",
   "language": "ay",
+  "unique_ngrams": 79517,
+  "total_ngrams": 3702757
 }

models/subword_ngram/ay_5gram_subword.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:803216eeaa51e93098ac61273d0da7e7bc6132dea2fb7a682860709e829438f9
+size 1968450

models/subword_ngram/ay_5gram_subword_metadata.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "n": 5,
+  "variant": "subword",
+  "language": "ay",
+  "unique_ngrams": 172494,
+  "total_ngrams": 3697528
+}

models/tokenizer/ay_tokenizer_16k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bacd6b6b67eecb376d4d6587a319913a3beba867c2d3084857448cfac3757e48
-size 503226

 version https://git-lfs.github.com/spec/v1
+oid sha256:0adbcb26da0a585d5342e5e812427af91bee2f91b636980dd3e95f0b4a1c1196
+size 505737

models/tokenizer/ay_tokenizer_16k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/ay_tokenizer_32k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ff707a0d80083da5281395b809fd5fa3d8f96da2ef90f415d80f52d3eef2121a
-size 777476

 version https://git-lfs.github.com/spec/v1
+oid sha256:a389e8657895e37f80c387c1c012d6587ba2c6873de28392f63b10239696e1aa
+size 775069

models/tokenizer/ay_tokenizer_32k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/ay_tokenizer_64k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2e56a192de4ae4f2d2a0ff3ab901c71c57726cc607781da09d9ccbcc07365fc0
-size 1370860

 version https://git-lfs.github.com/spec/v1
+oid sha256:52c41ca6fce3fc21331e55bad07971d936bae44bee2a2da570c6c24a802646be
+size 1367039

models/tokenizer/ay_tokenizer_64k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/ay_tokenizer_8k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:06a75da91da908126d1fafd3569251faaa4952976ef9181f20ec4f333bf6005d
-size 371206

 version https://git-lfs.github.com/spec/v1
+oid sha256:2e40fb62f5c7d884cf77e55e5a5e857f6acb519b28d3093b7f6c0c61798bc063
+size 371955

models/tokenizer/ay_tokenizer_8k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/vocabulary/ay_vocabulary.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:de3094a5549fa43e268c332bf5eb6ff19a145b1edf8b5ccfbb2e2d399a2272e4
-size 427486

 version https://git-lfs.github.com/spec/v1
+oid sha256:96a6d04850ad9f727fbc2e68b852da29a3b2e054174228817863e3cd55eff52b
+size 406214

models/vocabulary/ay_vocabulary_metadata.json CHANGED Viewed

@@ -1,16 +1,17 @@
 {
   "language": "ay",
-  "vocabulary_size": 25329,
   "statistics": {
-    "type_token_ratio": 0.10227701301150292,
     "coverage": {
-      "top_100": 0.43783393047960273,
-      "top_1000": 0.6883022188698221,
-      "top_5000": 0.8231488465648376,
-      "top_10000": 0.8748287133069332
     },
-    "hapax_count": 39756,
-    "hapax_ratio": 0.6108319889375432,
-    "total_documents": 5428
   }
 }

 {
   "language": "ay",
+  "vocabulary_size": 24208,
+  "variant": "full",
   "statistics": {
+    "type_token_ratio": 0.10855027401615537,
     "coverage": {
+      "top_100": 0.44600956676528014,
+      "top_1000": 0.6823514196569544,
+      "top_5000": 0.8155496758687956,
+      "top_10000": 0.869902051124535
     },
+    "hapax_count": 36224,
+    "hapax_ratio": 0.5994175271379402,
+    "total_documents": 5229
   }
 }

models/word_markov/ay_markov_ctx1_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9235d44170713caeb8fc80b3dba5782d1f796ed803992b0531ceb65661a19aa4
-size 2137508

 version https://git-lfs.github.com/spec/v1
+oid sha256:b3f3d0e7e728161dbcc80a73ac28997f5e0feb79512426474650a8a566101fc6
+size 1954702

models/word_markov/ay_markov_ctx1_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "word",
   "language": "ay",
-  "unique_contexts": 65172,
-  "total_transitions": 888867
 }

   "context_size": 1,
   "variant": "word",
   "language": "ay",
+  "unique_contexts": 60169,
+  "total_transitions": 551490
 }

models/word_markov/ay_markov_ctx2_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:56c569c3d809da2135b500827e40c8db0797fbcb18c0c4e781c6af0595edd8c7
-size 4305196

 version https://git-lfs.github.com/spec/v1
+oid sha256:a79438f48542e0b474e7e91879950895c9f265f99669a75acc68549273f2b2a6
+size 3803866

models/word_markov/ay_markov_ctx2_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "word",
   "language": "ay",
-  "unique_contexts": 239858,
-  "total_transitions": 883439
 }

   "context_size": 2,
   "variant": "word",
   "language": "ay",
+  "unique_contexts": 216093,
+  "total_transitions": 546261
 }