omarkamali commited on Jan 3

Commit

b7c9fa4

verified ·

1 Parent(s): f085801

Upload all models and assets for ast (20251001)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +313 -153
models/embeddings/monolingual/ast_128d.bin +2 -2
models/embeddings/monolingual/ast_128d_metadata.json +5 -3
models/embeddings/monolingual/ast_32d.bin +2 -2
models/embeddings/monolingual/ast_32d_metadata.json +5 -3
models/embeddings/monolingual/ast_64d.bin +2 -2
models/embeddings/monolingual/ast_64d_metadata.json +5 -3
models/subword_markov/ast_markov_ctx1_subword.parquet +2 -2
models/subword_markov/ast_markov_ctx1_subword_metadata.json +2 -2
models/subword_markov/ast_markov_ctx2_subword.parquet +2 -2
models/subword_markov/ast_markov_ctx2_subword_metadata.json +2 -2
models/subword_markov/ast_markov_ctx3_subword.parquet +2 -2
models/subword_markov/ast_markov_ctx3_subword_metadata.json +2 -2
models/subword_markov/ast_markov_ctx4_subword.parquet +2 -2
models/subword_markov/ast_markov_ctx4_subword_metadata.json +2 -2
models/subword_ngram/ast_2gram_subword.parquet +2 -2
models/subword_ngram/ast_2gram_subword_metadata.json +2 -2
models/subword_ngram/ast_3gram_subword.parquet +2 -2
models/subword_ngram/ast_3gram_subword_metadata.json +2 -2
models/subword_ngram/ast_4gram_subword.parquet +2 -2
models/subword_ngram/ast_4gram_subword_metadata.json +2 -2
models/tokenizer/ast_tokenizer_16k.model +2 -2
models/tokenizer/ast_tokenizer_16k.vocab +0 -0
models/tokenizer/ast_tokenizer_32k.model +2 -2
models/tokenizer/ast_tokenizer_32k.vocab +0 -0
models/tokenizer/ast_tokenizer_64k.model +2 -2
models/tokenizer/ast_tokenizer_64k.vocab +0 -0
models/tokenizer/ast_tokenizer_8k.model +2 -2
models/tokenizer/ast_tokenizer_8k.vocab +0 -0
models/vocabulary/ast_vocabulary.parquet +2 -2
models/vocabulary/ast_vocabulary_metadata.json +10 -9
models/word_markov/ast_markov_ctx1_word.parquet +2 -2
models/word_markov/ast_markov_ctx1_word_metadata.json +2 -2
models/word_markov/ast_markov_ctx2_word.parquet +2 -2
models/word_markov/ast_markov_ctx2_word_metadata.json +2 -2
models/word_markov/ast_markov_ctx3_word.parquet +2 -2
models/word_markov/ast_markov_ctx3_word_metadata.json +2 -2
models/word_markov/ast_markov_ctx4_word.parquet +2 -2
models/word_markov/ast_markov_ctx4_word_metadata.json +2 -2
models/word_ngram/ast_2gram_word.parquet +2 -2
models/word_ngram/ast_2gram_word_metadata.json +2 -2
models/word_ngram/ast_3gram_word.parquet +2 -2
models/word_ngram/ast_3gram_word_metadata.json +2 -2
models/word_ngram/ast_4gram_word.parquet +2 -2
models/word_ngram/ast_4gram_word_metadata.json +2 -2
visualizations/embedding_isotropy.png +0 -0
visualizations/embedding_norms.png +0 -0
visualizations/embedding_similarity.png +2 -2
visualizations/markov_branching.png +0 -0
visualizations/markov_contexts.png +0 -0

README.md CHANGED Viewed

@@ -23,14 +23,14 @@ dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
-    value: 3.924
   - name: best_isotropy
     type: isotropy
-    value: 0.7692
   - name: vocabulary_size
     type: vocab
-    value: 654549
-generated: 2025-12-27
 ---
 # AST - Wikilangs Models
@@ -44,12 +44,13 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 ### Models & Assets
 - Tokenizers (8k, 16k, 32k, 64k)
-- N-gram models (2, 3, 4-gram)
-- Markov chains (context of 1, 2, 3 and 4)
 - Subword N-gram and Markov chains
-- Embeddings in various sizes and dimensions
 - Language Vocabulary
 - Language Statistics
 ![Performance Dashboard](visualizations/performance_dashboard.png)
 ### Analysis and Evaluation
@@ -59,7 +60,8 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
-- [6. Summary & Recommendations](#6-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
@@ -68,71 +70,57 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 ![Tokenizer Compression](visualizations/tokenizer_compression.png)
 ### Results
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
-| **8k** | 3.259x | 3.22 | 0.0290% | 1,033,064 |
-| **16k** | 3.531x | 3.48 | 0.0315% | 953,475 |
-| **32k** | 3.753x | 3.70 | 0.0334% | 897,137 |
-| **64k** | 3.924x 🏆 | 3.87 | 0.0350% | 858,173 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
-**Sample 1:** `Fechos
- Personaxes importantes
- Referencies
- Enllaces esternos
-Categoría...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁fechos ▁personaxes ▁importantes ▁referencies ▁enllaces ▁esternos ▁categoría : sieglu ▁viii ... (+4 more)` | 14 |
-| 16k | `▁fechos ▁personaxes ▁importantes ▁referencies ▁enllaces ▁esternos ▁categoría : sieglu ▁viii ... (+4 more)` | 14 |
-| 32k | `▁fechos ▁personaxes ▁importantes ▁referencies ▁enllaces ▁esternos ▁categoría : sieglu ▁viii ... (+4 more)` | 14 |
-| 64k | `▁fechos ▁personaxes ▁importantes ▁referencies ▁enllaces ▁esternos ▁categoría : sieglu ▁viii ... (+4 more)` | 14 |
-**Sample 2:** `Armental ye un llugar de la parroquia de Talarén nel conceyu asturianu de Navia....`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁ar mental ▁ye ▁un ▁llugar ▁de ▁la ▁parroquia ▁de ▁tal ... (+18 more)` | 28 |
-| 16k | `▁ar mental ▁ye ▁un ▁llugar ▁de ▁la ▁parroquia ▁de ▁tal ... (+18 more)` | 28 |
-| 32k | `▁ar mental ▁ye ▁un ▁llugar ▁de ▁la ▁parroquia ▁de ▁tal ... (+16 more)` | 26 |
-| 64k | `▁ar mental ▁ye ▁un ▁llugar ▁de ▁la ▁parroquia ▁de ▁tal ... (+16 more)` | 26 |
-**Sample 3:** `Fechos
-  -
- Nacencies
-  -
- Muertes
-  -
- Referencies
- Enllaces esternos
-...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁fechos ▁- ▁nacencies ▁- ▁muertes ▁- ▁referencies ▁enllaces ▁esternos ▁categoría ... (+7 more)` | 17 |
-| 16k | `▁fechos ▁- ▁nacencies ▁- ▁muertes ▁- ▁referencies ▁enllaces ▁esternos ▁categoría ... (+7 more)` | 17 |
-| 32k | `▁fechos ▁- ▁nacencies ▁- ▁muertes ▁- ▁referencies ▁enllaces ▁esternos ▁categoría ... (+7 more)` | 17 |
-| 64k | `▁fechos ▁- ▁nacencies ▁- ▁muertes ▁- ▁referencies ▁enllaces ▁esternos ▁categoría ... (+7 more)` | 17 |
 ### Key Findings
-- **Best Compression:** 64k achieves 3.924x compression
-- **Lowest UNK Rate:** 8k with 0.0290% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
@@ -141,57 +129,89 @@ Categoría...`
 ![N-gram Perplexity](visualizations/ngram_perplexity.png)
 ![N-gram Coverage](visualizations/ngram_coverage.png)
 ### Results
-| N-gram | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
-|--------|------------|---------|----------------|------------------|-------------------|
-| **2-gram** | 95,540 🏆 | 16.54 | 1,568,799 | 13.6% | 26.9% |
-| **2-gram** | 311 🏆 | 8.28 | 23,389 | 65.5% | 98.4% |
-| **3-gram** | 573,984 | 19.13 | 3,974,147 | 5.1% | 13.4% |
-| **3-gram** | 2,766 | 11.43 | 195,082 | 25.8% | 68.5% |
-| **4-gram** | 1,609,317 | 20.62 | 7,247,181 | 3.9% | 9.3% |
-| **4-gram** | 16,954 | 14.05 | 1,178,742 | 12.7% | 37.0% |
 ### Top 5 N-grams by Size
-**2-grams:**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `d '` | 1,196,313 |
-| 2 | `de la` | 875,667 |
-| 3 | `' l` | 534,478 |
-| 4 | `| |` | 438,858 |
-| 5 | `l '` | 403,691 |
-**3-grams:**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `| - |` | 128,285 |
-| 2 | `referencies enllaces esternos` | 104,162 |
-| 3 | `- | |` | 89,758 |
-| 4 | `- - -` | 81,514 |
-| 5 | `d ' un` | 69,529 |
-**4-grams:**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `- - - -` | 69,470 |
-| 2 | `enllaces esternos categoría :` | 63,833 |
-| 3 | `referencies enllaces esternos categoría` | 60,665 |
-| 4 | `. referencies enllaces esternos` | 51,144 |
-| 5 | `| linear | -` | 50,481 |
 ### Key Findings
-- **Best Perplexity:** 2-gram with 311
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
-- **Coverage:** Top-1000 patterns cover ~37% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
@@ -199,55 +219,86 @@ Categoría...`
 ![Markov Entropy](visualizations/markov_entropy.png)
 ![Markov Branching](visualizations/markov_branching.png)
 ### Results
-| Context | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
-|---------|-------------|------------|------------------|-----------------|----------------|
-| **1** | 0.7150 | 1.641 | 8.69 | 1,669,949 | 28.5% |
-| **1** | 1.5193 | 2.866 | 10.68 | 8,875 | 0.0% |
-| **2** | 0.4611 | 1.377 | 2.90 | 14,499,551 | 53.9% |
-| **2** | 0.7271 | 1.655 | 4.90 | 94,766 | 27.3% |
-| **3** | 0.2234 | 1.167 | 1.58 | 42,031,886 | 77.7% |
-| **3** | 0.8068 | 1.749 | 4.59 | 464,259 | 19.3% |
-| **4** | 0.1062 🏆 | 1.076 | 1.22 | 66,322,442 | 89.4% |
-| **4** | 0.7182 🏆 | 1.645 | 3.49 | 2,131,889 | 28.2% |
-### Generated Text Samples
-Below are text samples generated from each Markov chain model:
 **Context Size 1:**
-1. `de gossip girl play ye como xenofonte en dussel , y el 7 d ' amuesa`
-2. `, pero la botánica referencies ver , collaboró en valdivia . mientres la humanidá al chinu`
-3. `. ( 2 ) - ḥḏ horusmuriu blancumennefermenfismit rahina . isbn 0 british lion , yera`
 **Context Size 2:**
-1. `d ' ellos yera detectáu polos enemigos . shiva prakash ( 1997 ) , nel conceyu sevillanu`
-2. `de la litografía y l ' ala posterior : chronica majora : una « inocente ya inconsciente`
-3. `' l xeneral prats tamién pudo ante fernando verdasco david ferrer por 6 - 2 | ríu`
 **Context Size 3:**
-1. `| - | 38378 - | | 1997 tb18 | | 4 | align = right | [`
-2. `referencies enllaces esternos categoría : montserrat`
-3. `- | | 2001 sd35 | | 16 | | 592 | | < small > 1911 <`
 **Context Size 4:**
-1. `- - - - - - - - - - - - - - - - - - -`
-2. `enllaces esternos categoría : pintores de parís categoría : sabios de la torre eiffel , los nacional...`
-3. `referencies enllaces esternos categoría : comuñes de nord`
 ### Key Findings
-- **Best Predictability:** Context-4 with 89.4% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
-- **Memory Trade-off:** Larger contexts require more storage (2,131,889 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
@@ -263,64 +314,64 @@ Below are text samples generated from each Markov chain model:
 | Metric | Value |
 |--------|-------|
-| Vocabulary Size | 654,549 |
-| Total Tokens | 80,184,102 |
-| Mean Frequency | 122.50 |
 | Median Frequency | 4 |
-| Frequency Std Dev | 8722.95 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | de | 5,075,921 |
-| 2 | la | 2,521,840 |
-| 3 | y | 2,071,360 |
-| 4 | d | 1,229,266 |
-| 5 | a | 1,176,335 |
-| 6 | del | 1,090,980 |
-| 7 | en | 1,071,173 |
-| 8 | que | 1,020,518 |
-| 9 | los | 971,499 |
-| 10 | l | 968,352 |
 ### Least Common Words (from vocabulary)
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | leptafeke | 2 |
-| 2 | haua | 2 |
-| 3 | küzdoblani | 2 |
-| 4 | contrarrellatu | 2 |
-| 5 | semilleru | 2 |
-| 6 | bisterca | 2 |
-| 7 | šafarsko | 2 |
-| 8 | vyfalu | 2 |
-| 9 | ribich | 2 |
-| 10 | lacos | 2 |
 ### Zipf's Law Analysis
 | Metric | Value |
 |--------|-------|
-| Zipf Coefficient | 1.0077 |
-| R² (Goodness of Fit) | 0.995140 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
-| Top 100 | 40.0% |
-| Top 1,000 | 60.0% |
-| Top 5,000 | 76.4% |
-| Top 10,000 | 82.7% |
 ### Key Findings
-- **Zipf Compliance:** R²=0.9951 indicates excellent adherence to Zipf's law
-- **High Frequency Dominance:** Top 100 words cover 40.0% of corpus
-- **Long Tail:** 644,549 words needed for remaining 17.3% coverage
 ---
 ## 5. Word Embeddings Evaluation
@@ -333,24 +384,130 @@ Below are text samples generated from each Markov chain model:
 ![t-SNE Sentences](visualizations/tsne_sentences.png)
-### Model Comparison
-| Model | Vocab Size | Dimension | Avg Norm | Std Norm | Isotropy |
-|-------|------------|-----------|----------|----------|----------|
-| **mono_32d** | 510,373 | 32 | 3.008 | 0.935 | 0.7692 🏆 |
-| **mono_64d** | 510,373 | 64 | 3.395 | 0.938 | 0.7616 |
-| **mono_128d** | 510,373 | 128 | 3.842 | 0.965 | 0.6988 |
-| **embeddings_enhanced** | 0 | 0 | 0.000 | 0.000 | 0.0000 |
 ### Key Findings
-- **Best Isotropy:** mono_32d with 0.7692 (more uniform distribution)
-- **Dimension Trade-off:** Higher dimensions capture more semantics but reduce isotropy
-- **Vocabulary Coverage:** All models cover 510,373 words
-- **Recommendation:** 100d for balanced semantic capture and efficiency
 ---
-## 6. Summary & Recommendations
 ![Performance Dashboard](visualizations/performance_dashboard.png)
@@ -358,11 +515,12 @@ Below are text samples generated from each Markov chain model:
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
-| Tokenizer | **32k BPE** | Best compression (3.92x) with low UNK rate |
-| N-gram | **5-gram** | Lowest perplexity (311) |
-| Markov | **Context-4** | Highest predictability (89.4%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
 ---
 ## Appendix: Metrics Glossary & Interpretation Guide
@@ -552,7 +710,8 @@ If you use these models in your research, please cite:
   author = {Kamali, Omar},
   title = {Wikilangs: Open NLP Models for Wikipedia Languages},
   year = {2025},
-  publisher = {HuggingFace},
   url = {https://huggingface.co/wikilangs}
   institution = {Omneity Labs}
 }
@@ -568,7 +727,8 @@ MIT License - Free for academic and commercial use.
 - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
 - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
 - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
 ---
 *Generated by Wikilangs Models Pipeline*
-*Report Date: 2025-12-27 20:35:27*

 metrics:
   - name: best_compression_ratio
     type: compression
+    value: 4.427
   - name: best_isotropy
     type: isotropy
+    value: 0.7909
   - name: vocabulary_size
     type: vocab
+    value: 0
+generated: 2026-01-03
 ---
 # AST - Wikilangs Models
 ### Models & Assets
 - Tokenizers (8k, 16k, 32k, 64k)
+- N-gram models (2, 3, 4, 5-gram)
+- Markov chains (context of 1, 2, 3, 4 and 5)
 - Subword N-gram and Markov chains
+- Embeddings in various sizes and dimensions (aligned and unaligned)
 - Language Vocabulary
 - Language Statistics
 ![Performance Dashboard](visualizations/performance_dashboard.png)
 ### Analysis and Evaluation
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
+- [6. Morphological Analysis (Experimental)](#6-morphological-analysis)
+- [7. Summary & Recommendations](#7-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
 ![Tokenizer Compression](visualizations/tokenizer_compression.png)
+![Tokenizer Fertility](visualizations/tokenizer_fertility.png)
+![Tokenizer OOV](visualizations/tokenizer_oov.png)
+![Total Tokens](visualizations/tokenizer_total_tokens.png)
 ### Results
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
+| **8k** | 3.569x | 3.57 | 0.0259% | 871,221 |
+| **16k** | 3.921x | 3.92 | 0.0285% | 793,006 |
+| **32k** | 4.204x | 4.21 | 0.0306% | 739,567 |
+| **64k** | 4.427x 🏆 | 4.43 | 0.0322% | 702,254 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
+**Sample 1:** `Luiz Diallisson de Souza Alves ye un futbolista brasilanu. Clubes Kuban Referenc...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁lu iz ▁di all is son ▁de ▁sou za ▁al ... (+14 more)` | 24 |
+| 16k | `▁lu iz ▁di all is son ▁de ▁sou za ▁al ... (+14 more)` | 24 |
+| 32k | `▁luiz ▁di all is son ▁de ▁souza ▁alves ▁ye ▁un ... (+11 more)` | 21 |
+| 64k | `▁luiz ▁di all isson ▁de ▁souza ▁alves ▁ye ▁un ▁futbolista ... (+10 more)` | 20 |
+**Sample 2:** `Vagner da Silva Sarti ye un ex-futbolista brasilanu. Clubes Referencies Enllaces...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁va gn er ▁da ▁silva ▁sar ti ▁ye ▁un ▁ex ... (+10 more)` | 20 |
+| 16k | `▁va gner ▁da ▁silva ▁sar ti ▁ye ▁un ▁ex - ... (+9 more)` | 19 |
+| 32k | `▁va gner ▁da ▁silva ▁sar ti ▁ye ▁un ▁ex - ... (+9 more)` | 19 |
+| 64k | `▁va gner ▁da ▁silva ▁sar ti ▁ye ▁un ▁ex - ... (+9 more)` | 19 |
+**Sample 3:** `(MMLXXXII) va ser un añu normal entamáu en xueves nel calendariu gregorianu. Ref...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁( m m l xx x ii ) ▁va ▁ser ... (+17 more)` | 27 |
+| 16k | `▁( mm l xx x ii ) ▁va ▁ser ▁un ... (+14 more)` | 24 |
+| 32k | `▁( mm l xx xii ) ▁va ▁ser ▁un ▁añu ... (+12 more)` | 22 |
+| 64k | `▁( mm lxx xii ) ▁va ▁ser ▁un ▁añu ▁normal ... (+11 more)` | 21 |
 ### Key Findings
+- **Best Compression:** 64k achieves 4.427x compression
+- **Lowest UNK Rate:** 8k with 0.0259% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
 ![N-gram Perplexity](visualizations/ngram_perplexity.png)
+![N-gram Unique](visualizations/ngram_unique.png)
 ![N-gram Coverage](visualizations/ngram_coverage.png)
 ### Results
+| N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
+|--------|---------|------------|---------|----------------|------------------|-------------------|
+| **2-gram** | Word | 133,027 | 17.02 | 1,354,323 | 9.8% | 21.6% |
+| **2-gram** | Subword | 260 🏆 | 8.02 | 19,069 | 69.7% | 99.1% |
+| **3-gram** | Word | 646,899 | 19.30 | 2,908,394 | 4.2% | 10.7% |
+| **3-gram** | Subword | 2,223 | 11.12 | 139,212 | 28.0% | 72.3% |
+| **4-gram** | Word | 1,559,764 | 20.57 | 4,707,856 | 3.3% | 7.5% |
+| **4-gram** | Subword | 13,372 | 13.71 | 791,795 | 13.9% | 39.3% |
 ### Top 5 N-grams by Size
+**2-grams (Word):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `de la` | 891,402 |
+| 2 | `de los` | 329,410 |
+| 3 | `la so` | 220,083 |
+| 4 | `a la` | 215,036 |
+| 5 | `de les` | 208,071 |
+**3-grams (Word):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `referencies enllaces esternos` | 101,643 |
+| 2 | `de la so` | 48,838 |
+| 3 | `d estaos xuníos` | 34,333 |
+| 4 | `enllaces esternos de` | 33,237 |
+| 5 | `una población de` | 30,269 |
+**4-grams (Word):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `referencies enllaces esternos de` | 32,314 |
+| 2 | `tien una población de` | 26,720 |
+| 3 | `una población de y` | 19,598 |
+| 4 | `y una superficie de` | 19,554 |
+| 5 | `una superficie de km` | 19,519 |
+**2-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `a _` | 12,346,491 |
+| 2 | `e _` | 10,275,492 |
+| 3 | `s _` | 10,054,248 |
+| 4 | `_ d` | 9,863,919 |
+| 5 | `e s` | 9,411,923 |
+**3-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `_ d e` | 7,215,701 |
+| 2 | `d e _` | 5,349,035 |
+| 3 | `e s _` | 4,769,369 |
+| 4 | `o s _` | 3,909,790 |
+| 5 | `l a _` | 3,068,189 |
+**4-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `_ d e _` | 4,975,922 |
+| 2 | `_ l a _` | 2,468,941 |
+| 3 | `d e _ l` | 1,667,072 |
+| 4 | `a _ d e` | 1,422,241 |
+| 5 | `s _ d e` | 1,380,334 |
 ### Key Findings
+- **Best Perplexity:** 2-gram (subword) with 260
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
+- **Coverage:** Top-1000 patterns cover ~39% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
 ![Markov Entropy](visualizations/markov_entropy.png)
+![Markov Contexts](visualizations/markov_contexts.png)
 ![Markov Branching](visualizations/markov_branching.png)
 ### Results
+| Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
+|---------|---------|-------------|------------|------------------|-----------------|----------------|
+| **1** | Word | 1.0381 | 2.054 | 12.99 | 1,204,316 | 0.0% |
+| **1** | Subword | 1.1983 | 2.295 | 7.97 | 10,463 | 0.0% |
+| **2** | Word | 0.4193 | 1.337 | 2.57 | 15,634,564 | 58.1% |
+| **2** | Subword | 0.6558 | 1.576 | 4.28 | 83,437 | 34.4% |
+| **3** | Word | 0.1865 | 1.138 | 1.44 | 40,202,890 | 81.4% |
+| **3** | Subword | 0.6846 | 1.607 | 4.03 | 357,207 | 31.5% |
+| **4** | Word | 0.0789 🏆 | 1.056 | 1.15 | 57,817,277 | 92.1% |
+| **4** | Subword | 0.6846 | 1.607 | 3.51 | 1,439,883 | 31.5% |
+### Generated Text Samples (Word-based)
+Below are text samples generated from each word-based Markov chain model:
+**Context Size 1:**
+1. `de xunu empieza a centros multimodales funciones nel nectariu semilunar o n ucraín y comunicaciones ...`
+2. `la pieza cornelius coffin fit cuando el algebasó 503 mariña d una solución bonal o nun`
+3. `y 15 m sobre l uniforme del postreru gran midida china tales from here mirror weekly`
+**Context Size 2:**
+1. `de la provincia dende esti tornéu surdió en y persuadió a eliza dushku en películes d estudiante`
+2. `de los documentos relativos al mercáu l so antiguu nome dau más tarde l empresariu estremeñu dueñu`
+3. `la so base na isla parker llogró atrapar la pelota vasca que se llevó a empecipiar una`
+**Context Size 3:**
+1. `referencies enllaces esternos el salín nel suelu y ente vexetación trupa sicasí en marismas y ribere...`
+2. `de la so agua h havagazı gas o otobüs bus y t troleybüs trolebús magar que los entamos`
+3. `enllaces esternos de côte d or na rexón de gran este llenda con tien una población de 1`
+**Context Size 4:**
+1. `referencies enllaces esternos de saboya de francia de bretaña de dreux de bretaña`
+2. `tien una población de 1 690 471 habitantes y un puertu fluvial sobre l paraná amás tien importancia ...`
+3. `una población de y una superficie de km ver tamién referencies enllaces esternos de xapón de la pref...`
+### Generated Text Samples (Subword-based)
+Below are text samples generated from each subword-based Markov chain model:
 **Context Size 1:**
+1. `_fén_an_yaconyíc`
+2. `er,_ciesunton_a_`
+3. `a_tostelociz_ce_`
 **Context Size 2:**
+1. `a_gasainel_tabaro`
+2. `e_es_de_y_chel_má`
+3. `s_astamudia_de_ll`
 **Context Size 3:**
+1. `_de_scharacióse_le`
+2. `de_tragar_primera_`
+3. `es_so_títulu_miliz`
 **Context Size 4:**
+1. `_de_los_sobres_del_`
+2. `_la_ermistoria_dife`
+3. `de_los_xeneia,_cons`
 ### Key Findings
+- **Best Predictability:** Context-4 (word) with 92.1% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
+- **Memory Trade-off:** Larger contexts require more storage (1,439,883 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
 | Metric | Value |
 |--------|-------|
+| Vocabulary Size | 555,056 |
+| Total Tokens | 75,071,637 |
+| Mean Frequency | 135.25 |
 | Median Frequency | 4 |
+| Frequency Std Dev | 9337.66 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | de | 4,994,843 |
+| 2 | la | 2,512,518 |
+| 3 | y | 2,055,358 |
+| 4 | d | 1,181,646 |
+| 5 | a | 1,163,388 |
+| 6 | del | 1,091,464 |
+| 7 | en | 1,070,328 |
+| 8 | que | 1,013,684 |
+| 9 | los | 966,280 |
+| 10 | l | 958,680 |
 ### Least Common Words (from vocabulary)
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | sverlo | 2 |
+| 2 | kmca | 2 |
+| 3 | antimaterialistas | 2 |
+| 4 | infectados | 2 |
+| 5 | historietistas | 2 |
+| 6 | curtmetratxe | 2 |
+| 7 | rugna | 2 |
+| 8 | lleáu | 2 |
+| 9 | queña | 2 |
+| 10 | nkoghe | 2 |
 ### Zipf's Law Analysis
 | Metric | Value |
 |--------|-------|
+| Zipf Coefficient | 0.9991 |
+| R² (Goodness of Fit) | 0.995555 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
+| Top 100 | 41.7% |
+| Top 1,000 | 60.8% |
+| Top 5,000 | 76.8% |
+| Top 10,000 | 83.1% |
 ### Key Findings
+- **Zipf Compliance:** R²=0.9956 indicates excellent adherence to Zipf's law
+- **High Frequency Dominance:** Top 100 words cover 41.7% of corpus
+- **Long Tail:** 545,056 words needed for remaining 16.9% coverage
 ---
 ## 5. Word Embeddings Evaluation
 ![t-SNE Sentences](visualizations/tsne_sentences.png)
+### 5.1 Cross-Lingual Alignment
+> *Note: Multilingual alignment visualization not available for this language.*
+### 5.2 Model Comparison
+| Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
+|-------|-----------|----------|------------------|---------------|----------------|
+| **mono_32d** | 32 | 0.7909 🏆 | 0.3827 | N/A | N/A |
+| **mono_64d** | 64 | 0.7802 | 0.3065 | N/A | N/A |
+| **mono_128d** | 128 | 0.7192 | 0.2391 | N/A | N/A |
 ### Key Findings
+- **Best Isotropy:** mono_32d with 0.7909 (more uniform distribution)
+- **Semantic Density:** Average pairwise similarity of 0.3094. Lower values indicate better semantic separation.
+- **Alignment Quality:** No aligned models evaluated in this run.
+- **Recommendation:** 128d aligned for best cross-lingual performance
 ---
+## 6.  Morphological Analysis (Experimental)
+> ⚠️ **Warning:** This language shows low morphological productivity. The statistical signals used for this analysis may be noisy or less reliable than for morphologically rich languages.
+This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
+### 6.1 Productivity & Complexity
+| Metric | Value | Interpretation | Recommendation |
+|--------|-------|----------------|----------------|
+| Productivity Index | **0.000** | Low morphological productivity | ⚠️ Likely unreliable |
+| Idiomaticity Gap | **-1.000** | Low formulaic content | - |
+### 6.2 Affix Inventory (Productive Units)
+These are the most productive prefixes and suffixes identified by sampling the vocabulary for global substitutability patterns. A unit is considered an affix if stripping it leaves a valid stem that appears in other contexts.
+#### Productive Prefixes
+| Prefix | Examples |
+|--------|----------|
+| `-co` | comíen, compelidos, conciliable |
+| `-ma` | maravíase, maça, matematización |
+| `-re` | reescalada, reprimió, reconociéralu |
+| `-de` | deduz, declaratorio, desfila |
+| `-ca` | caminómetru, castromil, caecilia |
+#### Productive Suffixes
+| Suffix | Examples |
+|--------|----------|
+| `-s` | phrygilus, anticolinérgicos, friulianos |
+| `-a` | raksasa, estendería, reescalada |
+| `-es` | ibes, distopíes, ziríes |
+| `-os` | anticolinérgicos, friulianos, afogadiegos |
+| `-se` | esmoreciérase, maravíase, cuayábase |
+| `-as` | monarquistas, gorgas, mimeografiadas |
+| `-en` | altshausen, comíen, blegen |
+### 6.3 Bound Stems (Lexical Roots)
+Bound stems are high-frequency subword units that are semantically cohesive but rarely appear as standalone words. These often correspond to the 'core' of a word that requires inflection or derivation to be valid.
+| Stem | Cohesion | Substitutability | Examples |
+|------|----------|------------------|----------|
+| `iend` | 1.80x | 206 contexts | fiend, iendo, viendi |
+| `renc` | 2.05x | 99 contexts | frenc, wrench, rencor |
+| `ient` | 1.67x | 271 contexts | vient, iente, aient |
+| `enci` | 1.52x | 262 contexts | venci, benci, cenci |
+| `acio` | 1.63x | 166 contexts | nacio, cacio, tacio |
+| `ació` | 1.79x | 94 contexts | lació, xació, ñació |
+| `nter` | 1.38x | 335 contexts | inter, enter, unter |
+| `ontr` | 1.63x | 118 contexts | contr, contra, montra |
+| `ener` | 1.42x | 205 contexts | enerc, tener, enero |
+| `ntos` | 1.79x | 67 contexts | antos, entos, tintos |
+| `ntes` | 1.49x | 144 contexts | antes, entes, fontes |
+| `efer` | 1.61x | 86 contexts | sefer, nefer, refer |
+### 6.4 Affix Compatibility (Co-occurrence)
+This table shows which prefixes and suffixes most frequently co-occur on the same stems, revealing the 'stacking' rules of the language's morphology.
+| Prefix | Suffix | Frequency | Examples |
+|--------|--------|-----------|----------|
+| `-co` | `-s` | 59 words | conversas, concinnus |
+| `-ca` | `-s` | 53 words | cancelaciones, caloiros |
+| `-ca` | `-a` | 49 words | cartajima, campana |
+| `-co` | `-a` | 44 words | comella, copia |
+| `-ma` | `-a` | 38 words | matina, matrioshka |
+| `-re` | `-s` | 34 words | rectos, restaurantes |
+| `-ma` | `-s` | 31 words | maniobres, maderensis |
+| `-de` | `-s` | 31 words | descatados, definitives |
+| `-co` | `-es` | 25 words | cotidales, coleicionables |
+| `-re` | `-a` | 24 words | renombraría, retomara |
+### 6.5 Recursive Morpheme Segmentation
+Using **Recursive Hierarchical Substitutability**, we decompose complex words into their constituent morphemes. This approach handles nested affixes (e.g., `prefix-prefix-root-suffix`).
+| Word | Suggested Split | Confidence | Stem |
+|------|-----------------|------------|------|
+| retractores | **`re-tractor-es`** | 6.0 | `tractor` |
+| aseguráronse | **`aseguráron-se`** | 4.5 | `aseguráron` |
+| tendiéronse | **`tendiéron-se`** | 4.5 | `tendiéron` |
+| tresversales | **`tresversal-es`** | 4.5 | `tresversal` |
+| redefiniéronse | **`re-de-finiéron-se`** | 4.5 | `finiéron` |
+| redistributivo | **`re-distributivo`** | 4.5 | `distributivo` |
+| prométese | **`prométe-se`** | 4.5 | `prométe` |
+| escaecíen | **`escaecí-en`** | 4.5 | `escaecí` |
+| domadores | **`domador-es`** | 4.5 | `domador` |
+| consérvense | **`co-nsérv-en-se`** | 4.5 | `nsérv` |
+| descripto | **`de-scripto`** | 4.5 | `scripto` |
+| esaxeróse | **`esaxeró-se`** | 4.5 | `esaxeró` |
+| acentores | **`acentor-es`** | 4.5 | `acentor` |
+| detrayendo | **`de-trayendo`** | 4.5 | `trayendo` |
+| renormalización | **`re-normalización`** | 4.5 | `normalización` |
+### 6.6 Linguistic Interpretation
+> **Automated Insight:**
+The language AST appears to be more isolating or has a highly fixed vocabulary. Word-level models perform nearly as well as subword models, indicating fewer productive morphological processes.
+---
+## 7. Summary & Recommendations
 ![Performance Dashboard](visualizations/performance_dashboard.png)
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
+| Tokenizer | **64k BPE** | Best compression (4.43x) |
+| N-gram | **2-gram** | Lowest perplexity (260) |
+| Markov | **Context-4** | Highest predictability (92.1%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
 ---
 ## Appendix: Metrics Glossary & Interpretation Guide
   author = {Kamali, Omar},
   title = {Wikilangs: Open NLP Models for Wikipedia Languages},
   year = {2025},
+  doi = {10.5281/zenodo.18073153},
+  publisher = {Zenodo},
   url = {https://huggingface.co/wikilangs}
   institution = {Omneity Labs}
 }
 - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
 - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
 - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
+- 🤝 Sponsor: [Featherless AI](https://featherless.ai)
 ---
 *Generated by Wikilangs Models Pipeline*
+*Report Date: 2026-01-03 09:38:21*

models/embeddings/monolingual/ast_128d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:39f5f134a23b0ae5b3bba7a4864ecc3599e28dd6a5c44748c538e6eb50542a1b
-size 1555994683

 version https://git-lfs.github.com/spec/v1
+oid sha256:8c4500458da720e9717e0be530291de837dfe1937d937d8f59cd5975214f6d01
+size 1526234273

models/embeddings/monolingual/ast_128d_metadata.json CHANGED Viewed

@@ -3,11 +3,13 @@
   "dimension": 128,
   "version": "monolingual",
   "training_params": {
-    "dim": 128,
     "min_count": 5,
     "window": 5,
     "negative": 5,
-    "epochs": 5
   },
-  "vocab_size": 510373
 }

   "dimension": 128,
   "version": "monolingual",
   "training_params": {
+    "algorithm": "skipgram",
     "min_count": 5,
     "window": 5,
     "negative": 5,
+    "epochs": 5,
+    "encoding_method": "rope",
+    "dim": 128
   },
+  "vocab_size": 481846
 }

models/embeddings/monolingual/ast_32d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:aecd461d0dd299f160b9fefcb4d6bbc47ffe601253410b6bf66aa59353683649
-size 396028219

 version https://git-lfs.github.com/spec/v1
+oid sha256:aa3cdba099d3af49ab89043ad5713c1fe468060d520334f5200b000925a5e30e
+size 388176545

models/embeddings/monolingual/ast_32d_metadata.json CHANGED Viewed

@@ -3,11 +3,13 @@
   "dimension": 32,
   "version": "monolingual",
   "training_params": {
-    "dim": 32,
     "min_count": 5,
     "window": 5,
     "negative": 5,
-    "epochs": 5
   },
-  "vocab_size": 510373
 }

   "dimension": 32,
   "version": "monolingual",
   "training_params": {
+    "algorithm": "skipgram",
     "min_count": 5,
     "window": 5,
     "negative": 5,
+    "epochs": 5,
+    "encoding_method": "rope",
+    "dim": 32
   },
+  "vocab_size": 481846
 }

models/embeddings/monolingual/ast_64d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a7552caa3e9f8ef8607c05afb5b9266e4c9a3e70fdba272075a7b9912e995045
-size 782683707

 version https://git-lfs.github.com/spec/v1
+oid sha256:7e24d4561e7c9f15e391c915d7d4096c8b8adf794324cd6df9a60e78e6f5fc6a
+size 767529121

models/embeddings/monolingual/ast_64d_metadata.json CHANGED Viewed

@@ -3,11 +3,13 @@
   "dimension": 64,
   "version": "monolingual",
   "training_params": {
-    "dim": 64,
     "min_count": 5,
     "window": 5,
     "negative": 5,
-    "epochs": 5
   },
-  "vocab_size": 510373
 }

   "dimension": 64,
   "version": "monolingual",
   "training_params": {
+    "algorithm": "skipgram",
     "min_count": 5,
     "window": 5,
     "negative": 5,
+    "epochs": 5,
+    "encoding_method": "rope",
+    "dim": 64
   },
+  "vocab_size": 481846
 }

models/subword_markov/ast_markov_ctx1_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:797c1dfcb6c516214d2dc02ed941e64e840e49d291e2cc758d5482d86ae04640
-size 602821

 version https://git-lfs.github.com/spec/v1
+oid sha256:1b9be5fbc395f6cb378f05193e1ba7234945d9bca4e71b4cdad6aea648215809
+size 555949

models/subword_markov/ast_markov_ctx1_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "subword",
   "language": "ast",
-  "unique_contexts": 8875,
-  "total_transitions": 508763835
 }

   "context_size": 1,
   "variant": "subword",
   "language": "ast",
+  "unique_contexts": 10463,
+  "total_transitions": 464881103
 }

models/subword_markov/ast_markov_ctx2_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e0ee540fb4354bf9779b55ea96cde4252defd9f562e14d445ac6b6c9b457c3f1
-size 3577755

 version https://git-lfs.github.com/spec/v1
+oid sha256:a0a7c5f99ccca8b527cebd7afb3189b4a99a6cc219cd1b861b501d17180c93b0
+size 2891072

models/subword_markov/ast_markov_ctx2_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "subword",
   "language": "ast",
-  "unique_contexts": 94766,
-  "total_transitions": 508625606
 }

   "context_size": 2,
   "variant": "subword",
   "language": "ast",
+  "unique_contexts": 83437,
+  "total_transitions": 464744539
 }

models/subword_markov/ast_markov_ctx3_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:db3f45e2155b76125f075309cbec6b285d5ee2401fd063d4b596638cb534193e
-size 16073938

 version https://git-lfs.github.com/spec/v1
+oid sha256:6c060d1174629162cb8911f78054119018ce6e2fc628936bfdeb36691d35489c
+size 11949355

models/subword_markov/ast_markov_ctx3_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 3,
   "variant": "subword",
   "language": "ast",
-  "unique_contexts": 464259,
-  "total_transitions": 508487377
 }

   "context_size": 3,
   "variant": "subword",
   "language": "ast",
+  "unique_contexts": 357207,
+  "total_transitions": 464607975
 }

models/subword_markov/ast_markov_ctx4_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dd0b9d4e9db8c19eeb444ba13e49b7a48ec4b84512f949ef315285aff0822a0e
-size 57292008

 version https://git-lfs.github.com/spec/v1
+oid sha256:60988880010bce63ffacea78379f2d4b447c453dc44f6445a08b1d6e123f645c
+size 39925220

models/subword_markov/ast_markov_ctx4_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 4,
   "variant": "subword",
   "language": "ast",
-  "unique_contexts": 2131889,
-  "total_transitions": 508349148
 }

   "context_size": 4,
   "variant": "subword",
   "language": "ast",
+  "unique_contexts": 1439883,
+  "total_transitions": 464471411
 }

models/subword_ngram/ast_2gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:417e9408d0c904ed21f23501e55082a7494276c2cfce4650855bdde090aa9e15
-size 314961

 version https://git-lfs.github.com/spec/v1
+oid sha256:0c50ba055002320ed277521dcbb25feecc7139a7b02e23e72ca8255ada347410
+size 259482

models/subword_ngram/ast_2gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 2,
   "variant": "subword",
   "language": "ast",
-  "unique_ngrams": 23389,
-  "total_ngrams": 508763835
 }

   "n": 2,
   "variant": "subword",
   "language": "ast",
+  "unique_ngrams": 19069,
+  "total_ngrams": 464881103
 }

models/subword_ngram/ast_3gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:730e803a7b1373e12874f3e3ee787c6229e54c4f714b7b5b45ecbc90e7e1f2ef
-size 2302607

 version https://git-lfs.github.com/spec/v1
+oid sha256:d1ee7e63a9f92e3bf668df5f64f593ca6671568eb54bc02ab9e3b50ee2ae7ae4
+size 1721257

models/subword_ngram/ast_3gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 3,
   "variant": "subword",
   "language": "ast",
-  "unique_ngrams": 195082,
-  "total_ngrams": 508625606
 }

   "n": 3,
   "variant": "subword",
   "language": "ast",
+  "unique_ngrams": 139212,
+  "total_ngrams": 464744539
 }

models/subword_ngram/ast_4gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b418657cc08b4f91e0b2c2eda00aef5c4302d754fe9dccc5feb84ff1b24a6291
-size 13600667

 version https://git-lfs.github.com/spec/v1
+oid sha256:390181c0c7fcc8b118b108e18cb830493109f584b4b87f76850993e3e9dda0be
+size 9353492

models/subword_ngram/ast_4gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 4,
   "variant": "subword",
   "language": "ast",
-  "unique_ngrams": 1178742,
-  "total_ngrams": 508487377
 }

   "n": 4,
   "variant": "subword",
   "language": "ast",
+  "unique_ngrams": 791795,
+  "total_ngrams": 464607975
 }

models/tokenizer/ast_tokenizer_16k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:907f4bece802b1d3656fdf65b960935cdd809ca5733b7b73b0823ce18af2c113
-size 513109

 version https://git-lfs.github.com/spec/v1
+oid sha256:f5014581384d443a91ffa0f71150e3ecc09244a1056ad046785f8b8cbc4ce42c
+size 511145

models/tokenizer/ast_tokenizer_16k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/ast_tokenizer_32k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:88d4563db9be715e9225ff77615f89d344e1f2db1b32e1119ffcb1adf873068a
-size 795952

 version https://git-lfs.github.com/spec/v1
+oid sha256:13c458e53178d158dee7ca9d93a562de7bf19975484f7e21af497697c021bb6a
+size 791500

models/tokenizer/ast_tokenizer_32k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/ast_tokenizer_64k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bab637cbda657ea59afedbac3a722b8a7d18839ff031e3a67f1d79f914cb1edd
-size 1371599

 version https://git-lfs.github.com/spec/v1
+oid sha256:b619430c9173d8e202e7905a328c5bca3d915f85a3540adb29b3d6230f10d749
+size 1364026

models/tokenizer/ast_tokenizer_64k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/ast_tokenizer_8k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:53538ed204ba1e90b9a3ec72f89f3a9639ddf5f7e06a5d91daf2182e1dbacf32
-size 374583

 version https://git-lfs.github.com/spec/v1
+oid sha256:8172313a7a1d41e2c61618ad0fab8a1cbd16688f798c007cba80fce5c693133c
+size 374227

models/tokenizer/ast_tokenizer_8k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/vocabulary/ast_vocabulary.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e95a025d0a2e0d5c5fc9e67b2724543af5eaadf84e83944b5601b9e113deee95
-size 9493041

 version https://git-lfs.github.com/spec/v1
+oid sha256:42abd03dfba1393adae66a9e07842fcb85830c5f2dbcb0e037ab56b27fbc8846
+size 8277795

models/vocabulary/ast_vocabulary_metadata.json CHANGED Viewed

@@ -1,16 +1,17 @@
 {
   "language": "ast",
-  "vocabulary_size": 654549,
   "statistics": {
-    "type_token_ratio": 0.020582308161438793,
     "coverage": {
-      "top_100": 0.39513983436676664,
-      "top_1000": 0.5923739422740554,
-      "top_5000": 0.7541024014353347,
-      "top_10000": 0.8168048946874377
     },
-    "hapax_count": 1016752,
-    "hapax_ratio": 0.608359595309283,
-    "total_documents": 138229
   }
 }

 {
   "language": "ast",
+  "vocabulary_size": 555056,
+  "variant": "full",
   "statistics": {
+    "type_token_ratio": 0.015923489955309757,
     "coverage": {
+      "top_100": 0.4129270164572954,
+      "top_1000": 0.6027767233040656,
+      "top_5000": 0.7612431178669916,
+      "top_10000": 0.8237422388332004
     },
+    "hapax_count": 650708,
+    "hapax_ratio": 0.5396644782892838,
+    "total_documents": 136564
   }
 }

models/word_markov/ast_markov_ctx1_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e50511b1eedd319fddb7babd82679e20996cdc8c40bb7414b7729fa744061d38
-size 125168928

 version https://git-lfs.github.com/spec/v1
+oid sha256:df2720891f77ec4fc87c424807076f856c9ebed7d35e8609e740c581e849ba43
+size 130011519

models/word_markov/ast_markov_ctx1_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "word",
   "language": "ast",
-  "unique_contexts": 1669949,
-  "total_transitions": 101266716
 }

   "context_size": 1,
   "variant": "word",
   "language": "ast",
+  "unique_contexts": 1204316,
+  "total_transitions": 75585781
 }

models/word_markov/ast_markov_ctx2_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5e97c5fda98a744b9ecdbc923910e1c4de28e3c699d3c7a3519ea2d5826e515a
-size 436569711

 version https://git-lfs.github.com/spec/v1
+oid sha256:360e77ba671d9d79ef0176080b3098535eb366bbb657d0a5d965de6cbcaaf01a
+size 443953571

models/word_markov/ast_markov_ctx2_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "word",
   "language": "ast",
-  "unique_contexts": 14499551,
-  "total_transitions": 101128488
 }

   "context_size": 2,
   "variant": "word",
   "language": "ast",
+  "unique_contexts": 15634564,
+  "total_transitions": 75449217
 }

models/word_markov/ast_markov_ctx3_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3d70d8996da08da859f83f17208f24ec5fe8a079a8cdc33b458965bdf29e6a65
-size 863814981

 version https://git-lfs.github.com/spec/v1
+oid sha256:dd3378d79db7d6144445a3f22f70dd19d9f2db6b768c7e3368bcecdeffa20cbe
+size 807770212

models/word_markov/ast_markov_ctx3_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 3,
   "variant": "word",
   "language": "ast",
-  "unique_contexts": 42031886,
-  "total_transitions": 100990260
 }

   "context_size": 3,
   "variant": "word",
   "language": "ast",
+  "unique_contexts": 40202890,
+  "total_transitions": 75312653
 }

models/word_markov/ast_markov_ctx4_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:edd90f355e0db9ed093dd1b20ea832c072fe6c4eb8eda9897b82053102ed9e22
-size 1203044127

 version https://git-lfs.github.com/spec/v1
+oid sha256:9217b205ffd93647009467d670ec677b3039ae8a9d817ec195bc651d54e1f8c6
+size 1060990691

models/word_markov/ast_markov_ctx4_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 4,
   "variant": "word",
   "language": "ast",
-  "unique_contexts": 66322442,
-  "total_transitions": 100852034
 }

   "context_size": 4,
   "variant": "word",
   "language": "ast",
+  "unique_contexts": 57817277,
+  "total_transitions": 75176089
 }

models/word_ngram/ast_2gram_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4498d3f067003db47603a9cd549742083283922e1e3bcfac7e995862e92c13a0
-size 22263161

 version https://git-lfs.github.com/spec/v1
+oid sha256:6de399f50958b26f56a582db88426ccf2fe013228b455ccdf670d96c90d70911
+size 19677216

models/word_ngram/ast_2gram_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 2,
   "variant": "word",
   "language": "ast",
-  "unique_ngrams": 1568799,
-  "total_ngrams": 101266716
 }

   "n": 2,
   "variant": "word",
   "language": "ast",
+  "unique_ngrams": 1354323,
+  "total_ngrams": 75585781
 }

models/word_ngram/ast_3gram_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:77d87442f951bfee562611f31c08d2afebd188296c7dc1f3fc4691abc873c580
-size 61024149

 version https://git-lfs.github.com/spec/v1
+oid sha256:d09ec9dd887391dcc7d6cf6c2c1d8f84dece68e3a9cba2f5b80d23407481818d
+size 46625187

models/word_ngram/ast_3gram_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 3,
   "variant": "word",
   "language": "ast",
-  "unique_ngrams": 3974147,
-  "total_ngrams": 101128488
 }

   "n": 3,
   "variant": "word",
   "language": "ast",
+  "unique_ngrams": 2908394,
+  "total_ngrams": 75449217
 }

models/word_ngram/ast_4gram_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5a09cc0505f23b8e2addb0fd5ea158db1477037a7668eedab235820dfbcb3df5
-size 117588970