omarkamali commited on Jan 3

Commit

06c77fe

verified ·

1 Parent(s): 966f45e

Upload all models and assets for ace (20251001)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +307 -136
models/embeddings/monolingual/ace_128d.bin +2 -2
models/embeddings/monolingual/ace_128d_metadata.json +5 -3
models/embeddings/monolingual/ace_32d.bin +2 -2
models/embeddings/monolingual/ace_32d_metadata.json +5 -3
models/embeddings/monolingual/ace_64d.bin +2 -2
models/embeddings/monolingual/ace_64d_metadata.json +5 -3
models/subword_markov/ace_markov_ctx1_subword.parquet +2 -2
models/subword_markov/ace_markov_ctx1_subword_metadata.json +2 -2
models/subword_markov/ace_markov_ctx2_subword.parquet +2 -2
models/subword_markov/ace_markov_ctx2_subword_metadata.json +2 -2
models/subword_markov/ace_markov_ctx3_subword.parquet +2 -2
models/subword_markov/ace_markov_ctx3_subword_metadata.json +2 -2
models/subword_markov/ace_markov_ctx4_subword.parquet +2 -2
models/subword_markov/ace_markov_ctx4_subword_metadata.json +2 -2
models/subword_ngram/ace_2gram_subword.parquet +2 -2
models/subword_ngram/ace_2gram_subword_metadata.json +2 -2
models/subword_ngram/ace_3gram_subword.parquet +2 -2
models/subword_ngram/ace_3gram_subword_metadata.json +2 -2
models/subword_ngram/ace_4gram_subword.parquet +2 -2
models/subword_ngram/ace_4gram_subword_metadata.json +2 -2
models/tokenizer/ace_tokenizer_16k.model +2 -2
models/tokenizer/ace_tokenizer_16k.vocab +0 -0
models/tokenizer/ace_tokenizer_32k.model +2 -2
models/tokenizer/ace_tokenizer_32k.vocab +0 -0
models/tokenizer/ace_tokenizer_64k.model +2 -2
models/tokenizer/ace_tokenizer_64k.vocab +0 -0
models/tokenizer/ace_tokenizer_8k.model +2 -2
models/tokenizer/ace_tokenizer_8k.vocab +0 -0
models/vocabulary/ace_vocabulary.parquet +2 -2
models/vocabulary/ace_vocabulary_metadata.json +10 -9
models/word_markov/ace_markov_ctx1_word.parquet +2 -2
models/word_markov/ace_markov_ctx1_word_metadata.json +2 -2
models/word_markov/ace_markov_ctx2_word.parquet +2 -2
models/word_markov/ace_markov_ctx2_word_metadata.json +2 -2
models/word_markov/ace_markov_ctx3_word.parquet +2 -2
models/word_markov/ace_markov_ctx3_word_metadata.json +2 -2
models/word_markov/ace_markov_ctx4_word.parquet +2 -2
models/word_markov/ace_markov_ctx4_word_metadata.json +2 -2
models/word_ngram/ace_2gram_word.parquet +2 -2
models/word_ngram/ace_2gram_word_metadata.json +2 -2
models/word_ngram/ace_3gram_word.parquet +2 -2
models/word_ngram/ace_3gram_word_metadata.json +2 -2
models/word_ngram/ace_4gram_word.parquet +2 -2
models/word_ngram/ace_4gram_word_metadata.json +2 -2
visualizations/embedding_isotropy.png +0 -0
visualizations/embedding_norms.png +0 -0
visualizations/embedding_similarity.png +2 -2
visualizations/markov_branching.png +0 -0
visualizations/markov_contexts.png +0 -0

README.md CHANGED Viewed

@@ -23,14 +23,14 @@ dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
-    value: 4.814
   - name: best_isotropy
     type: isotropy
-    value: 0.5452
   - name: vocabulary_size
     type: vocab
-    value: 16834
-generated: 2025-12-27
 ---
 # ACE - Wikilangs Models
@@ -44,12 +44,13 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 ### Models & Assets
 - Tokenizers (8k, 16k, 32k, 64k)
-- N-gram models (2, 3, 4-gram)
-- Markov chains (context of 1, 2, 3 and 4)
 - Subword N-gram and Markov chains
-- Embeddings in various sizes and dimensions
 - Language Vocabulary
 - Language Statistics
 ![Performance Dashboard](visualizations/performance_dashboard.png)
 ### Analysis and Evaluation
@@ -59,7 +60,8 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
-- [6. Summary & Recommendations](#6-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
@@ -68,55 +70,57 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 ![Tokenizer Compression](visualizations/tokenizer_compression.png)
 ### Results
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
-| **8k** | 3.989x | 3.94 | 0.2625% | 138,651 |
-| **16k** | 4.326x | 4.27 | 0.2847% | 127,870 |
-| **32k** | 4.588x | 4.53 | 0.3019% | 120,551 |
-| **64k** | 4.814x 🏆 | 4.76 | 0.3168% | 114,908 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
-**Sample 1:** `Jeremiah "Jerry" O'Connell (1974 – ) nakeuh sidroe aktor asay Amirika Syarikat.
-...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁jer em iah ▁" j er ry " ▁o ' ... (+20 more)` | 30 |
-| 16k | `▁jer em iah ▁" j er ry " ▁o ' ... (+19 more)` | 29 |
-| 32k | `▁jer em iah ▁" jerry " ▁o ' con nell ... (+17 more)` | 27 |
-| 64k | `▁jeremiah ▁" jerry " ▁o ' connell ▁( 1 9 ... (+14 more)` | 24 |
-**Sample 2:** `Darul Aman nakeuh saboh gampông nyang na lam keucamatan Permata, Kabupaten Bener...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁darul ▁aman ▁nakeuh ▁saboh ▁gampông ▁nyang ▁na ▁lam ▁keucamatan ▁permata ... (+10 more)` | 20 |
-| 16k | `▁darul ▁aman ▁nakeuh ▁saboh ▁gampông ▁nyang ▁na ▁lam ▁keucamatan ▁permata ... (+10 more)` | 20 |
-| 32k | `▁darul ▁aman ▁nakeuh ▁saboh ▁gampông ▁nyang ▁na ▁lam ▁keucamatan ▁permata ... (+10 more)` | 20 |
-| 64k | `▁darul ▁aman ▁nakeuh ▁saboh ▁gampông ▁nyang ▁na ▁lam ▁keucamatan ▁permata ... (+10 more)` | 20 |
-**Sample 3:** `Nè
-Kawan:Gampông di Subulussalam
-Kawan:Longkib, Subulussalam`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁nè ▁kawan : gampông ▁di ▁subulussalam ▁kawan : longkib , ... (+1 more)` | 11 |
-| 16k | `▁nè ▁kawan : gampông ▁di ▁subulussalam ▁kawan : longkib , ... (+1 more)` | 11 |
-| 32k | `▁nè ▁kawan : gampông ▁di ▁subulussalam ▁kawan : longkib , ... (+1 more)` | 11 |
-| 64k | `▁nè ▁kawan : gampông ▁di ▁subulussalam ▁kawan : longkib , ... (+1 more)` | 11 |
 ### Key Findings
-- **Best Compression:** 64k achieves 4.814x compression
-- **Lowest UNK Rate:** 8k with 0.2625% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
@@ -125,57 +129,89 @@ Kawan:Longkib, Subulussalam`
 ![N-gram Perplexity](visualizations/ngram_perplexity.png)
 ![N-gram Coverage](visualizations/ngram_coverage.png)
 ### Results
-| N-gram | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
-|--------|------------|---------|----------------|------------------|-------------------|
-| **2-gram** | 936 🏆 | 9.87 | 9,992 | 52.8% | 80.2% |
-| **2-gram** | 261 🏆 | 8.03 | 2,689 | 68.6% | 99.2% |
-| **3-gram** | 1,100 | 10.10 | 14,167 | 50.6% | 79.4% |
-| **3-gram** | 1,398 | 10.45 | 17,865 | 36.2% | 82.1% |
-| **4-gram** | 1,474 | 10.53 | 24,432 | 49.2% | 75.0% |
-| **4-gram** | 4,053 | 11.98 | 71,115 | 25.9% | 65.3% |
 ### Top 5 N-grams by Size
-**2-grams:**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `kawan :` | 14,756 |
-| 2 | `bak laman` | 7,389 |
-| 3 | `gunong nyoe` | 7,388 |
-| 4 | `gampông di` | 6,982 |
-| 5 | `. nè` | 6,946 |
-**3-grams:**
 | Rank | N-gram | Count |
 |------|--------|-------|
 | 1 | `gunong nyoe bak` | 5,541 |
-| 2 | `nè kawan :` | 4,508 |
-| 3 | `. nè kawan` | 4,441 |
-| 4 | `nyoe bak laman` | 3,694 |
-| 5 | `, acèh .` | 3,576 |
-**4-grams:**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `. nè kawan :` | 4,441 |
-| 2 | `gunong nyoe bak laman` | 3,694 |
-| 3 | `. lumbôi gampông nyoe` | 3,567 |
-| 4 | `acèh . lumbôi gampông` | 3,564 |
-| 5 | `, acèh . lumbôi` | 3,564 |
 ### Key Findings
-- **Best Perplexity:** 2-gram with 261
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
-- **Coverage:** Top-1000 patterns cover ~65% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
@@ -183,55 +219,86 @@ Kawan:Longkib, Subulussalam`
 ![Markov Entropy](visualizations/markov_entropy.png)
 ![Markov Branching](visualizations/markov_branching.png)
 ### Results
-| Context | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
-|---------|-------------|------------|------------------|-----------------|----------------|
-| **1** | 0.7141 | 1.640 | 4.47 | 38,800 | 28.6% |
-| **1** | 1.0130 | 2.018 | 7.11 | 1,076 | 0.0% |
-| **2** | 0.2550 | 1.193 | 1.56 | 172,855 | 74.5% |
-| **2** | 0.8567 | 1.811 | 4.89 | 7,655 | 14.3% |
-| **3** | 0.0849 | 1.061 | 1.15 | 269,187 | 91.5% |
-| **3** | 0.7773 | 1.714 | 3.54 | 37,427 | 22.3% |
-| **4** | 0.0363 🏆 | 1.025 | 1.07 | 309,167 | 96.4% |
-| **4** | 0.5505 🏆 | 1.465 | 2.29 | 132,546 | 44.9% |
-### Generated Text Samples
-Below are text samples generated from each Markov chain model:
 **Context Size 1:**
-1. `. kawan : kalimantan seulatan china ) dan la ' el republik indonesia . bab manga`
-2. `, bah seulamat nibak watèe jitamong lam data cuaca daerah gunong nyoe geuneuk teuka nibak limong`
-3. `di kabupaten acèh . kawan : lawe sumur nakeuh saboh nanggroe nyang geuseumurat lé shogakkukan .`
 **Context Size 2:**
-1. `kawan : langsa lama langsa timur ( langsa timur ( langsa timur ) nakeuh sidroe aktor asay`
-2. `bak laman nasa data matauroe teubiet & teunom di da ' irah bak laman nasa data matauroe`
-3. `gunong nyoe bak wikidata data cuaca daerah gunong nyoe bak wikidata data cuaca daerah gunong nyoe ba...`
 **Context Size 3:**
-1. `gunong nyoe bak laman geonames data gunong nyoe bak laman geonames data gunong nyoe bak wikidata dat...`
-2. `nè kawan : gampông di pidie jaya kawan : meurah dua , pidie jaya . ngon keudèe panté`
-3. `. nè kawan : gampông di simeulue kawan : teupah teungöh , kabupatèn simeulue , acèh . lumbôi`
 **Context Size 4:**
-1. `. nè kawan : gampông di acèh rayek kawan : darussalam , acèh rayek . meunurôt riwayat , kuta`
-2. `gunong nyoe bak laman nasa data matauroe teubiet & teunom di da ' irah bak laman sunrisesunset . com`
-3. `. lumbôi gampông nyoe lam data peumeurèntah nakeuh 11 . 11 . 03 . 06 . 2040 . nè`
 ### Key Findings
-- **Best Predictability:** Context-4 with 96.4% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
-- **Memory Trade-off:** Larger contexts require more storage (132,546 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
@@ -247,64 +314,64 @@ Below are text samples generated from each Markov chain model:
 | Metric | Value |
 |--------|-------|
-| Vocabulary Size | 16,834 |
-| Total Tokens | 576,109 |
-| Mean Frequency | 34.22 |
 | Median Frequency | 3 |
-| Frequency Std Dev | 429.12 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | di | 21,413 |
-| 2 | nakeuh | 20,825 |
-| 3 | bak | 18,295 |
-| 4 | acèh | 17,701 |
-| 5 | kawan | 14,998 |
-| 6 | nyoe | 13,193 |
-| 7 | gampông | 12,105 |
-| 8 | gunong | 11,874 |
-| 9 | data | 11,091 |
-| 10 | nyang | 9,069 |
 ### Least Common Words (from vocabulary)
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | own | 2 |
-| 2 | became | 2 |
-| 3 | influence | 2 |
-| 4 | across | 2 |
-| 5 | represent | 2 |
-| 6 | raising | 2 |
-| 7 | ceremony | 2 |
-| 8 | flown | 2 |
-| 9 | reconstructions | 2 |
-| 10 | jawatimu | 2 |
 ### Zipf's Law Analysis
 | Metric | Value |
 |--------|-------|
-| Zipf Coefficient | 1.1968 |
-| R² (Goodness of Fit) | 0.996896 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
-| Top 100 | 61.4% |
-| Top 1,000 | 83.7% |
-| Top 5,000 | 93.9% |
-| Top 10,000 | 97.4% |
 ### Key Findings
-- **Zipf Compliance:** R²=0.9969 indicates excellent adherence to Zipf's law
-- **High Frequency Dominance:** Top 100 words cover 61.4% of corpus
-- **Long Tail:** 6,834 words needed for remaining 2.6% coverage
 ---
 ## 5. Word Embeddings Evaluation
@@ -317,24 +384,125 @@ Below are text samples generated from each Markov chain model:
 ![t-SNE Sentences](visualizations/tsne_sentences.png)
-### Model Comparison
-| Model | Vocab Size | Dimension | Avg Norm | Std Norm | Isotropy |
-|-------|------------|-----------|----------|----------|----------|
-| **mono_32d** | 6,710 | 32 | 3.746 | 0.875 | 0.5452 🏆 |
-| **mono_64d** | 6,710 | 64 | 3.801 | 0.865 | 0.1802 |
-| **mono_128d** | 6,710 | 128 | 3.849 | 0.857 | 0.0333 |
-| **embeddings_enhanced** | 0 | 0 | 0.000 | 0.000 | 0.0000 |
 ### Key Findings
-- **Best Isotropy:** mono_32d with 0.5452 (more uniform distribution)
-- **Dimension Trade-off:** Higher dimensions capture more semantics but reduce isotropy
-- **Vocabulary Coverage:** All models cover 6,710 words
-- **Recommendation:** 100d for balanced semantic capture and efficiency
 ---
-## 6. Summary & Recommendations
 ![Performance Dashboard](visualizations/performance_dashboard.png)
@@ -342,11 +510,12 @@ Below are text samples generated from each Markov chain model:
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
-| Tokenizer | **32k BPE** | Best compression (4.81x) with low UNK rate |
-| N-gram | **5-gram** | Lowest perplexity (261) |
-| Markov | **Context-4** | Highest predictability (96.4%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
 ---
 ## Appendix: Metrics Glossary & Interpretation Guide
@@ -536,7 +705,8 @@ If you use these models in your research, please cite:
   author = {Kamali, Omar},
   title = {Wikilangs: Open NLP Models for Wikipedia Languages},
   year = {2025},
-  publisher = {HuggingFace},
   url = {https://huggingface.co/wikilangs}
   institution = {Omneity Labs}
 }
@@ -552,7 +722,8 @@ MIT License - Free for academic and commercial use.
 - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
 - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
 - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
 ---
 *Generated by Wikilangs Models Pipeline*
-*Report Date: 2025-12-27 04:33:03*

 metrics:
   - name: best_compression_ratio
     type: compression
+    value: 4.925
   - name: best_isotropy
     type: isotropy
+    value: 0.5172
   - name: vocabulary_size
     type: vocab
+    value: 0
+generated: 2026-01-03
 ---
 # ACE - Wikilangs Models
 ### Models & Assets
 - Tokenizers (8k, 16k, 32k, 64k)
+- N-gram models (2, 3, 4, 5-gram)
+- Markov chains (context of 1, 2, 3, 4 and 5)
 - Subword N-gram and Markov chains
+- Embeddings in various sizes and dimensions (aligned and unaligned)
 - Language Vocabulary
 - Language Statistics
 ![Performance Dashboard](visualizations/performance_dashboard.png)
 ### Analysis and Evaluation
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
+- [6. Morphological Analysis (Experimental)](#6-morphological-analysis)
+- [7. Summary & Recommendations](#7-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
 ![Tokenizer Compression](visualizations/tokenizer_compression.png)
+![Tokenizer Fertility](visualizations/tokenizer_fertility.png)
+![Tokenizer OOV](visualizations/tokenizer_oov.png)
+![Total Tokens](visualizations/tokenizer_total_tokens.png)
 ### Results
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
+| **8k** | 4.119x | 4.13 | 0.2682% | 125,632 |
+| **16k** | 4.488x | 4.50 | 0.2923% | 115,301 |
+| **32k** | 4.727x | 4.74 | 0.3079% | 109,452 |
+| **64k** | 4.925x 🏆 | 4.93 | 0.3208% | 105,066 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
+**Sample 1:** `Mukim Sepakat nakeuh saboh mukim di keucamatan Lawe Sigala-Gala Kabupatèn Acèh T...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁mukim ▁sepakat ▁nakeuh ▁saboh ▁mukim ▁di ▁keucamatan ▁lawe ▁sigala - ... (+12 more)` | 22 |
+| 16k | `▁mukim ▁sepakat ▁nakeuh ▁saboh ▁mukim ▁di ▁keucamatan ▁lawe ▁sigala - ... (+12 more)` | 22 |
+| 32k | `▁mukim ▁sepakat ▁nakeuh ▁saboh ▁mukim ▁di ▁keucamatan ▁lawe ▁sigala - ... (+12 more)` | 22 |
+| 64k | `▁mukim ▁sepakat ▁nakeuh ▁saboh ▁mukim ▁di ▁keucamatan ▁lawe ▁sigala - ... (+12 more)` | 22 |
+**Sample 2:** `Propinsi Nakhon Ratchasima nakeuh saboh propinsi di timu baroh Muangthai. Nang n...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁propinsi ▁nakhon ▁ratch asi ma ▁nakeuh ▁saboh ▁propinsi ▁di ▁timu ... (+11 more)` | 21 |
+| 16k | `▁propinsi ▁nakhon ▁ratchasima ▁nakeuh ▁saboh ▁propinsi ▁di ▁timu ▁baroh ▁muangthai ... (+7 more)` | 17 |
+| 32k | `▁propinsi ▁nakhon ▁ratchasima ▁nakeuh ▁saboh ▁propinsi ▁di ▁timu ▁baroh ▁muangthai ... (+7 more)` | 17 |
+| 64k | `▁propinsi ▁nakhon ▁ratchasima ▁nakeuh ▁saboh ▁propinsi ▁di ▁timu ▁baroh ▁muangthai ... (+7 more)` | 17 |
+**Sample 3:** `Kandang nakeuh gampông di Keucamatan Samalanga, Kabupatèn Bireuen, Acèh. Lumbôi ...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁kandang ▁nakeuh ▁gampông ▁di ▁keucamatan ▁samalanga , ▁kabupatèn ▁bireuen , ... (+13 more)` | 23 |
+| 16k | `▁kandang ▁nakeuh ▁gampông ▁di ▁keucamatan ▁samalanga , ▁kabupatèn ▁bireuen , ... (+13 more)` | 23 |
+| 32k | `▁kandang ▁nakeuh ▁gampông ▁di ▁keucamatan ▁samalanga , ▁kabupatèn ▁bireuen , ... (+13 more)` | 23 |
+| 64k | `▁kandang ▁nakeuh ▁gampông ▁di ▁keucamatan ▁samalanga , ▁kabupatèn ▁bireuen , ... (+13 more)` | 23 |
 ### Key Findings
+- **Best Compression:** 64k achieves 4.925x compression
+- **Lowest UNK Rate:** 8k with 0.2682% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
 ![N-gram Perplexity](visualizations/ngram_perplexity.png)
+![N-gram Unique](visualizations/ngram_unique.png)
 ![N-gram Coverage](visualizations/ngram_coverage.png)
 ### Results
+| N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
+|--------|---------|------------|---------|----------------|------------------|-------------------|
+| **2-gram** | Word | 637 | 9.32 | 7,009 | 62.6% | 83.4% |
+| **2-gram** | Subword | 224 🏆 | 7.80 | 2,204 | 71.8% | 99.5% |
+| **3-gram** | Word | 577 | 9.17 | 8,214 | 65.4% | 85.5% |
+| **3-gram** | Subword | 1,194 | 10.22 | 14,605 | 37.9% | 84.9% |
+| **4-gram** | Word | 673 | 9.39 | 12,805 | 64.5% | 83.7% |
+| **4-gram** | Subword | 3,551 | 11.79 | 59,251 | 26.2% | 67.5% |
 ### Top 5 N-grams by Size
+**2-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `bak laman` | 7,389 |
+| 2 | `gunong nyoe` | 7,388 |
+| 3 | `nyoe bak` | 5,543 |
+| 4 | `nakeuh saboh` | 5,045 |
+| 5 | `di acèh` | 4,748 |
+**3-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
 | 1 | `gunong nyoe bak` | 5,541 |
+| 2 | `nyoe bak laman` | 3,694 |
+| 3 | `lumbôi gampông nyoe` | 3,567 |
+| 4 | `acèh lumbôi gampông` | 3,564 |
+| 5 | `nyoe lam data` | 3,499 |
+**4-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `gunong nyoe bak laman` | 3,694 |
+| 2 | `acèh lumbôi gampông nyoe` | 3,564 |
+| 3 | `nyoe lam data peumeurèntah` | 3,499 |
+| 4 | `gampông nyoe lam data` | 3,499 |
+| 5 | `lam data peumeurèntah nakeuh` | 3,499 |
+**2-grams (Subword):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `e u` | 117,818 |
+| 2 | `_ n` | 79,411 |
+| 3 | `a n` | 69,436 |
+| 4 | `h _` | 68,029 |
+| 5 | `n g` | 67,573 |
+**3-grams (Subword):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `n g _` | 44,439 |
+| 2 | `_ n a` | 31,640 |
+| 3 | `_ b a` | 30,463 |
+| 4 | `k e u` | 30,322 |
+| 5 | `_ n y` | 26,537 |
+**4-grams (Subword):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `e u h _` | 23,348 |
+| 2 | `b a k _` | 23,260 |
+| 3 | `_ d i _` | 21,144 |
+| 4 | `k e u h` | 21,117 |
+| 5 | `a k e u` | 20,691 |
 ### Key Findings
+- **Best Perplexity:** 2-gram (subword) with 224
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
+- **Coverage:** Top-1000 patterns cover ~68% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
 ![Markov Entropy](visualizations/markov_entropy.png)
+![Markov Contexts](visualizations/markov_contexts.png)
 ![Markov Branching](visualizations/markov_branching.png)
 ### Results
+| Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
+|---------|---------|-------------|------------|------------------|-----------------|----------------|
+| **1** | Word | 0.7515 | 1.684 | 4.35 | 36,025 | 24.8% |
+| **1** | Subword | 0.8633 | 1.819 | 5.38 | 1,269 | 13.7% |
+| **2** | Word | 0.2148 | 1.161 | 1.44 | 155,224 | 78.5% |
+| **2** | Subword | 0.7739 | 1.710 | 4.50 | 6,822 | 22.6% |
+| **3** | Word | 0.0655 | 1.046 | 1.11 | 221,018 | 93.4% |
+| **3** | Subword | 0.7559 | 1.689 | 3.54 | 30,615 | 24.4% |
+| **4** | Word | 0.0242 🏆 | 1.017 | 1.04 | 242,720 | 97.6% |
+| **4** | Subword | 0.5660 | 1.480 | 2.36 | 108,223 | 43.4% |
+### Generated Text Samples (Word-based)
+Below are text samples generated from each word-based Markov chain model:
 **Context Size 1:**
+1. `di pidie acèh timu acèh indonesia the colour of life seuneubeuet bak saboh spèsiès nibak takson`
+2. `nakeuh gunong nyoe geupeuteubiet bak wikidata data peumeurèntah nakeuh gunong di teungoh ngon geukeu...`
+3. `bak wikidata data matauroe teubiet teunom di ateuh babah la ôt peunawôt luwa data gunong nyoe`
 **Context Size 2:**
+1. `bak laman sunrisesunset com di acèh seulatan acèh lumbôi gampông nyoe lam data peumeurèntah nakeuh n...`
+2. `gunong nyoe bak laman geonames data gunong nyoe bak laman sunrisesunset com di acèh nakeuh gampông d...`
+3. `nyoe bak wikidata data cuaca daerah gunong nyoe nakeuh kagoshima banda`
 **Context Size 3:**
+1. `gunong nyoe bak laman geonames data gunong nyoe bak wikidata data cuaca daerah gunong nyoe bak wikid...`
+2. `nyoe bak laman geonames data gunong nyoe bak laman geonames data gunong nyoe bak wikidata data cuaca...`
+3. `lumbôi gampông nyoe lam data peumeurèntah nakeuh nè di acèh rayek kawan ingin jaya acèh rayek nibak ...`
 **Context Size 4:**
+1. `gunong nyoe bak laman nasa data matauroe teubiet teunom di da irah bak laman sunrisesunset com di ac...`
+2. `acèh lumbôi gampông nyoe lam data peumeurèntah nakeuh nè di acèh rayek acèh acèh rayek`
+3. `nyoe lam data peumeurèntah nakeuh nè di bireuen bireuen`
+### Generated Text Samples (Subword-based)
+Below are text samples generated from each subword-based Markov chain model:
+**Context Size 1:**
+1. `_da_geriè_kahara`
+2. `ata_jeetabam_lab`
+3. `ng_ngeung_teukeu`
+**Context Size 2:**
+1. `euna_preunomyza_d`
+2. `_nya_-_diet_lis_a`
+3. `h_nak_lam_diversi`
+**Context Size 3:**
+1. `ng_udeh_nyoe_lam_d`
+2. `_nakeuh_spèsi_acèh`
+3. `_bagiang_bak_lagèe`
+**Context Size 4:**
+1. `euh_tarèh_seuë_deun`
+2. `bak_encyclopedia_of`
+3. `_di_surat_lé_gosho_`
 ### Key Findings
+- **Best Predictability:** Context-4 (word) with 97.6% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
+- **Memory Trade-off:** Larger contexts require more storage (108,223 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
 | Metric | Value |
 |--------|-------|
+| Vocabulary Size | 15,502 |
+| Total Tokens | 515,006 |
+| Mean Frequency | 33.22 |
 | Median Frequency | 3 |
+| Frequency Std Dev | 415.97 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | di | 21,196 |
+| 2 | nakeuh | 20,604 |
+| 3 | bak | 18,159 |
+| 4 | acèh | 17,511 |
+| 5 | nyoe | 13,184 |
+| 6 | data | 11,090 |
+| 7 | gunong | 10,023 |
+| 8 | nyang | 9,025 |
+| 9 | gampông | 8,794 |
+| 10 | lam | 7,941 |
 ### Least Common Words (from vocabulary)
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | saûdep | 2 |
+| 2 | teuleungah | 2 |
+| 3 | mutuskeun | 2 |
+| 4 | ekshumasi | 2 |
+| 5 | teukeuh | 2 |
+| 6 | dilegalisasikan | 2 |
+| 7 | jendela | 2 |
+| 8 | prosès | 2 |
+| 9 | piazza | 2 |
+| 10 | fontana | 2 |
 ### Zipf's Law Analysis
 | Metric | Value |
 |--------|-------|
+| Zipf Coefficient | 1.1704 |
+| R² (Goodness of Fit) | 0.995382 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
+| Top 100 | 63.2% |
+| Top 1,000 | 84.2% |
+| Top 5,000 | 94.2% |
+| Top 10,000 | 97.8% |
 ### Key Findings
+- **Zipf Compliance:** R²=0.9954 indicates excellent adherence to Zipf's law
+- **High Frequency Dominance:** Top 100 words cover 63.2% of corpus
+- **Long Tail:** 5,502 words needed for remaining 2.2% coverage
 ---
 ## 5. Word Embeddings Evaluation
 ![t-SNE Sentences](visualizations/tsne_sentences.png)
+### 5.1 Cross-Lingual Alignment
+> *Note: Multilingual alignment visualization not available for this language.*
+### 5.2 Model Comparison
+| Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
+|-------|-----------|----------|------------------|---------------|----------------|
+| **mono_32d** | 32 | 0.5172 🏆 | 0.4104 | N/A | N/A |
+| **mono_64d** | 64 | 0.1209 | 0.4362 | N/A | N/A |
+| **mono_128d** | 128 | 0.0271 | 0.4092 | N/A | N/A |
 ### Key Findings
+- **Best Isotropy:** mono_32d with 0.5172 (more uniform distribution)
+- **Semantic Density:** Average pairwise similarity of 0.4186. Lower values indicate better semantic separation.
+- **Alignment Quality:** No aligned models evaluated in this run.
+- **Recommendation:** 128d aligned for best cross-lingual performance
 ---
+## 6.  Morphological Analysis (Experimental)
+> ⚠️ **Warning:** This language shows low morphological productivity. The statistical signals used for this analysis may be noisy or less reliable than for morphologically rich languages.
+This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
+### 6.1 Productivity & Complexity
+| Metric | Value | Interpretation | Recommendation |
+|--------|-------|----------------|----------------|
+| Productivity Index | **0.000** | Low morphological productivity | ⚠️ Likely unreliable |
+| Idiomaticity Gap | **-1.000** | Low formulaic content | - |
+### 6.2 Affix Inventory (Productive Units)
+These are the most productive prefixes and suffixes identified by sampling the vocabulary for global substitutability patterns. A unit is considered an affix if stripping it leaves a valid stem that appears in other contexts.
+#### Productive Prefixes
+| Prefix | Examples |
+|--------|----------|
+| `-me` | meulagu, meukeunong, meulabôh |
+| `-ge` | geumeuhoi, geupasoe, geupeuresmi |
+| `-geu` | geumeuhoi, geupasoe, geupeuresmi |
+| `-meu` | meulagu, meukeunong, meulabôh |
+| `-pe` | peunuman, peureudee, peumurah |
+#### Productive Suffixes
+| Suffix | Examples |
+|--------|----------|
+| `-ng` | meukeunong, gelampang, seberang |
+| `-an` | jonathan, peunuman, kyrgyzstan |
+| `-ah` | bawah, geupeujeulah, jumlah |
+### 6.3 Bound Stems (Lexical Roots)
+Bound stems are high-frequency subword units that are semantically cohesive but rarely appear as standalone words. These often correspond to the 'core' of a word that requires inflection or derivation to be valid.
+| Stem | Cohesion | Substitutability | Examples |
+|------|----------|------------------|----------|
+| `eung` | 1.41x | 64 contexts | reung, meung, jeung |
+| `uneu` | 1.70x | 28 contexts | runeu, uneun, seuneu |
+| `euen` | 1.54x | 38 contexts | eueng, meuen, leuen |
+| `euna` | 1.36x | 59 contexts | peuna, beuna, keuna |
+| `ubeu` | 1.47x | 22 contexts | ubeut, neubeu, ubeuet |
+| `umeu` | 1.44x | 23 contexts | jumeu, geumeu, jeumeu |
+| `meur` | 1.63x | 15 contexts | meuri, meurô, meurôn |
+| `anga` | 1.36x | 23 contexts | panga, manga, langa |
+| `teun` | 1.32x | 25 contexts | uteun, ateung, teunga |
+| `neub` | 1.57x | 14 contexts | neuba, neubeu, neubôk |
+| `eube` | 1.48x | 16 contexts | leube, teubee, leubeh |
+| `eune` | 1.63x | 12 contexts | seuneu, geuneu, keuneu |
+### 6.4 Affix Compatibility (Co-occurrence)
+This table shows which prefixes and suffixes most frequently co-occur on the same stems, revealing the 'stacking' rules of the language's morphology.
+| Prefix | Suffix | Frequency | Examples |
+|--------|--------|-----------|----------|
+| `-ge` | `-ng` | 56 words | geupeutrang, geudông |
+| `-pe` | `-an` | 51 words | penyiaran, permukaan |
+| `-me` | `-ng` | 40 words | meulinteueng, meuhubông |
+| `-pe` | `-ng` | 22 words | perang, peukeumang |
+| `-pe` | `-ah` | 18 words | peujeunajah, peuleumah |
+| `-ge` | `-ah` | 17 words | geupeuglah, geupeuluwah |
+| `-me` | `-ah` | 16 words | meujumeulah, meurah |
+| `-me` | `-an` | 10 words | meridian, meukeujadian |
+| `-ge` | `-an` | 6 words | geurakan, geuritan |
+### 6.5 Recursive Morpheme Segmentation
+Using **Recursive Hierarchical Substitutability**, we decompose complex words into their constituent morphemes. This approach handles nested affixes (e.g., `prefix-prefix-root-suffix`).
+| Word | Suggested Split | Confidence | Stem |
+|------|-----------------|------------|------|
+| geumeudong | **`geu-meu-dong`** | 6.0 | `dong` |
+| geumeututô | **`geu-meu-tutô`** | 6.0 | `tutô` |
+| meubileueng | **`meu-bileue-ng`** | 6.0 | `bileue` |
+| geulumbang | **`geu-lumba-ng`** | 6.0 | `lumba` |
+| geumeupakat | **`geu-meu-pakat`** | 6.0 | `pakat` |
+| geumeuniaga | **`geu-meu-niaga`** | 6.0 | `niaga` |
+| geumeuturi | **`geu-meu-turi`** | 6.0 | `turi` |
+| geuseubarô | **`geu-seubarô`** | 4.5 | `seubarô` |
+| geudapeuta | **`geu-dapeuta`** | 4.5 | `dapeuta` |
+| meusampoe | **`meu-sampoe`** | 4.5 | `sampoe` |
+| geubayeuë | **`geu-bayeuë`** | 4.5 | `bayeuë` |
+| meulingka | **`meu-lingka`** | 4.5 | `lingka` |
+| meusiyasat | **`meu-siyasat`** | 4.5 | `siyasat` |
+| meulaksana | **`meu-laksana`** | 4.5 | `laksana` |
+| geubayeue | **`geu-bayeue`** | 4.5 | `bayeue` |
+### 6.6 Linguistic Interpretation
+> **Automated Insight:**
+The language ACE appears to be more isolating or has a highly fixed vocabulary. Word-level models perform nearly as well as subword models, indicating fewer productive morphological processes.
+---
+## 7. Summary & Recommendations
 ![Performance Dashboard](visualizations/performance_dashboard.png)
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
+| Tokenizer | **64k BPE** | Best compression (4.92x) |
+| N-gram | **2-gram** | Lowest perplexity (224) |
+| Markov | **Context-4** | Highest predictability (97.6%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
 ---
 ## Appendix: Metrics Glossary & Interpretation Guide
   author = {Kamali, Omar},
   title = {Wikilangs: Open NLP Models for Wikipedia Languages},
   year = {2025},
+  doi = {10.5281/zenodo.18073153},
+  publisher = {Zenodo},
   url = {https://huggingface.co/wikilangs}
   institution = {Omneity Labs}
 }
 - 🤗 Models: [huggingface.co/wikilangs](https://huggingface.co/wikilangs)
 - 📊 Data: [wikipedia-monthly](https://huggingface.co/datasets/omarkamali/wikipedia-monthly)
 - 👤 Author: [Omar Kamali](https://huggingface.co/omarkamali)
+- 🤝 Sponsor: [Featherless AI](https://featherless.ai)
 ---
 *Generated by Wikilangs Models Pipeline*
+*Report Date: 2026-01-03 05:05:30*

models/embeddings/monolingual/ace_128d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2046e3b06e8436b76db682bbcbcff52a79ae03beb79e02a5c143feb4aa30e845
-size 1030981956

 version https://git-lfs.github.com/spec/v1
+oid sha256:0cf768b8cc26ee028b74fbc795a0084120d5ec9fa9ee7265d5277f3228676402
+size 1030413699

models/embeddings/monolingual/ace_128d_metadata.json CHANGED Viewed

@@ -3,11 +3,13 @@
   "dimension": 128,
   "version": "monolingual",
   "training_params": {
-    "dim": 128,
     "min_count": 5,
     "window": 5,
     "negative": 5,
-    "epochs": 5
   },
-  "vocab_size": 6710
 }

   "dimension": 128,
   "version": "monolingual",
   "training_params": {
+    "algorithm": "skipgram",
     "min_count": 5,
     "window": 5,
     "negative": 5,
+    "epochs": 5,
+    "encoding_method": "rope",
+    "dim": 128
   },
+  "vocab_size": 6165
 }

models/embeddings/monolingual/ace_32d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:865bc53096b61029d31129feb2421f97605a4e1b1fc71a16f09ae2a11377dc30
-size 257828676

 version https://git-lfs.github.com/spec/v1
+oid sha256:b69885b365da465ab4bdfb040ee28e5b1f011ab235bd18a0e87c25eae3925f8f
+size 257678979

models/embeddings/monolingual/ace_32d_metadata.json CHANGED Viewed

@@ -3,11 +3,13 @@
   "dimension": 32,
   "version": "monolingual",
   "training_params": {
-    "dim": 32,
     "min_count": 5,
     "window": 5,
     "negative": 5,
-    "epochs": 5
   },
-  "vocab_size": 6710
 }

   "dimension": 32,
   "version": "monolingual",
   "training_params": {
+    "algorithm": "skipgram",
     "min_count": 5,
     "window": 5,
     "negative": 5,
+    "epochs": 5,
+    "encoding_method": "rope",
+    "dim": 32
   },
+  "vocab_size": 6165
 }

models/embeddings/monolingual/ace_64d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e407ad194bf72cb228e2d6744a4203d66d913abff54ac372bb7ee406098236c3
-size 515546436

 version https://git-lfs.github.com/spec/v1
+oid sha256:79de1c214df5334f0fa005b795ccfce46a4ac070f1ea7654cfaab5c2dc34dda5
+size 515257219

models/embeddings/monolingual/ace_64d_metadata.json CHANGED Viewed

@@ -3,11 +3,13 @@
   "dimension": 64,
   "version": "monolingual",
   "training_params": {
-    "dim": 64,
     "min_count": 5,
     "window": 5,
     "negative": 5,
-    "epochs": 5
   },
-  "vocab_size": 6710
 }

   "dimension": 64,
   "version": "monolingual",
   "training_params": {
+    "algorithm": "skipgram",
     "min_count": 5,
     "window": 5,
     "negative": 5,
+    "epochs": 5,
+    "encoding_method": "rope",
+    "dim": 64
   },
+  "vocab_size": 6165
 }

models/subword_markov/ace_markov_ctx1_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:994945c1c8459c3679e3e00e0210f36d27f7b92323ac2b7120e6d2efd5d2d126
-size 61873

 version https://git-lfs.github.com/spec/v1
+oid sha256:0f3a9abf57afb060a211a69e96c23c22c115a944e460bb459448be1adea274b0
+size 59763

models/subword_markov/ace_markov_ctx1_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "subword",
   "language": "ace",
-  "unique_contexts": 1076,
-  "total_transitions": 3908719
 }

   "context_size": 1,
   "variant": "subword",
   "language": "ace",
+  "unique_contexts": 1269,
+  "total_transitions": 3470466
 }

models/subword_markov/ace_markov_ctx2_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c7276dd9bca94f9f03917ff721ff8bf88482f94093dfa6019b62501feb464811
-size 312446

 version https://git-lfs.github.com/spec/v1
+oid sha256:d76a3d707c4ae33b25f22e9cbdcb78745638dfcdc99ed80df41d977cbb81cca0
+size 268452

models/subword_markov/ace_markov_ctx2_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "subword",
   "language": "ace",
-  "unique_contexts": 7655,
-  "total_transitions": 3895602
 }

   "context_size": 2,
   "variant": "subword",
   "language": "ace",
+  "unique_contexts": 6822,
+  "total_transitions": 3457577
 }

models/subword_markov/ace_markov_ctx3_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fa959fe08fc66a84cabcff2aea19b15147f3ed325699ecddb1034ee1e56ba332
-size 1020456

 version https://git-lfs.github.com/spec/v1
+oid sha256:652d2af547c89ae398ecf44a87cac49a050a3836bbeb328bda5d380d30c7c8c0
+size 900905

models/subword_markov/ace_markov_ctx3_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 3,
   "variant": "subword",
   "language": "ace",
-  "unique_contexts": 37427,
-  "total_transitions": 3882485
 }

   "context_size": 3,
   "variant": "subword",
   "language": "ace",
+  "unique_contexts": 30615,
+  "total_transitions": 3444688
 }

models/subword_markov/ace_markov_ctx4_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0ae4f621898a6636e9b6c7126e341387c206ed707f6ed0fba958ebfe9fc9fbb4
-size 2470198

 version https://git-lfs.github.com/spec/v1
+oid sha256:32adbfca2120175fa43f1b0bd2cedc5d7bd19099677319e6128cab29e66007d4
+size 2085174

models/subword_markov/ace_markov_ctx4_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 4,
   "variant": "subword",
   "language": "ace",
-  "unique_contexts": 132546,
-  "total_transitions": 3869368
 }

   "context_size": 4,
   "variant": "subword",
   "language": "ace",
+  "unique_contexts": 108223,
+  "total_transitions": 3431799
 }

models/subword_ngram/ace_2gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4a5e9b80103fed58ac45a1a1dc5018d7bf762a47692e47d327a50b683698f3ad
-size 36523

 version https://git-lfs.github.com/spec/v1
+oid sha256:0e6d83ed77f7cb491e9f8059b986f03ae648a2fb694c94ba29ac349e3055f671
+size 30955

models/subword_ngram/ace_2gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 2,
   "variant": "subword",
   "language": "ace",
-  "unique_ngrams": 2689,
-  "total_ngrams": 3908719
 }

   "n": 2,
   "variant": "subword",
   "language": "ace",
+  "unique_ngrams": 2204,
+  "total_ngrams": 3470466
 }

models/subword_ngram/ace_3gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8d4bacbf3031b3897258cf39672bf0992a78d116a540da2c29da63b6fe9f5442
-size 218763

 version https://git-lfs.github.com/spec/v1
+oid sha256:5f58aa5cf358032d5b4d7a52f7a9b81c4a9bcf9dda5c142a65fdae14ae7135f4
+size 178456

models/subword_ngram/ace_3gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 3,
   "variant": "subword",
   "language": "ace",
-  "unique_ngrams": 17865,
-  "total_ngrams": 3895602
 }

   "n": 3,
   "variant": "subword",
   "language": "ace",
+  "unique_ngrams": 14605,
+  "total_ngrams": 3457577
 }

models/subword_ngram/ace_4gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1535b59d3ffbe8075678699c37413a180fe8e0b10d9a35c6873de3dbef8e23f7
-size 844170

 version https://git-lfs.github.com/spec/v1
+oid sha256:529d84e3eda9c7669ad3732639f36554c4e26803ae7ddfb8b4ca0fbcc326fbe7
+size 709589

models/subword_ngram/ace_4gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 4,
   "variant": "subword",
   "language": "ace",
-  "unique_ngrams": 71115,
-  "total_ngrams": 3882485
 }

   "n": 4,
   "variant": "subword",
   "language": "ace",
+  "unique_ngrams": 59251,
+  "total_ngrams": 3444688
 }

models/tokenizer/ace_tokenizer_16k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e0486832283fb4af24e9a049fd7267139c87b145310f599caa110fb2824272a2
-size 502107

 version https://git-lfs.github.com/spec/v1
+oid sha256:772eccdeefe417a39cb6d9efddc039fe1535ce5b6b9728ed7a7a9f133127fb6c
+size 503799

models/tokenizer/ace_tokenizer_16k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/ace_tokenizer_32k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:68ba228d17ef3b883e0874b7c05972fa9a2da79bae088ee77a799ec8ba684117
-size 780590

 version https://git-lfs.github.com/spec/v1
+oid sha256:1fc76a32a006f73f304c48abbc3874f1bbba66904ea507a5989c19d84064eb2b
+size 784975

models/tokenizer/ace_tokenizer_32k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/ace_tokenizer_64k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6eaefce9e3832566d4b03402b226a5055d116d560a4b6f5c9d79e2f9014b5be2
-size 1365161

 version https://git-lfs.github.com/spec/v1
+oid sha256:638ca8a1f7bfb0862f2a4e2f334f6a0b19cd0775fcc9ea20c4d6f11edfca90d1
+size 1328218

models/tokenizer/ace_tokenizer_64k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/ace_tokenizer_8k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8afabfa554db88a64b6659881e7d87e20f7717006505bdc6bbb199898a6e53a2
-size 371036

 version https://git-lfs.github.com/spec/v1
+oid sha256:4f2cf6a3125500fbc0a6568259881ab1bdc31c6fd6b01f32fd845acf44e03d4a
+size 371130

models/tokenizer/ace_tokenizer_8k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/vocabulary/ace_vocabulary.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0cdcdc6259363ef9c7e4d76eeee62ba0c6f4da7b3a86d0e338193a83e61e901a
-size 279594

 version https://git-lfs.github.com/spec/v1
+oid sha256:2ba6b44c1dbf28322c8da89370cfc09afb1c3724b0f3302b03929700e5dc3332
+size 251932

models/vocabulary/ace_vocabulary_metadata.json CHANGED Viewed

@@ -1,16 +1,17 @@
 {
   "language": "ace",
-  "vocabulary_size": 16834,
   "statistics": {
-    "type_token_ratio": 0.06473562926222352,
     "coverage": {
-      "top_100": 0.591821547661651,
-      "top_1000": 0.8065606887117759,
-      "top_5000": 0.9046783703966313,
-      "top_10000": 0.9388346884375219
     },
-    "hapax_count": 21877,
-    "hapax_ratio": 0.5651365245020795,
-    "total_documents": 13117
   }
 }

 {
   "language": "ace",
+  "vocabulary_size": 15502,
+  "variant": "full",
   "statistics": {
+    "type_token_ratio": 0.06761988315009426,
     "coverage": {
+      "top_100": 0.6077445728258638,
+      "top_1000": 0.8095215873667706,
+      "top_5000": 0.9057864969294234,
+      "top_10000": 0.9405017452821384
     },
+    "hapax_count": 20724,
+    "hapax_ratio": 0.5720753050295369,
+    "total_documents": 12889
   }
 }

models/word_markov/ace_markov_ctx1_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:064d6047862cf68da2b906472e39980d8a5b9be018d078f381c09a061281db10
-size 1341704

 version https://git-lfs.github.com/spec/v1
+oid sha256:f453896eb7f6ff049a8bcdf5e17901d47f42c1057f32a9f36b112d6d94fd70e3
+size 1218673

models/word_markov/ace_markov_ctx1_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "word",
   "language": "ace",
-  "unique_contexts": 38800,
-  "total_transitions": 712935
 }

   "context_size": 1,
   "variant": "word",
   "language": "ace",
+  "unique_contexts": 36025,
+  "total_transitions": 522841
 }

models/word_markov/ace_markov_ctx2_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:80655ab7d24dd0562f6dd00bf8454b23ac16d8f6145ab9327656208ee794c040
-size 2960527

 version https://git-lfs.github.com/spec/v1
+oid sha256:5f80eb40706cfeda9139c3371c96005f0fea0733f1ee4c3a2e5eecc815929a6a
+size 2625141

models/word_markov/ace_markov_ctx2_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "word",
   "language": "ace",
-  "unique_contexts": 172855,
-  "total_transitions": 699818
 }

   "context_size": 2,
   "variant": "word",
   "language": "ace",
+  "unique_contexts": 155224,
+  "total_transitions": 509952
 }

models/word_markov/ace_markov_ctx3_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c713caa0041630e45e31c03c7212c262abebdd3fc3bd44d99b0c957bc7aa8187
-size 4205022

 version https://git-lfs.github.com/spec/v1
+oid sha256:6af7f232b9a0d1fc8f7b6bc13752d652f933bd8f13f2d7ddea846f7703f26706
+size 3563465

models/word_markov/ace_markov_ctx3_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 3,
   "variant": "word",
   "language": "ace",
-  "unique_contexts": 269187,
-  "total_transitions": 686702
 }

   "context_size": 3,
   "variant": "word",
   "language": "ace",
+  "unique_contexts": 221018,
+  "total_transitions": 497063
 }

models/word_markov/ace_markov_ctx4_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:26e6fd733dee534e51fd8ce9dac5ca06b9de38d8fcabd817bd83cb1b54510e03
-size 4916810