omarkamali commited on Jan 3

Commit

247cf96

verified ·

1 Parent(s): 06c77fe

Upload all models and assets for ace (latest)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
README.md +205 -168
models/embeddings/aligned/ace_128d.bin +3 -0
models/embeddings/aligned/ace_128d.meta.json +1 -0
models/embeddings/aligned/ace_128d.projection.npy +3 -0
models/embeddings/aligned/ace_128d_metadata.json +8 -0
models/embeddings/aligned/ace_32d.bin +3 -0
models/embeddings/aligned/ace_32d.meta.json +1 -0
models/embeddings/aligned/ace_32d.projection.npy +3 -0
models/embeddings/aligned/ace_32d_metadata.json +8 -0
models/embeddings/aligned/ace_64d.bin +3 -0
models/embeddings/aligned/ace_64d.meta.json +1 -0
models/embeddings/aligned/ace_64d.projection.npy +3 -0
models/embeddings/aligned/ace_64d_metadata.json +8 -0
models/embeddings/monolingual/ace_128d.bin +2 -2
models/embeddings/monolingual/ace_128d_metadata.json +1 -1
models/embeddings/monolingual/ace_32d.bin +2 -2
models/embeddings/monolingual/ace_32d_metadata.json +1 -1
models/embeddings/monolingual/ace_64d.bin +2 -2
models/embeddings/monolingual/ace_64d_metadata.json +1 -1
models/subword_markov/ace_markov_ctx1_subword.parquet +2 -2
models/subword_markov/ace_markov_ctx1_subword_metadata.json +2 -2
models/subword_markov/ace_markov_ctx2_subword.parquet +2 -2
models/subword_markov/ace_markov_ctx2_subword_metadata.json +2 -2
models/subword_markov/ace_markov_ctx3_subword.parquet +2 -2
models/subword_markov/ace_markov_ctx3_subword_metadata.json +2 -2
models/subword_markov/ace_markov_ctx4_subword.parquet +2 -2
models/subword_markov/ace_markov_ctx4_subword_metadata.json +2 -2
models/subword_ngram/ace_2gram_subword.parquet +2 -2
models/subword_ngram/ace_2gram_subword_metadata.json +2 -2
models/subword_ngram/ace_3gram_subword.parquet +2 -2
models/subword_ngram/ace_3gram_subword_metadata.json +2 -2
models/subword_ngram/ace_4gram_subword.parquet +2 -2
models/subword_ngram/ace_4gram_subword_metadata.json +2 -2
models/subword_ngram/ace_5gram_subword.parquet +3 -0
models/subword_ngram/ace_5gram_subword_metadata.json +7 -0
models/tokenizer/ace_tokenizer_16k.model +2 -2
models/tokenizer/ace_tokenizer_16k.vocab +0 -0
models/tokenizer/ace_tokenizer_32k.model +2 -2
models/tokenizer/ace_tokenizer_32k.vocab +0 -0
models/tokenizer/ace_tokenizer_64k.model +2 -2
models/tokenizer/ace_tokenizer_64k.vocab +0 -0
models/tokenizer/ace_tokenizer_8k.model +2 -2
models/tokenizer/ace_tokenizer_8k.vocab +0 -0
models/vocabulary/ace_vocabulary.parquet +2 -2
models/vocabulary/ace_vocabulary_metadata.json +9 -9
models/word_markov/ace_markov_ctx1_word.parquet +2 -2
models/word_markov/ace_markov_ctx1_word_metadata.json +2 -2
models/word_markov/ace_markov_ctx2_word.parquet +2 -2
models/word_markov/ace_markov_ctx2_word_metadata.json +2 -2

.gitattributes CHANGED Viewed

@@ -39,3 +39,4 @@ visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -t
 visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text

 visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
+visualizations/embedding_tsne_multilingual.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 language: ace
-language_name: ACE
 language_family: austronesian_malay
 tags:
   - wikilangs
@@ -10,11 +10,21 @@ tags:
   - n-gram
   - markov
   - wikipedia
   - monolingual
   - family-austronesian_malay
 license: mit
 library_name: wikilangs
-pipeline_tag: feature-extraction
 datasets:
   - omarkamali/wikipedia-monthly
 dataset_info:
@@ -26,17 +36,17 @@ metrics:
     value: 4.925
   - name: best_isotropy
     type: isotropy
-    value: 0.5172
   - name: vocabulary_size
     type: vocab
     value: 0
 generated: 2026-01-03
 ---
-# ACE - Wikilangs Models
 ## Comprehensive Research Report & Full Ablation Study
-This repository contains NLP models trained and evaluated by Wikilangs, specifically on **ACE** Wikipedia data.
 We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
 ## 📋 Repository Contents
@@ -60,7 +70,7 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
-- [6. Morphological Analysis (Experimental)](#6-morphological-analysis)
 - [7. Summary & Recommendations](#7-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
@@ -80,47 +90,47 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
-| **8k** | 4.119x | 4.13 | 0.2682% | 125,632 |
-| **16k** | 4.488x | 4.50 | 0.2923% | 115,301 |
-| **32k** | 4.727x | 4.74 | 0.3079% | 109,452 |
-| **64k** | 4.925x 🏆 | 4.93 | 0.3208% | 105,066 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
-**Sample 1:** `Mukim Sepakat nakeuh saboh mukim di keucamatan Lawe Sigala-Gala Kabupatèn Acèh T...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁mukim ▁sepakat ▁nakeuh ▁saboh ▁mukim ▁di ▁keucamatan ▁lawe ▁sigala - ... (+12 more)` | 22 |
-| 16k | `▁mukim ▁sepakat ▁nakeuh ▁saboh ▁mukim ▁di ▁keucamatan ▁lawe ▁sigala - ... (+12 more)` | 22 |
-| 32k | `▁mukim ▁sepakat ▁nakeuh ▁saboh ▁mukim ▁di ▁keucamatan ▁lawe ▁sigala - ... (+12 more)` | 22 |
-| 64k | `▁mukim ▁sepakat ▁nakeuh ▁saboh ▁mukim ▁di ▁keucamatan ▁lawe ▁sigala - ... (+12 more)` | 22 |
-**Sample 2:** `Propinsi Nakhon Ratchasima nakeuh saboh propinsi di timu baroh Muangthai. Nang n...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁propinsi ▁nakhon ▁ratch asi ma ▁nakeuh ▁saboh ▁propinsi ▁di ▁timu ... (+11 more)` | 21 |
-| 16k | `▁propinsi ▁nakhon ▁ratchasima ▁nakeuh ▁saboh ▁propinsi ▁di ▁timu ▁baroh ▁muangthai ... (+7 more)` | 17 |
-| 32k | `▁propinsi ▁nakhon ▁ratchasima ▁nakeuh ▁saboh ▁propinsi ▁di ▁timu ▁baroh ▁muangthai ... (+7 more)` | 17 |
-| 64k | `▁propinsi ▁nakhon ▁ratchasima ▁nakeuh ▁saboh ▁propinsi ▁di ▁timu ▁baroh ▁muangthai ... (+7 more)` | 17 |
-**Sample 3:** `Kandang nakeuh gampông di Keucamatan Samalanga, Kabupatèn Bireuen, Acèh. Lumbôi ...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁kandang ▁nakeuh ▁gampông ▁di ▁keucamatan ▁samalanga , ▁kabupatèn ▁bireuen , ... (+13 more)` | 23 |
-| 16k | `▁kandang ▁nakeuh ▁gampông ▁di ▁keucamatan ▁samalanga , ▁kabupatèn ▁bireuen , ... (+13 more)` | 23 |
-| 32k | `▁kandang ▁nakeuh ▁gampông ▁di ▁keucamatan ▁samalanga , ▁kabupatèn ▁bireuen , ... (+13 more)` | 23 |
-| 64k | `▁kandang ▁nakeuh ▁gampông ▁di ▁keucamatan ▁samalanga , ▁kabupatèn ▁bireuen , ... (+13 more)` | 23 |
 ### Key Findings
 - **Best Compression:** 64k achieves 4.925x compression
-- **Lowest UNK Rate:** 8k with 0.2682% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
@@ -137,12 +147,14 @@ Below are sample sentences tokenized with each vocabulary size:
 | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
 |--------|---------|------------|---------|----------------|------------------|-------------------|
-| **2-gram** | Word | 637 | 9.32 | 7,009 | 62.6% | 83.4% |
-| **2-gram** | Subword | 224 🏆 | 7.80 | 2,204 | 71.8% | 99.5% |
-| **3-gram** | Word | 577 | 9.17 | 8,214 | 65.4% | 85.5% |
-| **3-gram** | Subword | 1,194 | 10.22 | 14,605 | 37.9% | 84.9% |
-| **4-gram** | Word | 673 | 9.39 | 12,805 | 64.5% | 83.7% |
-| **4-gram** | Subword | 3,551 | 11.79 | 59,251 | 26.2% | 67.5% |
 ### Top 5 N-grams by Size
@@ -153,8 +165,8 @@ Below are sample sentences tokenized with each vocabulary size:
 | 1 | `bak laman` | 7,389 |
 | 2 | `gunong nyoe` | 7,388 |
 | 3 | `nyoe bak` | 5,543 |
-| 4 | `nakeuh saboh` | 5,045 |
-| 5 | `di acèh` | 4,748 |
 **3-grams (Word):**
@@ -164,7 +176,7 @@ Below are sample sentences tokenized with each vocabulary size:
 | 2 | `nyoe bak laman` | 3,694 |
 | 3 | `lumbôi gampông nyoe` | 3,567 |
 | 4 | `acèh lumbôi gampông` | 3,564 |
-| 5 | `nyoe lam data` | 3,499 |
 **4-grams (Word):**
@@ -173,45 +185,65 @@ Below are sample sentences tokenized with each vocabulary size:
 | 1 | `gunong nyoe bak laman` | 3,694 |
 | 2 | `acèh lumbôi gampông nyoe` | 3,564 |
 | 3 | `nyoe lam data peumeurèntah` | 3,499 |
-| 4 | `gampông nyoe lam data` | 3,499 |
-| 5 | `lam data peumeurèntah nakeuh` | 3,499 |
 **2-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `e u` | 117,818 |
-| 2 | `_ n` | 79,411 |
-| 3 | `a n` | 69,436 |
-| 4 | `h _` | 68,029 |
-| 5 | `n g` | 67,573 |
 **3-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `n g _` | 44,439 |
-| 2 | `_ n a` | 31,640 |
-| 3 | `_ b a` | 30,463 |
-| 4 | `k e u` | 30,322 |
-| 5 | `_ n y` | 26,537 |
 **4-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `e u h _` | 23,348 |
-| 2 | `b a k _` | 23,260 |
-| 3 | `_ d i _` | 21,144 |
-| 4 | `k e u h` | 21,117 |
-| 5 | `a k e u` | 20,691 |
 ### Key Findings
 - **Best Perplexity:** 2-gram (subword) with 224
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
-- **Coverage:** Top-1000 patterns cover ~68% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
@@ -227,14 +259,14 @@ Below are sample sentences tokenized with each vocabulary size:
 | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
 |---------|---------|-------------|------------|------------------|-----------------|----------------|
-| **1** | Word | 0.7515 | 1.684 | 4.35 | 36,025 | 24.8% |
-| **1** | Subword | 0.8633 | 1.819 | 5.38 | 1,269 | 13.7% |
-| **2** | Word | 0.2148 | 1.161 | 1.44 | 155,224 | 78.5% |
-| **2** | Subword | 0.7739 | 1.710 | 4.50 | 6,822 | 22.6% |
-| **3** | Word | 0.0655 | 1.046 | 1.11 | 221,018 | 93.4% |
-| **3** | Subword | 0.7559 | 1.689 | 3.54 | 30,615 | 24.4% |
-| **4** | Word | 0.0242 🏆 | 1.017 | 1.04 | 242,720 | 97.6% |
-| **4** | Subword | 0.5660 | 1.480 | 2.36 | 108,223 | 43.4% |
 ### Generated Text Samples (Word-based)
@@ -242,27 +274,27 @@ Below are text samples generated from each word-based Markov chain model:
 **Context Size 1:**
-1. `di pidie acèh timu acèh indonesia the colour of life seuneubeuet bak saboh spèsiès nibak takson`
-2. `nakeuh gunong nyoe geupeuteubiet bak wikidata data peumeurèntah nakeuh gunong di teungoh ngon geukeu...`
-3. `bak wikidata data matauroe teubiet teunom di ateuh babah la ôt peunawôt luwa data gunong nyoe`
 **Context Size 2:**
-1. `bak laman sunrisesunset com di acèh seulatan acèh lumbôi gampông nyoe lam data peumeurèntah nakeuh n...`
-2. `gunong nyoe bak laman geonames data gunong nyoe bak laman sunrisesunset com di acèh nakeuh gampông d...`
-3. `nyoe bak wikidata data cuaca daerah gunong nyoe nakeuh kagoshima banda`
 **Context Size 3:**
-1. `gunong nyoe bak laman geonames data gunong nyoe bak wikidata data cuaca daerah gunong nyoe bak wikid...`
-2. `nyoe bak laman geonames data gunong nyoe bak laman geonames data gunong nyoe bak wikidata data cuaca...`
-3. `lumbôi gampông nyoe lam data peumeurèntah nakeuh nè di acèh rayek kawan ingin jaya acèh rayek nibak ...`
 **Context Size 4:**
 1. `gunong nyoe bak laman nasa data matauroe teubiet teunom di da irah bak laman sunrisesunset com di ac...`
-2. `acèh lumbôi gampông nyoe lam data peumeurèntah nakeuh nè di acèh rayek acèh acèh rayek`
-3. `nyoe lam data peumeurèntah nakeuh nè di bireuen bireuen`
 ### Generated Text Samples (Subword-based)
@@ -271,34 +303,34 @@ Below are text samples generated from each subword-based Markov chain model:
 **Context Size 1:**
-1. `_da_geriè_kahara`
-2. `ata_jeetabam_lab`
-3. `ng_ngeung_teukeu`
 **Context Size 2:**
-1. `euna_preunomyza_d`
-2. `_nya_-_diet_lis_a`
-3. `h_nak_lam_diversi`
 **Context Size 3:**
-1. `ng_udeh_nyoe_lam_d`
-2. `_nakeuh_spèsi_acèh`
-3. `_bagiang_bak_lagèe`
 **Context Size 4:**
-1. `euh_tarèh_seuë_deun`
 2. `bak_encyclopedia_of`
-3. `_di_surat_lé_gosho_`
 ### Key Findings
 - **Best Predictability:** Context-4 (word) with 97.6% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
-- **Memory Trade-off:** Larger contexts require more storage (108,223 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
@@ -314,64 +346,64 @@ Below are text samples generated from each subword-based Markov chain model:
 | Metric | Value |
 |--------|-------|
-| Vocabulary Size | 15,502 |
-| Total Tokens | 515,006 |
-| Mean Frequency | 33.22 |
 | Median Frequency | 3 |
-| Frequency Std Dev | 415.97 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | di | 21,196 |
-| 2 | nakeuh | 20,604 |
-| 3 | bak | 18,159 |
-| 4 | acèh | 17,511 |
-| 5 | nyoe | 13,184 |
 | 6 | data | 11,090 |
 | 7 | gunong | 10,023 |
-| 8 | nyang | 9,025 |
 | 9 | gampông | 8,794 |
-| 10 | lam | 7,941 |
 ### Least Common Words (from vocabulary)
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | saûdep | 2 |
-| 2 | teuleungah | 2 |
-| 3 | mutuskeun | 2 |
-| 4 | ekshumasi | 2 |
-| 5 | teukeuh | 2 |
-| 6 | dilegalisasikan | 2 |
-| 7 | jendela | 2 |
-| 8 | prosès | 2 |
-| 9 | piazza | 2 |
-| 10 | fontana | 2 |
 ### Zipf's Law Analysis
 | Metric | Value |
 |--------|-------|
-| Zipf Coefficient | 1.1704 |
-| R² (Goodness of Fit) | 0.995382 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
-| Top 100 | 63.2% |
-| Top 1,000 | 84.2% |
 | Top 5,000 | 94.2% |
 | Top 10,000 | 97.8% |
 ### Key Findings
-- **Zipf Compliance:** R²=0.9954 indicates excellent adherence to Zipf's law
-- **High Frequency Dominance:** Top 100 words cover 63.2% of corpus
-- **Long Tail:** 5,502 words needed for remaining 2.2% coverage
 ---
 ## 5. Word Embeddings Evaluation
@@ -387,37 +419,40 @@ Below are text samples generated from each subword-based Markov chain model:
 ### 5.1 Cross-Lingual Alignment
-> *Note: Multilingual alignment visualization not available for this language.*
 ### 5.2 Model Comparison
 | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
 |-------|-----------|----------|------------------|---------------|----------------|
-| **mono_32d** | 32 | 0.5172 🏆 | 0.4104 | N/A | N/A |
-| **mono_64d** | 64 | 0.1209 | 0.4362 | N/A | N/A |
-| **mono_128d** | 128 | 0.0271 | 0.4092 | N/A | N/A |
 ### Key Findings
-- **Best Isotropy:** mono_32d with 0.5172 (more uniform distribution)
-- **Semantic Density:** Average pairwise similarity of 0.4186. Lower values indicate better semantic separation.
-- **Alignment Quality:** No aligned models evaluated in this run.
 - **Recommendation:** 128d aligned for best cross-lingual performance
 ---
 ## 6.  Morphological Analysis (Experimental)
-> ⚠️ **Warning:** This language shows low morphological productivity. The statistical signals used for this analysis may be noisy or less reliable than for morphologically rich languages.
 This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
 ### 6.1 Productivity & Complexity
 | Metric | Value | Interpretation | Recommendation |
 |--------|-------|----------------|----------------|
-| Productivity Index | **0.000** | Low morphological productivity | ⚠️ Likely unreliable |
-| Idiomaticity Gap | **-1.000** | Low formulaic content | - |
 ### 6.2 Affix Inventory (Productive Units)
@@ -426,18 +461,18 @@ These are the most productive prefixes and suffixes identified by sampling the v
 #### Productive Prefixes
 | Prefix | Examples |
 |--------|----------|
-| `-me` | meulagu, meukeunong, meulabôh |
-| `-ge` | geumeuhoi, geupasoe, geupeuresmi |
-| `-geu` | geumeuhoi, geupasoe, geupeuresmi |
-| `-meu` | meulagu, meukeunong, meulabôh |
-| `-pe` | peunuman, peureudee, peumurah |
 #### Productive Suffixes
 | Suffix | Examples |
 |--------|----------|
-| `-ng` | meukeunong, gelampang, seberang |
-| `-an` | jonathan, peunuman, kyrgyzstan |
-| `-ah` | bawah, geupeujeulah, jumlah |
 ### 6.3 Bound Stems (Lexical Roots)
@@ -445,18 +480,18 @@ Bound stems are high-frequency subword units that are semantically cohesive but
 | Stem | Cohesion | Substitutability | Examples |
 |------|----------|------------------|----------|
-| `eung` | 1.41x | 64 contexts | reung, meung, jeung |
-| `uneu` | 1.70x | 28 contexts | runeu, uneun, seuneu |
-| `euen` | 1.54x | 38 contexts | eueng, meuen, leuen |
-| `euna` | 1.36x | 59 contexts | peuna, beuna, keuna |
-| `ubeu` | 1.47x | 22 contexts | ubeut, neubeu, ubeuet |
-| `umeu` | 1.44x | 23 contexts | jumeu, geumeu, jeumeu |
-| `meur` | 1.63x | 15 contexts | meuri, meurô, meurôn |
-| `anga` | 1.36x | 23 contexts | panga, manga, langa |
-| `teun` | 1.32x | 25 contexts | uteun, ateung, teunga |
-| `neub` | 1.57x | 14 contexts | neuba, neubeu, neubôk |
-| `eube` | 1.48x | 16 contexts | leube, teubee, leubeh |
-| `eune` | 1.63x | 12 contexts | seuneu, geuneu, keuneu |
 ### 6.4 Affix Compatibility (Co-occurrence)
@@ -464,15 +499,15 @@ This table shows which prefixes and suffixes most frequently co-occur on the sam
 | Prefix | Suffix | Frequency | Examples |
 |--------|--------|-----------|----------|
-| `-ge` | `-ng` | 56 words | geupeutrang, geudông |
-| `-pe` | `-an` | 51 words | penyiaran, permukaan |
-| `-me` | `-ng` | 40 words | meulinteueng, meuhubông |
-| `-pe` | `-ng` | 22 words | perang, peukeumang |
-| `-pe` | `-ah` | 18 words | peujeunajah, peuleumah |
-| `-ge` | `-ah` | 17 words | geupeuglah, geupeuluwah |
-| `-me` | `-ah` | 16 words | meujumeulah, meurah |
-| `-me` | `-an` | 10 words | meridian, meukeujadian |
-| `-ge` | `-an` | 6 words | geurakan, geuritan |
 ### 6.5 Recursive Morpheme Segmentation
@@ -480,26 +515,28 @@ Using **Recursive Hierarchical Substitutability**, we decompose complex words in
 | Word | Suggested Split | Confidence | Stem |
 |------|-----------------|------------|------|
-| geumeudong | **`geu-meu-dong`** | 6.0 | `dong` |
-| geumeututô | **`geu-meu-tutô`** | 6.0 | `tutô` |
-| meubileueng | **`meu-bileue-ng`** | 6.0 | `bileue` |
 | geulumbang | **`geu-lumba-ng`** | 6.0 | `lumba` |
 | geumeupakat | **`geu-meu-pakat`** | 6.0 | `pakat` |
-| geumeuniaga | **`geu-meu-niaga`** | 6.0 | `niaga` |
-| geumeuturi | **`geu-meu-turi`** | 6.0 | `turi` |
-| geuseubarô | **`geu-seubarô`** | 4.5 | `seubarô` |
-| geudapeuta | **`geu-dapeuta`** | 4.5 | `dapeuta` |
-| meusampoe | **`meu-sampoe`** | 4.5 | `sampoe` |
-| geubayeuë | **`geu-bayeuë`** | 4.5 | `bayeuë` |
-| meulingka | **`meu-lingka`** | 4.5 | `lingka` |
-| meusiyasat | **`meu-siyasat`** | 4.5 | `siyasat` |
-| meulaksana | **`meu-laksana`** | 4.5 | `laksana` |
-| geubayeue | **`geu-bayeue`** | 4.5 | `bayeue` |
 ### 6.6 Linguistic Interpretation
 > **Automated Insight:**
-The language ACE appears to be more isolating or has a highly fixed vocabulary. Word-level models perform nearly as well as subword models, indicating fewer productive morphological processes.
 ---
 ## 7. Summary & Recommendations
@@ -510,7 +547,7 @@ The language ACE appears to be more isolating or has a highly fixed vocabulary.
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
-| Tokenizer | **64k BPE** | Best compression (4.92x) |
 | N-gram | **2-gram** | Lowest perplexity (224) |
 | Markov | **Context-4** | Highest predictability (97.6%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
@@ -726,4 +763,4 @@ MIT License - Free for academic and commercial use.
 ---
 *Generated by Wikilangs Models Pipeline*
-*Report Date: 2026-01-03 05:05:30*

 ---
 language: ace
+language_name: Acehnese
 language_family: austronesian_malay
 tags:
   - wikilangs
   - n-gram
   - markov
   - wikipedia
+  - feature-extraction
+  - sentence-similarity
+  - tokenization
+  - n-grams
+  - markov-chain
+  - text-mining
+  - fasttext
+  - babelvec
+  - vocabulous
+  - vocabulary
   - monolingual
   - family-austronesian_malay
 license: mit
 library_name: wikilangs
+pipeline_tag: text-generation
 datasets:
   - omarkamali/wikipedia-monthly
 dataset_info:
     value: 4.925
   - name: best_isotropy
     type: isotropy
+    value: 0.5616
   - name: vocabulary_size
     type: vocab
     value: 0
 generated: 2026-01-03
 ---
+# Acehnese - Wikilangs Models
 ## Comprehensive Research Report & Full Ablation Study
+This repository contains NLP models trained and evaluated by Wikilangs, specifically on **Acehnese** Wikipedia data.
 We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
 ## 📋 Repository Contents
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
+- [6. Morphological Analysis (Experimental)](#6--morphological-analysis-experimental)
 - [7. Summary & Recommendations](#7-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
+| **8k** | 4.118x | 4.13 | 0.2676% | 125,584 |
+| **16k** | 4.487x | 4.50 | 0.2916% | 115,243 |
+| **32k** | 4.726x | 4.74 | 0.3071% | 109,414 |
+| **64k** | 4.925x 🏆 | 4.93 | 0.3200% | 104,998 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
+**Sample 1:** `Propinsi Champasak nakeuh saboh propinsi di Laos. Nang nanggroejih nakeuh Pakse.`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁propinsi ▁champ as ak ▁nakeuh ▁saboh ▁propinsi ▁di ▁laos . ... (+6 more)` | 16 |
+| 16k | `▁propinsi ▁champ asak ▁nakeuh ▁saboh ▁propinsi ▁di ▁laos . ▁nang ... (+5 more)` | 15 |
+| 32k | `▁propinsi ▁champasak ▁nakeuh ▁saboh ▁propinsi ▁di ▁laos . ▁nang ▁nanggroejih ... (+4 more)` | 14 |
+| 64k | `▁propinsi ▁champasak ▁nakeuh ▁saboh ▁propinsi ▁di ▁laos . ▁nang ▁nanggroejih ... (+3 more)` | 13 |
+**Sample 2:** `Mesjid Keumangan nakeuh gampông di Mutiara, Kabupatèn Pidie, Acèh. Lumbôi gampôn...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁mesjid ▁keum angan ▁nakeuh ▁gampông ▁di ▁mutiara , ▁kabupatèn ▁pidie ... (+14 more)` | 24 |
+| 16k | `▁mesjid ▁keumangan ▁nakeuh ▁gampông ▁di ▁mutiara , ▁kabupatèn ▁pidie , ... (+13 more)` | 23 |
+| 32k | `▁mesjid ▁keumangan ▁nakeuh ▁gampông ▁di ▁mutiara , ▁kabupatèn ▁pidie , ... (+13 more)` | 23 |
+| 64k | `▁mesjid ▁keumangan ▁nakeuh ▁gampông ▁di ▁mutiara , ▁kabupatèn ▁pidie , ... (+13 more)` | 23 |
+**Sample 3:** `Jurông Pandé nakeuh gampông di Geulumpang Tiga, Kabupatèn Pidie, Acèh. Lumbôi ga...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁jurông ▁pand é ▁nakeuh ▁gampông ▁di ▁geulumpang ▁tiga , ▁kabupatèn ... (+17 more)` | 27 |
+| 16k | `▁jurông ▁pandé ▁nakeuh ▁gampông ▁di ▁geulumpang ▁tiga , ▁kabupatèn ▁pidie ... (+16 more)` | 26 |
+| 32k | `▁jurông ▁pandé ▁nakeuh ▁gampông ▁di ▁geulumpang ▁tiga , ▁kabupatèn ▁pidie ... (+16 more)` | 26 |
+| 64k | `▁jurông ▁pandé ▁nakeuh ▁gampông ▁di ▁geulumpang ▁tiga , ▁kabupatèn ▁pidie ... (+16 more)` | 26 |
 ### Key Findings
 - **Best Compression:** 64k achieves 4.925x compression
+- **Lowest UNK Rate:** 8k with 0.2676% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
 | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
 |--------|---------|------------|---------|----------------|------------------|-------------------|
+| **2-gram** | Word | 640 | 9.32 | 7,037 | 62.5% | 83.3% |
+| **2-gram** | Subword | 224 🏆 | 7.81 | 2,200 | 71.8% | 99.5% |
+| **3-gram** | Word | 582 | 9.19 | 8,345 | 65.3% | 85.4% |
+| **3-gram** | Subword | 1,199 | 10.23 | 14,644 | 37.8% | 84.8% |
+| **4-gram** | Word | 678 | 9.41 | 12,913 | 64.4% | 83.6% |
+| **4-gram** | Subword | 3,579 | 11.81 | 59,564 | 26.1% | 67.4% |
+| **5-gram** | Word | 585 | 9.19 | 10,187 | 66.3% | 85.3% |
+| **5-gram** | Subword | 6,530 | 12.67 | 114,683 | 21.4% | 60.4% |
 ### Top 5 N-grams by Size
 | 1 | `bak laman` | 7,389 |
 | 2 | `gunong nyoe` | 7,388 |
 | 3 | `nyoe bak` | 5,543 |
+| 4 | `nakeuh saboh` | 5,048 |
+| 5 | `di acèh` | 4,747 |
 **3-grams (Word):**
 | 2 | `nyoe bak laman` | 3,694 |
 | 3 | `lumbôi gampông nyoe` | 3,567 |
 | 4 | `acèh lumbôi gampông` | 3,564 |
+| 5 | `lam data peumeurèntah` | 3,499 |
 **4-grams (Word):**
 | 1 | `gunong nyoe bak laman` | 3,694 |
 | 2 | `acèh lumbôi gampông nyoe` | 3,564 |
 | 3 | `nyoe lam data peumeurèntah` | 3,499 |
+| 4 | `lam data peumeurèntah nakeuh` | 3,499 |
+| 5 | `gampông nyoe lam data` | 3,499 |
+**5-grams (Word):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `gampông nyoe lam data peumeurèntah` | 3,499 |
+| 2 | `nyoe lam data peumeurèntah nakeuh` | 3,499 |
+| 3 | `lumbôi gampông nyoe lam data` | 3,498 |
+| 4 | `acèh lumbôi gampông nyoe lam` | 3,495 |
+| 5 | `lam data peumeurèntah nakeuh nè` | 3,489 |
 **2-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `e u` | 118,044 |
+| 2 | `_ n` | 79,550 |
+| 3 | `a n` | 69,741 |
+| 4 | `h _` | 68,205 |
+| 5 | `n g` | 67,768 |
 **3-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `n g _` | 44,547 |
+| 2 | `_ n a` | 31,665 |
+| 3 | `_ b a` | 30,517 |
+| 4 | `k e u` | 30,367 |
+| 5 | `_ n y` | 26,591 |
 **4-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `e u h _` | 23,358 |
+| 2 | `b a k _` | 23,289 |
+| 3 | `_ d i _` | 21,170 |
+| 4 | `k e u h` | 21,124 |
+| 5 | `a k e u` | 20,698 |
+**5-grams (Subword):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `k e u h _` | 21,003 |
+| 2 | `n a k e u` | 20,623 |
+| 3 | `a k e u h` | 20,621 |
+| 4 | `_ n a k e` | 20,596 |
+| 5 | `_ b a k _` | 18,136 |
 ### Key Findings
 - **Best Perplexity:** 2-gram (subword) with 224
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
+- **Coverage:** Top-1000 patterns cover ~60% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
 | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
 |---------|---------|-------------|------------|------------------|-----------------|----------------|
+| **1** | Word | 0.7505 | 1.682 | 4.34 | 36,359 | 25.0% |
+| **1** | Subword | 0.8631 | 1.819 | 5.38 | 1,270 | 13.7% |
+| **2** | Word | 0.2142 | 1.160 | 1.44 | 156,380 | 78.6% |
+| **2** | Subword | 0.7734 | 1.709 | 4.50 | 6,829 | 22.7% |
+| **3** | Word | 0.0653 | 1.046 | 1.11 | 222,450 | 93.5% |
+| **3** | Subword | 0.7578 | 1.691 | 3.55 | 30,660 | 24.2% |
+| **4** | Word | 0.0241 🏆 | 1.017 | 1.04 | 244,189 | 97.6% |
+| **4** | Subword | 0.5683 | 1.483 | 2.36 | 108,651 | 43.2% |
 ### Generated Text Samples (Word-based)
 **Context Size 1:**
+1. `di da irah bak wikidata data cuaca daerah gunong nyoe bak di acèh indonesia laos nang`
+2. `nakeuh saboh propinsi acèh timu burundi rwanda madagaskar nakeuh gampông lam data peumeurèntah nakeu...`
+3. `bak laman geonames data peumeurèntah nakeuh nè di gayo lues provinsi acèh barat pulo wèh lam`
 **Context Size 2:**
+1. `bak laman nasa data matauroe teubiet teunom di da irah bak laman nasa data matauroe teubiet teunom`
+2. `gunong nyoe nakeuh bagian nibak inggréh pangiran maurits dari beulanda natom cit meukirém surat keu ...`
+3. `nyoe bak laman nasa data matauroe teubiet teunom di da irah bak laman sunrisesunset com di acèh`
 **Context Size 3:**
+1. `gunong nyoe bak laman nasa data matauroe teubiet teunom di da irah bak laman sunrisesunset com di ac...`
+2. `nyoe bak laman nasa data matauroe teubiet teunom di da irah bak laman sunrisesunset com di acèh`
+3. `lumbôi gampông nyoe lam data peumeurèntah nakeuh nè di acèh timu jernih acèh timu`
 **Context Size 4:**
 1. `gunong nyoe bak laman nasa data matauroe teubiet teunom di da irah bak laman sunrisesunset com di ac...`
+2. `acèh lumbôi gampông nyoe lam data peumeurèntah nakeuh nè di acèh barôh acèh barôh`
+3. `gampông nyoe lam data peumeurèntah nakeuh nè di acèh rayek kawan peukan bada acèh rayek ngön nan awa...`
 ### Generated Text Samples (Subword-based)
 **Context Size 1:**
+1. `_onirastak_lh_ak`
+2. `ansa_pônng_39_n.`
+3. `naneum_l_()._dam`
 **Context Size 2:**
+1. `euh_aoyatèktiong_`
+2. `_nya_droë:_teukeu`
+3. `an_ak_di_istreng_`
 **Context Size 3:**
+1. `ng_geukheungui_gam`
+2. `_na_data_pranté_ab`
+3. `_bak_da'irahmada_u`
 **Context Size 4:**
+1. `euh_gampông_na_di_a`
 2. `bak_encyclopedia_of`
+3. `_di_tunong_nyoë,_bh`
 ### Key Findings
 - **Best Predictability:** Context-4 (word) with 97.6% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
+- **Memory Trade-off:** Larger contexts require more storage (108,651 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
 | Metric | Value |
 |--------|-------|
+| Vocabulary Size | 15,619 |
+| Total Tokens | 516,593 |
+| Mean Frequency | 33.07 |
 | Median Frequency | 3 |
+| Frequency Std Dev | 414.79 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | di | 21,222 |
+| 2 | nakeuh | 20,611 |
+| 3 | bak | 18,176 |
+| 4 | acèh | 17,532 |
+| 5 | nyoe | 13,191 |
 | 6 | data | 11,090 |
 | 7 | gunong | 10,023 |
+| 8 | nyang | 9,056 |
 | 9 | gampông | 8,794 |
+| 10 | lam | 7,951 |
 ### Least Common Words (from vocabulary)
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | influence | 2 |
+| 2 | across | 2 |
+| 3 | represent | 2 |
+| 4 | raising | 2 |
+| 5 | ceremony | 2 |
+| 6 | flown | 2 |
+| 7 | reconstructions | 2 |
+| 8 | bendera | 2 |
+| 9 | bekas | 2 |
+| 10 | jawatimu | 2 |
 ### Zipf's Law Analysis
 | Metric | Value |
 |--------|-------|
+| Zipf Coefficient | 1.1698 |
+| R² (Goodness of Fit) | 0.995531 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
+| Top 100 | 63.1% |
+| Top 1,000 | 84.1% |
 | Top 5,000 | 94.2% |
 | Top 10,000 | 97.8% |
 ### Key Findings
+- **Zipf Compliance:** R²=0.9955 indicates excellent adherence to Zipf's law
+- **High Frequency Dominance:** Top 100 words cover 63.1% of corpus
+- **Long Tail:** 5,619 words needed for remaining 2.2% coverage
 ---
 ## 5. Word Embeddings Evaluation
 ### 5.1 Cross-Lingual Alignment
+![Alignment Quality](visualizations/embedding_alignment_quality.png)
+![Multilingual t-SNE](visualizations/embedding_tsne_multilingual.png)
 ### 5.2 Model Comparison
 | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
 |-------|-----------|----------|------------------|---------------|----------------|
+| **mono_32d** | 32 | 0.5616 🏆 | 0.3940 | N/A | N/A |
+| **mono_64d** | 64 | 0.2087 | 0.3984 | N/A | N/A |
+| **mono_128d** | 128 | 0.0274 | 0.4044 | N/A | N/A |
+| **aligned_32d** | 32 | 0.5616 | 0.4083 | 0.0220 | 0.1860 |
+| **aligned_64d** | 64 | 0.2087 | 0.4071 | 0.0460 | 0.2660 |
+| **aligned_128d** | 128 | 0.0274 | 0.4087 | 0.0440 | 0.2760 |
 ### Key Findings
+- **Best Isotropy:** mono_32d with 0.5616 (more uniform distribution)
+- **Semantic Density:** Average pairwise similarity of 0.4035. Lower values indicate better semantic separation.
+- **Alignment Quality:** Aligned models achieve up to 4.6% R@1 in cross-lingual retrieval.
 - **Recommendation:** 128d aligned for best cross-lingual performance
 ---
 ## 6.  Morphological Analysis (Experimental)
 This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
 ### 6.1 Productivity & Complexity
 | Metric | Value | Interpretation | Recommendation |
 |--------|-------|----------------|----------------|
+| Productivity Index | **5.000** | High morphological productivity | Reliable analysis |
+| Idiomaticity Gap | **0.411** | High formulaic/idiomatic content | - |
 ### 6.2 Affix Inventory (Productive Units)
 #### Productive Prefixes
 | Prefix | Examples |
 |--------|----------|
+| `-me` | meusuci, meutapi, meukheuluk |
+| `-meu` | meusuci, meutapi, meukheuluk |
+| `-ge` | geuôseuha, geuplueng, geupeuhu |
+| `-geu` | geuôseuha, geuplueng, geupeuhu |
+| `-pe` | peuneujeutneuh, peutinggai, pelelangan |
 #### Productive Suffixes
 | Suffix | Examples |
 |--------|----------|
+| `-ng` | geuplueng, loyang, berperang |
+| `-an` | pelelangan, onekotan, kerobokan |
+| `-ah` | ketukah, beulasah, beudarah |
 ### 6.3 Bound Stems (Lexical Roots)
 | Stem | Cohesion | Substitutability | Examples |
 |------|----------|------------------|----------|
+| `eung` | 1.41x | 64 contexts | meung, reung, jeung |
+| `uneu` | 1.70x | 28 contexts | runeu, uneun, meuneu |
+| `euen` | 1.53x | 38 contexts | meuen, leuen, eueng |
+| `euna` | 1.35x | 60 contexts | beuna, keuna, peuna |
+| `ubeu` | 1.43x | 22 contexts | ubeut, neubeu, keubeu |
+| `umeu` | 1.40x | 23 contexts | jumeu, jeumeu, geumeu |
+| `meur` | 1.59x | 15 contexts | meuri, meurô, meurah |
+| `neub` | 1.58x | 14 contexts | neuba, neubôk, neubut |
+| `teun` | 1.31x | 25 contexts | uteun, ateung, teuntè |
+| `beue` | 1.49x | 16 contexts | beuet, tabeue, abeuek |
+| `anga` | 1.31x | 23 contexts | langa, panga, manga |
+| `eune` | 1.61x | 12 contexts | jeuneh, meuneu, geuneu |
 ### 6.4 Affix Compatibility (Co-occurrence)
 | Prefix | Suffix | Frequency | Examples |
 |--------|--------|-----------|----------|
+| `-pe` | `-an` | 53 words | peureumponan, pertahanan |
+| `-ge` | `-ng` | 52 words | geumeugabong, geutamöng |
+| `-me` | `-ng` | 33 words | meuulang, meunatang |
+| `-pe` | `-ng` | 27 words | peunayông, peudong |
+| `-ge` | `-ah` | 21 words | geupeuleumah, geupisah |
+| `-me` | `-ah` | 17 words | meubatah, meuseudeukah |
+| `-pe` | `-ah` | 15 words | peuneugah, peumerintah |
+| `-me` | `-an` | 13 words | meukawan, mediterranian |
+| `-ge` | `-an` | 4 words | geurakan, gerakan |
 ### 6.5 Recursive Morpheme Segmentation
 | Word | Suggested Split | Confidence | Stem |
 |------|-----------------|------------|------|
+| geumeujuang | **`geu-meu-juang`** | 6.0 | `juang` |
 | geulumbang | **`geu-lumba-ng`** | 6.0 | `lumba` |
+| geumeunarit | **`geu-meu-narit`** | 6.0 | `narit` |
+| geumeuripèe | **`geu-meu-ripèe`** | 6.0 | `ripèe` |
 | geumeupakat | **`geu-meu-pakat`** | 6.0 | `pakat` |
+| geumeusipheuët | **`geu-meu-sipheuët`** | 6.0 | `sipheuët` |
+| geumeuduëk | **`geu-meu-duëk`** | 6.0 | `duëk` |
+| meubintéh | **`meu-bintéh`** | 4.5 | `bintéh` |
+| geutanyöe | **`geu-tanyöe`** | 4.5 | `tanyöe` |
+| geupeuriwang | **`geu-pe-uriwa-ng`** | 4.5 | `uriwa` |
+| meuadaptasi | **`meu-adaptasi`** | 4.5 | `adaptasi` |
+| geumigrasi | **`geu-migrasi`** | 4.5 | `migrasi` |
+| geutimbak | **`geu-timbak`** | 4.5 | `timbak` |
+| geupageuë | **`geu-pageuë`** | 4.5 | `pageuë` |
+| meutugaih | **`meu-tugaih`** | 4.5 | `tugaih` |
 ### 6.6 Linguistic Interpretation
 > **Automated Insight:**
+The language Acehnese shows high morphological productivity. The subword models are significantly more efficient than word models, suggesting a rich system of affixation or compounding.
+> **Note on Idiomaticity:** The high Idiomaticity Gap suggests a large number of frequent multi-word expressions or formulaic sequences that are statistically distinct from their component parts.
 ---
 ## 7. Summary & Recommendations
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
+| Tokenizer | **64k BPE** | Best compression (4.93x) |
 | N-gram | **2-gram** | Lowest perplexity (224) |
 | Markov | **Context-4** | Highest predictability (97.6%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
 ---
 *Generated by Wikilangs Models Pipeline*
+*Report Date: 2026-01-03 14:04:07*

models/embeddings/aligned/ace_128d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f03f3819979ca174222ea3d23b6d7de0b1f4b570a29fb44a02a8e759136e5c0
+size 1030450066

models/embeddings/aligned/ace_128d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "ace", "dim": 128, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/ace_128d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88867f0cac00844ea8cc080676475c23d28d323b7517734505a58aaf2d4a6181
+size 65664

models/embeddings/aligned/ace_128d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "ace",
+  "dimension": 128,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 2127,
+  "vocab_size": 6200
+}

models/embeddings/aligned/ace_32d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e5448de312894c92e7b4bed52b5ecf83aadef491e286bed26ea293aaac01108
+size 257688466

models/embeddings/aligned/ace_32d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "ace", "dim": 32, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/ace_32d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:52dcedd993e567a2687112252defc4b0171e62f32ccd9169e40c8f9542aa7dd6
+size 4224

models/embeddings/aligned/ace_32d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "ace",
+  "dimension": 32,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 2127,
+  "vocab_size": 6200
+}

models/embeddings/aligned/ace_64d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e85918583ce1dcf80f4ed4c21ef5bf9fbbcfc6a1b4562cef02b8c398a799c834
+size 515275666

models/embeddings/aligned/ace_64d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "ace", "dim": 64, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/ace_64d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:507f0da0845b6878874d0c64c9cc6d2df612728219f1020eb11838e0d5193067
+size 16512

models/embeddings/aligned/ace_64d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "ace",
+  "dimension": 64,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 2127,
+  "vocab_size": 6200
+}

models/embeddings/monolingual/ace_128d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0cf768b8cc26ee028b74fbc795a0084120d5ec9fa9ee7265d5277f3228676402
-size 1030413699

 version https://git-lfs.github.com/spec/v1
+oid sha256:5f03f3819979ca174222ea3d23b6d7de0b1f4b570a29fb44a02a8e759136e5c0
+size 1030450066

models/embeddings/monolingual/ace_128d_metadata.json CHANGED Viewed

@@ -11,5 +11,5 @@
     "encoding_method": "rope",
     "dim": 128
   },
-  "vocab_size": 6165
 }

     "encoding_method": "rope",
     "dim": 128
   },
+  "vocab_size": 6200
 }

models/embeddings/monolingual/ace_32d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b69885b365da465ab4bdfb040ee28e5b1f011ab235bd18a0e87c25eae3925f8f
-size 257678979

 version https://git-lfs.github.com/spec/v1
+oid sha256:9e5448de312894c92e7b4bed52b5ecf83aadef491e286bed26ea293aaac01108
+size 257688466

models/embeddings/monolingual/ace_32d_metadata.json CHANGED Viewed

@@ -11,5 +11,5 @@
     "encoding_method": "rope",
     "dim": 32
   },
-  "vocab_size": 6165
 }

     "encoding_method": "rope",
     "dim": 32
   },
+  "vocab_size": 6200
 }

models/embeddings/monolingual/ace_64d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:79de1c214df5334f0fa005b795ccfce46a4ac070f1ea7654cfaab5c2dc34dda5
-size 515257219

 version https://git-lfs.github.com/spec/v1
+oid sha256:e85918583ce1dcf80f4ed4c21ef5bf9fbbcfc6a1b4562cef02b8c398a799c834
+size 515275666

models/embeddings/monolingual/ace_64d_metadata.json CHANGED Viewed

@@ -11,5 +11,5 @@
     "encoding_method": "rope",
     "dim": 64
   },
-  "vocab_size": 6165
 }

     "encoding_method": "rope",
     "dim": 64
   },
+  "vocab_size": 6200
 }

models/subword_markov/ace_markov_ctx1_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0f3a9abf57afb060a211a69e96c23c22c115a944e460bb459448be1adea274b0
-size 59763

 version https://git-lfs.github.com/spec/v1
+oid sha256:3e072c754c56da9e95c9c200c695cd6b471ee6689a0e1a698d30935c43cdafe4
+size 59998

models/subword_markov/ace_markov_ctx1_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "subword",
   "language": "ace",
-  "unique_contexts": 1269,
-  "total_transitions": 3470466
 }

   "context_size": 1,
   "variant": "subword",
   "language": "ace",
+  "unique_contexts": 1270,
+  "total_transitions": 3482091
 }

models/subword_markov/ace_markov_ctx2_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d76a3d707c4ae33b25f22e9cbdcb78745638dfcdc99ed80df41d977cbb81cca0
-size 268452

 version https://git-lfs.github.com/spec/v1
+oid sha256:e6afbd7f0d3da86c66f7176070cff11e496daaa5c85cd0f88eec18a4b951b764
+size 268894

models/subword_markov/ace_markov_ctx2_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "subword",
   "language": "ace",
-  "unique_contexts": 6822,
-  "total_transitions": 3457577
 }

   "context_size": 2,
   "variant": "subword",
   "language": "ace",
+  "unique_contexts": 6829,
+  "total_transitions": 3469188
 }

models/subword_markov/ace_markov_ctx3_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:652d2af547c89ae398ecf44a87cac49a050a3836bbeb328bda5d380d30c7c8c0
-size 900905

 version https://git-lfs.github.com/spec/v1
+oid sha256:99d97f9386bffc3efb8fe0009cd6a6ed6d18de98abe9815278c8893869470442
+size 892058

models/subword_markov/ace_markov_ctx3_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 3,
   "variant": "subword",
   "language": "ace",
-  "unique_contexts": 30615,
-  "total_transitions": 3444688
 }

   "context_size": 3,
   "variant": "subword",
   "language": "ace",
+  "unique_contexts": 30660,
+  "total_transitions": 3456285
 }

models/subword_markov/ace_markov_ctx4_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:32adbfca2120175fa43f1b0bd2cedc5d7bd19099677319e6128cab29e66007d4
-size 2085174

 version https://git-lfs.github.com/spec/v1
+oid sha256:5c75772e985795218463a1d4dbea72b935f78d17de04a246cd54921d15bd6127
+size 2090712

models/subword_markov/ace_markov_ctx4_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 4,
   "variant": "subword",
   "language": "ace",
-  "unique_contexts": 108223,
-  "total_transitions": 3431799
 }

   "context_size": 4,
   "variant": "subword",
   "language": "ace",
+  "unique_contexts": 108651,
+  "total_transitions": 3443382
 }

models/subword_ngram/ace_2gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0e6d83ed77f7cb491e9f8059b986f03ae648a2fb694c94ba29ac349e3055f671
-size 30955

 version https://git-lfs.github.com/spec/v1
+oid sha256:ee94d573e335a2d4be7247c2a12242fe807e58afabb45eeefc19c58537fc242c
+size 30922

models/subword_ngram/ace_2gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 2,
   "variant": "subword",
   "language": "ace",
-  "unique_ngrams": 2204,
-  "total_ngrams": 3470466
 }

   "n": 2,
   "variant": "subword",
   "language": "ace",
+  "unique_ngrams": 2200,
+  "total_ngrams": 3482091
 }

models/subword_ngram/ace_3gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5f58aa5cf358032d5b4d7a52f7a9b81c4a9bcf9dda5c142a65fdae14ae7135f4
-size 178456

 version https://git-lfs.github.com/spec/v1
+oid sha256:6089db142d0a01a8ea615485f0561eafea4b0f9e38c1b90e77d230871c2eb996
+size 178914

models/subword_ngram/ace_3gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 3,
   "variant": "subword",
   "language": "ace",
-  "unique_ngrams": 14605,
-  "total_ngrams": 3457577
 }

   "n": 3,
   "variant": "subword",
   "language": "ace",
+  "unique_ngrams": 14644,
+  "total_ngrams": 3469188
 }

models/subword_ngram/ace_4gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:529d84e3eda9c7669ad3732639f36554c4e26803ae7ddfb8b4ca0fbcc326fbe7
-size 709589

 version https://git-lfs.github.com/spec/v1
+oid sha256:23c45b8e33584a590113811267828deedd9f32c90ce26cd12b20b61ad8507758
+size 711121

models/subword_ngram/ace_4gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 4,
   "variant": "subword",
   "language": "ace",
-  "unique_ngrams": 59251,
-  "total_ngrams": 3444688
 }

   "n": 4,
   "variant": "subword",
   "language": "ace",
+  "unique_ngrams": 59564,
+  "total_ngrams": 3456285
 }

models/subword_ngram/ace_5gram_subword.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b59e6034705aedd81a216a5c9cafd068958aef9196abf3285e0a0a7b64325b24
+size 1335561

models/subword_ngram/ace_5gram_subword_metadata.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "n": 5,
+  "variant": "subword",
+  "language": "ace",
+  "unique_ngrams": 114683,
+  "total_ngrams": 3443382
+}

models/tokenizer/ace_tokenizer_16k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:772eccdeefe417a39cb6d9efddc039fe1535ce5b6b9728ed7a7a9f133127fb6c
-size 503799

 version https://git-lfs.github.com/spec/v1
+oid sha256:72d32d166640b817121ffe9e19b8c9c7a5746b6b0771e6230d16250f372188cf
+size 504006

models/tokenizer/ace_tokenizer_16k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/ace_tokenizer_32k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1fc76a32a006f73f304c48abbc3874f1bbba66904ea507a5989c19d84064eb2b
-size 784975

 version https://git-lfs.github.com/spec/v1
+oid sha256:165e0dfc018dd109d7f186031ebab0ecf3322a59360785ef69e9cc6df607a025
+size 784687

models/tokenizer/ace_tokenizer_32k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/ace_tokenizer_64k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:638ca8a1f7bfb0862f2a4e2f334f6a0b19cd0775fcc9ea20c4d6f11edfca90d1
-size 1328218

 version https://git-lfs.github.com/spec/v1
+oid sha256:3edc9a3b51a0f7207be7594e5268c9ae5beb9e82faeccbc7d45b72c6f6c8dd74
+size 1329031

models/tokenizer/ace_tokenizer_64k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/ace_tokenizer_8k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4f2cf6a3125500fbc0a6568259881ab1bdc31c6fd6b01f32fd845acf44e03d4a
-size 371130

 version https://git-lfs.github.com/spec/v1
+oid sha256:5a75923b49929bbad8c27677f13bfd6d07a5e1e8f2cf1c69010042def7150a74
+size 371090

models/tokenizer/ace_tokenizer_8k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/vocabulary/ace_vocabulary.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2ba6b44c1dbf28322c8da89370cfc09afb1c3724b0f3302b03929700e5dc3332
-size 251932

 version https://git-lfs.github.com/spec/v1
+oid sha256:a52cd9780a36a6b32b6162ad116f556ac1e99250e23a8ba801de9a38eb1ae92e
+size 254661

models/vocabulary/ace_vocabulary_metadata.json CHANGED Viewed

@@ -1,17 +1,17 @@
 {
   "language": "ace",
-  "vocabulary_size": 15502,
   "variant": "full",
   "statistics": {
-    "type_token_ratio": 0.06761988315009426,
     "coverage": {
-      "top_100": 0.6077445728258638,
-      "top_1000": 0.8095215873667706,
-      "top_5000": 0.9057864969294234,
-      "top_10000": 0.9405017452821384
     },
-    "hapax_count": 20724,
-    "hapax_ratio": 0.5720753050295369,
-    "total_documents": 12889
   }
 }

 {
   "language": "ace",
+  "vocabulary_size": 15619,
   "variant": "full",
   "statistics": {
+    "type_token_ratio": 0.06801256853067626,
     "coverage": {
+      "top_100": 0.6062678942502134,
+      "top_1000": 0.8083819970122764,
+      "top_5000": 0.904856073952669,
+      "top_10000": 0.9397339326143698
     },
+    "hapax_count": 20940,
+    "hapax_ratio": 0.572772778248858,
+    "total_documents": 12903
   }
 }

models/word_markov/ace_markov_ctx1_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f453896eb7f6ff049a8bcdf5e17901d47f42c1057f32a9f36b112d6d94fd70e3
-size 1218673

 version https://git-lfs.github.com/spec/v1
+oid sha256:948199af90e300f25e496a4eedb3192c80a7bfe576a56d57660902469ac200fa
+size 1247545

models/word_markov/ace_markov_ctx1_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "word",
   "language": "ace",
-  "unique_contexts": 36025,
-  "total_transitions": 522841
 }

   "context_size": 1,
   "variant": "word",
   "language": "ace",
+  "unique_contexts": 36359,
+  "total_transitions": 524630
 }

models/word_markov/ace_markov_ctx2_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5f80eb40706cfeda9139c3371c96005f0fea0733f1ee4c3a2e5eecc815929a6a
-size 2625141

 version https://git-lfs.github.com/spec/v1
+oid sha256:626d87fcc6d59b74ebbbe7bd7226acbac283028cc291478986933611b278d2c7
+size 2658404

models/word_markov/ace_markov_ctx2_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "word",
   "language": "ace",
-  "unique_contexts": 155224,
-  "total_transitions": 509952
 }

   "context_size": 2,
   "variant": "word",
   "language": "ace",
+  "unique_contexts": 156380,
+  "total_transitions": 511727
 }