omarkamali commited on Jan 3

Commit

1ca515d

verified ·

1 Parent(s): c982f3a

Upload all models and assets for bi (latest)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
README.md +184 -147
models/embeddings/aligned/bi_128d.bin +3 -0
models/embeddings/aligned/bi_128d.meta.json +1 -0
models/embeddings/aligned/bi_128d.projection.npy +3 -0
models/embeddings/aligned/bi_128d_metadata.json +8 -0
models/embeddings/aligned/bi_32d.bin +3 -0
models/embeddings/aligned/bi_32d.meta.json +1 -0
models/embeddings/aligned/bi_32d.projection.npy +3 -0
models/embeddings/aligned/bi_32d_metadata.json +8 -0
models/embeddings/aligned/bi_64d.bin +3 -0
models/embeddings/aligned/bi_64d.meta.json +1 -0
models/embeddings/aligned/bi_64d.projection.npy +3 -0
models/embeddings/aligned/bi_64d_metadata.json +8 -0
models/embeddings/monolingual/bi_128d.bin +2 -2
models/embeddings/monolingual/bi_128d_metadata.json +1 -1
models/embeddings/monolingual/bi_32d.bin +2 -2
models/embeddings/monolingual/bi_32d_metadata.json +1 -1
models/embeddings/monolingual/bi_64d.bin +2 -2
models/embeddings/monolingual/bi_64d_metadata.json +1 -1
models/subword_markov/bi_markov_ctx1_subword.parquet +2 -2
models/subword_markov/bi_markov_ctx1_subword_metadata.json +2 -2
models/subword_markov/bi_markov_ctx2_subword.parquet +2 -2
models/subword_markov/bi_markov_ctx2_subword_metadata.json +2 -2
models/subword_markov/bi_markov_ctx3_subword.parquet +2 -2
models/subword_markov/bi_markov_ctx3_subword_metadata.json +2 -2
models/subword_markov/bi_markov_ctx4_subword.parquet +2 -2
models/subword_markov/bi_markov_ctx4_subword_metadata.json +2 -2
models/subword_ngram/bi_2gram_subword.parquet +2 -2
models/subword_ngram/bi_2gram_subword_metadata.json +2 -2
models/subword_ngram/bi_3gram_subword.parquet +2 -2
models/subword_ngram/bi_3gram_subword_metadata.json +2 -2
models/subword_ngram/bi_4gram_subword.parquet +2 -2
models/subword_ngram/bi_4gram_subword_metadata.json +2 -2
models/subword_ngram/bi_5gram_subword.parquet +3 -0
models/subword_ngram/bi_5gram_subword_metadata.json +7 -0
models/tokenizer/bi_tokenizer_16k.model +2 -2
models/tokenizer/bi_tokenizer_16k.vocab +0 -0
models/tokenizer/bi_tokenizer_8k.model +2 -2
models/tokenizer/bi_tokenizer_8k.vocab +0 -0
models/vocabulary/bi_vocabulary.parquet +2 -2
models/vocabulary/bi_vocabulary_metadata.json +8 -8
models/word_markov/bi_markov_ctx1_word.parquet +2 -2
models/word_markov/bi_markov_ctx1_word_metadata.json +2 -2
models/word_markov/bi_markov_ctx2_word.parquet +2 -2
models/word_markov/bi_markov_ctx2_word_metadata.json +2 -2
models/word_markov/bi_markov_ctx3_word.parquet +2 -2
models/word_markov/bi_markov_ctx3_word_metadata.json +2 -2
models/word_markov/bi_markov_ctx4_word.parquet +2 -2
models/word_markov/bi_markov_ctx4_word_metadata.json +2 -2

.gitattributes CHANGED Viewed

@@ -40,3 +40,4 @@ visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
 visualizations/ngram_coverage.png filter=lfs diff=lfs merge=lfs -text

 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
 visualizations/ngram_coverage.png filter=lfs diff=lfs merge=lfs -text
+visualizations/embedding_tsne_multilingual.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 language: bi
-language_name: BI
 language_family: germanic_west_anglofrisian
 tags:
   - wikilangs
@@ -10,11 +10,21 @@ tags:
   - n-gram
   - markov
   - wikipedia
   - monolingual
   - family-germanic_west_anglofrisian
 license: mit
 library_name: wikilangs
-pipeline_tag: feature-extraction
 datasets:
   - omarkamali/wikipedia-monthly
 dataset_info:
@@ -23,20 +33,20 @@ dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
-    value: 4.443
   - name: best_isotropy
     type: isotropy
-    value: 0.0388
   - name: vocabulary_size
     type: vocab
     value: 0
 generated: 2026-01-03
 ---
-# BI - Wikilangs Models
 ## Comprehensive Research Report & Full Ablation Study
-This repository contains NLP models trained and evaluated by Wikilangs, specifically on **BI** Wikipedia data.
 We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
 ## 📋 Repository Contents
@@ -60,7 +70,7 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
-- [6. Morphological Analysis (Experimental)](#6-morphological-analysis)
 - [7. Summary & Recommendations](#7-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
@@ -80,39 +90,39 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
-| **8k** | 4.032x | 4.05 | 0.1444% | 47,092 |
-| **16k** | 4.443x 🏆 | 4.47 | 0.1591% | 42,734 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
-**Sample 1:** `Copenhagen (toktok Denmak: København), hem i kapitol blong Denmak. Long yia popu...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁copenhagen ▁( toktok ▁denmak : ▁københavn ), ▁hem ▁i ▁kapitol ... (+20 more)` | 30 |
-| 16k | `▁copenhagen ▁( toktok ▁denmak : ▁københavn ), ▁hem ▁i ▁kapitol ... (+20 more)` | 30 |
-**Sample 2:** `Emily Elizabeth Dickinson (10 Desemba – 15 May em i bin wan poet blong Amerika. ...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁em il y ▁elizabeth ▁dick ins on ▁( 1 0 ... (+19 more)` | 29 |
-| 16k | `▁emily ▁elizabeth ▁dickinson ▁( 1 0 ▁desemba ▁– ▁ 1 ... (+15 more)` | 25 |
-**Sample 3:** `Narafala kaen blong spot long Vanuatu i stap pleiplei tru long kaontri long yumi...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁narafala ▁kaen ▁blong ▁spot ▁long ▁vanuatu ▁i ▁stap ▁pleiplei ▁tru ... (+7 more)` | 17 |
-| 16k | `▁narafala ▁kaen ▁blong ▁spot ▁long ▁vanuatu ▁i ▁stap ▁pleiplei ▁tru ... (+7 more)` | 17 |
 ### Key Findings
-- **Best Compression:** 16k achieves 4.443x compression
-- **Lowest UNK Rate:** 8k with 0.1444% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
@@ -129,12 +139,14 @@ Below are sample sentences tokenized with each vocabulary size:
 | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
 |--------|---------|------------|---------|----------------|------------------|-------------------|
-| **2-gram** | Word | 362 | 8.50 | 1,049 | 58.8% | 98.9% |
-| **2-gram** | Subword | 209 🏆 | 7.71 | 983 | 73.7% | 100.0% |
-| **3-gram** | Word | 496 | 8.95 | 1,408 | 53.1% | 92.0% |
-| **3-gram** | Subword | 1,182 | 10.21 | 5,848 | 38.2% | 79.4% |
-| **4-gram** | Word | 887 | 9.79 | 2,457 | 43.9% | 77.4% |
-| **4-gram** | Subword | 3,532 | 11.79 | 19,225 | 28.5% | 58.2% |
 ### Top 5 N-grams by Size
@@ -142,68 +154,88 @@ Below are sample sentences tokenized with each vocabulary size:
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `hem i` | 738 |
-| 2 | `stet blong` | 729 |
-| 3 | `em i` | 617 |
-| 4 | `blong amerika` | 598 |
-| 5 | `blong yunaeted` | 535 |
 **3-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `stet blong amerika` | 583 |
-| 2 | `yunaeted stet blong` | 479 |
-| 3 | `blong yunaeted stet` | 479 |
-| 4 | `blong singsing blong` | 292 |
 | 5 | `blong hem i` | 259 |
 **4-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `yunaeted stet blong amerika` | 477 |
-| 2 | `blong yunaeted stet blong` | 470 |
 | 3 | `akta blong yunaeted stet` | 210 |
-| 4 | `woman blong singsing blong` | 182 |
 | 5 | `blong singsing blong japan` | 150 |
 **2-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `o n` | 9,093 |
-| 2 | `n g` | 8,780 |
-| 3 | `l o` | 8,027 |
-| 4 | `g _` | 7,936 |
-| 5 | `_ b` | 7,059 |
 **3-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `n g _` | 7,795 |
-| 2 | `o n g` | 7,296 |
-| 3 | `l o n` | 7,257 |
-| 4 | `_ b l` | 5,277 |
-| 5 | `b l o` | 5,252 |
 **4-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `o n g _` | 7,200 |
-| 2 | `l o n g` | 7,191 |
-| 3 | `_ b l o` | 5,238 |
-| 4 | `b l o n` | 5,015 |
-| 5 | `_ l o n` | 2,153 |
 ### Key Findings
-- **Best Perplexity:** 2-gram (subword) with 209
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
-- **Coverage:** Top-1000 patterns cover ~58% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
@@ -219,14 +251,14 @@ Below are sample sentences tokenized with each vocabulary size:
 | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
 |---------|---------|-------------|------------|------------------|-----------------|----------------|
-| **1** | Word | 0.5840 | 1.499 | 3.04 | 8,338 | 41.6% |
-| **1** | Subword | 0.9602 | 1.946 | 6.50 | 364 | 4.0% |
-| **2** | Word | 0.1997 | 1.148 | 1.41 | 24,957 | 80.0% |
-| **2** | Subword | 0.9911 | 1.988 | 5.10 | 2,361 | 0.9% |
-| **3** | Word | 0.0755 | 1.054 | 1.13 | 34,724 | 92.4% |
-| **3** | Subword | 0.7964 | 1.737 | 3.17 | 12,016 | 20.4% |
-| **4** | Word | 0.0328 🏆 | 1.023 | 1.06 | 38,736 | 96.7% |
-| **4** | Subword | 0.4627 | 1.378 | 1.90 | 38,018 | 53.7% |
 ### Generated Text Samples (Word-based)
@@ -234,27 +266,27 @@ Below are text samples generated from each word-based Markov chain model:
 **Context Size 1:**
-1. `blong olgeta mo yu ol disaepol blong dover long wol plante fasin blong court i wan`
-2. `i bin wan strongfala win if you s 84 913 km2 populaesen blong stet blong hem`
-3. `long saed blong tekem carbondioxde mo wanwan aelan gaua o aoba hem i wokem long milly`
 **Context Size 2:**
-1. `hem i stap insaet long solwota everi man i save sindaon o silip long hem islam relijon`
-2. `stet blong philippines blong stet blong amerika blong stet blong amerika blong stet blong amerika mo...`
-3. `em i bin ded 8 septemba em i woman blong singsing blong japan man blong singsing blong`
 **Context Size 3:**
-1. `blong yunaeted stet blong amerika model akta blong pornografi blong ajentina em i stap popiula from ...`
-2. `yunaeted stet blong amerika akta blong yunaeted stet blong amerika blong yunaeted stet blong amerika...`
-3. `blong singsing blong japan thumb anna iriyama man blong singsing blong kanada man blong singsing blo...`
 **Context Size 4:**
-1. `blong yunaeted stet blong amerika blong stet blong yunaeted stet blong amerika blong yunaeted stet b...`
-2. `yunaeted stet blong amerika blong stet blong yunaeted stet blong amerika blong yunaeted stet blong a...`
-3. `akta blong yunaeted stet blong amerika akta blong yunaeted stet blong amerika blong stet blong yunae...`
 ### Generated Text Samples (Subword-based)
@@ -263,34 +295,34 @@ Below are text samples generated from each subword-based Markov chain model:
 **Context Size 1:**
-1. `_dimo_ste_lon_i_`
-2. `a_blong_bl_19_s_`
-3. `ngstang_yulolem:`
 **Context Size 2:**
-1. `ong_300px_12_3_44`
-2. `ng_st_boetexanblo`
-3. `long_prol_no,_рос`
 **Context Size 3:**
-1. `ng_amerika._akta_b`
-2. `ong_savela_taeland`
-3. `long_amerika_maura`
 **Context Size 4:**
-1. `ong_amerika._praem_`
-2. `long_not_prize_nigh`
-3. `_blong_21_man_blong`
 ### Key Findings
-- **Best Predictability:** Context-4 (word) with 96.7% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
-- **Memory Trade-off:** Larger contexts require more storage (38,018 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
@@ -306,48 +338,48 @@ Below are text samples generated from each subword-based Markov chain model:
 | Metric | Value |
 |--------|-------|
-| Vocabulary Size | 3,117 |
-| Total Tokens | 48,872 |
-| Mean Frequency | 15.68 |
 | Median Frequency | 3 |
-| Frequency Std Dev | 124.49 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | blong | 5,014 |
-| 2 | i | 3,182 |
-| 3 | long | 2,146 |
-| 4 | mo | 1,031 |
-| 5 | hem | 1,008 |
-| 6 | ol | 886 |
-| 7 | wan | 875 |
-| 8 | stet | 840 |
-| 9 | amerika | 673 |
-| 10 | em | 660 |
 ### Least Common Words (from vocabulary)
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | lotta | 2 |
-| 2 | continua | 2 |
-| 3 | ekshumesen | 2 |
-| 4 | suspension | 2 |
-| 5 | fulwan | 2 |
-| 6 | konfirm | 2 |
-| 7 | trial | 2 |
-| 8 | window | 2 |
-| 9 | piazza | 2 |
-| 10 | fontana | 2 |
 ### Zipf's Law Analysis
 | Metric | Value |
 |--------|-------|
-| Zipf Coefficient | 1.0400 |
-| R² (Goodness of Fit) | 0.989215 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
@@ -361,9 +393,9 @@ Below are text samples generated from each subword-based Markov chain model:
 ### Key Findings
-- **Zipf Compliance:** R²=0.9892 indicates excellent adherence to Zipf's law
 - **High Frequency Dominance:** Top 100 words cover 62.1% of corpus
-- **Long Tail:** -6,883 words needed for remaining 100.0% coverage
 ---
 ## 5. Word Embeddings Evaluation
@@ -379,37 +411,40 @@ Below are text samples generated from each subword-based Markov chain model:
 ### 5.1 Cross-Lingual Alignment
-> *Note: Multilingual alignment visualization not available for this language.*
 ### 5.2 Model Comparison
 | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
 |-------|-----------|----------|------------------|---------------|----------------|
-| **mono_32d** | 32 | 0.0388 🏆 | 0.6777 | N/A | N/A |
-| **mono_64d** | 64 | 0.0097 | 0.6676 | N/A | N/A |
-| **mono_128d** | 128 | 0.0021 | 0.6720 | N/A | N/A |
 ### Key Findings
-- **Best Isotropy:** mono_32d with 0.0388 (more uniform distribution)
-- **Semantic Density:** Average pairwise similarity of 0.6724. Lower values indicate better semantic separation.
-- **Alignment Quality:** No aligned models evaluated in this run.
 - **Recommendation:** 128d aligned for best cross-lingual performance
 ---
 ## 6.  Morphological Analysis (Experimental)
-> ⚠️ **Warning:** This language shows low morphological productivity. The statistical signals used for this analysis may be noisy or less reliable than for morphologically rich languages.
 This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
 ### 6.1 Productivity & Complexity
 | Metric | Value | Interpretation | Recommendation |
 |--------|-------|----------------|----------------|
-| Productivity Index | **0.000** | Low morphological productivity | ⚠️ Likely unreliable |
-| Idiomaticity Gap | **-1.000** | Low formulaic content | - |
 ### 6.2 Affix Inventory (Productive Units)
@@ -422,9 +457,9 @@ These are the most productive prefixes and suffixes identified by sampling the v
 #### Productive Suffixes
 | Suffix | Examples |
 |--------|----------|
-| `-en` | ren, disisen, citizen |
-| `-an` | givhan, kirgistan, wan |
-| `-em` | shoem, wokem, blem |
 ### 6.3 Bound Stems (Lexical Roots)
@@ -432,7 +467,7 @@ Bound stems are high-frequency subword units that are semantically cohesive but
 | Stem | Cohesion | Substitutability | Examples |
 |------|----------|------------------|----------|
-| `amba` | 1.38x | 8 contexts | ambae, namba, bambae |
 ### 6.4 Affix Compatibility (Co-occurrence)
@@ -450,23 +485,25 @@ Using **Recursive Hierarchical Substitutability**, we decompose complex words in
 | republican | **`republic-an`** | 4.5 | `republic` |
 | andastanem | **`andast-an-em`** | 3.0 | `andast` |
 | niutesteman | **`niutest-em-an`** | 3.0 | `niutest` |
-| kirgistan | **`kirgist-an`** | 1.5 | `kirgist` |
-| valencian | **`valenci-an`** | 1.5 | `valenci` |
-| singaotem | **`singaot-em`** | 1.5 | `singaot` |
-| defdefren | **`defdefr-en`** | 1.5 | `defdefr` |
-| melanesian | **`melanesi-an`** | 1.5 | `melanesi` |
-| konstitusen | **`konstitus-en`** | 1.5 | `konstitus` |
-| komposisen | **`komposis-en`** | 1.5 | `komposis` |
-| smithsonian | **`smithsoni-an`** | 1.5 | `smithsoni` |
-| kompitisen | **`kompitis-en`** | 1.5 | `kompitis` |
-| bisnesman | **`bisnesm-an`** | 1.5 | `bisnesm` |
-| protestan | **`protest-an`** | 1.5 | `protest` |
 | ekshumesen | **`ekshumes-en`** | 1.5 | `ekshumes` |
 ### 6.6 Linguistic Interpretation
 > **Automated Insight:**
-The language BI appears to be more isolating or has a highly fixed vocabulary. Word-level models perform nearly as well as subword models, indicating fewer productive morphological processes.
 ---
 ## 7. Summary & Recommendations
@@ -478,8 +515,8 @@ The language BI appears to be more isolating or has a highly fixed vocabulary. W
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
 | Tokenizer | **16k BPE** | Best compression (4.44x) |
-| N-gram | **2-gram** | Lowest perplexity (209) |
-| Markov | **Context-4** | Highest predictability (96.7%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
@@ -693,4 +730,4 @@ MIT License - Free for academic and commercial use.
 ---
 *Generated by Wikilangs Models Pipeline*
-*Report Date: 2026-01-03 07:17:54*

 ---
 language: bi
+language_name: Bislama
 language_family: germanic_west_anglofrisian
 tags:
   - wikilangs
   - n-gram
   - markov
   - wikipedia
+  - feature-extraction
+  - sentence-similarity
+  - tokenization
+  - n-grams
+  - markov-chain
+  - text-mining
+  - fasttext
+  - babelvec
+  - vocabulous
+  - vocabulary
   - monolingual
   - family-germanic_west_anglofrisian
 license: mit
 library_name: wikilangs
+pipeline_tag: text-generation
 datasets:
   - omarkamali/wikipedia-monthly
 dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
+    value: 4.441
   - name: best_isotropy
     type: isotropy
+    value: 0.0691
   - name: vocabulary_size
     type: vocab
     value: 0
 generated: 2026-01-03
 ---
+# Bislama - Wikilangs Models
 ## Comprehensive Research Report & Full Ablation Study
+This repository contains NLP models trained and evaluated by Wikilangs, specifically on **Bislama** Wikipedia data.
 We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
 ## 📋 Repository Contents
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
+- [6. Morphological Analysis (Experimental)](#6--morphological-analysis-experimental)
 - [7. Summary & Recommendations](#7-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
+| **8k** | 4.034x | 4.06 | 0.1436% | 45,948 |
+| **16k** | 4.441x 🏆 | 4.46 | 0.1581% | 41,742 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
+**Sample 1:** `Spiro Theodore "Ted" Agnew (9 Novemba – 17 Septemba em i politikis blong Yunaete...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁spi ro ▁theodore ▁" ted " ▁agnew ▁( 9 ▁novemba ... (+19 more)` | 29 |
+| 16k | `▁spiro ▁theodore ▁" ted " ▁agnew ▁( 9 ▁novemba ▁– ... (+18 more)` | 28 |
+**Sample 2:** `Xi Jinping (boen i hed blong stet blong Jaena. blong Stet blong Jaena`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁xi ▁jinping ▁( boen ▁i ▁hed ▁blong ▁stet ▁blong ▁jaena ... (+5 more)` | 15 |
+| 16k | `▁xi ▁jinping ▁( boen ▁i ▁hed ▁blong ▁stet ▁blong ▁jaena ... (+5 more)` | 15 |
+**Sample 3:** `Miori Ichikawa (boen 12 Februari em i bin woman blong singsing blong Japan. woma...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁mi ori ▁ich ika wa ▁( boen ▁ 1 2 ... (+16 more)` | 26 |
+| 16k | `▁miori ▁ichikawa ▁( boen ▁ 1 2 ▁februari ▁em ▁i ... (+13 more)` | 23 |
 ### Key Findings
+- **Best Compression:** 16k achieves 4.441x compression
+- **Lowest UNK Rate:** 8k with 0.1436% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
 | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
 |--------|---------|------------|---------|----------------|------------------|-------------------|
+| **2-gram** | Word | 362 | 8.50 | 1,045 | 58.8% | 99.0% |
+| **2-gram** | Subword | 208 🏆 | 7.70 | 976 | 73.9% | 100.0% |
+| **3-gram** | Word | 494 | 8.95 | 1,403 | 53.1% | 92.1% |
+| **3-gram** | Subword | 1,176 | 10.20 | 5,825 | 38.3% | 79.5% |
+| **4-gram** | Word | 875 | 9.77 | 2,432 | 44.2% | 77.7% |
+| **4-gram** | Subword | 3,512 | 11.78 | 19,179 | 28.6% | 58.3% |
+| **5-gram** | Word | 727 | 9.51 | 1,831 | 46.0% | 82.2% |
+| **5-gram** | Subword | 5,192 | 12.34 | 26,363 | 25.9% | 52.6% |
 ### Top 5 N-grams by Size
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `hem i` | 741 |
+| 2 | `stet blong` | 731 |
+| 3 | `em i` | 611 |
+| 4 | `blong amerika` | 599 |
+| 5 | `blong yunaeted` | 537 |
 **3-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `stet blong amerika` | 585 |
+| 2 | `blong yunaeted stet` | 481 |
+| 3 | `yunaeted stet blong` | 481 |
+| 4 | `blong singsing blong` | 291 |
 | 5 | `blong hem i` | 259 |
 **4-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `yunaeted stet blong amerika` | 479 |
+| 2 | `blong yunaeted stet blong` | 472 |
 | 3 | `akta blong yunaeted stet` | 210 |
+| 4 | `woman blong singsing blong` | 181 |
 | 5 | `blong singsing blong japan` | 150 |
+**5-grams (Word):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `blong yunaeted stet blong amerika` | 471 |
+| 2 | `akta blong yunaeted stet blong` | 210 |
+| 3 | `woman blong singsing blong japan` | 129 |
+| 4 | `em i woman blong singsing` | 100 |
+| 5 | `i woman blong singsing blong` | 96 |
 **2-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `o n` | 9,097 |
+| 2 | `n g` | 8,801 |
+| 3 | `l o` | 8,033 |
+| 4 | `g _` | 7,960 |
+| 5 | `_ b` | 7,074 |
 **3-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `n g _` | 7,816 |
+| 2 | `o n g` | 7,315 |
+| 3 | `l o n` | 7,271 |
+| 4 | `_ b l` | 5,295 |
+| 5 | `b l o` | 5,265 |
 **4-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `o n g _` | 7,216 |
+| 2 | `l o n g` | 7,207 |
+| 3 | `_ b l o` | 5,255 |
+| 4 | `b l o n` | 5,031 |
+| 5 | `_ l o n` | 2,154 |
+**5-grams (Subword):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `l o n g _` | 7,179 |
+| 2 | `b l o n g` | 5,030 |
+| 3 | `_ b l o n` | 5,028 |
+| 4 | `_ l o n g` | 2,151 |
+| 5 | `e m _ i _` | 1,374 |
 ### Key Findings
+- **Best Perplexity:** 2-gram (subword) with 208
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
+- **Coverage:** Top-1000 patterns cover ~53% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
 | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
 |---------|---------|-------------|------------|------------------|-----------------|----------------|
+| **1** | Word | 0.5784 | 1.493 | 3.02 | 8,408 | 42.2% |
+| **1** | Subword | 0.9577 | 1.942 | 6.51 | 362 | 4.2% |
+| **2** | Word | 0.1997 | 1.148 | 1.41 | 25,020 | 80.0% |
+| **2** | Subword | 0.9916 | 1.988 | 5.13 | 2,350 | 0.8% |
+| **3** | Word | 0.0750 | 1.053 | 1.13 | 34,806 | 92.5% |
+| **3** | Subword | 0.7944 | 1.734 | 3.18 | 12,029 | 20.6% |
+| **4** | Word | 0.0323 🏆 | 1.023 | 1.05 | 38,812 | 96.8% |
+| **4** | Subword | 0.4624 | 1.378 | 1.90 | 38,112 | 53.8% |
 ### Generated Text Samples (Word-based)
 **Context Size 1:**
+1. `blong miusik grup i praem minista blong pasifik tu kristianiti islam jeinisim i praem minista blong`
+2. `i stap wetem graon kavremap 29 septemba hem hemi sapraesm ol pipol likem kakae we i`
+3. `long septemba i stap mekem afta blong et et i wan fruit kakae we ol komposisen`
 **Context Size 2:**
+1. `hem i wan miusik grup stet blong philippines blong stet blong amerika man blong singsing blong japan`
+2. `stet blong peru bik kaontri long saot blong yurop we i stap araon 860 090 external links`
+3. `em i bin transletem niu testeman i kam mo watchem kustom danis wetem good fren pipol`
 **Context Size 3:**
+1. `yunaeted stet blong amerika akta blong yunaeted stet blong amerika risos long internet www vilnius l...`
+2. `blong yunaeted stet blong amerika blong yunaeted stet blong amerika akta blong yunaeted stet blong a...`
+3. `blong singsing blong taelan woman blong singsing blong japan woman blong singsing blong japan man bl...`
 **Context Size 4:**
+1. `blong yunaeted stet blong amerika akta blong yunaeted stet blong amerika blong stet blong yunaeted s...`
+2. `yunaeted stet blong amerika bara lyle crist images of america alliance arcadia publishing s 41 isbn ...`
+3. `akta blong yunaeted stet blong amerika akta blong yunaeted stet blong amerika akta blong yunaeted st...`
 ### Generated Text Samples (Subword-based)
 **Context Size 1:**
+1. `_stakthae_m_blon`
+2. `ak_25paryulgraju`
+3. `ng_lons_i_we_d_p`
 **Context Size 2:**
+1. `ong_yun_wosing_i_`
+2. `ng_noasol_ww.cita`
+3. `long_en_lon_i_sol`
 **Context Size 3:**
+1. `ng_nara_(cano_red_`
+2. `ong_wan_blong_mius`
+3. `long_(long_blong_y`
 **Context Size 4:**
+1. `ong_nolej,_televis_`
+2. `long_gud_fasin_muha`
+3. `_blong_stet_blong_s`
 ### Key Findings
+- **Best Predictability:** Context-4 (word) with 96.8% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
+- **Memory Trade-off:** Larger contexts require more storage (38,112 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
 | Metric | Value |
 |--------|-------|
+| Vocabulary Size | 3,106 |
+| Total Tokens | 48,839 |
+| Mean Frequency | 15.72 |
 | Median Frequency | 3 |
+| Frequency Std Dev | 125.16 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | blong | 5,030 |
+| 2 | i | 3,201 |
+| 3 | long | 2,145 |
+| 4 | mo | 1,056 |
+| 5 | hem | 1,010 |
+| 6 | ol | 899 |
+| 7 | wan | 870 |
+| 8 | stet | 842 |
+| 9 | amerika | 672 |
+| 10 | em | 654 |
 ### Least Common Words (from vocabulary)
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | ftps | 2 |
+| 2 | sftp | 2 |
+| 3 | operating | 2 |
+| 4 | guide | 2 |
+| 5 | spesifikesen | 2 |
+| 6 | firewall | 2 |
+| 7 | sapot | 2 |
+| 8 | lesin | 2 |
+| 9 | sanem | 2 |
+| 10 | extended | 2 |
 ### Zipf's Law Analysis
 | Metric | Value |
 |--------|-------|
+| Zipf Coefficient | 1.0402 |
+| R² (Goodness of Fit) | 0.989274 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 ### Key Findings
+- **Zipf Compliance:** R²=0.9893 indicates excellent adherence to Zipf's law
 - **High Frequency Dominance:** Top 100 words cover 62.1% of corpus
+- **Long Tail:** -6,894 words needed for remaining 100.0% coverage
 ---
 ## 5. Word Embeddings Evaluation
 ### 5.1 Cross-Lingual Alignment
+![Alignment Quality](visualizations/embedding_alignment_quality.png)
+![Multilingual t-SNE](visualizations/embedding_tsne_multilingual.png)
 ### 5.2 Model Comparison
 | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
 |-------|-----------|----------|------------------|---------------|----------------|
+| **mono_32d** | 32 | 0.0691 🏆 | 0.6642 | N/A | N/A |
+| **mono_64d** | 64 | 0.0097 | 0.6595 | N/A | N/A |
+| **mono_128d** | 128 | 0.0022 | 0.6755 | N/A | N/A |
+| **aligned_32d** | 32 | 0.0691 | 0.6741 | 0.0060 | 0.0420 |
+| **aligned_64d** | 64 | 0.0097 | 0.6519 | 0.0080 | 0.0860 |
+| **aligned_128d** | 128 | 0.0022 | 0.6801 | 0.0200 | 0.0920 |
 ### Key Findings
+- **Best Isotropy:** mono_32d with 0.0691 (more uniform distribution)
+- **Semantic Density:** Average pairwise similarity of 0.6675. Lower values indicate better semantic separation.
+- **Alignment Quality:** Aligned models achieve up to 2.0% R@1 in cross-lingual retrieval.
 - **Recommendation:** 128d aligned for best cross-lingual performance
 ---
 ## 6.  Morphological Analysis (Experimental)
 This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
 ### 6.1 Productivity & Complexity
 | Metric | Value | Interpretation | Recommendation |
 |--------|-------|----------------|----------------|
+| Productivity Index | **5.000** | High morphological productivity | Reliable analysis |
+| Idiomaticity Gap | **0.564** | High formulaic/idiomatic content | - |
 ### 6.2 Affix Inventory (Productive Units)
 #### Productive Suffixes
 | Suffix | Examples |
 |--------|----------|
+| `-en` | warren, truiden, paten |
+| `-em` | katem, raonem, sanem |
+| `-an` | ejukesan, busan, giaman |
 ### 6.3 Bound Stems (Lexical Roots)
 | Stem | Cohesion | Substitutability | Examples |
 |------|----------|------------------|----------|
+| `amba` | 1.40x | 8 contexts | ambae, namba, stamba |
 ### 6.4 Affix Compatibility (Co-occurrence)
 | republican | **`republic-an`** | 4.5 | `republic` |
 | andastanem | **`andast-an-em`** | 3.0 | `andast` |
 | niutesteman | **`niutest-em-an`** | 3.0 | `niutest` |
+| komunikesen | **`komunikes-en`** | 1.5 | `komunikes` |
+| oganaesesen | **`oganaeses-en`** | 1.5 | `oganaeses` |
+| sustreksen | **`sustreks-en`** | 1.5 | `sustreks` |
+| vaespresiden | **`vaespresid-en`** | 1.5 | `vaespresid` |
+| populesen | **`popules-en`** | 1.5 | `popules` |
 | ekshumesen | **`ekshumes-en`** | 1.5 | `ekshumes` |
+| komposisen | **`komposis-en`** | 1.5 | `komposis` |
+| konstitusen | **`konstitus-en`** | 1.5 | `konstitus` |
+| sébastien | **`sébasti-en`** | 1.5 | `sébasti` |
+| austronesian | **`austronesi-an`** | 1.5 | `austronesi` |
+| divelopem | **`divelop-em`** | 1.5 | `divelop` |
+| christian | **`christi-an`** | 1.5 | `christi` |
 ### 6.6 Linguistic Interpretation
 > **Automated Insight:**
+The language Bislama shows high morphological productivity. The subword models are significantly more efficient than word models, suggesting a rich system of affixation or compounding.
+> **Note on Idiomaticity:** The high Idiomaticity Gap suggests a large number of frequent multi-word expressions or formulaic sequences that are statistically distinct from their component parts.
 ---
 ## 7. Summary & Recommendations
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
 | Tokenizer | **16k BPE** | Best compression (4.44x) |
+| N-gram | **2-gram** | Lowest perplexity (208) |
+| Markov | **Context-4** | Highest predictability (96.8%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
 ---
 *Generated by Wikilangs Models Pipeline*
+*Report Date: 2026-01-03 18:57:38*

models/embeddings/aligned/bi_128d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b04089d63affbd01946ea22712a42b9270a9e9c1e3642b2c4557f09e64125f45
+size 1025093310

models/embeddings/aligned/bi_128d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "bi", "dim": 128, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/bi_128d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a0187132ed142b2493533fcad026c5d7be9bcd7cd03af41e20ed204911939b9
+size 65664

models/embeddings/aligned/bi_128d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "bi",
+  "dimension": 128,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 565,
+  "vocab_size": 1052
+}

models/embeddings/aligned/bi_32d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a1b8bc649eeb41d1b7c45409caed59f6f1e6fc8c3a618675db998527999e92d
+size 256285374

models/embeddings/aligned/bi_32d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "bi", "dim": 32, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/bi_32d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:af2e9ecddbabbf795c354094a776879f0e3b6edcdaff3d42ad65641f2bf82a11
+size 4224

models/embeddings/aligned/bi_32d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "bi",
+  "dimension": 32,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 565,
+  "vocab_size": 1052
+}

models/embeddings/aligned/bi_64d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bfe3f902fbea272ab483848ef0bfa3a8f447352ec83746e7036947e7d16daefc
+size 512554686

models/embeddings/aligned/bi_64d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "bi", "dim": 64, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/bi_64d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f0fbe174e9560c7369b8587d1eec8c4597aa4afff653cdea5e66616f728f25bc
+size 16512

models/embeddings/aligned/bi_64d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "bi",
+  "dimension": 64,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 565,
+  "vocab_size": 1052
+}

models/embeddings/monolingual/bi_128d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5ba043557491d9bd866f4880ed63951e9cedd258275bb5ab99cc6a4588a48418
-size 1025087100

 version https://git-lfs.github.com/spec/v1
+oid sha256:b04089d63affbd01946ea22712a42b9270a9e9c1e3642b2c4557f09e64125f45
+size 1025093310

models/embeddings/monolingual/bi_128d_metadata.json CHANGED Viewed

@@ -11,5 +11,5 @@
     "encoding_method": "rope",
     "dim": 128
   },
-  "vocab_size": 1046
 }

     "encoding_method": "rope",
     "dim": 128
   },
+  "vocab_size": 1052
 }

models/embeddings/monolingual/bi_32d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:77f00c5cf1334339a3726d39a681bacc6c8c5b7d226141cb5bd2ec7d3d93731c
-size 256283772

 version https://git-lfs.github.com/spec/v1
+oid sha256:6a1b8bc649eeb41d1b7c45409caed59f6f1e6fc8c3a618675db998527999e92d
+size 256285374

models/embeddings/monolingual/bi_32d_metadata.json CHANGED Viewed

@@ -11,5 +11,5 @@
     "encoding_method": "rope",
     "dim": 32
   },
-  "vocab_size": 1046
 }

     "encoding_method": "rope",
     "dim": 32
   },
+  "vocab_size": 1052
 }

models/embeddings/monolingual/bi_64d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:64be2d287f8ece0f9696fe2372c048f2ea1c8dc331d051d642a50efe1563bc8b
-size 512551548

 version https://git-lfs.github.com/spec/v1
+oid sha256:bfe3f902fbea272ab483848ef0bfa3a8f447352ec83746e7036947e7d16daefc
+size 512554686

models/embeddings/monolingual/bi_64d_metadata.json CHANGED Viewed

@@ -11,5 +11,5 @@
     "encoding_method": "rope",
     "dim": 64
   },
-  "vocab_size": 1046
 }

     "encoding_method": "rope",
     "dim": 64
   },
+  "vocab_size": 1052
 }

models/subword_markov/bi_markov_ctx1_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2e9372210a853979f0e4b07d4830a256cffa139a1fece177ff9c6038eed35df5
-size 22700

 version https://git-lfs.github.com/spec/v1
+oid sha256:e8524f1a9c10520cc90998cb812247a37e806c4685ba4e3d908cfe064d56fe25
+size 22675

models/subword_markov/bi_markov_ctx1_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "subword",
   "language": "bi",
-  "unique_contexts": 364,
-  "total_transitions": 308852
 }

   "context_size": 1,
   "variant": "subword",
   "language": "bi",
+  "unique_contexts": 362,
+  "total_transitions": 308808
 }

models/subword_markov/bi_markov_ctx2_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6ca6b0d701fee7db455da7e30950385f2806f14cec4f336ce82c4779e09621b4
-size 91544

 version https://git-lfs.github.com/spec/v1
+oid sha256:34494d2c3fd7277869199119468c66a51dacc18e713456fcb18965e6b21e05a3
+size 89295

models/subword_markov/bi_markov_ctx2_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "subword",
   "language": "bi",
-  "unique_contexts": 2361,
-  "total_transitions": 307409
 }

   "context_size": 2,
   "variant": "subword",
   "language": "bi",
+  "unique_contexts": 2350,
+  "total_transitions": 307368
 }

models/subword_markov/bi_markov_ctx3_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:68a8c133f845c55867a0fe53a9d2c945b085e56ce9e1e895558840e0a31ab900
-size 279694

 version https://git-lfs.github.com/spec/v1
+oid sha256:57d3e82e14b72150fc4f4cc52d7d27341f4386a1f1c03172174f9aed47b3cd22
+size 274739

models/subword_markov/bi_markov_ctx3_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 3,
   "variant": "subword",
   "language": "bi",
-  "unique_contexts": 12016,
-  "total_transitions": 305966
 }

   "context_size": 3,
   "variant": "subword",
   "language": "bi",
+  "unique_contexts": 12029,
+  "total_transitions": 305928
 }

models/subword_markov/bi_markov_ctx4_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6e769ec2bcec378d6b983577c7020c20977d3c068c15135f0bc9e18b6c7e264f
-size 620331

 version https://git-lfs.github.com/spec/v1
+oid sha256:24f23097b8ef424ce0be18fd51ceab87432f9a9ea51a0dc91ba21933142679f3
+size 614238

models/subword_markov/bi_markov_ctx4_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 4,
   "variant": "subword",
   "language": "bi",
-  "unique_contexts": 38018,
-  "total_transitions": 304523
 }

   "context_size": 4,
   "variant": "subword",
   "language": "bi",
+  "unique_contexts": 38112,
+  "total_transitions": 304488
 }

models/subword_ngram/bi_2gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4527bcf666ffd2dd1c578cfc5840d02a71199e369b0c8a627659726d39e176f6
-size 14130

 version https://git-lfs.github.com/spec/v1
+oid sha256:e4829e48479c7969991243f36dfea6f4e6d74ea2f630ea2043c13d2ca2c89511
+size 13975

models/subword_ngram/bi_2gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 2,
   "variant": "subword",
   "language": "bi",
-  "unique_ngrams": 983,
-  "total_ngrams": 308852
 }

   "n": 2,
   "variant": "subword",
   "language": "bi",
+  "unique_ngrams": 976,
+  "total_ngrams": 308808
 }

models/subword_ngram/bi_3gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b80fdffb1d982a5003ad057c3864a9f483b951ec85c6662f9c20d596201a26ff
-size 63627

 version https://git-lfs.github.com/spec/v1
+oid sha256:9f5fd82b92dfc6d4356afce05270d72ac4e39277a4cfecdc7ed558590eaed9ee
+size 63337

models/subword_ngram/bi_3gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 3,
   "variant": "subword",
   "language": "bi",
-  "unique_ngrams": 5848,
-  "total_ngrams": 307409
 }

   "n": 3,
   "variant": "subword",
   "language": "bi",
+  "unique_ngrams": 5825,
+  "total_ngrams": 307368
 }

models/subword_ngram/bi_4gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c4ebfe1944f1ab6833eb266e7debe2291ba5a0381a209e1ff6ac58781421a2c2
-size 221014

 version https://git-lfs.github.com/spec/v1
+oid sha256:20e7ff332ff6bbddf062a55c5f6828a679b47b4c79847b4ba098c8a3eccea847
+size 219083

models/subword_ngram/bi_4gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 4,
   "variant": "subword",
   "language": "bi",
-  "unique_ngrams": 19225,
-  "total_ngrams": 305966
 }

   "n": 4,
   "variant": "subword",
   "language": "bi",
+  "unique_ngrams": 19179,
+  "total_ngrams": 305928
 }

models/subword_ngram/bi_5gram_subword.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de55d59bf119f4b44768dc3b459ec322e227c6b8d17fb1a1a538913bae9a97fe
+size 319412

models/subword_ngram/bi_5gram_subword_metadata.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "n": 5,
+  "variant": "subword",
+  "language": "bi",
+  "unique_ngrams": 26363,
+  "total_ngrams": 304488
+}

models/tokenizer/bi_tokenizer_16k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cb93c36553e5bad9dc7098178cff3fc9a58ec9892899f622abc48bd993de075f
-size 506618

 version https://git-lfs.github.com/spec/v1
+oid sha256:e828f5e2c1b1fc98dac5a27a95c31b59be52a8402143d113689a11719d8ace9a
+size 506724

models/tokenizer/bi_tokenizer_16k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/bi_tokenizer_8k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6a4efeb4ad5e3a8b678a5c79bd0331f01122c9bb189bbfd294c1172390d5a05c
-size 370520

 version https://git-lfs.github.com/spec/v1
+oid sha256:28f725c6cb47e38ae99fa39137b89f2abb928cc14a1ee5d46ef16a3017e64106
+size 370903

models/tokenizer/bi_tokenizer_8k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/vocabulary/bi_vocabulary.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:87f70e9b39cbf89b9a192034747f166c231ea9bf458bb2112e79e97ae15e247a
-size 50916

 version https://git-lfs.github.com/spec/v1
+oid sha256:33e06d16f52a22d02ec5ebd57923482baa956d867f4a4e6664539ddda180960e
+size 50487

models/vocabulary/bi_vocabulary_metadata.json CHANGED Viewed

@@ -1,16 +1,16 @@
 {
   "language": "bi",
-  "vocabulary_size": 3117,
   "variant": "full",
   "statistics": {
-    "type_token_ratio": 0.15550018456995202,
     "coverage": {
-      "top_100": 0.5605020302694721,
-      "top_1000": 0.7978589885566629,
-      "top_5000": 0.9367847914359543
     },
-    "hapax_count": 5308,
-    "hapax_ratio": 0.6300296735905044,
-    "total_documents": 1443
   }
 }

 {
   "language": "bi",
+  "vocabulary_size": 3106,
   "variant": "full",
   "statistics": {
+    "type_token_ratio": 0.15657562289065527,
     "coverage": {
+      "top_100": 0.5595227117643804,
+      "top_1000": 0.7971893845784999,
+      "top_5000": 0.9356361691533113
     },
+    "hapax_count": 5384,
+    "hapax_ratio": 0.6341578327444052,
+    "total_documents": 1440
   }
 }

models/word_markov/bi_markov_ctx1_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5bcf4783672108d39279e5b6f88455605e79e8765b7ffe3f48acaac9ccda1c56
-size 230383

 version https://git-lfs.github.com/spec/v1
+oid sha256:ecb13023f55478121529681449b7ba5a4ee949d8ecb72247965a859fcffcc5e2
+size 231420

models/word_markov/bi_markov_ctx1_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "word",
   "language": "bi",
-  "unique_contexts": 8338,
-  "total_transitions": 52737
 }

   "context_size": 1,
   "variant": "word",
   "language": "bi",
+  "unique_contexts": 8408,
+  "total_transitions": 52783
 }

models/word_markov/bi_markov_ctx2_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c04695f69e6a2692e750e9bff816affd14058dc6384bd028ad4dea65463972ae
-size 440900

 version https://git-lfs.github.com/spec/v1
+oid sha256:3a773d109fde8766031a00b61fe2c016e512310440b2793f45f0c3b3ccf3d501
+size 441478

models/word_markov/bi_markov_ctx2_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "word",
   "language": "bi",
-  "unique_contexts": 24957,
-  "total_transitions": 51294
 }

   "context_size": 2,
   "variant": "word",
   "language": "bi",
+  "unique_contexts": 25020,
+  "total_transitions": 51343
 }

models/word_markov/bi_markov_ctx3_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1acfd63f57a3dbfe43f8279cf4b22712ba4b1311bd026f0bab886fd886ead857
-size 595208

 version https://git-lfs.github.com/spec/v1
+oid sha256:fca25f8469c5430353b0b9ba3c70930045f3bc7297b5e8d5418d50fa61aa1094
+size 596165

models/word_markov/bi_markov_ctx3_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 3,
   "variant": "word",
   "language": "bi",
-  "unique_contexts": 34724,
-  "total_transitions": 49851
 }

   "context_size": 3,
   "variant": "word",
   "language": "bi",
+  "unique_contexts": 34806,
+  "total_transitions": 49903
 }

models/word_markov/bi_markov_ctx4_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:19edc02c0845063d5a953f889ccfbfd26a2b1ec828f00bb70ee81b9c16697c22
-size 659846

 version https://git-lfs.github.com/spec/v1
+oid sha256:c6503dd4b9dff191c13926ba12557bf311d6b5db8c6f259a39a282c2b106887a
+size 663890

models/word_markov/bi_markov_ctx4_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 4,
   "variant": "word",
   "language": "bi",
-  "unique_contexts": 38736,
-  "total_transitions": 48408
 }

   "context_size": 4,
   "variant": "word",
   "language": "bi",
+  "unique_contexts": 38812,
+  "total_transitions": 48463
 }