omarkamali commited on Jan 3

Commit

6afc7d3

verified ·

1 Parent(s): e32d67b

Upload all models and assets for cr (latest)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
README.md +131 -96
models/embeddings/aligned/cr_128d.bin +3 -0
models/embeddings/aligned/cr_128d.meta.json +1 -0
models/embeddings/aligned/cr_128d.projection.npy +3 -0
models/embeddings/aligned/cr_128d_metadata.json +8 -0
models/embeddings/aligned/cr_32d.bin +3 -0
models/embeddings/aligned/cr_32d.meta.json +1 -0
models/embeddings/aligned/cr_32d.projection.npy +3 -0
models/embeddings/aligned/cr_32d_metadata.json +8 -0
models/embeddings/aligned/cr_64d.bin +3 -0
models/embeddings/aligned/cr_64d.meta.json +1 -0
models/embeddings/aligned/cr_64d.projection.npy +3 -0
models/embeddings/aligned/cr_64d_metadata.json +8 -0
models/embeddings/monolingual/cr_128d.bin +1 -1
models/embeddings/monolingual/cr_32d.bin +1 -1
models/embeddings/monolingual/cr_64d.bin +1 -1
models/subword_markov/cr_markov_ctx1_subword.parquet +2 -2
models/subword_markov/cr_markov_ctx1_subword_metadata.json +2 -2
models/subword_markov/cr_markov_ctx2_subword.parquet +2 -2
models/subword_markov/cr_markov_ctx2_subword_metadata.json +2 -2
models/subword_markov/cr_markov_ctx3_subword.parquet +2 -2
models/subword_markov/cr_markov_ctx3_subword_metadata.json +2 -2
models/subword_markov/cr_markov_ctx4_subword.parquet +2 -2
models/subword_markov/cr_markov_ctx4_subword_metadata.json +2 -2
models/subword_ngram/cr_2gram_subword.parquet +2 -2
models/subword_ngram/cr_2gram_subword_metadata.json +2 -2
models/subword_ngram/cr_3gram_subword.parquet +2 -2
models/subword_ngram/cr_3gram_subword_metadata.json +2 -2
models/subword_ngram/cr_4gram_subword.parquet +2 -2
models/subword_ngram/cr_4gram_subword_metadata.json +2 -2
models/subword_ngram/cr_5gram_subword.parquet +3 -0
models/subword_ngram/cr_5gram_subword_metadata.json +7 -0
models/tokenizer/cr_tokenizer_8k.model +2 -2
models/tokenizer/cr_tokenizer_8k.vocab +0 -0
models/vocabulary/cr_vocabulary.parquet +2 -2
models/vocabulary/cr_vocabulary_metadata.json +6 -6
models/word_markov/cr_markov_ctx1_word.parquet +2 -2
models/word_markov/cr_markov_ctx1_word_metadata.json +2 -2
models/word_markov/cr_markov_ctx2_word.parquet +2 -2
models/word_markov/cr_markov_ctx2_word_metadata.json +2 -2
models/word_markov/cr_markov_ctx3_word.parquet +2 -2
models/word_markov/cr_markov_ctx3_word_metadata.json +2 -2
models/word_markov/cr_markov_ctx4_word.parquet +2 -2
models/word_markov/cr_markov_ctx4_word_metadata.json +2 -2
models/word_ngram/cr_2gram_word_metadata.json +1 -1
models/word_ngram/cr_3gram_word_metadata.json +1 -1
models/word_ngram/cr_4gram_word.parquet +2 -2
models/word_ngram/cr_4gram_word_metadata.json +2 -2
models/word_ngram/cr_5gram_word.parquet +3 -0

.gitattributes CHANGED Viewed

@@ -36,3 +36,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 visualizations/embedding_similarity.png filter=lfs diff=lfs merge=lfs -text
 visualizations/performance_dashboard.png filter=lfs diff=lfs merge=lfs -text
 visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -text

 visualizations/embedding_similarity.png filter=lfs diff=lfs merge=lfs -text
 visualizations/performance_dashboard.png filter=lfs diff=lfs merge=lfs -text
 visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -text
+visualizations/embedding_tsne_multilingual.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 language: cr
-language_name: CR
 language_family: american_algonquian
 tags:
   - wikilangs
@@ -10,11 +10,21 @@ tags:
   - n-gram
   - markov
   - wikipedia
   - monolingual
   - family-american_algonquian
 license: mit
 library_name: wikilangs
-pipeline_tag: feature-extraction
 datasets:
   - omarkamali/wikipedia-monthly
 dataset_info:
@@ -23,20 +33,20 @@ dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
-    value: 3.182
   - name: best_isotropy
     type: isotropy
-    value: 0.0381
   - name: vocabulary_size
     type: vocab
     value: 0
 generated: 2026-01-03
 ---
-# CR - Wikilangs Models
 ## Comprehensive Research Report & Full Ablation Study
-This repository contains NLP models trained and evaluated by Wikilangs, specifically on **CR** Wikipedia data.
 We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
 ## 📋 Repository Contents
@@ -60,7 +70,7 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
-- [6. Morphological Analysis (Experimental)](#6-morphological-analysis)
 - [7. Summary & Recommendations](#7-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
@@ -80,35 +90,35 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
-| **8k** | 3.182x 🏆 | 3.19 | 2.9567% | 6,629 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
-**Sample 1:** `ᐊᓐ ᐊᒋᐦᑖᓱᓐ ᐯᔭᒄ ᑲ ᐃᔑᓂᐦᑳᑌᒡ, ᐋᐸᑎᓐ ᒉ ᒌ ᐃᑣᓅᐦᒡ ᐯᔭᒄ ᒉᒀᓐ ᒫᒃ ᐊᐌᓐ᙮ ᐊᓐ ᒫᒃ ᐊᒋᐦᑖᓱᓐ ᐯᔭᒄ, ᐁᐅᑯᓐ ᓃ...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁ᐊᓐ ▁ᐊᒋᐦᑖᓱᓐ ▁ᐯᔭᒄ ▁ᑲ ▁ᐃᔑᓂᐦᑳᑌᒡ , ▁ᐋᐸᑎᓐ ▁ᒉ ▁ᒌ ▁ᐃᑣᓅᐦᒡ ... (+19 more)` | 29 |
-**Sample 2:** `ᓀᐦᐃᔭᐁᐧᐃᐧᐣ ᑕᐣᓯ ᑲ ᐃᓯᐲᑭᐢᑫᐧᕁ ᓵᓴᕀ ᐳᓂ ᐱᑭᐢᑫᐧᐃᐧᐣ ᐱᐦᒑᔨᕁ ᑳᓇᑕ. ᓵᓴᕀ ᐳᓂ ᐱᑭᐢᑫᐧᐃᐧᐣ ᓇᐊᐧᐨ ᐳᑯ ᒌᑳᐦᑕ...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁ᓀᐦᐃᔭᐁᐧᐃᐧᐣ ▁ᑕᐣᓯ ▁ᑲ ▁ᐃᓯᐲᑭᐢᑫᐧᕁ ▁ᓵᓴᕀ ▁ᐳᓂ ▁ᐱᑭᐢᑫᐧᐃᐧᐣ ▁ᐱᐦᒑᔨᕁ ▁ᑳᓇᑕ . ... (+11 more)` | 21 |
-**Sample 3:** `ᒨᔅ, Muus, Mush ( ; ) n.a. ᐊᐧᐁᓰᔅ ᐆ᙮ ᒨᔅ ᒥᐦᒑᐱᔅᒋᓲ᙮ ᓂᒥᑕᐦᐊᒻ ᑲᔦᐦ᙮ ᐸᐹᒦᒋᓲ᙮ ᒨᒥᓀᐤ᙮ ᒨᔅ ᒦᒎ ᓂᐦ...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁ᒨᔅ , ▁muus , ▁mush ▁( ▁; ▁) ▁n . ... (+17 more)` | 27 |
 ### Key Findings
-- **Best Compression:** 8k achieves 3.182x compression
-- **Lowest UNK Rate:** 8k with 2.9567% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
@@ -126,11 +136,13 @@ Below are sample sentences tokenized with each vocabulary size:
 | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
 |--------|---------|------------|---------|----------------|------------------|-------------------|
 | **2-gram** | Word | 16 | 4.04 | 17 | 100.0% | 100.0% |
-| **2-gram** | Subword | 492 | 8.94 | 848 | 48.2% | 100.0% |
 | **3-gram** | Word | 15 🏆 | 3.88 | 16 | 100.0% | 100.0% |
-| **3-gram** | Subword | 1,528 | 10.58 | 1,986 | 19.4% | 75.4% |
-| **4-gram** | Word | 163 | 7.35 | 166 | 62.1% | 100.0% |
-| **4-gram** | Subword | 3,131 | 11.61 | 3,878 | 11.9% | 50.9% |
 ### Top 5 N-grams by Size
@@ -162,23 +174,33 @@ Below are sample sentences tokenized with each vocabulary size:
 | 2 | `in standard roman orthography` | 5 |
 | 3 | `written in standard roman` | 5 |
 | 4 | `ᑎᐸᐦᐄᑲᓐ ᑎᐸᐦᐄᑲᓐ ᑎᐸᐦᐄᑲᓐ ᑎᐸᐦᐄᑲᓐ` | 4 |
-| 5 | `of articles some articles` | 3 |
 **2-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `i n` | 215 |
-| 2 | `, _` | 213 |
-| 3 | `_ ᐊ` | 179 |
-| 4 | `i k` | 168 |
-| 5 | `n _` | 165 |
 **3-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `i n _` | 61 |
 | 2 | `a n i` | 49 |
 | 3 | `w i n` | 48 |
 | 4 | `_ k i` | 47 |
@@ -190,16 +212,26 @@ Below are sample sentences tokenized with each vocabulary size:
 |------|--------|-------|
 | 1 | `w a k _` | 33 |
 | 2 | `w i n _` | 27 |
-| 3 | `t i o n` | 24 |
-| 4 | `k a n i` | 23 |
-| 5 | `i k a n` | 22 |
 ### Key Findings
 - **Best Perplexity:** 3-gram (word) with 15
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
-- **Coverage:** Top-1000 patterns cover ~51% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
@@ -215,14 +247,14 @@ Below are sample sentences tokenized with each vocabulary size:
 | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
 |---------|---------|-------------|------------|------------------|-----------------|----------------|
-| **1** | Word | 0.2827 | 1.216 | 1.47 | 1,787 | 71.7% |
-| **1** | Subword | 1.9100 | 3.758 | 10.53 | 273 | 0.0% |
-| **2** | Word | 0.0424 | 1.030 | 1.05 | 2,607 | 95.8% |
-| **2** | Subword | 0.6919 | 1.615 | 2.63 | 2,872 | 30.8% |
-| **3** | Word | 0.0178 | 1.012 | 1.02 | 2,724 | 98.2% |
-| **3** | Subword | 0.3559 | 1.280 | 1.57 | 7,557 | 64.4% |
-| **4** | Word | 0.0086 🏆 | 1.006 | 1.01 | 2,765 | 99.1% |
-| **4** | Subword | 0.1591 | 1.117 | 1.21 | 11,842 | 84.1% |
 ### Generated Text Samples (Word-based)
@@ -230,27 +262,27 @@ Below are text samples generated from each word-based Markov chain model:
 **Context Size 1:**
-1. `ᐁ ᐊᐧᐃᐢᑮᐦᐃᑲᐣ ᐃᐧᐊ ᐁᔥᐃᐦᑕᒧᐃᐧᐣ ᑳ ᐋᐸᐦᐄᔥᑌᒡ english and montana some articles in iyuw iyimuun natuashish dia...`
-2. `e kašcihot e wîcit e iskwewit mâk atimwa wes namawîy nataweyihtam cecî cisceyihtâkwaniyic ekw wenâpe...`
-3. `of nonkilling channel on l nehirâmowin qc r s t u v w ᐌ ᐎ ᐒ`
 **Context Size 2:**
-1. `some articles in ininiwi išikišwēwin eastern dialect la romaine mingan natashquan pakuashipi and she...`
-2. `articles in lehlueun western dialect list of articles some articles in nīhithawīwin list of articles...`
-3. `ēkwa mīna otaskānitik e ka naskahtamēw nikiskihcēta anihi tahki itēhk kā pēhtahkik tānpahtiwin ē mic...`
 **Context Size 3:**
-1. `some articles in nēhiyawēwin âpihtâkosisânak kâ isiwepahki maskisin ᐸᐦᑵᓯᑲᐣ pimîhkân tipahikan itasin...`
-2. `list of articles wikipedias in other native american languages atikamekw avañe ẽ aymar choctaw ꮳꮃꭹ c...`
-3. `dialect list of articles some articles in ililîmowin list of articles ᐃᓕᓖᒧᐎᓐ ililîmowin ililîmowin p...`
 **Context Size 4:**
-1. `dialect list of articles some articles in iyuw iyimuun kawawachikamach dialect list of articles nīhi...`
 2. `written in standard roman orthography`
-3. `ᑎᐸᐦᐄᑲᓐ ᑎᐸᐦᐄᑲᓐ ᑎᐸᐦᐄᑲᓐ ᑎᐸᐦᐄᑲᓐ ᑎᐸᐦᐄᑲᓐ ᑎᐸᐦᐄᑲᓐ ᐋᐱᐦᑖᒌᔑᑳᐤ ᐋᐱᐦᑖᑎᐱᔅᑳᐤ 1 05 ᐯᔭᒄ ᑎᐸᐦᐄᑲᓐ ᒦᓐ ᓂᔮᔪ ᒥᓂᑯᔥ ᓂᔮᔪ ᒥᓂᑯᔥ ᒥᔮ...`
 ### Generated Text Samples (Subword-based)
@@ -259,34 +291,34 @@ Below are text samples generated from each subword-based Markov chain model:
 **Context Size 1:**
-1. `_ᒉᒀᓐᓂᓕᐅᕝᕙᓪᓗ_ᐃᓐᓂᓂ`
-2. `ik;_ᑲᐤ_(_ᑕᐦᐁᐧᔭᐍᑎ`
-3. `am_ᐁr_ē-nata_ost`
 **Context Size 2:**
-1. `inēhiyiy-âyot_ayi`
-2. `,_ᐱᔪᓐᓇᖅ_ᖂᑉ_ᒪᓕᒋᐊᓕᖕ`
-3. `_ᐊᑕᐦᑐᒥᒃ_ᑐᒃᓯᓪᓗᓂ_ᐊᓯ`
 **Context Size 3:**
-1. `in_nešt_mâk_ekwa_a`
-2. `anininisiniw._pask`
-3. `win_okiskān_tipēna`
 **Context Size 4:**
-1. `wak_tāpihikan_ᐆᒪ_ᐊᐢ`
-2. `win_(statistics_(10`
-3. `tion_métis_federati`
 ### Key Findings
 - **Best Predictability:** Context-4 (word) with 99.1% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
-- **Memory Trade-off:** Larger contexts require more storage (11,842 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
@@ -302,9 +334,9 @@ Below are text samples generated from each subword-based Markov chain model:
 | Metric | Value |
 |--------|-------|
-| Vocabulary Size | 489 |
-| Total Tokens | 1,731 |
-| Mean Frequency | 3.54 |
 | Median Frequency | 2 |
 | Frequency Std Dev | 3.40 |
@@ -312,11 +344,11 @@ Below are text samples generated from each subword-based Markov chain model:
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | ᐁ | 34 |
 | 2 | e | 30 |
 | 3 | and | 22 |
-| 4 | in | 22 |
-| 5 | of | 22 |
 | 6 | pîsim | 19 |
 | 7 | articles | 18 |
 | 8 | cree | 16 |
@@ -327,39 +359,39 @@ Below are text samples generated from each subword-based Markov chain model:
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | ᐸᑦᑕᖕᓂᑦ | 2 |
-| 2 | ordinateur | 2 |
-| 3 | demandez | 2 |
-| 4 | le | 2 |
-| 5 | programme | 2 |
-| 6 | eurêka | 2 |
-| 7 | culture | 2 |
-| 8 | 18 | 2 |
-| 9 | août | 2 |
 | 10 | ᖃᐅᔨᓴᖅᑎᐅᔪᓄᑦ | 2 |
 ### Zipf's Law Analysis
 | Metric | Value |
 |--------|-------|
-| Zipf Coefficient | 0.5522 |
-| R² (Goodness of Fit) | 0.947702 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
-| Top 100 | 47.6% |
 | Top 1,000 | 0.0% |
 | Top 5,000 | 0.0% |
 | Top 10,000 | 0.0% |
 ### Key Findings
-- **Zipf Compliance:** R²=0.9477 indicates excellent adherence to Zipf's law
-- **High Frequency Dominance:** Top 100 words cover 47.6% of corpus
-- **Long Tail:** -9,511 words needed for remaining 100.0% coverage
 ---
 ## 5. Word Embeddings Evaluation
@@ -375,37 +407,38 @@ Below are text samples generated from each subword-based Markov chain model:
 ### 5.1 Cross-Lingual Alignment
-> *Note: Multilingual alignment visualization not available for this language.*
 ### 5.2 Model Comparison
 | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
 |-------|-----------|----------|------------------|---------------|----------------|
-| **mono_32d** | 32 | 0.0381 🏆 | 0.0000 | N/A | N/A |
-| **mono_64d** | 64 | 0.0033 | 0.0000 | N/A | N/A |
 | **mono_128d** | 128 | 0.0000 | 0.0000 | N/A | N/A |
 ### Key Findings
-- **Best Isotropy:** mono_32d with 0.0381 (more uniform distribution)
 - **Semantic Density:** Average pairwise similarity of 0.0000. Lower values indicate better semantic separation.
-- **Alignment Quality:** No aligned models evaluated in this run.
 - **Recommendation:** 128d aligned for best cross-lingual performance
 ---
 ## 6.  Morphological Analysis (Experimental)
-> ⚠️ **Warning:** This language shows low morphological productivity. The statistical signals used for this analysis may be noisy or less reliable than for morphologically rich languages.
 This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
 ### 6.1 Productivity & Complexity
 | Metric | Value | Interpretation | Recommendation |
 |--------|-------|----------------|----------------|
-| Productivity Index | **0.000** | Low morphological productivity | ⚠️ Likely unreliable |
-| Idiomaticity Gap | **-1.000** | Low formulaic content | - |
 ### 6.2 Affix Inventory (Productive Units)
@@ -438,7 +471,9 @@ Using **Recursive Hierarchical Substitutability**, we decompose complex words in
 ### 6.6 Linguistic Interpretation
 > **Automated Insight:**
-The language CR appears to be more isolating or has a highly fixed vocabulary. Word-level models perform nearly as well as subword models, indicating fewer productive morphological processes.
 ---
 ## 7. Summary & Recommendations
@@ -449,7 +484,7 @@ The language CR appears to be more isolating or has a highly fixed vocabulary. W
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
-| Tokenizer | **8k BPE** | Best compression (3.18x) |
 | N-gram | **3-gram** | Lowest perplexity (15) |
 | Markov | **Context-4** | Highest predictability (99.1%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
@@ -665,4 +700,4 @@ MIT License - Free for academic and commercial use.
 ---
 *Generated by Wikilangs Models Pipeline*
-*Report Date: 2026-01-03 10:19:03*

 ---
 language: cr
+language_name: Cree
 language_family: american_algonquian
 tags:
   - wikilangs
   - n-gram
   - markov
   - wikipedia
+  - feature-extraction
+  - sentence-similarity
+  - tokenization
+  - n-grams
+  - markov-chain
+  - text-mining
+  - fasttext
+  - babelvec
+  - vocabulous
+  - vocabulary
   - monolingual
   - family-american_algonquian
 license: mit
 library_name: wikilangs
+pipeline_tag: text-generation
 datasets:
   - omarkamali/wikipedia-monthly
 dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
+    value: 3.238
   - name: best_isotropy
     type: isotropy
+    value: 0.0354
   - name: vocabulary_size
     type: vocab
     value: 0
 generated: 2026-01-03
 ---
+# Cree - Wikilangs Models
 ## Comprehensive Research Report & Full Ablation Study
+This repository contains NLP models trained and evaluated by Wikilangs, specifically on **Cree** Wikipedia data.
 We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
 ## 📋 Repository Contents
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
+- [6. Morphological Analysis (Experimental)](#6--morphological-analysis-experimental)
 - [7. Summary & Recommendations](#7-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
+| **8k** | 3.238x 🏆 | 3.24 | 2.7764% | 6,267 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
+**Sample 1:** `ᓀᐦᐃᔭᐁᐧᐃᐧᐣ ᑕᐣᓯ ᑲ ᐃᓯᐲᑭᐢᑫᐧᕁ ᓵᓴᕀ ᐳᓂ ᐱᑭᐢᑫᐧᐃᐧᐣ ᐱᐦᒑᔨᕁ ᑳᓇᑕ. ᓵᓴᕀ ᐳᓂ ᐱᑭᐢᑫᐧᐃᐧᐣ ᓇᐊᐧᐨ ᐳᑯ ᒌᑳᐦᑕ...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁ᓀᐦᐃᔭᐁᐧᐃᐧᐣ ▁ᑕᐣᓯ ▁ᑲ ▁ᐃᓯᐲᑭᐢᑫᐧᕁ ▁ᓵᓴᕀ ▁ᐳᓂ ▁ᐱᑭᐢᑫᐧᐃᐧᐣ ▁ᐱᐦᒑᔨᕁ ▁ᑳᓇᑕ . ... (+11 more)` | 21 |
+**Sample 2:** `ᐊᓐ ᐊᒋᐦᑖᓱᓐ ᐯᔭᒄ ᑲ ᐃᔑᓂᐦᑳᑌᒡ, ᐋᐸᑎᓐ ᒉ ᒌ ᐃᑣᓅᐦᒡ ᐯᔭᒄ ᒉᒀᓐ ᒫᒃ ᐊᐌᓐ᙮ ᐊᓐ ᒫᒃ ᐊᒋᐦᑖᓱᓐ ᐯᔭᒄ, ᐁᐅᑯᓐ ᓃ...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁ᐊᓐ ▁ᐊᒋᐦᑖᓱᓐ ▁ᐯᔭᒄ ▁ᑲ ▁ᐃᔑᓂᐦᑳᑌᒡ , ▁ᐋᐸᑎᓐ ▁ᒉ ▁ᒌ ▁ᐃᑣᓅᐦᒡ ... (+19 more)` | 29 |
+**Sample 3:** `ᒦᒃᓰᖂ (english : Mexico) ᐊᐢᑭᐩ ᑮᐍᑎᐣ ᐊᒣᕒᐃᑲ ᐆᐦᒋ᙮ ᐊᔨᓯᔨᓂᐘᐠ ᐑᑭᐘᐠ ᐆᒪ ᐊᐢᑭᔭᕽ᙮ </center>`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁ᒦᒃᓰᖂ ▁( english ▁: ▁mexico ) ▁ᐊᐢᑭᐩ ▁ᑮᐍᑎᐣ ▁ᐊᒣᕒᐃᑲ ▁ᐆᐦᒋ᙮ ... (+7 more)` | 17 |
 ### Key Findings
+- **Best Compression:** 8k achieves 3.238x compression
+- **Lowest UNK Rate:** 8k with 2.7764% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
 | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
 |--------|---------|------------|---------|----------------|------------------|-------------------|
 | **2-gram** | Word | 16 | 4.04 | 17 | 100.0% | 100.0% |
+| **2-gram** | Subword | 473 | 8.89 | 812 | 49.1% | 100.0% |
 | **3-gram** | Word | 15 🏆 | 3.88 | 16 | 100.0% | 100.0% |
+| **3-gram** | Subword | 1,468 | 10.52 | 1,902 | 19.8% | 76.9% |
+| **4-gram** | Word | 157 | 7.29 | 160 | 64.3% | 100.0% |
+| **4-gram** | Subword | 2,988 | 11.54 | 3,702 | 12.2% | 52.2% |
+| **5-gram** | Word | 137 | 7.10 | 138 | 73.1% | 100.0% |
+| **5-gram** | Subword | 2,771 | 11.44 | 3,264 | 12.2% | 51.7% |
 ### Top 5 N-grams by Size
 | 2 | `in standard roman orthography` | 5 |
 | 3 | `written in standard roman` | 5 |
 | 4 | `ᑎᐸᐦᐄᑲᓐ ᑎᐸᐦᐄᑲᓐ ᑎᐸᐦᐄᑲᓐ ᑎᐸᐦᐄᑲᓐ` | 4 |
+| 5 | `center for global nonkilling` | 3 |
+**5-grams (Word):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `written in standard roman orthography` | 5 |
+| 2 | `list of articles some articles` | 3 |
+| 3 | `of articles some articles in` | 3 |
+| 4 | `dialect list of articles some` | 3 |
+| 5 | `ᑎᐸᐦᐄᑲᓐ ᑎᐸᐦᐄᑲᓐ ᑎᐸᐦᐄᑲᓐ ᑎᐸᐦᐄᑲᓐ ᑎᐸᐦᐄᑲᓐ` | 3 |
 **2-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `i n` | 207 |
+| 2 | `, _` | 202 |
+| 3 | `i k` | 169 |
+| 4 | `_ ᐊ` | 164 |
+| 5 | `i s` | 159 |
 **3-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `i n _` | 58 |
 | 2 | `a n i` | 49 |
 | 3 | `w i n` | 48 |
 | 4 | `_ k i` | 47 |
 |------|--------|-------|
 | 1 | `w a k _` | 33 |
 | 2 | `w i n _` | 27 |
+| 3 | `k a n i` | 23 |
+| 4 | `t i o n` | 23 |
+| 5 | `_ o f _` | 22 |
+**5-grams (Subword):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `_ a n d _` | 22 |
+| 2 | `a t i o n` | 21 |
+| 3 | `p î s i m` | 20 |
+| 4 | `- p î s i` | 19 |
+| 5 | `a r t i c` | 19 |
 ### Key Findings
 - **Best Perplexity:** 3-gram (word) with 15
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
+- **Coverage:** Top-1000 patterns cover ~52% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
 | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
 |---------|---------|-------------|------------|------------------|-----------------|----------------|
+| **1** | Word | 0.2841 | 1.218 | 1.47 | 1,711 | 71.6% |
+| **1** | Subword | 1.8933 | 3.715 | 10.31 | 271 | 0.0% |
+| **2** | Word | 0.0442 | 1.031 | 1.05 | 2,501 | 95.6% |
+| **2** | Subword | 0.6883 | 1.611 | 2.62 | 2,789 | 31.2% |
+| **3** | Word | 0.0186 | 1.013 | 1.02 | 2,617 | 98.1% |
+| **3** | Subword | 0.3514 | 1.276 | 1.56 | 7,299 | 64.9% |
+| **4** | Word | 0.0089 🏆 | 1.006 | 1.01 | 2,657 | 99.1% |
+| **4** | Subword | 0.1579 | 1.116 | 1.21 | 11,392 | 84.2% |
 ### Generated Text Samples (Word-based)
 **Context Size 1:**
+1. `ᐁ ᐃ ᐅ ᐊ ᐄ ᐆ ᐋ p q r s ᓭ ᓯ ᓱ ᓴ ᓰ`
+2. `e kiskatcik e tašitwâw awesîsac sašimuve nîštam atim nâpeštimw išinihkâtâkaniwiw simpohanin âtayôhkâ...`
+3. `of articles in ininiwi išikišwēwin eastern dialect western montagnais iso 639 crk location québec an...`
 **Context Size 2:**
+1. `some articles in nēhiyawēwin âpihtâkosisânak kâ isiwepahki maskisin ᐸᐦᑵᓯᑲᐣ pimîhkân tipahikan itasin...`
+2. `articles in iyuw iyimuun natuashish dialect list of articles ᐃᔨᔨᐤ ᐊᔨᒧᐧᐃᓐ iyyû ayimuwin nēhiyawēwin p...`
+3. `list of articles ᐃᔨᔨᐤ ᐊᔨᒧᐧᐃᓐ iyyû ayimuwin northern dialect chisasibi eastmain waskaganish wemindji ...`
 **Context Size 3:**
+1. `some articles in lehlueun western dialect betsiamites mashteuiatsh matimekosh and uashat maliotenam ...`
+2. `list of articles ᐃᓕᓖᒧᐎᓐ ililîmowin ililîmowin portal english name woods cree iso 639 crk location sa...`
+3. `dialect list of articles ᐃᓕᓖᒧᐎᓐ ililîmowin ililîmowin portal english name moose cree iso 639 csw loc...`
 **Context Size 4:**
+1. `dialect list of articles nīhithawīwin portal english name woods cree iso 639 cwd location manitoba a...`
 2. `written in standard roman orthography`
+3. `ᑎᐸᐦᐄᑲᓐ ᑎᐸᐦᐄᑲᓐ ᑎᐸᐦᐄᑲᓐ ᑎᐸᐦᐄᑲᓐ ᐋᐱᐦᑖᒌᔑᑳᐤ ᐋᐱᐦᑖᑎᐱᔅᑳᐤ 1 05 ᐯᔭᒄ ᑎᐸᐦᐄᑲᓐ ᒦᓐ ᓂᔮᔪ ᒥᓂᑯᔥ ᓂᔮᔪ ᒥᓂᑯᔥ ᒥᔮᐧᐃᐸᔩᐤ ᐯᔭᒄ 1 30`
 ### Generated Text Samples (Subword-based)
 **Context Size 1:**
+1. `_ck.._ntahkwiwre`
+2. `iw._ey_îskānakat`
+3. `asuét):_ᓅᐦᑭᑫᓂᐤ..`
 **Context Size 2:**
+1. `initahtâw._ᑭᒋᒧᐏᐣ_`
+2. `,_miyis_nawamēwik`
+3. `ikawahtawāt_kin_o`
 **Context Size 3:**
+1. `in_itakwa_é-nipaho`
+2. `anitināw_ōnahkân_a`
+3. `winaka_kikamîw-sîp`
 **Context Size 4:**
+1. `wak_*`
+2. `win_ᐊᑎᒽ_ᐯᔭᒄ_ᓀᐦᐃᔭᐍᐏᐣ`
+3. `tion:_saskapi_qc_y_`
 ### Key Findings
 - **Best Predictability:** Context-4 (word) with 99.1% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
+- **Memory Trade-off:** Larger contexts require more storage (11,392 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
 | Metric | Value |
 |--------|-------|
+| Vocabulary Size | 468 |
+| Total Tokens | 1,673 |
+| Mean Frequency | 3.57 |
 | Median Frequency | 2 |
 | Frequency Std Dev | 3.40 |
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | ᐁ | 31 |
 | 2 | e | 30 |
 | 3 | and | 22 |
+| 4 | of | 22 |
+| 5 | in | 21 |
 | 6 | pîsim | 19 |
 | 7 | articles | 18 |
 | 8 | cree | 16 |
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | ᑯᓐᓄᑦ | 2 |
+| 2 | ᐊᒻᒪᐃᓛᒃ | 2 |
+| 3 | ᐊᑎᕐᒥᒃ | 2 |
+| 4 | ᖃᕆᑕᐅᔭᕐᒧᑦ | 2 |
+| 5 | ᐅᖃᐅᓯᕐᒥᒃ | 2 |
+| 6 | ᐊᔾᔨᐅᖏᑦᑐᒥᒃ | 2 |
+| 7 | ᑖᓐᓇ | 2 |
+| 8 | ᑕᐃᓐᓇ | 2 |
+| 9 | ᖃᕆᑕᐅᔭᒃᑯᑦ | 2 |
 | 10 | ᖃᐅᔨᓴᖅᑎᐅᔪᓄᑦ | 2 |
 ### Zipf's Law Analysis
 | Metric | Value |
 |--------|-------|
+| Zipf Coefficient | 0.5578 |
+| R² (Goodness of Fit) | 0.947960 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
+| Top 100 | 48.8% |
 | Top 1,000 | 0.0% |
 | Top 5,000 | 0.0% |
 | Top 10,000 | 0.0% |
 ### Key Findings
+- **Zipf Compliance:** R²=0.9480 indicates excellent adherence to Zipf's law
+- **High Frequency Dominance:** Top 100 words cover 48.8% of corpus
+- **Long Tail:** -9,532 words needed for remaining 100.0% coverage
 ---
 ## 5. Word Embeddings Evaluation
 ### 5.1 Cross-Lingual Alignment
+![Multilingual t-SNE](visualizations/embedding_tsne_multilingual.png)
 ### 5.2 Model Comparison
 | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
 |-------|-----------|----------|------------------|---------------|----------------|
+| **mono_32d** | 32 | 0.0354 | 0.0000 | N/A | N/A |
+| **mono_64d** | 64 | 0.0038 | 0.0000 | N/A | N/A |
 | **mono_128d** | 128 | 0.0000 | 0.0000 | N/A | N/A |
+| **aligned_32d** | 32 | 0.0354 🏆 | 0.0000 | 0.0000 | 0.0000 |
+| **aligned_64d** | 64 | 0.0038 | 0.0000 | 0.0000 | 0.0000 |
+| **aligned_128d** | 128 | 0.0000 | 0.0000 | 0.0000 | 0.0000 |
 ### Key Findings
+- **Best Isotropy:** aligned_32d with 0.0354 (more uniform distribution)
 - **Semantic Density:** Average pairwise similarity of 0.0000. Lower values indicate better semantic separation.
+- **Alignment Quality:** Aligned models evaluated but achieved 0% recall.
 - **Recommendation:** 128d aligned for best cross-lingual performance
 ---
 ## 6.  Morphological Analysis (Experimental)
 This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
 ### 6.1 Productivity & Complexity
 | Metric | Value | Interpretation | Recommendation |
 |--------|-------|----------------|----------------|
+| Productivity Index | **5.000** | High morphological productivity | Reliable analysis |
+| Idiomaticity Gap | **0.933** | High formulaic/idiomatic content | - |
 ### 6.2 Affix Inventory (Productive Units)
 ### 6.6 Linguistic Interpretation
 > **Automated Insight:**
+The language Cree shows high morphological productivity. The subword models are significantly more efficient than word models, suggesting a rich system of affixation or compounding.
+> **Note on Idiomaticity:** The high Idiomaticity Gap suggests a large number of frequent multi-word expressions or formulaic sequences that are statistically distinct from their component parts.
 ---
 ## 7. Summary & Recommendations
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
+| Tokenizer | **8k BPE** | Best compression (3.24x) |
 | N-gram | **3-gram** | Lowest perplexity (15) |
 | Markov | **Context-4** | Highest predictability (99.1%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
 ---
 *Generated by Wikilangs Models Pipeline*
+*Report Date: 2026-01-03 20:39:39*

models/embeddings/aligned/cr_128d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5dacb587c7d15197d345442364b1889a8b7a457b453f7df9253e97673b4fb352
+size 1024067754

models/embeddings/aligned/cr_128d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "cr", "dim": 128, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/cr_128d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b6ebec184c4af672ce0f571958b634809f46f6a1ea2dc7e6f8f2e7f53555387
+size 65664

models/embeddings/aligned/cr_128d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "cr",
+  "dimension": 128,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 39,
+  "vocab_size": 65
+}

models/embeddings/aligned/cr_32d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6e6a077873b64cad1912b66853d07d4f890bc6606e6b08083d4bfd3153121129
+size 256017834

models/embeddings/aligned/cr_32d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "cr", "dim": 32, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/cr_32d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b14cf327498f9c92f06cc00df8284bfbdfd03069966ad20b3ada72dc5ce02488
+size 4224

models/embeddings/aligned/cr_32d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "cr",
+  "dimension": 32,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 39,
+  "vocab_size": 65
+}

models/embeddings/aligned/cr_64d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd5960af306a49bc16d12139a057de3daae86c9d43e31642ee7dbd085eb553b7
+size 512034474

models/embeddings/aligned/cr_64d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "cr", "dim": 64, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/cr_64d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d846b3fae902e708babece2e6c2e3c62275450ddb7096ca337af53dcf63a96b6
+size 16512

models/embeddings/aligned/cr_64d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "cr",
+  "dimension": 64,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 39,
+  "vocab_size": 65
+}

models/embeddings/monolingual/cr_128d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:322799eab6ee84e7d28a0f76de24efb5b70f9e1fcd0eef88c9b61829ce272397
 size 1024067754

 version https://git-lfs.github.com/spec/v1
+oid sha256:5dacb587c7d15197d345442364b1889a8b7a457b453f7df9253e97673b4fb352
 size 1024067754

models/embeddings/monolingual/cr_32d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:095584a1fc693fcab2c3fd7f0ac831d533933eb58ce565c3b082866e303db9f6
 size 256017834

 version https://git-lfs.github.com/spec/v1
+oid sha256:6e6a077873b64cad1912b66853d07d4f890bc6606e6b08083d4bfd3153121129
 size 256017834

models/embeddings/monolingual/cr_64d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fdd5c40a0696cfa8faa26c56bda982ab2a5efbbc0bc7d30fcea9c944b571b542
 size 512034474

 version https://git-lfs.github.com/spec/v1
+oid sha256:fd5960af306a49bc16d12139a057de3daae86c9d43e31642ee7dbd085eb553b7
 size 512034474

models/subword_markov/cr_markov_ctx1_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d336a63f8ad5e082086c17ec41db170270495929a8dd340a6abcd2e5998bc0e6
-size 19362

 version https://git-lfs.github.com/spec/v1
+oid sha256:d58185c9c750453fec2b1be4bea145ca19c8154e9c1851c770fcfe60e0945619
+size 19062

models/subword_markov/cr_markov_ctx1_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "subword",
   "language": "cr",
-  "unique_contexts": 273,
-  "total_transitions": 21066
 }

   "context_size": 1,
   "variant": "subword",
   "language": "cr",
+  "unique_contexts": 271,
+  "total_transitions": 20269
 }

models/subword_markov/cr_markov_ctx2_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:88cd25c549a06a864e6af9c0a79d29d91815a704038e65f2ce7d2c821349c095
-size 54769

 version https://git-lfs.github.com/spec/v1
+oid sha256:3f22b06562e4d08f3c27092ecc548a0cae5a9446cc2ae3e54f2073e8b8900e95
+size 52808

models/subword_markov/cr_markov_ctx2_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "subword",
   "language": "cr",
-  "unique_contexts": 2872,
-  "total_transitions": 21041
 }

   "context_size": 2,
   "variant": "subword",
   "language": "cr",
+  "unique_contexts": 2789,
+  "total_transitions": 20244
 }

models/subword_markov/cr_markov_ctx3_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:493e48b8388399c0693ca1644064c6aff29ebd8768e02dc362bd13a9beb9923c
-size 109170

 version https://git-lfs.github.com/spec/v1
+oid sha256:f2ad975814a144c1719a36efe4086171171e4415b69c07c0cfc7589d7b44408b
+size 105153

models/subword_markov/cr_markov_ctx3_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 3,
   "variant": "subword",
   "language": "cr",
-  "unique_contexts": 7557,
-  "total_transitions": 21016
 }

   "context_size": 3,
   "variant": "subword",
   "language": "cr",
+  "unique_contexts": 7299,
+  "total_transitions": 20219
 }

models/subword_markov/cr_markov_ctx4_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:77ef47f052b70e08ed38cb704ef21e780c36aa884a719e5d4963f672dd6af637
-size 165104

 version https://git-lfs.github.com/spec/v1
+oid sha256:fbe0924ddd06b05b7652bd329b4be5d86de19b757a42631954cd9abc0a1910f8
+size 158381

models/subword_markov/cr_markov_ctx4_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 4,
   "variant": "subword",
   "language": "cr",
-  "unique_contexts": 11842,
-  "total_transitions": 20991
 }

   "context_size": 4,
   "variant": "subword",
   "language": "cr",
+  "unique_contexts": 11392,
+  "total_transitions": 20194
 }

models/subword_ngram/cr_2gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4ec4f4bbb6a07a46393df0aae6d9e7264c3913c983048c199b187ccf1637509f
-size 10150

 version https://git-lfs.github.com/spec/v1
+oid sha256:437302b4fde849c0752032e34257d718792b347ef32757c168db42d3025abc26
+size 9822

models/subword_ngram/cr_2gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 2,
   "variant": "subword",
   "language": "cr",
-  "unique_ngrams": 848,
-  "total_ngrams": 21066
 }

   "n": 2,
   "variant": "subword",
   "language": "cr",
+  "unique_ngrams": 812,
+  "total_ngrams": 20269
 }

models/subword_ngram/cr_3gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3a341e935103c45c5dd7be03811b9df507bbbcb941614f0a4d3949665defe332
-size 22181

 version https://git-lfs.github.com/spec/v1
+oid sha256:7980b5aac9cba90ebe083cb0bfd2b3416465c2a52b83cbcf3a73434e722de7a0
+size 21356

models/subword_ngram/cr_3gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 3,
   "variant": "subword",
   "language": "cr",
-  "unique_ngrams": 1986,
-  "total_ngrams": 21041
 }

   "n": 3,
   "variant": "subword",
   "language": "cr",
+  "unique_ngrams": 1902,
+  "total_ngrams": 20244
 }

models/subword_ngram/cr_4gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:36e06026aa7388f2d7e58a4f337a1ff541dac38696cfa56eaaaec940804c17a1
-size 46333

 version https://git-lfs.github.com/spec/v1
+oid sha256:b2bcb275f3772e5f6b8bc913b0b382afa84dac97f36e76b7d03ee8a6db106e56
+size 44325

models/subword_ngram/cr_4gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 4,
   "variant": "subword",
   "language": "cr",
-  "unique_ngrams": 3878,
-  "total_ngrams": 21016
 }

   "n": 4,
   "variant": "subword",
   "language": "cr",
+  "unique_ngrams": 3702,
+  "total_ngrams": 20219
 }

models/subword_ngram/cr_5gram_subword.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:18b2af82413c2d5e0b7124bc18273388641db7427c102a2465c939e7d69ffc67
+size 42412

models/subword_ngram/cr_5gram_subword_metadata.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "n": 5,
+  "variant": "subword",
+  "language": "cr",
+  "unique_ngrams": 3264,
+  "total_ngrams": 20194
+}

models/tokenizer/cr_tokenizer_8k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a0973f38231d8be6cc17f5bb11b4adcec87ca7e994a2d91d59f4a7f15ea4655f
-size 379309

 version https://git-lfs.github.com/spec/v1
+oid sha256:aafdacc6f2d991f561954af6f998ab1116a7904c0d6408e0e6f25d2ce7b6f625
+size 379259

models/tokenizer/cr_tokenizer_8k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/vocabulary/cr_vocabulary.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c9a339a558f9a91e507cf4c6a9afd7657b8a650a75b8594311ae35047326f22c
-size 10707

 version https://git-lfs.github.com/spec/v1
+oid sha256:4da6445f4ebee272af36858a529222db9605c42d9f0ad70060cc98b59cfaf5ee
+size 10298

models/vocabulary/cr_vocabulary_metadata.json CHANGED Viewed

@@ -1,15 +1,15 @@
 {
   "language": "cr",
-  "vocabulary_size": 489,
   "variant": "full",
   "statistics": {
-    "type_token_ratio": 0.5915817165406116,
     "coverage": {
-      "top_100": 0.2709634988490628,
-      "top_1000": 0.7372574810917462
     },
-    "hapax_count": 1310,
-    "hapax_ratio": 0.7281823235130628,
     "total_documents": 25
   }
 }

 {
   "language": "cr",
+  "vocabulary_size": 468,
   "variant": "full",
   "statistics": {
+    "type_token_ratio": 0.5884562841530054,
     "coverage": {
+      "top_100": 0.2786885245901639,
+      "top_1000": 0.7530737704918032
     },
+    "hapax_count": 1255,
+    "hapax_ratio": 0.7283807312826466,
     "total_documents": 25
   }
 }

models/word_markov/cr_markov_ctx1_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1b9cad226d4f9247d3628f1a8d45e964b5ac7aee3588940f10b795c7db079363
-size 51656

 version https://git-lfs.github.com/spec/v1
+oid sha256:67ec91aad046d21a2680c190c002d6bbdf3e246b18b9dffb01c01c60fdd45417
+size 49544

models/word_markov/cr_markov_ctx1_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "word",
   "language": "cr",
-  "unique_contexts": 1787,
-  "total_transitions": 3016
 }

   "context_size": 1,
   "variant": "word",
   "language": "cr",
+  "unique_contexts": 1711,
+  "total_transitions": 2903
 }

models/word_markov/cr_markov_ctx2_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b1f017105092b235e6c58f3c540d7b81c2ec84d6afd12ad9d42b7f06f32711cb
-size 68727

 version https://git-lfs.github.com/spec/v1
+oid sha256:da940dffbcc51f50754424866b5a37358c2ebc38b4236083ea24a6f1806004df
+size 65639

models/word_markov/cr_markov_ctx2_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "word",
   "language": "cr",
-  "unique_contexts": 2607,
-  "total_transitions": 2991
 }

   "context_size": 2,
   "variant": "word",
   "language": "cr",
+  "unique_contexts": 2501,
+  "total_transitions": 2878
 }

models/word_markov/cr_markov_ctx3_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ba438b38b527c374838426729a544c3af06986da131dcd7b6efc96c6174fdf64
-size 78351

 version https://git-lfs.github.com/spec/v1
+oid sha256:d5fd696554eca1524675c807015f1565335c8e8d03c64184b7283aecaf4877ac
+size 75064

models/word_markov/cr_markov_ctx3_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 3,
   "variant": "word",
   "language": "cr",
-  "unique_contexts": 2724,
-  "total_transitions": 2966
 }

   "context_size": 3,
   "variant": "word",
   "language": "cr",
+  "unique_contexts": 2617,
+  "total_transitions": 2853
 }

models/word_markov/cr_markov_ctx4_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:de165eca955a9457fb7d104a709440b37d8fe11272266a6eaf29d8aca13bcb96
-size 86120

 version https://git-lfs.github.com/spec/v1
+oid sha256:241b5162742d910baedb70c153556b238227f4b436220e840d48f82f5b69800a
+size 82423

models/word_markov/cr_markov_ctx4_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 4,
   "variant": "word",
   "language": "cr",
-  "unique_contexts": 2765,
-  "total_transitions": 2941
 }

   "context_size": 4,
   "variant": "word",
   "language": "cr",
+  "unique_contexts": 2657,
+  "total_transitions": 2828
 }

models/word_ngram/cr_2gram_word_metadata.json CHANGED Viewed

@@ -3,5 +3,5 @@
   "variant": "word",
   "language": "cr",
   "unique_ngrams": 17,
-  "total_ngrams": 3016
 }

   "variant": "word",
   "language": "cr",
   "unique_ngrams": 17,
+  "total_ngrams": 2903
 }

models/word_ngram/cr_3gram_word_metadata.json CHANGED Viewed

@@ -3,5 +3,5 @@
   "variant": "word",
   "language": "cr",
   "unique_ngrams": 16,
-  "total_ngrams": 2991
 }

   "variant": "word",
   "language": "cr",
   "unique_ngrams": 16,
+  "total_ngrams": 2878
 }

models/word_ngram/cr_4gram_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8c1ac43c0e3b24b9b058c232c3156ef3f4650c2f5098663b05aff7249ed3efd3
-size 5830

 version https://git-lfs.github.com/spec/v1
+oid sha256:bf88b51861c7b3e616572bab59d3354c7c36ad29a0cdfcbfb5afb5e47dca5e8c
+size 5691

models/word_ngram/cr_4gram_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 4,
   "variant": "word",
   "language": "cr",
-  "unique_ngrams": 166,
-  "total_ngrams": 2966
 }

   "n": 4,
   "variant": "word",
   "language": "cr",
+  "unique_ngrams": 160,
+  "total_ngrams": 2853
 }

models/word_ngram/cr_5gram_word.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:70760bb1c7ef26efafbcc96f57e199a10946cfebe8a2d04b08e9a1105d4bfa70
+size 5647