omarkamali commited on Jan 3

Commit

cec3a6d

verified ·

1 Parent(s): c9da813

Upload all models and assets for csb (latest)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
README.md +215 -178
models/embeddings/aligned/csb_128d.bin +3 -0
models/embeddings/aligned/csb_128d.meta.json +1 -0
models/embeddings/aligned/csb_128d.projection.npy +3 -0
models/embeddings/aligned/csb_128d_metadata.json +8 -0
models/embeddings/aligned/csb_32d.bin +3 -0
models/embeddings/aligned/csb_32d.meta.json +1 -0
models/embeddings/aligned/csb_32d.projection.npy +3 -0
models/embeddings/aligned/csb_32d_metadata.json +8 -0
models/embeddings/aligned/csb_64d.bin +3 -0
models/embeddings/aligned/csb_64d.meta.json +1 -0
models/embeddings/aligned/csb_64d.projection.npy +3 -0
models/embeddings/aligned/csb_64d_metadata.json +8 -0
models/embeddings/monolingual/csb_128d.bin +2 -2
models/embeddings/monolingual/csb_128d_metadata.json +1 -1
models/embeddings/monolingual/csb_32d.bin +2 -2
models/embeddings/monolingual/csb_32d_metadata.json +1 -1
models/embeddings/monolingual/csb_64d.bin +2 -2
models/embeddings/monolingual/csb_64d_metadata.json +1 -1
models/subword_markov/csb_markov_ctx1_subword.parquet +2 -2
models/subword_markov/csb_markov_ctx1_subword_metadata.json +2 -2
models/subword_markov/csb_markov_ctx2_subword.parquet +2 -2
models/subword_markov/csb_markov_ctx2_subword_metadata.json +2 -2
models/subword_markov/csb_markov_ctx3_subword.parquet +2 -2
models/subword_markov/csb_markov_ctx3_subword_metadata.json +2 -2
models/subword_markov/csb_markov_ctx4_subword.parquet +2 -2
models/subword_markov/csb_markov_ctx4_subword_metadata.json +2 -2
models/subword_ngram/csb_2gram_subword.parquet +2 -2
models/subword_ngram/csb_2gram_subword_metadata.json +2 -2
models/subword_ngram/csb_3gram_subword.parquet +2 -2
models/subword_ngram/csb_3gram_subword_metadata.json +2 -2
models/subword_ngram/csb_4gram_subword.parquet +2 -2
models/subword_ngram/csb_4gram_subword_metadata.json +2 -2
models/subword_ngram/csb_5gram_subword.parquet +3 -0
models/subword_ngram/csb_5gram_subword_metadata.json +7 -0
models/tokenizer/csb_tokenizer_16k.model +2 -2
models/tokenizer/csb_tokenizer_16k.vocab +0 -0
models/tokenizer/csb_tokenizer_32k.model +2 -2
models/tokenizer/csb_tokenizer_32k.vocab +0 -0
models/tokenizer/csb_tokenizer_64k.model +2 -2
models/tokenizer/csb_tokenizer_64k.vocab +0 -0
models/tokenizer/csb_tokenizer_8k.model +2 -2
models/tokenizer/csb_tokenizer_8k.vocab +0 -0
models/vocabulary/csb_vocabulary.parquet +2 -2
models/vocabulary/csb_vocabulary_metadata.json +9 -9
models/word_markov/csb_markov_ctx1_word.parquet +2 -2
models/word_markov/csb_markov_ctx1_word_metadata.json +2 -2
models/word_markov/csb_markov_ctx2_word.parquet +2 -2
models/word_markov/csb_markov_ctx2_word_metadata.json +2 -2

.gitattributes CHANGED Viewed

@@ -39,3 +39,4 @@ visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -t
 visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text

 visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
+visualizations/embedding_tsne_multilingual.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 language: csb
-language_name: CSB
 language_family: slavic_west
 tags:
   - wikilangs
@@ -10,11 +10,21 @@ tags:
   - n-gram
   - markov
   - wikipedia
   - monolingual
   - family-slavic_west
 license: mit
 library_name: wikilangs
-pipeline_tag: feature-extraction
 datasets:
   - omarkamali/wikipedia-monthly
 dataset_info:
@@ -23,20 +33,20 @@ dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
-    value: 4.519
   - name: best_isotropy
     type: isotropy
-    value: 0.7759
   - name: vocabulary_size
     type: vocab
     value: 0
 generated: 2026-01-03
 ---
-# CSB - Wikilangs Models
 ## Comprehensive Research Report & Full Ablation Study
-This repository contains NLP models trained and evaluated by Wikilangs, specifically on **CSB** Wikipedia data.
 We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
 ## 📋 Repository Contents
@@ -60,7 +70,7 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
-- [6. Morphological Analysis (Experimental)](#6-morphological-analysis)
 - [7. Summary & Recommendations](#7-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
@@ -80,47 +90,47 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
-| **8k** | 3.573x | 3.58 | 0.1681% | 180,853 |
-| **16k** | 3.908x | 3.91 | 0.1839% | 165,322 |
-| **32k** | 4.227x | 4.23 | 0.1989% | 152,876 |
-| **64k** | 4.519x 🏆 | 4.53 | 0.2126% | 142,981 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
-**Sample 1:** `Nowô Zelandzkô - je państwã na òstrowach Spòkójnégò Òceanu. w Aùstralëji i Ocean...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁nowô ▁zel an dzkô ▁- ▁je ▁państwã ▁na ▁òst rowa ... (+14 more)` | 24 |
-| 16k | `▁nowô ▁zelan dzkô ▁- ▁je ▁państwã ▁na ▁òstrowach ▁spòkójnégò ▁òceanu ... (+6 more)` | 16 |
-| 32k | `▁nowô ▁zelan dzkô ▁- ▁je ▁państwã ▁na ▁òstrowach ▁spòkójnégò ▁òceanu ... (+5 more)` | 15 |
-| 64k | `▁nowô ▁zelandzkô ▁- ▁je ▁państwã ▁na ▁òstrowach ▁spòkójnégò ▁òceanu . ... (+4 more)` | 14 |
-**Sample 2:** `802 / DCCCII 800 « 801 « 802 » 803 » 804 Wëdarzenia Ùrodzëlë sã Ùmarlë`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁ 8 0 2 ▁/ ▁dccc ii ▁ 8 0 ... (+25 more)` | 35 |
-| 16k | `▁ 8 0 2 ▁/ ▁dccc ii ▁ 8 0 ... (+25 more)` | 35 |
-| 32k | `▁ 8 0 2 ▁/ ▁dccc ii ▁ 8 0 ... (+25 more)` | 35 |
-| 64k | `▁ 8 0 2 ▁/ ▁dccc ii ▁ 8 0 ... (+25 more)` | 35 |
-**Sample 3:** `Smierdzący bòcónk (Geranium robertianum L.) – to je jednorocznô abò dwalatnô ros...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁smier dzą cy ▁bòc ónk ▁( ge ra nium ��robert ... (+26 more)` | 36 |
-| 16k | `▁smier dzący ▁bòc ónk ▁( gera nium ▁robert ian um ... (+24 more)` | 34 |
-| 32k | `▁smier dzący ▁bòc ónk ▁( gera nium ▁robert ian um ... (+23 more)` | 33 |
-| 64k | `▁smier dzący ▁bòcónk ▁( geranium ▁robert ian um ▁l .) ... (+21 more)` | 31 |
 ### Key Findings
-- **Best Compression:** 64k achieves 4.519x compression
-- **Lowest UNK Rate:** 8k with 0.1681% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
@@ -137,12 +147,14 @@ Below are sample sentences tokenized with each vocabulary size:
 | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
 |--------|---------|------------|---------|----------------|------------------|-------------------|
-| **2-gram** | Word | 1,973 | 10.95 | 6,252 | 31.3% | 68.4% |
-| **2-gram** | Subword | 459 🏆 | 8.84 | 2,759 | 53.4% | 98.1% |
-| **3-gram** | Word | 2,109 | 11.04 | 7,761 | 31.4% | 68.9% |
-| **3-gram** | Subword | 3,977 | 11.96 | 22,668 | 18.9% | 58.0% |
-| **4-gram** | Word | 3,756 | 11.88 | 15,387 | 27.9% | 59.4% |
-| **4-gram** | Subword | 19,041 | 14.22 | 103,678 | 9.9% | 32.9% |
 ### Top 5 N-grams by Size
@@ -150,11 +162,11 @@ Below are sample sentences tokenized with each vocabulary size:
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `to je` | 2,509 |
-| 2 | `bùtnowé lënczi` | 1,441 |
 | 3 | `ùrodzëlë sã` | 991 |
 | 4 | `w gminie` | 982 |
-| 5 | `m jin` | 873 |
 **3-grams (Word):**
@@ -173,45 +185,65 @@ Below are sample sentences tokenized with each vocabulary size:
 | 1 | `wëdarzenia ùrodzëlë sã ùmarlë` | 753 |
 | 2 | `p p p p` | 566 |
 | 3 | `w pòmòrsczim wòjewództwie w` | 537 |
-| 4 | `królestwa i jinëch słowiańsczich` | 489 |
-| 5 | `i jinëch słowiańsczich krajów` | 489 |
 **2-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `c z` | 39,994 |
-| 2 | `a _` | 39,475 |
-| 3 | `_ w` | 38,361 |
-| 4 | `. _` | 33,310 |
-| 5 | `_ p` | 33,120 |
 **3-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `c z i` | 17,651 |
-| 2 | `_ w _` | 16,987 |
-| 3 | `s c z` | 14,602 |
-| 4 | `_ p ò` | 12,455 |
-| 5 | `n a _` | 11,117 |
 **4-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `s c z i` | 9,987 |
-| 2 | `c z i _` | 8,529 |
-| 3 | `_ j e _` | 7,782 |
-| 4 | `é g ò _` | 7,756 |
-| 5 | `_ n a _` | 6,415 |
 ### Key Findings
-- **Best Perplexity:** 2-gram (subword) with 459
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
-- **Coverage:** Top-1000 patterns cover ~33% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
@@ -227,14 +259,14 @@ Below are sample sentences tokenized with each vocabulary size:
 | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
 |---------|---------|-------------|------------|------------------|-----------------|----------------|
-| **1** | Word | 0.5452 | 1.459 | 2.98 | 81,304 | 45.5% |
-| **1** | Subword | 1.0127 | 2.018 | 7.32 | 978 | 0.0% |
-| **2** | Word | 0.1324 | 1.096 | 1.26 | 240,607 | 86.8% |
-| **2** | Subword | 0.9831 | 1.977 | 6.04 | 7,148 | 1.7% |
-| **3** | Word | 0.0409 | 1.029 | 1.07 | 299,264 | 95.9% |
-| **3** | Subword | 0.8865 | 1.849 | 4.14 | 43,078 | 11.4% |
-| **4** | Word | 0.0201 🏆 | 1.014 | 1.03 | 315,962 | 98.0% |
-| **4** | Subword | 0.6527 | 1.572 | 2.59 | 178,117 | 34.7% |
 ### Generated Text Samples (Word-based)
@@ -242,27 +274,27 @@ Below are text samples generated from each word-based Markov chain model:
 **Context Size 1:**
-1. `w gminie wickò w nocë dlô biédnëch robòta jakno dzél wsë czelińskô hëta to béł pòlsczi`
-2. `je rëba z rodzëznë lycosidae òna rosce m jin na zôczątkù leno w pò ùpôdkù kòmùnizmù`
-3. `i białków zgòrzałégò zgòrzôłczi pòl jeziora potęgowskie to tak samò rok znoszą od średniowiecza do c...`
 **Context Size 2:**
-1. `to je dzél gardu grëdządza nad wisłą we zdrojach nova berlyn berlyn nigenberlin berlin berlinichen b...`
-2. `bùtnowé lënczi tadzino w geògraficznym słowôrzu pòlsczégò królestwa i jinëch słowiańsczich krajów pù...`
-3. `ùrodzëlë sã ùmarlë stolaté`
 **Context Size 3:**
-1. `wëdarzenia ùrodzëlë sã ùmarlë lesser giełdziński kòlekcjonéra dokôzów kùńsztu lesser giełdziński gaz...`
-2. `ùrodzëlë sã ùmarlë kalãdôrz na hewòtny rok juliańsczi 914 915 916 917 918 919 920 921 922 923`
-3. `w pòmòrsczim wòjewództwie w kartësczim krézu w gminie pòtãgòwò w stołpsczim krézu w gminie przedkòwò...`
 **Context Size 4:**
-1. `wëdarzenia ùrodzëlë sã ùmarlë kalãdôrz na hewòtny rok juliańsczi 948 949 950 951 952 953 954 955 956...`
-2. `p p p p p p p p p p p p p p p p p p p`
-3. `w pòmòrsczim wòjewództwie w kartësczim krézu w òbéńdze gminë somònino tu w szkòle dzece ùczą sã kasz...`
 ### Generated Text Samples (Subword-based)
@@ -271,34 +303,34 @@ Below are text samples generated from each subword-based Markov chain model:
 **Context Size 1:**
-1. `_kaństrdk_gô_òdz`
-2. `arstk:_todł_dnch`
-3. `icze_zegòriczëni`
 **Context Size 2:**
-1. `cziwónégò._maińst`
-2. `a_spòl.)_terticho`
-3. `_w_rowimòriart_ka`
 **Context Size 3:**
-1. `czi,_„roxy_dobis_z`
-2. `_w_chtërnym_są_z_d`
-3. `sczé_czajny),_mie_`
 **Context Size 4:**
-1. `sczi_egipsczégò_pòc`
-2. `czi_rôtësz_bëc_kòle`
-3. `_je_człowiańsczi_kò`
 ### Key Findings
 - **Best Predictability:** Context-4 (word) with 98.0% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
-- **Memory Trade-off:** Larger contexts require more storage (178,117 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
@@ -314,64 +346,64 @@ Below are text samples generated from each subword-based Markov chain model:
 | Metric | Value |
 |--------|-------|
-| Vocabulary Size | 28,754 |
-| Total Tokens | 367,683 |
-| Mean Frequency | 12.79 |
 | Median Frequency | 3 |
-| Frequency Std Dev | 148.11 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | w | 17,439 |
-| 2 | je | 7,833 |
-| 3 | i | 6,889 |
-| 4 | na | 6,729 |
-| 5 | z | 5,037 |
-| 6 | to | 4,739 |
-| 7 | sã | 3,695 |
-| 8 | do | 3,401 |
-| 9 | rok | 3,185 |
-| 10 | a | 2,487 |
 ### Least Common Words (from vocabulary)
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | szahada | 2 |
-| 2 | allaha | 2 |
-| 3 | الله | 2 |
-| 4 | llāh | 2 |
-| 5 | tatarzy | 2 |
-| 6 | chtërzy | 2 |
-| 7 | prevost | 2 |
-| 8 | gwiôzdozbiór | 2 |
-| 9 | discover | 2 |
-| 10 | krakowska | 2 |
 ### Zipf's Law Analysis
 | Metric | Value |
 |--------|-------|
-| Zipf Coefficient | 0.9905 |
-| R² (Goodness of Fit) | 0.995948 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
-| Top 100 | 36.0% |
-| Top 1,000 | 63.2% |
-| Top 5,000 | 79.8% |
-| Top 10,000 | 87.4% |
 ### Key Findings
-- **Zipf Compliance:** R²=0.9959 indicates excellent adherence to Zipf's law
-- **High Frequency Dominance:** Top 100 words cover 36.0% of corpus
-- **Long Tail:** 18,754 words needed for remaining 12.6% coverage
 ---
 ## 5. Word Embeddings Evaluation
@@ -387,37 +419,40 @@ Below are text samples generated from each subword-based Markov chain model:
 ### 5.1 Cross-Lingual Alignment
-> *Note: Multilingual alignment visualization not available for this language.*
 ### 5.2 Model Comparison
 | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
 |-------|-----------|----------|------------------|---------------|----------------|
-| **mono_32d** | 32 | 0.7759 🏆 | 0.3628 | N/A | N/A |
-| **mono_64d** | 64 | 0.4956 | 0.3193 | N/A | N/A |
-| **mono_128d** | 128 | 0.1441 | 0.3257 | N/A | N/A |
 ### Key Findings
-- **Best Isotropy:** mono_32d with 0.7759 (more uniform distribution)
-- **Semantic Density:** Average pairwise similarity of 0.3359. Lower values indicate better semantic separation.
-- **Alignment Quality:** No aligned models evaluated in this run.
 - **Recommendation:** 128d aligned for best cross-lingual performance
 ---
 ## 6.  Morphological Analysis (Experimental)
-> ⚠️ **Warning:** This language shows low morphological productivity. The statistical signals used for this analysis may be noisy or less reliable than for morphologically rich languages.
 This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
 ### 6.1 Productivity & Complexity
 | Metric | Value | Interpretation | Recommendation |
 |--------|-------|----------------|----------------|
-| Productivity Index | **0.000** | Low morphological productivity | ⚠️ Likely unreliable |
-| Idiomaticity Gap | **-1.000** | Low formulaic content | - |
 ### 6.2 Affix Inventory (Productive Units)
@@ -426,17 +461,17 @@ These are the most productive prefixes and suffixes identified by sampling the v
 #### Productive Prefixes
 | Prefix | Examples |
 |--------|----------|
-| `-pr` | prozã, przerôbianié, prostonórta |
-| `-pò` | pòmòcë, pòtémù, pòmòcnégò |
 #### Productive Suffixes
 | Suffix | Examples |
 |--------|----------|
-| `-a` | plëszka, svôta, jóna |
-| `-ch` | chtërnich, artisticznëch, tarnowsczich |
-| `-ów` | kònkùrsów, splecënków, piesniów |
-| `-zi` | marokańsczi, hélsczi, esteticzi |
-| `-czi` | marokańsczi, hélsczi, esteticzi |
 ### 6.3 Bound Stems (Lexical Roots)
@@ -444,18 +479,18 @@ Bound stems are high-frequency subword units that are semantically cohesive but
 | Stem | Cohesion | Substitutability | Examples |
 |------|----------|------------------|----------|
-| `tërn` | 2.01x | 29 contexts | chtërną, chtërna, chtërnã |
-| `htër` | 2.05x | 23 contexts | chtërô, chtërë, chtëre |
-| `chtë` | 1.91x | 27 contexts | chtërô, chtërë, chtëre |
-| `szëb` | 1.99x | 22 contexts | kaszëb, kaszëbi, kaszëba |
-| `zeni` | 1.64x | 32 contexts | zenice, ùczenié, ùczeniô |
-| `odzë` | 1.79x | 22 contexts | rodzëc, rodzënë, godzëną |
-| `stol` | 1.78x | 20 contexts | stole, stolp, stolpe |
-| `rodz` | 1.40x | 44 contexts | rodzy, rodze, rodzą |
-| `aszë` | 1.91x | 14 contexts | kaszëb, kaszëbi, kaszëba |
-| `sczé` | 1.41x | 30 contexts | rusczé, wąsczé, nisczé |
-| `zëzn` | 1.40x | 29 contexts | rodzëzna, rodzëznë, żëdzëzna |
-| `zëbs` | 2.04x | 9 contexts | kaszëbskô, kaszëbskù, kaszëbskò |
 ### 6.4 Affix Compatibility (Co-occurrence)
@@ -463,16 +498,16 @@ This table shows which prefixes and suffixes most frequently co-occur on the sam
 | Prefix | Suffix | Frequency | Examples |
 |--------|--------|-----------|----------|
-| `-pr` | `-a` | 36 words | prałata, prawidła |
-| `-pò` | `-a` | 22 words | pòéta, pòetka |
-| `-pr` | `-ów` | 19 words | procëmników, przezeblôkańców |
-| `-pò` | `-ch` | 15 words | pòlsczich, pòswiãconëch |
-| `-pò` | `-zi` | 12 words | pòrénszi, pòwieczi |
-| `-pò` | `-ów` | 12 words | pòétów, pòkôzków |
-| `-pr` | `-ch` | 10 words | przédnich, prësach |
-| `-pò` | `-czi` | 10 words | pòwieczi, pòprôwczi |
-| `-pr` | `-zi` | 4 words | prëczkòwsczi, prekmùrsczi |
-| `-pr` | `-czi` | 4 words | prëczkòwsczi, prekmùrsczi |
 ### 6.5 Recursive Morpheme Segmentation
@@ -480,26 +515,28 @@ Using **Recursive Hierarchical Substitutability**, we decompose complex words in
 | Word | Suggested Split | Confidence | Stem |
 |------|-----------------|------------|------|
-| francesczi | **`frances-czi`** | 4.5 | `frances` |
 | przebendowsczich | **`pr-zebendows-czi-ch`** | 4.5 | `zebendows` |
-| rozmajitéch | **`rozmajité-ch`** | 4.5 | `rozmajité` |
-| misyjnych | **`misyjny-ch`** | 4.5 | `misyjny` |
-| kòloniach | **`kòlonia-ch`** | 4.5 | `kòlonia` |
 | instrumentów | **`instrument-ów`** | 4.5 | `instrument` |
-| òpòwiesców | **`òpòwiesc-ów`** | 4.5 | `òpòwiesc` |
-| rockòwich | **`rockòwi-ch`** | 4.5 | `rockòwi` |
-| nôrodnych | **`nôrodny-ch`** | 4.5 | `nôrodny` |
-| kòntinentów | **`kòntinent-ów`** | 4.5 | `kòntinent` |
-| chtërnich | **`chtërni-ch`** | 4.5 | `chtërni` |
-| pierszëch | **`pierszë-ch`** | 4.5 | `pierszë` |
-| napùlsczich | **`napùls-czi-ch`** | 3.0 | `napùls` |
-| pòwijôczowatëch | **`pò-wijôczowatë-ch`** | 3.0 | `wijôczowatë` |
-| profesorów | **`pr-ofesor-ów`** | 3.0 | `ofesor` |
 ### 6.6 Linguistic Interpretation
 > **Automated Insight:**
-The language CSB appears to be more isolating or has a highly fixed vocabulary. Word-level models perform nearly as well as subword models, indicating fewer productive morphological processes.
 ---
 ## 7. Summary & Recommendations
@@ -511,7 +548,7 @@ The language CSB appears to be more isolating or has a highly fixed vocabulary.
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
 | Tokenizer | **64k BPE** | Best compression (4.52x) |
-| N-gram | **2-gram** | Lowest perplexity (459) |
 | Markov | **Context-4** | Highest predictability (98.0%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
@@ -726,4 +763,4 @@ MIT License - Free for academic and commercial use.
 ---
 *Generated by Wikilangs Models Pipeline*
-*Report Date: 2026-01-03 10:37:34*

 ---
 language: csb
+language_name: Kashubian
 language_family: slavic_west
 tags:
   - wikilangs
   - n-gram
   - markov
   - wikipedia
+  - feature-extraction
+  - sentence-similarity
+  - tokenization
+  - n-grams
+  - markov-chain
+  - text-mining
+  - fasttext
+  - babelvec
+  - vocabulous
+  - vocabulary
   - monolingual
   - family-slavic_west
 license: mit
 library_name: wikilangs
+pipeline_tag: text-generation
 datasets:
   - omarkamali/wikipedia-monthly
 dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
+    value: 4.520
   - name: best_isotropy
     type: isotropy
+    value: 0.7585
   - name: vocabulary_size
     type: vocab
     value: 0
 generated: 2026-01-03
 ---
+# Kashubian - Wikilangs Models
 ## Comprehensive Research Report & Full Ablation Study
+This repository contains NLP models trained and evaluated by Wikilangs, specifically on **Kashubian** Wikipedia data.
 We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
 ## 📋 Repository Contents
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
+- [6. Morphological Analysis (Experimental)](#6--morphological-analysis-experimental)
 - [7. Summary & Recommendations](#7-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
+| **8k** | 3.576x | 3.58 | 0.1685% | 179,827 |
+| **16k** | 3.912x | 3.92 | 0.1843% | 164,376 |
+| **32k** | 4.229x | 4.24 | 0.1993% | 152,042 |
+| **64k** | 4.520x 🏆 | 4.53 | 0.2130% | 142,258 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
+**Sample 1:** `Mòrzebób abò lësy ògón (Lycopodium clavatum L.) - to je wielelatnô roscëna z rod...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁mòrze b ób ▁abò ▁lë sy ▁ògón ▁( ly co ... (+29 more)` | 39 |
+| 16k | `▁mòrze b ób ▁abò ▁lë sy ▁ògón ▁( ly copo ... (+26 more)` | 36 |
+| 32k | `▁mòrze b ób ▁abò ▁lë sy ▁ògón ▁( lycopo dium ... (+22 more)` | 32 |
+| 64k | `▁mòrze b ób ▁abò ▁lë sy ▁ògón ▁( lycopodium ▁cla ... (+21 more)` | 31 |
+**Sample 2:** `Niemieckô Karznica (pòl. Karzniczka) - to je wies w pòmòrsczim wòjewództwie, w s...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁niemie ckô ▁ka rz nica ▁( pòl . ▁ka rz ... (+19 more)` | 29 |
+| 16k | `▁niemieckô ▁karz nica ▁( pòl . ▁karz niczka ) ▁- ... (+16 more)` | 26 |
+| 32k | `▁niemieckô ▁karznica ▁( pòl . ▁karz niczka ) ▁- ▁to ... (+15 more)` | 25 |
+| 64k | `▁niemieckô ▁karznica ▁( pòl . ▁karzniczka ) ▁- ▁to ▁je ... (+14 more)` | 24 |
+**Sample 3:** `Wëdarzenia Pòlsczi król Władisłôw I Herman wëdôł rozkôz spôleniô gardów w Gduńsc...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁wëdarzenia ▁pòlsczi ▁król ▁władisłôw ▁i ▁her man ▁wëdôł ▁roz kôz ... (+6 more)` | 16 |
+| 16k | `▁wëdarzenia ▁pòlsczi ▁król ▁władisłôw ▁i ▁her man ▁wëdôł ▁roz kôz ... (+6 more)` | 16 |
+| 32k | `▁wëdarzenia ▁pòlsczi ▁król ▁władisłôw ▁i ▁herman ▁wëdôł ▁roz kôz ▁spô ... (+5 more)` | 15 |
+| 64k | `▁wëdarzenia ▁pòlsczi ▁król ▁władisłôw ▁i ▁herman ▁wëdôł ▁rozkôz ▁spôleniô ▁gardów ... (+3 more)` | 13 |
 ### Key Findings
+- **Best Compression:** 64k achieves 4.520x compression
+- **Lowest UNK Rate:** 8k with 0.1685% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
 | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
 |--------|---------|------------|---------|----------------|------------------|-------------------|
+| **2-gram** | Word | 1,947 | 10.93 | 6,180 | 31.4% | 68.7% |
+| **2-gram** | Subword | 457 🏆 | 8.84 | 2,749 | 53.5% | 98.1% |
+| **3-gram** | Word | 2,094 | 11.03 | 7,716 | 31.5% | 69.0% |
+| **3-gram** | Subword | 3,953 | 11.95 | 22,499 | 18.9% | 58.2% |
+| **4-gram** | Word | 3,732 | 11.87 | 15,312 | 28.0% | 59.5% |
+| **4-gram** | Subword | 18,873 | 14.20 | 102,765 | 10.0% | 33.1% |
+| **5-gram** | Word | 3,059 | 11.58 | 12,171 | 29.4% | 62.6% |
+| **5-gram** | Subword | 46,114 | 15.49 | 210,801 | 7.4% | 25.0% |
 ### Top 5 N-grams by Size
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `to je` | 2,500 |
+| 2 | `bùtnowé lënczi` | 1,440 |
 | 3 | `ùrodzëlë sã` | 991 |
 | 4 | `w gminie` | 982 |
+| 5 | `m jin` | 870 |
 **3-grams (Word):**
 | 1 | `wëdarzenia ùrodzëlë sã ùmarlë` | 753 |
 | 2 | `p p p p` | 566 |
 | 3 | `w pòmòrsczim wòjewództwie w` | 537 |
+| 4 | `i jinëch słowiańsczich krajów` | 489 |
+| 5 | `królestwa i jinëch słowiańsczich` | 489 |
+**5-grams (Word):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `p p p p p` | 532 |
+| 2 | `pòlsczégò królestwa i jinëch słowiańsczich` | 489 |
+| 3 | `królestwa i jinëch słowiańsczich krajów` | 489 |
+| 4 | `słowôrzu pòlsczégò królestwa i jinëch` | 488 |
+| 5 | `geògraficznym słowôrzu pòlsczégò królestwa i` | 487 |
 **2-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `c z` | 39,727 |
+| 2 | `a _` | 38,964 |
+| 3 | `_ w` | 38,073 |
+| 4 | `. _` | 33,276 |
+| 5 | `_ p` | 32,909 |
 **3-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `c z i` | 17,503 |
+| 2 | `_ w _` | 16,830 |
+| 3 | `s c z` | 14,512 |
+| 4 | `_ p ò` | 12,375 |
+| 5 | `n a _` | 10,995 |
 **4-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `s c z i` | 9,919 |
+| 2 | `c z i _` | 8,412 |
+| 3 | `_ j e _` | 7,786 |
+| 4 | `é g ò _` | 7,710 |
+| 5 | `_ n a _` | 6,352 |
+**5-grams (Subword):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `_ k a s z` | 5,271 |
+| 2 | `k a s z ë` | 4,572 |
+| 3 | `a s z ë b` | 4,569 |
+| 4 | `s c z i _` | 4,317 |
+| 5 | `z é g ò _` | 4,004 |
 ### Key Findings
+- **Best Perplexity:** 2-gram (subword) with 457
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
+- **Coverage:** Top-1000 patterns cover ~25% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
 | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
 |---------|---------|-------------|------------|------------------|-----------------|----------------|
+| **1** | Word | 0.5411 | 1.455 | 2.97 | 80,925 | 45.9% |
+| **1** | Subword | 1.0139 | 2.019 | 7.32 | 979 | 0.0% |
+| **2** | Word | 0.1312 | 1.095 | 1.25 | 237,972 | 86.9% |
+| **2** | Subword | 0.9776 | 1.969 | 6.00 | 7,156 | 2.2% |
+| **3** | Word | 0.0409 | 1.029 | 1.07 | 295,594 | 95.9% |
+| **3** | Subword | 0.8837 | 1.845 | 4.13 | 42,873 | 11.6% |
+| **4** | Word | 0.0202 🏆 | 1.014 | 1.03 | 312,105 | 98.0% |
+| **4** | Subword | 0.6519 | 1.571 | 2.59 | 176,892 | 34.8% |
 ### Generated Text Samples (Word-based)
 **Context Size 1:**
+1. `w drëdżich wëstąpiwo nacygnienié i bùtnową z eùropejsczégò partnerstwa pòrtë to ekònomicznô rzôdzëzn...`
+2. `je w geògraficznym słowôrzu pòlsczégò królestwa i pierre bourdieu francësczi jãzëk to bëło jich rozm...`
+3. `i jedzenié wedle wielënë lëdztwa z kaszëbsczégò krôjòbraznégò parkù òn béł wërëti òn pisôł m jin`
 **Context Size 2:**
+1. `to je susk z rodzëznë swiniowatëch suidae na kaszëbach ten łëzgôcz żëwi sã roscënama`
+2. `bùtnowé lënczi picus viridis to je roscëna z rodzëznë cyperaceae òn rosce m jin w gardze dérowałë`
+3. `ùrodzëlë sã ùmarlë gregòriańsczi kalãdôrz zaczął bëc ùżiwóny dopiérze w na zôczątkù leno w niechtërn...`
 **Context Size 3:**
+1. `wëdarzenia ùrodzëlë sã ùmarlë przësłowia barbara swiãtô ò rëbôkach pamiãtô jak na barbarã mróz schòw...`
+2. `ùrodzëlë sã ùmarlë augùstin dominik chtëren napisôł m jin że kaszëbi cassubiorum gôdają pò wandalskù...`
+3. `w pòmòrsczim wòjewództwie w bëtowsczim krézu w pòmòrsczim wòjewództwie tu je pałac a w nim klôsztór ...`
 **Context Size 4:**
+1. `wëdarzenia ùrodzëlë sã ùmarlë przësłowié w stôrim piéckù diabeł pôli`
+2. `p p p p p p p p p p p p p p p swiãta ë ùroczëznë midzënôrodné`
+3. `w pòmòrsczim wòjewództwie w kartësczim krézu w gminie kartuzë tu ùrodzył sã gerard labùda niedalek ò...`
 ### Generated Text Samples (Subword-based)
 **Context Size 1:**
+1. `_jeczącz_wierëne`
+2. `a_xycok_w_słowin`
+3. `i_pò_aromstë_adz`
 **Context Size 2:**
+1. `cz_gmik_47_iniewò`
+2. `a_z_pòzwëbski)_na`
+3. `_w_rok_drólotam_p`
 **Context Size 3:**
+1. `czim_jãzëkã._strzé`
+2. `_w_pòzwa_«lucjonal`
+3. `sczi_kaszëbsczégò_`
 **Context Size 4:**
+1. `sczi)._wiesłowie_ho`
+2. `czi_lëdztwa_kaszëbs`
+3. `_je_w_tim_célu_gduń`
 ### Key Findings
 - **Best Predictability:** Context-4 (word) with 98.0% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
+- **Memory Trade-off:** Larger contexts require more storage (176,892 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
 | Metric | Value |
 |--------|-------|
+| Vocabulary Size | 28,419 |
+| Total Tokens | 363,789 |
+| Mean Frequency | 12.80 |
 | Median Frequency | 3 |
+| Frequency Std Dev | 147.85 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | w | 17,269 |
+| 2 | je | 7,835 |
+| 3 | i | 6,858 |
+| 4 | na | 6,665 |
+| 5 | z | 4,968 |
+| 6 | to | 4,725 |
+| 7 | sã | 3,705 |
+| 8 | do | 3,388 |
+| 9 | rok | 3,182 |
+| 10 | a | 2,483 |
 ### Least Common Words (from vocabulary)
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | krakowska | 2 |
+| 2 | włãczëne | 2 |
+| 3 | союз | 2 |
+| 4 | eliminowanié | 2 |
+| 5 | pòliticznich | 2 |
+| 6 | pôłna | 2 |
+| 7 | kòntrola | 2 |
+| 8 | ùmòwã | 2 |
+| 9 | stalinizm | 2 |
+| 10 | fssr | 2 |
 ### Zipf's Law Analysis
 | Metric | Value |
 |--------|-------|
+| Zipf Coefficient | 0.9915 |
+| R² (Goodness of Fit) | 0.995964 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
+| Top 100 | 36.1% |
+| Top 1,000 | 63.4% |
+| Top 5,000 | 80.0% |
+| Top 10,000 | 87.6% |
 ### Key Findings
+- **Zipf Compliance:** R²=0.9960 indicates excellent adherence to Zipf's law
+- **High Frequency Dominance:** Top 100 words cover 36.1% of corpus
+- **Long Tail:** 18,419 words needed for remaining 12.4% coverage
 ---
 ## 5. Word Embeddings Evaluation
 ### 5.1 Cross-Lingual Alignment
+![Alignment Quality](visualizations/embedding_alignment_quality.png)
+![Multilingual t-SNE](visualizations/embedding_tsne_multilingual.png)
 ### 5.2 Model Comparison
 | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
 |-------|-----------|----------|------------------|---------------|----------------|
+| **mono_32d** | 32 | 0.7585 | 0.3620 | N/A | N/A |
+| **mono_64d** | 64 | 0.5824 | 0.3234 | N/A | N/A |
+| **mono_128d** | 128 | 0.1382 | 0.3213 | N/A | N/A |
+| **aligned_32d** | 32 | 0.7585 🏆 | 0.3595 | 0.0200 | 0.1880 |
+| **aligned_64d** | 64 | 0.5824 | 0.3217 | 0.0600 | 0.2480 |
+| **aligned_128d** | 128 | 0.1382 | 0.3200 | 0.1040 | 0.3580 |
 ### Key Findings
+- **Best Isotropy:** aligned_32d with 0.7585 (more uniform distribution)
+- **Semantic Density:** Average pairwise similarity of 0.3347. Lower values indicate better semantic separation.
+- **Alignment Quality:** Aligned models achieve up to 10.4% R@1 in cross-lingual retrieval.
 - **Recommendation:** 128d aligned for best cross-lingual performance
 ---
 ## 6.  Morphological Analysis (Experimental)
 This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
 ### 6.1 Productivity & Complexity
 | Metric | Value | Interpretation | Recommendation |
 |--------|-------|----------------|----------------|
+| Productivity Index | **5.000** | High morphological productivity | Reliable analysis |
+| Idiomaticity Gap | **1.504** | High formulaic/idiomatic content | - |
 ### 6.2 Affix Inventory (Productive Units)
 #### Productive Prefixes
 | Prefix | Examples |
 |--------|----------|
+| `-pr` | przednik, przistãpną, prowincëjã |
+| `-pò` | pòzycji, pòkòrë, pòdôwô |
 #### Productive Suffixes
 | Suffix | Examples |
 |--------|----------|
+| `-a` | gdùńska, chòrobama, tradycja |
+| `-ch` | griphenberch, błãdnëch, pòdwòrzach |
+| `-zi` | czedrowsczi, krëszczi, amerikansczi |
+| `-czi` | czedrowsczi, krëszczi, amerikansczi |
+| `-ów` | ùrządzeniów, wëdôwków, dzélëków |
 ### 6.3 Bound Stems (Lexical Roots)
 | Stem | Cohesion | Substitutability | Examples |
 |------|----------|------------------|----------|
+| `tërn` | 1.98x | 29 contexts | chtërny, chtërno, chtërnë |
+| `chtë` | 2.02x | 27 contexts | chtërë, sëchtë, zëchtë |
+| `htër` | 2.06x | 23 contexts | chtërë, chtëre, chtërô |
+| `szëb` | 2.02x | 22 contexts | kaszëb, kaszëbą, kaszëbã |
+| `sczi` | 1.43x | 67 contexts | bùsczi, łasczi, bòsczi |
+| `zeni` | 1.61x | 32 contexts | zenice, grzenia, ùczeniô |
+| `odzë` | 1.76x | 23 contexts | rodzëc, rodzënë, rodzëcë |
+| `stol` | 1.81x | 20 contexts | stolp, stole, stolpe |
+| `rodz` | 1.40x | 45 contexts | rodzą, rodzy, rodze |
+| `aszë` | 1.93x | 14 contexts | kaszëb, kaszëbą, kaszëbã |
+| `sczé` | 1.44x | 30 contexts | rusczé, nisczé, wąsczé |
+| `zëbs` | 2.09x | 9 contexts | kaszëbsko, kaszëbsce, kaszëbskù |
 ### 6.4 Affix Compatibility (Co-occurrence)
 | Prefix | Suffix | Frequency | Examples |
 |--------|--------|-----------|----------|
+| `-pr` | `-ów` | 23 words | prawów, przezeblôkańców |
+| `-pr` | `-a` | 20 words | procesama, praha |
+| `-pò` | `-a` | 14 words | pòsłëga, pòlsczima |
+| `-pò` | `-ch` | 13 words | pòłączeniach, pòdwòdnëch |
+| `-pò` | `-ów` | 9 words | pòzwów, pòspólnotów |
+| `-pr` | `-ch` | 7 words | prawach, prezidencczich |
+| `-pò` | `-zi` | 6 words | pòlszczi, pòmerénczi |
+| `-pò` | `-czi` | 6 words | pòlszczi, pòmerénczi |
+| `-pr` | `-zi` | 6 words | prëczkòwsczi, prasczi |
+| `-pr` | `-czi` | 4 words | prëczkòwsczi, prasczi |
 ### 6.5 Recursive Morpheme Segmentation
 | Word | Suggested Split | Confidence | Stem |
 |------|-----------------|------------|------|
+| państwòwich | **`państwòwi-ch`** | 4.5 | `państwòwi` |
+| mòdlëtwów | **`mòdlëtw-ów`** | 4.5 | `mòdlëtw` |
 | przebendowsczich | **`pr-zebendows-czi-ch`** | 4.5 | `zebendows` |
+| czerënków | **`czerënk-ów`** | 4.5 | `czerënk` |
+| gòspòdarztwach | **`gòspòdarztwa-ch`** | 4.5 | `gòspòdarztwa` |
+| kòmpùtrach | **`kòmpùtra-ch`** | 4.5 | `kòmpùtra` |
+| chternych | **`chterny-ch`** | 4.5 | `chterny` |
 | instrumentów | **`instrument-ów`** | 4.5 | `instrument` |
+| wiérztczi | **`wiérzt-czi`** | 4.5 | `wiérzt` |
+| etnicznych | **`etniczny-ch`** | 4.5 | `etniczny` |
+| kònkùrsów | **`kònkùrs-ów`** | 4.5 | `kònkùrs` |
+| wòjskòwich | **`wòjskòwi-ch`** | 4.5 | `wòjskòwi` |
+| miemiecczich | **`miemiec-czi-ch`** | 3.0 | `miemiec` |
+| pòległëch | **`pò-ległë-ch`** | 3.0 | `ległë` |
+| programach | **`pr-ograma-ch`** | 3.0 | `ograma` |
 ### 6.6 Linguistic Interpretation
 > **Automated Insight:**
+The language Kashubian shows high morphological productivity. The subword models are significantly more efficient than word models, suggesting a rich system of affixation or compounding.
+> **Note on Idiomaticity:** The high Idiomaticity Gap suggests a large number of frequent multi-word expressions or formulaic sequences that are statistically distinct from their component parts.
 ---
 ## 7. Summary & Recommendations
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
 | Tokenizer | **64k BPE** | Best compression (4.52x) |
+| N-gram | **2-gram** | Lowest perplexity (457) |
 | Markov | **Context-4** | Highest predictability (98.0%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
 ---
 *Generated by Wikilangs Models Pipeline*
+*Report Date: 2026-01-03 20:55:59*

models/embeddings/aligned/csb_128d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:43c16a3877fc0cb095ba4ff3327fee74803119be2a9e7f20081afa461282b882
+size 1032709206

models/embeddings/aligned/csb_128d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "csb", "dim": 128, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/csb_128d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82a25567e36768d3dd6c94cb4d646624c69d2b3259a8acd77b503d8bd46816d6
+size 65664

models/embeddings/aligned/csb_128d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "csb",
+  "dimension": 128,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 1997,
+  "vocab_size": 8362
+}

models/embeddings/aligned/csb_32d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f2bb2172bdfcc3d2a8f53b936c3f8101b78d6dfdb5eae5e396a887a3a7169611
+size 258287190

models/embeddings/aligned/csb_32d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "csb", "dim": 32, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/csb_32d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:70af405e299a4b5f2a106a46c4ed598039a321b33e20e6e79b429efe14b53bad
+size 4224

models/embeddings/aligned/csb_32d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "csb",
+  "dimension": 32,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 1997,
+  "vocab_size": 8362
+}

models/embeddings/aligned/csb_64d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d0920564c633a2a22bc60f797cc31f28ae35c1dd6531daa7cd32db1ffe2b9da
+size 516427862

models/embeddings/aligned/csb_64d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "csb", "dim": 64, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/csb_64d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe6a56697e1060de28a3e13138ae748199b88287d6e1fbdc82d3c6c23e8da33c
+size 16512

models/embeddings/aligned/csb_64d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "csb",
+  "dimension": 64,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 1997,
+  "vocab_size": 8362
+}

models/embeddings/monolingual/csb_128d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:41e6f20527f7bdf6b8fa87b31ba74d5a6f79ab296290f0dc98660613cb8a2fe1
-size 1032826887

 version https://git-lfs.github.com/spec/v1
+oid sha256:43c16a3877fc0cb095ba4ff3327fee74803119be2a9e7f20081afa461282b882
+size 1032709206

models/embeddings/monolingual/csb_128d_metadata.json CHANGED Viewed

@@ -11,5 +11,5 @@
     "encoding_method": "rope",
     "dim": 128
   },
-  "vocab_size": 8475
 }

     "encoding_method": "rope",
     "dim": 128
   },
+  "vocab_size": 8362
 }

models/embeddings/monolingual/csb_32d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5cbc494ad59e671d08ac1ee2324782674a0601691c025b52f61a1dc262c55a50
-size 258318087

 version https://git-lfs.github.com/spec/v1
+oid sha256:f2bb2172bdfcc3d2a8f53b936c3f8101b78d6dfdb5eae5e396a887a3a7169611
+size 258287190

models/embeddings/monolingual/csb_32d_metadata.json CHANGED Viewed

@@ -11,5 +11,5 @@
     "encoding_method": "rope",
     "dim": 32
   },
-  "vocab_size": 8475
 }

     "encoding_method": "rope",
     "dim": 32
   },
+  "vocab_size": 8362
 }

models/embeddings/monolingual/csb_64d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bf5715a3cae0ee5b9bb8ac08a4ce47911ac64ce20e714c3910d2bc8b96c6e509
-size 516487687

 version https://git-lfs.github.com/spec/v1
+oid sha256:8d0920564c633a2a22bc60f797cc31f28ae35c1dd6531daa7cd32db1ffe2b9da
+size 516427862

models/embeddings/monolingual/csb_64d_metadata.json CHANGED Viewed

@@ -11,5 +11,5 @@
     "encoding_method": "rope",
     "dim": 64
   },
-  "vocab_size": 8475
 }

     "encoding_method": "rope",
     "dim": 64
   },
+  "vocab_size": 8362
 }

models/subword_markov/csb_markov_ctx1_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3e05a853044730e6b38caaa21dbd01cd8c459c5c597d8ef1f2b019b62bd3a09b
-size 59753

 version https://git-lfs.github.com/spec/v1
+oid sha256:07eb6d169e3ffd28d93cc892cd907a2570cae0d5f4d502f701e7b56450cbaf41
+size 59418

models/subword_markov/csb_markov_ctx1_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "subword",
   "language": "csb",
-  "unique_contexts": 978,
-  "total_transitions": 2901533
 }

   "context_size": 1,
   "variant": "subword",
   "language": "csb",
+  "unique_contexts": 979,
+  "total_transitions": 2875478
 }

models/subword_markov/csb_markov_ctx2_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9c40dba7c966b35d871440a303053f29b981727b16a9711bc7f3b928ef397d93
-size 348349

 version https://git-lfs.github.com/spec/v1
+oid sha256:00453bfd0428c74f169fc250cc72839e9b5be33c55c1202acd67810aa99b42ad
+size 338971

models/subword_markov/csb_markov_ctx2_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "subword",
   "language": "csb",
-  "unique_contexts": 7148,
-  "total_transitions": 2896074
 }

   "context_size": 2,
   "variant": "subword",
   "language": "csb",
+  "unique_contexts": 7156,
+  "total_transitions": 2870015
 }

models/subword_markov/csb_markov_ctx3_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:baf3ffd5d454791b7f04ed0e159954822bec5bc7bb1ca316a99d9ff1110cb21b
-size 1286038

 version https://git-lfs.github.com/spec/v1
+oid sha256:d83e6ab8b9405b08960d7b9e06958ed76a019d9fcfaf38af0f021e65364e14c0
+size 1286095

models/subword_markov/csb_markov_ctx3_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 3,
   "variant": "subword",
   "language": "csb",
-  "unique_contexts": 43078,
-  "total_transitions": 2890615
 }

   "context_size": 3,
   "variant": "subword",
   "language": "csb",
+  "unique_contexts": 42873,
+  "total_transitions": 2864552
 }

models/subword_markov/csb_markov_ctx4_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4c7319c1902c4efc845966ed77f3e54a04e733309dd3f842ad88f03e692305da
-size 3560727

 version https://git-lfs.github.com/spec/v1
+oid sha256:78238e27a8f4af66d93ad597fd4f60ddc27e630f9ba723af590e6c88ec4b2e48
+size 3564109

models/subword_markov/csb_markov_ctx4_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 4,
   "variant": "subword",
   "language": "csb",
-  "unique_contexts": 178117,
-  "total_transitions": 2885156
 }

   "context_size": 4,
   "variant": "subword",
   "language": "csb",
+  "unique_contexts": 176892,
+  "total_transitions": 2859089
 }

models/subword_ngram/csb_2gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cd44ffe8316f9e70b216abc0f7350ace6965b2fb8ca598790d222616a87aa285
-size 39483

 version https://git-lfs.github.com/spec/v1
+oid sha256:40917317e28e13d4228b19162f4f9e76a456818b2b363f27993f2281ea575568
+size 39268

models/subword_ngram/csb_2gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 2,
   "variant": "subword",
   "language": "csb",
-  "unique_ngrams": 2759,
-  "total_ngrams": 2901533
 }

   "n": 2,
   "variant": "subword",
   "language": "csb",
+  "unique_ngrams": 2749,
+  "total_ngrams": 2875478
 }

models/subword_ngram/csb_3gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ae511b1aec2b3de495c0e9773d3546aa9f63c0161f1fd07bc6a21de62ebbc168
-size 284170

 version https://git-lfs.github.com/spec/v1
+oid sha256:f5596a9d5bb811dc3f44b4afba898558c7c00cb30c04f5784909945808e7ca65
+size 281995

models/subword_ngram/csb_3gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 3,
   "variant": "subword",
   "language": "csb",
-  "unique_ngrams": 22668,
-  "total_ngrams": 2896074
 }

   "n": 3,
   "variant": "subword",
   "language": "csb",
+  "unique_ngrams": 22499,
+  "total_ngrams": 2870015
 }

models/subword_ngram/csb_4gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:12babc3b06e25fb857cc7fd5e7bd597cfa74946c127a0e861d2126235b75b740
-size 1210992

 version https://git-lfs.github.com/spec/v1
+oid sha256:f9c7a747d028ad968f03169031ba3678448ae4a3d57bfa89fdc232e93d6a3be1
+size 1200627

models/subword_ngram/csb_4gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 4,
   "variant": "subword",
   "language": "csb",
-  "unique_ngrams": 103678,
-  "total_ngrams": 2890615
 }

   "n": 4,
   "variant": "subword",
   "language": "csb",
+  "unique_ngrams": 102765,
+  "total_ngrams": 2864552
 }

models/subword_ngram/csb_5gram_subword.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:33e69de94b839b835a3a0e7e9affd94bf27d26e11d1673802cdb8940bef59ee0
+size 2541917

models/subword_ngram/csb_5gram_subword_metadata.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "n": 5,
+  "variant": "subword",
+  "language": "csb",
+  "unique_ngrams": 210801,
+  "total_ngrams": 2859089
+}

models/tokenizer/csb_tokenizer_16k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c396d5d00fa2d75c975c56bc121841f40ca7c7c22b9020fd8d1cada0062c2b40
-size 512151

 version https://git-lfs.github.com/spec/v1
+oid sha256:d02e0145e192ef700356d843d6237f98715147f78634bb2300d6f2eea840f7db
+size 512250

models/tokenizer/csb_tokenizer_16k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/csb_tokenizer_32k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a45eb87760776eef205d9de1e20ab9be70fab5b15ea858044a82e52b6e3c863a
-size 794869

 version https://git-lfs.github.com/spec/v1
+oid sha256:7ceaf1030ec78c2d3886b42ae9d8848969e8efddf8cdd8bec2c51fddc56fd37c
+size 794778

models/tokenizer/csb_tokenizer_32k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/csb_tokenizer_64k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7f9ecb06a069b33f750a0a07c327a2ef366efe2b86633a89f1e83698fc9f3c0b
-size 1419438

 version https://git-lfs.github.com/spec/v1
+oid sha256:f73c8de3bbadc860535a7b50d36e6fe3a2b224807b2b54cf36c21121e611b23a
+size 1420921

models/tokenizer/csb_tokenizer_64k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/csb_tokenizer_8k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a6e6f3cf7f11f412d56e5d102ff3e025a5475742bdbb3e47358ced7e86ca13cf
-size 374756

 version https://git-lfs.github.com/spec/v1
+oid sha256:171f6e45e564f0a1c09a28b681680e14f1de999eb483707de14434d19ab1a060
+size 374708

models/tokenizer/csb_tokenizer_8k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/vocabulary/csb_vocabulary.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d4ec8917a796775c330b809302d26f7ef684e595a22733614a196066a8dc7d30
-size 496629

 version https://git-lfs.github.com/spec/v1
+oid sha256:fe22e4af6ddea2fa9868d9195a53c2762291d71d1b71388fed2f82923821c1cf
+size 495850

models/vocabulary/csb_vocabulary_metadata.json CHANGED Viewed

@@ -1,17 +1,17 @@
 {
   "language": "csb",
-  "vocabulary_size": 28754,
   "variant": "full",
   "statistics": {
-    "type_token_ratio": 0.19407197802851062,
     "coverage": {
-      "top_100": 0.31498650560582103,
-      "top_1000": 0.5524925988895362,
-      "top_5000": 0.6979419562710293,
-      "top_10000": 0.7645555172454791
     },
-    "hapax_count": 52862,
-    "hapax_ratio": 0.6476916290923348,
-    "total_documents": 5459
   }
 }

 {
   "language": "csb",
+  "vocabulary_size": 28419,
   "variant": "full",
   "statistics": {
+    "type_token_ratio": 0.19500058808139045,
     "coverage": {
+      "top_100": 0.31544685784512577,
+      "top_1000": 0.5532501698235036,
+      "top_5000": 0.6984774692817486,
+      "top_10000": 0.7649330667364364
     },
+    "hapax_count": 52820,
+    "hapax_ratio": 0.6501803321065006,
+    "total_documents": 5463
   }
 }

models/word_markov/csb_markov_ctx1_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e732ca9b72afa9a30e582ebd48d54d02ff69454bc3acb08680e67d5bbf2db206
-size 2573619

 version https://git-lfs.github.com/spec/v1
+oid sha256:fc6bdec0fd1ec6bc5a83035f365843669521b25c56ba1798b6558918e6d997f2
+size 2559387

models/word_markov/csb_markov_ctx1_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "word",
   "language": "csb",
-  "unique_contexts": 81304,
-  "total_transitions": 415086
 }

   "context_size": 1,
   "variant": "word",
   "language": "csb",
+  "unique_contexts": 80925,
+  "total_transitions": 411146
 }

models/word_markov/csb_markov_ctx2_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2660c524e5ebffcf9e518ae94f1ef886c064dab42f134947c976da478ff1da2f
-size 4877767

 version https://git-lfs.github.com/spec/v1
+oid sha256:ab631eed2145ba815309e4d6cb61f5fa7d512d6972d20426c14fe86f8d2d72aa
+size 4833234

models/word_markov/csb_markov_ctx2_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "word",
   "language": "csb",
-  "unique_contexts": 240607,
-  "total_transitions": 409627
 }

   "context_size": 2,
   "variant": "word",
   "language": "csb",
+  "unique_contexts": 237972,
+  "total_transitions": 405683
 }