omarkamali commited on Jan 4

Commit

5e6fc93

verified ·

1 Parent(s): 3171718

Upload all models and assets for dag (latest)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
README.md +208 -175
models/embeddings/aligned/dag_128d.bin +3 -0
models/embeddings/aligned/dag_128d.meta.json +1 -0
models/embeddings/aligned/dag_128d.projection.npy +3 -0
models/embeddings/aligned/dag_128d_metadata.json +8 -0
models/embeddings/aligned/dag_32d.bin +3 -0
models/embeddings/aligned/dag_32d.meta.json +1 -0
models/embeddings/aligned/dag_32d.projection.npy +3 -0
models/embeddings/aligned/dag_32d_metadata.json +8 -0
models/embeddings/aligned/dag_64d.bin +3 -0
models/embeddings/aligned/dag_64d.meta.json +1 -0
models/embeddings/aligned/dag_64d.projection.npy +3 -0
models/embeddings/aligned/dag_64d_metadata.json +8 -0
models/embeddings/monolingual/dag_128d.bin +2 -2
models/embeddings/monolingual/dag_128d_metadata.json +1 -1
models/embeddings/monolingual/dag_32d.bin +2 -2
models/embeddings/monolingual/dag_32d_metadata.json +1 -1
models/embeddings/monolingual/dag_64d.bin +2 -2
models/embeddings/monolingual/dag_64d_metadata.json +1 -1
models/subword_markov/dag_markov_ctx1_subword.parquet +2 -2
models/subword_markov/dag_markov_ctx1_subword_metadata.json +2 -2
models/subword_markov/dag_markov_ctx2_subword.parquet +2 -2
models/subword_markov/dag_markov_ctx2_subword_metadata.json +2 -2
models/subword_markov/dag_markov_ctx3_subword.parquet +2 -2
models/subword_markov/dag_markov_ctx3_subword_metadata.json +2 -2
models/subword_markov/dag_markov_ctx4_subword.parquet +2 -2
models/subword_markov/dag_markov_ctx4_subword_metadata.json +2 -2
models/subword_ngram/dag_2gram_subword.parquet +2 -2
models/subword_ngram/dag_2gram_subword_metadata.json +2 -2
models/subword_ngram/dag_3gram_subword.parquet +2 -2
models/subword_ngram/dag_3gram_subword_metadata.json +2 -2
models/subword_ngram/dag_4gram_subword.parquet +2 -2
models/subword_ngram/dag_4gram_subword_metadata.json +2 -2
models/subword_ngram/dag_5gram_subword.parquet +3 -0
models/subword_ngram/dag_5gram_subword_metadata.json +7 -0
models/tokenizer/dag_tokenizer_16k.model +2 -2
models/tokenizer/dag_tokenizer_16k.vocab +0 -0
models/tokenizer/dag_tokenizer_32k.model +2 -2
models/tokenizer/dag_tokenizer_32k.vocab +0 -0
models/tokenizer/dag_tokenizer_64k.model +2 -2
models/tokenizer/dag_tokenizer_64k.vocab +0 -0
models/tokenizer/dag_tokenizer_8k.model +2 -2
models/tokenizer/dag_tokenizer_8k.vocab +0 -0
models/vocabulary/dag_vocabulary.parquet +2 -2
models/vocabulary/dag_vocabulary_metadata.json +9 -9
models/word_markov/dag_markov_ctx1_word.parquet +2 -2
models/word_markov/dag_markov_ctx1_word_metadata.json +2 -2
models/word_markov/dag_markov_ctx2_word.parquet +2 -2
models/word_markov/dag_markov_ctx2_word_metadata.json +2 -2

.gitattributes CHANGED Viewed

@@ -39,3 +39,4 @@ visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -t
 visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text

 visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text
 visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text
 visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text
+visualizations/embedding_tsne_multilingual.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 language: dag
-language_name: DAG
 language_family: atlantic_gur
 tags:
   - wikilangs
@@ -10,11 +10,21 @@ tags:
   - n-gram
   - markov
   - wikipedia
   - monolingual
   - family-atlantic_gur
 license: mit
 library_name: wikilangs
-pipeline_tag: feature-extraction
 datasets:
   - omarkamali/wikipedia-monthly
 dataset_info:
@@ -23,20 +33,20 @@ dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
-    value: 3.797
   - name: best_isotropy
     type: isotropy
-    value: 0.8190
   - name: vocabulary_size
     type: vocab
     value: 0
-generated: 2026-01-03
 ---
-# DAG - Wikilangs Models
 ## Comprehensive Research Report & Full Ablation Study
-This repository contains NLP models trained and evaluated by Wikilangs, specifically on **DAG** Wikipedia data.
 We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
 ## 📋 Repository Contents
@@ -60,7 +70,7 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
-- [6. Morphological Analysis (Experimental)](#6-morphological-analysis)
 - [7. Summary & Recommendations](#7-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
@@ -80,47 +90,47 @@ We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
-| **8k** | 3.299x | 3.30 | 0.0715% | 902,227 |
-| **16k** | 3.519x | 3.52 | 0.0763% | 845,892 |
-| **32k** | 3.683x | 3.68 | 0.0798% | 808,030 |
-| **64k** | 3.797x 🏆 | 3.80 | 0.0823% | 783,801 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
-**Sample 1:** `Tamale International School (TIS) nyɛla kariŋ zuŋ ti talli m bɛ Jisonayili, Sagn...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁tamale ▁international ▁school ▁( tis ) ▁nyɛla ▁kariŋ ▁zu ŋ ... (+14 more)` | 24 |
-| 16k | `▁tamale ▁international ▁school ▁( tis ) ▁nyɛla ▁kariŋ ▁zu ŋ ... (+11 more)` | 21 |
-| 32k | `▁tamale ▁international ▁school ▁( tis ) ▁nyɛla ▁kariŋ ▁zuŋ ▁ti ... (+10 more)` | 20 |
-| 64k | `▁tamale ▁international ▁school ▁( tis ) ▁nyɛla ▁kariŋ ▁zuŋ ▁ti ... (+10 more)` | 20 |
-**Sample 2:** `Bɛ nyɛla ti gbansabila paɣiba ban nyɛ toondanim bee tiŋgbani zuɣulanima`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁bɛ ▁nyɛla ▁ti ▁gbansabila ▁paɣiba ▁ban ▁nyɛ ▁toond anim ▁bee ... (+3 more)` | 13 |
-| 16k | `▁bɛ ▁nyɛla ▁ti ▁gbansabila ▁paɣiba ▁ban ▁nyɛ ▁toond anim ▁bee ... (+3 more)` | 13 |
-| 32k | `▁bɛ ▁nyɛla ▁ti ▁gbansabila ▁paɣiba ▁ban ▁nyɛ ▁toond anim ▁bee ... (+3 more)` | 13 |
-| 64k | `▁bɛ ▁nyɛla ▁ti ▁gbansabila ▁paɣiba ▁ban ▁nyɛ ▁toond anim ▁bee ... (+3 more)` | 13 |
-**Sample 3:** `GoondaaNaden, Tony. Dagbani dictionary. Webonary. Kundivihira`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
-| 8k | `▁go on da anaden , ▁tony . ▁dagbani ▁dictionary . ... (+3 more)` | 13 |
-| 16k | `▁go on da anaden , ▁tony . ▁dagbani ▁dictionary . ... (+3 more)` | 13 |
-| 32k | `▁go on da anaden , ▁tony . ▁dagbani ▁dictionary . ... (+3 more)` | 13 |
-| 64k | `▁go onda anaden , ▁tony . ▁dagbani ▁dictionary . ▁webonary ... (+2 more)` | 12 |
 ### Key Findings
-- **Best Compression:** 64k achieves 3.797x compression
-- **Lowest UNK Rate:** 8k with 0.0715% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
@@ -137,12 +147,14 @@ Below are sample sentences tokenized with each vocabulary size:
 | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
 |--------|---------|------------|---------|----------------|------------------|-------------------|
-| **2-gram** | Word | 32,119 | 14.97 | 135,454 | 12.8% | 30.2% |
-| **2-gram** | Subword | 338 🏆 | 8.40 | 6,662 | 61.1% | 98.8% |
-| **3-gram** | Word | 61,294 | 15.90 | 205,054 | 9.7% | 22.3% |
-| **3-gram** | Subword | 3,287 | 11.68 | 48,860 | 19.7% | 63.9% |
-| **4-gram** | Word | 122,956 | 16.91 | 377,494 | 8.8% | 17.3% |
-| **4-gram** | Subword | 20,734 | 14.34 | 281,639 | 9.1% | 31.1% |
 ### Top 5 N-grams by Size
@@ -150,68 +162,88 @@ Below are sample sentences tokenized with each vocabulary size:
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `of the` | 21,384 |
-| 2 | `n ti` | 15,953 |
-| 3 | `o daa` | 10,685 |
-| 4 | `din be` | 10,124 |
-| 5 | `ni daa` | 9,962 |
 **3-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `of the year` | 4,890 |
-| 2 | `n ti pahi` | 4,503 |
 | 3 | `zaŋ n ti` | 3,966 |
-| 4 | `nyɛla bɛ ni` | 3,607 |
-| 5 | `bɛ ni daa` | 3,248 |
 **4-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `ninsali biɛlim kalibu baŋsim` | 2,948 |
-| 2 | `biɛlim kalibu baŋsim bɔhimbu` | 2,948 |
 | 3 | `zalikpana mini gɔmnanti tali` | 2,947 |
 | 4 | `ni nyamma soya economy` | 2,945 |
 | 5 | `demographics ninsali biɛlim kalibu` | 2,944 |
 **2-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `a _` | 739,697 |
-| 2 | `i _` | 724,304 |
-| 3 | `n _` | 498,067 |
-| 4 | `a n` | 496,882 |
-| 5 | `, _` | 495,235 |
 **3-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `n i _` | 221,639 |
-| 2 | `_ n i` | 165,629 |
-| 3 | `_ m a` | 130,342 |
-| 4 | `l i _` | 130,046 |
-| 5 | `_ d a` | 129,510 |
 **4-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
-| 1 | `t h e _` | 98,150 |
-| 2 | `_ t h e` | 92,918 |
-| 3 | `_ n i _` | 91,122 |
-| 4 | `_ o f _` | 87,857 |
-| 5 | `_ d a a` | 76,848 |
 ### Key Findings
 - **Best Perplexity:** 2-gram (subword) with 338
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
-- **Coverage:** Top-1000 patterns cover ~31% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
@@ -227,14 +259,14 @@ Below are sample sentences tokenized with each vocabulary size:
 | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
 |---------|---------|-------------|------------|------------------|-----------------|----------------|
-| **1** | Word | 0.7248 | 1.653 | 6.35 | 344,988 | 27.5% |
-| **1** | Subword | 1.1283 | 2.186 | 6.69 | 4,037 | 0.0% |
-| **2** | Word | 0.2745 | 1.210 | 1.73 | 2,189,455 | 72.6% |
-| **2** | Subword | 0.6262 | 1.543 | 4.19 | 27,009 | 37.4% |
-| **3** | Word | 0.1110 | 1.080 | 1.21 | 3,779,471 | 88.9% |
-| **3** | Subword | 0.7294 | 1.658 | 4.22 | 113,279 | 27.1% |
-| **4** | Word | 0.0538 🏆 | 1.038 | 1.09 | 4,582,569 | 94.6% |
-| **4** | Subword | 0.7212 | 1.649 | 3.38 | 478,359 | 27.9% |
 ### Generated Text Samples (Word-based)
@@ -242,26 +274,26 @@ Below are text samples generated from each word-based Markov chain model:
 **Context Size 1:**
-1. `ni nyamma soya economy zalikpana mini polish o nyɛla bɛ tooi lahi sabiri yɛltɔɣa 23 47`
-2. `the break media binkpɛra transportation kundivihira pubu yaɣali tum yuuni fifa confederations cup s ...`
-3. `of the kurds of a african american lens nyɛla dolodolo mabiligu zaa tinsi salima di ni`
 **Context Size 2:**
-1. `of the visual arts general science karimba ni climatologist o piligu mini o tumo tarsi tɔ taali`
-2. `n ti wɔbigi paati jintɔra justice baah mathuselah daa nyɛla nigeria sasabira niriba bela n daa tɔ`
-3. `o daa lahi sôå kpaåsi kaya ni taɣada culture lahabali churi media binkpɛra transportation kundivihir...`
 **Context Size 3:**
-1. `of the year featuring farruko la familia urban album of the year lo siento bb himself best male`
-2. `n ti pahi metropolitan museum of art contemporary black artists july 1 31 counterpoints 23 march 16 ...`
-3. `zaŋ n ti daily graphic graphic communications group limited nima n daa ti o photographic curatorship...`
 **Context Size 4:**
-1. `biɛlim kalibu baŋsim bɔhimbu bomma ni nyamma soya economy zalikpana mini gɔmnanti tali law and gover...`
-2. `ninsali biɛlim kalibu baŋsim bɔhimbu bomma ni nyamma soya economy zalikpana mini gɔmnanti tali law a...`
 3. `zalikpana mini gɔmnanti tali law and government baŋsim bɔbu education kaya ni taada lahabali churi m...`
@@ -271,34 +303,34 @@ Below are text samples generated from each subword-based Markov chain model:
 **Context Size 1:**
-1. `_tamprecstessia_`
-2. `abrae_devineri_f`
-3. `ir_imaa_munghica`
 **Context Size 2:**
-1. `a_noadoma_pause_a`
-2. `i_smi_bortion_ght`
-3. `n_sh_ana_/_mankss`
 **Context Size 3:**
-1. `ni_sologic_schardk`
-2. `_ni_bɛ_tumahaba_pv`
-3. `_may_les_populi_ma`
 **Context Size 4:**
-1. `the_cissued_tieth_c`
-2. `_the_sunships,_larr`
-3. `_ni_lebowalestory_c`
 ### Key Findings
 - **Best Predictability:** Context-4 (word) with 94.6% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
-- **Memory Trade-off:** Larger contexts require more storage (478,359 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
@@ -314,64 +346,64 @@ Below are text samples generated from each subword-based Markov chain model:
 | Metric | Value |
 |--------|-------|
-| Vocabulary Size | 131,668 |
-| Total Tokens | 5,761,123 |
-| Mean Frequency | 43.75 |
 | Median Frequency | 4 |
-| Frequency Std Dev | 757.65 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | ni | 104,103 |
-| 2 | the | 91,175 |
-| 3 | of | 87,976 |
-| 4 | daa | 75,182 |
-| 5 | o | 70,845 |
-| 6 | ka | 69,699 |
-| 7 | n | 51,684 |
-| 8 | nyɛla | 49,641 |
-| 9 | din | 47,965 |
-| 10 | di | 44,711 |
 ### Least Common Words (from vocabulary)
 | Rank | Word | Frequency |
 |------|------|-----------|
-| 1 | menteith | 2 |
-| 2 | marischal | 2 |
-| 3 | dupplin | 2 |
-| 4 | malakula | 2 |
-| 5 | ambrym | 2 |
-| 6 | malekula | 2 |
-| 7 | biili | 2 |
-| 8 | chaira | 2 |
-| 9 | juŋ | 2 |
-| 10 | surim | 2 |
 ### Zipf's Law Analysis
 | Metric | Value |
 |--------|-------|
-| Zipf Coefficient | 1.0503 |
-| R² (Goodness of Fit) | 0.994826 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
-| Top 100 | 31.5% |
 | Top 1,000 | 58.6% |
 | Top 5,000 | 77.5% |
 | Top 10,000 | 84.5% |
 ### Key Findings
-- **Zipf Compliance:** R²=0.9948 indicates excellent adherence to Zipf's law
-- **High Frequency Dominance:** Top 100 words cover 31.5% of corpus
-- **Long Tail:** 121,668 words needed for remaining 15.5% coverage
 ---
 ## 5. Word Embeddings Evaluation
@@ -387,37 +419,40 @@ Below are text samples generated from each subword-based Markov chain model:
 ### 5.1 Cross-Lingual Alignment
-> *Note: Multilingual alignment visualization not available for this language.*
 ### 5.2 Model Comparison
 | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
 |-------|-----------|----------|------------------|---------------|----------------|
-| **mono_32d** | 32 | 0.7977 | 0.3405 | N/A | N/A |
-| **mono_64d** | 64 | 0.8086 | 0.2759 | N/A | N/A |
-| **mono_128d** | 128 | 0.8190 🏆 | 0.2136 | N/A | N/A |
 ### Key Findings
-- **Best Isotropy:** mono_128d with 0.8190 (more uniform distribution)
-- **Semantic Density:** Average pairwise similarity of 0.2767. Lower values indicate better semantic separation.
-- **Alignment Quality:** No aligned models evaluated in this run.
 - **Recommendation:** 128d aligned for best cross-lingual performance
 ---
 ## 6.  Morphological Analysis (Experimental)
-> ⚠️ **Warning:** This language shows low morphological productivity. The statistical signals used for this analysis may be noisy or less reliable than for morphologically rich languages.
 This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
 ### 6.1 Productivity & Complexity
 | Metric | Value | Interpretation | Recommendation |
 |--------|-------|----------------|----------------|
-| Productivity Index | **0.000** | Low morphological productivity | ⚠️ Likely unreliable |
-| Idiomaticity Gap | **-1.000** | Low formulaic content | - |
 ### 6.2 Affix Inventory (Productive Units)
@@ -426,16 +461,15 @@ These are the most productive prefixes and suffixes identified by sampling the v
 #### Productive Prefixes
 | Prefix | Examples |
 |--------|----------|
-| `-ma` | maresca, malaquais, maehara |
 #### Productive Suffixes
 | Suffix | Examples |
 |--------|----------|
-| `-er` | abaranger, bridgwater, alencier |
-| `-an` | seyitan, weitman, eghan |
-| `-ed` | crowned, programmed, loosed |
-| `-ng` | rongguang, invading, watling |
-| `-on` | ferguson, kongaction, turgeon |
 ### 6.3 Bound Stems (Lexical Roots)
@@ -443,18 +477,18 @@ Bound stems are high-frequency subword units that are semantically cohesive but
 | Stem | Cohesion | Substitutability | Examples |
 |------|----------|------------------|----------|
-| `ihir` | 2.44x | 42 contexts | vihir, vihiri, lihira |
-| `ison` | 2.20x | 60 contexts | sison, bison, isong |
-| `uuni` | 2.39x | 37 contexts | tuuni, nuuni, guuni |
-| `nter` | 1.87x | 69 contexts | unter, enter, inter |
-| `ctor` | 1.94x | 43 contexts | actor, actors, actora |
-| `riso` | 2.31x | 23 contexts | prison, bɔriso, arison |
-| `reen` | 1.99x | 37 contexts | green, breen, reena |
-| `atio` | 1.84x | 46 contexts | patio, ation, ratio |
-| `tern` | 1.80x | 48 contexts | terna, stern, terns |
-| `ture` | 1.74x | 54 contexts | cuture, mature, nature |
-| `rect` | 2.18x | 23 contexts | recta, recto, direct |
-| `awar` | 1.86x | 40 contexts | aware, pawar, yawar |
 ### 6.4 Affix Compatibility (Co-occurrence)
@@ -462,11 +496,10 @@ This table shows which prefixes and suffixes most frequently co-occur on the sam
 | Prefix | Suffix | Frequency | Examples |
 |--------|--------|-----------|----------|
-| `-ma` | `-ng` | 4 words | managing, mating |
-| `-ma` | `-ed` | 3 words | maherunited, manhandled |
-| `-ma` | `-on` | 2 words | manon, mathison |
-| `-ma` | `-an` | 2 words | magpakailanman, marjan |
-| `-ma` | `-er` | 1 words | manger, mater |
 ### 6.5 Recursive Morpheme Segmentation
@@ -474,26 +507,26 @@ Using **Recursive Hierarchical Substitutability**, we decompose complex words in
 | Word | Suggested Split | Confidence | Stem |
 |------|-----------------|------------|------|
-| kambangan | **`kamba-ng-an`** | 6.0 | `kamba` |
-| illumination | **`illuminati-on`** | 4.5 | `illuminati` |
-| parenting | **`parenti-ng`** | 4.5 | `parenti` |
-| gregorian | **`gregori-an`** | 4.5 | `gregori` |
-| transkeian | **`transkei-an`** | 4.5 | `transkei` |
-| sheltered | **`shelt-er-ed`** | 3.0 | `shelt` |
-| abandoned | **`aband-on-ed`** | 3.0 | `aband` |
-| mannheimer | **`ma-nnheim-er`** | 3.0 | `nnheim` |
 | malnutrition | **`ma-lnutriti-on`** | 3.0 | `lnutriti` |
-| homemaker | **`homemak-er`** | 1.5 | `homemak` |
-| swintonunited | **`swintonunit-ed`** | 1.5 | `swintonunit` |
-| xiaoxiang | **`xiaoxia-ng`** | 1.5 | `xiaoxia` |
-| venneraunited | **`venneraunit-ed`** | 1.5 | `venneraunit` |
-| grantunited | **`grantunit-ed`** | 1.5 | `grantunit` |
-| substation | **`substati-on`** | 1.5 | `substati` |
 ### 6.6 Linguistic Interpretation
 > **Automated Insight:**
-The language DAG appears to be more isolating or has a highly fixed vocabulary. Word-level models perform nearly as well as subword models, indicating fewer productive morphological processes.
 ---
 ## 7. Summary & Recommendations
@@ -504,7 +537,7 @@ The language DAG appears to be more isolating or has a highly fixed vocabulary.
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
-| Tokenizer | **64k BPE** | Best compression (3.80x) |
 | N-gram | **2-gram** | Lowest perplexity (338) |
 | Markov | **Context-4** | Highest predictability (94.6%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
@@ -720,4 +753,4 @@ MIT License - Free for academic and commercial use.
 ---
 *Generated by Wikilangs Models Pipeline*
-*Report Date: 2026-01-03 11:48:18*

 ---
 language: dag
+language_name: Dagbani
 language_family: atlantic_gur
 tags:
   - wikilangs
   - n-gram
   - markov
   - wikipedia
+  - feature-extraction
+  - sentence-similarity
+  - tokenization
+  - n-grams
+  - markov-chain
+  - text-mining
+  - fasttext
+  - babelvec
+  - vocabulous
+  - vocabulary
   - monolingual
   - family-atlantic_gur
 license: mit
 library_name: wikilangs
+pipeline_tag: text-generation
 datasets:
   - omarkamali/wikipedia-monthly
 dataset_info:
 metrics:
   - name: best_compression_ratio
     type: compression
+    value: 3.794
   - name: best_isotropy
     type: isotropy
+    value: 0.8139
   - name: vocabulary_size
     type: vocab
     value: 0
+generated: 2026-01-04
 ---
+# Dagbani - Wikilangs Models
 ## Comprehensive Research Report & Full Ablation Study
+This repository contains NLP models trained and evaluated by Wikilangs, specifically on **Dagbani** Wikipedia data.
 We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings.
 ## 📋 Repository Contents
 - [3. Markov Chain Evaluation](#3-markov-chain-evaluation)
 - [4. Vocabulary Analysis](#4-vocabulary-analysis)
 - [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation)
+- [6. Morphological Analysis (Experimental)](#6--morphological-analysis-experimental)
 - [7. Summary & Recommendations](#7-summary--recommendations)
 - [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide)
 - [Visualizations Index](#visualizations-index)
 | Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens |
 |------------|-------------|---------------|----------|--------------|
+| **8k** | 3.300x | 3.30 | 0.0720% | 894,994 |
+| **16k** | 3.518x | 3.52 | 0.0767% | 839,477 |
+| **32k** | 3.682x | 3.68 | 0.0803% | 801,972 |
+| **64k** | 3.794x 🏆 | 3.80 | 0.0827% | 778,290 |
 ### Tokenization Examples
 Below are sample sentences tokenized with each vocabulary size:
+**Sample 1:** `Nyuwɔɣu / Nawɔɣu (wateryam)Naden, Tony. Dagbani dictionary. Webonary. Kundivihir...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁nyu w ɔɣu ▁/ ▁na w ɔɣu ▁( water yam ... (+11 more)` | 21 |
+| 16k | `▁nyu w ɔɣu ▁/ ▁na w ɔɣu ▁( water yam ... (+11 more)` | 21 |
+| 32k | `▁nyu w ɔɣu ▁/ ▁naw ɔɣu ▁( water yam ) ... (+10 more)` | 20 |
+| 64k | `▁nyu wɔɣu ▁/ ▁naw ɔɣu ▁( water yam ) naden ... (+9 more)` | 19 |
+**Sample 2:** `Nakɔhigu nyɛla daankali tuma Dagbaŋ. Ban be di puuni kuri la nima. Di Piligu Be ...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁na kɔ higu ▁nyɛla ▁daan kali ▁tuma ▁dagbaŋ . ▁ban ... (+12 more)` | 22 |
+| 16k | `▁nakɔ higu ▁nyɛla ▁daan kali ▁tuma ▁dagbaŋ . ▁ban ▁be ... (+11 more)` | 21 |
+| 32k | `▁nakɔhigu ▁nyɛla ▁daankali ▁tuma ▁dagbaŋ . ▁ban ▁be ▁di ▁puuni ... (+9 more)` | 19 |
+| 64k | `▁nakɔhigu ▁nyɛla ▁daankali ▁tuma ▁dagbaŋ . ▁ban ▁be ▁di ▁puuni ... (+9 more)` | 19 |
+**Sample 3:** `LaniNaden, Tony. Dagbani dictionary. Webonary.nyɛla doo dabilim yaɣishɛli. Kundi...`
 | Vocab | Tokens | Count |
 |-------|--------|-------|
+| 8k | `▁lan inaden , ▁tony . ▁dagbani ▁dictionary . ▁webonary . ... (+9 more)` | 19 |
+| 16k | `▁lan inaden , ▁tony . ▁dagbani ▁dictionary . ▁webonary . ... (+8 more)` | 18 |
+| 32k | `▁lan inaden , ▁tony . ▁dagbani ▁dictionary . ▁webonary . ... (+7 more)` | 17 |
+| 64k | `▁lan inaden , ▁tony . ▁dagbani ▁dictionary . ▁webonary . ... (+7 more)` | 17 |
 ### Key Findings
+- **Best Compression:** 64k achieves 3.794x compression
+- **Lowest UNK Rate:** 8k with 0.0720% unknown tokens
 - **Trade-off:** Larger vocabularies improve compression but increase model size
 - **Recommendation:** 32k vocabulary provides optimal balance for production use
 | N-gram | Variant | Perplexity | Entropy | Unique N-grams | Top-100 Coverage | Top-1000 Coverage |
 |--------|---------|------------|---------|----------------|------------------|-------------------|
+| **2-gram** | Word | 31,979 | 14.96 | 135,270 | 12.8% | 30.3% |
+| **2-gram** | Subword | 338 🏆 | 8.40 | 6,640 | 61.2% | 98.8% |
+| **3-gram** | Word | 61,233 | 15.90 | 205,091 | 9.7% | 22.3% |
+| **3-gram** | Subword | 3,279 | 11.68 | 48,644 | 19.8% | 63.9% |
+| **4-gram** | Word | 122,791 | 16.91 | 377,150 | 8.8% | 17.3% |
+| **4-gram** | Subword | 20,666 | 14.33 | 280,804 | 9.1% | 31.2% |
+| **5-gram** | Word | 83,218 | 16.34 | 277,989 | 11.4% | 19.8% |
+| **5-gram** | Subword | 81,311 | 16.31 | 863,645 | 5.8% | 20.0% |
 ### Top 5 N-grams by Size
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `of the` | 21,162 |
+| 2 | `n ti` | 16,066 |
+| 3 | `o daa` | 10,740 |
+| 4 | `din be` | 10,157 |
+| 5 | `ka di` | 10,044 |
 **3-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `of the year` | 4,882 |
+| 2 | `n ti pahi` | 4,540 |
 | 3 | `zaŋ n ti` | 3,966 |
+| 4 | `nyɛla bɛ ni` | 3,631 |
+| 5 | `bɛ ni daa` | 3,273 |
 **4-grams (Word):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `biɛlim kalibu baŋsim bɔhimbu` | 2,948 |
+| 2 | `ninsali biɛlim kalibu baŋsim` | 2,948 |
 | 3 | `zalikpana mini gɔmnanti tali` | 2,947 |
 | 4 | `ni nyamma soya economy` | 2,945 |
 | 5 | `demographics ninsali biɛlim kalibu` | 2,944 |
+**5-grams (Word):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `ninsali biɛlim kalibu baŋsim bɔhimbu` | 2,948 |
+| 2 | `demographics ninsali biɛlim kalibu baŋsim` | 2,944 |
+| 3 | `tali law and government baŋsim` | 2,943 |
+| 4 | `gɔmnanti tali law and government` | 2,943 |
+| 5 | `mini gɔmnanti tali law and` | 2,943 |
 **2-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `a _` | 742,691 |
+| 2 | `i _` | 729,151 |
+| 3 | `n _` | 496,810 |
+| 4 | `a n` | 496,260 |
+| 5 | `, _` | 494,751 |
 **3-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `n i _` | 223,179 |
+| 2 | `_ n i` | 166,766 |
+| 3 | `l i _` | 131,067 |
+| 4 | `_ m a` | 130,487 |
+| 5 | `_ d a` | 130,222 |
 **4-grams (Subword):**
 | Rank | N-gram | Count |
 |------|--------|-------|
+| 1 | `t h e _` | 96,966 |
+| 2 | `_ n i _` | 91,865 |
+| 3 | `_ t h e` | 91,838 |
+| 4 | `_ o f _` | 86,951 |
+| 5 | `_ d a a` | 77,547 |
+**5-grams (Subword):**
+| Rank | N-gram | Count |
+|------|--------|-------|
+| 1 | `_ t h e _` | 86,257 |
+| 2 | `_ d a a _` | 73,635 |
+| 3 | `y ɛ l a _` | 50,822 |
+| 4 | `n y ɛ l a` | 50,735 |
+| 5 | `_ n y ɛ l` | 49,922 |
 ### Key Findings
 - **Best Perplexity:** 2-gram (subword) with 338
 - **Entropy Trend:** Decreases with larger n-grams (more predictable)
+- **Coverage:** Top-1000 patterns cover ~20% of corpus
 - **Recommendation:** 4-gram or 5-gram for best predictive performance
 ---
 | Context | Variant | Avg Entropy | Perplexity | Branching Factor | Unique Contexts | Predictability |
 |---------|---------|-------------|------------|------------------|-----------------|----------------|
+| **1** | Word | 0.7239 | 1.652 | 6.34 | 344,700 | 27.6% |
+| **1** | Subword | 1.1279 | 2.185 | 6.69 | 4,036 | 0.0% |
+| **2** | Word | 0.2746 | 1.210 | 1.73 | 2,184,048 | 72.5% |
+| **2** | Subword | 0.6246 | 1.542 | 4.19 | 26,994 | 37.5% |
+| **3** | Word | 0.1113 | 1.080 | 1.21 | 3,772,159 | 88.9% |
+| **3** | Subword | 0.7278 | 1.656 | 4.22 | 112,970 | 27.2% |
+| **4** | Word | 0.0540 🏆 | 1.038 | 1.09 | 4,576,663 | 94.6% |
+| **4** | Subword | 0.7217 | 1.649 | 3.38 | 476,865 | 27.8% |
 ### Generated Text Samples (Word-based)
 **Context Size 1:**
+1. `ni 146 naɣila ni bɛ 3 mini periodic teebuli maa zaa di yuuni puuni ka buɣujɛmdiba`
+2. `the title close to score after the laws ebube ordinary john brascia lucille la kasbah n`
+3. `of china art museum swarthmore fullback gene quintano screenplay by burroughsrob bridgett tina mensa...`
 **Context Size 2:**
+1. `of the treasure of pancho villa as mimi alexis puig as militar adriana russo kundiviha the film`
+2. `n ti best supporting actress go go girl m net mytv formerly astv newzroom afrika nongoma tv`
+3. `o daa pilli shɛli yuuni puuni n nyɛ toon tibo suhudoo dabsili yuuni ŋɔ churi critics lists`
 **Context Size 3:**
+1. `of the year amy grant southern gospel album of the year invade my soul by the tree chuck`
+2. `n ti pahi 503 votes ntoso daa dolila ghanas independence din daa n niŋ ka bindirigu bi niŋ`
+3. `zaŋ n ti master of medicine mmed in internal medicine since master of medicine n ti pahi princess`
 **Context Size 4:**
+1. `ninsali biɛlim kalibu baŋsim bɔhimbu bomma ni nyamma soya economy zalikpana mini gɔmnanti tali law a...`
+2. `biɛlim kalibu baŋsim bɔhimbu bomma ni nyamma soya economy zalikpana mini gɔmnanti tali law and gover...`
 3. `zalikpana mini gɔmnanti tali law and government baŋsim bɔbu education kaya ni taada lahabali churi m...`
 **Context Size 1:**
+1. `_ryɛld_baninasou`
+2. `a_y_benteso_plag`
+3. `iound_n_na_ni_er`
 **Context Size 2:**
+1. `a_bes_tuma_prishe`
+2. `i_st_a_le_rickinm`
+3. `n_naner_fation,_d`
 **Context Size 3:**
+1. `ni_daa_niŋ_maŋsim_`
+2. `_ni_sam_kyung_high`
+3. `li_ary_la_of_the_d`
 **Context Size 4:**
+1. `the_illum,_alexande`
+2. `_ni_di_rhondon_hee-`
+3. `_the_museum._frases`
 ### Key Findings
 - **Best Predictability:** Context-4 (word) with 94.6% predictability
 - **Branching Factor:** Decreases with context size (more deterministic)
+- **Memory Trade-off:** Larger contexts require more storage (476,865 contexts)
 - **Recommendation:** Context-3 or Context-4 for text generation
 ---
 | Metric | Value |
 |--------|-------|
+| Vocabulary Size | 131,415 |
+| Total Tokens | 5,756,455 |
+| Mean Frequency | 43.80 |
 | Median Frequency | 4 |
+| Frequency Std Dev | 759.26 |
 ### Most Common Words
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | ni | 104,912 |
+| 2 | the | 89,996 |
+| 3 | of | 87,067 |
+| 4 | daa | 75,848 |
+| 5 | o | 71,090 |
+| 6 | ka | 70,258 |
+| 7 | n | 52,198 |
+| 8 | nyɛla | 49,965 |
+| 9 | din | 48,314 |
+| 10 | di | 45,125 |
 ### Least Common Words (from vocabulary)
 | Rank | Word | Frequency |
 |------|------|-----------|
+| 1 | yikonim | 2 |
+| 2 | asj | 2 |
+| 3 | fiqhi | 2 |
+| 4 | sapuhi | 2 |
+| 5 | hoti | 2 |
+| 6 | breams | 2 |
+| 7 | xai | 2 |
+| 8 | coloboma | 2 |
+| 9 | ziɛ | 2 |
+| 10 | bɔɔlɔ | 2 |
 ### Zipf's Law Analysis
 | Metric | Value |
 |--------|-------|
+| Zipf Coefficient | 1.0507 |
+| R² (Goodness of Fit) | 0.994879 |
 | Adherence Quality | **excellent** |
 ### Coverage Analysis
 | Top N Words | Coverage |
 |-------------|----------|
+| Top 100 | 31.6% |
 | Top 1,000 | 58.6% |
 | Top 5,000 | 77.5% |
 | Top 10,000 | 84.5% |
 ### Key Findings
+- **Zipf Compliance:** R²=0.9949 indicates excellent adherence to Zipf's law
+- **High Frequency Dominance:** Top 100 words cover 31.6% of corpus
+- **Long Tail:** 121,415 words needed for remaining 15.5% coverage
 ---
 ## 5. Word Embeddings Evaluation
 ### 5.1 Cross-Lingual Alignment
+![Alignment Quality](visualizations/embedding_alignment_quality.png)
+![Multilingual t-SNE](visualizations/embedding_tsne_multilingual.png)
 ### 5.2 Model Comparison
 | Model | Dimension | Isotropy | Semantic Density | Alignment R@1 | Alignment R@10 |
 |-------|-----------|----------|------------------|---------------|----------------|
+| **mono_32d** | 32 | 0.7990 | 0.3615 | N/A | N/A |
+| **mono_64d** | 64 | 0.8035 | 0.2926 | N/A | N/A |
+| **mono_128d** | 128 | 0.8139 | 0.2158 | N/A | N/A |
+| **aligned_32d** | 32 | 0.7990 | 0.3542 | 0.1220 | 0.4920 |
+| **aligned_64d** | 64 | 0.8035 | 0.2751 | 0.2420 | 0.6800 |
+| **aligned_128d** | 128 | 0.8139 🏆 | 0.2184 | 0.3840 | 0.7540 |
 ### Key Findings
+- **Best Isotropy:** aligned_128d with 0.8139 (more uniform distribution)
+- **Semantic Density:** Average pairwise similarity of 0.2863. Lower values indicate better semantic separation.
+- **Alignment Quality:** Aligned models achieve up to 38.4% R@1 in cross-lingual retrieval.
 - **Recommendation:** 128d aligned for best cross-lingual performance
 ---
 ## 6.  Morphological Analysis (Experimental)
 This section presents an automated morphological analysis derived from the statistical divergence between word-level and subword-level models. By analyzing where subword predictability spikes and where word-level coverage fails, we can infer linguistic structures without supervised data.
 ### 6.1 Productivity & Complexity
 | Metric | Value | Interpretation | Recommendation |
 |--------|-------|----------------|----------------|
+| Productivity Index | **5.000** | High morphological productivity | Reliable analysis |
+| Idiomaticity Gap | **-0.010** | Low formulaic content | - |
 ### 6.2 Affix Inventory (Productive Units)
 #### Productive Prefixes
 | Prefix | Examples |
 |--------|----------|
+| `-ma` | mazzotta, malvína, manilyn |
 #### Productive Suffixes
 | Suffix | Examples |
 |--------|----------|
+| `-er` | sanger, schmucker, reefroger |
+| `-ed` | aliunited, hayekunited, affected |
+| `-an` | statestarzan, parisian, cappleman |
+| `-on` | gudnason, bronston, verdon |
 ### 6.3 Bound Stems (Lexical Roots)
 | Stem | Cohesion | Substitutability | Examples |
 |------|----------|------------------|----------|
+| `uuni` | 2.43x | 37 contexts | guuni, yuuni, duuni |
+| `ihir` | 2.32x | 42 contexts | vihir, pihiri, lihira |
+| `ison` | 2.11x | 60 contexts | isong, mison, isono |
+| `nter` | 1.90x | 69 contexts | enter, inter, unter |
+| `ctor` | 1.95x | 43 contexts | actor, sector, factor |
+| `atio` | 1.88x | 46 contexts | ratio, patio, ation |
+| `ture` | 1.79x | 54 contexts | mature, cuture, future |
+| `reen` | 1.97x | 37 contexts | reena, breen, green |
+| `tern` | 1.84x | 48 contexts | stern, terns, terna |
+| `riso` | 2.21x | 23 contexts | arison, prison, bɔriso |
+| `rect` | 2.19x | 22 contexts | recta, rector, direct |
+| `ogra` | 1.95x | 32 contexts | dogra, yograj, biograd |
 ### 6.4 Affix Compatibility (Co-occurrence)
 | Prefix | Suffix | Frequency | Examples |
 |--------|--------|-----------|----------|
+| `-ma` | `-an` | 8 words | mariaan, mailman |
+| `-ma` | `-ed` | 8 words | matched, marloweunited |
+| `-ma` | `-on` | 5 words | malnutrition, marsbyron |
+| `-ma` | `-er` | 1 words | marmer, mayweather |
 ### 6.5 Recursive Morpheme Segmentation
 | Word | Suggested Split | Confidence | Stem |
 |------|-----------------|------------|------|
+| nyankpalan | **`nyankpal-an`** | 4.5 | `nyankpal` |
+| schweiger | **`schweig-er`** | 4.5 | `schweig` |
+| cricketer | **`cricket-er`** | 4.5 | `cricket` |
+| michelson | **`michels-on`** | 4.5 | `michels` |
+| shipwrecked | **`shipwreck-ed`** | 4.5 | `shipwreck` |
+| macgruber | **`ma-cgrub-er`** | 3.0 | `cgrub` |
+| madhunandan | **`ma-dhunand-an`** | 3.0 | `dhunand` |
+| chalcedon | **`chalc-ed-on`** | 3.0 | `chalc` |
+| skycameron | **`skycam-er-on`** | 3.0 | `skycam` |
 | malnutrition | **`ma-lnutriti-on`** | 3.0 | `lnutriti` |
+| metropolitansan | **`metropolitans-an`** | 1.5 | `metropolitans` |
+| trevorunited | **`trevorunit-ed`** | 1.5 | `trevorunit` |
+| meaneyunited | **`meaneyunit-ed`** | 1.5 | `meaneyunit` |
+| cattrallunited | **`cattrallunit-ed`** | 1.5 | `cattrallunit` |
+| margherita | **`ma-rgherita`** | 1.5 | `rgherita` |
 ### 6.6 Linguistic Interpretation
 > **Automated Insight:**
+The language Dagbani shows high morphological productivity. The subword models are significantly more efficient than word models, suggesting a rich system of affixation or compounding.
 ---
 ## 7. Summary & Recommendations
 | Component | Recommended | Rationale |
 |-----------|-------------|-----------|
+| Tokenizer | **64k BPE** | Best compression (3.79x) |
 | N-gram | **2-gram** | Lowest perplexity (338) |
 | Markov | **Context-4** | Highest predictability (94.6%) |
 | Embeddings | **100d** | Balanced semantic capture and isotropy |
 ---
 *Generated by Wikilangs Models Pipeline*
+*Report Date: 2026-01-04 01:58:15*

models/embeddings/aligned/dag_128d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ef4cab20d7d846a747c842268d9a69a885a3cc99e7887e085e5d2aa609521d7
+size 1103732268

models/embeddings/aligned/dag_128d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "dag", "dim": 128, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/dag_128d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf7db2ef0c006f0c6b132ca7dfe121320db1daaa836a9098e7b6aa91756e4a67
+size 65664

models/embeddings/aligned/dag_128d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "dag",
+  "dimension": 128,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 52551,
+  "vocab_size": 76599
+}

models/embeddings/aligned/dag_32d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3c31d67ee8ec12ecb0701edf236727a4414ead88fcd4576d237ef8745363df8
+size 276904236

models/embeddings/aligned/dag_32d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "dag", "dim": 32, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/dag_32d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f010c07a936f9a33a429afa2989044ddff3a6d5570cb7f0d311790f8a2c80cb
+size 4224

models/embeddings/aligned/dag_32d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "dag",
+  "dimension": 32,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 52551,
+  "vocab_size": 76599
+}

models/embeddings/aligned/dag_64d.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1bfa2bed5eb16db4d819df2090f0ed2020de9883792b0b746680ec495b7efdb5
+size 552513580

models/embeddings/aligned/dag_64d.meta.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"lang": "dag", "dim": 64, "max_seq_len": 512, "is_aligned": true}

models/embeddings/aligned/dag_64d.projection.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e45a8a2b4707cb35f615f38363658accb1cc41ad4eae125c36ed53d785bbdd6b
+size 16512

models/embeddings/aligned/dag_64d_metadata.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "language": "dag",
+  "dimension": 64,
+  "version": "aligned",
+  "hub_language": "en",
+  "seed_vocab_size": 52551,
+  "vocab_size": 76599
+}

models/embeddings/monolingual/dag_128d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4d00f16d9be6613f95ad245b565c957bc4b77c2ffe3c6c97ed55fc35874cbb02
-size 1103962437

 version https://git-lfs.github.com/spec/v1
+oid sha256:0ef4cab20d7d846a747c842268d9a69a885a3cc99e7887e085e5d2aa609521d7
+size 1103732268

models/embeddings/monolingual/dag_128d_metadata.json CHANGED Viewed

@@ -11,5 +11,5 @@
     "encoding_method": "rope",
     "dim": 128
   },
-  "vocab_size": 76820
 }

     "encoding_method": "rope",
     "dim": 128
   },
+  "vocab_size": 76599
 }

models/embeddings/monolingual/dag_32d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7a71b8db5dcabf646b63006ac127c9431a0329bc1b838c8056ba69701f137080
-size 276964677

 version https://git-lfs.github.com/spec/v1
+oid sha256:f3c31d67ee8ec12ecb0701edf236727a4414ead88fcd4576d237ef8745363df8
+size 276904236

models/embeddings/monolingual/dag_32d_metadata.json CHANGED Viewed

@@ -11,5 +11,5 @@
     "encoding_method": "rope",
     "dim": 32
   },
-  "vocab_size": 76820
 }

     "encoding_method": "rope",
     "dim": 32
   },
+  "vocab_size": 76599
 }

models/embeddings/monolingual/dag_64d.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b0cbe4ba5bff3c754108b7e2cdf941847dabea53417274bd516d6558b4d8fe15
-size 552630597

 version https://git-lfs.github.com/spec/v1
+oid sha256:1bfa2bed5eb16db4d819df2090f0ed2020de9883792b0b746680ec495b7efdb5
+size 552513580

models/embeddings/monolingual/dag_64d_metadata.json CHANGED Viewed

@@ -11,5 +11,5 @@
     "encoding_method": "rope",
     "dim": 64
   },
-  "vocab_size": 76820
 }

     "encoding_method": "rope",
     "dim": 64
   },
+  "vocab_size": 76599
 }

models/subword_markov/dag_markov_ctx1_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4dcab6aeb23a2a118f72e7f0aeae2b8870148e09f24543081b5214fa182f8de7
-size 199146

 version https://git-lfs.github.com/spec/v1
+oid sha256:a7fbbd485192387cc3345b23955b9b12dc8636d00d74dbb0f2cb93397b1e8c71
+size 198647

models/subword_markov/dag_markov_ctx1_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "subword",
   "language": "dag",
-  "unique_contexts": 4037,
-  "total_transitions": 37349732
 }

   "context_size": 1,
   "variant": "subword",
   "language": "dag",
+  "unique_contexts": 4036,
+  "total_transitions": 37304308
 }

models/subword_markov/dag_markov_ctx2_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:08f53c1fdb7f4c76ae169e25b12f81666d01dcfab8b202b8a44bfcdd2d782375
-size 968006

 version https://git-lfs.github.com/spec/v1
+oid sha256:ee153d990a13d120518a559b6dffaf5ab0ccc6c0c109c64bae3d6c45b173f2f3
+size 967998

models/subword_markov/dag_markov_ctx2_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "subword",
   "language": "dag",
-  "unique_contexts": 27009,
-  "total_transitions": 37334959
 }

   "context_size": 2,
   "variant": "subword",
   "language": "dag",
+  "unique_contexts": 26994,
+  "total_transitions": 37289457
 }

models/subword_markov/dag_markov_ctx3_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c51b7acbe6f43a51b3c918db657fb4e7e81da4d78e8c3084a0a87dad548b73a0
-size 3766626

 version https://git-lfs.github.com/spec/v1
+oid sha256:764871aecaf38761005d29748a1ab0046e433dad36bb604b1153fa687fa3bc90
+size 3755341

models/subword_markov/dag_markov_ctx3_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 3,
   "variant": "subword",
   "language": "dag",
-  "unique_contexts": 113279,
-  "total_transitions": 37320186
 }

   "context_size": 3,
   "variant": "subword",
   "language": "dag",
+  "unique_contexts": 112970,
+  "total_transitions": 37274606
 }

models/subword_markov/dag_markov_ctx4_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:282669cd7b9e381bac5380875c673efece93e4c4cd175997c0d421789b5b7d2b
-size 12359006

 version https://git-lfs.github.com/spec/v1
+oid sha256:a596ae68e137618b2ef06622af69ec5775b1bd475d969182c3f53388cd7c1707
+size 12383895

models/subword_markov/dag_markov_ctx4_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 4,
   "variant": "subword",
   "language": "dag",
-  "unique_contexts": 478359,
-  "total_transitions": 37305413
 }

   "context_size": 4,
   "variant": "subword",
   "language": "dag",
+  "unique_contexts": 476865,
+  "total_transitions": 37259755
 }

models/subword_ngram/dag_2gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:424eb991c814c6bded4d9b5b7ae5b9c29b14dbb12e16ca36447fc358ba63b9d4
-size 88328

 version https://git-lfs.github.com/spec/v1
+oid sha256:687a27c0ccf7aea7be86852b038245eb76e2a105867768b797a62501e3e0734e
+size 88101

models/subword_ngram/dag_2gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 2,
   "variant": "subword",
   "language": "dag",
-  "unique_ngrams": 6662,
-  "total_ngrams": 37349732
 }

   "n": 2,
   "variant": "subword",
   "language": "dag",
+  "unique_ngrams": 6640,
+  "total_ngrams": 37304308
 }

models/subword_ngram/dag_3gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1710397b15aa7f5e18eafea8221a048c648e298ec1ab262bbdb99a61b7eed2fb
-size 627540

 version https://git-lfs.github.com/spec/v1
+oid sha256:3bebdf7b44da58fafeee0630d7fef8ae0e6ead7b7b7e2cd6daa0851ab095da28
+size 625053

models/subword_ngram/dag_3gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 3,
   "variant": "subword",
   "language": "dag",
-  "unique_ngrams": 48860,
-  "total_ngrams": 37334959
 }

   "n": 3,
   "variant": "subword",
   "language": "dag",
+  "unique_ngrams": 48644,
+  "total_ngrams": 37289457
 }

models/subword_ngram/dag_4gram_subword.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d741ca12b56a31b6cec08e7dd10051326c0bffcc6e6f714ba6dbb0a748fc2af4
-size 3149822

 version https://git-lfs.github.com/spec/v1
+oid sha256:f98de4938612ef44476e7365374df5e8a440f60615d7f92d021ace73736c78c6
+size 3132988

models/subword_ngram/dag_4gram_subword_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "n": 4,
   "variant": "subword",
   "language": "dag",
-  "unique_ngrams": 281639,
-  "total_ngrams": 37320186
 }

   "n": 4,
   "variant": "subword",
   "language": "dag",
+  "unique_ngrams": 280804,
+  "total_ngrams": 37274606
 }

models/subword_ngram/dag_5gram_subword.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f049b9afeadacd99e7d205d817ec9fdaeac5a2db96086f8220771bc221a0c82c
+size 9856970

models/subword_ngram/dag_5gram_subword_metadata.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "n": 5,
+  "variant": "subword",
+  "language": "dag",
+  "unique_ngrams": 863645,
+  "total_ngrams": 37259755
+}

models/tokenizer/dag_tokenizer_16k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5ec34742853f7f09d3c027bd4c59d25d4f51a22ea4ec1a56a84487038d0a3c7c
-size 501847

 version https://git-lfs.github.com/spec/v1
+oid sha256:cc8a3b681e49ce814d3689ac74ef572658a6f79b9fa097d09835b3dcb485e4c6
+size 501712

models/tokenizer/dag_tokenizer_16k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/dag_tokenizer_32k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c741184d829058c5b71ec3219c290dd8af79ec0d95c96d7e85772dbcb15368e6
-size 767801

 version https://git-lfs.github.com/spec/v1
+oid sha256:946b778a66a4611a099f165078357f3e114b20b4fdeab46c250ac3bae44ab50d
+size 767956

models/tokenizer/dag_tokenizer_32k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/dag_tokenizer_64k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:816c70ef2066989f637164f982242822307293ed74ca5704893e8d0e90d112c5
-size 1308307

 version https://git-lfs.github.com/spec/v1
+oid sha256:b53e42b09ac7b4dc8418d691a251621be26bceb091fb781053c8765717ad0fd4
+size 1308481

models/tokenizer/dag_tokenizer_64k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/tokenizer/dag_tokenizer_8k.model CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4eb6d54a65133acd4ad994c022d72028d652288dbb0a9cdbb3aaf9bbb3d462aa
-size 370487

 version https://git-lfs.github.com/spec/v1
+oid sha256:95b33bd56135c430453a278c280788d97a57e06cac40cd397f0c178a7e007bec
+size 370600

models/tokenizer/dag_tokenizer_8k.vocab CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/vocabulary/dag_vocabulary.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:68dddcb0559f6c5317590e5b88cf3a16779cc57890a35aaac09a1125ef4e9d2f
-size 2215170

 version https://git-lfs.github.com/spec/v1
+oid sha256:d583405fb087bb40e47c826c82db7c359570749d6f3924b4da11fe957cfd802e
+size 2212190

models/vocabulary/dag_vocabulary_metadata.json CHANGED Viewed

@@ -1,17 +1,17 @@
 {
   "language": "dag",
-  "vocabulary_size": 131668,
   "variant": "full",
   "statistics": {
-    "type_token_ratio": 0.05775705051748038,
     "coverage": {
-      "top_100": 0.3041086439325898,
-      "top_1000": 0.564692496107641,
-      "top_5000": 0.7471635406725152,
-      "top_10000": 0.8147734230297098
     },
-    "hapax_count": 213403,
-    "hapax_ratio": 0.6184321487462,
-    "total_documents": 14773
   }
 }

 {
   "language": "dag",
+  "vocabulary_size": 131415,
   "variant": "full",
   "statistics": {
+    "type_token_ratio": 0.057754466463332925,
     "coverage": {
+      "top_100": 0.3048215491779992,
+      "top_1000": 0.565483002513977,
+      "top_5000": 0.7476411364891159,
+      "top_10000": 0.8151446340796646
     },
+    "hapax_count": 213369,
+    "hapax_ratio": 0.6188483224279548,
+    "total_documents": 14851
   }
 }

models/word_markov/dag_markov_ctx1_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f0e0f6a3232922b668c546d7e25e65d33a440e4451d4066dfab8999e9c34c44e
-size 19122863

 version https://git-lfs.github.com/spec/v1
+oid sha256:d691165089cd63a640e66e6f8e7d294d3252176dd444dd6b6890706b69fdbca2
+size 19084487

models/word_markov/dag_markov_ctx1_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 1,
   "variant": "word",
   "language": "dag",
-  "unique_contexts": 344988,
-  "total_transitions": 5959753
 }

   "context_size": 1,
   "variant": "word",
   "language": "dag",
+  "unique_contexts": 344700,
+  "total_transitions": 5954973
 }

models/word_markov/dag_markov_ctx2_word.parquet CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:57303e729f982b046fc9eb4582b38507579236a925f3ed865a2128ee0d272a33
-size 44774701

 version https://git-lfs.github.com/spec/v1
+oid sha256:a2421bb60ac7e149c714abf55e1fd8b203d6fec01a06867731630c612eed47fe
+size 44614824

models/word_markov/dag_markov_ctx2_word_metadata.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "context_size": 2,
   "variant": "word",
   "language": "dag",
-  "unique_contexts": 2189455,
-  "total_transitions": 5944980
 }

   "context_size": 2,
   "variant": "word",
   "language": "dag",
+  "unique_contexts": 2184048,
+  "total_transitions": 5940122
 }