diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..291f62bc2f4728f8016ad17ad14264621981f304 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +visualizations/embedding_similarity.png filter=lfs diff=lfs merge=lfs -text +visualizations/embedding_tsne_multilingual.png filter=lfs diff=lfs merge=lfs -text +visualizations/performance_dashboard.png filter=lfs diff=lfs merge=lfs -text +visualizations/position_encoding_comparison.png filter=lfs diff=lfs merge=lfs -text +visualizations/tsne_sentences.png filter=lfs diff=lfs merge=lfs -text +visualizations/tsne_words.png filter=lfs diff=lfs merge=lfs -text +visualizations/zipf_law.png filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..05742d203d2e2051ba3a42574ee670350b0591b6 --- /dev/null +++ b/README.md @@ -0,0 +1,763 @@ +--- +language: rsk +language_name: Unknown language [rsk] +language_family: slavic_south +tags: + - wikilangs + - nlp + - tokenizer + - embeddings + - n-gram + - markov + - wikipedia + - feature-extraction + - sentence-similarity + - tokenization + - n-grams + - markov-chain + - text-mining + - fasttext + - babelvec + - vocabulous + - vocabulary + - monolingual + - family-slavic_south +license: mit +library_name: wikilangs +pipeline_tag: text-generation +datasets: + - omarkamali/wikipedia-monthly +dataset_info: + name: wikipedia-monthly + description: Monthly snapshots of Wikipedia articles across 300+ languages +metrics: + - name: best_compression_ratio + type: compression + value: 4.008 + - name: best_isotropy + type: isotropy + value: 0.8518 + - name: vocabulary_size + type: vocab + value: 0 +generated: 2026-01-10 +--- + +# Unknown language [rsk] - Wikilangs Models +## Comprehensive Research Report & Full Ablation Study + +This repository contains NLP models trained and evaluated by Wikilangs, specifically on **Unknown language [rsk]** Wikipedia data. +We analyze tokenizers, n-gram models, Markov chains, vocabulary statistics, and word embeddings. + +## 📋 Repository Contents + +### Models & Assets + +- Tokenizers (8k, 16k, 32k, 64k) +- N-gram models (2, 3, 4, 5-gram) +- Markov chains (context of 1, 2, 3, 4 and 5) +- Subword N-gram and Markov chains +- Embeddings in various sizes and dimensions (aligned and unaligned) +- Language Vocabulary +- Language Statistics + +![Performance Dashboard](visualizations/performance_dashboard.png) + +### Analysis and Evaluation + +- [1. Tokenizer Evaluation](#1-tokenizer-evaluation) +- [2. N-gram Model Evaluation](#2-n-gram-model-evaluation) +- [3. Markov Chain Evaluation](#3-markov-chain-evaluation) +- [4. Vocabulary Analysis](#4-vocabulary-analysis) +- [5. Word Embeddings Evaluation](#5-word-embeddings-evaluation) +- [6. Morphological Analysis (Experimental)](#6--morphological-analysis-experimental) +- [7. Summary & Recommendations](#7-summary--recommendations) +- [Metrics Glossary](#appendix-metrics-glossary--interpretation-guide) +- [Visualizations Index](#visualizations-index) + +--- +## 1. Tokenizer Evaluation + +![Tokenizer Compression](visualizations/tokenizer_compression.png) + +![Tokenizer Fertility](visualizations/tokenizer_fertility.png) + +![Tokenizer OOV](visualizations/tokenizer_oov.png) + +![Total Tokens](visualizations/tokenizer_total_tokens.png) + +### Results + +| Vocab Size | Compression | Avg Token Len | UNK Rate | Total Tokens | +|------------|-------------|---------------|----------|--------------| +| **8k** | 3.410x | 3.41 | 0.1603% | 1,061,780 | +| **16k** | 3.743x | 3.74 | 0.1760% | 967,123 | +| **32k** | 4.008x 🏆 | 4.01 | 0.1884% | 903,354 | + +### Tokenization Examples + +Below are sample sentences tokenized with each vocabulary size: + +**Sample 1:** `Митра (вецейзначна одреднїца) Митра (церковне швето) Митра (владикова коруна)` + +| Vocab | Tokens | Count | +|-------|--------|-------| +| 8k | `▁митра ▁( вецейзначна ▁одреднїца ) ▁митра ▁( цер ков не ... (+9 more)` | 19 | +| 16k | `▁митра ▁( вецейзначна ▁одреднїца ) ▁митра ▁( цер ков не ... (+8 more)` | 18 | +| 32k | `▁митра ▁( вецейзначна ▁одреднїца ) ▁митра ▁( церковне ▁швето ) ... (+5 more)` | 15 | + +**Sample 2:** `