feat: simplified mon tokenizer in hf format

Browse files

Files changed (13) hide show

.gitignore +10 -0
.python-version +1 -0
README.md +38 -0
convert_to_hf.py +258 -0
generation_config.json +9 -0
mon_tokenizer.meta.json +728 -0
mon_tokenizer.model +3 -0
pyproject.toml +42 -0
special_tokens_map.json +30 -0
test_tokenizer.py +108 -0
tokenizer_config.json +19 -0
upload_to_hub.py +128 -0
uv.lock +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.11

README.md ADDED Viewed

	@@ -0,0 +1,38 @@

+---
+language:
+- mon
+library_name: transformers
+license: mit
+tags:
+- tokenizer
+- mon
+- myanmar
+- sentencepiece
+---
+# mon language tokenizer
+sentencepiece tokenizer for mon language with 4,000 vocabulary.
+## usage
+```python
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer")
+text = "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။"
+tokens = tokenizer(text, return_tensors="pt")
+decoded = tokenizer.decode(tokens["input_ids"][0])
+```
+## details
+- vocabulary size: 4,000
+- algorithm: sentencepiece
+- model type: unigram
+- special tokens: <s>, </s>, <unk>, <pad>
+## training data
+trained on mon language corpus including wikipedia articles, news, and books.

convert_to_hf.py ADDED Viewed

	@@ -0,0 +1,258 @@

+#!/usr/bin/env python3
+"""
+convert mon sentencepiece tokenizer to hugging face format
+creates required config files for transformers library
+"""
+import json
+import shutil
+import os
+from pathlib import Path
+from typing import Dict, Any
+import sentencepiece as spm
+def load_metadata(meta_file: str = "mon_tokenizer.meta.json") -> Dict[str, Any]:
+    """load tokenizer metadata"""
+    print(f"loading metadata from {meta_file}")
+    if not os.path.exists(meta_file):
+        print(f"warning: metadata file not found")
+        return {}
+    with open(meta_file, 'r', encoding='utf-8') as f:
+        metadata = json.load(f)
+    print(f"loaded metadata - vocab size: {metadata.get('vocab_size', 'unknown')}")
+    return metadata
+def analyze_model(model_file: str = "mon_tokenizer.model") -> Dict[str, Any]:
+    """analyze sentencepiece model"""
+    print(f"analyzing model: {model_file}")
+    if not os.path.exists(model_file):
+        raise FileNotFoundError(f"model file not found: {model_file}")
+    sp = spm.SentencePieceProcessor()
+    sp.load(model_file)
+    vocab_size = sp.get_piece_size()
+    bos_id = sp.bos_id()
+    eos_id = sp.eos_id()
+    unk_id = sp.unk_id()
+    pad_id = sp.pad_id() if sp.pad_id() != -1 else vocab_size
+    analysis = {
+        "vocab_size": vocab_size,
+        "bos_token": sp.id_to_piece(bos_id) if bos_id != -1 else "<s>",
+        "eos_token": sp.id_to_piece(eos_id) if eos_id != -1 else "</s>",
+        "unk_token": sp.id_to_piece(unk_id) if unk_id != -1 else "<unk>",
+        "pad_token": "<pad>",
+        "bos_token_id": bos_id if bos_id != -1 else 1,
+        "eos_token_id": eos_id if eos_id != -1 else 2,
+        "unk_token_id": unk_id if unk_id != -1 else 0,
+        "pad_token_id": pad_id
+    }
+    print(f"analysis complete - vocab: {vocab_size}")
+    return analysis
+def create_tokenizer_config(analysis: Dict[str, Any]) -> Dict[str, Any]:
+    """create tokenizer_config.json"""
+    return {
+        "model_type": "llama",
+        "tokenizer_class": "LlamaTokenizer",
+        "vocab_file": "mon_tokenizer.model",
+        "vocab_size": analysis["vocab_size"],
+        "bos_token": analysis["bos_token"],
+        "eos_token": analysis["eos_token"],
+        "unk_token": analysis["unk_token"],
+        "pad_token": analysis["pad_token"],
+        "bos_token_id": analysis["bos_token_id"],
+        "eos_token_id": analysis["eos_token_id"],
+        "unk_token_id": analysis["unk_token_id"],
+        "pad_token_id": analysis["pad_token_id"],
+        "clean_up_tokenization_spaces": False,
+        "sp_model_kwargs": {},
+        "add_bos_token": True,
+        "add_eos_token": False,
+        "model_max_length": 2048
+    }
+def create_special_tokens_map(analysis: Dict[str, Any]) -> Dict[str, Any]:
+    """create special_tokens_map.json"""
+    return {
+        "bos_token": {
+            "content": analysis["bos_token"],
+            "lstrip": False,
+            "normalized": False,
+            "rstrip": False,
+            "single_word": False
+        },
+        "eos_token": {
+            "content": analysis["eos_token"],
+            "lstrip": False,
+            "normalized": False,
+            "rstrip": False,
+            "single_word": False
+        },
+        "pad_token": {
+            "content": analysis["pad_token"],
+            "lstrip": False,
+            "normalized": False,
+            "rstrip": False,
+            "single_word": False
+        },
+        "unk_token": {
+            "content": analysis["unk_token"],
+            "lstrip": False,
+            "normalized": False,
+            "rstrip": False,
+            "single_word": False
+        }
+    }
+def create_generation_config() -> Dict[str, Any]:
+    """create generation_config.json"""
+    return {
+        "bos_token_id": 1,
+        "eos_token_id": 2,
+        "pad_token_id": 4000,
+        "do_sample": True,
+        "max_length": 2048,
+        "temperature": 0.8,
+        "top_p": 0.9
+    }
+def create_readme(analysis: Dict[str, Any], metadata: Dict[str, Any]) -> str:
+    """create readme model card"""
+    return f"""---
+language:
+- mon
+library_name: transformers
+license: mit
+tags:
+- tokenizer
+- mon
+- myanmar
+- sentencepiece
+---
+# mon language tokenizer
+sentencepiece tokenizer for mon language with {analysis["vocab_size"]:,} vocabulary.
+## usage
+```python
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("janakhpon/mon_tokenizer")
+text = "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။"
+tokens = tokenizer(text, return_tensors="pt")
+decoded = tokenizer.decode(tokens["input_ids"][0])
+```
+## details
+- vocabulary size: {analysis["vocab_size"]:,}
+- algorithm: sentencepiece
+- model type: unigram
+- special tokens: {analysis["bos_token"]}, {analysis["eos_token"]}, {analysis["unk_token"]}, {analysis["pad_token"]}
+## training data
+trained on mon language corpus including wikipedia articles, news, and books.
+"""
+def create_gitattributes() -> str:
+    """create .gitattributes for git lfs"""
+    return "mon_tokenizer.model filter=lfs diff=lfs merge=lfs -text\n"
+def test_tokenizer(output_dir: str) -> bool:
+    """test converted tokenizer"""
+    print("testing tokenizer")
+    try:
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(output_dir)
+        test_text = "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။"
+        tokens = tokenizer(test_text, return_tensors="pt")
+        decoded = tokenizer.decode(tokens["input_ids"][0], skip_special_tokens=True)
+        print(f"test passed - vocab: {tokenizer.vocab_size:,}")
+        return test_text == decoded
+    except Exception as e:
+        print(f"test failed: {e}")
+        return False
+def convert_to_huggingface(
+    input_model: str = "mon_tokenizer.model",
+    input_meta: str = "mon_tokenizer.meta.json",
+    output_dir: str = "."
+):
+    """convert mon tokenizer to hugging face format"""
+    print("converting mon tokenizer to hugging face format")
+    # create output directory
+    output_path = Path(output_dir)
+    output_path.mkdir(exist_ok=True)
+    # load metadata and analyze model
+    metadata = load_metadata(input_meta)
+    analysis = analyze_model(input_model)
+    # copy model file if needed
+    model_dest = output_path / "mon_tokenizer.model"
+    if not model_dest.exists() or model_dest.resolve() != Path(input_model).resolve():
+        print("copying model file")
+        shutil.copy2(input_model, model_dest)
+    else:
+        print("model file already in place")
+    # create config files
+    print("creating config files")
+    configs = {
+        "tokenizer_config.json": create_tokenizer_config(analysis),
+        "special_tokens_map.json": create_special_tokens_map(analysis),
+        "generation_config.json": create_generation_config()
+    }
+    for filename, config in configs.items():
+        with open(output_path / filename, 'w') as f:
+            json.dump(config, f, indent=2)
+        print(f"created {filename}")
+    # create readme and gitattributes
+    with open(output_path / "README.md", 'w', encoding='utf-8') as f:
+        f.write(create_readme(analysis, metadata))
+    print("created README.md")
+    with open(output_path / ".gitattributes", 'w') as f:
+        f.write(create_gitattributes())
+    print("created .gitattributes")
+    # test
+    success = test_tokenizer(str(output_path))
+    print(f"conversion {'successful' if success else 'completed with warnings'}")
+    return success
+if __name__ == "__main__":
+    convert_to_huggingface()

generation_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 4000,
+  "do_sample": true,
+  "max_length": 2048,
+  "temperature": 0.8,
+  "top_p": 0.9
+}

mon_tokenizer.meta.json ADDED Viewed

	@@ -0,0 +1,728 @@

+{
+  "model_path": "mon_tokenizer.model",
+  "vocab_path": "mon_tokenizer.vocab",
+  "lines_trained": 32412,
+  "total_characters": 2453293,
+  "model_type": "unigram",
+  "vocab_size": 4000,
+  "original_vocab_size": 4000,
+  "character_coverage": 0.9995,
+  "byte_fallback": true,
+  "user_defined_symbols": [
+    "<mask>",
+    "<sep>",
+    "<cls>"
+  ],
+  "evaluation": {
+    "သ္ဂံသ္ဂံပါ။ ကျာ်တြဲ ပရိတ်တံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။": {
+      "num_pieces": 24,
+      "pieces": [
+        "▁",
+        "သ္",
+        "ဂ",
+        "ံ",
+        "သ္",
+        "ဂ",
+        "ံ",
+        "ပါ",
+        "<0xE1>",
+        "<0x81>",
+        "<0x8B>",
+        "▁",
+        "ကျာ်တြဲ",
+        "▁",
+        "ပရိ",
+        "တ်",
+        "တံဂှ်",
+        "▁",
+        "ကၠောန်",
+        "ဗဒှ်",
+        "လဝ်ရ",
+        "<0xE1>",
+        "<0x81>",
+        "<0x8B>"
+      ],
+      "ids_head": [
+        262,
+        610,
+        324,
+        381,
+        610,
+        324,
+        381,
+        495,
+        231,
+        135,
+        145,
+        262,
+        1733,
+        262,
+        2158,
+        339,
+        1148,
+        262,
+        286,
+        726,
+        1097,
+        231,
+        135,
+        145
+      ],
+      "round_trip_ok": true,
+      "compression_ratio": 1.9166666666666667
+    },
+    "ဒေါံဏံ ဍာ်မိုဟ် ကြဴကြဴဏောၚ်။": {
+      "num_pieces": 14,
+      "pieces": [
+        "▁",
+        "ဒေါ",
+        "ံ",
+        "ဏံ",
+        "▁ဍာ်",
+        "မ",
+        "ိုဟ်",
+        "▁",
+        "ကြဴ",
+        "ကြဴ",
+        "ဏောၚ်",
+        "<0xE1>",
+        "<0x81>",
+        "<0x8B>"
+      ],
+      "ids_head": [
+        262,
+        1865,
+        381,
+        596,
+        1178,
+        272,
+        1255,
+        262,
+        1752,
+        1752,
+        2484,
+        231,
+        135,
+        145
+      ],
+      "round_trip_ok": true,
+      "compression_ratio": 2.0
+    },
+    "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။": {
+      "num_pieces": 12,
+      "pieces": [
+        "▁",
+        "ဘာသာမန်",
+        "▁",
+        "ပရူပရာ",
+        "တံဂှ်",
+        "▁",
+        "ကၠောန်",
+        "ဗဒှ်",
+        "လဝ်ရ",
+        "<0xE1>",
+        "<0x81>",
+        "<0x8B>"
+      ],
+      "ids_head": [
+        262,
+        1179,
+        262,
+        3651,
+        1148,
+        262,
+        286,
+        726,
+        1097,
+        231,
+        135,
+        145
+      ],
+      "round_trip_ok": true,
+      "compression_ratio": 2.9166666666666665
+    },
+    "ဘာသာအင်္ဂလိက် ကဵု ဘာသာမန် နွံပၟိက်ရ။": {
+      "num_pieces": 11,
+      "pieces": [
+        "▁",
+        "ဘာသာအင်္ဂလိက်",
+        "▁ကဵု",
+        "▁",
+        "ဘာသာမန်",
+        "▁",
+        "နွံပၟိက်",
+        "ရ",
+        "<0xE1>",
+        "<0x81>",
+        "<0x8B>"
+      ],
+      "ids_head": [
+        262,
+        1970,
+        387,
+        262,
+        1179,
+        262,
+        1205,
+        264,
+        231,
+        135,
+        145
+      ],
+      "round_trip_ok": true,
+      "compression_ratio": 3.272727272727273
+    },
+    "သၞာံ ၂၀၂၄ ဂိတုဇန္နဝါရဳ ၁၅ မံက်": {
+      "num_pieces": 10,
+      "pieces": [
+        "▁သၞာံ",
+        "▁၂၀၂၄",
+        "▁ဂိတု",
+        "ဇ",
+        "န္န",
+        "ဝါ",
+        "ရဳ",
+        "▁၁၅",
+        "▁",
+        "မံက်"
+      ],
+      "ids_head": [
+        287,
+        2730,
+        732,
+        384,
+        2733,
+        463,
+        1248,
+        1059,
+        262,
+        967
+      ],
+      "round_trip_ok": true,
+      "compression_ratio": 3.0
+    },
+    "ၚၛၜၝၞၟၠ မန်တံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။": {
+      "num_pieces": 20,
+      "pieces": [
+        "▁",
+        "ၚ",
+        "<0xE1>",
+        "<0x81>",
+        "<0x9B>",
+        "ၜ",
+        "ၝ",
+        "ၞ",
+        "ၟ",
+        "ၠ",
+        "▁",
+        "မန်",
+        "တံဂှ်",
+        "▁",
+        "ကၠောန်",
+        "ဗဒှ်",
+        "လဝ်ရ",
+        "<0xE1>",
+        "<0x81>",
+        "<0x8B>"
+      ],
+      "ids_head": [
+        262,
+        1062,
+        231,
+        135,
+        161,
+        844,
+        1937,
+        554,
+        3999,
+        922,
+        262,
+        294,
+        1148,
+        262,
+        286,
+        726,
+        1097,
+        231,
+        135,
+        145
+      ],
+      "round_trip_ok": true,
+      "compression_ratio": 1.6
+    },
+    "ဨဩဪဥဦဧ မန်တံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။": {
+      "num_pieces": 23,
+      "pieces": [
+        "▁",
+        "ဨ",
+        "<0xE1>",
+        "<0x80>",
+        "<0xA9>",
+        "<0xE1>",
+        "<0x80>",
+        "<0xAA>",
+        "ဥ",
+        "ဦ",
+        "<0xE1>",
+        "<0x80>",
+        "<0xA7>",
+        "▁",
+        "မန်",
+        "တံဂှ်",
+        "▁",
+        "ကၠောန်",
+        "ဗဒှ်",
+        "လဝ်ရ",
+        "<0xE1>",
+        "<0x81>",
+        "<0x8B>"
+      ],
+      "ids_head": [
+        262,
+        1052,
+        231,
+        134,
+        175,
+        231,
+        134,
+        176,
+        1157,
+        3995,
+        231,
+        134,
+        173,
+        262,
+        294,
+        1148,
+        262,
+        286,
+        726,
+        1097,
+        231,
+        135,
+        145
+      ],
+      "round_trip_ok": true,
+      "compression_ratio": 1.3478260869565217
+    },
+    "ါာူးေိီဲံ်္ မန်တံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။": {
+      "num_pieces": 22,
+      "pieces": [
+        "▁",
+        "ါ",
+        "ာ",
+        "ူ",
+        "း",
+        "ေ",
+        "ိ",
+        "ီ",
+        "ဲ",
+        "ံ",
+        "်",
+        "္",
+        "▁",
+        "မန်",
+        "တံဂှ်",
+        "▁",
+        "ကၠောန်",
+        "ဗဒှ်",
+        "လဝ်ရ",
+        "<0xE1>",
+        "<0x81>",
+        "<0x8B>"
+      ],
+      "ids_head": [
+        262,
+        580,
+        328,
+        634,
+        304,
+        445,
+        478,
+        649,
+        340,
+        381,
+        276,
+        483,
+        262,
+        294,
+        1148,
+        262,
+        286,
+        726,
+        1097,
+        231,
+        135,
+        145
+      ],
+      "round_trip_ok": true,
+      "compression_ratio": 1.6363636363636365
+    },
+    "ျြွှဿ မန်တံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။": {
+      "num_pieces": 16,
+      "pieces": [
+        "▁",
+        "ျ",
+        "ြ",
+        "ွ",
+        "ှ",
+        "ဿ",
+        "▁",
+        "မန်",
+        "တံဂှ်",
+        "▁",
+        "ကၠောန်",
+        "ဗဒှ်",
+        "လဝ်ရ",
+        "<0xE1>",
+        "<0x81>",
+        "<0x8B>"
+      ],
+      "ids_head": [
+        262,
+        2040,
+        2674,
+        738,
+        753,
+        1251,
+        262,
+        294,
+        1148,
+        262,
+        286,
+        726,
+        1097,
+        231,
+        135,
+        145
+      ],
+      "round_trip_ok": true,
+      "compression_ratio": 1.875
+    },
+    "မန်တံဂှ်၊ ကၠောန်ဗဒှ်လဝ်ရ။ ပရူပရာတံဂှ်၌ နွံပၟိက်ရ။": {
+      "num_pieces": 23,
+      "pieces": [
+        "▁",
+        "မန်",
+        "တံဂှ်",
+        "<0xE1>",
+        "<0x81>",
+        "<0x8A>",
+        "▁",
+        "ကၠောန်",
+        "ဗဒှ်",
+        "လဝ်ရ",
+        "<0xE1>",
+        "<0x81>",
+        "<0x8B>",
+        "▁",
+        "ပရူပရာ",
+        "တံဂှ်",
+        "၌",
+        "▁",
+        "နွံပၟိက်",
+        "ရ",
+        "<0xE1>",
+        "<0x81>",
+        "<0x8B>"
+      ],
+      "ids_head": [
+        262,
+        294,
+        1148,
+        231,
+        135,
+        144,
+        262,
+        286,
+        726,
+        1097,
+        231,
+        135,
+        145,
+        262,
+        3651,
+        1148,
+        3430,
+        262,
+        1205,
+        264,
+        231,
+        135,
+        145
+      ],
+      "round_trip_ok": true,
+      "compression_ratio": 2.130434782608696
+    },
+    "သၞာံ ၂၀၂၄ ÷ ၄ = ၅၀၆ × ၁၀ = ၅၀၆၀": {
+      "num_pieces": 18,
+      "pieces": [
+        "▁သၞာံ",
+        "▁၂၀၂၄",
+        "▁",
+        "<0xC3>",
+        "<0xB7>",
+        "▁၄",
+        "▁=",
+        "▁",
+        "၅၀",
+        "၆",
+        "▁",
+        "<0xC3>",
+        "<0x97>",
+        "▁၁၀",
+        "▁=",
+        "▁",
+        "၅၀",
+        "၆၀"
+      ],
+      "ids_head": [
+        287,
+        2730,
+        262,
+        201,
+        189,
+        705,
+        533,
+        262,
+        1287,
+        936,
+        262,
+        201,
+        157,
+        782,
+        533,
+        262,
+        1287,
+        1812
+      ],
+      "round_trip_ok": true,
+      "compression_ratio": 1.7222222222222223
+    },
+    "_stats": {
+      "avg_compression_ratio": 1.9896373056994818,
+      "round_trip_accuracy": 1.0,
+      "total_samples": 11,
+      "vocab_size": 4000
+    }
+  },
+  "character_analysis": {
+    "total_chars": 2453293,
+    "mon_chars": 1907807,
+    "unique_mon_chars": 94,
+    "mon_char_ratio": 0.7776515075859264,
+    "categories": {
+      "base_consonants": [
+        "က",
+        "ခ",
+        "ဂ",
+        "ဃ",
+        "င",
+        "စ",
+        "ဆ",
+        "ဇ",
+        "ဉ",
+        "ည",
+        "ဋ",
+        "ဌ",
+        "ဍ",
+        "ဎ",
+        "ဏ",
+        "တ",
+        "ထ",
+        "ဒ",
+        "ဓ",
+        "န",
+        "ပ",
+        "ဖ",
+        "ဗ",
+        "ဘ",
+        "မ",
+        "ယ",
+        "ရ",
+        "လ",
+        "ဝ",
+        "သ",
+        "ဟ",
+        "ဠ",
+        "အ"
+      ],
+      "extended_mon": [
+        "ၚ",
+        "ၛ",
+        "ၜ",
+        "ၝ",
+        "ၞ",
+        "ၟ",
+        "ၠ"
+      ],
+      "extended_vowels": [
+        "ဥ",
+        "ဦ",
+        "ဧ",
+        "ဨ",
+        "ဩ"
+      ],
+      "vowel_signs": [
+        "ါ",
+        "ာ",
+        "ိ",
+        "ီ",
+        "ူ",
+        "ေ",
+        "ဲ",
+        "ံ",
+        "း",
+        "္",
+        "်"
+      ],
+      "media_chars": [
+        "ျ",
+        "ြ",
+        "ွ",
+        "ှ"
+      ],
+      "punctuation": [
+        "၌",
+        "၏"
+      ],
+      "mathematical": [
+        "=",
+        "×"
+      ],
+      "other": [
+        "ဣ",
+        "ဤ",
+        "ု",
+        "ဳ",
+        "ဴ",
+        "ဵ",
+        "့",
+        "ဿ",
+        "၀",
+        "၁",
+        "၂",
+        "၃",
+        "၄",
+        "၅",
+        "၆",
+        "၇",
+        "၈",
+        "၉",
+        "ၐ",
+        "ၑ",
+        "ၢ",
+        "ၤ",
+        "ႄ",
+        "ႅ",
+        "ႆ",
+        "ႇ",
+        "ႈ",
+        "႓",
+        "႕",
+        "ႝ"
+      ]
+    },
+    "all_found_chars": [
+      "=",
+      "×",
+      "က",
+      "ခ",
+      "ဂ",
+      "ဃ",
+      "င",
+      "စ",
+      "ဆ",
+      "ဇ",
+      "ဉ",
+      "ည",
+      "ဋ",
+      "ဌ",
+      "ဍ",
+      "ဎ",
+      "ဏ",
+      "တ",
+      "ထ",
+      "ဒ",
+      "ဓ",
+      "န",
+      "ပ",
+      "ဖ",
+      "ဗ",
+      "ဘ",
+      "မ",
+      "ယ",
+      "ရ",
+      "လ",
+      "ဝ",
+      "သ",
+      "ဟ",
+      "ဠ",
+      "အ",
+      "ဣ",
+      "ဤ",
+      "ဥ",
+      "ဦ",
+      "ဧ",
+      "ဨ",
+      "ဩ",
+      "ါ",
+      "ာ",
+      "ိ",
+      "ီ",
+      "ု",
+      "ူ",
+      "ေ",
+      "ဲ",
+      "ဳ",
+      "ဴ",
+      "ဵ",
+      "ံ",
+      "့",
+      "း",
+      "္",
+      "်",
+      "ျ",
+      "ြ",
+      "ွ",
+      "ှ",
+      "ဿ",
+      "၀",
+      "၁",
+      "၂",
+      "၃",
+      "၄",
+      "၅",
+      "၆",
+      "၇",
+      "၈",
+      "၉",
+      "၌",
+      "၏",
+      "ၐ",
+      "ၑ",
+      "ၚ",
+      "ၛ",
+      "ၜ",
+      "ၝ",
+      "ၞ",
+      "ၟ",
+      "ၠ",
+      "ၢ",
+      "ၤ",
+      "ႄ",
+      "ႅ",
+      "ႆ",
+      "ႇ",
+      "ႈ",
+      "႓",
+      "႕",
+      "ႝ"
+    ]
+  },
+  "resource_limits": {
+    "max_cpu_percent": 90,
+    "max_memory_percent": 85,
+    "max_disk_percent": 90
+  }
+}

mon_tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a0b3e772c4f414d2540c3f68474d14b037ec00f8e5ac9bce637938d7e82998d3
+size 338422

pyproject.toml ADDED Viewed

	@@ -0,0 +1,42 @@

+[project]
+name = "mon-tokenizer-hf"
+version = "1.0.0"
+description = "mon language tokenizer for hugging face transformers"
+readme = "README.md"
+requires-python = ">=3.8.1"
+license = {text = "MIT"}
+authors = [
+    {name = "Mon Language Project", email = "contact@example.com"}
+]
+keywords = ["tokenizer", "mon", "myanmar", "nlp", "huggingface", "sentencepiece"]
+dependencies = [
+    "transformers>=4.30.0",
+    "torch>=1.12.0",
+    "sentencepiece>=0.1.99",
+    "huggingface_hub>=0.15.0",
+    "protobuf>=3.20.0",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=7.0.0",
+    "black>=23.0.0",
+    "isort>=5.12.0",
+]
+[project.urls]
+Homepage = "https://github.com/yourusername/mon-tokenizer-hf"
+Repository = "https://github.com/yourusername/mon-tokenizer-hf"
+Documentation = "https://github.com/yourusername/mon-tokenizer-hf#readme"
+"Bug Tracker" = "https://github.com/yourusername/mon-tokenizer-hf/issues"
+"Hugging Face" = "https://huggingface.co/janakhpon/mon_tokenizer"
+[tool.black]
+line-length = 88
+target-version = ['py38']
+include = '\.pyi?$'
+[tool.isort]
+profile = "black"
+multi_line_output = 3

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

test_tokenizer.py ADDED Viewed

	@@ -0,0 +1,108 @@

+#!/usr/bin/env python3
+"""
+test mon tokenizer hugging face integration
+"""
+import torch
+from transformers import AutoTokenizer, GPT2LMHeadModel, GPT2Config
+def test_tokenizer():
+    """test tokenizer loading and basic functionality"""
+    print("testing mon tokenizer")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(".")
+        print(f"tokenizer loaded - vocab: {tokenizer.vocab_size:,}")
+        # test tokenization
+        test_texts = [
+            "ဘာသာမန်",
+            "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။",
+            "မန်တံဂှ် မံင်ပ္ဍဲ ရးမန် ကဵု ရးသေံ။"
+        ]
+        for text in test_texts:
+            inputs = tokenizer(text, return_tensors="pt")
+            decoded = tokenizer.decode(inputs["input_ids"][0], skip_special_tokens=True)
+            print(f"input: '{text}'")
+            print(f"tokens: {inputs['input_ids'].shape}")
+            print(f"decoded: '{decoded}'")
+            print(f"round-trip: {'ok' if text == decoded else 'failed'}")
+            print()
+        return True
+    except Exception as e:
+        print(f"tokenizer test failed: {e}")
+        return False
+def test_model_integration():
+    """test tokenizer with gpt2 model"""
+    print("testing model integration")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(".")
+        # create small gpt2 model
+        config = GPT2Config(
+            vocab_size=tokenizer.vocab_size,
+            n_positions=512,
+            n_embd=256,
+            n_layer=4,
+            n_head=4,
+            bos_token_id=tokenizer.bos_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+            pad_token_id=tokenizer.pad_token_id,
+        )
+        model = GPT2LMHeadModel(config)
+        print(f"model created - params: {sum(p.numel() for p in model.parameters()):,}")
+        # test generation
+        prompt = "ဘာသာမန်"
+        inputs = tokenizer(prompt, return_tensors="pt")
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_length=inputs['input_ids'].shape[1] + 10,
+                do_sample=False,
+                pad_token_id=tokenizer.pad_token_id
+            )
+        generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        print(f"generated: '{generated}'")
+        return True
+    except Exception as e:
+        print(f"model integration test failed: {e}")
+        return False
+def main():
+    """run all tests"""
+    print("mon tokenizer test suite")
+    tests = [
+        ("tokenizer", test_tokenizer),
+        ("model integration", test_model_integration)
+    ]
+    results = []
+    for name, test_func in tests:
+        print(f"\n--- {name} test ---")
+        success = test_func()
+        results.append(success)
+        print(f"{name}: {'passed' if success else 'failed'}")
+    print(f"\ntest results: {sum(results)}/{len(results)} passed")
+    return all(results)
+if __name__ == "__main__":
+    main()

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "model_type": "llama",
+  "tokenizer_class": "LlamaTokenizer",
+  "vocab_file": "mon_tokenizer.model",
+  "vocab_size": 4000,
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "unk_token": "<unk>",
+  "pad_token": "<pad>",
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "unk_token_id": 0,
+  "pad_token_id": 4000,
+  "clean_up_tokenization_spaces": false,
+  "sp_model_kwargs": {},
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "model_max_length": 2048
+}

upload_to_hub.py ADDED Viewed

	@@ -0,0 +1,128 @@

+#!/usr/bin/env python3
+"""
+upload mon tokenizer to hugging face hub
+"""
+import os
+from pathlib import Path
+from huggingface_hub import HfApi, login
+from transformers import AutoTokenizer
+def validate_tokenizer(directory: str = ".") -> bool:
+    """validate tokenizer before upload"""
+    print("validating tokenizer")
+    required_files = [
+        "mon_tokenizer.model",
+        "tokenizer_config.json",
+        "special_tokens_map.json",
+        "README.md"
+    ]
+    for file in required_files:
+        if not os.path.exists(os.path.join(directory, file)):
+            print(f"missing required file: {file}")
+            return False
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(directory)
+        test_text = "ဘာသာမန်"
+        tokens = tokenizer(test_text, return_tensors="pt")
+        decoded = tokenizer.decode(tokens["input_ids"][0], skip_special_tokens=True)
+        if test_text != decoded:
+            print("tokenizer round-trip test failed")
+            return False
+        print("validation passed")
+        return True
+    except Exception as e:
+        print(f"validation failed: {e}")
+        return False
+def upload_to_hub(
+    repo_id: str = "janakhpon/mon_tokenizer",
+    directory: str = ".",
+    private: bool = False,
+    commit_message: str = "upload mon tokenizer"
+):
+    """upload tokenizer to hugging face hub"""
+    print(f"uploading to {repo_id}")
+    # validate first
+    if not validate_tokenizer(directory):
+        print("upload cancelled - validation failed")
+        return False
+    try:
+        # login
+        print("logging in to hugging face")
+        login()
+        # create api client
+        api = HfApi()
+        # create/update repository
+        print(f"creating repository: {repo_id}")
+        api.create_repo(
+            repo_id=repo_id,
+            private=private,
+            exist_ok=True,
+            repo_type="model"
+        )
+        # upload files
+        print("uploading files")
+        api.upload_folder(
+            folder_path=directory,
+            repo_id=repo_id,
+            commit_message=commit_message,
+            ignore_patterns=[
+                "*.pyc",
+                "__pycache__/",
+                ".git/",
+                ".venv/",
+                "*.lock",
+                "datasets/"
+            ]
+        )
+        print(f"upload successful: https://huggingface.co/{repo_id}")
+        return True
+    except Exception as e:
+        print(f"upload failed: {e}")
+        return False
+def main():
+    """main upload function"""
+    print("mon tokenizer hub uploader")
+    # get repo info
+    repo_id = input("repository id (janakhpon/mon_tokenizer): ").strip()
+    if not repo_id:
+        repo_id = "janakhpon/mon_tokenizer"
+    private = input("private repository? (y/n): ").strip().lower() == 'y'
+    # upload
+    success = upload_to_hub(
+        repo_id=repo_id,
+        private=private,
+        commit_message="updated mon tokenizer"
+    )
+    if success:
+        print("tokenizer successfully uploaded to hugging face hub")
+    else:
+        print("upload failed")
+if __name__ == "__main__":
+    main()

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff