mon_tokenizer / mon_tokenizer.meta.json
janakhpon's picture
feat: simplified mon tokenizer in hf format, updated tags, resolve the legacy issue
e9d0f85
{
"model_path": "tokenizer.model",
"vocab_path": "mon_tokenizer.vocab",
"lines_trained": 32412,
"total_characters": 2453293,
"model_type": "unigram",
"vocab_size": 4000,
"original_vocab_size": 4000,
"character_coverage": 0.9995,
"byte_fallback": true,
"user_defined_symbols": [
"<mask>",
"<sep>",
"<cls>"
],
"evaluation": {
"သ္ဂံသ္ဂံပါ။ ကျာ်တြဲ ပရိတ်တံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။": {
"num_pieces": 24,
"pieces": [
"▁",
"သ္",
"ဂ",
"ံ",
"သ္",
"ဂ",
"ံ",
"ပါ",
"<0xE1>",
"<0x81>",
"<0x8B>",
"▁",
"ကျာ်တြဲ",
"▁",
"ပရိ",
"တ်",
"တံဂှ်",
"▁",
"ကၠောန်",
"ဗဒှ်",
"လဝ်ရ",
"<0xE1>",
"<0x81>",
"<0x8B>"
],
"ids_head": [
262,
610,
324,
381,
610,
324,
381,
495,
231,
135,
145,
262,
1733,
262,
2158,
339,
1148,
262,
286,
726,
1097,
231,
135,
145
],
"round_trip_ok": true,
"compression_ratio": 1.9166666666666667
},
"ဒေါံဏံ ဍာ်မိုဟ် ကြဴကြဴဏောၚ်။": {
"num_pieces": 14,
"pieces": [
"▁",
"ဒေါ",
"ံ",
"ဏံ",
"▁ဍာ်",
"မ",
"ိုဟ်",
"▁",
"ကြဴ",
"ကြဴ",
"ဏောၚ်",
"<0xE1>",
"<0x81>",
"<0x8B>"
],
"ids_head": [
262,
1865,
381,
596,
1178,
272,
1255,
262,
1752,
1752,
2484,
231,
135,
145
],
"round_trip_ok": true,
"compression_ratio": 2.0
},
"ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။": {
"num_pieces": 12,
"pieces": [
"▁",
"ဘာသာမန်",
"▁",
"ပရူပရာ",
"တံဂှ်",
"▁",
"ကၠောန်",
"ဗဒှ်",
"လဝ်ရ",
"<0xE1>",
"<0x81>",
"<0x8B>"
],
"ids_head": [
262,
1179,
262,
3651,
1148,
262,
286,
726,
1097,
231,
135,
145
],
"round_trip_ok": true,
"compression_ratio": 2.9166666666666665
},
"ဘာသာအင်္ဂလိက် ကဵု ဘာသာမန် နွံပၟိက်ရ။": {
"num_pieces": 11,
"pieces": [
"▁",
"ဘာသာအင်္ဂလိက်",
"▁ကဵု",
"▁",
"ဘာသာမန်",
"▁",
"နွံပၟိက်",
"ရ",
"<0xE1>",
"<0x81>",
"<0x8B>"
],
"ids_head": [
262,
1970,
387,
262,
1179,
262,
1205,
264,
231,
135,
145
],
"round_trip_ok": true,
"compression_ratio": 3.272727272727273
},
"သၞာံ ၂၀၂၄ ဂိတုဇန္နဝါရဳ ၁၅ မံက်": {
"num_pieces": 10,
"pieces": [
"▁သၞာံ",
"▁၂၀၂၄",
"▁ဂိတု",
"ဇ",
"န္န",
"ဝါ",
"ရဳ",
"▁၁၅",
"▁",
"မံက်"
],
"ids_head": [
287,
2730,
732,
384,
2733,
463,
1248,
1059,
262,
967
],
"round_trip_ok": true,
"compression_ratio": 3.0
},
"ၚၛၜၝၞၟၠ မန်တံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။": {
"num_pieces": 20,
"pieces": [
"▁",
"ၚ",
"<0xE1>",
"<0x81>",
"<0x9B>",
"ၜ",
"ၝ",
"ၞ",
"ၟ",
"ၠ",
"▁",
"မန်",
"တံဂှ်",
"▁",
"ကၠောန်",
"ဗဒှ်",
"လဝ်ရ",
"<0xE1>",
"<0x81>",
"<0x8B>"
],
"ids_head": [
262,
1062,
231,
135,
161,
844,
1937,
554,
3999,
922,
262,
294,
1148,
262,
286,
726,
1097,
231,
135,
145
],
"round_trip_ok": true,
"compression_ratio": 1.6
},
"ဨဩဪဥဦဧ မန်တံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။": {
"num_pieces": 23,
"pieces": [
"▁",
"ဨ",
"<0xE1>",
"<0x80>",
"<0xA9>",
"<0xE1>",
"<0x80>",
"<0xAA>",
"ဥ",
"ဦ",
"<0xE1>",
"<0x80>",
"<0xA7>",
"▁",
"မန်",
"တံဂှ်",
"▁",
"ကၠောန်",
"ဗဒှ်",
"လဝ်ရ",
"<0xE1>",
"<0x81>",
"<0x8B>"
],
"ids_head": [
262,
1052,
231,
134,
175,
231,
134,
176,
1157,
3995,
231,
134,
173,
262,
294,
1148,
262,
286,
726,
1097,
231,
135,
145
],
"round_trip_ok": true,
"compression_ratio": 1.3478260869565217
},
"ါာူးေိီဲံ်္ မန်တံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။": {
"num_pieces": 22,
"pieces": [
"▁",
"ါ",
"ာ",
"ူ",
"း",
"ေ",
"ိ",
"ီ",
"ဲ",
"ံ",
"်",
"္",
"▁",
"မန်",
"တံဂှ်",
"▁",
"ကၠောန်",
"ဗဒှ်",
"လဝ်ရ",
"<0xE1>",
"<0x81>",
"<0x8B>"
],
"ids_head": [
262,
580,
328,
634,
304,
445,
478,
649,
340,
381,
276,
483,
262,
294,
1148,
262,
286,
726,
1097,
231,
135,
145
],
"round_trip_ok": true,
"compression_ratio": 1.6363636363636365
},
"ျြွှဿ မန်တံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။": {
"num_pieces": 16,
"pieces": [
"▁",
"ျ",
"ြ",
"ွ",
"ှ",
"ဿ",
"▁",
"မန်",
"တံဂှ်",
"▁",
"ကၠောန်",
"ဗဒှ်",
"လဝ်ရ",
"<0xE1>",
"<0x81>",
"<0x8B>"
],
"ids_head": [
262,
2040,
2674,
738,
753,
1251,
262,
294,
1148,
262,
286,
726,
1097,
231,
135,
145
],
"round_trip_ok": true,
"compression_ratio": 1.875
},
"မန်တံဂှ်၊ ကၠောန်ဗဒှ်လဝ်ရ။ ပရူပရာတံဂှ်၌ နွံပၟိက်ရ။": {
"num_pieces": 23,
"pieces": [
"▁",
"မန်",
"တံဂှ်",
"<0xE1>",
"<0x81>",
"<0x8A>",
"▁",
"ကၠောန်",
"ဗဒှ်",
"လဝ်ရ",
"<0xE1>",
"<0x81>",
"<0x8B>",
"▁",
"ပရူပရာ",
"တံဂှ်",
"၌",
"▁",
"နွံပၟိက်",
"ရ",
"<0xE1>",
"<0x81>",
"<0x8B>"
],
"ids_head": [
262,
294,
1148,
231,
135,
144,
262,
286,
726,
1097,
231,
135,
145,
262,
3651,
1148,
3430,
262,
1205,
264,
231,
135,
145
],
"round_trip_ok": true,
"compression_ratio": 2.130434782608696
},
"သၞာံ ၂၀၂၄ ÷ ၄ = ၅၀၆ × ၁၀ = ၅၀၆၀": {
"num_pieces": 18,
"pieces": [
"▁သၞာံ",
"▁၂၀၂၄",
"▁",
"<0xC3>",
"<0xB7>",
"▁၄",
"▁=",
"▁",
"၅၀",
"၆",
"▁",
"<0xC3>",
"<0x97>",
"▁၁၀",
"▁=",
"▁",
"၅၀",
"၆၀"
],
"ids_head": [
287,
2730,
262,
201,
189,
705,
533,
262,
1287,
936,
262,
201,
157,
782,
533,
262,
1287,
1812
],
"round_trip_ok": true,
"compression_ratio": 1.7222222222222223
},
"_stats": {
"avg_compression_ratio": 1.9896373056994818,
"round_trip_accuracy": 1.0,
"total_samples": 11,
"vocab_size": 4000
}
},
"character_analysis": {
"total_chars": 2453293,
"mon_chars": 1907807,
"unique_mon_chars": 94,
"mon_char_ratio": 0.7776515075859264,
"categories": {
"base_consonants": [
"က",
"ခ",
"ဂ",
"ဃ",
"င",
"စ",
"ဆ",
"ဇ",
"ဉ",
"ည",
"ဋ",
"ဌ",
"ဍ",
"ဎ",
"ဏ",
"တ",
"ထ",
"ဒ",
"ဓ",
"န",
"ပ",
"ဖ",
"ဗ",
"ဘ",
"မ",
"ယ",
"ရ",
"လ",
"ဝ",
"သ",
"ဟ",
"ဠ",
"အ"
],
"extended_mon": [
"ၚ",
"ၛ",
"ၜ",
"ၝ",
"ၞ",
"ၟ",
"ၠ"
],
"extended_vowels": [
"ဥ",
"ဦ",
"ဧ",
"ဨ",
"ဩ"
],
"vowel_signs": [
"ါ",
"ာ",
"ိ",
"ီ",
"ူ",
"ေ",
"ဲ",
"ံ",
"း",
"္",
"်"
],
"media_chars": [
"ျ",
"ြ",
"ွ",
"ှ"
],
"punctuation": [
"၌",
"၏"
],
"mathematical": [
"=",
"×"
],
"other": [
"ဣ",
"ဤ",
"ု",
"ဳ",
"ဴ",
"ဵ",
"့",
"ဿ",
"၀",
"၁",
"၂",
"၃",
"၄",
"၅",
"၆",
"၇",
"၈",
"၉",
"ၐ",
"ၑ",
"ၢ",
"ၤ",
"ႄ",
"ႅ",
"ႆ",
"ႇ",
"ႈ",
"႓",
"႕",
"ႝ"
]
},
"all_found_chars": [
"=",
"×",
"က",
"ခ",
"ဂ",
"ဃ",
"င",
"စ",
"ဆ",
"ဇ",
"ဉ",
"ည",
"ဋ",
"ဌ",
"ဍ",
"ဎ",
"ဏ",
"တ",
"ထ",
"ဒ",
"ဓ",
"န",
"ပ",
"ဖ",
"ဗ",
"ဘ",
"မ",
"ယ",
"ရ",
"လ",
"ဝ",
"သ",
"ဟ",
"ဠ",
"အ",
"ဣ",
"ဤ",
"ဥ",
"ဦ",
"ဧ",
"ဨ",
"ဩ",
"ါ",
"ာ",
"ိ",
"ီ",
"ု",
"ူ",
"ေ",
"ဲ",
"ဳ",
"ဴ",
"ဵ",
"ံ",
"့",
"း",
"္",
"်",
"ျ",
"ြ",
"ွ",
"ှ",
"ဿ",
"၀",
"၁",
"၂",
"၃",
"၄",
"၅",
"၆",
"၇",
"၈",
"၉",
"၌",
"၏",
"ၐ",
"ၑ",
"ၚ",
"ၛ",
"ၜ",
"ၝ",
"ၞ",
"ၟ",
"ၠ",
"ၢ",
"ၤ",
"ႄ",
"ႅ",
"ႆ",
"ႇ",
"ႈ",
"႓",
"႕",
"ႝ"
]
},
"resource_limits": {
"max_cpu_percent": 90,
"max_memory_percent": 85,
"max_disk_percent": 90
}
}