| { | |
| "model_path": "tokenizer.model", | |
| "vocab_path": "mon_tokenizer.vocab", | |
| "lines_trained": 32412, | |
| "total_characters": 2453293, | |
| "model_type": "unigram", | |
| "vocab_size": 4000, | |
| "original_vocab_size": 4000, | |
| "character_coverage": 0.9995, | |
| "byte_fallback": true, | |
| "user_defined_symbols": [ | |
| "<mask>", | |
| "<sep>", | |
| "<cls>" | |
| ], | |
| "evaluation": { | |
| "သ္ဂံသ္ဂံပါ။ ကျာ်တြဲ ပရိတ်တံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။": { | |
| "num_pieces": 24, | |
| "pieces": [ | |
| "▁", | |
| "သ္", | |
| "ဂ", | |
| "ံ", | |
| "သ္", | |
| "ဂ", | |
| "ံ", | |
| "ပါ", | |
| "<0xE1>", | |
| "<0x81>", | |
| "<0x8B>", | |
| "▁", | |
| "ကျာ်တြဲ", | |
| "▁", | |
| "ပရိ", | |
| "တ်", | |
| "တံဂှ်", | |
| "▁", | |
| "ကၠောန်", | |
| "ဗဒှ်", | |
| "လဝ်ရ", | |
| "<0xE1>", | |
| "<0x81>", | |
| "<0x8B>" | |
| ], | |
| "ids_head": [ | |
| 262, | |
| 610, | |
| 324, | |
| 381, | |
| 610, | |
| 324, | |
| 381, | |
| 495, | |
| 231, | |
| 135, | |
| 145, | |
| 262, | |
| 1733, | |
| 262, | |
| 2158, | |
| 339, | |
| 1148, | |
| 262, | |
| 286, | |
| 726, | |
| 1097, | |
| 231, | |
| 135, | |
| 145 | |
| ], | |
| "round_trip_ok": true, | |
| "compression_ratio": 1.9166666666666667 | |
| }, | |
| "ဒေါံဏံ ဍာ်မိုဟ် ကြဴကြဴဏောၚ်။": { | |
| "num_pieces": 14, | |
| "pieces": [ | |
| "▁", | |
| "ဒေါ", | |
| "ံ", | |
| "ဏံ", | |
| "▁ဍာ်", | |
| "မ", | |
| "ိုဟ်", | |
| "▁", | |
| "ကြဴ", | |
| "ကြဴ", | |
| "ဏောၚ်", | |
| "<0xE1>", | |
| "<0x81>", | |
| "<0x8B>" | |
| ], | |
| "ids_head": [ | |
| 262, | |
| 1865, | |
| 381, | |
| 596, | |
| 1178, | |
| 272, | |
| 1255, | |
| 262, | |
| 1752, | |
| 1752, | |
| 2484, | |
| 231, | |
| 135, | |
| 145 | |
| ], | |
| "round_trip_ok": true, | |
| "compression_ratio": 2.0 | |
| }, | |
| "ဘာသာမန် ပရူပရာတံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။": { | |
| "num_pieces": 12, | |
| "pieces": [ | |
| "▁", | |
| "ဘာသာမန်", | |
| "▁", | |
| "ပရူပရာ", | |
| "တံဂှ်", | |
| "▁", | |
| "ကၠောန်", | |
| "ဗဒှ်", | |
| "လဝ်ရ", | |
| "<0xE1>", | |
| "<0x81>", | |
| "<0x8B>" | |
| ], | |
| "ids_head": [ | |
| 262, | |
| 1179, | |
| 262, | |
| 3651, | |
| 1148, | |
| 262, | |
| 286, | |
| 726, | |
| 1097, | |
| 231, | |
| 135, | |
| 145 | |
| ], | |
| "round_trip_ok": true, | |
| "compression_ratio": 2.9166666666666665 | |
| }, | |
| "ဘာသာအင်္ဂလိက် ကဵု ဘာသာမန် နွံပၟိက်ရ။": { | |
| "num_pieces": 11, | |
| "pieces": [ | |
| "▁", | |
| "ဘာသာအင်္ဂလိက်", | |
| "▁ကဵု", | |
| "▁", | |
| "ဘာသာမန်", | |
| "▁", | |
| "နွံပၟိက်", | |
| "ရ", | |
| "<0xE1>", | |
| "<0x81>", | |
| "<0x8B>" | |
| ], | |
| "ids_head": [ | |
| 262, | |
| 1970, | |
| 387, | |
| 262, | |
| 1179, | |
| 262, | |
| 1205, | |
| 264, | |
| 231, | |
| 135, | |
| 145 | |
| ], | |
| "round_trip_ok": true, | |
| "compression_ratio": 3.272727272727273 | |
| }, | |
| "သၞာံ ၂၀၂၄ ဂိတုဇန္နဝါရဳ ၁၅ မံက်": { | |
| "num_pieces": 10, | |
| "pieces": [ | |
| "▁သၞာံ", | |
| "▁၂၀၂၄", | |
| "▁ဂိတု", | |
| "ဇ", | |
| "န္န", | |
| "ဝါ", | |
| "ရဳ", | |
| "▁၁၅", | |
| "▁", | |
| "မံက်" | |
| ], | |
| "ids_head": [ | |
| 287, | |
| 2730, | |
| 732, | |
| 384, | |
| 2733, | |
| 463, | |
| 1248, | |
| 1059, | |
| 262, | |
| 967 | |
| ], | |
| "round_trip_ok": true, | |
| "compression_ratio": 3.0 | |
| }, | |
| "ၚၛၜၝၞၟၠ မန်တံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။": { | |
| "num_pieces": 20, | |
| "pieces": [ | |
| "▁", | |
| "ၚ", | |
| "<0xE1>", | |
| "<0x81>", | |
| "<0x9B>", | |
| "ၜ", | |
| "ၝ", | |
| "ၞ", | |
| "ၟ", | |
| "ၠ", | |
| "▁", | |
| "မန်", | |
| "တံဂှ်", | |
| "▁", | |
| "ကၠောန်", | |
| "ဗဒှ်", | |
| "လဝ်ရ", | |
| "<0xE1>", | |
| "<0x81>", | |
| "<0x8B>" | |
| ], | |
| "ids_head": [ | |
| 262, | |
| 1062, | |
| 231, | |
| 135, | |
| 161, | |
| 844, | |
| 1937, | |
| 554, | |
| 3999, | |
| 922, | |
| 262, | |
| 294, | |
| 1148, | |
| 262, | |
| 286, | |
| 726, | |
| 1097, | |
| 231, | |
| 135, | |
| 145 | |
| ], | |
| "round_trip_ok": true, | |
| "compression_ratio": 1.6 | |
| }, | |
| "ဨဩဪဥဦဧ မန်တံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။": { | |
| "num_pieces": 23, | |
| "pieces": [ | |
| "▁", | |
| "ဨ", | |
| "<0xE1>", | |
| "<0x80>", | |
| "<0xA9>", | |
| "<0xE1>", | |
| "<0x80>", | |
| "<0xAA>", | |
| "ဥ", | |
| "ဦ", | |
| "<0xE1>", | |
| "<0x80>", | |
| "<0xA7>", | |
| "▁", | |
| "မန်", | |
| "တံဂှ်", | |
| "▁", | |
| "ကၠောန်", | |
| "ဗဒှ်", | |
| "လဝ်ရ", | |
| "<0xE1>", | |
| "<0x81>", | |
| "<0x8B>" | |
| ], | |
| "ids_head": [ | |
| 262, | |
| 1052, | |
| 231, | |
| 134, | |
| 175, | |
| 231, | |
| 134, | |
| 176, | |
| 1157, | |
| 3995, | |
| 231, | |
| 134, | |
| 173, | |
| 262, | |
| 294, | |
| 1148, | |
| 262, | |
| 286, | |
| 726, | |
| 1097, | |
| 231, | |
| 135, | |
| 145 | |
| ], | |
| "round_trip_ok": true, | |
| "compression_ratio": 1.3478260869565217 | |
| }, | |
| "ါာူးေိီဲံ်္ မန်တံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။": { | |
| "num_pieces": 22, | |
| "pieces": [ | |
| "▁", | |
| "ါ", | |
| "ာ", | |
| "ူ", | |
| "း", | |
| "ေ", | |
| "ိ", | |
| "ီ", | |
| "ဲ", | |
| "ံ", | |
| "်", | |
| "္", | |
| "▁", | |
| "မန်", | |
| "တံဂှ်", | |
| "▁", | |
| "ကၠောန်", | |
| "ဗဒှ်", | |
| "လဝ်ရ", | |
| "<0xE1>", | |
| "<0x81>", | |
| "<0x8B>" | |
| ], | |
| "ids_head": [ | |
| 262, | |
| 580, | |
| 328, | |
| 634, | |
| 304, | |
| 445, | |
| 478, | |
| 649, | |
| 340, | |
| 381, | |
| 276, | |
| 483, | |
| 262, | |
| 294, | |
| 1148, | |
| 262, | |
| 286, | |
| 726, | |
| 1097, | |
| 231, | |
| 135, | |
| 145 | |
| ], | |
| "round_trip_ok": true, | |
| "compression_ratio": 1.6363636363636365 | |
| }, | |
| "ျြွှဿ မန်တံဂှ် ကၠောန်ဗဒှ်လဝ်ရ။": { | |
| "num_pieces": 16, | |
| "pieces": [ | |
| "▁", | |
| "ျ", | |
| "ြ", | |
| "ွ", | |
| "ှ", | |
| "ဿ", | |
| "▁", | |
| "မန်", | |
| "တံဂှ်", | |
| "▁", | |
| "ကၠောန်", | |
| "ဗဒှ်", | |
| "လဝ်ရ", | |
| "<0xE1>", | |
| "<0x81>", | |
| "<0x8B>" | |
| ], | |
| "ids_head": [ | |
| 262, | |
| 2040, | |
| 2674, | |
| 738, | |
| 753, | |
| 1251, | |
| 262, | |
| 294, | |
| 1148, | |
| 262, | |
| 286, | |
| 726, | |
| 1097, | |
| 231, | |
| 135, | |
| 145 | |
| ], | |
| "round_trip_ok": true, | |
| "compression_ratio": 1.875 | |
| }, | |
| "မန်တံဂှ်၊ ကၠောန်ဗဒှ်လဝ်ရ။ ပရူပရာတံဂှ်၌ နွံပၟိက်ရ။": { | |
| "num_pieces": 23, | |
| "pieces": [ | |
| "▁", | |
| "မန်", | |
| "တံဂှ်", | |
| "<0xE1>", | |
| "<0x81>", | |
| "<0x8A>", | |
| "▁", | |
| "ကၠောန်", | |
| "ဗဒှ်", | |
| "လဝ်ရ", | |
| "<0xE1>", | |
| "<0x81>", | |
| "<0x8B>", | |
| "▁", | |
| "ပရူပရာ", | |
| "တံဂှ်", | |
| "၌", | |
| "▁", | |
| "နွံပၟိက်", | |
| "ရ", | |
| "<0xE1>", | |
| "<0x81>", | |
| "<0x8B>" | |
| ], | |
| "ids_head": [ | |
| 262, | |
| 294, | |
| 1148, | |
| 231, | |
| 135, | |
| 144, | |
| 262, | |
| 286, | |
| 726, | |
| 1097, | |
| 231, | |
| 135, | |
| 145, | |
| 262, | |
| 3651, | |
| 1148, | |
| 3430, | |
| 262, | |
| 1205, | |
| 264, | |
| 231, | |
| 135, | |
| 145 | |
| ], | |
| "round_trip_ok": true, | |
| "compression_ratio": 2.130434782608696 | |
| }, | |
| "သၞာံ ၂၀၂၄ ÷ ၄ = ၅၀၆ × ၁၀ = ၅၀၆၀": { | |
| "num_pieces": 18, | |
| "pieces": [ | |
| "▁သၞာံ", | |
| "▁၂၀၂၄", | |
| "▁", | |
| "<0xC3>", | |
| "<0xB7>", | |
| "▁၄", | |
| "▁=", | |
| "▁", | |
| "၅၀", | |
| "၆", | |
| "▁", | |
| "<0xC3>", | |
| "<0x97>", | |
| "▁၁၀", | |
| "▁=", | |
| "▁", | |
| "၅၀", | |
| "၆၀" | |
| ], | |
| "ids_head": [ | |
| 287, | |
| 2730, | |
| 262, | |
| 201, | |
| 189, | |
| 705, | |
| 533, | |
| 262, | |
| 1287, | |
| 936, | |
| 262, | |
| 201, | |
| 157, | |
| 782, | |
| 533, | |
| 262, | |
| 1287, | |
| 1812 | |
| ], | |
| "round_trip_ok": true, | |
| "compression_ratio": 1.7222222222222223 | |
| }, | |
| "_stats": { | |
| "avg_compression_ratio": 1.9896373056994818, | |
| "round_trip_accuracy": 1.0, | |
| "total_samples": 11, | |
| "vocab_size": 4000 | |
| } | |
| }, | |
| "character_analysis": { | |
| "total_chars": 2453293, | |
| "mon_chars": 1907807, | |
| "unique_mon_chars": 94, | |
| "mon_char_ratio": 0.7776515075859264, | |
| "categories": { | |
| "base_consonants": [ | |
| "က", | |
| "ခ", | |
| "ဂ", | |
| "ဃ", | |
| "င", | |
| "စ", | |
| "ဆ", | |
| "ဇ", | |
| "ဉ", | |
| "ည", | |
| "ဋ", | |
| "ဌ", | |
| "ဍ", | |
| "ဎ", | |
| "ဏ", | |
| "တ", | |
| "ထ", | |
| "ဒ", | |
| "ဓ", | |
| "န", | |
| "ပ", | |
| "ဖ", | |
| "ဗ", | |
| "ဘ", | |
| "မ", | |
| "ယ", | |
| "ရ", | |
| "လ", | |
| "ဝ", | |
| "သ", | |
| "ဟ", | |
| "ဠ", | |
| "အ" | |
| ], | |
| "extended_mon": [ | |
| "ၚ", | |
| "ၛ", | |
| "ၜ", | |
| "ၝ", | |
| "ၞ", | |
| "ၟ", | |
| "ၠ" | |
| ], | |
| "extended_vowels": [ | |
| "ဥ", | |
| "ဦ", | |
| "ဧ", | |
| "ဨ", | |
| "ဩ" | |
| ], | |
| "vowel_signs": [ | |
| "ါ", | |
| "ာ", | |
| "ိ", | |
| "ီ", | |
| "ူ", | |
| "ေ", | |
| "ဲ", | |
| "ံ", | |
| "း", | |
| "္", | |
| "်" | |
| ], | |
| "media_chars": [ | |
| "ျ", | |
| "ြ", | |
| "ွ", | |
| "ှ" | |
| ], | |
| "punctuation": [ | |
| "၌", | |
| "၏" | |
| ], | |
| "mathematical": [ | |
| "=", | |
| "×" | |
| ], | |
| "other": [ | |
| "ဣ", | |
| "ဤ", | |
| "ု", | |
| "ဳ", | |
| "ဴ", | |
| "ဵ", | |
| "့", | |
| "ဿ", | |
| "၀", | |
| "၁", | |
| "၂", | |
| "၃", | |
| "၄", | |
| "၅", | |
| "၆", | |
| "၇", | |
| "၈", | |
| "၉", | |
| "ၐ", | |
| "ၑ", | |
| "ၢ", | |
| "ၤ", | |
| "ႄ", | |
| "ႅ", | |
| "ႆ", | |
| "ႇ", | |
| "ႈ", | |
| "႓", | |
| "႕", | |
| "ႝ" | |
| ] | |
| }, | |
| "all_found_chars": [ | |
| "=", | |
| "×", | |
| "က", | |
| "ခ", | |
| "ဂ", | |
| "ဃ", | |
| "င", | |
| "စ", | |
| "ဆ", | |
| "ဇ", | |
| "ဉ", | |
| "ည", | |
| "ဋ", | |
| "ဌ", | |
| "ဍ", | |
| "ဎ", | |
| "ဏ", | |
| "တ", | |
| "ထ", | |
| "ဒ", | |
| "ဓ", | |
| "န", | |
| "ပ", | |
| "ဖ", | |
| "ဗ", | |
| "ဘ", | |
| "မ", | |
| "ယ", | |
| "ရ", | |
| "လ", | |
| "ဝ", | |
| "သ", | |
| "ဟ", | |
| "ဠ", | |
| "အ", | |
| "ဣ", | |
| "ဤ", | |
| "ဥ", | |
| "ဦ", | |
| "ဧ", | |
| "ဨ", | |
| "ဩ", | |
| "ါ", | |
| "ာ", | |
| "ိ", | |
| "ီ", | |
| "ု", | |
| "ူ", | |
| "ေ", | |
| "ဲ", | |
| "ဳ", | |
| "ဴ", | |
| "ဵ", | |
| "ံ", | |
| "့", | |
| "း", | |
| "္", | |
| "်", | |
| "ျ", | |
| "ြ", | |
| "ွ", | |
| "ှ", | |
| "ဿ", | |
| "၀", | |
| "၁", | |
| "၂", | |
| "၃", | |
| "၄", | |
| "၅", | |
| "၆", | |
| "၇", | |
| "၈", | |
| "၉", | |
| "၌", | |
| "၏", | |
| "ၐ", | |
| "ၑ", | |
| "ၚ", | |
| "ၛ", | |
| "ၜ", | |
| "ၝ", | |
| "ၞ", | |
| "ၟ", | |
| "ၠ", | |
| "ၢ", | |
| "ၤ", | |
| "ႄ", | |
| "ႅ", | |
| "ႆ", | |
| "ႇ", | |
| "ႈ", | |
| "႓", | |
| "႕", | |
| "ႝ" | |
| ] | |
| }, | |
| "resource_limits": { | |
| "max_cpu_percent": 90, | |
| "max_memory_percent": 85, | |
| "max_disk_percent": 90 | |
| } | |
| } |