{ "model_info": { "version": "2.0", "model_type": "unigram", "vocab_size": 35590, "creation_date": "2025-07-02", "language": "thai", "description": "Advanced Thai tokenizer with improved handling of Thai text, mixed content, and modern vocabulary" }, "performance": { "test_results": { "overall": { "passed": 24, "total": 24 }, "categories": { "basic_thai": { "passed": 4, "total": 4, "details": [ { "input": "สวัสดี", "tokens": [ "สวัสด", "ี" ], "token_count": 2, "decoded": "สวัสดี", "success": true }, { "input": "ขอบคุณ", "tokens": [ "ขอบ", "คุณ" ], "token_count": 2, "decoded": "ขอบคุณ", "success": true }, { "input": "ครับ", "tokens": [ "ครับ" ], "token_count": 1, "decoded": "ครับ", "success": true }, { "input": "ค่ะ", "tokens": [ "ค่ะ" ], "token_count": 1, "decoded": "ค่ะ", "success": true } ] }, "thai_with_spaces": { "passed": 3, "total": 3, "details": [ { "input": "กิน ข้าว อร่อย", "tokens": [ "กิน", " ", "ข้าว", " ", "อ", "ร่อย" ], "token_count": 6, "decoded": "กิน ข้าว อร่อย", "success": true }, { "input": "วันนี้ อากาศ ดี", "tokens": [ "วัน", "นี้", " ", "อากาศ", " ", "ด", "ี" ], "token_count": 7, "decoded": "วันนี้ อากาศ ดี", "success": true }, { "input": "ผม ชื่อ จอห์น", "tokens": [ "ผ", "ม", " ", "ชื่อ", " ", "จอห์น" ], "token_count": 6, "decoded": "ผม ชื่อ จอห์น", "success": true } ] }, "mixed_content": { "passed": 3, "total": 3, "details": [ { "input": "123 สวัสดี abc", "tokens": [ "1", "2", "3", " ", "สวัสด", "ี", " ", "abc" ], "token_count": 8, "decoded": "123 สวัสดี abc", "success": true }, { "input": "Hello ครับ", "tokens": [ "Hello", " ", "ครับ" ], "token_count": 3, "decoded": "Hello ครับ", "success": true }, { "input": "COVID-19 ระบาด", "tokens": [ "COVID", "-", "1", "9", " ", "ระบาด" ], "token_count": 6, "decoded": "COVID-19 ระบาด", "success": true } ] }, "formal_thai": { "passed": 2, "total": 2, "details": [ { "input": "พระบาทสมเด็จพระเจ้าอยู่หัว", "tokens": [ "พระบาทสมเด็จพระ", "เจ้าอยู่หัว" ], "token_count": 2, "decoded": "พระบาทสมเด็จพระเจ้าอยู่หัว", "success": true }, { "input": "การประชุมสำคัญ", "tokens": [ "การประชุม", "สำคัญ" ], "token_count": 2, "decoded": "การประชุมสำคัญ", "success": true } ] }, "casual_thai": { "passed": 3, "total": 3, "details": [ { "input": "อร่อยจัง", "tokens": [ "อ", "ร่อย", "จัง" ], "token_count": 3, "decoded": "อร่อยจัง", "success": true }, { "input": "แพงมาก", "tokens": [ "แพง", "มาก" ], "token_count": 2, "decoded": "แพงมาก", "success": true }, { "input": "ถูกมาก", "tokens": [ "ถูก", "มาก" ], "token_count": 2, "decoded": "ถูกมาก", "success": true } ] }, "complex_thai": { "passed": 3, "total": 3, "details": [ { "input": "กรุงเทพมหานคร", "tokens": [ "กรุงเทพ", "มหา", "นคร" ], "token_count": 3, "decoded": "กรุงเทพมหานคร", "success": true }, { "input": "ราชมงคลธัญบุรี", "tokens": [ "ราช", "มงคล", "ธัญ", "บุรี" ], "token_count": 4, "decoded": "ราชมงคลธัญบุรี", "success": true }, { "input": "จุฬาลงกรณ์มหาวิทยาลัย", "tokens": [ "จุฬาล", "ง", "กรณ์", "มหาวิทยาลัย" ], "token_count": 4, "decoded": "จุฬาลงกรณ์มหาวิทยาลัย", "success": true } ] }, "numbers_dates": { "passed": 3, "total": 3, "details": [ { "input": "1 มกราคม 2567", "tokens": [ "1", " ", "มกรา", "ค", "ม", " ", "2", "567" ], "token_count": 8, "decoded": "1 มกราคม 2567", "success": true }, { "input": "เวลา 14:30 น.", "tokens": [ "เวลา", " ", "1", "4", ":", "30", " ", "น", "." ], "token_count": 9, "decoded": "เวลา 14:30 น.", "success": true }, { "input": "ราคา 1,234 บาท", "tokens": [ "ราคา", " ", "1", ",", "2", "34", " ", "บาท" ], "token_count": 8, "decoded": "ราคา 1,234 บาท", "success": true } ] }, "technology": { "passed": 3, "total": 3, "details": [ { "input": "อินเทอร์เน็ต", "tokens": [ "อินเทอร์เน็ต" ], "token_count": 1, "decoded": "อินเทอร์เน็ต", "success": true }, { "input": "โทรศัพท์มือถือ", "tokens": [ "โทรศัพท์", "มือถือ" ], "token_count": 2, "decoded": "โทรศัพท์มือถือ", "success": true }, { "input": "แอปพลิเคชัน", "tokens": [ "แอปพลิเคชั", "น" ], "token_count": 2, "decoded": "แอปพลิเคชัน", "success": true } ] } } }, "efficiency": { "compression_ratios": [ 3.0, 2.75, 2.6470588235294117, 6.7, 2.1666666666666665 ], "avg_tokens_per_char": 0.30726256983240224, "vocab_coverage": 0.0010958134307389716, "details": [ { "sentence": "สวัสดี", "char_count": 6, "token_count": 2, "compression_ratio": 3.0, "tokens": [ "สวัสด", "ี" ] }, { "sentence": "สวัสดีครับ ผมชื่อจอห์น", "char_count": 22, "token_count": 8, "compression_ratio": 2.75, "tokens": [ "สวัสด", "ี", "ครับ", " ", "ผ", "ม", "ชื่อ", "จอห์น" ] }, { "sentence": "วันนี้อากาศดีมาก ผมจึงไปเดินเล่นที่สวนสาธารณะ", "char_count": 45, "token_count": 17, "compression_ratio": 2.6470588235294117, "tokens": [ "วัน", "นี้", "อากาศ", "ด", "ี", "มาก", " ", "ผ", "ม", "จึง", "ไป", "เดิน", "เล่น", "ที่", "สวน", "สาธารณ", "ะ" ] }, { "sentence": "พระบาทสมเด็จพระเจ้าอยู่หัวทรงพระกรุณาโปรดเกล้าฯ ให้จัดงานพระราชพิธี", "char_count": 67, "token_count": 10, "compression_ratio": 6.7, "tokens": [ "พระบาทสมเด็จพระ", "เจ้าอยู่หัว", "ทรง", "พระกรุณา", "โปรดเกล้า", "ฯ ", "ให้", "จัด", "งาน", "พระราชพิธี" ] }, { "sentence": "555 อร่อยมากกก กินข้าวยัง? #อาหารไทย 🇹🇭", "char_count": 39, "token_count": 18, "compression_ratio": 2.1666666666666665, "tokens": [ "555", " ", "อ", "ร่อย", "มาก", "ก", "ก", " ", "กิน", "ข้าว", "ยัง", "?", " ", "#", "อาหาร", "ไทย", " ", "🇹🇭" ] } ] }, "overall_accuracy": "24/24" }, "features": [ "No normalization (preserves Thai characters)", "Smart punctuation handling", "Mixed Thai-English support", "Modern vocabulary coverage", "Efficient compression", "Direct decoding without artifacts" ], "usage_notes": { "best_decoding": "manual concatenation of non-special tokens", "recommended_for": [ "Thai NLP", "LLM training", "Text processing", "Social media analysis" ], "avoid": [ "Text normalization", "Byte-level fallback", "Aggressive post-processing" ] } }