| { | |
| "model_info": { | |
| "version": "2.0", | |
| "model_type": "unigram", | |
| "vocab_size": 35590, | |
| "creation_date": "2025-07-02", | |
| "language": "thai", | |
| "description": "Advanced Thai tokenizer with improved handling of Thai text, mixed content, and modern vocabulary" | |
| }, | |
| "performance": { | |
| "test_results": { | |
| "overall": { | |
| "passed": 24, | |
| "total": 24 | |
| }, | |
| "categories": { | |
| "basic_thai": { | |
| "passed": 4, | |
| "total": 4, | |
| "details": [ | |
| { | |
| "input": "สวัสดี", | |
| "tokens": [ | |
| "สวัสด", | |
| "ี" | |
| ], | |
| "token_count": 2, | |
| "decoded": "สวัสดี", | |
| "success": true | |
| }, | |
| { | |
| "input": "ขอบคุณ", | |
| "tokens": [ | |
| "ขอบ", | |
| "คุณ" | |
| ], | |
| "token_count": 2, | |
| "decoded": "ขอบคุณ", | |
| "success": true | |
| }, | |
| { | |
| "input": "ครับ", | |
| "tokens": [ | |
| "ครับ" | |
| ], | |
| "token_count": 1, | |
| "decoded": "ครับ", | |
| "success": true | |
| }, | |
| { | |
| "input": "ค่ะ", | |
| "tokens": [ | |
| "ค่ะ" | |
| ], | |
| "token_count": 1, | |
| "decoded": "ค่ะ", | |
| "success": true | |
| } | |
| ] | |
| }, | |
| "thai_with_spaces": { | |
| "passed": 3, | |
| "total": 3, | |
| "details": [ | |
| { | |
| "input": "กิน ข้าว อร่อย", | |
| "tokens": [ | |
| "กิน", | |
| " ", | |
| "ข้าว", | |
| " ", | |
| "อ", | |
| "ร่อย" | |
| ], | |
| "token_count": 6, | |
| "decoded": "กิน ข้าว อร่อย", | |
| "success": true | |
| }, | |
| { | |
| "input": "วันนี้ อากาศ ดี", | |
| "tokens": [ | |
| "วัน", | |
| "นี้", | |
| " ", | |
| "อากาศ", | |
| " ", | |
| "ด", | |
| "ี" | |
| ], | |
| "token_count": 7, | |
| "decoded": "วันนี้ อากาศ ดี", | |
| "success": true | |
| }, | |
| { | |
| "input": "ผม ชื่อ จอห์น", | |
| "tokens": [ | |
| "ผ", | |
| "ม", | |
| " ", | |
| "ชื่อ", | |
| " ", | |
| "จอห์น" | |
| ], | |
| "token_count": 6, | |
| "decoded": "ผม ชื่อ จอห์น", | |
| "success": true | |
| } | |
| ] | |
| }, | |
| "mixed_content": { | |
| "passed": 3, | |
| "total": 3, | |
| "details": [ | |
| { | |
| "input": "123 สวัสดี abc", | |
| "tokens": [ | |
| "1", | |
| "2", | |
| "3", | |
| " ", | |
| "สวัสด", | |
| "ี", | |
| " ", | |
| "abc" | |
| ], | |
| "token_count": 8, | |
| "decoded": "123 สวัสดี abc", | |
| "success": true | |
| }, | |
| { | |
| "input": "Hello ครับ", | |
| "tokens": [ | |
| "Hello", | |
| " ", | |
| "ครับ" | |
| ], | |
| "token_count": 3, | |
| "decoded": "Hello ครับ", | |
| "success": true | |
| }, | |
| { | |
| "input": "COVID-19 ระบาด", | |
| "tokens": [ | |
| "COVID", | |
| "-", | |
| "1", | |
| "9", | |
| " ", | |
| "ระบาด" | |
| ], | |
| "token_count": 6, | |
| "decoded": "COVID-19 ระบาด", | |
| "success": true | |
| } | |
| ] | |
| }, | |
| "formal_thai": { | |
| "passed": 2, | |
| "total": 2, | |
| "details": [ | |
| { | |
| "input": "พระบาทสมเด็จพระเจ้าอยู่หัว", | |
| "tokens": [ | |
| "พระบาทสมเด็จพระ", | |
| "เจ้าอยู่หัว" | |
| ], | |
| "token_count": 2, | |
| "decoded": "พระบาทสมเด็จพระเจ้าอยู่หัว", | |
| "success": true | |
| }, | |
| { | |
| "input": "การประชุมสำคัญ", | |
| "tokens": [ | |
| "การประชุม", | |
| "สำคัญ" | |
| ], | |
| "token_count": 2, | |
| "decoded": "การประชุมสำคัญ", | |
| "success": true | |
| } | |
| ] | |
| }, | |
| "casual_thai": { | |
| "passed": 3, | |
| "total": 3, | |
| "details": [ | |
| { | |
| "input": "อร่อยจัง", | |
| "tokens": [ | |
| "อ", | |
| "ร่อย", | |
| "จัง" | |
| ], | |
| "token_count": 3, | |
| "decoded": "อร่อยจัง", | |
| "success": true | |
| }, | |
| { | |
| "input": "แพงมาก", | |
| "tokens": [ | |
| "แพง", | |
| "มาก" | |
| ], | |
| "token_count": 2, | |
| "decoded": "แพงมาก", | |
| "success": true | |
| }, | |
| { | |
| "input": "ถูกมาก", | |
| "tokens": [ | |
| "ถูก", | |
| "มาก" | |
| ], | |
| "token_count": 2, | |
| "decoded": "ถูกมาก", | |
| "success": true | |
| } | |
| ] | |
| }, | |
| "complex_thai": { | |
| "passed": 3, | |
| "total": 3, | |
| "details": [ | |
| { | |
| "input": "กรุงเทพมหานคร", | |
| "tokens": [ | |
| "กรุงเทพ", | |
| "มหา", | |
| "นคร" | |
| ], | |
| "token_count": 3, | |
| "decoded": "กรุงเทพมหานคร", | |
| "success": true | |
| }, | |
| { | |
| "input": "ราชมงคลธัญบุรี", | |
| "tokens": [ | |
| "ราช", | |
| "มงคล", | |
| "ธัญ", | |
| "บุรี" | |
| ], | |
| "token_count": 4, | |
| "decoded": "ราชมงคลธัญบุรี", | |
| "success": true | |
| }, | |
| { | |
| "input": "จุฬาลงกรณ์มหาวิทยาลัย", | |
| "tokens": [ | |
| "จุฬาล", | |
| "ง", | |
| "กรณ์", | |
| "มหาวิทยาลัย" | |
| ], | |
| "token_count": 4, | |
| "decoded": "จุฬาลงกรณ์มหาวิทยาลัย", | |
| "success": true | |
| } | |
| ] | |
| }, | |
| "numbers_dates": { | |
| "passed": 3, | |
| "total": 3, | |
| "details": [ | |
| { | |
| "input": "1 มกราคม 2567", | |
| "tokens": [ | |
| "1", | |
| " ", | |
| "มกรา", | |
| "ค", | |
| "ม", | |
| " ", | |
| "2", | |
| "567" | |
| ], | |
| "token_count": 8, | |
| "decoded": "1 มกราคม 2567", | |
| "success": true | |
| }, | |
| { | |
| "input": "เวลา 14:30 น.", | |
| "tokens": [ | |
| "เวลา", | |
| " ", | |
| "1", | |
| "4", | |
| ":", | |
| "30", | |
| " ", | |
| "น", | |
| "." | |
| ], | |
| "token_count": 9, | |
| "decoded": "เวลา 14:30 น.", | |
| "success": true | |
| }, | |
| { | |
| "input": "ราคา 1,234 บาท", | |
| "tokens": [ | |
| "ราคา", | |
| " ", | |
| "1", | |
| ",", | |
| "2", | |
| "34", | |
| " ", | |
| "บาท" | |
| ], | |
| "token_count": 8, | |
| "decoded": "ราคา 1,234 บาท", | |
| "success": true | |
| } | |
| ] | |
| }, | |
| "technology": { | |
| "passed": 3, | |
| "total": 3, | |
| "details": [ | |
| { | |
| "input": "อินเทอร์เน็ต", | |
| "tokens": [ | |
| "อินเทอร์เน็ต" | |
| ], | |
| "token_count": 1, | |
| "decoded": "อินเทอร์เน็ต", | |
| "success": true | |
| }, | |
| { | |
| "input": "โทรศัพท์มือถือ", | |
| "tokens": [ | |
| "โทรศัพท์", | |
| "มือถือ" | |
| ], | |
| "token_count": 2, | |
| "decoded": "โทรศัพท์มือถือ", | |
| "success": true | |
| }, | |
| { | |
| "input": "แอปพลิเคชัน", | |
| "tokens": [ | |
| "แอปพลิเคชั", | |
| "น" | |
| ], | |
| "token_count": 2, | |
| "decoded": "แอปพลิเคชัน", | |
| "success": true | |
| } | |
| ] | |
| } | |
| } | |
| }, | |
| "efficiency": { | |
| "compression_ratios": [ | |
| 3.0, | |
| 2.75, | |
| 2.6470588235294117, | |
| 6.7, | |
| 2.1666666666666665 | |
| ], | |
| "avg_tokens_per_char": 0.30726256983240224, | |
| "vocab_coverage": 0.0010958134307389716, | |
| "details": [ | |
| { | |
| "sentence": "สวัสดี", | |
| "char_count": 6, | |
| "token_count": 2, | |
| "compression_ratio": 3.0, | |
| "tokens": [ | |
| "สวัสด", | |
| "ี" | |
| ] | |
| }, | |
| { | |
| "sentence": "สวัสดีครับ ผมชื่อจอห์น", | |
| "char_count": 22, | |
| "token_count": 8, | |
| "compression_ratio": 2.75, | |
| "tokens": [ | |
| "สวัสด", | |
| "ี", | |
| "ครับ", | |
| " ", | |
| "ผ", | |
| "ม", | |
| "ชื่อ", | |
| "จอห์น" | |
| ] | |
| }, | |
| { | |
| "sentence": "วันนี้อากาศดีมาก ผมจึงไปเดินเล่นที่สวนสาธารณะ", | |
| "char_count": 45, | |
| "token_count": 17, | |
| "compression_ratio": 2.6470588235294117, | |
| "tokens": [ | |
| "วัน", | |
| "นี้", | |
| "อากาศ", | |
| "ด", | |
| "ี", | |
| "มาก", | |
| " ", | |
| "ผ", | |
| "ม", | |
| "จึง", | |
| "ไป", | |
| "เดิน", | |
| "เล่น", | |
| "ที่", | |
| "สวน", | |
| "สาธารณ", | |
| "ะ" | |
| ] | |
| }, | |
| { | |
| "sentence": "พระบาทสมเด็จพระเจ้าอยู่หัวทรงพระกรุณาโปรดเกล้าฯ ให้จัดงานพระราชพิธี", | |
| "char_count": 67, | |
| "token_count": 10, | |
| "compression_ratio": 6.7, | |
| "tokens": [ | |
| "พระบาทสมเด็จพระ", | |
| "เจ้าอยู่หัว", | |
| "ทรง", | |
| "พระกรุณา", | |
| "โปรดเกล้า", | |
| "ฯ ", | |
| "ให้", | |
| "จัด", | |
| "งาน", | |
| "พระราชพิธี" | |
| ] | |
| }, | |
| { | |
| "sentence": "555 อร่อยมากกก กินข้าวยัง? #อาหารไทย 🇹🇭", | |
| "char_count": 39, | |
| "token_count": 18, | |
| "compression_ratio": 2.1666666666666665, | |
| "tokens": [ | |
| "555", | |
| " ", | |
| "อ", | |
| "ร่อย", | |
| "มาก", | |
| "ก", | |
| "ก", | |
| " ", | |
| "กิน", | |
| "ข้าว", | |
| "ยัง", | |
| "?", | |
| " ", | |
| "#", | |
| "อาหาร", | |
| "ไทย", | |
| " ", | |
| "🇹🇭" | |
| ] | |
| } | |
| ] | |
| }, | |
| "overall_accuracy": "24/24" | |
| }, | |
| "features": [ | |
| "No normalization (preserves Thai characters)", | |
| "Smart punctuation handling", | |
| "Mixed Thai-English support", | |
| "Modern vocabulary coverage", | |
| "Efficient compression", | |
| "Direct decoding without artifacts" | |
| ], | |
| "usage_notes": { | |
| "best_decoding": "manual concatenation of non-special tokens", | |
| "recommended_for": [ | |
| "Thai NLP", | |
| "LLM training", | |
| "Text processing", | |
| "Social media analysis" | |
| ], | |
| "avoid": [ | |
| "Text normalization", | |
| "Byte-level fallback", | |
| "Aggressive post-processing" | |
| ] | |
| } | |
| } |