Thaitokenizer / metadata.json
JonusNattapong's picture
Upload folder using huggingface_hub
55f2143 verified
{
"model_info": {
"version": "2.0",
"model_type": "unigram",
"vocab_size": 35590,
"creation_date": "2025-07-02",
"language": "thai",
"description": "Advanced Thai tokenizer with improved handling of Thai text, mixed content, and modern vocabulary"
},
"performance": {
"test_results": {
"overall": {
"passed": 24,
"total": 24
},
"categories": {
"basic_thai": {
"passed": 4,
"total": 4,
"details": [
{
"input": "สวัสดี",
"tokens": [
"สวัสด",
"ี"
],
"token_count": 2,
"decoded": "สวัสดี",
"success": true
},
{
"input": "ขอบคุณ",
"tokens": [
"ขอบ",
"คุณ"
],
"token_count": 2,
"decoded": "ขอบคุณ",
"success": true
},
{
"input": "ครับ",
"tokens": [
"ครับ"
],
"token_count": 1,
"decoded": "ครับ",
"success": true
},
{
"input": "ค่ะ",
"tokens": [
"ค่ะ"
],
"token_count": 1,
"decoded": "ค่ะ",
"success": true
}
]
},
"thai_with_spaces": {
"passed": 3,
"total": 3,
"details": [
{
"input": "กิน ข้าว อร่อย",
"tokens": [
"กิน",
" ",
"ข้าว",
" ",
"อ",
"ร่อย"
],
"token_count": 6,
"decoded": "กิน ข้าว อร่อย",
"success": true
},
{
"input": "วันนี้ อากาศ ดี",
"tokens": [
"วัน",
"นี้",
" ",
"อากาศ",
" ",
"ด",
"ี"
],
"token_count": 7,
"decoded": "วันนี้ อากาศ ดี",
"success": true
},
{
"input": "ผม ชื่อ จอห์น",
"tokens": [
"ผ",
"ม",
" ",
"ชื่อ",
" ",
"จอห์น"
],
"token_count": 6,
"decoded": "ผม ชื่อ จอห์น",
"success": true
}
]
},
"mixed_content": {
"passed": 3,
"total": 3,
"details": [
{
"input": "123 สวัสดี abc",
"tokens": [
"1",
"2",
"3",
" ",
"สวัสด",
"ี",
" ",
"abc"
],
"token_count": 8,
"decoded": "123 สวัสดี abc",
"success": true
},
{
"input": "Hello ครับ",
"tokens": [
"Hello",
" ",
"ครับ"
],
"token_count": 3,
"decoded": "Hello ครับ",
"success": true
},
{
"input": "COVID-19 ระบาด",
"tokens": [
"COVID",
"-",
"1",
"9",
" ",
"ระบาด"
],
"token_count": 6,
"decoded": "COVID-19 ระบาด",
"success": true
}
]
},
"formal_thai": {
"passed": 2,
"total": 2,
"details": [
{
"input": "พระบาทสมเด็จพระเจ้าอยู่หัว",
"tokens": [
"พระบาทสมเด็จพระ",
"เจ้าอยู่หัว"
],
"token_count": 2,
"decoded": "พระบาทสมเด็จพระเจ้าอยู่หัว",
"success": true
},
{
"input": "การประชุมสำคัญ",
"tokens": [
"การประชุม",
"สำคัญ"
],
"token_count": 2,
"decoded": "การประชุมสำคัญ",
"success": true
}
]
},
"casual_thai": {
"passed": 3,
"total": 3,
"details": [
{
"input": "อร่อยจัง",
"tokens": [
"อ",
"ร่อย",
"จัง"
],
"token_count": 3,
"decoded": "อร่อยจัง",
"success": true
},
{
"input": "แพงมาก",
"tokens": [
"แพง",
"มาก"
],
"token_count": 2,
"decoded": "แพงมาก",
"success": true
},
{
"input": "ถูกมาก",
"tokens": [
"ถูก",
"มาก"
],
"token_count": 2,
"decoded": "ถูกมาก",
"success": true
}
]
},
"complex_thai": {
"passed": 3,
"total": 3,
"details": [
{
"input": "กรุงเทพมหานคร",
"tokens": [
"กรุงเทพ",
"มหา",
"นคร"
],
"token_count": 3,
"decoded": "กรุงเทพมหานคร",
"success": true
},
{
"input": "ราชมงคลธัญบุรี",
"tokens": [
"ราช",
"มงคล",
"ธัญ",
"บุรี"
],
"token_count": 4,
"decoded": "ราชมงคลธัญบุรี",
"success": true
},
{
"input": "จุฬาลงกรณ์มหาวิทยาลัย",
"tokens": [
"จุฬาล",
"ง",
"กรณ์",
"มหาวิทยาลัย"
],
"token_count": 4,
"decoded": "จุฬาลงกรณ์มหาวิทยาลัย",
"success": true
}
]
},
"numbers_dates": {
"passed": 3,
"total": 3,
"details": [
{
"input": "1 มกราคม 2567",
"tokens": [
"1",
" ",
"มกรา",
"ค",
"ม",
" ",
"2",
"567"
],
"token_count": 8,
"decoded": "1 มกราคม 2567",
"success": true
},
{
"input": "เวลา 14:30 น.",
"tokens": [
"เวลา",
" ",
"1",
"4",
":",
"30",
" ",
"น",
"."
],
"token_count": 9,
"decoded": "เวลา 14:30 น.",
"success": true
},
{
"input": "ราคา 1,234 บาท",
"tokens": [
"ราคา",
" ",
"1",
",",
"2",
"34",
" ",
"บาท"
],
"token_count": 8,
"decoded": "ราคา 1,234 บาท",
"success": true
}
]
},
"technology": {
"passed": 3,
"total": 3,
"details": [
{
"input": "อินเทอร์เน็ต",
"tokens": [
"อินเทอร์เน็ต"
],
"token_count": 1,
"decoded": "อินเทอร์เน็ต",
"success": true
},
{
"input": "โทรศัพท์มือถือ",
"tokens": [
"โทรศัพท์",
"มือถือ"
],
"token_count": 2,
"decoded": "โทรศัพท์มือถือ",
"success": true
},
{
"input": "แอปพลิเคชัน",
"tokens": [
"แอปพลิเคชั",
"น"
],
"token_count": 2,
"decoded": "แอปพลิเคชัน",
"success": true
}
]
}
}
},
"efficiency": {
"compression_ratios": [
3.0,
2.75,
2.6470588235294117,
6.7,
2.1666666666666665
],
"avg_tokens_per_char": 0.30726256983240224,
"vocab_coverage": 0.0010958134307389716,
"details": [
{
"sentence": "สวัสดี",
"char_count": 6,
"token_count": 2,
"compression_ratio": 3.0,
"tokens": [
"สวัสด",
"ี"
]
},
{
"sentence": "สวัสดีครับ ผมชื่อจอห์น",
"char_count": 22,
"token_count": 8,
"compression_ratio": 2.75,
"tokens": [
"สวัสด",
"ี",
"ครับ",
" ",
"ผ",
"ม",
"ชื่อ",
"จอห์น"
]
},
{
"sentence": "วันนี้อากาศดีมาก ผมจึงไปเดินเล่นที่สวนสาธารณะ",
"char_count": 45,
"token_count": 17,
"compression_ratio": 2.6470588235294117,
"tokens": [
"วัน",
"นี้",
"อากาศ",
"ด",
"ี",
"มาก",
" ",
"ผ",
"ม",
"จึง",
"ไป",
"เดิน",
"เล่น",
"ที่",
"สวน",
"สาธารณ",
"ะ"
]
},
{
"sentence": "พระบาทสมเด็จพระเจ้าอยู่หัวทรงพระกรุณาโปรดเกล้าฯ ให้จัดงานพระราชพิธี",
"char_count": 67,
"token_count": 10,
"compression_ratio": 6.7,
"tokens": [
"พระบาทสมเด็จพระ",
"เจ้าอยู่หัว",
"ทรง",
"พระกรุณา",
"โปรดเกล้า",
"ฯ ",
"ให้",
"จัด",
"งาน",
"พระราชพิธี"
]
},
{
"sentence": "555 อร่อยมากกก กินข้าวยัง? #อาหารไทย 🇹🇭",
"char_count": 39,
"token_count": 18,
"compression_ratio": 2.1666666666666665,
"tokens": [
"555",
" ",
"อ",
"ร่อย",
"มาก",
"ก",
"ก",
" ",
"กิน",
"ข้าว",
"ยัง",
"?",
" ",
"#",
"อาหาร",
"ไทย",
" ",
"🇹🇭"
]
}
]
},
"overall_accuracy": "24/24"
},
"features": [
"No normalization (preserves Thai characters)",
"Smart punctuation handling",
"Mixed Thai-English support",
"Modern vocabulary coverage",
"Efficient compression",
"Direct decoding without artifacts"
],
"usage_notes": {
"best_decoding": "manual concatenation of non-special tokens",
"recommended_for": [
"Thai NLP",
"LLM training",
"Text processing",
"Social media analysis"
],
"avoid": [
"Text normalization",
"Byte-level fallback",
"Aggressive post-processing"
]
}
}