Thaitokenizer / metadata.json

Upload folder using huggingface_hub

55f2143 verified 6 months ago

14.2 kB

	{
	"model_info": {
	"version": "2.0",
	"model_type": "unigram",
	"vocab_size": 35590,
	"creation_date": "2025-07-02",
	"language": "thai",
	"description": "Advanced Thai tokenizer with improved handling of Thai text, mixed content, and modern vocabulary"
	},
	"performance": {
	"test_results": {
	"overall": {
	"passed": 24,
	"total": 24
	},
	"categories": {
	"basic_thai": {
	"passed": 4,
	"total": 4,
	"details": [
	{
	"input": "สวัสดี",
	"tokens": [
	"สวัสด",
	"ี"
	],
	"token_count": 2,
	"decoded": "สวัสดี",
	"success": true
	},
	{
	"input": "ขอบคุณ",
	"tokens": [
	"ขอบ",
	"คุณ"
	],
	"token_count": 2,
	"decoded": "ขอบคุณ",
	"success": true
	},
	{
	"input": "ครับ",
	"tokens": [
	"ครับ"
	],
	"token_count": 1,
	"decoded": "ครับ",
	"success": true
	},
	{
	"input": "ค่ะ",
	"tokens": [
	"ค่ะ"
	],
	"token_count": 1,
	"decoded": "ค่ะ",
	"success": true
	}
	]
	},
	"thai_with_spaces": {
	"passed": 3,
	"total": 3,
	"details": [
	{
	"input": "กิน ข้าว อร่อย",
	"tokens": [
	"กิน",
	" ",
	"ข้าว",
	" ",
	"อ",
	"ร่อย"
	],
	"token_count": 6,
	"decoded": "กิน ข้าว อร่อย",
	"success": true
	},
	{
	"input": "วันนี้ อากาศ ดี",
	"tokens": [
	"วัน",
	"นี้",
	" ",
	"อากาศ",
	" ",
	"ด",
	"ี"
	],
	"token_count": 7,
	"decoded": "วันนี้ อากาศ ดี",
	"success": true
	},
	{
	"input": "ผม ชื่อ จอห์น",
	"tokens": [
	"ผ",
	"ม",
	" ",
	"ชื่อ",
	" ",
	"จอห์น"
	],
	"token_count": 6,
	"decoded": "ผม ชื่อ จอห์น",
	"success": true
	}
	]
	},
	"mixed_content": {
	"passed": 3,
	"total": 3,
	"details": [
	{
	"input": "123 สวัสดี abc",
	"tokens": [
	"1",
	"2",
	"3",
	" ",
	"สวัสด",
	"ี",
	" ",
	"abc"
	],
	"token_count": 8,
	"decoded": "123 สวัสดี abc",
	"success": true
	},
	{
	"input": "Hello ครับ",
	"tokens": [
	"Hello",
	" ",
	"ครับ"
	],
	"token_count": 3,
	"decoded": "Hello ครับ",
	"success": true
	},
	{
	"input": "COVID-19 ระบาด",
	"tokens": [
	"COVID",
	"-",
	"1",
	"9",
	" ",
	"ระบาด"
	],
	"token_count": 6,
	"decoded": "COVID-19 ระบาด",
	"success": true
	}
	]
	},
	"formal_thai": {
	"passed": 2,
	"total": 2,
	"details": [
	{
	"input": "พระบาทสมเด็จพระเจ้าอยู่หัว",
	"tokens": [
	"พระบาทสมเด็จพระ",
	"เจ้าอยู่หัว"
	],
	"token_count": 2,
	"decoded": "พระบาทสมเด็จพระเจ้าอยู่หัว",
	"success": true
	},
	{
	"input": "การประชุมสำคัญ",
	"tokens": [
	"การประชุม",
	"สำคัญ"
	],
	"token_count": 2,
	"decoded": "การประชุมสำคัญ",
	"success": true
	}
	]
	},
	"casual_thai": {
	"passed": 3,
	"total": 3,
	"details": [
	{
	"input": "อร่อยจัง",
	"tokens": [
	"อ",
	"ร่อย",
	"จัง"
	],
	"token_count": 3,
	"decoded": "อร่อยจัง",
	"success": true
	},
	{
	"input": "แพงมาก",
	"tokens": [
	"แพง",
	"มาก"
	],
	"token_count": 2,
	"decoded": "แพงมาก",
	"success": true
	},
	{
	"input": "ถูกมาก",
	"tokens": [
	"ถูก",
	"มาก"
	],
	"token_count": 2,
	"decoded": "ถูกมาก",
	"success": true
	}
	]
	},
	"complex_thai": {
	"passed": 3,
	"total": 3,
	"details": [
	{
	"input": "กรุงเทพมหานคร",
	"tokens": [
	"กรุงเทพ",
	"มหา",
	"นคร"
	],
	"token_count": 3,
	"decoded": "กรุงเทพมหานคร",
	"success": true
	},
	{
	"input": "ราชมงคลธัญบุรี",
	"tokens": [
	"ราช",
	"มงคล",
	"ธัญ",
	"บุรี"
	],
	"token_count": 4,
	"decoded": "ราชมงคลธัญบุรี",
	"success": true
	},
	{
	"input": "จุฬาลงกรณ์มหาวิทยาลัย",
	"tokens": [
	"จุฬาล",
	"ง",
	"กรณ์",
	"มหาวิทยาลัย"
	],
	"token_count": 4,
	"decoded": "จุฬาลงกรณ์มหาวิทยาลัย",
	"success": true
	}
	]
	},
	"numbers_dates": {
	"passed": 3,
	"total": 3,
	"details": [
	{
	"input": "1 มกราคม 2567",
	"tokens": [
	"1",
	" ",
	"มกรา",
	"ค",
	"ม",
	" ",
	"2",
	"567"
	],
	"token_count": 8,
	"decoded": "1 มกราคม 2567",
	"success": true
	},
	{
	"input": "เวลา 14:30 น.",
	"tokens": [
	"เวลา",
	" ",
	"1",
	"4",
	":",
	"30",
	" ",
	"น",
	"."
	],
	"token_count": 9,
	"decoded": "เวลา 14:30 น.",
	"success": true
	},
	{
	"input": "ราคา 1,234 บาท",
	"tokens": [
	"ราคา",
	" ",
	"1",
	",",
	"2",
	"34",
	" ",
	"บาท"
	],
	"token_count": 8,
	"decoded": "ราคา 1,234 บาท",
	"success": true
	}
	]
	},
	"technology": {
	"passed": 3,
	"total": 3,
	"details": [
	{
	"input": "อินเทอร์เน็ต",
	"tokens": [
	"อินเทอร์เน็ต"
	],
	"token_count": 1,
	"decoded": "อินเทอร์เน็ต",
	"success": true
	},
	{
	"input": "โทรศัพท์มือถือ",
	"tokens": [
	"โทรศัพท์",
	"มือถือ"
	],
	"token_count": 2,
	"decoded": "โทรศัพท์มือถือ",
	"success": true
	},
	{
	"input": "แอปพลิเคชัน",
	"tokens": [
	"แอปพลิเคชั",
	"น"
	],
	"token_count": 2,
	"decoded": "แอปพลิเคชัน",
	"success": true
	}
	]
	}
	}
	},
	"efficiency": {
	"compression_ratios": [
	3.0,
	2.75,
	2.6470588235294117,
	6.7,
	2.1666666666666665
	],
	"avg_tokens_per_char": 0.30726256983240224,
	"vocab_coverage": 0.0010958134307389716,
	"details": [
	{
	"sentence": "สวัสดี",
	"char_count": 6,
	"token_count": 2,
	"compression_ratio": 3.0,
	"tokens": [
	"สวัสด",
	"ี"
	]
	},
	{
	"sentence": "สวัสดีครับ ผมชื่อจอห์น",
	"char_count": 22,
	"token_count": 8,
	"compression_ratio": 2.75,
	"tokens": [
	"สวัสด",
	"ี",
	"ครับ",
	" ",
	"ผ",
	"ม",
	"ชื่อ",
	"จอห์น"
	]
	},
	{
	"sentence": "วันนี้อากาศดีมาก ผมจึงไปเดินเล่นที่สวนสาธารณะ",
	"char_count": 45,
	"token_count": 17,
	"compression_ratio": 2.6470588235294117,
	"tokens": [
	"วัน",
	"นี้",
	"อากาศ",
	"ด",
	"ี",
	"มาก",
	" ",
	"ผ",
	"ม",
	"จึง",
	"ไป",
	"เดิน",
	"เล่น",
	"ที่",
	"สวน",
	"สาธารณ",
	"ะ"
	]
	},
	{
	"sentence": "พระบาทสมเด็จพระเจ้าอยู่หัวทรงพระกรุณาโปรดเกล้าฯ ให้จัดงานพระราชพิธี",
	"char_count": 67,
	"token_count": 10,
	"compression_ratio": 6.7,
	"tokens": [
	"พระบาทสมเด็จพระ",
	"เจ้าอยู่หัว",
	"ทรง",
	"พระกรุณา",
	"โปรดเกล้า",
	"ฯ ",
	"ให้",
	"จัด",
	"งาน",
	"พระราชพิธี"
	]
	},
	{
	"sentence": "555 อร่อยมากกก กินข้าวยัง? #อาหารไทย 🇹🇭",
	"char_count": 39,
	"token_count": 18,
	"compression_ratio": 2.1666666666666665,
	"tokens": [
	"555",
	" ",
	"อ",
	"ร่อย",
	"มาก",
	"ก",
	"ก",
	" ",
	"กิน",
	"ข้าว",
	"ยัง",
	"?",
	" ",
	"#",
	"อาหาร",
	"ไทย",
	" ",
	"🇹🇭"
	]
	}
	]
	},
	"overall_accuracy": "24/24"
	},
	"features": [
	"No normalization (preserves Thai characters)",
	"Smart punctuation handling",
	"Mixed Thai-English support",
	"Modern vocabulary coverage",
	"Efficient compression",
	"Direct decoding without artifacts"
	],
	"usage_notes": {
	"best_decoding": "manual concatenation of non-special tokens",
	"recommended_for": [
	"Thai NLP",
	"LLM training",
	"Text processing",
	"Social media analysis"
	],
	"avoid": [
	"Text normalization",
	"Byte-level fallback",
	"Aggressive post-processing"
	]
	}
	}