{
  "model_info": {
    "version": "2.0",
    "model_type": "unigram",
    "vocab_size": 35590,
    "creation_date": "2025-07-02",
    "language": "thai",
    "description": "Advanced Thai tokenizer with improved handling of Thai text, mixed content, and modern vocabulary"
  },
  "performance": {
    "test_results": {
      "overall": {
        "passed": 24,
        "total": 24
      },
      "categories": {
        "basic_thai": {
          "passed": 4,
          "total": 4,
          "details": [
            {
              "input": "สวัสดี",
              "tokens": [
                "สวัสด",
                "ี"
              ],
              "token_count": 2,
              "decoded": "สวัสดี",
              "success": true
            },
            {
              "input": "ขอบคุณ",
              "tokens": [
                "ขอบ",
                "คุณ"
              ],
              "token_count": 2,
              "decoded": "ขอบคุณ",
              "success": true
            },
            {
              "input": "ครับ",
              "tokens": [
                "ครับ"
              ],
              "token_count": 1,
              "decoded": "ครับ",
              "success": true
            },
            {
              "input": "ค่ะ",
              "tokens": [
                "ค่ะ"
              ],
              "token_count": 1,
              "decoded": "ค่ะ",
              "success": true
            }
          ]
        },
        "thai_with_spaces": {
          "passed": 3,
          "total": 3,
          "details": [
            {
              "input": "กิน ข้าว อร่อย",
              "tokens": [
                "กิน",
                " ",
                "ข้าว",
                " ",
                "อ",
                "ร่อย"
              ],
              "token_count": 6,
              "decoded": "กิน ข้าว อร่อย",
              "success": true
            },
            {
              "input": "วันนี้ อากาศ ดี",
              "tokens": [
                "วัน",
                "นี้",
                " ",
                "อากาศ",
                " ",
                "ด",
                "ี"
              ],
              "token_count": 7,
              "decoded": "วันนี้ อากาศ ดี",
              "success": true
            },
            {
              "input": "ผม ชื่อ จอห์น",
              "tokens": [
                "ผ",
                "ม",
                " ",
                "ชื่อ",
                " ",
                "จอห์น"
              ],
              "token_count": 6,
              "decoded": "ผม ชื่อ จอห์น",
              "success": true
            }
          ]
        },
        "mixed_content": {
          "passed": 3,
          "total": 3,
          "details": [
            {
              "input": "123 สวัสดี abc",
              "tokens": [
                "1",
                "2",
                "3",
                " ",
                "สวัสด",
                "ี",
                " ",
                "abc"
              ],
              "token_count": 8,
              "decoded": "123 สวัสดี abc",
              "success": true
            },
            {
              "input": "Hello ครับ",
              "tokens": [
                "Hello",
                " ",
                "ครับ"
              ],
              "token_count": 3,
              "decoded": "Hello ครับ",
              "success": true
            },
            {
              "input": "COVID-19 ระบาด",
              "tokens": [
                "COVID",
                "-",
                "1",
                "9",
                " ",
                "ระบาด"
              ],
              "token_count": 6,
              "decoded": "COVID-19 ระบาด",
              "success": true
            }
          ]
        },
        "formal_thai": {
          "passed": 2,
          "total": 2,
          "details": [
            {
              "input": "พระบาทสมเด็จพระเจ้าอยู่หัว",
              "tokens": [
                "พระบาทสมเด็จพระ",
                "เจ้าอยู่หัว"
              ],
              "token_count": 2,
              "decoded": "พระบาทสมเด็จพระเจ้าอยู่หัว",
              "success": true
            },
            {
              "input": "การประชุมสำคัญ",
              "tokens": [
                "การประชุม",
                "สำคัญ"
              ],
              "token_count": 2,
              "decoded": "การประชุมสำคัญ",
              "success": true
            }
          ]
        },
        "casual_thai": {
          "passed": 3,
          "total": 3,
          "details": [
            {
              "input": "อร่อยจัง",
              "tokens": [
                "อ",
                "ร่อย",
                "จัง"
              ],
              "token_count": 3,
              "decoded": "อร่อยจัง",
              "success": true
            },
            {
              "input": "แพงมาก",
              "tokens": [
                "แพง",
                "มาก"
              ],
              "token_count": 2,
              "decoded": "แพงมาก",
              "success": true
            },
            {
              "input": "ถูกมาก",
              "tokens": [
                "ถูก",
                "มาก"
              ],
              "token_count": 2,
              "decoded": "ถูกมาก",
              "success": true
            }
          ]
        },
        "complex_thai": {
          "passed": 3,
          "total": 3,
          "details": [
            {
              "input": "กรุงเทพมหานคร",
              "tokens": [
                "กรุงเทพ",
                "มหา",
                "นคร"
              ],
              "token_count": 3,
              "decoded": "กรุงเทพมหานคร",
              "success": true
            },
            {
              "input": "ราชมงคลธัญบุรี",
              "tokens": [
                "ราช",
                "มงคล",
                "ธัญ",
                "บุรี"
              ],
              "token_count": 4,
              "decoded": "ราชมงคลธัญบุรี",
              "success": true
            },
            {
              "input": "จุฬาลงกรณ์มหาวิทยาลัย",
              "tokens": [
                "จุฬาล",
                "ง",
                "กรณ์",
                "มหาวิทยาลัย"
              ],
              "token_count": 4,
              "decoded": "จุฬาลงกรณ์มหาวิทยาลัย",
              "success": true
            }
          ]
        },
        "numbers_dates": {
          "passed": 3,
          "total": 3,
          "details": [
            {
              "input": "1 มกราคม 2567",
              "tokens": [
                "1",
                " ",
                "มกรา",
                "ค",
                "ม",
                " ",
                "2",
                "567"
              ],
              "token_count": 8,
              "decoded": "1 มกราคม 2567",
              "success": true
            },
            {
              "input": "เวลา 14:30 น.",
              "tokens": [
                "เวลา",
                " ",
                "1",
                "4",
                ":",
                "30",
                " ",
                "น",
                "."
              ],
              "token_count": 9,
              "decoded": "เวลา 14:30 น.",
              "success": true
            },
            {
              "input": "ราคา 1,234 บาท",
              "tokens": [
                "ราคา",
                " ",
                "1",
                ",",
                "2",
                "34",
                " ",
                "บาท"
              ],
              "token_count": 8,
              "decoded": "ราคา 1,234 บาท",
              "success": true
            }
          ]
        },
        "technology": {
          "passed": 3,
          "total": 3,
          "details": [
            {
              "input": "อินเทอร์เน็ต",
              "tokens": [
                "อินเทอร์เน็ต"
              ],
              "token_count": 1,
              "decoded": "อินเทอร์เน็ต",
              "success": true
            },
            {
              "input": "โทรศัพท์มือถือ",
              "tokens": [
                "โทรศัพท์",
                "มือถือ"
              ],
              "token_count": 2,
              "decoded": "โทรศัพท์มือถือ",
              "success": true
            },
            {
              "input": "แอปพลิเคชัน",
              "tokens": [
                "แอปพลิเคชั",
                "น"
              ],
              "token_count": 2,
              "decoded": "แอปพลิเคชัน",
              "success": true
            }
          ]
        }
      }
    },
    "efficiency": {
      "compression_ratios": [
        3.0,
        2.75,
        2.6470588235294117,
        6.7,
        2.1666666666666665
      ],
      "avg_tokens_per_char": 0.30726256983240224,
      "vocab_coverage": 0.0010958134307389716,
      "details": [
        {
          "sentence": "สวัสดี",
          "char_count": 6,
          "token_count": 2,
          "compression_ratio": 3.0,
          "tokens": [
            "สวัสด",
            "ี"
          ]
        },
        {
          "sentence": "สวัสดีครับ ผมชื่อจอห์น",
          "char_count": 22,
          "token_count": 8,
          "compression_ratio": 2.75,
          "tokens": [
            "สวัสด",
            "ี",
            "ครับ",
            " ",
            "ผ",
            "ม",
            "ชื่อ",
            "จอห์น"
          ]
        },
        {
          "sentence": "วันนี้อากาศดีมาก ผมจึงไปเดินเล่นที่สวนสาธารณะ",
          "char_count": 45,
          "token_count": 17,
          "compression_ratio": 2.6470588235294117,
          "tokens": [
            "วัน",
            "นี้",
            "อากาศ",
            "ด",
            "ี",
            "มาก",
            " ",
            "ผ",
            "ม",
            "จึง",
            "ไป",
            "เดิน",
            "เล่น",
            "ที่",
            "สวน",
            "สาธารณ",
            "ะ"
          ]
        },
        {
          "sentence": "พระบาทสมเด็จพระเจ้าอยู่หัวทรงพระกรุณาโปรดเกล้าฯ ให้จัดงานพระราชพิธี",
          "char_count": 67,
          "token_count": 10,
          "compression_ratio": 6.7,
          "tokens": [
            "พระบาทสมเด็จพระ",
            "เจ้าอยู่หัว",
            "ทรง",
            "พระกรุณา",
            "โปรดเกล้า",
            "ฯ ",
            "ให้",
            "จัด",
            "งาน",
            "พระราชพิธี"
          ]
        },
        {
          "sentence": "555 อร่อยมากกก กินข้าวยัง? #อาหารไทย 🇹🇭",
          "char_count": 39,
          "token_count": 18,
          "compression_ratio": 2.1666666666666665,
          "tokens": [
            "555",
            " ",
            "อ",
            "ร่อย",
            "มาก",
            "ก",
            "ก",
            " ",
            "กิน",
            "ข้าว",
            "ยัง",
            "?",
            " ",
            "#",
            "อาหาร",
            "ไทย",
            " ",
            "🇹🇭"
          ]
        }
      ]
    },
    "overall_accuracy": "24/24"
  },
  "features": [
    "No normalization (preserves Thai characters)",
    "Smart punctuation handling",
    "Mixed Thai-English support",
    "Modern vocabulary coverage",
    "Efficient compression",
    "Direct decoding without artifacts"
  ],
  "usage_notes": {
    "best_decoding": "manual concatenation of non-special tokens",
    "recommended_for": [
      "Thai NLP",
      "LLM training",
      "Text processing",
      "Social media analysis"
    ],
    "avoid": [
      "Text normalization",
      "Byte-level fallback",
      "Aggressive post-processing"
    ]
  }
}