File size: 6,597 Bytes
61c6f2a
fffd3be
a0ba317
cff80e7
 
 
 
 
 
 
 
529fe04
cff80e7
 
 
 
 
9f6019d
a074b70
9f6019d
 
cff80e7
 
13a9b24
cff80e7
 
 
 
49c41eb
eabcfdc
b35ecbb
 
 
 
 
3c584e9
b35ecbb
 
 
 
 
 
 
 
 
 
 
 
49c41eb
 
 
 
 
cff80e7
13a9b24
 
 
 
 
cff80e7
173620b
cff80e7
 
 
 
 
 
 
 
 
 
 
aa7bab3
 
 
 
 
cff80e7
 
502cb8b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cff80e7
 
 
 
 
 
 
f0ac1c7
cff80e7
 
 
944fdd3
 
 
cff80e7
 
 
 
 
 
 
 
 
3ff7370
cff80e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fffd3be
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import os
import streamlit as st
from transformers import AutoTokenizer 
import random


##############################
# SETTING AND MODELC CHOICES
##############################

MODEL_CHOICES = {
    "Typhoon (ThaiLLM-8B)": "typhoon-ai/typhoon-s-thaillm-8b-instruct-research-preview",
    "Typhoon-1.5 (8B)": "scb10x/typhoon-v1.5-8b-instruct",
    "Llama-3 (8B)": "meta-llama/Meta-Llama-3-8B",
    "Gemma-2 (9B)": "google/gemma-2-9b-it",
    "SeaLLM-v3": "SeaLLMs/SeaLLM-7B-v2.5",
    "BGE-M3 (Embedding)": "BAAI/bge-m3",
    "WangchanBERTa": "airesearch/wangchanberta-base-att-spm-uncased",
    "OpenThaiGPT (7B)": "openthaigpt/openthaigpt1.5-7b-instruct",
    "GPT-2 (Thai)": "flax-community/gpt2-base-thai",
    "Mistral-Nemo (12B)": "mistralai/Mistral-Nemo-Instruct-2407"
}

hf_token = os.getenv("HF_TOKEN")

# use cache
@st.cache_resource
def load_tokenizer(model_path):

    try:
    
        if "wangchanberta" in model_path.lower():
            return AutoTokenizer.from_pretrained(
                model_path, 
                token=hf_token, 
                # use_fast=False,
                trust_remote_code=True
            )
    
        tokenizer = AutoTokenizer.from_pretrained(
                model_path, 
                token=hf_token, 
                trust_remote_code=True
            )
    
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        return tokenizer

    except Exception as e:
        st.error(f"Error loading tokenizer for {model_path}: {e}")
        return None

if hf_token is None:
    st.error("HF_TOKEN is not being detected by the system. Please check 'Settings > Secrets'.")
else:
    st.success("HF_TOKEN successfully loaded from environment!")

def get_random_color():
    colors = ["#FFD1DC", "#B2F2BB", "#A5D8FF", "#FFEC99", "#FFD8A8", "#D0EBFF", "#EEBEE1"]
    return random.choice(colors)


##############################
# UI
##############################

# page title
st.set_page_config(page_title="Thai Tokenizer Visualizer", layout="wide")
st.title("Thai Tokenizer Multi-Benchmark")
st.markdown("""
While Tokenizer Visualizers are standard tools in the global AI landscape, 
there is a significant gap when it comes to the Thai language, especially regarding official and legal contexts. 
Standard models often fail to capture the nuances of complex Thai bureaucratic phrasing and long compound nouns, 
leading to 'Token Inflation'—where fragmented tokenization results in 'Hidden Costs' and significant performance loss. 
This app focuses on comparing how different LLMs 'see' Thai text. Efficient tokenization (lower Token Count) 
usually leads to lower inference costs and better performance for Thai language tasks.
""")
with st.expander("Why Tokenization matters for Thai OCR & Documents?", expanded=False):
    st.markdown("""
    ### **The Problem: Unstructured Thai Government Data**
    When processing **OCR text from official Thai documents**, I face a unique challenge. 
    Thai is a non-segmented language (no spaces between words and with those Thai numerics), and legal/official 
    vocabulary is highly complex.

    ### **What is this?**
    This Arena helps visualize which LLM "understands" Thai document structures most 
    efficiently. A "good" tokenizer sees words as meaningful units; a "bad" one breaks 
    them into meaningless characters.

    ### **Why does this matter?**
    *   **Cost Efficiency:** Models that use fewer tokens to represent the same text are cheaper to run.
    *   **Memory (Context):** Efficient tokenization allows you to feed longer documents into a model without hitting memory limits.
    *   **Accuracy:** Better tokenization leads to fewer hallucinations in RAG (Retrieval-Augmented Generation) systems.

    ### **How to use**
    1. Select models from the sidebar.
    2. Paste your Thai text.
    3. Look for the most cost-effective model for your data (better segmentation and lower number of tokens).
    """)

# put choice on the sidebar
with st.sidebar:
    st.header("Configuration")
    selected_models = st.multiselect(
        "Select Models:",
        options=list(MODEL_CHOICES.keys()),  # select from name in the list
        default=["WangchanBERTa", "Llama-3 (8B)"]  
    )

# accept input
input_text = st.text_area("Input Thai Text:", 
                          "ข้อ ๔ ให้เพิ่มความต่อไปนี้เป็นวรรคสองของข้อ ๗ แห่งระเบียบคณะกรรมการป้องกันและปราบปรามการฟอกเงิน ว่าด้วยการเก็บรักษาและการจัดการทรัพย์สินที่ถูกยึดหรืออายัด พ.ศ. ๒๕๔๓ ซึ่งแก้ไขเพิ่มเติมโดยระเบียบคณะกรรมการป้องกันและปราบปรามการฟอกเงิน ว่าด้วยการเก็บรักษาและการจัดการทรัพย์สินที่ถูกยึดหรืออายัด (ฉบับที่ ๒) พ.ศ. ๒๕๕๖", 
                          height=250)

# result
if selected_models:
    cols = st.columns(len(selected_models))

    for i, model_name in enumerate(selected_models):
        with cols[i]:
            st.subheader(model_name)
            try:
                tokenizer = load_tokenizer(MODEL_CHOICES[model_name])
                tokens = tokenizer.encode(input_text)
                decoded_tokens = [tokenizer.decode([t]) for t in tokens]
                
                # num of tokens to compare
                st.metric("Total Tokens", len(tokens))
                
                # show visual
                html_output = ""
                for t in decoded_tokens:
                    color = get_random_color()
                    # clean up
                    display_token = t.replace(" ", " ").replace("\n", "↵")
                    html_output += f'<span style="background-color: {color}; padding: 2px 6px; margin: 2px; border-radius: 4px; display: inline-block; color: black; font-family: monospace; border: 1px solid #ddd;">{display_token}</span>'
                
                st.markdown(html_output, unsafe_allow_html=True)
                
            except Exception as e:
                st.error(f"Error loading {model_name}: {e}")
else:
    st.info("Please select at least one model from the sidebar.")