Thai-LLM-Token-Comparison / src /streamlit_app.py
ll-monkey's picture
Update src/streamlit_app.py
f0ac1c7 verified
import os
import streamlit as st
from transformers import AutoTokenizer
import random
##############################
# SETTING AND MODELC CHOICES
##############################
MODEL_CHOICES = {
"Typhoon (ThaiLLM-8B)": "typhoon-ai/typhoon-s-thaillm-8b-instruct-research-preview",
"Typhoon-1.5 (8B)": "scb10x/typhoon-v1.5-8b-instruct",
"Llama-3 (8B)": "meta-llama/Meta-Llama-3-8B",
"Gemma-2 (9B)": "google/gemma-2-9b-it",
"SeaLLM-v3": "SeaLLMs/SeaLLM-7B-v2.5",
"BGE-M3 (Embedding)": "BAAI/bge-m3",
"WangchanBERTa": "airesearch/wangchanberta-base-att-spm-uncased",
"OpenThaiGPT (7B)": "openthaigpt/openthaigpt1.5-7b-instruct",
"GPT-2 (Thai)": "flax-community/gpt2-base-thai",
"Mistral-Nemo (12B)": "mistralai/Mistral-Nemo-Instruct-2407"
}
hf_token = os.getenv("HF_TOKEN")
# use cache
@st.cache_resource
def load_tokenizer(model_path):
try:
if "wangchanberta" in model_path.lower():
return AutoTokenizer.from_pretrained(
model_path,
token=hf_token,
# use_fast=False,
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(
model_path,
token=hf_token,
trust_remote_code=True
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
return tokenizer
except Exception as e:
st.error(f"Error loading tokenizer for {model_path}: {e}")
return None
if hf_token is None:
st.error("HF_TOKEN is not being detected by the system. Please check 'Settings > Secrets'.")
else:
st.success("HF_TOKEN successfully loaded from environment!")
def get_random_color():
colors = ["#FFD1DC", "#B2F2BB", "#A5D8FF", "#FFEC99", "#FFD8A8", "#D0EBFF", "#EEBEE1"]
return random.choice(colors)
##############################
# UI
##############################
# page title
st.set_page_config(page_title="Thai Tokenizer Visualizer", layout="wide")
st.title("Thai Tokenizer Multi-Benchmark")
st.markdown("""
While Tokenizer Visualizers are standard tools in the global AI landscape,
there is a significant gap when it comes to the Thai language, especially regarding official and legal contexts.
Standard models often fail to capture the nuances of complex Thai bureaucratic phrasing and long compound nouns,
leading to 'Token Inflation'—where fragmented tokenization results in 'Hidden Costs' and significant performance loss.
This app focuses on comparing how different LLMs 'see' Thai text. Efficient tokenization (lower Token Count)
usually leads to lower inference costs and better performance for Thai language tasks.
""")
with st.expander("Why Tokenization matters for Thai OCR & Documents?", expanded=False):
st.markdown("""
### **The Problem: Unstructured Thai Government Data**
When processing **OCR text from official Thai documents**, I face a unique challenge.
Thai is a non-segmented language (no spaces between words and with those Thai numerics), and legal/official
vocabulary is highly complex.
### **What is this?**
This Arena helps visualize which LLM "understands" Thai document structures most
efficiently. A "good" tokenizer sees words as meaningful units; a "bad" one breaks
them into meaningless characters.
### **Why does this matter?**
* **Cost Efficiency:** Models that use fewer tokens to represent the same text are cheaper to run.
* **Memory (Context):** Efficient tokenization allows you to feed longer documents into a model without hitting memory limits.
* **Accuracy:** Better tokenization leads to fewer hallucinations in RAG (Retrieval-Augmented Generation) systems.
### **How to use**
1. Select models from the sidebar.
2. Paste your Thai text.
3. Look for the most cost-effective model for your data (better segmentation and lower number of tokens).
""")
# put choice on the sidebar
with st.sidebar:
st.header("Configuration")
selected_models = st.multiselect(
"Select Models:",
options=list(MODEL_CHOICES.keys()), # select from name in the list
default=["WangchanBERTa", "Llama-3 (8B)"]
)
# accept input
input_text = st.text_area("Input Thai Text:",
"ข้อ ๔ ให้เพิ่มความต่อไปนี้เป็นวรรคสองของข้อ ๗ แห่งระเบียบคณะกรรมการป้องกันและปราบปรามการฟอกเงิน ว่าด้วยการเก็บรักษาและการจัดการทรัพย์สินที่ถูกยึดหรืออายัด พ.ศ. ๒๕๔๓ ซึ่งแก้ไขเพิ่มเติมโดยระเบียบคณะกรรมการป้องกันและปราบปรามการฟอกเงิน ว่าด้วยการเก็บรักษาและการจัดการทรัพย์สินที่ถูกยึดหรืออายัด (ฉบับที่ ๒) พ.ศ. ๒๕๕๖",
height=250)
# result
if selected_models:
cols = st.columns(len(selected_models))
for i, model_name in enumerate(selected_models):
with cols[i]:
st.subheader(model_name)
try:
tokenizer = load_tokenizer(MODEL_CHOICES[model_name])
tokens = tokenizer.encode(input_text)
decoded_tokens = [tokenizer.decode([t]) for t in tokens]
# num of tokens to compare
st.metric("Total Tokens", len(tokens))
# show visual
html_output = ""
for t in decoded_tokens:
color = get_random_color()
# clean up
display_token = t.replace(" ", " ").replace("\n", "↵")
html_output += f'<span style="background-color: {color}; padding: 2px 6px; margin: 2px; border-radius: 4px; display: inline-block; color: black; font-family: monospace; border: 1px solid #ddd;">{display_token}</span>'
st.markdown(html_output, unsafe_allow_html=True)
except Exception as e:
st.error(f"Error loading {model_name}: {e}")
else:
st.info("Please select at least one model from the sidebar.")