File size: 6,597 Bytes
61c6f2a fffd3be a0ba317 cff80e7 529fe04 cff80e7 9f6019d a074b70 9f6019d cff80e7 13a9b24 cff80e7 49c41eb eabcfdc b35ecbb 3c584e9 b35ecbb 49c41eb cff80e7 13a9b24 cff80e7 173620b cff80e7 aa7bab3 cff80e7 502cb8b cff80e7 f0ac1c7 cff80e7 944fdd3 cff80e7 3ff7370 cff80e7 fffd3be | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | import os
import streamlit as st
from transformers import AutoTokenizer
import random
##############################
# SETTING AND MODELC CHOICES
##############################
MODEL_CHOICES = {
"Typhoon (ThaiLLM-8B)": "typhoon-ai/typhoon-s-thaillm-8b-instruct-research-preview",
"Typhoon-1.5 (8B)": "scb10x/typhoon-v1.5-8b-instruct",
"Llama-3 (8B)": "meta-llama/Meta-Llama-3-8B",
"Gemma-2 (9B)": "google/gemma-2-9b-it",
"SeaLLM-v3": "SeaLLMs/SeaLLM-7B-v2.5",
"BGE-M3 (Embedding)": "BAAI/bge-m3",
"WangchanBERTa": "airesearch/wangchanberta-base-att-spm-uncased",
"OpenThaiGPT (7B)": "openthaigpt/openthaigpt1.5-7b-instruct",
"GPT-2 (Thai)": "flax-community/gpt2-base-thai",
"Mistral-Nemo (12B)": "mistralai/Mistral-Nemo-Instruct-2407"
}
hf_token = os.getenv("HF_TOKEN")
# use cache
@st.cache_resource
def load_tokenizer(model_path):
try:
if "wangchanberta" in model_path.lower():
return AutoTokenizer.from_pretrained(
model_path,
token=hf_token,
# use_fast=False,
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(
model_path,
token=hf_token,
trust_remote_code=True
)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
return tokenizer
except Exception as e:
st.error(f"Error loading tokenizer for {model_path}: {e}")
return None
if hf_token is None:
st.error("HF_TOKEN is not being detected by the system. Please check 'Settings > Secrets'.")
else:
st.success("HF_TOKEN successfully loaded from environment!")
def get_random_color():
colors = ["#FFD1DC", "#B2F2BB", "#A5D8FF", "#FFEC99", "#FFD8A8", "#D0EBFF", "#EEBEE1"]
return random.choice(colors)
##############################
# UI
##############################
# page title
st.set_page_config(page_title="Thai Tokenizer Visualizer", layout="wide")
st.title("Thai Tokenizer Multi-Benchmark")
st.markdown("""
While Tokenizer Visualizers are standard tools in the global AI landscape,
there is a significant gap when it comes to the Thai language, especially regarding official and legal contexts.
Standard models often fail to capture the nuances of complex Thai bureaucratic phrasing and long compound nouns,
leading to 'Token Inflation'—where fragmented tokenization results in 'Hidden Costs' and significant performance loss.
This app focuses on comparing how different LLMs 'see' Thai text. Efficient tokenization (lower Token Count)
usually leads to lower inference costs and better performance for Thai language tasks.
""")
with st.expander("Why Tokenization matters for Thai OCR & Documents?", expanded=False):
st.markdown("""
### **The Problem: Unstructured Thai Government Data**
When processing **OCR text from official Thai documents**, I face a unique challenge.
Thai is a non-segmented language (no spaces between words and with those Thai numerics), and legal/official
vocabulary is highly complex.
### **What is this?**
This Arena helps visualize which LLM "understands" Thai document structures most
efficiently. A "good" tokenizer sees words as meaningful units; a "bad" one breaks
them into meaningless characters.
### **Why does this matter?**
* **Cost Efficiency:** Models that use fewer tokens to represent the same text are cheaper to run.
* **Memory (Context):** Efficient tokenization allows you to feed longer documents into a model without hitting memory limits.
* **Accuracy:** Better tokenization leads to fewer hallucinations in RAG (Retrieval-Augmented Generation) systems.
### **How to use**
1. Select models from the sidebar.
2. Paste your Thai text.
3. Look for the most cost-effective model for your data (better segmentation and lower number of tokens).
""")
# put choice on the sidebar
with st.sidebar:
st.header("Configuration")
selected_models = st.multiselect(
"Select Models:",
options=list(MODEL_CHOICES.keys()), # select from name in the list
default=["WangchanBERTa", "Llama-3 (8B)"]
)
# accept input
input_text = st.text_area("Input Thai Text:",
"ข้อ ๔ ให้เพิ่มความต่อไปนี้เป็นวรรคสองของข้อ ๗ แห่งระเบียบคณะกรรมการป้องกันและปราบปรามการฟอกเงิน ว่าด้วยการเก็บรักษาและการจัดการทรัพย์สินที่ถูกยึดหรืออายัด พ.ศ. ๒๕๔๓ ซึ่งแก้ไขเพิ่มเติมโดยระเบียบคณะกรรมการป้องกันและปราบปรามการฟอกเงิน ว่าด้วยการเก็บรักษาและการจัดการทรัพย์สินที่ถูกยึดหรืออายัด (ฉบับที่ ๒) พ.ศ. ๒๕๕๖",
height=250)
# result
if selected_models:
cols = st.columns(len(selected_models))
for i, model_name in enumerate(selected_models):
with cols[i]:
st.subheader(model_name)
try:
tokenizer = load_tokenizer(MODEL_CHOICES[model_name])
tokens = tokenizer.encode(input_text)
decoded_tokens = [tokenizer.decode([t]) for t in tokens]
# num of tokens to compare
st.metric("Total Tokens", len(tokens))
# show visual
html_output = ""
for t in decoded_tokens:
color = get_random_color()
# clean up
display_token = t.replace(" ", " ").replace("\n", "↵")
html_output += f'<span style="background-color: {color}; padding: 2px 6px; margin: 2px; border-radius: 4px; display: inline-block; color: black; font-family: monospace; border: 1px solid #ddd;">{display_token}</span>'
st.markdown(html_output, unsafe_allow_html=True)
except Exception as e:
st.error(f"Error loading {model_name}: {e}")
else:
st.info("Please select at least one model from the sidebar.")
|