import os import streamlit as st from transformers import AutoTokenizer import random ############################## # SETTING AND MODELC CHOICES ############################## MODEL_CHOICES = { "Typhoon (ThaiLLM-8B)": "typhoon-ai/typhoon-s-thaillm-8b-instruct-research-preview", "Typhoon-1.5 (8B)": "scb10x/typhoon-v1.5-8b-instruct", "Llama-3 (8B)": "meta-llama/Meta-Llama-3-8B", "Gemma-2 (9B)": "google/gemma-2-9b-it", "SeaLLM-v3": "SeaLLMs/SeaLLM-7B-v2.5", "BGE-M3 (Embedding)": "BAAI/bge-m3", "WangchanBERTa": "airesearch/wangchanberta-base-att-spm-uncased", "OpenThaiGPT (7B)": "openthaigpt/openthaigpt1.5-7b-instruct", "GPT-2 (Thai)": "flax-community/gpt2-base-thai", "Mistral-Nemo (12B)": "mistralai/Mistral-Nemo-Instruct-2407" } hf_token = os.getenv("HF_TOKEN") # use cache @st.cache_resource def load_tokenizer(model_path): try: if "wangchanberta" in model_path.lower(): return AutoTokenizer.from_pretrained( model_path, token=hf_token, # use_fast=False, trust_remote_code=True ) tokenizer = AutoTokenizer.from_pretrained( model_path, token=hf_token, trust_remote_code=True ) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token return tokenizer except Exception as e: st.error(f"Error loading tokenizer for {model_path}: {e}") return None if hf_token is None: st.error("HF_TOKEN is not being detected by the system. Please check 'Settings > Secrets'.") else: st.success("HF_TOKEN successfully loaded from environment!") def get_random_color(): colors = ["#FFD1DC", "#B2F2BB", "#A5D8FF", "#FFEC99", "#FFD8A8", "#D0EBFF", "#EEBEE1"] return random.choice(colors) ############################## # UI ############################## # page title st.set_page_config(page_title="Thai Tokenizer Visualizer", layout="wide") st.title("Thai Tokenizer Multi-Benchmark") st.markdown(""" While Tokenizer Visualizers are standard tools in the global AI landscape, there is a significant gap when it comes to the Thai language, especially regarding official and legal contexts. Standard models often fail to capture the nuances of complex Thai bureaucratic phrasing and long compound nouns, leading to 'Token Inflation'—where fragmented tokenization results in 'Hidden Costs' and significant performance loss. This app focuses on comparing how different LLMs 'see' Thai text. Efficient tokenization (lower Token Count) usually leads to lower inference costs and better performance for Thai language tasks. """) with st.expander("Why Tokenization matters for Thai OCR & Documents?", expanded=False): st.markdown(""" ### **The Problem: Unstructured Thai Government Data** When processing **OCR text from official Thai documents**, I face a unique challenge. Thai is a non-segmented language (no spaces between words and with those Thai numerics), and legal/official vocabulary is highly complex. ### **What is this?** This Arena helps visualize which LLM "understands" Thai document structures most efficiently. A "good" tokenizer sees words as meaningful units; a "bad" one breaks them into meaningless characters. ### **Why does this matter?** * **Cost Efficiency:** Models that use fewer tokens to represent the same text are cheaper to run. * **Memory (Context):** Efficient tokenization allows you to feed longer documents into a model without hitting memory limits. * **Accuracy:** Better tokenization leads to fewer hallucinations in RAG (Retrieval-Augmented Generation) systems. ### **How to use** 1. Select models from the sidebar. 2. Paste your Thai text. 3. Look for the most cost-effective model for your data (better segmentation and lower number of tokens). """) # put choice on the sidebar with st.sidebar: st.header("Configuration") selected_models = st.multiselect( "Select Models:", options=list(MODEL_CHOICES.keys()), # select from name in the list default=["WangchanBERTa", "Llama-3 (8B)"] ) # accept input input_text = st.text_area("Input Thai Text:", "ข้อ ๔ ให้เพิ่มความต่อไปนี้เป็นวรรคสองของข้อ ๗ แห่งระเบียบคณะกรรมการป้องกันและปราบปรามการฟอกเงิน ว่าด้วยการเก็บรักษาและการจัดการทรัพย์สินที่ถูกยึดหรืออายัด พ.ศ. ๒๕๔๓ ซึ่งแก้ไขเพิ่มเติมโดยระเบียบคณะกรรมการป้องกันและปราบปรามการฟอกเงิน ว่าด้วยการเก็บรักษาและการจัดการทรัพย์สินที่ถูกยึดหรืออายัด (ฉบับที่ ๒) พ.ศ. ๒๕๕๖", height=250) # result if selected_models: cols = st.columns(len(selected_models)) for i, model_name in enumerate(selected_models): with cols[i]: st.subheader(model_name) try: tokenizer = load_tokenizer(MODEL_CHOICES[model_name]) tokens = tokenizer.encode(input_text) decoded_tokens = [tokenizer.decode([t]) for t in tokens] # num of tokens to compare st.metric("Total Tokens", len(tokens)) # show visual html_output = "" for t in decoded_tokens: color = get_random_color() # clean up display_token = t.replace(" ", " ").replace("\n", "↵") html_output += f'{display_token}' st.markdown(html_output, unsafe_allow_html=True) except Exception as e: st.error(f"Error loading {model_name}: {e}") else: st.info("Please select at least one model from the sidebar.")