| import os |
| import streamlit as st |
| from transformers import AutoTokenizer |
| import random |
|
|
|
|
| |
| |
| |
|
|
| MODEL_CHOICES = { |
| "Typhoon (ThaiLLM-8B)": "typhoon-ai/typhoon-s-thaillm-8b-instruct-research-preview", |
| "Typhoon-1.5 (8B)": "scb10x/typhoon-v1.5-8b-instruct", |
| "Llama-3 (8B)": "meta-llama/Meta-Llama-3-8B", |
| "Gemma-2 (9B)": "google/gemma-2-9b-it", |
| "SeaLLM-v3": "SeaLLMs/SeaLLM-7B-v2.5", |
| "BGE-M3 (Embedding)": "BAAI/bge-m3", |
| "WangchanBERTa": "airesearch/wangchanberta-base-att-spm-uncased", |
| "OpenThaiGPT (7B)": "openthaigpt/openthaigpt1.5-7b-instruct", |
| "GPT-2 (Thai)": "flax-community/gpt2-base-thai", |
| "Mistral-Nemo (12B)": "mistralai/Mistral-Nemo-Instruct-2407" |
| } |
|
|
| hf_token = os.getenv("HF_TOKEN") |
|
|
| |
| @st.cache_resource |
| def load_tokenizer(model_path): |
|
|
| try: |
| |
| if "wangchanberta" in model_path.lower(): |
| return AutoTokenizer.from_pretrained( |
| model_path, |
| token=hf_token, |
| |
| trust_remote_code=True |
| ) |
| |
| tokenizer = AutoTokenizer.from_pretrained( |
| model_path, |
| token=hf_token, |
| trust_remote_code=True |
| ) |
| |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
| |
| return tokenizer |
|
|
| except Exception as e: |
| st.error(f"Error loading tokenizer for {model_path}: {e}") |
| return None |
|
|
| if hf_token is None: |
| st.error("HF_TOKEN is not being detected by the system. Please check 'Settings > Secrets'.") |
| else: |
| st.success("HF_TOKEN successfully loaded from environment!") |
|
|
| def get_random_color(): |
| colors = ["#FFD1DC", "#B2F2BB", "#A5D8FF", "#FFEC99", "#FFD8A8", "#D0EBFF", "#EEBEE1"] |
| return random.choice(colors) |
|
|
|
|
| |
| |
| |
|
|
| |
| st.set_page_config(page_title="Thai Tokenizer Visualizer", layout="wide") |
| st.title("Thai Tokenizer Multi-Benchmark") |
| st.markdown(""" |
| While Tokenizer Visualizers are standard tools in the global AI landscape, |
| there is a significant gap when it comes to the Thai language, especially regarding official and legal contexts. |
| Standard models often fail to capture the nuances of complex Thai bureaucratic phrasing and long compound nouns, |
| leading to 'Token Inflation'—where fragmented tokenization results in 'Hidden Costs' and significant performance loss. |
| This app focuses on comparing how different LLMs 'see' Thai text. Efficient tokenization (lower Token Count) |
| usually leads to lower inference costs and better performance for Thai language tasks. |
| """) |
| with st.expander("Why Tokenization matters for Thai OCR & Documents?", expanded=False): |
| st.markdown(""" |
| ### **The Problem: Unstructured Thai Government Data** |
| When processing **OCR text from official Thai documents**, I face a unique challenge. |
| Thai is a non-segmented language (no spaces between words and with those Thai numerics), and legal/official |
| vocabulary is highly complex. |
| |
| ### **What is this?** |
| This Arena helps visualize which LLM "understands" Thai document structures most |
| efficiently. A "good" tokenizer sees words as meaningful units; a "bad" one breaks |
| them into meaningless characters. |
| |
| ### **Why does this matter?** |
| * **Cost Efficiency:** Models that use fewer tokens to represent the same text are cheaper to run. |
| * **Memory (Context):** Efficient tokenization allows you to feed longer documents into a model without hitting memory limits. |
| * **Accuracy:** Better tokenization leads to fewer hallucinations in RAG (Retrieval-Augmented Generation) systems. |
| |
| ### **How to use** |
| 1. Select models from the sidebar. |
| 2. Paste your Thai text. |
| 3. Look for the most cost-effective model for your data (better segmentation and lower number of tokens). |
| """) |
|
|
| |
| with st.sidebar: |
| st.header("Configuration") |
| selected_models = st.multiselect( |
| "Select Models:", |
| options=list(MODEL_CHOICES.keys()), |
| default=["WangchanBERTa", "Llama-3 (8B)"] |
| ) |
|
|
| |
| input_text = st.text_area("Input Thai Text:", |
| "ข้อ ๔ ให้เพิ่มความต่อไปนี้เป็นวรรคสองของข้อ ๗ แห่งระเบียบคณะกรรมการป้องกันและปราบปรามการฟอกเงิน ว่าด้วยการเก็บรักษาและการจัดการทรัพย์สินที่ถูกยึดหรืออายัด พ.ศ. ๒๕๔๓ ซึ่งแก้ไขเพิ่มเติมโดยระเบียบคณะกรรมการป้องกันและปราบปรามการฟอกเงิน ว่าด้วยการเก็บรักษาและการจัดการทรัพย์สินที่ถูกยึดหรืออายัด (ฉบับที่ ๒) พ.ศ. ๒๕๕๖", |
| height=250) |
|
|
| |
| if selected_models: |
| cols = st.columns(len(selected_models)) |
|
|
| for i, model_name in enumerate(selected_models): |
| with cols[i]: |
| st.subheader(model_name) |
| try: |
| tokenizer = load_tokenizer(MODEL_CHOICES[model_name]) |
| tokens = tokenizer.encode(input_text) |
| decoded_tokens = [tokenizer.decode([t]) for t in tokens] |
| |
| |
| st.metric("Total Tokens", len(tokens)) |
| |
| |
| html_output = "" |
| for t in decoded_tokens: |
| color = get_random_color() |
| |
| display_token = t.replace(" ", " ").replace("\n", "↵") |
| html_output += f'<span style="background-color: {color}; padding: 2px 6px; margin: 2px; border-radius: 4px; display: inline-block; color: black; font-family: monospace; border: 1px solid #ddd;">{display_token}</span>' |
| |
| st.markdown(html_output, unsafe_allow_html=True) |
| |
| except Exception as e: |
| st.error(f"Error loading {model_name}: {e}") |
| else: |
| st.info("Please select at least one model from the sidebar.") |
|
|
|
|