Spaces:

ll-monkey
/

Thai-LLM-Token-Comparison

Running

App Files Files Community

Thai-LLM-Token-Comparison / src /streamlit_app.py

ll-monkey

Update src/streamlit_app.py

f0ac1c7 verified 9 days ago

raw

history blame contribute delete

6.6 kB

	import os
	import streamlit as st
	from transformers import AutoTokenizer
	import random


	##############################
	# SETTING AND MODELC CHOICES
	##############################

	MODEL_CHOICES = {
	"Typhoon (ThaiLLM-8B)": "typhoon-ai/typhoon-s-thaillm-8b-instruct-research-preview",
	"Typhoon-1.5 (8B)": "scb10x/typhoon-v1.5-8b-instruct",
	"Llama-3 (8B)": "meta-llama/Meta-Llama-3-8B",
	"Gemma-2 (9B)": "google/gemma-2-9b-it",
	"SeaLLM-v3": "SeaLLMs/SeaLLM-7B-v2.5",
	"BGE-M3 (Embedding)": "BAAI/bge-m3",
	"WangchanBERTa": "airesearch/wangchanberta-base-att-spm-uncased",
	"OpenThaiGPT (7B)": "openthaigpt/openthaigpt1.5-7b-instruct",
	"GPT-2 (Thai)": "flax-community/gpt2-base-thai",
	"Mistral-Nemo (12B)": "mistralai/Mistral-Nemo-Instruct-2407"
	}

	hf_token = os.getenv("HF_TOKEN")

	# use cache
	@st.cache_resource
	def load_tokenizer(model_path):

	try:

	if "wangchanberta" in model_path.lower():
	return AutoTokenizer.from_pretrained(
	model_path,
	token=hf_token,
	# use_fast=False,
	trust_remote_code=True
	)

	tokenizer = AutoTokenizer.from_pretrained(
	model_path,
	token=hf_token,
	trust_remote_code=True
	)

	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	return tokenizer

	except Exception as e:
	st.error(f"Error loading tokenizer for {model_path}: {e}")
	return None

	if hf_token is None:
	st.error("HF_TOKEN is not being detected by the system. Please check 'Settings > Secrets'.")
	else:
	st.success("HF_TOKEN successfully loaded from environment!")

	def get_random_color():
	colors = ["#FFD1DC", "#B2F2BB", "#A5D8FF", "#FFEC99", "#FFD8A8", "#D0EBFF", "#EEBEE1"]
	return random.choice(colors)


	##############################
	# UI
	##############################

	# page title
	st.set_page_config(page_title="Thai Tokenizer Visualizer", layout="wide")
	st.title("Thai Tokenizer Multi-Benchmark")
	st.markdown("""
	While Tokenizer Visualizers are standard tools in the global AI landscape,
	there is a significant gap when it comes to the Thai language, especially regarding official and legal contexts.
	Standard models often fail to capture the nuances of complex Thai bureaucratic phrasing and long compound nouns,
	leading to 'Token Inflation'—where fragmented tokenization results in 'Hidden Costs' and significant performance loss.
	This app focuses on comparing how different LLMs 'see' Thai text. Efficient tokenization (lower Token Count)
	usually leads to lower inference costs and better performance for Thai language tasks.
	""")
	with st.expander("Why Tokenization matters for Thai OCR & Documents?", expanded=False):
	st.markdown("""
	### The Problem: Unstructured Thai Government Data
	When processing OCR text from official Thai documents, I face a unique challenge.
	Thai is a non-segmented language (no spaces between words and with those Thai numerics), and legal/official
	vocabulary is highly complex.

	### What is this?
	This Arena helps visualize which LLM "understands" Thai document structures most
	efficiently. A "good" tokenizer sees words as meaningful units; a "bad" one breaks
	them into meaningless characters.

	### Why does this matter?
	* Cost Efficiency: Models that use fewer tokens to represent the same text are cheaper to run.
	* Memory (Context): Efficient tokenization allows you to feed longer documents into a model without hitting memory limits.
	* Accuracy: Better tokenization leads to fewer hallucinations in RAG (Retrieval-Augmented Generation) systems.

	### How to use
	1. Select models from the sidebar.
	2. Paste your Thai text.
	3. Look for the most cost-effective model for your data (better segmentation and lower number of tokens).
	""")

	# put choice on the sidebar
	with st.sidebar:
	st.header("Configuration")
	selected_models = st.multiselect(
	"Select Models:",
	options=list(MODEL_CHOICES.keys()), # select from name in the list
	default=["WangchanBERTa", "Llama-3 (8B)"]
	)

	# accept input
	input_text = st.text_area("Input Thai Text:",
	"ข้อ ๔ ให้เพิ่มความต่อไปนี้เป็นวรรคสองของข้อ ๗ แห่งระเบียบคณะกรรมการป้องกันและปราบปรามการฟอกเงิน ว่าด้วยการเก็บรักษาและการจัดการทรัพย์สินที่ถูกยึดหรืออายัด พ.ศ. ๒๕๔๓ ซึ่งแก้ไขเพิ่มเติมโดยระเบียบคณะกรรมการป้องกันและปราบปรามการฟอกเงิน ว่าด้วยการเก็บรักษาและการจัดการทรัพย์สินที่ถูกยึดหรืออายัด (ฉบับที่ ๒) พ.ศ. ๒๕๕๖",
	height=250)

	# result
	if selected_models:
	cols = st.columns(len(selected_models))

	for i, model_name in enumerate(selected_models):
	with cols[i]:
	st.subheader(model_name)
	try:
	tokenizer = load_tokenizer(MODEL_CHOICES[model_name])
	tokens = tokenizer.encode(input_text)
	decoded_tokens = [tokenizer.decode([t]) for t in tokens]

	# num of tokens to compare
	st.metric("Total Tokens", len(tokens))

	# show visual
	html_output = ""
	for t in decoded_tokens:
	color = get_random_color()
	# clean up
	display_token = t.replace(" ", " ").replace("\n", "↵")
	html_output += f'<span style="background-color: {color}; padding: 2px 6px; margin: 2px; border-radius: 4px; display: inline-block; color: black; font-family: monospace; border: 1px solid #ddd;">{display_token}</span>'

	st.markdown(html_output, unsafe_allow_html=True)

	except Exception as e:
	st.error(f"Error loading {model_name}: {e}")
	else:
	st.info("Please select at least one model from the sidebar.")