Spaces:

ll-monkey
/

Thai-LLM-Token-Comparison

Running

App Files Files Community

ll-monkey commited on 21 days ago

Commit

cff80e7

verified ·

1 Parent(s): d601877

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +82 -38

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,84 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import streamlit as st
+from transfomers import AutoTokenizer
+import random
+##############################
+# SETTING AND MODELC CHOICES
+##############################
+MODEL_CHOICES = {
+    "Typhoon-1.5 (8B)": "scb10x/typhoon-v1.5-8b-instruct",
+    "Llama-3 (8B)": "meta-llama/Meta-Llama-3-8B",
+    "Gemma-2 (9B)": "google/gemma-2-9b-it",
+    "SeaLLM-v3": "SeaLLMs/SeaLLM-7B-v2.5",
+    "BGE-M3 (Embedding)": "BAAI/bge-m3",
+    "WangchanBERTa": "airesearch/wangchanberta-base-att-spm-uncased"
+}
+# use cache
+@st.cache_resource
+def load_tokenizer(model_path):
+    token = st.secrets.get("HF_TOKEN")
+    return AutoTokenizer.from_pretrained(model_path, token=token)
+def get_random_color():
+    color = ["#FFD1DC", "#B2F2BB", "#A5D8FF", "#FFEC99", "#FFD8A8", "#D0EBFF", "#EEBEE1"]
+    return random.choice(colors)
+##############################
+# UI
+##############################
+# page title
+st.set_page_config(page_title="Thai Tokenizer Visualizer", layout="wide")
+st.title("Thai Tokenizer Multi-Benchmark")
+st.markdown("""
+Compare how different LLMs 'see' Thai text. Efficient tokenization (lower Token Count)
+usually leads to lower inference costs and better performance for Thai language tasks.
+""")
+# put choice on the sidebar
+with st.sidebar:
+    st.header("Configuration")
+    selected_models = st.multiselect(
+        "Select Models:",
+        options=list(MODEL_CHOICES.keys()),  # select from name in the list
+        default=["Typhoon-1.5 (8B)", "Llama-3 (8B)"]  # defualt choices
+    )
+# accept input
+input_text = st.text_area("Input Thai Text:", "การประปานครหลวง ตั้งอยู่ใกล้กับสถานีกลางกรุงเทพอภิวัฒน์", height=120)
+# result
+if selected_models:
+    cols = st.columns(len(selected_models))
+    for i, model_name in enumerate(selected_models):
+        with cols[i]:
+            st.subheader(model_name)
+            try:
+                tokenizer = load_tokenizer(MODELS_TO_TEST[model_name])
+                tokens = tokenizer.encode(input_text)
+                decoded_tokens = [tokenizer.decode([t]) for t in tokens]
+                # num of tokens to compare
+                st.metric("Total Tokens", len(tokens))
+                # show visual
+                html_output = ""
+                for t in decoded_tokens:
+                    color = get_random_color()
+                    # clean up
+                    display_token = t.replace(" ", " ").replace("\n", "↵")
+                    html_output += f'<span style="background-color: {color}; padding: 2px 6px; margin: 2px; border-radius: 4px; display: inline-block; color: black; font-family: monospace; border: 1px solid #ddd;">{display_token}</span>'
+                st.markdown(html_output, unsafe_allow_html=True)
+            except Exception as e:
+                st.error(f"Error loading {model_name}: {e}")
+else:
+    st.info("Please select at least one model from the sidebar.")