ll-monkey commited on
Commit
cff80e7
·
verified ·
1 Parent(s): d601877

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +82 -38
src/streamlit_app.py CHANGED
@@ -1,40 +1,84 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
1
  import streamlit as st
2
+ from transfomers import AutoTokenizer
3
+ import random
4
+
5
+
6
+ ##############################
7
+ # SETTING AND MODELC CHOICES
8
+ ##############################
9
+
10
+ MODEL_CHOICES = {
11
+ "Typhoon-1.5 (8B)": "scb10x/typhoon-v1.5-8b-instruct",
12
+ "Llama-3 (8B)": "meta-llama/Meta-Llama-3-8B",
13
+ "Gemma-2 (9B)": "google/gemma-2-9b-it",
14
+ "SeaLLM-v3": "SeaLLMs/SeaLLM-7B-v2.5",
15
+ "BGE-M3 (Embedding)": "BAAI/bge-m3",
16
+ "WangchanBERTa": "airesearch/wangchanberta-base-att-spm-uncased"
17
+ }
18
+
19
+
20
+ # use cache
21
+ @st.cache_resource
22
+ def load_tokenizer(model_path):
23
+ token = st.secrets.get("HF_TOKEN")
24
+ return AutoTokenizer.from_pretrained(model_path, token=token)
25
+
26
+ def get_random_color():
27
+ color = ["#FFD1DC", "#B2F2BB", "#A5D8FF", "#FFEC99", "#FFD8A8", "#D0EBFF", "#EEBEE1"]
28
+ return random.choice(colors)
29
+
30
+
31
+ ##############################
32
+ # UI
33
+ ##############################
34
+
35
+ # page title
36
+ st.set_page_config(page_title="Thai Tokenizer Visualizer", layout="wide")
37
+ st.title("Thai Tokenizer Multi-Benchmark")
38
+ st.markdown("""
39
+ Compare how different LLMs 'see' Thai text. Efficient tokenization (lower Token Count)
40
+ usually leads to lower inference costs and better performance for Thai language tasks.
41
+ """)
42
+
43
+ # put choice on the sidebar
44
+ with st.sidebar:
45
+ st.header("Configuration")
46
+ selected_models = st.multiselect(
47
+ "Select Models:",
48
+ options=list(MODEL_CHOICES.keys()), # select from name in the list
49
+ default=["Typhoon-1.5 (8B)", "Llama-3 (8B)"] # defualt choices
50
+ )
51
+
52
+ # accept input
53
+ input_text = st.text_area("Input Thai Text:", "การประปานครหลวง ตั้งอยู่ใกล้กับสถานีกลางกรุงเทพอภิวัฒน์", height=120)
54
+
55
+ # result
56
+ if selected_models:
57
+ cols = st.columns(len(selected_models))
58
+
59
+ for i, model_name in enumerate(selected_models):
60
+ with cols[i]:
61
+ st.subheader(model_name)
62
+ try:
63
+ tokenizer = load_tokenizer(MODELS_TO_TEST[model_name])
64
+ tokens = tokenizer.encode(input_text)
65
+ decoded_tokens = [tokenizer.decode([t]) for t in tokens]
66
+
67
+ # num of tokens to compare
68
+ st.metric("Total Tokens", len(tokens))
69
+
70
+ # show visual
71
+ html_output = ""
72
+ for t in decoded_tokens:
73
+ color = get_random_color()
74
+ # clean up
75
+ display_token = t.replace(" ", " ").replace("\n", "↵")
76
+ html_output += f'<span style="background-color: {color}; padding: 2px 6px; margin: 2px; border-radius: 4px; display: inline-block; color: black; font-family: monospace; border: 1px solid #ddd;">{display_token}</span>'
77
+
78
+ st.markdown(html_output, unsafe_allow_html=True)
79
+
80
+ except Exception as e:
81
+ st.error(f"Error loading {model_name}: {e}")
82
+ else:
83
+ st.info("Please select at least one model from the sidebar.")
84