muhsin's picture
added a few more widgets
e148493
import streamlit as st
import tiktoken
import random
from utils.metrics import calculate_compression
COLORS = ["#FFB7B2", "#FFDAC1", "#E2F0CB", "#B5EAD7", "#C7CEEA"]
DEFAULT_TEXT = "ഇതൊരു Malayalam ടോക്കനൈസർ ആണ് 🔤.\nവാചകത്തെ ടോക്കണുകൾ എന്ന് വിളിക്കുന്ന ചെറിയ കഷ്ണങ്ങളായി വിഭജിക്കുന്ന ഒരു method aanu ടോക്കനൈസർ."
st.set_page_config(page_title="Malayalam Tokenizer", page_icon="🔤", layout="wide")
st.title("Malayalam Tokenizer 🔤")
text = st.text_area("Enter your text:", value=DEFAULT_TEXT, height=150)
if text:
enc = tiktoken.get_encoding("cl100k_base")
tokens = enc.encode(text)
bytes_data = text.encode("utf-8")
ratio = calculate_compression(bytes_data, tokens)
cols = st.columns(4)
metrics = [
("Text Length", f"{len(text)} chars"),
("UTF-8 Size", f"{len(bytes_data)} bytes"),
("Tokens", f"{len(tokens)}"),
("Compression", f"{ratio:.2f}x")
]
for col, (label, value) in zip(cols, metrics):
col.metric(label, value)
st.write("""<style>
.token-viz {
background: #1E1E1E;
padding: 20px;
border-radius: 10px;
font-family: 'Noto Sans Malayalam', 'Noto Sans', system-ui, -apple-system, sans-serif;
white-space: pre-wrap;
word-break: break-word;
line-height: 2;
}
.token {
display: inline-block;
padding: 2px 4px;
margin: 0 1px;
border-radius: 4px;
vertical-align: middle;
}
</style>""", unsafe_allow_html=True)
token_container = '<div class="token-viz">'
current_pos = 0
for token in tokens:
color = random.choice(COLORS)
token_text = text[current_pos:current_pos + len(enc.decode([token]))]
current_pos += len(token_text)
token_container += f'<span class="token" style="background: {color}">{token_text}</span>'
token_container += '</div>'
st.write(token_container, unsafe_allow_html=True)
with st.expander("Token Details"):
st.code(f"Tokens: {tokens}")