import streamlit as st import tiktoken import random from utils.metrics import calculate_compression COLORS = ["#FFB7B2", "#FFDAC1", "#E2F0CB", "#B5EAD7", "#C7CEEA"] DEFAULT_TEXT = "ഇതൊരു Malayalam ടോക്കനൈസർ ആണ് 🔤.\nവാചകത്തെ ടോക്കണുകൾ എന്ന് വിളിക്കുന്ന ചെറിയ കഷ്ണങ്ങളായി വിഭജിക്കുന്ന ഒരു method aanu ടോക്കനൈസർ." st.set_page_config(page_title="Malayalam Tokenizer", page_icon="🔤", layout="wide") st.title("Malayalam Tokenizer 🔤") text = st.text_area("Enter your text:", value=DEFAULT_TEXT, height=150) if text: enc = tiktoken.get_encoding("cl100k_base") tokens = enc.encode(text) bytes_data = text.encode("utf-8") ratio = calculate_compression(bytes_data, tokens) cols = st.columns(4) metrics = [ ("Text Length", f"{len(text)} chars"), ("UTF-8 Size", f"{len(bytes_data)} bytes"), ("Tokens", f"{len(tokens)}"), ("Compression", f"{ratio:.2f}x") ] for col, (label, value) in zip(cols, metrics): col.metric(label, value) st.write("""""", unsafe_allow_html=True) token_container = '

' current_pos = 0 for token in tokens: color = random.choice(COLORS) token_text = text[current_pos:current_pos + len(enc.decode([token]))] current_pos += len(token_text) token_container += f'{token_text}' token_container += '

' st.write(token_container, unsafe_allow_html=True) with st.expander("Token Details"): st.code(f"Tokens: {tokens}")