Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import tiktoken | |
| import random | |
| from utils.metrics import calculate_compression | |
| COLORS = ["#FFB7B2", "#FFDAC1", "#E2F0CB", "#B5EAD7", "#C7CEEA"] | |
| DEFAULT_TEXT = "ഇതൊരു Malayalam ടോക്കനൈസർ ആണ് 🔤.\nവാചകത്തെ ടോക്കണുകൾ എന്ന് വിളിക്കുന്ന ചെറിയ കഷ്ണങ്ങളായി വിഭജിക്കുന്ന ഒരു method aanu ടോക്കനൈസർ." | |
| st.set_page_config(page_title="Malayalam Tokenizer", page_icon="🔤", layout="wide") | |
| st.title("Malayalam Tokenizer 🔤") | |
| text = st.text_area("Enter your text:", value=DEFAULT_TEXT, height=150) | |
| if text: | |
| enc = tiktoken.get_encoding("cl100k_base") | |
| tokens = enc.encode(text) | |
| bytes_data = text.encode("utf-8") | |
| ratio = calculate_compression(bytes_data, tokens) | |
| cols = st.columns(4) | |
| metrics = [ | |
| ("Text Length", f"{len(text)} chars"), | |
| ("UTF-8 Size", f"{len(bytes_data)} bytes"), | |
| ("Tokens", f"{len(tokens)}"), | |
| ("Compression", f"{ratio:.2f}x") | |
| ] | |
| for col, (label, value) in zip(cols, metrics): | |
| col.metric(label, value) | |
| st.write("""<style> | |
| .token-viz { | |
| background: #1E1E1E; | |
| padding: 20px; | |
| border-radius: 10px; | |
| font-family: 'Noto Sans Malayalam', 'Noto Sans', system-ui, -apple-system, sans-serif; | |
| white-space: pre-wrap; | |
| word-break: break-word; | |
| line-height: 2; | |
| } | |
| .token { | |
| display: inline-block; | |
| padding: 2px 4px; | |
| margin: 0 1px; | |
| border-radius: 4px; | |
| vertical-align: middle; | |
| } | |
| </style>""", unsafe_allow_html=True) | |
| token_container = '<div class="token-viz">' | |
| current_pos = 0 | |
| for token in tokens: | |
| color = random.choice(COLORS) | |
| token_text = text[current_pos:current_pos + len(enc.decode([token]))] | |
| current_pos += len(token_text) | |
| token_container += f'<span class="token" style="background: {color}">{token_text}</span>' | |
| token_container += '</div>' | |
| st.write(token_container, unsafe_allow_html=True) | |
| with st.expander("Token Details"): | |
| st.code(f"Tokens: {tokens}") | |