File size: 2,289 Bytes
689b021
8988399
e148493
8988399
689b021
e148493
 
8988399
e148493
8988399
 
e148493
8988399
 
 
 
e148493
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import streamlit as st
import tiktoken
import random
from utils.metrics import calculate_compression

COLORS = ["#FFB7B2", "#FFDAC1", "#E2F0CB", "#B5EAD7", "#C7CEEA"]
DEFAULT_TEXT = "ഇതൊരു Malayalam ടോക്കനൈസർ ആണ് 🔤.\nവാചകത്തെ ടോക്കണുകൾ എന്ന് വിളിക്കുന്ന ചെറിയ കഷ്ണങ്ങളായി വിഭജിക്കുന്ന ഒരു method aanu ടോക്കനൈസർ."

st.set_page_config(page_title="Malayalam Tokenizer", page_icon="🔤", layout="wide")
st.title("Malayalam Tokenizer 🔤")

text = st.text_area("Enter your text:", value=DEFAULT_TEXT, height=150)

if text:
    enc = tiktoken.get_encoding("cl100k_base")
    tokens = enc.encode(text)
    bytes_data = text.encode("utf-8")
    ratio = calculate_compression(bytes_data, tokens)
    
    cols = st.columns(4)
    metrics = [
        ("Text Length", f"{len(text)} chars"),
        ("UTF-8 Size", f"{len(bytes_data)} bytes"),
        ("Tokens", f"{len(tokens)}"),
        ("Compression", f"{ratio:.2f}x")
    ]
    for col, (label, value) in zip(cols, metrics):
        col.metric(label, value)

    st.write("""<style>
        .token-viz {
            background: #1E1E1E;
            padding: 20px;
            border-radius: 10px;
            font-family: 'Noto Sans Malayalam', 'Noto Sans', system-ui, -apple-system, sans-serif;
            white-space: pre-wrap;
            word-break: break-word;
            line-height: 2;
        }
        .token {
            display: inline-block;
            padding: 2px 4px;
            margin: 0 1px;
            border-radius: 4px;
            vertical-align: middle;
        }
    </style>""", unsafe_allow_html=True)
    
    token_container = '<div class="token-viz">'
    current_pos = 0
    
    for token in tokens:
        color = random.choice(COLORS)
        token_text = text[current_pos:current_pos + len(enc.decode([token]))]
        current_pos += len(token_text)
        token_container += f'<span class="token" style="background: {color}">{token_text}</span>'
    
    token_container += '</div>'
    st.write(token_container, unsafe_allow_html=True)
    
    with st.expander("Token Details"):
        st.code(f"Tokens: {tokens}")