File size: 1,651 Bytes
708f4a3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | import streamlit as st
from crayon import CrayonVocab
import time
st.set_page_config(page_title="CRAYON Tokenizer Demo", layout="wide")
st.title("🖍️ CRAYON v5.1.0 Tokenizer Demo")
st.markdown("Interactive tokenization with CRAYON—the hyper-fast specialized tokenizer.")
# Initialize session state
if "vocab" not in st.session_state:
with st.spinner("Loading vocabulary profile..."):
st.session_state.vocab = CrayonVocab(device="cpu") # Use CPU for cloud compatibility
st.session_state.vocab.load_profile("lite")
st.success("✓ Profile loaded!")
vocab = st.session_state.vocab
# User input
st.subheader("Input Text")
text_input = st.text_area(
"Enter text to tokenize:",
value="Hello, CRAYON! This is a production-grade tokenizer.",
height=100
)
if text_input:
# Tokenize
start = time.perf_counter()
tokens = vocab.tokenize(text_input)
elapsed = (time.perf_counter() - start) * 1000
# Decode
decoded = vocab.decode(tokens)
# Display results
col1, col2 = st.columns(2)
with col1:
st.subheader("Tokens")
st.code(str(tokens), language="python")
with col2:
st.subheader("Statistics")
st.metric("Token Count", len(tokens))
st.metric("Processing Time", f"{elapsed:.3f}ms")
st.subheader("Decoded Output")
st.write(decoded)
# Token breakdown
with st.expander("📋 Token Breakdown"):
st.write(f"{'ID':<8} | {'Substring':<20}")
st.write("-" * 30)
for tid in tokens:
substring = vocab.decode([tid])
st.write(f"{tid:<8} | '{substring}'")
|