| import streamlit as st |
| from crayon import CrayonVocab |
| import time |
|
|
| st.set_page_config(page_title="CRAYON Tokenizer Demo", layout="wide") |
|
|
| st.title("🖍️ CRAYON v5.1.0 Tokenizer Demo") |
| st.markdown("Interactive tokenization with CRAYON—the hyper-fast specialized tokenizer.") |
|
|
| |
| if "vocab" not in st.session_state: |
| with st.spinner("Loading vocabulary profile..."): |
| st.session_state.vocab = CrayonVocab(device="cpu") |
| st.session_state.vocab.load_profile("lite") |
| st.success("✓ Profile loaded!") |
|
|
| vocab = st.session_state.vocab |
|
|
| |
| st.subheader("Input Text") |
| text_input = st.text_area( |
| "Enter text to tokenize:", |
| value="Hello, CRAYON! This is a production-grade tokenizer.", |
| height=100 |
| ) |
|
|
| if text_input: |
| |
| start = time.perf_counter() |
| tokens = vocab.tokenize(text_input) |
| elapsed = (time.perf_counter() - start) * 1000 |
| |
| |
| decoded = vocab.decode(tokens) |
| |
| |
| col1, col2 = st.columns(2) |
| |
| with col1: |
| st.subheader("Tokens") |
| st.code(str(tokens), language="python") |
| |
| with col2: |
| st.subheader("Statistics") |
| st.metric("Token Count", len(tokens)) |
| st.metric("Processing Time", f"{elapsed:.3f}ms") |
| |
| st.subheader("Decoded Output") |
| st.write(decoded) |
| |
| |
| with st.expander("📋 Token Breakdown"): |
| st.write(f"{'ID':<8} | {'Substring':<20}") |
| st.write("-" * 30) |
| for tid in tokens: |
| substring = vocab.decode([tid]) |
| st.write(f"{tid:<8} | '{substring}'") |
|
|