import streamlit as st
from bpe import Tokenizer
import random
import colorsys
# Set page config
st.set_page_config(
page_title="English BPE Tokenizer Visualizer",
layout="wide"
)
# Load the trained tokenizer
@st.cache_resource
def load_tokenizer():
tokenizer = Tokenizer()
tokenizer.load("models/EnglishBPE_6999.model.model")
return tokenizer
# Load example texts
@st.cache_data
def load_examples():
try:
with open("data/testdata1.txt", "r", encoding="utf-8") as f:
example1 = f.read().strip()
with open("data/testdata2.txt", "r", encoding="utf-8") as f:
example2 = f.read().strip()
except Exception as e:
st.error(f"Error loading example texts: {str(e)}")
# Fallback examples in case files can't be loaded
return example1, example2
def generate_distinct_colors(n):
colors = []
for i in range(n):
hue = i / n
saturation = 0.7 + random.random() * 0.3
value = 0.8 + random.random() * 0.2
rgb = colorsys.hsv_to_rgb(hue, saturation, value)
hex_color = "#{:02x}{:02x}{:02x}".format(
int(rgb[0] * 255), int(rgb[1] * 255), int(rgb[2] * 255)
)
colors.append(hex_color)
return colors
def process_text(text, tokenizer):
try:
# Get tokens
tokens = tokenizer.encode(text)
# Generate colors for visualization
unique_tokens = list(set(tokens))
colors = generate_distinct_colors(len(unique_tokens))
token_colors = dict(zip(unique_tokens, colors))
# Create HTML visualization
html_parts = []
decoded_tokens = [tokenizer.decode([token]) for token in tokens]
for token, token_text in zip(tokens, decoded_tokens):
color = token_colors[token]
html_parts.append(f'{token_text}')
return ''.join(html_parts), tokens
except Exception as e:
return f"Error processing text: {str(e)}", None
def main():
# Load tokenizer and examples
tokenizer = load_tokenizer()
example1, example2 = load_examples()
# Title and description
st.title("English BPE Tokenizer Visualizer")
st.markdown("Enter text to see how it gets tokenized, with color-coded visualization")
# Example selector
example_option = st.selectbox(
"Choose an example or enter your own text below:",
["Custom Input", "Example 1", "Example 2"]
)
# Text input
if example_option == "Example 1":
text = st.text_area("Enter Text", value=example1, height=100)
elif example_option == "Example 2":
text = st.text_area("Enter Text", value=example2, height=100)
else:
text = st.text_area("Enter Text", height=100)
# Process button
if st.button("Process Text") or text:
if text.strip():
# Create two columns for output
col1, col2 = st.columns([2, 1])
# Process the text
visualization, tokens = process_text(text, tokenizer)
with col1:
st.subheader("Visualization")
st.markdown(visualization, unsafe_allow_html=True)
with col2:
if tokens is not None:
st.subheader("Token Information")
st.write(f"Token count: {len(tokens)}")
st.write("Tokens:", tokens)
else:
st.warning("Please enter some text to process.")
if __name__ == "__main__":
main()