File size: 3,816 Bytes
d6ae8d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import streamlit as st
from bpe import Tokenizer
import random
import colorsys

# Set page config
st.set_page_config(
    page_title="English BPE Tokenizer Visualizer",
    layout="wide"
)

# Load the trained tokenizer
@st.cache_resource
def load_tokenizer():
    tokenizer = Tokenizer()
    tokenizer.load("models/EnglishBPE_6999.model.model")
    return tokenizer

# Load example texts
@st.cache_data
def load_examples():
    try:
        with open("data/testdata1.txt", "r", encoding="utf-8") as f:
            example1 = f.read().strip()
        with open("data/testdata2.txt", "r", encoding="utf-8") as f:
            example2 = f.read().strip()
    except Exception as e:
        st.error(f"Error loading example texts: {str(e)}")
        # Fallback examples in case files can't be loaded
        
    return example1, example2

def generate_distinct_colors(n):
    colors = []
    for i in range(n):
        hue = i / n
        saturation = 0.7 + random.random() * 0.3
        value = 0.8 + random.random() * 0.2
        rgb = colorsys.hsv_to_rgb(hue, saturation, value)
        hex_color = "#{:02x}{:02x}{:02x}".format(
            int(rgb[0] * 255), int(rgb[1] * 255), int(rgb[2] * 255)
        )
        colors.append(hex_color)
    return colors

def process_text(text, tokenizer):
    try:
        # Get tokens
        tokens = tokenizer.encode(text)
        
        # Generate colors for visualization
        unique_tokens = list(set(tokens))
        colors = generate_distinct_colors(len(unique_tokens))
        token_colors = dict(zip(unique_tokens, colors))
        
        # Create HTML visualization
        html_parts = []
        decoded_tokens = [tokenizer.decode([token]) for token in tokens]
        
        for token, token_text in zip(tokens, decoded_tokens):
            color = token_colors[token]
            html_parts.append(f'<span style="background-color: {color}; padding: 0 2px; border-radius: 3px;" title="Token ID: {token}">{token_text}</span>')
        
        return ''.join(html_parts), tokens
    except Exception as e:
        return f"<span style='color: red'>Error processing text: {str(e)}</span>", None

def main():
    # Load tokenizer and examples
    tokenizer = load_tokenizer()
    example1, example2 = load_examples()

    # Title and description
    st.title("English BPE Tokenizer Visualizer")
    st.markdown("Enter text to see how it gets tokenized, with color-coded visualization")

    # Example selector
    example_option = st.selectbox(
        "Choose an example or enter your own text below:",
        ["Custom Input", "Example 1", "Example 2"]
    )

    # Text input
    if example_option == "Example 1":
        text = st.text_area("Enter Text", value=example1, height=100)
    elif example_option == "Example 2":
        text = st.text_area("Enter Text", value=example2, height=100)
    else:
        text = st.text_area("Enter Text", height=100)

    # Process button
    if st.button("Process Text") or text:
        if text.strip():
            # Create two columns for output
            col1, col2 = st.columns([2, 1])
            
            # Process the text
            visualization, tokens = process_text(text, tokenizer)
            
            with col1:
                st.subheader("Visualization")
                st.markdown(visualization, unsafe_allow_html=True)
            
            with col2:
                if tokens is not None:
                    st.subheader("Token Information")
                    st.write(f"Token count: {len(tokens)}")
                    st.write("Tokens:", tokens)
        else:
            st.warning("Please enter some text to process.")

if __name__ == "__main__":
    main()