File size: 6,119 Bytes
f073607
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import gradio as gr
from transformers import AutoTokenizer
import json
import re

# Supported models list
SUPPORTED_MODELS = {
    "Llama-2": "meta-llama/Llama-2-7b-chat-hf",
    "Llama-3": "meta-llama/Meta-Llama-3-8B-Instruct",
    "Qwen2": "Qwen/Qwen2-7B-Instruct",
    "Gemma-2": "google/gemma-2-9b-it",
    "GPT-2": "gpt2",
    "BERT": "bert-base-uncased",
}

# Global variable to store current tokenizer
current_tokenizer = None

# Color palette for alternating tokens
TOKEN_COLORS = [
    "#e3f2fd",  # Light blue
    "#f3e5f5",  # Light purple
    "#e8f5e8",  # Light green
    "#fff3e0",  # Light orange
    "#fce4ec",  # Light pink
    "#e0f2f1",  # Light teal
    "#f1f8e9",  # Light lime
    "#fafafa",  # Light gray
    "#fff8e1",  # Light amber
    "#f3e5f5",  # Light indigo
]

def load_tokenizer(model_name):
    """Load the specified tokenizer"""
    global current_tokenizer
    try:
        model_path = SUPPORTED_MODELS[model_name]
        current_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
        return f"βœ… Successfully loaded {model_name} tokenizer"
    except Exception as e:
        return f"❌ Loading failed: {str(e)}"

def visualize_tokens(text, model_name):
    """Visualize the tokenization results of text"""
    global current_tokenizer
    
    if not current_tokenizer:
        return "Please select and load a model first", None, None
    
    if not text.strip():
        return "Please enter text to analyze", None, None
    
    try:
        # Perform tokenization on text
        encoding = current_tokenizer(text, return_tensors="pt", add_special_tokens=True)
        token_ids = encoding['input_ids'][0].tolist()
        tokens = current_tokenizer.convert_ids_to_tokens(token_ids)
        
        # Create HTML format visualization results
        html_output = "<div style='font-family: monospace; font-size: 14px; line-height: 1.5;'>"
        
        # Display Tokenization Results
        html_output += "<h3>Tokenization Results:</h3>"
        html_output += "<div style='margin-bottom: 20px;'>"
        
        # Cycle through multiple colors for consecutive tokens
        for i, token in enumerate(tokens):
            # Cycle through all available colors
            current_color_index = i % len(TOKEN_COLORS)
            
            # Get color for this token
            bg_color = TOKEN_COLORS[current_color_index]
            border_color = "#2196f3"
            
            # Create token span
            # Escape special characters in token string for HTML display
            escaped_token = token.replace("<", "&lt;").replace(">", "&gt;")
            token_html = f'<span class="token" data-token-id="{token_ids[i]}" data-token-string="{token}" data-token-index="{i}" data-color-index="{current_color_index}" style="display: inline-block; margin: 2px; padding: 4px 8px; background-color: {bg_color}; border: 1px solid {border_color}; border-radius: 4px; color: black;">{escaped_token}</span>'
            html_output += token_html
        
        html_output += "</div>"
        
        # Display Token IDs
        html_output += "<h3>Token IDs:</h3>"
        html_output += "<div style='margin-bottom: 20px;'>"
        
        for i, token_id in enumerate(token_ids):
            # Alternate between two colors for consecutive token IDs
            current_color_index = i % len(TOKEN_COLORS)
            bg_color = TOKEN_COLORS[current_color_index]
            border_color = "#2196f3"
            
            # Create token ID span
            token_id_html = f'<span class="token-id" data-token-id="{token_id}" data-token-index="{i}" data-color-index="{current_color_index}" style="display: inline-block; margin: 2px; padding: 4px 8px; background-color: {bg_color}; border: 1px solid {border_color}; border-radius: 4px; color: black; text-decoration: none !important;">{token_id}</span>'
            html_output += token_id_html
        
        html_output += "</div>"
        html_output += "</div>"
        
        # No JavaScript needed since we removed hover effects
        js_code = ""
        
        html_output += js_code
        
        # Create JSON format detailed information
        
        # Get vocabulary size
        vocab_size = current_tokenizer.vocab_size
        
        return html_output, f"Total tokens: {len(tokens)}\nVocabulary size: {vocab_size:,}"
        
    except Exception as e:
        return f"❌ Processing failed: {str(e)}", None



# Create Gradio interface
with gr.Blocks(title="Token Visualizer", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# πŸ” Token Visualizer")
    gr.Markdown("This is a tool for visualizing the text tokenization process. Select a model, input text, and view the tokenization results.")
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 1. Select Model")
            model_dropdown = gr.Dropdown(
                choices=list(SUPPORTED_MODELS.keys()),
                label="Select Model",
                value="GPT-2"
            )
            load_btn = gr.Button("Load Tokenizer", variant="primary")
            load_status = gr.Textbox(label="Loading Status", interactive=False)
            
            gr.Markdown("### 2. Input Text")
            text_input = gr.Textbox(
                label="Enter text to tokenize",
                placeholder="Example: Hello, how are you today?",
                lines=4
            )
            visualize_btn = gr.Button("Visualize", variant="primary")
            
        with gr.Column(scale=2):
            gr.Markdown("### 3. Visualization Results")
            html_output = gr.HTML(label="Token Visualization")
            stats_output = gr.Textbox(label="Statistics", interactive=False)
    
    # Event binding
    
    load_btn.click(
        fn=load_tokenizer,
        inputs=[model_dropdown],
        outputs=[load_status]
    )
    
    visualize_btn.click(
        fn=visualize_tokens,
        inputs=[text_input, model_dropdown],
        outputs=[html_output, stats_output]
    )
    


if __name__ == "__main__":
    demo.launch()