Spaces:

snorfyang
/

token-visualizer

Sleeping

File size: 6,119 Bytes

f073607

import gradio as gr
from transformers import AutoTokenizer
import json
import re

# Supported models list
SUPPORTED_MODELS = {
    "Llama-2": "meta-llama/Llama-2-7b-chat-hf",
    "Llama-3": "meta-llama/Meta-Llama-3-8B-Instruct",
    "Qwen2": "Qwen/Qwen2-7B-Instruct",
    "Gemma-2": "google/gemma-2-9b-it",
    "GPT-2": "gpt2",
    "BERT": "bert-base-uncased",
}

# Global variable to store current tokenizer
current_tokenizer = None

# Color palette for alternating tokens
TOKEN_COLORS = [
    "#e3f2fd",  # Light blue
    "#f3e5f5",  # Light purple
    "#e8f5e8",  # Light green
    "#fff3e0",  # Light orange
    "#fce4ec",  # Light pink
    "#e0f2f1",  # Light teal
    "#f1f8e9",  # Light lime
    "#fafafa",  # Light gray
    "#fff8e1",  # Light amber
    "#f3e5f5",  # Light indigo
]

def load_tokenizer(model_name):
    """Load the specified tokenizer"""
    global current_tokenizer
    try:
        model_path = SUPPORTED_MODELS[model_name]
        current_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
        return f"✅ Successfully loaded {model_name} tokenizer"
    except Exception as e:
        return f"❌ Loading failed: {str(e)}"

def visualize_tokens(text, model_name):
    """Visualize the tokenization results of text"""
    global current_tokenizer
    
    if not current_tokenizer:
        return "Please select and load a model first", None, None
    
    if not text.strip():
        return "Please enter text to analyze", None, None
    
    try:
        # Perform tokenization on text
        encoding = current_tokenizer(text, return_tensors="pt", add_special_tokens=True)
        token_ids = encoding['input_ids'][0].tolist()
        tokens = current_tokenizer.convert_ids_to_tokens(token_ids)
        
        # Create HTML format visualization results
        html_output = "<div style='font-family: monospace; font-size: 14px; line-height: 1.5;'>"
        
        # Display Tokenization Results
        html_output += "<h3>Tokenization Results:</h3>"
        html_output += "<div style='margin-bottom: 20px;'>"
        
        # Cycle through multiple colors for consecutive tokens
        for i, token in enumerate(tokens):
            # Cycle through all available colors
            current_color_index = i % len(TOKEN_COLORS)
            
            # Get color for this token
            bg_color = TOKEN_COLORS[current_color_index]
            border_color = "#2196f3"
            
            # Create token span
            # Escape special characters in token string for HTML display
            escaped_token = token.replace("<", "&lt;").replace(">", "&gt;")
            token_html = f'<span class="token" data-token-id="{token_ids[i]}" data-token-string="{token}" data-token-index="{i}" data-color-index="{current_color_index}" style="display: inline-block; margin: 2px; padding: 4px 8px; background-color: {bg_color}; border: 1px solid {border_color}; border-radius: 4px; color: black;">{escaped_token}</span>'
            html_output += token_html
        
        html_output += "</div>"
        
        # Display Token IDs
        html_output += "<h3>Token IDs:</h3>"
        html_output += "<div style='margin-bottom: 20px;'>"
        
        for i, token_id in enumerate(token_ids):
            # Alternate between two colors for consecutive token IDs
            current_color_index = i % len(TOKEN_COLORS)
            bg_color = TOKEN_COLORS[current_color_index]
            border_color = "#2196f3"
            
            # Create token ID span
            token_id_html = f'<span class="token-id" data-token-id="{token_id}" data-token-index="{i}" data-color-index="{current_color_index}" style="display: inline-block; margin: 2px; padding: 4px 8px; background-color: {bg_color}; border: 1px solid {border_color}; border-radius: 4px; color: black; text-decoration: none !important;">{token_id}</span>'
            html_output += token_id_html
        
        html_output += "</div>"
        html_output += "</div>"
        
        # No JavaScript needed since we removed hover effects
        js_code = ""
        
        html_output += js_code
        
        # Create JSON format detailed information
        
        # Get vocabulary size
        vocab_size = current_tokenizer.vocab_size
        
        return html_output, f"Total tokens: {len(tokens)}\nVocabulary size: {vocab_size:,}"
        
    except Exception as e:
        return f"❌ Processing failed: {str(e)}", None



# Create Gradio interface
with gr.Blocks(title="Token Visualizer", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🔍 Token Visualizer")
    gr.Markdown("This is a tool for visualizing the text tokenization process. Select a model, input text, and view the tokenization results.")
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 1. Select Model")
            model_dropdown = gr.Dropdown(
                choices=list(SUPPORTED_MODELS.keys()),
                label="Select Model",
                value="GPT-2"
            )
            load_btn = gr.Button("Load Tokenizer", variant="primary")
            load_status = gr.Textbox(label="Loading Status", interactive=False)
            
            gr.Markdown("### 2. Input Text")
            text_input = gr.Textbox(
                label="Enter text to tokenize",
                placeholder="Example: Hello, how are you today?",
                lines=4
            )
            visualize_btn = gr.Button("Visualize", variant="primary")
            
        with gr.Column(scale=2):
            gr.Markdown("### 3. Visualization Results")
            html_output = gr.HTML(label="Token Visualization")
            stats_output = gr.Textbox(label="Statistics", interactive=False)
    
    # Event binding
    
    load_btn.click(
        fn=load_tokenizer,
        inputs=[model_dropdown],
        outputs=[load_status]
    )
    
    visualize_btn.click(
        fn=visualize_tokens,
        inputs=[text_input, model_dropdown],
        outputs=[html_output, stats_output]
    )
    


if __name__ == "__main__":
    demo.launch()