Spaces:
Sleeping
Sleeping
File size: 6,119 Bytes
f073607 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import gradio as gr
from transformers import AutoTokenizer
import json
import re
# Supported models list
SUPPORTED_MODELS = {
"Llama-2": "meta-llama/Llama-2-7b-chat-hf",
"Llama-3": "meta-llama/Meta-Llama-3-8B-Instruct",
"Qwen2": "Qwen/Qwen2-7B-Instruct",
"Gemma-2": "google/gemma-2-9b-it",
"GPT-2": "gpt2",
"BERT": "bert-base-uncased",
}
# Global variable to store current tokenizer
current_tokenizer = None
# Color palette for alternating tokens
TOKEN_COLORS = [
"#e3f2fd", # Light blue
"#f3e5f5", # Light purple
"#e8f5e8", # Light green
"#fff3e0", # Light orange
"#fce4ec", # Light pink
"#e0f2f1", # Light teal
"#f1f8e9", # Light lime
"#fafafa", # Light gray
"#fff8e1", # Light amber
"#f3e5f5", # Light indigo
]
def load_tokenizer(model_name):
"""Load the specified tokenizer"""
global current_tokenizer
try:
model_path = SUPPORTED_MODELS[model_name]
current_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
return f"β
Successfully loaded {model_name} tokenizer"
except Exception as e:
return f"β Loading failed: {str(e)}"
def visualize_tokens(text, model_name):
"""Visualize the tokenization results of text"""
global current_tokenizer
if not current_tokenizer:
return "Please select and load a model first", None, None
if not text.strip():
return "Please enter text to analyze", None, None
try:
# Perform tokenization on text
encoding = current_tokenizer(text, return_tensors="pt", add_special_tokens=True)
token_ids = encoding['input_ids'][0].tolist()
tokens = current_tokenizer.convert_ids_to_tokens(token_ids)
# Create HTML format visualization results
html_output = "<div style='font-family: monospace; font-size: 14px; line-height: 1.5;'>"
# Display Tokenization Results
html_output += "<h3>Tokenization Results:</h3>"
html_output += "<div style='margin-bottom: 20px;'>"
# Cycle through multiple colors for consecutive tokens
for i, token in enumerate(tokens):
# Cycle through all available colors
current_color_index = i % len(TOKEN_COLORS)
# Get color for this token
bg_color = TOKEN_COLORS[current_color_index]
border_color = "#2196f3"
# Create token span
# Escape special characters in token string for HTML display
escaped_token = token.replace("<", "<").replace(">", ">")
token_html = f'<span class="token" data-token-id="{token_ids[i]}" data-token-string="{token}" data-token-index="{i}" data-color-index="{current_color_index}" style="display: inline-block; margin: 2px; padding: 4px 8px; background-color: {bg_color}; border: 1px solid {border_color}; border-radius: 4px; color: black;">{escaped_token}</span>'
html_output += token_html
html_output += "</div>"
# Display Token IDs
html_output += "<h3>Token IDs:</h3>"
html_output += "<div style='margin-bottom: 20px;'>"
for i, token_id in enumerate(token_ids):
# Alternate between two colors for consecutive token IDs
current_color_index = i % len(TOKEN_COLORS)
bg_color = TOKEN_COLORS[current_color_index]
border_color = "#2196f3"
# Create token ID span
token_id_html = f'<span class="token-id" data-token-id="{token_id}" data-token-index="{i}" data-color-index="{current_color_index}" style="display: inline-block; margin: 2px; padding: 4px 8px; background-color: {bg_color}; border: 1px solid {border_color}; border-radius: 4px; color: black; text-decoration: none !important;">{token_id}</span>'
html_output += token_id_html
html_output += "</div>"
html_output += "</div>"
# No JavaScript needed since we removed hover effects
js_code = ""
html_output += js_code
# Create JSON format detailed information
# Get vocabulary size
vocab_size = current_tokenizer.vocab_size
return html_output, f"Total tokens: {len(tokens)}\nVocabulary size: {vocab_size:,}"
except Exception as e:
return f"β Processing failed: {str(e)}", None
# Create Gradio interface
with gr.Blocks(title="Token Visualizer", theme=gr.themes.Soft()) as demo:
gr.Markdown("# π Token Visualizer")
gr.Markdown("This is a tool for visualizing the text tokenization process. Select a model, input text, and view the tokenization results.")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("### 1. Select Model")
model_dropdown = gr.Dropdown(
choices=list(SUPPORTED_MODELS.keys()),
label="Select Model",
value="GPT-2"
)
load_btn = gr.Button("Load Tokenizer", variant="primary")
load_status = gr.Textbox(label="Loading Status", interactive=False)
gr.Markdown("### 2. Input Text")
text_input = gr.Textbox(
label="Enter text to tokenize",
placeholder="Example: Hello, how are you today?",
lines=4
)
visualize_btn = gr.Button("Visualize", variant="primary")
with gr.Column(scale=2):
gr.Markdown("### 3. Visualization Results")
html_output = gr.HTML(label="Token Visualization")
stats_output = gr.Textbox(label="Statistics", interactive=False)
# Event binding
load_btn.click(
fn=load_tokenizer,
inputs=[model_dropdown],
outputs=[load_status]
)
visualize_btn.click(
fn=visualize_tokens,
inputs=[text_input, model_dropdown],
outputs=[html_output, stats_output]
)
if __name__ == "__main__":
demo.launch() |