Debanjum
Make url shareable with model, text set via url query params
e6162a8
"""
Tokenizer Web Application
A simple webapp to visualize tokenization from any Hugging Face model.
"""
import os
from flask import Flask, render_template, request, jsonify
from transformers import AutoTokenizer
import hashlib
app = Flask(__name__)
# Cache for loaded tokenizers
tokenizer_cache = {}
def get_color_for_token(token_id: int, total_colors: int = 10) -> str:
"""Generate a consistent color for a token based on its ID."""
colors = [
"#FFEAA7", # Yellow
"#DFE6E9", # Light gray
"#A8E6CF", # Mint green
"#FDCB82", # Peach
"#C3AED6", # Lavender
"#FFB3BA", # Light pink
"#BAFFC9", # Light green
"#BAE1FF", # Light blue
"#FFE4E1", # Misty rose
"#E0BBE4", # Plum
]
return colors[token_id % len(colors)]
def load_tokenizer(model_id: str):
"""Load and cache a tokenizer from Hugging Face."""
if model_id not in tokenizer_cache:
try:
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer_cache[model_id] = tokenizer
except Exception as e:
raise ValueError(f"Failed to load tokenizer for '{model_id}': {str(e)}")
return tokenizer_cache[model_id]
@app.route("/")
def index():
# Get query parameters
model_id = request.args.get("model", "").strip() or request.args.get("model_id", "").strip()
text = request.args.get("text", "").strip()
return render_template("index.html", model_id=model_id, text=text)
@app.route("/tokenize", methods=["POST"])
def tokenize():
data = request.json
model_id = data.get("model_id", "").strip()
text = data.get("text", "")
if not model_id:
return jsonify({"error": "Model ID is required"}), 400
if not text:
return jsonify({"error": "Text is required"}), 400
try:
tokenizer = load_tokenizer(model_id)
# Tokenize the text
encoding = tokenizer(text, return_offsets_mapping=False, add_special_tokens=True)
token_ids = encoding["input_ids"]
# Get token strings
tokens = []
for i, token_id in enumerate(token_ids):
token_str = tokenizer.decode([token_id])
tokens.append({
"id": token_id,
"text": token_str,
"color": get_color_for_token(i),
})
return jsonify({
"tokens": tokens,
"token_count": len(tokens),
"model_id": model_id,
})
except ValueError as e:
return jsonify({"error": str(e)}), 400
except Exception as e:
return jsonify({"error": f"Tokenization failed: {str(e)}"}), 500
@app.route("/models/suggestions")
def model_suggestions():
"""Return a list of popular model suggestions."""
suggestions = [
"qwen/qwen3-4B",
"google/gemma-3-1b-it",
"openai/gpt-oss-20b",
"meta-llama/llama-3.2-3b",
]
return jsonify(suggestions)
if __name__ == "__main__":
# Use port 7860 for HF Spaces compatibility
port = int(os.environ.get("PORT", 7860))
app.run(debug=False, host="0.0.0.0", port=port)