""" Tokenizer Web Application A simple webapp to visualize tokenization from any Hugging Face model. """ import os from flask import Flask, render_template, request, jsonify from transformers import AutoTokenizer import hashlib app = Flask(__name__) # Cache for loaded tokenizers tokenizer_cache = {} def get_color_for_token(token_id: int, total_colors: int = 10) -> str: """Generate a consistent color for a token based on its ID.""" colors = [ "#FFEAA7", # Yellow "#DFE6E9", # Light gray "#A8E6CF", # Mint green "#FDCB82", # Peach "#C3AED6", # Lavender "#FFB3BA", # Light pink "#BAFFC9", # Light green "#BAE1FF", # Light blue "#FFE4E1", # Misty rose "#E0BBE4", # Plum ] return colors[token_id % len(colors)] def load_tokenizer(model_id: str): """Load and cache a tokenizer from Hugging Face.""" if model_id not in tokenizer_cache: try: tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) tokenizer_cache[model_id] = tokenizer except Exception as e: raise ValueError(f"Failed to load tokenizer for '{model_id}': {str(e)}") return tokenizer_cache[model_id] @app.route("/") def index(): # Get query parameters model_id = request.args.get("model", "").strip() or request.args.get("model_id", "").strip() text = request.args.get("text", "").strip() return render_template("index.html", model_id=model_id, text=text) @app.route("/tokenize", methods=["POST"]) def tokenize(): data = request.json model_id = data.get("model_id", "").strip() text = data.get("text", "") if not model_id: return jsonify({"error": "Model ID is required"}), 400 if not text: return jsonify({"error": "Text is required"}), 400 try: tokenizer = load_tokenizer(model_id) # Tokenize the text encoding = tokenizer(text, return_offsets_mapping=False, add_special_tokens=True) token_ids = encoding["input_ids"] # Get token strings tokens = [] for i, token_id in enumerate(token_ids): token_str = tokenizer.decode([token_id]) tokens.append({ "id": token_id, "text": token_str, "color": get_color_for_token(i), }) return jsonify({ "tokens": tokens, "token_count": len(tokens), "model_id": model_id, }) except ValueError as e: return jsonify({"error": str(e)}), 400 except Exception as e: return jsonify({"error": f"Tokenization failed: {str(e)}"}), 500 @app.route("/models/suggestions") def model_suggestions(): """Return a list of popular model suggestions.""" suggestions = [ "qwen/qwen3-4B", "google/gemma-3-1b-it", "openai/gpt-oss-20b", "meta-llama/llama-3.2-3b", ] return jsonify(suggestions) if __name__ == "__main__": # Use port 7860 for HF Spaces compatibility port = int(os.environ.get("PORT", 7860)) app.run(debug=False, host="0.0.0.0", port=port)