Spaces:
Sleeping
Sleeping
| """ | |
| Tokenizer Web Application | |
| A simple webapp to visualize tokenization from any Hugging Face model. | |
| """ | |
| import os | |
| from flask import Flask, render_template, request, jsonify | |
| from transformers import AutoTokenizer | |
| import hashlib | |
| app = Flask(__name__) | |
| # Cache for loaded tokenizers | |
| tokenizer_cache = {} | |
| def get_color_for_token(token_id: int, total_colors: int = 10) -> str: | |
| """Generate a consistent color for a token based on its ID.""" | |
| colors = [ | |
| "#FFEAA7", # Yellow | |
| "#DFE6E9", # Light gray | |
| "#A8E6CF", # Mint green | |
| "#FDCB82", # Peach | |
| "#C3AED6", # Lavender | |
| "#FFB3BA", # Light pink | |
| "#BAFFC9", # Light green | |
| "#BAE1FF", # Light blue | |
| "#FFE4E1", # Misty rose | |
| "#E0BBE4", # Plum | |
| ] | |
| return colors[token_id % len(colors)] | |
| def load_tokenizer(model_id: str): | |
| """Load and cache a tokenizer from Hugging Face.""" | |
| if model_id not in tokenizer_cache: | |
| try: | |
| tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) | |
| tokenizer_cache[model_id] = tokenizer | |
| except Exception as e: | |
| raise ValueError(f"Failed to load tokenizer for '{model_id}': {str(e)}") | |
| return tokenizer_cache[model_id] | |
| def index(): | |
| # Get query parameters | |
| model_id = request.args.get("model", "").strip() or request.args.get("model_id", "").strip() | |
| text = request.args.get("text", "").strip() | |
| return render_template("index.html", model_id=model_id, text=text) | |
| def tokenize(): | |
| data = request.json | |
| model_id = data.get("model_id", "").strip() | |
| text = data.get("text", "") | |
| if not model_id: | |
| return jsonify({"error": "Model ID is required"}), 400 | |
| if not text: | |
| return jsonify({"error": "Text is required"}), 400 | |
| try: | |
| tokenizer = load_tokenizer(model_id) | |
| # Tokenize the text | |
| encoding = tokenizer(text, return_offsets_mapping=False, add_special_tokens=True) | |
| token_ids = encoding["input_ids"] | |
| # Get token strings | |
| tokens = [] | |
| for i, token_id in enumerate(token_ids): | |
| token_str = tokenizer.decode([token_id]) | |
| tokens.append({ | |
| "id": token_id, | |
| "text": token_str, | |
| "color": get_color_for_token(i), | |
| }) | |
| return jsonify({ | |
| "tokens": tokens, | |
| "token_count": len(tokens), | |
| "model_id": model_id, | |
| }) | |
| except ValueError as e: | |
| return jsonify({"error": str(e)}), 400 | |
| except Exception as e: | |
| return jsonify({"error": f"Tokenization failed: {str(e)}"}), 500 | |
| def model_suggestions(): | |
| """Return a list of popular model suggestions.""" | |
| suggestions = [ | |
| "qwen/qwen3-4B", | |
| "google/gemma-3-1b-it", | |
| "openai/gpt-oss-20b", | |
| "meta-llama/llama-3.2-3b", | |
| ] | |
| return jsonify(suggestions) | |
| if __name__ == "__main__": | |
| # Use port 7860 for HF Spaces compatibility | |
| port = int(os.environ.get("PORT", 7860)) | |
| app.run(debug=False, host="0.0.0.0", port=port) | |