Spaces:

khoj-ai
/

token-visualizer

Sleeping

App Files Files Community

Debanjum commited on Nov 25, 2025

Commit

ef61a64

0 Parent(s):

Create app to visualize text tokenization by any LLM on HF

Browse files

Files changed (5) hide show

Dockerfile +16 -0
README.md +36 -0
app.py +105 -0
requirements.txt +4 -0
templates/index.html +550 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+FROM python:3.10-slim
+WORKDIR /app
+# Install dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY . .
+# Expose port 7860 (HF Spaces default)
+EXPOSE 7860
+# Run the application
+CMD ["python", "app.py"]

README.md ADDED Viewed

	@@ -0,0 +1,36 @@

+---
+title: HF Tokenizer Visualizer
+emoji: 🔤
+colorFrom: purple
+colorTo: blue
+sdk: docker
+pinned: false
+license: mit
+---
+# HF Tokenizer Visualizer
+A web app to visualize tokenization from any Hugging Face model. Inspired by OpenAI's tokenizer tool.
+## Features
+- Load any tokenizer from Hugging Face Hub by model ID (e.g., `openai-community/gpt2`)
+- Color-coded token visualization
+- Hover to see token IDs
+- Character and token count statistics
+- Popular model suggestions
+## Usage
+1. Enter a Hugging Face model ID (e.g., `meta-llama/Llama-2-7b-hf`)
+2. Type or paste text you want to tokenize
+3. Click "Tokenize" to see the colorized output
+## Local Development
+```bash
+pip install -r requirements.txt
+python app.py
+```
+Then open http://localhost:7860 in your browser.

app.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""
+Tokenizer Web Application
+A simple webapp to visualize tokenization from any Hugging Face model.
+"""
+import os
+from flask import Flask, render_template, request, jsonify
+from transformers import AutoTokenizer
+import hashlib
+app = Flask(__name__)
+# Cache for loaded tokenizers
+tokenizer_cache = {}
+def get_color_for_token(token_id: int, total_colors: int = 10) -> str:
+    """Generate a consistent color for a token based on its ID."""
+    colors = [
+        "#FFEAA7",  # Yellow
+        "#DFE6E9",  # Light gray
+        "#A8E6CF",  # Mint green
+        "#FDCB82",  # Peach
+        "#C3AED6",  # Lavender
+        "#FFB3BA",  # Light pink
+        "#BAFFC9",  # Light green
+        "#BAE1FF",  # Light blue
+        "#FFE4E1",  # Misty rose
+        "#E0BBE4",  # Plum
+    ]
+    return colors[token_id % len(colors)]
+def load_tokenizer(model_id: str):
+    """Load and cache a tokenizer from Hugging Face."""
+    if model_id not in tokenizer_cache:
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+            tokenizer_cache[model_id] = tokenizer
+        except Exception as e:
+            raise ValueError(f"Failed to load tokenizer for '{model_id}': {str(e)}")
+    return tokenizer_cache[model_id]
+@app.route("/")
+def index():
+    return render_template("index.html")
+@app.route("/tokenize", methods=["POST"])
+def tokenize():
+    data = request.json
+    model_id = data.get("model_id", "").strip()
+    text = data.get("text", "")
+    if not model_id:
+        return jsonify({"error": "Model ID is required"}), 400
+    if not text:
+        return jsonify({"error": "Text is required"}), 400
+    try:
+        tokenizer = load_tokenizer(model_id)
+        # Tokenize the text
+        encoding = tokenizer(text, return_offsets_mapping=False, add_special_tokens=True)
+        token_ids = encoding["input_ids"]
+        # Get token strings
+        tokens = []
+        for i, token_id in enumerate(token_ids):
+            token_str = tokenizer.decode([token_id])
+            tokens.append({
+                "id": token_id,
+                "text": token_str,
+                "color": get_color_for_token(i),
+            })
+        return jsonify({
+            "tokens": tokens,
+            "token_count": len(tokens),
+            "model_id": model_id,
+        })
+    except ValueError as e:
+        return jsonify({"error": str(e)}), 400
+    except Exception as e:
+        return jsonify({"error": f"Tokenization failed: {str(e)}"}), 500
+@app.route("/models/suggestions")
+def model_suggestions():
+    """Return a list of popular model suggestions."""
+    suggestions = [
+        "qwen/qwen3-4B",
+        "google/gemma-3-1b-it",
+        "openai/gpt-oss-20b",
+        "meta-llama/llama-3.2-3b",
+    ]
+    return jsonify(suggestions)
+if __name__ == "__main__":
+    # Use port 7860 for HF Spaces compatibility
+    port = int(os.environ.get("PORT", 7860))
+    app.run(debug=False, host="0.0.0.0", port=port)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+flask>=2.0.0
+transformers>=4.30.0
+huggingface_hub
+gunicorn

templates/index.html ADDED Viewed

	@@ -0,0 +1,550 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>HF Tokenizer Visualizer</title>
+    <style>
+        * {
+            box-sizing: border-box;
+            margin: 0;
+            padding: 0;
+        }
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, sans-serif;
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            min-height: 100vh;
+            padding: 40px 20px;
+        }
+        .container {
+            max-width: 900px;
+            margin: 0 auto;
+        }
+        h1 {
+            color: white;
+            text-align: center;
+            margin-bottom: 10px;
+            font-size: 2.5rem;
+        }
+        .subtitle {
+            color: rgba(255, 255, 255, 0.8);
+            text-align: center;
+            margin-bottom: 30px;
+            font-size: 1.1rem;
+        }
+        .card {
+            background: white;
+            border-radius: 16px;
+            padding: 30px;
+            box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
+        }
+        .input-group {
+            margin-bottom: 20px;
+        }
+        label {
+            display: block;
+            font-weight: 600;
+            margin-bottom: 8px;
+            color: #333;
+        }
+        .model-input-wrapper {
+            position: relative;
+        }
+        input[type="text"] {
+            width: 100%;
+            padding: 14px 16px;
+            border: 2px solid #e0e0e0;
+            border-radius: 10px;
+            font-size: 16px;
+            transition: border-color 0.2s, box-shadow 0.2s;
+        }
+        input[type="text"]:focus {
+            outline: none;
+            border-color: #667eea;
+            box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.2);
+        }
+        .suggestions {
+            position: absolute;
+            top: 100%;
+            left: 0;
+            right: 0;
+            background: white;
+            border: 1px solid #e0e0e0;
+            border-radius: 10px;
+            margin-top: 4px;
+            box-shadow: 0 10px 40px rgba(0, 0, 0, 0.1);
+            z-index: 100;
+            display: none;
+            max-height: 250px;
+            overflow-y: auto;
+        }
+        .suggestions.show {
+            display: block;
+        }
+        .suggestion-item {
+            padding: 12px 16px;
+            cursor: pointer;
+            transition: background 0.15s;
+            border-bottom: 1px solid #f0f0f0;
+        }
+        .suggestion-item:last-child {
+            border-bottom: none;
+        }
+        .suggestion-item:hover {
+            background: #f5f5f5;
+        }
+        .suggestion-item code {
+            background: #e8e8e8;
+            padding: 2px 6px;
+            border-radius: 4px;
+            font-size: 14px;
+        }
+        textarea {
+            width: 100%;
+            padding: 14px 16px;
+            border: 2px solid #e0e0e0;
+            border-radius: 10px;
+            font-size: 16px;
+            min-height: 150px;
+            resize: vertical;
+            font-family: inherit;
+            transition: border-color 0.2s, box-shadow 0.2s;
+        }
+        textarea:focus {
+            outline: none;
+            border-color: #667eea;
+            box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.2);
+        }
+        .btn {
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            border: none;
+            padding: 14px 32px;
+            font-size: 16px;
+            font-weight: 600;
+            border-radius: 10px;
+            cursor: pointer;
+            transition: transform 0.2s, box-shadow 0.2s;
+            width: 100%;
+        }
+        .btn:hover {
+            transform: translateY(-2px);
+            box-shadow: 0 10px 30px rgba(102, 126, 234, 0.4);
+        }
+        .btn:active {
+            transform: translateY(0);
+        }
+        .btn:disabled {
+            opacity: 0.6;
+            cursor: not-allowed;
+            transform: none;
+        }
+        .results {
+            margin-top: 30px;
+            padding-top: 30px;
+            border-top: 2px solid #f0f0f0;
+        }
+        .stats {
+            display: flex;
+            gap: 20px;
+            margin-bottom: 20px;
+            flex-wrap: wrap;
+        }
+        .stat {
+            background: #f8f9fa;
+            padding: 12px 20px;
+            border-radius: 10px;
+            font-size: 14px;
+        }
+        .stat-value {
+            font-weight: 700;
+            color: #667eea;
+            font-size: 20px;
+        }
+        .stat-label {
+            color: #666;
+            margin-top: 2px;
+        }
+        .tokens-display {
+            background: #f8f9fa;
+            border-radius: 12px;
+            padding: 20px;
+            line-height: 2.2;
+            min-height: 100px;
+            word-wrap: break-word;
+            overflow-wrap: break-word;
+            white-space: pre-wrap;
+        }
+        .token {
+            display: inline;
+            padding: 4px 2px;
+            margin: 2px 0;
+            border-radius: 4px;
+            font-family: 'SF Mono', Monaco, 'Cascadia Code', monospace;
+            font-size: 15px;
+            cursor: pointer;
+            transition: transform 0.1s, box-shadow 0.1s;
+            position: relative;
+        }
+        .token:hover {
+            box-shadow: 0 2px 8px rgba(0, 0, 0, 0.15);
+            z-index: 10;
+        }
+        .token-tooltip {
+            position: absolute;
+            bottom: 100%;
+            left: 50%;
+            transform: translateX(-50%);
+            background: #333;
+            color: white;
+            padding: 6px 10px;
+            border-radius: 6px;
+            font-size: 12px;
+            white-space: nowrap;
+            opacity: 0;
+            pointer-events: none;
+            transition: opacity 0.2s;
+            z-index: 100;
+        }
+        .token:hover .token-tooltip {
+            opacity: 1;
+        }
+        .error {
+            background: #fff5f5;
+            border: 1px solid #fed7d7;
+            color: #c53030;
+            padding: 14px 18px;
+            border-radius: 10px;
+            margin-top: 20px;
+        }
+        .loading {
+            display: none;
+            text-align: center;
+            padding: 40px;
+        }
+        .loading.show {
+            display: block;
+        }
+        .spinner {
+            width: 40px;
+            height: 40px;
+            border: 4px solid #f0f0f0;
+            border-top-color: #667eea;
+            border-radius: 50%;
+            animation: spin 0.8s linear infinite;
+            margin: 0 auto 10px;
+        }
+        @keyframes spin {
+            to { transform: rotate(360deg); }
+        }
+        .hidden {
+            display: none;
+        }
+        .token-ids-toggle {
+            margin-top: 15px;
+        }
+        .token-ids-toggle label {
+            display: inline-flex;
+            align-items: center;
+            gap: 8px;
+            cursor: pointer;
+            font-weight: normal;
+        }
+        .token-ids {
+            background: #2d3748;
+            color: #e2e8f0;
+            padding: 16px;
+            border-radius: 10px;
+            margin-top: 15px;
+            font-family: 'SF Mono', Monaco, monospace;
+            font-size: 13px;
+            overflow-x: auto;
+            white-space: pre-wrap;
+            word-break: break-all;
+        }
+        footer {
+            text-align: center;
+            margin-top: 30px;
+            color: rgba(255, 255, 255, 0.7);
+            font-size: 14px;
+        }
+        footer a {
+            color: white;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>🔤 HF Tokenizer</h1>
+        <p class="subtitle">Visualize tokenization from any Hugging Face model</p>
+        <div class="card">
+            <div class="input-group">
+                <label for="model-id">Model ID (org/model-name)</label>
+                <div class="model-input-wrapper">
+                    <input
+                        type="text"
+                        id="model-id"
+                        placeholder="e.g., openai-community/gpt2"
+                        autocomplete="off"
+                    >
+                    <div class="suggestions" id="suggestions"></div>
+                </div>
+            </div>
+            <div class="input-group">
+                <label for="text-input">Text to tokenize</label>
+                <textarea
+                    id="text-input"
+                    placeholder="Enter text to tokenize..."
+                >Hello, world! This is a test of the tokenizer visualization tool.</textarea>
+            </div>
+            <button class="btn" id="tokenize-btn">Tokenize</button>
+            <div class="loading" id="loading">
+                <div class="spinner"></div>
+                <p>Loading tokenizer and processing...</p>
+            </div>
+            <div class="error hidden" id="error"></div>
+            <div class="results hidden" id="results">
+                <div class="stats">
+                    <div class="stat">
+                        <div class="stat-value" id="token-count">0</div>
+                        <div class="stat-label">Tokens</div>
+                    </div>
+                    <div class="stat">
+                        <div class="stat-value" id="char-count">0</div>
+                        <div class="stat-label">Characters</div>
+                    </div>
+                </div>
+                <label>Tokenized Output <span style="font-weight: normal; color: #888;">(hover for token IDs)</span></label>
+                <div class="tokens-display" id="tokens-display"></div>
+                <div class="token-ids-toggle">
+                    <label>
+                        <input type="checkbox" id="show-ids"> Show token IDs
+                    </label>
+                </div>
+                <div class="token-ids hidden" id="token-ids"></div>
+            </div>
+        </div>
+        <footer>
+            Powered by 🤗 Hugging Face Transformers
+        </footer>
+    </div>
+    <script>
+        const modelInput = document.getElementById('model-id');
+        const textInput = document.getElementById('text-input');
+        const tokenizeBtn = document.getElementById('tokenize-btn');
+        const suggestionsDiv = document.getElementById('suggestions');
+        const loadingDiv = document.getElementById('loading');
+        const errorDiv = document.getElementById('error');
+        const resultsDiv = document.getElementById('results');
+        const tokensDisplay = document.getElementById('tokens-display');
+        const tokenCount = document.getElementById('token-count');
+        const charCount = document.getElementById('char-count');
+        const showIdsCheckbox = document.getElementById('show-ids');
+        const tokenIdsDiv = document.getElementById('token-ids');
+        let suggestions = [];
+        let currentTokens = [];
+        // Load model suggestions
+        fetch('/models/suggestions')
+            .then(res => res.json())
+            .then(data => {
+                suggestions = data;
+            });
+        // Model input focus/blur handling
+        modelInput.addEventListener('focus', () => {
+            if (suggestions.length > 0) {
+                showSuggestions(suggestions);
+            }
+        });
+        modelInput.addEventListener('input', () => {
+            const query = modelInput.value.toLowerCase();
+            const filtered = suggestions.filter(s => s.toLowerCase().includes(query));
+            if (filtered.length > 0 && document.activeElement === modelInput) {
+                showSuggestions(filtered);
+            } else {
+                hideSuggestions();
+            }
+        });
+        document.addEventListener('click', (e) => {
+            if (!modelInput.contains(e.target) && !suggestionsDiv.contains(e.target)) {
+                hideSuggestions();
+            }
+        });
+        function showSuggestions(items) {
+            suggestionsDiv.innerHTML = items.map(item =>
+                `<div class="suggestion-item" data-model="${item}"><code>${item}</code></div>`
+            ).join('');
+            suggestionsDiv.classList.add('show');
+        }
+        function hideSuggestions() {
+            suggestionsDiv.classList.remove('show');
+        }
+        suggestionsDiv.addEventListener('click', (e) => {
+            const item = e.target.closest('.suggestion-item');
+            if (item) {
+                modelInput.value = item.dataset.model;
+                hideSuggestions();
+            }
+        });
+        // Tokenize button click
+        tokenizeBtn.addEventListener('click', tokenize);
+        // Keyboard shortcut
+        document.addEventListener('keydown', (e) => {
+            if ((e.metaKey || e.ctrlKey) && e.key === 'Enter') {
+                tokenize();
+            }
+        });
+        async function tokenize() {
+            const modelId = modelInput.value.trim();
+            const text = textInput.value;
+            if (!modelId) {
+                showError('Please enter a model ID');
+                return;
+            }
+            if (!text) {
+                showError('Please enter some text to tokenize');
+                return;
+            }
+            hideError();
+            resultsDiv.classList.add('hidden');
+            loadingDiv.classList.add('show');
+            tokenizeBtn.disabled = true;
+            try {
+                const response = await fetch('/tokenize', {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({ model_id: modelId, text })
+                });
+                const data = await response.json();
+                if (!response.ok) {
+                    throw new Error(data.error || 'Tokenization failed');
+                }
+                currentTokens = data.tokens;
+                displayResults(data, text);
+            } catch (error) {
+                showError(error.message);
+            } finally {
+                loadingDiv.classList.remove('show');
+                tokenizeBtn.disabled = false;
+            }
+        }
+        function displayResults(data, text) {
+            tokenCount.textContent = data.token_count;
+            charCount.textContent = text.length;
+            // Display colorized tokens
+            tokensDisplay.innerHTML = data.tokens.map((token, i) => {
+                // Escape HTML and handle special characters
+                let displayText = token.text
+                    .replace(/&/g, '&amp;')
+                    .replace(/</g, '&lt;')
+                    .replace(/>/g, '&gt;')
+                    .replace(/\n/g, '↵\n')
+                    .replace(/\t/g, '→')
+                    .replace(/ /g, '·');
+                // Handle empty or whitespace-only tokens
+                if (displayText.trim() === '' && displayText !== '') {
+                    displayText = displayText || '␣';
+                }
+                return `<span class="token" style="background-color: ${token.color}">` +
+                    `<span class="token-tooltip">ID: ${token.id}</span>` +
+                    `${displayText}</span>`;
+            }).join('');
+            // Update token IDs display
+            tokenIdsDiv.textContent = `[${data.tokens.map(t => t.id).join(', ')}]`;
+            resultsDiv.classList.remove('hidden');
+        }
+        function showError(message) {
+            errorDiv.textContent = message;
+            errorDiv.classList.remove('hidden');
+        }
+        function hideError() {
+            errorDiv.classList.add('hidden');
+        }
+        // Toggle token IDs
+        showIdsCheckbox.addEventListener('change', () => {
+            tokenIdsDiv.classList.toggle('hidden', !showIdsCheckbox.checked);
+        });
+    </script>
+</body>
+</html>