Spaces:

afeng
/

tokenizers

Running

File size: 18,889 Bytes

import gradio as gr
from transformers import AutoTokenizer
import json
import traceback
from typing import Optional, Dict, List, Tuple

# Popular tokenizer models
TOKENIZER_OPTIONS = {
    # Qwen Series
    "Qwen/Qwen3-0.6B": "Qwen 3 (0.6B)",
    "Qwen/Qwen3-1.8B": "Qwen 3 (1.8B)",
    "Qwen/Qwen3-4B": "Qwen 3 (4B)",
    "Qwen/Qwen3-7B": "Qwen 3 (7B)",
    "Qwen/Qwen2.5-7B": "Qwen 2.5 (7B)",
    "Qwen/Qwen2.5-72B": "Qwen 2.5 (72B)",
    "Qwen/Qwen2-7B": "Qwen 2 (7B)",
    "Qwen/Qwen2-72B": "Qwen 2 (72B)",
    "Qwen/Qwen-7B": "Qwen 1 (7B)",

    # Llama Series
    "meta-llama/Llama-3.2-1B": "Llama 3.2 (1B)",
    "meta-llama/Llama-3.2-3B": "Llama 3.2 (3B)",
    "meta-llama/Llama-3.1-8B": "Llama 3.1 (8B)",
    "meta-llama/Llama-3.1-70B": "Llama 3.1 (70B)",
    "meta-llama/Llama-2-7b-hf": "Llama 2 (7B)",
    "meta-llama/Llama-2-13b-hf": "Llama 2 (13B)",
    "meta-llama/Llama-2-70b-hf": "Llama 2 (70B)",

    # Other Popular Models
    "openai-community/gpt2": "GPT-2",
    "google/gemma-2b": "Gemma (2B)",
    "google/gemma-7b": "Gemma (7B)",
    "mistralai/Mistral-7B-v0.1": "Mistral (7B)",
    "mistralai/Mixtral-8x7B-v0.1": "Mixtral (8x7B)",
    "deepseek-ai/deepseek-coder-6.7b-base": "DeepSeek Coder (6.7B)",
    "microsoft/phi-2": "Phi-2",
    "microsoft/phi-3-mini-4k-instruct": "Phi-3 Mini",
    "01-ai/Yi-6B": "Yi (6B)",
    "01-ai/Yi-34B": "Yi (34B)",
    "google-t5/t5-base": "T5 Base",
    "google-bert/bert-base-uncased": "BERT Base (uncased)",
    "google-bert/bert-base-cased": "BERT Base (cased)",
    "EleutherAI/gpt-neox-20b": "GPT-NeoX (20B)",
    "bigscience/bloom-560m": "BLOOM (560M)",
    "facebook/opt-350m": "OPT (350M)",
    "stabilityai/stablelm-base-alpha-7b": "StableLM (7B)",
}

# Cache for loaded tokenizers
tokenizer_cache = {}

def load_tokenizer(model_id: str):
    """Load a tokenizer with caching."""
    if model_id not in tokenizer_cache:
        try:
            tokenizer_cache[model_id] = AutoTokenizer.from_pretrained(
                model_id,
                trust_remote_code=True,
                use_fast=True  # Use fast tokenizer when available
            )
        except Exception as e:
            # Fallback to slow tokenizer if fast is not available
            try:
                tokenizer_cache[model_id] = AutoTokenizer.from_pretrained(
                    model_id,
                    trust_remote_code=True,
                    use_fast=False
                )
            except:
                raise e
    return tokenizer_cache[model_id]

def tokenize_text(
    text: str,
    model_id: str,
    add_special_tokens: bool = True,
    show_special_tokens: bool = True,
    custom_model_id: Optional[str] = None
) -> Tuple[str, str, str, str]:
    """
    Tokenize text using the selected tokenizer.

    Returns:
        Tuple of (tokens_json, token_ids, decoded_text, stats)
    """
    try:
        # Use custom model ID if provided
        actual_model_id = custom_model_id.strip() if custom_model_id and custom_model_id.strip() else model_id

        if not actual_model_id:
            return "", "", "", "Please select or enter a tokenizer model."

        # Load tokenizer
        tokenizer = load_tokenizer(actual_model_id)

        # Tokenize
        encoded = tokenizer.encode(text, add_special_tokens=add_special_tokens)
        tokens = tokenizer.convert_ids_to_tokens(encoded)

        # Decode
        decoded = tokenizer.decode(encoded, skip_special_tokens=not show_special_tokens)

        # Create detailed token information
        token_info = []
        for i, (token, token_id) in enumerate(zip(tokens, encoded)):
            # Try to get the actual string representation of the token
            try:
                token_str = tokenizer.convert_tokens_to_string([token])
            except:
                token_str = token

            token_info.append({
                "index": i,
                "token": token,
                "token_id": token_id,
                "text": token_str,
                "is_special": token_id in (tokenizer.all_special_ids if hasattr(tokenizer, 'all_special_ids') else [])
            })

        # Format outputs
        tokens_display = json.dumps(tokens, ensure_ascii=False, indent=2)
        token_ids_display = str(encoded)
        token_info_json = json.dumps(token_info, ensure_ascii=False, indent=2)

        # Statistics
        stats = f"""Statistics:
• Model: {actual_model_id}
• Number of tokens: {len(tokens)}
• Number of characters: {len(text)}
• Tokens per character: {len(tokens)/len(text):.2f}
• Characters per token: {len(text)/len(tokens):.2f}
• Vocabulary size: {tokenizer.vocab_size if hasattr(tokenizer, 'vocab_size') else 'N/A'}
• Special tokens: {', '.join(tokenizer.all_special_tokens) if hasattr(tokenizer, 'all_special_tokens') else 'N/A'}"""

        return tokens_display, token_ids_display, decoded, token_info_json, stats

    except Exception as e:
        error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
        return error_msg, "", "", "", ""

def decode_tokens(
    token_ids_str: str,
    model_id: str,
    skip_special_tokens: bool = False,
    custom_model_id: Optional[str] = None
) -> Tuple[str, str, str]:
    """Decode token IDs back to text.

    Returns:
        Tuple of (decoded_text, tokens_json, stats)
    """
    try:
        # Use custom model ID if provided
        actual_model_id = custom_model_id.strip() if custom_model_id and custom_model_id.strip() else model_id

        if not actual_model_id:
            return "Please select or enter a tokenizer model.", "", ""

        # Parse token IDs
        token_ids_str = token_ids_str.strip()
        if not token_ids_str:
            return "", "", ""

        if token_ids_str.startswith('[') and token_ids_str.endswith(']'):
            token_ids = json.loads(token_ids_str)
        else:
            # Try to parse as comma or space separated values
            token_ids = [int(x.strip()) for x in token_ids_str.replace(',', ' ').split()]

        # Load tokenizer and decode
        tokenizer = load_tokenizer(actual_model_id)
        decoded = tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)

        # Also show tokens
        tokens = tokenizer.convert_ids_to_tokens(token_ids)
        tokens_json = json.dumps(tokens, ensure_ascii=False, indent=2)

        # Statistics
        stats = f"""Statistics:
• Model: {actual_model_id}
• Token count: {len(tokens)}
• Character count: {len(decoded)}
• Characters per token: {len(decoded)/len(tokens):.2f}
• Special tokens skipped: {'Yes' if skip_special_tokens else 'No'}"""

        return decoded, tokens_json, stats

    except Exception as e:
        error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
        return error_msg, "", ""

def compare_tokenizers(
    text: str,
    model_ids: List[str],
    add_special_tokens: bool = True
) -> str:
    """Compare tokenization across multiple models."""
    if not model_ids:
        return "Please select at least one model to compare."

    results = []

    for model_id in model_ids:
        try:
            tokenizer = load_tokenizer(model_id)
            encoded = tokenizer.encode(text, add_special_tokens=add_special_tokens)
            tokens = tokenizer.convert_ids_to_tokens(encoded)

            results.append({
                "model": model_id,
                "token_count": len(tokens),
                "tokens": tokens[:50],  # Show first 50 tokens
                "token_ids": encoded[:50]  # Show first 50 IDs
            })
        except Exception as e:
            results.append({
                "model": model_id,
                "error": str(e)
            })

    # Sort by token count
    results.sort(key=lambda x: x.get("token_count", float('inf')))

    # Format output
    output = "# Tokenizer Comparison\n\n"
    output += f"Input text length: {len(text)} characters\n\n"

    for result in results:
        if "error" in result:
            output += f"## {result['model']}\n"
            output += f"Error: {result['error']}\n\n"
        else:
            output += f"## {result['model']}\n"
            output += f"**Token count:** {result['token_count']} "
            output += f"(ratio: {result['token_count']/len(text):.2f} tokens/char)\n\n"
            output += f"**First tokens:** {result['tokens']}\n\n"
            if len(result['tokens']) == 50:
                output += "*(showing first 50 tokens)*\n\n"

    return output

def analyze_vocabulary(model_id: str, custom_model_id: Optional[str] = None) -> str:
    """Analyze tokenizer vocabulary."""
    try:
        actual_model_id = custom_model_id.strip() if custom_model_id and custom_model_id.strip() else model_id

        if not actual_model_id:
            return "Please select or enter a tokenizer model."

        tokenizer = load_tokenizer(actual_model_id)

        # Get vocabulary information
        vocab_size = tokenizer.vocab_size if hasattr(tokenizer, 'vocab_size') else len(tokenizer.get_vocab())

        # Get special tokens
        special_tokens = {}
        if hasattr(tokenizer, 'special_tokens_map'):
            special_tokens = tokenizer.special_tokens_map

        # Get some example tokens
        vocab = tokenizer.get_vocab()
        sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])[:100]  # First 100 tokens

        output = f"""# Tokenizer Vocabulary Analysis

**Model:** {actual_model_id}
**Vocabulary Size:** {vocab_size:,}
**Tokenizer Type:** {tokenizer.__class__.__name__}

## Special Tokens
```json
{json.dumps(special_tokens, ensure_ascii=False, indent=2)}
```

## Token Settings
• Padding Token: {tokenizer.pad_token if tokenizer.pad_token else 'None'}
• BOS Token: {tokenizer.bos_token if tokenizer.bos_token else 'None'}
• EOS Token: {tokenizer.eos_token if tokenizer.eos_token else 'None'}
• UNK Token: {tokenizer.unk_token if tokenizer.unk_token else 'None'}
• SEP Token: {tokenizer.sep_token if hasattr(tokenizer, 'sep_token') and tokenizer.sep_token else 'None'}
• CLS Token: {tokenizer.cls_token if hasattr(tokenizer, 'cls_token') and tokenizer.cls_token else 'None'}
• Mask Token: {tokenizer.mask_token if hasattr(tokenizer, 'mask_token') and tokenizer.mask_token else 'None'}

## First 100 Tokens in Vocabulary
Token → ID
"""
        for token, token_id in sorted_vocab:
            # Escape special characters for display
            display_token = repr(token) if not token.isprintable() else token
            output += f"{display_token} → {token_id}\n"

        return output

    except Exception as e:
        return f"Error: {str(e)}\n{traceback.format_exc()}"

# Create Gradio interface
with gr.Blocks(title="🤗 Tokenizer Playground", theme=gr.themes.Soft()) as app:
    gr.Markdown("""
    # 🤗 Tokenizer Playground

    A comprehensive tool for NLP researchers to experiment with various Hugging Face tokenizers.
    Supports popular models including **Qwen**, **Llama**, **Mistral**, **GPT**, and many more.

    ### Features:
    - 🔤 **Tokenize & Detokenize** text with any Hugging Face tokenizer
    - 📊 **Compare** tokenization across multiple models
    - 📖 **Analyze** vocabulary and special tokens
    - 🎯 **Support** for custom model IDs from Hugging Face Hub
    """)

    with gr.Tab("🔤 Tokenize"):
        with gr.Row():
            with gr.Column(scale=3):
                tokenize_input = gr.Textbox(
                    label="Input Text",
                    placeholder="Enter text to tokenize...",
                    lines=5,
                    max_lines=15,
                    autoscroll=False
                )
            with gr.Column(scale=1):
                tokenize_model = gr.Dropdown(
                    label="Select Tokenizer",
                    choices=list(TOKENIZER_OPTIONS.keys()),
                    value="Qwen/Qwen3-0.6B",
                    allow_custom_value=False
                )
                tokenize_custom_model = gr.Textbox(
                    label="Or Enter Custom Model ID",
                    placeholder="e.g., facebook/bart-base",
                    info="Override selection above with any HF model"
                )
                add_special = gr.Checkbox(label="Add Special Tokens", value=True)
                show_special = gr.Checkbox(label="Show Special Tokens in Decoded", value=True)
                tokenize_btn = gr.Button("Tokenize", variant="primary")

        with gr.Row():
            with gr.Column():
                tokens_output = gr.Textbox(label="Tokens", lines=10, max_lines=20, autoscroll=False, show_copy_button=True)
            with gr.Column():
                token_ids_output = gr.Textbox(label="Token IDs", lines=10, max_lines=20, autoscroll=False, show_copy_button=True)

        with gr.Row():
            with gr.Column():
                decoded_output = gr.Textbox(label="Decoded Text (Verification)", lines=5, max_lines=15, autoscroll=False, show_copy_button=True)
            with gr.Column():
                token_info_output = gr.Textbox(label="Detailed Token Information", lines=10, max_lines=20, autoscroll=False, show_copy_button=True)

        stats_output = gr.Textbox(label="Statistics", lines=7, max_lines=15, autoscroll=False)

        tokenize_btn.click(
            fn=tokenize_text,
            inputs=[tokenize_input, tokenize_model, add_special, show_special, tokenize_custom_model],
            outputs=[tokens_output, token_ids_output, decoded_output, token_info_output, stats_output]
        )

    with gr.Tab("🔄 Detokenize"):
        with gr.Row():
            with gr.Column(scale=3):
                decode_input = gr.Textbox(
                    label="Token IDs",
                    placeholder="Enter token IDs as a list [101, 2023, ...] or space/comma separated",
                    lines=5,
                    max_lines=15,
                    autoscroll=False
                )
            with gr.Column(scale=1):
                decode_model = gr.Dropdown(
                    label="Select Tokenizer",
                    choices=list(TOKENIZER_OPTIONS.keys()),
                    value="Qwen/Qwen3-0.6B"
                )
                decode_custom_model = gr.Textbox(
                    label="Or Enter Custom Model ID",
                    placeholder="e.g., facebook/bart-base"
                )
                skip_special = gr.Checkbox(label="Skip Special Tokens", value=False)
                decode_btn = gr.Button("Decode", variant="primary")

        decode_output = gr.Textbox(
            label="Decoded Text",
            lines=10,
            max_lines=20,
            interactive=False,
            show_copy_button=True,
            placeholder="Decoded text will appear here...",
            autoscroll=False
        )

        decode_stats = gr.Textbox(
            label="Statistics",
            lines=5,
            interactive=False
        )

        with gr.Accordion("Show Tokens", open=False):
            decode_tokens_output = gr.Textbox(
                label="Tokens",
                lines=10,
                max_lines=20,
                interactive=False,
                show_copy_button=True,
                autoscroll=False
            )

        decode_btn.click(
            fn=decode_tokens,
            inputs=[decode_input, decode_model, skip_special, decode_custom_model],
            outputs=[decode_output, decode_tokens_output, decode_stats]
        )

    with gr.Tab("📊 Compare"):
        compare_input = gr.Textbox(
            label="Input Text",
            placeholder="Enter text to compare tokenization across models...",
            lines=5,
            max_lines=15,
            autoscroll=False
        )

        compare_models = gr.CheckboxGroup(
            label="Select Models to Compare",
            choices=list(TOKENIZER_OPTIONS.keys()),
            value=["Qwen/Qwen3-0.6B", "meta-llama/Llama-3.1-8B", "openai-community/gpt2"]
        )

        compare_add_special = gr.Checkbox(label="Add Special Tokens", value=True)
        compare_btn = gr.Button("Compare Tokenizers", variant="primary")

        compare_output = gr.Markdown()

        compare_btn.click(
            fn=compare_tokenizers,
            inputs=[compare_input, compare_models, compare_add_special],
            outputs=compare_output
        )

    with gr.Tab("📖 Vocabulary"):
        with gr.Row():
            vocab_model = gr.Dropdown(
                label="Select Tokenizer",
                choices=list(TOKENIZER_OPTIONS.keys()),
                value="Qwen/Qwen3-0.6B"
            )
            vocab_custom_model = gr.Textbox(
                label="Or Enter Custom Model ID",
                placeholder="e.g., facebook/bart-base"
            )
            vocab_btn = gr.Button("Analyze Vocabulary", variant="primary")

        vocab_output = gr.Markdown()

        vocab_btn.click(
            fn=analyze_vocabulary,
            inputs=[vocab_model, vocab_custom_model],
            outputs=vocab_output
        )

    with gr.Tab("ℹ️ About"):
        gr.Markdown("""
        ## About This Tool

        This tokenizer playground provides researchers and developers with an easy way to experiment
        with various tokenizers from the Hugging Face Model Hub.

        ### Supported Models

        **Qwen Series:** Qwen 3, Qwen 2.5, Qwen 2, Qwen 1 (various sizes)

        **Llama Series:** Llama 3.2, Llama 3.1, Llama 2 (various sizes)

        **Other Popular Models:** GPT-2, Gemma, Mistral, Mixtral, DeepSeek, Phi, Yi, T5, BERT, GPT-NeoX, BLOOM, OPT, StableLM

        ### Custom Models

        You can use any tokenizer from the Hugging Face Hub by entering its model ID in the "Custom Model ID" field.
        For example:
        - `facebook/bart-base`
        - `EleutherAI/gpt-j-6b`
        - `bigscience/bloom`

        ### Features Explanation

        - **Tokenize:** Convert text into tokens and token IDs
        - **Detokenize:** Convert token IDs back to text
        - **Compare:** See how different tokenizers handle the same text
        - **Vocabulary:** Explore tokenizer vocabulary and special tokens

        ### Tips

        1. Different tokenizers can produce very different token counts for the same text
        2. Special tokens (like [CLS], [SEP], <s>, </s>) are model-specific
        3. Subword tokenization (used by most modern models) allows handling of out-of-vocabulary words
        4. Token efficiency affects model performance and API costs

        ### Resources

        - [Hugging Face Tokenizers Documentation](https://huggingface.co/docs/transformers/main_classes/tokenizer)
        - [Understanding Tokenization](https://huggingface.co/docs/transformers/tokenizer_summary)
        - [Model Hub](https://huggingface.co/models)

        ---

 
        """)

# Launch the app
if __name__ == "__main__":
    app.launch()