Spaces:

gsaltintas
/

tokenizer-comparison

Sleeping

File size: 34,500 Bytes

from collections import Counter
from pathlib import Path

import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from utils import (
    clean_token_display,
    get_normalization_methods,
    normalize_text,
    tokenize_w_tekken,
    tokenize_with_byt5,
    tokenize_with_hf,
    tokenize_with_tiktoken,
)

TIKTOKENS = [ "gpt-4o", "gpt-2"]
HF = ["llama-3", "gemma-2", "qwen3", "mbert", "phi-3", "xglm",  "bloom", "aya-expanse", "comma", "tokenmonster", "byt5"]
available_tokenizers = TIKTOKENS + HF + ["tekken", ]
pre_selected_tokenizers = ["xglm"]
pre_selected_tokenizers= available_tokenizers
pre_selected_tokenizers=[]
OUT_FILE = Path("paper-outs.txt")
if not OUT_FILE.exists():
    open(OUT_FILE, "w")

def tokenize(model, text):

    if model in ["gpt-4", "gpt-2", "gpt-4o"]:
        toks = tokenize_with_tiktoken(text, model)
    elif model in ["tekken"]:
        toks = tokenize_w_tekken(text, model)
    elif "byt5" in model:
        toks = tokenize_with_byt5(text, model)
    else:
        toks = tokenize_with_hf(text, model)
    with open(OUT_FILE, "a", encoding="utf-8") as file:  # Specify UTF-8 encoding
        file.write(toks["model"]+"\n")
        file.write(f"Text: {text}\n")
        s= str(",".join([str(t["text"]) for t in toks["tokens"]])) +"\n"
        # s = s.encode("utf-8")
        # s = s.encode('latin1').decode('utf-8')
        file.write(s)
        file.write("\n")
    return toks

def compare_tokenizers(text, selected_models, show_details=False):
    if not text.strip():
        return "Please enter some text to tokenize.", "", "", "", None, None

    results = {}

    for model in selected_models:
        results[model] = tokenize(model, text)
    # Generate outputs
    efficiency_output, tokenization_html, token_ids_output = generate_basic_comparison(
        results
    )
    detailed_output = generate_detailed_analysis(results) if show_details else ""
    efficiency_chart = create_efficiency_chart(results)
    token_distribution_chart = create_token_distribution_chart(results)

    return (
        efficiency_output,
        tokenization_html,
        token_ids_output,
        detailed_output,
        efficiency_chart,
        token_distribution_chart,
    )


def generate_basic_comparison(results):
    if not results:
        return "No results to display.", "", ""

    # Efficiency ranking
    sorted_models = sorted(results.items(), key=lambda x: x[1]["token_count"])

    ranking_output = []
    ranking_output.append("## 🏆 Efficiency Ranking (Fewer tokens = more efficient)")
    for i, (model, result) in enumerate(sorted_models):
        if "error" in result:
            ranking_output.append(
                f"{i + 1}. **{result['model']}**: ❌ Error - {result['error']}"
            )
        else:
            ranking_output.append(
                f"{i + 1}. **{result['model']}**: {result['token_count']} tokens "
                f"({result['compression_ratio']:.2f}x compression)"
            )

    # Generate interactive tokenization display
    tokenization_html = generate_interactive_tokenization(results)

    # Generate token ID tables
    token_ids_display = generate_token_ids_display(results)

    return "\n".join(ranking_output), tokenization_html, token_ids_display


def generate_interactive_tokenization(results):
    ##todo main vis
    """Generate HTML with working hover highlighting across tokenizers"""
    if not results:
        return "<p>No tokenization results to display.</p>"

    html_parts = []

    # Add styles first
    html_parts.append("""
    <div id="tokenizer-container" class="tokenizer-container">
    <style>
    .tokenizer-container {
        display: flex;
        flex-wrap: wrap;
        justify-content: space-between;
        gap: 20px;
    }
    .tokenizer-section {
        margin-bottom: 20px;
        border: 1px solid #e0e0e0;
        border-radius: 8px;
        padding: 15px;
        background: white;
        flex-wrap: wrap;
        display: inline-block;
        justify-content: space-between;
    }
    .tokenizer-header {
        font-weight: bold;
        font-size: 18px;
        margin-bottom: 10px;
        color: #2c3e50;
    }
    .token-display {
        font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
        line-height: 1.8;
        word-wrap: break-word;
    }
    .token {
        display: inline-block;
        margin: 2px;
        padding: 4px 8px;
        border-radius: 4px;
        border: 1px solid;
        cursor: pointer;
        transition: all 0.2s ease;
        position: relative;
        font-size: 14px;
        user-select: none;
    }
    .token:hover {
        transform: scale(1.05);
        z-index: 10;
        box-shadow: 0 2px 8px rgba(0,0,0,0.2);
    }
    .token.highlighted {
        background: #ff6b6b !important;
        border-color: #e55353 !important;
        color: white !important;
        box-shadow: 0 0 10px rgba(255, 107, 107, 0.5) !important;
        transform: scale(1.1) !important;
        z-index: 100 !important;
    }
    .token-word { background: #e8f5e8; border-color: #4caf50; color: #2e7d32; }
    .token-number { background: #f3e5f5; border-color: #9c27b0; color: #7b1fa2; }
    .token-punctuation { background: #ffebee; border-color: #f44336; color: #c62828; }
    .token-whitespace { background: #f5f5f5; border-color: #9e9e9e; color: #616161; }
    .token-special { background: #fff3e0; border-color: #ff9800; color: #ef6c00; }
    .token-mixed { background: #e3f2fd; border-color: #2196f3; color: #1565c0; }
    .token-subword {
        background: #fff8e1 !important;
        border-color: #ffc107 !important;
        border-style: dashed !important;
    }
    .token-stats {
        display: inline-block;
        margin-left: 10px;
        padding: 2px 6px;
        background: #f8f9fa;
        border-radius: 3px;
        font-size: 12px;
        color: #666;
    }
    .highlight-info {
        position: fixed;
        top: 10px;
        right: 10px;
        background: #333;
        color: white;
        padding: 8px 12px;
        border-radius: 4px;
        font-size: 12px;
        display: none;
        z-index: 1000;
        flex-wrap: wrap;
        display: inline-block;
        justify-content: space-between;
    }
                      
                      /* Multi-token span styles */
.token-span-container {
    display: inline-flex;
    margin: 2px;
}

.token-multi-span {
    background: linear-gradient(45deg, #e8f5e8 25%, #f3e5f5 25%, #f3e5f5 50%, #e8f5e8 50%, #e8f5e8 75%, #f3e5f5 75%);
    background-size: 8px 8px;
}

.token-span-part {
    margin: 0 !important;
    border-radius: 0 !important;
    border-right: none !important;
    position: relative;
    min-width: 20px;
    text-align: center;
    font-size: 11px;
}


/* Hover effect for multi-token spans */
.token-span-container:hover .token-span-part {
    transform: scale(1.02);
    box-shadow: 0 2px 8px rgba(0,0,0,0.15);
}

/* Different visual for multi-token spans */
.token-multi-span.token-word { 
    background: repeating-linear-gradient(45deg, #e8f5e8, #e8f5e8 4px, #d4edda 4px, #d4edda 8px);
}
.token-multi-span.token-number { 
    background: repeating-linear-gradient(45deg, #f3e5f5, #f3e5f5 4px, #e1bee7 4px, #e1bee7 8px);
}
.token-multi-span.token-punctuation { 
    background: repeating-linear-gradient(45deg, #ffebee, #ffebee 4px, #ffcdd2 4px, #ffcdd2 8px);
}
                      /* Multi-token span styles */
.token-span-container {
    display: inline-flex;
    margin: 2px;
    cursor: pointer;
}

.token-multi-span {
    /* Distinctive background pattern for multi-token spans */
    background: repeating-linear-gradient(
        45deg,
        transparent,
        transparent 2px,
        rgba(0,0,0,0.1) 2px,
        rgba(0,0,0,0.1) 4px
    );
}

.token-span-part {
    margin: 0 !important;
    border-radius: 0 !important;
    border-right: none !important;
    position: relative;
    padding: 4px 6px;
    border: 1px dashed rgba(0,0,0,0.3) !important;
    pointer-events: none; /* Prevent individual box clicks */
}

.token-span-first {
    border-radius: 4px 0 0 4px !important;
}

.token-span-last {
    border-radius: 0 4px 4px 0 !important;
    border-right: 1px solid !important;
}

/* Connecting lines between boxes */
.token-span-part:not(.token-span-last)::after {
    content: '';
    position: absolute;
    top: 0;
    right: -1px;
    width: 1px;
    height: 100%;
    background: rgba(0,0,0,0.3);
    z-index: 1;
}

/* Hover effect for entire multi-token span */
.token-span-container:hover .token-span-part {
    transform: scale(1.05);
    box-shadow: 0 2px 8px rgba(0,0,0,0.2);
}

.token-span-container.highlighted .token-span-part {
    background: #ff6b6b !important;
    border-color: #e55353 !important;
    color: white !important;
    box-shadow: 0 0 10px rgba(255, 107, 107, 0.5) !important;
    transform: scale(1.1) !important;
    z-index: 100 !important;
}

/* Different patterns for different token types when multi-span */
.token-multi-span.token-word .token-span-part { 
    background: #e8f5e8;
    border-color: #4caf50;
    color: #2e7d32;
}
.token-multi-span.token-number .token-span-part { 
    background: #f3e5f5;
    border-color: #9c27b0;
    color: #7b1fa2;
}
.token-multi-span.token-punctuation .token-span-part { 
    background: #ffebee;
    border-color: #f44336;
    color: #c62828;
}
    </style>
    
    <div class="highlight-info" id="highlight-info"></div>
    
    <script>
    function highlightTokens(targetText) {
        // Clear all highlights
        document.querySelectorAll('.token').forEach(function(token) {
            token.classList.remove('highlighted');
        });
        
        // Highlight matching tokens
        let count = 0;
        document.querySelectorAll('.token').forEach(function(token) {
            if (token.getAttribute('data-text') === targetText) {
                token.classList.add('highlighted');
                count++;
            }
        });
        
        // Show info
        const info = document.getElementById('highlight-info');
        if (info) {
            const displayText = targetText === ' ' ? '(space)' : targetText;
            info.textContent = '"' + displayText + '" appears in ' + count + ' positions';
            info.style.display = 'block';
        }
    }
    
    function clearHighlights() {
        document.querySelectorAll('.token').forEach(function(token) {
            token.classList.remove('highlighted');
        });
        const info = document.getElementById('highlight-info');
        if (info) {
            info.style.display = 'none';
        }
    }
                      
                      function highlightTokens(targetText) {
    // Clear all highlights
    document.querySelectorAll('.token, .token-span-container').forEach(function(element) {
        element.classList.remove('highlighted');
    });
    
    // Highlight matching tokens and spans
    let count = 0;
    
    // Single tokens
    document.querySelectorAll('.token').forEach(function(token) {
        if (token.getAttribute('data-text') === targetText) {
            token.classList.add('highlighted');
            count++;
        }
    });
    
    // Multi-token spans
    document.querySelectorAll('.token-span-container').forEach(function(span) {
        if (span.getAttribute('data-text') === targetText) {
            span.classList.add('highlighted');
            count++;
        }
    });
    
    // Show info
    const info = document.getElementById('highlight-info');
    if (info) {
        const displayText = targetText === ' ' ? '(space)' : targetText;
        info.textContent = '"' + displayText + '" appears in ' + count + ' positions';
        info.style.display = 'block';
    }
}
    </script>
    """)

    # Generate tokenizer sections with inline event handlers
    for model, result in results.items():
        if "error" in result:
            html_parts.append(f"""
            <div class="tokenizer-section">
                <div class="tokenizer-header">{result["model"]} ❌</div>
                <div style="color: #d32f2f; font-style: italic;">Error: {result["error"]}</div>
            </div>
            """)
            continue

        html_parts.append(f"""
        <div class="tokenizer-section">
            <div class="tokenizer-header">
                {result["model"]} 
                <span class="token-stats">
                    {result["token_count"]} tokens | 
                    {result["encoding"]} | 
                    {result["compression_ratio"]:.2f}x compression
                </span>
            </div>
            <div class="token-display">
        """)

        # Add tokens with inline event handlers
        subword_count = 0
        for i, token in enumerate(result["tokens"]):
            token_text = token["text"]
            token_text = clean_token_display(token_text)
            display_text = token_text if token_text.strip() else "·"
            if token_text == "<newline>":
                html_parts.append("<br>")
                continue
            # Check if this token spans multiple token IDs
            token_ids = token["id"] if isinstance(token["id"], list) else [token["id"]]
            is_multi_token = len(token_ids) > 1

            # Determine token class
            token_class = f"token token-{token['type']}"
            if token["is_subword"]:
                token_class += " token-subword"
                subword_count += 1

            # Create unique identifier for this token occurrence
            token_id = f"token_{model}_{i}"

            # Escape text for HTML and JavaScript - be very careful with quotes
            escaped_text = (
                token_text.replace("\\", "\\\\")
                .replace("'", "\\'")
                .replace('"', '\\"')
                .replace("\r", "\\r")
                .replace("\n", "\\n")
            )

            escaped_display = (
                display_text.replace('"', "&quot;")
                .replace("'", "&#39;")
                .replace("\r", "\n")
            )

            if is_multi_token:
                # Create a container for the multi-token span
                span_id = f"span_{model}_{i}"
                token_ids_str = ",".join(map(str, token_ids))

                html_parts.append(f"""<span class="token-span-container" 
                        id="{span_id}_container"
                        data-text="{token_text.replace('"', "&quot;").replace("'", "&#39;")}"
                        data-ids="{token_ids_str}"
                        data-position="{i}"
                        data-model="{model}"
                        onmouseover="highlightTokens('{escaped_text}')"
                        onmouseout="clearHighlights()"
                        onclick="alert('Token: \\'{escaped_text}\\'\\nIDs: [{token_ids_str}]\\nModel: {model}\\nSpans {len(token_ids)} token IDs')"
                        title="Text: '{token_text}' | IDs: [{token_ids_str}] | Type: {token["type"]} | Subword: {token["is_subword"]}">""")

                # Create individual boxes for each token ID - but they act as one unit
                for j, tid in enumerate(token_ids):
                    token_id = f"token_{model}_{i}_{j}"
                    box_class = f"{token_class} token-span-part"
                    box_content = ""

                    # Add position indicators for styling
                    if j == 0:
                        box_class += " token-span-first"
                        box_content = escaped_display
                    elif j == len(token_ids) - 1:
                        box_class += " token-span-last"
                    else:
                        box_class += " token-span-middle"

                    # Each box shows the same text (the combined character/text)
                    html_parts.append(f"""<span class="{box_class}" 
                            id="{token_id}"
                            data-token-id="{tid}">{box_content}</span>""")

                html_parts.append("</span>")
            else:
                # Single token - original behavior
                token_id = f"token_{model}_{i}"
                html_parts.append(f"""<span class="{token_class}" 
                        id="{token_id}"
                        data-text="{token_text.replace('"', "&quot;").replace("'", "&#39;")}"
                        data-id="{token_ids[0]}"
                        data-position="{i}"
                        data-model="{model}"
                        title="Text: '{token_text}' | ID: {token_ids[0]} | Type: {token["type"]} | Subword: {token["is_subword"]}"
                        onmouseover="highlightTokens('{escaped_text}')"
                        onmouseout="clearHighlights()"
                        onclick="alert('Token: \\'{escaped_text}\\'\\nID: {token_ids[0]}\\nModel: {model}')">{escaped_display}</span>""")
            # # Use inline event handlers that work in Gradio
            # html_parts.append(f"""<span class="{token_class}"
            #           id="{token_id}"
            #           data-text="{token_text.replace('"', "&quot;").replace("'", "&#39;")}"
            #           data-id="{token["id"]}"
            #           data-position="{i}"
            #           data-model="{model}"
            #           title="Text: '{token_text}' | ID: {token["id"]} | Type: {token["type"]} | Subword: {token["is_subword"]}"
            #           onmouseover="highlightTokens('{escaped_text}')"
            #           onmouseout="clearHighlights()"
            #           onclick="alert('Token: \\'{escaped_text}\\'\\nID: {token["id"]}\\nModel: {model}')">{escaped_display}</span>""")

        html_parts.append(f"""
            </div>
            <div style="margin-top: 8px; font-size: 12px; color: #666;">
                Subwords: {subword_count}/{sum([len(t) for t in result["tokens"]])}
                ({subword_count / len(result["tokens"]) * 100:.1f}%)
            </div>
        </div>
        """)

    html_parts.append("</div>")
    return "".join(html_parts)


def generate_token_ids_display(results):
    """Generate a clean display of token IDs for each tokenizer"""
    if not results:
        return "No token IDs to display."

    output = []
    output.append("## 🔢 Token IDs by Tokenizer")

    for model, result in results.items():
        if "error" in result:
            output.append(f"\n### {result['model']} ❌")
            output.append(f"Error: {result['error']}")
            continue

        output.append(f"\n### {result['model']}")
        output.append(
            f"**Vocab Size**: {result['vocab_size']:,} | **Encoding**: {result['encoding']}"
        )

        # Display token IDs in a readable format
        token_ids = [str(token["id"]) for token in result["tokens"]]

        # Group IDs for better readability (10 per line)
        lines = []
        for i in range(0, len(token_ids), 10):
            line_ids = token_ids[i : i + 10]
            lines.append(" ".join(line_ids))

        output.append("```")
        output.append("\n".join(lines))
        output.append("```")

        # Add some statistics
        unique_ids = len(set(token_ids))
        output.append(
            f"**Stats**: {len(token_ids)} total tokens, {unique_ids} unique IDs"
        )

    return "\n".join(output)


def compare_with_normalization(
    text, selected_models, normalization_method, show_details=False
):
    """Compare tokenizers with optional normalization"""
    normalized_text = normalize_text(text, normalization_method)
    print(
        "[DEBUG] Before normalization:", text, "\nAfter normalization:", normalized_text
    )

    # Get both original and normalized results
    original_results = {}
    normalized_results = {}

    for model in selected_models:
        original_results[model] = tokenize(model, text)
        if normalization_method != "none":
            normalized_results[model] = tokenize(model, text)
    return original_results, normalized_results, normalized_text


def generate_detailed_analysis(results):
    if not results or len(results) < 2:
        return "Need at least 2 tokenizers for detailed analysis."

    output = []
    output.append("## 🔍 Detailed Analysis")

    # Find common tokens
    all_token_sets = []
    for model, result in results.items():
        if "error" not in result:
            token_texts = {token["text"] for token in result["tokens"]}
            all_token_sets.append(token_texts)

    if all_token_sets:
        common_tokens = set.intersection(*all_token_sets)
        output.append(f"\n### Common Tokens ({len(common_tokens)})")
        if common_tokens:
            common_display = [
                f"`{token}`" if token != " " else "`·`"
                for token in list(common_tokens)[:15]
            ]
            output.append(" ".join(common_display))
        else:
            output.append("No common tokens found.")

    # Token type distribution
    output.append("\n### Token Type Distribution")
    for model, result in results.items():
        if "error" not in result:
            type_counts = Counter(token["type"] for token in result["tokens"])
            type_display = [f"{type_}: {count}" for type_, count in type_counts.items()]
            output.append(f"**{result['model']}**: {', '.join(type_display)}")

    # Subword analysis
    output.append("\n### Subword Analysis")
    for model, result in results.items():
        if "error" not in result:
            subwords = [token for token in result["tokens"] if token["is_subword"]]
            subword_ratio = (
                len(subwords) / len(result["tokens"]) * 100 if result["tokens"] else 0
            )
            output.append(
                f"**{result['model']}**: {len(subwords)} subwords ({subword_ratio:.1f}%)"
            )

    return "\n".join(output)


def create_efficiency_chart(results):
    if not results:
        return None

    models = []
    token_counts = []
    compression_ratios = []

    for model, result in results.items():
        if "error" not in result:
            models.append(result["model"])
            token_counts.append(result["token_count"])
            compression_ratios.append(result["compression_ratio"])

    if not models:
        return None

    fig = go.Figure()

    # Add token count bars
    fig.add_trace(
        go.Bar(
            x=models,
            y=token_counts,
            name="Token Count",
            marker_color="lightblue",
            text=token_counts,
            textposition="auto",
        )
    )

    fig.update_layout(
        title="Token Count Comparison (Lower = More Efficient)",
        xaxis_title="Tokenizer",
        yaxis_title="Number of Tokens",
        template="plotly_white",
    )

    return fig


def create_token_distribution_chart(results):
    if not results:
        return None

    all_data = []

    for model, result in results.items():
        if "error" not in result:
            type_counts = Counter(token["type"] for token in result["tokens"])
            for token_type, count in type_counts.items():
                all_data.append(
                    {
                        "Tokenizer": result["model"],
                        "Token Type": token_type,
                        "Count": count,
                    }
                )

    if not all_data:
        return None

    df = pd.DataFrame(all_data)

    fig = px.bar(
        df,
        x="Tokenizer",
        y="Count",
        color="Token Type",
        title="Token Type Distribution by Tokenizer",
        template="plotly_white",
    )

    return fig


# Custom CSS for better styling
css = """
.gradio-container {
    font-family: 'Inter', sans-serif;
}
.token-display {
    font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
    background: #f8f9fa;
    padding: 8px;
    border-radius: 4px;
    font-size: 0.9em;
}
"""

# Create the Gradio interface
with gr.Blocks(
    title="🔤 Advanced Tokenizer Comparison", theme=gr.themes.Soft(), css=css
) as demo:
    gr.Markdown("""
    # 🔤 Advanced Tokenizer Comparison Tool
    
    Compare how different LLM tokenizers split text into tokens. Analyze efficiency, subwords, and token types.
    
    **Legend**: 🔤 Word | 🔢 Number | ❗ Punctuation | 🔸 Subword | · Space
    
    💡 **Try the sample texts** to see how tokenizers handle different challenges like:
    - Mixed languages and scripts
    - Programming code and JSON
    - Long compound words
    - Special characters and emojis
    - Technical terminology
    """)

    with gr.Row():
        with gr.Column(scale=2):
            # Sample texts dropdown
            pre_choices = [
                "Custom text (enter below)",
                """
ᴾʸᵗʰᵒⁿ
ₚᵧₜₕₒₙ
P̲y̲t̲h̲o̲n̲
P̄ȳt̄h̄ōn̄
P̅y̅t̅h̅o̅n̅
ⓅⓎⓉⒽⓄⓃ
⒫⒴⒯⒣⒪⒩
🄿🅈🅃🄷🄾🄽
ⓅⓎⓉⒽⓄⓃ
Ｐｙｔｈｏｎ
Pʎʇɥou
Pyʇɥou
P̊ẙt̊h̊o̊n̊
Pëthøñ
P̶y̶t̶h̶o̶n̶
P̸y̸t̸h̸o̸n̸
P̷y̷t̷h̷o̷n̷
P̴y̴t̴h̴o̴n̴
𝒫𝓎𝓉𝒽𝑜𝓃
ℙ𝕪𝕥𝕙𝕠𝕟
                    """,
                "english: The quick brown fox jumps over the lazy dog. It's 1234.56 and costs $789.",
                "french: Le renard brun rapide saute par-dessus le chien paresseux. C'est 1234,56 et coûte 789€.",
                "german: Der schnelle braune Fuchs springt über den faulen Hund. Es ist 1234,56 und kostet 789€.",
                "turkish: Hızlı kahverengi tilki tembel köpeğin üstunden atlar. 1234.56'dır ve 789$ tutar.",
                "chinese: 快速的棕色狐狸跳过懒狗。它是1234.56，价格为789美元。",
                "arabic: الثعلب البني السريع يقفز فوق الكلب الكسول. إنه 1234.56 ويكلف 789 دولارًا.",
                "hindi: तेज भूरी लोमड़ी आलसी कुत्ते पर कूदती है। यह 1234.56 है और 789 डॉलर की कीमत है।",
                "code: def calculate_sum(a, b):\n    return a + b\n\nresult = calculate_sum(123, 456)",
                "mixed: English text with numbers 12345 and special chars !@#$%, plus some code: x = f(y)",
                "numbers: The price is $123.45 (20% off) = $98.76 savings 1 12 123 1234 12345 123456 1234567 12345678 123456789",
                "Mixed languages: Hello! 你好! こんにちは! Bonjour! Hola! مرحبا!",
                "Subword challenge: antidisestablishmentarianism pseudopseudohypoparathyroidism",
                "Special characters: @user123 #AI #NLP https://example.com/api?q=tokenization&limit=100",
                "Scientific text: The mitochondria (powerhouse of the cell) produces ATP through oxidative phosphorylation.",
                "Technical jargon: The RESTful API endpoint /users/{id}/preferences supports GET/POST/PUT/DELETE operations.",
                "Emoji & Unicode: I love AI! 🤖✨ The café naïve résumé 北京大学 العربية😀 👍 🚀 🌍 🎉 💡 🔥 🎵 🏆 🌈",
                "Long compound words (German): Donaudampfschifffahrtselektrizitätenhauptbetriebswerkbauunterbeamtengesellschaft",
                'JSON data: {"name": "John Doe", "age": 30, "skills": ["Python", "JavaScript", "AI/ML"]}',
                "Medical terminology: Pneumonoultramicroscopicsilicovolcanoconiosisdiagnosis requires thorough radiological examination.",
            ]
            sample_texts = gr.Dropdown(
                choices=pre_choices,
                value="Custom text (enter below)",
                label="Choose a sample text or enter your own",
                interactive=True,
            )

            text_input = gr.Textbox(
                label="Text to tokenize",
                placeholder="Enter your text here or select a sample above...",
                lines=4,
                value=pre_choices[1],
            )
        with gr.Column(scale=1):
            with gr.Tabs():
                with gr.TabItem("Models"):
                    model_selector = gr.CheckboxGroup(

                        choices=available_tokenizers,
                        value=pre_selected_tokenizers,
                        label="Select tokenizers to compare...",
                    )
                    show_details = gr.Checkbox(
                        label="Show detailed analysis", value=False
                    )

                with gr.TabItem("Normalization"):
                    normalization_method = gr.Dropdown(
                        choices=[method[0] for method in get_normalization_methods()],
                        value="none",
                        label="Normalization Method",
                    )
                    show_normalization = gr.Checkbox(
                        label="Show normalized results", value=False
                    )
    with gr.Row():
        with gr.Column():
            efficiency_output = gr.Markdown(
                label="Efficiency Ranking",
                value="Enter text above to see efficiency comparison...",
            )

    with gr.Row():
        with gr.Column():
            tokenization_display = gr.HTML(
                label="Interactive Tokenization (Hover to highlight across tokenizers)",
                value="<p>Enter text above to see interactive tokenization...</p>",
            )
    with gr.Row():
        with gr.Column():
            normalized_display = gr.HTML(
                label="Normalized Tokenization",
                value="<p>Enable normalization to see results...</p>",
                visible=False,
            )
    with gr.Row():
        with gr.Column():
            token_ids_output = gr.Markdown(
                label="Token IDs", value="Token IDs will appear here..."
            )

    with gr.Row():
        with gr.Column():
            detailed_output = gr.Markdown(label="Detailed Analysis", visible=False)

    with gr.Row():
        with gr.Column():
            efficiency_chart = gr.Plot(label="Efficiency Comparison")
        with gr.Column():
            distribution_chart = gr.Plot(label="Token Type Distribution")

    # Function to update text input when sample is selected
    def update_text_from_sample(sample_choice):
        if sample_choice == "Custom text (enter below)":
            return gr.update()  # Don't change the text input
        else:
            # Extract the text after the colon
            sample_text = (
                sample_choice.split(": ", 1)[1]
                if ": " in sample_choice
                else sample_choice
            )
            return gr.update(value=sample_text)

    # Update text input when sample is selected
    sample_texts.change(
        fn=update_text_from_sample, inputs=sample_texts, outputs=text_input
    )

    # Main comparison function
    def update_comparison_with_norm(text, models, details, norm_method, show_norm):
        if normalization_method == "none" or not show_norm:
            # Original behavior
            (
                efficiency,
                tokenization_html,
                token_ids,
                detailed,
                eff_chart,
                dist_chart,
            ) = compare_tokenizers(text, models, details)
            return (
                efficiency,
                tokenization_html,
                token_ids,
                detailed,
                eff_chart,
                dist_chart,
            )
        else:
            # With normalization
            original_results, normalized_results, normalized_text = (
                compare_with_normalization(text, models, norm_method, details)
            )

            # Generate displays for both
            orig_eff, orig_html, orig_ids = generate_basic_comparison(original_results)
            norm_eff, norm_html, norm_ids = generate_basic_comparison(
                normalized_results
            )

            # Combine or show separately
            combined_html = f"<h3>Normalized ({norm_method}) Text: {normalized_text} </h3>{norm_html}\n<h2>Original</h2>{orig_html}"

            return (
                orig_eff,
                gr.update(value=combined_html, visible=True),
                orig_ids,
                "",
                None,
                None,
            )

    def update_comparison(text, models, details):
        efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart = (
            compare_tokenizers(text, models, details)
        )
        return efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart

    # Auto-update on changes
    for component in [
        text_input,
        model_selector,
        show_details,
        normalization_method,
        show_normalization,
    ]:
        component.change(
            fn=update_comparison_with_norm,
            inputs=[
                text_input,
                model_selector,
                show_details,
                normalization_method,
                show_normalization,
            ],
            outputs=[
                efficiency_output,
                tokenization_display,
                token_ids_output,
                detailed_output,
                efficiency_chart,
                distribution_chart,
            ],
        )

    gr.Markdown("""
    ---
    ### About the Models
    
    - **GPT-4/GPT-2**: OpenAI's tokenizers using BPE (Byte-Pair Encoding)
    - **LLaMA-2/3**: Meta's models using SentencePiece (Llama-3 uses BPE)
    - **Gemma-2**: Google's model with SentencePiece (though HuggingFace uses BPE)
    - **Qwen3/2.5**: Alibaba's models with BPE
    - **BERT/DistilBERT**: Google's models with WordPiece
    - **BLOOM**: BigScience's multilingual model with BPE
    - **Aya Expanse**: Cohere's multilingual model with SentencePiece
    - **Comma (Common Pile)**: Common Pile's model with BPE
    - **Byt5**: Google's byte-level model 
    
    ### Features
    - **Efficiency Ranking**: Compare token counts across models
    - **Subword Analysis**: See how models handle subwords
    - **Token Types**: Classification of word/number/punctuation tokens
    - **Visual Charts**: Interactive plots for comparison
    - **Detailed Analysis**: Common tokens and distribution stats
    """)

if __name__ == "__main__":
    demo.launch()