Spaces:

gsaltintas
/

tokenizer-comparison

Running

App Files Files Community

Gül Sena Altıntaş commited on Jun 9

Commit

3a08f05

1 Parent(s): 14ea3d3

Added additional tokenizers

Browse files

Files changed (2) hide show

app.py +447 -92
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -1,138 +1,493 @@
 import gradio as gr
 import tiktoken
 from transformers import AutoTokenizer
-import os
 # Model mappings
 MODEL_MAP = {
-    'llama-2': 'meta-llama/Llama-2-7b-hf',
-    'llama-3': 'meta-llama/Meta-Llama-3-8B',
-    'gemma-2': 'google/gemma-2-2b',
-    'qwen3': 'Qwen/Qwen2.5-0.5B',
-    'bert': 'bert-base-uncased'
 }
 def tokenize_with_tiktoken(text, model):
-    encoding = 'cl100k_base' if model == 'gpt-4' else 'gpt2'
     enc = tiktoken.get_encoding(encoding)
     tokens = enc.encode(text)
-    token_texts = [enc.decode([token]) for token in tokens]
     return {
-        'model': f'GPT-4' if model == 'gpt-4' else 'GPT-2',
-        'token_count': len(tokens),
-        'tokens': token_texts,
-        'token_ids': tokens.tolist()
     }
 def tokenize_with_hf(text, model):
     try:
-        model_name = MODEL_MAP.get(model, 'gpt2')
-        tokenizer = AutoTokenizer.from_pretrained(model_name, token=os.getenv('HF_TOKEN'))
         tokens = tokenizer.encode(text)
-        token_texts = [tokenizer.decode([token], skip_special_tokens=False) for token in tokens]
         return {
-            'model': model.upper(),
-            'token_count': len(tokens),
-            'tokens': token_texts,
-            'token_ids': tokens
         }
     except Exception as e:
         return {
-            'model': model.upper(),
-            'token_count': 0,
-            'tokens': [f"Error: {str(e)}"],
-            'token_ids': []
         }
-def compare_tokenizers(text, selected_models):
     if not text.strip():
-        return "Please enter some text to tokenize."
-    results = []
     for model in selected_models:
-        if model in ['gpt-4', 'gpt-2']:
-            result = tokenize_with_tiktoken(text, model)
         else:
-            result = tokenize_with_hf(text, model)
-        # Format output
-        tokens_display = ' | '.join([f'"{token}"' if token.strip() else '"·"' for token in result['tokens'][:20]])
-        if len(result['tokens']) > 20:
-            tokens_display += f" ... (+{len(result['tokens']) - 20} more)"
-        results.append(f"""
-**{result['model']}**
-- Token Count: **{result['token_count']}**
-- Tokens: {tokens_display}
-- Token IDs: {str(result['token_ids'][:10])}{'...' if len(result['token_ids']) > 10 else ''}
-        """)
-    return "\n\n---\n".join(results)
-# Create Gradio interface
 with gr.Blocks(
-    title="🔤 Tokenizer Comparison Tool",
-    theme=gr.themes.Soft()
 ) as demo:
     gr.Markdown("""
-    # 🔤 Tokenizer Comparison Tool
-    Compare how different LLM tokenizers split text into tokens. See the differences between GPT, LLaMA, Gemma, and other models.
-    """)
     with gr.Row():
         with gr.Column(scale=2):
             text_input = gr.Textbox(
                 label="Text to tokenize",
-                placeholder="Hello world! This is a test with some subwords and punctuation.",
                 lines=4,
-                value="Hello world! This is a test with some subwords and punctuation."
             )
         with gr.Column(scale=1):
             model_selector = gr.CheckboxGroup(
-                choices=['gpt-4', 'gpt-2', 'llama-2', 'llama-3', 'gemma-2', 'qwen3', 'bert'],
-                value=['gpt-4', 'llama-3', 'gpt-2'],
-                label="Select tokenizers to compare"
             )
-    output = gr.Markdown(
-        label="Tokenization Results",
-        value="Enter text above to see tokenization results..."
-    )
-    # Auto-update on text or model change
-    text_input.change(
-        fn=compare_tokenizers,
-        inputs=[text_input, model_selector],
-        outputs=output
-    )
-    model_selector.change(
-        fn=compare_tokenizers,
-        inputs=[text_input, model_selector],
-        outputs=output
-    )
     gr.Markdown("""
-    ### Legend:
-    - **Token Count**: Number of tokens the model uses
-    - **Tokens**: The actual text pieces (subwords)
-    - **Token IDs**: Numerical IDs in the vocabulary
-    - **"·"**: Represents spaces/whitespace
-    ### Models:
-    - **GPT-4/GPT-2**: OpenAI tokenizers (tiktoken)
-    - **LLaMA**: Meta's models (SentencePiece)
-    - **Gemma**: Google's models
-    - **Qwen**: Alibaba's models
-    - **BERT**: Google's BERT tokenizer
     """)
 if __name__ == "__main__":
-    demo.launch()

+import json
+import os
+from collections import Counter
 import gradio as gr
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
 import tiktoken
 from transformers import AutoTokenizer
 # Model mappings
 MODEL_MAP = {
+    "llama-2": "meta-llama/Llama-2-7b-hf",
+    "llama-3": "meta-llama/Llama-3.2-1B",
+    "gemma-2": "google/gemma-2-2b",
+    "qwen3": "Qwen/Qwen3-0.6B",
+    "qwen2.5": "Qwen/Qwen2.5-0.5B",
+    "bert": "bert-base-uncased",
+    "bloom": "bigscience/bloom-560m",
+    "aya-expanse": "CohereForAI/aya-expanse-8b",
+    "comma": "common-pile/comma-v0.1-2tgpt2",
+    "byte-level": "google/byt5-small",
+    "tokenmonster": "alasdairforsythe/tokenmonster",
 }
+TOKENIZER_INFO = {
+    "gpt-4": {"name": "GPT-4", "vocab_size": 100277, "encoding": "BPE"},
+    "gpt-2": {"name": "GPT-2", "vocab_size": 50257, "encoding": "BPE"},
+    "llama-2": {"name": "LLaMA-2", "vocab_size": 32000, "encoding": "SentencePiece"},
+    "llama-3": {"name": "LLaMA-3", "vocab_size": 128000, "encoding": "SentencePiece"},
+    "gemma-2": {"name": "Gemma-2", "vocab_size": 256000, "encoding": "SentencePiece"},
+    "qwen3": {"name": "Qwen3", "vocab_size": 151936, "encoding": "BPE"},
+    "qwen2.5": {"name": "Qwen2.5", "vocab_size": 151936, "encoding": "BPE"},
+    "bert": {"name": "BERT", "vocab_size": 30522, "encoding": "WordPiece"},
+    "bloom": {"name": "BLOOM", "vocab_size": 250680, "encoding": "BPE"},
+    "aya-expanse": {
+        "name": "Aya Expanse",
+        "vocab_size": 256000,
+        "encoding": "SentencePiece",
+    },
+    "comma": {"name": "Comma AI", "vocab_size": 50257, "encoding": ""},
+    "byte-level": {"name": "Byte-Level BPE", "vocab_size": 50000, "encoding": "BPE"},
+    "tokenmonster": {"name": "TokenMonster", "vocab_size": 32000, "encoding": ""},
+}
+def get_token_type(token_text):
+    import re
+    if re.match(r"^\s+$", token_text):
+        return "whitespace"
+    elif re.match(r"^[a-zA-Z]+$", token_text):
+        return "word"
+    elif re.match(r"^\d+$", token_text):
+        return "number"
+    elif re.match(r"^[^\w\s]+$", token_text):
+        return "punctuation"
+    elif token_text.startswith("<") and token_text.endswith(">"):
+        return "special"
+    else:
+        return "mixed"
+def is_subword(token_text, model, is_first):
+    if model in ["llama-2", "llama-3", "qwen3"]:
+        return not token_text.startswith("▁") and not is_first
+    elif model == "bert":
+        return token_text.startswith("##")
+    else:  # BPE models
+        return not token_text.startswith(" ") and not is_first and len(token_text) > 0
 def tokenize_with_tiktoken(text, model):
+    encoding = "cl100k_base" if model == "gpt-4" else "gpt2"
     enc = tiktoken.get_encoding(encoding)
     tokens = enc.encode(text)
+    token_data = []
+    current_pos = 0
+    for i, token_id in enumerate(tokens):
+        token_text = enc.decode([token_id])
+        token_type = get_token_type(token_text)
+        subword = is_subword(token_text, model, i == 0)
+        token_data.append(
+            {
+                "text": token_text,
+                "id": int(token_id),
+                "type": token_type,
+                "is_subword": subword,
+                "bytes": len(token_text.encode("utf-8")),
+                "position": i,
+            }
+        )
+        current_pos += len(token_text)
     return {
+        "model": TOKENIZER_INFO[model]["name"],
+        "token_count": len(tokens),
+        "tokens": token_data,
+        "compression_ratio": len(text) / len(tokens) if tokens else 0,
+        "encoding": TOKENIZER_INFO[model]["encoding"],
+        "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
     }
 def tokenize_with_hf(text, model):
     try:
+        model_name = MODEL_MAP.get(model, "gpt2")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name, token=os.getenv("HF_TOKEN"), trust_remote_code=True
+        )
         tokens = tokenizer.encode(text)
+        token_data = []
+        for i, token_id in enumerate(tokens):
+            token_text = tokenizer.decode([token_id], skip_special_tokens=False)
+            token_type = get_token_type(token_text)
+            subword = is_subword(token_text, model, i == 0)
+            token_data.append(
+                {
+                    "text": token_text,
+                    "id": int(token_id),
+                    "type": token_type,
+                    "is_subword": subword,
+                    "bytes": len(token_text.encode("utf-8")),
+                    "position": i,
+                }
+            )
         return {
+            "model": TOKENIZER_INFO[model]["name"],
+            "token_count": len(tokens),
+            "tokens": token_data,
+            "compression_ratio": len(text) / len(tokens) if tokens else 0,
+            "encoding": TOKENIZER_INFO[model]["encoding"],
+            "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
         }
     except Exception as e:
         return {
+            "model": TOKENIZER_INFO[model]["name"],
+            "token_count": 0,
+            "tokens": [],
+            "compression_ratio": 0,
+            "encoding": "Error",
+            "vocab_size": 0,
+            "error": str(e),
         }
+def compare_tokenizers(text, selected_models, show_details=False):
     if not text.strip():
+        return "Please enter some text to tokenize.", "", None, None
+    results = {}
     for model in selected_models:
+        if model in ["gpt-4", "gpt-2"]:
+            results[model] = tokenize_with_tiktoken(text, model)
         else:
+            results[model] = tokenize_with_hf(text, model)
+    # Generate outputs
+    basic_output = generate_basic_comparison(results)
+    detailed_output = generate_detailed_analysis(results) if show_details else ""
+    efficiency_chart = create_efficiency_chart(results)
+    token_distribution_chart = create_token_distribution_chart(results)
+    return basic_output, detailed_output, efficiency_chart, token_distribution_chart
+def generate_basic_comparison(results):
+    if not results:
+        return "No results to display."
+    output = []
+    # Efficiency ranking
+    sorted_models = sorted(results.items(), key=lambda x: x[1]["token_count"])
+    output.append("## 🏆 Efficiency Ranking (Fewer tokens = more efficient)")
+    for i, (model, result) in enumerate(sorted_models):
+        if "error" in result:
+            output.append(
+                f"{i + 1}. **{result['model']}**: ❌ Error - {result['error']}"
+            )
+        else:
+            output.append(
+                f"{i + 1}. **{result['model']}**: {result['token_count']} tokens "
+                f"({result['compression_ratio']:.2f}x compression)"
+            )
+    output.append("\n## 🔤 Tokenization Results")
+    for model, result in results.items():
+        if "error" in result:
+            output.append(f"\n### ❌ {result['model']} - Error: {result['error']}")
+            continue
+        output.append(f"\n### {result['model']}")
+        output.append(f"- **Tokens**: {result['token_count']}")
+        output.append(f"- **Vocab Size**: {result['vocab_size']:,}")
+        output.append(f"- **Encoding**: {result['encoding']}")
+        output.append(f"- **Compression**: {result['compression_ratio']:.2f}x")
+        # Show first 20 tokens with visual indicators
+        tokens_display = []
+        subword_count = 0
+        for token in result["tokens"][:20]:
+            token_text = token["text"]
+            if token_text == " ":
+                token_text = "·"  # Space indicator
+            elif token_text.strip() == "":
+                token_text = "⎵"  # Empty token indicator
+            # Add type indicators
+            if token["is_subword"]:
+                tokens_display.append(f"🔸`{token_text}`")
+                subword_count += 1
+            elif token["type"] == "word":
+                tokens_display.append(f"🔤`{token_text}`")
+            elif token["type"] == "number":
+                tokens_display.append(f"🔢`{token_text}`")
+            elif token["type"] == "punctuation":
+                tokens_display.append(f"❗`{token_text}`")
+            else:
+                tokens_display.append(f"`{token_text}`")
+        if len(result["tokens"]) > 20:
+            tokens_display.append(f"... (+{len(result['tokens']) - 20} more)")
+        output.append(f"- **Subwords**: {subword_count}/{len(result['tokens'][:20])}")
+        output.append(f"- **Tokens**: {' '.join(tokens_display)}")
+    return "\n".join(output)
+def generate_detailed_analysis(results):
+    if not results or len(results) < 2:
+        return "Need at least 2 tokenizers for detailed analysis."
+    output = []
+    output.append("## 🔍 Detailed Analysis")
+    # Find common tokens
+    all_token_sets = []
+    for model, result in results.items():
+        if "error" not in result:
+            token_texts = {token["text"] for token in result["tokens"]}
+            all_token_sets.append(token_texts)
+    if all_token_sets:
+        common_tokens = set.intersection(*all_token_sets)
+        output.append(f"\n### Common Tokens ({len(common_tokens)})")
+        if common_tokens:
+            common_display = [
+                f"`{token}`" if token != " " else "`·`"
+                for token in list(common_tokens)[:15]
+            ]
+            output.append(" ".join(common_display))
+        else:
+            output.append("No common tokens found.")
+    # Token type distribution
+    output.append("\n### Token Type Distribution")
+    for model, result in results.items():
+        if "error" not in result:
+            type_counts = Counter(token["type"] for token in result["tokens"])
+            type_display = [f"{type_}: {count}" for type_, count in type_counts.items()]
+            output.append(f"**{result['model']}**: {', '.join(type_display)}")
+    # Subword analysis
+    output.append("\n### Subword Analysis")
+    for model, result in results.items():
+        if "error" not in result:
+            subwords = [token for token in result["tokens"] if token["is_subword"]]
+            subword_ratio = (
+                len(subwords) / len(result["tokens"]) * 100 if result["tokens"] else 0
+            )
+            output.append(
+                f"**{result['model']}**: {len(subwords)} subwords ({subword_ratio:.1f}%)"
+            )
+    return "\n".join(output)
+def create_efficiency_chart(results):
+    if not results:
+        return None
+    models = []
+    token_counts = []
+    compression_ratios = []
+    for model, result in results.items():
+        if "error" not in result:
+            models.append(result["model"])
+            token_counts.append(result["token_count"])
+            compression_ratios.append(result["compression_ratio"])
+    if not models:
+        return None
+    fig = go.Figure()
+    # Add token count bars
+    fig.add_trace(
+        go.Bar(
+            x=models,
+            y=token_counts,
+            name="Token Count",
+            marker_color="lightblue",
+            text=token_counts,
+            textposition="auto",
+        )
+    )
+    fig.update_layout(
+        title="Token Count Comparison (Lower = More Efficient)",
+        xaxis_title="Tokenizer",
+        yaxis_title="Number of Tokens",
+        template="plotly_white",
+    )
+    return fig
+def create_token_distribution_chart(results):
+    if not results:
+        return None
+    all_data = []
+    for model, result in results.items():
+        if "error" not in result:
+            type_counts = Counter(token["type"] for token in result["tokens"])
+            for token_type, count in type_counts.items():
+                all_data.append(
+                    {
+                        "Tokenizer": result["model"],
+                        "Token Type": token_type,
+                        "Count": count,
+                    }
+                )
+    if not all_data:
+        return None
+    df = pd.DataFrame(all_data)
+    fig = px.bar(
+        df,
+        x="Tokenizer",
+        y="Count",
+        color="Token Type",
+        title="Token Type Distribution by Tokenizer",
+        template="plotly_white",
+    )
+    return fig
+# Custom CSS for better styling
+css = """
+.gradio-container {
+    font-family: 'Inter', sans-serif;
+}
+.token-display {
+    font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
+    background: #f8f9fa;
+    padding: 8px;
+    border-radius: 4px;
+    font-size: 0.9em;
+}
+"""
+# Create the Gradio interface
 with gr.Blocks(
+    title="🔤 Advanced Tokenizer Comparison", theme=gr.themes.Soft(), css=css
 ) as demo:
     gr.Markdown("""
+    # 🔤 Advanced Tokenizer Comparison Tool
+    Compare how different LLM tokenizers split text into tokens. Analyze efficiency, subwords, and token types.
+    **Legend**: 🔤 Word | 🔢 Number | ❗ Punctuation | 🔸 Subword | · Space
+    """)
     with gr.Row():
         with gr.Column(scale=2):
             text_input = gr.Textbox(
                 label="Text to tokenize",
+                placeholder="Enter your text here...",
                 lines=4,
+                value="Hello world! This is a test with some subwords and punctuation.",
             )
         with gr.Column(scale=1):
             model_selector = gr.CheckboxGroup(
+                choices=[
+                    "gpt-4",
+                    "gpt-2",
+                    "llama-2",
+                    "llama-3",
+                    "gemma-2",
+                    "qwen3",
+                    "qwen2.5",
+                    "bert",
+                    "bloom",
+                    "aya-expanse",
+                    "comma",
+                    "byte-level",
+                    "tokenmonster",
+                ],
+                value=["gpt-4", "llama-3", "gpt-2"],
+                label="Select tokenizers to compare",
             )
+            show_details = gr.Checkbox(label="Show detailed analysis", value=False)
+    with gr.Row():
+        with gr.Column():
+            basic_output = gr.Markdown(
+                label="Comparison Results",
+                value="Enter text above to see tokenization results...",
+            )
+    with gr.Row():
+        with gr.Column():
+            detailed_output = gr.Markdown(label="Detailed Analysis", visible=False)
+    with gr.Row():
+        with gr.Column():
+            efficiency_chart = gr.Plot(label="Efficiency Comparison")
+        with gr.Column():
+            distribution_chart = gr.Plot(label="Token Type Distribution")
+    # Update visibility of detailed analysis
+    def toggle_details(show_details):
+        return gr.update(visible=show_details)
+    show_details.change(fn=toggle_details, inputs=show_details, outputs=detailed_output)
+    # Main comparison function
+    def update_comparison(text, models, details):
+        basic, detailed, eff_chart, dist_chart = compare_tokenizers(
+            text, models, details
+        )
+        return basic, detailed, eff_chart, dist_chart
+    # Auto-update on changes
+    for component in [text_input, model_selector, show_details]:
+        component.change(
+            fn=update_comparison,
+            inputs=[text_input, model_selector, show_details],
+            outputs=[
+                basic_output,
+                detailed_output,
+                efficiency_chart,
+                distribution_chart,
+            ],
+        )
     gr.Markdown("""
+    ---
+    ### About the Models
+    - **GPT-4/GPT-2**: OpenAI's tokenizers using BPE (Byte-Pair Encoding)
+    - **LLaMA-2/3**: Meta's models using SentencePiece
+    - **Gemma-2**: Google's model with SentencePiece
+    - **Qwen3/2.5**: Alibaba's models with BPE
+    - **BERT**: Google's BERT with WordPiece
+    - **BLOOM**: BigScience's multilingual model with BPE
+    - **Aya Expanse**: Cohere's multilingual model with SentencePiece
+    - **Comma AI**: Comma AI's model with BPE
+    - **Byte-Level**: Byte-level BPE tokenizer
+    - **TokenMonster**: Optimized tokenizer with BPE
+    ### Features
+    - **Efficiency Ranking**: Compare token counts across models
+    - **Subword Analysis**: See how models handle subwords
+    - **Token Types**: Classification of word/number/punctuation tokens
+    - **Visual Charts**: Interactive plots for comparison
+    - **Detailed Analysis**: Common tokens and distribution stats
     """)
 if __name__ == "__main__":
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -1,4 +1,6 @@
 gradio
 tiktoken
 transformers
-torch

 gradio
 tiktoken
 transformers
+torch
+pandas
+plotly