Spaces:

gsaltintas
/

tokenizer-comparison

Running

App Files Files Community

Gül Sena Altıntaş commited on Jun 9

Commit

c02e89e

1 Parent(s): 3a08f05

Refactoring, and visual improvements

Browse files

Files changed (4) hide show

.gitignore +7 -0
app.py +245 -196
mappings.py +36 -0
utils.py +136 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+*.pyc
+*.pyo
+*.pyd
+*.pyw
+*.pyz
+*.pywz
+*.pyzw

app.py CHANGED Viewed

@@ -1,160 +1,16 @@
-import json
-import os
 from collections import Counter
 import gradio as gr
 import pandas as pd
 import plotly.express as px
 import plotly.graph_objects as go
-import tiktoken
-from transformers import AutoTokenizer
-# Model mappings
-MODEL_MAP = {
-    "llama-2": "meta-llama/Llama-2-7b-hf",
-    "llama-3": "meta-llama/Llama-3.2-1B",
-    "gemma-2": "google/gemma-2-2b",
-    "qwen3": "Qwen/Qwen3-0.6B",
-    "qwen2.5": "Qwen/Qwen2.5-0.5B",
-    "bert": "bert-base-uncased",
-    "bloom": "bigscience/bloom-560m",
-    "aya-expanse": "CohereForAI/aya-expanse-8b",
-    "comma": "common-pile/comma-v0.1-2tgpt2",
-    "byte-level": "google/byt5-small",
-    "tokenmonster": "alasdairforsythe/tokenmonster",
-}
-TOKENIZER_INFO = {
-    "gpt-4": {"name": "GPT-4", "vocab_size": 100277, "encoding": "BPE"},
-    "gpt-2": {"name": "GPT-2", "vocab_size": 50257, "encoding": "BPE"},
-    "llama-2": {"name": "LLaMA-2", "vocab_size": 32000, "encoding": "SentencePiece"},
-    "llama-3": {"name": "LLaMA-3", "vocab_size": 128000, "encoding": "SentencePiece"},
-    "gemma-2": {"name": "Gemma-2", "vocab_size": 256000, "encoding": "SentencePiece"},
-    "qwen3": {"name": "Qwen3", "vocab_size": 151936, "encoding": "BPE"},
-    "qwen2.5": {"name": "Qwen2.5", "vocab_size": 151936, "encoding": "BPE"},
-    "bert": {"name": "BERT", "vocab_size": 30522, "encoding": "WordPiece"},
-    "bloom": {"name": "BLOOM", "vocab_size": 250680, "encoding": "BPE"},
-    "aya-expanse": {
-        "name": "Aya Expanse",
-        "vocab_size": 256000,
-        "encoding": "SentencePiece",
-    },
-    "comma": {"name": "Comma AI", "vocab_size": 50257, "encoding": ""},
-    "byte-level": {"name": "Byte-Level BPE", "vocab_size": 50000, "encoding": "BPE"},
-    "tokenmonster": {"name": "TokenMonster", "vocab_size": 32000, "encoding": ""},
-}
-def get_token_type(token_text):
-    import re
-    if re.match(r"^\s+$", token_text):
-        return "whitespace"
-    elif re.match(r"^[a-zA-Z]+$", token_text):
-        return "word"
-    elif re.match(r"^\d+$", token_text):
-        return "number"
-    elif re.match(r"^[^\w\s]+$", token_text):
-        return "punctuation"
-    elif token_text.startswith("<") and token_text.endswith(">"):
-        return "special"
-    else:
-        return "mixed"
-def is_subword(token_text, model, is_first):
-    if model in ["llama-2", "llama-3", "qwen3"]:
-        return not token_text.startswith("▁") and not is_first
-    elif model == "bert":
-        return token_text.startswith("##")
-    else:  # BPE models
-        return not token_text.startswith(" ") and not is_first and len(token_text) > 0
-def tokenize_with_tiktoken(text, model):
-    encoding = "cl100k_base" if model == "gpt-4" else "gpt2"
-    enc = tiktoken.get_encoding(encoding)
-    tokens = enc.encode(text)
-    token_data = []
-    current_pos = 0
-    for i, token_id in enumerate(tokens):
-        token_text = enc.decode([token_id])
-        token_type = get_token_type(token_text)
-        subword = is_subword(token_text, model, i == 0)
-        token_data.append(
-            {
-                "text": token_text,
-                "id": int(token_id),
-                "type": token_type,
-                "is_subword": subword,
-                "bytes": len(token_text.encode("utf-8")),
-                "position": i,
-            }
-        )
-        current_pos += len(token_text)
-    return {
-        "model": TOKENIZER_INFO[model]["name"],
-        "token_count": len(tokens),
-        "tokens": token_data,
-        "compression_ratio": len(text) / len(tokens) if tokens else 0,
-        "encoding": TOKENIZER_INFO[model]["encoding"],
-        "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
-    }
-def tokenize_with_hf(text, model):
-    try:
-        model_name = MODEL_MAP.get(model, "gpt2")
-        tokenizer = AutoTokenizer.from_pretrained(
-            model_name, token=os.getenv("HF_TOKEN"), trust_remote_code=True
-        )
-        tokens = tokenizer.encode(text)
-        token_data = []
-        for i, token_id in enumerate(tokens):
-            token_text = tokenizer.decode([token_id], skip_special_tokens=False)
-            token_type = get_token_type(token_text)
-            subword = is_subword(token_text, model, i == 0)
-            token_data.append(
-                {
-                    "text": token_text,
-                    "id": int(token_id),
-                    "type": token_type,
-                    "is_subword": subword,
-                    "bytes": len(token_text.encode("utf-8")),
-                    "position": i,
-                }
-            )
-        return {
-            "model": TOKENIZER_INFO[model]["name"],
-            "token_count": len(tokens),
-            "tokens": token_data,
-            "compression_ratio": len(text) / len(tokens) if tokens else 0,
-            "encoding": TOKENIZER_INFO[model]["encoding"],
-            "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
-        }
-    except Exception as e:
-        return {
-            "model": TOKENIZER_INFO[model]["name"],
-            "token_count": 0,
-            "tokens": [],
-            "compression_ratio": 0,
-            "encoding": "Error",
-            "vocab_size": 0,
-            "error": str(e),
-        }
 def compare_tokenizers(text, selected_models, show_details=False):
     if not text.strip():
-        return "Please enter some text to tokenize.", "", None, None
     results = {}
@@ -165,77 +21,252 @@ def compare_tokenizers(text, selected_models, show_details=False):
             results[model] = tokenize_with_hf(text, model)
     # Generate outputs
-    basic_output = generate_basic_comparison(results)
     detailed_output = generate_detailed_analysis(results) if show_details else ""
     efficiency_chart = create_efficiency_chart(results)
     token_distribution_chart = create_token_distribution_chart(results)
-    return basic_output, detailed_output, efficiency_chart, token_distribution_chart
 def generate_basic_comparison(results):
     if not results:
-        return "No results to display."
-    output = []
     # Efficiency ranking
     sorted_models = sorted(results.items(), key=lambda x: x[1]["token_count"])
-    output.append("## 🏆 Efficiency Ranking (Fewer tokens = more efficient)")
     for i, (model, result) in enumerate(sorted_models):
         if "error" in result:
-            output.append(
                 f"{i + 1}. **{result['model']}**: ❌ Error - {result['error']}"
             )
         else:
-            output.append(
                 f"{i + 1}. **{result['model']}**: {result['token_count']} tokens "
                 f"({result['compression_ratio']:.2f}x compression)"
             )
-    output.append("\n## 🔤 Tokenization Results")
     for model, result in results.items():
         if "error" in result:
-            output.append(f"\n### ❌ {result['model']} - Error: {result['error']}")
             continue
-        output.append(f"\n### {result['model']}")
-        output.append(f"- **Tokens**: {result['token_count']}")
-        output.append(f"- **Vocab Size**: {result['vocab_size']:,}")
-        output.append(f"- **Encoding**: {result['encoding']}")
-        output.append(f"- **Compression**: {result['compression_ratio']:.2f}x")
-        # Show first 20 tokens with visual indicators
-        tokens_display = []
         subword_count = 0
-        for token in result["tokens"][:20]:
             token_text = token["text"]
-            if token_text == " ":
-                token_text = "·"  # Space indicator
-            elif token_text.strip() == "":
-                token_text = "⎵"  # Empty token indicator
-            # Add type indicators
             if token["is_subword"]:
-                tokens_display.append(f"🔸`{token_text}`")
                 subword_count += 1
-            elif token["type"] == "word":
-                tokens_display.append(f"🔤`{token_text}`")
-            elif token["type"] == "number":
-                tokens_display.append(f"🔢`{token_text}`")
-            elif token["type"] == "punctuation":
-                tokens_display.append(f"❗`{token_text}`")
-            else:
-                tokens_display.append(f"`{token_text}`")
-        if len(result["tokens"]) > 20:
-            tokens_display.append(f"... (+{len(result['tokens']) - 20} more)")
-        output.append(f"- **Subwords**: {subword_count}/{len(result['tokens'][:20])}")
-        output.append(f"- **Tokens**: {' '.join(tokens_display)}")
     return "\n".join(output)
@@ -414,8 +445,10 @@ with gr.Blocks(
                     "bloom",
                     "aya-expanse",
                     "comma",
-                    "byte-level",
                     "tokenmonster",
                 ],
                 value=["gpt-4", "llama-3", "gpt-2"],
                 label="Select tokenizers to compare",
@@ -425,9 +458,22 @@ with gr.Blocks(
     with gr.Row():
         with gr.Column():
-            basic_output = gr.Markdown(
-                label="Comparison Results",
-                value="Enter text above to see tokenization results...",
             )
     with gr.Row():
@@ -448,10 +494,10 @@ with gr.Blocks(
     # Main comparison function
     def update_comparison(text, models, details):
-        basic, detailed, eff_chart, dist_chart = compare_tokenizers(
-            text, models, details
         )
-        return basic, detailed, eff_chart, dist_chart
     # Auto-update on changes
     for component in [text_input, model_selector, show_details]:
@@ -459,7 +505,9 @@ with gr.Blocks(
             fn=update_comparison,
             inputs=[text_input, model_selector, show_details],
             outputs=[
-                basic_output,
                 detailed_output,
                 efficiency_chart,
                 distribution_chart,
@@ -474,12 +522,11 @@ with gr.Blocks(
     - **LLaMA-2/3**: Meta's models using SentencePiece
     - **Gemma-2**: Google's model with SentencePiece
     - **Qwen3/2.5**: Alibaba's models with BPE
-    - **BERT**: Google's BERT with WordPiece
     - **BLOOM**: BigScience's multilingual model with BPE
     - **Aya Expanse**: Cohere's multilingual model with SentencePiece
-    - **Comma AI**: Comma AI's model with BPE
-    - **Byte-Level**: Byte-level BPE tokenizer
-    - **TokenMonster**: Optimized tokenizer with BPE
     ### Features
     - **Efficiency Ranking**: Compare token counts across models
@@ -491,3 +538,5 @@ with gr.Blocks(
 if __name__ == "__main__":
     demo.launch()

 from collections import Counter
 import gradio as gr
 import pandas as pd
 import plotly.express as px
 import plotly.graph_objects as go
+from utils import tokenize_with_hf, tokenize_with_tiktoken
 def compare_tokenizers(text, selected_models, show_details=False):
     if not text.strip():
+        return "Please enter some text to tokenize.", "", "", "", None, None
     results = {}
             results[model] = tokenize_with_hf(text, model)
     # Generate outputs
+    efficiency_output, tokenization_html, token_ids_output = generate_basic_comparison(
+        results
+    )
     detailed_output = generate_detailed_analysis(results) if show_details else ""
     efficiency_chart = create_efficiency_chart(results)
     token_distribution_chart = create_token_distribution_chart(results)
+    return (
+        efficiency_output,
+        tokenization_html,
+        token_ids_output,
+        detailed_output,
+        efficiency_chart,
+        token_distribution_chart,
+    )
 def generate_basic_comparison(results):
     if not results:
+        return "No results to display.", "", ""
     # Efficiency ranking
     sorted_models = sorted(results.items(), key=lambda x: x[1]["token_count"])
+    ranking_output = []
+    ranking_output.append("## 🏆 Efficiency Ranking (Fewer tokens = more efficient)")
     for i, (model, result) in enumerate(sorted_models):
         if "error" in result:
+            ranking_output.append(
                 f"{i + 1}. **{result['model']}**: ❌ Error - {result['error']}"
             )
         else:
+            ranking_output.append(
                 f"{i + 1}. **{result['model']}**: {result['token_count']} tokens "
                 f"({result['compression_ratio']:.2f}x compression)"
             )
+    # Generate interactive tokenization display
+    tokenization_html = generate_interactive_tokenization(results)
+    # Generate token ID tables
+    token_ids_display = generate_token_ids_display(results)
+    return "\n".join(ranking_output), tokenization_html, token_ids_display
+def generate_interactive_tokenization(results):
+    """Generate HTML with hover highlighting across tokenizers"""
+    if not results:
+        return "<p>No tokenization results to display.</p>"
+    html_parts = []
+    html_parts.append("""
+    <style>
+    .tokenizer-container {
+        margin-bottom: 20px;
+        border: 1px solid #e0e0e0;
+        border-radius: 8px;
+        padding: 15px;
+        background: white;
+    }
+    .tokenizer-header {
+        font-weight: bold;
+        font-size: 18px;
+        margin-bottom: 10px;
+        color: #2c3e50;
+    }
+    .token-display {
+        font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
+        line-height: 1.8;
+        word-wrap: break-word;
+    }
+    .token {
+        display: inline-block;
+        margin: 2px;
+        padding: 4px 8px;
+        border-radius: 4px;
+        border: 1px solid;
+        cursor: pointer;
+        transition: all 0.2s ease;
+        position: relative;
+        font-size: 14px;
+    }
+    .token:hover {
+        transform: scale(1.1);
+        z-index: 10;
+        box-shadow: 0 2px 8px rgba(0,0,0,0.2);
+    }
+    .token.highlighted {
+        background: #ff6b6b !important;
+        border-color: #e55353 !important;
+        color: white !important;
+        box-shadow: 0 0 10px rgba(255, 107, 107, 0.5);
+    }
+    .token-word { background: #e8f5e8; border-color: #4caf50; color: #2e7d32; }
+    .token-number { background: #f3e5f5; border-color: #9c27b0; color: #7b1fa2; }
+    .token-punctuation { background: #ffebee; border-color: #f44336; color: #c62828; }
+    .token-whitespace { background: #f5f5f5; border-color: #9e9e9e; color: #616161; }
+    .token-special { background: #fff3e0; border-color: #ff9800; color: #ef6c00; }
+    .token-mixed { background: #e3f2fd; border-color: #2196f3; color: #1565c0; }
+    .token-subword {
+        background: #fff8e1 !important;
+        border-color: #ffc107 !important;
+        border-style: dashed !important;
+    }
+    .token-stats {
+        display: inline-block;
+        margin-left: 10px;
+        padding: 2px 6px;
+        background: #f8f9fa;
+        border-radius: 3px;
+        font-size: 12px;
+        color: #666;
+    }
+    </style>
+    <script>
+    function highlightToken(text, allTokenizers) {
+        // Remove existing highlights
+        document.querySelectorAll('.token').forEach(token => {
+            token.classList.remove('highlighted');
+        });
+        // Highlight tokens with same text across all tokenizers
+        document.querySelectorAll('.token').forEach(token => {
+            if (token.dataset.text === text) {
+                token.classList.add('highlighted');
+            }
+        });
+    }
+    function clearHighlights() {
+        document.querySelectorAll('.token').forEach(token => {
+            token.classList.remove('highlighted');
+        });
+    }
+    </script>
+    """)
     for model, result in results.items():
         if "error" in result:
+            html_parts.append(f"""
+            <div class="tokenizer-container">
+                <div class="tokenizer-header">{result["model"]} ❌</div>
+                <div style="color: #d32f2f; font-style: italic;">Error: {result["error"]}</div>
+            </div>
+            """)
             continue
+        html_parts.append(f"""
+        <div class="tokenizer-container">
+            <div class="tokenizer-header">
+                {result["model"]}
+                <span class="token-stats">
+                    {result["token_count"]} tokens |
+                    {result["encoding"]} |
+                    {result["compression_ratio"]:.2f}x compression
+                </span>
+            </div>
+            <div class="token-display">
+        """)
+        # Add tokens with hover functionality
         subword_count = 0
+        for i, token in enumerate(result["tokens"]):
             token_text = token["text"]
+            display_text = (
+                token_text if token_text.strip() else "·"
+            )  # Show space as dot
+            # Determine token class
+            token_class = f"token token-{token['type']}"
             if token["is_subword"]:
+                token_class += " token-subword"
                 subword_count += 1
+            # Escape text for HTML
+            escaped_text = token_text.replace('"', "&quot;").replace("'", "&#39;")
+            escaped_display = display_text.replace('"', "&quot;").replace("'", "&#39;")
+            html_parts.append(f"""
+                <span class="{token_class}"
+                      data-text="{escaped_text}"
+                      data-id="{token["id"]}"
+                      data-position="{i}"
+                      title="Text: '{token_text}' | ID: {token["id"]} | Type: {token["type"]} | Subword: {token["is_subword"]}"
+                      onmouseover="highlightToken('{escaped_text}', true)"
+                      onmouseout="clearHighlights()">
+                    {escaped_display}
+                </span>
+            """)
+        html_parts.append(f"""
+            </div>
+            <div style="margin-top: 8px; font-size: 12px; color: #666;">
+                Subwords: {subword_count}/{len(result["tokens"])}
+                ({subword_count / len(result["tokens"]) * 100:.1f}%)
+            </div>
+        </div>
+        """)
+    return "".join(html_parts)
+def generate_token_ids_display(results):
+    """Generate a clean display of token IDs for each tokenizer"""
+    if not results:
+        return "No token IDs to display."
+    output = []
+    output.append("## 🔢 Token IDs by Tokenizer")
+    for model, result in results.items():
+        if "error" in result:
+            output.append(f"\n### {result['model']} ❌")
+            output.append(f"Error: {result['error']}")
+            continue
+        output.append(f"\n### {result['model']}")
+        output.append(
+            f"**Vocab Size**: {result['vocab_size']:,} | **Encoding**: {result['encoding']}"
+        )
+        # Display token IDs in a readable format
+        token_ids = [str(token["id"]) for token in result["tokens"]]
+        # Group IDs for better readability (10 per line)
+        lines = []
+        for i in range(0, len(token_ids), 10):
+            line_ids = token_ids[i : i + 10]
+            lines.append(" ".join(line_ids))
+        output.append("```")
+        output.append("\n".join(lines))
+        output.append("```")
+        # Add some statistics
+        unique_ids = len(set(token_ids))
+        output.append(
+            f"**Stats**: {len(token_ids)} total tokens, {unique_ids} unique IDs"
+        )
+        # Show ID ranges
+        id_values = [token["id"] for token in result["tokens"]]
+        if id_values:
+            output.append(f"**ID Range**: {min(id_values)} - {max(id_values)}")
     return "\n".join(output)
                     "bloom",
                     "aya-expanse",
                     "comma",
+                    "roberta",
+                    "distilbert",
                     "tokenmonster",
+                    "byt5",
                 ],
                 value=["gpt-4", "llama-3", "gpt-2"],
                 label="Select tokenizers to compare",
     with gr.Row():
         with gr.Column():
+            efficiency_output = gr.Markdown(
+                label="Efficiency Ranking",
+                value="Enter text above to see efficiency comparison...",
+            )
+    with gr.Row():
+        with gr.Column():
+            tokenization_display = gr.HTML(
+                label="Interactive Tokenization (Hover to highlight across tokenizers)",
+                value="<p>Enter text above to see interactive tokenization...</p>",
+            )
+    with gr.Row():
+        with gr.Column():
+            token_ids_output = gr.Markdown(
+                label="Token IDs", value="Token IDs will appear here..."
             )
     with gr.Row():
     # Main comparison function
     def update_comparison(text, models, details):
+        efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart = (
+            compare_tokenizers(text, models, details)
         )
+        return efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart
     # Auto-update on changes
     for component in [text_input, model_selector, show_details]:
             fn=update_comparison,
             inputs=[text_input, model_selector, show_details],
             outputs=[
+                efficiency_output,
+                tokenization_display,
+                token_ids_output,
                 detailed_output,
                 efficiency_chart,
                 distribution_chart,
     - **LLaMA-2/3**: Meta's models using SentencePiece
     - **Gemma-2**: Google's model with SentencePiece
     - **Qwen3/2.5**: Alibaba's models with BPE
+    - **BERT/DistilBERT**: Google's models with WordPiece
+    - **RoBERTa**: Facebook's model with BPE
     - **BLOOM**: BigScience's multilingual model with BPE
     - **Aya Expanse**: Cohere's multilingual model with SentencePiece
+    - **Comma (Common Pile)**: Common Pile's model with BPE
     ### Features
     - **Efficiency Ranking**: Compare token counts across models
 if __name__ == "__main__":
     demo.launch()
+    demo.launch()
+    demo.launch()

mappings.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# Model mappings
+MODEL_MAP = {
+    "llama-2": "meta-llama/Llama-2-7b-hf",
+    "llama-3": "meta-llama/Llama-3.2-1B",
+    "gemma-2": "google/gemma-2-2b",
+    "qwen3": "Qwen/Qwen3-0.6B",
+    "qwen2.5": "Qwen/Qwen2.5-0.5B",
+    "bert": "bert-base-uncased",
+    "bloom": "bigscience/bloom-560m",
+    "aya-expanse": "CohereForAI/aya-expanse-8b",
+    "comma": "common-pile/comma-v0.1-2t",
+    "byte-level": "google/byt5-small",
+    "tokenmonster": "alasdairforsythe/tokenmonster",
+    "byt5": "google/byt5-small",
+}
+TOKENIZER_INFO = {
+    "gpt-4": {"name": "GPT-4", "vocab_size": 100277, "encoding": "BPE"},
+    "gpt-2": {"name": "GPT-2", "vocab_size": 50257, "encoding": "BPE"},
+    "llama-2": {"name": "LLaMA-2", "vocab_size": 32000, "encoding": "SentencePiece"},
+    "llama-3": {"name": "LLaMA-3", "vocab_size": 128000, "encoding": "SentencePiece"},
+    "gemma-2": {"name": "Gemma-2", "vocab_size": 256000, "encoding": "SentencePiece"},
+    "qwen3": {"name": "Qwen3", "vocab_size": 151936, "encoding": "BPE"},
+    "qwen2.5": {"name": "Qwen2.5", "vocab_size": 151936, "encoding": "BPE"},
+    "bert": {"name": "BERT", "vocab_size": 30522, "encoding": "WordPiece"},
+    "bloom": {"name": "BLOOM", "vocab_size": 250680, "encoding": "BPE"},
+    "aya-expanse": {
+        "name": "Aya Expanse",
+        "vocab_size": 256000,
+        "encoding": "SentencePiece",
+    },
+    "comma": {"name": "Comma AI", "vocab_size": 50257, "encoding": ""},
+    "byte-level": {"name": "Byte-Level BPE", "vocab_size": 50000, "encoding": "BPE"},
+    "tokenmonster": {"name": "TokenMonster", "vocab_size": 32000, "encoding": ""},
+    "byt5": {"name": "Byt5", "vocab_size": 50000, "encoding": "BPE"},
+}

utils.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import os
+import re
+import tiktoken
+from transformers import AutoTokenizer
+from mappings import MODEL_MAP, TOKENIZER_INFO
+def get_token_type(token_text):
+    if re.match(r"^\s+$", token_text):
+        return "whitespace"
+    elif re.match(r"^[a-zA-Z]+$", token_text):
+        return "word"
+    elif re.match(r"^\d+$", token_text):
+        return "number"
+    elif re.match(r"^[^\w\s]+$", token_text):
+        return "punctuation"
+    elif token_text.startswith("<") and token_text.endswith(">"):
+        return "special"
+    else:
+        return "mixed"
+def is_subword(token_text, model, is_first):
+    if model in ["llama-2", "llama-3", "qwen3"]:
+        return not token_text.startswith("▁") and not is_first
+    elif model == "bert":
+        return token_text.startswith("##")
+    else:  # BPE models
+        return not token_text.startswith(" ") and not is_first and len(token_text) > 0
+def tokenize_with_tiktoken(text, model):
+    encoding = "cl100k_base" if model == "gpt-4" else "gpt2"
+    enc = tiktoken.get_encoding(encoding)
+    tokens = enc.encode(text)
+    token_data = []
+    current_pos = 0
+    for i, token_id in enumerate(tokens):
+        token_text = enc.decode([token_id])
+        token_type = get_token_type(token_text)
+        subword = is_subword(token_text, model, i == 0)
+        token_data.append(
+            {
+                "text": token_text,
+                "id": int(token_id),
+                "type": token_type,
+                "is_subword": subword,
+                "bytes": len(token_text.encode("utf-8")),
+                "position": i,
+            }
+        )
+        current_pos += len(token_text)
+    return {
+        "model": TOKENIZER_INFO[model]["name"],
+        "token_count": len(tokens),
+        "tokens": token_data,
+        "compression_ratio": len(text) / len(tokens) if tokens else 0,
+        "encoding": TOKENIZER_INFO[model]["encoding"],
+        "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
+    }
+def tokenize_with_hf(text, model):
+    try:
+        model_name = MODEL_MAP.get(model, "gpt2")
+        # Get token from environment
+        hf_token = os.getenv("HF_TOKEN")
+        if not hf_token:
+            return {
+                "model": TOKENIZER_INFO[model]["name"],
+                "token_count": 0,
+                "tokens": [],
+                "error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.",
+            }
+        print(f"DEBUG: Loading model {model_name} with token")
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name, token=hf_token, trust_remote_code=True
+        )
+        tokens = tokenizer.encode(text)
+        token_data = []
+        for i, token_id in enumerate(tokens):
+            token_text = tokenizer.decode([token_id], skip_special_tokens=False)
+            token_type = get_token_type(token_text)
+            subword = is_subword(token_text, model, i == 0)
+            token_data.append(
+                {
+                    "text": token_text,
+                    "id": int(token_id),
+                    "type": token_type,
+                    "is_subword": subword,
+                    "bytes": len(token_text.encode("utf-8")),
+                    "position": i,
+                }
+            )
+        return {
+            "model": TOKENIZER_INFO[model]["name"],
+            "token_count": len(tokens),
+            "tokens": token_data,
+            "compression_ratio": len(text) / len(tokens) if tokens else 0,
+            "encoding": TOKENIZER_INFO[model]["encoding"],
+            "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
+        }
+    except Exception as e:
+        error_msg = str(e)
+        # Provide helpful error messages
+        if "gated repo" in error_msg.lower():
+            error_msg = f"Model is gated. Request access at https://huggingface.co/{model_name} and ensure HF_TOKEN is set."
+        elif "401" in error_msg:
+            error_msg = "Authentication failed. Check your HF_TOKEN in Space secrets."
+        elif "not found" in error_msg.lower():
+            error_msg = (
+                f"Model {model_name} not found. It may have been moved or renamed."
+            )
+        return {
+            "model": TOKENIZER_INFO[model]["name"],
+            "token_count": 0,
+            "tokens": [],
+            "compression_ratio": 0,
+            "encoding": "Error",
+            "vocab_size": 0,
+            "error": error_msg,
+        }