Spaces:

gsaltintas
/

tokenizer-comparison

Sleeping

App Files Files Community

Gül Sena Altıntaş commited on Jun 9

Commit

452a924

1 Parent(s): c02e89e

Added sample texts

Browse files

Files changed (1) hide show

app.py +120 -37

app.py CHANGED Viewed

@@ -68,14 +68,17 @@ def generate_basic_comparison(results):
 def generate_interactive_tokenization(results):
-    """Generate HTML with hover highlighting across tokenizers"""
     if not results:
         return "<p>No tokenization results to display.</p>"
     html_parts = []
     html_parts.append("""
     <style>
-    .tokenizer-container {
         margin-bottom: 20px;
         border: 1px solid #e0e0e0;
         border-radius: 8px;
@@ -103,9 +106,10 @@ def generate_interactive_tokenization(results):
         transition: all 0.2s ease;
         position: relative;
         font-size: 14px;
     }
     .token:hover {
-        transform: scale(1.1);
         z-index: 10;
         box-shadow: 0 2px 8px rgba(0,0,0,0.2);
     }
@@ -113,7 +117,9 @@ def generate_interactive_tokenization(results):
         background: #ff6b6b !important;
         border-color: #e55353 !important;
         color: white !important;
-        box-shadow: 0 0 10px rgba(255, 107, 107, 0.5);
     }
     .token-word { background: #e8f5e8; border-color: #4caf50; color: #2e7d32; }
     .token-number { background: #f3e5f5; border-color: #9c27b0; color: #7b1fa2; }
@@ -135,35 +141,64 @@ def generate_interactive_tokenization(results):
         font-size: 12px;
         color: #666;
     }
     </style>
     <script>
-    function highlightToken(text, allTokenizers) {
-        // Remove existing highlights
-        document.querySelectorAll('.token').forEach(token => {
             token.classList.remove('highlighted');
         });
-        // Highlight tokens with same text across all tokenizers
-        document.querySelectorAll('.token').forEach(token => {
-            if (token.dataset.text === text) {
                 token.classList.add('highlighted');
             }
         });
     }
     function clearHighlights() {
-        document.querySelectorAll('.token').forEach(token => {
             token.classList.remove('highlighted');
         });
     }
     </script>
     """)
     for model, result in results.items():
         if "error" in result:
             html_parts.append(f"""
-            <div class="tokenizer-container">
                 <div class="tokenizer-header">{result["model"]} ❌</div>
                 <div style="color: #d32f2f; font-style: italic;">Error: {result["error"]}</div>
             </div>
@@ -171,7 +206,7 @@ def generate_interactive_tokenization(results):
             continue
         html_parts.append(f"""
-        <div class="tokenizer-container">
             <div class="tokenizer-header">
                 {result["model"]}
                 <span class="token-stats">
@@ -183,13 +218,11 @@ def generate_interactive_tokenization(results):
             <div class="token-display">
         """)
-        # Add tokens with hover functionality
         subword_count = 0
         for i, token in enumerate(result["tokens"]):
             token_text = token["text"]
-            display_text = (
-                token_text if token_text.strip() else "·"
-            )  # Show space as dot
             # Determine token class
             token_class = f"token token-{token['type']}"
@@ -197,21 +230,31 @@ def generate_interactive_tokenization(results):
                 token_class += " token-subword"
                 subword_count += 1
-            # Escape text for HTML
-            escaped_text = token_text.replace('"', "&quot;").replace("'", "&#39;")
             escaped_display = display_text.replace('"', "&quot;").replace("'", "&#39;")
-            html_parts.append(f"""
-                <span class="{token_class}"
-                      data-text="{escaped_text}"
                       data-id="{token["id"]}"
                       data-position="{i}"
                       title="Text: '{token_text}' | ID: {token["id"]} | Type: {token["type"]} | Subword: {token["is_subword"]}"
-                      onmouseover="highlightToken('{escaped_text}', true)"
-                      onmouseout="clearHighlights()">
-                    {escaped_display}
-                </span>
-            """)
         html_parts.append(f"""
             </div>
@@ -222,6 +265,7 @@ def generate_interactive_tokenization(results):
         </div>
         """)
     return "".join(html_parts)
@@ -420,13 +464,44 @@ with gr.Blocks(
     Compare how different LLM tokenizers split text into tokens. Analyze efficiency, subwords, and token types.
     **Legend**: 🔤 Word | 🔢 Number | ❗ Punctuation | 🔸 Subword | · Space
     """)
     with gr.Row():
         with gr.Column(scale=2):
             text_input = gr.Textbox(
                 label="Text to tokenize",
-                placeholder="Enter your text here...",
                 lines=4,
                 value="Hello world! This is a test with some subwords and punctuation.",
             )
@@ -445,8 +520,6 @@ with gr.Blocks(
                     "bloom",
                     "aya-expanse",
                     "comma",
-                    "roberta",
-                    "distilbert",
                     "tokenmonster",
                     "byt5",
                 ],
@@ -486,11 +559,23 @@ with gr.Blocks(
         with gr.Column():
             distribution_chart = gr.Plot(label="Token Type Distribution")
-    # Update visibility of detailed analysis
-    def toggle_details(show_details):
-        return gr.update(visible=show_details)
-    show_details.change(fn=toggle_details, inputs=show_details, outputs=detailed_output)
     # Main comparison function
     def update_comparison(text, models, details):
@@ -523,10 +608,10 @@ with gr.Blocks(
     - **Gemma-2**: Google's model with SentencePiece
     - **Qwen3/2.5**: Alibaba's models with BPE
     - **BERT/DistilBERT**: Google's models with WordPiece
-    - **RoBERTa**: Facebook's model with BPE
     - **BLOOM**: BigScience's multilingual model with BPE
     - **Aya Expanse**: Cohere's multilingual model with SentencePiece
     - **Comma (Common Pile)**: Common Pile's model with BPE
     ### Features
     - **Efficiency Ranking**: Compare token counts across models
@@ -538,5 +623,3 @@ with gr.Blocks(
 if __name__ == "__main__":
     demo.launch()
-    demo.launch()
-    demo.launch()

 def generate_interactive_tokenization(results):
+    """Generate HTML with working hover highlighting across tokenizers"""
     if not results:
         return "<p>No tokenization results to display.</p>"
     html_parts = []
+    # Add styles first
     html_parts.append("""
+    <div id="tokenizer-container">
     <style>
+    .tokenizer-section {
         margin-bottom: 20px;
         border: 1px solid #e0e0e0;
         border-radius: 8px;
         transition: all 0.2s ease;
         position: relative;
         font-size: 14px;
+        user-select: none;
     }
     .token:hover {
+        transform: scale(1.05);
         z-index: 10;
         box-shadow: 0 2px 8px rgba(0,0,0,0.2);
     }
         background: #ff6b6b !important;
         border-color: #e55353 !important;
         color: white !important;
+        box-shadow: 0 0 10px rgba(255, 107, 107, 0.5) !important;
+        transform: scale(1.1) !important;
+        z-index: 100 !important;
     }
     .token-word { background: #e8f5e8; border-color: #4caf50; color: #2e7d32; }
     .token-number { background: #f3e5f5; border-color: #9c27b0; color: #7b1fa2; }
         font-size: 12px;
         color: #666;
     }
+    .highlight-info {
+        position: fixed;
+        top: 10px;
+        right: 10px;
+        background: #333;
+        color: white;
+        padding: 8px 12px;
+        border-radius: 4px;
+        font-size: 12px;
+        display: none;
+        z-index: 1000;
+    }
     </style>
+    <div class="highlight-info" id="highlight-info"></div>
     <script>
+    function highlightTokens(targetText) {
+        // Clear all highlights
+        document.querySelectorAll('.token').forEach(function(token) {
             token.classList.remove('highlighted');
         });
+        // Highlight matching tokens
+        let count = 0;
+        document.querySelectorAll('.token').forEach(function(token) {
+            if (token.getAttribute('data-text') === targetText) {
                 token.classList.add('highlighted');
+                count++;
             }
         });
+        // Show info
+        const info = document.getElementById('highlight-info');
+        if (info) {
+            const displayText = targetText === ' ' ? '(space)' : targetText;
+            info.textContent = '"' + displayText + '" appears in ' + count + ' positions';
+            info.style.display = 'block';
+        }
     }
     function clearHighlights() {
+        document.querySelectorAll('.token').forEach(function(token) {
             token.classList.remove('highlighted');
         });
+        const info = document.getElementById('highlight-info');
+        if (info) {
+            info.style.display = 'none';
+        }
     }
     </script>
     """)
+    # Generate tokenizer sections with inline event handlers
     for model, result in results.items():
         if "error" in result:
             html_parts.append(f"""
+            <div class="tokenizer-section">
                 <div class="tokenizer-header">{result["model"]} ❌</div>
                 <div style="color: #d32f2f; font-style: italic;">Error: {result["error"]}</div>
             </div>
             continue
         html_parts.append(f"""
+        <div class="tokenizer-section">
             <div class="tokenizer-header">
                 {result["model"]}
                 <span class="token-stats">
             <div class="token-display">
         """)
+        # Add tokens with inline event handlers
         subword_count = 0
         for i, token in enumerate(result["tokens"]):
             token_text = token["text"]
+            display_text = token_text if token_text.strip() else "·"
             # Determine token class
             token_class = f"token token-{token['type']}"
                 token_class += " token-subword"
                 subword_count += 1
+            # Create unique identifier for this token occurrence
+            token_id = f"token_{model}_{i}"
+            # Escape text for HTML and JavaScript - be very careful with quotes
+            escaped_text = (
+                token_text.replace("\\", "\\\\")
+                .replace("'", "\\'")
+                .replace('"', '\\"')
+                .replace("\n", "\\n")
+                .replace("\r", "\\r")
+            )
             escaped_display = display_text.replace('"', "&quot;").replace("'", "&#39;")
+            # Use inline event handlers that definitely work in Gradio
+            html_parts.append(f"""<span class="{token_class}"
+                      id="{token_id}"
+                      data-text="{token_text.replace('"', "&quot;").replace("'", "&#39;")}"
                       data-id="{token["id"]}"
                       data-position="{i}"
+                      data-model="{model}"
                       title="Text: '{token_text}' | ID: {token["id"]} | Type: {token["type"]} | Subword: {token["is_subword"]}"
+                      onmouseover="highlightTokens('{escaped_text}')"
+                      onmouseout="clearHighlights()"
+                      onclick="alert('Token: \\'{escaped_text}\\'\\nID: {token["id"]}\\nModel: {model}')">{escaped_display}</span>""")
         html_parts.append(f"""
             </div>
         </div>
         """)
+    html_parts.append("</div>")
     return "".join(html_parts)
     Compare how different LLM tokenizers split text into tokens. Analyze efficiency, subwords, and token types.
     **Legend**: 🔤 Word | 🔢 Number | ❗ Punctuation | 🔸 Subword | · Space
+    💡 **Try the sample texts** to see how tokenizers handle different challenges like:
+    - Mixed languages and scripts
+    - Programming code and JSON
+    - Long compound words
+    - Special characters and emojis
+    - Technical terminology
     """)
     with gr.Row():
         with gr.Column(scale=2):
+            # Sample texts dropdown
+            sample_texts = gr.Dropdown(
+                choices=[
+                    "Custom text (enter below)",
+                    "Basic English: Hello world! How are you doing today?",
+                    "Programming code: def tokenize_text(input_str): return input_str.split()",
+                    "Mixed languages: Hello! 你好! こんにちは! Bonjour! Hola! مرحبا!",
+                    "Numbers & symbols: The price is $123.45 (20% off) = $98.76 savings!",
+                    "Subword challenge: antidisestablishmentarianism pseudopseudohypoparathyroidism",
+                    "Special characters: @user123 #AI #NLP https://example.com/api?q=tokenization&limit=100",
+                    "Scientific text: The mitochondria (powerhouse of the cell) produces ATP through oxidative phosphorylation.",
+                    "Poetry: Roses are red, violets are blue, tokenizers split words, in ways quite new!",
+                    "Technical jargon: The RESTful API endpoint /users/{id}/preferences supports GET/POST/PUT/DELETE operations.",
+                    "Emoji & Unicode: I love AI! 🤖✨ The café naïve résumé 北京大学 العربية",
+                    "Repetitive text: Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo.",
+                    "Long compound words (German): Donaudampfschifffahrtselektrizitätenhauptbetriebswerkbauunterbeamtengesellschaft",
+                    'JSON data: {"name": "John Doe", "age": 30, "skills": ["Python", "JavaScript", "AI/ML"]}',
+                    "Medical terminology: Pneumonoultramicroscopicsilicovolcanoconiosisdiagnosis requires thorough radiological examination.",
+                ],
+                value="Custom text (enter below)",
+                label="Choose a sample text or enter your own",
+                interactive=True,
+            )
             text_input = gr.Textbox(
                 label="Text to tokenize",
+                placeholder="Enter your text here or select a sample above...",
                 lines=4,
                 value="Hello world! This is a test with some subwords and punctuation.",
             )
                     "bloom",
                     "aya-expanse",
                     "comma",
                     "tokenmonster",
                     "byt5",
                 ],
         with gr.Column():
             distribution_chart = gr.Plot(label="Token Type Distribution")
+    # Function to update text input when sample is selected
+    def update_text_from_sample(sample_choice):
+        if sample_choice == "Custom text (enter below)":
+            return gr.update()  # Don't change the text input
+        else:
+            # Extract the text after the colon
+            sample_text = (
+                sample_choice.split(": ", 1)[1]
+                if ": " in sample_choice
+                else sample_choice
+            )
+            return gr.update(value=sample_text)
+    # Update text input when sample is selected
+    sample_texts.change(
+        fn=update_text_from_sample, inputs=sample_texts, outputs=text_input
+    )
     # Main comparison function
     def update_comparison(text, models, details):
     - **Gemma-2**: Google's model with SentencePiece
     - **Qwen3/2.5**: Alibaba's models with BPE
     - **BERT/DistilBERT**: Google's models with WordPiece
     - **BLOOM**: BigScience's multilingual model with BPE
     - **Aya Expanse**: Cohere's multilingual model with SentencePiece
     - **Comma (Common Pile)**: Common Pile's model with BPE
+    - **Byt5**: Google's byte-level model
     ### Features
     - **Efficiency Ranking**: Compare token counts across models
 if __name__ == "__main__":
     demo.launch()