Spaces:

gsaltintas
/

tokenizer-comparison

Sleeping

App Files Files Community

Gül Sena Altıntaş commited on Jun 9

Commit

37a99cb

1 Parent(s): 0c7d05e

Add normalization

Browse files

Files changed (2) hide show

app.py +146 -35
utils.py +44 -0

app.py CHANGED Viewed

@@ -5,7 +5,12 @@ import pandas as pd
 import plotly.express as px
 import plotly.graph_objects as go
-from utils import tokenize_with_hf, tokenize_with_tiktoken
 def compare_tokenizers(text, selected_models, show_details=False):
@@ -315,6 +320,34 @@ def generate_token_ids_display(results):
     return "\n".join(output)
 def generate_detailed_analysis(results):
     if not results or len(results) < 2:
         return "Need at least 2 tokenizers for detailed analysis."
@@ -479,17 +512,22 @@ with gr.Blocks(
             sample_texts = gr.Dropdown(
                 choices=[
                     "Custom text (enter below)",
-                    "Basic English: Hello world! How are you doing today?",
-                    "Programming code: def tokenize_text(input_str): return input_str.split()",
                     "Mixed languages: Hello! 你好! こんにちは! Bonjour! Hola! مرحبا!",
-                    "Numbers & symbols: The price is $123.45 (20% off) = $98.76 savings!",
                     "Subword challenge: antidisestablishmentarianism pseudopseudohypoparathyroidism",
                     "Special characters: @user123 #AI #NLP https://example.com/api?q=tokenization&limit=100",
                     "Scientific text: The mitochondria (powerhouse of the cell) produces ATP through oxidative phosphorylation.",
-                    "Poetry: Roses are red, violets are blue, tokenizers split words, in ways quite new!",
                     "Technical jargon: The RESTful API endpoint /users/{id}/preferences supports GET/POST/PUT/DELETE operations.",
-                    "Emoji & Unicode: I love AI! 🤖✨ The café naïve résumé 北京大学 العربية",
-                    "Repetitive text: Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo.",
                     "Long compound words (German): Donaudampfschifffahrtselektrizitätenhauptbetriebswerkbauunterbeamtengesellschaft",
                     'JSON data: {"name": "John Doe", "age": 30, "skills": ["Python", "JavaScript", "AI/ML"]}',
                     "Medical terminology: Pneumonoultramicroscopicsilicovolcanoconiosisdiagnosis requires thorough radiological examination.",
@@ -505,30 +543,41 @@ with gr.Blocks(
                 lines=4,
                 value="Hello world! This is a test with some subwords and punctuation.",
             )
         with gr.Column(scale=1):
-            model_selector = gr.CheckboxGroup(
-                choices=[
-                    "gpt-4",
-                    "gpt-2",
-                    "llama-2",
-                    "llama-3",
-                    "gemma-2",
-                    "qwen3",
-                    "qwen2.5",
-                    "bert",
-                    "bloom",
-                    "aya-expanse",
-                    "comma",
-                    "tokenmonster",
-                    "byt5",
-                ],
-                value=["gpt-4", "llama-3", "gpt-2"],
-                label="Select tokenizers to compare",
-            )
-            show_details = gr.Checkbox(label="Show detailed analysis", value=False)
     with gr.Row():
         with gr.Column():
             efficiency_output = gr.Markdown(
@@ -542,7 +591,13 @@ with gr.Blocks(
                 label="Interactive Tokenization (Hover to highlight across tokenizers)",
                 value="<p>Enter text above to see interactive tokenization...</p>",
             )
     with gr.Row():
         with gr.Column():
             token_ids_output = gr.Markdown(
@@ -578,6 +633,50 @@ with gr.Blocks(
     )
     # Main comparison function
     def update_comparison(text, models, details):
         efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart = (
             compare_tokenizers(text, models, details)
@@ -585,10 +684,22 @@ with gr.Blocks(
         return efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart
     # Auto-update on changes
-    for component in [text_input, model_selector, show_details]:
         component.change(
-            fn=update_comparison,
-            inputs=[text_input, model_selector, show_details],
             outputs=[
                 efficiency_output,
                 tokenization_display,
@@ -604,7 +715,7 @@ with gr.Blocks(
     ### About the Models
     - **GPT-4/GPT-2**: OpenAI's tokenizers using BPE (Byte-Pair Encoding)
-    - **LLaMA-2/3**: Meta's models using SentencePiece
     - **Gemma-2**: Google's model with SentencePiece
     - **Qwen3/2.5**: Alibaba's models with BPE
     - **BERT/DistilBERT**: Google's models with WordPiece

 import plotly.express as px
 import plotly.graph_objects as go
+from utils import (
+    get_normalization_methods,
+    normalize_text,
+    tokenize_with_hf,
+    tokenize_with_tiktoken,
+)
 def compare_tokenizers(text, selected_models, show_details=False):
     return "\n".join(output)
+def compare_with_normalization(
+    text, selected_models, normalization_method, show_details=False
+):
+    """Compare tokenizers with optional normalization"""
+    normalized_text = normalize_text(text, normalization_method)
+    print(
+        "[DEBUG] Before normalization:", text, "\nAfter normalization:", normalized_text
+    )
+    # Get both original and normalized results
+    original_results = {}
+    normalized_results = {}
+    for model in selected_models:
+        if model in ["gpt-4", "gpt-2"]:
+            original_results[model] = tokenize_with_tiktoken(text, model)
+            if normalization_method != "none":
+                normalized_results[model] = tokenize_with_tiktoken(
+                    normalized_text, model
+                )
+        else:
+            original_results[model] = tokenize_with_hf(text, model)
+            if normalization_method != "none":
+                normalized_results[model] = tokenize_with_hf(normalized_text, model)
+    return original_results, normalized_results, normalized_text
 def generate_detailed_analysis(results):
     if not results or len(results) < 2:
         return "Need at least 2 tokenizers for detailed analysis."
             sample_texts = gr.Dropdown(
                 choices=[
                     "Custom text (enter below)",
+                    "english: The quick brown fox jumps over the lazy dog. It's 1234.56 and costs $789.",
+                    "french: Le renard brun rapide saute par-dessus le chien paresseux. C'est 1234,56 et coûte 789€.",
+                    "german: Der schnelle braune Fuchs springt über den faulen Hund. Es ist 1234,56 und kostet 789€.",
+                    "turkish: Hızlı kahverengi tilki tembel köpeğin üstunden atlar. 1234.56'dır ve 789$ tutar.",
+                    "chinese: 快速的棕色狐狸跳过懒狗。它是1234.56，价格为789美元。",
+                    "arabic: الثعلب البني السريع يقفز فوق الكلب الكسول. إنه 1234.56 ويكلف 789 دولارًا.",
+                    "hindi: तेज भूरी लोमड़ी आलसी कुत्ते पर कूदती है। यह 1234.56 है और 789 डॉलर की कीमत है।",
+                    "code: def calculate_sum(a, b):\n    return a + b\n\nresult = calculate_sum(123, 456)",
+                    "mixed: English text with numbers 12345 and special chars !@#$%, plus some code: x = f(y)",
+                    "numbers: The price is $123.45 (20% off) = $98.76 savings 1 12 123 1234 12345 123456 1234567 12345678 123456789",
                     "Mixed languages: Hello! 你好! こんにちは! Bonjour! Hola! مرحبا!",
                     "Subword challenge: antidisestablishmentarianism pseudopseudohypoparathyroidism",
                     "Special characters: @user123 #AI #NLP https://example.com/api?q=tokenization&limit=100",
                     "Scientific text: The mitochondria (powerhouse of the cell) produces ATP through oxidative phosphorylation.",
                     "Technical jargon: The RESTful API endpoint /users/{id}/preferences supports GET/POST/PUT/DELETE operations.",
+                    "Emoji & Unicode: I love AI! 🤖✨ The café naïve résumé 北京大学 العربية😀 👍 🚀 🌍 🎉 💡 🔥 🎵 🏆 🌈",
                     "Long compound words (German): Donaudampfschifffahrtselektrizitätenhauptbetriebswerkbauunterbeamtengesellschaft",
                     'JSON data: {"name": "John Doe", "age": 30, "skills": ["Python", "JavaScript", "AI/ML"]}',
                     "Medical terminology: Pneumonoultramicroscopicsilicovolcanoconiosisdiagnosis requires thorough radiological examination.",
                 lines=4,
                 value="Hello world! This is a test with some subwords and punctuation.",
             )
         with gr.Column(scale=1):
+            with gr.Tabs():
+                with gr.TabItem("Models"):
+                    model_selector = gr.CheckboxGroup(
+                        choices=[
+                            "gpt-4",
+                            "gpt-2",
+                            "llama-2",
+                            "llama-3",
+                            "gemma-2",
+                            "qwen3",
+                            "qwen2.5",
+                            "bert",
+                            "bloom",
+                            "aya-expanse",
+                            "comma",
+                            "tokenmonster",
+                            "byt5",
+                        ],
+                        value=["gpt-4", "llama-3", "gpt-2"],
+                        label="Select tokenizers to compare",
+                    )
+                    show_details = gr.Checkbox(
+                        label="Show detailed analysis", value=False
+                    )
+                with gr.TabItem("Normalization"):
+                    normalization_method = gr.Dropdown(
+                        choices=[method[0] for method in get_normalization_methods()],
+                        value="none",
+                        label="Normalization Method",
+                    )
+                    show_normalization = gr.Checkbox(
+                        label="Show normalized results", value=False
+                    )
     with gr.Row():
         with gr.Column():
             efficiency_output = gr.Markdown(
                 label="Interactive Tokenization (Hover to highlight across tokenizers)",
                 value="<p>Enter text above to see interactive tokenization...</p>",
             )
+    with gr.Row():
+        with gr.Column():
+            normalized_display = gr.HTML(
+                label="Normalized Tokenization",
+                value="<p>Enable normalization to see results...</p>",
+                visible=False,
+            )
     with gr.Row():
         with gr.Column():
             token_ids_output = gr.Markdown(
     )
     # Main comparison function
+    def update_comparison_with_norm(text, models, details, norm_method, show_norm):
+        if normalization_method == "none" or not show_norm:
+            # Original behavior
+            (
+                efficiency,
+                tokenization_html,
+                token_ids,
+                detailed,
+                eff_chart,
+                dist_chart,
+            ) = compare_tokenizers(text, models, details)
+            return (
+                efficiency,
+                tokenization_html,
+                token_ids,
+                detailed,
+                eff_chart,
+                dist_chart,
+            )
+        else:
+            # With normalization
+            original_results, normalized_results, normalized_text = (
+                compare_with_normalization(text, models, norm_method, details)
+            )
+            # Generate displays for both
+            orig_eff, orig_html, orig_ids = generate_basic_comparison(original_results)
+            norm_eff, norm_html, norm_ids = generate_basic_comparison(
+                normalized_results
+            )
+            print(normalized_text)
+            # Combine or show separately
+            combined_html = f"<h3>Normalized Text: {normalized_text}</h3>{norm_html}\n<h2>Original</h2>{orig_html}"
+            return (
+                orig_eff,
+                gr.update(value=combined_html, visible=True),
+                orig_ids,
+                "",
+                None,
+                None,
+            )
     def update_comparison(text, models, details):
         efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart = (
             compare_tokenizers(text, models, details)
         return efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart
     # Auto-update on changes
+    for component in [
+        text_input,
+        model_selector,
+        show_details,
+        normalization_method,
+        show_normalization,
+    ]:
         component.change(
+            fn=update_comparison_with_norm,
+            inputs=[
+                text_input,
+                model_selector,
+                show_details,
+                normalization_method,
+                show_normalization,
+            ],
             outputs=[
                 efficiency_output,
                 tokenization_display,
     ### About the Models
     - **GPT-4/GPT-2**: OpenAI's tokenizers using BPE (Byte-Pair Encoding)
+    - **LLaMA-2/3**: Meta's models using SentencePiece (Llama-3 uses BPE)
     - **Gemma-2**: Google's model with SentencePiece
     - **Qwen3/2.5**: Alibaba's models with BPE
     - **BERT/DistilBERT**: Google's models with WordPiece

utils.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 import re
 import tiktoken
 from transformers import AutoTokenizer
@@ -116,6 +117,7 @@ def tokenize_with_hf(text, model):
         )
         token_ids = encoding["input_ids"]
         tokens = tokenizer.convert_ids_to_tokens(token_ids)
         for i, (token_id, token_text) in enumerate(zip(token_ids, tokens)):
             token_type = get_token_type(token_text)
@@ -163,3 +165,45 @@ def tokenize_with_hf(text, model):
             "vocab_size": 0,
             "error": error_msg,
         }

 import os
 import re
+import unicodedata
 import tiktoken
 from transformers import AutoTokenizer
         )
         token_ids = encoding["input_ids"]
         tokens = tokenizer.convert_ids_to_tokens(token_ids)
+        # print(tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))
         for i, (token_id, token_text) in enumerate(zip(token_ids, tokens)):
             token_type = get_token_type(token_text)
             "vocab_size": 0,
             "error": error_msg,
         }
+def normalize_text(text, method):
+    """Apply normalization method to text"""
+    if method == "none":
+        return text
+    elif method == "lowercase":
+        return text.lower()
+    elif method == "nfc":
+        return unicodedata.normalize("NFC", text)
+    elif method == "nfd":
+        return unicodedata.normalize("NFD", text)
+    elif method == "nfkc":
+        return unicodedata.normalize("NFKC", text)
+    elif method == "nfkd":
+        return unicodedata.normalize("NFKD", text)
+    elif method == "strip_accents":
+        return "".join(
+            c
+            for c in unicodedata.normalize("NFD", text)
+            if unicodedata.category(c) != "Mn"
+        )
+    elif method == "strip_punctuation":
+        return re.sub(r"[^\w\s]", "", text)
+    elif method == "whitespace_normalize":
+        return " ".join(text.split())
+    return text
+def get_normalization_methods():
+    """Return available normalization methods"""
+    return [
+        ("none", "No normalization"),
+        ("lowercase", "Lowercase"),
+        ("nfc", "Unicode NFC (Canonical)"),
+        ("nfd", "Unicode NFD (Decomposed)"),
+        ("nfkc", "Unicode NFKC (Compatible)"),
+        ("nfkd", "Unicode NFKD (Compatible Decomposed)"),
+        ("strip_accents", "Remove Accents"),
+        ("strip_punctuation", "Remove Punctuation"),
+        ("whitespace_normalize", "Normalize Whitespace"),
+    ]