Spaces:
Sleeping
Sleeping
Gül Sena Altıntaş
commited on
Commit
·
37a99cb
1
Parent(s):
0c7d05e
Add normalization
Browse files
app.py
CHANGED
|
@@ -5,7 +5,12 @@ import pandas as pd
|
|
| 5 |
import plotly.express as px
|
| 6 |
import plotly.graph_objects as go
|
| 7 |
|
| 8 |
-
from utils import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
def compare_tokenizers(text, selected_models, show_details=False):
|
|
@@ -315,6 +320,34 @@ def generate_token_ids_display(results):
|
|
| 315 |
return "\n".join(output)
|
| 316 |
|
| 317 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
def generate_detailed_analysis(results):
|
| 319 |
if not results or len(results) < 2:
|
| 320 |
return "Need at least 2 tokenizers for detailed analysis."
|
|
@@ -479,17 +512,22 @@ with gr.Blocks(
|
|
| 479 |
sample_texts = gr.Dropdown(
|
| 480 |
choices=[
|
| 481 |
"Custom text (enter below)",
|
| 482 |
-
"
|
| 483 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 484 |
"Mixed languages: Hello! 你好! こんにちは! Bonjour! Hola! مرحبا!",
|
| 485 |
-
"Numbers & symbols: The price is $123.45 (20% off) = $98.76 savings!",
|
| 486 |
"Subword challenge: antidisestablishmentarianism pseudopseudohypoparathyroidism",
|
| 487 |
"Special characters: @user123 #AI #NLP https://example.com/api?q=tokenization&limit=100",
|
| 488 |
"Scientific text: The mitochondria (powerhouse of the cell) produces ATP through oxidative phosphorylation.",
|
| 489 |
-
"Poetry: Roses are red, violets are blue, tokenizers split words, in ways quite new!",
|
| 490 |
"Technical jargon: The RESTful API endpoint /users/{id}/preferences supports GET/POST/PUT/DELETE operations.",
|
| 491 |
-
"Emoji & Unicode: I love AI! 🤖✨ The café naïve résumé 北京大学
|
| 492 |
-
"Repetitive text: Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo.",
|
| 493 |
"Long compound words (German): Donaudampfschifffahrtselektrizitätenhauptbetriebswerkbauunterbeamtengesellschaft",
|
| 494 |
'JSON data: {"name": "John Doe", "age": 30, "skills": ["Python", "JavaScript", "AI/ML"]}',
|
| 495 |
"Medical terminology: Pneumonoultramicroscopicsilicovolcanoconiosisdiagnosis requires thorough radiological examination.",
|
|
@@ -505,30 +543,41 @@ with gr.Blocks(
|
|
| 505 |
lines=4,
|
| 506 |
value="Hello world! This is a test with some subwords and punctuation.",
|
| 507 |
)
|
| 508 |
-
|
| 509 |
with gr.Column(scale=1):
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 532 |
with gr.Row():
|
| 533 |
with gr.Column():
|
| 534 |
efficiency_output = gr.Markdown(
|
|
@@ -542,7 +591,13 @@ with gr.Blocks(
|
|
| 542 |
label="Interactive Tokenization (Hover to highlight across tokenizers)",
|
| 543 |
value="<p>Enter text above to see interactive tokenization...</p>",
|
| 544 |
)
|
| 545 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 546 |
with gr.Row():
|
| 547 |
with gr.Column():
|
| 548 |
token_ids_output = gr.Markdown(
|
|
@@ -578,6 +633,50 @@ with gr.Blocks(
|
|
| 578 |
)
|
| 579 |
|
| 580 |
# Main comparison function
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 581 |
def update_comparison(text, models, details):
|
| 582 |
efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart = (
|
| 583 |
compare_tokenizers(text, models, details)
|
|
@@ -585,10 +684,22 @@ with gr.Blocks(
|
|
| 585 |
return efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart
|
| 586 |
|
| 587 |
# Auto-update on changes
|
| 588 |
-
for component in [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 589 |
component.change(
|
| 590 |
-
fn=
|
| 591 |
-
inputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 592 |
outputs=[
|
| 593 |
efficiency_output,
|
| 594 |
tokenization_display,
|
|
@@ -604,7 +715,7 @@ with gr.Blocks(
|
|
| 604 |
### About the Models
|
| 605 |
|
| 606 |
- **GPT-4/GPT-2**: OpenAI's tokenizers using BPE (Byte-Pair Encoding)
|
| 607 |
-
- **LLaMA-2/3**: Meta's models using SentencePiece
|
| 608 |
- **Gemma-2**: Google's model with SentencePiece
|
| 609 |
- **Qwen3/2.5**: Alibaba's models with BPE
|
| 610 |
- **BERT/DistilBERT**: Google's models with WordPiece
|
|
|
|
| 5 |
import plotly.express as px
|
| 6 |
import plotly.graph_objects as go
|
| 7 |
|
| 8 |
+
from utils import (
|
| 9 |
+
get_normalization_methods,
|
| 10 |
+
normalize_text,
|
| 11 |
+
tokenize_with_hf,
|
| 12 |
+
tokenize_with_tiktoken,
|
| 13 |
+
)
|
| 14 |
|
| 15 |
|
| 16 |
def compare_tokenizers(text, selected_models, show_details=False):
|
|
|
|
| 320 |
return "\n".join(output)
|
| 321 |
|
| 322 |
|
| 323 |
+
def compare_with_normalization(
|
| 324 |
+
text, selected_models, normalization_method, show_details=False
|
| 325 |
+
):
|
| 326 |
+
"""Compare tokenizers with optional normalization"""
|
| 327 |
+
normalized_text = normalize_text(text, normalization_method)
|
| 328 |
+
print(
|
| 329 |
+
"[DEBUG] Before normalization:", text, "\nAfter normalization:", normalized_text
|
| 330 |
+
)
|
| 331 |
+
|
| 332 |
+
# Get both original and normalized results
|
| 333 |
+
original_results = {}
|
| 334 |
+
normalized_results = {}
|
| 335 |
+
|
| 336 |
+
for model in selected_models:
|
| 337 |
+
if model in ["gpt-4", "gpt-2"]:
|
| 338 |
+
original_results[model] = tokenize_with_tiktoken(text, model)
|
| 339 |
+
if normalization_method != "none":
|
| 340 |
+
normalized_results[model] = tokenize_with_tiktoken(
|
| 341 |
+
normalized_text, model
|
| 342 |
+
)
|
| 343 |
+
else:
|
| 344 |
+
original_results[model] = tokenize_with_hf(text, model)
|
| 345 |
+
if normalization_method != "none":
|
| 346 |
+
normalized_results[model] = tokenize_with_hf(normalized_text, model)
|
| 347 |
+
|
| 348 |
+
return original_results, normalized_results, normalized_text
|
| 349 |
+
|
| 350 |
+
|
| 351 |
def generate_detailed_analysis(results):
|
| 352 |
if not results or len(results) < 2:
|
| 353 |
return "Need at least 2 tokenizers for detailed analysis."
|
|
|
|
| 512 |
sample_texts = gr.Dropdown(
|
| 513 |
choices=[
|
| 514 |
"Custom text (enter below)",
|
| 515 |
+
"english: The quick brown fox jumps over the lazy dog. It's 1234.56 and costs $789.",
|
| 516 |
+
"french: Le renard brun rapide saute par-dessus le chien paresseux. C'est 1234,56 et coûte 789€.",
|
| 517 |
+
"german: Der schnelle braune Fuchs springt über den faulen Hund. Es ist 1234,56 und kostet 789€.",
|
| 518 |
+
"turkish: Hızlı kahverengi tilki tembel köpeğin üstunden atlar. 1234.56'dır ve 789$ tutar.",
|
| 519 |
+
"chinese: 快速的棕色狐狸跳过懒狗。它是1234.56,价格为789美元。",
|
| 520 |
+
"arabic: الثعلب البني السريع يقفز فوق الكلب الكسول. إنه 1234.56 ويكلف 789 دولارًا.",
|
| 521 |
+
"hindi: तेज भूरी लोमड़ी आलसी कुत्ते पर कूदती है। यह 1234.56 है और 789 डॉलर की कीमत है।",
|
| 522 |
+
"code: def calculate_sum(a, b):\n return a + b\n\nresult = calculate_sum(123, 456)",
|
| 523 |
+
"mixed: English text with numbers 12345 and special chars !@#$%, plus some code: x = f(y)",
|
| 524 |
+
"numbers: The price is $123.45 (20% off) = $98.76 savings 1 12 123 1234 12345 123456 1234567 12345678 123456789",
|
| 525 |
"Mixed languages: Hello! 你好! こんにちは! Bonjour! Hola! مرحبا!",
|
|
|
|
| 526 |
"Subword challenge: antidisestablishmentarianism pseudopseudohypoparathyroidism",
|
| 527 |
"Special characters: @user123 #AI #NLP https://example.com/api?q=tokenization&limit=100",
|
| 528 |
"Scientific text: The mitochondria (powerhouse of the cell) produces ATP through oxidative phosphorylation.",
|
|
|
|
| 529 |
"Technical jargon: The RESTful API endpoint /users/{id}/preferences supports GET/POST/PUT/DELETE operations.",
|
| 530 |
+
"Emoji & Unicode: I love AI! 🤖✨ The café naïve résumé 北京大学 العربية😀 👍 🚀 🌍 🎉 💡 🔥 🎵 🏆 🌈",
|
|
|
|
| 531 |
"Long compound words (German): Donaudampfschifffahrtselektrizitätenhauptbetriebswerkbauunterbeamtengesellschaft",
|
| 532 |
'JSON data: {"name": "John Doe", "age": 30, "skills": ["Python", "JavaScript", "AI/ML"]}',
|
| 533 |
"Medical terminology: Pneumonoultramicroscopicsilicovolcanoconiosisdiagnosis requires thorough radiological examination.",
|
|
|
|
| 543 |
lines=4,
|
| 544 |
value="Hello world! This is a test with some subwords and punctuation.",
|
| 545 |
)
|
|
|
|
| 546 |
with gr.Column(scale=1):
|
| 547 |
+
with gr.Tabs():
|
| 548 |
+
with gr.TabItem("Models"):
|
| 549 |
+
model_selector = gr.CheckboxGroup(
|
| 550 |
+
choices=[
|
| 551 |
+
"gpt-4",
|
| 552 |
+
"gpt-2",
|
| 553 |
+
"llama-2",
|
| 554 |
+
"llama-3",
|
| 555 |
+
"gemma-2",
|
| 556 |
+
"qwen3",
|
| 557 |
+
"qwen2.5",
|
| 558 |
+
"bert",
|
| 559 |
+
"bloom",
|
| 560 |
+
"aya-expanse",
|
| 561 |
+
"comma",
|
| 562 |
+
"tokenmonster",
|
| 563 |
+
"byt5",
|
| 564 |
+
],
|
| 565 |
+
value=["gpt-4", "llama-3", "gpt-2"],
|
| 566 |
+
label="Select tokenizers to compare",
|
| 567 |
+
)
|
| 568 |
+
show_details = gr.Checkbox(
|
| 569 |
+
label="Show detailed analysis", value=False
|
| 570 |
+
)
|
| 571 |
+
|
| 572 |
+
with gr.TabItem("Normalization"):
|
| 573 |
+
normalization_method = gr.Dropdown(
|
| 574 |
+
choices=[method[0] for method in get_normalization_methods()],
|
| 575 |
+
value="none",
|
| 576 |
+
label="Normalization Method",
|
| 577 |
+
)
|
| 578 |
+
show_normalization = gr.Checkbox(
|
| 579 |
+
label="Show normalized results", value=False
|
| 580 |
+
)
|
| 581 |
with gr.Row():
|
| 582 |
with gr.Column():
|
| 583 |
efficiency_output = gr.Markdown(
|
|
|
|
| 591 |
label="Interactive Tokenization (Hover to highlight across tokenizers)",
|
| 592 |
value="<p>Enter text above to see interactive tokenization...</p>",
|
| 593 |
)
|
| 594 |
+
with gr.Row():
|
| 595 |
+
with gr.Column():
|
| 596 |
+
normalized_display = gr.HTML(
|
| 597 |
+
label="Normalized Tokenization",
|
| 598 |
+
value="<p>Enable normalization to see results...</p>",
|
| 599 |
+
visible=False,
|
| 600 |
+
)
|
| 601 |
with gr.Row():
|
| 602 |
with gr.Column():
|
| 603 |
token_ids_output = gr.Markdown(
|
|
|
|
| 633 |
)
|
| 634 |
|
| 635 |
# Main comparison function
|
| 636 |
+
def update_comparison_with_norm(text, models, details, norm_method, show_norm):
|
| 637 |
+
if normalization_method == "none" or not show_norm:
|
| 638 |
+
# Original behavior
|
| 639 |
+
(
|
| 640 |
+
efficiency,
|
| 641 |
+
tokenization_html,
|
| 642 |
+
token_ids,
|
| 643 |
+
detailed,
|
| 644 |
+
eff_chart,
|
| 645 |
+
dist_chart,
|
| 646 |
+
) = compare_tokenizers(text, models, details)
|
| 647 |
+
return (
|
| 648 |
+
efficiency,
|
| 649 |
+
tokenization_html,
|
| 650 |
+
token_ids,
|
| 651 |
+
detailed,
|
| 652 |
+
eff_chart,
|
| 653 |
+
dist_chart,
|
| 654 |
+
)
|
| 655 |
+
else:
|
| 656 |
+
# With normalization
|
| 657 |
+
original_results, normalized_results, normalized_text = (
|
| 658 |
+
compare_with_normalization(text, models, norm_method, details)
|
| 659 |
+
)
|
| 660 |
+
|
| 661 |
+
# Generate displays for both
|
| 662 |
+
orig_eff, orig_html, orig_ids = generate_basic_comparison(original_results)
|
| 663 |
+
norm_eff, norm_html, norm_ids = generate_basic_comparison(
|
| 664 |
+
normalized_results
|
| 665 |
+
)
|
| 666 |
+
print(normalized_text)
|
| 667 |
+
|
| 668 |
+
# Combine or show separately
|
| 669 |
+
combined_html = f"<h3>Normalized Text: {normalized_text}</h3>{norm_html}\n<h2>Original</h2>{orig_html}"
|
| 670 |
+
|
| 671 |
+
return (
|
| 672 |
+
orig_eff,
|
| 673 |
+
gr.update(value=combined_html, visible=True),
|
| 674 |
+
orig_ids,
|
| 675 |
+
"",
|
| 676 |
+
None,
|
| 677 |
+
None,
|
| 678 |
+
)
|
| 679 |
+
|
| 680 |
def update_comparison(text, models, details):
|
| 681 |
efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart = (
|
| 682 |
compare_tokenizers(text, models, details)
|
|
|
|
| 684 |
return efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart
|
| 685 |
|
| 686 |
# Auto-update on changes
|
| 687 |
+
for component in [
|
| 688 |
+
text_input,
|
| 689 |
+
model_selector,
|
| 690 |
+
show_details,
|
| 691 |
+
normalization_method,
|
| 692 |
+
show_normalization,
|
| 693 |
+
]:
|
| 694 |
component.change(
|
| 695 |
+
fn=update_comparison_with_norm,
|
| 696 |
+
inputs=[
|
| 697 |
+
text_input,
|
| 698 |
+
model_selector,
|
| 699 |
+
show_details,
|
| 700 |
+
normalization_method,
|
| 701 |
+
show_normalization,
|
| 702 |
+
],
|
| 703 |
outputs=[
|
| 704 |
efficiency_output,
|
| 705 |
tokenization_display,
|
|
|
|
| 715 |
### About the Models
|
| 716 |
|
| 717 |
- **GPT-4/GPT-2**: OpenAI's tokenizers using BPE (Byte-Pair Encoding)
|
| 718 |
+
- **LLaMA-2/3**: Meta's models using SentencePiece (Llama-3 uses BPE)
|
| 719 |
- **Gemma-2**: Google's model with SentencePiece
|
| 720 |
- **Qwen3/2.5**: Alibaba's models with BPE
|
| 721 |
- **BERT/DistilBERT**: Google's models with WordPiece
|
utils.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
import re
|
|
|
|
| 3 |
|
| 4 |
import tiktoken
|
| 5 |
from transformers import AutoTokenizer
|
|
@@ -116,6 +117,7 @@ def tokenize_with_hf(text, model):
|
|
| 116 |
)
|
| 117 |
token_ids = encoding["input_ids"]
|
| 118 |
tokens = tokenizer.convert_ids_to_tokens(token_ids)
|
|
|
|
| 119 |
|
| 120 |
for i, (token_id, token_text) in enumerate(zip(token_ids, tokens)):
|
| 121 |
token_type = get_token_type(token_text)
|
|
@@ -163,3 +165,45 @@ def tokenize_with_hf(text, model):
|
|
| 163 |
"vocab_size": 0,
|
| 164 |
"error": error_msg,
|
| 165 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import re
|
| 3 |
+
import unicodedata
|
| 4 |
|
| 5 |
import tiktoken
|
| 6 |
from transformers import AutoTokenizer
|
|
|
|
| 117 |
)
|
| 118 |
token_ids = encoding["input_ids"]
|
| 119 |
tokens = tokenizer.convert_ids_to_tokens(token_ids)
|
| 120 |
+
# print(tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))
|
| 121 |
|
| 122 |
for i, (token_id, token_text) in enumerate(zip(token_ids, tokens)):
|
| 123 |
token_type = get_token_type(token_text)
|
|
|
|
| 165 |
"vocab_size": 0,
|
| 166 |
"error": error_msg,
|
| 167 |
}
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def normalize_text(text, method):
|
| 171 |
+
"""Apply normalization method to text"""
|
| 172 |
+
if method == "none":
|
| 173 |
+
return text
|
| 174 |
+
elif method == "lowercase":
|
| 175 |
+
return text.lower()
|
| 176 |
+
elif method == "nfc":
|
| 177 |
+
return unicodedata.normalize("NFC", text)
|
| 178 |
+
elif method == "nfd":
|
| 179 |
+
return unicodedata.normalize("NFD", text)
|
| 180 |
+
elif method == "nfkc":
|
| 181 |
+
return unicodedata.normalize("NFKC", text)
|
| 182 |
+
elif method == "nfkd":
|
| 183 |
+
return unicodedata.normalize("NFKD", text)
|
| 184 |
+
elif method == "strip_accents":
|
| 185 |
+
return "".join(
|
| 186 |
+
c
|
| 187 |
+
for c in unicodedata.normalize("NFD", text)
|
| 188 |
+
if unicodedata.category(c) != "Mn"
|
| 189 |
+
)
|
| 190 |
+
elif method == "strip_punctuation":
|
| 191 |
+
return re.sub(r"[^\w\s]", "", text)
|
| 192 |
+
elif method == "whitespace_normalize":
|
| 193 |
+
return " ".join(text.split())
|
| 194 |
+
return text
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def get_normalization_methods():
|
| 198 |
+
"""Return available normalization methods"""
|
| 199 |
+
return [
|
| 200 |
+
("none", "No normalization"),
|
| 201 |
+
("lowercase", "Lowercase"),
|
| 202 |
+
("nfc", "Unicode NFC (Canonical)"),
|
| 203 |
+
("nfd", "Unicode NFD (Decomposed)"),
|
| 204 |
+
("nfkc", "Unicode NFKC (Compatible)"),
|
| 205 |
+
("nfkd", "Unicode NFKD (Compatible Decomposed)"),
|
| 206 |
+
("strip_accents", "Remove Accents"),
|
| 207 |
+
("strip_punctuation", "Remove Punctuation"),
|
| 208 |
+
("whitespace_normalize", "Normalize Whitespace"),
|
| 209 |
+
]
|