Spaces:
Sleeping
Sleeping
Gül Sena Altıntaş
commited on
Commit
·
452a924
1
Parent(s):
c02e89e
Added sample texts
Browse files
app.py
CHANGED
|
@@ -68,14 +68,17 @@ def generate_basic_comparison(results):
|
|
| 68 |
|
| 69 |
|
| 70 |
def generate_interactive_tokenization(results):
|
| 71 |
-
"""Generate HTML with hover highlighting across tokenizers"""
|
| 72 |
if not results:
|
| 73 |
return "<p>No tokenization results to display.</p>"
|
| 74 |
|
| 75 |
html_parts = []
|
|
|
|
|
|
|
| 76 |
html_parts.append("""
|
|
|
|
| 77 |
<style>
|
| 78 |
-
.tokenizer-
|
| 79 |
margin-bottom: 20px;
|
| 80 |
border: 1px solid #e0e0e0;
|
| 81 |
border-radius: 8px;
|
|
@@ -103,9 +106,10 @@ def generate_interactive_tokenization(results):
|
|
| 103 |
transition: all 0.2s ease;
|
| 104 |
position: relative;
|
| 105 |
font-size: 14px;
|
|
|
|
| 106 |
}
|
| 107 |
.token:hover {
|
| 108 |
-
transform: scale(1.
|
| 109 |
z-index: 10;
|
| 110 |
box-shadow: 0 2px 8px rgba(0,0,0,0.2);
|
| 111 |
}
|
|
@@ -113,7 +117,9 @@ def generate_interactive_tokenization(results):
|
|
| 113 |
background: #ff6b6b !important;
|
| 114 |
border-color: #e55353 !important;
|
| 115 |
color: white !important;
|
| 116 |
-
box-shadow: 0 0 10px rgba(255, 107, 107, 0.5);
|
|
|
|
|
|
|
| 117 |
}
|
| 118 |
.token-word { background: #e8f5e8; border-color: #4caf50; color: #2e7d32; }
|
| 119 |
.token-number { background: #f3e5f5; border-color: #9c27b0; color: #7b1fa2; }
|
|
@@ -135,35 +141,64 @@ def generate_interactive_tokenization(results):
|
|
| 135 |
font-size: 12px;
|
| 136 |
color: #666;
|
| 137 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
</style>
|
| 139 |
|
|
|
|
|
|
|
| 140 |
<script>
|
| 141 |
-
function
|
| 142 |
-
//
|
| 143 |
-
document.querySelectorAll('.token').forEach(token
|
| 144 |
token.classList.remove('highlighted');
|
| 145 |
});
|
| 146 |
|
| 147 |
-
// Highlight tokens
|
| 148 |
-
|
| 149 |
-
|
|
|
|
| 150 |
token.classList.add('highlighted');
|
|
|
|
| 151 |
}
|
| 152 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
}
|
| 154 |
|
| 155 |
function clearHighlights() {
|
| 156 |
-
document.querySelectorAll('.token').forEach(token
|
| 157 |
token.classList.remove('highlighted');
|
| 158 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
}
|
| 160 |
</script>
|
| 161 |
""")
|
| 162 |
|
|
|
|
| 163 |
for model, result in results.items():
|
| 164 |
if "error" in result:
|
| 165 |
html_parts.append(f"""
|
| 166 |
-
<div class="tokenizer-
|
| 167 |
<div class="tokenizer-header">{result["model"]} ❌</div>
|
| 168 |
<div style="color: #d32f2f; font-style: italic;">Error: {result["error"]}</div>
|
| 169 |
</div>
|
|
@@ -171,7 +206,7 @@ def generate_interactive_tokenization(results):
|
|
| 171 |
continue
|
| 172 |
|
| 173 |
html_parts.append(f"""
|
| 174 |
-
<div class="tokenizer-
|
| 175 |
<div class="tokenizer-header">
|
| 176 |
{result["model"]}
|
| 177 |
<span class="token-stats">
|
|
@@ -183,13 +218,11 @@ def generate_interactive_tokenization(results):
|
|
| 183 |
<div class="token-display">
|
| 184 |
""")
|
| 185 |
|
| 186 |
-
# Add tokens with
|
| 187 |
subword_count = 0
|
| 188 |
for i, token in enumerate(result["tokens"]):
|
| 189 |
token_text = token["text"]
|
| 190 |
-
display_text = (
|
| 191 |
-
token_text if token_text.strip() else "·"
|
| 192 |
-
) # Show space as dot
|
| 193 |
|
| 194 |
# Determine token class
|
| 195 |
token_class = f"token token-{token['type']}"
|
|
@@ -197,21 +230,31 @@ def generate_interactive_tokenization(results):
|
|
| 197 |
token_class += " token-subword"
|
| 198 |
subword_count += 1
|
| 199 |
|
| 200 |
-
#
|
| 201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
escaped_display = display_text.replace('"', """).replace("'", "'")
|
| 203 |
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
|
|
|
| 207 |
data-id="{token["id"]}"
|
| 208 |
data-position="{i}"
|
|
|
|
| 209 |
title="Text: '{token_text}' | ID: {token["id"]} | Type: {token["type"]} | Subword: {token["is_subword"]}"
|
| 210 |
-
onmouseover="
|
| 211 |
-
onmouseout="clearHighlights()"
|
| 212 |
-
|
| 213 |
-
</span>
|
| 214 |
-
""")
|
| 215 |
|
| 216 |
html_parts.append(f"""
|
| 217 |
</div>
|
|
@@ -222,6 +265,7 @@ def generate_interactive_tokenization(results):
|
|
| 222 |
</div>
|
| 223 |
""")
|
| 224 |
|
|
|
|
| 225 |
return "".join(html_parts)
|
| 226 |
|
| 227 |
|
|
@@ -420,13 +464,44 @@ with gr.Blocks(
|
|
| 420 |
Compare how different LLM tokenizers split text into tokens. Analyze efficiency, subwords, and token types.
|
| 421 |
|
| 422 |
**Legend**: 🔤 Word | 🔢 Number | ❗ Punctuation | 🔸 Subword | · Space
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 423 |
""")
|
| 424 |
|
| 425 |
with gr.Row():
|
| 426 |
with gr.Column(scale=2):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
text_input = gr.Textbox(
|
| 428 |
label="Text to tokenize",
|
| 429 |
-
placeholder="Enter your text here...",
|
| 430 |
lines=4,
|
| 431 |
value="Hello world! This is a test with some subwords and punctuation.",
|
| 432 |
)
|
|
@@ -445,8 +520,6 @@ with gr.Blocks(
|
|
| 445 |
"bloom",
|
| 446 |
"aya-expanse",
|
| 447 |
"comma",
|
| 448 |
-
"roberta",
|
| 449 |
-
"distilbert",
|
| 450 |
"tokenmonster",
|
| 451 |
"byt5",
|
| 452 |
],
|
|
@@ -486,11 +559,23 @@ with gr.Blocks(
|
|
| 486 |
with gr.Column():
|
| 487 |
distribution_chart = gr.Plot(label="Token Type Distribution")
|
| 488 |
|
| 489 |
-
#
|
| 490 |
-
def
|
| 491 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 492 |
|
| 493 |
-
|
|
|
|
|
|
|
|
|
|
| 494 |
|
| 495 |
# Main comparison function
|
| 496 |
def update_comparison(text, models, details):
|
|
@@ -523,10 +608,10 @@ with gr.Blocks(
|
|
| 523 |
- **Gemma-2**: Google's model with SentencePiece
|
| 524 |
- **Qwen3/2.5**: Alibaba's models with BPE
|
| 525 |
- **BERT/DistilBERT**: Google's models with WordPiece
|
| 526 |
-
- **RoBERTa**: Facebook's model with BPE
|
| 527 |
- **BLOOM**: BigScience's multilingual model with BPE
|
| 528 |
- **Aya Expanse**: Cohere's multilingual model with SentencePiece
|
| 529 |
- **Comma (Common Pile)**: Common Pile's model with BPE
|
|
|
|
| 530 |
|
| 531 |
### Features
|
| 532 |
- **Efficiency Ranking**: Compare token counts across models
|
|
@@ -538,5 +623,3 @@ with gr.Blocks(
|
|
| 538 |
|
| 539 |
if __name__ == "__main__":
|
| 540 |
demo.launch()
|
| 541 |
-
demo.launch()
|
| 542 |
-
demo.launch()
|
|
|
|
| 68 |
|
| 69 |
|
| 70 |
def generate_interactive_tokenization(results):
|
| 71 |
+
"""Generate HTML with working hover highlighting across tokenizers"""
|
| 72 |
if not results:
|
| 73 |
return "<p>No tokenization results to display.</p>"
|
| 74 |
|
| 75 |
html_parts = []
|
| 76 |
+
|
| 77 |
+
# Add styles first
|
| 78 |
html_parts.append("""
|
| 79 |
+
<div id="tokenizer-container">
|
| 80 |
<style>
|
| 81 |
+
.tokenizer-section {
|
| 82 |
margin-bottom: 20px;
|
| 83 |
border: 1px solid #e0e0e0;
|
| 84 |
border-radius: 8px;
|
|
|
|
| 106 |
transition: all 0.2s ease;
|
| 107 |
position: relative;
|
| 108 |
font-size: 14px;
|
| 109 |
+
user-select: none;
|
| 110 |
}
|
| 111 |
.token:hover {
|
| 112 |
+
transform: scale(1.05);
|
| 113 |
z-index: 10;
|
| 114 |
box-shadow: 0 2px 8px rgba(0,0,0,0.2);
|
| 115 |
}
|
|
|
|
| 117 |
background: #ff6b6b !important;
|
| 118 |
border-color: #e55353 !important;
|
| 119 |
color: white !important;
|
| 120 |
+
box-shadow: 0 0 10px rgba(255, 107, 107, 0.5) !important;
|
| 121 |
+
transform: scale(1.1) !important;
|
| 122 |
+
z-index: 100 !important;
|
| 123 |
}
|
| 124 |
.token-word { background: #e8f5e8; border-color: #4caf50; color: #2e7d32; }
|
| 125 |
.token-number { background: #f3e5f5; border-color: #9c27b0; color: #7b1fa2; }
|
|
|
|
| 141 |
font-size: 12px;
|
| 142 |
color: #666;
|
| 143 |
}
|
| 144 |
+
.highlight-info {
|
| 145 |
+
position: fixed;
|
| 146 |
+
top: 10px;
|
| 147 |
+
right: 10px;
|
| 148 |
+
background: #333;
|
| 149 |
+
color: white;
|
| 150 |
+
padding: 8px 12px;
|
| 151 |
+
border-radius: 4px;
|
| 152 |
+
font-size: 12px;
|
| 153 |
+
display: none;
|
| 154 |
+
z-index: 1000;
|
| 155 |
+
}
|
| 156 |
</style>
|
| 157 |
|
| 158 |
+
<div class="highlight-info" id="highlight-info"></div>
|
| 159 |
+
|
| 160 |
<script>
|
| 161 |
+
function highlightTokens(targetText) {
|
| 162 |
+
// Clear all highlights
|
| 163 |
+
document.querySelectorAll('.token').forEach(function(token) {
|
| 164 |
token.classList.remove('highlighted');
|
| 165 |
});
|
| 166 |
|
| 167 |
+
// Highlight matching tokens
|
| 168 |
+
let count = 0;
|
| 169 |
+
document.querySelectorAll('.token').forEach(function(token) {
|
| 170 |
+
if (token.getAttribute('data-text') === targetText) {
|
| 171 |
token.classList.add('highlighted');
|
| 172 |
+
count++;
|
| 173 |
}
|
| 174 |
});
|
| 175 |
+
|
| 176 |
+
// Show info
|
| 177 |
+
const info = document.getElementById('highlight-info');
|
| 178 |
+
if (info) {
|
| 179 |
+
const displayText = targetText === ' ' ? '(space)' : targetText;
|
| 180 |
+
info.textContent = '"' + displayText + '" appears in ' + count + ' positions';
|
| 181 |
+
info.style.display = 'block';
|
| 182 |
+
}
|
| 183 |
}
|
| 184 |
|
| 185 |
function clearHighlights() {
|
| 186 |
+
document.querySelectorAll('.token').forEach(function(token) {
|
| 187 |
token.classList.remove('highlighted');
|
| 188 |
});
|
| 189 |
+
const info = document.getElementById('highlight-info');
|
| 190 |
+
if (info) {
|
| 191 |
+
info.style.display = 'none';
|
| 192 |
+
}
|
| 193 |
}
|
| 194 |
</script>
|
| 195 |
""")
|
| 196 |
|
| 197 |
+
# Generate tokenizer sections with inline event handlers
|
| 198 |
for model, result in results.items():
|
| 199 |
if "error" in result:
|
| 200 |
html_parts.append(f"""
|
| 201 |
+
<div class="tokenizer-section">
|
| 202 |
<div class="tokenizer-header">{result["model"]} ❌</div>
|
| 203 |
<div style="color: #d32f2f; font-style: italic;">Error: {result["error"]}</div>
|
| 204 |
</div>
|
|
|
|
| 206 |
continue
|
| 207 |
|
| 208 |
html_parts.append(f"""
|
| 209 |
+
<div class="tokenizer-section">
|
| 210 |
<div class="tokenizer-header">
|
| 211 |
{result["model"]}
|
| 212 |
<span class="token-stats">
|
|
|
|
| 218 |
<div class="token-display">
|
| 219 |
""")
|
| 220 |
|
| 221 |
+
# Add tokens with inline event handlers
|
| 222 |
subword_count = 0
|
| 223 |
for i, token in enumerate(result["tokens"]):
|
| 224 |
token_text = token["text"]
|
| 225 |
+
display_text = token_text if token_text.strip() else "·"
|
|
|
|
|
|
|
| 226 |
|
| 227 |
# Determine token class
|
| 228 |
token_class = f"token token-{token['type']}"
|
|
|
|
| 230 |
token_class += " token-subword"
|
| 231 |
subword_count += 1
|
| 232 |
|
| 233 |
+
# Create unique identifier for this token occurrence
|
| 234 |
+
token_id = f"token_{model}_{i}"
|
| 235 |
+
|
| 236 |
+
# Escape text for HTML and JavaScript - be very careful with quotes
|
| 237 |
+
escaped_text = (
|
| 238 |
+
token_text.replace("\\", "\\\\")
|
| 239 |
+
.replace("'", "\\'")
|
| 240 |
+
.replace('"', '\\"')
|
| 241 |
+
.replace("\n", "\\n")
|
| 242 |
+
.replace("\r", "\\r")
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
escaped_display = display_text.replace('"', """).replace("'", "'")
|
| 246 |
|
| 247 |
+
# Use inline event handlers that definitely work in Gradio
|
| 248 |
+
html_parts.append(f"""<span class="{token_class}"
|
| 249 |
+
id="{token_id}"
|
| 250 |
+
data-text="{token_text.replace('"', """).replace("'", "'")}"
|
| 251 |
data-id="{token["id"]}"
|
| 252 |
data-position="{i}"
|
| 253 |
+
data-model="{model}"
|
| 254 |
title="Text: '{token_text}' | ID: {token["id"]} | Type: {token["type"]} | Subword: {token["is_subword"]}"
|
| 255 |
+
onmouseover="highlightTokens('{escaped_text}')"
|
| 256 |
+
onmouseout="clearHighlights()"
|
| 257 |
+
onclick="alert('Token: \\'{escaped_text}\\'\\nID: {token["id"]}\\nModel: {model}')">{escaped_display}</span>""")
|
|
|
|
|
|
|
| 258 |
|
| 259 |
html_parts.append(f"""
|
| 260 |
</div>
|
|
|
|
| 265 |
</div>
|
| 266 |
""")
|
| 267 |
|
| 268 |
+
html_parts.append("</div>")
|
| 269 |
return "".join(html_parts)
|
| 270 |
|
| 271 |
|
|
|
|
| 464 |
Compare how different LLM tokenizers split text into tokens. Analyze efficiency, subwords, and token types.
|
| 465 |
|
| 466 |
**Legend**: 🔤 Word | 🔢 Number | ❗ Punctuation | 🔸 Subword | · Space
|
| 467 |
+
|
| 468 |
+
💡 **Try the sample texts** to see how tokenizers handle different challenges like:
|
| 469 |
+
- Mixed languages and scripts
|
| 470 |
+
- Programming code and JSON
|
| 471 |
+
- Long compound words
|
| 472 |
+
- Special characters and emojis
|
| 473 |
+
- Technical terminology
|
| 474 |
""")
|
| 475 |
|
| 476 |
with gr.Row():
|
| 477 |
with gr.Column(scale=2):
|
| 478 |
+
# Sample texts dropdown
|
| 479 |
+
sample_texts = gr.Dropdown(
|
| 480 |
+
choices=[
|
| 481 |
+
"Custom text (enter below)",
|
| 482 |
+
"Basic English: Hello world! How are you doing today?",
|
| 483 |
+
"Programming code: def tokenize_text(input_str): return input_str.split()",
|
| 484 |
+
"Mixed languages: Hello! 你好! こんにちは! Bonjour! Hola! مرحبا!",
|
| 485 |
+
"Numbers & symbols: The price is $123.45 (20% off) = $98.76 savings!",
|
| 486 |
+
"Subword challenge: antidisestablishmentarianism pseudopseudohypoparathyroidism",
|
| 487 |
+
"Special characters: @user123 #AI #NLP https://example.com/api?q=tokenization&limit=100",
|
| 488 |
+
"Scientific text: The mitochondria (powerhouse of the cell) produces ATP through oxidative phosphorylation.",
|
| 489 |
+
"Poetry: Roses are red, violets are blue, tokenizers split words, in ways quite new!",
|
| 490 |
+
"Technical jargon: The RESTful API endpoint /users/{id}/preferences supports GET/POST/PUT/DELETE operations.",
|
| 491 |
+
"Emoji & Unicode: I love AI! 🤖✨ The café naïve résumé 北京大学 العربية",
|
| 492 |
+
"Repetitive text: Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo.",
|
| 493 |
+
"Long compound words (German): Donaudampfschifffahrtselektrizitätenhauptbetriebswerkbauunterbeamtengesellschaft",
|
| 494 |
+
'JSON data: {"name": "John Doe", "age": 30, "skills": ["Python", "JavaScript", "AI/ML"]}',
|
| 495 |
+
"Medical terminology: Pneumonoultramicroscopicsilicovolcanoconiosisdiagnosis requires thorough radiological examination.",
|
| 496 |
+
],
|
| 497 |
+
value="Custom text (enter below)",
|
| 498 |
+
label="Choose a sample text or enter your own",
|
| 499 |
+
interactive=True,
|
| 500 |
+
)
|
| 501 |
+
|
| 502 |
text_input = gr.Textbox(
|
| 503 |
label="Text to tokenize",
|
| 504 |
+
placeholder="Enter your text here or select a sample above...",
|
| 505 |
lines=4,
|
| 506 |
value="Hello world! This is a test with some subwords and punctuation.",
|
| 507 |
)
|
|
|
|
| 520 |
"bloom",
|
| 521 |
"aya-expanse",
|
| 522 |
"comma",
|
|
|
|
|
|
|
| 523 |
"tokenmonster",
|
| 524 |
"byt5",
|
| 525 |
],
|
|
|
|
| 559 |
with gr.Column():
|
| 560 |
distribution_chart = gr.Plot(label="Token Type Distribution")
|
| 561 |
|
| 562 |
+
# Function to update text input when sample is selected
|
| 563 |
+
def update_text_from_sample(sample_choice):
|
| 564 |
+
if sample_choice == "Custom text (enter below)":
|
| 565 |
+
return gr.update() # Don't change the text input
|
| 566 |
+
else:
|
| 567 |
+
# Extract the text after the colon
|
| 568 |
+
sample_text = (
|
| 569 |
+
sample_choice.split(": ", 1)[1]
|
| 570 |
+
if ": " in sample_choice
|
| 571 |
+
else sample_choice
|
| 572 |
+
)
|
| 573 |
+
return gr.update(value=sample_text)
|
| 574 |
|
| 575 |
+
# Update text input when sample is selected
|
| 576 |
+
sample_texts.change(
|
| 577 |
+
fn=update_text_from_sample, inputs=sample_texts, outputs=text_input
|
| 578 |
+
)
|
| 579 |
|
| 580 |
# Main comparison function
|
| 581 |
def update_comparison(text, models, details):
|
|
|
|
| 608 |
- **Gemma-2**: Google's model with SentencePiece
|
| 609 |
- **Qwen3/2.5**: Alibaba's models with BPE
|
| 610 |
- **BERT/DistilBERT**: Google's models with WordPiece
|
|
|
|
| 611 |
- **BLOOM**: BigScience's multilingual model with BPE
|
| 612 |
- **Aya Expanse**: Cohere's multilingual model with SentencePiece
|
| 613 |
- **Comma (Common Pile)**: Common Pile's model with BPE
|
| 614 |
+
- **Byt5**: Google's byte-level model
|
| 615 |
|
| 616 |
### Features
|
| 617 |
- **Efficiency Ranking**: Compare token counts across models
|
|
|
|
| 623 |
|
| 624 |
if __name__ == "__main__":
|
| 625 |
demo.launch()
|
|
|
|
|
|