Gül Sena Altıntaş
Added additional tokenizers
3a08f05
raw
history blame
16.3 kB
import json
import os
from collections import Counter
import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import tiktoken
from transformers import AutoTokenizer
# Model mappings
MODEL_MAP = {
"llama-2": "meta-llama/Llama-2-7b-hf",
"llama-3": "meta-llama/Llama-3.2-1B",
"gemma-2": "google/gemma-2-2b",
"qwen3": "Qwen/Qwen3-0.6B",
"qwen2.5": "Qwen/Qwen2.5-0.5B",
"bert": "bert-base-uncased",
"bloom": "bigscience/bloom-560m",
"aya-expanse": "CohereForAI/aya-expanse-8b",
"comma": "common-pile/comma-v0.1-2tgpt2",
"byte-level": "google/byt5-small",
"tokenmonster": "alasdairforsythe/tokenmonster",
}
TOKENIZER_INFO = {
"gpt-4": {"name": "GPT-4", "vocab_size": 100277, "encoding": "BPE"},
"gpt-2": {"name": "GPT-2", "vocab_size": 50257, "encoding": "BPE"},
"llama-2": {"name": "LLaMA-2", "vocab_size": 32000, "encoding": "SentencePiece"},
"llama-3": {"name": "LLaMA-3", "vocab_size": 128000, "encoding": "SentencePiece"},
"gemma-2": {"name": "Gemma-2", "vocab_size": 256000, "encoding": "SentencePiece"},
"qwen3": {"name": "Qwen3", "vocab_size": 151936, "encoding": "BPE"},
"qwen2.5": {"name": "Qwen2.5", "vocab_size": 151936, "encoding": "BPE"},
"bert": {"name": "BERT", "vocab_size": 30522, "encoding": "WordPiece"},
"bloom": {"name": "BLOOM", "vocab_size": 250680, "encoding": "BPE"},
"aya-expanse": {
"name": "Aya Expanse",
"vocab_size": 256000,
"encoding": "SentencePiece",
},
"comma": {"name": "Comma AI", "vocab_size": 50257, "encoding": ""},
"byte-level": {"name": "Byte-Level BPE", "vocab_size": 50000, "encoding": "BPE"},
"tokenmonster": {"name": "TokenMonster", "vocab_size": 32000, "encoding": ""},
}
def get_token_type(token_text):
import re
if re.match(r"^\s+$", token_text):
return "whitespace"
elif re.match(r"^[a-zA-Z]+$", token_text):
return "word"
elif re.match(r"^\d+$", token_text):
return "number"
elif re.match(r"^[^\w\s]+$", token_text):
return "punctuation"
elif token_text.startswith("<") and token_text.endswith(">"):
return "special"
else:
return "mixed"
def is_subword(token_text, model, is_first):
if model in ["llama-2", "llama-3", "qwen3"]:
return not token_text.startswith("▁") and not is_first
elif model == "bert":
return token_text.startswith("##")
else: # BPE models
return not token_text.startswith(" ") and not is_first and len(token_text) > 0
def tokenize_with_tiktoken(text, model):
encoding = "cl100k_base" if model == "gpt-4" else "gpt2"
enc = tiktoken.get_encoding(encoding)
tokens = enc.encode(text)
token_data = []
current_pos = 0
for i, token_id in enumerate(tokens):
token_text = enc.decode([token_id])
token_type = get_token_type(token_text)
subword = is_subword(token_text, model, i == 0)
token_data.append(
{
"text": token_text,
"id": int(token_id),
"type": token_type,
"is_subword": subword,
"bytes": len(token_text.encode("utf-8")),
"position": i,
}
)
current_pos += len(token_text)
return {
"model": TOKENIZER_INFO[model]["name"],
"token_count": len(tokens),
"tokens": token_data,
"compression_ratio": len(text) / len(tokens) if tokens else 0,
"encoding": TOKENIZER_INFO[model]["encoding"],
"vocab_size": TOKENIZER_INFO[model]["vocab_size"],
}
def tokenize_with_hf(text, model):
try:
model_name = MODEL_MAP.get(model, "gpt2")
tokenizer = AutoTokenizer.from_pretrained(
model_name, token=os.getenv("HF_TOKEN"), trust_remote_code=True
)
tokens = tokenizer.encode(text)
token_data = []
for i, token_id in enumerate(tokens):
token_text = tokenizer.decode([token_id], skip_special_tokens=False)
token_type = get_token_type(token_text)
subword = is_subword(token_text, model, i == 0)
token_data.append(
{
"text": token_text,
"id": int(token_id),
"type": token_type,
"is_subword": subword,
"bytes": len(token_text.encode("utf-8")),
"position": i,
}
)
return {
"model": TOKENIZER_INFO[model]["name"],
"token_count": len(tokens),
"tokens": token_data,
"compression_ratio": len(text) / len(tokens) if tokens else 0,
"encoding": TOKENIZER_INFO[model]["encoding"],
"vocab_size": TOKENIZER_INFO[model]["vocab_size"],
}
except Exception as e:
return {
"model": TOKENIZER_INFO[model]["name"],
"token_count": 0,
"tokens": [],
"compression_ratio": 0,
"encoding": "Error",
"vocab_size": 0,
"error": str(e),
}
def compare_tokenizers(text, selected_models, show_details=False):
if not text.strip():
return "Please enter some text to tokenize.", "", None, None
results = {}
for model in selected_models:
if model in ["gpt-4", "gpt-2"]:
results[model] = tokenize_with_tiktoken(text, model)
else:
results[model] = tokenize_with_hf(text, model)
# Generate outputs
basic_output = generate_basic_comparison(results)
detailed_output = generate_detailed_analysis(results) if show_details else ""
efficiency_chart = create_efficiency_chart(results)
token_distribution_chart = create_token_distribution_chart(results)
return basic_output, detailed_output, efficiency_chart, token_distribution_chart
def generate_basic_comparison(results):
if not results:
return "No results to display."
output = []
# Efficiency ranking
sorted_models = sorted(results.items(), key=lambda x: x[1]["token_count"])
output.append("## 🏆 Efficiency Ranking (Fewer tokens = more efficient)")
for i, (model, result) in enumerate(sorted_models):
if "error" in result:
output.append(
f"{i + 1}. **{result['model']}**: ❌ Error - {result['error']}"
)
else:
output.append(
f"{i + 1}. **{result['model']}**: {result['token_count']} tokens "
f"({result['compression_ratio']:.2f}x compression)"
)
output.append("\n## 🔤 Tokenization Results")
for model, result in results.items():
if "error" in result:
output.append(f"\n### ❌ {result['model']} - Error: {result['error']}")
continue
output.append(f"\n### {result['model']}")
output.append(f"- **Tokens**: {result['token_count']}")
output.append(f"- **Vocab Size**: {result['vocab_size']:,}")
output.append(f"- **Encoding**: {result['encoding']}")
output.append(f"- **Compression**: {result['compression_ratio']:.2f}x")
# Show first 20 tokens with visual indicators
tokens_display = []
subword_count = 0
for token in result["tokens"][:20]:
token_text = token["text"]
if token_text == " ":
token_text = "·" # Space indicator
elif token_text.strip() == "":
token_text = "⎵" # Empty token indicator
# Add type indicators
if token["is_subword"]:
tokens_display.append(f"🔸`{token_text}`")
subword_count += 1
elif token["type"] == "word":
tokens_display.append(f"🔤`{token_text}`")
elif token["type"] == "number":
tokens_display.append(f"🔢`{token_text}`")
elif token["type"] == "punctuation":
tokens_display.append(f"❗`{token_text}`")
else:
tokens_display.append(f"`{token_text}`")
if len(result["tokens"]) > 20:
tokens_display.append(f"... (+{len(result['tokens']) - 20} more)")
output.append(f"- **Subwords**: {subword_count}/{len(result['tokens'][:20])}")
output.append(f"- **Tokens**: {' '.join(tokens_display)}")
return "\n".join(output)
def generate_detailed_analysis(results):
if not results or len(results) < 2:
return "Need at least 2 tokenizers for detailed analysis."
output = []
output.append("## 🔍 Detailed Analysis")
# Find common tokens
all_token_sets = []
for model, result in results.items():
if "error" not in result:
token_texts = {token["text"] for token in result["tokens"]}
all_token_sets.append(token_texts)
if all_token_sets:
common_tokens = set.intersection(*all_token_sets)
output.append(f"\n### Common Tokens ({len(common_tokens)})")
if common_tokens:
common_display = [
f"`{token}`" if token != " " else "`·`"
for token in list(common_tokens)[:15]
]
output.append(" ".join(common_display))
else:
output.append("No common tokens found.")
# Token type distribution
output.append("\n### Token Type Distribution")
for model, result in results.items():
if "error" not in result:
type_counts = Counter(token["type"] for token in result["tokens"])
type_display = [f"{type_}: {count}" for type_, count in type_counts.items()]
output.append(f"**{result['model']}**: {', '.join(type_display)}")
# Subword analysis
output.append("\n### Subword Analysis")
for model, result in results.items():
if "error" not in result:
subwords = [token for token in result["tokens"] if token["is_subword"]]
subword_ratio = (
len(subwords) / len(result["tokens"]) * 100 if result["tokens"] else 0
)
output.append(
f"**{result['model']}**: {len(subwords)} subwords ({subword_ratio:.1f}%)"
)
return "\n".join(output)
def create_efficiency_chart(results):
if not results:
return None
models = []
token_counts = []
compression_ratios = []
for model, result in results.items():
if "error" not in result:
models.append(result["model"])
token_counts.append(result["token_count"])
compression_ratios.append(result["compression_ratio"])
if not models:
return None
fig = go.Figure()
# Add token count bars
fig.add_trace(
go.Bar(
x=models,
y=token_counts,
name="Token Count",
marker_color="lightblue",
text=token_counts,
textposition="auto",
)
)
fig.update_layout(
title="Token Count Comparison (Lower = More Efficient)",
xaxis_title="Tokenizer",
yaxis_title="Number of Tokens",
template="plotly_white",
)
return fig
def create_token_distribution_chart(results):
if not results:
return None
all_data = []
for model, result in results.items():
if "error" not in result:
type_counts = Counter(token["type"] for token in result["tokens"])
for token_type, count in type_counts.items():
all_data.append(
{
"Tokenizer": result["model"],
"Token Type": token_type,
"Count": count,
}
)
if not all_data:
return None
df = pd.DataFrame(all_data)
fig = px.bar(
df,
x="Tokenizer",
y="Count",
color="Token Type",
title="Token Type Distribution by Tokenizer",
template="plotly_white",
)
return fig
# Custom CSS for better styling
css = """
.gradio-container {
font-family: 'Inter', sans-serif;
}
.token-display {
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
background: #f8f9fa;
padding: 8px;
border-radius: 4px;
font-size: 0.9em;
}
"""
# Create the Gradio interface
with gr.Blocks(
title="🔤 Advanced Tokenizer Comparison", theme=gr.themes.Soft(), css=css
) as demo:
gr.Markdown("""
# 🔤 Advanced Tokenizer Comparison Tool
Compare how different LLM tokenizers split text into tokens. Analyze efficiency, subwords, and token types.
**Legend**: 🔤 Word | 🔢 Number | ❗ Punctuation | 🔸 Subword | · Space
""")
with gr.Row():
with gr.Column(scale=2):
text_input = gr.Textbox(
label="Text to tokenize",
placeholder="Enter your text here...",
lines=4,
value="Hello world! This is a test with some subwords and punctuation.",
)
with gr.Column(scale=1):
model_selector = gr.CheckboxGroup(
choices=[
"gpt-4",
"gpt-2",
"llama-2",
"llama-3",
"gemma-2",
"qwen3",
"qwen2.5",
"bert",
"bloom",
"aya-expanse",
"comma",
"byte-level",
"tokenmonster",
],
value=["gpt-4", "llama-3", "gpt-2"],
label="Select tokenizers to compare",
)
show_details = gr.Checkbox(label="Show detailed analysis", value=False)
with gr.Row():
with gr.Column():
basic_output = gr.Markdown(
label="Comparison Results",
value="Enter text above to see tokenization results...",
)
with gr.Row():
with gr.Column():
detailed_output = gr.Markdown(label="Detailed Analysis", visible=False)
with gr.Row():
with gr.Column():
efficiency_chart = gr.Plot(label="Efficiency Comparison")
with gr.Column():
distribution_chart = gr.Plot(label="Token Type Distribution")
# Update visibility of detailed analysis
def toggle_details(show_details):
return gr.update(visible=show_details)
show_details.change(fn=toggle_details, inputs=show_details, outputs=detailed_output)
# Main comparison function
def update_comparison(text, models, details):
basic, detailed, eff_chart, dist_chart = compare_tokenizers(
text, models, details
)
return basic, detailed, eff_chart, dist_chart
# Auto-update on changes
for component in [text_input, model_selector, show_details]:
component.change(
fn=update_comparison,
inputs=[text_input, model_selector, show_details],
outputs=[
basic_output,
detailed_output,
efficiency_chart,
distribution_chart,
],
)
gr.Markdown("""
---
### About the Models
- **GPT-4/GPT-2**: OpenAI's tokenizers using BPE (Byte-Pair Encoding)
- **LLaMA-2/3**: Meta's models using SentencePiece
- **Gemma-2**: Google's model with SentencePiece
- **Qwen3/2.5**: Alibaba's models with BPE
- **BERT**: Google's BERT with WordPiece
- **BLOOM**: BigScience's multilingual model with BPE
- **Aya Expanse**: Cohere's multilingual model with SentencePiece
- **Comma AI**: Comma AI's model with BPE
- **Byte-Level**: Byte-level BPE tokenizer
- **TokenMonster**: Optimized tokenizer with BPE
### Features
- **Efficiency Ranking**: Compare token counts across models
- **Subword Analysis**: See how models handle subwords
- **Token Types**: Classification of word/number/punctuation tokens
- **Visual Charts**: Interactive plots for comparison
- **Detailed Analysis**: Common tokens and distribution stats
""")
if __name__ == "__main__":
demo.launch()