Spaces:

gsaltintas
/

tokenizer-comparison

Sleeping

Gül Sena Altıntaş

Added additional tokenizers

3a08f05 6 months ago

16.3 kB

	import json
	import os
	from collections import Counter

	import gradio as gr
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	import tiktoken
	from transformers import AutoTokenizer

	# Model mappings
	MODEL_MAP = {
	"llama-2": "meta-llama/Llama-2-7b-hf",
	"llama-3": "meta-llama/Llama-3.2-1B",
	"gemma-2": "google/gemma-2-2b",
	"qwen3": "Qwen/Qwen3-0.6B",
	"qwen2.5": "Qwen/Qwen2.5-0.5B",
	"bert": "bert-base-uncased",
	"bloom": "bigscience/bloom-560m",
	"aya-expanse": "CohereForAI/aya-expanse-8b",
	"comma": "common-pile/comma-v0.1-2tgpt2",
	"byte-level": "google/byt5-small",
	"tokenmonster": "alasdairforsythe/tokenmonster",
	}

	TOKENIZER_INFO = {
	"gpt-4": {"name": "GPT-4", "vocab_size": 100277, "encoding": "BPE"},
	"gpt-2": {"name": "GPT-2", "vocab_size": 50257, "encoding": "BPE"},
	"llama-2": {"name": "LLaMA-2", "vocab_size": 32000, "encoding": "SentencePiece"},
	"llama-3": {"name": "LLaMA-3", "vocab_size": 128000, "encoding": "SentencePiece"},
	"gemma-2": {"name": "Gemma-2", "vocab_size": 256000, "encoding": "SentencePiece"},
	"qwen3": {"name": "Qwen3", "vocab_size": 151936, "encoding": "BPE"},
	"qwen2.5": {"name": "Qwen2.5", "vocab_size": 151936, "encoding": "BPE"},
	"bert": {"name": "BERT", "vocab_size": 30522, "encoding": "WordPiece"},
	"bloom": {"name": "BLOOM", "vocab_size": 250680, "encoding": "BPE"},
	"aya-expanse": {
	"name": "Aya Expanse",
	"vocab_size": 256000,
	"encoding": "SentencePiece",
	},
	"comma": {"name": "Comma AI", "vocab_size": 50257, "encoding": ""},
	"byte-level": {"name": "Byte-Level BPE", "vocab_size": 50000, "encoding": "BPE"},
	"tokenmonster": {"name": "TokenMonster", "vocab_size": 32000, "encoding": ""},
	}


	def get_token_type(token_text):
	import re

	if re.match(r"^\s+$", token_text):
	return "whitespace"
	elif re.match(r"^[a-zA-Z]+$", token_text):
	return "word"
	elif re.match(r"^\d+$", token_text):
	return "number"
	elif re.match(r"^[^\w\s]+$", token_text):
	return "punctuation"
	elif token_text.startswith("<") and token_text.endswith(">"):
	return "special"
	else:
	return "mixed"


	def is_subword(token_text, model, is_first):
	if model in ["llama-2", "llama-3", "qwen3"]:
	return not token_text.startswith("▁") and not is_first
	elif model == "bert":
	return token_text.startswith("##")
	else: # BPE models
	return not token_text.startswith(" ") and not is_first and len(token_text) > 0


	def tokenize_with_tiktoken(text, model):
	encoding = "cl100k_base" if model == "gpt-4" else "gpt2"
	enc = tiktoken.get_encoding(encoding)
	tokens = enc.encode(text)

	token_data = []
	current_pos = 0

	for i, token_id in enumerate(tokens):
	token_text = enc.decode([token_id])
	token_type = get_token_type(token_text)
	subword = is_subword(token_text, model, i == 0)

	token_data.append(
	{
	"text": token_text,
	"id": int(token_id),
	"type": token_type,
	"is_subword": subword,
	"bytes": len(token_text.encode("utf-8")),
	"position": i,
	}
	)
	current_pos += len(token_text)

	return {
	"model": TOKENIZER_INFO[model]["name"],
	"token_count": len(tokens),
	"tokens": token_data,
	"compression_ratio": len(text) / len(tokens) if tokens else 0,
	"encoding": TOKENIZER_INFO[model]["encoding"],
	"vocab_size": TOKENIZER_INFO[model]["vocab_size"],
	}


	def tokenize_with_hf(text, model):
	try:
	model_name = MODEL_MAP.get(model, "gpt2")
	tokenizer = AutoTokenizer.from_pretrained(
	model_name, token=os.getenv("HF_TOKEN"), trust_remote_code=True
	)

	tokens = tokenizer.encode(text)
	token_data = []

	for i, token_id in enumerate(tokens):
	token_text = tokenizer.decode([token_id], skip_special_tokens=False)
	token_type = get_token_type(token_text)
	subword = is_subword(token_text, model, i == 0)

	token_data.append(
	{
	"text": token_text,
	"id": int(token_id),
	"type": token_type,
	"is_subword": subword,
	"bytes": len(token_text.encode("utf-8")),
	"position": i,
	}
	)

	return {
	"model": TOKENIZER_INFO[model]["name"],
	"token_count": len(tokens),
	"tokens": token_data,
	"compression_ratio": len(text) / len(tokens) if tokens else 0,
	"encoding": TOKENIZER_INFO[model]["encoding"],
	"vocab_size": TOKENIZER_INFO[model]["vocab_size"],
	}
	except Exception as e:
	return {
	"model": TOKENIZER_INFO[model]["name"],
	"token_count": 0,
	"tokens": [],
	"compression_ratio": 0,
	"encoding": "Error",
	"vocab_size": 0,
	"error": str(e),
	}


	def compare_tokenizers(text, selected_models, show_details=False):
	if not text.strip():
	return "Please enter some text to tokenize.", "", None, None

	results = {}

	for model in selected_models:
	if model in ["gpt-4", "gpt-2"]:
	results[model] = tokenize_with_tiktoken(text, model)
	else:
	results[model] = tokenize_with_hf(text, model)

	# Generate outputs
	basic_output = generate_basic_comparison(results)
	detailed_output = generate_detailed_analysis(results) if show_details else ""
	efficiency_chart = create_efficiency_chart(results)
	token_distribution_chart = create_token_distribution_chart(results)

	return basic_output, detailed_output, efficiency_chart, token_distribution_chart


	def generate_basic_comparison(results):
	if not results:
	return "No results to display."

	output = []

	# Efficiency ranking
	sorted_models = sorted(results.items(), key=lambda x: x[1]["token_count"])

	output.append("## 🏆 Efficiency Ranking (Fewer tokens = more efficient)")
	for i, (model, result) in enumerate(sorted_models):
	if "error" in result:
	output.append(
	f"{i + 1}. {result['model']}: ❌ Error - {result['error']}"
	)
	else:
	output.append(
	f"{i + 1}. {result['model']}: {result['token_count']} tokens "
	f"({result['compression_ratio']:.2f}x compression)"
	)

	output.append("\n## 🔤 Tokenization Results")

	for model, result in results.items():
	if "error" in result:
	output.append(f"\n### ❌ {result['model']} - Error: {result['error']}")
	continue

	output.append(f"\n### {result['model']}")
	output.append(f"- Tokens: {result['token_count']}")
	output.append(f"- Vocab Size: {result['vocab_size']:,}")
	output.append(f"- Encoding: {result['encoding']}")
	output.append(f"- Compression: {result['compression_ratio']:.2f}x")

	# Show first 20 tokens with visual indicators
	tokens_display = []
	subword_count = 0

	for token in result["tokens"][:20]:
	token_text = token["text"]
	if token_text == " ":
	token_text = "·" # Space indicator
	elif token_text.strip() == "":
	token_text = "⎵" # Empty token indicator

	# Add type indicators
	if token["is_subword"]:
	tokens_display.append(f"🔸`{token_text}`")
	subword_count += 1
	elif token["type"] == "word":
	tokens_display.append(f"🔤`{token_text}`")
	elif token["type"] == "number":
	tokens_display.append(f"🔢`{token_text}`")
	elif token["type"] == "punctuation":
	tokens_display.append(f"❗`{token_text}`")
	else:
	tokens_display.append(f"`{token_text}`")

	if len(result["tokens"]) > 20:
	tokens_display.append(f"... (+{len(result['tokens']) - 20} more)")

	output.append(f"- Subwords: {subword_count}/{len(result['tokens'][:20])}")
	output.append(f"- Tokens: {' '.join(tokens_display)}")

	return "\n".join(output)


	def generate_detailed_analysis(results):
	if not results or len(results) < 2:
	return "Need at least 2 tokenizers for detailed analysis."

	output = []
	output.append("## 🔍 Detailed Analysis")

	# Find common tokens
	all_token_sets = []
	for model, result in results.items():
	if "error" not in result:
	token_texts = {token["text"] for token in result["tokens"]}
	all_token_sets.append(token_texts)

	if all_token_sets:
	common_tokens = set.intersection(*all_token_sets)
	output.append(f"\n### Common Tokens ({len(common_tokens)})")
	if common_tokens:
	common_display = [
	f"`{token}`" if token != " " else "`·`"
	for token in list(common_tokens)[:15]
	]
	output.append(" ".join(common_display))
	else:
	output.append("No common tokens found.")

	# Token type distribution
	output.append("\n### Token Type Distribution")
	for model, result in results.items():
	if "error" not in result:
	type_counts = Counter(token["type"] for token in result["tokens"])
	type_display = [f"{type_}: {count}" for type_, count in type_counts.items()]
	output.append(f"{result['model']}: {', '.join(type_display)}")

	# Subword analysis
	output.append("\n### Subword Analysis")
	for model, result in results.items():
	if "error" not in result:
	subwords = [token for token in result["tokens"] if token["is_subword"]]
	subword_ratio = (
	len(subwords) / len(result["tokens"]) * 100 if result["tokens"] else 0
	)
	output.append(
	f"{result['model']}: {len(subwords)} subwords ({subword_ratio:.1f}%)"
	)

	return "\n".join(output)


	def create_efficiency_chart(results):
	if not results:
	return None

	models = []
	token_counts = []
	compression_ratios = []

	for model, result in results.items():
	if "error" not in result:
	models.append(result["model"])
	token_counts.append(result["token_count"])
	compression_ratios.append(result["compression_ratio"])

	if not models:
	return None

	fig = go.Figure()

	# Add token count bars
	fig.add_trace(
	go.Bar(
	x=models,
	y=token_counts,
	name="Token Count",
	marker_color="lightblue",
	text=token_counts,
	textposition="auto",
	)
	)

	fig.update_layout(
	title="Token Count Comparison (Lower = More Efficient)",
	xaxis_title="Tokenizer",
	yaxis_title="Number of Tokens",
	template="plotly_white",
	)

	return fig


	def create_token_distribution_chart(results):
	if not results:
	return None

	all_data = []

	for model, result in results.items():
	if "error" not in result:
	type_counts = Counter(token["type"] for token in result["tokens"])
	for token_type, count in type_counts.items():
	all_data.append(
	{
	"Tokenizer": result["model"],
	"Token Type": token_type,
	"Count": count,
	}
	)

	if not all_data:
	return None

	df = pd.DataFrame(all_data)

	fig = px.bar(
	df,
	x="Tokenizer",
	y="Count",
	color="Token Type",
	title="Token Type Distribution by Tokenizer",
	template="plotly_white",
	)

	return fig


	# Custom CSS for better styling
	css = """
	.gradio-container {
	font-family: 'Inter', sans-serif;
	}
	.token-display {
	font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
	background: #f8f9fa;
	padding: 8px;
	border-radius: 4px;
	font-size: 0.9em;
	}
	"""

	# Create the Gradio interface
	with gr.Blocks(
	title="🔤 Advanced Tokenizer Comparison", theme=gr.themes.Soft(), css=css
	) as demo:
	gr.Markdown("""
	# 🔤 Advanced Tokenizer Comparison Tool

	Compare how different LLM tokenizers split text into tokens. Analyze efficiency, subwords, and token types.

	Legend: 🔤 Word \| 🔢 Number \| ❗ Punctuation \| 🔸 Subword \| · Space
	""")

	with gr.Row():
	with gr.Column(scale=2):
	text_input = gr.Textbox(
	label="Text to tokenize",
	placeholder="Enter your text here...",
	lines=4,
	value="Hello world! This is a test with some subwords and punctuation.",
	)

	with gr.Column(scale=1):
	model_selector = gr.CheckboxGroup(
	choices=[
	"gpt-4",
	"gpt-2",
	"llama-2",
	"llama-3",
	"gemma-2",
	"qwen3",
	"qwen2.5",
	"bert",
	"bloom",
	"aya-expanse",
	"comma",
	"byte-level",
	"tokenmonster",
	],
	value=["gpt-4", "llama-3", "gpt-2"],
	label="Select tokenizers to compare",
	)

	show_details = gr.Checkbox(label="Show detailed analysis", value=False)

	with gr.Row():
	with gr.Column():
	basic_output = gr.Markdown(
	label="Comparison Results",
	value="Enter text above to see tokenization results...",
	)

	with gr.Row():
	with gr.Column():
	detailed_output = gr.Markdown(label="Detailed Analysis", visible=False)

	with gr.Row():
	with gr.Column():
	efficiency_chart = gr.Plot(label="Efficiency Comparison")
	with gr.Column():
	distribution_chart = gr.Plot(label="Token Type Distribution")

	# Update visibility of detailed analysis
	def toggle_details(show_details):
	return gr.update(visible=show_details)

	show_details.change(fn=toggle_details, inputs=show_details, outputs=detailed_output)

	# Main comparison function
	def update_comparison(text, models, details):
	basic, detailed, eff_chart, dist_chart = compare_tokenizers(
	text, models, details
	)
	return basic, detailed, eff_chart, dist_chart

	# Auto-update on changes
	for component in [text_input, model_selector, show_details]:
	component.change(
	fn=update_comparison,
	inputs=[text_input, model_selector, show_details],
	outputs=[
	basic_output,
	detailed_output,
	efficiency_chart,
	distribution_chart,
	],
	)

	gr.Markdown("""
	---
	### About the Models

	- GPT-4/GPT-2: OpenAI's tokenizers using BPE (Byte-Pair Encoding)
	- LLaMA-2/3: Meta's models using SentencePiece
	- Gemma-2: Google's model with SentencePiece
	- Qwen3/2.5: Alibaba's models with BPE
	- BERT: Google's BERT with WordPiece
	- BLOOM: BigScience's multilingual model with BPE
	- Aya Expanse: Cohere's multilingual model with SentencePiece
	- Comma AI: Comma AI's model with BPE
	- Byte-Level: Byte-level BPE tokenizer
	- TokenMonster: Optimized tokenizer with BPE

	### Features
	- Efficiency Ranking: Compare token counts across models
	- Subword Analysis: See how models handle subwords
	- Token Types: Classification of word/number/punctuation tokens
	- Visual Charts: Interactive plots for comparison
	- Detailed Analysis: Common tokens and distribution stats
	""")

	if __name__ == "__main__":
	demo.launch()