Spaces:
Sleeping
Sleeping
| import json | |
| import os | |
| from collections import Counter | |
| import gradio as gr | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| import tiktoken | |
| from transformers import AutoTokenizer | |
| # Model mappings | |
| MODEL_MAP = { | |
| "llama-2": "meta-llama/Llama-2-7b-hf", | |
| "llama-3": "meta-llama/Llama-3.2-1B", | |
| "gemma-2": "google/gemma-2-2b", | |
| "qwen3": "Qwen/Qwen3-0.6B", | |
| "qwen2.5": "Qwen/Qwen2.5-0.5B", | |
| "bert": "bert-base-uncased", | |
| "bloom": "bigscience/bloom-560m", | |
| "aya-expanse": "CohereForAI/aya-expanse-8b", | |
| "comma": "common-pile/comma-v0.1-2tgpt2", | |
| "byte-level": "google/byt5-small", | |
| "tokenmonster": "alasdairforsythe/tokenmonster", | |
| } | |
| TOKENIZER_INFO = { | |
| "gpt-4": {"name": "GPT-4", "vocab_size": 100277, "encoding": "BPE"}, | |
| "gpt-2": {"name": "GPT-2", "vocab_size": 50257, "encoding": "BPE"}, | |
| "llama-2": {"name": "LLaMA-2", "vocab_size": 32000, "encoding": "SentencePiece"}, | |
| "llama-3": {"name": "LLaMA-3", "vocab_size": 128000, "encoding": "SentencePiece"}, | |
| "gemma-2": {"name": "Gemma-2", "vocab_size": 256000, "encoding": "SentencePiece"}, | |
| "qwen3": {"name": "Qwen3", "vocab_size": 151936, "encoding": "BPE"}, | |
| "qwen2.5": {"name": "Qwen2.5", "vocab_size": 151936, "encoding": "BPE"}, | |
| "bert": {"name": "BERT", "vocab_size": 30522, "encoding": "WordPiece"}, | |
| "bloom": {"name": "BLOOM", "vocab_size": 250680, "encoding": "BPE"}, | |
| "aya-expanse": { | |
| "name": "Aya Expanse", | |
| "vocab_size": 256000, | |
| "encoding": "SentencePiece", | |
| }, | |
| "comma": {"name": "Comma AI", "vocab_size": 50257, "encoding": ""}, | |
| "byte-level": {"name": "Byte-Level BPE", "vocab_size": 50000, "encoding": "BPE"}, | |
| "tokenmonster": {"name": "TokenMonster", "vocab_size": 32000, "encoding": ""}, | |
| } | |
| def get_token_type(token_text): | |
| import re | |
| if re.match(r"^\s+$", token_text): | |
| return "whitespace" | |
| elif re.match(r"^[a-zA-Z]+$", token_text): | |
| return "word" | |
| elif re.match(r"^\d+$", token_text): | |
| return "number" | |
| elif re.match(r"^[^\w\s]+$", token_text): | |
| return "punctuation" | |
| elif token_text.startswith("<") and token_text.endswith(">"): | |
| return "special" | |
| else: | |
| return "mixed" | |
| def is_subword(token_text, model, is_first): | |
| if model in ["llama-2", "llama-3", "qwen3"]: | |
| return not token_text.startswith("▁") and not is_first | |
| elif model == "bert": | |
| return token_text.startswith("##") | |
| else: # BPE models | |
| return not token_text.startswith(" ") and not is_first and len(token_text) > 0 | |
| def tokenize_with_tiktoken(text, model): | |
| encoding = "cl100k_base" if model == "gpt-4" else "gpt2" | |
| enc = tiktoken.get_encoding(encoding) | |
| tokens = enc.encode(text) | |
| token_data = [] | |
| current_pos = 0 | |
| for i, token_id in enumerate(tokens): | |
| token_text = enc.decode([token_id]) | |
| token_type = get_token_type(token_text) | |
| subword = is_subword(token_text, model, i == 0) | |
| token_data.append( | |
| { | |
| "text": token_text, | |
| "id": int(token_id), | |
| "type": token_type, | |
| "is_subword": subword, | |
| "bytes": len(token_text.encode("utf-8")), | |
| "position": i, | |
| } | |
| ) | |
| current_pos += len(token_text) | |
| return { | |
| "model": TOKENIZER_INFO[model]["name"], | |
| "token_count": len(tokens), | |
| "tokens": token_data, | |
| "compression_ratio": len(text) / len(tokens) if tokens else 0, | |
| "encoding": TOKENIZER_INFO[model]["encoding"], | |
| "vocab_size": TOKENIZER_INFO[model]["vocab_size"], | |
| } | |
| def tokenize_with_hf(text, model): | |
| try: | |
| model_name = MODEL_MAP.get(model, "gpt2") | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| model_name, token=os.getenv("HF_TOKEN"), trust_remote_code=True | |
| ) | |
| tokens = tokenizer.encode(text) | |
| token_data = [] | |
| for i, token_id in enumerate(tokens): | |
| token_text = tokenizer.decode([token_id], skip_special_tokens=False) | |
| token_type = get_token_type(token_text) | |
| subword = is_subword(token_text, model, i == 0) | |
| token_data.append( | |
| { | |
| "text": token_text, | |
| "id": int(token_id), | |
| "type": token_type, | |
| "is_subword": subword, | |
| "bytes": len(token_text.encode("utf-8")), | |
| "position": i, | |
| } | |
| ) | |
| return { | |
| "model": TOKENIZER_INFO[model]["name"], | |
| "token_count": len(tokens), | |
| "tokens": token_data, | |
| "compression_ratio": len(text) / len(tokens) if tokens else 0, | |
| "encoding": TOKENIZER_INFO[model]["encoding"], | |
| "vocab_size": TOKENIZER_INFO[model]["vocab_size"], | |
| } | |
| except Exception as e: | |
| return { | |
| "model": TOKENIZER_INFO[model]["name"], | |
| "token_count": 0, | |
| "tokens": [], | |
| "compression_ratio": 0, | |
| "encoding": "Error", | |
| "vocab_size": 0, | |
| "error": str(e), | |
| } | |
| def compare_tokenizers(text, selected_models, show_details=False): | |
| if not text.strip(): | |
| return "Please enter some text to tokenize.", "", None, None | |
| results = {} | |
| for model in selected_models: | |
| if model in ["gpt-4", "gpt-2"]: | |
| results[model] = tokenize_with_tiktoken(text, model) | |
| else: | |
| results[model] = tokenize_with_hf(text, model) | |
| # Generate outputs | |
| basic_output = generate_basic_comparison(results) | |
| detailed_output = generate_detailed_analysis(results) if show_details else "" | |
| efficiency_chart = create_efficiency_chart(results) | |
| token_distribution_chart = create_token_distribution_chart(results) | |
| return basic_output, detailed_output, efficiency_chart, token_distribution_chart | |
| def generate_basic_comparison(results): | |
| if not results: | |
| return "No results to display." | |
| output = [] | |
| # Efficiency ranking | |
| sorted_models = sorted(results.items(), key=lambda x: x[1]["token_count"]) | |
| output.append("## 🏆 Efficiency Ranking (Fewer tokens = more efficient)") | |
| for i, (model, result) in enumerate(sorted_models): | |
| if "error" in result: | |
| output.append( | |
| f"{i + 1}. **{result['model']}**: ❌ Error - {result['error']}" | |
| ) | |
| else: | |
| output.append( | |
| f"{i + 1}. **{result['model']}**: {result['token_count']} tokens " | |
| f"({result['compression_ratio']:.2f}x compression)" | |
| ) | |
| output.append("\n## 🔤 Tokenization Results") | |
| for model, result in results.items(): | |
| if "error" in result: | |
| output.append(f"\n### ❌ {result['model']} - Error: {result['error']}") | |
| continue | |
| output.append(f"\n### {result['model']}") | |
| output.append(f"- **Tokens**: {result['token_count']}") | |
| output.append(f"- **Vocab Size**: {result['vocab_size']:,}") | |
| output.append(f"- **Encoding**: {result['encoding']}") | |
| output.append(f"- **Compression**: {result['compression_ratio']:.2f}x") | |
| # Show first 20 tokens with visual indicators | |
| tokens_display = [] | |
| subword_count = 0 | |
| for token in result["tokens"][:20]: | |
| token_text = token["text"] | |
| if token_text == " ": | |
| token_text = "·" # Space indicator | |
| elif token_text.strip() == "": | |
| token_text = "⎵" # Empty token indicator | |
| # Add type indicators | |
| if token["is_subword"]: | |
| tokens_display.append(f"🔸`{token_text}`") | |
| subword_count += 1 | |
| elif token["type"] == "word": | |
| tokens_display.append(f"🔤`{token_text}`") | |
| elif token["type"] == "number": | |
| tokens_display.append(f"🔢`{token_text}`") | |
| elif token["type"] == "punctuation": | |
| tokens_display.append(f"❗`{token_text}`") | |
| else: | |
| tokens_display.append(f"`{token_text}`") | |
| if len(result["tokens"]) > 20: | |
| tokens_display.append(f"... (+{len(result['tokens']) - 20} more)") | |
| output.append(f"- **Subwords**: {subword_count}/{len(result['tokens'][:20])}") | |
| output.append(f"- **Tokens**: {' '.join(tokens_display)}") | |
| return "\n".join(output) | |
| def generate_detailed_analysis(results): | |
| if not results or len(results) < 2: | |
| return "Need at least 2 tokenizers for detailed analysis." | |
| output = [] | |
| output.append("## 🔍 Detailed Analysis") | |
| # Find common tokens | |
| all_token_sets = [] | |
| for model, result in results.items(): | |
| if "error" not in result: | |
| token_texts = {token["text"] for token in result["tokens"]} | |
| all_token_sets.append(token_texts) | |
| if all_token_sets: | |
| common_tokens = set.intersection(*all_token_sets) | |
| output.append(f"\n### Common Tokens ({len(common_tokens)})") | |
| if common_tokens: | |
| common_display = [ | |
| f"`{token}`" if token != " " else "`·`" | |
| for token in list(common_tokens)[:15] | |
| ] | |
| output.append(" ".join(common_display)) | |
| else: | |
| output.append("No common tokens found.") | |
| # Token type distribution | |
| output.append("\n### Token Type Distribution") | |
| for model, result in results.items(): | |
| if "error" not in result: | |
| type_counts = Counter(token["type"] for token in result["tokens"]) | |
| type_display = [f"{type_}: {count}" for type_, count in type_counts.items()] | |
| output.append(f"**{result['model']}**: {', '.join(type_display)}") | |
| # Subword analysis | |
| output.append("\n### Subword Analysis") | |
| for model, result in results.items(): | |
| if "error" not in result: | |
| subwords = [token for token in result["tokens"] if token["is_subword"]] | |
| subword_ratio = ( | |
| len(subwords) / len(result["tokens"]) * 100 if result["tokens"] else 0 | |
| ) | |
| output.append( | |
| f"**{result['model']}**: {len(subwords)} subwords ({subword_ratio:.1f}%)" | |
| ) | |
| return "\n".join(output) | |
| def create_efficiency_chart(results): | |
| if not results: | |
| return None | |
| models = [] | |
| token_counts = [] | |
| compression_ratios = [] | |
| for model, result in results.items(): | |
| if "error" not in result: | |
| models.append(result["model"]) | |
| token_counts.append(result["token_count"]) | |
| compression_ratios.append(result["compression_ratio"]) | |
| if not models: | |
| return None | |
| fig = go.Figure() | |
| # Add token count bars | |
| fig.add_trace( | |
| go.Bar( | |
| x=models, | |
| y=token_counts, | |
| name="Token Count", | |
| marker_color="lightblue", | |
| text=token_counts, | |
| textposition="auto", | |
| ) | |
| ) | |
| fig.update_layout( | |
| title="Token Count Comparison (Lower = More Efficient)", | |
| xaxis_title="Tokenizer", | |
| yaxis_title="Number of Tokens", | |
| template="plotly_white", | |
| ) | |
| return fig | |
| def create_token_distribution_chart(results): | |
| if not results: | |
| return None | |
| all_data = [] | |
| for model, result in results.items(): | |
| if "error" not in result: | |
| type_counts = Counter(token["type"] for token in result["tokens"]) | |
| for token_type, count in type_counts.items(): | |
| all_data.append( | |
| { | |
| "Tokenizer": result["model"], | |
| "Token Type": token_type, | |
| "Count": count, | |
| } | |
| ) | |
| if not all_data: | |
| return None | |
| df = pd.DataFrame(all_data) | |
| fig = px.bar( | |
| df, | |
| x="Tokenizer", | |
| y="Count", | |
| color="Token Type", | |
| title="Token Type Distribution by Tokenizer", | |
| template="plotly_white", | |
| ) | |
| return fig | |
| # Custom CSS for better styling | |
| css = """ | |
| .gradio-container { | |
| font-family: 'Inter', sans-serif; | |
| } | |
| .token-display { | |
| font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace; | |
| background: #f8f9fa; | |
| padding: 8px; | |
| border-radius: 4px; | |
| font-size: 0.9em; | |
| } | |
| """ | |
| # Create the Gradio interface | |
| with gr.Blocks( | |
| title="🔤 Advanced Tokenizer Comparison", theme=gr.themes.Soft(), css=css | |
| ) as demo: | |
| gr.Markdown(""" | |
| # 🔤 Advanced Tokenizer Comparison Tool | |
| Compare how different LLM tokenizers split text into tokens. Analyze efficiency, subwords, and token types. | |
| **Legend**: 🔤 Word | 🔢 Number | ❗ Punctuation | 🔸 Subword | · Space | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| text_input = gr.Textbox( | |
| label="Text to tokenize", | |
| placeholder="Enter your text here...", | |
| lines=4, | |
| value="Hello world! This is a test with some subwords and punctuation.", | |
| ) | |
| with gr.Column(scale=1): | |
| model_selector = gr.CheckboxGroup( | |
| choices=[ | |
| "gpt-4", | |
| "gpt-2", | |
| "llama-2", | |
| "llama-3", | |
| "gemma-2", | |
| "qwen3", | |
| "qwen2.5", | |
| "bert", | |
| "bloom", | |
| "aya-expanse", | |
| "comma", | |
| "byte-level", | |
| "tokenmonster", | |
| ], | |
| value=["gpt-4", "llama-3", "gpt-2"], | |
| label="Select tokenizers to compare", | |
| ) | |
| show_details = gr.Checkbox(label="Show detailed analysis", value=False) | |
| with gr.Row(): | |
| with gr.Column(): | |
| basic_output = gr.Markdown( | |
| label="Comparison Results", | |
| value="Enter text above to see tokenization results...", | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| detailed_output = gr.Markdown(label="Detailed Analysis", visible=False) | |
| with gr.Row(): | |
| with gr.Column(): | |
| efficiency_chart = gr.Plot(label="Efficiency Comparison") | |
| with gr.Column(): | |
| distribution_chart = gr.Plot(label="Token Type Distribution") | |
| # Update visibility of detailed analysis | |
| def toggle_details(show_details): | |
| return gr.update(visible=show_details) | |
| show_details.change(fn=toggle_details, inputs=show_details, outputs=detailed_output) | |
| # Main comparison function | |
| def update_comparison(text, models, details): | |
| basic, detailed, eff_chart, dist_chart = compare_tokenizers( | |
| text, models, details | |
| ) | |
| return basic, detailed, eff_chart, dist_chart | |
| # Auto-update on changes | |
| for component in [text_input, model_selector, show_details]: | |
| component.change( | |
| fn=update_comparison, | |
| inputs=[text_input, model_selector, show_details], | |
| outputs=[ | |
| basic_output, | |
| detailed_output, | |
| efficiency_chart, | |
| distribution_chart, | |
| ], | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ### About the Models | |
| - **GPT-4/GPT-2**: OpenAI's tokenizers using BPE (Byte-Pair Encoding) | |
| - **LLaMA-2/3**: Meta's models using SentencePiece | |
| - **Gemma-2**: Google's model with SentencePiece | |
| - **Qwen3/2.5**: Alibaba's models with BPE | |
| - **BERT**: Google's BERT with WordPiece | |
| - **BLOOM**: BigScience's multilingual model with BPE | |
| - **Aya Expanse**: Cohere's multilingual model with SentencePiece | |
| - **Comma AI**: Comma AI's model with BPE | |
| - **Byte-Level**: Byte-level BPE tokenizer | |
| - **TokenMonster**: Optimized tokenizer with BPE | |
| ### Features | |
| - **Efficiency Ranking**: Compare token counts across models | |
| - **Subword Analysis**: See how models handle subwords | |
| - **Token Types**: Classification of word/number/punctuation tokens | |
| - **Visual Charts**: Interactive plots for comparison | |
| - **Detailed Analysis**: Common tokens and distribution stats | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() | |