Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import tiktoken | |
| from transformers import AutoTokenizer | |
| import os | |
| # Model mappings | |
| MODEL_MAP = { | |
| 'llama-2': 'meta-llama/Llama-2-7b-hf', | |
| 'llama-3': 'meta-llama/Meta-Llama-3-8B', | |
| 'gemma-2': 'google/gemma-2-2b', | |
| 'qwen3': 'Qwen/Qwen2.5-0.5B', | |
| 'bert': 'bert-base-uncased' | |
| } | |
| def tokenize_with_tiktoken(text, model): | |
| encoding = 'cl100k_base' if model == 'gpt-4' else 'gpt2' | |
| enc = tiktoken.get_encoding(encoding) | |
| tokens = enc.encode(text) | |
| token_texts = [enc.decode([token]) for token in tokens] | |
| return { | |
| 'model': f'GPT-4' if model == 'gpt-4' else 'GPT-2', | |
| 'token_count': len(tokens), | |
| 'tokens': token_texts, | |
| 'token_ids': tokens.tolist() | |
| } | |
| def tokenize_with_hf(text, model): | |
| try: | |
| model_name = MODEL_MAP.get(model, 'gpt2') | |
| tokenizer = AutoTokenizer.from_pretrained(model_name, token=os.getenv('HF_TOKEN')) | |
| tokens = tokenizer.encode(text) | |
| token_texts = [tokenizer.decode([token], skip_special_tokens=False) for token in tokens] | |
| return { | |
| 'model': model.upper(), | |
| 'token_count': len(tokens), | |
| 'tokens': token_texts, | |
| 'token_ids': tokens | |
| } | |
| except Exception as e: | |
| return { | |
| 'model': model.upper(), | |
| 'token_count': 0, | |
| 'tokens': [f"Error: {str(e)}"], | |
| 'token_ids': [] | |
| } | |
| def compare_tokenizers(text, selected_models): | |
| if not text.strip(): | |
| return "Please enter some text to tokenize." | |
| results = [] | |
| for model in selected_models: | |
| if model in ['gpt-4', 'gpt-2']: | |
| result = tokenize_with_tiktoken(text, model) | |
| else: | |
| result = tokenize_with_hf(text, model) | |
| # Format output | |
| tokens_display = ' | '.join([f'"{token}"' if token.strip() else '"·"' for token in result['tokens'][:20]]) | |
| if len(result['tokens']) > 20: | |
| tokens_display += f" ... (+{len(result['tokens']) - 20} more)" | |
| results.append(f""" | |
| **{result['model']}** | |
| - Token Count: **{result['token_count']}** | |
| - Tokens: {tokens_display} | |
| - Token IDs: {str(result['token_ids'][:10])}{'...' if len(result['token_ids']) > 10 else ''} | |
| """) | |
| return "\n\n---\n".join(results) | |
| # Create Gradio interface | |
| with gr.Blocks( | |
| title="🔤 Tokenizer Comparison Tool", | |
| theme=gr.themes.Soft() | |
| ) as demo: | |
| gr.Markdown(""" | |
| # 🔤 Tokenizer Comparison Tool | |
| Compare how different LLM tokenizers split text into tokens. See the differences between GPT, LLaMA, Gemma, and other models. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| text_input = gr.Textbox( | |
| label="Text to tokenize", | |
| placeholder="Hello world! This is a test with some subwords and punctuation.", | |
| lines=4, | |
| value="Hello world! This is a test with some subwords and punctuation." | |
| ) | |
| with gr.Column(scale=1): | |
| model_selector = gr.CheckboxGroup( | |
| choices=['gpt-4', 'gpt-2', 'llama-2', 'llama-3', 'gemma-2', 'qwen3', 'bert'], | |
| value=['gpt-4', 'llama-3', 'gpt-2'], | |
| label="Select tokenizers to compare" | |
| ) | |
| output = gr.Markdown( | |
| label="Tokenization Results", | |
| value="Enter text above to see tokenization results..." | |
| ) | |
| # Auto-update on text or model change | |
| text_input.change( | |
| fn=compare_tokenizers, | |
| inputs=[text_input, model_selector], | |
| outputs=output | |
| ) | |
| model_selector.change( | |
| fn=compare_tokenizers, | |
| inputs=[text_input, model_selector], | |
| outputs=output | |
| ) | |
| gr.Markdown(""" | |
| ### Legend: | |
| - **Token Count**: Number of tokens the model uses | |
| - **Tokens**: The actual text pieces (subwords) | |
| - **Token IDs**: Numerical IDs in the vocabulary | |
| - **"·"**: Represents spaces/whitespace | |
| ### Models: | |
| - **GPT-4/GPT-2**: OpenAI tokenizers (tiktoken) | |
| - **LLaMA**: Meta's models (SentencePiece) | |
| - **Gemma**: Google's models | |
| - **Qwen**: Alibaba's models | |
| - **BERT**: Google's BERT tokenizer | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() |