import gradio as gr from transformers import AutoTokenizer # Default tokenizer DEFAULT_MODEL = "Qwen/Qwen2.5-0.5B" # Cache of tokenizers to avoid reloading tokenizers = {} def get_tokenizer(model_name): if model_name not in tokenizers: tokenizers[model_name] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) return tokenizers[model_name] def tokenize_text(text, model_name): tokenizer = get_tokenizer(model_name) tokens = tokenizer.tokenize(text) token_ids = tokenizer.encode(text) decoded = tokenizer.decode(token_ids) return { "Tokens": tokens, "Token IDs": token_ids, "Decoded text": decoded } # List of popular tokenizers available_models = [ "Qwen/Qwen2.5-0.5B", "meta-llama/Llama-2-7b-hf" "meta-llama/Llama-2-13b-hf", "ReliableAI/UCCIX-Llama2-13B" ] # Gradio UI with gr.Blocks(title="Tokenizer Playground") as demo: gr.Markdown("# 🧩 Tokenizer Playground") gr.Markdown("Explore how different Hugging Face tokenizers split your text into tokens!") with gr.Row(): model_name = gr.Dropdown(available_models, value=DEFAULT_MODEL, label="Select a tokenizer model") with gr.Row(): text = gr.Textbox(label="Input text", placeholder="Type something...", lines=4) btn = gr.Button("Tokenize") with gr.Row(): tokens_output = gr.JSON(label="Tokenizer output") btn.click(tokenize_text, inputs=[text, model_name], outputs=tokens_output) demo.launch()