File size: 1,499 Bytes
fe1b559 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
import gradio as gr
from transformers import AutoTokenizer
# Default tokenizer
DEFAULT_MODEL = "Qwen/Qwen2.5-0.5B"
# Cache of tokenizers to avoid reloading
tokenizers = {}
def get_tokenizer(model_name):
if model_name not in tokenizers:
tokenizers[model_name] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
return tokenizers[model_name]
def tokenize_text(text, model_name):
tokenizer = get_tokenizer(model_name)
tokens = tokenizer.tokenize(text)
token_ids = tokenizer.encode(text)
decoded = tokenizer.decode(token_ids)
return {
"Tokens": tokens,
"Token IDs": token_ids,
"Decoded text": decoded
}
# List of popular tokenizers
available_models = [
"Qwen/Qwen2.5-0.5B",
"meta-llama/Llama-2-7b-hf"
"meta-llama/Llama-2-13b-hf",
"ReliableAI/UCCIX-Llama2-13B"
]
# Gradio UI
with gr.Blocks(title="Tokenizer Playground") as demo:
gr.Markdown("# 🧩 Tokenizer Playground")
gr.Markdown("Explore how different Hugging Face tokenizers split your text into tokens!")
with gr.Row():
model_name = gr.Dropdown(available_models, value=DEFAULT_MODEL, label="Select a tokenizer model")
with gr.Row():
text = gr.Textbox(label="Input text", placeholder="Type something...", lines=4)
btn = gr.Button("Tokenize")
with gr.Row():
tokens_output = gr.JSON(label="Tokenizer output")
btn.click(tokenize_text, inputs=[text, model_name], outputs=tokens_output)
demo.launch()
|