|
|
import gradio as gr |
|
|
from transformers import AutoTokenizer |
|
|
|
|
|
|
|
|
DEFAULT_MODEL = "Qwen/Qwen2.5-0.5B" |
|
|
|
|
|
|
|
|
tokenizers = {} |
|
|
|
|
|
def get_tokenizer(model_name): |
|
|
if model_name not in tokenizers: |
|
|
tokenizers[model_name] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) |
|
|
return tokenizers[model_name] |
|
|
|
|
|
def tokenize_text(text, model_name): |
|
|
tokenizer = get_tokenizer(model_name) |
|
|
tokens = tokenizer.tokenize(text) |
|
|
token_ids = tokenizer.encode(text) |
|
|
decoded = tokenizer.decode(token_ids) |
|
|
return { |
|
|
"Tokens": tokens, |
|
|
"Token IDs": token_ids, |
|
|
"Decoded text": decoded |
|
|
} |
|
|
|
|
|
|
|
|
available_models = [ |
|
|
"Qwen/Qwen2.5-0.5B", |
|
|
"meta-llama/Llama-2-7b-hf" |
|
|
"meta-llama/Llama-2-13b-hf", |
|
|
"ReliableAI/UCCIX-Llama2-13B" |
|
|
] |
|
|
|
|
|
|
|
|
with gr.Blocks(title="Tokenizer Playground") as demo: |
|
|
gr.Markdown("# 🧩 Tokenizer Playground") |
|
|
gr.Markdown("Explore how different Hugging Face tokenizers split your text into tokens!") |
|
|
|
|
|
with gr.Row(): |
|
|
model_name = gr.Dropdown(available_models, value=DEFAULT_MODEL, label="Select a tokenizer model") |
|
|
with gr.Row(): |
|
|
text = gr.Textbox(label="Input text", placeholder="Type something...", lines=4) |
|
|
|
|
|
btn = gr.Button("Tokenize") |
|
|
|
|
|
with gr.Row(): |
|
|
tokens_output = gr.JSON(label="Tokenizer output") |
|
|
|
|
|
btn.click(tokenize_text, inputs=[text, model_name], outputs=tokens_output) |
|
|
|
|
|
demo.launch() |
|
|
|
|
|
|