File size: 1,499 Bytes
fe1b559
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import gradio as gr
from transformers import AutoTokenizer

# Default tokenizer
DEFAULT_MODEL = "Qwen/Qwen2.5-0.5B"

# Cache of tokenizers to avoid reloading
tokenizers = {}

def get_tokenizer(model_name):
    if model_name not in tokenizers:
        tokenizers[model_name] = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    return tokenizers[model_name]

def tokenize_text(text, model_name):
    tokenizer = get_tokenizer(model_name)
    tokens = tokenizer.tokenize(text)
    token_ids = tokenizer.encode(text)
    decoded = tokenizer.decode(token_ids)
    return {
        "Tokens": tokens,
        "Token IDs": token_ids,
        "Decoded text": decoded
    }

# List of popular tokenizers
available_models = [
    "Qwen/Qwen2.5-0.5B",
	"meta-llama/Llama-2-7b-hf"
	"meta-llama/Llama-2-13b-hf",
	"ReliableAI/UCCIX-Llama2-13B"
]

# Gradio UI
with gr.Blocks(title="Tokenizer Playground") as demo:
    gr.Markdown("# 🧩 Tokenizer Playground")
    gr.Markdown("Explore how different Hugging Face tokenizers split your text into tokens!")

    with gr.Row():
        model_name = gr.Dropdown(available_models, value=DEFAULT_MODEL, label="Select a tokenizer model")
    with gr.Row():
        text = gr.Textbox(label="Input text", placeholder="Type something...", lines=4)

    btn = gr.Button("Tokenize")

    with gr.Row():
        tokens_output = gr.JSON(label="Tokenizer output")

    btn.click(tokenize_text, inputs=[text, model_name], outputs=tokens_output)

demo.launch()