| import gradio as gr | |
| from transformers import AutoTokenizer | |
| import random | |
| import colorsys | |
| import html | |
| def get_distinct_colors(n): | |
| colors = [] | |
| for i in range(n): | |
| h = i / n | |
| s = 0.6 | |
| v = 0.7 | |
| r, g, b = colorsys.hsv_to_rgb(h, s, v) | |
| color = "#{:02x}{:02x}{:02x}".format(int(r*255), int(g*255), int(b*255)) | |
| colors.append(color) | |
| return colors | |
| def tokenize_text(hf_model_id, text, token=None): | |
| try: | |
| tokenizer = AutoTokenizer.from_pretrained(hf_model_id, access_token=token) | |
| tokens = tokenizer.tokenize(text) | |
| token_count = len(tokens) | |
| colors = get_distinct_colors(token_count) | |
| colored_tokens = [] | |
| for i, token in enumerate(tokens): | |
| display_token = token.replace('Ġ', '<space>') | |
| display_token = html.escape(display_token) | |
| colored_tokens.append(f'<span style="background-color: {colors[i]}; color: white; padding: 2px 4px; border-radius: 3px; margin: 2px; display: inline-block;">{display_token}</span>') | |
| tokenized_text = "".join(colored_tokens) | |
| return token_count, tokenized_text | |
| except Exception as e: | |
| return f"Error: {str(e)}", "" | |
| demo = gr.Interface( | |
| fn=tokenize_text, | |
| inputs=[ | |
| gr.Textbox(label="Hugging Face Model ID", placeholder="unsloth/gemma-3-27b-it", value="unsloth/gemma-3-27b-it"), | |
| gr.Textbox(label="Text to Tokenize", lines=5, placeholder="Enter your text here..."), | |
| gr.Textbox(label="HuggingFace Token (optional)", placeholder="hf_...", lines=1) | |
| ], | |
| outputs=[ | |
| gr.Number(label="Token Count"), | |
| gr.HTML(label="Tokens", container=True, show_label=True) | |
| ], | |
| title="HuggingFace Tokenizer", | |
| description="Enter a HuggingFace model ID and text to see how it gets tokenized. Provide a huggingface token if the model is gated.", | |
| allow_flagging="never" | |
| ) | |
| demo.launch() |