Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import json | |
| import re | |
| def tokenize_text(text: str, method: str = "wordpiece") -> str: | |
| """Tokenize text and return token count and details.""" | |
| if not text: | |
| return json.dumps({"error": "No text provided"}) | |
| if method == "whitespace": | |
| tokens = text.split() | |
| elif method == "character": | |
| tokens = list(text) | |
| elif method == "wordpiece": | |
| words = re.findall(r"\S+|\s+", text) | |
| tokens = [] | |
| for w in words: | |
| if len(w) <= 4: | |
| tokens.append(w) | |
| else: | |
| for i in range(0, len(w), 4): | |
| prefix = "##" if i > 0 else "" | |
| tokens.append(prefix + w[i:i+4]) | |
| elif method == "bpe": | |
| words = re.findall(r"\w+|\W", text) | |
| tokens = [] | |
| for w in words: | |
| if w.isalpha() and len(w) <= 6: | |
| tokens.append(w) | |
| elif w.isalpha(): | |
| mid = len(w) // 2 | |
| tokens.append(w[:mid]) | |
| tokens.append("##" + w[mid:]) | |
| else: | |
| tokens.append(w) | |
| return json.dumps({ | |
| "method": method, | |
| "token_count": len(tokens), | |
| "char_count": len(text), | |
| "tokens": tokens[:50], | |
| "avg_token_length": round(sum(len(t) for t in tokens) / len(tokens), 1) if tokens else 0, | |
| }, indent=2) | |
| def compare_methods(text: str) -> str: | |
| """Compare tokenization across methods.""" | |
| methods = ["whitespace", "character", "wordpiece", "bpe"] | |
| results = {} | |
| for m in methods: | |
| if m == "whitespace": | |
| tokens = text.split() | |
| elif m == "character": | |
| tokens = list(text) | |
| elif m == "wordpiece": | |
| words = re.findall(r"\S+|\s+", text) | |
| tokens = [] | |
| for w in words: | |
| if len(w) <= 4: tokens.append(w) | |
| else: | |
| for i in range(0, len(w), 4): | |
| tokens.append(("##" if i > 0 else "") + w[i:i+4]) | |
| elif m == "bpe": | |
| words = re.findall(r"\w+|\W", text) | |
| tokens = [] | |
| for w in words: | |
| if w.isalpha() and len(w) <= 6: tokens.append(w) | |
| elif w.isalpha(): | |
| mid = len(w) // 2 | |
| tokens.append(w[:mid]) | |
| tokens.append("##" + w[mid:]) | |
| else: tokens.append(w) | |
| results[m] = {"token_count": len(tokens), "tokens_per_char": round(len(tokens) / len(text), 3) if text else 0} | |
| return json.dumps({"char_count": len(text), "comparison": results}, indent=2) | |
| with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="dispatchAI Tokenizer Visualizer") as demo: | |
| gr.Markdown("# 🔤 dispatchAI Tokenizer Visualizer (MCP)") | |
| with gr.Tab("Visualize"): | |
| v_text = gr.Textbox(label="Text", value="The quick brown fox jumps over the lazy dog.", lines=3) | |
| v_method = gr.Dropdown(["whitespace", "character", "wordpiece", "bpe"], value="wordpiece", label="Method") | |
| v_btn = gr.Button("Tokenize", variant="primary") | |
| v_out = gr.Textbox(label="Tokens (JSON)", lines=15) | |
| v_btn.click(fn=tokenize_text, inputs=[v_text, v_method], outputs=v_out) | |
| with gr.Tab("Compare"): | |
| c_text = gr.Textbox(label="Text", value="The quick brown fox.", lines=3) | |
| c_btn = gr.Button("Compare Methods", variant="primary") | |
| c_out = gr.Textbox(label="Comparison (JSON)", lines=12) | |
| c_btn.click(fn=compare_methods, inputs=c_text, outputs=c_out) | |
| gr.Markdown("---\n🚀 [dispatchAI](https://huggingface.co/dispatchAI)") | |
| demo.launch(mcp_server=True) | |