import gradio as gr import json import re def tokenize_text(text: str, method: str = "wordpiece") -> str: """Tokenize text and return token count and details.""" if not text: return json.dumps({"error": "No text provided"}) if method == "whitespace": tokens = text.split() elif method == "character": tokens = list(text) elif method == "wordpiece": words = re.findall(r"\S+|\s+", text) tokens = [] for w in words: if len(w) <= 4: tokens.append(w) else: for i in range(0, len(w), 4): prefix = "##" if i > 0 else "" tokens.append(prefix + w[i:i+4]) elif method == "bpe": words = re.findall(r"\w+|\W", text) tokens = [] for w in words: if w.isalpha() and len(w) <= 6: tokens.append(w) elif w.isalpha(): mid = len(w) // 2 tokens.append(w[:mid]) tokens.append("##" + w[mid:]) else: tokens.append(w) return json.dumps({ "method": method, "token_count": len(tokens), "char_count": len(text), "tokens": tokens[:50], "avg_token_length": round(sum(len(t) for t in tokens) / len(tokens), 1) if tokens else 0, }, indent=2) def compare_methods(text: str) -> str: """Compare tokenization across methods.""" methods = ["whitespace", "character", "wordpiece", "bpe"] results = {} for m in methods: if m == "whitespace": tokens = text.split() elif m == "character": tokens = list(text) elif m == "wordpiece": words = re.findall(r"\S+|\s+", text) tokens = [] for w in words: if len(w) <= 4: tokens.append(w) else: for i in range(0, len(w), 4): tokens.append(("##" if i > 0 else "") + w[i:i+4]) elif m == "bpe": words = re.findall(r"\w+|\W", text) tokens = [] for w in words: if w.isalpha() and len(w) <= 6: tokens.append(w) elif w.isalpha(): mid = len(w) // 2 tokens.append(w[:mid]) tokens.append("##" + w[mid:]) else: tokens.append(w) results[m] = {"token_count": len(tokens), "tokens_per_char": round(len(tokens) / len(text), 3) if text else 0} return json.dumps({"char_count": len(text), "comparison": results}, indent=2) with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="dispatchAI Tokenizer Visualizer") as demo: gr.Markdown("# šŸ”¤ dispatchAI Tokenizer Visualizer (MCP)") with gr.Tab("Visualize"): v_text = gr.Textbox(label="Text", value="The quick brown fox jumps over the lazy dog.", lines=3) v_method = gr.Dropdown(["whitespace", "character", "wordpiece", "bpe"], value="wordpiece", label="Method") v_btn = gr.Button("Tokenize", variant="primary") v_out = gr.Textbox(label="Tokens (JSON)", lines=15) v_btn.click(fn=tokenize_text, inputs=[v_text, v_method], outputs=v_out) with gr.Tab("Compare"): c_text = gr.Textbox(label="Text", value="The quick brown fox.", lines=3) c_btn = gr.Button("Compare Methods", variant="primary") c_out = gr.Textbox(label="Comparison (JSON)", lines=12) c_btn.click(fn=compare_methods, inputs=c_text, outputs=c_out) gr.Markdown("---\nšŸš€ [dispatchAI](https://huggingface.co/dispatchAI)") demo.launch(mcp_server=True)