import gradio as gr import json # Import all tool functions # (Inlined for Space simplicity) MODELS = [ {"Model": "SmolLM2-135M-Instruct-mobile", "Params": "135M", "Size_MB": 270, "RAM_MB": 400, "Task": "Chat", "Quant": "FP16", "Speed_tps": 25.5}, {"Model": "SmolLM2-360M-Instruct-mobile", "Params": "360M", "Size_MB": 720, "RAM_MB": 700, "Task": "Chat", "Quant": "FP16", "Speed_tps": 21.0}, {"Model": "Qwen2.5-0.5B-Instruct-mobile-int4", "Params": "500M", "Size_MB": 350, "RAM_MB": 550, "Task": "Chat", "Quant": "INT4", "Speed_tps": 20.0}, {"Model": "Llama-3.2-1B-Instruct-Q4-mobile", "Params": "1B", "Size_MB": 700, "RAM_MB": 1100, "Task": "Chat", "Quant": "Q4", "Speed_tps": 18.2}, {"Model": "Llama-3.2-3B-Instruct-Q5-mobile", "Params": "3B", "Size_MB": 2100, "RAM_MB": 2700, "Task": "Chat", "Quant": "Q5", "Speed_tps": 8.5}, {"Model": "Gemma-2B-Arabic-mobile", "Params": "2B", "Size_MB": 5000, "RAM_MB": 5500, "Task": "Arabic", "Quant": "FP16", "Speed_tps": 8.0}, {"Model": "Llama-3.2-1B-FunctionCall-mobile", "Params": "1B", "Size_MB": 2500, "RAM_MB": 3000, "Task": "Function Call", "Quant": "FP16", "Speed_tps": 12.0}, ] LATENCY_DB = { "135M": {"FP16": 25.5, "Q4_K_M": 32.0, "Q8_0": 28.2}, "500M": {"FP16": 20.0, "Q4_K_M": 26.8, "INT4": 20.0}, "1B": {"FP16": 12.0, "Q4_K_M": 18.2, "Q5_K_M": 17.5}, "2B": {"FP16": 8.0, "Q5_K_M": 12.0, "Q4_K_M": 12.8}, "3B": {"FP16": 5.5, "Q5_K_M": 8.5, "Q4_K_M": 9.0}, } def recommend_model(ram_mb: int, task: str = "Any") -> str: """Recommend the best dispatchAI mobile model for a given RAM budget and task. Args: ram_mb: Available RAM in MB task: Task type (Chat, Code, Math, Arabic, Function Call, Vision, Embedding, Any) Returns: JSON with recommended model """ filtered = [m for m in MODELS if m["RAM_MB"] <= ram_mb] if task != "Any": filtered = [m for m in filtered if m["Task"] == task] if not filtered: return json.dumps({"error": f"No models fit in {ram_mb}MB"}) best = sorted(filtered, key=lambda x: x["Size_MB"])[0] return json.dumps({ "recommended": best["Model"], "url": f"https://huggingface.co/dispatchAI/{best['Model']}", "size_mb": best["Size_MB"], "ram_mb": best["RAM_MB"], "speed_tps": best["Speed_tps"], }, indent=2) def estimate_latency(params: str, quant: str = "Q4_K_M") -> str: """Estimate inference latency on Snapdragon 865. Args: params: Model size (e.g., "135M", "1B", "3B") quant: Quantization (FP16, Q4_K_M, Q5_K_M, Q8_0, INT4) Returns: JSON with speed and RAM estimates """ params = params.upper() if params not in LATENCY_DB: return json.dumps({"error": f"Unknown: {params}. Valid: {list(LATENCY_DB.keys())}"}) tps = LATENCY_DB[params].get(quant.upper(), 10.0) return json.dumps({ "params": params, "quant": quant, "tokens_per_sec": tps, "ms_per_token": round(1000/tps, 0), "hardware": "Snapdragon 865", }, indent=2) def calculate_savings(daily_queries: int, cloud_cost_per_1k: float) -> str: """Calculate savings from on-device vs cloud inference. Args: daily_queries: Queries per day cloud_cost_per_1k: Cloud cost per 1000 queries Returns: JSON with cost comparison """ annual_cloud = daily_queries * 365 * cloud_cost_per_1k / 1000 annual_device = 0.5 return json.dumps({ "cloud_annual": round(annual_cloud, 2), "device_annual": round(annual_device, 2), "savings": round(annual_cloud - annual_device, 2), "savings_pct": round((1 - annual_device/annual_cloud)*100, 1) if annual_cloud > 0 else 0, }, indent=2) def search_models(query: str) -> str: """Search dispatchAI models by keyword. Args: query: Search term (e.g., "arabic", "coder", "1B", "quantized") Returns: JSON with matching models """ q = query.lower() matches = [m for m in MODELS if q in m["Model"].lower() or q in m["Task"].lower() or q in m["Quant"].lower()] return json.dumps({"query": query, "matches": len(matches), "models": matches}, indent=2) with gr.Blocks(title="dispatchAI MCP Hub") as demo: gr.Markdown(""" # 🧰 dispatchAI MCP Tool Hub **One Space, four tools.** Add to Claude Desktop / Cursor / any MCP client. | Tool | Description | |------|-------------| | `recommend_model` | Find best model for your phone | | `estimate_latency` | Predict inference speed | | `calculate_savings` | Cloud vs on-device cost | | `search_models` | Search dispatchAI catalog | """) with gr.Tab("Recommend"): r_ram = gr.Slider(512, 8192, value=2048, label="RAM (MB)") r_task = gr.Dropdown(["Any", "Chat", "Arabic", "Function Call"], value="Any", label="Task") r_btn = gr.Button("Recommend") r_out = gr.Textbox(label="Result", lines=10) r_btn.click(fn=recommend_model, inputs=[r_ram, r_task], outputs=r_out) with gr.Tab("Latency"): l_p = gr.Dropdown(list(LATENCY_DB.keys()), value="1B", label="Params") l_q = gr.Dropdown(["FP16", "Q4_K_M", "Q5_K_M", "Q8_0", "INT4"], value="Q4_K_M", label="Quant") l_btn = gr.Button("Estimate") l_out = gr.Textbox(label="Result", lines=8) l_btn.click(fn=estimate_latency, inputs=[l_p, l_q], outputs=l_out) with gr.Tab("Cost"): c_dq = gr.Slider(100, 100000, value=10000, label="Daily Queries") c_cc = gr.Slider(0.1, 10, value=0.5, label="Cloud $/1K") c_btn = gr.Button("Calculate") c_out = gr.Textbox(label="Result", lines=8) c_btn.click(fn=calculate_savings, inputs=[c_dq, c_cc], outputs=c_out) with gr.Tab("Search"): s_q = gr.Textbox(label="Search Query", placeholder="arabic, coder, 1B...") s_btn = gr.Button("Search") s_out = gr.Textbox(label="Results", lines=10) s_btn.click(fn=search_models, inputs=s_q, outputs=s_out) demo.launch(mcp_server=True)