Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import json | |
| # Import all tool functions | |
| # (Inlined for Space simplicity) | |
| MODELS = [ | |
| {"Model": "SmolLM2-135M-Instruct-mobile", "Params": "135M", "Size_MB": 270, "RAM_MB": 400, "Task": "Chat", "Quant": "FP16", "Speed_tps": 25.5}, | |
| {"Model": "SmolLM2-360M-Instruct-mobile", "Params": "360M", "Size_MB": 720, "RAM_MB": 700, "Task": "Chat", "Quant": "FP16", "Speed_tps": 21.0}, | |
| {"Model": "Qwen2.5-0.5B-Instruct-mobile-int4", "Params": "500M", "Size_MB": 350, "RAM_MB": 550, "Task": "Chat", "Quant": "INT4", "Speed_tps": 20.0}, | |
| {"Model": "Llama-3.2-1B-Instruct-Q4-mobile", "Params": "1B", "Size_MB": 700, "RAM_MB": 1100, "Task": "Chat", "Quant": "Q4", "Speed_tps": 18.2}, | |
| {"Model": "Llama-3.2-3B-Instruct-Q5-mobile", "Params": "3B", "Size_MB": 2100, "RAM_MB": 2700, "Task": "Chat", "Quant": "Q5", "Speed_tps": 8.5}, | |
| {"Model": "Gemma-2B-Arabic-mobile", "Params": "2B", "Size_MB": 5000, "RAM_MB": 5500, "Task": "Arabic", "Quant": "FP16", "Speed_tps": 8.0}, | |
| {"Model": "Llama-3.2-1B-FunctionCall-mobile", "Params": "1B", "Size_MB": 2500, "RAM_MB": 3000, "Task": "Function Call", "Quant": "FP16", "Speed_tps": 12.0}, | |
| ] | |
| LATENCY_DB = { | |
| "135M": {"FP16": 25.5, "Q4_K_M": 32.0, "Q8_0": 28.2}, | |
| "500M": {"FP16": 20.0, "Q4_K_M": 26.8, "INT4": 20.0}, | |
| "1B": {"FP16": 12.0, "Q4_K_M": 18.2, "Q5_K_M": 17.5}, | |
| "2B": {"FP16": 8.0, "Q5_K_M": 12.0, "Q4_K_M": 12.8}, | |
| "3B": {"FP16": 5.5, "Q5_K_M": 8.5, "Q4_K_M": 9.0}, | |
| } | |
| def recommend_model(ram_mb: int, task: str = "Any") -> str: | |
| """Recommend the best dispatchAI mobile model for a given RAM budget and task. | |
| Args: | |
| ram_mb: Available RAM in MB | |
| task: Task type (Chat, Code, Math, Arabic, Function Call, Vision, Embedding, Any) | |
| Returns: | |
| JSON with recommended model | |
| """ | |
| filtered = [m for m in MODELS if m["RAM_MB"] <= ram_mb] | |
| if task != "Any": | |
| filtered = [m for m in filtered if m["Task"] == task] | |
| if not filtered: | |
| return json.dumps({"error": f"No models fit in {ram_mb}MB"}) | |
| best = sorted(filtered, key=lambda x: x["Size_MB"])[0] | |
| return json.dumps({ | |
| "recommended": best["Model"], | |
| "url": f"https://huggingface.co/dispatchAI/{best['Model']}", | |
| "size_mb": best["Size_MB"], | |
| "ram_mb": best["RAM_MB"], | |
| "speed_tps": best["Speed_tps"], | |
| }, indent=2) | |
| def estimate_latency(params: str, quant: str = "Q4_K_M") -> str: | |
| """Estimate inference latency on Snapdragon 865. | |
| Args: | |
| params: Model size (e.g., "135M", "1B", "3B") | |
| quant: Quantization (FP16, Q4_K_M, Q5_K_M, Q8_0, INT4) | |
| Returns: | |
| JSON with speed and RAM estimates | |
| """ | |
| params = params.upper() | |
| if params not in LATENCY_DB: | |
| return json.dumps({"error": f"Unknown: {params}. Valid: {list(LATENCY_DB.keys())}"}) | |
| tps = LATENCY_DB[params].get(quant.upper(), 10.0) | |
| return json.dumps({ | |
| "params": params, | |
| "quant": quant, | |
| "tokens_per_sec": tps, | |
| "ms_per_token": round(1000/tps, 0), | |
| "hardware": "Snapdragon 865", | |
| }, indent=2) | |
| def calculate_savings(daily_queries: int, cloud_cost_per_1k: float) -> str: | |
| """Calculate savings from on-device vs cloud inference. | |
| Args: | |
| daily_queries: Queries per day | |
| cloud_cost_per_1k: Cloud cost per 1000 queries | |
| Returns: | |
| JSON with cost comparison | |
| """ | |
| annual_cloud = daily_queries * 365 * cloud_cost_per_1k / 1000 | |
| annual_device = 0.5 | |
| return json.dumps({ | |
| "cloud_annual": round(annual_cloud, 2), | |
| "device_annual": round(annual_device, 2), | |
| "savings": round(annual_cloud - annual_device, 2), | |
| "savings_pct": round((1 - annual_device/annual_cloud)*100, 1) if annual_cloud > 0 else 0, | |
| }, indent=2) | |
| def search_models(query: str) -> str: | |
| """Search dispatchAI models by keyword. | |
| Args: | |
| query: Search term (e.g., "arabic", "coder", "1B", "quantized") | |
| Returns: | |
| JSON with matching models | |
| """ | |
| q = query.lower() | |
| matches = [m for m in MODELS if q in m["Model"].lower() or q in m["Task"].lower() or q in m["Quant"].lower()] | |
| return json.dumps({"query": query, "matches": len(matches), "models": matches}, indent=2) | |
| with gr.Blocks(title="dispatchAI MCP Hub") as demo: | |
| gr.Markdown(""" | |
| # 🧰 dispatchAI MCP Tool Hub | |
| **One Space, four tools.** Add to Claude Desktop / Cursor / any MCP client. | |
| | Tool | Description | | |
| |------|-------------| | |
| | `recommend_model` | Find best model for your phone | | |
| | `estimate_latency` | Predict inference speed | | |
| | `calculate_savings` | Cloud vs on-device cost | | |
| | `search_models` | Search dispatchAI catalog | | |
| """) | |
| with gr.Tab("Recommend"): | |
| r_ram = gr.Slider(512, 8192, value=2048, label="RAM (MB)") | |
| r_task = gr.Dropdown(["Any", "Chat", "Arabic", "Function Call"], value="Any", label="Task") | |
| r_btn = gr.Button("Recommend") | |
| r_out = gr.Textbox(label="Result", lines=10) | |
| r_btn.click(fn=recommend_model, inputs=[r_ram, r_task], outputs=r_out) | |
| with gr.Tab("Latency"): | |
| l_p = gr.Dropdown(list(LATENCY_DB.keys()), value="1B", label="Params") | |
| l_q = gr.Dropdown(["FP16", "Q4_K_M", "Q5_K_M", "Q8_0", "INT4"], value="Q4_K_M", label="Quant") | |
| l_btn = gr.Button("Estimate") | |
| l_out = gr.Textbox(label="Result", lines=8) | |
| l_btn.click(fn=estimate_latency, inputs=[l_p, l_q], outputs=l_out) | |
| with gr.Tab("Cost"): | |
| c_dq = gr.Slider(100, 100000, value=10000, label="Daily Queries") | |
| c_cc = gr.Slider(0.1, 10, value=0.5, label="Cloud $/1K") | |
| c_btn = gr.Button("Calculate") | |
| c_out = gr.Textbox(label="Result", lines=8) | |
| c_btn.click(fn=calculate_savings, inputs=[c_dq, c_cc], outputs=c_out) | |
| with gr.Tab("Search"): | |
| s_q = gr.Textbox(label="Search Query", placeholder="arabic, coder, 1B...") | |
| s_btn = gr.Button("Search") | |
| s_out = gr.Textbox(label="Results", lines=10) | |
| s_btn.click(fn=search_models, inputs=s_q, outputs=s_out) | |
| demo.launch(mcp_server=True) | |