import gradio as gr import json # Latency estimates for Snapdragon 865 (Samsung S20 FE, 8GB RAM, llama.cpp 4 threads) LATENCY_DB = { "135M": {"FP16": 25.5, "Q8_0": 28.2, "Q5_K_M": 30.1, "Q4_K_M": 32.0, "Q3_K_M": 33.5, "Q2_K": 35.0}, "300M": {"FP16": 22.0, "Q8_0": 24.5, "Q5_K_M": 26.0, "Q4_K_M": 27.5, "Q3_K_M": 28.5, "Q2_K": 29.5}, "360M": {"FP16": 21.0, "Q8_0": 23.5, "Q5_K_M": 25.0, "Q4_K_M": 26.5, "Q3_K_M": 27.5, "Q2_K": 28.5}, "500M": {"FP16": 20.0, "Q8_0": 24.0, "Q5_K_M": 25.5, "Q4_K_M": 26.8, "Q3_K_M": 27.5, "Q2_K": 28.5}, "600M": {"FP16": 18.0, "Q8_0": 21.0, "Q5_K_M": 22.5, "Q4_K_M": 23.8, "Q3_K_M": 24.5, "Q2_K": 25.5}, "1B": {"FP16": 12.0, "Q8_0": 15.5, "Q5_K_M": 17.5, "Q4_K_M": 18.2, "Q3_K_M": 19.0, "Q2_K": 20.0}, "1.1B": {"FP16": 11.5, "Q8_0": 14.8, "Q5_K_M": 17.0, "Q4_K_M": 17.7, "Q3_K_M": 18.5, "Q2_K": 19.5}, "1.5B": {"FP16": 10.5, "Q8_0": 13.0, "Q5_K_M": 14.5, "Q4_K_M": 15.2, "Q3_K_M": 16.0, "Q2_K": 17.0}, "1.7B": {"FP16": 9.0, "Q8_0": 11.5, "Q5_K_M": 13.0, "Q4_K_M": 13.8, "Q3_K_M": 14.5, "Q2_K": 15.5}, "1.9B": {"FP16": 8.5, "Q8_0": 11.0, "Q5_K_M": 12.5, "Q4_K_M": 13.2, "Q3_K_M": 14.0, "Q2_K": 15.0}, "2B": {"FP16": 8.0, "Q8_0": 10.5, "Q5_K_M": 12.0, "Q4_K_M": 12.8, "Q3_K_M": 13.5, "Q2_K": 14.5}, "3B": {"FP16": 5.5, "Q8_0": 7.0, "Q5_K_M": 8.5, "Q4_K_M": 9.0, "Q3_K_M": 9.8, "Q2_K": 10.5}, "3.8B": {"FP16": 4.5, "Q8_0": 6.0, "Q5_K_M": 7.5, "Q4_K_M": 8.2, "Q3_K_M": 8.8, "Q2_K": 9.5}, "7B": {"FP16": 2.5, "Q8_0": 3.5, "Q5_K_M": 4.5, "Q4_K_M": 5.0, "Q3_K_M": 5.5, "Q2_K": 6.0}, } # Hardware profiles HARDWARE = { "snapdragon_865": {"name": "Snapdragon 865 (Samsung S20 FE)", "multiplier": 1.0}, "snapdragon_8_gen_2": {"name": "Snapdragon 8 Gen 2 (Galaxy S23)", "multiplier": 1.8}, "snapdragon_8_gen_3": {"name": "Snapdragon 8 Gen 3 (Galaxy S24)", "multiplier": 2.2}, "apple_a17": {"name": "Apple A17 Pro (iPhone 15 Pro)", "multiplier": 2.5}, "apple_m2": {"name": "Apple M2 (MacBook)", "multiplier": 3.0}, "snapdragon_778g": {"name": "Snapdragon 778G (Mid-range)", "multiplier": 0.7}, "mediatek_dimensity_9200": {"name": "MediaTek Dimensity 9200", "multiplier": 1.6}, } def estimate_latency(params: str, quant: str = "Q4_K_M", hardware: str = "snapdragon_865", prompt_tokens: int = 100, generate_tokens: int = 100) -> str: """Estimate on-device inference latency for a mobile LLM. Use this tool when a user asks "how fast will model X be on my phone" or "what's the latency of this model on mobile hardware". Returns tokens/sec, time per token, and total generation time. Args: params: Model parameter count as string - e.g., "135M", "500M", "1B", "1.5B", "3B", "7B" quant: Quantization level - one of: FP16, Q8_0, Q5_K_M, Q4_K_M, Q3_K_M, Q2_K hardware: Target hardware - one of: snapdragon_865, snapdragon_8_gen_2, snapdragon_8_gen_3, apple_a17, apple_m2, snapdragon_778g, mediatek_dimensity_9200 prompt_tokens: Number of input tokens to process generate_tokens: Number of tokens to generate Returns: JSON string with latency estimates """ params = params.upper() if params not in LATENCY_DB: return json.dumps({"error": f"Unknown param count: {params}. Valid: {list(LATENCY_DB.keys())}"}) quant = quant.upper() if quant not in LATENCY_DB[params]: return json.dumps({"error": f"Unknown quant: {quant}. Valid: {list(LATENCY_DB[params].keys())}"}) if hardware not in HARDWARE: return json.dumps({"error": f"Unknown hardware: {hardware}. Valid: {list(HARDWARE.keys())}"}) base_tps = LATENCY_DB[params][quant] hw_mult = HARDWARE[hardware]["multiplier"] actual_tps = base_tps * hw_mult # Prompt processing is ~2x faster than generation prompt_tps = actual_tps * 2.0 prompt_time = prompt_tokens / prompt_tps gen_time = generate_tokens / actual_tps total_time = prompt_time + gen_time # RAM estimate ram_mult = {"FP16": 1.0, "Q8_0": 0.6, "Q5_K_M": 0.45, "Q4_K_M": 0.38, "Q3_K_M": 0.32, "Q2_K": 0.28} base_ram = {"135M": 400, "300M": 600, "360M": 700, "500M": 1000, "600M": 1200, "1B": 2500, "1.1B": 2700, "1.5B": 3000, "1.7B": 3400, "1.9B": 3800, "2B": 5000, "3B": 6000, "3.8B": 7600, "7B": 14000} ram_mb = base_ram.get(params, 3000) * ram_mult.get(quant, 0.5) return json.dumps({ "model_params": params, "quantization": quant, "hardware": HARDWARE[hardware]["name"], "generation_speed_tps": round(actual_tps, 1), "prompt_speed_tps": round(prompt_tps, 1), "latency_ms_per_token": round(1000 / actual_tps, 0), "prompt_processing_time_s": round(prompt_time, 2), "generation_time_s": round(gen_time, 2), "total_inference_time_s": round(total_time, 2), "ram_required_mb": round(ram_mb), "suitable_for_realtime": actual_tps > 10, "suitable_for_phone": ram_mb < 4000 and actual_tps > 2, }, indent=2) def list_supported_hardware() -> str: """List all supported hardware profiles for latency estimation. Returns: JSON string with hardware names and relative speed multipliers """ return json.dumps({ hw: {"name": v["name"], "speed_multiplier": v["multiplier"]} for hw, v in HARDWARE.items() }, indent=2) with gr.Blocks(title="dispatchAI Latency Estimator MCP") as demo: gr.Markdown("## ⚡ dispatchAI Latency Estimator (MCP Tool)") with gr.Row(): params = gr.Dropdown(list(LATENCY_DB.keys()), value="1B", label="Model Params") quant = gr.Dropdown(["FP16", "Q8_0", "Q5_K_M", "Q4_K_M", "Q3_K_M", "Q2_K"], value="Q4_K_M", label="Quantization") hw = gr.Dropdown(list(HARDWARE.keys()), value="snapdragon_865", label="Hardware") with gr.Row(): prompt_tok = gr.Slider(10, 1000, value=100, label="Prompt Tokens") gen_tok = gr.Slider(10, 500, value=100, label="Generate Tokens") btn = gr.Button("Estimate Latency", variant="primary") out = gr.Textbox(label="Estimate (JSON)", lines=15) btn.click(fn=estimate_latency, inputs=[params, quant, hw, prompt_tok, gen_tok], outputs=out) demo.launch(mcp_server=True)