Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import json | |
| # Latency estimates for Snapdragon 865 (Samsung S20 FE, 8GB RAM, llama.cpp 4 threads) | |
| LATENCY_DB = { | |
| "135M": {"FP16": 25.5, "Q8_0": 28.2, "Q5_K_M": 30.1, "Q4_K_M": 32.0, "Q3_K_M": 33.5, "Q2_K": 35.0}, | |
| "300M": {"FP16": 22.0, "Q8_0": 24.5, "Q5_K_M": 26.0, "Q4_K_M": 27.5, "Q3_K_M": 28.5, "Q2_K": 29.5}, | |
| "360M": {"FP16": 21.0, "Q8_0": 23.5, "Q5_K_M": 25.0, "Q4_K_M": 26.5, "Q3_K_M": 27.5, "Q2_K": 28.5}, | |
| "500M": {"FP16": 20.0, "Q8_0": 24.0, "Q5_K_M": 25.5, "Q4_K_M": 26.8, "Q3_K_M": 27.5, "Q2_K": 28.5}, | |
| "600M": {"FP16": 18.0, "Q8_0": 21.0, "Q5_K_M": 22.5, "Q4_K_M": 23.8, "Q3_K_M": 24.5, "Q2_K": 25.5}, | |
| "1B": {"FP16": 12.0, "Q8_0": 15.5, "Q5_K_M": 17.5, "Q4_K_M": 18.2, "Q3_K_M": 19.0, "Q2_K": 20.0}, | |
| "1.1B": {"FP16": 11.5, "Q8_0": 14.8, "Q5_K_M": 17.0, "Q4_K_M": 17.7, "Q3_K_M": 18.5, "Q2_K": 19.5}, | |
| "1.5B": {"FP16": 10.5, "Q8_0": 13.0, "Q5_K_M": 14.5, "Q4_K_M": 15.2, "Q3_K_M": 16.0, "Q2_K": 17.0}, | |
| "1.7B": {"FP16": 9.0, "Q8_0": 11.5, "Q5_K_M": 13.0, "Q4_K_M": 13.8, "Q3_K_M": 14.5, "Q2_K": 15.5}, | |
| "1.9B": {"FP16": 8.5, "Q8_0": 11.0, "Q5_K_M": 12.5, "Q4_K_M": 13.2, "Q3_K_M": 14.0, "Q2_K": 15.0}, | |
| "2B": {"FP16": 8.0, "Q8_0": 10.5, "Q5_K_M": 12.0, "Q4_K_M": 12.8, "Q3_K_M": 13.5, "Q2_K": 14.5}, | |
| "3B": {"FP16": 5.5, "Q8_0": 7.0, "Q5_K_M": 8.5, "Q4_K_M": 9.0, "Q3_K_M": 9.8, "Q2_K": 10.5}, | |
| "3.8B": {"FP16": 4.5, "Q8_0": 6.0, "Q5_K_M": 7.5, "Q4_K_M": 8.2, "Q3_K_M": 8.8, "Q2_K": 9.5}, | |
| "7B": {"FP16": 2.5, "Q8_0": 3.5, "Q5_K_M": 4.5, "Q4_K_M": 5.0, "Q3_K_M": 5.5, "Q2_K": 6.0}, | |
| } | |
| # Hardware profiles | |
| HARDWARE = { | |
| "snapdragon_865": {"name": "Snapdragon 865 (Samsung S20 FE)", "multiplier": 1.0}, | |
| "snapdragon_8_gen_2": {"name": "Snapdragon 8 Gen 2 (Galaxy S23)", "multiplier": 1.8}, | |
| "snapdragon_8_gen_3": {"name": "Snapdragon 8 Gen 3 (Galaxy S24)", "multiplier": 2.2}, | |
| "apple_a17": {"name": "Apple A17 Pro (iPhone 15 Pro)", "multiplier": 2.5}, | |
| "apple_m2": {"name": "Apple M2 (MacBook)", "multiplier": 3.0}, | |
| "snapdragon_778g": {"name": "Snapdragon 778G (Mid-range)", "multiplier": 0.7}, | |
| "mediatek_dimensity_9200": {"name": "MediaTek Dimensity 9200", "multiplier": 1.6}, | |
| } | |
| def estimate_latency(params: str, quant: str = "Q4_K_M", hardware: str = "snapdragon_865", prompt_tokens: int = 100, generate_tokens: int = 100) -> str: | |
| """Estimate on-device inference latency for a mobile LLM. | |
| Use this tool when a user asks "how fast will model X be on my phone" or | |
| "what's the latency of this model on mobile hardware". Returns tokens/sec, | |
| time per token, and total generation time. | |
| Args: | |
| params: Model parameter count as string - e.g., "135M", "500M", "1B", "1.5B", "3B", "7B" | |
| quant: Quantization level - one of: FP16, Q8_0, Q5_K_M, Q4_K_M, Q3_K_M, Q2_K | |
| hardware: Target hardware - one of: snapdragon_865, snapdragon_8_gen_2, snapdragon_8_gen_3, apple_a17, apple_m2, snapdragon_778g, mediatek_dimensity_9200 | |
| prompt_tokens: Number of input tokens to process | |
| generate_tokens: Number of tokens to generate | |
| Returns: | |
| JSON string with latency estimates | |
| """ | |
| params = params.upper() | |
| if params not in LATENCY_DB: | |
| return json.dumps({"error": f"Unknown param count: {params}. Valid: {list(LATENCY_DB.keys())}"}) | |
| quant = quant.upper() | |
| if quant not in LATENCY_DB[params]: | |
| return json.dumps({"error": f"Unknown quant: {quant}. Valid: {list(LATENCY_DB[params].keys())}"}) | |
| if hardware not in HARDWARE: | |
| return json.dumps({"error": f"Unknown hardware: {hardware}. Valid: {list(HARDWARE.keys())}"}) | |
| base_tps = LATENCY_DB[params][quant] | |
| hw_mult = HARDWARE[hardware]["multiplier"] | |
| actual_tps = base_tps * hw_mult | |
| # Prompt processing is ~2x faster than generation | |
| prompt_tps = actual_tps * 2.0 | |
| prompt_time = prompt_tokens / prompt_tps | |
| gen_time = generate_tokens / actual_tps | |
| total_time = prompt_time + gen_time | |
| # RAM estimate | |
| ram_mult = {"FP16": 1.0, "Q8_0": 0.6, "Q5_K_M": 0.45, "Q4_K_M": 0.38, "Q3_K_M": 0.32, "Q2_K": 0.28} | |
| base_ram = {"135M": 400, "300M": 600, "360M": 700, "500M": 1000, "600M": 1200, | |
| "1B": 2500, "1.1B": 2700, "1.5B": 3000, "1.7B": 3400, "1.9B": 3800, | |
| "2B": 5000, "3B": 6000, "3.8B": 7600, "7B": 14000} | |
| ram_mb = base_ram.get(params, 3000) * ram_mult.get(quant, 0.5) | |
| return json.dumps({ | |
| "model_params": params, | |
| "quantization": quant, | |
| "hardware": HARDWARE[hardware]["name"], | |
| "generation_speed_tps": round(actual_tps, 1), | |
| "prompt_speed_tps": round(prompt_tps, 1), | |
| "latency_ms_per_token": round(1000 / actual_tps, 0), | |
| "prompt_processing_time_s": round(prompt_time, 2), | |
| "generation_time_s": round(gen_time, 2), | |
| "total_inference_time_s": round(total_time, 2), | |
| "ram_required_mb": round(ram_mb), | |
| "suitable_for_realtime": actual_tps > 10, | |
| "suitable_for_phone": ram_mb < 4000 and actual_tps > 2, | |
| }, indent=2) | |
| def list_supported_hardware() -> str: | |
| """List all supported hardware profiles for latency estimation. | |
| Returns: | |
| JSON string with hardware names and relative speed multipliers | |
| """ | |
| return json.dumps({ | |
| hw: {"name": v["name"], "speed_multiplier": v["multiplier"]} | |
| for hw, v in HARDWARE.items() | |
| }, indent=2) | |
| with gr.Blocks(title="dispatchAI Latency Estimator MCP") as demo: | |
| gr.Markdown("## ⚡ dispatchAI Latency Estimator (MCP Tool)") | |
| with gr.Row(): | |
| params = gr.Dropdown(list(LATENCY_DB.keys()), value="1B", label="Model Params") | |
| quant = gr.Dropdown(["FP16", "Q8_0", "Q5_K_M", "Q4_K_M", "Q3_K_M", "Q2_K"], value="Q4_K_M", label="Quantization") | |
| hw = gr.Dropdown(list(HARDWARE.keys()), value="snapdragon_865", label="Hardware") | |
| with gr.Row(): | |
| prompt_tok = gr.Slider(10, 1000, value=100, label="Prompt Tokens") | |
| gen_tok = gr.Slider(10, 500, value=100, label="Generate Tokens") | |
| btn = gr.Button("Estimate Latency", variant="primary") | |
| out = gr.Textbox(label="Estimate (JSON)", lines=15) | |
| btn.click(fn=estimate_latency, inputs=[params, quant, hw, prompt_tok, gen_tok], outputs=out) | |
| demo.launch(mcp_server=True) | |