3morixd's picture
Upload app.py with huggingface_hub
7cfa169 verified
Raw
History Blame Contribute Delete
6.29 kB
import gradio as gr
import json
# Latency estimates for Snapdragon 865 (Samsung S20 FE, 8GB RAM, llama.cpp 4 threads)
LATENCY_DB = {
"135M": {"FP16": 25.5, "Q8_0": 28.2, "Q5_K_M": 30.1, "Q4_K_M": 32.0, "Q3_K_M": 33.5, "Q2_K": 35.0},
"300M": {"FP16": 22.0, "Q8_0": 24.5, "Q5_K_M": 26.0, "Q4_K_M": 27.5, "Q3_K_M": 28.5, "Q2_K": 29.5},
"360M": {"FP16": 21.0, "Q8_0": 23.5, "Q5_K_M": 25.0, "Q4_K_M": 26.5, "Q3_K_M": 27.5, "Q2_K": 28.5},
"500M": {"FP16": 20.0, "Q8_0": 24.0, "Q5_K_M": 25.5, "Q4_K_M": 26.8, "Q3_K_M": 27.5, "Q2_K": 28.5},
"600M": {"FP16": 18.0, "Q8_0": 21.0, "Q5_K_M": 22.5, "Q4_K_M": 23.8, "Q3_K_M": 24.5, "Q2_K": 25.5},
"1B": {"FP16": 12.0, "Q8_0": 15.5, "Q5_K_M": 17.5, "Q4_K_M": 18.2, "Q3_K_M": 19.0, "Q2_K": 20.0},
"1.1B": {"FP16": 11.5, "Q8_0": 14.8, "Q5_K_M": 17.0, "Q4_K_M": 17.7, "Q3_K_M": 18.5, "Q2_K": 19.5},
"1.5B": {"FP16": 10.5, "Q8_0": 13.0, "Q5_K_M": 14.5, "Q4_K_M": 15.2, "Q3_K_M": 16.0, "Q2_K": 17.0},
"1.7B": {"FP16": 9.0, "Q8_0": 11.5, "Q5_K_M": 13.0, "Q4_K_M": 13.8, "Q3_K_M": 14.5, "Q2_K": 15.5},
"1.9B": {"FP16": 8.5, "Q8_0": 11.0, "Q5_K_M": 12.5, "Q4_K_M": 13.2, "Q3_K_M": 14.0, "Q2_K": 15.0},
"2B": {"FP16": 8.0, "Q8_0": 10.5, "Q5_K_M": 12.0, "Q4_K_M": 12.8, "Q3_K_M": 13.5, "Q2_K": 14.5},
"3B": {"FP16": 5.5, "Q8_0": 7.0, "Q5_K_M": 8.5, "Q4_K_M": 9.0, "Q3_K_M": 9.8, "Q2_K": 10.5},
"3.8B": {"FP16": 4.5, "Q8_0": 6.0, "Q5_K_M": 7.5, "Q4_K_M": 8.2, "Q3_K_M": 8.8, "Q2_K": 9.5},
"7B": {"FP16": 2.5, "Q8_0": 3.5, "Q5_K_M": 4.5, "Q4_K_M": 5.0, "Q3_K_M": 5.5, "Q2_K": 6.0},
}
# Hardware profiles
HARDWARE = {
"snapdragon_865": {"name": "Snapdragon 865 (Samsung S20 FE)", "multiplier": 1.0},
"snapdragon_8_gen_2": {"name": "Snapdragon 8 Gen 2 (Galaxy S23)", "multiplier": 1.8},
"snapdragon_8_gen_3": {"name": "Snapdragon 8 Gen 3 (Galaxy S24)", "multiplier": 2.2},
"apple_a17": {"name": "Apple A17 Pro (iPhone 15 Pro)", "multiplier": 2.5},
"apple_m2": {"name": "Apple M2 (MacBook)", "multiplier": 3.0},
"snapdragon_778g": {"name": "Snapdragon 778G (Mid-range)", "multiplier": 0.7},
"mediatek_dimensity_9200": {"name": "MediaTek Dimensity 9200", "multiplier": 1.6},
}
def estimate_latency(params: str, quant: str = "Q4_K_M", hardware: str = "snapdragon_865", prompt_tokens: int = 100, generate_tokens: int = 100) -> str:
"""Estimate on-device inference latency for a mobile LLM.
Use this tool when a user asks "how fast will model X be on my phone" or
"what's the latency of this model on mobile hardware". Returns tokens/sec,
time per token, and total generation time.
Args:
params: Model parameter count as string - e.g., "135M", "500M", "1B", "1.5B", "3B", "7B"
quant: Quantization level - one of: FP16, Q8_0, Q5_K_M, Q4_K_M, Q3_K_M, Q2_K
hardware: Target hardware - one of: snapdragon_865, snapdragon_8_gen_2, snapdragon_8_gen_3, apple_a17, apple_m2, snapdragon_778g, mediatek_dimensity_9200
prompt_tokens: Number of input tokens to process
generate_tokens: Number of tokens to generate
Returns:
JSON string with latency estimates
"""
params = params.upper()
if params not in LATENCY_DB:
return json.dumps({"error": f"Unknown param count: {params}. Valid: {list(LATENCY_DB.keys())}"})
quant = quant.upper()
if quant not in LATENCY_DB[params]:
return json.dumps({"error": f"Unknown quant: {quant}. Valid: {list(LATENCY_DB[params].keys())}"})
if hardware not in HARDWARE:
return json.dumps({"error": f"Unknown hardware: {hardware}. Valid: {list(HARDWARE.keys())}"})
base_tps = LATENCY_DB[params][quant]
hw_mult = HARDWARE[hardware]["multiplier"]
actual_tps = base_tps * hw_mult
# Prompt processing is ~2x faster than generation
prompt_tps = actual_tps * 2.0
prompt_time = prompt_tokens / prompt_tps
gen_time = generate_tokens / actual_tps
total_time = prompt_time + gen_time
# RAM estimate
ram_mult = {"FP16": 1.0, "Q8_0": 0.6, "Q5_K_M": 0.45, "Q4_K_M": 0.38, "Q3_K_M": 0.32, "Q2_K": 0.28}
base_ram = {"135M": 400, "300M": 600, "360M": 700, "500M": 1000, "600M": 1200,
"1B": 2500, "1.1B": 2700, "1.5B": 3000, "1.7B": 3400, "1.9B": 3800,
"2B": 5000, "3B": 6000, "3.8B": 7600, "7B": 14000}
ram_mb = base_ram.get(params, 3000) * ram_mult.get(quant, 0.5)
return json.dumps({
"model_params": params,
"quantization": quant,
"hardware": HARDWARE[hardware]["name"],
"generation_speed_tps": round(actual_tps, 1),
"prompt_speed_tps": round(prompt_tps, 1),
"latency_ms_per_token": round(1000 / actual_tps, 0),
"prompt_processing_time_s": round(prompt_time, 2),
"generation_time_s": round(gen_time, 2),
"total_inference_time_s": round(total_time, 2),
"ram_required_mb": round(ram_mb),
"suitable_for_realtime": actual_tps > 10,
"suitable_for_phone": ram_mb < 4000 and actual_tps > 2,
}, indent=2)
def list_supported_hardware() -> str:
"""List all supported hardware profiles for latency estimation.
Returns:
JSON string with hardware names and relative speed multipliers
"""
return json.dumps({
hw: {"name": v["name"], "speed_multiplier": v["multiplier"]}
for hw, v in HARDWARE.items()
}, indent=2)
with gr.Blocks(title="dispatchAI Latency Estimator MCP") as demo:
gr.Markdown("## ⚡ dispatchAI Latency Estimator (MCP Tool)")
with gr.Row():
params = gr.Dropdown(list(LATENCY_DB.keys()), value="1B", label="Model Params")
quant = gr.Dropdown(["FP16", "Q8_0", "Q5_K_M", "Q4_K_M", "Q3_K_M", "Q2_K"], value="Q4_K_M", label="Quantization")
hw = gr.Dropdown(list(HARDWARE.keys()), value="snapdragon_865", label="Hardware")
with gr.Row():
prompt_tok = gr.Slider(10, 1000, value=100, label="Prompt Tokens")
gen_tok = gr.Slider(10, 500, value=100, label="Generate Tokens")
btn = gr.Button("Estimate Latency", variant="primary")
out = gr.Textbox(label="Estimate (JSON)", lines=15)
btn.click(fn=estimate_latency, inputs=[params, quant, hw, prompt_tok, gen_tok], outputs=out)
demo.launch(mcp_server=True)