mcp-hub / app.py
3morixd's picture
Upload app.py with huggingface_hub
c89b420 verified
Raw
History Blame Contribute Delete
6.11 kB
import gradio as gr
import json
# Import all tool functions
# (Inlined for Space simplicity)
MODELS = [
{"Model": "SmolLM2-135M-Instruct-mobile", "Params": "135M", "Size_MB": 270, "RAM_MB": 400, "Task": "Chat", "Quant": "FP16", "Speed_tps": 25.5},
{"Model": "SmolLM2-360M-Instruct-mobile", "Params": "360M", "Size_MB": 720, "RAM_MB": 700, "Task": "Chat", "Quant": "FP16", "Speed_tps": 21.0},
{"Model": "Qwen2.5-0.5B-Instruct-mobile-int4", "Params": "500M", "Size_MB": 350, "RAM_MB": 550, "Task": "Chat", "Quant": "INT4", "Speed_tps": 20.0},
{"Model": "Llama-3.2-1B-Instruct-Q4-mobile", "Params": "1B", "Size_MB": 700, "RAM_MB": 1100, "Task": "Chat", "Quant": "Q4", "Speed_tps": 18.2},
{"Model": "Llama-3.2-3B-Instruct-Q5-mobile", "Params": "3B", "Size_MB": 2100, "RAM_MB": 2700, "Task": "Chat", "Quant": "Q5", "Speed_tps": 8.5},
{"Model": "Gemma-2B-Arabic-mobile", "Params": "2B", "Size_MB": 5000, "RAM_MB": 5500, "Task": "Arabic", "Quant": "FP16", "Speed_tps": 8.0},
{"Model": "Llama-3.2-1B-FunctionCall-mobile", "Params": "1B", "Size_MB": 2500, "RAM_MB": 3000, "Task": "Function Call", "Quant": "FP16", "Speed_tps": 12.0},
]
LATENCY_DB = {
"135M": {"FP16": 25.5, "Q4_K_M": 32.0, "Q8_0": 28.2},
"500M": {"FP16": 20.0, "Q4_K_M": 26.8, "INT4": 20.0},
"1B": {"FP16": 12.0, "Q4_K_M": 18.2, "Q5_K_M": 17.5},
"2B": {"FP16": 8.0, "Q5_K_M": 12.0, "Q4_K_M": 12.8},
"3B": {"FP16": 5.5, "Q5_K_M": 8.5, "Q4_K_M": 9.0},
}
def recommend_model(ram_mb: int, task: str = "Any") -> str:
"""Recommend the best dispatchAI mobile model for a given RAM budget and task.
Args:
ram_mb: Available RAM in MB
task: Task type (Chat, Code, Math, Arabic, Function Call, Vision, Embedding, Any)
Returns:
JSON with recommended model
"""
filtered = [m for m in MODELS if m["RAM_MB"] <= ram_mb]
if task != "Any":
filtered = [m for m in filtered if m["Task"] == task]
if not filtered:
return json.dumps({"error": f"No models fit in {ram_mb}MB"})
best = sorted(filtered, key=lambda x: x["Size_MB"])[0]
return json.dumps({
"recommended": best["Model"],
"url": f"https://huggingface.co/dispatchAI/{best['Model']}",
"size_mb": best["Size_MB"],
"ram_mb": best["RAM_MB"],
"speed_tps": best["Speed_tps"],
}, indent=2)
def estimate_latency(params: str, quant: str = "Q4_K_M") -> str:
"""Estimate inference latency on Snapdragon 865.
Args:
params: Model size (e.g., "135M", "1B", "3B")
quant: Quantization (FP16, Q4_K_M, Q5_K_M, Q8_0, INT4)
Returns:
JSON with speed and RAM estimates
"""
params = params.upper()
if params not in LATENCY_DB:
return json.dumps({"error": f"Unknown: {params}. Valid: {list(LATENCY_DB.keys())}"})
tps = LATENCY_DB[params].get(quant.upper(), 10.0)
return json.dumps({
"params": params,
"quant": quant,
"tokens_per_sec": tps,
"ms_per_token": round(1000/tps, 0),
"hardware": "Snapdragon 865",
}, indent=2)
def calculate_savings(daily_queries: int, cloud_cost_per_1k: float) -> str:
"""Calculate savings from on-device vs cloud inference.
Args:
daily_queries: Queries per day
cloud_cost_per_1k: Cloud cost per 1000 queries
Returns:
JSON with cost comparison
"""
annual_cloud = daily_queries * 365 * cloud_cost_per_1k / 1000
annual_device = 0.5
return json.dumps({
"cloud_annual": round(annual_cloud, 2),
"device_annual": round(annual_device, 2),
"savings": round(annual_cloud - annual_device, 2),
"savings_pct": round((1 - annual_device/annual_cloud)*100, 1) if annual_cloud > 0 else 0,
}, indent=2)
def search_models(query: str) -> str:
"""Search dispatchAI models by keyword.
Args:
query: Search term (e.g., "arabic", "coder", "1B", "quantized")
Returns:
JSON with matching models
"""
q = query.lower()
matches = [m for m in MODELS if q in m["Model"].lower() or q in m["Task"].lower() or q in m["Quant"].lower()]
return json.dumps({"query": query, "matches": len(matches), "models": matches}, indent=2)
with gr.Blocks(title="dispatchAI MCP Hub") as demo:
gr.Markdown("""
# 🧰 dispatchAI MCP Tool Hub
**One Space, four tools.** Add to Claude Desktop / Cursor / any MCP client.
| Tool | Description |
|------|-------------|
| `recommend_model` | Find best model for your phone |
| `estimate_latency` | Predict inference speed |
| `calculate_savings` | Cloud vs on-device cost |
| `search_models` | Search dispatchAI catalog |
""")
with gr.Tab("Recommend"):
r_ram = gr.Slider(512, 8192, value=2048, label="RAM (MB)")
r_task = gr.Dropdown(["Any", "Chat", "Arabic", "Function Call"], value="Any", label="Task")
r_btn = gr.Button("Recommend")
r_out = gr.Textbox(label="Result", lines=10)
r_btn.click(fn=recommend_model, inputs=[r_ram, r_task], outputs=r_out)
with gr.Tab("Latency"):
l_p = gr.Dropdown(list(LATENCY_DB.keys()), value="1B", label="Params")
l_q = gr.Dropdown(["FP16", "Q4_K_M", "Q5_K_M", "Q8_0", "INT4"], value="Q4_K_M", label="Quant")
l_btn = gr.Button("Estimate")
l_out = gr.Textbox(label="Result", lines=8)
l_btn.click(fn=estimate_latency, inputs=[l_p, l_q], outputs=l_out)
with gr.Tab("Cost"):
c_dq = gr.Slider(100, 100000, value=10000, label="Daily Queries")
c_cc = gr.Slider(0.1, 10, value=0.5, label="Cloud $/1K")
c_btn = gr.Button("Calculate")
c_out = gr.Textbox(label="Result", lines=8)
c_btn.click(fn=calculate_savings, inputs=[c_dq, c_cc], outputs=c_out)
with gr.Tab("Search"):
s_q = gr.Textbox(label="Search Query", placeholder="arabic, coder, 1B...")
s_btn = gr.Button("Search")
s_out = gr.Textbox(label="Results", lines=10)
s_btn.click(fn=search_models, inputs=s_q, outputs=s_out)
demo.launch(mcp_server=True)