Spaces:

dispatchAI
/

mcp-hub

Runtime error

App Files Files Community

mcp-hub / app.py

3morixd

Upload app.py with huggingface_hub

c89b420 verified 1 day ago

Raw

History Blame Contribute Delete

6.11 kB

	import gradio as gr
	import json

	# Import all tool functions
	# (Inlined for Space simplicity)

	MODELS = [
	{"Model": "SmolLM2-135M-Instruct-mobile", "Params": "135M", "Size_MB": 270, "RAM_MB": 400, "Task": "Chat", "Quant": "FP16", "Speed_tps": 25.5},
	{"Model": "SmolLM2-360M-Instruct-mobile", "Params": "360M", "Size_MB": 720, "RAM_MB": 700, "Task": "Chat", "Quant": "FP16", "Speed_tps": 21.0},
	{"Model": "Qwen2.5-0.5B-Instruct-mobile-int4", "Params": "500M", "Size_MB": 350, "RAM_MB": 550, "Task": "Chat", "Quant": "INT4", "Speed_tps": 20.0},
	{"Model": "Llama-3.2-1B-Instruct-Q4-mobile", "Params": "1B", "Size_MB": 700, "RAM_MB": 1100, "Task": "Chat", "Quant": "Q4", "Speed_tps": 18.2},
	{"Model": "Llama-3.2-3B-Instruct-Q5-mobile", "Params": "3B", "Size_MB": 2100, "RAM_MB": 2700, "Task": "Chat", "Quant": "Q5", "Speed_tps": 8.5},
	{"Model": "Gemma-2B-Arabic-mobile", "Params": "2B", "Size_MB": 5000, "RAM_MB": 5500, "Task": "Arabic", "Quant": "FP16", "Speed_tps": 8.0},
	{"Model": "Llama-3.2-1B-FunctionCall-mobile", "Params": "1B", "Size_MB": 2500, "RAM_MB": 3000, "Task": "Function Call", "Quant": "FP16", "Speed_tps": 12.0},
	]

	LATENCY_DB = {
	"135M": {"FP16": 25.5, "Q4_K_M": 32.0, "Q8_0": 28.2},
	"500M": {"FP16": 20.0, "Q4_K_M": 26.8, "INT4": 20.0},
	"1B": {"FP16": 12.0, "Q4_K_M": 18.2, "Q5_K_M": 17.5},
	"2B": {"FP16": 8.0, "Q5_K_M": 12.0, "Q4_K_M": 12.8},
	"3B": {"FP16": 5.5, "Q5_K_M": 8.5, "Q4_K_M": 9.0},
	}

	def recommend_model(ram_mb: int, task: str = "Any") -> str:
	"""Recommend the best dispatchAI mobile model for a given RAM budget and task.

	Args:
	ram_mb: Available RAM in MB
	task: Task type (Chat, Code, Math, Arabic, Function Call, Vision, Embedding, Any)

	Returns:
	JSON with recommended model
	"""
	filtered = [m for m in MODELS if m["RAM_MB"] <= ram_mb]
	if task != "Any":
	filtered = [m for m in filtered if m["Task"] == task]
	if not filtered:
	return json.dumps({"error": f"No models fit in {ram_mb}MB"})
	best = sorted(filtered, key=lambda x: x["Size_MB"])[0]
	return json.dumps({
	"recommended": best["Model"],
	"url": f"https://huggingface.co/dispatchAI/{best['Model']}",
	"size_mb": best["Size_MB"],
	"ram_mb": best["RAM_MB"],
	"speed_tps": best["Speed_tps"],
	}, indent=2)

	def estimate_latency(params: str, quant: str = "Q4_K_M") -> str:
	"""Estimate inference latency on Snapdragon 865.

	Args:
	params: Model size (e.g., "135M", "1B", "3B")
	quant: Quantization (FP16, Q4_K_M, Q5_K_M, Q8_0, INT4)

	Returns:
	JSON with speed and RAM estimates
	"""
	params = params.upper()
	if params not in LATENCY_DB:
	return json.dumps({"error": f"Unknown: {params}. Valid: {list(LATENCY_DB.keys())}"})
	tps = LATENCY_DB[params].get(quant.upper(), 10.0)
	return json.dumps({
	"params": params,
	"quant": quant,
	"tokens_per_sec": tps,
	"ms_per_token": round(1000/tps, 0),
	"hardware": "Snapdragon 865",
	}, indent=2)

	def calculate_savings(daily_queries: int, cloud_cost_per_1k: float) -> str:
	"""Calculate savings from on-device vs cloud inference.

	Args:
	daily_queries: Queries per day
	cloud_cost_per_1k: Cloud cost per 1000 queries

	Returns:
	JSON with cost comparison
	"""
	annual_cloud = daily_queries * 365 * cloud_cost_per_1k / 1000
	annual_device = 0.5
	return json.dumps({
	"cloud_annual": round(annual_cloud, 2),
	"device_annual": round(annual_device, 2),
	"savings": round(annual_cloud - annual_device, 2),
	"savings_pct": round((1 - annual_device/annual_cloud)*100, 1) if annual_cloud > 0 else 0,
	}, indent=2)

	def search_models(query: str) -> str:
	"""Search dispatchAI models by keyword.

	Args:
	query: Search term (e.g., "arabic", "coder", "1B", "quantized")

	Returns:
	JSON with matching models
	"""
	q = query.lower()
	matches = [m for m in MODELS if q in m["Model"].lower() or q in m["Task"].lower() or q in m["Quant"].lower()]
	return json.dumps({"query": query, "matches": len(matches), "models": matches}, indent=2)

	with gr.Blocks(title="dispatchAI MCP Hub") as demo:
	gr.Markdown("""
	# 🧰 dispatchAI MCP Tool Hub

	One Space, four tools. Add to Claude Desktop / Cursor / any MCP client.

	\| Tool \| Description \|
	\|------\|-------------\|
	\| `recommend_model` \| Find best model for your phone \|
	\| `estimate_latency` \| Predict inference speed \|
	\| `calculate_savings` \| Cloud vs on-device cost \|
	\| `search_models` \| Search dispatchAI catalog \|
	""")

	with gr.Tab("Recommend"):
	r_ram = gr.Slider(512, 8192, value=2048, label="RAM (MB)")
	r_task = gr.Dropdown(["Any", "Chat", "Arabic", "Function Call"], value="Any", label="Task")
	r_btn = gr.Button("Recommend")
	r_out = gr.Textbox(label="Result", lines=10)
	r_btn.click(fn=recommend_model, inputs=[r_ram, r_task], outputs=r_out)

	with gr.Tab("Latency"):
	l_p = gr.Dropdown(list(LATENCY_DB.keys()), value="1B", label="Params")
	l_q = gr.Dropdown(["FP16", "Q4_K_M", "Q5_K_M", "Q8_0", "INT4"], value="Q4_K_M", label="Quant")
	l_btn = gr.Button("Estimate")
	l_out = gr.Textbox(label="Result", lines=8)
	l_btn.click(fn=estimate_latency, inputs=[l_p, l_q], outputs=l_out)

	with gr.Tab("Cost"):
	c_dq = gr.Slider(100, 100000, value=10000, label="Daily Queries")
	c_cc = gr.Slider(0.1, 10, value=0.5, label="Cloud $/1K")
	c_btn = gr.Button("Calculate")
	c_out = gr.Textbox(label="Result", lines=8)
	c_btn.click(fn=calculate_savings, inputs=[c_dq, c_cc], outputs=c_out)

	with gr.Tab("Search"):
	s_q = gr.Textbox(label="Search Query", placeholder="arabic, coder, 1B...")
	s_btn = gr.Button("Search")
	s_out = gr.Textbox(label="Results", lines=10)
	s_btn.click(fn=search_models, inputs=s_q, outputs=s_out)

	demo.launch(mcp_server=True)