import gradio as gr
import json

# Import all tool functions
# (Inlined for Space simplicity)

MODELS = [
    {"Model": "SmolLM2-135M-Instruct-mobile", "Params": "135M", "Size_MB": 270, "RAM_MB": 400, "Task": "Chat", "Quant": "FP16", "Speed_tps": 25.5},
    {"Model": "SmolLM2-360M-Instruct-mobile", "Params": "360M", "Size_MB": 720, "RAM_MB": 700, "Task": "Chat", "Quant": "FP16", "Speed_tps": 21.0},
    {"Model": "Qwen2.5-0.5B-Instruct-mobile-int4", "Params": "500M", "Size_MB": 350, "RAM_MB": 550, "Task": "Chat", "Quant": "INT4", "Speed_tps": 20.0},
    {"Model": "Llama-3.2-1B-Instruct-Q4-mobile", "Params": "1B", "Size_MB": 700, "RAM_MB": 1100, "Task": "Chat", "Quant": "Q4", "Speed_tps": 18.2},
    {"Model": "Llama-3.2-3B-Instruct-Q5-mobile", "Params": "3B", "Size_MB": 2100, "RAM_MB": 2700, "Task": "Chat", "Quant": "Q5", "Speed_tps": 8.5},
    {"Model": "Gemma-2B-Arabic-mobile", "Params": "2B", "Size_MB": 5000, "RAM_MB": 5500, "Task": "Arabic", "Quant": "FP16", "Speed_tps": 8.0},
    {"Model": "Llama-3.2-1B-FunctionCall-mobile", "Params": "1B", "Size_MB": 2500, "RAM_MB": 3000, "Task": "Function Call", "Quant": "FP16", "Speed_tps": 12.0},
]

LATENCY_DB = {
    "135M": {"FP16": 25.5, "Q4_K_M": 32.0, "Q8_0": 28.2},
    "500M": {"FP16": 20.0, "Q4_K_M": 26.8, "INT4": 20.0},
    "1B": {"FP16": 12.0, "Q4_K_M": 18.2, "Q5_K_M": 17.5},
    "2B": {"FP16": 8.0, "Q5_K_M": 12.0, "Q4_K_M": 12.8},
    "3B": {"FP16": 5.5, "Q5_K_M": 8.5, "Q4_K_M": 9.0},
}

def recommend_model(ram_mb: int, task: str = "Any") -> str:
    """Recommend the best dispatchAI mobile model for a given RAM budget and task.
    
    Args:
        ram_mb: Available RAM in MB
        task: Task type (Chat, Code, Math, Arabic, Function Call, Vision, Embedding, Any)
    
    Returns:
        JSON with recommended model
    """
    filtered = [m for m in MODELS if m["RAM_MB"] <= ram_mb]
    if task != "Any":
        filtered = [m for m in filtered if m["Task"] == task]
    if not filtered:
        return json.dumps({"error": f"No models fit in {ram_mb}MB"})
    best = sorted(filtered, key=lambda x: x["Size_MB"])[0]
    return json.dumps({
        "recommended": best["Model"],
        "url": f"https://huggingface.co/dispatchAI/{best['Model']}",
        "size_mb": best["Size_MB"],
        "ram_mb": best["RAM_MB"],
        "speed_tps": best["Speed_tps"],
    }, indent=2)

def estimate_latency(params: str, quant: str = "Q4_K_M") -> str:
    """Estimate inference latency on Snapdragon 865.
    
    Args:
        params: Model size (e.g., "135M", "1B", "3B")
        quant: Quantization (FP16, Q4_K_M, Q5_K_M, Q8_0, INT4)
    
    Returns:
        JSON with speed and RAM estimates
    """
    params = params.upper()
    if params not in LATENCY_DB:
        return json.dumps({"error": f"Unknown: {params}. Valid: {list(LATENCY_DB.keys())}"})
    tps = LATENCY_DB[params].get(quant.upper(), 10.0)
    return json.dumps({
        "params": params,
        "quant": quant,
        "tokens_per_sec": tps,
        "ms_per_token": round(1000/tps, 0),
        "hardware": "Snapdragon 865",
    }, indent=2)

def calculate_savings(daily_queries: int, cloud_cost_per_1k: float) -> str:
    """Calculate savings from on-device vs cloud inference.
    
    Args:
        daily_queries: Queries per day
        cloud_cost_per_1k: Cloud cost per 1000 queries
    
    Returns:
        JSON with cost comparison
    """
    annual_cloud = daily_queries * 365 * cloud_cost_per_1k / 1000
    annual_device = 0.5
    return json.dumps({
        "cloud_annual": round(annual_cloud, 2),
        "device_annual": round(annual_device, 2),
        "savings": round(annual_cloud - annual_device, 2),
        "savings_pct": round((1 - annual_device/annual_cloud)*100, 1) if annual_cloud > 0 else 0,
    }, indent=2)

def search_models(query: str) -> str:
    """Search dispatchAI models by keyword.
    
    Args:
        query: Search term (e.g., "arabic", "coder", "1B", "quantized")
    
    Returns:
        JSON with matching models
    """
    q = query.lower()
    matches = [m for m in MODELS if q in m["Model"].lower() or q in m["Task"].lower() or q in m["Quant"].lower()]
    return json.dumps({"query": query, "matches": len(matches), "models": matches}, indent=2)

with gr.Blocks(title="dispatchAI MCP Hub") as demo:
    gr.Markdown("""
    # 🧰 dispatchAI MCP Tool Hub
    
    **One Space, four tools.** Add to Claude Desktop / Cursor / any MCP client.
    
    | Tool | Description |
    |------|-------------|
    | `recommend_model` | Find best model for your phone |
    | `estimate_latency` | Predict inference speed |
    | `calculate_savings` | Cloud vs on-device cost |
    | `search_models` | Search dispatchAI catalog |
    """)
    
    with gr.Tab("Recommend"):
        r_ram = gr.Slider(512, 8192, value=2048, label="RAM (MB)")
        r_task = gr.Dropdown(["Any", "Chat", "Arabic", "Function Call"], value="Any", label="Task")
        r_btn = gr.Button("Recommend")
        r_out = gr.Textbox(label="Result", lines=10)
        r_btn.click(fn=recommend_model, inputs=[r_ram, r_task], outputs=r_out)
    
    with gr.Tab("Latency"):
        l_p = gr.Dropdown(list(LATENCY_DB.keys()), value="1B", label="Params")
        l_q = gr.Dropdown(["FP16", "Q4_K_M", "Q5_K_M", "Q8_0", "INT4"], value="Q4_K_M", label="Quant")
        l_btn = gr.Button("Estimate")
        l_out = gr.Textbox(label="Result", lines=8)
        l_btn.click(fn=estimate_latency, inputs=[l_p, l_q], outputs=l_out)
    
    with gr.Tab("Cost"):
        c_dq = gr.Slider(100, 100000, value=10000, label="Daily Queries")
        c_cc = gr.Slider(0.1, 10, value=0.5, label="Cloud $/1K")
        c_btn = gr.Button("Calculate")
        c_out = gr.Textbox(label="Result", lines=8)
        c_btn.click(fn=calculate_savings, inputs=[c_dq, c_cc], outputs=c_out)
    
    with gr.Tab("Search"):
        s_q = gr.Textbox(label="Search Query", placeholder="arabic, coder, 1B...")
        s_btn = gr.Button("Search")
        s_out = gr.Textbox(label="Results", lines=10)
        s_btn.click(fn=search_models, inputs=s_q, outputs=s_out)

demo.launch(mcp_server=True)