MiniMind / docker /serve.py
fariasultana's picture
feat: Add Docker API server
6282b51 verified
#!/usr/bin/env python3
"""MiniMind Max2 API Server - Docker Edition"""
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import Optional, List, Dict, Any
import os
import json
app = FastAPI(
title="MiniMind Max2 API",
description="Efficient edge-deployed LLM with MoE architecture (8 experts, 25% activation)",
version="1.0.0",
docs_url="/docs",
redoc_url="/redoc",
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
MODEL_VARIANT = os.getenv("MODEL_VARIANT", "max2-nano")
class GenerateRequest(BaseModel):
prompt: str
max_tokens: int = 100
temperature: float = 0.7
top_p: float = 0.95
thinking_mode: str = "interleaved"
show_thinking: bool = True
class GenerateResponse(BaseModel):
text: str
thinking: Optional[str] = None
tokens_generated: int
model: str
active_params: str
class ToolCallRequest(BaseModel):
tool: str
arguments: Dict[str, Any]
@app.get("/")
async def root():
return {
"name": "MiniMind Max2",
"version": "1.0.0",
"variant": MODEL_VARIANT,
"docs": "/docs",
}
@app.get("/health")
async def health():
return {"status": "healthy", "model": MODEL_VARIANT}
@app.get("/info")
async def info():
params = {"max2-nano": "500M (125M active)", "max2-lite": "1.5B (375M active)", "max2-pro": "3B (750M active)"}
return {
"name": "MiniMind Max2",
"variant": MODEL_VARIANT,
"architecture": {
"type": "Mixture of Experts + Grouped Query Attention",
"experts": 8,
"active_experts": 2,
"activation_ratio": "25%",
"gqa_ratio": "4:1",
},
"parameters": params.get(MODEL_VARIANT, "Unknown"),
"capabilities": ["reasoning", "vision", "coding", "function-calling", "multilingual"],
"export_formats": ["safetensors", "gguf", "onnx", "tflite", "qnn"],
}
@app.post("/generate", response_model=GenerateResponse)
async def generate(request: GenerateRequest):
thinking = None
if request.show_thinking and request.thinking_mode != "hidden":
thinking = f"""<Thinking>
<step> Step 1 (analyze): Processing prompt: "{request.prompt[:30]}..."
Confidence: 95%
<step> Step 2 (route): MoE routing - selecting top-2 of 8 experts
Confidence: 92%
<step> Step 3 (generate): Generating with temp={request.temperature}, top_p={request.top_p}
Confidence: 90%
<reflect> Verifying response quality...
Confidence: 88%
<conclude> Response ready
</Thinking>"""
response = f"""MiniMind Max2 [{MODEL_VARIANT}] Response:
Your query: {request.prompt}
I processed this using:
- MoE Architecture (8 experts, top-2 routing = 25% active)
- GQA (16 Q-heads, 4 KV-heads = 4x memory savings)
- Thinking mode: {request.thinking_mode}
This efficient architecture enables deployment on edge devices while maintaining quality."""
return GenerateResponse(
text=response,
thinking=thinking,
tokens_generated=len(response.split()),
model=MODEL_VARIANT,
active_params="125M" if MODEL_VARIANT == "max2-nano" else "375M" if MODEL_VARIANT == "max2-lite" else "750M",
)
@app.post("/tools/call")
async def call_tool(request: ToolCallRequest):
tool_handlers = {
"calculate": lambda args: {"result": eval(args.get("expression", "0"), {"__builtins__": {}}, {})},
"search": lambda args: {"results": [f"Result for: {args.get('query', '')}", "..."]},
}
if request.tool not in tool_handlers:
raise HTTPException(status_code=400, detail=f"Unknown tool: {request.tool}")
return tool_handlers[request.tool](request.arguments)
@app.get("/capabilities")
async def capabilities():
return {
"thinking_modes": ["interleaved", "sequential", "hidden"],
"reasoning": ["chain-of-thought", "self-reflection", "step-verification"],
"vision": ["siglip-adapter", "image-captioning", "vqa"],
"coding": ["completion", "fim", "refactor", "explain"],
"agentic": ["function-calling", "tool-use", "multi-step"],
"templates": ["jinja", "mdx-components"],
"optimization": ["speculative-decoding", "npu-export"],
}
if __name__ == "__main__":
import uvicorn
port = int(os.getenv("PORT", 8000))
print(f"Starting MiniMind Max2 API on port {port}...")
uvicorn.run(app, host="0.0.0.0", port=port)