from fastapi import FastAPI, HTTPException from fastapi.responses import JSONResponse from pydantic import BaseModel, Field from typing import List, Optional import os from llama_cpp import Llama # Load model MODEL_PATH = "/app/models/Qwen2.5-3B-Instruct-Q4_K_M.gguf" if not os.path.exists(MODEL_PATH): raise FileNotFoundError(f"Model not found at {MODEL_PATH}") llm = Llama( model_path=MODEL_PATH, n_ctx=4096, n_threads=4, chat_format="chatml", hf_pretrained_model_name_or_path="Qwen/Qwen2.5-3B-Instruct", verbose=False, ) app = FastAPI(title="Qwen2.5-3B API", version="0.1.0") class Message(BaseModel): role: str = Field(..., description="Role: 'system', 'user', or 'assistant'") content: str = Field(..., description="Message content") class ChatRequest(BaseModel): model: str = Field(..., description="Model identifier (ignored, single model)") messages: List[Message] = Field(..., description="List of messages") max_tokens: Optional[int] = Field(None, description="Maximum tokens to generate") stream: Optional[bool] = Field(False, description="Stream response (not supported)") @app.post("/v1/chat/completions") async def chat_completion(req: ChatRequest): # Ignore model name; use loaded model try: result = llm.create_chat_completion( messages=[m.dict() for m in req.messages], max_tokens=req.max_tokens, stream=req.stream, ) return JSONResponse(content=result) except Exception as e: raise HTTPException(status_code=400, detail=str(e)) @app.get("/healthz") async def healthz(): return {"status": "ok"} # For HF Spaces compatibility if __name__ == "__main__": import uvicorn port = int(os.getenv("PORT", 7860)) uvicorn.run(app, host="0.0.0.0", port=port)