LLM_API / main.py
Jacky2305's picture
feat: switch to custom FastAPI app to avoid server issues
19c7563
from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel, Field
from typing import List, Optional
import os
from llama_cpp import Llama
# Load model
MODEL_PATH = "/app/models/Qwen2.5-3B-Instruct-Q4_K_M.gguf"
if not os.path.exists(MODEL_PATH):
raise FileNotFoundError(f"Model not found at {MODEL_PATH}")
llm = Llama(
model_path=MODEL_PATH,
n_ctx=4096,
n_threads=4,
chat_format="chatml",
hf_pretrained_model_name_or_path="Qwen/Qwen2.5-3B-Instruct",
verbose=False,
)
app = FastAPI(title="Qwen2.5-3B API", version="0.1.0")
class Message(BaseModel):
role: str = Field(..., description="Role: 'system', 'user', or 'assistant'")
content: str = Field(..., description="Message content")
class ChatRequest(BaseModel):
model: str = Field(..., description="Model identifier (ignored, single model)")
messages: List[Message] = Field(..., description="List of messages")
max_tokens: Optional[int] = Field(None, description="Maximum tokens to generate")
stream: Optional[bool] = Field(False, description="Stream response (not supported)")
@app.post("/v1/chat/completions")
async def chat_completion(req: ChatRequest):
# Ignore model name; use loaded model
try:
result = llm.create_chat_completion(
messages=[m.dict() for m in req.messages],
max_tokens=req.max_tokens,
stream=req.stream,
)
return JSONResponse(content=result)
except Exception as e:
raise HTTPException(status_code=400, detail=str(e))
@app.get("/healthz")
async def healthz():
return {"status": "ok"}
# For HF Spaces compatibility
if __name__ == "__main__":
import uvicorn
port = int(os.getenv("PORT", 7860))
uvicorn.run(app, host="0.0.0.0", port=port)