| from fastapi import FastAPI, HTTPException |
| from fastapi.responses import JSONResponse |
| from pydantic import BaseModel, Field |
| from typing import List, Optional |
| import os |
|
|
| from llama_cpp import Llama |
|
|
| |
| MODEL_PATH = "/app/models/Qwen2.5-3B-Instruct-Q4_K_M.gguf" |
| if not os.path.exists(MODEL_PATH): |
| raise FileNotFoundError(f"Model not found at {MODEL_PATH}") |
|
|
| llm = Llama( |
| model_path=MODEL_PATH, |
| n_ctx=4096, |
| n_threads=4, |
| chat_format="chatml", |
| hf_pretrained_model_name_or_path="Qwen/Qwen2.5-3B-Instruct", |
| verbose=False, |
| ) |
|
|
| app = FastAPI(title="Qwen2.5-3B API", version="0.1.0") |
|
|
| class Message(BaseModel): |
| role: str = Field(..., description="Role: 'system', 'user', or 'assistant'") |
| content: str = Field(..., description="Message content") |
|
|
| class ChatRequest(BaseModel): |
| model: str = Field(..., description="Model identifier (ignored, single model)") |
| messages: List[Message] = Field(..., description="List of messages") |
| max_tokens: Optional[int] = Field(None, description="Maximum tokens to generate") |
| stream: Optional[bool] = Field(False, description="Stream response (not supported)") |
|
|
| @app.post("/v1/chat/completions") |
| async def chat_completion(req: ChatRequest): |
| |
| try: |
| result = llm.create_chat_completion( |
| messages=[m.dict() for m in req.messages], |
| max_tokens=req.max_tokens, |
| stream=req.stream, |
| ) |
| return JSONResponse(content=result) |
| except Exception as e: |
| raise HTTPException(status_code=400, detail=str(e)) |
|
|
| @app.get("/healthz") |
| async def healthz(): |
| return {"status": "ok"} |
|
|
| |
| if __name__ == "__main__": |
| import uvicorn |
| port = int(os.getenv("PORT", 7860)) |
| uvicorn.run(app, host="0.0.0.0", port=port) |
|
|