Server / app.py
omaryasserhassan's picture
Update app.py
926ab67 verified
import os
from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
app = FastAPI()
# Configurable model path (override via env MODEL_PATH)
MODEL_PATH = os.getenv("MODEL_PATH", "/app/models/Llama-3.2-3B-Instruct-Q4_K_M.gguf")
# Load once at startup
llm = Llama(
model_path=MODEL_PATH,
n_ctx=1024, # instead of 2048
n_gpu_layers=0,
n_batch=128, # reduce batch size
use_mmap=True,
verbose=False
)
class GenerateRequest(BaseModel):
prompt: str
max_tokens: int = 256
temperature: float = 0.7
top_p: float = 0.9
@app.get("/health")
def health():
return {"ok": True}
@app.get("/ready")
def ready():
return {"ready": os.path.exists(MODEL_PATH), "model_path": MODEL_PATH}
@app.post("/generate")
def generate(req: GenerateRequest):
out = llm(
req.prompt,
max_tokens=req.max_tokens,
temperature=req.temperature,
top_p=req.top_p,
stop=["</s>"],
)
text = out["choices"][0]["text"]
return {"text": text}