import os from fastapi import FastAPI from pydantic import BaseModel from llama_cpp import Llama app = FastAPI() # Configurable model path (override via env MODEL_PATH) MODEL_PATH = os.getenv("MODEL_PATH", "/app/models/Llama-3.2-3B-Instruct-Q4_K_M.gguf") # Load once at startup llm = Llama( model_path=MODEL_PATH, n_ctx=1024, # instead of 2048 n_gpu_layers=0, n_batch=128, # reduce batch size use_mmap=True, verbose=False ) class GenerateRequest(BaseModel): prompt: str max_tokens: int = 256 temperature: float = 0.7 top_p: float = 0.9 @app.get("/health") def health(): return {"ok": True} @app.get("/ready") def ready(): return {"ready": os.path.exists(MODEL_PATH), "model_path": MODEL_PATH} @app.post("/generate") def generate(req: GenerateRequest): out = llm( req.prompt, max_tokens=req.max_tokens, temperature=req.temperature, top_p=req.top_p, stop=[""], ) text = out["choices"][0]["text"] return {"text": text}