Spaces:
Sleeping
Sleeping
| import os | |
| from fastapi import FastAPI | |
| from pydantic import BaseModel | |
| from llama_cpp import Llama | |
| app = FastAPI() | |
| # Configurable model path (override via env MODEL_PATH) | |
| MODEL_PATH = os.getenv("MODEL_PATH", "/app/models/Llama-3.2-3B-Instruct-Q4_K_M.gguf") | |
| # Load once at startup | |
| llm = Llama( | |
| model_path=MODEL_PATH, | |
| n_ctx=1024, # instead of 2048 | |
| n_gpu_layers=0, | |
| n_batch=128, # reduce batch size | |
| use_mmap=True, | |
| verbose=False | |
| ) | |
| class GenerateRequest(BaseModel): | |
| prompt: str | |
| max_tokens: int = 256 | |
| temperature: float = 0.7 | |
| top_p: float = 0.9 | |
| def health(): | |
| return {"ok": True} | |
| def ready(): | |
| return {"ready": os.path.exists(MODEL_PATH), "model_path": MODEL_PATH} | |
| def generate(req: GenerateRequest): | |
| out = llm( | |
| req.prompt, | |
| max_tokens=req.max_tokens, | |
| temperature=req.temperature, | |
| top_p=req.top_p, | |
| stop=["</s>"], | |
| ) | |
| text = out["choices"][0]["text"] | |
| return {"text": text} | |