Gemma4-API / main.py
novaco-evildudes999's picture
Update main.py
70e1801 verified
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
from llama_cpp import Llama
import threading
MODEL_REPO = "bartowski/google_gemma-4-E2B-it-GGUF"
MODEL_FILE = "gemma-4-E2B-it-Q4_K_M.gguf"
app = FastAPI(title="Gemma 4 CPU API")
llm = None
load_error = None
lock = threading.Lock()
class GenerateRequest(BaseModel):
prompt: str
system: str | None = None
max_tokens: int = Field(default=256, ge=1, le=1024)
temperature: float = Field(default=0.7, ge=0.0, le=2.0)
def load_model():
global llm, load_error
try:
llm = Llama.from_pretrained(
repo_id=MODEL_REPO,
filename=MODEL_FILE,
n_ctx=2048,
n_threads=2,
n_batch=64,
verbose=True,
)
load_error = None
print("MODEL LOADED OK")
except Exception as e:
llm = None
load_error = f"{type(e).__name__}: {e}"
print(f"MODEL LOAD FAILED: {load_error}")
@app.on_event("startup")
def startup():
load_model()
@app.get("/health")
def health():
return {
"ok": llm is not None,
"model_loaded": llm is not None,
"error": load_error,
}
@app.post("/generate")
def generate(req: GenerateRequest):
global llm
if llm is None:
load_model()
if llm is None:
raise HTTPException(status_code=503, detail=f"Model not loaded: {load_error}")
messages = []
if req.system:
messages.append({"role": "system", "content": req.system})
messages.append({"role": "user", "content": req.prompt})
with lock:
out = llm.create_chat_completion(
messages=messages,
max_tokens=req.max_tokens,
temperature=req.temperature,
chat_format="gemma",
)
return {"response": out["choices"][0]["message"]["content"]}