| from fastapi import FastAPI, HTTPException |
| from pydantic import BaseModel, Field |
| from llama_cpp import Llama |
| import threading |
|
|
| MODEL_REPO = "bartowski/google_gemma-4-E2B-it-GGUF" |
| MODEL_FILE = "gemma-4-E2B-it-Q4_K_M.gguf" |
|
|
| app = FastAPI(title="Gemma 4 CPU API") |
| llm = None |
| load_error = None |
| lock = threading.Lock() |
|
|
| class GenerateRequest(BaseModel): |
| prompt: str |
| system: str | None = None |
| max_tokens: int = Field(default=256, ge=1, le=1024) |
| temperature: float = Field(default=0.7, ge=0.0, le=2.0) |
|
|
| def load_model(): |
| global llm, load_error |
| try: |
| llm = Llama.from_pretrained( |
| repo_id=MODEL_REPO, |
| filename=MODEL_FILE, |
| n_ctx=2048, |
| n_threads=2, |
| n_batch=64, |
| verbose=True, |
| ) |
| load_error = None |
| print("MODEL LOADED OK") |
| except Exception as e: |
| llm = None |
| load_error = f"{type(e).__name__}: {e}" |
| print(f"MODEL LOAD FAILED: {load_error}") |
|
|
| @app.on_event("startup") |
| def startup(): |
| load_model() |
|
|
| @app.get("/health") |
| def health(): |
| return { |
| "ok": llm is not None, |
| "model_loaded": llm is not None, |
| "error": load_error, |
| } |
|
|
| @app.post("/generate") |
| def generate(req: GenerateRequest): |
| global llm |
|
|
| if llm is None: |
| load_model() |
|
|
| if llm is None: |
| raise HTTPException(status_code=503, detail=f"Model not loaded: {load_error}") |
|
|
| messages = [] |
| if req.system: |
| messages.append({"role": "system", "content": req.system}) |
| messages.append({"role": "user", "content": req.prompt}) |
|
|
| with lock: |
| out = llm.create_chat_completion( |
| messages=messages, |
| max_tokens=req.max_tokens, |
| temperature=req.temperature, |
| chat_format="gemma", |
| ) |
|
|
| return {"response": out["choices"][0]["message"]["content"]} |