novaco-evildudes999 commited on
Commit
70e1801
·
verified ·
1 Parent(s): 3b27ff5

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +22 -6
main.py CHANGED
@@ -3,11 +3,12 @@ from pydantic import BaseModel, Field
3
  from llama_cpp import Llama
4
  import threading
5
 
6
- MODEL_REPO = "unsloth/gemma-4-E2B-it-GGUF"
7
  MODEL_FILE = "gemma-4-E2B-it-Q4_K_M.gguf"
8
 
9
  app = FastAPI(title="Gemma 4 CPU API")
10
  llm = None
 
11
  lock = threading.Lock()
12
 
13
  class GenerateRequest(BaseModel):
@@ -16,9 +17,8 @@ class GenerateRequest(BaseModel):
16
  max_tokens: int = Field(default=256, ge=1, le=1024)
17
  temperature: float = Field(default=0.7, ge=0.0, le=2.0)
18
 
19
- @app.on_event("startup")
20
  def load_model():
21
- global llm
22
  try:
23
  llm = Llama.from_pretrained(
24
  repo_id=MODEL_REPO,
@@ -28,19 +28,34 @@ def load_model():
28
  n_batch=64,
29
  verbose=True,
30
  )
 
31
  print("MODEL LOADED OK")
32
  except Exception as e:
33
  llm = None
34
- print(f"MODEL LOAD FAILED: {type(e).__name__}: {e}")
 
 
 
 
 
35
 
36
  @app.get("/health")
37
  def health():
38
- return {"ok": llm is not None, "model_loaded": llm is not None}
 
 
 
 
39
 
40
  @app.post("/generate")
41
  def generate(req: GenerateRequest):
 
 
 
 
 
42
  if llm is None:
43
- raise HTTPException(status_code=503, detail="Model not loaded yet. Check logs.")
44
 
45
  messages = []
46
  if req.system:
@@ -52,6 +67,7 @@ def generate(req: GenerateRequest):
52
  messages=messages,
53
  max_tokens=req.max_tokens,
54
  temperature=req.temperature,
 
55
  )
56
 
57
  return {"response": out["choices"][0]["message"]["content"]}
 
3
  from llama_cpp import Llama
4
  import threading
5
 
6
+ MODEL_REPO = "bartowski/google_gemma-4-E2B-it-GGUF"
7
  MODEL_FILE = "gemma-4-E2B-it-Q4_K_M.gguf"
8
 
9
  app = FastAPI(title="Gemma 4 CPU API")
10
  llm = None
11
+ load_error = None
12
  lock = threading.Lock()
13
 
14
  class GenerateRequest(BaseModel):
 
17
  max_tokens: int = Field(default=256, ge=1, le=1024)
18
  temperature: float = Field(default=0.7, ge=0.0, le=2.0)
19
 
 
20
  def load_model():
21
+ global llm, load_error
22
  try:
23
  llm = Llama.from_pretrained(
24
  repo_id=MODEL_REPO,
 
28
  n_batch=64,
29
  verbose=True,
30
  )
31
+ load_error = None
32
  print("MODEL LOADED OK")
33
  except Exception as e:
34
  llm = None
35
+ load_error = f"{type(e).__name__}: {e}"
36
+ print(f"MODEL LOAD FAILED: {load_error}")
37
+
38
+ @app.on_event("startup")
39
+ def startup():
40
+ load_model()
41
 
42
  @app.get("/health")
43
  def health():
44
+ return {
45
+ "ok": llm is not None,
46
+ "model_loaded": llm is not None,
47
+ "error": load_error,
48
+ }
49
 
50
  @app.post("/generate")
51
  def generate(req: GenerateRequest):
52
+ global llm
53
+
54
+ if llm is None:
55
+ load_model()
56
+
57
  if llm is None:
58
+ raise HTTPException(status_code=503, detail=f"Model not loaded: {load_error}")
59
 
60
  messages = []
61
  if req.system:
 
67
  messages=messages,
68
  max_tokens=req.max_tokens,
69
  temperature=req.temperature,
70
+ chat_format="gemma",
71
  )
72
 
73
  return {"response": out["choices"][0]["message"]["content"]}