Spaces:

fugthchat
/

Hannah-Pilot-Interface

Sleeping

App Files Files Community

fugthchat commited on Dec 20, 2025

Commit

9a066c8

1 Parent(s): d67838f

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -10

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import os
 import glob
 import json
 import psutil
 from typing import Any, Dict, List, Optional
 from fastapi import FastAPI, Request, HTTPException
@@ -31,29 +32,106 @@ current_model: Optional[Llama] = None
 current_model_name: str = ""
 def get_model(model_name: str) -> Llama:
     global current_model, current_model_name
     if not model_name:
         raise HTTPException(status_code=400, detail="No model selected")
-    if not os.path.exists(model_name):
-        raise HTTPException(status_code=404, detail="Model file not found")
     if current_model_name == model_name and current_model is not None:
         return current_model
-    print(f"Loading {model_name}...")
     if current_model is not None:
         del current_model
     # --- PERFORMANCE TUNING (HF Free CPU) ---
-    current_model = Llama(
-        model_path=model_name,
-        n_ctx=4096,
-        n_threads=2,
-        n_batch=512,
-        verbose=False,
-    )
     current_model_name = model_name
     return current_model

 import glob
 import json
 import psutil
+from pathlib import Path
 from typing import Any, Dict, List, Optional
 from fastapi import FastAPI, Request, HTTPException
 current_model_name: str = ""
+def _model_abs_path(model_name: str) -> Path:
+    # Always resolve relative to the app directory to avoid cwd surprises.
+    base_dir = Path(__file__).resolve().parent
+    return (base_dir / model_name).resolve()
+def _looks_like_pointer_file(path: Path) -> bool:
+    # If the GGUF file is a Git LFS pointer (or similar), llama.cpp will fail to load it.
+    try:
+        if not path.exists() or path.is_dir():
+            return False
+        head = path.read_bytes()[:256]
+        if b"git-lfs" in head and b"oid sha256" in head:
+            return True
+        # Some pointer files are plain text starting with "version".
+        if head.startswith(b"version ") and b"sha256" in head:
+            return True
+        return False
+    except Exception:
+        return False
+def _try_load_model(
+    model_path: Path, *, n_ctx: int, n_threads: int, n_batch: int
+) -> Llama:
+    # Keep this tiny and explicit so we can retry with different params.
+    return Llama(
+        model_path=str(model_path),
+        n_ctx=n_ctx,
+        n_threads=n_threads,
+        n_batch=n_batch,
+        # mmap tends to be friendlier on low-memory CPU machines
+        use_mmap=True,
+        verbose=False,
+    )
 def get_model(model_name: str) -> Llama:
     global current_model, current_model_name
     if not model_name:
         raise HTTPException(status_code=400, detail="No model selected")
+    model_path = _model_abs_path(model_name)
+    if not model_path.exists():
+        raise HTTPException(
+            status_code=404,
+            detail=f"Model file not found: {model_path.name}",
+        )
+    if _looks_like_pointer_file(model_path):
+        raise HTTPException(
+            status_code=500,
+            detail=(
+                "Model file looks like a pointer (not the real .gguf). "
+                "Re-upload the GGUF to the Space (so it is stored as the full binary), "
+                "then restart the Space."
+            ),
+        )
+    try:
+        size_mb = model_path.stat().st_size / (1024 * 1024)
+    except Exception:
+        size_mb = -1
     if current_model_name == model_name and current_model is not None:
         return current_model
+    print(f"Loading {model_path.name} ({size_mb:.1f} MB)...")
     if current_model is not None:
         del current_model
     # --- PERFORMANCE TUNING (HF Free CPU) ---
+    # 4096 ctx can be too memory heavy on small Spaces; start safer, then tune up later.
+    threads = int(os.getenv("N_THREADS", "2"))
+    n_ctx = int(os.getenv("N_CTX", "2048"))
+    n_batch = int(os.getenv("N_BATCH", "256"))
+    try:
+        current_model = _try_load_model(
+            model_path, n_ctx=n_ctx, n_threads=threads, n_batch=n_batch
+        )
+    except Exception as e:
+        # Retry with very conservative settings in case this is memory pressure.
+        print(f"Model load failed with N_CTX={n_ctx}, N_BATCH={n_batch}: {e}")
+        try:
+            current_model = _try_load_model(
+                model_path, n_ctx=1024, n_threads=threads, n_batch=64
+            )
+        except Exception as e2:
+            print(f"Model load retry failed: {e2}")
+            raise HTTPException(
+                status_code=500,
+                detail=(
+                    "Failed to load GGUF model. This is usually caused by: "
+                    "(1) model file not fully present inside the container, "
+                    "(2) not enough RAM for the chosen context size, or "
+                    "(3) llama-cpp-python too old for this GGUF. "
+                    f"Model: {model_path.name}"
+                ),
+            )
     current_model_name = model_name
     return current_model