Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,6 +2,7 @@ import os
|
|
| 2 |
import glob
|
| 3 |
import json
|
| 4 |
import psutil
|
|
|
|
| 5 |
from typing import Any, Dict, List, Optional
|
| 6 |
|
| 7 |
from fastapi import FastAPI, Request, HTTPException
|
|
@@ -31,29 +32,106 @@ current_model: Optional[Llama] = None
|
|
| 31 |
current_model_name: str = ""
|
| 32 |
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
def get_model(model_name: str) -> Llama:
|
| 35 |
global current_model, current_model_name
|
| 36 |
|
| 37 |
if not model_name:
|
| 38 |
raise HTTPException(status_code=400, detail="No model selected")
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
if current_model_name == model_name and current_model is not None:
|
| 43 |
return current_model
|
| 44 |
|
| 45 |
-
print(f"Loading {
|
| 46 |
if current_model is not None:
|
| 47 |
del current_model
|
| 48 |
|
| 49 |
# --- PERFORMANCE TUNING (HF Free CPU) ---
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
current_model_name = model_name
|
| 58 |
return current_model
|
| 59 |
|
|
|
|
| 2 |
import glob
|
| 3 |
import json
|
| 4 |
import psutil
|
| 5 |
+
from pathlib import Path
|
| 6 |
from typing import Any, Dict, List, Optional
|
| 7 |
|
| 8 |
from fastapi import FastAPI, Request, HTTPException
|
|
|
|
| 32 |
current_model_name: str = ""
|
| 33 |
|
| 34 |
|
| 35 |
+
def _model_abs_path(model_name: str) -> Path:
|
| 36 |
+
# Always resolve relative to the app directory to avoid cwd surprises.
|
| 37 |
+
base_dir = Path(__file__).resolve().parent
|
| 38 |
+
return (base_dir / model_name).resolve()
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def _looks_like_pointer_file(path: Path) -> bool:
|
| 42 |
+
# If the GGUF file is a Git LFS pointer (or similar), llama.cpp will fail to load it.
|
| 43 |
+
try:
|
| 44 |
+
if not path.exists() or path.is_dir():
|
| 45 |
+
return False
|
| 46 |
+
head = path.read_bytes()[:256]
|
| 47 |
+
if b"git-lfs" in head and b"oid sha256" in head:
|
| 48 |
+
return True
|
| 49 |
+
# Some pointer files are plain text starting with "version".
|
| 50 |
+
if head.startswith(b"version ") and b"sha256" in head:
|
| 51 |
+
return True
|
| 52 |
+
return False
|
| 53 |
+
except Exception:
|
| 54 |
+
return False
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def _try_load_model(
|
| 58 |
+
model_path: Path, *, n_ctx: int, n_threads: int, n_batch: int
|
| 59 |
+
) -> Llama:
|
| 60 |
+
# Keep this tiny and explicit so we can retry with different params.
|
| 61 |
+
return Llama(
|
| 62 |
+
model_path=str(model_path),
|
| 63 |
+
n_ctx=n_ctx,
|
| 64 |
+
n_threads=n_threads,
|
| 65 |
+
n_batch=n_batch,
|
| 66 |
+
# mmap tends to be friendlier on low-memory CPU machines
|
| 67 |
+
use_mmap=True,
|
| 68 |
+
verbose=False,
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
|
| 72 |
def get_model(model_name: str) -> Llama:
|
| 73 |
global current_model, current_model_name
|
| 74 |
|
| 75 |
if not model_name:
|
| 76 |
raise HTTPException(status_code=400, detail="No model selected")
|
| 77 |
+
|
| 78 |
+
model_path = _model_abs_path(model_name)
|
| 79 |
+
if not model_path.exists():
|
| 80 |
+
raise HTTPException(
|
| 81 |
+
status_code=404,
|
| 82 |
+
detail=f"Model file not found: {model_path.name}",
|
| 83 |
+
)
|
| 84 |
+
if _looks_like_pointer_file(model_path):
|
| 85 |
+
raise HTTPException(
|
| 86 |
+
status_code=500,
|
| 87 |
+
detail=(
|
| 88 |
+
"Model file looks like a pointer (not the real .gguf). "
|
| 89 |
+
"Re-upload the GGUF to the Space (so it is stored as the full binary), "
|
| 90 |
+
"then restart the Space."
|
| 91 |
+
),
|
| 92 |
+
)
|
| 93 |
+
try:
|
| 94 |
+
size_mb = model_path.stat().st_size / (1024 * 1024)
|
| 95 |
+
except Exception:
|
| 96 |
+
size_mb = -1
|
| 97 |
|
| 98 |
if current_model_name == model_name and current_model is not None:
|
| 99 |
return current_model
|
| 100 |
|
| 101 |
+
print(f"Loading {model_path.name} ({size_mb:.1f} MB)...")
|
| 102 |
if current_model is not None:
|
| 103 |
del current_model
|
| 104 |
|
| 105 |
# --- PERFORMANCE TUNING (HF Free CPU) ---
|
| 106 |
+
# 4096 ctx can be too memory heavy on small Spaces; start safer, then tune up later.
|
| 107 |
+
threads = int(os.getenv("N_THREADS", "2"))
|
| 108 |
+
n_ctx = int(os.getenv("N_CTX", "2048"))
|
| 109 |
+
n_batch = int(os.getenv("N_BATCH", "256"))
|
| 110 |
+
|
| 111 |
+
try:
|
| 112 |
+
current_model = _try_load_model(
|
| 113 |
+
model_path, n_ctx=n_ctx, n_threads=threads, n_batch=n_batch
|
| 114 |
+
)
|
| 115 |
+
except Exception as e:
|
| 116 |
+
# Retry with very conservative settings in case this is memory pressure.
|
| 117 |
+
print(f"Model load failed with N_CTX={n_ctx}, N_BATCH={n_batch}: {e}")
|
| 118 |
+
try:
|
| 119 |
+
current_model = _try_load_model(
|
| 120 |
+
model_path, n_ctx=1024, n_threads=threads, n_batch=64
|
| 121 |
+
)
|
| 122 |
+
except Exception as e2:
|
| 123 |
+
print(f"Model load retry failed: {e2}")
|
| 124 |
+
raise HTTPException(
|
| 125 |
+
status_code=500,
|
| 126 |
+
detail=(
|
| 127 |
+
"Failed to load GGUF model. This is usually caused by: "
|
| 128 |
+
"(1) model file not fully present inside the container, "
|
| 129 |
+
"(2) not enough RAM for the chosen context size, or "
|
| 130 |
+
"(3) llama-cpp-python too old for this GGUF. "
|
| 131 |
+
f"Model: {model_path.name}"
|
| 132 |
+
),
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
current_model_name = model_name
|
| 136 |
return current_model
|
| 137 |
|