fugthchat commited on
Commit
9a066c8
·
1 Parent(s): d67838f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -10
app.py CHANGED
@@ -2,6 +2,7 @@ import os
2
  import glob
3
  import json
4
  import psutil
 
5
  from typing import Any, Dict, List, Optional
6
 
7
  from fastapi import FastAPI, Request, HTTPException
@@ -31,29 +32,106 @@ current_model: Optional[Llama] = None
31
  current_model_name: str = ""
32
 
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def get_model(model_name: str) -> Llama:
35
  global current_model, current_model_name
36
 
37
  if not model_name:
38
  raise HTTPException(status_code=400, detail="No model selected")
39
- if not os.path.exists(model_name):
40
- raise HTTPException(status_code=404, detail="Model file not found")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  if current_model_name == model_name and current_model is not None:
43
  return current_model
44
 
45
- print(f"Loading {model_name}...")
46
  if current_model is not None:
47
  del current_model
48
 
49
  # --- PERFORMANCE TUNING (HF Free CPU) ---
50
- current_model = Llama(
51
- model_path=model_name,
52
- n_ctx=4096,
53
- n_threads=2,
54
- n_batch=512,
55
- verbose=False,
56
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  current_model_name = model_name
58
  return current_model
59
 
 
2
  import glob
3
  import json
4
  import psutil
5
+ from pathlib import Path
6
  from typing import Any, Dict, List, Optional
7
 
8
  from fastapi import FastAPI, Request, HTTPException
 
32
  current_model_name: str = ""
33
 
34
 
35
+ def _model_abs_path(model_name: str) -> Path:
36
+ # Always resolve relative to the app directory to avoid cwd surprises.
37
+ base_dir = Path(__file__).resolve().parent
38
+ return (base_dir / model_name).resolve()
39
+
40
+
41
+ def _looks_like_pointer_file(path: Path) -> bool:
42
+ # If the GGUF file is a Git LFS pointer (or similar), llama.cpp will fail to load it.
43
+ try:
44
+ if not path.exists() or path.is_dir():
45
+ return False
46
+ head = path.read_bytes()[:256]
47
+ if b"git-lfs" in head and b"oid sha256" in head:
48
+ return True
49
+ # Some pointer files are plain text starting with "version".
50
+ if head.startswith(b"version ") and b"sha256" in head:
51
+ return True
52
+ return False
53
+ except Exception:
54
+ return False
55
+
56
+
57
+ def _try_load_model(
58
+ model_path: Path, *, n_ctx: int, n_threads: int, n_batch: int
59
+ ) -> Llama:
60
+ # Keep this tiny and explicit so we can retry with different params.
61
+ return Llama(
62
+ model_path=str(model_path),
63
+ n_ctx=n_ctx,
64
+ n_threads=n_threads,
65
+ n_batch=n_batch,
66
+ # mmap tends to be friendlier on low-memory CPU machines
67
+ use_mmap=True,
68
+ verbose=False,
69
+ )
70
+
71
+
72
  def get_model(model_name: str) -> Llama:
73
  global current_model, current_model_name
74
 
75
  if not model_name:
76
  raise HTTPException(status_code=400, detail="No model selected")
77
+
78
+ model_path = _model_abs_path(model_name)
79
+ if not model_path.exists():
80
+ raise HTTPException(
81
+ status_code=404,
82
+ detail=f"Model file not found: {model_path.name}",
83
+ )
84
+ if _looks_like_pointer_file(model_path):
85
+ raise HTTPException(
86
+ status_code=500,
87
+ detail=(
88
+ "Model file looks like a pointer (not the real .gguf). "
89
+ "Re-upload the GGUF to the Space (so it is stored as the full binary), "
90
+ "then restart the Space."
91
+ ),
92
+ )
93
+ try:
94
+ size_mb = model_path.stat().st_size / (1024 * 1024)
95
+ except Exception:
96
+ size_mb = -1
97
 
98
  if current_model_name == model_name and current_model is not None:
99
  return current_model
100
 
101
+ print(f"Loading {model_path.name} ({size_mb:.1f} MB)...")
102
  if current_model is not None:
103
  del current_model
104
 
105
  # --- PERFORMANCE TUNING (HF Free CPU) ---
106
+ # 4096 ctx can be too memory heavy on small Spaces; start safer, then tune up later.
107
+ threads = int(os.getenv("N_THREADS", "2"))
108
+ n_ctx = int(os.getenv("N_CTX", "2048"))
109
+ n_batch = int(os.getenv("N_BATCH", "256"))
110
+
111
+ try:
112
+ current_model = _try_load_model(
113
+ model_path, n_ctx=n_ctx, n_threads=threads, n_batch=n_batch
114
+ )
115
+ except Exception as e:
116
+ # Retry with very conservative settings in case this is memory pressure.
117
+ print(f"Model load failed with N_CTX={n_ctx}, N_BATCH={n_batch}: {e}")
118
+ try:
119
+ current_model = _try_load_model(
120
+ model_path, n_ctx=1024, n_threads=threads, n_batch=64
121
+ )
122
+ except Exception as e2:
123
+ print(f"Model load retry failed: {e2}")
124
+ raise HTTPException(
125
+ status_code=500,
126
+ detail=(
127
+ "Failed to load GGUF model. This is usually caused by: "
128
+ "(1) model file not fully present inside the container, "
129
+ "(2) not enough RAM for the chosen context size, or "
130
+ "(3) llama-cpp-python too old for this GGUF. "
131
+ f"Model: {model_path.name}"
132
+ ),
133
+ )
134
+
135
  current_model_name = model_name
136
  return current_model
137