lastmass commited on
Commit
27935bf
·
1 Parent(s): 575cde7

Fix llama.cpp CPU deployment on Spaces

Browse files
Files changed (3) hide show
  1. README.md +5 -0
  2. app.py +36 -22
  3. requirements.txt +0 -2
README.md CHANGED
@@ -50,6 +50,11 @@ The app is designed for **free CPU Spaces** on Hugging Face. It does not require
50
  a GPU. The GGUF model (~2.78 GB, Q4_K_M) is downloaded from the Hub at first
51
  launch and cached.
52
 
 
 
 
 
 
53
  - Set `DEMO_MODE=auto` (default) to allow a graceful scripted fallback if the
54
  model cannot load.
55
  - Set `DEMO_MODE=true` to skip model loading entirely (instant UI-only demo).
 
50
  a GPU. The GGUF model (~2.78 GB, Q4_K_M) is downloaded from the Hub at first
51
  launch and cached.
52
 
53
+ If you deploy on **ZeroGPU**, keep the CPU `llama-cpp-python` wheel. Do not use
54
+ the CUDA wheel URL (`llama-cpp-python/whl/cu124`) unless the Space image also
55
+ provides CUDA runtime libraries such as `libcudart.so.12`; otherwise model
56
+ loading can fail when the first button click triggers inference.
57
+
58
  - Set `DEMO_MODE=auto` (default) to allow a graceful scripted fallback if the
59
  model cannot load.
60
  - Set `DEMO_MODE=true` to skip model loading entirely (instant UI-only demo).
app.py CHANGED
@@ -119,6 +119,7 @@ ACTION_PRESETS = {
119
  # Game state
120
  # ---------------------------------------------------------------------------
121
 
 
122
  @dataclass
123
  class GameState:
124
  title: str = ""
@@ -148,6 +149,7 @@ class GameState:
148
  # Helpers
149
  # ---------------------------------------------------------------------------
150
 
 
151
  def normalize_text(value: str) -> str:
152
  return re.sub(r"\s+", " ", value or "").strip()
153
 
@@ -162,6 +164,7 @@ def strip_thinking(text: str) -> str:
162
  # Demo / fallback replies (no model needed)
163
  # ---------------------------------------------------------------------------
164
 
 
165
  def demo_reply(prompt: str, state: GameState, mode: str) -> str:
166
  unused = [c for c in state.clues if c not in state.used_clues]
167
  next_clue = unused[0] if unused else random.choice(state.clues)
@@ -211,17 +214,16 @@ def demo_reply(prompt: str, state: GameState, mode: str) -> str:
211
 
212
 
213
  # ---------------------------------------------------------------------------
214
- # Model loading — llama-cpp-python (GGUF) with ZeroGPU support
215
  # ---------------------------------------------------------------------------
216
-
217
- try:
218
- import spaces
219
- HAS_ZEROGPU = True
220
- except ImportError:
221
- HAS_ZEROGPU = False
222
 
223
  _llm_instance = None
224
 
 
225
  def get_llm():
226
  """Load the GGUF model. Raises RuntimeError when DEMO_MODE is forced."""
227
  global _llm_instance
@@ -237,15 +239,17 @@ def get_llm():
237
  repo_id=GGUF_REPO,
238
  filename=GGUF_FILE,
239
  n_ctx=2048,
240
- n_threads=4,
241
- n_gpu_layers=-1 if HAS_ZEROGPU else 0,
242
  verbose=True,
243
  )
244
  print("[Case Lantern] Model loaded successfully.")
245
  return _llm_instance
246
 
247
 
248
- def _call_model_inner(messages: List[Dict[str, str]], state: GameState, fallback_mode: str) -> str:
 
 
249
  if DEMO_MODE in {"1", "true", "yes", "on"}:
250
  return demo_reply(messages[-1]["content"], state, fallback_mode)
251
 
@@ -263,6 +267,7 @@ def _call_model_inner(messages: List[Dict[str, str]], state: GameState, fallback
263
  return strip_thinking(raw)
264
  except Exception as exc:
265
  import traceback
 
266
  traceback.print_exc()
267
  if DEMO_MODE == "off":
268
  raise
@@ -272,13 +277,7 @@ def _call_model_inner(messages: List[Dict[str, str]], state: GameState, fallback
272
  )
273
 
274
 
275
- # Wrap with @spaces.GPU when ZeroGPU is available
276
- if HAS_ZEROGPU:
277
- @spaces.GPU
278
- def call_model(messages, state, fallback_mode):
279
- return _call_model_inner(messages, state, fallback_mode)
280
- else:
281
- call_model = _call_model_inner
282
 
283
 
284
  # ---------------------------------------------------------------------------
@@ -325,7 +324,9 @@ def reveal_clue(state: GameState) -> Optional[str]:
325
  return clue
326
 
327
 
328
- def build_messages(state: GameState, instruction: str, mode: str) -> List[Dict[str, str]]:
 
 
329
  return [
330
  {"role": "system", "content": SYSTEM_PROMPT},
331
  {
@@ -371,10 +372,17 @@ def act(action, custom_action, chat, state):
371
  chat, state, context, status = new_case()
372
 
373
  if state.solved:
374
- chat.append({"role": "assistant", "content": "案件已经结案。点击 **新案件** 开始下一个挑战。"})
 
 
 
 
 
375
  return chat, state, state.public_context(), status_line(state), ""
376
 
377
- instruction = normalize_text(custom_action) or ACTION_PRESETS.get(action, ACTION_PRESETS["提示"])
 
 
378
  mode = "hint" if action == "提示" else "clue"
379
  state.turns += 1
380
  state.score = max(20, state.score - (6 if mode == "hint" else 4))
@@ -937,13 +945,19 @@ with gr.Blocks(
937
  # ---------------------------------------------------------------------------
938
  if __name__ == "__main__":
939
  launch_kwargs = {
940
- "share": os.getenv("GRADIO_SHARE", "false").lower() in {"1", "true", "yes", "on"},
 
941
  "theme": gr.themes.Base(
942
  primary_hue="rose",
943
  secondary_hue="teal",
944
  neutral_hue="slate",
945
  radius_size="lg",
946
- font=[gr.themes.GoogleFont("Inter"), "Noto Sans SC", "system-ui", "sans-serif"],
 
 
 
 
 
947
  ),
948
  "css": CUSTOM_CSS,
949
  "head": CUSTOM_HEAD,
 
119
  # Game state
120
  # ---------------------------------------------------------------------------
121
 
122
+
123
  @dataclass
124
  class GameState:
125
  title: str = ""
 
149
  # Helpers
150
  # ---------------------------------------------------------------------------
151
 
152
+
153
  def normalize_text(value: str) -> str:
154
  return re.sub(r"\s+", " ", value or "").strip()
155
 
 
164
  # Demo / fallback replies (no model needed)
165
  # ---------------------------------------------------------------------------
166
 
167
+
168
  def demo_reply(prompt: str, state: GameState, mode: str) -> str:
169
  unused = [c for c in state.clues if c not in state.used_clues]
170
  next_clue = unused[0] if unused else random.choice(state.clues)
 
214
 
215
 
216
  # ---------------------------------------------------------------------------
217
+ # Model loading — llama-cpp-python (GGUF) on CPU
218
  # ---------------------------------------------------------------------------
219
+ # Hugging Face ZeroGPU is designed primarily for PyTorch workloads. The CUDA
220
+ # wheel of llama-cpp-python requires system CUDA runtime libraries such as
221
+ # libcudart.so.12, which are not available in the normal Space container and can
222
+ # fail before inference starts. Use the CPU wheel for reliable Spaces startup.
 
 
223
 
224
  _llm_instance = None
225
 
226
+
227
  def get_llm():
228
  """Load the GGUF model. Raises RuntimeError when DEMO_MODE is forced."""
229
  global _llm_instance
 
239
  repo_id=GGUF_REPO,
240
  filename=GGUF_FILE,
241
  n_ctx=2048,
242
+ n_threads=int(os.getenv("LLAMA_THREADS", "4")),
243
+ n_gpu_layers=0,
244
  verbose=True,
245
  )
246
  print("[Case Lantern] Model loaded successfully.")
247
  return _llm_instance
248
 
249
 
250
+ def _call_model_inner(
251
+ messages: List[Dict[str, str]], state: GameState, fallback_mode: str
252
+ ) -> str:
253
  if DEMO_MODE in {"1", "true", "yes", "on"}:
254
  return demo_reply(messages[-1]["content"], state, fallback_mode)
255
 
 
267
  return strip_thinking(raw)
268
  except Exception as exc:
269
  import traceback
270
+
271
  traceback.print_exc()
272
  if DEMO_MODE == "off":
273
  raise
 
277
  )
278
 
279
 
280
+ call_model = _call_model_inner
 
 
 
 
 
 
281
 
282
 
283
  # ---------------------------------------------------------------------------
 
324
  return clue
325
 
326
 
327
+ def build_messages(
328
+ state: GameState, instruction: str, mode: str
329
+ ) -> List[Dict[str, str]]:
330
  return [
331
  {"role": "system", "content": SYSTEM_PROMPT},
332
  {
 
372
  chat, state, context, status = new_case()
373
 
374
  if state.solved:
375
+ chat.append(
376
+ {
377
+ "role": "assistant",
378
+ "content": "案件已经结案。点击 **新案件** 开始下一个挑战。",
379
+ }
380
+ )
381
  return chat, state, state.public_context(), status_line(state), ""
382
 
383
+ instruction = normalize_text(custom_action) or ACTION_PRESETS.get(
384
+ action, ACTION_PRESETS["提示"]
385
+ )
386
  mode = "hint" if action == "提示" else "clue"
387
  state.turns += 1
388
  state.score = max(20, state.score - (6 if mode == "hint" else 4))
 
945
  # ---------------------------------------------------------------------------
946
  if __name__ == "__main__":
947
  launch_kwargs = {
948
+ "share": os.getenv("GRADIO_SHARE", "false").lower()
949
+ in {"1", "true", "yes", "on"},
950
  "theme": gr.themes.Base(
951
  primary_hue="rose",
952
  secondary_hue="teal",
953
  neutral_hue="slate",
954
  radius_size="lg",
955
+ font=[
956
+ gr.themes.GoogleFont("Inter"),
957
+ "Noto Sans SC",
958
+ "system-ui",
959
+ "sans-serif",
960
+ ],
961
  ),
962
  "css": CUSTOM_CSS,
963
  "head": CUSTOM_HEAD,
requirements.txt CHANGED
@@ -1,4 +1,2 @@
1
- --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124
2
  gradio==6.15.2
3
  llama-cpp-python==0.3.22
4
- spaces
 
 
1
  gradio==6.15.2
2
  llama-cpp-python==0.3.22