Spaces:

build-small-hackathon
/

Case-Lantern

Running

App Files Files Community

lastmass commited on 5 days ago

Commit

27935bf

1 Parent(s): 575cde7

Fix llama.cpp CPU deployment on Spaces

Browse files

Files changed (3) hide show

README.md +5 -0
app.py +36 -22
requirements.txt +0 -2

README.md CHANGED Viewed

@@ -50,6 +50,11 @@ The app is designed for **free CPU Spaces** on Hugging Face. It does not require
 a GPU. The GGUF model (~2.78 GB, Q4_K_M) is downloaded from the Hub at first
 launch and cached.
 - Set `DEMO_MODE=auto` (default) to allow a graceful scripted fallback if the
   model cannot load.
 - Set `DEMO_MODE=true` to skip model loading entirely (instant UI-only demo).

 a GPU. The GGUF model (~2.78 GB, Q4_K_M) is downloaded from the Hub at first
 launch and cached.
+If you deploy on **ZeroGPU**, keep the CPU `llama-cpp-python` wheel. Do not use
+the CUDA wheel URL (`llama-cpp-python/whl/cu124`) unless the Space image also
+provides CUDA runtime libraries such as `libcudart.so.12`; otherwise model
+loading can fail when the first button click triggers inference.
 - Set `DEMO_MODE=auto` (default) to allow a graceful scripted fallback if the
   model cannot load.
 - Set `DEMO_MODE=true` to skip model loading entirely (instant UI-only demo).

app.py CHANGED Viewed

@@ -119,6 +119,7 @@ ACTION_PRESETS = {
 # Game state
 # ---------------------------------------------------------------------------
 @dataclass
 class GameState:
     title: str = ""
@@ -148,6 +149,7 @@ class GameState:
 # Helpers
 # ---------------------------------------------------------------------------
 def normalize_text(value: str) -> str:
     return re.sub(r"\s+", " ", value or "").strip()
@@ -162,6 +164,7 @@ def strip_thinking(text: str) -> str:
 # Demo / fallback replies (no model needed)
 # ---------------------------------------------------------------------------
 def demo_reply(prompt: str, state: GameState, mode: str) -> str:
     unused = [c for c in state.clues if c not in state.used_clues]
     next_clue = unused[0] if unused else random.choice(state.clues)
@@ -211,17 +214,16 @@ def demo_reply(prompt: str, state: GameState, mode: str) -> str:
 # ---------------------------------------------------------------------------
-# Model loading — llama-cpp-python (GGUF) with ZeroGPU support
 # ---------------------------------------------------------------------------
-try:
-    import spaces
-    HAS_ZEROGPU = True
-except ImportError:
-    HAS_ZEROGPU = False
 _llm_instance = None
 def get_llm():
     """Load the GGUF model.  Raises RuntimeError when DEMO_MODE is forced."""
     global _llm_instance
@@ -237,15 +239,17 @@ def get_llm():
         repo_id=GGUF_REPO,
         filename=GGUF_FILE,
         n_ctx=2048,
-        n_threads=4,
-        n_gpu_layers=-1 if HAS_ZEROGPU else 0,
         verbose=True,
     )
     print("[Case Lantern] Model loaded successfully.")
     return _llm_instance
-def _call_model_inner(messages: List[Dict[str, str]], state: GameState, fallback_mode: str) -> str:
     if DEMO_MODE in {"1", "true", "yes", "on"}:
         return demo_reply(messages[-1]["content"], state, fallback_mode)
@@ -263,6 +267,7 @@ def _call_model_inner(messages: List[Dict[str, str]], state: GameState, fallback
         return strip_thinking(raw)
     except Exception as exc:
         import traceback
         traceback.print_exc()
         if DEMO_MODE == "off":
             raise
@@ -272,13 +277,7 @@ def _call_model_inner(messages: List[Dict[str, str]], state: GameState, fallback
         )
-# Wrap with @spaces.GPU when ZeroGPU is available
-if HAS_ZEROGPU:
-    @spaces.GPU
-    def call_model(messages, state, fallback_mode):
-        return _call_model_inner(messages, state, fallback_mode)
-else:
-    call_model = _call_model_inner
 # ---------------------------------------------------------------------------
@@ -325,7 +324,9 @@ def reveal_clue(state: GameState) -> Optional[str]:
     return clue
-def build_messages(state: GameState, instruction: str, mode: str) -> List[Dict[str, str]]:
     return [
         {"role": "system", "content": SYSTEM_PROMPT},
         {
@@ -371,10 +372,17 @@ def act(action, custom_action, chat, state):
         chat, state, context, status = new_case()
     if state.solved:
-        chat.append({"role": "assistant", "content": "案件已经结案。点击 **新案件** 开始下一个挑战。"})
         return chat, state, state.public_context(), status_line(state), ""
-    instruction = normalize_text(custom_action) or ACTION_PRESETS.get(action, ACTION_PRESETS["提示"])
     mode = "hint" if action == "提示" else "clue"
     state.turns += 1
     state.score = max(20, state.score - (6 if mode == "hint" else 4))
@@ -937,13 +945,19 @@ with gr.Blocks(
 # ---------------------------------------------------------------------------
 if __name__ == "__main__":
     launch_kwargs = {
-        "share": os.getenv("GRADIO_SHARE", "false").lower() in {"1", "true", "yes", "on"},
         "theme": gr.themes.Base(
             primary_hue="rose",
             secondary_hue="teal",
             neutral_hue="slate",
             radius_size="lg",
-            font=[gr.themes.GoogleFont("Inter"), "Noto Sans SC", "system-ui", "sans-serif"],
         ),
         "css": CUSTOM_CSS,
         "head": CUSTOM_HEAD,

 # Game state
 # ---------------------------------------------------------------------------
 @dataclass
 class GameState:
     title: str = ""
 # Helpers
 # ---------------------------------------------------------------------------
 def normalize_text(value: str) -> str:
     return re.sub(r"\s+", " ", value or "").strip()
 # Demo / fallback replies (no model needed)
 # ---------------------------------------------------------------------------
 def demo_reply(prompt: str, state: GameState, mode: str) -> str:
     unused = [c for c in state.clues if c not in state.used_clues]
     next_clue = unused[0] if unused else random.choice(state.clues)
 # ---------------------------------------------------------------------------
+# Model loading — llama-cpp-python (GGUF) on CPU
 # ---------------------------------------------------------------------------
+# Hugging Face ZeroGPU is designed primarily for PyTorch workloads. The CUDA
+# wheel of llama-cpp-python requires system CUDA runtime libraries such as
+# libcudart.so.12, which are not available in the normal Space container and can
+# fail before inference starts. Use the CPU wheel for reliable Spaces startup.
 _llm_instance = None
 def get_llm():
     """Load the GGUF model.  Raises RuntimeError when DEMO_MODE is forced."""
     global _llm_instance
         repo_id=GGUF_REPO,
         filename=GGUF_FILE,
         n_ctx=2048,
+        n_threads=int(os.getenv("LLAMA_THREADS", "4")),
+        n_gpu_layers=0,
         verbose=True,
     )
     print("[Case Lantern] Model loaded successfully.")
     return _llm_instance
+def _call_model_inner(
+    messages: List[Dict[str, str]], state: GameState, fallback_mode: str
+) -> str:
     if DEMO_MODE in {"1", "true", "yes", "on"}:
         return demo_reply(messages[-1]["content"], state, fallback_mode)
         return strip_thinking(raw)
     except Exception as exc:
         import traceback
         traceback.print_exc()
         if DEMO_MODE == "off":
             raise
         )
+call_model = _call_model_inner
 # ---------------------------------------------------------------------------
     return clue
+def build_messages(
+    state: GameState, instruction: str, mode: str
+) -> List[Dict[str, str]]:
     return [
         {"role": "system", "content": SYSTEM_PROMPT},
         {
         chat, state, context, status = new_case()
     if state.solved:
+        chat.append(
+            {
+                "role": "assistant",
+                "content": "案件已经结案。点击 **新案件** 开始下一个挑战。",
+            }
+        )
         return chat, state, state.public_context(), status_line(state), ""
+    instruction = normalize_text(custom_action) or ACTION_PRESETS.get(
+        action, ACTION_PRESETS["提示"]
+    )
     mode = "hint" if action == "提示" else "clue"
     state.turns += 1
     state.score = max(20, state.score - (6 if mode == "hint" else 4))
 # ---------------------------------------------------------------------------
 if __name__ == "__main__":
     launch_kwargs = {
+        "share": os.getenv("GRADIO_SHARE", "false").lower()
+        in {"1", "true", "yes", "on"},
         "theme": gr.themes.Base(
             primary_hue="rose",
             secondary_hue="teal",
             neutral_hue="slate",
             radius_size="lg",
+            font=[
+                gr.themes.GoogleFont("Inter"),
+                "Noto Sans SC",
+                "system-ui",
+                "sans-serif",
+            ],
         ),
         "css": CUSTOM_CSS,
         "head": CUSTOM_HEAD,

requirements.txt CHANGED Viewed

@@ -1,4 +1,2 @@
---extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124
 gradio==6.15.2
 llama-cpp-python==0.3.22
-spaces



1	gradio==6.15.2
2	llama-cpp-python==0.3.22