AlexWortega commited on
Commit
5f2fdbe
·
unverified ·
1 Parent(s): f9ff64c

Run on ZeroGPU: spaces @GPU + CUDA llama.cpp (n_gpu_layers=-1)

Browse files

User set the Space to ZeroGPU. Add the spaces package and decorate the rollout
with @spaces.GPU(duration=180); switch llama-cpp-python to the cu124 prebuilt
wheel and offload all layers (n_gpu_layers=-1). Preload libcudart/libcublas from
the nvidia-* pip packages at import (ZeroGPU doesn't have them on the loader
path). Load the model per-call (GPU is freed between requests). Slider capped at
30 frames to fit the GPU window; CPU path still works via the no-op gpu shim.

Files changed (2) hide show
  1. app.py +45 -13
  2. requirements.txt +4 -1
app.py CHANGED
@@ -11,12 +11,46 @@ continues with "Frame N: …\n obj_i: pos=(x,y), vel=(vx,vy), a=…, av=…".
11
  """
12
  from __future__ import annotations
13
 
 
14
  import io
15
  import json
 
16
  import re
17
  import time
18
  from pathlib import Path
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  import gradio as gr
21
  import matplotlib
22
 
@@ -147,23 +181,20 @@ HELD_OUT = {"pong", "bowling", "ramp_roll", "angry_birds", "hourglass", "newtons
147
  # -----------------------------------------------------------------------------
148
  # Model (lazy)
149
  # -----------------------------------------------------------------------------
150
- _LLM = None
151
-
152
-
153
  def get_llm(log=lambda s: None):
154
- global _LLM
155
- if _LLM is not None:
156
- return _LLM
157
  from huggingface_hub import hf_hub_download
158
  from llama_cpp import Llama
159
 
160
- log("Downloading model (≈216 MB, first run only)…")
161
  path = hf_hub_download(repo_id=GGUF_REPO, filename=GGUF_FILE)
162
- log("Loading LFM2-350M into llama.cpp…")
163
- _LLM = Llama(model_path=path, n_ctx=N_CTX, n_threads=N_THREADS,
164
- n_threads_batch=N_THREADS, verbose=False)
165
- log("Model ready.")
166
- return _LLM
167
 
168
 
169
  # -----------------------------------------------------------------------------
@@ -240,6 +271,7 @@ def render(header: dict, obj_map: dict[int, dict], bounds, title: str) -> Image.
240
  # -----------------------------------------------------------------------------
241
  # Simulation (streamed)
242
  # -----------------------------------------------------------------------------
 
243
  def simulate(scenario: str, n_frames: int, temperature: float):
244
  log_lines: list[str] = []
245
 
@@ -348,7 +380,7 @@ with gr.Blocks(title="Physics LLM 🪀") as demo:
348
  value="bowling" if "bowling" in SCENARIOS else (sorted(SCENARIOS)[0] if SCENARIOS else None),
349
  label="Scenario",
350
  )
351
- n_frames = gr.Slider(5, 60, value=15, step=1, label="Frames to predict")
352
  temperature = gr.Slider(0.0, 1.0, value=0.0, step=0.05,
353
  label="Temperature (0 = greedy)")
354
  run = gr.Button("▶ Simulate", variant="primary")
 
11
  """
12
  from __future__ import annotations
13
 
14
+ import glob
15
  import io
16
  import json
17
+ import os
18
  import re
19
  import time
20
  from pathlib import Path
21
 
22
+
23
+ def _preload_cuda() -> None:
24
+ """ZeroGPU: the CUDA build of llama-cpp-python needs libcudart/libcublas on
25
+ the loader path at import time, but they aren't there by default. The pip
26
+ nvidia-* packages ship the .so's; preload them globally so `import llama_cpp`
27
+ succeeds. No-op off GPU / when the packages are absent."""
28
+ import ctypes
29
+ try:
30
+ import nvidia # noqa: F401
31
+ base = os.path.dirname(nvidia.__file__)
32
+ except Exception:
33
+ return
34
+ for sub in ("cuda_runtime", "cublas"):
35
+ for so in sorted(glob.glob(os.path.join(base, sub, "lib", "*.so*"))):
36
+ try:
37
+ ctypes.CDLL(so, mode=ctypes.RTLD_GLOBAL)
38
+ except OSError:
39
+ pass
40
+
41
+
42
+ _preload_cuda()
43
+
44
+ try:
45
+ import spaces
46
+
47
+ gpu = spaces.GPU
48
+ except Exception: # local / non-Spaces: make @gpu(...) a no-op
49
+ def gpu(*args, **kwargs):
50
+ if len(args) == 1 and callable(args[0]) and not kwargs:
51
+ return args[0]
52
+ return lambda f: f
53
+
54
  import gradio as gr
55
  import matplotlib
56
 
 
181
  # -----------------------------------------------------------------------------
182
  # Model (lazy)
183
  # -----------------------------------------------------------------------------
 
 
 
184
  def get_llm(log=lambda s: None):
185
+ # Built fresh each call: ZeroGPU frees the GPU between requests, so a cached
186
+ # GPU-resident model would be stale. The GGUF stays disk-cached, so only the
187
+ # (fast) load repeats.
188
  from huggingface_hub import hf_hub_download
189
  from llama_cpp import Llama
190
 
191
+ log("Fetching model (≈216 MB, cached after first run)…")
192
  path = hf_hub_download(repo_id=GGUF_REPO, filename=GGUF_FILE)
193
+ log("Loading LFM2-350M into llama.cpp (GPU offload)…")
194
+ # n_gpu_layers=-1 offloads all layers to the GPU when a CUDA build + GPU are
195
+ # present (ZeroGPU, inside @spaces.GPU); harmless on a CPU build.
196
+ return Llama(model_path=path, n_ctx=N_CTX, n_gpu_layers=-1,
197
+ n_threads=N_THREADS, n_threads_batch=N_THREADS, verbose=False)
198
 
199
 
200
  # -----------------------------------------------------------------------------
 
271
  # -----------------------------------------------------------------------------
272
  # Simulation (streamed)
273
  # -----------------------------------------------------------------------------
274
+ @gpu(duration=180)
275
  def simulate(scenario: str, n_frames: int, temperature: float):
276
  log_lines: list[str] = []
277
 
 
380
  value="bowling" if "bowling" in SCENARIOS else (sorted(SCENARIOS)[0] if SCENARIOS else None),
381
  label="Scenario",
382
  )
383
+ n_frames = gr.Slider(5, 30, value=15, step=1, label="Frames to predict")
384
  temperature = gr.Slider(0.0, 1.0, value=0.0, step=0.05,
385
  label="Temperature (0 = greedy)")
386
  run = gr.Button("▶ Simulate", variant="primary")
requirements.txt CHANGED
@@ -1,6 +1,9 @@
1
- --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
2
  gradio==6.11.0
 
3
  llama-cpp-python==0.3.23
 
 
4
  huggingface_hub
5
  matplotlib
6
  pillow
 
1
+ --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124
2
  gradio==6.11.0
3
+ spaces
4
  llama-cpp-python==0.3.23
5
+ nvidia-cuda-runtime-cu12
6
+ nvidia-cublas-cu12
7
  huggingface_hub
8
  matplotlib
9
  pillow