Spaces:
Running on Zero
Running on Zero
Run on ZeroGPU: spaces @GPU + CUDA llama.cpp (n_gpu_layers=-1)
Browse filesUser set the Space to ZeroGPU. Add the spaces package and decorate the rollout
with @spaces.GPU(duration=180); switch llama-cpp-python to the cu124 prebuilt
wheel and offload all layers (n_gpu_layers=-1). Preload libcudart/libcublas from
the nvidia-* pip packages at import (ZeroGPU doesn't have them on the loader
path). Load the model per-call (GPU is freed between requests). Slider capped at
30 frames to fit the GPU window; CPU path still works via the no-op gpu shim.
- app.py +45 -13
- requirements.txt +4 -1
app.py
CHANGED
|
@@ -11,12 +11,46 @@ continues with "Frame N: …\n obj_i: pos=(x,y), vel=(vx,vy), a=…, av=…".
|
|
| 11 |
"""
|
| 12 |
from __future__ import annotations
|
| 13 |
|
|
|
|
| 14 |
import io
|
| 15 |
import json
|
|
|
|
| 16 |
import re
|
| 17 |
import time
|
| 18 |
from pathlib import Path
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
import gradio as gr
|
| 21 |
import matplotlib
|
| 22 |
|
|
@@ -147,23 +181,20 @@ HELD_OUT = {"pong", "bowling", "ramp_roll", "angry_birds", "hourglass", "newtons
|
|
| 147 |
# -----------------------------------------------------------------------------
|
| 148 |
# Model (lazy)
|
| 149 |
# -----------------------------------------------------------------------------
|
| 150 |
-
_LLM = None
|
| 151 |
-
|
| 152 |
-
|
| 153 |
def get_llm(log=lambda s: None):
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
from huggingface_hub import hf_hub_download
|
| 158 |
from llama_cpp import Llama
|
| 159 |
|
| 160 |
-
log("
|
| 161 |
path = hf_hub_download(repo_id=GGUF_REPO, filename=GGUF_FILE)
|
| 162 |
-
log("Loading LFM2-350M into llama.cpp…")
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
|
| 168 |
|
| 169 |
# -----------------------------------------------------------------------------
|
|
@@ -240,6 +271,7 @@ def render(header: dict, obj_map: dict[int, dict], bounds, title: str) -> Image.
|
|
| 240 |
# -----------------------------------------------------------------------------
|
| 241 |
# Simulation (streamed)
|
| 242 |
# -----------------------------------------------------------------------------
|
|
|
|
| 243 |
def simulate(scenario: str, n_frames: int, temperature: float):
|
| 244 |
log_lines: list[str] = []
|
| 245 |
|
|
@@ -348,7 +380,7 @@ with gr.Blocks(title="Physics LLM 🪀") as demo:
|
|
| 348 |
value="bowling" if "bowling" in SCENARIOS else (sorted(SCENARIOS)[0] if SCENARIOS else None),
|
| 349 |
label="Scenario",
|
| 350 |
)
|
| 351 |
-
n_frames = gr.Slider(5,
|
| 352 |
temperature = gr.Slider(0.0, 1.0, value=0.0, step=0.05,
|
| 353 |
label="Temperature (0 = greedy)")
|
| 354 |
run = gr.Button("▶ Simulate", variant="primary")
|
|
|
|
| 11 |
"""
|
| 12 |
from __future__ import annotations
|
| 13 |
|
| 14 |
+
import glob
|
| 15 |
import io
|
| 16 |
import json
|
| 17 |
+
import os
|
| 18 |
import re
|
| 19 |
import time
|
| 20 |
from pathlib import Path
|
| 21 |
|
| 22 |
+
|
| 23 |
+
def _preload_cuda() -> None:
|
| 24 |
+
"""ZeroGPU: the CUDA build of llama-cpp-python needs libcudart/libcublas on
|
| 25 |
+
the loader path at import time, but they aren't there by default. The pip
|
| 26 |
+
nvidia-* packages ship the .so's; preload them globally so `import llama_cpp`
|
| 27 |
+
succeeds. No-op off GPU / when the packages are absent."""
|
| 28 |
+
import ctypes
|
| 29 |
+
try:
|
| 30 |
+
import nvidia # noqa: F401
|
| 31 |
+
base = os.path.dirname(nvidia.__file__)
|
| 32 |
+
except Exception:
|
| 33 |
+
return
|
| 34 |
+
for sub in ("cuda_runtime", "cublas"):
|
| 35 |
+
for so in sorted(glob.glob(os.path.join(base, sub, "lib", "*.so*"))):
|
| 36 |
+
try:
|
| 37 |
+
ctypes.CDLL(so, mode=ctypes.RTLD_GLOBAL)
|
| 38 |
+
except OSError:
|
| 39 |
+
pass
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
_preload_cuda()
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
import spaces
|
| 46 |
+
|
| 47 |
+
gpu = spaces.GPU
|
| 48 |
+
except Exception: # local / non-Spaces: make @gpu(...) a no-op
|
| 49 |
+
def gpu(*args, **kwargs):
|
| 50 |
+
if len(args) == 1 and callable(args[0]) and not kwargs:
|
| 51 |
+
return args[0]
|
| 52 |
+
return lambda f: f
|
| 53 |
+
|
| 54 |
import gradio as gr
|
| 55 |
import matplotlib
|
| 56 |
|
|
|
|
| 181 |
# -----------------------------------------------------------------------------
|
| 182 |
# Model (lazy)
|
| 183 |
# -----------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
| 184 |
def get_llm(log=lambda s: None):
|
| 185 |
+
# Built fresh each call: ZeroGPU frees the GPU between requests, so a cached
|
| 186 |
+
# GPU-resident model would be stale. The GGUF stays disk-cached, so only the
|
| 187 |
+
# (fast) load repeats.
|
| 188 |
from huggingface_hub import hf_hub_download
|
| 189 |
from llama_cpp import Llama
|
| 190 |
|
| 191 |
+
log("Fetching model (≈216 MB, cached after first run)…")
|
| 192 |
path = hf_hub_download(repo_id=GGUF_REPO, filename=GGUF_FILE)
|
| 193 |
+
log("Loading LFM2-350M into llama.cpp (GPU offload)…")
|
| 194 |
+
# n_gpu_layers=-1 offloads all layers to the GPU when a CUDA build + GPU are
|
| 195 |
+
# present (ZeroGPU, inside @spaces.GPU); harmless on a CPU build.
|
| 196 |
+
return Llama(model_path=path, n_ctx=N_CTX, n_gpu_layers=-1,
|
| 197 |
+
n_threads=N_THREADS, n_threads_batch=N_THREADS, verbose=False)
|
| 198 |
|
| 199 |
|
| 200 |
# -----------------------------------------------------------------------------
|
|
|
|
| 271 |
# -----------------------------------------------------------------------------
|
| 272 |
# Simulation (streamed)
|
| 273 |
# -----------------------------------------------------------------------------
|
| 274 |
+
@gpu(duration=180)
|
| 275 |
def simulate(scenario: str, n_frames: int, temperature: float):
|
| 276 |
log_lines: list[str] = []
|
| 277 |
|
|
|
|
| 380 |
value="bowling" if "bowling" in SCENARIOS else (sorted(SCENARIOS)[0] if SCENARIOS else None),
|
| 381 |
label="Scenario",
|
| 382 |
)
|
| 383 |
+
n_frames = gr.Slider(5, 30, value=15, step=1, label="Frames to predict")
|
| 384 |
temperature = gr.Slider(0.0, 1.0, value=0.0, step=0.05,
|
| 385 |
label="Temperature (0 = greedy)")
|
| 386 |
run = gr.Button("▶ Simulate", variant="primary")
|
requirements.txt
CHANGED
|
@@ -1,6 +1,9 @@
|
|
| 1 |
-
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/
|
| 2 |
gradio==6.11.0
|
|
|
|
| 3 |
llama-cpp-python==0.3.23
|
|
|
|
|
|
|
| 4 |
huggingface_hub
|
| 5 |
matplotlib
|
| 6 |
pillow
|
|
|
|
| 1 |
+
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124
|
| 2 |
gradio==6.11.0
|
| 3 |
+
spaces
|
| 4 |
llama-cpp-python==0.3.23
|
| 5 |
+
nvidia-cuda-runtime-cu12
|
| 6 |
+
nvidia-cublas-cu12
|
| 7 |
huggingface_hub
|
| 8 |
matplotlib
|
| 9 |
pillow
|