Spaces:

AlexWortega
/

physics-llm

Running on Zero

App Files Files Community

AlexWortega commited on 10 days ago

Commit

5f2fdbe

unverified ·

1 Parent(s): f9ff64c

Run on ZeroGPU: spaces @GPU + CUDA llama.cpp (n_gpu_layers=-1)

Browse files

User set the Space to ZeroGPU. Add the spaces package and decorate the rollout
with @spaces.GPU(duration=180); switch llama-cpp-python to the cu124 prebuilt
wheel and offload all layers (n_gpu_layers=-1). Preload libcudart/libcublas from
the nvidia-* pip packages at import (ZeroGPU doesn't have them on the loader
path). Load the model per-call (GPU is freed between requests). Slider capped at
30 frames to fit the GPU window; CPU path still works via the no-op gpu shim.

Files changed (2) hide show

app.py +45 -13
requirements.txt +4 -1

app.py CHANGED Viewed

@@ -11,12 +11,46 @@ continues with "Frame N: …\n  obj_i: pos=(x,y), vel=(vx,vy), a=…, av=…".
 """
 from __future__ import annotations
 import io
 import json
 import re
 import time
 from pathlib import Path
 import gradio as gr
 import matplotlib
@@ -147,23 +181,20 @@ HELD_OUT = {"pong", "bowling", "ramp_roll", "angry_birds", "hourglass", "newtons
 # -----------------------------------------------------------------------------
 # Model (lazy)
 # -----------------------------------------------------------------------------
-_LLM = None
 def get_llm(log=lambda s: None):
-    global _LLM
-    if _LLM is not None:
-        return _LLM
     from huggingface_hub import hf_hub_download
     from llama_cpp import Llama
-    log("Downloading model (≈216 MB, first run only)…")
     path = hf_hub_download(repo_id=GGUF_REPO, filename=GGUF_FILE)
-    log("Loading LFM2-350M into llama.cpp…")
-    _LLM = Llama(model_path=path, n_ctx=N_CTX, n_threads=N_THREADS,
-                 n_threads_batch=N_THREADS, verbose=False)
-    log("Model ready.")
-    return _LLM
 # -----------------------------------------------------------------------------
@@ -240,6 +271,7 @@ def render(header: dict, obj_map: dict[int, dict], bounds, title: str) -> Image.
 # -----------------------------------------------------------------------------
 # Simulation (streamed)
 # -----------------------------------------------------------------------------
 def simulate(scenario: str, n_frames: int, temperature: float):
     log_lines: list[str] = []
@@ -348,7 +380,7 @@ with gr.Blocks(title="Physics LLM 🪀") as demo:
                 value="bowling" if "bowling" in SCENARIOS else (sorted(SCENARIOS)[0] if SCENARIOS else None),
                 label="Scenario",
             )
-            n_frames = gr.Slider(5, 60, value=15, step=1, label="Frames to predict")
             temperature = gr.Slider(0.0, 1.0, value=0.0, step=0.05,
                                     label="Temperature (0 = greedy)")
             run = gr.Button("▶ Simulate", variant="primary")

 """
 from __future__ import annotations
+import glob
 import io
 import json
+import os
 import re
 import time
 from pathlib import Path
+def _preload_cuda() -> None:
+    """ZeroGPU: the CUDA build of llama-cpp-python needs libcudart/libcublas on
+    the loader path at import time, but they aren't there by default. The pip
+    nvidia-* packages ship the .so's; preload them globally so `import llama_cpp`
+    succeeds. No-op off GPU / when the packages are absent."""
+    import ctypes
+    try:
+        import nvidia  # noqa: F401
+        base = os.path.dirname(nvidia.__file__)
+    except Exception:
+        return
+    for sub in ("cuda_runtime", "cublas"):
+        for so in sorted(glob.glob(os.path.join(base, sub, "lib", "*.so*"))):
+            try:
+                ctypes.CDLL(so, mode=ctypes.RTLD_GLOBAL)
+            except OSError:
+                pass
+_preload_cuda()
+try:
+    import spaces
+    gpu = spaces.GPU
+except Exception:  # local / non-Spaces: make @gpu(...) a no-op
+    def gpu(*args, **kwargs):
+        if len(args) == 1 and callable(args[0]) and not kwargs:
+            return args[0]
+        return lambda f: f
 import gradio as gr
 import matplotlib
 # -----------------------------------------------------------------------------
 # Model (lazy)
 # -----------------------------------------------------------------------------
 def get_llm(log=lambda s: None):
+    # Built fresh each call: ZeroGPU frees the GPU between requests, so a cached
+    # GPU-resident model would be stale. The GGUF stays disk-cached, so only the
+    # (fast) load repeats.
     from huggingface_hub import hf_hub_download
     from llama_cpp import Llama
+    log("Fetching model (≈216 MB, cached after first run)…")
     path = hf_hub_download(repo_id=GGUF_REPO, filename=GGUF_FILE)
+    log("Loading LFM2-350M into llama.cpp (GPU offload)…")
+    # n_gpu_layers=-1 offloads all layers to the GPU when a CUDA build + GPU are
+    # present (ZeroGPU, inside @spaces.GPU); harmless on a CPU build.
+    return Llama(model_path=path, n_ctx=N_CTX, n_gpu_layers=-1,
+                 n_threads=N_THREADS, n_threads_batch=N_THREADS, verbose=False)
 # -----------------------------------------------------------------------------
 # -----------------------------------------------------------------------------
 # Simulation (streamed)
 # -----------------------------------------------------------------------------
+@gpu(duration=180)
 def simulate(scenario: str, n_frames: int, temperature: float):
     log_lines: list[str] = []
                 value="bowling" if "bowling" in SCENARIOS else (sorted(SCENARIOS)[0] if SCENARIOS else None),
                 label="Scenario",
             )
+            n_frames = gr.Slider(5, 30, value=15, step=1, label="Frames to predict")
             temperature = gr.Slider(0.0, 1.0, value=0.0, step=0.05,
                                     label="Temperature (0 = greedy)")
             run = gr.Button("▶ Simulate", variant="primary")

requirements.txt CHANGED Viewed

@@ -1,6 +1,9 @@
---extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
 gradio==6.11.0
 llama-cpp-python==0.3.23
 huggingface_hub
 matplotlib
 pillow

+--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124
 gradio==6.11.0
+spaces
 llama-cpp-python==0.3.23
+nvidia-cuda-runtime-cu12
+nvidia-cublas-cu12
 huggingface_hub
 matplotlib
 pillow