Spaces:

vivekchakraverty
/

gdscript-assistant

Running on Zero

vivekchakraverty Claude Opus 4.8 commited on 2 days ago

Commit

8df32ec

1 Parent(s): 043484b

ZeroGPU: keep model GPU-resident (canonical pattern)

The prior revision loaded the model on CPU and moved it to GPU inside the
@spaces.GPU function, gated on torch.cuda.is_available(). On ZeroGPU that
gate can read a stale False cached in the main process, so generation ran
on CPU and exceeded the 120s GPU budget -> 'GPU task aborted'.

Now spaces is imported before torch (in app.py and generate.py) and the
model is placed on cuda at import; spaces defers/snapshots it so the main
process stays CUDA-clean and every call reuses the GPU-resident model.
Generation runs directly on cuda (no is_available gate, no per-call
15GB transfer).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

Files changed (2) hide show

app.py +7 -0
generate.py +30 -29

app.py CHANGED Viewed

@@ -15,6 +15,13 @@ import os
 if not os.environ.get("HF_HOME") and os.path.isdir("/data") and os.access("/data", os.W_OK):
     os.environ["HF_HOME"] = "/data/huggingface"
 import gradio as gr
 import rag

 if not os.environ.get("HF_HOME") and os.path.isdir("/data") and os.access("/data", os.W_OK):
     os.environ["HF_HOME"] = "/data/huggingface"
+# Import spaces BEFORE any torch-importing library (gradio/rag/generate) so
+# ZeroGPU can patch CUDA and keep the model GPU-resident. No-op off-Space.
+try:
+    import spaces  # noqa: F401
+except Exception:
+    pass
 import gradio as gr
 import rag

generate.py CHANGED Viewed

@@ -1,18 +1,18 @@
 """Generation with Qwen2.5-Coder-7B-Instruct on ZeroGPU.
-The model + tokenizer are loaded ONCE, at import time, into this module's
-globals on the CPU (see ``_load`` and the import-time call below). ZeroGPU
-forks the main process for each ``@spaces.GPU`` call, so every fork inherits
-the already-resident weights (copy-on-write) and only has to move them onto the
-freshly-allocated GPU — there is no per-request reload from disk. The previous
-version loaded the model lazily *inside* the GPU function (via ``lru_cache``),
-but because each ZeroGPU call runs in a fresh fork that cache never persisted,
-so the 7B model was re-read from disk on every request, which ate the GPU time
-budget and made cold calls fail.
-HF weights are cached under ``HF_HOME``; ``app.py`` points that at persistent
-storage (``/data``) when it is mounted, so the weights are not re-downloaded on
-every cold boot.
 Local testing: set GDRAG_STUB_LLM=1 to return a canned answer without loading
 the model (so rag/validate/app can be exercised without a GPU or the download).
@@ -24,11 +24,16 @@ import os
 MODEL_ID = os.environ.get("GDRAG_LLM", "Qwen/Qwen2.5-Coder-7B-Instruct")
 STUB = os.environ.get("GDRAG_STUB_LLM") == "1"
-# Optional ZeroGPU decorator — degrade to a no-op when running locally.
 try:
     import spaces
     GPU = spaces.GPU
 except Exception:                                  # not on a Space
     def GPU(*dargs, **dkwargs):
         def deco(fn):
             return fn
@@ -38,33 +43,33 @@ except Exception:                                  # not on a Space
         return deco
-# ── Weights live in module globals, loaded once on the CPU. ─────────────────
 _MODEL = None
 _TOKENIZER = None
 def _load() -> None:
-    """Load the tokenizer + model into the module globals on the CPU.
-    Idempotent and CUDA-free: we deliberately do NOT use ``device_map="auto"``
-    or move to CUDA here, because on ZeroGPU there is no GPU outside a
-    ``@spaces.GPU`` function and the main process must not initialise CUDA.
-    The per-call fork moves the model onto the GPU inside ``generate``.
     """
     global _MODEL, _TOKENIZER
     if STUB or _MODEL is not None:
         return
-    import torch
     from transformers import AutoModelForCausalLM, AutoTokenizer
     _TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID)
     _MODEL = AutoModelForCausalLM.from_pretrained(
-        MODEL_ID, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True,
     )
     _MODEL.eval()
 # Load at import so the Space boots with the weights resident — one disk read
-# for the whole Space lifetime, instead of one per request.
 _load()
@@ -87,15 +92,11 @@ def generate(messages: list[dict], max_new_tokens: int = 512,
             "\tmove_and_slide()\n```\n"
         )
     import torch
-    _load()  # no-op when already resident; safety net for any fork edge case
     tok = _TOKENIZER
     model = _MODEL
-    # Move the pre-loaded CPU weights onto the GPU that ZeroGPU allocated for
-    # the duration of this call — cheap next to a disk reload.
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    model = model.to(device)
     text = _render(messages, tok)
-    inputs = tok([text], return_tensors="pt").to(device)
     with torch.no_grad():
         out = model.generate(
             **inputs, max_new_tokens=max_new_tokens,

 """Generation with Qwen2.5-Coder-7B-Instruct on ZeroGPU.
+Canonical ZeroGPU setup: ``spaces`` is imported before torch, and the model is
+loaded once at import into module globals and placed on CUDA. ``spaces``
+intercepts the global ``.to("cuda")`` so the *main* process never initialises
+CUDA, and it snapshots the GPU-resident model so every ``@spaces.GPU`` call
+reuses it — there is no per-request disk reload and no per-request 15 GB
+CPU->GPU transfer. Generation runs entirely on the GPU inside the decorated
+function.
+The previous revision loaded the model on the CPU and moved it to the GPU
+*inside* the function while gating on ``torch.cuda.is_available()``. On ZeroGPU
+that gate can read a stale ``False`` cached in the main process, so generation
+silently fell back to the CPU and blew past the 120 s GPU budget ("GPU task
+aborted").
 Local testing: set GDRAG_STUB_LLM=1 to return a canned answer without loading
 the model (so rag/validate/app can be exercised without a GPU or the download).
 MODEL_ID = os.environ.get("GDRAG_LLM", "Qwen/Qwen2.5-Coder-7B-Instruct")
 STUB = os.environ.get("GDRAG_STUB_LLM") == "1"
+# Import spaces BEFORE torch so ZeroGPU can patch CUDA and defer/snapshot the
+# global model placement. Degrade to a no-op decorator + CPU when running
+# locally without the ``spaces`` package.
 try:
     import spaces
     GPU = spaces.GPU
+    ON_ZERO = True
 except Exception:                                  # not on a Space
+    ON_ZERO = False
     def GPU(*dargs, **dkwargs):
         def deco(fn):
             return fn
         return deco
+_DEVICE = "cuda" if ON_ZERO else "cpu"
 _MODEL = None
 _TOKENIZER = None
 def _load() -> None:
+    """Load the tokenizer + model once into the module globals, on ``_DEVICE``.
+    On ZeroGPU the ``.to("cuda")`` is intercepted by ``spaces`` (imported above,
+    before torch): the main process stays CUDA-clean and the model is made
+    GPU-resident / snapshotted for every ``@spaces.GPU`` call.
     """
     global _MODEL, _TOKENIZER
     if STUB or _MODEL is not None:
         return
     from transformers import AutoModelForCausalLM, AutoTokenizer
+    import torch
     _TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID)
     _MODEL = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID, torch_dtype=torch.bfloat16,
     )
     _MODEL.eval()
+    _MODEL.to(_DEVICE)
 # Load at import so the Space boots with the weights resident — one disk read
+# for the whole Space lifetime, GPU-resident for every request.
 _load()
             "\tmove_and_slide()\n```\n"
         )
     import torch
+    _load()  # no-op once resident
     tok = _TOKENIZER
     model = _MODEL
     text = _render(messages, tok)
+    inputs = tok([text], return_tensors="pt").to(_DEVICE)
     with torch.no_grad():
         out = model.generate(
             **inputs, max_new_tokens=max_new_tokens,