Spaces:

vivekchakraverty
/

gdscript-assistant

Running on Zero

vivekchakraverty Claude Opus 4.8 commited on 2 days ago

Commit

cccb7d5

1 Parent(s): 5fa56c1

ZeroGPU: load model on GPU inside @spaces.GPU (canonical), not at import

Loading the 15GB model into the main process at import made ZeroGPU's
per-call fork heavy/asyncio-tangled and the GPU task never executed
(timed out -> 'GPU task aborted', and the in-function [gen] log never
appeared). Revert to the standard ZeroGPU pattern: keep the main process
model-free and load with device_map=cuda inside the @spaces.GPU function,
cached per GPU worker.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

Files changed (1) hide show

generate.py +34 -65

generate.py CHANGED Viewed

@@ -1,32 +1,30 @@
 """Generation with Qwen2.5-Coder-7B-Instruct on ZeroGPU.
-Canonical ZeroGPU setup: ``spaces`` is imported before torch, and the model is
-loaded once at import into module globals and placed on CUDA. ``spaces``
-intercepts the global ``.to("cuda")`` so the *main* process never initialises
-CUDA, and it snapshots the GPU-resident model so every ``@spaces.GPU`` call
-reuses it — there is no per-request disk reload and no per-request 15 GB
-CPU->GPU transfer. Generation runs entirely on the GPU inside the decorated
-function.
-The previous revision loaded the model on the CPU and moved it to the GPU
-*inside* the function while gating on ``torch.cuda.is_available()``. On ZeroGPU
-that gate can read a stale ``False`` cached in the main process, so generation
-silently fell back to the CPU and blew past the 120 s GPU budget ("GPU task
-aborted").
-Local testing: set GDRAG_STUB_LLM=1 to return a canned answer without loading
-the model (so rag/validate/app can be exercised without a GPU or the download).
 """
 from __future__ import annotations
 import os
 MODEL_ID = os.environ.get("GDRAG_LLM", "Qwen/Qwen2.5-Coder-7B-Instruct")
 STUB = os.environ.get("GDRAG_STUB_LLM") == "1"
-# Import spaces BEFORE torch so ZeroGPU can patch CUDA and defer/snapshot the
-# global model placement. Degrade to a no-op decorator + CPU when running
-# locally without the ``spaces`` package.
 try:
     import spaces
     GPU = spaces.GPU
@@ -37,42 +35,24 @@ except Exception:                                  # not on a Space
     def GPU(*dargs, **dkwargs):
         def deco(fn):
             return fn
-        # support both @GPU and @GPU(duration=...)
         if dargs and callable(dargs[0]):
             return dargs[0]
         return deco
-_DEVICE = "cuda" if ON_ZERO else "cpu"
-_MODEL = None
-_TOKENIZER = None
-def _load() -> None:
-    """Load the tokenizer + model once into the module globals, on ``_DEVICE``.
-    On ZeroGPU the ``.to("cuda")`` is intercepted by ``spaces`` (imported above,
-    before torch): the main process stays CUDA-clean and the model is made
-    GPU-resident / snapshotted for every ``@spaces.GPU`` call.
-    """
-    global _MODEL, _TOKENIZER
-    if STUB or _MODEL is not None:
-        return
-    from transformers import AutoModelForCausalLM, AutoTokenizer
     import torch
-    _TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID)
-    _MODEL = AutoModelForCausalLM.from_pretrained(
         MODEL_ID, torch_dtype=torch.bfloat16,
     )
-    _MODEL.eval()
-    # Do NOT move to CUDA here. On ZeroGPU there is no GPU outside a @spaces.GPU
-    # function, and touching CUDA in the main process caches is_available()=False
-    # for the fork. The model is moved to the GPU inside generate() (forced).
-# Load at import so the Space boots with the weights resident — one disk read
-# for the whole Space lifetime, GPU-resident for every request.
-_load()
 def _render(messages, tok) -> str:
@@ -93,34 +73,23 @@ def generate(messages: list[dict], max_new_tokens: int = 256,
             "\"ui_up\", \"ui_down\")\n\tvelocity = dir * speed\n"
             "\tmove_and_slide()\n```\n"
         )
-    import torch, time
-    _load()  # no-op once resident
-    tok = _TOKENIZER
-    model = _MODEL
-    # Inside @spaces.GPU the GPU IS allocated for this call. Force the move to
-    # CUDA unconditionally — do NOT gate on torch.cuda.is_available(), which can
-    # be a stale False cached in the main process and would silently push
-    # generation onto the CPU (then it blows the 120s budget -> GPU task aborted).
-    dev = _DEVICE
-    before = str(next(model.parameters()).device)
-    model = model.to(dev)
-    after = str(next(model.parameters()).device)
-    print(f"[gen] forced dev={dev} cuda_avail={torch.cuda.is_available()} before={before} after={after}", flush=True)
     text = _render(messages, tok)
     inputs = tok([text], return_tensors="pt").to(dev)
-    t0 = time.time()
     with torch.no_grad():
         out = model.generate(
             **inputs, max_new_tokens=max_new_tokens,
             do_sample=temperature > 0, temperature=max(temperature, 1e-4),
             top_p=0.95, pad_token_id=tok.eos_token_id,
         )
-    n_new = int(out.shape[-1] - inputs["input_ids"].shape[1])
-    print(f"[gen] generated {n_new} tokens in {time.time()-t0:.1f}s on {dev}", flush=True)
     gen = out[0][inputs["input_ids"].shape[1]:]
     return tok.decode(gen, skip_special_tokens=True).strip()
 def warmup() -> None:
-    """Ensure the weights are resident. Already done at import; kept for the API."""
-    _load()

 """Generation with Qwen2.5-Coder-7B-Instruct on ZeroGPU.
+Canonical ZeroGPU pattern: the main process stays light (no model in it) and
+the model is loaded **on the GPU inside** the ``@spaces.GPU`` function, where
+the GPU actually exists. ``device_map="cuda"`` (accelerate) puts every shard on
+the allocated GPU. An ``lru_cache`` keeps it resident for the life of each GPU
+worker.
+Why not load once at import? On ZeroGPU there is no GPU outside ``@spaces.GPU``,
+and ZeroGPU forks the main process for every GPU call. Loading the 15 GB model
+into the main process makes that fork heavy and tangled with gradio's asyncio
+loop, and the GPU task never runs (it just times out -> "GPU task aborted").
+Keeping the main process model-free is what makes the GPU call actually execute.
+Local testing: set GDRAG_STUB_LLM=1 to return a canned answer without a GPU or
+the model download.
 """
 from __future__ import annotations
 import os
+from functools import lru_cache
 MODEL_ID = os.environ.get("GDRAG_LLM", "Qwen/Qwen2.5-Coder-7B-Instruct")
 STUB = os.environ.get("GDRAG_STUB_LLM") == "1"
+# Import spaces BEFORE torch so ZeroGPU can patch CUDA. Degrade to a no-op
+# decorator (CPU) when running locally without the package.
 try:
     import spaces
     GPU = spaces.GPU
     def GPU(*dargs, **dkwargs):
         def deco(fn):
             return fn
         if dargs and callable(dargs[0]):
             return dargs[0]
         return deco
+@lru_cache(maxsize=1)
+def _model_and_tokenizer():
+    """Load tokenizer + model. Called from inside ``generate`` so on ZeroGPU it
+    runs in the GPU worker where ``device_map="cuda"`` can place the weights."""
     import torch
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    tok = AutoTokenizer.from_pretrained(MODEL_ID)
+    model = AutoModelForCausalLM.from_pretrained(
         MODEL_ID, torch_dtype=torch.bfloat16,
+        device_map=("cuda" if ON_ZERO else None),
     )
+    model.eval()
+    return model, tok
 def _render(messages, tok) -> str:
             "\"ui_up\", \"ui_down\")\n\tvelocity = dir * speed\n"
             "\tmove_and_slide()\n```\n"
         )
+    import torch
+    model, tok = _model_and_tokenizer()
+    dev = model.device
     text = _render(messages, tok)
     inputs = tok([text], return_tensors="pt").to(dev)
     with torch.no_grad():
         out = model.generate(
             **inputs, max_new_tokens=max_new_tokens,
             do_sample=temperature > 0, temperature=max(temperature, 1e-4),
             top_p=0.95, pad_token_id=tok.eos_token_id,
         )
     gen = out[0][inputs["input_ids"].shape[1]:]
     return tok.decode(gen, skip_special_tokens=True).strip()
 def warmup() -> None:
+    """No-op on ZeroGPU: the model can only be loaded inside @spaces.GPU (the
+    GPU does not exist in the main process)."""
+    if not ON_ZERO and not STUB:
+        _model_and_tokenizer()