vivekchakraverty Claude Opus 4.8 commited on
Commit
8df32ec
Β·
1 Parent(s): 043484b

ZeroGPU: keep model GPU-resident (canonical pattern)

Browse files

The prior revision loaded the model on CPU and moved it to GPU inside the
@spaces.GPU function, gated on torch.cuda.is_available(). On ZeroGPU that
gate can read a stale False cached in the main process, so generation ran
on CPU and exceeded the 120s GPU budget -> 'GPU task aborted'.

Now spaces is imported before torch (in app.py and generate.py) and the
model is placed on cuda at import; spaces defers/snapshots it so the main
process stays CUDA-clean and every call reuses the GPU-resident model.
Generation runs directly on cuda (no is_available gate, no per-call
15GB transfer).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

Files changed (2) hide show
  1. app.py +7 -0
  2. generate.py +30 -29
app.py CHANGED
@@ -15,6 +15,13 @@ import os
15
  if not os.environ.get("HF_HOME") and os.path.isdir("/data") and os.access("/data", os.W_OK):
16
  os.environ["HF_HOME"] = "/data/huggingface"
17
 
 
 
 
 
 
 
 
18
  import gradio as gr
19
 
20
  import rag
 
15
  if not os.environ.get("HF_HOME") and os.path.isdir("/data") and os.access("/data", os.W_OK):
16
  os.environ["HF_HOME"] = "/data/huggingface"
17
 
18
+ # Import spaces BEFORE any torch-importing library (gradio/rag/generate) so
19
+ # ZeroGPU can patch CUDA and keep the model GPU-resident. No-op off-Space.
20
+ try:
21
+ import spaces # noqa: F401
22
+ except Exception:
23
+ pass
24
+
25
  import gradio as gr
26
 
27
  import rag
generate.py CHANGED
@@ -1,18 +1,18 @@
1
  """Generation with Qwen2.5-Coder-7B-Instruct on ZeroGPU.
2
 
3
- The model + tokenizer are loaded ONCE, at import time, into this module's
4
- globals on the CPU (see ``_load`` and the import-time call below). ZeroGPU
5
- forks the main process for each ``@spaces.GPU`` call, so every fork inherits
6
- the already-resident weights (copy-on-write) and only has to move them onto the
7
- freshly-allocated GPU β€” there is no per-request reload from disk. The previous
8
- version loaded the model lazily *inside* the GPU function (via ``lru_cache``),
9
- but because each ZeroGPU call runs in a fresh fork that cache never persisted,
10
- so the 7B model was re-read from disk on every request, which ate the GPU time
11
- budget and made cold calls fail.
12
-
13
- HF weights are cached under ``HF_HOME``; ``app.py`` points that at persistent
14
- storage (``/data``) when it is mounted, so the weights are not re-downloaded on
15
- every cold boot.
16
 
17
  Local testing: set GDRAG_STUB_LLM=1 to return a canned answer without loading
18
  the model (so rag/validate/app can be exercised without a GPU or the download).
@@ -24,11 +24,16 @@ import os
24
  MODEL_ID = os.environ.get("GDRAG_LLM", "Qwen/Qwen2.5-Coder-7B-Instruct")
25
  STUB = os.environ.get("GDRAG_STUB_LLM") == "1"
26
 
27
- # Optional ZeroGPU decorator β€” degrade to a no-op when running locally.
 
 
28
  try:
29
  import spaces
30
  GPU = spaces.GPU
 
31
  except Exception: # not on a Space
 
 
32
  def GPU(*dargs, **dkwargs):
33
  def deco(fn):
34
  return fn
@@ -38,33 +43,33 @@ except Exception: # not on a Space
38
  return deco
39
 
40
 
41
- # ── Weights live in module globals, loaded once on the CPU. ─────────────────
42
  _MODEL = None
43
  _TOKENIZER = None
44
 
45
 
46
  def _load() -> None:
47
- """Load the tokenizer + model into the module globals on the CPU.
48
 
49
- Idempotent and CUDA-free: we deliberately do NOT use ``device_map="auto"``
50
- or move to CUDA here, because on ZeroGPU there is no GPU outside a
51
- ``@spaces.GPU`` function and the main process must not initialise CUDA.
52
- The per-call fork moves the model onto the GPU inside ``generate``.
53
  """
54
  global _MODEL, _TOKENIZER
55
  if STUB or _MODEL is not None:
56
  return
57
- import torch
58
  from transformers import AutoModelForCausalLM, AutoTokenizer
 
59
  _TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID)
60
  _MODEL = AutoModelForCausalLM.from_pretrained(
61
- MODEL_ID, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True,
62
  )
63
  _MODEL.eval()
 
64
 
65
 
66
  # Load at import so the Space boots with the weights resident β€” one disk read
67
- # for the whole Space lifetime, instead of one per request.
68
  _load()
69
 
70
 
@@ -87,15 +92,11 @@ def generate(messages: list[dict], max_new_tokens: int = 512,
87
  "\tmove_and_slide()\n```\n"
88
  )
89
  import torch
90
- _load() # no-op when already resident; safety net for any fork edge case
91
  tok = _TOKENIZER
92
  model = _MODEL
93
- # Move the pre-loaded CPU weights onto the GPU that ZeroGPU allocated for
94
- # the duration of this call β€” cheap next to a disk reload.
95
- device = "cuda" if torch.cuda.is_available() else "cpu"
96
- model = model.to(device)
97
  text = _render(messages, tok)
98
- inputs = tok([text], return_tensors="pt").to(device)
99
  with torch.no_grad():
100
  out = model.generate(
101
  **inputs, max_new_tokens=max_new_tokens,
 
1
  """Generation with Qwen2.5-Coder-7B-Instruct on ZeroGPU.
2
 
3
+ Canonical ZeroGPU setup: ``spaces`` is imported before torch, and the model is
4
+ loaded once at import into module globals and placed on CUDA. ``spaces``
5
+ intercepts the global ``.to("cuda")`` so the *main* process never initialises
6
+ CUDA, and it snapshots the GPU-resident model so every ``@spaces.GPU`` call
7
+ reuses it β€” there is no per-request disk reload and no per-request 15 GB
8
+ CPU->GPU transfer. Generation runs entirely on the GPU inside the decorated
9
+ function.
10
+
11
+ The previous revision loaded the model on the CPU and moved it to the GPU
12
+ *inside* the function while gating on ``torch.cuda.is_available()``. On ZeroGPU
13
+ that gate can read a stale ``False`` cached in the main process, so generation
14
+ silently fell back to the CPU and blew past the 120 s GPU budget ("GPU task
15
+ aborted").
16
 
17
  Local testing: set GDRAG_STUB_LLM=1 to return a canned answer without loading
18
  the model (so rag/validate/app can be exercised without a GPU or the download).
 
24
  MODEL_ID = os.environ.get("GDRAG_LLM", "Qwen/Qwen2.5-Coder-7B-Instruct")
25
  STUB = os.environ.get("GDRAG_STUB_LLM") == "1"
26
 
27
+ # Import spaces BEFORE torch so ZeroGPU can patch CUDA and defer/snapshot the
28
+ # global model placement. Degrade to a no-op decorator + CPU when running
29
+ # locally without the ``spaces`` package.
30
  try:
31
  import spaces
32
  GPU = spaces.GPU
33
+ ON_ZERO = True
34
  except Exception: # not on a Space
35
+ ON_ZERO = False
36
+
37
  def GPU(*dargs, **dkwargs):
38
  def deco(fn):
39
  return fn
 
43
  return deco
44
 
45
 
46
+ _DEVICE = "cuda" if ON_ZERO else "cpu"
47
  _MODEL = None
48
  _TOKENIZER = None
49
 
50
 
51
  def _load() -> None:
52
+ """Load the tokenizer + model once into the module globals, on ``_DEVICE``.
53
 
54
+ On ZeroGPU the ``.to("cuda")`` is intercepted by ``spaces`` (imported above,
55
+ before torch): the main process stays CUDA-clean and the model is made
56
+ GPU-resident / snapshotted for every ``@spaces.GPU`` call.
 
57
  """
58
  global _MODEL, _TOKENIZER
59
  if STUB or _MODEL is not None:
60
  return
 
61
  from transformers import AutoModelForCausalLM, AutoTokenizer
62
+ import torch
63
  _TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID)
64
  _MODEL = AutoModelForCausalLM.from_pretrained(
65
+ MODEL_ID, torch_dtype=torch.bfloat16,
66
  )
67
  _MODEL.eval()
68
+ _MODEL.to(_DEVICE)
69
 
70
 
71
  # Load at import so the Space boots with the weights resident β€” one disk read
72
+ # for the whole Space lifetime, GPU-resident for every request.
73
  _load()
74
 
75
 
 
92
  "\tmove_and_slide()\n```\n"
93
  )
94
  import torch
95
+ _load() # no-op once resident
96
  tok = _TOKENIZER
97
  model = _MODEL
 
 
 
 
98
  text = _render(messages, tok)
99
+ inputs = tok([text], return_tensors="pt").to(_DEVICE)
100
  with torch.no_grad():
101
  out = model.generate(
102
  **inputs, max_new_tokens=max_new_tokens,