vivekchakraverty Claude Opus 4.8 commited on
Commit
cccb7d5
·
1 Parent(s): 5fa56c1

ZeroGPU: load model on GPU inside @spaces.GPU (canonical), not at import

Browse files

Loading the 15GB model into the main process at import made ZeroGPU's
per-call fork heavy/asyncio-tangled and the GPU task never executed
(timed out -> 'GPU task aborted', and the in-function [gen] log never
appeared). Revert to the standard ZeroGPU pattern: keep the main process
model-free and load with device_map=cuda inside the @spaces.GPU function,
cached per GPU worker.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

Files changed (1) hide show
  1. generate.py +34 -65
generate.py CHANGED
@@ -1,32 +1,30 @@
1
  """Generation with Qwen2.5-Coder-7B-Instruct on ZeroGPU.
2
 
3
- Canonical ZeroGPU setup: ``spaces`` is imported before torch, and the model is
4
- loaded once at import into module globals and placed on CUDA. ``spaces``
5
- intercepts the global ``.to("cuda")`` so the *main* process never initialises
6
- CUDA, and it snapshots the GPU-resident model so every ``@spaces.GPU`` call
7
- reuses it — there is no per-request disk reload and no per-request 15 GB
8
- CPU->GPU transfer. Generation runs entirely on the GPU inside the decorated
9
- function.
10
-
11
- The previous revision loaded the model on the CPU and moved it to the GPU
12
- *inside* the function while gating on ``torch.cuda.is_available()``. On ZeroGPU
13
- that gate can read a stale ``False`` cached in the main process, so generation
14
- silently fell back to the CPU and blew past the 120 s GPU budget ("GPU task
15
- aborted").
16
-
17
- Local testing: set GDRAG_STUB_LLM=1 to return a canned answer without loading
18
- the model (so rag/validate/app can be exercised without a GPU or the download).
19
  """
20
  from __future__ import annotations
21
 
22
  import os
 
23
 
24
  MODEL_ID = os.environ.get("GDRAG_LLM", "Qwen/Qwen2.5-Coder-7B-Instruct")
25
  STUB = os.environ.get("GDRAG_STUB_LLM") == "1"
26
 
27
- # Import spaces BEFORE torch so ZeroGPU can patch CUDA and defer/snapshot the
28
- # global model placement. Degrade to a no-op decorator + CPU when running
29
- # locally without the ``spaces`` package.
30
  try:
31
  import spaces
32
  GPU = spaces.GPU
@@ -37,42 +35,24 @@ except Exception: # not on a Space
37
  def GPU(*dargs, **dkwargs):
38
  def deco(fn):
39
  return fn
40
- # support both @GPU and @GPU(duration=...)
41
  if dargs and callable(dargs[0]):
42
  return dargs[0]
43
  return deco
44
 
45
 
46
- _DEVICE = "cuda" if ON_ZERO else "cpu"
47
- _MODEL = None
48
- _TOKENIZER = None
49
-
50
-
51
- def _load() -> None:
52
- """Load the tokenizer + model once into the module globals, on ``_DEVICE``.
53
-
54
- On ZeroGPU the ``.to("cuda")`` is intercepted by ``spaces`` (imported above,
55
- before torch): the main process stays CUDA-clean and the model is made
56
- GPU-resident / snapshotted for every ``@spaces.GPU`` call.
57
- """
58
- global _MODEL, _TOKENIZER
59
- if STUB or _MODEL is not None:
60
- return
61
- from transformers import AutoModelForCausalLM, AutoTokenizer
62
  import torch
63
- _TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID)
64
- _MODEL = AutoModelForCausalLM.from_pretrained(
 
65
  MODEL_ID, torch_dtype=torch.bfloat16,
 
66
  )
67
- _MODEL.eval()
68
- # Do NOT move to CUDA here. On ZeroGPU there is no GPU outside a @spaces.GPU
69
- # function, and touching CUDA in the main process caches is_available()=False
70
- # for the fork. The model is moved to the GPU inside generate() (forced).
71
-
72
-
73
- # Load at import so the Space boots with the weights resident — one disk read
74
- # for the whole Space lifetime, GPU-resident for every request.
75
- _load()
76
 
77
 
78
  def _render(messages, tok) -> str:
@@ -93,34 +73,23 @@ def generate(messages: list[dict], max_new_tokens: int = 256,
93
  "\"ui_up\", \"ui_down\")\n\tvelocity = dir * speed\n"
94
  "\tmove_and_slide()\n```\n"
95
  )
96
- import torch, time
97
- _load() # no-op once resident
98
- tok = _TOKENIZER
99
- model = _MODEL
100
- # Inside @spaces.GPU the GPU IS allocated for this call. Force the move to
101
- # CUDA unconditionally — do NOT gate on torch.cuda.is_available(), which can
102
- # be a stale False cached in the main process and would silently push
103
- # generation onto the CPU (then it blows the 120s budget -> GPU task aborted).
104
- dev = _DEVICE
105
- before = str(next(model.parameters()).device)
106
- model = model.to(dev)
107
- after = str(next(model.parameters()).device)
108
- print(f"[gen] forced dev={dev} cuda_avail={torch.cuda.is_available()} before={before} after={after}", flush=True)
109
  text = _render(messages, tok)
110
  inputs = tok([text], return_tensors="pt").to(dev)
111
- t0 = time.time()
112
  with torch.no_grad():
113
  out = model.generate(
114
  **inputs, max_new_tokens=max_new_tokens,
115
  do_sample=temperature > 0, temperature=max(temperature, 1e-4),
116
  top_p=0.95, pad_token_id=tok.eos_token_id,
117
  )
118
- n_new = int(out.shape[-1] - inputs["input_ids"].shape[1])
119
- print(f"[gen] generated {n_new} tokens in {time.time()-t0:.1f}s on {dev}", flush=True)
120
  gen = out[0][inputs["input_ids"].shape[1]:]
121
  return tok.decode(gen, skip_special_tokens=True).strip()
122
 
123
 
124
  def warmup() -> None:
125
- """Ensure the weights are resident. Already done at import; kept for the API."""
126
- _load()
 
 
 
1
  """Generation with Qwen2.5-Coder-7B-Instruct on ZeroGPU.
2
 
3
+ Canonical ZeroGPU pattern: the main process stays light (no model in it) and
4
+ the model is loaded **on the GPU inside** the ``@spaces.GPU`` function, where
5
+ the GPU actually exists. ``device_map="cuda"`` (accelerate) puts every shard on
6
+ the allocated GPU. An ``lru_cache`` keeps it resident for the life of each GPU
7
+ worker.
8
+
9
+ Why not load once at import? On ZeroGPU there is no GPU outside ``@spaces.GPU``,
10
+ and ZeroGPU forks the main process for every GPU call. Loading the 15 GB model
11
+ into the main process makes that fork heavy and tangled with gradio's asyncio
12
+ loop, and the GPU task never runs (it just times out -> "GPU task aborted").
13
+ Keeping the main process model-free is what makes the GPU call actually execute.
14
+
15
+ Local testing: set GDRAG_STUB_LLM=1 to return a canned answer without a GPU or
16
+ the model download.
 
 
17
  """
18
  from __future__ import annotations
19
 
20
  import os
21
+ from functools import lru_cache
22
 
23
  MODEL_ID = os.environ.get("GDRAG_LLM", "Qwen/Qwen2.5-Coder-7B-Instruct")
24
  STUB = os.environ.get("GDRAG_STUB_LLM") == "1"
25
 
26
+ # Import spaces BEFORE torch so ZeroGPU can patch CUDA. Degrade to a no-op
27
+ # decorator (CPU) when running locally without the package.
 
28
  try:
29
  import spaces
30
  GPU = spaces.GPU
 
35
  def GPU(*dargs, **dkwargs):
36
  def deco(fn):
37
  return fn
 
38
  if dargs and callable(dargs[0]):
39
  return dargs[0]
40
  return deco
41
 
42
 
43
+ @lru_cache(maxsize=1)
44
+ def _model_and_tokenizer():
45
+ """Load tokenizer + model. Called from inside ``generate`` so on ZeroGPU it
46
+ runs in the GPU worker where ``device_map="cuda"`` can place the weights."""
 
 
 
 
 
 
 
 
 
 
 
 
47
  import torch
48
+ from transformers import AutoModelForCausalLM, AutoTokenizer
49
+ tok = AutoTokenizer.from_pretrained(MODEL_ID)
50
+ model = AutoModelForCausalLM.from_pretrained(
51
  MODEL_ID, torch_dtype=torch.bfloat16,
52
+ device_map=("cuda" if ON_ZERO else None),
53
  )
54
+ model.eval()
55
+ return model, tok
 
 
 
 
 
 
 
56
 
57
 
58
  def _render(messages, tok) -> str:
 
73
  "\"ui_up\", \"ui_down\")\n\tvelocity = dir * speed\n"
74
  "\tmove_and_slide()\n```\n"
75
  )
76
+ import torch
77
+ model, tok = _model_and_tokenizer()
78
+ dev = model.device
 
 
 
 
 
 
 
 
 
 
79
  text = _render(messages, tok)
80
  inputs = tok([text], return_tensors="pt").to(dev)
 
81
  with torch.no_grad():
82
  out = model.generate(
83
  **inputs, max_new_tokens=max_new_tokens,
84
  do_sample=temperature > 0, temperature=max(temperature, 1e-4),
85
  top_p=0.95, pad_token_id=tok.eos_token_id,
86
  )
 
 
87
  gen = out[0][inputs["input_ids"].shape[1]:]
88
  return tok.decode(gen, skip_special_tokens=True).strip()
89
 
90
 
91
  def warmup() -> None:
92
+ """No-op on ZeroGPU: the model can only be loaded inside @spaces.GPU (the
93
+ GPU does not exist in the main process)."""
94
+ if not ON_ZERO and not STUB:
95
+ _model_and_tokenizer()