vivekchakraverty Claude Opus 4.8 commited on
Commit
043484b
Β·
1 Parent(s): 69036da

Load the LLM once at startup instead of per ZeroGPU call

Browse files

generate.py previously loaded Qwen2.5-Coder-7B lazily inside the
@spaces.GPU function via lru_cache. ZeroGPU forks a fresh process per
call, so that cache never persisted and the 7B model was re-read from
disk on every request, eating the GPU time budget and making cold calls
fail with an opaque error.

Now the tokenizer + model load once at import into module globals on the
CPU (main process, CUDA-free). Each ZeroGPU fork inherits the resident
weights (copy-on-write) and only moves them to the allocated GPU inside
generate(); no per-request disk reload.

app.py also routes HF_HOME to persistent storage (/data) when mounted,
so weights are not re-downloaded on cold boots (no-op otherwise).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

Files changed (2) hide show
  1. app.py +10 -0
  2. generate.py +50 -17
app.py CHANGED
@@ -5,6 +5,16 @@ optional 1x self-correct -> render answer + validation + sources.
5
  """
6
  from __future__ import annotations
7
 
 
 
 
 
 
 
 
 
 
 
8
  import gradio as gr
9
 
10
  import rag
 
5
  """
6
  from __future__ import annotations
7
 
8
+ import os
9
+
10
+ # Route the HF cache to persistent storage (mounted at /data when the Space has
11
+ # "persistent storage" enabled) BEFORE importing any HF library, so model and
12
+ # embedder weights survive restarts and are not re-downloaded on every cold
13
+ # boot. No-op when /data isn't present/writable (falls back to the default
14
+ # ephemeral cache).
15
+ if not os.environ.get("HF_HOME") and os.path.isdir("/data") and os.access("/data", os.W_OK):
16
+ os.environ["HF_HOME"] = "/data/huggingface"
17
+
18
  import gradio as gr
19
 
20
  import rag
generate.py CHANGED
@@ -1,16 +1,25 @@
1
  """Generation with Qwen2.5-Coder-7B-Instruct on ZeroGPU.
2
 
3
- Only this module touches the GPU: the decorated ``generate`` runs under
4
- ``@spaces.GPU`` so ZeroGPU allocates an A100 slice on demand; retrieval and
5
- validation stay on CPU.
 
 
 
 
 
 
6
 
7
- Local testing: set GDRAG_STUB_LLM=1 to return a canned answer without loading the
8
- model (so rag/validate/app can be exercised without a GPU or a 15 GB download).
 
 
 
 
9
  """
10
  from __future__ import annotations
11
 
12
  import os
13
- from functools import lru_cache
14
 
15
  MODEL_ID = os.environ.get("GDRAG_LLM", "Qwen/Qwen2.5-Coder-7B-Instruct")
16
  STUB = os.environ.get("GDRAG_STUB_LLM") == "1"
@@ -29,16 +38,34 @@ except Exception: # not on a Space
29
  return deco
30
 
31
 
32
- @lru_cache(maxsize=1)
33
- def _model_and_tokenizer():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  import torch
35
  from transformers import AutoModelForCausalLM, AutoTokenizer
36
- tok = AutoTokenizer.from_pretrained(MODEL_ID)
37
- model = AutoModelForCausalLM.from_pretrained(
38
- MODEL_ID, torch_dtype=torch.bfloat16, device_map="auto",
39
  )
40
- model.eval()
41
- return model, tok
 
 
 
 
42
 
43
 
44
  def _render(messages, tok) -> str:
@@ -60,9 +87,15 @@ def generate(messages: list[dict], max_new_tokens: int = 512,
60
  "\tmove_and_slide()\n```\n"
61
  )
62
  import torch
63
- model, tok = _model_and_tokenizer()
 
 
 
 
 
 
64
  text = _render(messages, tok)
65
- inputs = tok([text], return_tensors="pt").to(model.device)
66
  with torch.no_grad():
67
  out = model.generate(
68
  **inputs, max_new_tokens=max_new_tokens,
@@ -74,5 +107,5 @@ def generate(messages: list[dict], max_new_tokens: int = 512,
74
 
75
 
76
  def warmup() -> None:
77
- if not STUB:
78
- _model_and_tokenizer()
 
1
  """Generation with Qwen2.5-Coder-7B-Instruct on ZeroGPU.
2
 
3
+ The model + tokenizer are loaded ONCE, at import time, into this module's
4
+ globals on the CPU (see ``_load`` and the import-time call below). ZeroGPU
5
+ forks the main process for each ``@spaces.GPU`` call, so every fork inherits
6
+ the already-resident weights (copy-on-write) and only has to move them onto the
7
+ freshly-allocated GPU β€” there is no per-request reload from disk. The previous
8
+ version loaded the model lazily *inside* the GPU function (via ``lru_cache``),
9
+ but because each ZeroGPU call runs in a fresh fork that cache never persisted,
10
+ so the 7B model was re-read from disk on every request, which ate the GPU time
11
+ budget and made cold calls fail.
12
 
13
+ HF weights are cached under ``HF_HOME``; ``app.py`` points that at persistent
14
+ storage (``/data``) when it is mounted, so the weights are not re-downloaded on
15
+ every cold boot.
16
+
17
+ Local testing: set GDRAG_STUB_LLM=1 to return a canned answer without loading
18
+ the model (so rag/validate/app can be exercised without a GPU or the download).
19
  """
20
  from __future__ import annotations
21
 
22
  import os
 
23
 
24
  MODEL_ID = os.environ.get("GDRAG_LLM", "Qwen/Qwen2.5-Coder-7B-Instruct")
25
  STUB = os.environ.get("GDRAG_STUB_LLM") == "1"
 
38
  return deco
39
 
40
 
41
+ # ── Weights live in module globals, loaded once on the CPU. ─────────────────
42
+ _MODEL = None
43
+ _TOKENIZER = None
44
+
45
+
46
+ def _load() -> None:
47
+ """Load the tokenizer + model into the module globals on the CPU.
48
+
49
+ Idempotent and CUDA-free: we deliberately do NOT use ``device_map="auto"``
50
+ or move to CUDA here, because on ZeroGPU there is no GPU outside a
51
+ ``@spaces.GPU`` function and the main process must not initialise CUDA.
52
+ The per-call fork moves the model onto the GPU inside ``generate``.
53
+ """
54
+ global _MODEL, _TOKENIZER
55
+ if STUB or _MODEL is not None:
56
+ return
57
  import torch
58
  from transformers import AutoModelForCausalLM, AutoTokenizer
59
+ _TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID)
60
+ _MODEL = AutoModelForCausalLM.from_pretrained(
61
+ MODEL_ID, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True,
62
  )
63
+ _MODEL.eval()
64
+
65
+
66
+ # Load at import so the Space boots with the weights resident β€” one disk read
67
+ # for the whole Space lifetime, instead of one per request.
68
+ _load()
69
 
70
 
71
  def _render(messages, tok) -> str:
 
87
  "\tmove_and_slide()\n```\n"
88
  )
89
  import torch
90
+ _load() # no-op when already resident; safety net for any fork edge case
91
+ tok = _TOKENIZER
92
+ model = _MODEL
93
+ # Move the pre-loaded CPU weights onto the GPU that ZeroGPU allocated for
94
+ # the duration of this call β€” cheap next to a disk reload.
95
+ device = "cuda" if torch.cuda.is_available() else "cpu"
96
+ model = model.to(device)
97
  text = _render(messages, tok)
98
+ inputs = tok([text], return_tensors="pt").to(device)
99
  with torch.no_grad():
100
  out = model.generate(
101
  **inputs, max_new_tokens=max_new_tokens,
 
107
 
108
 
109
  def warmup() -> None:
110
+ """Ensure the weights are resident. Already done at import; kept for the API."""
111
+ _load()