Spaces:

vivekchakraverty
/

gdscript-assistant

Running on Zero

vivekchakraverty Claude Opus 4.8 commited on 2 days ago

Commit

043484b

1 Parent(s): 69036da

Load the LLM once at startup instead of per ZeroGPU call

generate.py previously loaded Qwen2.5-Coder-7B lazily inside the
@spaces.GPU function via lru_cache. ZeroGPU forks a fresh process per
call, so that cache never persisted and the 7B model was re-read from
disk on every request, eating the GPU time budget and making cold calls
fail with an opaque error.

Now the tokenizer + model load once at import into module globals on the
CPU (main process, CUDA-free). Each ZeroGPU fork inherits the resident
weights (copy-on-write) and only moves them to the allocated GPU inside
generate(); no per-request disk reload.

app.py also routes HF_HOME to persistent storage (/data) when mounted,
so weights are not re-downloaded on cold boots (no-op otherwise).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

Files changed (2) hide show

app.py +10 -0
generate.py +50 -17

app.py CHANGED Viewed

@@ -5,6 +5,16 @@ optional 1x self-correct -> render answer + validation + sources.
 """
 from __future__ import annotations
 import gradio as gr
 import rag

 """
 from __future__ import annotations
+import os
+# Route the HF cache to persistent storage (mounted at /data when the Space has
+# "persistent storage" enabled) BEFORE importing any HF library, so model and
+# embedder weights survive restarts and are not re-downloaded on every cold
+# boot. No-op when /data isn't present/writable (falls back to the default
+# ephemeral cache).
+if not os.environ.get("HF_HOME") and os.path.isdir("/data") and os.access("/data", os.W_OK):
+    os.environ["HF_HOME"] = "/data/huggingface"
 import gradio as gr
 import rag

generate.py CHANGED Viewed

@@ -1,16 +1,25 @@
 """Generation with Qwen2.5-Coder-7B-Instruct on ZeroGPU.
-Only this module touches the GPU: the decorated ``generate`` runs under
-``@spaces.GPU`` so ZeroGPU allocates an A100 slice on demand; retrieval and
-validation stay on CPU.
-Local testing: set GDRAG_STUB_LLM=1 to return a canned answer without loading the
-model (so rag/validate/app can be exercised without a GPU or a 15 GB download).
 """
 from __future__ import annotations
 import os
-from functools import lru_cache
 MODEL_ID = os.environ.get("GDRAG_LLM", "Qwen/Qwen2.5-Coder-7B-Instruct")
 STUB = os.environ.get("GDRAG_STUB_LLM") == "1"
@@ -29,16 +38,34 @@ except Exception:                                  # not on a Space
         return deco
-@lru_cache(maxsize=1)
-def _model_and_tokenizer():
     import torch
     from transformers import AutoModelForCausalLM, AutoTokenizer
-    tok = AutoTokenizer.from_pretrained(MODEL_ID)
-    model = AutoModelForCausalLM.from_pretrained(
-        MODEL_ID, torch_dtype=torch.bfloat16, device_map="auto",
     )
-    model.eval()
-    return model, tok
 def _render(messages, tok) -> str:
@@ -60,9 +87,15 @@ def generate(messages: list[dict], max_new_tokens: int = 512,
             "\tmove_and_slide()\n```\n"
         )
     import torch
-    model, tok = _model_and_tokenizer()
     text = _render(messages, tok)
-    inputs = tok([text], return_tensors="pt").to(model.device)
     with torch.no_grad():
         out = model.generate(
             **inputs, max_new_tokens=max_new_tokens,
@@ -74,5 +107,5 @@ def generate(messages: list[dict], max_new_tokens: int = 512,
 def warmup() -> None:
-    if not STUB:
-        _model_and_tokenizer()

 """Generation with Qwen2.5-Coder-7B-Instruct on ZeroGPU.
+The model + tokenizer are loaded ONCE, at import time, into this module's
+globals on the CPU (see ``_load`` and the import-time call below). ZeroGPU
+forks the main process for each ``@spaces.GPU`` call, so every fork inherits
+the already-resident weights (copy-on-write) and only has to move them onto the
+freshly-allocated GPU — there is no per-request reload from disk. The previous
+version loaded the model lazily *inside* the GPU function (via ``lru_cache``),
+but because each ZeroGPU call runs in a fresh fork that cache never persisted,
+so the 7B model was re-read from disk on every request, which ate the GPU time
+budget and made cold calls fail.
+HF weights are cached under ``HF_HOME``; ``app.py`` points that at persistent
+storage (``/data``) when it is mounted, so the weights are not re-downloaded on
+every cold boot.
+Local testing: set GDRAG_STUB_LLM=1 to return a canned answer without loading
+the model (so rag/validate/app can be exercised without a GPU or the download).
 """
 from __future__ import annotations
 import os
 MODEL_ID = os.environ.get("GDRAG_LLM", "Qwen/Qwen2.5-Coder-7B-Instruct")
 STUB = os.environ.get("GDRAG_STUB_LLM") == "1"
         return deco
+# ── Weights live in module globals, loaded once on the CPU. ─────────────────
+_MODEL = None
+_TOKENIZER = None
+def _load() -> None:
+    """Load the tokenizer + model into the module globals on the CPU.
+    Idempotent and CUDA-free: we deliberately do NOT use ``device_map="auto"``
+    or move to CUDA here, because on ZeroGPU there is no GPU outside a
+    ``@spaces.GPU`` function and the main process must not initialise CUDA.
+    The per-call fork moves the model onto the GPU inside ``generate``.
+    """
+    global _MODEL, _TOKENIZER
+    if STUB or _MODEL is not None:
+        return
     import torch
     from transformers import AutoModelForCausalLM, AutoTokenizer
+    _TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID)
+    _MODEL = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True,
     )
+    _MODEL.eval()
+# Load at import so the Space boots with the weights resident — one disk read
+# for the whole Space lifetime, instead of one per request.
+_load()
 def _render(messages, tok) -> str:
             "\tmove_and_slide()\n```\n"
         )
     import torch
+    _load()  # no-op when already resident; safety net for any fork edge case
+    tok = _TOKENIZER
+    model = _MODEL
+    # Move the pre-loaded CPU weights onto the GPU that ZeroGPU allocated for
+    # the duration of this call — cheap next to a disk reload.
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = model.to(device)
     text = _render(messages, tok)
+    inputs = tok([text], return_tensors="pt").to(device)
     with torch.no_grad():
         out = model.generate(
             **inputs, max_new_tokens=max_new_tokens,
 def warmup() -> None:
+    """Ensure the weights are resident. Already done at import; kept for the API."""
+    _load()