Spaces:
Running on Zero
Running on Zero
Commit Β·
043484b
1
Parent(s): 69036da
Load the LLM once at startup instead of per ZeroGPU call
Browse filesgenerate.py previously loaded Qwen2.5-Coder-7B lazily inside the
@spaces.GPU function via lru_cache. ZeroGPU forks a fresh process per
call, so that cache never persisted and the 7B model was re-read from
disk on every request, eating the GPU time budget and making cold calls
fail with an opaque error.
Now the tokenizer + model load once at import into module globals on the
CPU (main process, CUDA-free). Each ZeroGPU fork inherits the resident
weights (copy-on-write) and only moves them to the allocated GPU inside
generate(); no per-request disk reload.
app.py also routes HF_HOME to persistent storage (/data) when mounted,
so weights are not re-downloaded on cold boots (no-op otherwise).
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
- app.py +10 -0
- generate.py +50 -17
app.py
CHANGED
|
@@ -5,6 +5,16 @@ optional 1x self-correct -> render answer + validation + sources.
|
|
| 5 |
"""
|
| 6 |
from __future__ import annotations
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
import gradio as gr
|
| 9 |
|
| 10 |
import rag
|
|
|
|
| 5 |
"""
|
| 6 |
from __future__ import annotations
|
| 7 |
|
| 8 |
+
import os
|
| 9 |
+
|
| 10 |
+
# Route the HF cache to persistent storage (mounted at /data when the Space has
|
| 11 |
+
# "persistent storage" enabled) BEFORE importing any HF library, so model and
|
| 12 |
+
# embedder weights survive restarts and are not re-downloaded on every cold
|
| 13 |
+
# boot. No-op when /data isn't present/writable (falls back to the default
|
| 14 |
+
# ephemeral cache).
|
| 15 |
+
if not os.environ.get("HF_HOME") and os.path.isdir("/data") and os.access("/data", os.W_OK):
|
| 16 |
+
os.environ["HF_HOME"] = "/data/huggingface"
|
| 17 |
+
|
| 18 |
import gradio as gr
|
| 19 |
|
| 20 |
import rag
|
generate.py
CHANGED
|
@@ -1,16 +1,25 @@
|
|
| 1 |
"""Generation with Qwen2.5-Coder-7B-Instruct on ZeroGPU.
|
| 2 |
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
"""
|
| 10 |
from __future__ import annotations
|
| 11 |
|
| 12 |
import os
|
| 13 |
-
from functools import lru_cache
|
| 14 |
|
| 15 |
MODEL_ID = os.environ.get("GDRAG_LLM", "Qwen/Qwen2.5-Coder-7B-Instruct")
|
| 16 |
STUB = os.environ.get("GDRAG_STUB_LLM") == "1"
|
|
@@ -29,16 +38,34 @@ except Exception: # not on a Space
|
|
| 29 |
return deco
|
| 30 |
|
| 31 |
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
import torch
|
| 35 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
MODEL_ID, torch_dtype=torch.bfloat16,
|
| 39 |
)
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
|
| 44 |
def _render(messages, tok) -> str:
|
|
@@ -60,9 +87,15 @@ def generate(messages: list[dict], max_new_tokens: int = 512,
|
|
| 60 |
"\tmove_and_slide()\n```\n"
|
| 61 |
)
|
| 62 |
import torch
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
text = _render(messages, tok)
|
| 65 |
-
inputs = tok([text], return_tensors="pt").to(
|
| 66 |
with torch.no_grad():
|
| 67 |
out = model.generate(
|
| 68 |
**inputs, max_new_tokens=max_new_tokens,
|
|
@@ -74,5 +107,5 @@ def generate(messages: list[dict], max_new_tokens: int = 512,
|
|
| 74 |
|
| 75 |
|
| 76 |
def warmup() -> None:
|
| 77 |
-
|
| 78 |
-
|
|
|
|
| 1 |
"""Generation with Qwen2.5-Coder-7B-Instruct on ZeroGPU.
|
| 2 |
|
| 3 |
+
The model + tokenizer are loaded ONCE, at import time, into this module's
|
| 4 |
+
globals on the CPU (see ``_load`` and the import-time call below). ZeroGPU
|
| 5 |
+
forks the main process for each ``@spaces.GPU`` call, so every fork inherits
|
| 6 |
+
the already-resident weights (copy-on-write) and only has to move them onto the
|
| 7 |
+
freshly-allocated GPU β there is no per-request reload from disk. The previous
|
| 8 |
+
version loaded the model lazily *inside* the GPU function (via ``lru_cache``),
|
| 9 |
+
but because each ZeroGPU call runs in a fresh fork that cache never persisted,
|
| 10 |
+
so the 7B model was re-read from disk on every request, which ate the GPU time
|
| 11 |
+
budget and made cold calls fail.
|
| 12 |
|
| 13 |
+
HF weights are cached under ``HF_HOME``; ``app.py`` points that at persistent
|
| 14 |
+
storage (``/data``) when it is mounted, so the weights are not re-downloaded on
|
| 15 |
+
every cold boot.
|
| 16 |
+
|
| 17 |
+
Local testing: set GDRAG_STUB_LLM=1 to return a canned answer without loading
|
| 18 |
+
the model (so rag/validate/app can be exercised without a GPU or the download).
|
| 19 |
"""
|
| 20 |
from __future__ import annotations
|
| 21 |
|
| 22 |
import os
|
|
|
|
| 23 |
|
| 24 |
MODEL_ID = os.environ.get("GDRAG_LLM", "Qwen/Qwen2.5-Coder-7B-Instruct")
|
| 25 |
STUB = os.environ.get("GDRAG_STUB_LLM") == "1"
|
|
|
|
| 38 |
return deco
|
| 39 |
|
| 40 |
|
| 41 |
+
# ββ Weights live in module globals, loaded once on the CPU. βββββββββββββββββ
|
| 42 |
+
_MODEL = None
|
| 43 |
+
_TOKENIZER = None
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _load() -> None:
|
| 47 |
+
"""Load the tokenizer + model into the module globals on the CPU.
|
| 48 |
+
|
| 49 |
+
Idempotent and CUDA-free: we deliberately do NOT use ``device_map="auto"``
|
| 50 |
+
or move to CUDA here, because on ZeroGPU there is no GPU outside a
|
| 51 |
+
``@spaces.GPU`` function and the main process must not initialise CUDA.
|
| 52 |
+
The per-call fork moves the model onto the GPU inside ``generate``.
|
| 53 |
+
"""
|
| 54 |
+
global _MODEL, _TOKENIZER
|
| 55 |
+
if STUB or _MODEL is not None:
|
| 56 |
+
return
|
| 57 |
import torch
|
| 58 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 59 |
+
_TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID)
|
| 60 |
+
_MODEL = AutoModelForCausalLM.from_pretrained(
|
| 61 |
+
MODEL_ID, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True,
|
| 62 |
)
|
| 63 |
+
_MODEL.eval()
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# Load at import so the Space boots with the weights resident β one disk read
|
| 67 |
+
# for the whole Space lifetime, instead of one per request.
|
| 68 |
+
_load()
|
| 69 |
|
| 70 |
|
| 71 |
def _render(messages, tok) -> str:
|
|
|
|
| 87 |
"\tmove_and_slide()\n```\n"
|
| 88 |
)
|
| 89 |
import torch
|
| 90 |
+
_load() # no-op when already resident; safety net for any fork edge case
|
| 91 |
+
tok = _TOKENIZER
|
| 92 |
+
model = _MODEL
|
| 93 |
+
# Move the pre-loaded CPU weights onto the GPU that ZeroGPU allocated for
|
| 94 |
+
# the duration of this call β cheap next to a disk reload.
|
| 95 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 96 |
+
model = model.to(device)
|
| 97 |
text = _render(messages, tok)
|
| 98 |
+
inputs = tok([text], return_tensors="pt").to(device)
|
| 99 |
with torch.no_grad():
|
| 100 |
out = model.generate(
|
| 101 |
**inputs, max_new_tokens=max_new_tokens,
|
|
|
|
| 107 |
|
| 108 |
|
| 109 |
def warmup() -> None:
|
| 110 |
+
"""Ensure the weights are resident. Already done at import; kept for the API."""
|
| 111 |
+
_load()
|