Spaces:
Running on Zero
perf(hf-space): pre-load model at module init (Option 3 refactor)
Browse filesMoves model load OUT of @spaces.GPU and into module-level startup
code. Per-call ZeroGPU quota cost drops from ~37s to ~10-25s, and
ZEROGPU_DURATION_SECONDS drops 60 β 45. Net effect: ~2.5x more
submissions per quota window vs the original 120s reservation.
The HuggingFace ZeroGPU docs' recommended pattern:
1. Module init (runs once at Space startup, on CPU, no GPU quota):
- Download model weights from HF Hub
- Deserialize into PyTorch state on CPU memory (~7.6GB for
Phi-4-mini-instruct)
- Tokenizer load
2. Inside @spaces.GPU (per request):
- model.to('cuda') β fast PCIe transfer of already-loaded weights
- tokenize
- generate
- decode
- (Implicit) GPU deallocated when function returns
vs the old "lazy load on first call" pattern which paid the full
download + deserialize cost on the first request after Space sleep,
inside the @spaces.GPU quota window.
Tradeoff:
+ ~2.5x more submissions per daily quota window
+ Predictable per-call latency (~15s warm, ~25s after long idle)
+ No "first call is dramatically slower" cliff
- Space cold-start (after deploy or sleep) takes ~30-60s longer
because the model loads at startup (one-time cost)
- ~7.6GB CPU RAM held continuously (well within Pro tier's
16GB-32GB envelope)
Changes:
app.py:
- Removed `_load_zerogpu_model()` function (lazy load)
- Added module-level model load inside `if _ZEROGPU_DEPS_AVAILABLE:`
block (NO device_map=auto β load to CPU)
- `_zerogpu_invoke()` now does explicit `_zerogpu_model.to('cuda')`
at the top and `.to('cuda')` on inputs (was `.to(model.device)`
which resolved to wherever device_map put it)
- ZEROGPU_DURATION_SECONDS default: 60 β 45 (per-call cost dropped)
- Updated module docstring with the new pattern + tradeoffs
test_diagnose.py:
- FakeModel now starts with device="cpu" and has .to() method
tracking transitions (mirrors real torch behavior)
- Removed monkeypatch of _load_zerogpu_model (function gone)
- Renamed test to ..._moves_model_and_inputs_to_cuda; asserts BOTH
the model device transition and the input device transition
Verification:
pytest test_diagnose.py 64 passed, 1 skipped (no test count change)
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
- app.py +50 -42
- test_diagnose.py +16 -5
|
@@ -190,13 +190,12 @@ ANTHROPIC_MODEL_ID = os.environ.get("MODEL_ID", "claude-opus-4-7")
|
|
| 190 |
HF_MODEL_ID = os.environ.get("HF_MODEL_ID", "google/gemma-2-9b-it")
|
| 191 |
ZEROGPU_MODEL_ID = os.environ.get("ZEROGPU_MODEL_ID", "microsoft/Phi-4-mini-instruct")
|
| 192 |
# ZeroGPU reserves this many seconds from the Space owner's daily quota
|
| 193 |
-
# per request
|
| 194 |
-
#
|
| 195 |
-
#
|
| 196 |
-
#
|
| 197 |
-
#
|
| 198 |
-
|
| 199 |
-
ZEROGPU_DURATION_SECONDS = int(os.environ.get("ZEROGPU_DURATION_SECONDS", "60"))
|
| 200 |
MAX_DESCRIPTION_WORDS = int(os.environ.get("MAX_DESCRIPTION_WORDS", "5000"))
|
| 201 |
MIN_DESCRIPTION_WORDS = 200
|
| 202 |
|
|
@@ -357,48 +356,57 @@ def _call_huggingface(system_block: str, user_prompt: str) -> str:
|
|
| 357 |
return resp.choices[0].message.content
|
| 358 |
|
| 359 |
|
| 360 |
-
# ZeroGPU backend
|
| 361 |
-
#
|
| 362 |
-
#
|
| 363 |
-
#
|
| 364 |
-
#
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
_zerogpu_tokenizer = _AutoTokenizer.from_pretrained(ZEROGPU_MODEL_ID)
|
| 388 |
_zerogpu_model = _AutoModelForCausalLM.from_pretrained(
|
| 389 |
ZEROGPU_MODEL_ID,
|
| 390 |
torch_dtype=_torch.bfloat16,
|
| 391 |
-
device_map
|
|
|
|
| 392 |
)
|
|
|
|
|
|
|
|
|
|
| 393 |
|
| 394 |
|
| 395 |
def _zerogpu_invoke(system_block: str, user_prompt: str) -> str:
|
| 396 |
-
"""Model invocation logic for the ZeroGPU backend.
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 402 |
messages = [
|
| 403 |
{"role": "system", "content": system_block},
|
| 404 |
{"role": "user", "content": user_prompt},
|
|
@@ -407,7 +415,7 @@ def _zerogpu_invoke(system_block: str, user_prompt: str) -> str:
|
|
| 407 |
messages,
|
| 408 |
return_tensors="pt",
|
| 409 |
add_generation_prompt=True,
|
| 410 |
-
).to(
|
| 411 |
outputs = _zerogpu_model.generate(
|
| 412 |
inputs,
|
| 413 |
max_new_tokens=2500,
|
|
|
|
| 190 |
HF_MODEL_ID = os.environ.get("HF_MODEL_ID", "google/gemma-2-9b-it")
|
| 191 |
ZEROGPU_MODEL_ID = os.environ.get("ZEROGPU_MODEL_ID", "microsoft/Phi-4-mini-instruct")
|
| 192 |
# ZeroGPU reserves this many seconds from the Space owner's daily quota
|
| 193 |
+
# per request. With the pre-load pattern below (model on CPU at module
|
| 194 |
+
# init, .to('cuda') + inference inside @spaces.GPU), per-call cost is
|
| 195 |
+
# only ~10-25s wall-clock. 45s gives generous margin while squeezing
|
| 196 |
+
# ~2.5x more submissions per quota window vs the original 120s.
|
| 197 |
+
# Pro-tier max is 120s; raise via env if you need bigger headroom.
|
| 198 |
+
ZEROGPU_DURATION_SECONDS = int(os.environ.get("ZEROGPU_DURATION_SECONDS", "45"))
|
|
|
|
| 199 |
MAX_DESCRIPTION_WORDS = int(os.environ.get("MAX_DESCRIPTION_WORDS", "5000"))
|
| 200 |
MIN_DESCRIPTION_WORDS = 200
|
| 201 |
|
|
|
|
| 356 |
return resp.choices[0].message.content
|
| 357 |
|
| 358 |
|
| 359 |
+
# ZeroGPU backend β pre-load pattern.
|
| 360 |
+
#
|
| 361 |
+
# Model is loaded onto CPU at Space startup (module init), NOT inside
|
| 362 |
+
# `@spaces.GPU`. This is the documented HuggingFace ZeroGPU pattern:
|
| 363 |
+
# - Module init runs once at Space startup, on CPU, with no GPU
|
| 364 |
+
# quota consumed. The expensive part β downloading ~7.6GB of
|
| 365 |
+
# safetensors and deserializing into PyTorch state β happens here.
|
| 366 |
+
# - Inside `@spaces.GPU`, all we do is `.to('cuda')` + tokenize +
|
| 367 |
+
# generate + decode. Wall-clock drops to ~10-15s warm, ~20-25s
|
| 368 |
+
# after Space restart (the .to('cuda') for 7.6GB takes a few
|
| 369 |
+
# seconds over PCIe).
|
| 370 |
+
#
|
| 371 |
+
# Why deliberately NOT `trust_remote_code=True`. Phi-4-mini-instruct's
|
| 372 |
+
# architecture is `phi3`, which transformers 4.46+ supports natively
|
| 373 |
+
# via `Phi3ForCausalLM` β no custom code download required. The custom
|
| 374 |
+
# modeling code that ships with the model on HF Hub (`modeling_phi3.py`)
|
| 375 |
+
# imports `LossKwargs` from `transformers.utils`, which was removed in
|
| 376 |
+
# transformers 4.57+ β loading WITH `trust_remote_code=True` fails
|
| 377 |
+
# with `ImportError: cannot import name 'LossKwargs' from
|
| 378 |
+
# 'transformers.utils'` and bricks the `@spaces.GPU` worker. The
|
| 379 |
+
# native path avoids the upstream pin-mismatch entirely.
|
| 380 |
+
#
|
| 381 |
+
# Tradeoff: ~30-60s slower Space cold-start (the one-time CPU load).
|
| 382 |
+
# Acceptable because Spaces only restart on deploy or after a long
|
| 383 |
+
# idle period. Worth it for the 2.5x quota efficiency.
|
| 384 |
+
|
| 385 |
+
if _ZEROGPU_DEPS_AVAILABLE:
|
| 386 |
_zerogpu_tokenizer = _AutoTokenizer.from_pretrained(ZEROGPU_MODEL_ID)
|
| 387 |
_zerogpu_model = _AutoModelForCausalLM.from_pretrained(
|
| 388 |
ZEROGPU_MODEL_ID,
|
| 389 |
torch_dtype=_torch.bfloat16,
|
| 390 |
+
# NO device_map β load to CPU; we move to GPU per-call inside
|
| 391 |
+
# @spaces.GPU. ZeroGPU has no GPU available at module load.
|
| 392 |
)
|
| 393 |
+
else:
|
| 394 |
+
_zerogpu_tokenizer = None
|
| 395 |
+
_zerogpu_model = None
|
| 396 |
|
| 397 |
|
| 398 |
def _zerogpu_invoke(system_block: str, user_prompt: str) -> str:
|
| 399 |
+
"""Model invocation logic for the ZeroGPU backend. Pre-loaded model
|
| 400 |
+
(on CPU) is moved to GPU on entry, then inference + decode. Reads
|
| 401 |
+
module-level globals (`_zerogpu_tokenizer`, `_zerogpu_model`) which
|
| 402 |
+
tests monkeypatch to fake the transformers types.
|
| 403 |
+
|
| 404 |
+
Separated from the `@spaces.GPU` decoration below so it can be
|
| 405 |
+
unit-tested without actually allocating a GPU."""
|
| 406 |
+
# Move pre-loaded model from CPU to the GPU that @spaces.GPU just
|
| 407 |
+
# allocated. Fast β just PCIe memory transfer of already-loaded
|
| 408 |
+
# weights, no download or deserialize.
|
| 409 |
+
_zerogpu_model.to("cuda")
|
| 410 |
messages = [
|
| 411 |
{"role": "system", "content": system_block},
|
| 412 |
{"role": "user", "content": user_prompt},
|
|
|
|
| 415 |
messages,
|
| 416 |
return_tensors="pt",
|
| 417 |
add_generation_prompt=True,
|
| 418 |
+
).to("cuda")
|
| 419 |
outputs = _zerogpu_model.generate(
|
| 420 |
inputs,
|
| 421 |
max_new_tokens=2500,
|
|
@@ -837,7 +837,12 @@ def _install_fake_zerogpu_model(monkeypatch, captured: dict, *,
|
|
| 837 |
return decoded_text
|
| 838 |
|
| 839 |
class _FakeModel:
|
| 840 |
-
device = "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 841 |
|
| 842 |
def generate(self, inputs, **kwargs):
|
| 843 |
captured["generate_inputs"] = inputs
|
|
@@ -846,8 +851,8 @@ def _install_fake_zerogpu_model(monkeypatch, captured: dict, *,
|
|
| 846 |
|
| 847 |
monkeypatch.setattr(app_module, "_zerogpu_tokenizer", _FakeTokenizer())
|
| 848 |
monkeypatch.setattr(app_module, "_zerogpu_model", _FakeModel())
|
| 849 |
-
#
|
| 850 |
-
|
| 851 |
|
| 852 |
|
| 853 |
def test_zerogpu_invoke_builds_chat_template_with_system_and_user(monkeypatch):
|
|
@@ -863,11 +868,17 @@ def test_zerogpu_invoke_builds_chat_template_with_system_and_user(monkeypatch):
|
|
| 863 |
assert chat["kwargs"]["add_generation_prompt"] is True
|
| 864 |
|
| 865 |
|
| 866 |
-
def
|
|
|
|
|
|
|
|
|
|
| 867 |
captured = {}
|
| 868 |
_install_fake_zerogpu_model(monkeypatch, captured)
|
| 869 |
_zerogpu_invoke("sys", "usr")
|
| 870 |
-
|
|
|
|
|
|
|
|
|
|
| 871 |
|
| 872 |
|
| 873 |
def test_zerogpu_invoke_generate_call_shape(monkeypatch):
|
|
|
|
| 837 |
return decoded_text
|
| 838 |
|
| 839 |
class _FakeModel:
|
| 840 |
+
device = "cpu" # starts on CPU; _zerogpu_invoke moves to cuda
|
| 841 |
+
|
| 842 |
+
def to(self, device):
|
| 843 |
+
captured["model_moved_to_device"] = device
|
| 844 |
+
self.device = device
|
| 845 |
+
return self
|
| 846 |
|
| 847 |
def generate(self, inputs, **kwargs):
|
| 848 |
captured["generate_inputs"] = inputs
|
|
|
|
| 851 |
|
| 852 |
monkeypatch.setattr(app_module, "_zerogpu_tokenizer", _FakeTokenizer())
|
| 853 |
monkeypatch.setattr(app_module, "_zerogpu_model", _FakeModel())
|
| 854 |
+
# Note: no _load_zerogpu_model to patch β after the pre-load refactor
|
| 855 |
+
# (commit ___), model load happens at module init, not lazily.
|
| 856 |
|
| 857 |
|
| 858 |
def test_zerogpu_invoke_builds_chat_template_with_system_and_user(monkeypatch):
|
|
|
|
| 868 |
assert chat["kwargs"]["add_generation_prompt"] is True
|
| 869 |
|
| 870 |
|
| 871 |
+
def test_zerogpu_invoke_moves_model_and_inputs_to_cuda(monkeypatch):
|
| 872 |
+
"""Post-refactor (pre-load pattern): the model lives on CPU at
|
| 873 |
+
module init, and _zerogpu_invoke must explicitly move it AND the
|
| 874 |
+
input tensors to cuda inside the @spaces.GPU context."""
|
| 875 |
captured = {}
|
| 876 |
_install_fake_zerogpu_model(monkeypatch, captured)
|
| 877 |
_zerogpu_invoke("sys", "usr")
|
| 878 |
+
# Model: moved CPU β cuda inside the invoke
|
| 879 |
+
assert captured["model_moved_to_device"] == "cuda"
|
| 880 |
+
# Inputs: tokenized then moved to cuda for inference
|
| 881 |
+
assert captured["inputs_moved_to_device"] == "cuda"
|
| 882 |
|
| 883 |
|
| 884 |
def test_zerogpu_invoke_generate_call_shape(monkeypatch):
|