Spaces:
Running on Zero
Running on Zero
File size: 4,353 Bytes
777ea0e cccb7d5 777ea0e cccb7d5 777ea0e cccb7d5 777ea0e 8df32ec 777ea0e 8df32ec 777ea0e cccb7d5 2709f63 8df32ec 2709f63 cccb7d5 2709f63 cccb7d5 777ea0e 5fa56c1 6246295 777ea0e 2709f63 cccb7d5 2709f63 777ea0e 5ff14e5 2709f63 777ea0e 2709f63 777ea0e cccb7d5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 | """Generation with Qwen2.5-Coder-7B-Instruct on ZeroGPU.
Canonical ZeroGPU pattern: the main process stays light (no model in it) and
the model is loaded **on the GPU inside** the ``@spaces.GPU`` function, where
the GPU actually exists. ``device_map="cuda"`` (accelerate) puts every shard on
the allocated GPU. An ``lru_cache`` keeps it resident for the life of each GPU
worker.
Why not load once at import? On ZeroGPU there is no GPU outside ``@spaces.GPU``,
and ZeroGPU forks the main process for every GPU call. Loading the 15 GB model
into the main process makes that fork heavy and tangled with gradio's asyncio
loop, and the GPU task never runs (it just times out -> "GPU task aborted").
Keeping the main process model-free is what makes the GPU call actually execute.
Local testing: set GDRAG_STUB_LLM=1 to return a canned answer without a GPU or
the model download.
"""
from __future__ import annotations
import os
from functools import lru_cache
MODEL_ID = os.environ.get("GDRAG_LLM", "Qwen/Qwen2.5-Coder-7B-Instruct")
STUB = os.environ.get("GDRAG_STUB_LLM") == "1"
# Import spaces BEFORE torch so ZeroGPU can patch CUDA. Degrade to a no-op
# decorator (CPU) when running locally without the package.
try:
import spaces
GPU = spaces.GPU
ON_ZERO = True
except Exception: # not on a Space
ON_ZERO = False
def GPU(*dargs, **dkwargs):
def deco(fn):
return fn
if dargs and callable(dargs[0]):
return dargs[0]
return deco
@lru_cache(maxsize=1)
def _model_and_tokenizer():
"""Load tokenizer + 4-bit model. Called from inside ``generate`` so on
ZeroGPU it runs in the GPU worker where bitsandbytes can quantize onto the
allocated GPU (4-bit load requires a GPU). 4-bit shrinks the 7B from ~15 GB
to ~5 GB, so it loads and generates fast enough to fit the GPU budget."""
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
tok = AutoTokenizer.from_pretrained(MODEL_ID)
if ON_ZERO:
quant = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, quantization_config=quant, device_map={"": 0},
)
else:
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
model.eval()
return model, tok
def _render(messages, tok) -> str:
return tok.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True)
@GPU(duration=180)
def generate(messages: list[dict], max_new_tokens: int = 512,
temperature: float = 0.2) -> str:
"""Generate an assistant reply for chat-format ``messages``."""
if STUB:
return (
"Here is a Godot 4 movement script:\n\n```gdscript\n"
"extends CharacterBody2D\n\n@export var speed: float = 200.0\n\n"
"func _physics_process(delta: float) -> void:\n"
"\tvar dir := Input.get_vector(\"ui_left\", \"ui_right\", "
"\"ui_up\", \"ui_down\")\n\tvelocity = dir * speed\n"
"\tmove_and_slide()\n```\n"
)
import torch, time
model, tok = _model_and_tokenizer()
dev = "cuda" if ON_ZERO else "cpu"
text = _render(messages, tok)
inputs = tok([text], return_tensors="pt").to(dev)
print(f"[gen] dev={dev} cuda_avail={torch.cuda.is_available()} generating max_new={max_new_tokens}", flush=True)
t0 = time.time()
with torch.no_grad():
out = model.generate(
**inputs, max_new_tokens=max_new_tokens,
do_sample=temperature > 0, temperature=max(temperature, 1e-4),
top_p=0.95, pad_token_id=tok.eos_token_id,
)
n_new = int(out.shape[-1] - inputs["input_ids"].shape[1])
print(f"[gen] done {n_new} tokens in {time.time()-t0:.1f}s on {dev}", flush=True)
gen = out[0][inputs["input_ids"].shape[1]:]
return tok.decode(gen, skip_special_tokens=True).strip()
def warmup() -> None:
"""No-op on ZeroGPU: the model can only be loaded inside @spaces.GPU (the
GPU does not exist in the main process)."""
if not ON_ZERO and not STUB:
_model_and_tokenizer()
|