Spaces:
Running on Zero
Running on Zero
Commit ·
2709f63
1
Parent(s): cccb7d5
Load Qwen2.5-Coder-7B in 4-bit (nf4) inside the GPU worker
Browse filesAdds bitsandbytes; loads the 7B in 4-bit (~5GB) with device_map={'':0}
inside @spaces.GPU so it fits the ZeroGPU budget and generates fast.
Adds [gen] timing logs (GPU worker stdout reaches the run logs).
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
- generate.py +23 -9
- requirements.txt +1 -0
generate.py
CHANGED
|
@@ -42,15 +42,25 @@ except Exception: # not on a Space
|
|
| 42 |
|
| 43 |
@lru_cache(maxsize=1)
|
| 44 |
def _model_and_tokenizer():
|
| 45 |
-
"""Load tokenizer + model. Called from inside ``generate`` so on
|
| 46 |
-
runs in the GPU worker where
|
|
|
|
|
|
|
| 47 |
import torch
|
| 48 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 49 |
tok = AutoTokenizer.from_pretrained(MODEL_ID)
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
model.eval()
|
| 55 |
return model, tok
|
| 56 |
|
|
@@ -73,17 +83,21 @@ def generate(messages: list[dict], max_new_tokens: int = 256,
|
|
| 73 |
"\"ui_up\", \"ui_down\")\n\tvelocity = dir * speed\n"
|
| 74 |
"\tmove_and_slide()\n```\n"
|
| 75 |
)
|
| 76 |
-
import torch
|
| 77 |
model, tok = _model_and_tokenizer()
|
| 78 |
-
dev =
|
| 79 |
text = _render(messages, tok)
|
| 80 |
inputs = tok([text], return_tensors="pt").to(dev)
|
|
|
|
|
|
|
| 81 |
with torch.no_grad():
|
| 82 |
out = model.generate(
|
| 83 |
**inputs, max_new_tokens=max_new_tokens,
|
| 84 |
do_sample=temperature > 0, temperature=max(temperature, 1e-4),
|
| 85 |
top_p=0.95, pad_token_id=tok.eos_token_id,
|
| 86 |
)
|
|
|
|
|
|
|
| 87 |
gen = out[0][inputs["input_ids"].shape[1]:]
|
| 88 |
return tok.decode(gen, skip_special_tokens=True).strip()
|
| 89 |
|
|
|
|
| 42 |
|
| 43 |
@lru_cache(maxsize=1)
|
| 44 |
def _model_and_tokenizer():
|
| 45 |
+
"""Load tokenizer + 4-bit model. Called from inside ``generate`` so on
|
| 46 |
+
ZeroGPU it runs in the GPU worker where bitsandbytes can quantize onto the
|
| 47 |
+
allocated GPU (4-bit load requires a GPU). 4-bit shrinks the 7B from ~15 GB
|
| 48 |
+
to ~5 GB, so it loads and generates fast enough to fit the GPU budget."""
|
| 49 |
import torch
|
| 50 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
| 51 |
tok = AutoTokenizer.from_pretrained(MODEL_ID)
|
| 52 |
+
if ON_ZERO:
|
| 53 |
+
quant = BitsAndBytesConfig(
|
| 54 |
+
load_in_4bit=True,
|
| 55 |
+
bnb_4bit_quant_type="nf4",
|
| 56 |
+
bnb_4bit_compute_dtype=torch.bfloat16,
|
| 57 |
+
bnb_4bit_use_double_quant=True,
|
| 58 |
+
)
|
| 59 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 60 |
+
MODEL_ID, quantization_config=quant, device_map={"": 0},
|
| 61 |
+
)
|
| 62 |
+
else:
|
| 63 |
+
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
|
| 64 |
model.eval()
|
| 65 |
return model, tok
|
| 66 |
|
|
|
|
| 83 |
"\"ui_up\", \"ui_down\")\n\tvelocity = dir * speed\n"
|
| 84 |
"\tmove_and_slide()\n```\n"
|
| 85 |
)
|
| 86 |
+
import torch, time
|
| 87 |
model, tok = _model_and_tokenizer()
|
| 88 |
+
dev = "cuda" if ON_ZERO else "cpu"
|
| 89 |
text = _render(messages, tok)
|
| 90 |
inputs = tok([text], return_tensors="pt").to(dev)
|
| 91 |
+
print(f"[gen] dev={dev} cuda_avail={torch.cuda.is_available()} generating max_new={max_new_tokens}", flush=True)
|
| 92 |
+
t0 = time.time()
|
| 93 |
with torch.no_grad():
|
| 94 |
out = model.generate(
|
| 95 |
**inputs, max_new_tokens=max_new_tokens,
|
| 96 |
do_sample=temperature > 0, temperature=max(temperature, 1e-4),
|
| 97 |
top_p=0.95, pad_token_id=tok.eos_token_id,
|
| 98 |
)
|
| 99 |
+
n_new = int(out.shape[-1] - inputs["input_ids"].shape[1])
|
| 100 |
+
print(f"[gen] done {n_new} tokens in {time.time()-t0:.1f}s on {dev}", flush=True)
|
| 101 |
gen = out[0][inputs["input_ids"].shape[1]:]
|
| 102 |
return tok.decode(gen, skip_special_tokens=True).strip()
|
| 103 |
|
requirements.txt
CHANGED
|
@@ -2,6 +2,7 @@ gradio>=4.44
|
|
| 2 |
spaces>=0.30
|
| 3 |
torch
|
| 4 |
transformers~=4.45 # satisfies BOTH jina remote code (4.x) AND Qwen2.5-Coder
|
|
|
|
| 5 |
sentence-transformers~=2.7
|
| 6 |
einops # required by jina remote code
|
| 7 |
accelerate # device_map model loading
|
|
|
|
| 2 |
spaces>=0.30
|
| 3 |
torch
|
| 4 |
transformers~=4.45 # satisfies BOTH jina remote code (4.x) AND Qwen2.5-Coder
|
| 5 |
+
bitsandbytes>=0.43 # 4-bit (nf4) quantization of the 7B for ZeroGPU
|
| 6 |
sentence-transformers~=2.7
|
| 7 |
einops # required by jina remote code
|
| 8 |
accelerate # device_map model loading
|