Spaces:

vivekchakraverty
/

gdscript-assistant

Running on Zero

vivekchakraverty Claude Opus 4.8 commited on 1 day ago

Commit

2709f63

1 Parent(s): cccb7d5

Load Qwen2.5-Coder-7B in 4-bit (nf4) inside the GPU worker

Adds bitsandbytes; loads the 7B in 4-bit (~5GB) with device_map={'':0}
inside @spaces.GPU so it fits the ZeroGPU budget and generates fast.
Adds [gen] timing logs (GPU worker stdout reaches the run logs).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

Files changed (2) hide show

generate.py +23 -9
requirements.txt +1 -0

generate.py CHANGED Viewed

@@ -42,15 +42,25 @@ except Exception:                                  # not on a Space
 @lru_cache(maxsize=1)
 def _model_and_tokenizer():
-    """Load tokenizer + model. Called from inside ``generate`` so on ZeroGPU it
-    runs in the GPU worker where ``device_map="cuda"`` can place the weights."""
     import torch
-    from transformers import AutoModelForCausalLM, AutoTokenizer
     tok = AutoTokenizer.from_pretrained(MODEL_ID)
-    model = AutoModelForCausalLM.from_pretrained(
-        MODEL_ID, torch_dtype=torch.bfloat16,
-        device_map=("cuda" if ON_ZERO else None),
-    )
     model.eval()
     return model, tok
@@ -73,17 +83,21 @@ def generate(messages: list[dict], max_new_tokens: int = 256,
             "\"ui_up\", \"ui_down\")\n\tvelocity = dir * speed\n"
             "\tmove_and_slide()\n```\n"
         )
-    import torch
     model, tok = _model_and_tokenizer()
-    dev = model.device
     text = _render(messages, tok)
     inputs = tok([text], return_tensors="pt").to(dev)
     with torch.no_grad():
         out = model.generate(
             **inputs, max_new_tokens=max_new_tokens,
             do_sample=temperature > 0, temperature=max(temperature, 1e-4),
             top_p=0.95, pad_token_id=tok.eos_token_id,
         )
     gen = out[0][inputs["input_ids"].shape[1]:]
     return tok.decode(gen, skip_special_tokens=True).strip()

 @lru_cache(maxsize=1)
 def _model_and_tokenizer():
+    """Load tokenizer + 4-bit model. Called from inside ``generate`` so on
+    ZeroGPU it runs in the GPU worker where bitsandbytes can quantize onto the
+    allocated GPU (4-bit load requires a GPU). 4-bit shrinks the 7B from ~15 GB
+    to ~5 GB, so it loads and generates fast enough to fit the GPU budget."""
     import torch
+    from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
     tok = AutoTokenizer.from_pretrained(MODEL_ID)
+    if ON_ZERO:
+        quant = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_use_double_quant=True,
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID, quantization_config=quant, device_map={"": 0},
+        )
+    else:
+        model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
     model.eval()
     return model, tok
             "\"ui_up\", \"ui_down\")\n\tvelocity = dir * speed\n"
             "\tmove_and_slide()\n```\n"
         )
+    import torch, time
     model, tok = _model_and_tokenizer()
+    dev = "cuda" if ON_ZERO else "cpu"
     text = _render(messages, tok)
     inputs = tok([text], return_tensors="pt").to(dev)
+    print(f"[gen] dev={dev} cuda_avail={torch.cuda.is_available()} generating max_new={max_new_tokens}", flush=True)
+    t0 = time.time()
     with torch.no_grad():
         out = model.generate(
             **inputs, max_new_tokens=max_new_tokens,
             do_sample=temperature > 0, temperature=max(temperature, 1e-4),
             top_p=0.95, pad_token_id=tok.eos_token_id,
         )
+    n_new = int(out.shape[-1] - inputs["input_ids"].shape[1])
+    print(f"[gen] done {n_new} tokens in {time.time()-t0:.1f}s on {dev}", flush=True)
     gen = out[0][inputs["input_ids"].shape[1]:]
     return tok.decode(gen, skip_special_tokens=True).strip()

requirements.txt CHANGED Viewed

@@ -2,6 +2,7 @@ gradio>=4.44
 spaces>=0.30
 torch
 transformers~=4.45        # satisfies BOTH jina remote code (4.x) AND Qwen2.5-Coder
 sentence-transformers~=2.7
 einops                    # required by jina remote code
 accelerate                # device_map model loading

 spaces>=0.30
 torch
 transformers~=4.45        # satisfies BOTH jina remote code (4.x) AND Qwen2.5-Coder
+bitsandbytes>=0.43        # 4-bit (nf4) quantization of the 7B for ZeroGPU
 sentence-transformers~=2.7
 einops                    # required by jina remote code
 accelerate                # device_map model loading