vivekchakraverty Claude Opus 4.8 commited on
Commit
2709f63
·
1 Parent(s): cccb7d5

Load Qwen2.5-Coder-7B in 4-bit (nf4) inside the GPU worker

Browse files

Adds bitsandbytes; loads the 7B in 4-bit (~5GB) with device_map={'':0}
inside @spaces.GPU so it fits the ZeroGPU budget and generates fast.
Adds [gen] timing logs (GPU worker stdout reaches the run logs).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>

Files changed (2) hide show
  1. generate.py +23 -9
  2. requirements.txt +1 -0
generate.py CHANGED
@@ -42,15 +42,25 @@ except Exception: # not on a Space
42
 
43
  @lru_cache(maxsize=1)
44
  def _model_and_tokenizer():
45
- """Load tokenizer + model. Called from inside ``generate`` so on ZeroGPU it
46
- runs in the GPU worker where ``device_map="cuda"`` can place the weights."""
 
 
47
  import torch
48
- from transformers import AutoModelForCausalLM, AutoTokenizer
49
  tok = AutoTokenizer.from_pretrained(MODEL_ID)
50
- model = AutoModelForCausalLM.from_pretrained(
51
- MODEL_ID, torch_dtype=torch.bfloat16,
52
- device_map=("cuda" if ON_ZERO else None),
53
- )
 
 
 
 
 
 
 
 
54
  model.eval()
55
  return model, tok
56
 
@@ -73,17 +83,21 @@ def generate(messages: list[dict], max_new_tokens: int = 256,
73
  "\"ui_up\", \"ui_down\")\n\tvelocity = dir * speed\n"
74
  "\tmove_and_slide()\n```\n"
75
  )
76
- import torch
77
  model, tok = _model_and_tokenizer()
78
- dev = model.device
79
  text = _render(messages, tok)
80
  inputs = tok([text], return_tensors="pt").to(dev)
 
 
81
  with torch.no_grad():
82
  out = model.generate(
83
  **inputs, max_new_tokens=max_new_tokens,
84
  do_sample=temperature > 0, temperature=max(temperature, 1e-4),
85
  top_p=0.95, pad_token_id=tok.eos_token_id,
86
  )
 
 
87
  gen = out[0][inputs["input_ids"].shape[1]:]
88
  return tok.decode(gen, skip_special_tokens=True).strip()
89
 
 
42
 
43
  @lru_cache(maxsize=1)
44
  def _model_and_tokenizer():
45
+ """Load tokenizer + 4-bit model. Called from inside ``generate`` so on
46
+ ZeroGPU it runs in the GPU worker where bitsandbytes can quantize onto the
47
+ allocated GPU (4-bit load requires a GPU). 4-bit shrinks the 7B from ~15 GB
48
+ to ~5 GB, so it loads and generates fast enough to fit the GPU budget."""
49
  import torch
50
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
51
  tok = AutoTokenizer.from_pretrained(MODEL_ID)
52
+ if ON_ZERO:
53
+ quant = BitsAndBytesConfig(
54
+ load_in_4bit=True,
55
+ bnb_4bit_quant_type="nf4",
56
+ bnb_4bit_compute_dtype=torch.bfloat16,
57
+ bnb_4bit_use_double_quant=True,
58
+ )
59
+ model = AutoModelForCausalLM.from_pretrained(
60
+ MODEL_ID, quantization_config=quant, device_map={"": 0},
61
+ )
62
+ else:
63
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.bfloat16)
64
  model.eval()
65
  return model, tok
66
 
 
83
  "\"ui_up\", \"ui_down\")\n\tvelocity = dir * speed\n"
84
  "\tmove_and_slide()\n```\n"
85
  )
86
+ import torch, time
87
  model, tok = _model_and_tokenizer()
88
+ dev = "cuda" if ON_ZERO else "cpu"
89
  text = _render(messages, tok)
90
  inputs = tok([text], return_tensors="pt").to(dev)
91
+ print(f"[gen] dev={dev} cuda_avail={torch.cuda.is_available()} generating max_new={max_new_tokens}", flush=True)
92
+ t0 = time.time()
93
  with torch.no_grad():
94
  out = model.generate(
95
  **inputs, max_new_tokens=max_new_tokens,
96
  do_sample=temperature > 0, temperature=max(temperature, 1e-4),
97
  top_p=0.95, pad_token_id=tok.eos_token_id,
98
  )
99
+ n_new = int(out.shape[-1] - inputs["input_ids"].shape[1])
100
+ print(f"[gen] done {n_new} tokens in {time.time()-t0:.1f}s on {dev}", flush=True)
101
  gen = out[0][inputs["input_ids"].shape[1]:]
102
  return tok.decode(gen, skip_special_tokens=True).strip()
103
 
requirements.txt CHANGED
@@ -2,6 +2,7 @@ gradio>=4.44
2
  spaces>=0.30
3
  torch
4
  transformers~=4.45 # satisfies BOTH jina remote code (4.x) AND Qwen2.5-Coder
 
5
  sentence-transformers~=2.7
6
  einops # required by jina remote code
7
  accelerate # device_map model loading
 
2
  spaces>=0.30
3
  torch
4
  transformers~=4.45 # satisfies BOTH jina remote code (4.x) AND Qwen2.5-Coder
5
+ bitsandbytes>=0.43 # 4-bit (nf4) quantization of the 7B for ZeroGPU
6
  sentence-transformers~=2.7
7
  einops # required by jina remote code
8
  accelerate # device_map model loading