vivekchakraverty commited on
Commit
5ff14e5
·
1 Parent(s): 8df32ec

diag: log cuda availability + model device + gen timing; force model.to(cuda) in fn

Browse files
Files changed (1) hide show
  1. generate.py +15 -2
generate.py CHANGED
@@ -91,18 +91,31 @@ def generate(messages: list[dict], max_new_tokens: int = 512,
91
  "\"ui_up\", \"ui_down\")\n\tvelocity = dir * speed\n"
92
  "\tmove_and_slide()\n```\n"
93
  )
94
- import torch
95
  _load() # no-op once resident
96
  tok = _TOKENIZER
97
  model = _MODEL
 
 
 
 
 
 
 
 
 
 
98
  text = _render(messages, tok)
99
- inputs = tok([text], return_tensors="pt").to(_DEVICE)
 
100
  with torch.no_grad():
101
  out = model.generate(
102
  **inputs, max_new_tokens=max_new_tokens,
103
  do_sample=temperature > 0, temperature=max(temperature, 1e-4),
104
  top_p=0.95, pad_token_id=tok.eos_token_id,
105
  )
 
 
106
  gen = out[0][inputs["input_ids"].shape[1]:]
107
  return tok.decode(gen, skip_special_tokens=True).strip()
108
 
 
91
  "\"ui_up\", \"ui_down\")\n\tvelocity = dir * speed\n"
92
  "\tmove_and_slide()\n```\n"
93
  )
94
+ import torch, time
95
  _load() # no-op once resident
96
  tok = _TOKENIZER
97
  model = _MODEL
98
+ avail = torch.cuda.is_available()
99
+ before = str(next(model.parameters()).device)
100
+ # Inside @spaces.GPU the GPU is allocated for this call. Force the model
101
+ # onto it (no-op if already there); print device + timing so we can see in
102
+ # the logs whether generation actually ran on CUDA.
103
+ if avail:
104
+ model = model.to("cuda")
105
+ dev = "cuda" if avail else "cpu"
106
+ after = str(next(model.parameters()).device)
107
+ print(f"[gen] cuda_avail={avail} dev={dev} model_before={before} model_after={after}", flush=True)
108
  text = _render(messages, tok)
109
+ inputs = tok([text], return_tensors="pt").to(dev)
110
+ t0 = time.time()
111
  with torch.no_grad():
112
  out = model.generate(
113
  **inputs, max_new_tokens=max_new_tokens,
114
  do_sample=temperature > 0, temperature=max(temperature, 1e-4),
115
  top_p=0.95, pad_token_id=tok.eos_token_id,
116
  )
117
+ n_new = int(out.shape[-1] - inputs["input_ids"].shape[1])
118
+ print(f"[gen] generated {n_new} tokens in {time.time()-t0:.1f}s on {dev}", flush=True)
119
  gen = out[0][inputs["input_ids"].shape[1]:]
120
  return tok.decode(gen, skip_special_tokens=True).strip()
121