Spaces:
Running on Zero
Running on Zero
Commit ·
5ff14e5
1
Parent(s): 8df32ec
diag: log cuda availability + model device + gen timing; force model.to(cuda) in fn
Browse files- generate.py +15 -2
generate.py
CHANGED
|
@@ -91,18 +91,31 @@ def generate(messages: list[dict], max_new_tokens: int = 512,
|
|
| 91 |
"\"ui_up\", \"ui_down\")\n\tvelocity = dir * speed\n"
|
| 92 |
"\tmove_and_slide()\n```\n"
|
| 93 |
)
|
| 94 |
-
import torch
|
| 95 |
_load() # no-op once resident
|
| 96 |
tok = _TOKENIZER
|
| 97 |
model = _MODEL
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
text = _render(messages, tok)
|
| 99 |
-
inputs = tok([text], return_tensors="pt").to(
|
|
|
|
| 100 |
with torch.no_grad():
|
| 101 |
out = model.generate(
|
| 102 |
**inputs, max_new_tokens=max_new_tokens,
|
| 103 |
do_sample=temperature > 0, temperature=max(temperature, 1e-4),
|
| 104 |
top_p=0.95, pad_token_id=tok.eos_token_id,
|
| 105 |
)
|
|
|
|
|
|
|
| 106 |
gen = out[0][inputs["input_ids"].shape[1]:]
|
| 107 |
return tok.decode(gen, skip_special_tokens=True).strip()
|
| 108 |
|
|
|
|
| 91 |
"\"ui_up\", \"ui_down\")\n\tvelocity = dir * speed\n"
|
| 92 |
"\tmove_and_slide()\n```\n"
|
| 93 |
)
|
| 94 |
+
import torch, time
|
| 95 |
_load() # no-op once resident
|
| 96 |
tok = _TOKENIZER
|
| 97 |
model = _MODEL
|
| 98 |
+
avail = torch.cuda.is_available()
|
| 99 |
+
before = str(next(model.parameters()).device)
|
| 100 |
+
# Inside @spaces.GPU the GPU is allocated for this call. Force the model
|
| 101 |
+
# onto it (no-op if already there); print device + timing so we can see in
|
| 102 |
+
# the logs whether generation actually ran on CUDA.
|
| 103 |
+
if avail:
|
| 104 |
+
model = model.to("cuda")
|
| 105 |
+
dev = "cuda" if avail else "cpu"
|
| 106 |
+
after = str(next(model.parameters()).device)
|
| 107 |
+
print(f"[gen] cuda_avail={avail} dev={dev} model_before={before} model_after={after}", flush=True)
|
| 108 |
text = _render(messages, tok)
|
| 109 |
+
inputs = tok([text], return_tensors="pt").to(dev)
|
| 110 |
+
t0 = time.time()
|
| 111 |
with torch.no_grad():
|
| 112 |
out = model.generate(
|
| 113 |
**inputs, max_new_tokens=max_new_tokens,
|
| 114 |
do_sample=temperature > 0, temperature=max(temperature, 1e-4),
|
| 115 |
top_p=0.95, pad_token_id=tok.eos_token_id,
|
| 116 |
)
|
| 117 |
+
n_new = int(out.shape[-1] - inputs["input_ids"].shape[1])
|
| 118 |
+
print(f"[gen] generated {n_new} tokens in {time.time()-t0:.1f}s on {dev}", flush=True)
|
| 119 |
gen = out[0][inputs["input_ids"].shape[1]:]
|
| 120 |
return tok.decode(gen, skip_special_tokens=True).strip()
|
| 121 |
|