| |
| """ |
| gen_local.py — TINY-model generation on Apple Silicon (MPS), purely to validate |
| the PIPELINE SHAPE before the venue. This does NOT run Laguna and does NOT do |
| speculative decoding — it proves the measure-generate-report loop works so the |
| same harness can be pointed at the real model on Prime Intellect. |
| |
| What it measures (the same two numbers we care about at the venue): |
| - TTFT (time to first token): wall-clock from submit to the first new token. |
| - tokens/sec (decode throughput): generated tokens / (total - TTFT). |
| |
| JVM analogy: think of this as a JUnit smoke test against an in-memory stub — |
| it asserts the wiring is correct so the integration run against the real |
| service (vLLM + Laguna on CUDA) can't fail on plumbing. |
| |
| Usage (Mac): |
| uv run python scripts/gen_local.py --model sshleifer/tiny-gpt2 --max-new-tokens 64 |
| uv run python scripts/gen_local.py --model gpt2 --prompt "def quicksort(arr):" |
| |
| At the venue you'd point --model at a small HF model first, then (on GPU) at |
| Laguna itself for a sanity generation BEFORE wiring up vLLM serving. |
| """ |
| from __future__ import annotations |
|
|
| import argparse |
| import time |
|
|
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
|
|
| def pick_device() -> str: |
| if torch.cuda.is_available(): |
| return "cuda" |
| if torch.backends.mps.is_available(): |
| return "mps" |
| return "cpu" |
|
|
|
|
| def main() -> None: |
| p = argparse.ArgumentParser(description="Tiny-model gen + TTFT/tokens-per-sec on MPS/CPU.") |
| p.add_argument("--model", default="sshleifer/tiny-gpt2", |
| help="HF model id. Tiny by default; swap to gpt2 or (on GPU) Laguna.") |
| p.add_argument("--prompt", default="def fibonacci(n):\n ", |
| help="Coding-style prompt (matches the hackathon track).") |
| p.add_argument("--max-new-tokens", type=int, default=64) |
| p.add_argument("--greedy", action="store_true", default=True, |
| help="Greedy decode so output is deterministic (lossless baseline).") |
| args = p.parse_args() |
|
|
| device = pick_device() |
| print(f"[gen_local] device={device} model={args.model}") |
|
|
| tok = AutoTokenizer.from_pretrained(args.model) |
| model = AutoModelForCausalLM.from_pretrained(args.model).to(device) |
| model.eval() |
|
|
| inputs = tok(args.prompt, return_tensors="pt").to(device) |
| n_prompt = inputs["input_ids"].shape[1] |
|
|
| |
| |
| |
| with torch.no_grad(): |
| _ = model.generate(**inputs, max_new_tokens=2, do_sample=False, |
| pad_token_id=tok.eos_token_id) |
| if device == "mps": |
| torch.mps.synchronize() |
|
|
| |
| if device == "mps": |
| torch.mps.synchronize() |
| t0 = time.perf_counter() |
| with torch.no_grad(): |
| _ = model.generate(**inputs, max_new_tokens=1, do_sample=False, |
| pad_token_id=tok.eos_token_id) |
| if device == "mps": |
| torch.mps.synchronize() |
| ttft = time.perf_counter() - t0 |
|
|
| |
| if device == "mps": |
| torch.mps.synchronize() |
| t1 = time.perf_counter() |
| with torch.no_grad(): |
| out = model.generate(**inputs, max_new_tokens=args.max_new_tokens, |
| do_sample=False, pad_token_id=tok.eos_token_id) |
| if device == "mps": |
| torch.mps.synchronize() |
| total = time.perf_counter() - t1 |
|
|
| new_tokens = out.shape[1] - n_prompt |
| |
| decode_time = max(total - ttft, 1e-9) |
| tps = (new_tokens - 1) / decode_time if new_tokens > 1 else 0.0 |
|
|
| text = tok.decode(out[0][n_prompt:], skip_special_tokens=True) |
|
|
| print("\n--- generation ---") |
| print(text) |
| print("\n--- metrics (PIPELINE-SHAPE ONLY; not Laguna numbers) ---") |
| print(f"prompt_tokens : {n_prompt}") |
| print(f"new_tokens : {new_tokens}") |
| print(f"TTFT_s : {ttft:.4f}") |
| print(f"total_s : {total:.4f}") |
| print(f"decode_tokens_per_s: {tps:.2f}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|