#!/usr/bin/env python3 """ gen_local.py — TINY-model generation on Apple Silicon (MPS), purely to validate the PIPELINE SHAPE before the venue. This does NOT run Laguna and does NOT do speculative decoding — it proves the measure-generate-report loop works so the same harness can be pointed at the real model on Prime Intellect. What it measures (the same two numbers we care about at the venue): - TTFT (time to first token): wall-clock from submit to the first new token. - tokens/sec (decode throughput): generated tokens / (total - TTFT). JVM analogy: think of this as a JUnit smoke test against an in-memory stub — it asserts the wiring is correct so the integration run against the real service (vLLM + Laguna on CUDA) can't fail on plumbing. Usage (Mac): uv run python scripts/gen_local.py --model sshleifer/tiny-gpt2 --max-new-tokens 64 uv run python scripts/gen_local.py --model gpt2 --prompt "def quicksort(arr):" At the venue you'd point --model at a small HF model first, then (on GPU) at Laguna itself for a sanity generation BEFORE wiring up vLLM serving. """ from __future__ import annotations import argparse import time import torch from transformers import AutoModelForCausalLM, AutoTokenizer def pick_device() -> str: if torch.cuda.is_available(): return "cuda" if torch.backends.mps.is_available(): return "mps" return "cpu" def main() -> None: p = argparse.ArgumentParser(description="Tiny-model gen + TTFT/tokens-per-sec on MPS/CPU.") p.add_argument("--model", default="sshleifer/tiny-gpt2", help="HF model id. Tiny by default; swap to gpt2 or (on GPU) Laguna.") p.add_argument("--prompt", default="def fibonacci(n):\n ", help="Coding-style prompt (matches the hackathon track).") p.add_argument("--max-new-tokens", type=int, default=64) p.add_argument("--greedy", action="store_true", default=True, help="Greedy decode so output is deterministic (lossless baseline).") args = p.parse_args() device = pick_device() print(f"[gen_local] device={device} model={args.model}") tok = AutoTokenizer.from_pretrained(args.model) model = AutoModelForCausalLM.from_pretrained(args.model).to(device) model.eval() inputs = tok(args.prompt, return_tensors="pt").to(device) n_prompt = inputs["input_ids"].shape[1] # --- Warmup: first run triggers lazy kernel compilation on MPS; if we timed # it, TTFT would absorb the one-off compile cost and tokens/sec would be # garbage. Run one throwaway pass to warm the kernels, THEN measure. --- with torch.no_grad(): _ = model.generate(**inputs, max_new_tokens=2, do_sample=False, pad_token_id=tok.eos_token_id) if device == "mps": torch.mps.synchronize() # --- TTFT: generate exactly 1 token, time it (warmed) --- if device == "mps": torch.mps.synchronize() t0 = time.perf_counter() with torch.no_grad(): _ = model.generate(**inputs, max_new_tokens=1, do_sample=False, pad_token_id=tok.eos_token_id) if device == "mps": torch.mps.synchronize() ttft = time.perf_counter() - t0 # --- Full generation: time the whole thing, derive decode tokens/sec --- if device == "mps": torch.mps.synchronize() t1 = time.perf_counter() with torch.no_grad(): out = model.generate(**inputs, max_new_tokens=args.max_new_tokens, do_sample=False, pad_token_id=tok.eos_token_id) if device == "mps": torch.mps.synchronize() total = time.perf_counter() - t1 new_tokens = out.shape[1] - n_prompt # tokens/sec over the decode phase: exclude the first token (its time is TTFT). decode_time = max(total - ttft, 1e-9) tps = (new_tokens - 1) / decode_time if new_tokens > 1 else 0.0 text = tok.decode(out[0][n_prompt:], skip_special_tokens=True) print("\n--- generation ---") print(text) print("\n--- metrics (PIPELINE-SHAPE ONLY; not Laguna numbers) ---") print(f"prompt_tokens : {n_prompt}") print(f"new_tokens : {new_tokens}") print(f"TTFT_s : {ttft:.4f}") print(f"total_s : {total:.4f}") print(f"decode_tokens_per_s: {tps:.2f}") if __name__ == "__main__": main()