#!/usr/bin/env python3 """ measure.py — the benchmark harness. Hits an OpenAI-compatible endpoint (the one `vllm serve` exposes) and records the three demo numbers: tokens/sec (decode throughput) <- THE WIN TTFT (time to first token) <- should be ~unchanged with DFlash acceptance length tau <- WHY it's faster (read from vLLM metrics) Run it twice on the GPU host — once against the baseline server, once against the DFlash server — and diff the JSON. That diff IS the before/after table. This file is endpoint-driven, so it runs anywhere (including the Mac) AS LONG AS something is serving on --base-url. On the Mac you can point it at a local tiny-model OpenAI server to shape-test; on the GPU host you point it at vLLM. acceptance length tau: tau = mean(number of tokens committed per target forward pass). With a draft of gamma=7, tau ranges from 1 (everything rejected, +1 bonus) up to gamma+1=8 (all accepted + bonus). The DFlash card publishes per-position acceptance only (~70.7% at position 1, decaying to ~2% by position 7), NOT a tau figure -- measure tau on the GPU host (expect roughly 2-3). vLLM exposes accepted/draft counts in its metrics; we read them from /metrics (Prometheus) when present and otherwise estimate tau from the speedup. VERIFY AT ONBOARDING which metric names the vLLM build uses (e.g. vllm:spec_decode_num_accepted_tokens / _num_draft_tokens). Usage: python bench/measure.py --base-url http://localhost:8000 --model laguna \ --label dflash --out results/dflash.json --n 20 python bench/measure.py --base-url http://localhost:8000 --model laguna \ --label baseline --out results/baseline.json --n 20 Requires only stdlib + requests-free urllib, so no extra extra deps. """ from __future__ import annotations import argparse import json import os import time import urllib.request from statistics import mean PROMPTS = [ "Write a Python function that returns the nth Fibonacci number iteratively.", "Implement binary search over a sorted list in Python. Return the index or -1.", "Write a function to check if a string is a palindrome, ignoring case and spaces.", "Implement quicksort in Python.", "Write a function that merges two sorted lists into one sorted list.", ] def _post(url: str, payload: dict) -> dict: data = json.dumps(payload).encode() req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"}) with urllib.request.urlopen(req, timeout=600) as r: return json.loads(r.read().decode()) def _try_metrics(base_url: str) -> dict: """Best-effort read of vLLM Prometheus spec-decode counters.""" out = {} try: with urllib.request.urlopen(base_url.rstrip("/") + "/metrics", timeout=10) as r: text = r.read().decode() except Exception: return out for line in text.splitlines(): if line.startswith("#"): continue # VERIFY metric names at onboarding; these are the common vLLM ones. for key in ("spec_decode_num_accepted_tokens", "spec_decode_num_draft_tokens", "spec_decode_num_emitted_tokens"): if key in line: try: out[key] = float(line.split()[-1]) except ValueError: pass return out def measure_one(base_url: str, model: str, prompt: str, max_tokens: int) -> dict: url = base_url.rstrip("/") + "/v1/completions" # Greedy (temperature 0) so output is deterministic — this is what makes the # baseline-vs-DFlash output comparison a LOSSLESS check. payload = { "model": model, "prompt": prompt, "max_tokens": max_tokens, "temperature": 0.0, "stream": True, } data = json.dumps(payload).encode() req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"}) t0 = time.perf_counter() ttft = None n_tokens = 0 chunks = [] with urllib.request.urlopen(req, timeout=600) as r: for raw in r: line = raw.decode().strip() if not line or not line.startswith("data:"): continue body = line[len("data:"):].strip() if body == "[DONE]": break obj = json.loads(body) piece = obj.get("choices", [{}])[0].get("text", "") if piece: if ttft is None: ttft = time.perf_counter() - t0 n_tokens += 1 chunks.append(piece) total = time.perf_counter() - t0 decode_time = max(total - (ttft or 0.0), 1e-9) tps = (n_tokens - 1) / decode_time if n_tokens > 1 else 0.0 return { "ttft_s": ttft, "total_s": total, "new_tokens": n_tokens, "tokens_per_s": tps, "text": "".join(chunks), } def main() -> None: p = argparse.ArgumentParser(description="Benchmark tokens/sec, TTFT, acceptance length against a vLLM endpoint.") p.add_argument("--base-url", default="http://localhost:8000") p.add_argument("--model", default="laguna") p.add_argument("--label", required=True, help="baseline | dflash (used in the output).") p.add_argument("--n", type=int, default=20, help="Number of generations (cycles through the prompt set).") p.add_argument("--max-tokens", type=int, default=256) p.add_argument("--out", default=None, help="Write JSON here (e.g. results/dflash.json).") args = p.parse_args() before = _try_metrics(args.base_url) runs = [] for i in range(args.n): prompt = PROMPTS[i % len(PROMPTS)] runs.append(measure_one(args.base_url, args.model, prompt, args.max_tokens)) print(f" [{args.label}] run {i+1}/{args.n} " f"tps={runs[-1]['tokens_per_s']:.1f} ttft={runs[-1]['ttft_s']:.3f}s") after = _try_metrics(args.base_url) # acceptance length tau from metric deltas, if available. tau = None acc = after.get("spec_decode_num_accepted_tokens", 0) - before.get("spec_decode_num_accepted_tokens", 0) emitted = after.get("spec_decode_num_emitted_tokens", 0) - before.get("spec_decode_num_emitted_tokens", 0) draft = after.get("spec_decode_num_draft_tokens", 0) - before.get("spec_decode_num_draft_tokens", 0) # tau ~= total committed tokens / number of target verification passes. # accepted + 1 bonus per pass; passes ~= draft / gamma. Best-effort only. if draft > 0: passes = draft / NUM_SPECULATIVE_TOKENS # gamma committed = acc + passes # +1 bonus token per pass tau = committed / passes if passes > 0 else None summary = { "label": args.label, "model": args.model, "base_url": args.base_url, "n": args.n, "tokens_per_s_mean": mean(r["tokens_per_s"] for r in runs), "ttft_s_mean": mean(r["ttft_s"] for r in runs if r["ttft_s"] is not None), "acceptance_length_tau": tau, # None if metrics unavailable — read off /metrics manually then "spec_metrics_before": before, "spec_metrics_after": after, "runs": runs, } print(json.dumps({k: v for k, v in summary.items() if k != "runs"}, indent=2)) if args.out: os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True) with open(args.out, "w") as f: json.dump(summary, f, indent=2) print(f"[measure] wrote {args.out}") NUM_SPECULATIVE_TOKENS = 7 # gamma, per the DFlash card if __name__ == "__main__": main()