#!/usr/bin/env python3
"""
measure.py — the benchmark harness. Hits an OpenAI-compatible endpoint (the one
`vllm serve` exposes) and records the three demo numbers:

    tokens/sec   (decode throughput)   <- THE WIN
    TTFT         (time to first token) <- should be ~unchanged with DFlash
    acceptance length tau              <- WHY it's faster (read from vLLM metrics)

Run it twice on the GPU host — once against the baseline server, once against the
DFlash server — and diff the JSON. That diff IS the before/after table.

This file is endpoint-driven, so it runs anywhere (including the Mac) AS LONG AS
something is serving on --base-url. On the Mac you can point it at a local
tiny-model OpenAI server to shape-test; on the GPU host you point it at vLLM.

acceptance length tau:
  tau = mean(number of tokens committed per target forward pass).
  With a draft of gamma=7, tau ranges from 1 (everything rejected, +1 bonus)
  up to gamma+1=8 (all accepted + bonus). The DFlash card publishes per-position
  acceptance only (~70.7% at position 1, decaying to ~2% by position 7), NOT a
  tau figure -- measure tau on the GPU host (expect roughly 2-3). vLLM exposes
  accepted/draft counts in its metrics; we
  read them from /metrics (Prometheus) when present and otherwise estimate tau
  from the speedup. VERIFY AT ONBOARDING which metric names the vLLM build uses
  (e.g. vllm:spec_decode_num_accepted_tokens / _num_draft_tokens).

Usage:
  python bench/measure.py --base-url http://localhost:8000 --model laguna \
      --label dflash --out results/dflash.json --n 20
  python bench/measure.py --base-url http://localhost:8000 --model laguna \
      --label baseline --out results/baseline.json --n 20

Requires only stdlib + requests-free urllib, so no extra extra deps.
"""
from __future__ import annotations

import argparse
import json
import os
import time
import urllib.request
from statistics import mean

PROMPTS = [
    "Write a Python function that returns the nth Fibonacci number iteratively.",
    "Implement binary search over a sorted list in Python. Return the index or -1.",
    "Write a function to check if a string is a palindrome, ignoring case and spaces.",
    "Implement quicksort in Python.",
    "Write a function that merges two sorted lists into one sorted list.",
]


def _post(url: str, payload: dict) -> dict:
    data = json.dumps(payload).encode()
    req = urllib.request.Request(url, data=data,
                                 headers={"Content-Type": "application/json"})
    with urllib.request.urlopen(req, timeout=600) as r:
        return json.loads(r.read().decode())


def _try_metrics(base_url: str) -> dict:
    """Best-effort read of vLLM Prometheus spec-decode counters."""
    out = {}
    try:
        with urllib.request.urlopen(base_url.rstrip("/") + "/metrics", timeout=10) as r:
            text = r.read().decode()
    except Exception:
        return out
    for line in text.splitlines():
        if line.startswith("#"):
            continue
        # VERIFY metric names at onboarding; these are the common vLLM ones.
        for key in ("spec_decode_num_accepted_tokens",
                    "spec_decode_num_draft_tokens",
                    "spec_decode_num_emitted_tokens"):
            if key in line:
                try:
                    out[key] = float(line.split()[-1])
                except ValueError:
                    pass
    return out


def measure_one(base_url: str, model: str, prompt: str, max_tokens: int) -> dict:
    url = base_url.rstrip("/") + "/v1/completions"
    # Greedy (temperature 0) so output is deterministic — this is what makes the
    # baseline-vs-DFlash output comparison a LOSSLESS check.
    payload = {
        "model": model,
        "prompt": prompt,
        "max_tokens": max_tokens,
        "temperature": 0.0,
        "stream": True,
    }
    data = json.dumps(payload).encode()
    req = urllib.request.Request(url, data=data,
                                 headers={"Content-Type": "application/json"})
    t0 = time.perf_counter()
    ttft = None
    n_tokens = 0
    chunks = []
    with urllib.request.urlopen(req, timeout=600) as r:
        for raw in r:
            line = raw.decode().strip()
            if not line or not line.startswith("data:"):
                continue
            body = line[len("data:"):].strip()
            if body == "[DONE]":
                break
            obj = json.loads(body)
            piece = obj.get("choices", [{}])[0].get("text", "")
            if piece:
                if ttft is None:
                    ttft = time.perf_counter() - t0
                n_tokens += 1
                chunks.append(piece)
    total = time.perf_counter() - t0
    decode_time = max(total - (ttft or 0.0), 1e-9)
    tps = (n_tokens - 1) / decode_time if n_tokens > 1 else 0.0
    return {
        "ttft_s": ttft,
        "total_s": total,
        "new_tokens": n_tokens,
        "tokens_per_s": tps,
        "text": "".join(chunks),
    }


def main() -> None:
    p = argparse.ArgumentParser(description="Benchmark tokens/sec, TTFT, acceptance length against a vLLM endpoint.")
    p.add_argument("--base-url", default="http://localhost:8000")
    p.add_argument("--model", default="laguna")
    p.add_argument("--label", required=True, help="baseline | dflash (used in the output).")
    p.add_argument("--n", type=int, default=20, help="Number of generations (cycles through the prompt set).")
    p.add_argument("--max-tokens", type=int, default=256)
    p.add_argument("--out", default=None, help="Write JSON here (e.g. results/dflash.json).")
    args = p.parse_args()

    before = _try_metrics(args.base_url)
    runs = []
    for i in range(args.n):
        prompt = PROMPTS[i % len(PROMPTS)]
        runs.append(measure_one(args.base_url, args.model, prompt, args.max_tokens))
        print(f"  [{args.label}] run {i+1}/{args.n}  "
              f"tps={runs[-1]['tokens_per_s']:.1f}  ttft={runs[-1]['ttft_s']:.3f}s")
    after = _try_metrics(args.base_url)

    # acceptance length tau from metric deltas, if available.
    tau = None
    acc = after.get("spec_decode_num_accepted_tokens", 0) - before.get("spec_decode_num_accepted_tokens", 0)
    emitted = after.get("spec_decode_num_emitted_tokens", 0) - before.get("spec_decode_num_emitted_tokens", 0)
    draft = after.get("spec_decode_num_draft_tokens", 0) - before.get("spec_decode_num_draft_tokens", 0)
    # tau ~= total committed tokens / number of target verification passes.
    # accepted + 1 bonus per pass; passes ~= draft / gamma. Best-effort only.
    if draft > 0:
        passes = draft / NUM_SPECULATIVE_TOKENS  # gamma
        committed = acc + passes  # +1 bonus token per pass
        tau = committed / passes if passes > 0 else None

    summary = {
        "label": args.label,
        "model": args.model,
        "base_url": args.base_url,
        "n": args.n,
        "tokens_per_s_mean": mean(r["tokens_per_s"] for r in runs),
        "ttft_s_mean": mean(r["ttft_s"] for r in runs if r["ttft_s"] is not None),
        "acceptance_length_tau": tau,  # None if metrics unavailable — read off /metrics manually then
        "spec_metrics_before": before,
        "spec_metrics_after": after,
        "runs": runs,
    }
    print(json.dumps({k: v for k, v in summary.items() if k != "runs"}, indent=2))
    if args.out:
        os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
        with open(args.out, "w") as f:
            json.dump(summary, f, indent=2)
        print(f"[measure] wrote {args.out}")


NUM_SPECULATIVE_TOKENS = 7  # gamma, per the DFlash card

if __name__ == "__main__":
    main()