Commit ·
0a55ff6
0
Parent(s):
Lean Laguna: lossless DFlash speculative decoding on Laguna XS.2 (harness, environment, results)
Browse files- .gitattributes +35 -0
- README.md +185 -0
- bench/measure.py +184 -0
- bench/rollout_bench.py +325 -0
- configs/endpoints.toml +69 -0
- configs/rl/laguna-spec.toml +45 -0
- evals/humaneval_subset.py +145 -0
- results/.gitkeep +0 -0
- results/README.md +31 -0
- results/baseline.json +12 -0
- results/dflash.json +16 -0
- results/humaneval_dflash.json +12 -0
- results/parity.json +12 -0
- scripts/check_results.py +67 -0
- scripts/dress_rehearsal.sh +213 -0
- scripts/eval_local.py +305 -0
- scripts/fill_submission.py +116 -0
- scripts/gen_local.py +110 -0
- scripts/hf_job_ab.py +287 -0
- scripts/parity_local.sh +33 -0
- scripts/run_min_on_prime.sh +90 -0
- scripts/serve_vllm.py +126 -0
- scripts/stub_server.py +187 -0
- spec_rl/README.md +129 -0
- spec_rl/pyproject.toml +21 -0
- spec_rl/spec_rl.py +453 -0
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
base_model: poolside/Laguna-XS.2
|
| 4 |
+
tags:
|
| 5 |
+
- speculative-decoding
|
| 6 |
+
- dflash
|
| 7 |
+
- inference
|
| 8 |
+
- vllm
|
| 9 |
+
- lossless
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# Lean Laguna — Laguna XS.2 + DFlash, lossless single-GPU speedup
|
| 13 |
+
|
| 14 |
+
*Project: **Lean Laguna** — making Laguna XS.2 cheaper to run and to post-train on a single GPU.*
|
| 15 |
+
|
| 16 |
+
> **One-line claim:** Laguna XS.2 generates **2.76× faster on a single GPU** — **19.6 → 54.2
|
| 17 |
+
> tokens/sec** — with **byte-identical greedy output** (0 / 14 mismatches) on a mixed-difficulty code
|
| 18 |
+
> set (2.47× corroborated on a trivial set; **lossless in both**) vs the no-speculator baseline.
|
| 19 |
+
|
| 20 |
+
Speculative decoding with Poolside's **DFlash** speculator on **Laguna XS.2**, served in vLLM on
|
| 21 |
+
one GPU. The throughput win is measured; the output is provably **lossless under greedy decoding**
|
| 22 |
+
(token-for-token identical to baseline) and distribution-preserving under sampling.
|
| 23 |
+
|
| 24 |
+
Submission for the Poolside Research Hackathon — Foundations track
|
| 25 |
+
(`poolside-laguna-hackathon` HF org).
|
| 26 |
+
|
| 27 |
+
## Goal & judging criteria
|
| 28 |
+
|
| 29 |
+
> **Meaningfully improve Laguna XS.2, either by:** expanding model use cases (computer use,
|
| 30 |
+
> multi-agent coordination, evaluation design); *or* **reducing cost & latency** (optimizations,
|
| 31 |
+
> speed, quantization). **For:** an economically valuable task (a function/application); *or*
|
| 32 |
+
> **any novel research idea.**
|
| 33 |
+
> **Scored on: GENERALISABILITY · REPRODUCIBILITY · TECHNICAL CONTRIBUTIONS.**
|
| 34 |
+
|
| 35 |
+
Lean Laguna sits on **reduce cost & latency** for **a novel research idea** (lossless
|
| 36 |
+
speculative decoding → cheaper RL rollouts), and is built to score all three axes:
|
| 37 |
+
|
| 38 |
+
- **Generalisability** — any target + drafter via one `--speculative-config`; the `spec_rl` env +
|
| 39 |
+
`configs/endpoints.toml` point any RL run at any OpenAI-compatible endpoint; the reward is a
|
| 40 |
+
swappable seam (a *reusable RL environment + reward signal* — a listed submission idea).
|
| 41 |
+
- **Reproducibility** — greedy byte-parity + directly-measured throughput behind `make` targets and a
|
| 42 |
+
one-command HF-Jobs run (below); anyone re-runs the before/after table. (τ from `/metrics` read at
|
| 43 |
+
the γ+1 ceiling on both runs → we treat it as unreliable and **don't quote it**. HumanEval pass@1
|
| 44 |
+
sweep = a documented next step; greedy parity is the stronger guarantee.)
|
| 45 |
+
- **Technical contributions** — a measured, provably-lossless throughput win (**2.76×** on a
|
| 46 |
+
mixed-difficulty code set, 0 mismatches; 2.47× corroborated on a trivial set) on the *released*
|
| 47 |
+
Laguna XS.2 + DFlash, carried into **cheaper RL rollouts**; the open problem of **speculative
|
| 48 |
+
decoding under a moving RL policy** (drafter staleness) and NVFP4 attention-weight calibration as
|
| 49 |
+
the posed research stretches.
|
| 50 |
+
|
| 51 |
+
### Cheaper RL rollouts — the generalisability + frontier story
|
| 52 |
+
|
| 53 |
+
The speedup is a *decode-time* property, so it carries into any RL trainer whose rollout phase is
|
| 54 |
+
OpenAI-compatible vLLM inference — e.g. **`verifiers`** envs (our `spec_rl`, or third-party Hub envs
|
| 55 |
+
like [`pandelis/zerolang-editing`](https://app.primeintellect.ai/dashboard/environments/pandelis/zerolang-editing)
|
| 56 |
+
— install + repoint `endpoints.toml`, zero code change) and **[OpenPipe ART](https://github.com/openpipe/art)**
|
| 57 |
+
(GRPO + LoRA, rollouts served via vLLM). Drop `--speculative-config` into the rollout server →
|
| 58 |
+
cheaper rollouts.
|
| 59 |
+
|
| 60 |
+
**The honest open problem:** in RL the policy moves every batch (e.g. ART's LoRA), so a drafter
|
| 61 |
+
trained on the *base* model drifts → acceptance τ decays → the speedup erodes across training. Within
|
| 62 |
+
a batch the policy is frozen, so the per-batch win is real; the frontier is keeping the drafter useful
|
| 63 |
+
as the policy moves (periodic drafter distillation, hidden-state-conditioned drafters, or measuring
|
| 64 |
+
and amortizing the re-sync cost). This is the "novel research idea" axis, stated plainly.
|
| 65 |
+
|
| 66 |
+
---
|
| 67 |
+
|
| 68 |
+
## Method
|
| 69 |
+
|
| 70 |
+
- **Target model:** `poolside/Laguna-XS.2` — 33.4B-total / 3B-active MoE, single GPU, FP8 native,
|
| 71 |
+
128K (→256K) context, Apache 2.0, built for agentic coding.
|
| 72 |
+
- **Draft model:** `poolside/Laguna-XS.2-speculator.dflash` — a 0.6B-parameter draft model
|
| 73 |
+
(block-diffusion-style speculative-decoding method).
|
| 74 |
+
- **How it works:** DFlash proposes **γ = 7** candidate tokens per round; Laguna XS.2 verifies all
|
| 75 |
+
7 in a **single forward pass** and commits the longest matching prefix plus one free bonus token.
|
| 76 |
+
Same output, fewer expensive target passes.
|
| 77 |
+
- **Why lossless:** under greedy decoding the target only commits tokens equal to its own argmax,
|
| 78 |
+
so the output is token-identical to the baseline. Under sampling, vLLM's rejection sampling
|
| 79 |
+
preserves the target's output distribution. **Decode-time property — independent of training.**
|
| 80 |
+
- **Regime:** the win lands at **low batch / memory-bound decode** — the single-GPU, single-agent
|
| 81 |
+
case. It shrinks (and can invert) at high batch / compute-bound. See the honesty note below.
|
| 82 |
+
|
| 83 |
+
### The exact vLLM flag
|
| 84 |
+
|
| 85 |
+
Baseline and DFlash differ by **one flag only** — that is the whole experiment:
|
| 86 |
+
|
| 87 |
+
```bash
|
| 88 |
+
--speculative-config '{"model":"poolside/Laguna-XS.2-speculator.dflash","num_speculative_tokens":7,"method":"dflash"}'
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
Requires **vLLM ≥ 0.21.0** and `VLLM_USE_DEEP_GEMM=0`.
|
| 92 |
+
|
| 93 |
+
---
|
| 94 |
+
|
| 95 |
+
## Results
|
| 96 |
+
|
| 97 |
+
Same prompts, same `max_tokens`, **temperature 0 (greedy)**, same single GPU,
|
| 98 |
+
`--tensor-parallel-size 1`. Only `--speculative-config` differs between the two servers.
|
| 99 |
+
|
| 100 |
+
Measured on an **H200**, vLLM 0.22.0, `--enforce-eager`, `--max-model-len 4096`, greedy. Two runs:
|
| 101 |
+
a **14-prompt mixed-difficulty** code set (trivial `fib`/`is_prime` → hard `lcs`/`dijkstra`/`LRUCache`)
|
| 102 |
+
plus a corroborating **20-prompt trivial** set.
|
| 103 |
+
|
| 104 |
+
| Metric | Baseline | + DFlash | Δ |
|
| 105 |
+
|---|---|---|---|
|
| 106 |
+
| tokens/sec — mixed-difficulty (N=14) | 19.6 | 54.2 | **2.76×** ↑ |
|
| 107 |
+
| tokens/sec — trivial (N=20) | 19.5 | 48.1 | **2.47×** ↑ |
|
| 108 |
+
| greedy parity | — | **identical** | **0 / 14 and 0 / 20 mismatches** ✓ |
|
| 109 |
+
| HumanEval pass@1 | not run† | not run† | — |
|
| 110 |
+
|
| 111 |
+
- **tokens/sec is the headline win** — directly measured wall-clock. The speedup *holds and is larger*
|
| 112 |
+
on the harder, more diverse set (**2.76×**) than on the trivial one (2.47×), and output is
|
| 113 |
+
byte-identical in **both**.
|
| 114 |
+
- **No acceptance-length (τ) claim — on purpose.** vLLM's `/metrics` τ pinned at *exactly* the γ+1
|
| 115 |
+
ceiling (8.0) on **both** runs, and per-prompt deltas didn't resolve a distribution — almost
|
| 116 |
+
certainly a metrics artifact, not true 100% acceptance. So we report only the directly-measured
|
| 117 |
+
speedup + parity and treat τ as unreliable. *The metric we can't trust, we don't quote.*
|
| 118 |
+
- **parity** = baseline vs DFlash greedy outputs are token-identical — the lossless proof.
|
| 119 |
+
- **†No TTFT or HumanEval-pass@1 row.** This MIN A/B measured throughput + byte-parity only; the
|
| 120 |
+
harness did not isolate true time-to-first-token, and a full HumanEval pass@1 sweep is a documented
|
| 121 |
+
next step. Byte-identical greedy output ⇒ identical pass@1 *by construction*, so parity is the
|
| 122 |
+
stronger guarantee here.
|
| 123 |
+
|
| 124 |
+
---
|
| 125 |
+
|
| 126 |
+
## How to reproduce
|
| 127 |
+
|
| 128 |
+
**The exact run that produced the numbers above** — one self-contained command on Hugging Face Jobs
|
| 129 |
+
(no ssh; serves baseline → measures → re-serves with DFlash → measures → byte-parity), funded by the
|
| 130 |
+
HF Jobs credit pool:
|
| 131 |
+
|
| 132 |
+
```bash
|
| 133 |
+
hf jobs uv run --flavor h200 --timeout 1500 --detach --secrets HF_TOKEN scripts/hf_job_ab.py
|
| 134 |
+
# then: hf jobs logs <id> → the [job] RESULT / BASELINE_JSON / DFLASH_JSON / PARITY_JSON lines
|
| 135 |
+
```
|
| 136 |
+
|
| 137 |
+
`scripts/hf_job_ab.py` pins the working vLLM env (Triton MoE + Torch sampler + FlashAttention, so no
|
| 138 |
+
CUDA toolkit is needed in the slim image — see `THE_JOURNEY.md` for *why*). Below is the equivalent
|
| 139 |
+
local two-server flow for any CUDA box with the released weights (vLLM ≥ 0.21.0):
|
| 140 |
+
|
| 141 |
+
```bash
|
| 142 |
+
# 1. Baseline server (speed floor)
|
| 143 |
+
python scripts/serve_vllm.py --mode baseline --run # serves on :8000
|
| 144 |
+
|
| 145 |
+
# 2. Benchmark baseline (separate shell)
|
| 146 |
+
python bench/measure.py --base-url http://localhost:8000 --model laguna \
|
| 147 |
+
--label baseline --n 20 --out results/baseline.json
|
| 148 |
+
|
| 149 |
+
# 3. DFlash server — same command + the one --speculative-config flag
|
| 150 |
+
python scripts/serve_vllm.py --mode dflash --run
|
| 151 |
+
python bench/measure.py --base-url http://localhost:8000 --model laguna \
|
| 152 |
+
--label dflash --n 20 --out results/dflash.json
|
| 153 |
+
|
| 154 |
+
# 4. Quality + lossless parity
|
| 155 |
+
python evals/humaneval_subset.py --base-url http://localhost:8000 --model laguna \
|
| 156 |
+
--n 25 --out results/humaneval_dflash.json
|
| 157 |
+
python evals/humaneval_subset.py --parity \
|
| 158 |
+
--base-url http://localhost:8000 --base-url-b http://localhost:8001 --model laguna --n 25
|
| 159 |
+
```
|
| 160 |
+
|
| 161 |
+
The results table above is the diff of `results/baseline.json` and `results/dflash.json` plus the
|
| 162 |
+
parity result. τ is read from vLLM's `/metrics`.
|
| 163 |
+
|
| 164 |
+
---
|
| 165 |
+
|
| 166 |
+
## Honesty note — the low-batch regime
|
| 167 |
+
|
| 168 |
+
This is deliberately a **single-GPU, low-concurrency** result: one box, one agent, maximum
|
| 169 |
+
tokens/sec.
|
| 170 |
+
|
| 171 |
+
Speculative decoding helps **most at low batch size / memory-bound decode**, where each step
|
| 172 |
+
reloads the active weights to emit a single token and doing useful work for several tokens per
|
| 173 |
+
pass is a large win. It helps **less at high batch size / compute-bound decode** — once the GPU is
|
| 174 |
+
saturated, the matmuls dominate and the extra verify work for rejected drafts can slightly hurt.
|
| 175 |
+
At very high concurrency you would tune γ down or turn speculation off.
|
| 176 |
+
|
| 177 |
+
The reported speedup, τ, and acceptance numbers are for the low-batch single-GPU regime on
|
| 178 |
+
coding-style prompts. The lossless claim (greedy parity) holds regardless of regime — it is a
|
| 179 |
+
correctness property of the verification step, not a function of batch size.
|
| 180 |
+
|
| 181 |
+
---
|
| 182 |
+
|
| 183 |
+
## License
|
| 184 |
+
|
| 185 |
+
Apache 2.0, inheriting `poolside/Laguna-XS.2`.
|
bench/measure.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
measure.py — the benchmark harness. Hits an OpenAI-compatible endpoint (the one
|
| 4 |
+
`vllm serve` exposes) and records the three demo numbers:
|
| 5 |
+
|
| 6 |
+
tokens/sec (decode throughput) <- THE WIN
|
| 7 |
+
TTFT (time to first token) <- should be ~unchanged with DFlash
|
| 8 |
+
acceptance length tau <- WHY it's faster (read from vLLM metrics)
|
| 9 |
+
|
| 10 |
+
Run it twice at the venue — once against the baseline server, once against the
|
| 11 |
+
DFlash server — and diff the JSON. That diff IS the before/after table.
|
| 12 |
+
|
| 13 |
+
This file is endpoint-driven, so it runs anywhere (including the Mac) AS LONG AS
|
| 14 |
+
something is serving on --base-url. On the Mac you can point it at a local
|
| 15 |
+
tiny-model OpenAI server to shape-test; at the venue you point it at vLLM.
|
| 16 |
+
|
| 17 |
+
acceptance length tau:
|
| 18 |
+
tau = mean(number of tokens committed per target forward pass).
|
| 19 |
+
With a draft of gamma=7, tau ranges from 1 (everything rejected, +1 bonus)
|
| 20 |
+
up to gamma+1=8 (all accepted + bonus). The DFlash card publishes per-position
|
| 21 |
+
acceptance only (~70.7% at position 1, decaying to ~2% by position 7), NOT a
|
| 22 |
+
tau figure -- measure tau at the venue (expect roughly 2-3). vLLM exposes
|
| 23 |
+
accepted/draft counts in its metrics; we
|
| 24 |
+
read them from /metrics (Prometheus) when present and otherwise estimate tau
|
| 25 |
+
from the speedup. VERIFY AT ONBOARDING which metric names the vLLM build uses
|
| 26 |
+
(e.g. vllm:spec_decode_num_accepted_tokens / _num_draft_tokens).
|
| 27 |
+
|
| 28 |
+
Usage:
|
| 29 |
+
python bench/measure.py --base-url http://localhost:8000 --model laguna \
|
| 30 |
+
--label dflash --out results/dflash.json --n 20
|
| 31 |
+
python bench/measure.py --base-url http://localhost:8000 --model laguna \
|
| 32 |
+
--label baseline --out results/baseline.json --n 20
|
| 33 |
+
|
| 34 |
+
Requires only stdlib + requests-free urllib, so no extra venue deps.
|
| 35 |
+
"""
|
| 36 |
+
from __future__ import annotations
|
| 37 |
+
|
| 38 |
+
import argparse
|
| 39 |
+
import json
|
| 40 |
+
import os
|
| 41 |
+
import time
|
| 42 |
+
import urllib.request
|
| 43 |
+
from statistics import mean
|
| 44 |
+
|
| 45 |
+
PROMPTS = [
|
| 46 |
+
"Write a Python function that returns the nth Fibonacci number iteratively.",
|
| 47 |
+
"Implement binary search over a sorted list in Python. Return the index or -1.",
|
| 48 |
+
"Write a function to check if a string is a palindrome, ignoring case and spaces.",
|
| 49 |
+
"Implement quicksort in Python.",
|
| 50 |
+
"Write a function that merges two sorted lists into one sorted list.",
|
| 51 |
+
]
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def _post(url: str, payload: dict) -> dict:
|
| 55 |
+
data = json.dumps(payload).encode()
|
| 56 |
+
req = urllib.request.Request(url, data=data,
|
| 57 |
+
headers={"Content-Type": "application/json"})
|
| 58 |
+
with urllib.request.urlopen(req, timeout=600) as r:
|
| 59 |
+
return json.loads(r.read().decode())
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def _try_metrics(base_url: str) -> dict:
|
| 63 |
+
"""Best-effort read of vLLM Prometheus spec-decode counters."""
|
| 64 |
+
out = {}
|
| 65 |
+
try:
|
| 66 |
+
with urllib.request.urlopen(base_url.rstrip("/") + "/metrics", timeout=10) as r:
|
| 67 |
+
text = r.read().decode()
|
| 68 |
+
except Exception:
|
| 69 |
+
return out
|
| 70 |
+
for line in text.splitlines():
|
| 71 |
+
if line.startswith("#"):
|
| 72 |
+
continue
|
| 73 |
+
# VERIFY metric names at onboarding; these are the common vLLM ones.
|
| 74 |
+
for key in ("spec_decode_num_accepted_tokens",
|
| 75 |
+
"spec_decode_num_draft_tokens",
|
| 76 |
+
"spec_decode_num_emitted_tokens"):
|
| 77 |
+
if key in line:
|
| 78 |
+
try:
|
| 79 |
+
out[key] = float(line.split()[-1])
|
| 80 |
+
except ValueError:
|
| 81 |
+
pass
|
| 82 |
+
return out
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def measure_one(base_url: str, model: str, prompt: str, max_tokens: int) -> dict:
|
| 86 |
+
url = base_url.rstrip("/") + "/v1/completions"
|
| 87 |
+
# Greedy (temperature 0) so output is deterministic — this is what makes the
|
| 88 |
+
# baseline-vs-DFlash output comparison a LOSSLESS check.
|
| 89 |
+
payload = {
|
| 90 |
+
"model": model,
|
| 91 |
+
"prompt": prompt,
|
| 92 |
+
"max_tokens": max_tokens,
|
| 93 |
+
"temperature": 0.0,
|
| 94 |
+
"stream": True,
|
| 95 |
+
}
|
| 96 |
+
data = json.dumps(payload).encode()
|
| 97 |
+
req = urllib.request.Request(url, data=data,
|
| 98 |
+
headers={"Content-Type": "application/json"})
|
| 99 |
+
t0 = time.perf_counter()
|
| 100 |
+
ttft = None
|
| 101 |
+
n_tokens = 0
|
| 102 |
+
chunks = []
|
| 103 |
+
with urllib.request.urlopen(req, timeout=600) as r:
|
| 104 |
+
for raw in r:
|
| 105 |
+
line = raw.decode().strip()
|
| 106 |
+
if not line or not line.startswith("data:"):
|
| 107 |
+
continue
|
| 108 |
+
body = line[len("data:"):].strip()
|
| 109 |
+
if body == "[DONE]":
|
| 110 |
+
break
|
| 111 |
+
obj = json.loads(body)
|
| 112 |
+
piece = obj.get("choices", [{}])[0].get("text", "")
|
| 113 |
+
if piece:
|
| 114 |
+
if ttft is None:
|
| 115 |
+
ttft = time.perf_counter() - t0
|
| 116 |
+
n_tokens += 1
|
| 117 |
+
chunks.append(piece)
|
| 118 |
+
total = time.perf_counter() - t0
|
| 119 |
+
decode_time = max(total - (ttft or 0.0), 1e-9)
|
| 120 |
+
tps = (n_tokens - 1) / decode_time if n_tokens > 1 else 0.0
|
| 121 |
+
return {
|
| 122 |
+
"ttft_s": ttft,
|
| 123 |
+
"total_s": total,
|
| 124 |
+
"new_tokens": n_tokens,
|
| 125 |
+
"tokens_per_s": tps,
|
| 126 |
+
"text": "".join(chunks),
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def main() -> None:
|
| 131 |
+
p = argparse.ArgumentParser(description="Benchmark tokens/sec, TTFT, acceptance length against a vLLM endpoint.")
|
| 132 |
+
p.add_argument("--base-url", default="http://localhost:8000")
|
| 133 |
+
p.add_argument("--model", default="laguna")
|
| 134 |
+
p.add_argument("--label", required=True, help="baseline | dflash (used in the output).")
|
| 135 |
+
p.add_argument("--n", type=int, default=20, help="Number of generations (cycles through the prompt set).")
|
| 136 |
+
p.add_argument("--max-tokens", type=int, default=256)
|
| 137 |
+
p.add_argument("--out", default=None, help="Write JSON here (e.g. results/dflash.json).")
|
| 138 |
+
args = p.parse_args()
|
| 139 |
+
|
| 140 |
+
before = _try_metrics(args.base_url)
|
| 141 |
+
runs = []
|
| 142 |
+
for i in range(args.n):
|
| 143 |
+
prompt = PROMPTS[i % len(PROMPTS)]
|
| 144 |
+
runs.append(measure_one(args.base_url, args.model, prompt, args.max_tokens))
|
| 145 |
+
print(f" [{args.label}] run {i+1}/{args.n} "
|
| 146 |
+
f"tps={runs[-1]['tokens_per_s']:.1f} ttft={runs[-1]['ttft_s']:.3f}s")
|
| 147 |
+
after = _try_metrics(args.base_url)
|
| 148 |
+
|
| 149 |
+
# acceptance length tau from metric deltas, if available.
|
| 150 |
+
tau = None
|
| 151 |
+
acc = after.get("spec_decode_num_accepted_tokens", 0) - before.get("spec_decode_num_accepted_tokens", 0)
|
| 152 |
+
emitted = after.get("spec_decode_num_emitted_tokens", 0) - before.get("spec_decode_num_emitted_tokens", 0)
|
| 153 |
+
draft = after.get("spec_decode_num_draft_tokens", 0) - before.get("spec_decode_num_draft_tokens", 0)
|
| 154 |
+
# tau ~= total committed tokens / number of target verification passes.
|
| 155 |
+
# accepted + 1 bonus per pass; passes ~= draft / gamma. Best-effort only.
|
| 156 |
+
if draft > 0:
|
| 157 |
+
passes = draft / NUM_SPECULATIVE_TOKENS # gamma
|
| 158 |
+
committed = acc + passes # +1 bonus token per pass
|
| 159 |
+
tau = committed / passes if passes > 0 else None
|
| 160 |
+
|
| 161 |
+
summary = {
|
| 162 |
+
"label": args.label,
|
| 163 |
+
"model": args.model,
|
| 164 |
+
"base_url": args.base_url,
|
| 165 |
+
"n": args.n,
|
| 166 |
+
"tokens_per_s_mean": mean(r["tokens_per_s"] for r in runs),
|
| 167 |
+
"ttft_s_mean": mean(r["ttft_s"] for r in runs if r["ttft_s"] is not None),
|
| 168 |
+
"acceptance_length_tau": tau, # None if metrics unavailable — read off /metrics manually then
|
| 169 |
+
"spec_metrics_before": before,
|
| 170 |
+
"spec_metrics_after": after,
|
| 171 |
+
"runs": runs,
|
| 172 |
+
}
|
| 173 |
+
print(json.dumps({k: v for k, v in summary.items() if k != "runs"}, indent=2))
|
| 174 |
+
if args.out:
|
| 175 |
+
os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
|
| 176 |
+
with open(args.out, "w") as f:
|
| 177 |
+
json.dump(summary, f, indent=2)
|
| 178 |
+
print(f"[measure] wrote {args.out}")
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
NUM_SPECULATIVE_TOKENS = 7 # gamma, per the DFlash card
|
| 182 |
+
|
| 183 |
+
if __name__ == "__main__":
|
| 184 |
+
main()
|
bench/rollout_bench.py
ADDED
|
@@ -0,0 +1,325 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
rollout_bench.py — the COMBINED-THESIS benchmark. It measures the same endpoint
|
| 4 |
+
that verifiers points its RL rollouts at (see configs/endpoints.toml), but frames
|
| 5 |
+
the numbers the way an RL post-training run cares about:
|
| 6 |
+
|
| 7 |
+
rollout throughput (completions/sec, tokens/sec) <- THE WIN
|
| 8 |
+
TTFT (time to first token) <- ~unchanged with DFlash
|
| 9 |
+
acceptance length tau <- WHY it's faster
|
| 10 |
+
projected $/run saved <- WHY it's CHEAPER
|
| 11 |
+
|
| 12 |
+
The thesis is "lossless DFlash speculative decoding makes RL post-training
|
| 13 |
+
cheaper." RL spends most of its wall-clock generating rollouts, so a faster
|
| 14 |
+
rollout endpoint — at IDENTICAL greedy output — buys the same reward curve for
|
| 15 |
+
fewer GPU-hours. This script measures that, live, against whatever is serving on
|
| 16 |
+
--base-url. It is a sibling of measure.py and reuses the same conventions:
|
| 17 |
+
stdlib urllib only, streaming /v1/completions, greedy decode, best-effort read of
|
| 18 |
+
vLLM /metrics. The ONE design rule: baseline vs DFlash is a one-flag swap on the
|
| 19 |
+
SERVER (serve_vllm.py --mode), never a change here — so the same command produces
|
| 20 |
+
both halves of the A/B.
|
| 21 |
+
|
| 22 |
+
Workload: an RL "rollout batch" = a fixed prompt set, replayed identically, with
|
| 23 |
+
--rollouts-per-example completions per prompt. The workload is deterministic
|
| 24 |
+
(temperature 0 by default) so the BASELINE and DFLASH runs do identical work and
|
| 25 |
+
the only thing that moves is speed.
|
| 26 |
+
|
| 27 |
+
acceptance length tau:
|
| 28 |
+
tau = mean tokens committed per target forward pass. With gamma=7 it ranges
|
| 29 |
+
from 1 (all drafts rejected, +1 bonus) to 8 (all accepted + bonus). tau is NOT
|
| 30 |
+
published in any Laguna/DFlash primary source — the model card gives per-position
|
| 31 |
+
acceptance rates only (position-1 ~70.7%, decaying to ~2% at position-7). So we
|
| 32 |
+
MEASURE it here from vLLM /metrics deltas. Expect roughly 2-3; never quote a
|
| 33 |
+
published figure. None is printed if /metrics is unavailable — read it off the
|
| 34 |
+
server's /metrics by hand then. VERIFY the exact metric names at onboarding.
|
| 35 |
+
|
| 36 |
+
Losslessness:
|
| 37 |
+
--assert-parity runs the deterministic (greedy) workload TWICE against the same
|
| 38 |
+
endpoint and asserts byte-identical completions. On a correct speculative-decoding
|
| 39 |
+
implementation greedy output is invariant, so two runs must match. (The
|
| 40 |
+
baseline-vs-DFlash cross-server parity check lives in evals/humaneval_subset.py
|
| 41 |
+
--parity; this in-run check guards against nondeterminism in the served config.)
|
| 42 |
+
|
| 43 |
+
This does NOT fabricate anything. Every number comes from live HTTP calls. If the
|
| 44 |
+
endpoint is down you get an error, not a made-up result.
|
| 45 |
+
|
| 46 |
+
Usage:
|
| 47 |
+
# measure a DFlash run and project savings at $3.50/GPU-hour
|
| 48 |
+
python bench/rollout_bench.py --base-url http://localhost:8000 --model laguna \\
|
| 49 |
+
--label dflash --prompts 8 --rollouts-per-example 8 --max-tokens 512 \\
|
| 50 |
+
--hourly-rate 3.50 --out results/rollout_dflash.json
|
| 51 |
+
|
| 52 |
+
# measure the baseline (re-serve with serve_vllm.py --mode baseline first)
|
| 53 |
+
python bench/rollout_bench.py --base-url http://localhost:8000 --model laguna \\
|
| 54 |
+
--label baseline --hourly-rate 3.50 --out results/rollout_baseline.json
|
| 55 |
+
|
| 56 |
+
# prove losslessness: two greedy runs against the same endpoint must be identical
|
| 57 |
+
python bench/rollout_bench.py --base-url http://localhost:8000 --model laguna \\
|
| 58 |
+
--label dflash --assert-parity
|
| 59 |
+
|
| 60 |
+
Requires only the stdlib (urllib), so no extra venue deps.
|
| 61 |
+
"""
|
| 62 |
+
from __future__ import annotations
|
| 63 |
+
|
| 64 |
+
import argparse
|
| 65 |
+
import json
|
| 66 |
+
import os
|
| 67 |
+
import time
|
| 68 |
+
import urllib.request
|
| 69 |
+
from statistics import mean
|
| 70 |
+
|
| 71 |
+
# Draft length gamma, per the DFlash model card. Used only to estimate the number
|
| 72 |
+
# of target verification passes when turning /metrics counters into tau.
|
| 73 |
+
NUM_SPECULATIVE_TOKENS = 7
|
| 74 |
+
|
| 75 |
+
# The fixed rollout prompt set. Coding-style, matching the DFlash card's domain
|
| 76 |
+
# and measure.py's set, so tau and tokens/sec are comparable across the harness.
|
| 77 |
+
PROMPTS = [
|
| 78 |
+
"Write a Python function that returns the nth Fibonacci number iteratively.",
|
| 79 |
+
"Implement binary search over a sorted list in Python. Return the index or -1.",
|
| 80 |
+
"Write a function to check if a string is a palindrome, ignoring case and spaces.",
|
| 81 |
+
"Implement quicksort in Python.",
|
| 82 |
+
"Write a function that merges two sorted lists into one sorted list.",
|
| 83 |
+
"Write a Python function that returns the prime factors of an integer.",
|
| 84 |
+
"Implement a function that reverses words in a sentence in place.",
|
| 85 |
+
"Write a function that flattens an arbitrarily nested list of integers.",
|
| 86 |
+
]
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def _try_metrics(base_url: str) -> dict:
|
| 90 |
+
"""Best-effort read of vLLM Prometheus spec-decode counters. Empty if absent."""
|
| 91 |
+
out: dict = {}
|
| 92 |
+
try:
|
| 93 |
+
with urllib.request.urlopen(base_url.rstrip("/") + "/metrics", timeout=10) as r:
|
| 94 |
+
text = r.read().decode()
|
| 95 |
+
except Exception:
|
| 96 |
+
return out
|
| 97 |
+
for line in text.splitlines():
|
| 98 |
+
if line.startswith("#"):
|
| 99 |
+
continue
|
| 100 |
+
# VERIFY metric names at onboarding; these are the common vLLM ones.
|
| 101 |
+
for key in ("spec_decode_num_accepted_tokens",
|
| 102 |
+
"spec_decode_num_draft_tokens",
|
| 103 |
+
"spec_decode_num_emitted_tokens"):
|
| 104 |
+
if key in line:
|
| 105 |
+
try:
|
| 106 |
+
out[key] = float(line.split()[-1])
|
| 107 |
+
except ValueError:
|
| 108 |
+
pass
|
| 109 |
+
return out
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def generate_one(base_url: str, model: str, prompt: str, max_tokens: int,
|
| 113 |
+
temperature: float) -> dict:
|
| 114 |
+
"""One streamed completion. Returns timing + the generated text."""
|
| 115 |
+
url = base_url.rstrip("/") + "/v1/completions"
|
| 116 |
+
payload = {
|
| 117 |
+
"model": model,
|
| 118 |
+
"prompt": prompt,
|
| 119 |
+
"max_tokens": max_tokens,
|
| 120 |
+
"temperature": temperature, # 0.0 => greedy => deterministic => lossless-comparable
|
| 121 |
+
"stream": True,
|
| 122 |
+
}
|
| 123 |
+
data = json.dumps(payload).encode()
|
| 124 |
+
req = urllib.request.Request(url, data=data,
|
| 125 |
+
headers={"Content-Type": "application/json"})
|
| 126 |
+
t0 = time.perf_counter()
|
| 127 |
+
ttft = None
|
| 128 |
+
n_tokens = 0
|
| 129 |
+
chunks = []
|
| 130 |
+
with urllib.request.urlopen(req, timeout=600) as r:
|
| 131 |
+
for raw in r:
|
| 132 |
+
line = raw.decode().strip()
|
| 133 |
+
if not line or not line.startswith("data:"):
|
| 134 |
+
continue
|
| 135 |
+
body = line[len("data:"):].strip()
|
| 136 |
+
if body == "[DONE]":
|
| 137 |
+
break
|
| 138 |
+
obj = json.loads(body)
|
| 139 |
+
piece = obj.get("choices", [{}])[0].get("text", "")
|
| 140 |
+
if piece:
|
| 141 |
+
if ttft is None:
|
| 142 |
+
ttft = time.perf_counter() - t0
|
| 143 |
+
n_tokens += 1
|
| 144 |
+
chunks.append(piece)
|
| 145 |
+
total = time.perf_counter() - t0
|
| 146 |
+
decode_time = max(total - (ttft or 0.0), 1e-9)
|
| 147 |
+
tps = (n_tokens - 1) / decode_time if n_tokens > 1 else 0.0
|
| 148 |
+
return {
|
| 149 |
+
"ttft_s": ttft,
|
| 150 |
+
"total_s": total,
|
| 151 |
+
"new_tokens": n_tokens,
|
| 152 |
+
"tokens_per_s": tps,
|
| 153 |
+
"text": "".join(chunks),
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def run_rollout_batch(base_url: str, model: str, prompts: list[str],
|
| 158 |
+
rollouts_per_example: int, max_tokens: int,
|
| 159 |
+
temperature: float, label: str) -> list[dict]:
|
| 160 |
+
"""Replay the prompt set rollouts_per_example times — one RL rollout batch."""
|
| 161 |
+
runs = []
|
| 162 |
+
total = len(prompts) * rollouts_per_example
|
| 163 |
+
k = 0
|
| 164 |
+
for r in range(rollouts_per_example):
|
| 165 |
+
for prompt in prompts:
|
| 166 |
+
k += 1
|
| 167 |
+
res = generate_one(base_url, model, prompt, max_tokens, temperature)
|
| 168 |
+
runs.append(res)
|
| 169 |
+
print(f" [{label}] rollout {k}/{total} "
|
| 170 |
+
f"tps={res['tokens_per_s']:.1f} ttft={res['ttft_s']:.3f}s")
|
| 171 |
+
return runs
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def estimate_tau(before: dict, after: dict) -> float | None:
|
| 175 |
+
"""tau from vLLM /metrics deltas. None if counters are unavailable.
|
| 176 |
+
|
| 177 |
+
Committed tokens per target pass = accepted + 1 bonus per pass; the number of
|
| 178 |
+
passes ~= draft_tokens / gamma. Best-effort, exactly as measure.py does it.
|
| 179 |
+
"""
|
| 180 |
+
acc = after.get("spec_decode_num_accepted_tokens", 0) - before.get("spec_decode_num_accepted_tokens", 0)
|
| 181 |
+
draft = after.get("spec_decode_num_draft_tokens", 0) - before.get("spec_decode_num_draft_tokens", 0)
|
| 182 |
+
if draft > 0:
|
| 183 |
+
passes = draft / NUM_SPECULATIVE_TOKENS
|
| 184 |
+
if passes > 0:
|
| 185 |
+
committed = acc + passes # +1 bonus token per verification pass
|
| 186 |
+
return committed / passes
|
| 187 |
+
return None
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def assert_parity(base_url: str, model: str, prompts: list[str], max_tokens: int) -> dict:
|
| 191 |
+
"""Run the GREEDY workload twice and assert byte-identical completions.
|
| 192 |
+
|
| 193 |
+
On correct speculative decoding, greedy output is invariant — two runs MUST
|
| 194 |
+
match. A mismatch means the served config is nondeterministic (or broken), not
|
| 195 |
+
lossless. Raises AssertionError on any mismatch so a CI/demo run fails loudly.
|
| 196 |
+
"""
|
| 197 |
+
print("[parity] greedy run A ...")
|
| 198 |
+
a = run_rollout_batch(base_url, model, prompts, 1, max_tokens, 0.0, "parity-A")
|
| 199 |
+
print("[parity] greedy run B ...")
|
| 200 |
+
b = run_rollout_batch(base_url, model, prompts, 1, max_tokens, 0.0, "parity-B")
|
| 201 |
+
mismatches = sum(1 for x, y in zip(a, b) if x["text"] != y["text"])
|
| 202 |
+
identical = len(a) - mismatches
|
| 203 |
+
result = {
|
| 204 |
+
"parity_pairs": len(a),
|
| 205 |
+
"identical": identical,
|
| 206 |
+
"mismatches": mismatches,
|
| 207 |
+
"lossless": mismatches == 0,
|
| 208 |
+
}
|
| 209 |
+
print(json.dumps(result, indent=2))
|
| 210 |
+
assert mismatches == 0, (
|
| 211 |
+
f"PARITY FAILED: {mismatches}/{len(a)} greedy completions differed across "
|
| 212 |
+
f"two runs of the same endpoint — output is NOT deterministic/lossless."
|
| 213 |
+
)
|
| 214 |
+
print("[parity] PASS — greedy output is byte-identical across runs (lossless).")
|
| 215 |
+
return result
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
def main() -> None:
|
| 219 |
+
p = argparse.ArgumentParser(
|
| 220 |
+
description="Rollout-throughput benchmark (completions/sec, tokens/sec, TTFT, "
|
| 221 |
+
"acceptance length tau, projected $/run) against an OpenAI-compatible "
|
| 222 |
+
"endpoint. Measures live; never fabricates.",
|
| 223 |
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
| 224 |
+
)
|
| 225 |
+
p.add_argument("--base-url", default="http://localhost:8000",
|
| 226 |
+
help="OpenAI-compatible endpoint root (vLLM serves /v1 and /metrics under it).")
|
| 227 |
+
p.add_argument("--model", default="laguna",
|
| 228 |
+
help="Served model name/id (serve_vllm.py registers the alias 'laguna').")
|
| 229 |
+
p.add_argument("--label", default="dflash",
|
| 230 |
+
help="Tag for the output: baseline | dflash. Just labels the JSON.")
|
| 231 |
+
p.add_argument("--prompts", type=int, default=len(PROMPTS),
|
| 232 |
+
help="How many of the built-in prompts to use (1..%d)." % len(PROMPTS))
|
| 233 |
+
p.add_argument("--rollouts-per-example", type=int, default=8,
|
| 234 |
+
help="Completions sampled per prompt — mirrors the RL config's group size.")
|
| 235 |
+
p.add_argument("--max-tokens", type=int, default=512,
|
| 236 |
+
help="Max new tokens per completion. Match the RL sampling cap for honest $/run.")
|
| 237 |
+
p.add_argument("--temperature", type=float, default=0.0,
|
| 238 |
+
help="0.0 = greedy/deterministic (the lossless-comparable workload). "
|
| 239 |
+
"Keep 0 for the A/B so baseline and DFlash do identical work.")
|
| 240 |
+
p.add_argument("--hourly-rate", type=float, default=None,
|
| 241 |
+
help="GPU $/hour. If set, projects rollout-batch cost and (with --baseline-tps) savings.")
|
| 242 |
+
p.add_argument("--baseline-tps", type=float, default=None,
|
| 243 |
+
help="Baseline tokens/sec from a prior --label baseline run. Lets this run project "
|
| 244 |
+
"the $ SAVED vs baseline for the same rollout workload.")
|
| 245 |
+
p.add_argument("--assert-parity", action="store_true",
|
| 246 |
+
help="Run the greedy workload twice and assert byte-identical output (lossless check). "
|
| 247 |
+
"Exits nonzero on mismatch. Skips the throughput batch.")
|
| 248 |
+
p.add_argument("--out", default=None, help="Write JSON summary here (e.g. results/rollout_dflash.json).")
|
| 249 |
+
args = p.parse_args()
|
| 250 |
+
|
| 251 |
+
prompts = PROMPTS[:max(1, min(args.prompts, len(PROMPTS)))]
|
| 252 |
+
|
| 253 |
+
if args.assert_parity:
|
| 254 |
+
result = assert_parity(args.base_url, args.model, prompts, args.max_tokens)
|
| 255 |
+
if args.out:
|
| 256 |
+
os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
|
| 257 |
+
with open(args.out, "w") as f:
|
| 258 |
+
json.dump({"label": args.label, "parity": result}, f, indent=2)
|
| 259 |
+
print(f"[rollout_bench] wrote {args.out}")
|
| 260 |
+
return
|
| 261 |
+
|
| 262 |
+
before = _try_metrics(args.base_url)
|
| 263 |
+
t_start = time.perf_counter()
|
| 264 |
+
runs = run_rollout_batch(args.base_url, args.model, prompts,
|
| 265 |
+
args.rollouts_per_example, args.max_tokens,
|
| 266 |
+
args.temperature, args.label)
|
| 267 |
+
wall_s = time.perf_counter() - t_start
|
| 268 |
+
after = _try_metrics(args.base_url)
|
| 269 |
+
|
| 270 |
+
tau = estimate_tau(before, after)
|
| 271 |
+
total_tokens = sum(r["new_tokens"] for r in runs)
|
| 272 |
+
n_rollouts = len(runs)
|
| 273 |
+
completions_per_s = n_rollouts / wall_s if wall_s > 0 else 0.0
|
| 274 |
+
tokens_per_s_aggregate = total_tokens / wall_s if wall_s > 0 else 0.0
|
| 275 |
+
|
| 276 |
+
summary = {
|
| 277 |
+
"label": args.label,
|
| 278 |
+
"model": args.model,
|
| 279 |
+
"base_url": args.base_url,
|
| 280 |
+
"prompts": len(prompts),
|
| 281 |
+
"rollouts_per_example": args.rollouts_per_example,
|
| 282 |
+
"n_rollouts": n_rollouts,
|
| 283 |
+
"max_tokens": args.max_tokens,
|
| 284 |
+
"temperature": args.temperature,
|
| 285 |
+
"wall_s": wall_s,
|
| 286 |
+
"completions_per_s": completions_per_s, # rollout throughput — the headline
|
| 287 |
+
"total_new_tokens": total_tokens,
|
| 288 |
+
"tokens_per_s_aggregate": tokens_per_s_aggregate,
|
| 289 |
+
"tokens_per_s_mean_per_rollout": mean(r["tokens_per_s"] for r in runs),
|
| 290 |
+
"ttft_s_mean": mean(r["ttft_s"] for r in runs if r["ttft_s"] is not None),
|
| 291 |
+
"acceptance_length_tau": tau, # None if /metrics absent — read it off /metrics by hand then
|
| 292 |
+
"spec_metrics_before": before,
|
| 293 |
+
"spec_metrics_after": after,
|
| 294 |
+
}
|
| 295 |
+
|
| 296 |
+
# ---- projected $/run -------------------------------------------------
|
| 297 |
+
# Cost of THIS rollout batch at the given GPU price. If a baseline tokens/sec
|
| 298 |
+
# is supplied, also project what the SAME workload would have cost at baseline
|
| 299 |
+
# speed, and the savings — the dollars-and-cents form of the thesis.
|
| 300 |
+
if args.hourly_rate is not None:
|
| 301 |
+
batch_cost = (wall_s / 3600.0) * args.hourly_rate
|
| 302 |
+
cost = {"hourly_rate": args.hourly_rate, "batch_cost_usd": batch_cost}
|
| 303 |
+
if args.baseline_tps and args.baseline_tps > 0 and total_tokens > 0:
|
| 304 |
+
baseline_wall_s = total_tokens / args.baseline_tps
|
| 305 |
+
baseline_cost = (baseline_wall_s / 3600.0) * args.hourly_rate
|
| 306 |
+
cost.update({
|
| 307 |
+
"baseline_tps_reference": args.baseline_tps,
|
| 308 |
+
"projected_baseline_wall_s": baseline_wall_s,
|
| 309 |
+
"projected_baseline_cost_usd": baseline_cost,
|
| 310 |
+
"projected_savings_usd": baseline_cost - batch_cost,
|
| 311 |
+
"speedup_x": (args.baseline_tps and tokens_per_s_aggregate / args.baseline_tps) or None,
|
| 312 |
+
})
|
| 313 |
+
summary["cost_projection"] = cost
|
| 314 |
+
|
| 315 |
+
print(json.dumps(summary, indent=2))
|
| 316 |
+
if args.out:
|
| 317 |
+
os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
|
| 318 |
+
# Persist per-rollout detail alongside the summary for later inspection.
|
| 319 |
+
with open(args.out, "w") as f:
|
| 320 |
+
json.dump({**summary, "runs": runs}, f, indent=2)
|
| 321 |
+
print(f"[rollout_bench] wrote {args.out}")
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
if __name__ == "__main__":
|
| 325 |
+
main()
|
configs/endpoints.toml
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# endpoints.toml — verifiers / Prime inference endpoints for the Laguna hackathon.
|
| 2 |
+
#
|
| 3 |
+
# THIS FILE IS THE SEAM. It is the single place that decides WHERE a verifiers
|
| 4 |
+
# environment sends its rollout generations. Point it at the vanilla Laguna
|
| 5 |
+
# endpoint and rollouts run at baseline speed; point it at the vLLM+DFlash
|
| 6 |
+
# endpoint and the SAME rollouts run faster — at byte-identical greedy output.
|
| 7 |
+
# That swap (and nothing else) is the combined thesis: "lossless DFlash
|
| 8 |
+
# speculative decoding makes RL post-training cheaper" — same reward curve,
|
| 9 |
+
# higher rollout throughput, lower $/run.
|
| 10 |
+
#
|
| 11 |
+
# SCHEMA follows the Prime lab-cookbook (configs/endpoints.toml): an array of
|
| 12 |
+
# [[endpoint]] tables, each with:
|
| 13 |
+
# endpoint_id = "<alias>" # what you pass to `prime eval run -m <alias>`
|
| 14 |
+
# model = "<repo-id>"
|
| 15 |
+
# url = "<openai-compatible base url, ending in /v1>"
|
| 16 |
+
# key = "<ENV_VAR_NAME>" # the NAME of the env var holding the key, not the key itself
|
| 17 |
+
# type = "openai_chat_completions" # vLLM + Prime Inference are OpenAI-compatible
|
| 18 |
+
#
|
| 19 |
+
# How to use at the venue:
|
| 20 |
+
# 1. Serve the model on the GPU:
|
| 21 |
+
# python laguna-hack/scripts/serve_vllm.py --mode dflash --run
|
| 22 |
+
# (baseline is the SAME command with --mode baseline — one flag flips it.)
|
| 23 |
+
# 2. Run rollouts against the local DFlash server:
|
| 24 |
+
# prime eval run spec_rl -m local-dflash -n 128
|
| 25 |
+
# 3. For the BEFORE number, re-serve with --mode baseline and re-run with the
|
| 26 |
+
# same endpoint_id (url identical; only the server's spec-config differs),
|
| 27 |
+
# so the reward curve is a clean A/B on throughput alone.
|
| 28 |
+
#
|
| 29 |
+
# [verify at onboarding] Confirm the exact `type` string and whether `key` for a
|
| 30 |
+
# no-auth local vLLM should be an env-var name or a literal, against the venue's
|
| 31 |
+
# installed `prime`/`verifiers` version (the cookbook uses env-var NAMES like
|
| 32 |
+
# PRIME_API_KEY). Adjust if the CLI complains.
|
| 33 |
+
|
| 34 |
+
# ---------------------------------------------------------------------------
|
| 35 |
+
# local-dflash (ACTIVE) — our own vLLM server with the DFlash speculator, on the
|
| 36 |
+
# OpenAI-compatible surface vLLM exposes at :8000. vLLM requires a non-empty key
|
| 37 |
+
# but does not authenticate it, so EMPTY is a placeholder.
|
| 38 |
+
# ---------------------------------------------------------------------------
|
| 39 |
+
[[endpoint]]
|
| 40 |
+
endpoint_id = "local-dflash"
|
| 41 |
+
model = "poolside/Laguna-XS.2"
|
| 42 |
+
url = "http://localhost:8000/v1"
|
| 43 |
+
key = "EMPTY"
|
| 44 |
+
type = "openai_chat_completions"
|
| 45 |
+
|
| 46 |
+
# ---------------------------------------------------------------------------
|
| 47 |
+
# local-baseline (OPTIONAL) — a second vLLM server with NO speculator on :8001,
|
| 48 |
+
# for a side-by-side A/B without re-serving. Only if the GPU has room for two
|
| 49 |
+
# servers; on a single small GPU prefer re-serving on :8000 (flip --mode).
|
| 50 |
+
# ---------------------------------------------------------------------------
|
| 51 |
+
[[endpoint]]
|
| 52 |
+
endpoint_id = "local-baseline"
|
| 53 |
+
model = "poolside/Laguna-XS.2"
|
| 54 |
+
url = "http://localhost:8001/v1"
|
| 55 |
+
key = "EMPTY"
|
| 56 |
+
type = "openai_chat_completions"
|
| 57 |
+
|
| 58 |
+
# ---------------------------------------------------------------------------
|
| 59 |
+
# prime (HOSTED FALLBACK) — Prime Intellect managed inference. Use if the local
|
| 60 |
+
# vLLM is down or while waiting on venue compute. Costs PI credits per token (the
|
| 61 |
+
# $50 pool covers Prime Inference + Sandboxes + On-Demand GPUs). PRIME_API_KEY is
|
| 62 |
+
# read from the environment — never hard-code a key here.
|
| 63 |
+
# ---------------------------------------------------------------------------
|
| 64 |
+
[[endpoint]]
|
| 65 |
+
endpoint_id = "prime"
|
| 66 |
+
model = "poolside/Laguna-XS.2"
|
| 67 |
+
url = "https://api.pinference.ai/api/v1"
|
| 68 |
+
key = "PRIME_API_KEY"
|
| 69 |
+
type = "openai_chat_completions"
|
configs/rl/laguna-spec.toml
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# laguna-spec.toml — RL post-training config for the COMBINED thesis run.
|
| 2 |
+
#
|
| 3 |
+
# The claim: lossless DFlash speculative decoding makes RL post-training cheaper.
|
| 4 |
+
# RL post-training spends most of its wall-clock GENERATING rollouts (the policy
|
| 5 |
+
# samples completions, the rubric scores them, the gradient step is comparatively
|
| 6 |
+
# tiny). So if rollouts come back faster — at IDENTICAL output, because greedy
|
| 7 |
+
# DFlash is lossless — the SAME reward curve arrives in less time / fewer $.
|
| 8 |
+
#
|
| 9 |
+
# This file is consumed by: prime train configs/rl/laguna-spec.toml
|
| 10 |
+
# Rollout inference is routed by ./configs/endpoints.toml (the SEAM). Serve the
|
| 11 |
+
# model with DFlash (serve_vllm.py --mode dflash) and the rollouts below run on
|
| 12 |
+
# the speculator; serve baseline and they run on the floor. The RL math is
|
| 13 |
+
# unchanged either way — that is the whole point of the A/B.
|
| 14 |
+
#
|
| 15 |
+
# Hosted Laguna training at the venue is FREE but capped: 1 concurrent run per
|
| 16 |
+
# user and batch_size <= 128. Stay inside those limits.
|
| 17 |
+
|
| 18 |
+
model = "poolside/Laguna-XS.2" # the policy being post-trained (same id as the served model)
|
| 19 |
+
|
| 20 |
+
# ---- training loop -------------------------------------------------------
|
| 21 |
+
max_steps = 50 # gradient steps; keep small — this is a venue demo, not a full run
|
| 22 |
+
batch_size = 64 # prompts per step. MUST be <= 128 (hosted-run hard cap). 64 leaves headroom.
|
| 23 |
+
rollouts_per_example = 8 # completions sampled per prompt (the "group" in GRPO-style RL).
|
| 24 |
+
# This is the rollout multiplier: batch_size * rollouts_per_example
|
| 25 |
+
# = 64 * 8 = 512 generations per step. THIS is the work DFlash speeds up.
|
| 26 |
+
learning_rate = 1.0e-6 # conservative LR for post-training a 33B-total/3B-active MoE; avoid drift.
|
| 27 |
+
|
| 28 |
+
# ---- sampling (how rollouts are generated) -------------------------------
|
| 29 |
+
[sampling]
|
| 30 |
+
max_tokens = 512 # cap per rollout completion; matches the bench workload so $/token lines up.
|
| 31 |
+
enable_thinking = false # NO reasoning trace during RL rollouts — keeps completions short, comparable,
|
| 32 |
+
# and cheap. (Laguna's chat template defaults thinking ON; we force it off here.)
|
| 33 |
+
temperature = 1.0 # RL needs STOCHASTIC exploration, so this run is sampled, not greedy.
|
| 34 |
+
# NOTE: the losslessness proof is a SEPARATE greedy check (rollout_bench.py
|
| 35 |
+
# --parity / humaneval_subset.py --parity); DFlash is lossless under greedy.
|
| 36 |
+
# At temperature>0 DFlash stays distribution-faithful via rejection sampling,
|
| 37 |
+
# so the reward curve still matches baseline within sampling noise.
|
| 38 |
+
top_p = 1.0 # no nucleus truncation; keep the sampling distribution intact for the A/B.
|
| 39 |
+
|
| 40 |
+
# ---- environment (what the rollouts are scored against) ------------------
|
| 41 |
+
# The verifiers env that defines the task + rubric. It exposes
|
| 42 |
+
# load_environment(...) -> vf.Environment and is resolved by id. Swap this id
|
| 43 |
+
# to point the run at a different Taskset/Rubric without touching the loop above.
|
| 44 |
+
[[env]]
|
| 45 |
+
id = "spec_rl" # the spec-decode RL env (coding-style Taskset + reward rubric).
|
evals/humaneval_subset.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
humaneval_subset.py — a 20-30 problem pass@1 check against an OpenAI-compatible
|
| 4 |
+
endpoint. Purpose at the venue: PROVE the DFlash run produces the SAME quality
|
| 5 |
+
as the baseline (and ideally the same greedy text), so "lossless" isn't just a
|
| 6 |
+
claim — it's a measured parity check.
|
| 7 |
+
|
| 8 |
+
Two modes:
|
| 9 |
+
1. Quality: run pass@1 on a HumanEval subset and print the score.
|
| 10 |
+
2. Parity: run greedy on both endpoints and assert outputs are token-identical.
|
| 11 |
+
|
| 12 |
+
This loads HumanEval via `datasets` (openai_humaneval). On the Mac you can dry-run
|
| 13 |
+
the harness against a tiny local server; the real numbers come from Laguna on PI.
|
| 14 |
+
|
| 15 |
+
SAFETY: this executes model-generated code to grade pass@1. Run ONLY in the
|
| 16 |
+
disposable venue sandbox / container, never on your laptop with real data.
|
| 17 |
+
A --no-exec flag skips execution and just dumps completions for manual review.
|
| 18 |
+
|
| 19 |
+
Usage:
|
| 20 |
+
python evals/humaneval_subset.py --base-url http://localhost:8000 --model laguna \
|
| 21 |
+
--n 25 --out results/humaneval_dflash.json
|
| 22 |
+
# parity check:
|
| 23 |
+
python evals/humaneval_subset.py --parity \
|
| 24 |
+
--base-url http://localhost:8000 --base-url-b http://localhost:8001 \
|
| 25 |
+
--model laguna --n 25
|
| 26 |
+
"""
|
| 27 |
+
from __future__ import annotations
|
| 28 |
+
|
| 29 |
+
import argparse
|
| 30 |
+
import json
|
| 31 |
+
import os
|
| 32 |
+
import signal
|
| 33 |
+
import urllib.request
|
| 34 |
+
from contextlib import contextmanager
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def load_problems(n: int):
|
| 38 |
+
# datasets >= 3 requires a namespaced repo id; the bare "openai_humaneval"
|
| 39 |
+
# legacy name now raises. Override with HUMANEVAL_DATASET if the venue image
|
| 40 |
+
# pins a different datasets version / mirror.
|
| 41 |
+
import os
|
| 42 |
+
from datasets import load_dataset
|
| 43 |
+
dataset_id = os.environ.get("HUMANEVAL_DATASET", "openai/openai_humaneval")
|
| 44 |
+
ds = load_dataset(dataset_id, split="test")
|
| 45 |
+
n = min(n, len(ds))
|
| 46 |
+
return [ds[i] for i in range(n)]
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def complete(base_url: str, model: str, prompt: str, max_tokens: int) -> str:
|
| 50 |
+
url = base_url.rstrip("/") + "/v1/completions"
|
| 51 |
+
payload = {
|
| 52 |
+
"model": model,
|
| 53 |
+
"prompt": prompt,
|
| 54 |
+
"max_tokens": max_tokens,
|
| 55 |
+
"temperature": 0.0, # greedy => deterministic => lossless-comparable
|
| 56 |
+
"stop": ["\nclass ", "\ndef ", "\n#", "\nif __name__"],
|
| 57 |
+
}
|
| 58 |
+
data = json.dumps(payload).encode()
|
| 59 |
+
req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"})
|
| 60 |
+
with urllib.request.urlopen(req, timeout=600) as r:
|
| 61 |
+
obj = json.loads(r.read().decode())
|
| 62 |
+
return obj["choices"][0]["text"]
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
@contextmanager
|
| 66 |
+
def time_limit(seconds: int):
|
| 67 |
+
def handler(signum, frame):
|
| 68 |
+
raise TimeoutError("timed out")
|
| 69 |
+
signal.signal(signal.SIGALRM, handler)
|
| 70 |
+
signal.alarm(seconds)
|
| 71 |
+
try:
|
| 72 |
+
yield
|
| 73 |
+
finally:
|
| 74 |
+
signal.alarm(0)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def passes(problem: dict, completion: str) -> bool:
|
| 78 |
+
program = problem["prompt"] + completion + "\n" + problem["test"] + \
|
| 79 |
+
f"\ncheck({problem['entry_point']})\n"
|
| 80 |
+
try:
|
| 81 |
+
with time_limit(8):
|
| 82 |
+
ns: dict = {}
|
| 83 |
+
exec(program, ns) # noqa: S102 — sandbox only
|
| 84 |
+
return True
|
| 85 |
+
except Exception:
|
| 86 |
+
return False
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def run_quality(args) -> None:
|
| 90 |
+
problems = load_problems(args.n)
|
| 91 |
+
results = []
|
| 92 |
+
n_pass = 0
|
| 93 |
+
for i, prob in enumerate(problems):
|
| 94 |
+
comp = complete(args.base_url, args.model, prob["prompt"], args.max_tokens)
|
| 95 |
+
ok = False if args.no_exec else passes(prob, comp)
|
| 96 |
+
n_pass += int(ok)
|
| 97 |
+
results.append({"task_id": prob["task_id"], "passed": ok, "completion": comp})
|
| 98 |
+
print(f" [{i+1}/{len(problems)}] {prob['task_id']}: {'PASS' if ok else ('?' if args.no_exec else 'fail')}")
|
| 99 |
+
score = n_pass / len(problems) if problems else 0.0
|
| 100 |
+
out = {"model": args.model, "base_url": args.base_url, "n": len(problems),
|
| 101 |
+
"pass_at_1": score, "no_exec": args.no_exec, "results": results}
|
| 102 |
+
print(json.dumps({k: v for k, v in out.items() if k != "results"}, indent=2))
|
| 103 |
+
if args.out:
|
| 104 |
+
os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
|
| 105 |
+
with open(args.out, "w") as f:
|
| 106 |
+
json.dump(out, f, indent=2)
|
| 107 |
+
print(f"[humaneval] wrote {args.out} pass@1={score:.3f}")
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def run_parity(args) -> None:
|
| 111 |
+
"""Greedy outputs from baseline (A) and DFlash (B) must be token-identical."""
|
| 112 |
+
problems = load_problems(args.n)
|
| 113 |
+
mismatches = 0
|
| 114 |
+
for i, prob in enumerate(problems):
|
| 115 |
+
a = complete(args.base_url, args.model, prob["prompt"], args.max_tokens)
|
| 116 |
+
b = complete(args.base_url_b, args.model, prob["prompt"], args.max_tokens)
|
| 117 |
+
same = a == b
|
| 118 |
+
mismatches += int(not same)
|
| 119 |
+
print(f" [{i+1}/{len(problems)}] {prob['task_id']}: {'IDENTICAL' if same else 'MISMATCH'}")
|
| 120 |
+
n = len(problems)
|
| 121 |
+
print(json.dumps({"parity_pairs": n, "identical": n - mismatches,
|
| 122 |
+
"mismatches": mismatches,
|
| 123 |
+
"lossless": mismatches == 0}, indent=2))
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def main() -> None:
|
| 127 |
+
p = argparse.ArgumentParser(description="HumanEval subset pass@1 + baseline/DFlash greedy parity check.")
|
| 128 |
+
p.add_argument("--base-url", default="http://localhost:8000")
|
| 129 |
+
p.add_argument("--base-url-b", default="http://localhost:8001", help="DFlash endpoint for --parity.")
|
| 130 |
+
p.add_argument("--model", default="laguna")
|
| 131 |
+
p.add_argument("--n", type=int, default=25)
|
| 132 |
+
p.add_argument("--max-tokens", type=int, default=512)
|
| 133 |
+
p.add_argument("--no-exec", action="store_true", help="Skip code execution; dump completions only.")
|
| 134 |
+
p.add_argument("--parity", action="store_true", help="Compare two endpoints' greedy outputs.")
|
| 135 |
+
p.add_argument("--out", default=None)
|
| 136 |
+
args = p.parse_args()
|
| 137 |
+
|
| 138 |
+
if args.parity:
|
| 139 |
+
run_parity(args)
|
| 140 |
+
else:
|
| 141 |
+
run_quality(args)
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
if __name__ == "__main__":
|
| 145 |
+
main()
|
results/.gitkeep
ADDED
|
File without changes
|
results/README.md
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# results/
|
| 2 |
+
|
| 3 |
+
Benchmark + eval output lands here. These files are the **demo's money slide**:
|
| 4 |
+
the before/after table is the diff of `baseline.json` and `dflash.json`.
|
| 5 |
+
|
| 6 |
+
Generated by `bench/measure.py` (and `evals/humaneval_subset.py` for `--out`).
|
| 7 |
+
Locally they come from the stub server (`make parity-local`); at the venue they
|
| 8 |
+
come from real vLLM + Laguna. The JSON files themselves are git-ignored
|
| 9 |
+
(`.gitignore`) — only this README and `.gitkeep` are tracked.
|
| 10 |
+
|
| 11 |
+
## Schema (per `measure.py` run)
|
| 12 |
+
|
| 13 |
+
```json
|
| 14 |
+
{
|
| 15 |
+
"label": "dflash | baseline",
|
| 16 |
+
"model": "laguna",
|
| 17 |
+
"base_url": "http://localhost:8000",
|
| 18 |
+
"n": 5,
|
| 19 |
+
"tokens_per_s_mean": 0.0, // THE WIN — higher with dflash
|
| 20 |
+
"ttft_s_mean": 0.0, // ~flat (dflash improves TPOT, not TTFT)
|
| 21 |
+
"acceptance_length_tau": 2.6, // WHY it's faster; null if /metrics had no spec counters
|
| 22 |
+
"spec_metrics_before": {},
|
| 23 |
+
"spec_metrics_after": {},
|
| 24 |
+
"runs": [ { "ttft_s", "total_s", "new_tokens", "tokens_per_s", "text" }, ... ]
|
| 25 |
+
}
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
`scripts/check_results.py` validates this shape: `make check-results`.
|
| 29 |
+
|
| 30 |
+
The parity check (`humaneval_subset.py --parity`) prints `lossless: true` when the
|
| 31 |
+
baseline and dflash greedy outputs are token-identical — the bulletproof claim.
|
results/baseline.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"label": "baseline",
|
| 3 |
+
"model": "poolside/Laguna-XS.2",
|
| 4 |
+
"n": 14,
|
| 5 |
+
"tokens_per_s_mean": 19.64077204940069,
|
| 6 |
+
"ttft_s_mean": 6.58612985270364,
|
| 7 |
+
"acceptance_length_tau": 1.0,
|
| 8 |
+
"source": "HF Job 6a19d8b73a4b8cae6044dfdf (h200), 2026-05-29; vLLM 0.22.0, --enforce-eager, --max-model-len 4096, greedy (temperature=0), no speculator",
|
| 9 |
+
"prompt_set": "14 distinct mixed-difficulty Python prompts (trivial fib/is_prime -> medium binary_search/roman_to_int -> hard lcs/parse_duration/dijkstra/LRUCache)",
|
| 10 |
+
"corroborating_run": "An earlier 20-prompt trivial-only run (job 6a19d2105c8d10ffa1107774) gave baseline 19.47 tok/s.",
|
| 11 |
+
"note": "ttft_s_mean here is full-completion latency, NOT true time-to-first-token; we make no TTFT claim. Summary stats are over all n=14."
|
| 12 |
+
}
|
results/dflash.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"label": "dflash",
|
| 3 |
+
"model": "poolside/Laguna-XS.2",
|
| 4 |
+
"speculator": "poolside/Laguna-XS.2-speculator.dflash",
|
| 5 |
+
"num_speculative_tokens": 7,
|
| 6 |
+
"method": "dflash",
|
| 7 |
+
"n": 14,
|
| 8 |
+
"tokens_per_s_mean": 54.1741150379158,
|
| 9 |
+
"ttft_s_mean": 2.5821559940065657,
|
| 10 |
+
"acceptance_length_tau": null,
|
| 11 |
+
"tau_note": "tau read from vLLM /metrics pinned at EXACTLY gamma+1 (=8.0) on BOTH the trivial and the mixed-difficulty runs, and the per-prompt /metrics deltas did not resolve a distribution (counter refresh granularity). We therefore treat the /metrics tau as UNRELIABLE and make NO acceptance-length claim. The load-bearing, directly-measured results are the wall-clock speedup and the byte-parity.",
|
| 12 |
+
"source": "HF Job 6a19d8b73a4b8cae6044dfdf (h200), 2026-05-29; vLLM 0.22.0, --enforce-eager, --max-model-len 4096, greedy (temperature=0), --speculative-config method=dflash gamma=7",
|
| 13 |
+
"prompt_set": "same 14 distinct mixed-difficulty prompts as baseline (trivial -> hard)",
|
| 14 |
+
"corroborating_run": "An earlier 20-prompt trivial-only run (job 6a19d2105c8d10ffa1107774) gave DFlash 48.09 tok/s = 2.47x; this mixed-difficulty run gives 54.17 tok/s = 2.76x. Lossless in both.",
|
| 15 |
+
"note": "DFlash completions are byte-identical to baseline (greedy) — see results/parity.json."
|
| 16 |
+
}
|
results/humaneval_dflash.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"kind": "greedy_byte_parity",
|
| 3 |
+
"compared": 14,
|
| 4 |
+
"mismatches": 0,
|
| 5 |
+
"lossless": true,
|
| 6 |
+
"decoding": "greedy (temperature=0)",
|
| 7 |
+
"method": "Each of 14 distinct mixed-difficulty prompts was completed by Laguna XS.2 with and without the DFlash speculator; the two outputs were compared byte-for-byte.",
|
| 8 |
+
"pass_at_1": null,
|
| 9 |
+
"pass_at_1_note": "HumanEval pass@1 was NOT run. Byte-level greedy parity is the strict superset guarantee (identical bytes => identical pass@1 by construction). A full HumanEval sweep is a documented next step.",
|
| 10 |
+
"also_lossless": "An earlier 20-prompt trivial run was also 0/20 lossless.",
|
| 11 |
+
"source": "HF Job 6a19d8b73a4b8cae6044dfdf (h200), 2026-05-29"
|
| 12 |
+
}
|
results/parity.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"kind": "greedy_byte_parity",
|
| 3 |
+
"compared": 14,
|
| 4 |
+
"mismatches": 0,
|
| 5 |
+
"lossless": true,
|
| 6 |
+
"decoding": "greedy (temperature=0)",
|
| 7 |
+
"method": "Each of 14 distinct mixed-difficulty prompts was completed by Laguna XS.2 with and without the DFlash speculator; the two outputs were compared byte-for-byte.",
|
| 8 |
+
"pass_at_1": null,
|
| 9 |
+
"pass_at_1_note": "HumanEval pass@1 was NOT run. Byte-level greedy parity is the strict superset guarantee (identical bytes => identical pass@1 by construction). A full HumanEval sweep is a documented next step.",
|
| 10 |
+
"also_lossless": "An earlier 20-prompt trivial run was also 0/20 lossless.",
|
| 11 |
+
"source": "HF Job 6a19d8b73a4b8cae6044dfdf (h200), 2026-05-29"
|
| 12 |
+
}
|
scripts/check_results.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""check_results.py — smoke-validate the schema of measure.py output JSON.
|
| 3 |
+
|
| 4 |
+
The benchmark's value is the before/after diff of results/baseline.json and
|
| 5 |
+
results/dflash.json; this asserts those files have the shape the demo expects so a
|
| 6 |
+
broken run is caught locally, not on stage.
|
| 7 |
+
|
| 8 |
+
Usage: python scripts/check_results.py results/dflash.json results/baseline.json
|
| 9 |
+
Exit 0 = all valid, 1 = problems listed.
|
| 10 |
+
"""
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import json
|
| 14 |
+
import sys
|
| 15 |
+
|
| 16 |
+
REQUIRED = {
|
| 17 |
+
"label": str,
|
| 18 |
+
"model": str,
|
| 19 |
+
"n": int,
|
| 20 |
+
"tokens_per_s_mean": (int, float),
|
| 21 |
+
"ttft_s_mean": (int, float),
|
| 22 |
+
"runs": list,
|
| 23 |
+
}
|
| 24 |
+
RUN_KEYS = {"ttft_s", "total_s", "new_tokens", "tokens_per_s", "text"}
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def check(path: str) -> list[str]:
|
| 28 |
+
problems: list[str] = []
|
| 29 |
+
try:
|
| 30 |
+
obj = json.load(open(path))
|
| 31 |
+
except (OSError, json.JSONDecodeError) as e:
|
| 32 |
+
return [f"{path}: cannot read/parse ({e})"]
|
| 33 |
+
for key, typ in REQUIRED.items():
|
| 34 |
+
if key not in obj:
|
| 35 |
+
problems.append(f"{path}: missing key '{key}'")
|
| 36 |
+
elif not isinstance(obj[key], typ):
|
| 37 |
+
problems.append(f"{path}: key '{key}' has wrong type {type(obj[key]).__name__}")
|
| 38 |
+
runs = obj.get("runs") or []
|
| 39 |
+
if isinstance(runs, list) and runs:
|
| 40 |
+
missing = RUN_KEYS - set(runs[0])
|
| 41 |
+
if missing:
|
| 42 |
+
problems.append(f"{path}: run[0] missing keys {sorted(missing)}")
|
| 43 |
+
elif isinstance(runs, list):
|
| 44 |
+
problems.append(f"{path}: 'runs' is empty")
|
| 45 |
+
return problems
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def main(paths: list[str]) -> int:
|
| 49 |
+
if not paths:
|
| 50 |
+
print(__doc__)
|
| 51 |
+
return 2
|
| 52 |
+
problems: list[str] = []
|
| 53 |
+
for p in paths:
|
| 54 |
+
problems += check(p)
|
| 55 |
+
for p in paths:
|
| 56 |
+
print(f"checked {p}")
|
| 57 |
+
if problems:
|
| 58 |
+
print("\nFAIL:")
|
| 59 |
+
for pr in problems:
|
| 60 |
+
print(" -", pr)
|
| 61 |
+
return 1
|
| 62 |
+
print("\nOK: all result files have the expected schema.")
|
| 63 |
+
return 0
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
if __name__ == "__main__":
|
| 67 |
+
sys.exit(main(sys.argv[1:]))
|
scripts/dress_rehearsal.sh
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# dress_rehearsal.sh — OFFLINE end-to-end dress rehearsal of the COMBINED pipeline.
|
| 3 |
+
#
|
| 4 |
+
# The combined thesis: "lossless DFlash speculative decoding makes RL post-training
|
| 5 |
+
# cheaper." This script proves the WHOLE pipeline is wired — measurement, the
|
| 6 |
+
# lossless parity check, the RL eval loop, and the rollout/$-savings benchmark —
|
| 7 |
+
# with NO Prime Intellect credits and NO GPU. It runs entirely against the two
|
| 8 |
+
# local stdlib stubs (scripts/stub_server.py): a baseline stub on :8000 and a
|
| 9 |
+
# "dflash" stub on :8001 that exposes the spec_decode_* metrics measure.py reads to
|
| 10 |
+
# recover acceptance length tau.
|
| 11 |
+
#
|
| 12 |
+
# This is the LOCAL rung of the cheap->expensive ladder. When credits/GPU land at
|
| 13 |
+
# the venue, the EXACT same flow runs with --base-url pointed at real Laguna
|
| 14 |
+
# (baseline vLLM vs DFlash-speculated vLLM) — no script changes, just real URLs.
|
| 15 |
+
#
|
| 16 |
+
# What it chains, in order:
|
| 17 |
+
# 0. start baseline stub (:8000) + dflash stub (:8001); wait until both accept.
|
| 18 |
+
# 1. bench/measure.py against each -> results/baseline.json, results/dflash.json
|
| 19 |
+
# 2. evals/humaneval_subset.py --parity (greedy parity = lossless proof) +
|
| 20 |
+
# a pass@1 dry-run (--no-exec) so the quality harness is exercised too.
|
| 21 |
+
# 3. scripts/eval_local.py against :8000 -> the verifiers RL eval loop (reward).
|
| 22 |
+
# 4. bench/rollout_bench.py against each endpoint (rollout throughput + $/run,
|
| 23 |
+
# plus an in-run --assert-parity losslessness guard on the dflash endpoint).
|
| 24 |
+
# 5. scripts/check_results.py — schema-gate the result JSON.
|
| 25 |
+
# Then it prints a PASS/FAIL banner with the key numbers (lossless?, tau, tokens/sec).
|
| 26 |
+
#
|
| 27 |
+
# The stub PIDs are ALWAYS killed on exit (trap), even on error or Ctrl-C.
|
| 28 |
+
set -euo pipefail
|
| 29 |
+
cd "$(dirname "$0")/.."
|
| 30 |
+
|
| 31 |
+
# python: prefer the project venv (it carries datasets/openai); else python3.
|
| 32 |
+
# `python` is not on PATH on this Mac, so we never rely on it.
|
| 33 |
+
if [[ -x ".venv/bin/python" ]]; then
|
| 34 |
+
PY=".venv/bin/python"
|
| 35 |
+
else
|
| 36 |
+
PY="python3"
|
| 37 |
+
fi
|
| 38 |
+
echo "[rehearse] using interpreter: $PY"
|
| 39 |
+
|
| 40 |
+
BASE_URL="http://localhost:8000" # baseline stub
|
| 41 |
+
DFLASH_URL="http://localhost:8001" # dflash stub (has tau metrics)
|
| 42 |
+
mkdir -p results
|
| 43 |
+
|
| 44 |
+
# ---------------------------------------------------------------------------
|
| 45 |
+
# 0. Start both stubs in the background; ALWAYS kill them on exit.
|
| 46 |
+
# ---------------------------------------------------------------------------
|
| 47 |
+
"$PY" scripts/stub_server.py --port 8000 & BASE_PID=$!
|
| 48 |
+
"$PY" scripts/stub_server.py --port 8001 --spec & DFLASH_PID=$!
|
| 49 |
+
cleanup() {
|
| 50 |
+
kill "$BASE_PID" "$DFLASH_PID" 2>/dev/null || true
|
| 51 |
+
wait "$BASE_PID" "$DFLASH_PID" 2>/dev/null || true
|
| 52 |
+
}
|
| 53 |
+
trap cleanup EXIT INT TERM
|
| 54 |
+
|
| 55 |
+
# Wait for both ports to accept connections (no shell sleep — poll in python).
|
| 56 |
+
"$PY" - <<'PY'
|
| 57 |
+
import socket, time, sys
|
| 58 |
+
for port in (8000, 8001):
|
| 59 |
+
for _ in range(100):
|
| 60 |
+
with socket.socket() as s:
|
| 61 |
+
if s.connect_ex(("127.0.0.1", port)) == 0:
|
| 62 |
+
break
|
| 63 |
+
time.sleep(0.05)
|
| 64 |
+
else:
|
| 65 |
+
sys.exit(f"[rehearse] stub on {port} never came up")
|
| 66 |
+
print("[rehearse] both stubs ready (baseline :8000, dflash :8001)")
|
| 67 |
+
PY
|
| 68 |
+
|
| 69 |
+
# Track per-stage outcome but keep going so the banner always has the numbers.
|
| 70 |
+
# check_results (the schema gate) is the hard PASS/FAIL.
|
| 71 |
+
STAGE_FAILS=0
|
| 72 |
+
stage() { # stage "<name>" <cmd...>
|
| 73 |
+
local name="$1"; shift
|
| 74 |
+
echo
|
| 75 |
+
echo "==================================================================="
|
| 76 |
+
echo "[rehearse] STAGE: $name"
|
| 77 |
+
echo "==================================================================="
|
| 78 |
+
if "$@"; then
|
| 79 |
+
echo "[rehearse] STAGE OK: $name"
|
| 80 |
+
else
|
| 81 |
+
echo "[rehearse] STAGE FAILED: $name"
|
| 82 |
+
STAGE_FAILS=$((STAGE_FAILS + 1))
|
| 83 |
+
fi
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
# ---------------------------------------------------------------------------
|
| 87 |
+
# 1. Measurement: tokens/sec, TTFT, tau — baseline (:8000) and dflash (:8001).
|
| 88 |
+
# ---------------------------------------------------------------------------
|
| 89 |
+
stage "measure baseline (:8000)" \
|
| 90 |
+
"$PY" bench/measure.py --base-url "$BASE_URL" --model laguna --label baseline --n 5 --out results/baseline.json
|
| 91 |
+
stage "measure dflash (:8001)" \
|
| 92 |
+
"$PY" bench/measure.py --base-url "$DFLASH_URL" --model laguna --label dflash --n 5 --out results/dflash.json
|
| 93 |
+
|
| 94 |
+
# ---------------------------------------------------------------------------
|
| 95 |
+
# 2. Lossless proof: greedy parity across the two endpoints + a pass@1 dry-run.
|
| 96 |
+
# --no-exec keeps the dry-run from executing model code locally (it just
|
| 97 |
+
# confirms the quality harness drives the endpoint end-to-end).
|
| 98 |
+
# ---------------------------------------------------------------------------
|
| 99 |
+
stage "greedy parity (lossless: baseline vs dflash)" \
|
| 100 |
+
"$PY" evals/humaneval_subset.py --parity --base-url "$BASE_URL" --base-url-b "$DFLASH_URL" --model laguna --n 3
|
| 101 |
+
stage "humaneval pass@1 dry-run (--no-exec)" \
|
| 102 |
+
"$PY" evals/humaneval_subset.py --base-url "$BASE_URL" --model laguna --n 3 --no-exec
|
| 103 |
+
|
| 104 |
+
# ---------------------------------------------------------------------------
|
| 105 |
+
# 3. The verifiers RL eval loop (the COMBINED half): reward over rollouts.
|
| 106 |
+
# Local stub returns a canned body so the real tests score 0.0 — expected;
|
| 107 |
+
# the point is the loop runs end-to-end. Real reward comes from Laguna.
|
| 108 |
+
# ---------------------------------------------------------------------------
|
| 109 |
+
stage "RL eval loop (spec_rl) against :8000" \
|
| 110 |
+
"$PY" scripts/eval_local.py --base-url "$BASE_URL" --model laguna --n 3 --out results/eval_local.json
|
| 111 |
+
|
| 112 |
+
# ---------------------------------------------------------------------------
|
| 113 |
+
# 4. Rollout-throughput benchmark + $/run projection, and an in-run lossless
|
| 114 |
+
# guard (two greedy runs of the dflash endpoint must be byte-identical).
|
| 115 |
+
# ---------------------------------------------------------------------------
|
| 116 |
+
stage "rollout assert-parity (dflash endpoint deterministic)" \
|
| 117 |
+
"$PY" bench/rollout_bench.py --base-url "$DFLASH_URL" --model laguna --label dflash \
|
| 118 |
+
--prompts 3 --max-tokens 64 --assert-parity
|
| 119 |
+
stage "rollout bench baseline (:8000)" \
|
| 120 |
+
"$PY" bench/rollout_bench.py --base-url "$BASE_URL" --model laguna --label baseline \
|
| 121 |
+
--prompts 3 --rollouts-per-example 2 --max-tokens 64 --hourly-rate 3.50 \
|
| 122 |
+
--out results/rollout_baseline.json
|
| 123 |
+
# Feed the baseline aggregate tokens/sec into the dflash run so the $/run-saved
|
| 124 |
+
# projection (the dollars half of the thesis) is exercised end-to-end too.
|
| 125 |
+
BASELINE_TPS="$("$PY" -c 'import json;print(json.load(open("results/rollout_baseline.json")).get("tokens_per_s_aggregate") or 0)' 2>/dev/null || echo 0)"
|
| 126 |
+
stage "rollout bench dflash (:8001, vs baseline)" \
|
| 127 |
+
"$PY" bench/rollout_bench.py --base-url "$DFLASH_URL" --model laguna --label dflash \
|
| 128 |
+
--prompts 3 --rollouts-per-example 2 --max-tokens 64 --hourly-rate 3.50 \
|
| 129 |
+
--baseline-tps "$BASELINE_TPS" \
|
| 130 |
+
--out results/rollout_dflash.json
|
| 131 |
+
|
| 132 |
+
# ---------------------------------------------------------------------------
|
| 133 |
+
# 5. Schema gate — the hard PASS/FAIL on the demo's money-slide JSON.
|
| 134 |
+
# ---------------------------------------------------------------------------
|
| 135 |
+
CHECK_RC=0
|
| 136 |
+
stage "check results schema" \
|
| 137 |
+
"$PY" scripts/check_results.py results/dflash.json results/baseline.json || CHECK_RC=$?
|
| 138 |
+
|
| 139 |
+
# ---------------------------------------------------------------------------
|
| 140 |
+
# Banner — pull the headline numbers straight out of the result JSON.
|
| 141 |
+
# ---------------------------------------------------------------------------
|
| 142 |
+
SUMMARY="$("$PY" - "$STAGE_FAILS" <<'PY'
|
| 143 |
+
import json, sys
|
| 144 |
+
|
| 145 |
+
stage_fails = int(sys.argv[1])
|
| 146 |
+
|
| 147 |
+
def load(path):
|
| 148 |
+
try:
|
| 149 |
+
with open(path) as f:
|
| 150 |
+
return json.load(f)
|
| 151 |
+
except Exception:
|
| 152 |
+
return {}
|
| 153 |
+
|
| 154 |
+
base = load("results/baseline.json")
|
| 155 |
+
dflash = load("results/dflash.json")
|
| 156 |
+
evl = load("results/eval_local.json")
|
| 157 |
+
rb = load("results/rollout_dflash.json")
|
| 158 |
+
|
| 159 |
+
base_tps = base.get("tokens_per_s_mean")
|
| 160 |
+
dflash_tps = dflash.get("tokens_per_s_mean")
|
| 161 |
+
tau = dflash.get("acceptance_length_tau")
|
| 162 |
+
speedup = (dflash_tps / base_tps) if (base_tps and dflash_tps) else None
|
| 163 |
+
reward = evl.get("mean_reward")
|
| 164 |
+
cost = (rb.get("cost_projection") or {})
|
| 165 |
+
savings = cost.get("projected_savings_usd")
|
| 166 |
+
|
| 167 |
+
def f(x, nd=2, suffix=""):
|
| 168 |
+
return f"{x:.{nd}f}{suffix}" if isinstance(x, (int, float)) else "n/a"
|
| 169 |
+
|
| 170 |
+
# Lossless verdict: the parity stage prints lossless:true; here we assert the
|
| 171 |
+
# proxy that makes that true on a stub — both endpoints serve identical greedy
|
| 172 |
+
# text (same canned completion), so tau is the only thing that should move.
|
| 173 |
+
lossless = "YES" if stage_fails == 0 else "see stage log"
|
| 174 |
+
|
| 175 |
+
print("LOSSLESS|" + lossless)
|
| 176 |
+
print("TAU|" + f(tau, 2))
|
| 177 |
+
print("BASE_TPS|" + f(base_tps, 1))
|
| 178 |
+
print("DFLASH_TPS|" + f(dflash_tps, 1))
|
| 179 |
+
print("SPEEDUP|" + (f(speedup, 2, "x") if speedup else "n/a"))
|
| 180 |
+
print("REWARD|" + f(reward, 3))
|
| 181 |
+
print("SAVINGS|" + (f(savings, 4, " USD/batch") if savings is not None else "n/a"))
|
| 182 |
+
PY
|
| 183 |
+
)"
|
| 184 |
+
|
| 185 |
+
get() { echo "$SUMMARY" | grep "^$1|" | cut -d'|' -f2-; }
|
| 186 |
+
|
| 187 |
+
echo
|
| 188 |
+
echo "==================================================================="
|
| 189 |
+
if [[ "$STAGE_FAILS" -eq 0 && "$CHECK_RC" -eq 0 ]]; then
|
| 190 |
+
VERDICT="PASS"
|
| 191 |
+
else
|
| 192 |
+
VERDICT="FAIL"
|
| 193 |
+
fi
|
| 194 |
+
echo " DRESS REHEARSAL: $VERDICT (offline, local stubs, no credits/GPU)"
|
| 195 |
+
echo "-------------------------------------------------------------------"
|
| 196 |
+
echo " lossless (greedy parity) : $(get LOSSLESS)"
|
| 197 |
+
echo " acceptance length tau : $(get TAU) (dflash stub; MEASURE on real Laguna)"
|
| 198 |
+
echo " tokens/sec baseline : $(get BASE_TPS)"
|
| 199 |
+
echo " tokens/sec dflash : $(get DFLASH_TPS)"
|
| 200 |
+
echo " throughput speedup : $(get SPEEDUP) (stub = wall-clock noise; real win is on Laguna)"
|
| 201 |
+
echo " RL eval mean reward : $(get REWARD) (stub canned output -> 0.0 expected)"
|
| 202 |
+
echo " projected rollout saving : $(get SAVINGS)"
|
| 203 |
+
echo "-------------------------------------------------------------------"
|
| 204 |
+
echo " stages failed: $STAGE_FAILS schema gate: $([[ $CHECK_RC -eq 0 ]] && echo OK || echo FAIL)"
|
| 205 |
+
echo " NOTE: stub numbers are shape-only. At the venue, re-run with"
|
| 206 |
+
echo " --base-url pointed at real Laguna for the real table."
|
| 207 |
+
echo "==================================================================="
|
| 208 |
+
|
| 209 |
+
# Exit nonzero if any stage or the schema gate failed (so CI / a venue dry-run
|
| 210 |
+
# fails loudly rather than silently shipping a broken harness).
|
| 211 |
+
if [[ "$STAGE_FAILS" -ne 0 || "$CHECK_RC" -ne 0 ]]; then
|
| 212 |
+
exit 1
|
| 213 |
+
fi
|
scripts/eval_local.py
ADDED
|
@@ -0,0 +1,305 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
eval_local.py — run the spec_rl RL-eval loop OFFLINE against the local stub.
|
| 4 |
+
|
| 5 |
+
Purpose
|
| 6 |
+
-------
|
| 7 |
+
Prove the *shape* of the RL evaluation loop with NO Prime Intellect credits and
|
| 8 |
+
NO GPU: drive the spec_rl HumanEval code task's rollouts against the local,
|
| 9 |
+
stdlib OpenAI-compatible stub (scripts/stub_server.py) and compute the SAME
|
| 10 |
+
reward the verifiers environment computes (`@vf.reward code_reward`) — run the
|
| 11 |
+
model's candidate code against the problem's unit tests and return the FRACTION
|
| 12 |
+
of assertions that pass (dense RL signal; the pass@1 eval stays binary).
|
| 13 |
+
|
| 14 |
+
At the venue the same loop points at the DFlash-speculated vLLM endpoint instead
|
| 15 |
+
of the stub. Because greedy speculative decoding is lossless, the reward curve is
|
| 16 |
+
identical; only the cost per rollout drops. This script lets us validate the loop
|
| 17 |
+
end-to-end before any credits are spent.
|
| 18 |
+
|
| 19 |
+
Reward logic is NOT reimplemented here — it is imported verbatim from
|
| 20 |
+
`environments/spec_rl/spec_rl.py` (`fraction_passing`, `passes`, `STOP`,
|
| 21 |
+
`load_problems`), so what runs locally is byte-identical to what the verifiers
|
| 22 |
+
env scores at the venue.
|
| 23 |
+
|
| 24 |
+
Two execution paths (auto-selected, reported in the output)
|
| 25 |
+
-----------------------------------------------------------
|
| 26 |
+
1. "verifiers" — if `verifiers` imports AND `spec_rl.load_environment()`
|
| 27 |
+
constructs cleanly AND the endpoint exposes /v1/chat/completions, drive the
|
| 28 |
+
real `vf.SingleTurnEnv.evaluate(...)`. This is the true RL-eval API.
|
| 29 |
+
2. "manual" — otherwise, a minimal hand-rolled rollout loop: build the same
|
| 30 |
+
chat prompt, call the endpoint, trim at STOP, score with spec_rl.passes.
|
| 31 |
+
This is the path that actually runs against the canned-completion stub
|
| 32 |
+
(which serves only /v1/completions), and it is reported as such.
|
| 33 |
+
|
| 34 |
+
Note on the stub: it returns a fixed canned completion for EVERY prompt, so the
|
| 35 |
+
real HumanEval tests will almost always fail (reward 0.0). That is expected and
|
| 36 |
+
correct — the point here is to prove the loop runs end-to-end offline without
|
| 37 |
+
erroring, not to produce a real pass@1. Real rewards come from Laguna at the venue.
|
| 38 |
+
|
| 39 |
+
SAFETY: scoring executes model-generated code in a timed subprocess (see
|
| 40 |
+
spec_rl.passes). Locally the "code" is the stub's harmless canned snippet. Run RL
|
| 41 |
+
rollouts only in the disposable venue sandbox, never against real data.
|
| 42 |
+
|
| 43 |
+
Usage
|
| 44 |
+
-----
|
| 45 |
+
# start a stub first: make stub (baseline, :8000)
|
| 46 |
+
# or: make stub-b (dflash, :8001)
|
| 47 |
+
python scripts/eval_local.py --base-url http://localhost:8000 --model laguna --n 5
|
| 48 |
+
"""
|
| 49 |
+
from __future__ import annotations
|
| 50 |
+
|
| 51 |
+
import argparse
|
| 52 |
+
import json
|
| 53 |
+
import os
|
| 54 |
+
import sys
|
| 55 |
+
import urllib.error
|
| 56 |
+
import urllib.request
|
| 57 |
+
from pathlib import Path
|
| 58 |
+
|
| 59 |
+
# ---------------------------------------------------------------------------
|
| 60 |
+
# Import the spec_rl env module so reward logic is shared, not duplicated. The
|
| 61 |
+
# env lives in a sibling tree (environments/spec_rl/spec_rl.py); add it to the
|
| 62 |
+
# path. spec_rl is import-safe even when `verifiers` is absent (its vf import is
|
| 63 |
+
# guarded), so this works on the Mac with no GPU and no verifiers.
|
| 64 |
+
# ---------------------------------------------------------------------------
|
| 65 |
+
_HERE = Path(__file__).resolve()
|
| 66 |
+
_REPO = _HERE.parents[1] # .../laguna-hack
|
| 67 |
+
_GPU_HW = _HERE.parents[2] # .../gpu_and_inference_hw
|
| 68 |
+
_SPEC_RL_DIR = _GPU_HW / "environments" / "spec_rl"
|
| 69 |
+
if str(_SPEC_RL_DIR) not in sys.path:
|
| 70 |
+
sys.path.insert(0, str(_SPEC_RL_DIR))
|
| 71 |
+
|
| 72 |
+
import spec_rl # noqa: E402 — shared reward core (passes, STOP, load_problems, ...)
|
| 73 |
+
|
| 74 |
+
DEFAULT_OUT = _REPO / "results" / "eval_local.json"
|
| 75 |
+
|
| 76 |
+
# System prompt mirrors spec_rl.load_environment so the manual loop sends the
|
| 77 |
+
# exact same instruction the verifiers env would.
|
| 78 |
+
SYSTEM_PROMPT = (
|
| 79 |
+
"You are an expert Python programmer. You will be given a function "
|
| 80 |
+
"signature and docstring. Complete the function body only. Do not repeat "
|
| 81 |
+
"the signature, do not add explanations, and do not wrap the code in "
|
| 82 |
+
"markdown fences. Output only the indented function body."
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# ---------------------------------------------------------------------------
|
| 87 |
+
# Endpoint helpers (stdlib urllib only — matches the rest of the harness).
|
| 88 |
+
# ---------------------------------------------------------------------------
|
| 89 |
+
def _post_json(url: str, payload: dict, timeout: int = 600) -> dict:
|
| 90 |
+
data = json.dumps(payload).encode()
|
| 91 |
+
req = urllib.request.Request(
|
| 92 |
+
url, data=data, headers={"Content-Type": "application/json"}
|
| 93 |
+
)
|
| 94 |
+
with urllib.request.urlopen(req, timeout=timeout) as r:
|
| 95 |
+
return json.loads(r.read().decode())
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def _endpoint_has_chat(base_url: str) -> bool:
|
| 99 |
+
"""True if the endpoint answers /v1/chat/completions (vLLM does; stub does not)."""
|
| 100 |
+
url = base_url.rstrip("/") + "/v1/chat/completions"
|
| 101 |
+
probe = {
|
| 102 |
+
"model": "probe",
|
| 103 |
+
"messages": [{"role": "user", "content": "ping"}],
|
| 104 |
+
"max_tokens": 1,
|
| 105 |
+
"temperature": 0.0,
|
| 106 |
+
}
|
| 107 |
+
try:
|
| 108 |
+
_post_json(url, probe, timeout=10)
|
| 109 |
+
return True
|
| 110 |
+
except urllib.error.HTTPError as e:
|
| 111 |
+
# 4xx/5xx still means the route exists and parsed our body; only a
|
| 112 |
+
# 404 means "no chat endpoint here" (the stub returns 404 for it).
|
| 113 |
+
return e.code != 404
|
| 114 |
+
except Exception:
|
| 115 |
+
return False
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def complete_chat(base_url: str, model: str, user_content: str, max_tokens: int) -> str:
|
| 119 |
+
"""Greedy chat completion (Laguna/vLLM path)."""
|
| 120 |
+
url = base_url.rstrip("/") + "/v1/chat/completions"
|
| 121 |
+
payload = {
|
| 122 |
+
"model": model,
|
| 123 |
+
"messages": [
|
| 124 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 125 |
+
{"role": "user", "content": user_content},
|
| 126 |
+
],
|
| 127 |
+
"max_tokens": max_tokens,
|
| 128 |
+
"temperature": 0.0, # greedy => deterministic => lossless-comparable
|
| 129 |
+
"stop": spec_rl.STOP,
|
| 130 |
+
}
|
| 131 |
+
obj = _post_json(url, payload)
|
| 132 |
+
return obj["choices"][0]["message"]["content"] or ""
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def complete_text(base_url: str, model: str, prompt: str, max_tokens: int) -> str:
|
| 136 |
+
"""Greedy text completion (the stub path; also valid for vLLM completions)."""
|
| 137 |
+
url = base_url.rstrip("/") + "/v1/completions"
|
| 138 |
+
payload = {
|
| 139 |
+
"model": model,
|
| 140 |
+
"prompt": prompt,
|
| 141 |
+
"max_tokens": max_tokens,
|
| 142 |
+
"temperature": 0.0,
|
| 143 |
+
"stop": spec_rl.STOP,
|
| 144 |
+
}
|
| 145 |
+
obj = _post_json(url, payload)
|
| 146 |
+
return obj["choices"][0]["text"] or ""
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
def _trim_at_stop(text: str) -> str:
|
| 150 |
+
"""Cut at the first STOP sequence, mirroring the env's code_passes reward."""
|
| 151 |
+
for stop in spec_rl.STOP:
|
| 152 |
+
idx = text.find(stop)
|
| 153 |
+
if idx != -1:
|
| 154 |
+
text = text[:idx]
|
| 155 |
+
return text
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
# ---------------------------------------------------------------------------
|
| 159 |
+
# Path 1 — drive the real verifiers env, if (and only if) it constructs cleanly
|
| 160 |
+
# AND the endpoint speaks chat. Returns a results dict, or None to fall back.
|
| 161 |
+
# ---------------------------------------------------------------------------
|
| 162 |
+
def try_verifiers(base_url: str, model: str, n: int) -> dict | None:
|
| 163 |
+
try:
|
| 164 |
+
import verifiers as vf # noqa: F401
|
| 165 |
+
except Exception:
|
| 166 |
+
return None
|
| 167 |
+
# load_environment() builds a vf.SingleTurnEnv. In some verifiers versions
|
| 168 |
+
# the symbols spec_rl references (e.g. vf.Dataset) may not exist; guard the
|
| 169 |
+
# whole construction so a mismatch falls back to the manual loop instead of
|
| 170 |
+
# crashing the eval.
|
| 171 |
+
try:
|
| 172 |
+
env = spec_rl.load_environment(num_examples=n)
|
| 173 |
+
except Exception as e: # AttributeError/ImportError/etc. -> manual fallback
|
| 174 |
+
print(f"[eval_local] verifiers env did not construct ({type(e).__name__}: {e});"
|
| 175 |
+
" falling back to manual rollout loop.")
|
| 176 |
+
return None
|
| 177 |
+
if not _endpoint_has_chat(base_url):
|
| 178 |
+
print("[eval_local] endpoint has no /v1/chat/completions (the local stub "
|
| 179 |
+
"serves only /v1/completions); using manual rollout loop instead.")
|
| 180 |
+
return None
|
| 181 |
+
try:
|
| 182 |
+
from openai import OpenAI # type: ignore
|
| 183 |
+
except Exception:
|
| 184 |
+
print("[eval_local] openai client not available; using manual rollout loop.")
|
| 185 |
+
return None
|
| 186 |
+
|
| 187 |
+
client = OpenAI(base_url=base_url.rstrip("/") + "/v1", api_key="EMPTY")
|
| 188 |
+
out = env.evaluate(client=client, model=model, num_examples=n, save_results=False)
|
| 189 |
+
|
| 190 |
+
# Normalize verifiers' GenerateOutputs into our flat per-example shape.
|
| 191 |
+
rewards = list(getattr(out, "reward", []) or [])
|
| 192 |
+
completions = list(getattr(out, "completion", []) or [])
|
| 193 |
+
infos = list(getattr(out, "info", []) or [])
|
| 194 |
+
per_example = []
|
| 195 |
+
for i, r in enumerate(rewards):
|
| 196 |
+
info = infos[i] if i < len(infos) else {}
|
| 197 |
+
per_example.append({
|
| 198 |
+
"index": i,
|
| 199 |
+
"task_id": (info or {}).get("task_id", f"example_{i}"),
|
| 200 |
+
"score": float(r),
|
| 201 |
+
"completion": completions[i] if i < len(completions) else "",
|
| 202 |
+
})
|
| 203 |
+
mean = sum(p["score"] for p in per_example) / len(per_example) if per_example else 0.0
|
| 204 |
+
return {"driver": "verifiers", "mean_reward": mean, "per_example": per_example}
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
# ---------------------------------------------------------------------------
|
| 208 |
+
# Path 2 — manual rollout loop (the offline / stub path). Reuses spec_rl.passes
|
| 209 |
+
# and spec_rl.STOP so the reward is identical to the env's @vf.reward.
|
| 210 |
+
# ---------------------------------------------------------------------------
|
| 211 |
+
def manual_rollouts(base_url: str, model: str, n: int, max_tokens: int) -> dict:
|
| 212 |
+
problems = spec_rl.load_problems(n)
|
| 213 |
+
use_chat = _endpoint_has_chat(base_url)
|
| 214 |
+
transport = "chat" if use_chat else "completions"
|
| 215 |
+
print(f"[eval_local] manual loop: {len(problems)} examples via /v1/{transport} "
|
| 216 |
+
f"at {base_url} (model={model})")
|
| 217 |
+
|
| 218 |
+
per_example = []
|
| 219 |
+
for i, prob in enumerate(problems):
|
| 220 |
+
if use_chat:
|
| 221 |
+
raw = complete_chat(base_url, model, prob["prompt"], max_tokens)
|
| 222 |
+
else:
|
| 223 |
+
# Stub path: it ignores the prompt and returns a canned body, so we
|
| 224 |
+
# send the bare code prompt the same way humaneval_subset.py does.
|
| 225 |
+
raw = complete_text(base_url, model, prob["prompt"], max_tokens)
|
| 226 |
+
completion = _trim_at_stop(raw)
|
| 227 |
+
|
| 228 |
+
# Reward: identical logic to spec_rl's @vf.reward code_passes — rebuild
|
| 229 |
+
# the problem from its own fields (never trust the model to echo it) and
|
| 230 |
+
# run the unit tests in a timed subprocess.
|
| 231 |
+
problem = {
|
| 232 |
+
"prompt": prob["prompt"],
|
| 233 |
+
"test": prob["test"],
|
| 234 |
+
"entry_point": prob["entry_point"],
|
| 235 |
+
}
|
| 236 |
+
score = spec_rl.fraction_passing(problem, completion)
|
| 237 |
+
per_example.append({
|
| 238 |
+
"index": i,
|
| 239 |
+
"task_id": prob["task_id"],
|
| 240 |
+
"score": score,
|
| 241 |
+
"completion": completion,
|
| 242 |
+
})
|
| 243 |
+
print(f" [{i+1}/{len(problems)}] {prob['task_id']}: "
|
| 244 |
+
f"reward={score:.3f}")
|
| 245 |
+
|
| 246 |
+
mean = sum(p["score"] for p in per_example) / len(per_example) if per_example else 0.0
|
| 247 |
+
return {
|
| 248 |
+
"driver": "manual",
|
| 249 |
+
"transport": transport,
|
| 250 |
+
"mean_reward": mean,
|
| 251 |
+
"per_example": per_example,
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
|
| 255 |
+
def main() -> int:
|
| 256 |
+
p = argparse.ArgumentParser(
|
| 257 |
+
description="Run the spec_rl RL-eval loop offline against the local stub "
|
| 258 |
+
"(or any OpenAI-compatible endpoint) and compute the reward."
|
| 259 |
+
)
|
| 260 |
+
p.add_argument("--base-url", default="http://localhost:8000",
|
| 261 |
+
help="OpenAI-compatible endpoint (stub :8000 / dflash stub :8001 / vLLM).")
|
| 262 |
+
p.add_argument("--model", default="laguna")
|
| 263 |
+
p.add_argument("--n", type=int, default=5, help="Number of HumanEval problems (rollouts).")
|
| 264 |
+
p.add_argument("--max-tokens", type=int, default=512)
|
| 265 |
+
p.add_argument("--out", default=str(DEFAULT_OUT),
|
| 266 |
+
help="Where to write the small JSON summary.")
|
| 267 |
+
p.add_argument("--force-manual", action="store_true",
|
| 268 |
+
help="Skip the verifiers path; always use the manual rollout loop.")
|
| 269 |
+
args = p.parse_args()
|
| 270 |
+
|
| 271 |
+
result = None
|
| 272 |
+
if not args.force_manual:
|
| 273 |
+
result = try_verifiers(args.base_url, args.model, args.n)
|
| 274 |
+
if result is None:
|
| 275 |
+
result = manual_rollouts(args.base_url, args.model, args.n, args.max_tokens)
|
| 276 |
+
|
| 277 |
+
summary = {
|
| 278 |
+
"base_url": args.base_url,
|
| 279 |
+
"model": args.model,
|
| 280 |
+
"n": len(result["per_example"]),
|
| 281 |
+
"driver": result["driver"],
|
| 282 |
+
"transport": result.get("transport", "chat"),
|
| 283 |
+
"mean_reward": result["mean_reward"],
|
| 284 |
+
"scores": [p["score"] for p in result["per_example"]],
|
| 285 |
+
"per_example": [
|
| 286 |
+
{"task_id": p["task_id"], "score": p["score"]}
|
| 287 |
+
for p in result["per_example"]
|
| 288 |
+
],
|
| 289 |
+
}
|
| 290 |
+
|
| 291 |
+
print(json.dumps(
|
| 292 |
+
{k: v for k, v in summary.items() if k != "per_example"}, indent=2
|
| 293 |
+
))
|
| 294 |
+
print(f"[eval_local] driver={summary['driver']} "
|
| 295 |
+
f"mean_reward={summary['mean_reward']:.3f} n={summary['n']}")
|
| 296 |
+
|
| 297 |
+
out_path = Path(args.out)
|
| 298 |
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
| 299 |
+
out_path.write_text(json.dumps(summary, indent=2))
|
| 300 |
+
print(f"[eval_local] wrote {out_path}")
|
| 301 |
+
return 0
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
if __name__ == "__main__":
|
| 305 |
+
raise SystemExit(main())
|
scripts/fill_submission.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""fill_submission.py — turn measured results into ready-to-paste submission numbers.
|
| 3 |
+
|
| 4 |
+
Reads the before/after benchmark JSONs (and, if given, the HumanEval/parity JSON),
|
| 5 |
+
computes the headline figures (speedup, tau, TTFT delta, pass@1, parity verdict),
|
| 6 |
+
and PRINTS:
|
| 7 |
+
* a warning if the data is STILL STUB (shape-only) — so you never submit fake numbers,
|
| 8 |
+
* the values to drop into MODEL_CARD.md / RESULTS.html,
|
| 9 |
+
* a filled one-line claim for the demo.
|
| 10 |
+
|
| 11 |
+
It does NOT edit files — paste the numbers yourself, so nothing is silently overwritten.
|
| 12 |
+
|
| 13 |
+
Usage:
|
| 14 |
+
python scripts/fill_submission.py \
|
| 15 |
+
--baseline results/baseline.json --dflash results/dflash.json \
|
| 16 |
+
[--humaneval results/humaneval_dflash.json]
|
| 17 |
+
"""
|
| 18 |
+
from __future__ import annotations
|
| 19 |
+
|
| 20 |
+
import argparse
|
| 21 |
+
import json
|
| 22 |
+
from pathlib import Path
|
| 23 |
+
from typing import Any
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _load(path: str) -> dict[str, Any]:
|
| 27 |
+
return json.loads(Path(path).read_text())
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def _looks_stub(obj: dict[str, Any]) -> bool:
|
| 31 |
+
"""Heuristic: the dress-rehearsal stub stamps a tell-tale completion string."""
|
| 32 |
+
for r in obj.get("runs", []) or []:
|
| 33 |
+
if "stub completion" in str(r.get("text", "")).lower():
|
| 34 |
+
return True
|
| 35 |
+
return obj.get("base_url", "").endswith((":8000", ":8001")) and bool(
|
| 36 |
+
[r for r in obj.get("runs", []) or [] if "stub" in str(r.get("text", "")).lower()]
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _g(obj: dict[str, Any], *keys: str, default: Any = None) -> Any:
|
| 41 |
+
for k in keys:
|
| 42 |
+
if k in obj:
|
| 43 |
+
return obj[k]
|
| 44 |
+
return default
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def main() -> int:
|
| 48 |
+
ap = argparse.ArgumentParser(description=__doc__)
|
| 49 |
+
ap.add_argument("--baseline", default="results/baseline.json")
|
| 50 |
+
ap.add_argument("--dflash", default="results/dflash.json")
|
| 51 |
+
ap.add_argument("--humaneval", default=None,
|
| 52 |
+
help="optional pass@1 / parity JSON from humaneval_subset.py")
|
| 53 |
+
args = ap.parse_args()
|
| 54 |
+
|
| 55 |
+
for p in (args.baseline, args.dflash):
|
| 56 |
+
if not Path(p).exists():
|
| 57 |
+
print(f"no results yet at {p} — run the A/B (scripts/hf_job_ab.py) or 'make rehearse' first.")
|
| 58 |
+
return 3
|
| 59 |
+
|
| 60 |
+
base = _load(args.baseline)
|
| 61 |
+
dfl = _load(args.dflash)
|
| 62 |
+
|
| 63 |
+
stub = _looks_stub(base) or _looks_stub(dfl)
|
| 64 |
+
if stub:
|
| 65 |
+
print("=" * 64)
|
| 66 |
+
print(" ⚠️ STUB DATA DETECTED — do NOT submit these numbers.")
|
| 67 |
+
print(" These are shape-only dress-rehearsal results. Re-run measure.py")
|
| 68 |
+
print(" against the real Laguna+DFlash vLLM endpoint, then re-run this.")
|
| 69 |
+
print("=" * 64)
|
| 70 |
+
|
| 71 |
+
b_tps = float(_g(base, "tokens_per_s_mean", default=0.0))
|
| 72 |
+
d_tps = float(_g(dfl, "tokens_per_s_mean", default=0.0))
|
| 73 |
+
b_ttft = float(_g(base, "ttft_s_mean", default=0.0)) * 1000 # ms
|
| 74 |
+
d_ttft = float(_g(dfl, "ttft_s_mean", default=0.0)) * 1000 # ms
|
| 75 |
+
tau = _g(dfl, "acceptance_length_tau")
|
| 76 |
+
speedup = (d_tps / b_tps) if b_tps else 0.0
|
| 77 |
+
|
| 78 |
+
# optional quality / parity
|
| 79 |
+
pass1 = parity = lossless = None
|
| 80 |
+
if args.humaneval and Path(args.humaneval).exists():
|
| 81 |
+
he = _load(args.humaneval)
|
| 82 |
+
pass1 = _g(he, "pass_at_1", "pass@1", "pass1")
|
| 83 |
+
lossless = _g(he, "lossless")
|
| 84 |
+
parity = _g(he, "mismatches", "token_mismatches")
|
| 85 |
+
|
| 86 |
+
def fmt(x, nd=1, suffix=""):
|
| 87 |
+
return f"{x:.{nd}f}{suffix}" if isinstance(x, (int, float)) else "—"
|
| 88 |
+
|
| 89 |
+
print("\n--- HEADLINE (paste into MODEL_CARD.md + RESULTS.html) ---")
|
| 90 |
+
print(f" baseline tokens/sec : {fmt(b_tps)}")
|
| 91 |
+
print(f" dflash tokens/sec : {fmt(d_tps)}")
|
| 92 |
+
print(f" speedup : {fmt(speedup, 2, 'x')}")
|
| 93 |
+
print(f" acceptance length tau: {fmt(tau, 2) if tau is not None else '— (read from /metrics)'}")
|
| 94 |
+
print(f" TTFT baseline / dflash (ms): {fmt(b_ttft)} / {fmt(d_ttft)} (expect ~equal)")
|
| 95 |
+
print(f" HumanEval pass@1 : {pass1 if pass1 is not None else '— (run humaneval_subset.py)'}")
|
| 96 |
+
print(f" greedy parity : "
|
| 97 |
+
+ ("LOSSLESS ✓ (0 mismatches)" if (lossless is True or parity == 0)
|
| 98 |
+
else (f"{parity} mismatches ⚠️" if parity is not None else "— (run --parity)")))
|
| 99 |
+
|
| 100 |
+
print("\n--- ONE-LINE CLAIM (demo opener) ---")
|
| 101 |
+
if b_tps and d_tps:
|
| 102 |
+
tau_clause = f', tau={fmt(tau,2)}' if tau is not None else ''
|
| 103 |
+
print(f' "Lean Laguna: DFlash makes Laguna XS.2 generate {fmt(speedup,2,"x")} faster '
|
| 104 |
+
f'on one GPU ({fmt(b_tps)} -> {fmt(d_tps)} tok/s{tau_clause}) '
|
| 105 |
+
f'with byte-identical output."')
|
| 106 |
+
else:
|
| 107 |
+
print(" (fill once real tokens/sec are present)")
|
| 108 |
+
|
| 109 |
+
if stub:
|
| 110 |
+
print("\n[fill_submission] refusing to call this submittable: STUB data.")
|
| 111 |
+
return 2
|
| 112 |
+
return 0
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
if __name__ == "__main__":
|
| 116 |
+
raise SystemExit(main())
|
scripts/gen_local.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
gen_local.py — TINY-model generation on Apple Silicon (MPS), purely to validate
|
| 4 |
+
the PIPELINE SHAPE before the venue. This does NOT run Laguna and does NOT do
|
| 5 |
+
speculative decoding — it proves the measure-generate-report loop works so the
|
| 6 |
+
same harness can be pointed at the real model on Prime Intellect.
|
| 7 |
+
|
| 8 |
+
What it measures (the same two numbers we care about at the venue):
|
| 9 |
+
- TTFT (time to first token): wall-clock from submit to the first new token.
|
| 10 |
+
- tokens/sec (decode throughput): generated tokens / (total - TTFT).
|
| 11 |
+
|
| 12 |
+
JVM analogy: think of this as a JUnit smoke test against an in-memory stub —
|
| 13 |
+
it asserts the wiring is correct so the integration run against the real
|
| 14 |
+
service (vLLM + Laguna on CUDA) can't fail on plumbing.
|
| 15 |
+
|
| 16 |
+
Usage (Mac):
|
| 17 |
+
uv run python scripts/gen_local.py --model sshleifer/tiny-gpt2 --max-new-tokens 64
|
| 18 |
+
uv run python scripts/gen_local.py --model gpt2 --prompt "def quicksort(arr):"
|
| 19 |
+
|
| 20 |
+
At the venue you'd point --model at a small HF model first, then (on GPU) at
|
| 21 |
+
Laguna itself for a sanity generation BEFORE wiring up vLLM serving.
|
| 22 |
+
"""
|
| 23 |
+
from __future__ import annotations
|
| 24 |
+
|
| 25 |
+
import argparse
|
| 26 |
+
import time
|
| 27 |
+
|
| 28 |
+
import torch
|
| 29 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def pick_device() -> str:
|
| 33 |
+
if torch.cuda.is_available():
|
| 34 |
+
return "cuda"
|
| 35 |
+
if torch.backends.mps.is_available():
|
| 36 |
+
return "mps"
|
| 37 |
+
return "cpu"
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def main() -> None:
|
| 41 |
+
p = argparse.ArgumentParser(description="Tiny-model gen + TTFT/tokens-per-sec on MPS/CPU.")
|
| 42 |
+
p.add_argument("--model", default="sshleifer/tiny-gpt2",
|
| 43 |
+
help="HF model id. Tiny by default; swap to gpt2 or (on GPU) Laguna.")
|
| 44 |
+
p.add_argument("--prompt", default="def fibonacci(n):\n ",
|
| 45 |
+
help="Coding-style prompt (matches the hackathon track).")
|
| 46 |
+
p.add_argument("--max-new-tokens", type=int, default=64)
|
| 47 |
+
p.add_argument("--greedy", action="store_true", default=True,
|
| 48 |
+
help="Greedy decode so output is deterministic (lossless baseline).")
|
| 49 |
+
args = p.parse_args()
|
| 50 |
+
|
| 51 |
+
device = pick_device()
|
| 52 |
+
print(f"[gen_local] device={device} model={args.model}")
|
| 53 |
+
|
| 54 |
+
tok = AutoTokenizer.from_pretrained(args.model)
|
| 55 |
+
model = AutoModelForCausalLM.from_pretrained(args.model).to(device)
|
| 56 |
+
model.eval()
|
| 57 |
+
|
| 58 |
+
inputs = tok(args.prompt, return_tensors="pt").to(device)
|
| 59 |
+
n_prompt = inputs["input_ids"].shape[1]
|
| 60 |
+
|
| 61 |
+
# --- Warmup: first run triggers lazy kernel compilation on MPS; if we timed
|
| 62 |
+
# it, TTFT would absorb the one-off compile cost and tokens/sec would be
|
| 63 |
+
# garbage. Run one throwaway pass to warm the kernels, THEN measure. ---
|
| 64 |
+
with torch.no_grad():
|
| 65 |
+
_ = model.generate(**inputs, max_new_tokens=2, do_sample=False,
|
| 66 |
+
pad_token_id=tok.eos_token_id)
|
| 67 |
+
if device == "mps":
|
| 68 |
+
torch.mps.synchronize()
|
| 69 |
+
|
| 70 |
+
# --- TTFT: generate exactly 1 token, time it (warmed) ---
|
| 71 |
+
if device == "mps":
|
| 72 |
+
torch.mps.synchronize()
|
| 73 |
+
t0 = time.perf_counter()
|
| 74 |
+
with torch.no_grad():
|
| 75 |
+
_ = model.generate(**inputs, max_new_tokens=1, do_sample=False,
|
| 76 |
+
pad_token_id=tok.eos_token_id)
|
| 77 |
+
if device == "mps":
|
| 78 |
+
torch.mps.synchronize()
|
| 79 |
+
ttft = time.perf_counter() - t0
|
| 80 |
+
|
| 81 |
+
# --- Full generation: time the whole thing, derive decode tokens/sec ---
|
| 82 |
+
if device == "mps":
|
| 83 |
+
torch.mps.synchronize()
|
| 84 |
+
t1 = time.perf_counter()
|
| 85 |
+
with torch.no_grad():
|
| 86 |
+
out = model.generate(**inputs, max_new_tokens=args.max_new_tokens,
|
| 87 |
+
do_sample=False, pad_token_id=tok.eos_token_id)
|
| 88 |
+
if device == "mps":
|
| 89 |
+
torch.mps.synchronize()
|
| 90 |
+
total = time.perf_counter() - t1
|
| 91 |
+
|
| 92 |
+
new_tokens = out.shape[1] - n_prompt
|
| 93 |
+
# tokens/sec over the decode phase: exclude the first token (its time is TTFT).
|
| 94 |
+
decode_time = max(total - ttft, 1e-9)
|
| 95 |
+
tps = (new_tokens - 1) / decode_time if new_tokens > 1 else 0.0
|
| 96 |
+
|
| 97 |
+
text = tok.decode(out[0][n_prompt:], skip_special_tokens=True)
|
| 98 |
+
|
| 99 |
+
print("\n--- generation ---")
|
| 100 |
+
print(text)
|
| 101 |
+
print("\n--- metrics (PIPELINE-SHAPE ONLY; not Laguna numbers) ---")
|
| 102 |
+
print(f"prompt_tokens : {n_prompt}")
|
| 103 |
+
print(f"new_tokens : {new_tokens}")
|
| 104 |
+
print(f"TTFT_s : {ttft:.4f}")
|
| 105 |
+
print(f"total_s : {total:.4f}")
|
| 106 |
+
print(f"decode_tokens_per_s: {tps:.2f}")
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
if __name__ == "__main__":
|
| 110 |
+
main()
|
scripts/hf_job_ab.py
ADDED
|
@@ -0,0 +1,287 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# /// script
|
| 2 |
+
# requires-python = ">=3.10"
|
| 3 |
+
# dependencies = ["vllm>=0.21", "huggingface_hub>=0.25"]
|
| 4 |
+
# ///
|
| 5 |
+
"""hf_job_ab.py — the real Lean Laguna MIN A/B, as a self-contained HF Jobs run.
|
| 6 |
+
|
| 7 |
+
Runs ON Hugging Face Jobs (a GPU batch job, no ssh, auto-stops when done). It:
|
| 8 |
+
1. serves Laguna XS.2 baseline in vLLM, measures tokens/sec + TTFT over N prompts,
|
| 9 |
+
2. re-serves with the DFlash speculator (one --speculative-config), measures again + reads
|
| 10 |
+
acceptance length tau from /metrics,
|
| 11 |
+
3. greedy-parity-checks baseline vs DFlash outputs (must be byte-identical),
|
| 12 |
+
4. writes results/{baseline,dflash}.json + parity, and uploads them to an HF dataset repo
|
| 13 |
+
so the orchestrator can fetch them without ssh.
|
| 14 |
+
|
| 15 |
+
Submit with:
|
| 16 |
+
hf jobs uv run --flavor rtx-pro-6000 --timeout 1800 \
|
| 17 |
+
--secrets HF_TOKEN --env RESULTS_REPO=art87able/lean-laguna-results scripts/hf_job_ab.py
|
| 18 |
+
|
| 19 |
+
Everything is MEASURED — no fabricated numbers. A hard wall-clock budget bounds the spend.
|
| 20 |
+
"""
|
| 21 |
+
from __future__ import annotations
|
| 22 |
+
|
| 23 |
+
import json
|
| 24 |
+
import os
|
| 25 |
+
import subprocess
|
| 26 |
+
import sys
|
| 27 |
+
import time
|
| 28 |
+
import urllib.request
|
| 29 |
+
|
| 30 |
+
MODEL = os.environ.get("MODEL", "poolside/Laguna-XS.2")
|
| 31 |
+
SPECULATOR = os.environ.get("SPECULATOR", "poolside/Laguna-XS.2-speculator.dflash")
|
| 32 |
+
GAMMA = int(os.environ.get("GAMMA", "7"))
|
| 33 |
+
N = int(os.environ.get("N", "0")) # 0 => use the full curated prompt set
|
| 34 |
+
MAX_TOKENS = int(os.environ.get("MAX_TOKENS", "256"))
|
| 35 |
+
BUDGET_S = int(os.environ.get("BUDGET_S", "1500")) # hard wall-clock cap (credit guard)
|
| 36 |
+
RESULTS_REPO = os.environ.get("RESULTS_REPO", "") # HF dataset repo to upload results to
|
| 37 |
+
PORT = 8000
|
| 38 |
+
STOP = ["\nclass ", "\ndef ", "\n#", "\nif __name__"]
|
| 39 |
+
T0 = time.time()
|
| 40 |
+
# A mixed-difficulty set so acceptance length tau is measured across EASY -> HARD, not just
|
| 41 |
+
# trivial canonical functions (which pin tau at the gamma+1 ceiling and over-state the win).
|
| 42 |
+
PROMPTS = [
|
| 43 |
+
# --- trivial canonical (high acceptance: the ceiling case) ---
|
| 44 |
+
"def fib(n):\n \"\"\"Return the n-th Fibonacci number.\"\"\"\n",
|
| 45 |
+
"def is_prime(n):\n \"\"\"Return True iff n is prime.\"\"\"\n",
|
| 46 |
+
"def factorial(n):\n \"\"\"Return n! (n factorial).\"\"\"\n",
|
| 47 |
+
"def reverse_words(s):\n \"\"\"Reverse the order of words in s.\"\"\"\n",
|
| 48 |
+
# --- medium ---
|
| 49 |
+
"def binary_search(arr, target):\n \"\"\"Return the index of target in sorted arr, else -1.\"\"\"\n",
|
| 50 |
+
"def merge_sorted(a, b):\n \"\"\"Merge two sorted lists into one sorted list.\"\"\"\n",
|
| 51 |
+
"def is_balanced(s):\n \"\"\"Return True iff the brackets ()[]{} in s are balanced.\"\"\"\n",
|
| 52 |
+
"def roman_to_int(s):\n \"\"\"Convert a Roman numeral string to an integer.\"\"\"\n",
|
| 53 |
+
"def flatten(nested):\n \"\"\"Flatten an arbitrarily nested list of ints into a flat list.\"\"\"\n",
|
| 54 |
+
# --- harder / branchy / rare-token (acceptance should drop here) ---
|
| 55 |
+
"def lcs(a, b):\n \"\"\"Return the length of the longest common subsequence of strings a and b.\"\"\"\n",
|
| 56 |
+
"def parse_duration(s):\n \"\"\"Parse strings like '1h30m', '45s', '2d' into total seconds. Raise ValueError on bad input.\"\"\"\n",
|
| 57 |
+
"def group_anagrams(words):\n \"\"\"Group words that are anagrams of each other into a list of lists.\"\"\"\n",
|
| 58 |
+
"class LRUCache:\n \"\"\"A fixed-capacity LRU cache with get(key) and put(key, value).\"\"\"\n",
|
| 59 |
+
"def dijkstra(graph, start):\n \"\"\"graph: dict node -> list of (neighbor, weight). Return dict of shortest distances from start.\"\"\"\n",
|
| 60 |
+
]
|
| 61 |
+
if N <= 0:
|
| 62 |
+
N = len(PROMPTS)
|
| 63 |
+
PROMPTS = (PROMPTS * ((N // len(PROMPTS)) + 1))[:N] # repeat only if a larger N is forced
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def budget_left() -> float:
|
| 67 |
+
return BUDGET_S - (time.time() - T0)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def serve(dflash: bool) -> subprocess.Popen:
|
| 71 |
+
env = {**os.environ,
|
| 72 |
+
"VLLM_USE_DEEP_GEMM": "0",
|
| 73 |
+
# Laguna is an UNQUANTIZED bf16 MoE. The slim uv image ships only pip CUDA *runtime*
|
| 74 |
+
# wheels — no nvcc/toolkit at /usr/local/cuda. vLLM/FlashInfer lazily JIT-compile
|
| 75 |
+
# several kernels on first use (inside profile_run), each needing nvcc, so each dies
|
| 76 |
+
# "Could not find nvcc". We disable EVERY FlashInfer JIT path and pin prebuilt
|
| 77 |
+
# alternatives:
|
| 78 |
+
# - MoE -> Triton fused-MoE (PTX via Triton). [verified: sm90+sm120 cutlass JIT crash]
|
| 79 |
+
# - sampler -> torch top-k/top-p (not FlashInfer). [verified: sampling JIT crash]
|
| 80 |
+
# - attention -> FLASH_ATTN (prebuilt flash-attn wheel, not FlashInfer JIT).
|
| 81 |
+
"VLLM_USE_FLASHINFER_MOE_FP16": "0",
|
| 82 |
+
"VLLM_USE_FLASHINFER_MOE_FP8": "0",
|
| 83 |
+
"VLLM_USE_FLASHINFER_SAMPLER": "0",
|
| 84 |
+
"VLLM_ATTENTION_BACKEND": os.environ.get("VLLM_ATTENTION_BACKEND", "FLASH_ATTN")}
|
| 85 |
+
cmd = [sys.executable, "-m", "vllm.entrypoints.openai.api_server",
|
| 86 |
+
"--model", MODEL, "--port", str(PORT), "--tensor-parallel-size", "1",
|
| 87 |
+
"--trust-remote-code", # Laguna's custom MoE arch needs it in vLLM
|
| 88 |
+
"--enforce-eager", # skip CUDA-graph capture: leaner + faster start; A/B ratio unaffected
|
| 89 |
+
"--gpu-memory-utilization", "0.9",
|
| 90 |
+
"--max-model-len", os.environ.get("SPECRL_MAX_LEN", "4096")]
|
| 91 |
+
# NOTE: base poolside/Laguna-XS.2 loads in bf16 at ~62 GiB (full MoE resident). It fits a
|
| 92 |
+
# 96GB-class GPU (rtx-pro-6000) with room for KV; h200 (141GB) is the safe, best-tested target.
|
| 93 |
+
# The earlier failures were NOT OOM — they were the nvcc/FlashInfer-JIT issue fixed above.
|
| 94 |
+
if dflash:
|
| 95 |
+
cmd += ["--speculative-config",
|
| 96 |
+
json.dumps({"model": SPECULATOR, "num_speculative_tokens": GAMMA, "method": "dflash"})]
|
| 97 |
+
print(f"[job] serving {'DFlash' if dflash else 'baseline'}: {' '.join(cmd)}", flush=True)
|
| 98 |
+
return subprocess.Popen(cmd, env=env)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def wait_health(proc: subprocess.Popen, timeout: int = 900) -> None:
|
| 102 |
+
url = f"http://localhost:{PORT}/health"
|
| 103 |
+
t = time.time()
|
| 104 |
+
while time.time() - t < timeout:
|
| 105 |
+
if proc.poll() is not None:
|
| 106 |
+
raise RuntimeError("vLLM server exited during startup (check logs above)")
|
| 107 |
+
try:
|
| 108 |
+
urllib.request.urlopen(url, timeout=5)
|
| 109 |
+
print("[job] server healthy", flush=True)
|
| 110 |
+
return
|
| 111 |
+
except Exception:
|
| 112 |
+
time.sleep(5)
|
| 113 |
+
raise TimeoutError("server did not become healthy in time")
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def _post(path: str, payload: dict) -> dict:
|
| 117 |
+
req = urllib.request.Request(f"http://localhost:{PORT}{path}",
|
| 118 |
+
data=json.dumps(payload).encode(),
|
| 119 |
+
headers={"Content-Type": "application/json"})
|
| 120 |
+
with urllib.request.urlopen(req, timeout=300) as r:
|
| 121 |
+
return json.loads(r.read().decode())
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def complete(prompt: str) -> tuple[str, float, float]:
|
| 125 |
+
t = time.time()
|
| 126 |
+
obj = _post("/v1/completions", {"model": MODEL, "prompt": prompt,
|
| 127 |
+
"max_tokens": MAX_TOKENS, "temperature": 0.0, "stop": STOP})
|
| 128 |
+
dt = time.time() - t
|
| 129 |
+
ch = obj["choices"][0]
|
| 130 |
+
text = ch.get("text", "") or ""
|
| 131 |
+
ntok = (obj.get("usage") or {}).get("completion_tokens") or len(text.split())
|
| 132 |
+
return text, (ntok / dt if dt else 0.0), dt
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def tau_from_metrics() -> float | None:
|
| 136 |
+
try:
|
| 137 |
+
with urllib.request.urlopen(f"http://localhost:{PORT}/metrics", timeout=10) as r:
|
| 138 |
+
body = r.read().decode()
|
| 139 |
+
except Exception:
|
| 140 |
+
return None
|
| 141 |
+
acc = draft = None
|
| 142 |
+
for line in body.splitlines():
|
| 143 |
+
if line.startswith("vllm:spec_decode_num_accepted_tokens"):
|
| 144 |
+
acc = float(line.split()[-1])
|
| 145 |
+
elif line.startswith("vllm:spec_decode_num_draft_tokens"):
|
| 146 |
+
draft = float(line.split()[-1])
|
| 147 |
+
if acc is not None and draft and draft > 0:
|
| 148 |
+
passes = draft / GAMMA
|
| 149 |
+
return (acc + passes) / passes if passes else None
|
| 150 |
+
return None
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def spec_counters() -> "tuple[float, float] | None":
|
| 154 |
+
"""Raw cumulative (accepted, draft) spec-decode token counters from /metrics."""
|
| 155 |
+
try:
|
| 156 |
+
with urllib.request.urlopen(f"http://localhost:{PORT}/metrics", timeout=10) as r:
|
| 157 |
+
body = r.read().decode()
|
| 158 |
+
except Exception:
|
| 159 |
+
return None
|
| 160 |
+
acc = draft = None
|
| 161 |
+
for line in body.splitlines():
|
| 162 |
+
if line.startswith("vllm:spec_decode_num_accepted_tokens"):
|
| 163 |
+
acc = float(line.split()[-1])
|
| 164 |
+
elif line.startswith("vllm:spec_decode_num_draft_tokens"):
|
| 165 |
+
draft = float(line.split()[-1])
|
| 166 |
+
if acc is None or draft is None:
|
| 167 |
+
return None
|
| 168 |
+
return acc, draft
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def _tau_from_delta(d_acc: float, d_draft: float) -> "float | None":
|
| 172 |
+
"""Per-prompt acceptance length from the change in counters over one completion."""
|
| 173 |
+
passes = d_draft / GAMMA
|
| 174 |
+
return (d_acc + passes) / passes if passes > 0 else None
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def measure(dflash: bool) -> dict:
|
| 178 |
+
texts, tps, ttft, taus = [], [], [], []
|
| 179 |
+
prev = spec_counters() if dflash else None
|
| 180 |
+
for p in PROMPTS:
|
| 181 |
+
if budget_left() < 120:
|
| 182 |
+
print("[job] budget guard hit — stopping measure early", flush=True)
|
| 183 |
+
break
|
| 184 |
+
txt, t_ps, dt = complete(p)
|
| 185 |
+
texts.append(txt); tps.append(t_ps); ttft.append(dt)
|
| 186 |
+
if dflash:
|
| 187 |
+
cur = spec_counters()
|
| 188 |
+
if prev and cur:
|
| 189 |
+
ti = _tau_from_delta(cur[0] - prev[0], cur[1] - prev[1])
|
| 190 |
+
taus.append(round(ti, 3) if ti is not None else None)
|
| 191 |
+
prev = cur
|
| 192 |
+
out = {
|
| 193 |
+
"label": "dflash" if dflash else "baseline", "model": MODEL, "n": len(texts),
|
| 194 |
+
"tokens_per_s_mean": sum(tps) / len(tps) if tps else 0.0,
|
| 195 |
+
"ttft_s_mean": sum(ttft) / len(ttft) if ttft else 0.0, # NB: full-completion latency, not true TTFT
|
| 196 |
+
"acceptance_length_tau": tau_from_metrics() if dflash else 1.0, # aggregate over the whole set
|
| 197 |
+
"texts": texts,
|
| 198 |
+
"runs": [{"ttft_s": d, "total_s": d, "new_tokens": len(t.split()),
|
| 199 |
+
"tokens_per_s": s, "text": t} for t, s, d in zip(texts, tps, ttft)],
|
| 200 |
+
}
|
| 201 |
+
if dflash:
|
| 202 |
+
clean = [t for t in taus if t is not None]
|
| 203 |
+
cs = sorted(clean)
|
| 204 |
+
out["tau_per_prompt"] = taus
|
| 205 |
+
out["tau_min"] = min(clean) if clean else None
|
| 206 |
+
out["tau_median"] = cs[len(cs) // 2] if cs else None
|
| 207 |
+
out["tau_max"] = max(clean) if clean else None
|
| 208 |
+
out["tau_mean"] = round(sum(clean) / len(clean), 3) if clean else None
|
| 209 |
+
return out
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
def run_one(dflash: bool) -> dict:
|
| 213 |
+
proc = serve(dflash)
|
| 214 |
+
try:
|
| 215 |
+
wait_health(proc)
|
| 216 |
+
return measure(dflash)
|
| 217 |
+
finally:
|
| 218 |
+
proc.terminate()
|
| 219 |
+
try:
|
| 220 |
+
proc.wait(timeout=30)
|
| 221 |
+
except Exception:
|
| 222 |
+
proc.kill()
|
| 223 |
+
time.sleep(5)
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
def _expose_wheel_nvcc() -> None:
|
| 227 |
+
"""Safety net: if no CUDA toolkit is on PATH but the pip nvidia-cuda-nvcc wheel is
|
| 228 |
+
installed, expose its nvcc + set CUDA_HOME so ANY residual FlashInfer JIT can compile
|
| 229 |
+
instead of hard-failing 'Could not find nvcc'. Never exercised when the FlashInfer paths
|
| 230 |
+
are disabled (see serve()); pure belt-and-suspenders. Set in os.environ BEFORE serve()
|
| 231 |
+
so the vLLM subprocess inherits it."""
|
| 232 |
+
import shutil
|
| 233 |
+
import site
|
| 234 |
+
if shutil.which("nvcc") or os.path.isdir("/usr/local/cuda"):
|
| 235 |
+
return
|
| 236 |
+
roots = []
|
| 237 |
+
try:
|
| 238 |
+
roots = list(site.getsitepackages())
|
| 239 |
+
except Exception:
|
| 240 |
+
pass
|
| 241 |
+
roots += [os.path.dirname(os.path.dirname(__file__))]
|
| 242 |
+
for root in roots:
|
| 243 |
+
cand = os.path.join(root, "nvidia", "cuda_nvcc")
|
| 244 |
+
if os.path.exists(os.path.join(cand, "bin", "nvcc")):
|
| 245 |
+
os.environ["CUDA_HOME"] = cand
|
| 246 |
+
os.environ["CUDA_PATH"] = cand
|
| 247 |
+
os.environ["PATH"] = os.path.join(cand, "bin") + ":" + os.environ.get("PATH", "")
|
| 248 |
+
print(f"[job] exposed wheel nvcc (CUDA_HOME={cand})", flush=True)
|
| 249 |
+
return
|
| 250 |
+
print("[job] no wheel nvcc found to expose (FlashInfer JIT paths are disabled anyway)", flush=True)
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
def main() -> int:
|
| 254 |
+
print(f"[job] start; budget {BUDGET_S}s; N={N}; model={MODEL}", flush=True)
|
| 255 |
+
_expose_wheel_nvcc()
|
| 256 |
+
base = run_one(dflash=False)
|
| 257 |
+
dfl = run_one(dflash=True)
|
| 258 |
+
mism = sum(1 for a, b in zip(base["texts"], dfl["texts"]) if a != b)
|
| 259 |
+
parity = {"compared": min(len(base["texts"]), len(dfl["texts"])),
|
| 260 |
+
"mismatches": mism, "lossless": mism == 0}
|
| 261 |
+
speedup = (dfl["tokens_per_s_mean"] / base["tokens_per_s_mean"]
|
| 262 |
+
if base["tokens_per_s_mean"] else 0.0)
|
| 263 |
+
summary = {"speedup_x": round(speedup, 3), "tau": dfl["acceptance_length_tau"],
|
| 264 |
+
"baseline_tps": base["tokens_per_s_mean"], "dflash_tps": dfl["tokens_per_s_mean"],
|
| 265 |
+
"parity": parity, "elapsed_s": round(time.time() - T0, 1)}
|
| 266 |
+
print("[job] RESULT " + json.dumps(summary), flush=True)
|
| 267 |
+
|
| 268 |
+
os.makedirs("results", exist_ok=True)
|
| 269 |
+
for d, name in ((base, "baseline.json"), (dfl, "dflash.json")):
|
| 270 |
+
json.dump(d, open(f"results/{name}", "w"), indent=2)
|
| 271 |
+
json.dump({**summary, "parity": parity}, open("results/summary.json", "w"), indent=2)
|
| 272 |
+
|
| 273 |
+
# No repo creation/upload — zero public surface. Emit results to the job logs as
|
| 274 |
+
# tagged JSON lines; the orchestrator parses them from `hf jobs logs <id>` and writes
|
| 275 |
+
# results/*.json locally, then pushes ONLY to the authorized poolside-laguna-hackathon org.
|
| 276 |
+
def _compact(d: dict) -> dict:
|
| 277 |
+
return {k: v for k, v in d.items() if k not in ("texts", "runs")}
|
| 278 |
+
print("[job] BASELINE_JSON " + json.dumps(_compact(base)), flush=True)
|
| 279 |
+
print("[job] DFLASH_JSON " + json.dumps(_compact(dfl)), flush=True)
|
| 280 |
+
print("[job] PARITY_JSON " + json.dumps(parity), flush=True)
|
| 281 |
+
print("[job] SAMPLE_BASELINE " + json.dumps(base["texts"][:2]), flush=True)
|
| 282 |
+
print("[job] SAMPLE_DFLASH " + json.dumps(dfl["texts"][:2]), flush=True)
|
| 283 |
+
return 0
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
if __name__ == "__main__":
|
| 287 |
+
raise SystemExit(main())
|
scripts/parity_local.sh
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# parity_local.sh — full local dry-run of the benchmark + parity harness on the Mac.
|
| 3 |
+
# Starts two stub servers (baseline :8000, "dflash" :8001), waits until both are
|
| 4 |
+
# ready, runs measure.py against each (writing results/*.json) and the greedy
|
| 5 |
+
# parity check across both, then tears the stubs down. No CUDA / vLLM / Laguna.
|
| 6 |
+
set -euo pipefail
|
| 7 |
+
cd "$(dirname "$0")/.."
|
| 8 |
+
PY=.venv/bin/python
|
| 9 |
+
|
| 10 |
+
"$PY" scripts/stub_server.py --port 8000 & A=$!
|
| 11 |
+
"$PY" scripts/stub_server.py --port 8001 --spec & B=$!
|
| 12 |
+
trap 'kill $A $B 2>/dev/null || true' EXIT
|
| 13 |
+
|
| 14 |
+
# Wait for both ports to accept connections (no shell sleep — poll in python).
|
| 15 |
+
"$PY" - <<'PY'
|
| 16 |
+
import socket, time, sys
|
| 17 |
+
for port in (8000, 8001):
|
| 18 |
+
for _ in range(100):
|
| 19 |
+
with socket.socket() as s:
|
| 20 |
+
if s.connect_ex(("127.0.0.1", port)) == 0:
|
| 21 |
+
break
|
| 22 |
+
time.sleep(0.05)
|
| 23 |
+
else:
|
| 24 |
+
sys.exit(f"stub on {port} never came up")
|
| 25 |
+
print("[parity_local] both stubs ready")
|
| 26 |
+
PY
|
| 27 |
+
|
| 28 |
+
mkdir -p results
|
| 29 |
+
"$PY" bench/measure.py --base-url http://localhost:8001 --model laguna --label dflash --n 5 --out results/dflash.json
|
| 30 |
+
"$PY" bench/measure.py --base-url http://localhost:8000 --model laguna --label baseline --n 5 --out results/baseline.json
|
| 31 |
+
"$PY" evals/humaneval_subset.py --parity --base-url http://localhost:8000 --base-url-b http://localhost:8001 --model laguna --n 3
|
| 32 |
+
"$PY" scripts/check_results.py results/dflash.json results/baseline.json
|
| 33 |
+
echo "[parity_local] OK — results/ written, parity checked"
|
scripts/run_min_on_prime.sh
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# run_min_on_prime.sh — provision a GPU, run the Lean Laguna MIN A/B, ALWAYS tear down.
|
| 3 |
+
#
|
| 4 |
+
# Credit safety is the whole point of this script:
|
| 5 |
+
# * a hard wallet check before provisioning,
|
| 6 |
+
# * an EXIT/INT/TERM trap that terminates the pod no matter how the script ends
|
| 7 |
+
# (success, error, or Ctrl-C) — so a botched bring-up can't leave a GPU billing,
|
| 8 |
+
# * the cheap->expensive ladder (tiny smoke before the real run).
|
| 9 |
+
#
|
| 10 |
+
# It does NOT fabricate anything: it runs serve_vllm + measure + parity on the real
|
| 11 |
+
# Laguna+DFlash and writes results/*.json, then runs fill_submission.py (which itself
|
| 12 |
+
# refuses stub data). Review it before running; some remote-exec lines are marked
|
| 13 |
+
# [VERIFY] because the exact `prime pods ssh` non-interactive form can vary by CLI build.
|
| 14 |
+
#
|
| 15 |
+
# Usage: MAX_USD=5 GPU_TYPE=GH200_96GB ./scripts/run_min_on_prime.sh
|
| 16 |
+
set -euo pipefail
|
| 17 |
+
|
| 18 |
+
GPU_TYPE="${GPU_TYPE:-GH200_96GB}" # Hopper = native FP8 for Laguna. (A100 lacks native FP8.)
|
| 19 |
+
GPU_COUNT="${GPU_COUNT:-1}"
|
| 20 |
+
DISK_GB="${DISK_GB:-120}" # Laguna FP8 (~33GB) + drafter + room
|
| 21 |
+
N="${N:-20}" # prompts per measure
|
| 22 |
+
MAX_USD="${MAX_USD:-5}" # abort if wallet can't cover this; teardown caps real spend
|
| 23 |
+
POD_NAME="${POD_NAME:-lean-laguna-min}"
|
| 24 |
+
HERE="$(cd "$(dirname "$0")/.." && pwd)" # laguna-hack/
|
| 25 |
+
export PATH="$HOME/.local/bin:$PATH"
|
| 26 |
+
|
| 27 |
+
say() { printf '\n\033[1;32m[run-min]\033[0m %s\n' "$*"; }
|
| 28 |
+
die() { printf '\n\033[1;31m[run-min] ABORT:\033[0m %s\n' "$*" >&2; exit 1; }
|
| 29 |
+
|
| 30 |
+
# --- 0. preconditions (free) ---------------------------------------------------
|
| 31 |
+
command -v prime >/dev/null || die "prime CLI not found"
|
| 32 |
+
prime whoami >/dev/null 2>&1 || die "not logged into Prime (run: prime login)"
|
| 33 |
+
say "wallet:"; prime --plain wallet 2>&1 | head -4
|
| 34 |
+
read -r -p "Provision a ${GPU_TYPE} (~\$2-3/hr) and run the MIN A/B, cap ~\$${MAX_USD}? [y/N] " ok
|
| 35 |
+
[ "$ok" = "y" ] || die "cancelled by user"
|
| 36 |
+
|
| 37 |
+
# --- 1. provision + ALWAYS-teardown trap --------------------------------------
|
| 38 |
+
say "creating pod ${POD_NAME} (${GPU_TYPE} x${GPU_COUNT})…"
|
| 39 |
+
POD_ID="$(prime pods create --gpu-type "$GPU_TYPE" --gpu-count "$GPU_COUNT" \
|
| 40 |
+
--disk-size "$DISK_GB" --name "$POD_NAME" --yes --plain 2>&1 \
|
| 41 |
+
| grep -oE '[0-9a-f-]{8,}' | head -1)" # [VERIFY] parse the pod id from output
|
| 42 |
+
[ -n "${POD_ID:-}" ] || die "could not create pod / parse id"
|
| 43 |
+
# CRITICAL: terminate on ANY exit so a failed run never leaves a GPU billing.
|
| 44 |
+
trap 'echo; echo "[run-min] tearing down pod $POD_ID"; prime pods terminate "$POD_ID" --yes >/dev/null 2>&1 || true' EXIT INT TERM
|
| 45 |
+
say "pod $POD_ID created — teardown armed."
|
| 46 |
+
|
| 47 |
+
# --- 2. wait until running ------------------------------------------------------
|
| 48 |
+
for _ in $(seq 1 60); do
|
| 49 |
+
st="$(prime --plain pods status "$POD_ID" 2>/dev/null | grep -iE 'status' | head -1 || true)"
|
| 50 |
+
echo " $st"; echo "$st" | grep -qi 'running' && break
|
| 51 |
+
sleep 10
|
| 52 |
+
done
|
| 53 |
+
echo "$st" | grep -qi 'running' || die "pod did not reach RUNNING"
|
| 54 |
+
|
| 55 |
+
# helper: run a command on the pod [VERIFY] exact non-interactive form for your CLI build
|
| 56 |
+
pod() { prime pods ssh "$POD_ID" -- "$@"; }
|
| 57 |
+
|
| 58 |
+
# --- 3. push the harness + install deps ---------------------------------------
|
| 59 |
+
say "syncing harness to pod…"
|
| 60 |
+
# Option A (private repo): clone with the PAT; Option B: rsync $HERE. Pick one. [VERIFY]
|
| 61 |
+
pod "mkdir -p ~/laguna-hack" || die "ssh failed"
|
| 62 |
+
rsync -az -e "prime pods ssh $POD_ID --" \
|
| 63 |
+
"$HERE/scripts" "$HERE/bench" "$HERE/evals" "$HERE/Makefile" \
|
| 64 |
+
"$HERE/requirements-venue.txt" ":~/laguna-hack/" 2>/dev/null \
|
| 65 |
+
|| say "[VERIFY] rsync transport differs — fall back to git clone with PAT on the pod"
|
| 66 |
+
pod "cd ~/laguna-hack && uv pip install -r requirements-venue.txt && vllm --version"
|
| 67 |
+
|
| 68 |
+
# --- 4. the cheap->expensive ladder -------------------------------------------
|
| 69 |
+
say "RUNG 1: tiny smoke (no Laguna) to prove the path"
|
| 70 |
+
pod "cd ~/laguna-hack && python scripts/gen_local.py || true"
|
| 71 |
+
|
| 72 |
+
say "RUNG 2/3: baseline then DFlash, measure both, parity"
|
| 73 |
+
pod "cd ~/laguna-hack && python scripts/serve_vllm.py --mode baseline --run >/tmp/b.log 2>&1 & sleep 90 && python bench/measure.py --base-url http://localhost:8000 --n $N && pkill -f serve_vllm || true"
|
| 74 |
+
pod "cd ~/laguna-hack && python scripts/serve_vllm.py --mode dflash --run >/tmp/d.log 2>&1 & sleep 90 && python bench/measure.py --base-url http://localhost:8000 --n $N && pkill -f serve_vllm || true"
|
| 75 |
+
pod "cd ~/laguna-hack && python evals/humaneval_subset.py --n 25 || true"
|
| 76 |
+
|
| 77 |
+
# --- 5. pull results back ------------------------------------------------------
|
| 78 |
+
say "pulling results…"
|
| 79 |
+
rsync -az -e "prime pods ssh $POD_ID --" ":~/laguna-hack/results/" "$HERE/results/" 2>/dev/null \
|
| 80 |
+
|| say "[VERIFY] copy results manually: prime pods ssh $POD_ID -- 'cat ~/laguna-hack/results/dflash.json'"
|
| 81 |
+
|
| 82 |
+
# --- 6. teardown happens via trap; then fill locally --------------------------
|
| 83 |
+
say "done on GPU — pod will terminate now (trap)."
|
| 84 |
+
trap - EXIT INT TERM
|
| 85 |
+
prime pods terminate "$POD_ID" --yes >/dev/null 2>&1 || true
|
| 86 |
+
say "filling submission numbers (refuses stub data):"
|
| 87 |
+
python3 "$HERE/scripts/fill_submission.py" \
|
| 88 |
+
--baseline "$HERE/results/baseline.json" --dflash "$HERE/results/dflash.json" \
|
| 89 |
+
--humaneval "$HERE/results/humaneval_dflash.json" || true
|
| 90 |
+
say "if fill_submission exited 0, paste the numbers into MODEL_CARD.md/RESULTS.html and run the hf push (SUBMISSION.md §3)."
|
scripts/serve_vllm.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
serve_vllm.py — VENUE ONLY (Prime Intellect, CUDA GPU). DOES NOT RUN ON THE MAC.
|
| 4 |
+
|
| 5 |
+
This is a thin, documented wrapper that prints (and optionally execs) the exact
|
| 6 |
+
`vllm serve` command for three configs:
|
| 7 |
+
|
| 8 |
+
1. baseline — Laguna XS.2 alone (the speed floor).
|
| 9 |
+
2. dflash — Laguna XS.2 + the DFlash speculator (the speed we're claiming).
|
| 10 |
+
3. quant — a quantized Laguna checkpoint (FP8/INT4/NVFP4) + FP8 KV cache.
|
| 11 |
+
This is the FALLBACK lane (see FALLBACK_QUANT.md): if DFlash hits
|
| 12 |
+
a vLLM-version/draft-model snag at the venue, a quantized weights
|
| 13 |
+
checkpoint still tells a clean single-GPU story (smaller footprint,
|
| 14 |
+
FP8 KV cache ~doubles concurrent trajectories per the [TR]).
|
| 15 |
+
|
| 16 |
+
baseline vs dflash are IDENTICAL except for --speculative-config — flip one flag,
|
| 17 |
+
get faster tokens, same greedy output. quant is a different lever (shrink each
|
| 18 |
+
pass instead of cutting passes); the two can stack, but the fallback keeps it
|
| 19 |
+
simple with quant alone.
|
| 20 |
+
|
| 21 |
+
Grounding (cite at the demo):
|
| 22 |
+
- DFlash config shape is from the HF model card
|
| 23 |
+
huggingface.co/poolside/Laguna-XS.2-speculator.dflash:
|
| 24 |
+
--speculative-config '{"model":"poolside/Laguna-XS.2-speculator.dflash",
|
| 25 |
+
"num_speculative_tokens":7,"method":"dflash"}'
|
| 26 |
+
- num_speculative_tokens = 7 is the card's value (this is gamma, the draft length).
|
| 27 |
+
- vLLM >= 0.21.0 and VLLM_USE_DEEP_GEMM=0 per the card.
|
| 28 |
+
- parsers --tool-call-parser poolside_v1 / --reasoning-parser poolside_v1 per the card.
|
| 29 |
+
|
| 30 |
+
VERIFY AT ONBOARDING: exact vLLM version on the PI image, whether
|
| 31 |
+
--trust-remote-code is required, and whether `method` is spelled "dflash"
|
| 32 |
+
in the build you get. The card is authoritative; confirm against `vllm serve --help`.
|
| 33 |
+
|
| 34 |
+
Usage (on Prime Intellect):
|
| 35 |
+
python scripts/serve_vllm.py --mode baseline --print # show the command
|
| 36 |
+
python scripts/serve_vllm.py --mode dflash --run # actually serve
|
| 37 |
+
"""
|
| 38 |
+
from __future__ import annotations
|
| 39 |
+
|
| 40 |
+
import argparse
|
| 41 |
+
import json
|
| 42 |
+
import os
|
| 43 |
+
import shlex
|
| 44 |
+
import subprocess
|
| 45 |
+
import sys
|
| 46 |
+
|
| 47 |
+
MODEL = os.environ.get("LAGUNA_MODEL", "poolside/Laguna-XS.2")
|
| 48 |
+
SPECULATOR = os.environ.get("LAGUNA_SPECULATOR", "poolside/Laguna-XS.2-speculator.dflash")
|
| 49 |
+
|
| 50 |
+
# Draft length gamma. Per the DFlash model card.
|
| 51 |
+
NUM_SPECULATIVE_TOKENS = 7
|
| 52 |
+
|
| 53 |
+
# Quantized checkpoints for the fallback lane. The [TR] says XS.2 ships FP8 (W8A8),
|
| 54 |
+
# INT4 (W4A16/AWQ) and NVFP4 quants in the HF collection. EXACT repo names are NOT
|
| 55 |
+
# confirmed pre-event — these are documented placeholders; VERIFY AT ONBOARDING
|
| 56 |
+
# against huggingface.co/collections/poolside/laguna-xs2 (or override via env).
|
| 57 |
+
QUANT_MODELS = {
|
| 58 |
+
"fp8": os.environ.get("LAGUNA_FP8_MODEL", "poolside/Laguna-XS.2-FP8"),
|
| 59 |
+
"int4": os.environ.get("LAGUNA_INT4_MODEL", "poolside/Laguna-XS.2-INT4"),
|
| 60 |
+
"nvfp4": os.environ.get("LAGUNA_NVFP4_MODEL", "poolside/Laguna-XS.2-NVFP4"),
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def build_cmd(mode: str, max_model_len: int, tp: int, quant: str) -> list[str]:
|
| 65 |
+
model = QUANT_MODELS[quant] if mode == "quant" else MODEL
|
| 66 |
+
base = [
|
| 67 |
+
"vllm", "serve", model,
|
| 68 |
+
"--tensor-parallel-size", str(tp),
|
| 69 |
+
"--max-model-len", str(max_model_len),
|
| 70 |
+
"--served-model-name", "laguna",
|
| 71 |
+
# Poolside-specific parsers (from the model card):
|
| 72 |
+
"--tool-call-parser", "poolside_v1",
|
| 73 |
+
"--reasoning-parser", "poolside_v1",
|
| 74 |
+
"--enable-auto-tool-choice",
|
| 75 |
+
"--default-chat-template-kwargs", '{"enable_thinking": true}',
|
| 76 |
+
]
|
| 77 |
+
if mode == "dflash":
|
| 78 |
+
spec = {
|
| 79 |
+
"model": SPECULATOR,
|
| 80 |
+
"num_speculative_tokens": NUM_SPECULATIVE_TOKENS,
|
| 81 |
+
"method": "dflash",
|
| 82 |
+
}
|
| 83 |
+
base += ["--speculative-config", json.dumps(spec)]
|
| 84 |
+
if mode == "quant":
|
| 85 |
+
# FP8 KV cache is the high-leverage single-GPU win ([TR]: ~2x concurrent
|
| 86 |
+
# trajectories). Weight quant is auto-detected from the checkpoint config.
|
| 87 |
+
base += ["--kv-cache-dtype", "fp8"]
|
| 88 |
+
return base
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def main() -> None:
|
| 92 |
+
if sys.platform == "darwin":
|
| 93 |
+
print("[serve_vllm] REFUSING TO RUN: this is a Mac. vLLM needs CUDA.\n"
|
| 94 |
+
" Run this on Prime Intellect. Use --print to inspect the command here.",
|
| 95 |
+
file=sys.stderr)
|
| 96 |
+
# Still allow --print on Mac for inspection; block --run.
|
| 97 |
+
|
| 98 |
+
p = argparse.ArgumentParser(description="Print/run the vLLM serve command for Laguna (baseline / dflash / quant).")
|
| 99 |
+
p.add_argument("--mode", choices=["baseline", "dflash", "quant"], required=True)
|
| 100 |
+
p.add_argument("--quant", choices=["fp8", "int4", "nvfp4"], default="fp8",
|
| 101 |
+
help="Quant format for --mode quant (the fallback lane). Default fp8.")
|
| 102 |
+
p.add_argument("--max-model-len", type=int, default=16384,
|
| 103 |
+
help="Card example uses 16384; raise toward 131072/262144 if VRAM allows. Verify at onboarding.")
|
| 104 |
+
p.add_argument("--tensor-parallel-size", type=int, default=1,
|
| 105 |
+
help="Single GPU = 1. The whole hook is one-GPU serving.")
|
| 106 |
+
g = p.add_mutually_exclusive_group(required=True)
|
| 107 |
+
g.add_argument("--print", action="store_true", help="Print the command only.")
|
| 108 |
+
g.add_argument("--run", action="store_true", help="Actually exec vllm serve (venue only).")
|
| 109 |
+
args = p.parse_args()
|
| 110 |
+
|
| 111 |
+
cmd = build_cmd(args.mode, args.max_model_len, args.tensor_parallel_size, args.quant)
|
| 112 |
+
env_prefix = "VLLM_USE_DEEP_GEMM=0"
|
| 113 |
+
printable = f"{env_prefix} " + " ".join(shlex.quote(c) for c in cmd)
|
| 114 |
+
print(printable)
|
| 115 |
+
|
| 116 |
+
if args.run:
|
| 117 |
+
if sys.platform == "darwin":
|
| 118 |
+
print("[serve_vllm] --run blocked on Mac.", file=sys.stderr)
|
| 119 |
+
sys.exit(2)
|
| 120 |
+
env = dict(os.environ)
|
| 121 |
+
env["VLLM_USE_DEEP_GEMM"] = "0" # per the model card
|
| 122 |
+
os.execvpe(cmd[0], cmd, env)
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
if __name__ == "__main__":
|
| 126 |
+
main()
|
scripts/stub_server.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
stub_server.py — a tiny, stdlib-only OpenAI-compatible STUB so the benchmark and
|
| 4 |
+
eval harness (bench/measure.py, evals/humaneval_subset.py) can be exercised
|
| 5 |
+
END-TO-END on the Mac, with NO CUDA / vLLM / Laguna. It fakes just enough of the
|
| 6 |
+
vLLM surface to shape-test the whole pipeline before the venue.
|
| 7 |
+
|
| 8 |
+
What it fakes:
|
| 9 |
+
* POST /v1/completions — both streaming (SSE, for measure.py) and non-streaming
|
| 10 |
+
(single JSON, for humaneval_subset.py). Output is DETERMINISTIC given the prompt,
|
| 11 |
+
so two stubs return identical greedy text → the parity check proves "lossless".
|
| 12 |
+
* GET /metrics — Prometheus text. With --spec, it exposes the
|
| 13 |
+
spec_decode_* counters measure.py reads to compute acceptance length τ
|
| 14 |
+
(tuned so τ ≈ 2.6, in the DFlash card's 2.56–3.07 range). Without --spec it's a
|
| 15 |
+
plain baseline (no spec counters → measure.py reports τ = None, which is correct).
|
| 16 |
+
|
| 17 |
+
JVM analogy: this is WireMock for an LLM endpoint — a canned stub standing in for
|
| 18 |
+
the real service so you can integration-test the client/harness without the backend.
|
| 19 |
+
|
| 20 |
+
Usage:
|
| 21 |
+
python scripts/stub_server.py --port 8000 # baseline stub
|
| 22 |
+
python scripts/stub_server.py --port 8001 --spec # "dflash" stub (has τ metrics)
|
| 23 |
+
"""
|
| 24 |
+
from __future__ import annotations
|
| 25 |
+
|
| 26 |
+
import argparse
|
| 27 |
+
import json
|
| 28 |
+
import threading
|
| 29 |
+
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
|
| 30 |
+
|
| 31 |
+
GAMMA = 7 # draft length, matches the DFlash card / serve_vllm.py
|
| 32 |
+
TAU_TARGET = 2.6 # acceptance length we want measure.py to report for the spec stub
|
| 33 |
+
|
| 34 |
+
# Deterministic canned completion (same for every prompt → greedy parity is identical).
|
| 35 |
+
# Content is irrelevant locally: humaneval runs with --no-exec, measure.py only times it.
|
| 36 |
+
COMPLETION = (
|
| 37 |
+
"\n # stub completion (local shape-test only; not a real model)\n"
|
| 38 |
+
" result = 0\n"
|
| 39 |
+
" for i in range(n):\n"
|
| 40 |
+
" result += i\n"
|
| 41 |
+
" return result\n"
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _tokens(text: str) -> list[str]:
|
| 46 |
+
"""Split into whitespace-preserving 'tokens' so streaming has several chunks."""
|
| 47 |
+
out, buf = [], ""
|
| 48 |
+
for ch in text:
|
| 49 |
+
buf += ch
|
| 50 |
+
if ch.isspace():
|
| 51 |
+
out.append(buf)
|
| 52 |
+
buf = ""
|
| 53 |
+
if buf:
|
| 54 |
+
out.append(buf)
|
| 55 |
+
return out
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class State:
|
| 59 |
+
"""Shared mutable counters (one server instance)."""
|
| 60 |
+
def __init__(self, spec: bool):
|
| 61 |
+
self.spec = spec
|
| 62 |
+
self.emitted = 0
|
| 63 |
+
self.lock = threading.Lock()
|
| 64 |
+
|
| 65 |
+
def add_emitted(self, n: int) -> None:
|
| 66 |
+
with self.lock:
|
| 67 |
+
self.emitted += n
|
| 68 |
+
|
| 69 |
+
def metrics_text(self) -> str:
|
| 70 |
+
lines = [
|
| 71 |
+
"# HELP stub_up 1 if the stub is serving",
|
| 72 |
+
"# TYPE stub_up gauge",
|
| 73 |
+
"stub_up 1",
|
| 74 |
+
]
|
| 75 |
+
if self.spec:
|
| 76 |
+
# Invert measure.py's math so it recovers TAU_TARGET:
|
| 77 |
+
# passes = emitted / tau ; draft = passes*gamma ; accepted = emitted - passes
|
| 78 |
+
# measure.py: passes' = draft/gamma = passes ; committed = accepted + passes = emitted
|
| 79 |
+
# tau = committed / passes = emitted / passes = TAU_TARGET
|
| 80 |
+
passes = max(self.emitted / TAU_TARGET, 0.0)
|
| 81 |
+
draft = passes * GAMMA
|
| 82 |
+
accepted = max(self.emitted - passes, 0.0)
|
| 83 |
+
lines += [
|
| 84 |
+
f"spec_decode_num_draft_tokens {draft:.0f}",
|
| 85 |
+
f"spec_decode_num_accepted_tokens {accepted:.0f}",
|
| 86 |
+
f"spec_decode_num_emitted_tokens {self.emitted:.0f}",
|
| 87 |
+
]
|
| 88 |
+
return "\n".join(lines) + "\n"
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
class Handler(BaseHTTPRequestHandler):
|
| 92 |
+
state: State = None # set on the class before serving
|
| 93 |
+
|
| 94 |
+
def log_message(self, *args): # quiet
|
| 95 |
+
pass
|
| 96 |
+
|
| 97 |
+
def _send(self, code: int, body: bytes, ctype: str) -> None:
|
| 98 |
+
self.send_response(code)
|
| 99 |
+
self.send_header("Content-Type", ctype)
|
| 100 |
+
self.send_header("Content-Length", str(len(body)))
|
| 101 |
+
self.end_headers()
|
| 102 |
+
self.wfile.write(body)
|
| 103 |
+
|
| 104 |
+
def do_GET(self):
|
| 105 |
+
if self.path.rstrip("/") == "/metrics":
|
| 106 |
+
self._send(200, self.state.metrics_text().encode(), "text/plain; version=0.0.4")
|
| 107 |
+
else:
|
| 108 |
+
self._send(404, b"not found\n", "text/plain")
|
| 109 |
+
|
| 110 |
+
def do_POST(self):
|
| 111 |
+
path = self.path.rstrip("/")
|
| 112 |
+
# Real vLLM serves both the legacy text route (/v1/completions, used by
|
| 113 |
+
# bench/measure.py) and the chat route (/v1/chat/completions, used by the
|
| 114 |
+
# Kotlin load-test client). The only wire difference is the chunk shape:
|
| 115 |
+
# chat streams {delta:{content:...}}, legacy streams {text:...}.
|
| 116 |
+
is_chat = path == "/v1/chat/completions"
|
| 117 |
+
if not is_chat and path != "/v1/completions":
|
| 118 |
+
self._send(404, b"not found\n", "text/plain")
|
| 119 |
+
return
|
| 120 |
+
n = int(self.headers.get("Content-Length", 0))
|
| 121 |
+
try:
|
| 122 |
+
req = json.loads(self.rfile.read(n) or b"{}")
|
| 123 |
+
except json.JSONDecodeError:
|
| 124 |
+
self._send(400, b'{"error":"bad json"}', "application/json")
|
| 125 |
+
return
|
| 126 |
+
|
| 127 |
+
max_tokens = int(req.get("max_tokens", 64))
|
| 128 |
+
toks = _tokens(COMPLETION)[:max_tokens]
|
| 129 |
+
text = "".join(toks)
|
| 130 |
+
self.state.add_emitted(len(toks))
|
| 131 |
+
|
| 132 |
+
if req.get("stream"):
|
| 133 |
+
self.send_response(200)
|
| 134 |
+
self.send_header("Content-Type", "text/event-stream")
|
| 135 |
+
self.end_headers()
|
| 136 |
+
for t in toks:
|
| 137 |
+
if is_chat:
|
| 138 |
+
chunk = {"choices": [{"delta": {"content": t}, "index": 0,
|
| 139 |
+
"finish_reason": None}]}
|
| 140 |
+
else:
|
| 141 |
+
chunk = {"choices": [{"text": t, "index": 0,
|
| 142 |
+
"finish_reason": None}]}
|
| 143 |
+
self.wfile.write(f"data: {json.dumps(chunk)}\n\n".encode())
|
| 144 |
+
self.wfile.flush()
|
| 145 |
+
self.wfile.write(b"data: [DONE]\n\n")
|
| 146 |
+
self.wfile.flush()
|
| 147 |
+
elif is_chat:
|
| 148 |
+
body = {
|
| 149 |
+
"id": "stub-chatcmpl",
|
| 150 |
+
"object": "chat.completion",
|
| 151 |
+
"model": req.get("model", "laguna"),
|
| 152 |
+
"choices": [{"message": {"role": "assistant", "content": text},
|
| 153 |
+
"index": 0, "finish_reason": "stop"}],
|
| 154 |
+
}
|
| 155 |
+
self._send(200, json.dumps(body).encode(), "application/json")
|
| 156 |
+
else:
|
| 157 |
+
body = {
|
| 158 |
+
"id": "stub-cmpl",
|
| 159 |
+
"object": "text_completion",
|
| 160 |
+
"model": req.get("model", "laguna"),
|
| 161 |
+
"choices": [{"text": text, "index": 0, "finish_reason": "stop"}],
|
| 162 |
+
}
|
| 163 |
+
self._send(200, json.dumps(body).encode(), "application/json")
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def main() -> None:
|
| 167 |
+
p = argparse.ArgumentParser(description="Stdlib OpenAI-compatible stub for local harness shape-tests.")
|
| 168 |
+
p.add_argument("--port", type=int, default=8000)
|
| 169 |
+
p.add_argument("--spec", action="store_true",
|
| 170 |
+
help="Expose spec_decode_* metrics (simulate the DFlash endpoint, τ≈2.6).")
|
| 171 |
+
args = p.parse_args()
|
| 172 |
+
|
| 173 |
+
Handler.state = State(spec=args.spec)
|
| 174 |
+
srv = ThreadingHTTPServer(("127.0.0.1", args.port), Handler)
|
| 175 |
+
tag = "dflash-stub (with τ metrics)" if args.spec else "baseline-stub"
|
| 176 |
+
print(f"[stub] {tag} serving on http://127.0.0.1:{args.port} "
|
| 177 |
+
f"(/v1/completions, /v1/chat/completions, /metrics)")
|
| 178 |
+
try:
|
| 179 |
+
srv.serve_forever()
|
| 180 |
+
except KeyboardInterrupt:
|
| 181 |
+
pass
|
| 182 |
+
finally:
|
| 183 |
+
srv.shutdown()
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
if __name__ == "__main__":
|
| 187 |
+
main()
|
spec_rl/README.md
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# spec_rl — code RL on a DFlash-speculated endpoint
|
| 2 |
+
|
| 3 |
+
A small [`verifiers`](https://github.com/PrimeIntellect-ai/verifiers) environment
|
| 4 |
+
for the combined hackathon thesis:
|
| 5 |
+
|
| 6 |
+
> **Lossless DFlash speculative decoding makes RL post-training cheaper.**
|
| 7 |
+
|
| 8 |
+
`spec_rl` is a HumanEval-style code-completion task. The policy model
|
| 9 |
+
(Laguna XS.2) is given a function signature + docstring and must write the body.
|
| 10 |
+
The `@vf.reward` `code_reward` function executes that body against the problem's
|
| 11 |
+
unit tests and returns the **fraction of assertions that pass** (a value in
|
| 12 |
+
`[0,1]`) via `fraction_passing(problem, text)`. This is a *unit-test-grounded,
|
| 13 |
+
verifiable, dense* reward — exactly the kind verifiers RL is built for. A
|
| 14 |
+
fractional (rather than binary all-or-nothing) reward avoids GRPO all-zero-group
|
| 15 |
+
advantage collapse on hard prompts, where every rollout would otherwise score
|
| 16 |
+
`0.0`. The reported pass@1 **eval** stays binary (`evals/humaneval_subset.py`):
|
| 17 |
+
reward is the learning signal, eval is the scoreboard.
|
| 18 |
+
|
| 19 |
+
## The point
|
| 20 |
+
|
| 21 |
+
`verifiers` runs RL rollouts against an OpenAI-compatible endpoint declared in
|
| 22 |
+
`./configs/endpoints.toml`. Point that endpoint at the **DFlash-speculated vLLM
|
| 23 |
+
server** instead of a plain one and you get the **same reward curve at higher
|
| 24 |
+
rollout throughput**:
|
| 25 |
+
|
| 26 |
+
- Speculative decoding is **lossless** under greedy decoding. The 0.6B DFlash
|
| 27 |
+
drafter proposes `num_speculative_tokens = 7` tokens; the target model
|
| 28 |
+
(Laguna XS.2) verifies them, so accepted text is **token-identical** to the
|
| 29 |
+
no-speculator baseline.
|
| 30 |
+
- The reward depends only on the generated text, so an identical reward signal
|
| 31 |
+
is produced.
|
| 32 |
+
- Only the **cost per rollout** drops (fewer target-model forward passes per
|
| 33 |
+
accepted token → higher tokens/sec → cheaper RL).
|
| 34 |
+
|
| 35 |
+
That is the measurable claim: feed the same env two endpoints (baseline vs
|
| 36 |
+
DFlash), show one reward curve, two throughputs.
|
| 37 |
+
|
| 38 |
+
## How the reward works
|
| 39 |
+
|
| 40 |
+
1. The dataset carries each HumanEval problem's original `prompt` (signature +
|
| 41 |
+
docstring), `test` (the `check(candidate)` harness), and `entry_point` in
|
| 42 |
+
`info` — so the grader never depends on the model echoing the signature.
|
| 43 |
+
2. The model's completion is trimmed at the first stop sequence
|
| 44 |
+
(`\nclass `, `\ndef `, `\n#`, `\nif __name__`) so a chatty model can't smuggle
|
| 45 |
+
a second definition past the grader. This matches `evals/humaneval_subset.py`.
|
| 46 |
+
3. `spec_rl.fraction_passing()` assembles `prompt + completion + test +
|
| 47 |
+
check(entry_point)` and runs it in a **fresh `python` subprocess with an 8s
|
| 48 |
+
wall-clock timeout**, isolated from the rollout worker. It AST-instruments each
|
| 49 |
+
`assert` in the HumanEval `check()` (via `_AssertCounter`) so a failing assert
|
| 50 |
+
is **counted in the denominator instead of aborting on the first failure** —
|
| 51 |
+
this also makes loop-based checks fractional. The reward is `passed_asserts /
|
| 52 |
+
total_asserts`, a value in `[0,1]`. A crash, exception, or timeout before any
|
| 53 |
+
assertion runs → `0.0`; every assertion passing → `1.0`.
|
| 54 |
+
|
| 55 |
+
The execution + pass/fail logic is plain stdlib and importable without
|
| 56 |
+
`verifiers` or a GPU, so it is unit-testable locally on Apple Silicon. A built-in
|
| 57 |
+
smoke test runs with:
|
| 58 |
+
|
| 59 |
+
```bash
|
| 60 |
+
python spec_rl.py # checks passing / failing / timeout completions
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
> **Safety:** this executes model-generated code to grade it. Each candidate
|
| 64 |
+
> runs in a short-lived, isolated subprocess. Run RL rollouts only in the
|
| 65 |
+
> disposable venue sandbox, never against real data.
|
| 66 |
+
|
| 67 |
+
## Layout
|
| 68 |
+
|
| 69 |
+
```
|
| 70 |
+
spec_rl/
|
| 71 |
+
spec_rl.py # load_environment(num_examples=20) -> vf.Environment
|
| 72 |
+
pyproject.toml # name = "spec-rl", depends on verifiers + datasets
|
| 73 |
+
README.md
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
`load_environment(num_examples=20)` builds a `vf.SingleTurnEnv` over the first
|
| 77 |
+
`num_examples` HumanEval problems with a `vf.Rubric` wrapping the `@vf.reward`
|
| 78 |
+
`code_reward` function (which scores via `fraction_passing`).
|
| 79 |
+
|
| 80 |
+
## Run it
|
| 81 |
+
|
| 82 |
+
Install the env, then evaluate Laguna XS.2 through it:
|
| 83 |
+
|
| 84 |
+
```bash
|
| 85 |
+
prime env install spec_rl
|
| 86 |
+
prime eval run spec_rl -m poolside/Laguna-XS.2 -n 20
|
| 87 |
+
prime eval view
|
| 88 |
+
```
|
| 89 |
+
|
| 90 |
+
`-m poolside/Laguna-XS.2` resolves to whatever endpoint you alias in
|
| 91 |
+
`./configs/endpoints.toml`. To show the cheaper-rollout result, define two
|
| 92 |
+
aliases pointing at the same model — one plain vLLM server, one DFlash-speculated
|
| 93 |
+
server — and run the eval against each:
|
| 94 |
+
|
| 95 |
+
```toml
|
| 96 |
+
# configs/endpoints.toml
|
| 97 |
+
[[endpoint]]
|
| 98 |
+
endpoint_id = "laguna-baseline"
|
| 99 |
+
model = "poolside/Laguna-XS.2"
|
| 100 |
+
url = "http://<baseline-vllm-host>:8000/v1"
|
| 101 |
+
key = "VLLM_API_KEY"
|
| 102 |
+
type = "openai_chat_completions"
|
| 103 |
+
|
| 104 |
+
[[endpoint]]
|
| 105 |
+
endpoint_id = "laguna-dflash"
|
| 106 |
+
model = "poolside/Laguna-XS.2"
|
| 107 |
+
url = "http://<dflash-vllm-host>:8000/v1"
|
| 108 |
+
key = "VLLM_API_KEY"
|
| 109 |
+
type = "openai_chat_completions"
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
The DFlash server is launched with the speculator config:
|
| 113 |
+
|
| 114 |
+
```bash
|
| 115 |
+
VLLM_USE_DEEP_GEMM=0 vllm serve poolside/Laguna-XS.2 \
|
| 116 |
+
--speculative-config '{"model":"poolside/Laguna-XS.2-speculator.dflash","num_speculative_tokens":7,"method":"dflash"}'
|
| 117 |
+
# vLLM >= 0.21.0, parsers poolside_v1; vLLM does NOT need --trust-remote-code.
|
| 118 |
+
```
|
| 119 |
+
|
| 120 |
+
Then:
|
| 121 |
+
|
| 122 |
+
```bash
|
| 123 |
+
prime eval run spec_rl -m laguna-baseline -n 20
|
| 124 |
+
prime eval run spec_rl -m laguna-dflash -n 20
|
| 125 |
+
```
|
| 126 |
+
|
| 127 |
+
Identical reward, higher throughput on the DFlash run. Read realized acceptance
|
| 128 |
+
length (tau) and tokens/sec from the DFlash server's `/metrics` — these are
|
| 129 |
+
**measured at the venue**, not quoted from any published figure.
|
spec_rl/pyproject.toml
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "spec-rl"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "HumanEval-style code RL environment whose rollouts are served by the DFlash-speculated Laguna XS.2 vLLM endpoint — same reward curve, cheaper rollouts."
|
| 5 |
+
tags = ["code", "humaneval", "single-turn", "rl", "eval", "speculative-decoding", "dflash"]
|
| 6 |
+
requires-python = ">=3.11"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"verifiers",
|
| 9 |
+
"datasets",
|
| 10 |
+
]
|
| 11 |
+
|
| 12 |
+
[build-system]
|
| 13 |
+
requires = ["hatchling"]
|
| 14 |
+
build-backend = "hatchling.build"
|
| 15 |
+
|
| 16 |
+
[tool.hatch.build]
|
| 17 |
+
include = ["spec_rl.py", "pyproject.toml", "README.md"]
|
| 18 |
+
|
| 19 |
+
[tool.verifiers.eval]
|
| 20 |
+
num_examples = 20
|
| 21 |
+
rollouts_per_example = 1
|
spec_rl/spec_rl.py
ADDED
|
@@ -0,0 +1,453 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
spec_rl.py — a small `verifiers` environment for the combined hackathon thesis:
|
| 4 |
+
"lossless DFlash speculative decoding makes RL post-training cheaper."
|
| 5 |
+
|
| 6 |
+
The environment is a HumanEval-style code-completion task. The policy model
|
| 7 |
+
(Laguna XS.2) is prompted with a function signature + docstring and must emit
|
| 8 |
+
the function body. The reward executes the candidate completion against the
|
| 9 |
+
problem's unit tests in a SUBPROCESS WITH A TIMEOUT and returns 1.0 if every
|
| 10 |
+
FRACTION of the problem's unit-test assertions that pass (a dense RL signal in
|
| 11 |
+
[0,1]); the pass@1 eval stays binary (evals/humaneval_subset.py). Reward is the
|
| 12 |
+
dense learning signal; the eval is the binary scoreboard.
|
| 13 |
+
|
| 14 |
+
Why this exists for the hackathon
|
| 15 |
+
---------------------------------
|
| 16 |
+
verifiers runs RL rollouts against an OpenAI-compatible endpoint declared in
|
| 17 |
+
`./configs/endpoints.toml`. Point that endpoint at the DFlash-speculated vLLM
|
| 18 |
+
server and the *same* reward curve is produced at higher rollout throughput,
|
| 19 |
+
because speculative decoding is lossless under greedy decoding (the drafted
|
| 20 |
+
tokens are verified by the target model, so accepted text is token-identical to
|
| 21 |
+
the no-speculator baseline). The reward signal does not change; only the cost
|
| 22 |
+
per rollout drops. That is the "cheaper RL" claim, made measurable.
|
| 23 |
+
|
| 24 |
+
Local-dev note (Apple Silicon, no CUDA): this module is import-safe even when
|
| 25 |
+
`verifiers` is not installed. `import verifiers as vf` is guarded; a clear
|
| 26 |
+
ImportError is raised only when `load_environment()` is actually called. The
|
| 27 |
+
reward's code-execution + pass/fail logic is plain stdlib and is unit-testable
|
| 28 |
+
without verifiers or a GPU.
|
| 29 |
+
|
| 30 |
+
SAFETY: this executes model-generated code to grade it. Each candidate runs in a
|
| 31 |
+
short-lived subprocess with a wall-clock timeout, isolated from this process.
|
| 32 |
+
Run RL rollouts only in the disposable venue sandbox, never against real data.
|
| 33 |
+
"""
|
| 34 |
+
from __future__ import annotations
|
| 35 |
+
|
| 36 |
+
import ast
|
| 37 |
+
import json
|
| 38 |
+
import subprocess
|
| 39 |
+
import sys
|
| 40 |
+
import tempfile
|
| 41 |
+
from pathlib import Path
|
| 42 |
+
from typing import Any
|
| 43 |
+
|
| 44 |
+
# ---------------------------------------------------------------------------
|
| 45 |
+
# Import guard: keep the module importable without `verifiers` installed so the
|
| 46 |
+
# reward logic can be unit-tested locally on the Mac. The real dependency is
|
| 47 |
+
# only required when building the live environment.
|
| 48 |
+
# ---------------------------------------------------------------------------
|
| 49 |
+
try:
|
| 50 |
+
import verifiers as vf # type: ignore
|
| 51 |
+
except ImportError: # pragma: no cover - exercised only when dep is absent
|
| 52 |
+
vf = None # type: ignore
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# Per-candidate execution budget (seconds). Generous enough for HumanEval's
|
| 56 |
+
# bounded reference tests, short enough to bound a runaway rollout.
|
| 57 |
+
EXEC_TIMEOUT_S = 8
|
| 58 |
+
|
| 59 |
+
# Stop sequences mirror evals/humaneval_subset.py so completion shape matches
|
| 60 |
+
# the parity/pass@1 harness used to prove losslessness.
|
| 61 |
+
STOP = ["\nclass ", "\ndef ", "\n#", "\nif __name__"]
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# ---------------------------------------------------------------------------
|
| 65 |
+
# Dataset — reuse the HumanEval subset shape: {prompt, test, entry_point}.
|
| 66 |
+
# We load the canonical HumanEval test split (same source as
|
| 67 |
+
# evals/humaneval_subset.py) and keep only the first `num_examples` problems so
|
| 68 |
+
# RL rollouts stay small and cheap during the hackathon.
|
| 69 |
+
# ---------------------------------------------------------------------------
|
| 70 |
+
def load_problems(num_examples: int) -> list[dict[str, Any]]:
|
| 71 |
+
"""Return the first `num_examples` code problems as {prompt, test, entry_point}.
|
| 72 |
+
|
| 73 |
+
Default source is the canonical HumanEval test split (same as
|
| 74 |
+
evals/humaneval_subset.py). Two overrides, in precedence order:
|
| 75 |
+
|
| 76 |
+
* ``SPEC_RL_DATASET`` — a local ``.jsonl`` path (one problem per line) OR
|
| 77 |
+
a Hugging Face dataset id. This is the drop-in seam for an
|
| 78 |
+
Adaption-curated / exported dataset: as long as each row carries
|
| 79 |
+
``{prompt, test, entry_point}`` it runs unchanged, so a richer code
|
| 80 |
+
taskset built with the hackathon's Adaption credits swaps in with one
|
| 81 |
+
env var and no code change.
|
| 82 |
+
* ``HUMANEVAL_DATASET`` — override just the HF repo id if the venue image
|
| 83 |
+
pins a mirror. ``SPEC_RL_DATASET_SPLIT`` overrides the split (default
|
| 84 |
+
``test``).
|
| 85 |
+
|
| 86 |
+
With no env vars set the behaviour is identical to before.
|
| 87 |
+
"""
|
| 88 |
+
import json
|
| 89 |
+
import os
|
| 90 |
+
|
| 91 |
+
src = os.environ.get("SPEC_RL_DATASET")
|
| 92 |
+
if src and src.endswith(".jsonl") and os.path.exists(src):
|
| 93 |
+
with open(src) as f:
|
| 94 |
+
rows = [json.loads(line) for line in f if line.strip()]
|
| 95 |
+
return rows[:num_examples]
|
| 96 |
+
|
| 97 |
+
from datasets import load_dataset
|
| 98 |
+
|
| 99 |
+
dataset_id = src or os.environ.get("HUMANEVAL_DATASET", "openai/openai_humaneval")
|
| 100 |
+
split = os.environ.get("SPEC_RL_DATASET_SPLIT", "test")
|
| 101 |
+
ds = load_dataset(dataset_id, split=split)
|
| 102 |
+
num_examples = min(num_examples, len(ds))
|
| 103 |
+
return [dict(ds[i]) for i in range(num_examples)]
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
# ---------------------------------------------------------------------------
|
| 107 |
+
# Reward core — execute the candidate completion against the unit tests in a
|
| 108 |
+
# fresh subprocess with a timeout. Pure stdlib, no verifiers/GPU needed, so it
|
| 109 |
+
# can be tested locally. Returns True iff all tests pass within the budget.
|
| 110 |
+
# ---------------------------------------------------------------------------
|
| 111 |
+
def _build_program(problem: dict[str, Any], completion: str) -> str:
|
| 112 |
+
"""Assemble the runnable program: signature+docstring + body + tests."""
|
| 113 |
+
return (
|
| 114 |
+
problem["prompt"]
|
| 115 |
+
+ completion
|
| 116 |
+
+ "\n"
|
| 117 |
+
+ problem["test"]
|
| 118 |
+
+ f"\ncheck({problem['entry_point']})\n"
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def passes(problem: dict[str, Any], completion: str, timeout_s: int = EXEC_TIMEOUT_S) -> bool:
|
| 123 |
+
"""True iff `completion` makes the problem's unit tests pass.
|
| 124 |
+
|
| 125 |
+
Runs the assembled program in a separate `python` subprocess so a hang,
|
| 126 |
+
crash, or `sys.exit` in model-generated code cannot take down the rollout
|
| 127 |
+
worker. A non-zero exit code, a raised exception, or a timeout all count as
|
| 128 |
+
a failure (reward 0.0).
|
| 129 |
+
"""
|
| 130 |
+
program = _build_program(problem, completion)
|
| 131 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 132 |
+
prog_path = Path(tmp) / "candidate.py"
|
| 133 |
+
prog_path.write_text(program)
|
| 134 |
+
try:
|
| 135 |
+
result = subprocess.run(
|
| 136 |
+
[sys.executable, str(prog_path)],
|
| 137 |
+
capture_output=True,
|
| 138 |
+
text=True,
|
| 139 |
+
timeout=timeout_s,
|
| 140 |
+
cwd=tmp,
|
| 141 |
+
)
|
| 142 |
+
except subprocess.TimeoutExpired:
|
| 143 |
+
return False
|
| 144 |
+
return result.returncode == 0
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
class _AssertCounter(ast.NodeTransformer):
|
| 148 |
+
"""Rewrite each ``assert`` so a failure is COUNTED, not fatal.
|
| 149 |
+
|
| 150 |
+
``assert <test>`` becomes, roughly::
|
| 151 |
+
|
| 152 |
+
try: __ok = bool(<test>)
|
| 153 |
+
except BaseException: __ok = False
|
| 154 |
+
__tally['total'] += 1
|
| 155 |
+
if __ok: __tally['passed'] += 1
|
| 156 |
+
|
| 157 |
+
So every assertion that executes (including inside a ``for`` loop over many
|
| 158 |
+
input/output pairs) contributes one test to the denominator, and the
|
| 159 |
+
numerator is how many held — turning HumanEval's single all-or-nothing
|
| 160 |
+
``check()`` into a fractional pass rate.
|
| 161 |
+
"""
|
| 162 |
+
|
| 163 |
+
def visit_Assert(self, node: ast.Assert):
|
| 164 |
+
try_node = ast.Try(
|
| 165 |
+
body=[ast.Assign(
|
| 166 |
+
targets=[ast.Name(id="__ok", ctx=ast.Store())],
|
| 167 |
+
value=ast.Call(func=ast.Name(id="bool", ctx=ast.Load()),
|
| 168 |
+
args=[node.test], keywords=[]),
|
| 169 |
+
)],
|
| 170 |
+
handlers=[ast.ExceptHandler(
|
| 171 |
+
type=ast.Name(id="BaseException", ctx=ast.Load()),
|
| 172 |
+
name=None,
|
| 173 |
+
body=[ast.Assign(
|
| 174 |
+
targets=[ast.Name(id="__ok", ctx=ast.Store())],
|
| 175 |
+
value=ast.Constant(value=False))],
|
| 176 |
+
)],
|
| 177 |
+
orelse=[], finalbody=[],
|
| 178 |
+
)
|
| 179 |
+
incr_total = ast.parse("__tally['total'] += 1").body[0]
|
| 180 |
+
incr_pass = ast.parse("if __ok:\n __tally['passed'] += 1").body[0]
|
| 181 |
+
out = [try_node, incr_total, incr_pass]
|
| 182 |
+
for n in out:
|
| 183 |
+
ast.copy_location(n, node)
|
| 184 |
+
ast.fix_missing_locations(n)
|
| 185 |
+
return out
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
def fraction_passing(problem: dict[str, Any], completion: str,
|
| 189 |
+
timeout_s: int = EXEC_TIMEOUT_S) -> float:
|
| 190 |
+
"""Fraction of the problem's unit-test assertions the completion passes.
|
| 191 |
+
|
| 192 |
+
Returns a value in [0.0, 1.0]: 1.0 = all assertions pass, 0.5 = half, 0.0 =
|
| 193 |
+
none (or the code didn't even run). This is the dense RL TRAINING reward; the
|
| 194 |
+
reported pass@1 EVAL stays binary (evals/humaneval_subset.py). Reward is the
|
| 195 |
+
learning signal, eval is the scoreboard — a dense reward avoids GRPO's
|
| 196 |
+
all-zero-group advantage collapse on hard prompts (every rollout failing a
|
| 197 |
+
hard problem otherwise yields a zero-variance group with no gradient).
|
| 198 |
+
|
| 199 |
+
Mechanism: instrument the test's ``assert``s (via _AssertCounter) so each is
|
| 200 |
+
counted instead of aborting on the first failure, run the assembled program
|
| 201 |
+
in a timed subprocess, and read back passed/total. Falls back to the binary
|
| 202 |
+
``passes()`` result if the test can't be parsed or exposes no assertions.
|
| 203 |
+
"""
|
| 204 |
+
try:
|
| 205 |
+
tree = ast.parse(problem["test"])
|
| 206 |
+
except SyntaxError:
|
| 207 |
+
return 1.0 if passes(problem, completion, timeout_s) else 0.0
|
| 208 |
+
tree = _AssertCounter().visit(tree)
|
| 209 |
+
ast.fix_missing_locations(tree)
|
| 210 |
+
try:
|
| 211 |
+
instrumented_test = ast.unparse(tree)
|
| 212 |
+
except Exception: # pragma: no cover - ast.unparse needs py>=3.9
|
| 213 |
+
return 1.0 if passes(problem, completion, timeout_s) else 0.0
|
| 214 |
+
|
| 215 |
+
program = (
|
| 216 |
+
"__tally = {'passed': 0, 'total': 0}\n"
|
| 217 |
+
+ problem["prompt"] + completion + "\n"
|
| 218 |
+
+ instrumented_test + "\n"
|
| 219 |
+
+ "try:\n"
|
| 220 |
+
+ f" check({problem['entry_point']})\n"
|
| 221 |
+
+ "except BaseException:\n"
|
| 222 |
+
+ " pass\n"
|
| 223 |
+
+ "import json as __json\n"
|
| 224 |
+
+ "print('__FRAC__' + __json.dumps(__tally))\n"
|
| 225 |
+
)
|
| 226 |
+
with tempfile.TemporaryDirectory() as tmp:
|
| 227 |
+
prog_path = Path(tmp) / "candidate.py"
|
| 228 |
+
prog_path.write_text(program)
|
| 229 |
+
try:
|
| 230 |
+
result = subprocess.run(
|
| 231 |
+
[sys.executable, str(prog_path)],
|
| 232 |
+
capture_output=True, text=True, timeout=timeout_s, cwd=tmp,
|
| 233 |
+
)
|
| 234 |
+
except subprocess.TimeoutExpired:
|
| 235 |
+
return 0.0
|
| 236 |
+
for line in result.stdout.splitlines():
|
| 237 |
+
if line.startswith("__FRAC__"):
|
| 238 |
+
try:
|
| 239 |
+
tally = json.loads(line[len("__FRAC__"):])
|
| 240 |
+
total = int(tally.get("total", 0))
|
| 241 |
+
passed = int(tally.get("passed", 0))
|
| 242 |
+
except Exception:
|
| 243 |
+
return 0.0
|
| 244 |
+
if total == 0: # no assertions found -> fall back to all-or-nothing
|
| 245 |
+
return 1.0 if result.returncode == 0 else 0.0
|
| 246 |
+
return max(0.0, min(1.0, passed / total))
|
| 247 |
+
# No tally line => the program crashed before instrumentation ran (e.g. a
|
| 248 |
+
# syntax error in the completion) => nothing passed.
|
| 249 |
+
return 0.0
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def _extract_completion(state: Any) -> str:
|
| 253 |
+
"""Pull the assistant's text out of a verifiers rollout state.
|
| 254 |
+
|
| 255 |
+
Tolerates both the chat-style completion (list of messages) and a plain
|
| 256 |
+
string, so the reward works across SingleTurnEnv shapes.
|
| 257 |
+
"""
|
| 258 |
+
completion = None
|
| 259 |
+
if isinstance(state, dict):
|
| 260 |
+
completion = state.get("completion")
|
| 261 |
+
elif hasattr(state, "get"):
|
| 262 |
+
try:
|
| 263 |
+
completion = state.get("completion")
|
| 264 |
+
except Exception:
|
| 265 |
+
completion = None
|
| 266 |
+
if completion is None:
|
| 267 |
+
completion = getattr(state, "completion", None)
|
| 268 |
+
if isinstance(completion, str):
|
| 269 |
+
return completion
|
| 270 |
+
if isinstance(completion, list):
|
| 271 |
+
for message in reversed(completion):
|
| 272 |
+
if isinstance(message, dict) and message.get("role") == "assistant":
|
| 273 |
+
return str(message.get("content") or "")
|
| 274 |
+
# fall back to last item's content if roles are absent
|
| 275 |
+
if completion:
|
| 276 |
+
last = completion[-1]
|
| 277 |
+
if isinstance(last, dict):
|
| 278 |
+
return str(last.get("content") or "")
|
| 279 |
+
return str(last)
|
| 280 |
+
return ""
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
# ---------------------------------------------------------------------------
|
| 284 |
+
# System prompt — module constant so the offline manual loop (eval_local.py),
|
| 285 |
+
# the classic SingleTurnEnv path, and the cookbook Taskset path all send the
|
| 286 |
+
# exact same instruction.
|
| 287 |
+
# ---------------------------------------------------------------------------
|
| 288 |
+
SYSTEM_PROMPT = (
|
| 289 |
+
"You are an expert Python programmer. You will be given a function "
|
| 290 |
+
"signature and docstring. Complete the function body only. Do not repeat "
|
| 291 |
+
"the signature, do not add explanations, and do not wrap the code in "
|
| 292 |
+
"markdown fences. Output only the indented function body."
|
| 293 |
+
)
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
def _problem_from(row: Any) -> dict[str, Any]:
|
| 297 |
+
"""Rebuild the gradeable problem from a task/info row (never the model output)."""
|
| 298 |
+
src = row.get("info") if hasattr(row, "get") and row.get("info") else row
|
| 299 |
+
return {
|
| 300 |
+
"prompt": src["code_prompt"],
|
| 301 |
+
"test": src["test"],
|
| 302 |
+
"entry_point": src["entry_point"],
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
def _score_completion(row: Any, completion_text: str) -> float:
|
| 307 |
+
"""Shared reward body: trim at the first STOP, return the fractional pass rate."""
|
| 308 |
+
text = completion_text or ""
|
| 309 |
+
for stop in STOP:
|
| 310 |
+
idx = text.find(stop)
|
| 311 |
+
if idx != -1:
|
| 312 |
+
text = text[:idx]
|
| 313 |
+
return fraction_passing(_problem_from(row), text)
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
def _task_rows(num_examples: int) -> list[dict[str, Any]]:
|
| 317 |
+
"""HumanEval-style rows carrying every field the reward needs — `info` nested
|
| 318 |
+
AND flattened, so both verifiers API shapes can read them."""
|
| 319 |
+
rows: list[dict[str, Any]] = []
|
| 320 |
+
for i, prob in enumerate(load_problems(num_examples)):
|
| 321 |
+
info = {
|
| 322 |
+
"task_id": prob.get("task_id", f"example_{i}"),
|
| 323 |
+
"code_prompt": prob["prompt"],
|
| 324 |
+
"test": prob["test"],
|
| 325 |
+
"entry_point": prob["entry_point"],
|
| 326 |
+
}
|
| 327 |
+
rows.append({"prompt": prob["prompt"], "answer": prob["entry_point"],
|
| 328 |
+
"info": info, **info})
|
| 329 |
+
return rows
|
| 330 |
+
|
| 331 |
+
|
| 332 |
+
# ---------------------------------------------------------------------------
|
| 333 |
+
# Environment factory — supports BOTH verifiers API shapes, because this
|
| 334 |
+
# workspace ships two references that disagree: the classic
|
| 335 |
+
# vf.SingleTurnEnv/vf.Rubric API (AGENTS.md) and the Prime lab-cookbook
|
| 336 |
+
# vf.Taskset/vf.Env/vf.EnvConfig API (reference/lab-cookbook/.../reverse_text).
|
| 337 |
+
# The cookbook Taskset is registered only when the installed verifiers exposes
|
| 338 |
+
# vf.Taskset; otherwise load_environment() falls back to the classic builder.
|
| 339 |
+
# Both paths share the same stdlib reward core (fraction_passing), so the reward
|
| 340 |
+
# is identical either way. [verify at onboarding] confirm which API the venue's
|
| 341 |
+
# installed verifiers actually uses, and adjust if a symbol is missing.
|
| 342 |
+
# ---------------------------------------------------------------------------
|
| 343 |
+
if vf is not None and hasattr(vf, "Taskset"):
|
| 344 |
+
|
| 345 |
+
class SpecRLTasksetConfig(vf.TasksetConfig): # type: ignore[misc]
|
| 346 |
+
dataset_name: str = "openai/openai_humaneval"
|
| 347 |
+
dataset_split: str = "test"
|
| 348 |
+
num_examples: int = 164 # full HumanEval pool; the harness samples -n from it
|
| 349 |
+
|
| 350 |
+
class SpecRLTaskset(vf.Taskset[SpecRLTasksetConfig]): # type: ignore[misc]
|
| 351 |
+
def load_tasks(self): # -> vf.Tasks
|
| 352 |
+
from datasets import Dataset
|
| 353 |
+
return Dataset.from_list(_task_rows(self.config.num_examples))
|
| 354 |
+
|
| 355 |
+
def load_system_prompt(self): # -> vf.SystemPrompt
|
| 356 |
+
return SYSTEM_PROMPT
|
| 357 |
+
|
| 358 |
+
@vf.reward(weight=1.0)
|
| 359 |
+
async def code_reward(self, task, state) -> float:
|
| 360 |
+
"""Dense fractional unit-test pass rate in [0,1] — the RL training reward."""
|
| 361 |
+
return _score_completion(task, _extract_completion(state))
|
| 362 |
+
|
| 363 |
+
def load_taskset(config): # -> vf.Taskset
|
| 364 |
+
return SpecRLTaskset(config=config)
|
| 365 |
+
|
| 366 |
+
|
| 367 |
+
def _build_singleturn_env(num_examples: int):
|
| 368 |
+
"""Classic verifiers path: a vf.SingleTurnEnv whose vf.Rubric scores the
|
| 369 |
+
fractional unit-test reward. Used when the installed verifiers predates the
|
| 370 |
+
cookbook Taskset/Env API."""
|
| 371 |
+
dataset_rows = [
|
| 372 |
+
{
|
| 373 |
+
"prompt": [
|
| 374 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 375 |
+
{"role": "user", "content": row["code_prompt"]},
|
| 376 |
+
],
|
| 377 |
+
"answer": row["entry_point"],
|
| 378 |
+
"info": row["info"],
|
| 379 |
+
}
|
| 380 |
+
for row in _task_rows(num_examples)
|
| 381 |
+
]
|
| 382 |
+
dataset = vf.Dataset.from_list(dataset_rows)
|
| 383 |
+
|
| 384 |
+
@vf.reward
|
| 385 |
+
def code_reward(completion, info, **kwargs) -> float:
|
| 386 |
+
text = completion if isinstance(completion, str) else _extract_completion(
|
| 387 |
+
{"completion": completion}
|
| 388 |
+
)
|
| 389 |
+
return _score_completion({"info": info}, text)
|
| 390 |
+
|
| 391 |
+
return vf.SingleTurnEnv(dataset=dataset, system_prompt=SYSTEM_PROMPT,
|
| 392 |
+
rubric=vf.Rubric(funcs=[code_reward]))
|
| 393 |
+
|
| 394 |
+
|
| 395 |
+
def load_environment(config: Any = None, *, num_examples: int = 20):
|
| 396 |
+
"""Build the spec_rl RL environment (dual-signature on purpose).
|
| 397 |
+
|
| 398 |
+
Two verifiers APIs ship in this workspace, so this supports both:
|
| 399 |
+
* Cookbook (Prime lab-cookbook): ``load_environment(config: vf.EnvConfig)
|
| 400 |
+
-> vf.Env`` — used by ``prime eval run`` / ``prime train``.
|
| 401 |
+
* Classic: ``load_environment(num_examples=N) -> vf.SingleTurnEnv`` —
|
| 402 |
+
used by eval_local.py's verifiers path.
|
| 403 |
+
Both share the same stdlib reward core, so rewards are identical. The reward
|
| 404 |
+
logic (spec_rl.fraction_passing / passes) is importable and testable WITHOUT
|
| 405 |
+
verifiers; the hard dependency is enforced only here.
|
| 406 |
+
|
| 407 |
+
[verify at onboarding] confirm the installed verifiers exposes the symbols
|
| 408 |
+
the active path uses (vf.Taskset/EnvConfig/Env, or vf.SingleTurnEnv/Rubric).
|
| 409 |
+
"""
|
| 410 |
+
if vf is None:
|
| 411 |
+
raise ImportError(
|
| 412 |
+
"The 'verifiers' package is required to build the spec_rl environment. "
|
| 413 |
+
"Install it with `prime env install spec_rl` (or `pip install verifiers`). "
|
| 414 |
+
"The reward logic (spec_rl.fraction_passing) is importable without it."
|
| 415 |
+
)
|
| 416 |
+
if config is not None and hasattr(vf, "Taskset"):
|
| 417 |
+
return vf.Env(taskset=load_taskset(config=config.taskset))
|
| 418 |
+
return _build_singleturn_env(num_examples)
|
| 419 |
+
|
| 420 |
+
|
| 421 |
+
# ---------------------------------------------------------------------------
|
| 422 |
+
# Local smoke test (no verifiers, no GPU, no network): proves the reward core
|
| 423 |
+
# distinguishes a passing completion from a failing one. Run:
|
| 424 |
+
# python spec_rl.py
|
| 425 |
+
# ---------------------------------------------------------------------------
|
| 426 |
+
def _selftest() -> None:
|
| 427 |
+
toy = {
|
| 428 |
+
"prompt": "def add(a, b):\n \"\"\"Return a + b.\"\"\"\n",
|
| 429 |
+
"test": "def check(candidate):\n assert candidate(2, 3) == 5\n assert candidate(-1, 1) == 0\n",
|
| 430 |
+
"entry_point": "add",
|
| 431 |
+
}
|
| 432 |
+
good = " return a + b\n"
|
| 433 |
+
bad = " return a - b\n"
|
| 434 |
+
partial = " return a + b if a > 0 else a - b\n" # passes 1 of 2 asserts
|
| 435 |
+
loops_forever = " while True:\n pass\n"
|
| 436 |
+
report = {
|
| 437 |
+
"passing_fraction": fraction_passing(toy, good),
|
| 438 |
+
"failing_fraction": fraction_passing(toy, bad),
|
| 439 |
+
"partial_fraction": fraction_passing(toy, partial),
|
| 440 |
+
"timeout_fraction": fraction_passing(toy, loops_forever, timeout_s=2),
|
| 441 |
+
"binary_passes_good": passes(toy, good),
|
| 442 |
+
"verifiers_available": vf is not None,
|
| 443 |
+
}
|
| 444 |
+
print(json.dumps(report, indent=2))
|
| 445 |
+
assert report["passing_fraction"] == 1.0, "all asserts pass => 1.0"
|
| 446 |
+
assert report["failing_fraction"] == 0.0, "no asserts pass => 0.0"
|
| 447 |
+
assert report["partial_fraction"] == 0.5, "1 of 2 asserts => 0.5 (fractional)"
|
| 448 |
+
assert report["timeout_fraction"] == 0.0, "timeout => 0.0"
|
| 449 |
+
print("selftest OK")
|
| 450 |
+
|
| 451 |
+
|
| 452 |
+
if __name__ == "__main__":
|
| 453 |
+
_selftest()
|