#!/usr/bin/env bash # parity_local.sh — full local dry-run of the benchmark + parity harness on the Mac. # Starts two stub servers (baseline :8000, "dflash" :8001), waits until both are # ready, runs measure.py against each (writing results/*.json) and the greedy # parity check across both, then tears the stubs down. No CUDA / vLLM / Laguna. set -euo pipefail cd "$(dirname "$0")/.." PY=.venv/bin/python "$PY" scripts/stub_server.py --port 8000 & A=$! "$PY" scripts/stub_server.py --port 8001 --spec & B=$! trap 'kill $A $B 2>/dev/null || true' EXIT # Wait for both ports to accept connections (no shell sleep — poll in python). "$PY" - <<'PY' import socket, time, sys for port in (8000, 8001): for _ in range(100): with socket.socket() as s: if s.connect_ex(("127.0.0.1", port)) == 0: break time.sleep(0.05) else: sys.exit(f"stub on {port} never came up") print("[parity_local] both stubs ready") PY mkdir -p results "$PY" bench/measure.py --base-url http://localhost:8001 --model laguna --label dflash --n 5 --out results/dflash.json "$PY" bench/measure.py --base-url http://localhost:8000 --model laguna --label baseline --n 5 --out results/baseline.json "$PY" evals/humaneval_subset.py --parity --base-url http://localhost:8000 --base-url-b http://localhost:8001 --model laguna --n 3 "$PY" scripts/check_results.py results/dflash.json results/baseline.json echo "[parity_local] OK — results/ written, parity checked"