| # parity_local.sh — full local dry-run of the benchmark + parity harness on the Mac. | |
| # Starts two stub servers (baseline :8000, "dflash" :8001), waits until both are | |
| # ready, runs measure.py against each (writing results/*.json) and the greedy | |
| # parity check across both, then tears the stubs down. No CUDA / vLLM / Laguna. | |
| set -euo pipefail | |
| cd "$(dirname "$0")/.." | |
| PY=.venv/bin/python | |
| "$PY" scripts/stub_server.py --port 8000 & A=$! | |
| "$PY" scripts/stub_server.py --port 8001 --spec & B=$! | |
| trap 'kill $A $B 2>/dev/null || true' EXIT | |
| # Wait for both ports to accept connections (no shell sleep — poll in python). | |
| "$PY" - <<'PY' | |
| import socket, time, sys | |
| for port in (8000, 8001): | |
| for _ in range(100): | |
| with socket.socket() as s: | |
| if s.connect_ex(("127.0.0.1", port)) == 0: | |
| break | |
| time.sleep(0.05) | |
| else: | |
| sys.exit(f"stub on {port} never came up") | |
| print("[parity_local] both stubs ready") | |
| PY | |
| mkdir -p results | |
| "$PY" bench/measure.py --base-url http://localhost:8001 --model laguna --label dflash --n 5 --out results/dflash.json | |
| "$PY" bench/measure.py --base-url http://localhost:8000 --model laguna --label baseline --n 5 --out results/baseline.json | |
| "$PY" evals/humaneval_subset.py --parity --base-url http://localhost:8000 --base-url-b http://localhost:8001 --model laguna --n 3 | |
| "$PY" scripts/check_results.py results/dflash.json results/baseline.json | |
| echo "[parity_local] OK — results/ written, parity checked" | |