#!/usr/bin/env bash
# parity_local.sh — full local dry-run of the benchmark + parity harness on the Mac.
# Starts two stub servers (baseline :8000, "dflash" :8001), waits until both are
# ready, runs measure.py against each (writing results/*.json) and the greedy
# parity check across both, then tears the stubs down. No CUDA / vLLM / Laguna.
set -euo pipefail
cd "$(dirname "$0")/.."
PY=.venv/bin/python

"$PY" scripts/stub_server.py --port 8000 &       A=$!
"$PY" scripts/stub_server.py --port 8001 --spec & B=$!
trap 'kill $A $B 2>/dev/null || true' EXIT

# Wait for both ports to accept connections (no shell sleep — poll in python).
"$PY" - <<'PY'
import socket, time, sys
for port in (8000, 8001):
    for _ in range(100):
        with socket.socket() as s:
            if s.connect_ex(("127.0.0.1", port)) == 0:
                break
        time.sleep(0.05)
    else:
        sys.exit(f"stub on {port} never came up")
print("[parity_local] both stubs ready")
PY

mkdir -p results
"$PY" bench/measure.py --base-url http://localhost:8001 --model laguna --label dflash   --n 5 --out results/dflash.json
"$PY" bench/measure.py --base-url http://localhost:8000 --model laguna --label baseline --n 5 --out results/baseline.json
"$PY" evals/humaneval_subset.py --parity --base-url http://localhost:8000 --base-url-b http://localhost:8001 --model laguna --n 3
"$PY" scripts/check_results.py results/dflash.json results/baseline.json
echo "[parity_local] OK — results/ written, parity checked"