Pete Dunn
router stabilization pass
60aec0a
#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
BACKEND_DIR="${ROOT_DIR}/backend"
OUT_DIR="${OUT_DIR:-${ROOT_DIR}/docs/evals/release_gate}"
CASES_PATH="${CASES_PATH:-${ROOT_DIR}/docs/evals/unified_kb_eval150_cases.json}"
OUT_JSON="${OUT_JSON:-${OUT_DIR}/unified_kb_eval150_release_gate.json}"
OUT_MD="${OUT_MD:-${OUT_DIR}/unified_kb_eval150_release_gate.md}"
TREND_FILE="${TREND_FILE:-${ROOT_DIR}/docs/evals/unified_kb_eval150_trend.json}"
BASELINE_JSON="${BASELINE_JSON:-${ROOT_DIR}/docs/evals/release_gate_baseline.json}"
PASS_THRESHOLD="${PASS_THRESHOLD:-75}"
FACT_THRESHOLD="${FACT_THRESHOLD:-50}"
P95_BUDGET_MS="${P95_BUDGET_MS:-10000}"
P99_BUDGET_MS="${P99_BUDGET_MS:-15000}"
SEMANTIC_TIMEOUT_S="${SEMANTIC_TIMEOUT_S:-8}"
SEMANTIC_POLICY="${SEMANTIC_POLICY:-hard_edge_or_fail}"
EVAL_LIMIT="${EVAL_LIMIT:-}"
LATENCY_SUMMARY_JSON="${LATENCY_SUMMARY_JSON:-${OUT_DIR}/latency_summary_release_gate.json}"
PREFLIGHT_JSON="${PREFLIGHT_JSON:-${OUT_DIR}/preflight_release_gate.json}"
DEP_POLICY_JSON="${DEP_POLICY_JSON:-${OUT_DIR}/dependency_policy_release_gate.json}"
ROUTER_CANARY_SHARD_DIR="${ROUTER_CANARY_SHARD_DIR:-${ROOT_DIR}/docs/testing/router_canary_ab_shards}"
ROUTER_CANARY_GATE_JSON="${ROUTER_CANARY_GATE_JSON:-${OUT_DIR}/router_canary_ab_release_gate.json}"
ROUTER_CANARY_GATE_MD="${ROUTER_CANARY_GATE_MD:-${OUT_DIR}/router_canary_ab_release_gate.md}"
ROUTER_CANARY_ARTIFACTS_REFRESHED=0
refresh_router_canary_artifacts() {
if [[ "${ROUTER_CANARY_ARTIFACTS_REFRESHED}" == "1" ]]; then
return 0
fi
echo "== Router canary A/B artifact-state refresh =="
local refresh_status=0
if compgen -G "${ROUTER_CANARY_SHARD_DIR}/router_canary_ab_eval_shard_*_scored.csv" > /dev/null; then
if python3 scripts/router_canary_ab_release_gate.py \
--shard-dir "${ROUTER_CANARY_SHARD_DIR}" \
--out-json "${ROUTER_CANARY_GATE_JSON}" \
--out-md "${ROUTER_CANARY_GATE_MD}"; then
refresh_status=0
else
refresh_status=$?
fi
else
echo "ROUTER_CANARY_GATE: SKIP (no scored shard CSVs found under ${ROUTER_CANARY_SHARD_DIR})"
fi
ROUTER_CANARY_ARTIFACTS_REFRESHED=1
return "${refresh_status}"
}
trap refresh_router_canary_artifacts EXIT
mkdir -p "${OUT_DIR}"
if [[ -f "${ROOT_DIR}/.env.codex" ]]; then
set -a
# shellcheck source=/dev/null
source "${ROOT_DIR}/.env.codex"
set +a
fi
cd "${BACKEND_DIR}"
echo "== Preflight env/config gate =="
python3 scripts/preflight_env_check.py --require-openai --out "${PREFLIGHT_JSON}"
echo "== Dependency policy gate =="
python3 scripts/check_dependency_policy.py --out "${DEP_POLICY_JSON}"
echo "== Auth/env gate =="
python3 - <<'PY'
import os
import sys
from app.auth import load_auth_settings
s = load_auth_settings()
if s.required and not s.enabled:
details = " ".join(list(s.config_details or [])) or "missing/invalid auth settings"
print(f"AUTH_GATE: FAIL ({details})")
sys.exit(2)
print("AUTH_GATE: PASS")
PY
echo "== Startup/compile gate =="
python3 -m py_compile app/main.py app/knowledgebase/core.py app/router_rag/core.py app/routers/router_core.py
echo "COMPILE_GATE: PASS"
echo "== Router RAG smoke gate =="
python3 scripts/router_rag_smoke.py --json > "${OUT_DIR}/router_rag_smoke_release_gate.json"
echo "SMOKE_GATE: PASS"
KEY="${OPENAI_API_KEY:-}"
KEY_UPPER="$(printf '%s' "${KEY}" | tr '[:lower:]' '[:upper:]')"
if [[ -z "${KEY}" || "${KEY_UPPER}" == YOUR_KEY* || "${KEY_UPPER}" == "<YOUR_OPENAI_API_KEY>" || "${KEY_UPPER}" == "REPLACE_ME" ]]; then
echo "ERROR: OPENAI_API_KEY missing or placeholder. Release gate requires OpenAI-enabled eval."
exit 2
fi
echo "== Unified KB eval150 gate =="
eval_cmd=(
python3 scripts/unified_kb_eval150.py
--with-openai
--semantic-grader
--semantic-policy "${SEMANTIC_POLICY}"
--semantic-timeout-s "${SEMANTIC_TIMEOUT_S}"
--pass-threshold "${PASS_THRESHOLD}"
--fact-threshold "${FACT_THRESHOLD}"
--strict-exit
--cases "${CASES_PATH}"
--out "${OUT_JSON}"
--md-out "${OUT_MD}"
--trend-file "${TREND_FILE}"
--print-fails
)
if [[ -n "${EVAL_LIMIT}" ]]; then
eval_cmd+=(--limit "${EVAL_LIMIT}")
fi
"${eval_cmd[@]}"
echo "== Latency gate =="
python3 scripts/eval_latency_alert.py \
--eval-json "${OUT_JSON}" \
--p95-budget-ms "${P95_BUDGET_MS}" \
--p99-budget-ms "${P99_BUDGET_MS}" \
--out "${LATENCY_SUMMARY_JSON}"
echo "== Baseline regression gate =="
export RELEASE_GATE_OUT_JSON="${OUT_JSON}"
export RELEASE_GATE_BASELINE_JSON="${BASELINE_JSON}"
python3 - <<'PY'
import json
import os
import sys
from pathlib import Path
out_path = Path(str(os.environ.get("RELEASE_GATE_OUT_JSON", "")).strip())
baseline_path = Path(str(os.environ.get("RELEASE_GATE_BASELINE_JSON", "")).strip())
if not out_path.exists():
print(f"BASELINE_GATE: FAIL (missing eval output: {out_path})")
sys.exit(5)
if not baseline_path.exists():
print(f"BASELINE_GATE: FAIL (missing baseline file: {baseline_path})")
sys.exit(5)
obj = json.loads(out_path.read_text(encoding="utf-8"))
base = json.loads(baseline_path.read_text(encoding="utf-8"))
failed = int(obj.get("failed", 0) or 0)
pass_rate = float(obj.get("pass_rate", 0.0) or 0.0)
avg_latency = float(obj.get("avg_latency_ms", 0.0) or 0.0)
errors = []
if pass_rate < float(base.get("min_pass_rate", 90.0)):
errors.append(f"pass_rate {pass_rate:.2f}% < baseline min {float(base.get('min_pass_rate', 90.0)):.2f}%")
if failed > int(base.get("max_failed_cases", 15)):
errors.append(f"failed_cases {failed} > baseline max {int(base.get('max_failed_cases', 15))}")
if avg_latency > float(base.get("max_avg_latency_ms", 10000.0)):
errors.append(
f"avg_latency_ms {avg_latency:.2f} > baseline max {float(base.get('max_avg_latency_ms', 10000.0)):.2f}"
)
if errors:
print("BASELINE_GATE: FAIL")
for e in errors:
print(f"- {e}")
sys.exit(6)
print("BASELINE_GATE: PASS")
PY
if compgen -G "${ROUTER_CANARY_SHARD_DIR}/router_canary_ab_eval_shard_*_scored.csv" > /dev/null; then
echo "== Router canary A/B release gate =="
refresh_router_canary_artifacts
else
echo "== Router canary A/B release gate =="
echo "ROUTER_CANARY_GATE: SKIP (no scored shard CSVs found under ${ROUTER_CANARY_SHARD_DIR})"
fi
echo ""
echo "RELEASE_GATE: PASS"
echo "Eval JSON: ${OUT_JSON}"
echo "Eval MD: ${OUT_MD}"