#!/usr/bin/env bash set -euo pipefail ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" BACKEND_DIR="${ROOT_DIR}/backend" OUT_DIR="${OUT_DIR:-${ROOT_DIR}/docs/evals/release_gate}" CASES_PATH="${CASES_PATH:-${ROOT_DIR}/docs/evals/unified_kb_eval150_cases.json}" OUT_JSON="${OUT_JSON:-${OUT_DIR}/unified_kb_eval150_release_gate.json}" OUT_MD="${OUT_MD:-${OUT_DIR}/unified_kb_eval150_release_gate.md}" TREND_FILE="${TREND_FILE:-${ROOT_DIR}/docs/evals/unified_kb_eval150_trend.json}" BASELINE_JSON="${BASELINE_JSON:-${ROOT_DIR}/docs/evals/release_gate_baseline.json}" PASS_THRESHOLD="${PASS_THRESHOLD:-75}" FACT_THRESHOLD="${FACT_THRESHOLD:-50}" P95_BUDGET_MS="${P95_BUDGET_MS:-10000}" P99_BUDGET_MS="${P99_BUDGET_MS:-15000}" SEMANTIC_TIMEOUT_S="${SEMANTIC_TIMEOUT_S:-8}" SEMANTIC_POLICY="${SEMANTIC_POLICY:-hard_edge_or_fail}" EVAL_LIMIT="${EVAL_LIMIT:-}" LATENCY_SUMMARY_JSON="${LATENCY_SUMMARY_JSON:-${OUT_DIR}/latency_summary_release_gate.json}" PREFLIGHT_JSON="${PREFLIGHT_JSON:-${OUT_DIR}/preflight_release_gate.json}" DEP_POLICY_JSON="${DEP_POLICY_JSON:-${OUT_DIR}/dependency_policy_release_gate.json}" ROUTER_CANARY_SHARD_DIR="${ROUTER_CANARY_SHARD_DIR:-${ROOT_DIR}/docs/testing/router_canary_ab_shards}" ROUTER_CANARY_GATE_JSON="${ROUTER_CANARY_GATE_JSON:-${OUT_DIR}/router_canary_ab_release_gate.json}" ROUTER_CANARY_GATE_MD="${ROUTER_CANARY_GATE_MD:-${OUT_DIR}/router_canary_ab_release_gate.md}" ROUTER_CANARY_ARTIFACTS_REFRESHED=0 refresh_router_canary_artifacts() { if [[ "${ROUTER_CANARY_ARTIFACTS_REFRESHED}" == "1" ]]; then return 0 fi echo "== Router canary A/B artifact-state refresh ==" local refresh_status=0 if compgen -G "${ROUTER_CANARY_SHARD_DIR}/router_canary_ab_eval_shard_*_scored.csv" > /dev/null; then if python3 scripts/router_canary_ab_release_gate.py \ --shard-dir "${ROUTER_CANARY_SHARD_DIR}" \ --out-json "${ROUTER_CANARY_GATE_JSON}" \ --out-md "${ROUTER_CANARY_GATE_MD}"; then refresh_status=0 else refresh_status=$? fi else echo "ROUTER_CANARY_GATE: SKIP (no scored shard CSVs found under ${ROUTER_CANARY_SHARD_DIR})" fi ROUTER_CANARY_ARTIFACTS_REFRESHED=1 return "${refresh_status}" } trap refresh_router_canary_artifacts EXIT mkdir -p "${OUT_DIR}" if [[ -f "${ROOT_DIR}/.env.codex" ]]; then set -a # shellcheck source=/dev/null source "${ROOT_DIR}/.env.codex" set +a fi cd "${BACKEND_DIR}" echo "== Preflight env/config gate ==" python3 scripts/preflight_env_check.py --require-openai --out "${PREFLIGHT_JSON}" echo "== Dependency policy gate ==" python3 scripts/check_dependency_policy.py --out "${DEP_POLICY_JSON}" echo "== Auth/env gate ==" python3 - <<'PY' import os import sys from app.auth import load_auth_settings s = load_auth_settings() if s.required and not s.enabled: details = " ".join(list(s.config_details or [])) or "missing/invalid auth settings" print(f"AUTH_GATE: FAIL ({details})") sys.exit(2) print("AUTH_GATE: PASS") PY echo "== Startup/compile gate ==" python3 -m py_compile app/main.py app/knowledgebase/core.py app/router_rag/core.py app/routers/router_core.py echo "COMPILE_GATE: PASS" echo "== Router RAG smoke gate ==" python3 scripts/router_rag_smoke.py --json > "${OUT_DIR}/router_rag_smoke_release_gate.json" echo "SMOKE_GATE: PASS" KEY="${OPENAI_API_KEY:-}" KEY_UPPER="$(printf '%s' "${KEY}" | tr '[:lower:]' '[:upper:]')" if [[ -z "${KEY}" || "${KEY_UPPER}" == YOUR_KEY* || "${KEY_UPPER}" == "" || "${KEY_UPPER}" == "REPLACE_ME" ]]; then echo "ERROR: OPENAI_API_KEY missing or placeholder. Release gate requires OpenAI-enabled eval." exit 2 fi echo "== Unified KB eval150 gate ==" eval_cmd=( python3 scripts/unified_kb_eval150.py --with-openai --semantic-grader --semantic-policy "${SEMANTIC_POLICY}" --semantic-timeout-s "${SEMANTIC_TIMEOUT_S}" --pass-threshold "${PASS_THRESHOLD}" --fact-threshold "${FACT_THRESHOLD}" --strict-exit --cases "${CASES_PATH}" --out "${OUT_JSON}" --md-out "${OUT_MD}" --trend-file "${TREND_FILE}" --print-fails ) if [[ -n "${EVAL_LIMIT}" ]]; then eval_cmd+=(--limit "${EVAL_LIMIT}") fi "${eval_cmd[@]}" echo "== Latency gate ==" python3 scripts/eval_latency_alert.py \ --eval-json "${OUT_JSON}" \ --p95-budget-ms "${P95_BUDGET_MS}" \ --p99-budget-ms "${P99_BUDGET_MS}" \ --out "${LATENCY_SUMMARY_JSON}" echo "== Baseline regression gate ==" export RELEASE_GATE_OUT_JSON="${OUT_JSON}" export RELEASE_GATE_BASELINE_JSON="${BASELINE_JSON}" python3 - <<'PY' import json import os import sys from pathlib import Path out_path = Path(str(os.environ.get("RELEASE_GATE_OUT_JSON", "")).strip()) baseline_path = Path(str(os.environ.get("RELEASE_GATE_BASELINE_JSON", "")).strip()) if not out_path.exists(): print(f"BASELINE_GATE: FAIL (missing eval output: {out_path})") sys.exit(5) if not baseline_path.exists(): print(f"BASELINE_GATE: FAIL (missing baseline file: {baseline_path})") sys.exit(5) obj = json.loads(out_path.read_text(encoding="utf-8")) base = json.loads(baseline_path.read_text(encoding="utf-8")) failed = int(obj.get("failed", 0) or 0) pass_rate = float(obj.get("pass_rate", 0.0) or 0.0) avg_latency = float(obj.get("avg_latency_ms", 0.0) or 0.0) errors = [] if pass_rate < float(base.get("min_pass_rate", 90.0)): errors.append(f"pass_rate {pass_rate:.2f}% < baseline min {float(base.get('min_pass_rate', 90.0)):.2f}%") if failed > int(base.get("max_failed_cases", 15)): errors.append(f"failed_cases {failed} > baseline max {int(base.get('max_failed_cases', 15))}") if avg_latency > float(base.get("max_avg_latency_ms", 10000.0)): errors.append( f"avg_latency_ms {avg_latency:.2f} > baseline max {float(base.get('max_avg_latency_ms', 10000.0)):.2f}" ) if errors: print("BASELINE_GATE: FAIL") for e in errors: print(f"- {e}") sys.exit(6) print("BASELINE_GATE: PASS") PY if compgen -G "${ROUTER_CANARY_SHARD_DIR}/router_canary_ab_eval_shard_*_scored.csv" > /dev/null; then echo "== Router canary A/B release gate ==" refresh_router_canary_artifacts else echo "== Router canary A/B release gate ==" echo "ROUTER_CANARY_GATE: SKIP (no scored shard CSVs found under ${ROUTER_CANARY_SHARD_DIR})" fi echo "" echo "RELEASE_GATE: PASS" echo "Eval JSON: ${OUT_JSON}" echo "Eval MD: ${OUT_MD}"