| #!/usr/bin/env bash |
| set -euo pipefail |
|
|
| ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" |
| BACKEND_DIR="${ROOT_DIR}/backend" |
| OUT_DIR="${OUT_DIR:-${ROOT_DIR}/docs/evals/release_gate}" |
| CASES_PATH="${CASES_PATH:-${ROOT_DIR}/docs/evals/unified_kb_eval150_cases.json}" |
| OUT_JSON="${OUT_JSON:-${OUT_DIR}/unified_kb_eval150_release_gate.json}" |
| OUT_MD="${OUT_MD:-${OUT_DIR}/unified_kb_eval150_release_gate.md}" |
| TREND_FILE="${TREND_FILE:-${ROOT_DIR}/docs/evals/unified_kb_eval150_trend.json}" |
| BASELINE_JSON="${BASELINE_JSON:-${ROOT_DIR}/docs/evals/release_gate_baseline.json}" |
| PASS_THRESHOLD="${PASS_THRESHOLD:-75}" |
| FACT_THRESHOLD="${FACT_THRESHOLD:-50}" |
| P95_BUDGET_MS="${P95_BUDGET_MS:-10000}" |
| P99_BUDGET_MS="${P99_BUDGET_MS:-15000}" |
| SEMANTIC_TIMEOUT_S="${SEMANTIC_TIMEOUT_S:-8}" |
| SEMANTIC_POLICY="${SEMANTIC_POLICY:-hard_edge_or_fail}" |
| EVAL_LIMIT="${EVAL_LIMIT:-}" |
| LATENCY_SUMMARY_JSON="${LATENCY_SUMMARY_JSON:-${OUT_DIR}/latency_summary_release_gate.json}" |
| PREFLIGHT_JSON="${PREFLIGHT_JSON:-${OUT_DIR}/preflight_release_gate.json}" |
| DEP_POLICY_JSON="${DEP_POLICY_JSON:-${OUT_DIR}/dependency_policy_release_gate.json}" |
| ROUTER_CANARY_SHARD_DIR="${ROUTER_CANARY_SHARD_DIR:-${ROOT_DIR}/docs/testing/router_canary_ab_shards}" |
| ROUTER_CANARY_GATE_JSON="${ROUTER_CANARY_GATE_JSON:-${OUT_DIR}/router_canary_ab_release_gate.json}" |
| ROUTER_CANARY_GATE_MD="${ROUTER_CANARY_GATE_MD:-${OUT_DIR}/router_canary_ab_release_gate.md}" |
| ROUTER_CANARY_ARTIFACTS_REFRESHED=0 |
|
|
| refresh_router_canary_artifacts() { |
| if [[ "${ROUTER_CANARY_ARTIFACTS_REFRESHED}" == "1" ]]; then |
| return 0 |
| fi |
| echo "== Router canary A/B artifact-state refresh ==" |
| local refresh_status=0 |
| if compgen -G "${ROUTER_CANARY_SHARD_DIR}/router_canary_ab_eval_shard_*_scored.csv" > /dev/null; then |
| if python3 scripts/router_canary_ab_release_gate.py \ |
| --shard-dir "${ROUTER_CANARY_SHARD_DIR}" \ |
| --out-json "${ROUTER_CANARY_GATE_JSON}" \ |
| --out-md "${ROUTER_CANARY_GATE_MD}"; then |
| refresh_status=0 |
| else |
| refresh_status=$? |
| fi |
| else |
| echo "ROUTER_CANARY_GATE: SKIP (no scored shard CSVs found under ${ROUTER_CANARY_SHARD_DIR})" |
| fi |
| ROUTER_CANARY_ARTIFACTS_REFRESHED=1 |
| return "${refresh_status}" |
| } |
|
|
| trap refresh_router_canary_artifacts EXIT |
|
|
| mkdir -p "${OUT_DIR}" |
|
|
| if [[ -f "${ROOT_DIR}/.env.codex" ]]; then |
| set -a |
| |
| source "${ROOT_DIR}/.env.codex" |
| set +a |
| fi |
|
|
| cd "${BACKEND_DIR}" |
|
|
| echo "== Preflight env/config gate ==" |
| python3 scripts/preflight_env_check.py --require-openai --out "${PREFLIGHT_JSON}" |
|
|
| echo "== Dependency policy gate ==" |
| python3 scripts/check_dependency_policy.py --out "${DEP_POLICY_JSON}" |
|
|
| echo "== Auth/env gate ==" |
| python3 - <<'PY' |
| import os |
| import sys |
| from app.auth import load_auth_settings |
|
|
| s = load_auth_settings() |
| if s.required and not s.enabled: |
| details = " ".join(list(s.config_details or [])) or "missing/invalid auth settings" |
| print(f"AUTH_GATE: FAIL ({details})") |
| sys.exit(2) |
| print("AUTH_GATE: PASS") |
| PY |
|
|
| echo "== Startup/compile gate ==" |
| python3 -m py_compile app/main.py app/knowledgebase/core.py app/router_rag/core.py app/routers/router_core.py |
| echo "COMPILE_GATE: PASS" |
|
|
| echo "== Router RAG smoke gate ==" |
| python3 scripts/router_rag_smoke.py --json > "${OUT_DIR}/router_rag_smoke_release_gate.json" |
| echo "SMOKE_GATE: PASS" |
|
|
| KEY="${OPENAI_API_KEY:-}" |
| KEY_UPPER="$(printf '%s' "${KEY}" | tr '[:lower:]' '[:upper:]')" |
| if [[ -z "${KEY}" || "${KEY_UPPER}" == YOUR_KEY* || "${KEY_UPPER}" == "<YOUR_OPENAI_API_KEY>" || "${KEY_UPPER}" == "REPLACE_ME" ]]; then |
| echo "ERROR: OPENAI_API_KEY missing or placeholder. Release gate requires OpenAI-enabled eval." |
| exit 2 |
| fi |
|
|
| echo "== Unified KB eval150 gate ==" |
| eval_cmd=( |
| python3 scripts/unified_kb_eval150.py |
| --with-openai |
| --semantic-grader |
| --semantic-policy "${SEMANTIC_POLICY}" |
| --semantic-timeout-s "${SEMANTIC_TIMEOUT_S}" |
| --pass-threshold "${PASS_THRESHOLD}" |
| --fact-threshold "${FACT_THRESHOLD}" |
| --strict-exit |
| --cases "${CASES_PATH}" |
| --out "${OUT_JSON}" |
| --md-out "${OUT_MD}" |
| --trend-file "${TREND_FILE}" |
| --print-fails |
| ) |
| if [[ -n "${EVAL_LIMIT}" ]]; then |
| eval_cmd+=(--limit "${EVAL_LIMIT}") |
| fi |
| "${eval_cmd[@]}" |
|
|
| echo "== Latency gate ==" |
| python3 scripts/eval_latency_alert.py \ |
| --eval-json "${OUT_JSON}" \ |
| --p95-budget-ms "${P95_BUDGET_MS}" \ |
| --p99-budget-ms "${P99_BUDGET_MS}" \ |
| --out "${LATENCY_SUMMARY_JSON}" |
|
|
| echo "== Baseline regression gate ==" |
| export RELEASE_GATE_OUT_JSON="${OUT_JSON}" |
| export RELEASE_GATE_BASELINE_JSON="${BASELINE_JSON}" |
| python3 - <<'PY' |
| import json |
| import os |
| import sys |
| from pathlib import Path |
|
|
| out_path = Path(str(os.environ.get("RELEASE_GATE_OUT_JSON", "")).strip()) |
| baseline_path = Path(str(os.environ.get("RELEASE_GATE_BASELINE_JSON", "")).strip()) |
| if not out_path.exists(): |
| print(f"BASELINE_GATE: FAIL (missing eval output: {out_path})") |
| sys.exit(5) |
| if not baseline_path.exists(): |
| print(f"BASELINE_GATE: FAIL (missing baseline file: {baseline_path})") |
| sys.exit(5) |
|
|
| obj = json.loads(out_path.read_text(encoding="utf-8")) |
| base = json.loads(baseline_path.read_text(encoding="utf-8")) |
| failed = int(obj.get("failed", 0) or 0) |
| pass_rate = float(obj.get("pass_rate", 0.0) or 0.0) |
| avg_latency = float(obj.get("avg_latency_ms", 0.0) or 0.0) |
|
|
| errors = [] |
| if pass_rate < float(base.get("min_pass_rate", 90.0)): |
| errors.append(f"pass_rate {pass_rate:.2f}% < baseline min {float(base.get('min_pass_rate', 90.0)):.2f}%") |
| if failed > int(base.get("max_failed_cases", 15)): |
| errors.append(f"failed_cases {failed} > baseline max {int(base.get('max_failed_cases', 15))}") |
| if avg_latency > float(base.get("max_avg_latency_ms", 10000.0)): |
| errors.append( |
| f"avg_latency_ms {avg_latency:.2f} > baseline max {float(base.get('max_avg_latency_ms', 10000.0)):.2f}" |
| ) |
|
|
| if errors: |
| print("BASELINE_GATE: FAIL") |
| for e in errors: |
| print(f"- {e}") |
| sys.exit(6) |
| print("BASELINE_GATE: PASS") |
| PY |
|
|
| if compgen -G "${ROUTER_CANARY_SHARD_DIR}/router_canary_ab_eval_shard_*_scored.csv" > /dev/null; then |
| echo "== Router canary A/B release gate ==" |
| refresh_router_canary_artifacts |
| else |
| echo "== Router canary A/B release gate ==" |
| echo "ROUTER_CANARY_GATE: SKIP (no scored shard CSVs found under ${ROUTER_CANARY_SHARD_DIR})" |
| fi |
|
|
| echo "" |
| echo "RELEASE_GATE: PASS" |
| echo "Eval JSON: ${OUT_JSON}" |
| echo "Eval MD: ${OUT_MD}" |
|
|