File size: 3,918 Bytes
36b2bff | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | #!/usr/bin/env python3
"""Run all 8 verification gates and report pass/fail.
Gates:
1. Retrieval quality (model comparison)
2. Threshold calibration
3. RAGAS evaluation (requires clinician QA pairs)
4. Patient isolation
5. Safety layers
6. Memory + latency
7. Clinical review (manual — checklist)
8. E2E pipeline
Usage:
python scripts/evaluation/run_verification_gates.py
"""
import logging
import subprocess
import sys
from pathlib import Path
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger(__name__)
BACKEND_DIR = Path(__file__).parent.parent.parent
def run_gate(name: str, command: list[str], timeout: int = 600) -> bool:
"""Run a gate command and return pass/fail."""
logger.info(f"\n{'=' * 60}")
logger.info(f"GATE: {name}")
logger.info(f"{'=' * 60}")
try:
result = subprocess.run(command, capture_output=True, text=True, cwd=str(BACKEND_DIR), timeout=timeout) # noqa: S603
if result.returncode == 0:
logger.info(" PASS")
return True
else:
logger.info(" FAIL")
if result.stderr:
logger.info(f" Error: {result.stderr[:500]}")
return False
except subprocess.TimeoutExpired:
logger.info(" TIMEOUT")
return False
except Exception as e:
logger.info(f" ERROR: {e}")
return False
def main():
gates = {}
# Gate 4: Patient isolation tests
gates["4. Patient Isolation"] = run_gate(
"Patient Isolation",
[sys.executable, "-m", "pytest", "tests/test_patient_isolation.py", "-v", "--tb=short"],
)
# Gate 5: Safety layer tests
gates["5. Safety Layers"] = run_gate(
"Safety Layers",
[
sys.executable,
"-m",
"pytest",
"tests/test_rag_safety.py",
"tests/test_rag_degradation.py",
"-v",
"--tb=short",
],
)
# Gate 6: Import check (can we import everything?)
gates["6. Import Check"] = run_gate(
"Import Check",
[
sys.executable,
"-c",
"from app.services.rag import RAGService; from app.services.rag_safety import filter_by_relevance; from app.services.chat_summary import extract_clinical_sentences; print('All imports OK')",
],
)
# Gate 8: E2E pipeline tests
gates["8. E2E Pipeline"] = run_gate(
"E2E Pipeline",
[sys.executable, "-m", "pytest", "tests/test_pipeline_rag_e2e.py", "-v", "--tb=short"],
)
# Gate: Full test suite
gates["Full Test Suite"] = run_gate(
"Full Test Suite",
[
sys.executable,
"-m",
"pytest",
"tests/",
"--ignore=tests/test_rag_integration.py",
"--ignore=tests/test_patient_isolation.py",
"-q",
],
)
# Report
logger.info(f"\n{'=' * 60}")
logger.info("VERIFICATION GATE SUMMARY")
logger.info(f"{'=' * 60}")
all_pass = True
for gate_name, passed in gates.items():
status = "PASS" if passed else "FAIL"
logger.info(f" {gate_name}: {status}")
if not passed:
all_pass = False
logger.info("\nManual gates (require human verification):")
logger.info(" 1. Retrieval Quality: Run scripts/evaluation/run_model_comparison.py")
logger.info(" 2. Threshold Calibration: Run scripts/evaluation/run_threshold_calibration.py")
logger.info(" 3. RAGAS Evaluation: Run scripts/evaluation/run_ragas_evaluation.py (needs clinician QA)")
logger.info(" 7. Clinical Review: Clinician must review knowledge base content")
if all_pass:
logger.info("\nAll automated gates PASSED")
else:
logger.info("\nSome gates FAILED — fix before deployment")
sys.exit(1)
if __name__ == "__main__":
main()
|