depscreen / scripts /evaluation /run_verification_gates.py
halsabbah's picture
deploy: sync code from GitHub main
36b2bff verified
#!/usr/bin/env python3
"""Run all 8 verification gates and report pass/fail.
Gates:
1. Retrieval quality (model comparison)
2. Threshold calibration
3. RAGAS evaluation (requires clinician QA pairs)
4. Patient isolation
5. Safety layers
6. Memory + latency
7. Clinical review (manual — checklist)
8. E2E pipeline
Usage:
python scripts/evaluation/run_verification_gates.py
"""
import logging
import subprocess
import sys
from pathlib import Path
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger(__name__)
BACKEND_DIR = Path(__file__).parent.parent.parent
def run_gate(name: str, command: list[str], timeout: int = 600) -> bool:
"""Run a gate command and return pass/fail."""
logger.info(f"\n{'=' * 60}")
logger.info(f"GATE: {name}")
logger.info(f"{'=' * 60}")
try:
result = subprocess.run(command, capture_output=True, text=True, cwd=str(BACKEND_DIR), timeout=timeout) # noqa: S603
if result.returncode == 0:
logger.info(" PASS")
return True
else:
logger.info(" FAIL")
if result.stderr:
logger.info(f" Error: {result.stderr[:500]}")
return False
except subprocess.TimeoutExpired:
logger.info(" TIMEOUT")
return False
except Exception as e:
logger.info(f" ERROR: {e}")
return False
def main():
gates = {}
# Gate 4: Patient isolation tests
gates["4. Patient Isolation"] = run_gate(
"Patient Isolation",
[sys.executable, "-m", "pytest", "tests/test_patient_isolation.py", "-v", "--tb=short"],
)
# Gate 5: Safety layer tests
gates["5. Safety Layers"] = run_gate(
"Safety Layers",
[
sys.executable,
"-m",
"pytest",
"tests/test_rag_safety.py",
"tests/test_rag_degradation.py",
"-v",
"--tb=short",
],
)
# Gate 6: Import check (can we import everything?)
gates["6. Import Check"] = run_gate(
"Import Check",
[
sys.executable,
"-c",
"from app.services.rag import RAGService; from app.services.rag_safety import filter_by_relevance; from app.services.chat_summary import extract_clinical_sentences; print('All imports OK')",
],
)
# Gate 8: E2E pipeline tests
gates["8. E2E Pipeline"] = run_gate(
"E2E Pipeline",
[sys.executable, "-m", "pytest", "tests/test_pipeline_rag_e2e.py", "-v", "--tb=short"],
)
# Gate: Full test suite
gates["Full Test Suite"] = run_gate(
"Full Test Suite",
[
sys.executable,
"-m",
"pytest",
"tests/",
"--ignore=tests/test_rag_integration.py",
"--ignore=tests/test_patient_isolation.py",
"-q",
],
)
# Report
logger.info(f"\n{'=' * 60}")
logger.info("VERIFICATION GATE SUMMARY")
logger.info(f"{'=' * 60}")
all_pass = True
for gate_name, passed in gates.items():
status = "PASS" if passed else "FAIL"
logger.info(f" {gate_name}: {status}")
if not passed:
all_pass = False
logger.info("\nManual gates (require human verification):")
logger.info(" 1. Retrieval Quality: Run scripts/evaluation/run_model_comparison.py")
logger.info(" 2. Threshold Calibration: Run scripts/evaluation/run_threshold_calibration.py")
logger.info(" 3. RAGAS Evaluation: Run scripts/evaluation/run_ragas_evaluation.py (needs clinician QA)")
logger.info(" 7. Clinical Review: Clinician must review knowledge base content")
if all_pass:
logger.info("\nAll automated gates PASSED")
else:
logger.info("\nSome gates FAILED — fix before deployment")
sys.exit(1)
if __name__ == "__main__":
main()