File size: 3,918 Bytes
36b2bff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python3
"""Run all 8 verification gates and report pass/fail.

Gates:
1. Retrieval quality (model comparison)
2. Threshold calibration
3. RAGAS evaluation (requires clinician QA pairs)
4. Patient isolation
5. Safety layers
6. Memory + latency
7. Clinical review (manual — checklist)
8. E2E pipeline

Usage:
    python scripts/evaluation/run_verification_gates.py
"""

import logging
import subprocess
import sys
from pathlib import Path

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger(__name__)

BACKEND_DIR = Path(__file__).parent.parent.parent


def run_gate(name: str, command: list[str], timeout: int = 600) -> bool:
    """Run a gate command and return pass/fail."""
    logger.info(f"\n{'=' * 60}")
    logger.info(f"GATE: {name}")
    logger.info(f"{'=' * 60}")
    try:
        result = subprocess.run(command, capture_output=True, text=True, cwd=str(BACKEND_DIR), timeout=timeout)  # noqa: S603
        if result.returncode == 0:
            logger.info("  PASS")
            return True
        else:
            logger.info("  FAIL")
            if result.stderr:
                logger.info(f"  Error: {result.stderr[:500]}")
            return False
    except subprocess.TimeoutExpired:
        logger.info("  TIMEOUT")
        return False
    except Exception as e:
        logger.info(f"  ERROR: {e}")
        return False


def main():
    gates = {}

    # Gate 4: Patient isolation tests
    gates["4. Patient Isolation"] = run_gate(
        "Patient Isolation",
        [sys.executable, "-m", "pytest", "tests/test_patient_isolation.py", "-v", "--tb=short"],
    )

    # Gate 5: Safety layer tests
    gates["5. Safety Layers"] = run_gate(
        "Safety Layers",
        [
            sys.executable,
            "-m",
            "pytest",
            "tests/test_rag_safety.py",
            "tests/test_rag_degradation.py",
            "-v",
            "--tb=short",
        ],
    )

    # Gate 6: Import check (can we import everything?)
    gates["6. Import Check"] = run_gate(
        "Import Check",
        [
            sys.executable,
            "-c",
            "from app.services.rag import RAGService; from app.services.rag_safety import filter_by_relevance; from app.services.chat_summary import extract_clinical_sentences; print('All imports OK')",
        ],
    )

    # Gate 8: E2E pipeline tests
    gates["8. E2E Pipeline"] = run_gate(
        "E2E Pipeline",
        [sys.executable, "-m", "pytest", "tests/test_pipeline_rag_e2e.py", "-v", "--tb=short"],
    )

    # Gate: Full test suite
    gates["Full Test Suite"] = run_gate(
        "Full Test Suite",
        [
            sys.executable,
            "-m",
            "pytest",
            "tests/",
            "--ignore=tests/test_rag_integration.py",
            "--ignore=tests/test_patient_isolation.py",
            "-q",
        ],
    )

    # Report
    logger.info(f"\n{'=' * 60}")
    logger.info("VERIFICATION GATE SUMMARY")
    logger.info(f"{'=' * 60}")

    all_pass = True
    for gate_name, passed in gates.items():
        status = "PASS" if passed else "FAIL"
        logger.info(f"  {gate_name}: {status}")
        if not passed:
            all_pass = False

    logger.info("\nManual gates (require human verification):")
    logger.info("  1. Retrieval Quality: Run scripts/evaluation/run_model_comparison.py")
    logger.info("  2. Threshold Calibration: Run scripts/evaluation/run_threshold_calibration.py")
    logger.info("  3. RAGAS Evaluation: Run scripts/evaluation/run_ragas_evaluation.py (needs clinician QA)")
    logger.info("  7. Clinical Review: Clinician must review knowledge base content")

    if all_pass:
        logger.info("\nAll automated gates PASSED")
    else:
        logger.info("\nSome gates FAILED — fix before deployment")
        sys.exit(1)


if __name__ == "__main__":
    main()