Spaces:

halsabbah
/

depscreen

Runtime error

File size: 3,918 Bytes

36b2bff

#!/usr/bin/env python3
"""Run all 8 verification gates and report pass/fail.

Gates:
1. Retrieval quality (model comparison)
2. Threshold calibration
3. RAGAS evaluation (requires clinician QA pairs)
4. Patient isolation
5. Safety layers
6. Memory + latency
7. Clinical review (manual — checklist)
8. E2E pipeline

Usage:
    python scripts/evaluation/run_verification_gates.py
"""

import logging
import subprocess
import sys
from pathlib import Path

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger(__name__)

BACKEND_DIR = Path(__file__).parent.parent.parent


def run_gate(name: str, command: list[str], timeout: int = 600) -> bool:
    """Run a gate command and return pass/fail."""
    logger.info(f"\n{'=' * 60}")
    logger.info(f"GATE: {name}")
    logger.info(f"{'=' * 60}")
    try:
        result = subprocess.run(command, capture_output=True, text=True, cwd=str(BACKEND_DIR), timeout=timeout)  # noqa: S603
        if result.returncode == 0:
            logger.info("  PASS")
            return True
        else:
            logger.info("  FAIL")
            if result.stderr:
                logger.info(f"  Error: {result.stderr[:500]}")
            return False
    except subprocess.TimeoutExpired:
        logger.info("  TIMEOUT")
        return False
    except Exception as e:
        logger.info(f"  ERROR: {e}")
        return False


def main():
    gates = {}

    # Gate 4: Patient isolation tests
    gates["4. Patient Isolation"] = run_gate(
        "Patient Isolation",
        [sys.executable, "-m", "pytest", "tests/test_patient_isolation.py", "-v", "--tb=short"],
    )

    # Gate 5: Safety layer tests
    gates["5. Safety Layers"] = run_gate(
        "Safety Layers",
        [
            sys.executable,
            "-m",
            "pytest",
            "tests/test_rag_safety.py",
            "tests/test_rag_degradation.py",
            "-v",
            "--tb=short",
        ],
    )

    # Gate 6: Import check (can we import everything?)
    gates["6. Import Check"] = run_gate(
        "Import Check",
        [
            sys.executable,
            "-c",
            "from app.services.rag import RAGService; from app.services.rag_safety import filter_by_relevance; from app.services.chat_summary import extract_clinical_sentences; print('All imports OK')",
        ],
    )

    # Gate 8: E2E pipeline tests
    gates["8. E2E Pipeline"] = run_gate(
        "E2E Pipeline",
        [sys.executable, "-m", "pytest", "tests/test_pipeline_rag_e2e.py", "-v", "--tb=short"],
    )

    # Gate: Full test suite
    gates["Full Test Suite"] = run_gate(
        "Full Test Suite",
        [
            sys.executable,
            "-m",
            "pytest",
            "tests/",
            "--ignore=tests/test_rag_integration.py",
            "--ignore=tests/test_patient_isolation.py",
            "-q",
        ],
    )

    # Report
    logger.info(f"\n{'=' * 60}")
    logger.info("VERIFICATION GATE SUMMARY")
    logger.info(f"{'=' * 60}")

    all_pass = True
    for gate_name, passed in gates.items():
        status = "PASS" if passed else "FAIL"
        logger.info(f"  {gate_name}: {status}")
        if not passed:
            all_pass = False

    logger.info("\nManual gates (require human verification):")
    logger.info("  1. Retrieval Quality: Run scripts/evaluation/run_model_comparison.py")
    logger.info("  2. Threshold Calibration: Run scripts/evaluation/run_threshold_calibration.py")
    logger.info("  3. RAGAS Evaluation: Run scripts/evaluation/run_ragas_evaluation.py (needs clinician QA)")
    logger.info("  7. Clinical Review: Clinician must review knowledge base content")

    if all_pass:
        logger.info("\nAll automated gates PASSED")
    else:
        logger.info("\nSome gates FAILED — fix before deployment")
        sys.exit(1)


if __name__ == "__main__":
    main()