Spaces:

bshepp
/

cds-agent

Running

bshepp commited on Feb 14

Commit

393ff7f

1 Parent(s): c28dd56

Add validation framework for MedQA, MTSamples, and PMC Case Reports

Three-dataset validation suite that evaluates the CDS pipeline against
external clinical benchmarks:

- MedQA: 1273 USMLE-style questions, measures diagnostic accuracy
(top-1, top-3, mentioned-anywhere)
- MTSamples: ~5000 medical transcriptions, measures parse robustness
and field extraction completeness across specialties
- PMC Case Reports: Published case reports from PubMed, measures
real-world diagnostic accuracy against gold-standard diagnoses

Architecture:
- validation/base.py: Core framework (runner, fuzzy matching, scoring)
- validation/harness_medqa.py: MedQA fetcher + harness
- validation/harness_mtsamples.py: MTSamples fetcher + harness
- validation/harness_pmc.py: PMC Case Reports fetcher + harness
- validation/run_validation.py: Unified CLI runner

Uses direct Orchestrator calls (no server needed). Tested end-to-end:
3 MedQA cases, 66.7% top-1 accuracy, 100% parse success rate."

Files changed (7) hide show

.gitignore +4 -0
src/backend/validation/__init__.py +8 -0
src/backend/validation/base.py +257 -0
src/backend/validation/harness_medqa.py +335 -0
src/backend/validation/harness_mtsamples.py +402 -0
src/backend/validation/harness_pmc.py +470 -0
src/backend/validation/run_validation.py +273 -0

.gitignore CHANGED Viewed

@@ -39,6 +39,10 @@ out/
 # Test outputs
 results.json
 # Models (too large for git)
 models/*.bin
 models/*.pt

 # Test outputs
 results.json
+# Validation datasets (downloaded on demand) and results
+src/backend/validation/data/
+src/backend/validation/results/
 # Models (too large for git)
 models/*.bin
 models/*.pt

src/backend/validation/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""
+Validation framework for the Clinical Decision Support Agent.
+Validates the CDS pipeline against three external clinical datasets:
+  - MedQA (USMLE-style questions) — diagnostic accuracy
+  - MTSamples (medical transcriptions) — parse robustness
+  - PMC Case Reports (published cases) — real-world diagnostic accuracy
+"""

src/backend/validation/base.py ADDED Viewed

	@@ -0,0 +1,257 @@

+"""
+Base classes and utilities for the validation framework.
+Provides:
+  - ValidationCase: a single test case with input + ground truth
+  - ValidationResult: scored result for a single case
+  - ValidationSummary: aggregate metrics for a dataset
+  - run_cds_pipeline(): runs a case through the orchestrator directly
+  - fuzzy_match(): soft string matching for diagnosis comparison
+"""
+from __future__ import annotations
+import asyncio
+import json
+import re
+import time
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+# ── CDS pipeline imports ──
+import sys
+# Ensure the backend app is importable
+BACKEND_DIR = Path(__file__).resolve().parent.parent
+if str(BACKEND_DIR) not in sys.path:
+    sys.path.insert(0, str(BACKEND_DIR))
+from app.agent.orchestrator import Orchestrator
+from app.models.schemas import CaseSubmission, CDSReport, AgentState
+# ──────────────────────────────────────────────
+# Data classes
+# ──────────────────────────────────────────────
+@dataclass
+class ValidationCase:
+    """A single validation test case."""
+    case_id: str
+    source_dataset: str                    # "medqa", "mtsamples", "pmc"
+    input_text: str                        # Clinical text fed to the pipeline
+    ground_truth: Dict[str, Any]           # Dataset-specific ground truth
+    metadata: Dict[str, Any] = field(default_factory=dict)
+@dataclass
+class ValidationResult:
+    """Result of running one case through the pipeline + scoring."""
+    case_id: str
+    source_dataset: str
+    success: bool                          # Pipeline completed without crash
+    scores: Dict[str, float]              # Metric name → score (0.0–1.0)
+    pipeline_time_ms: int = 0
+    step_results: Dict[str, str] = field(default_factory=dict)  # step_id → status
+    report_summary: Optional[str] = None
+    error: Optional[str] = None
+    details: Dict[str, Any] = field(default_factory=dict)       # Extra scoring info
+@dataclass
+class ValidationSummary:
+    """Aggregate metrics for a dataset validation run."""
+    dataset: str
+    total_cases: int
+    successful_cases: int
+    failed_cases: int
+    metrics: Dict[str, float]              # Metric name → average score
+    per_case: List[ValidationResult]
+    run_duration_sec: float
+    timestamp: str = ""
+    def __post_init__(self):
+        if not self.timestamp:
+            self.timestamp = datetime.now(timezone.utc).isoformat()
+# ──────────────────────────────────────────────
+# Pipeline runner
+# ──────────────────────────────────────────────
+async def run_cds_pipeline(
+    patient_text: str,
+    include_drug_check: bool = True,
+    include_guidelines: bool = True,
+    timeout_sec: int = 180,
+) -> tuple[Optional[AgentState], Optional[CDSReport], Optional[str]]:
+    """
+    Run a single case through the CDS pipeline directly (no HTTP server needed).
+    Returns:
+        (state, report, error) — error is None on success
+    """
+    case = CaseSubmission(
+        patient_text=patient_text,
+        include_drug_check=include_drug_check,
+        include_guidelines=include_guidelines,
+    )
+    orchestrator = Orchestrator()
+    try:
+        async for _step_update in orchestrator.run(case):
+            pass  # consume all step updates
+        return orchestrator.state, orchestrator.get_result(), None
+    except asyncio.TimeoutError:
+        return orchestrator.state, None, f"Pipeline timed out after {timeout_sec}s"
+    except Exception as e:
+        return orchestrator.state, None, str(e)
+# ──────────────────────────────────────────────
+# Fuzzy string matching for diagnosis comparison
+# ──────────────────────────────────────────────
+def normalize_text(text: str) -> str:
+    """Lowercase, strip punctuation, normalize whitespace."""
+    text = text.lower().strip()
+    text = re.sub(r'[^\w\s]', ' ', text)
+    text = re.sub(r'\s+', ' ', text)
+    return text
+def fuzzy_match(candidate: str, target: str, threshold: float = 0.6) -> bool:
+    """
+    Check if candidate text is a fuzzy match for target.
+    Uses token overlap (Jaccard-like) rather than edit distance —
+    medical terms are long and we care about semantic overlap, not typos.
+    Args:
+        candidate: Text from the pipeline output
+        target: Ground truth text
+        threshold: Minimum token overlap ratio (0.0–1.0)
+    """
+    c_tokens = set(normalize_text(candidate).split())
+    t_tokens = set(normalize_text(target).split())
+    if not t_tokens:
+        return False
+    # If target is a substring of candidate (or vice versa), that's a match
+    if normalize_text(target) in normalize_text(candidate):
+        return True
+    if normalize_text(candidate) in normalize_text(target):
+        return True
+    # Token overlap
+    overlap = len(c_tokens & t_tokens)
+    denominator = min(len(c_tokens), len(t_tokens))
+    if denominator == 0:
+        return False
+    return (overlap / denominator) >= threshold
+def diagnosis_in_differential(
+    target_diagnosis: str,
+    report: CDSReport,
+    top_n: Optional[int] = None,
+) -> tuple[bool, int]:
+    """
+    Check if target_diagnosis appears in the report's differential.
+    Returns:
+        (found, rank) — rank is 0-indexed position, or -1 if not found
+    """
+    diagnoses = report.differential_diagnosis
+    if top_n:
+        diagnoses = diagnoses[:top_n]
+    for i, dx in enumerate(diagnoses):
+        if fuzzy_match(dx.diagnosis, target_diagnosis):
+            return True, i
+    # Also check the full report text (patient_summary, guideline_recommendations, etc.)
+    full_text = " ".join([
+        report.patient_summary or "",
+        " ".join(report.guideline_recommendations),
+        " ".join(a.action for a in report.suggested_next_steps),
+    ])
+    if fuzzy_match(full_text, target_diagnosis, threshold=0.3):
+        return True, len(diagnoses)  # found but not in differential
+    return False, -1
+# ──────────────────────────────────────────────
+# I/O utilities
+# ──────────────────────────────────────────────
+DATA_DIR = Path(__file__).resolve().parent / "data"
+def ensure_data_dir():
+    """Create the data directory if it doesn't exist."""
+    DATA_DIR.mkdir(parents=True, exist_ok=True)
+def save_results(summary: ValidationSummary, filename: str = None):
+    """Save validation results to JSON."""
+    results_dir = Path(__file__).resolve().parent / "results"
+    results_dir.mkdir(parents=True, exist_ok=True)
+    if filename is None:
+        ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
+        filename = f"{summary.dataset}_{ts}.json"
+    path = results_dir / filename
+    # Convert to serializable dict
+    data = {
+        "dataset": summary.dataset,
+        "total_cases": summary.total_cases,
+        "successful_cases": summary.successful_cases,
+        "failed_cases": summary.failed_cases,
+        "metrics": summary.metrics,
+        "run_duration_sec": summary.run_duration_sec,
+        "timestamp": summary.timestamp,
+        "per_case": [
+            {
+                "case_id": r.case_id,
+                "success": r.success,
+                "scores": r.scores,
+                "pipeline_time_ms": r.pipeline_time_ms,
+                "step_results": r.step_results,
+                "report_summary": r.report_summary,
+                "error": r.error,
+                "details": r.details,
+            }
+            for r in summary.per_case
+        ],
+    }
+    path.write_text(json.dumps(data, indent=2, default=str))
+    return path
+def print_summary(summary: ValidationSummary):
+    """Pretty-print validation results to console."""
+    print(f"\n{'='*60}")
+    print(f"  Validation Results: {summary.dataset.upper()}")
+    print(f"{'='*60}")
+    print(f"  Total cases:      {summary.total_cases}")
+    print(f"  Successful:       {summary.successful_cases}")
+    print(f"  Failed:           {summary.failed_cases}")
+    print(f"  Duration:         {summary.run_duration_sec:.1f}s")
+    print(f"\n  Metrics:")
+    for metric, value in sorted(summary.metrics.items()):
+        if "time" in metric and isinstance(value, (int, float)):
+            print(f"    {metric:30s} {value:.0f}ms")
+        elif isinstance(value, float):
+            print(f"    {metric:30s} {value:.1%}")
+        else:
+            print(f"    {metric:30s} {value}")
+    print(f"{'='*60}\n")

src/backend/validation/harness_medqa.py ADDED Viewed

	@@ -0,0 +1,335 @@

+"""
+MedQA dataset fetcher and validation harness.
+Downloads MedQA USMLE 4-option questions and evaluates the CDS pipeline's
+ability to arrive at the correct diagnosis / answer.
+Source: https://github.com/jind11/MedQA
+Format: JSONL with {question, options: {A, B, C, D}, answer_idx, answer}
+Metrics:
+  - top1_accuracy: Correct answer matches #1 differential diagnosis
+  - top3_accuracy: Correct answer in top 3 differential diagnoses
+  - mentioned_accuracy: Correct answer mentioned anywhere in report
+  - parse_success_rate: Pipeline completed without crashing
+"""
+from __future__ import annotations
+import asyncio
+import json
+import random
+import time
+from pathlib import Path
+from typing import List, Optional
+import httpx
+from validation.base import (
+    DATA_DIR,
+    ValidationCase,
+    ValidationResult,
+    ValidationSummary,
+    diagnosis_in_differential,
+    ensure_data_dir,
+    fuzzy_match,
+    normalize_text,
+    print_summary,
+    run_cds_pipeline,
+    save_results,
+)
+# ──────────────────────────────────────────────
+# Data fetching
+# ──────────────────────────────────────────────
+# HuggingFace direct download (JSONL)
+MEDQA_JSONL_URL = "https://huggingface.co/datasets/GBaker/MedQA-USMLE-4-options/resolve/main/phrases_no_exclude_test.jsonl"
+async def fetch_medqa(max_cases: int = 50, seed: int = 42) -> List[ValidationCase]:
+    """
+    Download MedQA test set and convert to ValidationCase objects.
+    Args:
+        max_cases: Maximum number of cases to sample
+        seed: Random seed for reproducible sampling
+    """
+    ensure_data_dir()
+    cache_path = DATA_DIR / "medqa_test.jsonl"
+    # Try to load from cache
+    if cache_path.exists():
+        print(f"  Loading MedQA from cache: {cache_path}")
+        raw_cases = _load_jsonl(cache_path)
+    else:
+        print(f"  Downloading MedQA test set...")
+        raw_cases = await _download_medqa_jsonl(cache_path)
+    if not raw_cases:
+        raise RuntimeError("Failed to fetch MedQA data. Check network connection.")
+    # Sample
+    random.seed(seed)
+    if len(raw_cases) > max_cases:
+        raw_cases = random.sample(raw_cases, max_cases)
+    # Convert to ValidationCase
+    cases = []
+    for i, item in enumerate(raw_cases):
+        question = item.get("question", "")
+        options = item.get("options", item.get("answer_choices", {}))
+        answer_idx = item.get("answer_idx", item.get("answer", ""))
+        answer_text = item.get("answer", "")
+        # Handle different formats
+        if isinstance(options, dict):
+            if answer_idx in options:
+                answer_text = options[answer_idx]
+        elif isinstance(options, list):
+            # Some formats have options as a list
+            idx = ord(answer_idx) - ord('A') if isinstance(answer_idx, str) and len(answer_idx) == 1 else 0
+            if idx < len(options):
+                answer_text = options[idx]
+        # Build clinical vignette (question only, not the options)
+        # This simulates what a clinician would present
+        clinical_text = _extract_vignette(question)
+        cases.append(ValidationCase(
+            case_id=f"medqa_{i:04d}",
+            source_dataset="medqa",
+            input_text=clinical_text,
+            ground_truth={
+                "correct_answer": answer_text,
+                "answer_idx": answer_idx,
+                "options": options,
+                "full_question": question,
+            },
+        ))
+    print(f"  Loaded {len(cases)} MedQA cases")
+    return cases
+async def _download_medqa_jsonl(cache_path: Path) -> List[dict]:
+    """Download MedQA JSONL from GitHub."""
+    async with httpx.AsyncClient(timeout=60, follow_redirects=True) as client:
+        try:
+            r = await client.get(MEDQA_JSONL_URL)
+            r.raise_for_status()
+            lines = r.text.strip().split('\n')
+            cases = [json.loads(line) for line in lines if line.strip()]
+            # Cache
+            cache_path.write_text('\n'.join(json.dumps(c) for c in cases))
+            print(f"  Cached {len(cases)} MedQA cases to {cache_path}")
+            return cases
+        except Exception as e:
+            print(f"  Warning: Failed to download MedQA: {e}")
+            return []
+def _load_jsonl(path: Path) -> List[dict]:
+    """Load JSONL file."""
+    cases = []
+    for line in path.read_text(encoding="utf-8").strip().split('\n'):
+        if line.strip():
+            cases.append(json.loads(line))
+    return cases
+def _extract_vignette(question: str) -> str:
+    """
+    Extract the clinical vignette from a USMLE question.
+    USMLE questions typically end with "Which of the following..." or
+    "What is the most likely diagnosis?". We strip the question stem
+    to leave just the clinical narrative.
+    """
+    # Common question stems
+    stems = [
+        r"which of the following",
+        r"what is the most likely",
+        r"what is the best next step",
+        r"what is the most appropriate",
+        r"what is the diagnosis",
+        r"the most likely diagnosis is",
+        r"this patient most likely has",
+        r"what would be the next step",
+    ]
+    text = question.strip()
+    for stem in stems:
+        import re
+        # Find the last sentence that starts a question
+        pattern = re.compile(rf'\.?\s*[A-Z].*{stem}.*[\?\.]?\s*$', re.IGNORECASE)
+        match = pattern.search(text)
+        if match:
+            # Return everything before the question stem sentence
+            vignette = text[:match.start()].strip()
+            if len(vignette) > 50:  # Sanity check
+                return vignette
+    # Fallback: return the full text
+    return text
+# ──────────────────────────────────────────────
+# Validation harness
+# ──────────────────────────────────────────────
+async def validate_medqa(
+    cases: List[ValidationCase],
+    include_drug_check: bool = False,
+    include_guidelines: bool = True,
+    delay_between_cases: float = 2.0,
+) -> ValidationSummary:
+    """
+    Run MedQA cases through the CDS pipeline and score results.
+    Args:
+        cases: List of MedQA ValidationCases
+        include_drug_check: Whether to run drug interaction check (slower)
+        include_guidelines: Whether to include guideline retrieval
+        delay_between_cases: Seconds to wait between cases (rate limiting)
+    """
+    results: List[ValidationResult] = []
+    start_time = time.time()
+    for i, case in enumerate(cases):
+        print(f"\n  [{i+1}/{len(cases)}] {case.case_id}: ", end="", flush=True)
+        case_start = time.monotonic()
+        state, report, error = await run_cds_pipeline(
+            patient_text=case.input_text,
+            include_drug_check=include_drug_check,
+            include_guidelines=include_guidelines,
+        )
+        elapsed_ms = int((time.monotonic() - case_start) * 1000)
+        # Build step results
+        step_results = {}
+        if state:
+            step_results = {s.step_id: s.status.value for s in state.steps}
+        # Score
+        scores = {}
+        details = {}
+        correct_answer = case.ground_truth["correct_answer"]
+        if report:
+            # Top-1 accuracy
+            found_top1, rank = diagnosis_in_differential(correct_answer, report, top_n=1)
+            scores["top1_accuracy"] = 1.0 if found_top1 else 0.0
+            # Top-3 accuracy
+            found_top3, rank3 = diagnosis_in_differential(correct_answer, report, top_n=3)
+            scores["top3_accuracy"] = 1.0 if found_top3 else 0.0
+            # Mentioned anywhere
+            found_any, rank_any = diagnosis_in_differential(correct_answer, report)
+            scores["mentioned_accuracy"] = 1.0 if found_any else 0.0
+            # Parse success
+            scores["parse_success"] = 1.0
+            details = {
+                "correct_answer": correct_answer,
+                "top_diagnosis": report.differential_diagnosis[0].diagnosis if report.differential_diagnosis else "NONE",
+                "num_diagnoses": len(report.differential_diagnosis),
+                "found_at_rank": rank_any if found_any else -1,
+            }
+            status_icon = "✓" if found_top3 else "✗"
+            print(f"{status_icon} top1={'Y' if found_top1 else 'N'} top3={'Y' if found_top3 else 'N'} ({elapsed_ms}ms)")
+        else:
+            scores = {
+                "top1_accuracy": 0.0,
+                "top3_accuracy": 0.0,
+                "mentioned_accuracy": 0.0,
+                "parse_success": 0.0,
+            }
+            details = {"correct_answer": correct_answer, "error": error}
+            print(f"✗ FAILED: {error[:80] if error else 'unknown'}")
+        results.append(ValidationResult(
+            case_id=case.case_id,
+            source_dataset="medqa",
+            success=report is not None,
+            scores=scores,
+            pipeline_time_ms=elapsed_ms,
+            step_results=step_results,
+            report_summary=report.patient_summary[:200] if report else None,
+            error=error,
+            details=details,
+        ))
+        # Rate limit
+        if i < len(cases) - 1:
+            await asyncio.sleep(delay_between_cases)
+    # Aggregate
+    total = len(results)
+    successful = sum(1 for r in results if r.success)
+    # Average each metric across successful cases only
+    metric_names = ["top1_accuracy", "top3_accuracy", "mentioned_accuracy", "parse_success"]
+    metrics = {}
+    for m in metric_names:
+        values = [r.scores.get(m, 0.0) for r in results]
+        metrics[m] = sum(values) / len(values) if values else 0.0
+    # Average pipeline time
+    times = [r.pipeline_time_ms for r in results if r.success]
+    metrics["avg_pipeline_time_ms"] = sum(times) / len(times) if times else 0
+    summary = ValidationSummary(
+        dataset="medqa",
+        total_cases=total,
+        successful_cases=successful,
+        failed_cases=total - successful,
+        metrics=metrics,
+        per_case=results,
+        run_duration_sec=time.time() - start_time,
+    )
+    return summary
+# ──────────────────────────────────────────────
+# Standalone runner
+# ──────────────────────────────────────────────
+async def main():
+    """Run MedQA validation standalone."""
+    import argparse
+    parser = argparse.ArgumentParser(description="MedQA Validation")
+    parser.add_argument("--max-cases", type=int, default=10, help="Number of cases to evaluate")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed")
+    parser.add_argument("--include-drugs", action="store_true", help="Include drug interaction check")
+    parser.add_argument("--delay", type=float, default=2.0, help="Delay between cases (seconds)")
+    args = parser.parse_args()
+    print("MedQA Validation Harness")
+    print("=" * 40)
+    cases = await fetch_medqa(max_cases=args.max_cases, seed=args.seed)
+    summary = await validate_medqa(
+        cases,
+        include_drug_check=args.include_drugs,
+        delay_between_cases=args.delay,
+    )
+    print_summary(summary)
+    path = save_results(summary)
+    print(f"Results saved to: {path}")
+if __name__ == "__main__":
+    asyncio.run(main())

src/backend/validation/harness_mtsamples.py ADDED Viewed

	@@ -0,0 +1,402 @@

+"""
+MTSamples dataset fetcher and validation harness.
+Downloads medical transcription samples and evaluates the CDS pipeline's
+ability to parse diverse clinical note formats and reason across specialties.
+Source: https://mtsamples.com (via GitHub mirrors)
+Format: CSV with columns: description, medical_specialty, sample_name, transcription, keywords
+Metrics:
+  - parse_success_rate: Pipeline completed without crashing
+  - field_completeness: How many structured fields were extracted
+  - specialty_alignment: System reasoning aligns with correct specialty
+  - has_differential: Report includes at least one diagnosis
+  - has_recommendations: Report includes next steps
+"""
+from __future__ import annotations
+import asyncio
+import csv
+import io
+import json
+import random
+import re
+import time
+from pathlib import Path
+from typing import List, Optional
+import httpx
+from validation.base import (
+    DATA_DIR,
+    ValidationCase,
+    ValidationResult,
+    ValidationSummary,
+    ensure_data_dir,
+    fuzzy_match,
+    normalize_text,
+    print_summary,
+    run_cds_pipeline,
+    save_results,
+)
+# ──────────────────────────────────────────────
+# Data fetching
+# ──────────────────────────────────────────────
+MTSAMPLES_URL = "https://raw.githubusercontent.com/socd06/medical-nlp/master/data/mtsamples.csv"
+MTSAMPLES_FALLBACK_URL = "https://raw.githubusercontent.com/Abonia1/Clinical-NLP-on-MTSamples/master/mtsamples.csv"
+# Specialties most relevant to CDS
+RELEVANT_SPECIALTIES = {
+    "Cardiovascular / Pulmonary",
+    "Gastroenterology",
+    "General Medicine",
+    "Neurology",
+    "Orthopedic",
+    "Urology",
+    "Nephrology",
+    "Endocrinology",
+    "Hematology - Oncology",
+    "Obstetrics / Gynecology",
+    "Emergency Room Reports",
+    "Consult - History and Phy.",
+    "Discharge Summary",
+    "SOAP / Chart / Progress Notes",
+    "Internal Medicine",
+}
+async def fetch_mtsamples(
+    max_cases: int = 30,
+    seed: int = 42,
+    specialties: Optional[set] = None,
+    min_length: int = 200,
+) -> List[ValidationCase]:
+    """
+    Download MTSamples and convert to ValidationCase objects.
+    Args:
+        max_cases: Maximum number of cases to sample
+        seed: Random seed for reproducible sampling
+        specialties: Filter to these specialties (None = use RELEVANT_SPECIALTIES)
+        min_length: Minimum transcription length to include
+    """
+    ensure_data_dir()
+    cache_path = DATA_DIR / "mtsamples.csv"
+    if cache_path.exists():
+        print(f"  Loading MTSamples from cache: {cache_path}")
+        raw_text = cache_path.read_text(encoding="utf-8")
+    else:
+        print(f"  Downloading MTSamples...")
+        raw_text = await _download_mtsamples(cache_path)
+    if not raw_text:
+        raise RuntimeError("Failed to fetch MTSamples data.")
+    # Parse CSV
+    reader = csv.DictReader(io.StringIO(raw_text))
+    rows = list(reader)
+    # Filter
+    target_specialties = specialties or RELEVANT_SPECIALTIES
+    filtered = []
+    for row in rows:
+        specialty = row.get("medical_specialty", "").strip()
+        transcription = row.get("transcription", "").strip()
+        if not transcription or len(transcription) < min_length:
+            continue
+        if specialty in target_specialties:
+            filtered.append(row)
+    # Sample
+    random.seed(seed)
+    if len(filtered) > max_cases:
+        # Stratified sample: try to get cases from diverse specialties
+        by_specialty = {}
+        for row in filtered:
+            sp = row.get("medical_specialty", "Other")
+            by_specialty.setdefault(sp, []).append(row)
+        sampled = []
+        per_specialty = max(1, max_cases // len(by_specialty))
+        for sp, sp_rows in by_specialty.items():
+            sampled.extend(random.sample(sp_rows, min(per_specialty, len(sp_rows))))
+        # Fill remaining slots randomly
+        remaining = [r for r in filtered if r not in sampled]
+        if len(sampled) < max_cases and remaining:
+            sampled.extend(random.sample(remaining, min(max_cases - len(sampled), len(remaining))))
+        filtered = sampled[:max_cases]
+    # Convert to ValidationCase
+    cases = []
+    for i, row in enumerate(filtered):
+        transcription = row.get("transcription", "").strip()
+        specialty = row.get("medical_specialty", "Unknown").strip()
+        description = row.get("description", "").strip()
+        keywords = row.get("keywords", "").strip()
+        cases.append(ValidationCase(
+            case_id=f"mts_{i:04d}",
+            source_dataset="mtsamples",
+            input_text=transcription,
+            ground_truth={
+                "specialty": specialty,
+                "description": description,
+                "keywords": keywords,
+            },
+            metadata={
+                "sample_name": row.get("sample_name", ""),
+                "text_length": len(transcription),
+            },
+        ))
+    print(f"  Loaded {len(cases)} MTSamples cases across {len(set(c.ground_truth['specialty'] for c in cases))} specialties")
+    return cases
+async def _download_mtsamples(cache_path: Path) -> str:
+    """Download MTSamples CSV."""
+    async with httpx.AsyncClient(timeout=60, follow_redirects=True) as client:
+        for url in [MTSAMPLES_URL, MTSAMPLES_FALLBACK_URL]:
+            try:
+                r = await client.get(url)
+                r.raise_for_status()
+                cache_path.write_text(r.text, encoding="utf-8")
+                print(f"  Cached MTSamples ({len(r.text)} bytes) to {cache_path}")
+                return r.text
+            except Exception as e:
+                print(f"  Warning: Failed to download from {url}: {e}")
+                continue
+    return ""
+# ──────────────────────────────────────────────
+# Scoring helpers
+# ──────────────────────────────────────────────
+SPECIALTY_KEYWORDS = {
+    "Cardiovascular / Pulmonary": ["cardiac", "heart", "coronary", "pulmonary", "lung", "chest", "hypertension", "arrhythmia"],
+    "Gastroenterology": ["gastro", "liver", "hepat", "colon", "bowel", "gi ", "abdominal", "pancrea"],
+    "General Medicine": ["general", "medicine", "primary", "routine"],
+    "Neurology": ["neuro", "brain", "seizure", "stroke", "headache", "neuropathy", "ms "],
+    "Orthopedic": ["ortho", "fracture", "bone", "joint", "knee", "hip", "shoulder", "spine"],
+    "Urology": ["urol", "kidney", "bladder", "prostate", "renal", "urinary"],
+    "Nephrology": ["renal", "kidney", "dialysis", "nephr", "creatinine"],
+    "Endocrinology": ["diabet", "thyroid", "endocrin", "insulin", "glucose", "adrenal"],
+    "Hematology - Oncology": ["cancer", "tumor", "leukemia", "lymphoma", "anemia", "oncol"],
+    "Obstetrics / Gynecology": ["pregnan", "obstet", "gynecol", "uterus", "ovarian", "menstrual"],
+    "Emergency Room Reports": ["emergency", "trauma", "acute", "er ", "ed "],
+    "Internal Medicine": ["internal", "medicine"],
+}
+def check_specialty_alignment(report_text: str, target_specialty: str) -> bool:
+    """Check if the report's content aligns with the expected specialty."""
+    keywords = SPECIALTY_KEYWORDS.get(target_specialty, [])
+    if not keywords:
+        return True  # Can't check, assume aligned
+    report_lower = report_text.lower()
+    matches = sum(1 for kw in keywords if kw in report_lower)
+    return matches >= 1  # At least one specialty keyword present
+def score_field_completeness(state) -> float:
+    """Score how many structured fields were successfully extracted from parsing."""
+    if not state or not state.patient_profile:
+        return 0.0
+    profile = state.patient_profile
+    fields = [
+        profile.age is not None,
+        profile.gender.value != "unknown",
+        bool(profile.chief_complaint),
+        bool(profile.history_of_present_illness),
+        len(profile.past_medical_history) > 0,
+        len(profile.current_medications) > 0,
+        len(profile.allergies) > 0,
+        len(profile.lab_results) > 0,
+        profile.vital_signs is not None,
+        bool(profile.social_history),
+        bool(profile.family_history),
+    ]
+    return sum(fields) / len(fields)
+# ──────────────────────────────────────────────
+# Validation harness
+# ──────────────────────────────────────────────
+async def validate_mtsamples(
+    cases: List[ValidationCase],
+    include_drug_check: bool = True,
+    include_guidelines: bool = True,
+    delay_between_cases: float = 2.0,
+) -> ValidationSummary:
+    """
+    Run MTSamples cases through the CDS pipeline and score results.
+    """
+    results: List[ValidationResult] = []
+    start_time = time.time()
+    for i, case in enumerate(cases):
+        specialty = case.ground_truth.get("specialty", "?")
+        print(f"\n  [{i+1}/{len(cases)}] {case.case_id} ({specialty}): ", end="", flush=True)
+        case_start = time.monotonic()
+        state, report, error = await run_cds_pipeline(
+            patient_text=case.input_text,
+            include_drug_check=include_drug_check,
+            include_guidelines=include_guidelines,
+        )
+        elapsed_ms = int((time.monotonic() - case_start) * 1000)
+        # Step results
+        step_results = {}
+        if state:
+            step_results = {s.step_id: s.status.value for s in state.steps}
+        # Score
+        scores = {}
+        details = {}
+        # Parse success
+        scores["parse_success"] = 1.0 if (state and state.patient_profile) else 0.0
+        # Field completeness
+        scores["field_completeness"] = score_field_completeness(state)
+        if report:
+            # Has differential
+            scores["has_differential"] = 1.0 if len(report.differential_diagnosis) > 0 else 0.0
+            # Has recommendations
+            scores["has_recommendations"] = 1.0 if len(report.suggested_next_steps) > 0 else 0.0
+            # Has guideline recommendations
+            scores["has_guidelines"] = 1.0 if len(report.guideline_recommendations) > 0 else 0.0
+            # Specialty alignment
+            full_report_text = " ".join([
+                report.patient_summary or "",
+                " ".join(d.diagnosis for d in report.differential_diagnosis),
+                " ".join(report.guideline_recommendations),
+                " ".join(a.action for a in report.suggested_next_steps),
+            ])
+            scores["specialty_alignment"] = 1.0 if check_specialty_alignment(
+                full_report_text, specialty
+            ) else 0.0
+            # Conflict detection worked (if applicable)
+            if state and state.conflict_detection:
+                scores["conflict_detection_ran"] = 1.0
+            else:
+                scores["conflict_detection_ran"] = 0.0
+            details = {
+                "specialty": specialty,
+                "num_diagnoses": len(report.differential_diagnosis),
+                "num_recommendations": len(report.suggested_next_steps),
+                "field_completeness": scores["field_completeness"],
+                "num_conflicts": len(report.conflicts) if report.conflicts else 0,
+            }
+            print(f"✓ fields={scores['field_completeness']:.0%} dx={len(report.differential_diagnosis)} ({elapsed_ms}ms)")
+        else:
+            scores.update({
+                "has_differential": 0.0,
+                "has_recommendations": 0.0,
+                "has_guidelines": 0.0,
+                "specialty_alignment": 0.0,
+                "conflict_detection_ran": 0.0,
+            })
+            details = {"specialty": specialty, "error": error}
+            print(f"✗ FAILED: {error[:80] if error else 'unknown'}")
+        results.append(ValidationResult(
+            case_id=case.case_id,
+            source_dataset="mtsamples",
+            success=report is not None,
+            scores=scores,
+            pipeline_time_ms=elapsed_ms,
+            step_results=step_results,
+            report_summary=report.patient_summary[:200] if report else None,
+            error=error,
+            details=details,
+        ))
+        if i < len(cases) - 1:
+            await asyncio.sleep(delay_between_cases)
+    # Aggregate
+    total = len(results)
+    successful = sum(1 for r in results if r.success)
+    metric_names = [
+        "parse_success", "field_completeness", "has_differential",
+        "has_recommendations", "has_guidelines", "specialty_alignment",
+        "conflict_detection_ran",
+    ]
+    metrics = {}
+    for m in metric_names:
+        values = [r.scores.get(m, 0.0) for r in results]
+        metrics[m] = sum(values) / len(values) if values else 0.0
+    times = [r.pipeline_time_ms for r in results if r.success]
+    metrics["avg_pipeline_time_ms"] = sum(times) / len(times) if times else 0
+    summary = ValidationSummary(
+        dataset="mtsamples",
+        total_cases=total,
+        successful_cases=successful,
+        failed_cases=total - successful,
+        metrics=metrics,
+        per_case=results,
+        run_duration_sec=time.time() - start_time,
+    )
+    return summary
+# ──────────────────────────────────────────────
+# Standalone runner
+# ──────────────────────────────────────────────
+async def main():
+    """Run MTSamples validation standalone."""
+    import argparse
+    parser = argparse.ArgumentParser(description="MTSamples Validation")
+    parser.add_argument("--max-cases", type=int, default=10, help="Number of cases to evaluate")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed")
+    parser.add_argument("--no-drugs", action="store_true", help="Skip drug interaction check")
+    parser.add_argument("--no-guidelines", action="store_true", help="Skip guideline retrieval")
+    parser.add_argument("--delay", type=float, default=2.0, help="Delay between cases (seconds)")
+    args = parser.parse_args()
+    print("MTSamples Validation Harness")
+    print("=" * 40)
+    cases = await fetch_mtsamples(max_cases=args.max_cases, seed=args.seed)
+    summary = await validate_mtsamples(
+        cases,
+        include_drug_check=not args.no_drugs,
+        include_guidelines=not args.no_guidelines,
+        delay_between_cases=args.delay,
+    )
+    print_summary(summary)
+    path = save_results(summary)
+    print(f"Results saved to: {path}")
+if __name__ == "__main__":
+    asyncio.run(main())

src/backend/validation/harness_pmc.py ADDED Viewed

	@@ -0,0 +1,470 @@

+"""
+PMC Case Reports fetcher and validation harness.
+Fetches published clinical case reports from PubMed Central and evaluates
+the CDS pipeline's diagnostic accuracy against gold-standard diagnoses.
+Source: NCBI PubMed / PubMed Central (E-utilities API)
+Format: XML abstracts with case presentations and final diagnoses
+Metrics:
+  - diagnostic_accuracy: Correct diagnosis appears in differential
+  - top3_accuracy: Correct diagnosis in top 3
+  - parse_success_rate: Pipeline completed without crashing
+  - has_recommendations: Report includes actionable next steps
+"""
+from __future__ import annotations
+import asyncio
+import json
+import random
+import re
+import time
+import xml.etree.ElementTree as ET
+from pathlib import Path
+from typing import List, Optional, Tuple
+import httpx
+from validation.base import (
+    DATA_DIR,
+    ValidationCase,
+    ValidationResult,
+    ValidationSummary,
+    diagnosis_in_differential,
+    ensure_data_dir,
+    fuzzy_match,
+    normalize_text,
+    print_summary,
+    run_cds_pipeline,
+    save_results,
+)
+# ──────────────────────────────────────────────
+# NCBI E-utilities configuration
+# ──────────────────────────────────────────────
+EUTILS_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
+ESEARCH_URL = f"{EUTILS_BASE}/esearch.fcgi"
+EFETCH_URL = f"{EUTILS_BASE}/efetch.fcgi"
+# Curated search queries for case reports with clear diagnoses
+# Each tuple: (search_query, expected_specialty)
+CASE_REPORT_QUERIES = [
+    ('"case report"[Title] AND "myocardial infarction"[Title] AND diagnosis', "Cardiology"),
+    ('"case report"[Title] AND "pneumonia"[Title] AND diagnosis', "Pulmonology"),
+    ('"case report"[Title] AND "diabetic ketoacidosis"[Title]', "Endocrinology"),
+    ('"case report"[Title] AND "stroke"[Title] AND diagnosis', "Neurology"),
+    ('"case report"[Title] AND "appendicitis"[Title] AND diagnosis', "Surgery"),
+    ('"case report"[Title] AND "pulmonary embolism"[Title]', "Pulmonology"),
+    ('"case report"[Title] AND "sepsis"[Title] AND management', "Critical Care"),
+    ('"case report"[Title] AND "heart failure"[Title] AND management', "Cardiology"),
+    ('"case report"[Title] AND "pancreatitis"[Title] AND diagnosis', "Gastroenterology"),
+    ('"case report"[Title] AND "meningitis"[Title] AND diagnosis', "Neurology/ID"),
+    ('"case report"[Title] AND "urinary tract infection"[Title]', "Urology/ID"),
+    ('"case report"[Title] AND "thyroid"[Title] AND "nodule"', "Endocrinology"),
+    ('"case report"[Title] AND "deep vein thrombosis"[Title]', "Hematology"),
+    ('"case report"[Title] AND "anaphylaxis"[Title]', "Allergy/EM"),
+    ('"case report"[Title] AND "renal failure"[Title] AND acute', "Nephrology"),
+    ('"case report"[Title] AND "liver cirrhosis"[Title]', "Hepatology"),
+    ('"case report"[Title] AND "asthma"[Title] AND exacerbation', "Pulmonology"),
+    ('"case report"[Title] AND "seizure"[Title] AND diagnosis', "Neurology"),
+    ('"case report"[Title] AND "hypoglycemia"[Title]', "Endocrinology"),
+    ('"case report"[Title] AND "gastrointestinal bleeding"[Title]', "Gastroenterology"),
+]
+async def fetch_pmc_cases(
+    max_cases: int = 20,
+    seed: int = 42,
+) -> List[ValidationCase]:
+    """
+    Fetch case reports from PubMed and convert to ValidationCase objects.
+    Uses PubMed E-utilities to search for case reports with clear diagnoses,
+    then extracts the clinical presentation and diagnosis from abstracts.
+    Args:
+        max_cases: Maximum number of cases to fetch
+        seed: Random seed for reproducible selection
+    """
+    ensure_data_dir()
+    cache_path = DATA_DIR / "pmc_cases.json"
+    if cache_path.exists():
+        print(f"  Loading PMC cases from cache: {cache_path}")
+        cached = json.loads(cache_path.read_text(encoding="utf-8"))
+        cases = [ValidationCase(**c) for c in cached]
+        if len(cases) >= max_cases:
+            random.seed(seed)
+            return random.sample(cases, min(max_cases, len(cases)))
+        # Fall through to fetch more if cache is insufficient
+    print(f"  Fetching case reports from PubMed...")
+    cases = await _fetch_from_pubmed(max_cases, seed)
+    if cases:
+        # Cache
+        cached_data = [
+            {
+                "case_id": c.case_id,
+                "source_dataset": c.source_dataset,
+                "input_text": c.input_text,
+                "ground_truth": c.ground_truth,
+                "metadata": c.metadata,
+            }
+            for c in cases
+        ]
+        cache_path.write_text(json.dumps(cached_data, indent=2), encoding="utf-8")
+        print(f"  Cached {len(cases)} PMC cases to {cache_path}")
+    print(f"  Loaded {len(cases)} PMC case reports")
+    return cases
+async def _fetch_from_pubmed(max_cases: int, seed: int) -> List[ValidationCase]:
+    """Fetch case reports via PubMed E-utilities."""
+    cases = []
+    random.seed(seed)
+    queries = random.sample(CASE_REPORT_QUERIES, min(max_cases, len(CASE_REPORT_QUERIES)))
+    async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
+        for query_text, specialty in queries:
+            if len(cases) >= max_cases:
+                break
+            try:
+                # Step 1: Search for PMIDs
+                pmids = await _esearch(client, query_text, retmax=3)
+                if not pmids:
+                    continue
+                # Step 2: Fetch abstracts
+                for pmid in pmids[:1]:  # Take first result per query
+                    abstract_data = await _efetch_abstract(client, pmid)
+                    if not abstract_data:
+                        continue
+                    title, abstract = abstract_data
+                    # Step 3: Extract case presentation and diagnosis
+                    presentation, diagnosis = _extract_case_and_diagnosis(title, abstract, query_text)
+                    if not presentation or not diagnosis:
+                        continue
+                    cases.append(ValidationCase(
+                        case_id=f"pmc_{pmid}",
+                        source_dataset="pmc",
+                        input_text=presentation,
+                        ground_truth={
+                            "diagnosis": diagnosis,
+                            "specialty": specialty,
+                            "title": title,
+                        },
+                        metadata={
+                            "pmid": pmid,
+                            "full_abstract": abstract,
+                        },
+                    ))
+                    if len(cases) >= max_cases:
+                        break
+                # NCBI rate limit: max 3 requests/second without API key
+                await asyncio.sleep(0.5)
+            except Exception as e:
+                print(f"  Warning: Query failed '{query_text[:40]}...': {e}")
+                continue
+    return cases
+async def _esearch(client: httpx.AsyncClient, query: str, retmax: int = 3) -> List[str]:
+    """Search PubMed and return PMIDs."""
+    params = {
+        "db": "pubmed",
+        "term": query,
+        "retmax": retmax,
+        "retmode": "json",
+        "sort": "relevance",
+    }
+    r = await client.get(ESEARCH_URL, params=params)
+    r.raise_for_status()
+    data = r.json()
+    return data.get("esearchresult", {}).get("idlist", [])
+async def _efetch_abstract(client: httpx.AsyncClient, pmid: str) -> Optional[Tuple[str, str]]:
+    """Fetch the title and abstract for a PMID."""
+    params = {
+        "db": "pubmed",
+        "id": pmid,
+        "retmode": "xml",
+    }
+    r = await client.get(EFETCH_URL, params=params)
+    r.raise_for_status()
+    try:
+        root = ET.fromstring(r.text)
+        # Extract title
+        title_el = root.find(".//ArticleTitle")
+        title = title_el.text if title_el is not None and title_el.text else ""
+        # Extract abstract
+        abstract_parts = []
+        for abs_text in root.findall(".//AbstractText"):
+            label = abs_text.get("Label", "")
+            text = abs_text.text or ""
+            # Collect tail text from sub-elements
+            full_text = (abs_text.text or "") + "".join(
+                (child.text or "") + (child.tail or "") for child in abs_text
+            )
+            if label:
+                abstract_parts.append(f"{label}: {full_text.strip()}")
+            else:
+                abstract_parts.append(full_text.strip())
+        abstract = " ".join(abstract_parts)
+        if len(abstract) < 100:
+            return None
+        return title, abstract
+    except ET.ParseError:
+        return None
+def _extract_case_and_diagnosis(
+    title: str, abstract: str, search_query: str
+) -> Tuple[Optional[str], Optional[str]]:
+    """
+    Extract the clinical presentation and final diagnosis from a case report abstract.
+    Strategy:
+    1. Try structured abstract sections (CASE PRESENTATION, DIAGNOSIS, etc.)
+    2. Extract diagnosis from the title (common pattern: "A case of [diagnosis]")
+    3. Fall back to using the search condition as the expected diagnosis
+    """
+    # Try to extract diagnosis from title
+    diagnosis = None
+    title_patterns = [
+        r"case (?:report )?of (.+?)(?:\.|:|$)",
+        r"presenting (?:as|with) (.+?)(?:\.|:|$)",
+        r"diagnosed (?:as|with) (.+?)(?:\.|:|$)",
+        r"rare case of (.+?)(?:\.|:|$)",
+        r"unusual (?:case|presentation) of (.+?)(?:\.|:|$)",
+        # Pattern: "Diagnosis Name: A Case Report"
+        r"^(.+?):\s*[Aa]\s*[Cc]ase\s*[Rr]eport",
+        # Pattern: "Diagnosis Name - Case Report"
+        r"^(.+?)\s*[-–—]\s*[Cc]ase\s*[Rr]eport",
+        # Pattern: "Case of Diagnosis Name"
+        r"[Cc]ase\s+of\s+(.+?)(?:\.|:|,|$)",
+    ]
+    for pattern in title_patterns:
+        match = re.search(pattern, title, re.IGNORECASE)
+        if match:
+            diagnosis = match.group(1).strip()
+            break
+    if not diagnosis:
+        # Extract from search query
+        # queries look like: '"case report"[Title] AND "myocardial infarction"[Title]'
+        # Find all quoted terms and pick the one that isn't "case report"
+        matches = re.findall(r'"([^"]+)"', search_query)
+        for m in matches:
+            if m.lower() != "case report":
+                diagnosis = m
+                break
+    if not diagnosis:
+        return None, None
+    # Clean diagnosis text
+    diagnosis = diagnosis.strip().rstrip('.')
+    # Extract clinical presentation
+    # For structured abstracts, look for specific sections
+    presentation_sections = ["CASE PRESENTATION", "CASE REPORT", "CASE", "CLINICAL PRESENTATION", "HISTORY"]
+    conclusion_sections = ["CONCLUSION", "DISCUSSION", "OUTCOME", "DIAGNOSIS", "RESULTS"]
+    # Try to split abstract into presentation vs conclusion
+    presentation = abstract
+    # Look for section boundaries in structured abstracts
+    for cs in conclusion_sections:
+        pattern = re.compile(rf'\b{cs}\b[:\s]', re.IGNORECASE)
+        match = pattern.search(abstract)
+        if match:
+            # Everything before the conclusion is the presentation
+            candidate = abstract[:match.start()].strip()
+            if len(candidate) > 100:
+                presentation = candidate
+                break
+    # Clean up
+    presentation = presentation.strip()
+    if len(presentation) < 50:
+        presentation = abstract  # Use full abstract if extraction is too short
+    return presentation, diagnosis
+# ──────────────────────────────────────────────
+# Validation harness
+# ──────────────────────────────────────────────
+async def validate_pmc(
+    cases: List[ValidationCase],
+    include_drug_check: bool = True,
+    include_guidelines: bool = True,
+    delay_between_cases: float = 2.0,
+) -> ValidationSummary:
+    """
+    Run PMC case reports through the CDS pipeline and score results.
+    """
+    results: List[ValidationResult] = []
+    start_time = time.time()
+    for i, case in enumerate(cases):
+        dx = case.ground_truth.get("diagnosis", "?")
+        specialty = case.ground_truth.get("specialty", "?")
+        print(f"\n  [{i+1}/{len(cases)}] {case.case_id} ({specialty} — {dx[:40]}): ", end="", flush=True)
+        case_start = time.monotonic()
+        state, report, error = await run_cds_pipeline(
+            patient_text=case.input_text,
+            include_drug_check=include_drug_check,
+            include_guidelines=include_guidelines,
+        )
+        elapsed_ms = int((time.monotonic() - case_start) * 1000)
+        step_results = {}
+        if state:
+            step_results = {s.step_id: s.status.value for s in state.steps}
+        scores = {}
+        details = {}
+        target_diagnosis = case.ground_truth["diagnosis"]
+        if report:
+            # Diagnostic accuracy (anywhere in differential)
+            found_any, rank_any = diagnosis_in_differential(target_diagnosis, report)
+            scores["diagnostic_accuracy"] = 1.0 if found_any else 0.0
+            # Top-3 accuracy
+            found_top3, rank3 = diagnosis_in_differential(target_diagnosis, report, top_n=3)
+            scores["top3_accuracy"] = 1.0 if found_top3 else 0.0
+            # Top-1 accuracy
+            found_top1, rank1 = diagnosis_in_differential(target_diagnosis, report, top_n=1)
+            scores["top1_accuracy"] = 1.0 if found_top1 else 0.0
+            # Parse success
+            scores["parse_success"] = 1.0
+            # Has recommendations
+            scores["has_recommendations"] = 1.0 if len(report.suggested_next_steps) > 0 else 0.0
+            details = {
+                "target_diagnosis": target_diagnosis,
+                "top_diagnosis": report.differential_diagnosis[0].diagnosis if report.differential_diagnosis else "NONE",
+                "num_diagnoses": len(report.differential_diagnosis),
+                "found_at_rank": rank_any if found_any else -1,
+                "all_diagnoses": [d.diagnosis for d in report.differential_diagnosis[:5]],
+            }
+            icon = "✓" if found_any else "✗"
+            top_dx = report.differential_diagnosis[0].diagnosis if report.differential_diagnosis else "NONE"
+            print(f"{icon} top1={'Y' if found_top1 else 'N'} diag={'Y' if found_any else 'N'} | top: {top_dx[:30]} ({elapsed_ms}ms)")
+        else:
+            scores = {
+                "diagnostic_accuracy": 0.0,
+                "top3_accuracy": 0.0,
+                "top1_accuracy": 0.0,
+                "parse_success": 0.0,
+                "has_recommendations": 0.0,
+            }
+            details = {"target_diagnosis": target_diagnosis, "error": error}
+            print(f"✗ FAILED: {error[:80] if error else 'unknown'}")
+        results.append(ValidationResult(
+            case_id=case.case_id,
+            source_dataset="pmc",
+            success=report is not None,
+            scores=scores,
+            pipeline_time_ms=elapsed_ms,
+            step_results=step_results,
+            report_summary=report.patient_summary[:200] if report else None,
+            error=error,
+            details=details,
+        ))
+        if i < len(cases) - 1:
+            await asyncio.sleep(delay_between_cases)
+    # Aggregate
+    total = len(results)
+    successful = sum(1 for r in results if r.success)
+    metric_names = ["diagnostic_accuracy", "top3_accuracy", "top1_accuracy", "parse_success", "has_recommendations"]
+    metrics = {}
+    for m in metric_names:
+        values = [r.scores.get(m, 0.0) for r in results]
+        metrics[m] = sum(values) / len(values) if values else 0.0
+    times = [r.pipeline_time_ms for r in results if r.success]
+    metrics["avg_pipeline_time_ms"] = sum(times) / len(times) if times else 0
+    summary = ValidationSummary(
+        dataset="pmc",
+        total_cases=total,
+        successful_cases=successful,
+        failed_cases=total - successful,
+        metrics=metrics,
+        per_case=results,
+        run_duration_sec=time.time() - start_time,
+    )
+    return summary
+# ──────────────────────────────────────────────
+# Standalone runner
+# ──────────────────────────────────────────────
+async def main():
+    """Run PMC Case Reports validation standalone."""
+    import argparse
+    parser = argparse.ArgumentParser(description="PMC Case Reports Validation")
+    parser.add_argument("--max-cases", type=int, default=10, help="Number of cases to evaluate")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed")
+    parser.add_argument("--no-drugs", action="store_true", help="Skip drug interaction check")
+    parser.add_argument("--no-guidelines", action="store_true", help="Skip guideline retrieval")
+    parser.add_argument("--delay", type=float, default=2.0, help="Delay between cases (seconds)")
+    args = parser.parse_args()
+    print("PMC Case Reports Validation Harness")
+    print("=" * 40)
+    cases = await fetch_pmc_cases(max_cases=args.max_cases, seed=args.seed)
+    summary = await validate_pmc(
+        cases,
+        include_drug_check=not args.no_drugs,
+        include_guidelines=not args.no_guidelines,
+        delay_between_cases=args.delay,
+    )
+    print_summary(summary)
+    path = save_results(summary)
+    print(f"Results saved to: {path}")
+if __name__ == "__main__":
+    asyncio.run(main())

src/backend/validation/run_validation.py ADDED Viewed

	@@ -0,0 +1,273 @@

+"""
+Unified validation runner for the Clinical Decision Support Agent.
+Runs all three dataset validations (MedQA, MTSamples, PMC Case Reports)
+and produces a combined summary report.
+Usage:
+    # From src/backend directory:
+    python -m validation.run_validation --all --max-cases 10
+    python -m validation.run_validation --medqa --max-cases 20
+    python -m validation.run_validation --mtsamples --max-cases 15
+    python -m validation.run_validation --pmc --max-cases 10
+    # Fetch data only (no pipeline execution):
+    python -m validation.run_validation --fetch-only
+"""
+from __future__ import annotations
+import asyncio
+import json
+import sys
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+# Ensure backend is importable
+BACKEND_DIR = Path(__file__).resolve().parent.parent
+if str(BACKEND_DIR) not in sys.path:
+    sys.path.insert(0, str(BACKEND_DIR))
+from validation.base import (
+    ValidationSummary,
+    print_summary,
+    save_results,
+)
+from validation.harness_medqa import fetch_medqa, validate_medqa
+from validation.harness_mtsamples import fetch_mtsamples, validate_mtsamples
+from validation.harness_pmc import fetch_pmc_cases, validate_pmc
+async def run_all_validations(
+    run_medqa: bool = True,
+    run_mtsamples: bool = True,
+    run_pmc: bool = True,
+    max_cases: int = 10,
+    seed: int = 42,
+    include_drug_check: bool = True,
+    include_guidelines: bool = True,
+    delay: float = 2.0,
+    fetch_only: bool = False,
+) -> dict:
+    """
+    Run validation against selected datasets.
+    Returns dict of {dataset_name: ValidationSummary}
+    """
+    results = {}
+    start = time.time()
+    # ── MedQA ──
+    if run_medqa:
+        print("\n" + "=" * 60)
+        print("  DATASET 1: MedQA (USMLE-style diagnostic accuracy)")
+        print("=" * 60)
+        cases = await fetch_medqa(max_cases=max_cases, seed=seed)
+        if fetch_only:
+            print(f"  Fetched {len(cases)} MedQA cases (fetch-only mode)")
+        else:
+            summary = await validate_medqa(
+                cases,
+                include_drug_check=include_drug_check,
+                include_guidelines=include_guidelines,
+                delay_between_cases=delay,
+            )
+            print_summary(summary)
+            save_results(summary)
+            results["medqa"] = summary
+    # ── MTSamples ──
+    if run_mtsamples:
+        print("\n" + "=" * 60)
+        print("  DATASET 2: MTSamples (clinical note parsing robustness)")
+        print("=" * 60)
+        cases = await fetch_mtsamples(max_cases=max_cases, seed=seed)
+        if fetch_only:
+            print(f"  Fetched {len(cases)} MTSamples cases (fetch-only mode)")
+        else:
+            summary = await validate_mtsamples(
+                cases,
+                include_drug_check=include_drug_check,
+                include_guidelines=include_guidelines,
+                delay_between_cases=delay,
+            )
+            print_summary(summary)
+            save_results(summary)
+            results["mtsamples"] = summary
+    # ── PMC Case Reports ──
+    if run_pmc:
+        print("\n" + "=" * 60)
+        print("  DATASET 3: PMC Case Reports (real-world diagnostic accuracy)")
+        print("=" * 60)
+        cases = await fetch_pmc_cases(max_cases=max_cases, seed=seed)
+        if fetch_only:
+            print(f"  Fetched {len(cases)} PMC cases (fetch-only mode)")
+        else:
+            summary = await validate_pmc(
+                cases,
+                include_drug_check=include_drug_check,
+                include_guidelines=include_guidelines,
+                delay_between_cases=delay,
+            )
+            print_summary(summary)
+            save_results(summary)
+            results["pmc"] = summary
+    # ── Combined Summary ──
+    total_duration = time.time() - start
+    if results and not fetch_only:
+        _print_combined_summary(results, total_duration)
+        _save_combined_report(results, total_duration)
+    return results
+def _print_combined_summary(results: dict, total_duration: float):
+    """Print a combined summary across all datasets."""
+    print("\n" + "=" * 70)
+    print("  COMBINED VALIDATION REPORT")
+    print("=" * 70)
+    # Header
+    print(f"\n  {'Dataset':<15} {'Cases':>6} {'Success':>8} {'Key Metric':>25} {'Value':>8}")
+    print(f"  {'-'*15} {'-'*6} {'-'*8} {'-'*25} {'-'*8}")
+    for name, summary in results.items():
+        # Pick the most important metric for each dataset
+        if name == "medqa":
+            key_metric = "top3_accuracy"
+        elif name == "mtsamples":
+            key_metric = "parse_success"
+        elif name == "pmc":
+            key_metric = "diagnostic_accuracy"
+        else:
+            key_metric = list(summary.metrics.keys())[0] if summary.metrics else "N/A"
+        value = summary.metrics.get(key_metric, 0.0)
+        print(
+            f"  {name:<15} {summary.total_cases:>6} "
+            f"{summary.successful_cases:>8} "
+            f"{key_metric:>25} {value:>7.1%}"
+        )
+    # All metrics
+    print(f"\n  {'─' * 66}")
+    for name, summary in results.items():
+        print(f"\n  {name.upper()} metrics:")
+        for metric, value in sorted(summary.metrics.items()):
+            if "time" in metric and isinstance(value, (int, float)):
+                print(f"    {metric:<35} {value:.0f}ms")
+            elif isinstance(value, float):
+                print(f"    {metric:<35} {value:.1%}")
+    # Totals
+    total_cases = sum(s.total_cases for s in results.values())
+    total_success = sum(s.successful_cases for s in results.values())
+    print(f"\n  Total cases:     {total_cases}")
+    print(f"  Total success:   {total_success}")
+    print(f"  Total duration:  {total_duration:.1f}s ({total_duration/60:.1f}min)")
+    print(f"  Timestamp:       {datetime.now(timezone.utc).isoformat()}")
+    print("=" * 70)
+def _save_combined_report(results: dict, total_duration: float):
+    """Save combined report to JSON."""
+    results_dir = Path(__file__).resolve().parent / "results"
+    results_dir.mkdir(parents=True, exist_ok=True)
+    ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
+    path = results_dir / f"combined_{ts}.json"
+    combined = {
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+        "total_duration_sec": total_duration,
+        "datasets": {},
+    }
+    for name, summary in results.items():
+        combined["datasets"][name] = {
+            "total_cases": summary.total_cases,
+            "successful_cases": summary.successful_cases,
+            "failed_cases": summary.failed_cases,
+            "metrics": summary.metrics,
+            "run_duration_sec": summary.run_duration_sec,
+        }
+    path.write_text(json.dumps(combined, indent=2, default=str))
+    print(f"\n  Combined report saved to: {path}")
+def main():
+    """CLI entry point."""
+    import argparse
+    parser = argparse.ArgumentParser(
+        description="CDS Agent Validation Suite",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python -m validation.run_validation --all --max-cases 10
+  python -m validation.run_validation --medqa --max-cases 50
+  python -m validation.run_validation --fetch-only
+  python -m validation.run_validation --medqa --pmc --max-cases 20 --no-drugs
+        """,
+    )
+    # Dataset selection
+    data_group = parser.add_argument_group("Datasets")
+    data_group.add_argument("--all", action="store_true", help="Run all three datasets")
+    data_group.add_argument("--medqa", action="store_true", help="Run MedQA validation")
+    data_group.add_argument("--mtsamples", action="store_true", help="Run MTSamples validation")
+    data_group.add_argument("--pmc", action="store_true", help="Run PMC Case Reports validation")
+    # Configuration
+    config_group = parser.add_argument_group("Configuration")
+    config_group.add_argument("--max-cases", type=int, default=10, help="Cases per dataset (default: 10)")
+    config_group.add_argument("--seed", type=int, default=42, help="Random seed (default: 42)")
+    config_group.add_argument("--delay", type=float, default=2.0, help="Delay between cases in seconds (default: 2.0)")
+    config_group.add_argument("--no-drugs", action="store_true", help="Skip drug interaction checks")
+    config_group.add_argument("--no-guidelines", action="store_true", help="Skip guideline retrieval")
+    config_group.add_argument("--fetch-only", action="store_true", help="Only download data, don't run pipeline")
+    args = parser.parse_args()
+    # Default to --all if nothing specified
+    if not any([args.all, args.medqa, args.mtsamples, args.pmc]):
+        args.all = True
+    run_medqa = args.all or args.medqa
+    run_mtsamples = args.all or args.mtsamples
+    run_pmc = args.all or args.pmc
+    print("╔════════════════════════════════════════════════════════╗")
+    print("║   Clinical Decision Support Agent — Validation Suite  ║")
+    print("╚════════════════════════════════════════════════════════╝")
+    print(f"\n  Datasets:     {'MedQA ' if run_medqa else ''}{'MTSamples ' if run_mtsamples else ''}{'PMC ' if run_pmc else ''}")
+    print(f"  Cases/dataset: {args.max_cases}")
+    print(f"  Drug check:    {'Yes' if not args.no_drugs else 'No'}")
+    print(f"  Guidelines:    {'Yes' if not args.no_guidelines else 'No'}")
+    print(f"  Fetch only:    {'Yes' if args.fetch_only else 'No'}")
+    asyncio.run(run_all_validations(
+        run_medqa=run_medqa,
+        run_mtsamples=run_mtsamples,
+        run_pmc=run_pmc,
+        max_cases=args.max_cases,
+        seed=args.seed,
+        include_drug_check=not args.no_drugs,
+        include_guidelines=not args.no_guidelines,
+        delay=args.delay,
+        fetch_only=args.fetch_only,
+    ))
+if __name__ == "__main__":
+    main()