Spaces:

bshepp
/

cds-agent

Running

App Files Files Community

bshepp commited on Feb 15

Commit

3d02eb2

1 Parent(s): 8aed835

Add incremental checkpoint saves, --resume flag, fix enum case-sensitivity, add HF_TOKEN to template

Browse files

Files changed (7) hide show

src/backend/.env.template +4 -0
src/backend/app/models/schemas.py +11 -1
src/backend/validation/base.py +76 -16
src/backend/validation/harness_medqa.py +24 -2
src/backend/validation/harness_mtsamples.py +23 -2
src/backend/validation/harness_pmc.py +23 -2
src/backend/validation/run_validation.py +7 -0

src/backend/.env.template CHANGED Viewed

@@ -17,6 +17,10 @@ MEDGEMMA_MODEL_ID=google/medgemma-27b-text-it
 # OpenFDA (no key required for basic use, add for higher rate limits)
 # OPENFDA_API_KEY=
 # --- RAG Configuration ---
 CHROMA_PERSIST_DIR=./data/chroma
 EMBEDDING_MODEL=all-MiniLM-L6-v2

 # OpenFDA (no key required for basic use, add for higher rate limits)
 # OPENFDA_API_KEY=
+# HuggingFace token (for downloading datasets without rate limits)
+# Get yours at: https://huggingface.co/settings/tokens
+# HF_TOKEN=hf_your_token_here
 # --- RAG Configuration ---
 CHROMA_PERSIST_DIR=./data/chroma
 EMBEDDING_MODEL=all-MiniLM-L6-v2

src/backend/app/models/schemas.py CHANGED Viewed

@@ -9,7 +9,7 @@ from __future__ import annotations
 from datetime import date, datetime
 from enum import Enum
 from typing import List, Optional
-from pydantic import BaseModel, Field
 # ──────────────────────────────────────────────
@@ -172,6 +172,16 @@ class ClinicalConflict(BaseModel):
     """A single detected conflict between guidelines and patient data."""
     conflict_type: ConflictType = Field(..., description="Category of the conflict")
     severity: Severity = Field(..., description="Potential clinical impact")
     guideline_source: str = Field(..., description="Which guideline flagged this")
     guideline_text: str = Field(..., description="What the guideline recommends")
     patient_data: str = Field(..., description="Relevant patient data that conflicts")

 from datetime import date, datetime
 from enum import Enum
 from typing import List, Optional
+from pydantic import BaseModel, Field, field_validator
 # ──────────────────────────────────────────────
     """A single detected conflict between guidelines and patient data."""
     conflict_type: ConflictType = Field(..., description="Category of the conflict")
     severity: Severity = Field(..., description="Potential clinical impact")
+    @field_validator("conflict_type", mode="before")
+    @classmethod
+    def _normalise_conflict_type(cls, v: str) -> str:
+        return v.lower() if isinstance(v, str) else v
+    @field_validator("severity", mode="before")
+    @classmethod
+    def _normalise_severity(cls, v: str) -> str:
+        return v.lower() if isinstance(v, str) else v
     guideline_source: str = Field(..., description="Which guideline flagged this")
     guideline_text: str = Field(..., description="What the guideline recommends")
     patient_data: str = Field(..., description="Relevant patient data that conflicts")

src/backend/validation/base.py CHANGED Viewed

@@ -191,6 +191,7 @@ def diagnosis_in_differential(
 # ──────────────────────────────────────────────
 DATA_DIR = Path(__file__).resolve().parent / "data"
 def ensure_data_dir():
@@ -198,16 +199,87 @@ def ensure_data_dir():
     DATA_DIR.mkdir(parents=True, exist_ok=True)
 def save_results(summary: ValidationSummary, filename: str = None):
     """Save validation results to JSON."""
-    results_dir = Path(__file__).resolve().parent / "results"
-    results_dir.mkdir(parents=True, exist_ok=True)
     if filename is None:
         ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
         filename = f"{summary.dataset}_{ts}.json"
-    path = results_dir / filename
     # Convert to serializable dict
     data = {
@@ -218,19 +290,7 @@ def save_results(summary: ValidationSummary, filename: str = None):
         "metrics": summary.metrics,
         "run_duration_sec": summary.run_duration_sec,
         "timestamp": summary.timestamp,
-        "per_case": [
-            {
-                "case_id": r.case_id,
-                "success": r.success,
-                "scores": r.scores,
-                "pipeline_time_ms": r.pipeline_time_ms,
-                "step_results": r.step_results,
-                "report_summary": r.report_summary,
-                "error": r.error,
-                "details": r.details,
-            }
-            for r in summary.per_case
-        ],
     }
     path.write_text(json.dumps(data, indent=2, default=str))

 # ──────────────────────────────────────────────
 DATA_DIR = Path(__file__).resolve().parent / "data"
+RESULTS_DIR = Path(__file__).resolve().parent / "results"
 def ensure_data_dir():
     DATA_DIR.mkdir(parents=True, exist_ok=True)
+def _result_to_dict(r: ValidationResult) -> dict:
+    """Convert a ValidationResult to a serialisable dict."""
+    return {
+        "case_id": r.case_id,
+        "source_dataset": r.source_dataset,
+        "success": r.success,
+        "scores": r.scores,
+        "pipeline_time_ms": r.pipeline_time_ms,
+        "step_results": r.step_results,
+        "report_summary": r.report_summary,
+        "error": r.error,
+        "details": r.details,
+    }
+# ──────────────────────────────────────────────
+# Incremental checkpoint (JSONL)
+# ──────────────────────────────────────────────
+def checkpoint_path(dataset: str) -> Path:
+    """Return the path to the checkpoint JSONL for *dataset*."""
+    RESULTS_DIR.mkdir(parents=True, exist_ok=True)
+    return RESULTS_DIR / f"{dataset}_checkpoint.jsonl"
+def save_incremental(result: ValidationResult, dataset: str) -> None:
+    """Append a single case result to the checkpoint JSONL file."""
+    path = checkpoint_path(dataset)
+    with open(path, "a", encoding="utf-8") as f:
+        f.write(json.dumps(_result_to_dict(result), default=str) + "\n")
+def load_checkpoint(dataset: str) -> List[ValidationResult]:
+    """
+    Load previously-completed results from the checkpoint file.
+    Returns a list of ValidationResult objects (may be empty).
+    """
+    path = checkpoint_path(dataset)
+    if not path.exists():
+        return []
+    results: List[ValidationResult] = []
+    for line in path.read_text(encoding="utf-8").strip().split("\n"):
+        if not line.strip():
+            continue
+        d = json.loads(line)
+        results.append(ValidationResult(
+            case_id=d["case_id"],
+            source_dataset=d.get("source_dataset", dataset),
+            success=d["success"],
+            scores=d["scores"],
+            pipeline_time_ms=d.get("pipeline_time_ms", 0),
+            step_results=d.get("step_results", {}),
+            report_summary=d.get("report_summary"),
+            error=d.get("error"),
+            details=d.get("details", {}),
+        ))
+    return results
+def clear_checkpoint(dataset: str) -> None:
+    """Delete checkpoint file for a fresh run."""
+    path = checkpoint_path(dataset)
+    if path.exists():
+        path.unlink()
+# ──────────────────────────────────────────────
+# Final results save
+# ──────────────────────────────────────────────
 def save_results(summary: ValidationSummary, filename: str = None):
     """Save validation results to JSON."""
+    RESULTS_DIR.mkdir(parents=True, exist_ok=True)
     if filename is None:
         ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
         filename = f"{summary.dataset}_{ts}.json"
+    path = RESULTS_DIR / filename
     # Convert to serializable dict
     data = {
         "metrics": summary.metrics,
         "run_duration_sec": summary.run_duration_sec,
         "timestamp": summary.timestamp,
+        "per_case": [_result_to_dict(r) for r in summary.per_case],
     }
     path.write_text(json.dumps(data, indent=2, default=str))

src/backend/validation/harness_medqa.py CHANGED Viewed

@@ -29,12 +29,15 @@ from validation.base import (
     ValidationCase,
     ValidationResult,
     ValidationSummary,
     diagnosis_in_differential,
     ensure_data_dir,
     fuzzy_match,
     normalize_text,
     print_summary,
     run_cds_pipeline,
     save_results,
 )
@@ -186,6 +189,7 @@ async def validate_medqa(
     include_drug_check: bool = False,
     include_guidelines: bool = True,
     delay_between_cases: float = 2.0,
 ) -> ValidationSummary:
     """
     Run MedQA cases through the CDS pipeline and score results.
@@ -195,11 +199,27 @@ async def validate_medqa(
         include_drug_check: Whether to run drug interaction check (slower)
         include_guidelines: Whether to include guideline retrieval
         delay_between_cases: Seconds to wait between cases (rate limiting)
     """
     results: List[ValidationResult] = []
     start_time = time.time()
     for i, case in enumerate(cases):
         print(f"\n  [{i+1}/{len(cases)}] {case.case_id}: ", end="", flush=True)
         case_start = time.monotonic()
@@ -257,7 +277,7 @@ async def validate_medqa(
             details = {"correct_answer": correct_answer, "error": error}
             print(f"✗ FAILED: {error[:80] if error else 'unknown'}")
-        results.append(ValidationResult(
             case_id=case.case_id,
             source_dataset="medqa",
             success=report is not None,
@@ -267,7 +287,9 @@ async def validate_medqa(
             report_summary=report.patient_summary[:200] if report else None,
             error=error,
             details=details,
-        ))
         # Rate limit
         if i < len(cases) - 1:

     ValidationCase,
     ValidationResult,
     ValidationSummary,
+    clear_checkpoint,
     diagnosis_in_differential,
     ensure_data_dir,
     fuzzy_match,
+    load_checkpoint,
     normalize_text,
     print_summary,
     run_cds_pipeline,
+    save_incremental,
     save_results,
 )
     include_drug_check: bool = False,
     include_guidelines: bool = True,
     delay_between_cases: float = 2.0,
+    resume: bool = False,
 ) -> ValidationSummary:
     """
     Run MedQA cases through the CDS pipeline and score results.
         include_drug_check: Whether to run drug interaction check (slower)
         include_guidelines: Whether to include guideline retrieval
         delay_between_cases: Seconds to wait between cases (rate limiting)
+        resume: If True, skip cases already in checkpoint and continue
     """
     results: List[ValidationResult] = []
     start_time = time.time()
+    # Resume support: load completed cases from checkpoint
+    completed_ids: set = set()
+    if resume:
+        prior = load_checkpoint("medqa")
+        if prior:
+            results.extend(prior)
+            completed_ids = {r.case_id for r in prior}
+            print(f"  Resuming: {len(prior)} cases loaded from checkpoint, {len(cases) - len(completed_ids)} remaining")
+    else:
+        clear_checkpoint("medqa")
     for i, case in enumerate(cases):
+        if case.case_id in completed_ids:
+            print(f"\n  [{i+1}/{len(cases)}] {case.case_id}: (cached) skipped")
+            continue
         print(f"\n  [{i+1}/{len(cases)}] {case.case_id}: ", end="", flush=True)
         case_start = time.monotonic()
             details = {"correct_answer": correct_answer, "error": error}
             print(f"✗ FAILED: {error[:80] if error else 'unknown'}")
+        result = ValidationResult(
             case_id=case.case_id,
             source_dataset="medqa",
             success=report is not None,
             report_summary=report.patient_summary[:200] if report else None,
             error=error,
             details=details,
+        )
+        results.append(result)
+        save_incremental(result, "medqa")  # checkpoint after every case
         # Rate limit
         if i < len(cases) - 1:

src/backend/validation/harness_mtsamples.py CHANGED Viewed

@@ -33,11 +33,14 @@ from validation.base import (
     ValidationCase,
     ValidationResult,
     ValidationSummary,
     ensure_data_dir,
     fuzzy_match,
     normalize_text,
     print_summary,
     run_cds_pipeline,
     save_results,
 )
@@ -238,6 +241,7 @@ async def validate_mtsamples(
     include_drug_check: bool = True,
     include_guidelines: bool = True,
     delay_between_cases: float = 2.0,
 ) -> ValidationSummary:
     """
     Run MTSamples cases through the CDS pipeline and score results.
@@ -245,8 +249,23 @@ async def validate_mtsamples(
     results: List[ValidationResult] = []
     start_time = time.time()
     for i, case in enumerate(cases):
         specialty = case.ground_truth.get("specialty", "?")
         print(f"\n  [{i+1}/{len(cases)}] {case.case_id} ({specialty}): ", end="", flush=True)
         case_start = time.monotonic()
@@ -321,7 +340,7 @@ async def validate_mtsamples(
             details = {"specialty": specialty, "error": error}
             print(f"✗ FAILED: {error[:80] if error else 'unknown'}")
-        results.append(ValidationResult(
             case_id=case.case_id,
             source_dataset="mtsamples",
             success=report is not None,
@@ -331,7 +350,9 @@ async def validate_mtsamples(
             report_summary=report.patient_summary[:200] if report else None,
             error=error,
             details=details,
-        ))
         if i < len(cases) - 1:
             await asyncio.sleep(delay_between_cases)

     ValidationCase,
     ValidationResult,
     ValidationSummary,
+    clear_checkpoint,
     ensure_data_dir,
     fuzzy_match,
+    load_checkpoint,
     normalize_text,
     print_summary,
     run_cds_pipeline,
+    save_incremental,
     save_results,
 )
     include_drug_check: bool = True,
     include_guidelines: bool = True,
     delay_between_cases: float = 2.0,
+    resume: bool = False,
 ) -> ValidationSummary:
     """
     Run MTSamples cases through the CDS pipeline and score results.
     results: List[ValidationResult] = []
     start_time = time.time()
+    # Resume support
+    completed_ids: set = set()
+    if resume:
+        prior = load_checkpoint("mtsamples")
+        if prior:
+            results.extend(prior)
+            completed_ids = {r.case_id for r in prior}
+            print(f"  Resuming: {len(prior)} cases loaded from checkpoint, {len(cases) - len(completed_ids)} remaining")
+    else:
+        clear_checkpoint("mtsamples")
     for i, case in enumerate(cases):
         specialty = case.ground_truth.get("specialty", "?")
+        if case.case_id in completed_ids:
+            print(f"\n  [{i+1}/{len(cases)}] {case.case_id} ({specialty}): (cached) skipped")
+            continue
         print(f"\n  [{i+1}/{len(cases)}] {case.case_id} ({specialty}): ", end="", flush=True)
         case_start = time.monotonic()
             details = {"specialty": specialty, "error": error}
             print(f"✗ FAILED: {error[:80] if error else 'unknown'}")
+        result = ValidationResult(
             case_id=case.case_id,
             source_dataset="mtsamples",
             success=report is not None,
             report_summary=report.patient_summary[:200] if report else None,
             error=error,
             details=details,
+        )
+        results.append(result)
+        save_incremental(result, "mtsamples")  # checkpoint after every case
         if i < len(cases) - 1:
             await asyncio.sleep(delay_between_cases)

src/backend/validation/harness_pmc.py CHANGED Viewed

@@ -31,12 +31,15 @@ from validation.base import (
     ValidationCase,
     ValidationResult,
     ValidationSummary,
     diagnosis_in_differential,
     ensure_data_dir,
     fuzzy_match,
     normalize_text,
     print_summary,
     run_cds_pipeline,
     save_results,
 )
@@ -322,6 +325,7 @@ async def validate_pmc(
     include_drug_check: bool = True,
     include_guidelines: bool = True,
     delay_between_cases: float = 2.0,
 ) -> ValidationSummary:
     """
     Run PMC case reports through the CDS pipeline and score results.
@@ -329,9 +333,24 @@ async def validate_pmc(
     results: List[ValidationResult] = []
     start_time = time.time()
     for i, case in enumerate(cases):
         dx = case.ground_truth.get("diagnosis", "?")
         specialty = case.ground_truth.get("specialty", "?")
         print(f"\n  [{i+1}/{len(cases)}] {case.case_id} ({specialty} — {dx[:40]}): ", end="", flush=True)
         case_start = time.monotonic()
@@ -393,7 +412,7 @@ async def validate_pmc(
             details = {"target_diagnosis": target_diagnosis, "error": error}
             print(f"✗ FAILED: {error[:80] if error else 'unknown'}")
-        results.append(ValidationResult(
             case_id=case.case_id,
             source_dataset="pmc",
             success=report is not None,
@@ -403,7 +422,9 @@ async def validate_pmc(
             report_summary=report.patient_summary[:200] if report else None,
             error=error,
             details=details,
-        ))
         if i < len(cases) - 1:
             await asyncio.sleep(delay_between_cases)

     ValidationCase,
     ValidationResult,
     ValidationSummary,
+    clear_checkpoint,
     diagnosis_in_differential,
     ensure_data_dir,
     fuzzy_match,
+    load_checkpoint,
     normalize_text,
     print_summary,
     run_cds_pipeline,
+    save_incremental,
     save_results,
 )
     include_drug_check: bool = True,
     include_guidelines: bool = True,
     delay_between_cases: float = 2.0,
+    resume: bool = False,
 ) -> ValidationSummary:
     """
     Run PMC case reports through the CDS pipeline and score results.
     results: List[ValidationResult] = []
     start_time = time.time()
+    # Resume support
+    completed_ids: set = set()
+    if resume:
+        prior = load_checkpoint("pmc")
+        if prior:
+            results.extend(prior)
+            completed_ids = {r.case_id for r in prior}
+            print(f"  Resuming: {len(prior)} cases loaded from checkpoint, {len(cases) - len(completed_ids)} remaining")
+    else:
+        clear_checkpoint("pmc")
     for i, case in enumerate(cases):
         dx = case.ground_truth.get("diagnosis", "?")
         specialty = case.ground_truth.get("specialty", "?")
+        if case.case_id in completed_ids:
+            print(f"\n  [{i+1}/{len(cases)}] {case.case_id} ({specialty}): (cached) skipped")
+            continue
         print(f"\n  [{i+1}/{len(cases)}] {case.case_id} ({specialty} — {dx[:40]}): ", end="", flush=True)
         case_start = time.monotonic()
             details = {"target_diagnosis": target_diagnosis, "error": error}
             print(f"✗ FAILED: {error[:80] if error else 'unknown'}")
+        result = ValidationResult(
             case_id=case.case_id,
             source_dataset="pmc",
             success=report is not None,
             report_summary=report.patient_summary[:200] if report else None,
             error=error,
             details=details,
+        )
+        results.append(result)
+        save_incremental(result, "pmc")  # checkpoint after every case
         if i < len(cases) - 1:
             await asyncio.sleep(delay_between_cases)

src/backend/validation/run_validation.py CHANGED Viewed

@@ -48,6 +48,7 @@ async def run_all_validations(
     include_guidelines: bool = True,
     delay: float = 2.0,
     fetch_only: bool = False,
 ) -> dict:
     """
     Run validation against selected datasets.
@@ -73,6 +74,7 @@ async def run_all_validations(
                 include_drug_check=include_drug_check,
                 include_guidelines=include_guidelines,
                 delay_between_cases=delay,
             )
             print_summary(summary)
             save_results(summary)
@@ -94,6 +96,7 @@ async def run_all_validations(
                 include_drug_check=include_drug_check,
                 include_guidelines=include_guidelines,
                 delay_between_cases=delay,
             )
             print_summary(summary)
             save_results(summary)
@@ -115,6 +118,7 @@ async def run_all_validations(
                 include_drug_check=include_drug_check,
                 include_guidelines=include_guidelines,
                 delay_between_cases=delay,
             )
             print_summary(summary)
             save_results(summary)
@@ -235,6 +239,7 @@ Examples:
     config_group.add_argument("--delay", type=float, default=2.0, help="Delay between cases in seconds (default: 2.0)")
     config_group.add_argument("--no-drugs", action="store_true", help="Skip drug interaction checks")
     config_group.add_argument("--no-guidelines", action="store_true", help="Skip guideline retrieval")
     config_group.add_argument("--fetch-only", action="store_true", help="Only download data, don't run pipeline")
     args = parser.parse_args()
@@ -254,6 +259,7 @@ Examples:
     print(f"  Cases/dataset: {args.max_cases}")
     print(f"  Drug check:    {'Yes' if not args.no_drugs else 'No'}")
     print(f"  Guidelines:    {'Yes' if not args.no_guidelines else 'No'}")
     print(f"  Fetch only:    {'Yes' if args.fetch_only else 'No'}")
     asyncio.run(run_all_validations(
@@ -266,6 +272,7 @@ Examples:
         include_guidelines=not args.no_guidelines,
         delay=args.delay,
         fetch_only=args.fetch_only,
     ))

     include_guidelines: bool = True,
     delay: float = 2.0,
     fetch_only: bool = False,
+    resume: bool = False,
 ) -> dict:
     """
     Run validation against selected datasets.
                 include_drug_check=include_drug_check,
                 include_guidelines=include_guidelines,
                 delay_between_cases=delay,
+                resume=resume,
             )
             print_summary(summary)
             save_results(summary)
                 include_drug_check=include_drug_check,
                 include_guidelines=include_guidelines,
                 delay_between_cases=delay,
+                resume=resume,
             )
             print_summary(summary)
             save_results(summary)
                 include_drug_check=include_drug_check,
                 include_guidelines=include_guidelines,
                 delay_between_cases=delay,
+                resume=resume,
             )
             print_summary(summary)
             save_results(summary)
     config_group.add_argument("--delay", type=float, default=2.0, help="Delay between cases in seconds (default: 2.0)")
     config_group.add_argument("--no-drugs", action="store_true", help="Skip drug interaction checks")
     config_group.add_argument("--no-guidelines", action="store_true", help="Skip guideline retrieval")
+    config_group.add_argument("--resume", action="store_true", help="Resume from checkpoint (skip already-completed cases)")
     config_group.add_argument("--fetch-only", action="store_true", help="Only download data, don't run pipeline")
     args = parser.parse_args()
     print(f"  Cases/dataset: {args.max_cases}")
     print(f"  Drug check:    {'Yes' if not args.no_drugs else 'No'}")
     print(f"  Guidelines:    {'Yes' if not args.no_guidelines else 'No'}")
+    print(f"  Resume:        {'Yes' if args.resume else 'No'}")
     print(f"  Fetch only:    {'Yes' if args.fetch_only else 'No'}")
     asyncio.run(run_all_validations(
         include_guidelines=not args.no_guidelines,
         delay=args.delay,
         fetch_only=args.fetch_only,
+        resume=args.resume,
     ))