feat(data): add split_cascade report mode + MIMIC-CXR builder with CheXpert oracle labels

- New report_mode "split_cascade": findings task unchanged (image+CheXpert),
impression task takes GT findings text as prompt context (findings→impression
summarisation). Wired through iu_xray_builder, dataset_resolver, config docs.
- New data/mimic_cxr_builder.py: parses the pre-split MIMIC layout, bakes the
14 GT CheXpert labels (oracle, from *chexpert*.csv) into structured_findings
as "Predicted Findings: ..." — the RaDialog image + abnormality-guidance
setup. Supports all 3 report_mode / image_mode axes; optional VQA attach.
- Implement build_instruct_json() (was NotImplementedError) as a thin delegate.
- dataset_resolver: auto-build MIMIC JSON (mode-suffixed cache) when missing.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (5) hide show

configs/train_config.yaml +35 -8
data/dataset.py +31 -45
data/iu_xray_builder.py +51 -5
data/mimic_cxr_builder.py +391 -0
utils/dataset_resolver.py +56 -6

configs/train_config.yaml CHANGED Viewed

@@ -9,11 +9,21 @@ data:
   dataset_name: "IU-Xray"
   # How findings and impression are turned into training samples.
-  #   "split"  → 2 separate tasks: task=findings and task=impression
-  #              (independent losses; pre-existing behaviour)
-  #   "merged" → 1 task: task=report, target = "Findings: ...\n\nImpression: ..."
-  #              Model generates the full report autoregressively in one pass;
-  #              impression is conditioned on the findings it just emitted.
   # When switching modes, the on-disk instruct JSON must be rebuilt (delete it
   # to trigger auto_build, or rerun iu_xray_builder.py with --report_mode).
   report_mode: "split"
@@ -43,11 +53,28 @@ data:
   max_images_per_sample: 2     # only used when image_mode == "multi_image_merged"
   # --- MIMIC-CXR paths (used when dataset_name == "MIMIC-CXR") ---
-  mimic_cxr_root:    "/path/to/physionet.org/files/mimic-cxr-jpg/2.0.0"
-  mimic_cxr_reports: "/path/to/physionet.org/files/mimic-cxr/2.0.0/reports"
-  vqa_data_path:     "/path/to/mimic-ext-cxr-qba"
   instruct_json:     "data/data_files/mimic_cxr_instruct_unified.json"
   # --- IU X-ray paths (used when dataset_name == "IU-Xray") ---
   # On local Windows the defaults below match D:\USTH\KLTN\data\IU-Xray\...
   # On Kaggle set these to the mounted dataset (e.g. /kaggle/input/vlm-cxr-data/...)

   dataset_name: "IU-Xray"
   # How findings and impression are turned into training samples.
+  #   "split"         → 2 separate tasks: task=findings and task=impression
+  #                     (independent losses; pre-existing behaviour). Both take
+  #                     image (+ CheXpert label) as input.
+  #   "merged"        → 1 task: task=report, target =
+  #                     "Findings: ...\n\nImpression: ...". Model generates the
+  #                     full report autoregressively; impression is conditioned
+  #                     on the findings it just emitted.
+  #   "split_cascade" → like "split" (2 separate tasks) but the impression
+  #                     sample's prompt context is the GROUND-TRUTH findings
+  #                     text (findings→impression summarisation) instead of the
+  #                     CheXpert label. findings task is unchanged. Studies
+  #                     without a findings section emit no impression sample.
+  #                     Train + eval are teacher-forced (impression sees GT
+  #                     findings); a true cascade eval feeding the model's own
+  #                     generated findings is not implemented yet.
   # When switching modes, the on-disk instruct JSON must be rebuilt (delete it
   # to trigger auto_build, or rerun iu_xray_builder.py with --report_mode).
   report_mode: "split"
   max_images_per_sample: 2     # only used when image_mode == "multi_image_merged"
   # --- MIMIC-CXR paths (used when dataset_name == "MIMIC-CXR") ---
+  # Layout expected: {mimic_cxr_root}/{train,valid,test}/pNN/pXXXX/sYYYY/*.jpg
+  # plus the report sYYYY.txt sitting in the same study dir (reports are NOT a
+  # separate tree in this pre-split layout).
+  mimic_cxr_root:    "/path/to/MIMIC-CXR"
   instruct_json:     "data/data_files/mimic_cxr_instruct_unified.json"
+  # RaDialog abnormality guidance: the 14 CheXpert labels (oracle / GT) are
+  # read from this CSV and baked into the prompt as
+  # "Predicted Findings: ...". If left null the builder auto-discovers any
+  # *chexpert*.csv under mimic_cxr_root; if none is found, structured_findings
+  # is null and abnormality guidance is silently DISABLED (loud warning).
+  mimic_chexpert_csv:     null
+  # How CheXpert -1.0 (uncertain) is mapped: "ignore" (only 1.0 positive,
+  # default, matches the classifier head) | "positive" (treat -1.0 as positive).
+  mimic_uncertain_policy: "ignore"
+  # Optional VQA pairs dir with {train,valid,test}.json. null → skip VQA.
+  mimic_vqa_root:         null
+  # Auto-build the unified JSON (with CheXpert labels) when the cached
+  # report_mode/image_mode-suffixed file is missing. Set false to require a
+  # pre-built file (built via `python -m data.mimic_cxr_builder ...`).
+  mimic_auto_build:       true
   # --- IU X-ray paths (used when dataset_name == "IU-Xray") ---
   # On local Windows the defaults below match D:\USTH\KLTN\data\IU-Xray\...
   # On Kaggle set these to the mounted dataset (e.g. /kaggle/input/vlm-cxr-data/...)

data/dataset.py CHANGED Viewed

@@ -284,52 +284,38 @@ class CXRInstructDataset(Dataset):
 def build_instruct_json(
     mimic_cxr_root:  str,
-    mimic_reports_root: str,
-    vqa_data_root:   str,
     output_path:     str,
-    task_weights:    Optional[Dict[str, float]] = None,
-):
     """
-    Build the unified instruction JSON from raw data sources.
-    Run this once after downloading data.
-    TODO: implement after downloading data:
-      1. Parse MIMIC-CXR splits (train/validate/test)
-      2. For each study: extract findings and impression from report txt files
-      3. For VQA: parse MIMIC-Ext-CXR-QBA JSON files
-      4. Write unified JSON: list of dicts with standardized format
-    Expected output format (one entry per sample):
-    [
-        {
-            "image_path":          "files/p10/p10000032/s50414267/02aa804e.jpg",
-            "task":                "findings",
-            "target":              "The lungs are clear...",
-            "question":            null,
-            "structured_findings": "Predicted Findings: No Finding",
-            "split":               "train",
-            "study_id":            "s50414267",
-            "subject_id":          "p10000032"
-        },
-        {
-            "image_path":          "files/p10/p10000032/s50414267/02aa804e.jpg",
-            "task":                "impression",
-            "target":              "No acute cardiopulmonary process.",
-            ...
-        },
-        {
-            "image_path":          "files/p10/p10000032/s50414267/02aa804e.jpg",
-            "task":                "vqa",
-            "target":              "Yes, there is mild pleural effusion.",
-            "question":            "Is there pleural effusion in this X-ray?",
-            ...
-        },
-        ...
-    ]
     """
-    raise NotImplementedError(
-        "TODO: Implement build_instruct_json() after downloading:\n"
-        "  - MIMIC-CXR: physionet.org/content/mimic-cxr/2.1.0\n"
-        "  - MIMIC-CXR-JPG: physionet.org/content/mimic-cxr-jpg/2.0.0\n"
-        "  - MIMIC-Ext-CXR-QBA: physionet.org/content/mimic-ext-cxr-qba/1.0.0\n"
     )

 def build_instruct_json(
     mimic_cxr_root:  str,
     output_path:     str,
+    chexpert_csv:    Optional[str] = None,
+    vqa_data_root:   Optional[str] = None,
+    report_mode:     str = "split",
+    image_mode:      str = "all_views_split",
+    uncertain_policy: str = "ignore",
+) -> str:
     """
+    Build the unified MIMIC-CXR instruction JSON.
+    Thin delegate to `data.mimic_cxr_builder.build_mimic_cxr_instruct_json`,
+    which walks the pre-split MIMIC layout (train/valid/test), parses
+    findings/impression from the report .txt files, and bakes the 14 CheXpert
+    labels (oracle, from `*chexpert*.csv`) into `structured_findings` as
+    "Predicted Findings: ..." — the RaDialog image + abnormality-guidance
+    setup. `report_mode` / `image_mode` mirror the IU builder.
+    Output entries match the shared schema, e.g.:
+        {"image_path": "train/p10/p10000032/s50414267/02aa804e.jpg",
+         "task": "findings", "target": "The lungs are clear...",
+         "question": null,
+         "structured_findings": "Predicted Findings: No Finding",
+         "split": "train", "study_id": "s50414267",
+         "subject_id": "p10000032"}
     """
+    from .mimic_cxr_builder import build_mimic_cxr_instruct_json
+    return build_mimic_cxr_instruct_json(
+        mimic_root       = mimic_cxr_root,
+        output_path      = output_path,
+        chexpert_csv     = chexpert_csv,
+        vqa_root         = vqa_data_root,
+        report_mode      = report_mode,
+        image_mode       = image_mode,
+        uncertain_policy = uncertain_policy,
     )

data/iu_xray_builder.py CHANGED Viewed

@@ -75,7 +75,7 @@ def build_iu_xray_instruct_json(
     test_ratio:   float = 0.15,
     seed:         int   = 42,
     image_suffix: str   = ".png",
-    report_mode:  str   = "split",                   # "split" | "merged"
     image_mode:   str   = "all_views_split",         # "all_views_split" | "frontal_only_split" | "multi_image_merged"
 ) -> str:
     """
@@ -88,12 +88,23 @@ def build_iu_xray_instruct_json(
                                 "Findings: ...\n\nImpression: ...". Use when training a
                                 single full-report generation task. Samples with only
                                 findings are dropped (no impression to anchor on).
     Returns:
         Absolute path to output JSON.
     """
-    assert report_mode in ("split", "merged"), \
-        f"report_mode must be 'split' or 'merged', got {report_mode!r}"
     assert image_mode in ("all_views_split", "frontal_only_split", "multi_image_merged"), \
         f"image_mode must be one of all_views_split/frontal_only_split/multi_image_merged, got {image_mode!r}"
     assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6, \
@@ -175,6 +186,7 @@ def build_iu_xray_instruct_json(
     samples: List[Dict] = []
     skipped_merged_no_impression = 0
     def _per_study_image_groups(report_imgs):
         """
@@ -222,6 +234,36 @@ def build_iu_xray_instruct_json(
                     "split":               split,
                     "report_id":           report["report_id"],
                 })
             else:  # "split"
                 for task_name, text in (
                     ("findings",   report["findings"]),
@@ -259,6 +301,8 @@ def build_iu_xray_instruct_json(
     print(f"  skipped no_image : {skipped_no_image}")
     if report_mode == "merged":
         print(f"  skipped no_impr  : {skipped_merged_no_impression}")
     print(f"  by split         : {by_split}")
     print(f"  by task          : {by_task}")
@@ -281,9 +325,11 @@ def _parse_args():
     p.add_argument("--seed",        type=int,   default=42)
     p.add_argument("--image_suffix", type=str,  default=".png")
     p.add_argument("--report_mode", type=str,   default="split",
-                   choices=["split", "merged"],
                    help="split: 2 samples/img (findings + impression). "
-                        "merged: 1 sample/img with combined target.")
     p.add_argument("--image_mode",  type=str,   default="all_views_split",
                    choices=["all_views_split", "frontal_only_split", "multi_image_merged"],
                    help="all_views_split: 1 sample per image. "

     test_ratio:   float = 0.15,
     seed:         int   = 42,
     image_suffix: str   = ".png",
+    report_mode:  str   = "split",                   # "split" | "merged" | "split_cascade"
     image_mode:   str   = "all_views_split",         # "all_views_split" | "frontal_only_split" | "multi_image_merged"
 ) -> str:
     """
                                 "Findings: ...\n\nImpression: ...". Use when training a
                                 single full-report generation task. Samples with only
                                 findings are dropped (no impression to anchor on).
+                     "split_cascade" → like "split" (2 separate tasks) BUT the
+                                impression sample carries the ground-truth findings
+                                text as its prompt context (in `structured_findings`,
+                                formatted "Findings: ...") instead of CheXpert
+                                labels. Impression thus learns findings→impression
+                                summarisation while still seeing the image. Only
+                                studies with BOTH findings and impression emit an
+                                impression sample (findings is its required input).
+                                NOTE: eval is teacher-forced (impression gets GT
+                                findings); a true cascade eval that feeds the
+                                model's own generated findings is future work.
     Returns:
         Absolute path to output JSON.
     """
+    assert report_mode in ("split", "merged", "split_cascade"), \
+        f"report_mode must be 'split', 'merged', or 'split_cascade', got {report_mode!r}"
     assert image_mode in ("all_views_split", "frontal_only_split", "multi_image_merged"), \
         f"image_mode must be one of all_views_split/frontal_only_split/multi_image_merged, got {image_mode!r}"
     assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6, \
     samples: List[Dict] = []
     skipped_merged_no_impression = 0
+    skipped_cascade_no_findings  = 0
     def _per_study_image_groups(report_imgs):
         """
                     "split":               split,
                     "report_id":           report["report_id"],
                 })
+            elif report_mode == "split_cascade":
+                # findings sample: identical to "split".
+                if report["findings"] is not None:
+                    samples.append({
+                        **path_fields,
+                        "task":                "findings",
+                        "target":              report["findings"],
+                        "question":            None,
+                        "structured_findings": None,
+                        "split":               split,
+                        "report_id":           report["report_id"],
+                    })
+                # impression sample: needs findings as its prompt context, so
+                # only emit when BOTH sections exist. The GT findings ride in
+                # `structured_findings` (same plumbing CheXpert labels use) so
+                # train (dataset.py) and eval (evaluate.py) pick it up with no
+                # other code changes.
+                if report["impression"] is not None:
+                    if report["findings"] is None:
+                        skipped_cascade_no_findings += 1
+                    else:
+                        samples.append({
+                            **path_fields,
+                            "task":                "impression",
+                            "target":              report["impression"],
+                            "question":            None,
+                            "structured_findings": f"Findings: {report['findings'].strip()}",
+                            "split":               split,
+                            "report_id":           report["report_id"],
+                        })
             else:  # "split"
                 for task_name, text in (
                     ("findings",   report["findings"]),
     print(f"  skipped no_image : {skipped_no_image}")
     if report_mode == "merged":
         print(f"  skipped no_impr  : {skipped_merged_no_impression}")
+    if report_mode == "split_cascade":
+        print(f"  skipped impr w/o findings : {skipped_cascade_no_findings}")
     print(f"  by split         : {by_split}")
     print(f"  by task          : {by_task}")
     p.add_argument("--seed",        type=int,   default=42)
     p.add_argument("--image_suffix", type=str,  default=".png")
     p.add_argument("--report_mode", type=str,   default="split",
+                   choices=["split", "merged", "split_cascade"],
                    help="split: 2 samples/img (findings + impression). "
+                        "merged: 1 sample/img with combined target. "
+                        "split_cascade: like split, but impression sample's "
+                        "prompt context = GT findings text (findings→impression).")
     p.add_argument("--image_mode",  type=str,   default="all_views_split",
                    choices=["all_views_split", "frontal_only_split", "multi_image_merged"],
                    help="all_views_split: 1 sample per image. "

data/mimic_cxr_builder.py ADDED Viewed

	@@ -0,0 +1,391 @@

+"""
+mimic_cxr_builder.py
+--------------------
+Parses the (pre-split) MIMIC-CXR layout into a unified instruction JSON
+compatible with `CXRInstructDataset` — the MIMIC counterpart of
+`iu_xray_builder.py`. Same JSON schema, same `report_mode` / `image_mode`
+axes, so everything downstream (dataset, resolver, evaluate) is unchanged.
+Expected on-disk layout (the custom MIMIC-CXR.zip used by the notebook,
+NOT the raw PhysioNet tree):
+    {mimic_root}/
+      ├── train/pNN/pXXXXXXXX/sYYYYYYYY/<dicom>.jpg + sYYYYYYYY.txt
+      ├── valid/pNN/...
+      └── test /pNN/...
+    {anywhere under mimic_root}/  *chexpert*.csv   (optional, auto-discovered)
+RaDialog-style abnormality guidance
+-----------------------------------
+The 14 CheXpert labels are read from `mimic-cxr-2.0.0-chexpert.csv`
+(CheXbert run on the ground-truth reports) and baked into the prompt as
+`structured_findings`:
+    "Predicted Findings: Cardiomegaly, Pleural Effusion"
+    "Predicted Findings: No Finding"           (when no positive label)
+This is the *oracle* setting — GT labels, no trained image classifier and
+no model change. The CheXpert classifier module stays unused; the existing
+`structured_findings` prompt plumbing carries the string through train
+(dataset.py) and eval (evaluate.py) untouched.
+VQA
+---
+VQA pairs live in a separate dataset and are attached by passing
+`vqa_root` (mirrors the notebook). Omit it to build findings/impression
+only.
+"""
+import argparse
+import csv
+import glob
+import json
+import re
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+# ─── Report parsing (same regex the notebook validated on MIMIC) ────────────
+_FINDINGS_RE   = re.compile(r"FINDINGS\s*:\s*(.*?)(?=\n\s*[A-Z ]{3,}\s*:|\Z)", re.S | re.I)
+_IMPRESSION_RE = re.compile(r"IMPRESSION\s*:\s*(.*?)(?=\n\s*[A-Z ]{3,}\s*:|\Z)", re.S | re.I)
+# 14 CheXpert columns, in the canonical order used by the classifier head.
+CHEXPERT_LABELS = [
+    "No Finding",
+    "Enlarged Cardiomediastinum",
+    "Cardiomegaly",
+    "Lung Opacity",
+    "Lung Lesion",
+    "Edema",
+    "Consolidation",
+    "Pneumonia",
+    "Atelectasis",
+    "Pneumothorax",
+    "Pleural Effusion",
+    "Pleural Other",
+    "Fracture",
+    "Support Devices",
+]
+def _clean(txt: str) -> str:
+    return re.sub(r"\s+", " ", txt).strip() if txt else ""
+def _parse_report(txt_path: Path) -> Tuple[Optional[str], Optional[str]]:
+    t = txt_path.read_text(errors="ignore")
+    f = _FINDINGS_RE.search(t)
+    i = _IMPRESSION_RE.search(t)
+    return (
+        _clean(f.group(1)) if f else None,
+        _clean(i.group(1)) if i else None,
+    )
+# ─── CheXpert CSV → "Predicted Findings: ..." string ────────────────────────
+def _discover_chexpert_csv(mimic_root: Path, explicit: Optional[str]) -> Optional[Path]:
+    if explicit:
+        p = Path(explicit)
+        return p if p.is_file() else None
+    # Auto-discover anything that looks like the CheXpert label CSV.
+    for pat in ("*chexpert*.csv", "*chexbert*.csv"):
+        hits = sorted(glob.glob(str(mimic_root / "**" / pat), recursive=True))
+        if hits:
+            return Path(hits[0])
+    return None
+def _load_chexpert_map(
+    csv_path: Path,
+    uncertain_policy: str = "ignore",   # "ignore" → only 1.0 positive | "positive" → -1.0 also positive
+) -> Dict[Tuple[str, str], str]:
+    """
+    Return {(subject_id, study_id): "Predicted Findings: A, B"} where the ids
+    are the bare integers as strings (CSV stores them without the p/s prefix).
+    """
+    pos_threshold = {"1", "1.0"}
+    if uncertain_policy == "positive":
+        pos_threshold = pos_threshold | {"-1", "-1.0"}
+    out: Dict[Tuple[str, str], str] = {}
+    with open(csv_path, newline="") as f:
+        reader = csv.DictReader(f)
+        # tolerate case / spacing variations in the header
+        col = {c.lower().strip(): c for c in reader.fieldnames or []}
+        subj_c  = col.get("subject_id")
+        study_c = col.get("study_id")
+        if subj_c is None or study_c is None:
+            raise ValueError(
+                f"{csv_path} missing subject_id/study_id columns "
+                f"(have: {reader.fieldnames})"
+            )
+        label_cols = [(name, col[name.lower()]) for name in CHEXPERT_LABELS
+                      if name.lower() in col]
+        for row in reader:
+            subj  = str(row[subj_c]).strip().lstrip("p").split(".")[0]
+            study = str(row[study_c]).strip().lstrip("s").split(".")[0]
+            positives = [
+                name for name, c in label_cols
+                if str(row.get(c, "")).strip() in pos_threshold
+            ]
+            # "No Finding" alone is reported as such; otherwise list the
+            # genuine positives (drop a redundant "No Finding" if any
+            # pathology is also positive).
+            real = [p for p in positives if p != "No Finding"]
+            if real:
+                txt = ", ".join(real)
+            else:
+                txt = "No Finding"
+            out[(subj, study)] = f"Predicted Findings: {txt}"
+    return out
+# ─── Main builder ───────────────────────────────────────────────────────────
+def build_mimic_cxr_instruct_json(
+    mimic_root:   str,
+    output_path:  str,
+    chexpert_csv: Optional[str] = None,
+    vqa_root:     Optional[str] = None,
+    report_mode:  str = "split",                  # "split" | "merged" | "split_cascade"
+    image_mode:   str = "all_views_split",        # "all_views_split" | "frontal_only_split" | "multi_image_merged"
+    uncertain_policy: str = "ignore",             # how CheXpert -1.0 (uncertain) is treated
+) -> str:
+    """
+    Build the unified MIMIC-CXR instruction JSON.
+    report_mode mirrors iu_xray_builder:
+      "split"         → findings + impression samples; BOTH carry the CheXpert
+                        "Predicted Findings: ..." string in structured_findings
+                        (RaDialog: image + 14 labels → text).
+      "merged"        → one task=report sample, target "Findings: ...\n\n
+                        Impression: ...", carries the CheXpert string.
+      "split_cascade" → findings sample carries the CheXpert string; the
+                        impression sample instead carries "Findings: <GT
+                        findings>" as context (findings→impression). Same
+                        convention as the IU builder.
+    image_mode mirrors iu_xray_builder (all_views_split / frontal_only_split /
+    multi_image_merged). NOTE: frontal_only_split here keeps the FIRST image
+    of the study — this MIMIC layout has no metadata.csv to read ViewPosition
+    from. Swap in a ViewPosition lookup if you add that CSV.
+    Returns the absolute output path.
+    """
+    assert report_mode in ("split", "merged", "split_cascade"), \
+        f"report_mode must be 'split', 'merged', or 'split_cascade', got {report_mode!r}"
+    assert image_mode in ("all_views_split", "frontal_only_split", "multi_image_merged"), \
+        f"image_mode invalid: {image_mode!r}"
+    from .dataset import format_merged_report   # local import to avoid cycle
+    mimic_root  = Path(mimic_root)
+    output_path = Path(output_path)
+    # split dir name → split label written into the JSON
+    split_dirs = {
+        "train": "train",
+        "valid": "validate",
+        "test":  "test",
+    }
+    present = {sub: mimic_root / sub for sub in split_dirs if (mimic_root / sub).is_dir()}
+    if not present:
+        raise FileNotFoundError(
+            f"No train/valid/test subdirs under {mimic_root}. "
+            f"Expected the pre-split MIMIC-CXR layout."
+        )
+    # ── CheXpert labels ───────────────────────────────────────────────────
+    csv_path = _discover_chexpert_csv(mimic_root, chexpert_csv)
+    if csv_path is not None:
+        chexpert_map = _load_chexpert_map(csv_path, uncertain_policy)
+        print(f"[mimic_cxr_builder] CheXpert CSV: {csv_path} "
+              f"({len(chexpert_map):,} studies, uncertain={uncertain_policy})")
+    else:
+        chexpert_map = {}
+        print("[mimic_cxr_builder] WARNING: no *chexpert*.csv found under "
+              f"{mimic_root} and none passed via --chexpert_csv. "
+              "structured_findings will be null (RaDialog abnormality "
+              "guidance DISABLED). Add the CSV to enable it.")
+    # ── Pass 1: index studies ─────────────────────────────────────────────
+    samples: List[Dict] = []
+    image_index: Dict[str, str] = {}     # subject-relative path → split label
+    n_studies = n_missing_report = n_no_chexpert = 0
+    skipped_merged_no_impression = skipped_cascade_no_findings = 0
+    def _structured_for(subj: str, study: str) -> Optional[str]:
+        return chexpert_map.get((subj.lstrip("p"), study.lstrip("s")))
+    def _image_groups(study_dir: Path, split_sub: str, subj: str, study: str):
+        """Yield path_fields dicts honouring image_mode (same rules as IU)."""
+        imgs = sorted(study_dir.glob("*.jpg"))
+        if not imgs:
+            return
+        def _rel(img: Path) -> str:
+            return f"{split_sub}/{img.parent.parent.parent.name}/{subj}/{study}/{img.name}"
+        rels = [_rel(im) for im in imgs]
+        for r in rels:
+            image_index[r] = split_dirs[split_sub]
+        if image_mode == "all_views_split":
+            for r in rels:
+                yield {"image_path": r, "image_paths": None}
+        elif image_mode == "frontal_only_split":
+            yield {"image_path": rels[0], "image_paths": None}
+        else:  # multi_image_merged
+            yield {"image_path": None, "image_paths": rels}
+    for split_sub, split_dir in present.items():
+        for p_dir in sorted(split_dir.glob("p*")):
+            for pat_dir in p_dir.glob("p*"):
+                for study_dir in pat_dir.glob("s*"):
+                    jpgs = list(study_dir.glob("*.jpg"))
+                    if not jpgs:
+                        continue
+                    n_studies += 1
+                    subj, study = pat_dir.name, study_dir.name
+                    txts = list(study_dir.glob("*.txt"))
+                    if not txts:
+                        n_missing_report += 1
+                        continue
+                    findings, impression = _parse_report(txts[0])
+                    structured = _structured_for(subj, study)
+                    if structured is None:
+                        n_no_chexpert += 1
+                    split_label = split_dirs[split_sub]
+                    for path_fields in _image_groups(study_dir, split_sub, subj, study):
+                        base = {
+                            **path_fields,
+                            "question":   None,
+                            "split":      split_label,
+                            "study_id":   study,
+                            "subject_id": subj,
+                        }
+                        if report_mode == "merged":
+                            target = format_merged_report(findings, impression)
+                            if target is None:
+                                skipped_merged_no_impression += 1
+                                continue
+                            samples.append({**base, "task": "report",
+                                             "target": target,
+                                             "structured_findings": structured})
+                        elif report_mode == "split_cascade":
+                            if findings:
+                                samples.append({**base, "task": "findings",
+                                                 "target": findings,
+                                                 "structured_findings": structured})
+                            if impression:
+                                if not findings:
+                                    skipped_cascade_no_findings += 1
+                                else:
+                                    samples.append({**base, "task": "impression",
+                                                     "target": impression,
+                                                     "structured_findings":
+                                                         f"Findings: {findings}"})
+                        else:  # "split"
+                            if findings:
+                                samples.append({**base, "task": "findings",
+                                                 "target": findings,
+                                                 "structured_findings": structured})
+                            if impression:
+                                samples.append({**base, "task": "impression",
+                                                 "target": impression,
+                                                 "structured_findings": structured})
+    # ── Pass 2: optional VQA attach (mirrors the notebook) ────────────────
+    n_vqa = n_vqa_dropped = 0
+    if vqa_root:
+        vqa_root = Path(vqa_root)
+        for fname, split_label in (("train", "train"),
+                                   ("valid", "validate"),
+                                   ("test",  "test")):
+            vqa_file = vqa_root / f"{fname}.json"
+            if not vqa_file.is_file():
+                continue
+            for row in json.load(open(vqa_file)):
+                sub_rel = str(row["image_path"]).lstrip("/")
+                if sub_rel.startswith("files/"):
+                    sub_rel = sub_rel[len("files/"):]
+                # match against any indexed image whose tail equals sub_rel
+                hit = next((k for k in image_index if k.endswith(sub_rel)), None)
+                if hit is None:
+                    n_vqa_dropped += 1
+                    continue
+                ans = row.get("answer", [])
+                answer = (", ".join(map(str, ans)) if isinstance(ans, list)
+                          else str(ans)) or "No."
+                samples.append({
+                    "image_path": hit, "image_paths": None,
+                    "task": "vqa", "target": answer,
+                    "question": row["question"],
+                    "structured_findings": None,
+                    "split": split_label,
+                    "study_id": row.get("study_id"),
+                    "subject_id": row.get("subject_id"),
+                })
+                n_vqa += 1
+    # ── Write ─────────────────────────────────────────────────────────────
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(samples, f, ensure_ascii=False)
+    by_split, by_task = {}, {}
+    for s in samples:
+        by_split[s["split"]] = by_split.get(s["split"], 0) + 1
+        by_task[s["task"]]   = by_task.get(s["task"], 0) + 1
+    print(f"[mimic_cxr_builder] wrote {len(samples):,} samples → {output_path}")
+    print(f"  report_mode      : {report_mode}")
+    print(f"  image_mode       : {image_mode}")
+    print(f"  studies indexed  : {n_studies:,}")
+    print(f"  missing report   : {n_missing_report:,}")
+    print(f"  studies w/o chexpert label : {n_no_chexpert:,}")
+    if report_mode == "merged":
+        print(f"  skipped no_impr  : {skipped_merged_no_impression:,}")
+    if report_mode == "split_cascade":
+        print(f"  skipped impr w/o findings : {skipped_cascade_no_findings:,}")
+    if vqa_root:
+        print(f"  vqa added/dropped: {n_vqa:,} / {n_vqa_dropped:,}")
+    print(f"  by split         : {by_split}")
+    print(f"  by task          : {by_task}")
+    return str(output_path)
+# ─── CLI ────────────────────────────────────────────────────────────────────
+def _parse_args():
+    p = argparse.ArgumentParser(description="Build MIMIC-CXR unified instruction JSON")
+    p.add_argument("--mimic_root",   required=True,
+                   help="Folder containing train/ valid/ test/ subdirs")
+    p.add_argument("--output",       required=True, help="Output JSON path")
+    p.add_argument("--chexpert_csv", default=None,
+                   help="Path to mimic-cxr-2.0.0-chexpert.csv "
+                        "(auto-discovered under --mimic_root if omitted)")
+    p.add_argument("--vqa_root",     default=None,
+                   help="Folder with {train,valid,test}.json VQA pairs (optional)")
+    p.add_argument("--report_mode",  default="split",
+                   choices=["split", "merged", "split_cascade"])
+    p.add_argument("--image_mode",   default="all_views_split",
+                   choices=["all_views_split", "frontal_only_split", "multi_image_merged"])
+    p.add_argument("--uncertain_policy", default="ignore",
+                   choices=["ignore", "positive"],
+                   help="CheXpert -1.0 (uncertain): ignore (default) or treat as positive.")
+    return p.parse_args()
+if __name__ == "__main__":
+    a = _parse_args()
+    build_mimic_cxr_instruct_json(
+        mimic_root   = a.mimic_root,
+        output_path  = a.output,
+        chexpert_csv = a.chexpert_csv,
+        vqa_root     = a.vqa_root,
+        report_mode  = a.report_mode,
+        image_mode   = a.image_mode,
+        uncertain_policy = a.uncertain_policy,
+    )

utils/dataset_resolver.py CHANGED Viewed

@@ -36,7 +36,7 @@ class DatasetSpec:
     instruct_json: str           # passed to CXRInstructDataset
     tasks:        List[str]      # which tasks exist in this dataset
     task_weights: Dict[str, float]  # normalized over `tasks`
-    report_mode:  str = "split"       # "split" | "merged"
     image_mode:   str = "all_views_split"  # "all_views_split" | "frontal_only_split" | "multi_image_merged"
     max_images:   int = 1             # >1 only when image_mode == multi_image_merged
@@ -50,16 +50,20 @@ def resolve_dataset_spec(train_cfg) -> DatasetSpec:
     missing and `iu_xray.auto_build == true`.
     The choice of which tasks are "available" depends on `data.report_mode`:
-      "split"  → findings, impression (+ vqa for MIMIC)
-      "merged" → report (+ vqa for MIMIC)
     """
     name = _get(train_cfg.data, "dataset_name", "MIMIC-CXR")
     report_mode = _get(train_cfg.data, "report_mode", "split")
     image_mode  = _get(train_cfg.data, "image_mode",  "all_views_split")
     max_images  = int(_get(train_cfg.data, "max_images_per_sample", 2))
-    if report_mode not in ("split", "merged"):
         raise ValueError(
-            f"data.report_mode must be 'split' or 'merged', got {report_mode!r}"
         )
     if image_mode not in ("all_views_split", "frontal_only_split", "multi_image_merged"):
         raise ValueError(
@@ -105,7 +109,9 @@ def resolve_dataset_spec(train_cfg) -> DatasetSpec:
         else:
             available = ["findings", "impression", "vqa"]
         image_root    = train_cfg.data.mimic_cxr_root
-        instruct_json = train_cfg.data.instruct_json
     else:  # IU-Xray
         # IU has no VQA.
@@ -184,6 +190,50 @@ def _ensure_iu_json_exists(iu_cfg,
     return str(out)
 # ─── Run ID resolution (dataset-prefixed) ───────────────────────────────────
 def resolve_run_id(

     instruct_json: str           # passed to CXRInstructDataset
     tasks:        List[str]      # which tasks exist in this dataset
     task_weights: Dict[str, float]  # normalized over `tasks`
+    report_mode:  str = "split"       # "split" | "merged" | "split_cascade"
     image_mode:   str = "all_views_split"  # "all_views_split" | "frontal_only_split" | "multi_image_merged"
     max_images:   int = 1             # >1 only when image_mode == multi_image_merged
     missing and `iu_xray.auto_build == true`.
     The choice of which tasks are "available" depends on `data.report_mode`:
+      "split"         → findings, impression (+ vqa for MIMIC)
+      "merged"        → report (+ vqa for MIMIC)
+      "split_cascade" → findings, impression (+ vqa for MIMIC); same task set
+                        and weights as "split" — only the data builder differs
+                        (impression sample carries GT findings as context).
     """
     name = _get(train_cfg.data, "dataset_name", "MIMIC-CXR")
     report_mode = _get(train_cfg.data, "report_mode", "split")
     image_mode  = _get(train_cfg.data, "image_mode",  "all_views_split")
     max_images  = int(_get(train_cfg.data, "max_images_per_sample", 2))
+    if report_mode not in ("split", "merged", "split_cascade"):
         raise ValueError(
+            f"data.report_mode must be 'split', 'merged', or 'split_cascade', "
+            f"got {report_mode!r}"
         )
     if image_mode not in ("all_views_split", "frontal_only_split", "multi_image_merged"):
         raise ValueError(
         else:
             available = ["findings", "impression", "vqa"]
         image_root    = train_cfg.data.mimic_cxr_root
+        instruct_json = _ensure_mimic_json_exists(
+            train_cfg.data, report_mode, image_mode
+        )
     else:  # IU-Xray
         # IU has no VQA.
     return str(out)
+def _ensure_mimic_json_exists(data_cfg,
+                              report_mode: str = "split",
+                              image_mode:  str = "all_views_split") -> str:
+    """
+    Build the MIMIC-CXR unified JSON if missing.
+    The configured `data.instruct_json` path is suffixed with both
+    report_mode and image_mode (mimic_..._instruct__split__all_views_split.json)
+    so each of the mode combinations gets its own cache and the RaDialog
+    CheXpert-guided JSON never collides with one built under other settings.
+    Auto-build (default on) reads `*chexpert*.csv` to bake the 14 oracle
+    labels into structured_findings. Set `data.mimic_auto_build: false` to
+    require a pre-built file instead.
+    """
+    base = Path(_get(data_cfg, "instruct_json",
+                     "data/data_files/mimic_cxr_instruct_unified.json"))
+    out  = base.with_name(f"{base.stem}__{report_mode}__{image_mode}{base.suffix}")
+    if out.is_file():
+        return str(out)
+    if not bool(_get(data_cfg, "mimic_auto_build", True)):
+        raise FileNotFoundError(
+            f"MIMIC instruct JSON not found at {out} and "
+            f"data.mimic_auto_build=false. Run: python -m data.mimic_cxr_builder "
+            f"--mimic_root {_get(data_cfg, 'mimic_cxr_root')} --output {out} "
+            f"--report_mode {report_mode} --image_mode {image_mode}"
+        )
+    from data.mimic_cxr_builder import build_mimic_cxr_instruct_json
+    print(f"[dataset_resolver] MIMIC JSON not found → auto-building "
+          f"(report_mode={report_mode}, image_mode={image_mode}) …")
+    build_mimic_cxr_instruct_json(
+        mimic_root       = str(_get(data_cfg, "mimic_cxr_root")),
+        output_path      = str(out),
+        chexpert_csv     = _get(data_cfg, "mimic_chexpert_csv"),
+        vqa_root         = _get(data_cfg, "mimic_vqa_root"),
+        report_mode      = report_mode,
+        image_mode       = image_mode,
+        uncertain_policy = str(_get(data_cfg, "mimic_uncertain_policy", "ignore")),
+    )
+    return str(out)
 # ─── Run ID resolution (dataset-prefixed) ───────────────────────────────────
 def resolve_run_id(