convitom Claude Sonnet 4.6 commited on
Commit
02426e6
·
1 Parent(s): 78b85ff

feat(data): add split_cascade report mode + MIMIC-CXR builder with CheXpert oracle labels

Browse files

- New report_mode "split_cascade": findings task unchanged (image+CheXpert),
impression task takes GT findings text as prompt context (findings→impression
summarisation). Wired through iu_xray_builder, dataset_resolver, config docs.
- New data/mimic_cxr_builder.py: parses the pre-split MIMIC layout, bakes the
14 GT CheXpert labels (oracle, from *chexpert*.csv) into structured_findings
as "Predicted Findings: ..." — the RaDialog image + abnormality-guidance
setup. Supports all 3 report_mode / image_mode axes; optional VQA attach.
- Implement build_instruct_json() (was NotImplementedError) as a thin delegate.
- dataset_resolver: auto-build MIMIC JSON (mode-suffixed cache) when missing.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

configs/train_config.yaml CHANGED
@@ -9,11 +9,21 @@ data:
9
  dataset_name: "IU-Xray"
10
 
11
  # How findings and impression are turned into training samples.
12
- # "split" → 2 separate tasks: task=findings and task=impression
13
- # (independent losses; pre-existing behaviour)
14
- # "merged" 1 task: task=report, target = "Findings: ...\n\nImpression: ..."
15
- # Model generates the full report autoregressively in one pass;
16
- # impression is conditioned on the findings it just emitted.
 
 
 
 
 
 
 
 
 
 
17
  # When switching modes, the on-disk instruct JSON must be rebuilt (delete it
18
  # to trigger auto_build, or rerun iu_xray_builder.py with --report_mode).
19
  report_mode: "split"
@@ -43,11 +53,28 @@ data:
43
  max_images_per_sample: 2 # only used when image_mode == "multi_image_merged"
44
 
45
  # --- MIMIC-CXR paths (used when dataset_name == "MIMIC-CXR") ---
46
- mimic_cxr_root: "/path/to/physionet.org/files/mimic-cxr-jpg/2.0.0"
47
- mimic_cxr_reports: "/path/to/physionet.org/files/mimic-cxr/2.0.0/reports"
48
- vqa_data_path: "/path/to/mimic-ext-cxr-qba"
 
49
  instruct_json: "data/data_files/mimic_cxr_instruct_unified.json"
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  # --- IU X-ray paths (used when dataset_name == "IU-Xray") ---
52
  # On local Windows the defaults below match D:\USTH\KLTN\data\IU-Xray\...
53
  # On Kaggle set these to the mounted dataset (e.g. /kaggle/input/vlm-cxr-data/...)
 
9
  dataset_name: "IU-Xray"
10
 
11
  # How findings and impression are turned into training samples.
12
+ # "split" → 2 separate tasks: task=findings and task=impression
13
+ # (independent losses; pre-existing behaviour). Both take
14
+ # image (+ CheXpert label) as input.
15
+ # "merged" → 1 task: task=report, target =
16
+ # "Findings: ...\n\nImpression: ...". Model generates the
17
+ # full report autoregressively; impression is conditioned
18
+ # on the findings it just emitted.
19
+ # "split_cascade" → like "split" (2 separate tasks) but the impression
20
+ # sample's prompt context is the GROUND-TRUTH findings
21
+ # text (findings→impression summarisation) instead of the
22
+ # CheXpert label. findings task is unchanged. Studies
23
+ # without a findings section emit no impression sample.
24
+ # Train + eval are teacher-forced (impression sees GT
25
+ # findings); a true cascade eval feeding the model's own
26
+ # generated findings is not implemented yet.
27
  # When switching modes, the on-disk instruct JSON must be rebuilt (delete it
28
  # to trigger auto_build, or rerun iu_xray_builder.py with --report_mode).
29
  report_mode: "split"
 
53
  max_images_per_sample: 2 # only used when image_mode == "multi_image_merged"
54
 
55
  # --- MIMIC-CXR paths (used when dataset_name == "MIMIC-CXR") ---
56
+ # Layout expected: {mimic_cxr_root}/{train,valid,test}/pNN/pXXXX/sYYYY/*.jpg
57
+ # plus the report sYYYY.txt sitting in the same study dir (reports are NOT a
58
+ # separate tree in this pre-split layout).
59
+ mimic_cxr_root: "/path/to/MIMIC-CXR"
60
  instruct_json: "data/data_files/mimic_cxr_instruct_unified.json"
61
 
62
+ # RaDialog abnormality guidance: the 14 CheXpert labels (oracle / GT) are
63
+ # read from this CSV and baked into the prompt as
64
+ # "Predicted Findings: ...". If left null the builder auto-discovers any
65
+ # *chexpert*.csv under mimic_cxr_root; if none is found, structured_findings
66
+ # is null and abnormality guidance is silently DISABLED (loud warning).
67
+ mimic_chexpert_csv: null
68
+ # How CheXpert -1.0 (uncertain) is mapped: "ignore" (only 1.0 positive,
69
+ # default, matches the classifier head) | "positive" (treat -1.0 as positive).
70
+ mimic_uncertain_policy: "ignore"
71
+ # Optional VQA pairs dir with {train,valid,test}.json. null → skip VQA.
72
+ mimic_vqa_root: null
73
+ # Auto-build the unified JSON (with CheXpert labels) when the cached
74
+ # report_mode/image_mode-suffixed file is missing. Set false to require a
75
+ # pre-built file (built via `python -m data.mimic_cxr_builder ...`).
76
+ mimic_auto_build: true
77
+
78
  # --- IU X-ray paths (used when dataset_name == "IU-Xray") ---
79
  # On local Windows the defaults below match D:\USTH\KLTN\data\IU-Xray\...
80
  # On Kaggle set these to the mounted dataset (e.g. /kaggle/input/vlm-cxr-data/...)
data/dataset.py CHANGED
@@ -284,52 +284,38 @@ class CXRInstructDataset(Dataset):
284
 
285
  def build_instruct_json(
286
  mimic_cxr_root: str,
287
- mimic_reports_root: str,
288
- vqa_data_root: str,
289
  output_path: str,
290
- task_weights: Optional[Dict[str, float]] = None,
291
- ):
 
 
 
 
292
  """
293
- Build the unified instruction JSON from raw data sources.
294
- Run this once after downloading data.
295
-
296
- TODO: implement after downloading data:
297
- 1. Parse MIMIC-CXR splits (train/validate/test)
298
- 2. For each study: extract findings and impression from report txt files
299
- 3. For VQA: parse MIMIC-Ext-CXR-QBA JSON files
300
- 4. Write unified JSON: list of dicts with standardized format
301
-
302
- Expected output format (one entry per sample):
303
- [
304
- {
305
- "image_path": "files/p10/p10000032/s50414267/02aa804e.jpg",
306
- "task": "findings",
307
- "target": "The lungs are clear...",
308
- "question": null,
309
- "structured_findings": "Predicted Findings: No Finding",
310
- "split": "train",
311
- "study_id": "s50414267",
312
- "subject_id": "p10000032"
313
- },
314
- {
315
- "image_path": "files/p10/p10000032/s50414267/02aa804e.jpg",
316
- "task": "impression",
317
- "target": "No acute cardiopulmonary process.",
318
- ...
319
- },
320
- {
321
- "image_path": "files/p10/p10000032/s50414267/02aa804e.jpg",
322
- "task": "vqa",
323
- "target": "Yes, there is mild pleural effusion.",
324
- "question": "Is there pleural effusion in this X-ray?",
325
- ...
326
- },
327
- ...
328
- ]
329
  """
330
- raise NotImplementedError(
331
- "TODO: Implement build_instruct_json() after downloading:\n"
332
- " - MIMIC-CXR: physionet.org/content/mimic-cxr/2.1.0\n"
333
- " - MIMIC-CXR-JPG: physionet.org/content/mimic-cxr-jpg/2.0.0\n"
334
- " - MIMIC-Ext-CXR-QBA: physionet.org/content/mimic-ext-cxr-qba/1.0.0\n"
 
 
 
 
335
  )
 
284
 
285
  def build_instruct_json(
286
  mimic_cxr_root: str,
 
 
287
  output_path: str,
288
+ chexpert_csv: Optional[str] = None,
289
+ vqa_data_root: Optional[str] = None,
290
+ report_mode: str = "split",
291
+ image_mode: str = "all_views_split",
292
+ uncertain_policy: str = "ignore",
293
+ ) -> str:
294
  """
295
+ Build the unified MIMIC-CXR instruction JSON.
296
+
297
+ Thin delegate to `data.mimic_cxr_builder.build_mimic_cxr_instruct_json`,
298
+ which walks the pre-split MIMIC layout (train/valid/test), parses
299
+ findings/impression from the report .txt files, and bakes the 14 CheXpert
300
+ labels (oracle, from `*chexpert*.csv`) into `structured_findings` as
301
+ "Predicted Findings: ..." the RaDialog image + abnormality-guidance
302
+ setup. `report_mode` / `image_mode` mirror the IU builder.
303
+
304
+ Output entries match the shared schema, e.g.:
305
+ {"image_path": "train/p10/p10000032/s50414267/02aa804e.jpg",
306
+ "task": "findings", "target": "The lungs are clear...",
307
+ "question": null,
308
+ "structured_findings": "Predicted Findings: No Finding",
309
+ "split": "train", "study_id": "s50414267",
310
+ "subject_id": "p10000032"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
  """
312
+ from .mimic_cxr_builder import build_mimic_cxr_instruct_json
313
+ return build_mimic_cxr_instruct_json(
314
+ mimic_root = mimic_cxr_root,
315
+ output_path = output_path,
316
+ chexpert_csv = chexpert_csv,
317
+ vqa_root = vqa_data_root,
318
+ report_mode = report_mode,
319
+ image_mode = image_mode,
320
+ uncertain_policy = uncertain_policy,
321
  )
data/iu_xray_builder.py CHANGED
@@ -75,7 +75,7 @@ def build_iu_xray_instruct_json(
75
  test_ratio: float = 0.15,
76
  seed: int = 42,
77
  image_suffix: str = ".png",
78
- report_mode: str = "split", # "split" | "merged"
79
  image_mode: str = "all_views_split", # "all_views_split" | "frontal_only_split" | "multi_image_merged"
80
  ) -> str:
81
  """
@@ -88,12 +88,23 @@ def build_iu_xray_instruct_json(
88
  "Findings: ...\n\nImpression: ...". Use when training a
89
  single full-report generation task. Samples with only
90
  findings are dropped (no impression to anchor on).
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  Returns:
93
  Absolute path to output JSON.
94
  """
95
- assert report_mode in ("split", "merged"), \
96
- f"report_mode must be 'split' or 'merged', got {report_mode!r}"
97
  assert image_mode in ("all_views_split", "frontal_only_split", "multi_image_merged"), \
98
  f"image_mode must be one of all_views_split/frontal_only_split/multi_image_merged, got {image_mode!r}"
99
  assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6, \
@@ -175,6 +186,7 @@ def build_iu_xray_instruct_json(
175
 
176
  samples: List[Dict] = []
177
  skipped_merged_no_impression = 0
 
178
 
179
  def _per_study_image_groups(report_imgs):
180
  """
@@ -222,6 +234,36 @@ def build_iu_xray_instruct_json(
222
  "split": split,
223
  "report_id": report["report_id"],
224
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  else: # "split"
226
  for task_name, text in (
227
  ("findings", report["findings"]),
@@ -259,6 +301,8 @@ def build_iu_xray_instruct_json(
259
  print(f" skipped no_image : {skipped_no_image}")
260
  if report_mode == "merged":
261
  print(f" skipped no_impr : {skipped_merged_no_impression}")
 
 
262
  print(f" by split : {by_split}")
263
  print(f" by task : {by_task}")
264
 
@@ -281,9 +325,11 @@ def _parse_args():
281
  p.add_argument("--seed", type=int, default=42)
282
  p.add_argument("--image_suffix", type=str, default=".png")
283
  p.add_argument("--report_mode", type=str, default="split",
284
- choices=["split", "merged"],
285
  help="split: 2 samples/img (findings + impression). "
286
- "merged: 1 sample/img with combined target.")
 
 
287
  p.add_argument("--image_mode", type=str, default="all_views_split",
288
  choices=["all_views_split", "frontal_only_split", "multi_image_merged"],
289
  help="all_views_split: 1 sample per image. "
 
75
  test_ratio: float = 0.15,
76
  seed: int = 42,
77
  image_suffix: str = ".png",
78
+ report_mode: str = "split", # "split" | "merged" | "split_cascade"
79
  image_mode: str = "all_views_split", # "all_views_split" | "frontal_only_split" | "multi_image_merged"
80
  ) -> str:
81
  """
 
88
  "Findings: ...\n\nImpression: ...". Use when training a
89
  single full-report generation task. Samples with only
90
  findings are dropped (no impression to anchor on).
91
+ "split_cascade" → like "split" (2 separate tasks) BUT the
92
+ impression sample carries the ground-truth findings
93
+ text as its prompt context (in `structured_findings`,
94
+ formatted "Findings: ...") instead of CheXpert
95
+ labels. Impression thus learns findings→impression
96
+ summarisation while still seeing the image. Only
97
+ studies with BOTH findings and impression emit an
98
+ impression sample (findings is its required input).
99
+ NOTE: eval is teacher-forced (impression gets GT
100
+ findings); a true cascade eval that feeds the
101
+ model's own generated findings is future work.
102
 
103
  Returns:
104
  Absolute path to output JSON.
105
  """
106
+ assert report_mode in ("split", "merged", "split_cascade"), \
107
+ f"report_mode must be 'split', 'merged', or 'split_cascade', got {report_mode!r}"
108
  assert image_mode in ("all_views_split", "frontal_only_split", "multi_image_merged"), \
109
  f"image_mode must be one of all_views_split/frontal_only_split/multi_image_merged, got {image_mode!r}"
110
  assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6, \
 
186
 
187
  samples: List[Dict] = []
188
  skipped_merged_no_impression = 0
189
+ skipped_cascade_no_findings = 0
190
 
191
  def _per_study_image_groups(report_imgs):
192
  """
 
234
  "split": split,
235
  "report_id": report["report_id"],
236
  })
237
+ elif report_mode == "split_cascade":
238
+ # findings sample: identical to "split".
239
+ if report["findings"] is not None:
240
+ samples.append({
241
+ **path_fields,
242
+ "task": "findings",
243
+ "target": report["findings"],
244
+ "question": None,
245
+ "structured_findings": None,
246
+ "split": split,
247
+ "report_id": report["report_id"],
248
+ })
249
+ # impression sample: needs findings as its prompt context, so
250
+ # only emit when BOTH sections exist. The GT findings ride in
251
+ # `structured_findings` (same plumbing CheXpert labels use) so
252
+ # train (dataset.py) and eval (evaluate.py) pick it up with no
253
+ # other code changes.
254
+ if report["impression"] is not None:
255
+ if report["findings"] is None:
256
+ skipped_cascade_no_findings += 1
257
+ else:
258
+ samples.append({
259
+ **path_fields,
260
+ "task": "impression",
261
+ "target": report["impression"],
262
+ "question": None,
263
+ "structured_findings": f"Findings: {report['findings'].strip()}",
264
+ "split": split,
265
+ "report_id": report["report_id"],
266
+ })
267
  else: # "split"
268
  for task_name, text in (
269
  ("findings", report["findings"]),
 
301
  print(f" skipped no_image : {skipped_no_image}")
302
  if report_mode == "merged":
303
  print(f" skipped no_impr : {skipped_merged_no_impression}")
304
+ if report_mode == "split_cascade":
305
+ print(f" skipped impr w/o findings : {skipped_cascade_no_findings}")
306
  print(f" by split : {by_split}")
307
  print(f" by task : {by_task}")
308
 
 
325
  p.add_argument("--seed", type=int, default=42)
326
  p.add_argument("--image_suffix", type=str, default=".png")
327
  p.add_argument("--report_mode", type=str, default="split",
328
+ choices=["split", "merged", "split_cascade"],
329
  help="split: 2 samples/img (findings + impression). "
330
+ "merged: 1 sample/img with combined target. "
331
+ "split_cascade: like split, but impression sample's "
332
+ "prompt context = GT findings text (findings→impression).")
333
  p.add_argument("--image_mode", type=str, default="all_views_split",
334
  choices=["all_views_split", "frontal_only_split", "multi_image_merged"],
335
  help="all_views_split: 1 sample per image. "
data/mimic_cxr_builder.py ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ mimic_cxr_builder.py
3
+ --------------------
4
+ Parses the (pre-split) MIMIC-CXR layout into a unified instruction JSON
5
+ compatible with `CXRInstructDataset` — the MIMIC counterpart of
6
+ `iu_xray_builder.py`. Same JSON schema, same `report_mode` / `image_mode`
7
+ axes, so everything downstream (dataset, resolver, evaluate) is unchanged.
8
+
9
+ Expected on-disk layout (the custom MIMIC-CXR.zip used by the notebook,
10
+ NOT the raw PhysioNet tree):
11
+
12
+ {mimic_root}/
13
+ ├── train/pNN/pXXXXXXXX/sYYYYYYYY/<dicom>.jpg + sYYYYYYYY.txt
14
+ ├── valid/pNN/...
15
+ └── test /pNN/...
16
+ {anywhere under mimic_root}/ *chexpert*.csv (optional, auto-discovered)
17
+
18
+ RaDialog-style abnormality guidance
19
+ -----------------------------------
20
+ The 14 CheXpert labels are read from `mimic-cxr-2.0.0-chexpert.csv`
21
+ (CheXbert run on the ground-truth reports) and baked into the prompt as
22
+ `structured_findings`:
23
+
24
+ "Predicted Findings: Cardiomegaly, Pleural Effusion"
25
+ "Predicted Findings: No Finding" (when no positive label)
26
+
27
+ This is the *oracle* setting — GT labels, no trained image classifier and
28
+ no model change. The CheXpert classifier module stays unused; the existing
29
+ `structured_findings` prompt plumbing carries the string through train
30
+ (dataset.py) and eval (evaluate.py) untouched.
31
+
32
+ VQA
33
+ ---
34
+ VQA pairs live in a separate dataset and are attached by passing
35
+ `vqa_root` (mirrors the notebook). Omit it to build findings/impression
36
+ only.
37
+ """
38
+
39
+ import argparse
40
+ import csv
41
+ import glob
42
+ import json
43
+ import re
44
+ from pathlib import Path
45
+ from typing import Dict, List, Optional, Tuple
46
+
47
+
48
+ # ─── Report parsing (same regex the notebook validated on MIMIC) ────────────
49
+
50
+ _FINDINGS_RE = re.compile(r"FINDINGS\s*:\s*(.*?)(?=\n\s*[A-Z ]{3,}\s*:|\Z)", re.S | re.I)
51
+ _IMPRESSION_RE = re.compile(r"IMPRESSION\s*:\s*(.*?)(?=\n\s*[A-Z ]{3,}\s*:|\Z)", re.S | re.I)
52
+
53
+ # 14 CheXpert columns, in the canonical order used by the classifier head.
54
+ CHEXPERT_LABELS = [
55
+ "No Finding",
56
+ "Enlarged Cardiomediastinum",
57
+ "Cardiomegaly",
58
+ "Lung Opacity",
59
+ "Lung Lesion",
60
+ "Edema",
61
+ "Consolidation",
62
+ "Pneumonia",
63
+ "Atelectasis",
64
+ "Pneumothorax",
65
+ "Pleural Effusion",
66
+ "Pleural Other",
67
+ "Fracture",
68
+ "Support Devices",
69
+ ]
70
+
71
+
72
+ def _clean(txt: str) -> str:
73
+ return re.sub(r"\s+", " ", txt).strip() if txt else ""
74
+
75
+
76
+ def _parse_report(txt_path: Path) -> Tuple[Optional[str], Optional[str]]:
77
+ t = txt_path.read_text(errors="ignore")
78
+ f = _FINDINGS_RE.search(t)
79
+ i = _IMPRESSION_RE.search(t)
80
+ return (
81
+ _clean(f.group(1)) if f else None,
82
+ _clean(i.group(1)) if i else None,
83
+ )
84
+
85
+
86
+ # ─── CheXpert CSV → "Predicted Findings: ..." string ────────────────────────
87
+
88
+ def _discover_chexpert_csv(mimic_root: Path, explicit: Optional[str]) -> Optional[Path]:
89
+ if explicit:
90
+ p = Path(explicit)
91
+ return p if p.is_file() else None
92
+ # Auto-discover anything that looks like the CheXpert label CSV.
93
+ for pat in ("*chexpert*.csv", "*chexbert*.csv"):
94
+ hits = sorted(glob.glob(str(mimic_root / "**" / pat), recursive=True))
95
+ if hits:
96
+ return Path(hits[0])
97
+ return None
98
+
99
+
100
+ def _load_chexpert_map(
101
+ csv_path: Path,
102
+ uncertain_policy: str = "ignore", # "ignore" → only 1.0 positive | "positive" → -1.0 also positive
103
+ ) -> Dict[Tuple[str, str], str]:
104
+ """
105
+ Return {(subject_id, study_id): "Predicted Findings: A, B"} where the ids
106
+ are the bare integers as strings (CSV stores them without the p/s prefix).
107
+ """
108
+ pos_threshold = {"1", "1.0"}
109
+ if uncertain_policy == "positive":
110
+ pos_threshold = pos_threshold | {"-1", "-1.0"}
111
+
112
+ out: Dict[Tuple[str, str], str] = {}
113
+ with open(csv_path, newline="") as f:
114
+ reader = csv.DictReader(f)
115
+ # tolerate case / spacing variations in the header
116
+ col = {c.lower().strip(): c for c in reader.fieldnames or []}
117
+ subj_c = col.get("subject_id")
118
+ study_c = col.get("study_id")
119
+ if subj_c is None or study_c is None:
120
+ raise ValueError(
121
+ f"{csv_path} missing subject_id/study_id columns "
122
+ f"(have: {reader.fieldnames})"
123
+ )
124
+ label_cols = [(name, col[name.lower()]) for name in CHEXPERT_LABELS
125
+ if name.lower() in col]
126
+
127
+ for row in reader:
128
+ subj = str(row[subj_c]).strip().lstrip("p").split(".")[0]
129
+ study = str(row[study_c]).strip().lstrip("s").split(".")[0]
130
+ positives = [
131
+ name for name, c in label_cols
132
+ if str(row.get(c, "")).strip() in pos_threshold
133
+ ]
134
+ # "No Finding" alone is reported as such; otherwise list the
135
+ # genuine positives (drop a redundant "No Finding" if any
136
+ # pathology is also positive).
137
+ real = [p for p in positives if p != "No Finding"]
138
+ if real:
139
+ txt = ", ".join(real)
140
+ else:
141
+ txt = "No Finding"
142
+ out[(subj, study)] = f"Predicted Findings: {txt}"
143
+ return out
144
+
145
+
146
+ # ─── Main builder ───────────────────────────────────────────────────────────
147
+
148
+ def build_mimic_cxr_instruct_json(
149
+ mimic_root: str,
150
+ output_path: str,
151
+ chexpert_csv: Optional[str] = None,
152
+ vqa_root: Optional[str] = None,
153
+ report_mode: str = "split", # "split" | "merged" | "split_cascade"
154
+ image_mode: str = "all_views_split", # "all_views_split" | "frontal_only_split" | "multi_image_merged"
155
+ uncertain_policy: str = "ignore", # how CheXpert -1.0 (uncertain) is treated
156
+ ) -> str:
157
+ """
158
+ Build the unified MIMIC-CXR instruction JSON.
159
+
160
+ report_mode mirrors iu_xray_builder:
161
+ "split" → findings + impression samples; BOTH carry the CheXpert
162
+ "Predicted Findings: ..." string in structured_findings
163
+ (RaDialog: image + 14 labels → text).
164
+ "merged" → one task=report sample, target "Findings: ...\n\n
165
+ Impression: ...", carries the CheXpert string.
166
+ "split_cascade" → findings sample carries the CheXpert string; the
167
+ impression sample instead carries "Findings: <GT
168
+ findings>" as context (findings→impression). Same
169
+ convention as the IU builder.
170
+
171
+ image_mode mirrors iu_xray_builder (all_views_split / frontal_only_split /
172
+ multi_image_merged). NOTE: frontal_only_split here keeps the FIRST image
173
+ of the study — this MIMIC layout has no metadata.csv to read ViewPosition
174
+ from. Swap in a ViewPosition lookup if you add that CSV.
175
+
176
+ Returns the absolute output path.
177
+ """
178
+ assert report_mode in ("split", "merged", "split_cascade"), \
179
+ f"report_mode must be 'split', 'merged', or 'split_cascade', got {report_mode!r}"
180
+ assert image_mode in ("all_views_split", "frontal_only_split", "multi_image_merged"), \
181
+ f"image_mode invalid: {image_mode!r}"
182
+
183
+ from .dataset import format_merged_report # local import to avoid cycle
184
+
185
+ mimic_root = Path(mimic_root)
186
+ output_path = Path(output_path)
187
+
188
+ # split dir name → split label written into the JSON
189
+ split_dirs = {
190
+ "train": "train",
191
+ "valid": "validate",
192
+ "test": "test",
193
+ }
194
+ present = {sub: mimic_root / sub for sub in split_dirs if (mimic_root / sub).is_dir()}
195
+ if not present:
196
+ raise FileNotFoundError(
197
+ f"No train/valid/test subdirs under {mimic_root}. "
198
+ f"Expected the pre-split MIMIC-CXR layout."
199
+ )
200
+
201
+ # ── CheXpert labels ───────────────────────────────────────────────────
202
+ csv_path = _discover_chexpert_csv(mimic_root, chexpert_csv)
203
+ if csv_path is not None:
204
+ chexpert_map = _load_chexpert_map(csv_path, uncertain_policy)
205
+ print(f"[mimic_cxr_builder] CheXpert CSV: {csv_path} "
206
+ f"({len(chexpert_map):,} studies, uncertain={uncertain_policy})")
207
+ else:
208
+ chexpert_map = {}
209
+ print("[mimic_cxr_builder] WARNING: no *chexpert*.csv found under "
210
+ f"{mimic_root} and none passed via --chexpert_csv. "
211
+ "structured_findings will be null (RaDialog abnormality "
212
+ "guidance DISABLED). Add the CSV to enable it.")
213
+
214
+ # ── Pass 1: index studies ─────────────────────────────────────────────
215
+ samples: List[Dict] = []
216
+ image_index: Dict[str, str] = {} # subject-relative path → split label
217
+ n_studies = n_missing_report = n_no_chexpert = 0
218
+ skipped_merged_no_impression = skipped_cascade_no_findings = 0
219
+
220
+ def _structured_for(subj: str, study: str) -> Optional[str]:
221
+ return chexpert_map.get((subj.lstrip("p"), study.lstrip("s")))
222
+
223
+ def _image_groups(study_dir: Path, split_sub: str, subj: str, study: str):
224
+ """Yield path_fields dicts honouring image_mode (same rules as IU)."""
225
+ imgs = sorted(study_dir.glob("*.jpg"))
226
+ if not imgs:
227
+ return
228
+ def _rel(img: Path) -> str:
229
+ return f"{split_sub}/{img.parent.parent.parent.name}/{subj}/{study}/{img.name}"
230
+ rels = [_rel(im) for im in imgs]
231
+ for r in rels:
232
+ image_index[r] = split_dirs[split_sub]
233
+ if image_mode == "all_views_split":
234
+ for r in rels:
235
+ yield {"image_path": r, "image_paths": None}
236
+ elif image_mode == "frontal_only_split":
237
+ yield {"image_path": rels[0], "image_paths": None}
238
+ else: # multi_image_merged
239
+ yield {"image_path": None, "image_paths": rels}
240
+
241
+ for split_sub, split_dir in present.items():
242
+ for p_dir in sorted(split_dir.glob("p*")):
243
+ for pat_dir in p_dir.glob("p*"):
244
+ for study_dir in pat_dir.glob("s*"):
245
+ jpgs = list(study_dir.glob("*.jpg"))
246
+ if not jpgs:
247
+ continue
248
+ n_studies += 1
249
+ subj, study = pat_dir.name, study_dir.name
250
+ txts = list(study_dir.glob("*.txt"))
251
+ if not txts:
252
+ n_missing_report += 1
253
+ continue
254
+ findings, impression = _parse_report(txts[0])
255
+ structured = _structured_for(subj, study)
256
+ if structured is None:
257
+ n_no_chexpert += 1
258
+ split_label = split_dirs[split_sub]
259
+
260
+ for path_fields in _image_groups(study_dir, split_sub, subj, study):
261
+ base = {
262
+ **path_fields,
263
+ "question": None,
264
+ "split": split_label,
265
+ "study_id": study,
266
+ "subject_id": subj,
267
+ }
268
+ if report_mode == "merged":
269
+ target = format_merged_report(findings, impression)
270
+ if target is None:
271
+ skipped_merged_no_impression += 1
272
+ continue
273
+ samples.append({**base, "task": "report",
274
+ "target": target,
275
+ "structured_findings": structured})
276
+ elif report_mode == "split_cascade":
277
+ if findings:
278
+ samples.append({**base, "task": "findings",
279
+ "target": findings,
280
+ "structured_findings": structured})
281
+ if impression:
282
+ if not findings:
283
+ skipped_cascade_no_findings += 1
284
+ else:
285
+ samples.append({**base, "task": "impression",
286
+ "target": impression,
287
+ "structured_findings":
288
+ f"Findings: {findings}"})
289
+ else: # "split"
290
+ if findings:
291
+ samples.append({**base, "task": "findings",
292
+ "target": findings,
293
+ "structured_findings": structured})
294
+ if impression:
295
+ samples.append({**base, "task": "impression",
296
+ "target": impression,
297
+ "structured_findings": structured})
298
+
299
+ # ── Pass 2: optional VQA attach (mirrors the notebook) ────────────────
300
+ n_vqa = n_vqa_dropped = 0
301
+ if vqa_root:
302
+ vqa_root = Path(vqa_root)
303
+ for fname, split_label in (("train", "train"),
304
+ ("valid", "validate"),
305
+ ("test", "test")):
306
+ vqa_file = vqa_root / f"{fname}.json"
307
+ if not vqa_file.is_file():
308
+ continue
309
+ for row in json.load(open(vqa_file)):
310
+ sub_rel = str(row["image_path"]).lstrip("/")
311
+ if sub_rel.startswith("files/"):
312
+ sub_rel = sub_rel[len("files/"):]
313
+ # match against any indexed image whose tail equals sub_rel
314
+ hit = next((k for k in image_index if k.endswith(sub_rel)), None)
315
+ if hit is None:
316
+ n_vqa_dropped += 1
317
+ continue
318
+ ans = row.get("answer", [])
319
+ answer = (", ".join(map(str, ans)) if isinstance(ans, list)
320
+ else str(ans)) or "No."
321
+ samples.append({
322
+ "image_path": hit, "image_paths": None,
323
+ "task": "vqa", "target": answer,
324
+ "question": row["question"],
325
+ "structured_findings": None,
326
+ "split": split_label,
327
+ "study_id": row.get("study_id"),
328
+ "subject_id": row.get("subject_id"),
329
+ })
330
+ n_vqa += 1
331
+
332
+ # ── Write ─────────────────────────────────────────────────────────────
333
+ output_path.parent.mkdir(parents=True, exist_ok=True)
334
+ with open(output_path, "w", encoding="utf-8") as f:
335
+ json.dump(samples, f, ensure_ascii=False)
336
+
337
+ by_split, by_task = {}, {}
338
+ for s in samples:
339
+ by_split[s["split"]] = by_split.get(s["split"], 0) + 1
340
+ by_task[s["task"]] = by_task.get(s["task"], 0) + 1
341
+
342
+ print(f"[mimic_cxr_builder] wrote {len(samples):,} samples → {output_path}")
343
+ print(f" report_mode : {report_mode}")
344
+ print(f" image_mode : {image_mode}")
345
+ print(f" studies indexed : {n_studies:,}")
346
+ print(f" missing report : {n_missing_report:,}")
347
+ print(f" studies w/o chexpert label : {n_no_chexpert:,}")
348
+ if report_mode == "merged":
349
+ print(f" skipped no_impr : {skipped_merged_no_impression:,}")
350
+ if report_mode == "split_cascade":
351
+ print(f" skipped impr w/o findings : {skipped_cascade_no_findings:,}")
352
+ if vqa_root:
353
+ print(f" vqa added/dropped: {n_vqa:,} / {n_vqa_dropped:,}")
354
+ print(f" by split : {by_split}")
355
+ print(f" by task : {by_task}")
356
+ return str(output_path)
357
+
358
+
359
+ # ─── CLI ────────────────────────────────────────────────────────────────────
360
+
361
+ def _parse_args():
362
+ p = argparse.ArgumentParser(description="Build MIMIC-CXR unified instruction JSON")
363
+ p.add_argument("--mimic_root", required=True,
364
+ help="Folder containing train/ valid/ test/ subdirs")
365
+ p.add_argument("--output", required=True, help="Output JSON path")
366
+ p.add_argument("--chexpert_csv", default=None,
367
+ help="Path to mimic-cxr-2.0.0-chexpert.csv "
368
+ "(auto-discovered under --mimic_root if omitted)")
369
+ p.add_argument("--vqa_root", default=None,
370
+ help="Folder with {train,valid,test}.json VQA pairs (optional)")
371
+ p.add_argument("--report_mode", default="split",
372
+ choices=["split", "merged", "split_cascade"])
373
+ p.add_argument("--image_mode", default="all_views_split",
374
+ choices=["all_views_split", "frontal_only_split", "multi_image_merged"])
375
+ p.add_argument("--uncertain_policy", default="ignore",
376
+ choices=["ignore", "positive"],
377
+ help="CheXpert -1.0 (uncertain): ignore (default) or treat as positive.")
378
+ return p.parse_args()
379
+
380
+
381
+ if __name__ == "__main__":
382
+ a = _parse_args()
383
+ build_mimic_cxr_instruct_json(
384
+ mimic_root = a.mimic_root,
385
+ output_path = a.output,
386
+ chexpert_csv = a.chexpert_csv,
387
+ vqa_root = a.vqa_root,
388
+ report_mode = a.report_mode,
389
+ image_mode = a.image_mode,
390
+ uncertain_policy = a.uncertain_policy,
391
+ )
utils/dataset_resolver.py CHANGED
@@ -36,7 +36,7 @@ class DatasetSpec:
36
  instruct_json: str # passed to CXRInstructDataset
37
  tasks: List[str] # which tasks exist in this dataset
38
  task_weights: Dict[str, float] # normalized over `tasks`
39
- report_mode: str = "split" # "split" | "merged"
40
  image_mode: str = "all_views_split" # "all_views_split" | "frontal_only_split" | "multi_image_merged"
41
  max_images: int = 1 # >1 only when image_mode == multi_image_merged
42
 
@@ -50,16 +50,20 @@ def resolve_dataset_spec(train_cfg) -> DatasetSpec:
50
  missing and `iu_xray.auto_build == true`.
51
 
52
  The choice of which tasks are "available" depends on `data.report_mode`:
53
- "split" → findings, impression (+ vqa for MIMIC)
54
- "merged" → report (+ vqa for MIMIC)
 
 
 
55
  """
56
  name = _get(train_cfg.data, "dataset_name", "MIMIC-CXR")
57
  report_mode = _get(train_cfg.data, "report_mode", "split")
58
  image_mode = _get(train_cfg.data, "image_mode", "all_views_split")
59
  max_images = int(_get(train_cfg.data, "max_images_per_sample", 2))
60
- if report_mode not in ("split", "merged"):
61
  raise ValueError(
62
- f"data.report_mode must be 'split' or 'merged', got {report_mode!r}"
 
63
  )
64
  if image_mode not in ("all_views_split", "frontal_only_split", "multi_image_merged"):
65
  raise ValueError(
@@ -105,7 +109,9 @@ def resolve_dataset_spec(train_cfg) -> DatasetSpec:
105
  else:
106
  available = ["findings", "impression", "vqa"]
107
  image_root = train_cfg.data.mimic_cxr_root
108
- instruct_json = train_cfg.data.instruct_json
 
 
109
 
110
  else: # IU-Xray
111
  # IU has no VQA.
@@ -184,6 +190,50 @@ def _ensure_iu_json_exists(iu_cfg,
184
  return str(out)
185
 
186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  # ─── Run ID resolution (dataset-prefixed) ───────────────────────────────────
188
 
189
  def resolve_run_id(
 
36
  instruct_json: str # passed to CXRInstructDataset
37
  tasks: List[str] # which tasks exist in this dataset
38
  task_weights: Dict[str, float] # normalized over `tasks`
39
+ report_mode: str = "split" # "split" | "merged" | "split_cascade"
40
  image_mode: str = "all_views_split" # "all_views_split" | "frontal_only_split" | "multi_image_merged"
41
  max_images: int = 1 # >1 only when image_mode == multi_image_merged
42
 
 
50
  missing and `iu_xray.auto_build == true`.
51
 
52
  The choice of which tasks are "available" depends on `data.report_mode`:
53
+ "split" → findings, impression (+ vqa for MIMIC)
54
+ "merged" → report (+ vqa for MIMIC)
55
+ "split_cascade" → findings, impression (+ vqa for MIMIC); same task set
56
+ and weights as "split" — only the data builder differs
57
+ (impression sample carries GT findings as context).
58
  """
59
  name = _get(train_cfg.data, "dataset_name", "MIMIC-CXR")
60
  report_mode = _get(train_cfg.data, "report_mode", "split")
61
  image_mode = _get(train_cfg.data, "image_mode", "all_views_split")
62
  max_images = int(_get(train_cfg.data, "max_images_per_sample", 2))
63
+ if report_mode not in ("split", "merged", "split_cascade"):
64
  raise ValueError(
65
+ f"data.report_mode must be 'split', 'merged', or 'split_cascade', "
66
+ f"got {report_mode!r}"
67
  )
68
  if image_mode not in ("all_views_split", "frontal_only_split", "multi_image_merged"):
69
  raise ValueError(
 
109
  else:
110
  available = ["findings", "impression", "vqa"]
111
  image_root = train_cfg.data.mimic_cxr_root
112
+ instruct_json = _ensure_mimic_json_exists(
113
+ train_cfg.data, report_mode, image_mode
114
+ )
115
 
116
  else: # IU-Xray
117
  # IU has no VQA.
 
190
  return str(out)
191
 
192
 
193
+ def _ensure_mimic_json_exists(data_cfg,
194
+ report_mode: str = "split",
195
+ image_mode: str = "all_views_split") -> str:
196
+ """
197
+ Build the MIMIC-CXR unified JSON if missing.
198
+
199
+ The configured `data.instruct_json` path is suffixed with both
200
+ report_mode and image_mode (mimic_..._instruct__split__all_views_split.json)
201
+ so each of the mode combinations gets its own cache and the RaDialog
202
+ CheXpert-guided JSON never collides with one built under other settings.
203
+
204
+ Auto-build (default on) reads `*chexpert*.csv` to bake the 14 oracle
205
+ labels into structured_findings. Set `data.mimic_auto_build: false` to
206
+ require a pre-built file instead.
207
+ """
208
+ base = Path(_get(data_cfg, "instruct_json",
209
+ "data/data_files/mimic_cxr_instruct_unified.json"))
210
+ out = base.with_name(f"{base.stem}__{report_mode}__{image_mode}{base.suffix}")
211
+ if out.is_file():
212
+ return str(out)
213
+
214
+ if not bool(_get(data_cfg, "mimic_auto_build", True)):
215
+ raise FileNotFoundError(
216
+ f"MIMIC instruct JSON not found at {out} and "
217
+ f"data.mimic_auto_build=false. Run: python -m data.mimic_cxr_builder "
218
+ f"--mimic_root {_get(data_cfg, 'mimic_cxr_root')} --output {out} "
219
+ f"--report_mode {report_mode} --image_mode {image_mode}"
220
+ )
221
+
222
+ from data.mimic_cxr_builder import build_mimic_cxr_instruct_json
223
+ print(f"[dataset_resolver] MIMIC JSON not found → auto-building "
224
+ f"(report_mode={report_mode}, image_mode={image_mode}) …")
225
+ build_mimic_cxr_instruct_json(
226
+ mimic_root = str(_get(data_cfg, "mimic_cxr_root")),
227
+ output_path = str(out),
228
+ chexpert_csv = _get(data_cfg, "mimic_chexpert_csv"),
229
+ vqa_root = _get(data_cfg, "mimic_vqa_root"),
230
+ report_mode = report_mode,
231
+ image_mode = image_mode,
232
+ uncertain_policy = str(_get(data_cfg, "mimic_uncertain_policy", "ignore")),
233
+ )
234
+ return str(out)
235
+
236
+
237
  # ─── Run ID resolution (dataset-prefixed) ───────────────────────────────────
238
 
239
  def resolve_run_id(