feat(data): add split_cascade report mode + MIMIC-CXR builder with CheXpert oracle labels
Browse files- New report_mode "split_cascade": findings task unchanged (image+CheXpert),
impression task takes GT findings text as prompt context (findings→impression
summarisation). Wired through iu_xray_builder, dataset_resolver, config docs.
- New data/mimic_cxr_builder.py: parses the pre-split MIMIC layout, bakes the
14 GT CheXpert labels (oracle, from *chexpert*.csv) into structured_findings
as "Predicted Findings: ..." — the RaDialog image + abnormality-guidance
setup. Supports all 3 report_mode / image_mode axes; optional VQA attach.
- Implement build_instruct_json() (was NotImplementedError) as a thin delegate.
- dataset_resolver: auto-build MIMIC JSON (mode-suffixed cache) when missing.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- configs/train_config.yaml +35 -8
- data/dataset.py +31 -45
- data/iu_xray_builder.py +51 -5
- data/mimic_cxr_builder.py +391 -0
- utils/dataset_resolver.py +56 -6
|
@@ -9,11 +9,21 @@ data:
|
|
| 9 |
dataset_name: "IU-Xray"
|
| 10 |
|
| 11 |
# How findings and impression are turned into training samples.
|
| 12 |
-
# "split"
|
| 13 |
-
#
|
| 14 |
-
#
|
| 15 |
-
#
|
| 16 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
# When switching modes, the on-disk instruct JSON must be rebuilt (delete it
|
| 18 |
# to trigger auto_build, or rerun iu_xray_builder.py with --report_mode).
|
| 19 |
report_mode: "split"
|
|
@@ -43,11 +53,28 @@ data:
|
|
| 43 |
max_images_per_sample: 2 # only used when image_mode == "multi_image_merged"
|
| 44 |
|
| 45 |
# --- MIMIC-CXR paths (used when dataset_name == "MIMIC-CXR") ---
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
|
|
|
| 49 |
instruct_json: "data/data_files/mimic_cxr_instruct_unified.json"
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
# --- IU X-ray paths (used when dataset_name == "IU-Xray") ---
|
| 52 |
# On local Windows the defaults below match D:\USTH\KLTN\data\IU-Xray\...
|
| 53 |
# On Kaggle set these to the mounted dataset (e.g. /kaggle/input/vlm-cxr-data/...)
|
|
|
|
| 9 |
dataset_name: "IU-Xray"
|
| 10 |
|
| 11 |
# How findings and impression are turned into training samples.
|
| 12 |
+
# "split" → 2 separate tasks: task=findings and task=impression
|
| 13 |
+
# (independent losses; pre-existing behaviour). Both take
|
| 14 |
+
# image (+ CheXpert label) as input.
|
| 15 |
+
# "merged" → 1 task: task=report, target =
|
| 16 |
+
# "Findings: ...\n\nImpression: ...". Model generates the
|
| 17 |
+
# full report autoregressively; impression is conditioned
|
| 18 |
+
# on the findings it just emitted.
|
| 19 |
+
# "split_cascade" → like "split" (2 separate tasks) but the impression
|
| 20 |
+
# sample's prompt context is the GROUND-TRUTH findings
|
| 21 |
+
# text (findings→impression summarisation) instead of the
|
| 22 |
+
# CheXpert label. findings task is unchanged. Studies
|
| 23 |
+
# without a findings section emit no impression sample.
|
| 24 |
+
# Train + eval are teacher-forced (impression sees GT
|
| 25 |
+
# findings); a true cascade eval feeding the model's own
|
| 26 |
+
# generated findings is not implemented yet.
|
| 27 |
# When switching modes, the on-disk instruct JSON must be rebuilt (delete it
|
| 28 |
# to trigger auto_build, or rerun iu_xray_builder.py with --report_mode).
|
| 29 |
report_mode: "split"
|
|
|
|
| 53 |
max_images_per_sample: 2 # only used when image_mode == "multi_image_merged"
|
| 54 |
|
| 55 |
# --- MIMIC-CXR paths (used when dataset_name == "MIMIC-CXR") ---
|
| 56 |
+
# Layout expected: {mimic_cxr_root}/{train,valid,test}/pNN/pXXXX/sYYYY/*.jpg
|
| 57 |
+
# plus the report sYYYY.txt sitting in the same study dir (reports are NOT a
|
| 58 |
+
# separate tree in this pre-split layout).
|
| 59 |
+
mimic_cxr_root: "/path/to/MIMIC-CXR"
|
| 60 |
instruct_json: "data/data_files/mimic_cxr_instruct_unified.json"
|
| 61 |
|
| 62 |
+
# RaDialog abnormality guidance: the 14 CheXpert labels (oracle / GT) are
|
| 63 |
+
# read from this CSV and baked into the prompt as
|
| 64 |
+
# "Predicted Findings: ...". If left null the builder auto-discovers any
|
| 65 |
+
# *chexpert*.csv under mimic_cxr_root; if none is found, structured_findings
|
| 66 |
+
# is null and abnormality guidance is silently DISABLED (loud warning).
|
| 67 |
+
mimic_chexpert_csv: null
|
| 68 |
+
# How CheXpert -1.0 (uncertain) is mapped: "ignore" (only 1.0 positive,
|
| 69 |
+
# default, matches the classifier head) | "positive" (treat -1.0 as positive).
|
| 70 |
+
mimic_uncertain_policy: "ignore"
|
| 71 |
+
# Optional VQA pairs dir with {train,valid,test}.json. null → skip VQA.
|
| 72 |
+
mimic_vqa_root: null
|
| 73 |
+
# Auto-build the unified JSON (with CheXpert labels) when the cached
|
| 74 |
+
# report_mode/image_mode-suffixed file is missing. Set false to require a
|
| 75 |
+
# pre-built file (built via `python -m data.mimic_cxr_builder ...`).
|
| 76 |
+
mimic_auto_build: true
|
| 77 |
+
|
| 78 |
# --- IU X-ray paths (used when dataset_name == "IU-Xray") ---
|
| 79 |
# On local Windows the defaults below match D:\USTH\KLTN\data\IU-Xray\...
|
| 80 |
# On Kaggle set these to the mounted dataset (e.g. /kaggle/input/vlm-cxr-data/...)
|
|
@@ -284,52 +284,38 @@ class CXRInstructDataset(Dataset):
|
|
| 284 |
|
| 285 |
def build_instruct_json(
|
| 286 |
mimic_cxr_root: str,
|
| 287 |
-
mimic_reports_root: str,
|
| 288 |
-
vqa_data_root: str,
|
| 289 |
output_path: str,
|
| 290 |
-
|
| 291 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
"""
|
| 293 |
-
Build the unified instruction JSON
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
"structured_findings": "Predicted Findings: No Finding",
|
| 310 |
-
"split": "train",
|
| 311 |
-
"study_id": "s50414267",
|
| 312 |
-
"subject_id": "p10000032"
|
| 313 |
-
},
|
| 314 |
-
{
|
| 315 |
-
"image_path": "files/p10/p10000032/s50414267/02aa804e.jpg",
|
| 316 |
-
"task": "impression",
|
| 317 |
-
"target": "No acute cardiopulmonary process.",
|
| 318 |
-
...
|
| 319 |
-
},
|
| 320 |
-
{
|
| 321 |
-
"image_path": "files/p10/p10000032/s50414267/02aa804e.jpg",
|
| 322 |
-
"task": "vqa",
|
| 323 |
-
"target": "Yes, there is mild pleural effusion.",
|
| 324 |
-
"question": "Is there pleural effusion in this X-ray?",
|
| 325 |
-
...
|
| 326 |
-
},
|
| 327 |
-
...
|
| 328 |
-
]
|
| 329 |
"""
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
)
|
|
|
|
| 284 |
|
| 285 |
def build_instruct_json(
|
| 286 |
mimic_cxr_root: str,
|
|
|
|
|
|
|
| 287 |
output_path: str,
|
| 288 |
+
chexpert_csv: Optional[str] = None,
|
| 289 |
+
vqa_data_root: Optional[str] = None,
|
| 290 |
+
report_mode: str = "split",
|
| 291 |
+
image_mode: str = "all_views_split",
|
| 292 |
+
uncertain_policy: str = "ignore",
|
| 293 |
+
) -> str:
|
| 294 |
"""
|
| 295 |
+
Build the unified MIMIC-CXR instruction JSON.
|
| 296 |
+
|
| 297 |
+
Thin delegate to `data.mimic_cxr_builder.build_mimic_cxr_instruct_json`,
|
| 298 |
+
which walks the pre-split MIMIC layout (train/valid/test), parses
|
| 299 |
+
findings/impression from the report .txt files, and bakes the 14 CheXpert
|
| 300 |
+
labels (oracle, from `*chexpert*.csv`) into `structured_findings` as
|
| 301 |
+
"Predicted Findings: ..." — the RaDialog image + abnormality-guidance
|
| 302 |
+
setup. `report_mode` / `image_mode` mirror the IU builder.
|
| 303 |
+
|
| 304 |
+
Output entries match the shared schema, e.g.:
|
| 305 |
+
{"image_path": "train/p10/p10000032/s50414267/02aa804e.jpg",
|
| 306 |
+
"task": "findings", "target": "The lungs are clear...",
|
| 307 |
+
"question": null,
|
| 308 |
+
"structured_findings": "Predicted Findings: No Finding",
|
| 309 |
+
"split": "train", "study_id": "s50414267",
|
| 310 |
+
"subject_id": "p10000032"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
"""
|
| 312 |
+
from .mimic_cxr_builder import build_mimic_cxr_instruct_json
|
| 313 |
+
return build_mimic_cxr_instruct_json(
|
| 314 |
+
mimic_root = mimic_cxr_root,
|
| 315 |
+
output_path = output_path,
|
| 316 |
+
chexpert_csv = chexpert_csv,
|
| 317 |
+
vqa_root = vqa_data_root,
|
| 318 |
+
report_mode = report_mode,
|
| 319 |
+
image_mode = image_mode,
|
| 320 |
+
uncertain_policy = uncertain_policy,
|
| 321 |
)
|
|
@@ -75,7 +75,7 @@ def build_iu_xray_instruct_json(
|
|
| 75 |
test_ratio: float = 0.15,
|
| 76 |
seed: int = 42,
|
| 77 |
image_suffix: str = ".png",
|
| 78 |
-
report_mode: str = "split", # "split" | "merged"
|
| 79 |
image_mode: str = "all_views_split", # "all_views_split" | "frontal_only_split" | "multi_image_merged"
|
| 80 |
) -> str:
|
| 81 |
"""
|
|
@@ -88,12 +88,23 @@ def build_iu_xray_instruct_json(
|
|
| 88 |
"Findings: ...\n\nImpression: ...". Use when training a
|
| 89 |
single full-report generation task. Samples with only
|
| 90 |
findings are dropped (no impression to anchor on).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
Returns:
|
| 93 |
Absolute path to output JSON.
|
| 94 |
"""
|
| 95 |
-
assert report_mode in ("split", "merged"), \
|
| 96 |
-
f"report_mode must be 'split' or '
|
| 97 |
assert image_mode in ("all_views_split", "frontal_only_split", "multi_image_merged"), \
|
| 98 |
f"image_mode must be one of all_views_split/frontal_only_split/multi_image_merged, got {image_mode!r}"
|
| 99 |
assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6, \
|
|
@@ -175,6 +186,7 @@ def build_iu_xray_instruct_json(
|
|
| 175 |
|
| 176 |
samples: List[Dict] = []
|
| 177 |
skipped_merged_no_impression = 0
|
|
|
|
| 178 |
|
| 179 |
def _per_study_image_groups(report_imgs):
|
| 180 |
"""
|
|
@@ -222,6 +234,36 @@ def build_iu_xray_instruct_json(
|
|
| 222 |
"split": split,
|
| 223 |
"report_id": report["report_id"],
|
| 224 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
else: # "split"
|
| 226 |
for task_name, text in (
|
| 227 |
("findings", report["findings"]),
|
|
@@ -259,6 +301,8 @@ def build_iu_xray_instruct_json(
|
|
| 259 |
print(f" skipped no_image : {skipped_no_image}")
|
| 260 |
if report_mode == "merged":
|
| 261 |
print(f" skipped no_impr : {skipped_merged_no_impression}")
|
|
|
|
|
|
|
| 262 |
print(f" by split : {by_split}")
|
| 263 |
print(f" by task : {by_task}")
|
| 264 |
|
|
@@ -281,9 +325,11 @@ def _parse_args():
|
|
| 281 |
p.add_argument("--seed", type=int, default=42)
|
| 282 |
p.add_argument("--image_suffix", type=str, default=".png")
|
| 283 |
p.add_argument("--report_mode", type=str, default="split",
|
| 284 |
-
choices=["split", "merged"],
|
| 285 |
help="split: 2 samples/img (findings + impression). "
|
| 286 |
-
"merged: 1 sample/img with combined target."
|
|
|
|
|
|
|
| 287 |
p.add_argument("--image_mode", type=str, default="all_views_split",
|
| 288 |
choices=["all_views_split", "frontal_only_split", "multi_image_merged"],
|
| 289 |
help="all_views_split: 1 sample per image. "
|
|
|
|
| 75 |
test_ratio: float = 0.15,
|
| 76 |
seed: int = 42,
|
| 77 |
image_suffix: str = ".png",
|
| 78 |
+
report_mode: str = "split", # "split" | "merged" | "split_cascade"
|
| 79 |
image_mode: str = "all_views_split", # "all_views_split" | "frontal_only_split" | "multi_image_merged"
|
| 80 |
) -> str:
|
| 81 |
"""
|
|
|
|
| 88 |
"Findings: ...\n\nImpression: ...". Use when training a
|
| 89 |
single full-report generation task. Samples with only
|
| 90 |
findings are dropped (no impression to anchor on).
|
| 91 |
+
"split_cascade" → like "split" (2 separate tasks) BUT the
|
| 92 |
+
impression sample carries the ground-truth findings
|
| 93 |
+
text as its prompt context (in `structured_findings`,
|
| 94 |
+
formatted "Findings: ...") instead of CheXpert
|
| 95 |
+
labels. Impression thus learns findings→impression
|
| 96 |
+
summarisation while still seeing the image. Only
|
| 97 |
+
studies with BOTH findings and impression emit an
|
| 98 |
+
impression sample (findings is its required input).
|
| 99 |
+
NOTE: eval is teacher-forced (impression gets GT
|
| 100 |
+
findings); a true cascade eval that feeds the
|
| 101 |
+
model's own generated findings is future work.
|
| 102 |
|
| 103 |
Returns:
|
| 104 |
Absolute path to output JSON.
|
| 105 |
"""
|
| 106 |
+
assert report_mode in ("split", "merged", "split_cascade"), \
|
| 107 |
+
f"report_mode must be 'split', 'merged', or 'split_cascade', got {report_mode!r}"
|
| 108 |
assert image_mode in ("all_views_split", "frontal_only_split", "multi_image_merged"), \
|
| 109 |
f"image_mode must be one of all_views_split/frontal_only_split/multi_image_merged, got {image_mode!r}"
|
| 110 |
assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6, \
|
|
|
|
| 186 |
|
| 187 |
samples: List[Dict] = []
|
| 188 |
skipped_merged_no_impression = 0
|
| 189 |
+
skipped_cascade_no_findings = 0
|
| 190 |
|
| 191 |
def _per_study_image_groups(report_imgs):
|
| 192 |
"""
|
|
|
|
| 234 |
"split": split,
|
| 235 |
"report_id": report["report_id"],
|
| 236 |
})
|
| 237 |
+
elif report_mode == "split_cascade":
|
| 238 |
+
# findings sample: identical to "split".
|
| 239 |
+
if report["findings"] is not None:
|
| 240 |
+
samples.append({
|
| 241 |
+
**path_fields,
|
| 242 |
+
"task": "findings",
|
| 243 |
+
"target": report["findings"],
|
| 244 |
+
"question": None,
|
| 245 |
+
"structured_findings": None,
|
| 246 |
+
"split": split,
|
| 247 |
+
"report_id": report["report_id"],
|
| 248 |
+
})
|
| 249 |
+
# impression sample: needs findings as its prompt context, so
|
| 250 |
+
# only emit when BOTH sections exist. The GT findings ride in
|
| 251 |
+
# `structured_findings` (same plumbing CheXpert labels use) so
|
| 252 |
+
# train (dataset.py) and eval (evaluate.py) pick it up with no
|
| 253 |
+
# other code changes.
|
| 254 |
+
if report["impression"] is not None:
|
| 255 |
+
if report["findings"] is None:
|
| 256 |
+
skipped_cascade_no_findings += 1
|
| 257 |
+
else:
|
| 258 |
+
samples.append({
|
| 259 |
+
**path_fields,
|
| 260 |
+
"task": "impression",
|
| 261 |
+
"target": report["impression"],
|
| 262 |
+
"question": None,
|
| 263 |
+
"structured_findings": f"Findings: {report['findings'].strip()}",
|
| 264 |
+
"split": split,
|
| 265 |
+
"report_id": report["report_id"],
|
| 266 |
+
})
|
| 267 |
else: # "split"
|
| 268 |
for task_name, text in (
|
| 269 |
("findings", report["findings"]),
|
|
|
|
| 301 |
print(f" skipped no_image : {skipped_no_image}")
|
| 302 |
if report_mode == "merged":
|
| 303 |
print(f" skipped no_impr : {skipped_merged_no_impression}")
|
| 304 |
+
if report_mode == "split_cascade":
|
| 305 |
+
print(f" skipped impr w/o findings : {skipped_cascade_no_findings}")
|
| 306 |
print(f" by split : {by_split}")
|
| 307 |
print(f" by task : {by_task}")
|
| 308 |
|
|
|
|
| 325 |
p.add_argument("--seed", type=int, default=42)
|
| 326 |
p.add_argument("--image_suffix", type=str, default=".png")
|
| 327 |
p.add_argument("--report_mode", type=str, default="split",
|
| 328 |
+
choices=["split", "merged", "split_cascade"],
|
| 329 |
help="split: 2 samples/img (findings + impression). "
|
| 330 |
+
"merged: 1 sample/img with combined target. "
|
| 331 |
+
"split_cascade: like split, but impression sample's "
|
| 332 |
+
"prompt context = GT findings text (findings→impression).")
|
| 333 |
p.add_argument("--image_mode", type=str, default="all_views_split",
|
| 334 |
choices=["all_views_split", "frontal_only_split", "multi_image_merged"],
|
| 335 |
help="all_views_split: 1 sample per image. "
|
|
@@ -0,0 +1,391 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
mimic_cxr_builder.py
|
| 3 |
+
--------------------
|
| 4 |
+
Parses the (pre-split) MIMIC-CXR layout into a unified instruction JSON
|
| 5 |
+
compatible with `CXRInstructDataset` — the MIMIC counterpart of
|
| 6 |
+
`iu_xray_builder.py`. Same JSON schema, same `report_mode` / `image_mode`
|
| 7 |
+
axes, so everything downstream (dataset, resolver, evaluate) is unchanged.
|
| 8 |
+
|
| 9 |
+
Expected on-disk layout (the custom MIMIC-CXR.zip used by the notebook,
|
| 10 |
+
NOT the raw PhysioNet tree):
|
| 11 |
+
|
| 12 |
+
{mimic_root}/
|
| 13 |
+
├── train/pNN/pXXXXXXXX/sYYYYYYYY/<dicom>.jpg + sYYYYYYYY.txt
|
| 14 |
+
├── valid/pNN/...
|
| 15 |
+
└── test /pNN/...
|
| 16 |
+
{anywhere under mimic_root}/ *chexpert*.csv (optional, auto-discovered)
|
| 17 |
+
|
| 18 |
+
RaDialog-style abnormality guidance
|
| 19 |
+
-----------------------------------
|
| 20 |
+
The 14 CheXpert labels are read from `mimic-cxr-2.0.0-chexpert.csv`
|
| 21 |
+
(CheXbert run on the ground-truth reports) and baked into the prompt as
|
| 22 |
+
`structured_findings`:
|
| 23 |
+
|
| 24 |
+
"Predicted Findings: Cardiomegaly, Pleural Effusion"
|
| 25 |
+
"Predicted Findings: No Finding" (when no positive label)
|
| 26 |
+
|
| 27 |
+
This is the *oracle* setting — GT labels, no trained image classifier and
|
| 28 |
+
no model change. The CheXpert classifier module stays unused; the existing
|
| 29 |
+
`structured_findings` prompt plumbing carries the string through train
|
| 30 |
+
(dataset.py) and eval (evaluate.py) untouched.
|
| 31 |
+
|
| 32 |
+
VQA
|
| 33 |
+
---
|
| 34 |
+
VQA pairs live in a separate dataset and are attached by passing
|
| 35 |
+
`vqa_root` (mirrors the notebook). Omit it to build findings/impression
|
| 36 |
+
only.
|
| 37 |
+
"""
|
| 38 |
+
|
| 39 |
+
import argparse
|
| 40 |
+
import csv
|
| 41 |
+
import glob
|
| 42 |
+
import json
|
| 43 |
+
import re
|
| 44 |
+
from pathlib import Path
|
| 45 |
+
from typing import Dict, List, Optional, Tuple
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
# ─── Report parsing (same regex the notebook validated on MIMIC) ────────────
|
| 49 |
+
|
| 50 |
+
_FINDINGS_RE = re.compile(r"FINDINGS\s*:\s*(.*?)(?=\n\s*[A-Z ]{3,}\s*:|\Z)", re.S | re.I)
|
| 51 |
+
_IMPRESSION_RE = re.compile(r"IMPRESSION\s*:\s*(.*?)(?=\n\s*[A-Z ]{3,}\s*:|\Z)", re.S | re.I)
|
| 52 |
+
|
| 53 |
+
# 14 CheXpert columns, in the canonical order used by the classifier head.
|
| 54 |
+
CHEXPERT_LABELS = [
|
| 55 |
+
"No Finding",
|
| 56 |
+
"Enlarged Cardiomediastinum",
|
| 57 |
+
"Cardiomegaly",
|
| 58 |
+
"Lung Opacity",
|
| 59 |
+
"Lung Lesion",
|
| 60 |
+
"Edema",
|
| 61 |
+
"Consolidation",
|
| 62 |
+
"Pneumonia",
|
| 63 |
+
"Atelectasis",
|
| 64 |
+
"Pneumothorax",
|
| 65 |
+
"Pleural Effusion",
|
| 66 |
+
"Pleural Other",
|
| 67 |
+
"Fracture",
|
| 68 |
+
"Support Devices",
|
| 69 |
+
]
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def _clean(txt: str) -> str:
|
| 73 |
+
return re.sub(r"\s+", " ", txt).strip() if txt else ""
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def _parse_report(txt_path: Path) -> Tuple[Optional[str], Optional[str]]:
|
| 77 |
+
t = txt_path.read_text(errors="ignore")
|
| 78 |
+
f = _FINDINGS_RE.search(t)
|
| 79 |
+
i = _IMPRESSION_RE.search(t)
|
| 80 |
+
return (
|
| 81 |
+
_clean(f.group(1)) if f else None,
|
| 82 |
+
_clean(i.group(1)) if i else None,
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# ─── CheXpert CSV → "Predicted Findings: ..." string ────────────────────────
|
| 87 |
+
|
| 88 |
+
def _discover_chexpert_csv(mimic_root: Path, explicit: Optional[str]) -> Optional[Path]:
|
| 89 |
+
if explicit:
|
| 90 |
+
p = Path(explicit)
|
| 91 |
+
return p if p.is_file() else None
|
| 92 |
+
# Auto-discover anything that looks like the CheXpert label CSV.
|
| 93 |
+
for pat in ("*chexpert*.csv", "*chexbert*.csv"):
|
| 94 |
+
hits = sorted(glob.glob(str(mimic_root / "**" / pat), recursive=True))
|
| 95 |
+
if hits:
|
| 96 |
+
return Path(hits[0])
|
| 97 |
+
return None
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def _load_chexpert_map(
|
| 101 |
+
csv_path: Path,
|
| 102 |
+
uncertain_policy: str = "ignore", # "ignore" → only 1.0 positive | "positive" → -1.0 also positive
|
| 103 |
+
) -> Dict[Tuple[str, str], str]:
|
| 104 |
+
"""
|
| 105 |
+
Return {(subject_id, study_id): "Predicted Findings: A, B"} where the ids
|
| 106 |
+
are the bare integers as strings (CSV stores them without the p/s prefix).
|
| 107 |
+
"""
|
| 108 |
+
pos_threshold = {"1", "1.0"}
|
| 109 |
+
if uncertain_policy == "positive":
|
| 110 |
+
pos_threshold = pos_threshold | {"-1", "-1.0"}
|
| 111 |
+
|
| 112 |
+
out: Dict[Tuple[str, str], str] = {}
|
| 113 |
+
with open(csv_path, newline="") as f:
|
| 114 |
+
reader = csv.DictReader(f)
|
| 115 |
+
# tolerate case / spacing variations in the header
|
| 116 |
+
col = {c.lower().strip(): c for c in reader.fieldnames or []}
|
| 117 |
+
subj_c = col.get("subject_id")
|
| 118 |
+
study_c = col.get("study_id")
|
| 119 |
+
if subj_c is None or study_c is None:
|
| 120 |
+
raise ValueError(
|
| 121 |
+
f"{csv_path} missing subject_id/study_id columns "
|
| 122 |
+
f"(have: {reader.fieldnames})"
|
| 123 |
+
)
|
| 124 |
+
label_cols = [(name, col[name.lower()]) for name in CHEXPERT_LABELS
|
| 125 |
+
if name.lower() in col]
|
| 126 |
+
|
| 127 |
+
for row in reader:
|
| 128 |
+
subj = str(row[subj_c]).strip().lstrip("p").split(".")[0]
|
| 129 |
+
study = str(row[study_c]).strip().lstrip("s").split(".")[0]
|
| 130 |
+
positives = [
|
| 131 |
+
name for name, c in label_cols
|
| 132 |
+
if str(row.get(c, "")).strip() in pos_threshold
|
| 133 |
+
]
|
| 134 |
+
# "No Finding" alone is reported as such; otherwise list the
|
| 135 |
+
# genuine positives (drop a redundant "No Finding" if any
|
| 136 |
+
# pathology is also positive).
|
| 137 |
+
real = [p for p in positives if p != "No Finding"]
|
| 138 |
+
if real:
|
| 139 |
+
txt = ", ".join(real)
|
| 140 |
+
else:
|
| 141 |
+
txt = "No Finding"
|
| 142 |
+
out[(subj, study)] = f"Predicted Findings: {txt}"
|
| 143 |
+
return out
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
# ─── Main builder ───────────────────────────────────────────────────────────
|
| 147 |
+
|
| 148 |
+
def build_mimic_cxr_instruct_json(
|
| 149 |
+
mimic_root: str,
|
| 150 |
+
output_path: str,
|
| 151 |
+
chexpert_csv: Optional[str] = None,
|
| 152 |
+
vqa_root: Optional[str] = None,
|
| 153 |
+
report_mode: str = "split", # "split" | "merged" | "split_cascade"
|
| 154 |
+
image_mode: str = "all_views_split", # "all_views_split" | "frontal_only_split" | "multi_image_merged"
|
| 155 |
+
uncertain_policy: str = "ignore", # how CheXpert -1.0 (uncertain) is treated
|
| 156 |
+
) -> str:
|
| 157 |
+
"""
|
| 158 |
+
Build the unified MIMIC-CXR instruction JSON.
|
| 159 |
+
|
| 160 |
+
report_mode mirrors iu_xray_builder:
|
| 161 |
+
"split" → findings + impression samples; BOTH carry the CheXpert
|
| 162 |
+
"Predicted Findings: ..." string in structured_findings
|
| 163 |
+
(RaDialog: image + 14 labels → text).
|
| 164 |
+
"merged" → one task=report sample, target "Findings: ...\n\n
|
| 165 |
+
Impression: ...", carries the CheXpert string.
|
| 166 |
+
"split_cascade" → findings sample carries the CheXpert string; the
|
| 167 |
+
impression sample instead carries "Findings: <GT
|
| 168 |
+
findings>" as context (findings→impression). Same
|
| 169 |
+
convention as the IU builder.
|
| 170 |
+
|
| 171 |
+
image_mode mirrors iu_xray_builder (all_views_split / frontal_only_split /
|
| 172 |
+
multi_image_merged). NOTE: frontal_only_split here keeps the FIRST image
|
| 173 |
+
of the study — this MIMIC layout has no metadata.csv to read ViewPosition
|
| 174 |
+
from. Swap in a ViewPosition lookup if you add that CSV.
|
| 175 |
+
|
| 176 |
+
Returns the absolute output path.
|
| 177 |
+
"""
|
| 178 |
+
assert report_mode in ("split", "merged", "split_cascade"), \
|
| 179 |
+
f"report_mode must be 'split', 'merged', or 'split_cascade', got {report_mode!r}"
|
| 180 |
+
assert image_mode in ("all_views_split", "frontal_only_split", "multi_image_merged"), \
|
| 181 |
+
f"image_mode invalid: {image_mode!r}"
|
| 182 |
+
|
| 183 |
+
from .dataset import format_merged_report # local import to avoid cycle
|
| 184 |
+
|
| 185 |
+
mimic_root = Path(mimic_root)
|
| 186 |
+
output_path = Path(output_path)
|
| 187 |
+
|
| 188 |
+
# split dir name → split label written into the JSON
|
| 189 |
+
split_dirs = {
|
| 190 |
+
"train": "train",
|
| 191 |
+
"valid": "validate",
|
| 192 |
+
"test": "test",
|
| 193 |
+
}
|
| 194 |
+
present = {sub: mimic_root / sub for sub in split_dirs if (mimic_root / sub).is_dir()}
|
| 195 |
+
if not present:
|
| 196 |
+
raise FileNotFoundError(
|
| 197 |
+
f"No train/valid/test subdirs under {mimic_root}. "
|
| 198 |
+
f"Expected the pre-split MIMIC-CXR layout."
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
# ── CheXpert labels ───────────────────────────────────────────────────
|
| 202 |
+
csv_path = _discover_chexpert_csv(mimic_root, chexpert_csv)
|
| 203 |
+
if csv_path is not None:
|
| 204 |
+
chexpert_map = _load_chexpert_map(csv_path, uncertain_policy)
|
| 205 |
+
print(f"[mimic_cxr_builder] CheXpert CSV: {csv_path} "
|
| 206 |
+
f"({len(chexpert_map):,} studies, uncertain={uncertain_policy})")
|
| 207 |
+
else:
|
| 208 |
+
chexpert_map = {}
|
| 209 |
+
print("[mimic_cxr_builder] WARNING: no *chexpert*.csv found under "
|
| 210 |
+
f"{mimic_root} and none passed via --chexpert_csv. "
|
| 211 |
+
"structured_findings will be null (RaDialog abnormality "
|
| 212 |
+
"guidance DISABLED). Add the CSV to enable it.")
|
| 213 |
+
|
| 214 |
+
# ── Pass 1: index studies ─────────────────────────────────────────────
|
| 215 |
+
samples: List[Dict] = []
|
| 216 |
+
image_index: Dict[str, str] = {} # subject-relative path → split label
|
| 217 |
+
n_studies = n_missing_report = n_no_chexpert = 0
|
| 218 |
+
skipped_merged_no_impression = skipped_cascade_no_findings = 0
|
| 219 |
+
|
| 220 |
+
def _structured_for(subj: str, study: str) -> Optional[str]:
|
| 221 |
+
return chexpert_map.get((subj.lstrip("p"), study.lstrip("s")))
|
| 222 |
+
|
| 223 |
+
def _image_groups(study_dir: Path, split_sub: str, subj: str, study: str):
|
| 224 |
+
"""Yield path_fields dicts honouring image_mode (same rules as IU)."""
|
| 225 |
+
imgs = sorted(study_dir.glob("*.jpg"))
|
| 226 |
+
if not imgs:
|
| 227 |
+
return
|
| 228 |
+
def _rel(img: Path) -> str:
|
| 229 |
+
return f"{split_sub}/{img.parent.parent.parent.name}/{subj}/{study}/{img.name}"
|
| 230 |
+
rels = [_rel(im) for im in imgs]
|
| 231 |
+
for r in rels:
|
| 232 |
+
image_index[r] = split_dirs[split_sub]
|
| 233 |
+
if image_mode == "all_views_split":
|
| 234 |
+
for r in rels:
|
| 235 |
+
yield {"image_path": r, "image_paths": None}
|
| 236 |
+
elif image_mode == "frontal_only_split":
|
| 237 |
+
yield {"image_path": rels[0], "image_paths": None}
|
| 238 |
+
else: # multi_image_merged
|
| 239 |
+
yield {"image_path": None, "image_paths": rels}
|
| 240 |
+
|
| 241 |
+
for split_sub, split_dir in present.items():
|
| 242 |
+
for p_dir in sorted(split_dir.glob("p*")):
|
| 243 |
+
for pat_dir in p_dir.glob("p*"):
|
| 244 |
+
for study_dir in pat_dir.glob("s*"):
|
| 245 |
+
jpgs = list(study_dir.glob("*.jpg"))
|
| 246 |
+
if not jpgs:
|
| 247 |
+
continue
|
| 248 |
+
n_studies += 1
|
| 249 |
+
subj, study = pat_dir.name, study_dir.name
|
| 250 |
+
txts = list(study_dir.glob("*.txt"))
|
| 251 |
+
if not txts:
|
| 252 |
+
n_missing_report += 1
|
| 253 |
+
continue
|
| 254 |
+
findings, impression = _parse_report(txts[0])
|
| 255 |
+
structured = _structured_for(subj, study)
|
| 256 |
+
if structured is None:
|
| 257 |
+
n_no_chexpert += 1
|
| 258 |
+
split_label = split_dirs[split_sub]
|
| 259 |
+
|
| 260 |
+
for path_fields in _image_groups(study_dir, split_sub, subj, study):
|
| 261 |
+
base = {
|
| 262 |
+
**path_fields,
|
| 263 |
+
"question": None,
|
| 264 |
+
"split": split_label,
|
| 265 |
+
"study_id": study,
|
| 266 |
+
"subject_id": subj,
|
| 267 |
+
}
|
| 268 |
+
if report_mode == "merged":
|
| 269 |
+
target = format_merged_report(findings, impression)
|
| 270 |
+
if target is None:
|
| 271 |
+
skipped_merged_no_impression += 1
|
| 272 |
+
continue
|
| 273 |
+
samples.append({**base, "task": "report",
|
| 274 |
+
"target": target,
|
| 275 |
+
"structured_findings": structured})
|
| 276 |
+
elif report_mode == "split_cascade":
|
| 277 |
+
if findings:
|
| 278 |
+
samples.append({**base, "task": "findings",
|
| 279 |
+
"target": findings,
|
| 280 |
+
"structured_findings": structured})
|
| 281 |
+
if impression:
|
| 282 |
+
if not findings:
|
| 283 |
+
skipped_cascade_no_findings += 1
|
| 284 |
+
else:
|
| 285 |
+
samples.append({**base, "task": "impression",
|
| 286 |
+
"target": impression,
|
| 287 |
+
"structured_findings":
|
| 288 |
+
f"Findings: {findings}"})
|
| 289 |
+
else: # "split"
|
| 290 |
+
if findings:
|
| 291 |
+
samples.append({**base, "task": "findings",
|
| 292 |
+
"target": findings,
|
| 293 |
+
"structured_findings": structured})
|
| 294 |
+
if impression:
|
| 295 |
+
samples.append({**base, "task": "impression",
|
| 296 |
+
"target": impression,
|
| 297 |
+
"structured_findings": structured})
|
| 298 |
+
|
| 299 |
+
# ── Pass 2: optional VQA attach (mirrors the notebook) ────────────────
|
| 300 |
+
n_vqa = n_vqa_dropped = 0
|
| 301 |
+
if vqa_root:
|
| 302 |
+
vqa_root = Path(vqa_root)
|
| 303 |
+
for fname, split_label in (("train", "train"),
|
| 304 |
+
("valid", "validate"),
|
| 305 |
+
("test", "test")):
|
| 306 |
+
vqa_file = vqa_root / f"{fname}.json"
|
| 307 |
+
if not vqa_file.is_file():
|
| 308 |
+
continue
|
| 309 |
+
for row in json.load(open(vqa_file)):
|
| 310 |
+
sub_rel = str(row["image_path"]).lstrip("/")
|
| 311 |
+
if sub_rel.startswith("files/"):
|
| 312 |
+
sub_rel = sub_rel[len("files/"):]
|
| 313 |
+
# match against any indexed image whose tail equals sub_rel
|
| 314 |
+
hit = next((k for k in image_index if k.endswith(sub_rel)), None)
|
| 315 |
+
if hit is None:
|
| 316 |
+
n_vqa_dropped += 1
|
| 317 |
+
continue
|
| 318 |
+
ans = row.get("answer", [])
|
| 319 |
+
answer = (", ".join(map(str, ans)) if isinstance(ans, list)
|
| 320 |
+
else str(ans)) or "No."
|
| 321 |
+
samples.append({
|
| 322 |
+
"image_path": hit, "image_paths": None,
|
| 323 |
+
"task": "vqa", "target": answer,
|
| 324 |
+
"question": row["question"],
|
| 325 |
+
"structured_findings": None,
|
| 326 |
+
"split": split_label,
|
| 327 |
+
"study_id": row.get("study_id"),
|
| 328 |
+
"subject_id": row.get("subject_id"),
|
| 329 |
+
})
|
| 330 |
+
n_vqa += 1
|
| 331 |
+
|
| 332 |
+
# ── Write ─────────────────────────────────────────────────────────────
|
| 333 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 334 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 335 |
+
json.dump(samples, f, ensure_ascii=False)
|
| 336 |
+
|
| 337 |
+
by_split, by_task = {}, {}
|
| 338 |
+
for s in samples:
|
| 339 |
+
by_split[s["split"]] = by_split.get(s["split"], 0) + 1
|
| 340 |
+
by_task[s["task"]] = by_task.get(s["task"], 0) + 1
|
| 341 |
+
|
| 342 |
+
print(f"[mimic_cxr_builder] wrote {len(samples):,} samples → {output_path}")
|
| 343 |
+
print(f" report_mode : {report_mode}")
|
| 344 |
+
print(f" image_mode : {image_mode}")
|
| 345 |
+
print(f" studies indexed : {n_studies:,}")
|
| 346 |
+
print(f" missing report : {n_missing_report:,}")
|
| 347 |
+
print(f" studies w/o chexpert label : {n_no_chexpert:,}")
|
| 348 |
+
if report_mode == "merged":
|
| 349 |
+
print(f" skipped no_impr : {skipped_merged_no_impression:,}")
|
| 350 |
+
if report_mode == "split_cascade":
|
| 351 |
+
print(f" skipped impr w/o findings : {skipped_cascade_no_findings:,}")
|
| 352 |
+
if vqa_root:
|
| 353 |
+
print(f" vqa added/dropped: {n_vqa:,} / {n_vqa_dropped:,}")
|
| 354 |
+
print(f" by split : {by_split}")
|
| 355 |
+
print(f" by task : {by_task}")
|
| 356 |
+
return str(output_path)
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
# ─── CLI ────────────────────────────────────────────────────────────────────
|
| 360 |
+
|
| 361 |
+
def _parse_args():
|
| 362 |
+
p = argparse.ArgumentParser(description="Build MIMIC-CXR unified instruction JSON")
|
| 363 |
+
p.add_argument("--mimic_root", required=True,
|
| 364 |
+
help="Folder containing train/ valid/ test/ subdirs")
|
| 365 |
+
p.add_argument("--output", required=True, help="Output JSON path")
|
| 366 |
+
p.add_argument("--chexpert_csv", default=None,
|
| 367 |
+
help="Path to mimic-cxr-2.0.0-chexpert.csv "
|
| 368 |
+
"(auto-discovered under --mimic_root if omitted)")
|
| 369 |
+
p.add_argument("--vqa_root", default=None,
|
| 370 |
+
help="Folder with {train,valid,test}.json VQA pairs (optional)")
|
| 371 |
+
p.add_argument("--report_mode", default="split",
|
| 372 |
+
choices=["split", "merged", "split_cascade"])
|
| 373 |
+
p.add_argument("--image_mode", default="all_views_split",
|
| 374 |
+
choices=["all_views_split", "frontal_only_split", "multi_image_merged"])
|
| 375 |
+
p.add_argument("--uncertain_policy", default="ignore",
|
| 376 |
+
choices=["ignore", "positive"],
|
| 377 |
+
help="CheXpert -1.0 (uncertain): ignore (default) or treat as positive.")
|
| 378 |
+
return p.parse_args()
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
if __name__ == "__main__":
|
| 382 |
+
a = _parse_args()
|
| 383 |
+
build_mimic_cxr_instruct_json(
|
| 384 |
+
mimic_root = a.mimic_root,
|
| 385 |
+
output_path = a.output,
|
| 386 |
+
chexpert_csv = a.chexpert_csv,
|
| 387 |
+
vqa_root = a.vqa_root,
|
| 388 |
+
report_mode = a.report_mode,
|
| 389 |
+
image_mode = a.image_mode,
|
| 390 |
+
uncertain_policy = a.uncertain_policy,
|
| 391 |
+
)
|
|
@@ -36,7 +36,7 @@ class DatasetSpec:
|
|
| 36 |
instruct_json: str # passed to CXRInstructDataset
|
| 37 |
tasks: List[str] # which tasks exist in this dataset
|
| 38 |
task_weights: Dict[str, float] # normalized over `tasks`
|
| 39 |
-
report_mode: str = "split" # "split" | "merged"
|
| 40 |
image_mode: str = "all_views_split" # "all_views_split" | "frontal_only_split" | "multi_image_merged"
|
| 41 |
max_images: int = 1 # >1 only when image_mode == multi_image_merged
|
| 42 |
|
|
@@ -50,16 +50,20 @@ def resolve_dataset_spec(train_cfg) -> DatasetSpec:
|
|
| 50 |
missing and `iu_xray.auto_build == true`.
|
| 51 |
|
| 52 |
The choice of which tasks are "available" depends on `data.report_mode`:
|
| 53 |
-
"split"
|
| 54 |
-
"merged"
|
|
|
|
|
|
|
|
|
|
| 55 |
"""
|
| 56 |
name = _get(train_cfg.data, "dataset_name", "MIMIC-CXR")
|
| 57 |
report_mode = _get(train_cfg.data, "report_mode", "split")
|
| 58 |
image_mode = _get(train_cfg.data, "image_mode", "all_views_split")
|
| 59 |
max_images = int(_get(train_cfg.data, "max_images_per_sample", 2))
|
| 60 |
-
if report_mode not in ("split", "merged"):
|
| 61 |
raise ValueError(
|
| 62 |
-
f"data.report_mode must be 'split'
|
|
|
|
| 63 |
)
|
| 64 |
if image_mode not in ("all_views_split", "frontal_only_split", "multi_image_merged"):
|
| 65 |
raise ValueError(
|
|
@@ -105,7 +109,9 @@ def resolve_dataset_spec(train_cfg) -> DatasetSpec:
|
|
| 105 |
else:
|
| 106 |
available = ["findings", "impression", "vqa"]
|
| 107 |
image_root = train_cfg.data.mimic_cxr_root
|
| 108 |
-
instruct_json =
|
|
|
|
|
|
|
| 109 |
|
| 110 |
else: # IU-Xray
|
| 111 |
# IU has no VQA.
|
|
@@ -184,6 +190,50 @@ def _ensure_iu_json_exists(iu_cfg,
|
|
| 184 |
return str(out)
|
| 185 |
|
| 186 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
# ─── Run ID resolution (dataset-prefixed) ───────────────────────────────────
|
| 188 |
|
| 189 |
def resolve_run_id(
|
|
|
|
| 36 |
instruct_json: str # passed to CXRInstructDataset
|
| 37 |
tasks: List[str] # which tasks exist in this dataset
|
| 38 |
task_weights: Dict[str, float] # normalized over `tasks`
|
| 39 |
+
report_mode: str = "split" # "split" | "merged" | "split_cascade"
|
| 40 |
image_mode: str = "all_views_split" # "all_views_split" | "frontal_only_split" | "multi_image_merged"
|
| 41 |
max_images: int = 1 # >1 only when image_mode == multi_image_merged
|
| 42 |
|
|
|
|
| 50 |
missing and `iu_xray.auto_build == true`.
|
| 51 |
|
| 52 |
The choice of which tasks are "available" depends on `data.report_mode`:
|
| 53 |
+
"split" → findings, impression (+ vqa for MIMIC)
|
| 54 |
+
"merged" → report (+ vqa for MIMIC)
|
| 55 |
+
"split_cascade" → findings, impression (+ vqa for MIMIC); same task set
|
| 56 |
+
and weights as "split" — only the data builder differs
|
| 57 |
+
(impression sample carries GT findings as context).
|
| 58 |
"""
|
| 59 |
name = _get(train_cfg.data, "dataset_name", "MIMIC-CXR")
|
| 60 |
report_mode = _get(train_cfg.data, "report_mode", "split")
|
| 61 |
image_mode = _get(train_cfg.data, "image_mode", "all_views_split")
|
| 62 |
max_images = int(_get(train_cfg.data, "max_images_per_sample", 2))
|
| 63 |
+
if report_mode not in ("split", "merged", "split_cascade"):
|
| 64 |
raise ValueError(
|
| 65 |
+
f"data.report_mode must be 'split', 'merged', or 'split_cascade', "
|
| 66 |
+
f"got {report_mode!r}"
|
| 67 |
)
|
| 68 |
if image_mode not in ("all_views_split", "frontal_only_split", "multi_image_merged"):
|
| 69 |
raise ValueError(
|
|
|
|
| 109 |
else:
|
| 110 |
available = ["findings", "impression", "vqa"]
|
| 111 |
image_root = train_cfg.data.mimic_cxr_root
|
| 112 |
+
instruct_json = _ensure_mimic_json_exists(
|
| 113 |
+
train_cfg.data, report_mode, image_mode
|
| 114 |
+
)
|
| 115 |
|
| 116 |
else: # IU-Xray
|
| 117 |
# IU has no VQA.
|
|
|
|
| 190 |
return str(out)
|
| 191 |
|
| 192 |
|
| 193 |
+
def _ensure_mimic_json_exists(data_cfg,
|
| 194 |
+
report_mode: str = "split",
|
| 195 |
+
image_mode: str = "all_views_split") -> str:
|
| 196 |
+
"""
|
| 197 |
+
Build the MIMIC-CXR unified JSON if missing.
|
| 198 |
+
|
| 199 |
+
The configured `data.instruct_json` path is suffixed with both
|
| 200 |
+
report_mode and image_mode (mimic_..._instruct__split__all_views_split.json)
|
| 201 |
+
so each of the mode combinations gets its own cache and the RaDialog
|
| 202 |
+
CheXpert-guided JSON never collides with one built under other settings.
|
| 203 |
+
|
| 204 |
+
Auto-build (default on) reads `*chexpert*.csv` to bake the 14 oracle
|
| 205 |
+
labels into structured_findings. Set `data.mimic_auto_build: false` to
|
| 206 |
+
require a pre-built file instead.
|
| 207 |
+
"""
|
| 208 |
+
base = Path(_get(data_cfg, "instruct_json",
|
| 209 |
+
"data/data_files/mimic_cxr_instruct_unified.json"))
|
| 210 |
+
out = base.with_name(f"{base.stem}__{report_mode}__{image_mode}{base.suffix}")
|
| 211 |
+
if out.is_file():
|
| 212 |
+
return str(out)
|
| 213 |
+
|
| 214 |
+
if not bool(_get(data_cfg, "mimic_auto_build", True)):
|
| 215 |
+
raise FileNotFoundError(
|
| 216 |
+
f"MIMIC instruct JSON not found at {out} and "
|
| 217 |
+
f"data.mimic_auto_build=false. Run: python -m data.mimic_cxr_builder "
|
| 218 |
+
f"--mimic_root {_get(data_cfg, 'mimic_cxr_root')} --output {out} "
|
| 219 |
+
f"--report_mode {report_mode} --image_mode {image_mode}"
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
from data.mimic_cxr_builder import build_mimic_cxr_instruct_json
|
| 223 |
+
print(f"[dataset_resolver] MIMIC JSON not found → auto-building "
|
| 224 |
+
f"(report_mode={report_mode}, image_mode={image_mode}) …")
|
| 225 |
+
build_mimic_cxr_instruct_json(
|
| 226 |
+
mimic_root = str(_get(data_cfg, "mimic_cxr_root")),
|
| 227 |
+
output_path = str(out),
|
| 228 |
+
chexpert_csv = _get(data_cfg, "mimic_chexpert_csv"),
|
| 229 |
+
vqa_root = _get(data_cfg, "mimic_vqa_root"),
|
| 230 |
+
report_mode = report_mode,
|
| 231 |
+
image_mode = image_mode,
|
| 232 |
+
uncertain_policy = str(_get(data_cfg, "mimic_uncertain_policy", "ignore")),
|
| 233 |
+
)
|
| 234 |
+
return str(out)
|
| 235 |
+
|
| 236 |
+
|
| 237 |
# ─── Run ID resolution (dataset-prefixed) ───────────────────────────────────
|
| 238 |
|
| 239 |
def resolve_run_id(
|