convitom commited on
Commit
b961b41
·
1 Parent(s): 0a99045
configs/train_config.yaml CHANGED
@@ -5,7 +5,15 @@
5
  # ── Data ─────────────────────────────────────
6
  data:
7
  # Pick which dataset to train on.
8
- # Supported: "MIMIC-CXR" (all 3 tasks) | "IU-Xray" (findings + impression only)
 
 
 
 
 
 
 
 
9
  dataset_name: "IU-Xray"
10
 
11
  # How findings and impression are turned into training samples.
@@ -74,6 +82,36 @@ data:
74
  # pre-built file (built via `python -m data.mimic_cxr_builder ...`).
75
  mimic_auto_build: true
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  # --- IU X-ray paths (used when dataset_name == "IU-Xray") ---
78
  # On local Windows the defaults below match D:\USTH\KLTN\data\IU-Xray\...
79
  # On Kaggle set these to the mounted dataset (e.g. /kaggle/input/vlm-cxr-data/...)
@@ -102,16 +140,17 @@ data:
102
  tasks:
103
  findings_generation:
104
  enabled: true
105
- weight: 0.4 # used when report_mode = split
106
  impression_generation:
107
  enabled: true
108
- weight: 0.2 # used when report_mode = split
 
109
  report_generation:
110
  enabled: true
111
  weight: 0.6 # used when report_mode = merged
112
  vqa:
113
  enabled: true
114
- weight: 0.4
115
 
116
  # ── Training ─────────────────────────────────
117
  training:
 
5
  # ── Data ─────────────────────────────────────
6
  data:
7
  # Pick which dataset to train on.
8
+ # Supported:
9
+ # "MIMIC-CXR" — pre-split layout {root}/{train,valid,test}/pXX/...
10
+ # all 3 tasks (findings, impression, vqa)
11
+ # "MIMIC-CXR_resized" — same data filtered+resized to tar shards on HF
12
+ # (hieu3636/cxr-vlm-data/MIMIC-CXR_resized/). After
13
+ # extraction the layout matches the raw PhysioNet
14
+ # tree ({root}/files/pXX/pXXXX/sYYYY/*.jpg); splits
15
+ # come from mimic-cxr-2.0.0-split.csv. All 3 tasks.
16
+ # "IU-Xray" — findings + impression only (no VQA)
17
  dataset_name: "IU-Xray"
18
 
19
  # How findings and impression are turned into training samples.
 
82
  # pre-built file (built via `python -m data.mimic_cxr_builder ...`).
83
  mimic_auto_build: true
84
 
85
+ # --- MIMIC-CXR_resized paths (used when dataset_name == "MIMIC-CXR_resized")
86
+ # Filtered + resized subset of MIMIC-CXR distributed via HF as tar shards
87
+ # (hieu3636/cxr-vlm-data/MIMIC-CXR_resized/) + a "subset_bundle" with the
88
+ # manifest CSVs and VQA JSON files. This dataset is MANIFEST-DRIVEN:
89
+ #
90
+ # manifest_{train,val,test}.csv — one row per image. Contains the split
91
+ # label, image_relpath, report_relpath, has_vqa, and 14 chex_*
92
+ # columns (the CheXpert labels). The val/test pool was redistributed
93
+ # from the original train split (subset is small), so the official
94
+ # PhysioNet mimic-cxr-2.0.0-split.csv is NOT used.
95
+ # vqa/{vqa.json, vqa_val.json, vqa_test.json} — VQA pairs filtered to
96
+ # only the images present in this resized subset.
97
+ #
98
+ # After extracting the tar shards, the on-disk layout (under `root`) is:
99
+ # {root}/files/pXX/pXXXXXXXX/sYYYYYYYY/<dicom>.jpg
100
+ # {root}/files/pXX/pXXXXXXXX/sYYYYYYYY.txt (reports alongside)
101
+ mimic_cxr_resized:
102
+ root: "D:/USTH/KLTN/subset_bundle" # extracted-tar root (parent of files/)
103
+ manifest_dir: null # null → same as `root`. Folder containing
104
+ # manifest_{train,val,test}.csv.
105
+ vqa_dir: null # null → use `{root}/vqa`. Folder containing
106
+ # vqa.json / vqa_val.json / vqa_test.json. Set
107
+ # to "" to disable VQA.
108
+ reports_root: null # null → auto-probe `{root}` then `{root}/reports`.
109
+ # Set explicitly if reports live somewhere else
110
+ # (e.g. when reports are bundled inside tars vs.
111
+ # a sibling `reports/` dir like subset_bundle/).
112
+ instruct_json: "data/data_files/mimic_cxr_resized_instruct.json"
113
+ auto_build: true # build JSON automatically if missing
114
+
115
  # --- IU X-ray paths (used when dataset_name == "IU-Xray") ---
116
  # On local Windows the defaults below match D:\USTH\KLTN\data\IU-Xray\...
117
  # On Kaggle set these to the mounted dataset (e.g. /kaggle/input/vlm-cxr-data/...)
 
140
  tasks:
141
  findings_generation:
142
  enabled: true
143
+ weight: 0.30 # used when report_mode = split
144
  impression_generation:
145
  enabled: true
146
+ weight: 0.20 # used when report_mode = split (lower:
147
+ # impression is conditioned on findings)
148
  report_generation:
149
  enabled: true
150
  weight: 0.6 # used when report_mode = merged
151
  vqa:
152
  enabled: true
153
+ weight: 0.50 # boosted so VQA ≈ RRG (findings+impression)
154
 
155
  # ── Training ─────────────────────────────────
156
  training:
data/count_img.py → count_img.py RENAMED
@@ -1,76 +1,76 @@
1
- import os
2
- import json
3
-
4
- def get_local_images(root_dir):
5
- """
6
- Lấy toàn bộ đường dẫn ảnh local (dạng pxx/...)
7
- """
8
- local_images = set()
9
-
10
- for p_folder in os.listdir(root_dir):
11
- if not p_folder.startswith("p1"): # chỉ p10 -> p19
12
- continue
13
-
14
- p_path = os.path.join(root_dir, p_folder)
15
-
16
- for root, _, files in os.walk(p_path):
17
- for file in files:
18
- if file.endswith(".jpg"):
19
- full_path = os.path.join(root, file)
20
-
21
- # convert về dạng giống VQA: p10/.../xxx.jpg
22
- rel_path = os.path.relpath(full_path, root_dir)
23
- rel_path = rel_path.replace("\\", "/")
24
-
25
- local_images.add(rel_path)
26
-
27
- return local_images
28
-
29
-
30
- def get_vqa_images(vqa_json_path):
31
- """
32
- Lấy toàn bộ image_path từ file VQA json
33
- """
34
- with open(vqa_json_path, "r", encoding="utf-8") as f:
35
- data = json.load(f)
36
-
37
- vqa_images = set()
38
-
39
- for item in data:
40
- if "image_path" in item:
41
- vqa_images.add(item["image_path"])
42
-
43
- return vqa_images
44
-
45
-
46
- def main(root_dir, vqa_json_path):
47
- print("Đang quét ảnh local...")
48
- local_images = get_local_images(root_dir)
49
- print(f"Số ảnh local: {len(local_images)}")
50
-
51
- print("Đang đọc VQA json...")
52
- vqa_images = get_vqa_images(vqa_json_path)
53
- print(f"Số ảnh trong VQA: {len(vqa_images)}")
54
-
55
- # intersection
56
- matched = local_images & vqa_images
57
-
58
- print("\n===== KẾT QUẢ =====")
59
- print(f"Số ảnh trùng: {len(matched)}")
60
- print(f"Tỷ lệ cover VQA: {len(matched) / len(vqa_images):.4f}")
61
-
62
- # nếu muốn lưu danh sách
63
- with open("matched_images.txt", "w") as f:
64
- for path in matched:
65
- f.write(path + "\n")
66
-
67
- print("Đã lưu danh sách vào matched_images.txt")
68
-
69
-
70
- if __name__ == "__main__":
71
- x = "train"
72
- y = "valid"
73
- root_dir = r"D:\USTH\KLTN\data\{x}".format(x=x) # ví dụ: D:/mimic-cxr
74
- vqa_json = r"D:\USTH\KLTN\data\mimic-ext-mimic-cxr-vqa-a-complex-diverse-and-large-scale-visual-question-answering-dataset-for-chest-x-ray-images-1.0.0\MIMIC-Ext-MIMIC-CXR-VQA\dataset\{y}.json".format(y=y) # ví dụ: D:/vqa/train.json
75
-
76
  main(root_dir, vqa_json)
 
1
+ import os
2
+ import json
3
+
4
+ def get_local_images(root_dir):
5
+ """
6
+ Lấy toàn bộ đường dẫn ảnh local (dạng pxx/...)
7
+ """
8
+ local_images = set()
9
+
10
+ for p_folder in os.listdir(root_dir):
11
+ if not p_folder.startswith("p1"): # chỉ p10 -> p19
12
+ continue
13
+
14
+ p_path = os.path.join(root_dir, p_folder)
15
+
16
+ for root, _, files in os.walk(p_path):
17
+ for file in files:
18
+ if file.endswith(".jpg"):
19
+ full_path = os.path.join(root, file)
20
+
21
+ # convert về dạng giống VQA: p10/.../xxx.jpg
22
+ rel_path = os.path.relpath(full_path, root_dir)
23
+ rel_path = rel_path.replace("\\", "/")
24
+
25
+ local_images.add(rel_path)
26
+
27
+ return local_images
28
+
29
+
30
+ def get_vqa_images(vqa_json_path):
31
+ """
32
+ Lấy toàn bộ image_path từ file VQA json
33
+ """
34
+ with open(vqa_json_path, "r", encoding="utf-8") as f:
35
+ data = json.load(f)
36
+
37
+ vqa_images = set()
38
+
39
+ for item in data:
40
+ if "image_path" in item:
41
+ vqa_images.add(item["image_path"])
42
+
43
+ return vqa_images
44
+
45
+
46
+ def main(root_dir, vqa_json_path):
47
+ print("Đang quét ảnh local...")
48
+ local_images = get_local_images(root_dir)
49
+ print(f"Số ảnh local: {len(local_images)}")
50
+
51
+ print("Đang đọc VQA json...")
52
+ vqa_images = get_vqa_images(vqa_json_path)
53
+ print(f"Số ảnh trong VQA: {len(vqa_images)}")
54
+
55
+ # intersection
56
+ matched = local_images & vqa_images
57
+
58
+ print("\n===== KẾT QUẢ =====")
59
+ print(f"Số ảnh trùng: {len(matched)}")
60
+ print(f"Tỷ lệ cover VQA: {len(matched) / len(vqa_images):.4f}")
61
+
62
+ # nếu muốn lưu danh sách
63
+ with open("matched_images.txt", "w") as f:
64
+ for path in matched:
65
+ f.write(path + "\n")
66
+
67
+ print("Đã lưu danh sách vào matched_images.txt")
68
+
69
+
70
+ if __name__ == "__main__":
71
+ x = "train"
72
+ y = "valid"
73
+ root_dir = r"D:\USTH\KLTN\data\{x}".format(x=x) # ví dụ: D:/mimic-cxr
74
+ vqa_json = r"D:\USTH\KLTN\data\mimic-ext-mimic-cxr-vqa-a-complex-diverse-and-large-scale-visual-question-answering-dataset-for-chest-x-ray-images-1.0.0\MIMIC-Ext-MIMIC-CXR-VQA\dataset\{y}.json".format(y=y) # ví dụ: D:/vqa/train.json
75
+
76
  main(root_dir, vqa_json)
data/dataset.py CHANGED
@@ -161,6 +161,53 @@ class CXRInstructDataset(Dataset):
161
  def __len__(self) -> int:
162
  return len(self.samples)
163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
165
  sample = self.samples[idx]
166
 
 
161
  def __len__(self) -> int:
162
  return len(self.samples)
163
 
164
+ def get_per_sample_weights(self) -> Optional[List[float]]:
165
+ """
166
+ Build per-sample weights for `torch.utils.data.WeightedRandomSampler`
167
+ so that, in expectation, each task occupies its configured fraction of
168
+ drawn training samples — regardless of how many samples of each task
169
+ exist in the JSON.
170
+
171
+ Math:
172
+ For task t with N_t samples in the JSON and configured weight w_t,
173
+ give every sample of t the weight `w_t / N_t`. The aggregate
174
+ probability of drawing ANY sample of task t over one draw becomes
175
+ `N_t * (w_t / N_t) = w_t`, which is exactly the desired ratio.
176
+
177
+ Tasks with weight 0 (e.g. VQA on IU-Xray) get weight 0 → never drawn.
178
+ Tasks present in the JSON but absent from `self.task_weights` also get
179
+ weight 0 (loud-failure-on-misconfig is preferable to silent miscounts).
180
+
181
+ Returns:
182
+ list of floats of length len(self.samples), or None if this is a
183
+ single-task dataset (`self.task != "mixed"`) — in that case every
184
+ sample is the same task, so weighted sampling is unnecessary and
185
+ the default uniform `RandomSampler` is correct.
186
+ """
187
+ if self.task != "mixed":
188
+ return None
189
+
190
+ # Count samples per task that actually appear in this dataset.
191
+ counts: Dict[str, int] = {}
192
+ for s in self.samples:
193
+ counts[s["task"]] = counts.get(s["task"], 0) + 1
194
+
195
+ # Per-sample weight = w_task / N_task. Tasks not in task_weights → 0.
196
+ weights = [
197
+ float(self.task_weights.get(s["task"], 0.0)) / counts[s["task"]]
198
+ for s in self.samples
199
+ ]
200
+
201
+ # Sanity: print effective per-task probabilities once so the actual
202
+ # mix during training is visible in logs (helps catch misconfigured
203
+ # weights vs. JSON-task-set mismatch).
204
+ eff = {t: float(self.task_weights.get(t, 0.0)) for t in counts}
205
+ eff_sum = sum(eff.values()) or 1.0
206
+ eff = {t: round(v / eff_sum, 4) for t, v in eff.items()}
207
+ print(f"[CXRInstructDataset] WeightedRandomSampler effective task mix: "
208
+ f"{eff} (counts: {counts})")
209
+ return weights
210
+
211
  def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
212
  sample = self.samples[idx]
213
 
data/mimic_cxr_builder.py CHANGED
@@ -94,6 +94,45 @@ def _discover_chexpert_csv(mimic_root: Path, explicit: Optional[str]) -> Optiona
94
  return None
95
 
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  def _load_chexpert_map(csv_path: Path) -> Dict[Tuple[str, str], str]:
98
  """
99
  Return {(subject_id, study_id): <PNU string>} where the ids are the bare
@@ -150,6 +189,9 @@ def build_mimic_cxr_instruct_json(
150
  vqa_root: Optional[str] = None,
151
  report_mode: str = "split", # "split" | "merged" | "split_cascade"
152
  image_mode: str = "all_views_split", # "all_views_split" | "frontal_only_split" | "multi_image_merged"
 
 
 
153
  ) -> str:
154
  """
155
  Build the unified MIMIC-CXR instruction JSON.
@@ -170,30 +212,70 @@ def build_mimic_cxr_instruct_json(
170
  of the study — this MIMIC layout has no metadata.csv to read ViewPosition
171
  from. Swap in a ViewPosition lookup if you add that CSV.
172
 
 
 
 
 
 
 
 
 
 
 
173
  Returns the absolute output path.
174
  """
175
  assert report_mode in ("split", "merged", "split_cascade"), \
176
  f"report_mode must be 'split', 'merged', or 'split_cascade', got {report_mode!r}"
177
  assert image_mode in ("all_views_split", "frontal_only_split", "multi_image_merged"), \
178
  f"image_mode invalid: {image_mode!r}"
 
 
179
 
180
  from .dataset import format_merged_report # local import to avoid cycle
181
 
182
  mimic_root = Path(mimic_root)
183
  output_path = Path(output_path)
184
 
185
- # split dir name split label written into the JSON
186
- split_dirs = {
187
- "train": "train",
188
- "valid": "validate",
189
- "test": "test",
190
- }
191
- present = {sub: mimic_root / sub for sub in split_dirs if (mimic_root / sub).is_dir()}
192
- if not present:
193
- raise FileNotFoundError(
194
- f"No train/valid/test subdirs under {mimic_root}. "
195
- f"Expected the pre-split MIMIC-CXR layout."
196
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
  # ── CheXpert labels ───────────────────────────────────────────────────
199
  csv_path = _discover_chexpert_csv(mimic_root, chexpert_csv)
@@ -211,21 +293,15 @@ def build_mimic_cxr_instruct_json(
211
  # ── Pass 1: index studies ─────────────────────────────────────────────
212
  samples: List[Dict] = []
213
  # sub_rel ("pXX/pXXXX/sYYYY/img.jpg") → full stored image_path
214
- # ("{split}/pXX/pXXXX/sYYYY/img.jpg"). O(1) VQA lookup.
 
215
  image_index: Dict[str, str] = {}
216
- n_studies = n_missing_report = n_no_chexpert = 0
217
  skipped_merged_no_impression = skipped_cascade_no_findings = 0
218
 
219
  def _structured_for(subj: str, study: str) -> Optional[str]:
220
  return chexpert_map.get((subj.lstrip("p"), study.lstrip("s")))
221
 
222
- def _rels_for(study_dir: Path, split_sub: str, subj: str, study: str) -> List[str]:
223
- """Split-prefixed relative image paths for one study, sorted."""
224
- return [
225
- f"{split_sub}/{im.parent.parent.parent.name}/{subj}/{study}/{im.name}"
226
- for im in sorted(study_dir.glob("*.jpg"))
227
- ]
228
-
229
  def _image_groups(rels: List[str]):
230
  """Yield path_fields dicts honouring image_mode (same rules as IU)."""
231
  if image_mode == "all_views_split":
@@ -236,67 +312,120 @@ def build_mimic_cxr_instruct_json(
236
  else: # multi_image_merged
237
  yield {"image_path": None, "image_paths": rels}
238
 
239
- for split_sub, split_dir in present.items():
240
- for p_dir in sorted(split_dir.glob("p*")):
241
- for pat_dir in p_dir.glob("p*"):
242
- for study_dir in pat_dir.glob("s*"):
243
- subj, study = pat_dir.name, study_dir.name
244
- rels = _rels_for(study_dir, split_sub, subj, study)
245
- if not rels:
246
- continue
247
- n_studies += 1
248
- # Index EVERY image up front — a VQA row may reference a
249
- # study that has images but no findings/impression report.
250
- for r in rels:
251
- image_index[r.split("/", 1)[1]] = r
252
- txts = list(study_dir.glob("*.txt"))
253
- if not txts:
254
- n_missing_report += 1
255
- continue
256
- findings, impression = _parse_report(txts[0])
257
- structured = _structured_for(subj, study)
258
- if structured is None:
259
- n_no_chexpert += 1
260
- split_label = split_dirs[split_sub]
261
-
262
- for path_fields in _image_groups(rels):
263
- base = {
264
- **path_fields,
265
- "question": None,
266
- "split": split_label,
267
- "study_id": study,
268
- "subject_id": subj,
269
- }
270
- if report_mode == "merged":
271
- target = format_merged_report(findings, impression)
272
- if target is None:
273
- skipped_merged_no_impression += 1
274
  continue
275
- samples.append({**base, "task": "report",
276
- "target": target,
277
- "structured_findings": structured})
278
- elif report_mode == "split_cascade":
279
- if findings:
280
- samples.append({**base, "task": "findings",
281
- "target": findings,
282
- "structured_findings": structured})
283
- if impression:
284
- if not findings:
285
- skipped_cascade_no_findings += 1
286
- else:
287
- samples.append({**base, "task": "impression",
288
- "target": impression,
289
- "structured_findings":
290
- f"Findings: {findings}"})
291
- else: # "split"
292
- if findings:
293
- samples.append({**base, "task": "findings",
294
- "target": findings,
295
- "structured_findings": structured})
296
- if impression:
297
- samples.append({**base, "task": "impression",
298
- "target": impression,
299
- "structured_findings": structured})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300
 
301
  # ── Pass 2: optional VQA attach (mirrors the notebook) ────────────────
302
  n_vqa = n_vqa_dropped = 0
@@ -346,11 +475,14 @@ def build_mimic_cxr_instruct_json(
346
  by_task[s["task"]] = by_task.get(s["task"], 0) + 1
347
 
348
  print(f"[mimic_cxr_builder] wrote {len(samples):,} samples → {output_path}")
 
349
  print(f" report_mode : {report_mode}")
350
  print(f" image_mode : {image_mode}")
351
  print(f" studies indexed : {n_studies:,}")
352
  print(f" missing report : {n_missing_report:,}")
353
  print(f" studies w/o chexpert label : {n_no_chexpert:,}")
 
 
354
  if report_mode == "merged":
355
  print(f" skipped no_impr : {skipped_merged_no_impression:,}")
356
  if report_mode == "split_cascade":
@@ -378,6 +510,17 @@ def _parse_args():
378
  choices=["split", "merged", "split_cascade"])
379
  p.add_argument("--image_mode", default="all_views_split",
380
  choices=["all_views_split", "frontal_only_split", "multi_image_merged"])
 
 
 
 
 
 
 
 
 
 
 
381
  return p.parse_args()
382
 
383
 
@@ -390,4 +533,7 @@ if __name__ == "__main__":
390
  vqa_root = a.vqa_root,
391
  report_mode = a.report_mode,
392
  image_mode = a.image_mode,
 
 
 
393
  )
 
94
  return None
95
 
96
 
97
+ def _discover_split_csv(mimic_root: Path, explicit: Optional[str]) -> Optional[Path]:
98
+ """Locate mimic-cxr-2.0.0-split.csv (or any *split*.csv) under `mimic_root`.
99
+ Used by the "files" layout to assign train/validate/test per study."""
100
+ if explicit:
101
+ p = Path(explicit)
102
+ return p if p.is_file() else None
103
+ hits = sorted(glob.glob(str(mimic_root / "**" / "*split*.csv"), recursive=True))
104
+ return Path(hits[0]) if hits else None
105
+
106
+
107
+ def _load_split_map(csv_path: Path) -> Dict[Tuple[str, str], str]:
108
+ """
109
+ Return {(subject_id, study_id): "train"|"validate"|"test"} from
110
+ mimic-cxr-2.0.0-split.csv. IDs stored without the p/s prefix to match
111
+ the chexpert map convention. Tolerates 'valid' as alias for 'validate'.
112
+ """
113
+ out: Dict[Tuple[str, str], str] = {}
114
+ with open(csv_path, newline="") as f:
115
+ reader = csv.DictReader(f)
116
+ col = {c.lower().strip(): c for c in reader.fieldnames or []}
117
+ subj_c = col.get("subject_id")
118
+ study_c = col.get("study_id")
119
+ split_c = col.get("split")
120
+ if not (subj_c and study_c and split_c):
121
+ raise ValueError(
122
+ f"{csv_path} missing subject_id/study_id/split columns "
123
+ f"(have: {reader.fieldnames})"
124
+ )
125
+ for row in reader:
126
+ subj = str(row[subj_c]).strip().lstrip("p").split(".")[0]
127
+ study = str(row[study_c]).strip().lstrip("s").split(".")[0]
128
+ sp = str(row[split_c]).strip().lower()
129
+ if sp == "valid":
130
+ sp = "validate"
131
+ if sp in ("train", "validate", "test"):
132
+ out[(subj, study)] = sp
133
+ return out
134
+
135
+
136
  def _load_chexpert_map(csv_path: Path) -> Dict[Tuple[str, str], str]:
137
  """
138
  Return {(subject_id, study_id): <PNU string>} where the ids are the bare
 
189
  vqa_root: Optional[str] = None,
190
  report_mode: str = "split", # "split" | "merged" | "split_cascade"
191
  image_mode: str = "all_views_split", # "all_views_split" | "frontal_only_split" | "multi_image_merged"
192
+ layout: str = "presplit", # "presplit" | "files"
193
+ split_csv: Optional[str] = None, # required for layout="files"
194
+ reports_root: Optional[str] = None, # for layout="files"; None → reports alongside images
195
  ) -> str:
196
  """
197
  Build the unified MIMIC-CXR instruction JSON.
 
212
  of the study — this MIMIC layout has no metadata.csv to read ViewPosition
213
  from. Swap in a ViewPosition lookup if you add that CSV.
214
 
215
+ layout selects which on-disk tree to walk:
216
+ "presplit" — {root}/{train,valid,test}/pXX/pXXXX/sYYYY/{*.jpg + *.txt}
217
+ The custom MIMIC-CXR.zip used by the notebook. Default.
218
+ "files" — {root}/files/pXX/pXXXX/sYYYY/*.jpg (raw PhysioNet tree).
219
+ Used by MIMIC-CXR_resized after extracting tar shards.
220
+ Requires `split_csv` (or auto-discovers *split*.csv) to
221
+ assign train/validate/test. Reports are read from
222
+ `reports_root` (separate tree, e.g. mimic-cxr-reports/)
223
+ or from the study dir if reports_root is None.
224
+
225
  Returns the absolute output path.
226
  """
227
  assert report_mode in ("split", "merged", "split_cascade"), \
228
  f"report_mode must be 'split', 'merged', or 'split_cascade', got {report_mode!r}"
229
  assert image_mode in ("all_views_split", "frontal_only_split", "multi_image_merged"), \
230
  f"image_mode invalid: {image_mode!r}"
231
+ assert layout in ("presplit", "files"), \
232
+ f"layout must be 'presplit' or 'files', got {layout!r}"
233
 
234
  from .dataset import format_merged_report # local import to avoid cycle
235
 
236
  mimic_root = Path(mimic_root)
237
  output_path = Path(output_path)
238
 
239
+ # ── Locate study dirs + split assignment ────────────────────────────────
240
+ # Two layouts produce the same downstream shape: each entry is
241
+ # (study_dir, subject_dir_name, study_dir_name, split_label, image_rel_prefix)
242
+ # where image_rel_prefix is the leading path component used when building
243
+ # the JSON-stored relative image path. presplit prefixes with the split
244
+ # dir name ("train/..."), files prefixes with "files/...".
245
+ if layout == "presplit":
246
+ split_dirs = {"train": "train", "valid": "validate", "test": "test"}
247
+ present = {sub: mimic_root / sub for sub in split_dirs if (mimic_root / sub).is_dir()}
248
+ if not present:
249
+ raise FileNotFoundError(
250
+ f"No train/valid/test subdirs under {mimic_root}. "
251
+ f"Expected the pre-split MIMIC-CXR layout."
252
+ )
253
+ split_map = None # not needed — split comes from dir name
254
+ else: # "files"
255
+ files_dir = mimic_root / "files"
256
+ if not files_dir.is_dir():
257
+ raise FileNotFoundError(
258
+ f"Expected {files_dir} for layout='files'. After extracting "
259
+ f"the MIMIC-CXR_resized tars the layout should be "
260
+ f"{{root}}/files/pXX/pXXXX/sYYYY/*.jpg."
261
+ )
262
+ sp_path = _discover_split_csv(mimic_root, split_csv)
263
+ if sp_path is None:
264
+ raise FileNotFoundError(
265
+ f"Could not find a split CSV under {mimic_root} and none "
266
+ f"passed via --split_csv. layout='files' needs "
267
+ f"mimic-cxr-2.0.0-split.csv to assign train/validate/test."
268
+ )
269
+ split_map = _load_split_map(sp_path)
270
+ print(f"[mimic_cxr_builder] split CSV: {sp_path} "
271
+ f"({len(split_map):,} (subj,study) entries)")
272
+ reports_root_p = Path(reports_root) if reports_root else None
273
+ if reports_root_p is not None and not reports_root_p.is_dir():
274
+ raise FileNotFoundError(
275
+ f"reports_root={reports_root_p} does not exist. Either point "
276
+ f"to the extracted mimic-cxr-reports tree (with a `files/` "
277
+ f"subdir inside it) or leave it null to look alongside images."
278
+ )
279
 
280
  # ── CheXpert labels ───────────────────────────────────────────────────
281
  csv_path = _discover_chexpert_csv(mimic_root, chexpert_csv)
 
293
  # ── Pass 1: index studies ─────────────────────────────────────────────
294
  samples: List[Dict] = []
295
  # sub_rel ("pXX/pXXXX/sYYYY/img.jpg") → full stored image_path
296
+ # ("{split}/pXX/pXXXX/sYYYY/img.jpg" or "files/pXX/pXXXX/sYYYY/img.jpg").
297
+ # O(1) VQA lookup.
298
  image_index: Dict[str, str] = {}
299
+ n_studies = n_missing_report = n_no_chexpert = n_no_split = 0
300
  skipped_merged_no_impression = skipped_cascade_no_findings = 0
301
 
302
  def _structured_for(subj: str, study: str) -> Optional[str]:
303
  return chexpert_map.get((subj.lstrip("p"), study.lstrip("s")))
304
 
 
 
 
 
 
 
 
305
  def _image_groups(rels: List[str]):
306
  """Yield path_fields dicts honouring image_mode (same rules as IU)."""
307
  if image_mode == "all_views_split":
 
312
  else: # multi_image_merged
313
  yield {"image_path": None, "image_paths": rels}
314
 
315
+ def _iter_studies():
316
+ """
317
+ Yield (study_dir, p_dir_name, subj, study, rels, report_path, split_label)
318
+ for every valid study in either layout.
319
+ rels = list of JSON-relative image paths (split-prefixed or
320
+ "files/"-prefixed depending on layout).
321
+ report_path = Path to the report .txt (may not exist; caller handles).
322
+ split_label = "train"/"validate"/"test" or None when unresolved.
323
+ """
324
+ if layout == "presplit":
325
+ for split_sub, split_dir in present.items():
326
+ for p_dir in sorted(split_dir.glob("p*")):
327
+ for pat_dir in p_dir.glob("p*"):
328
+ for study_dir in pat_dir.glob("s*"):
329
+ subj, study = pat_dir.name, study_dir.name
330
+ rels = [
331
+ f"{split_sub}/{p_dir.name}/{subj}/{study}/{im.name}"
332
+ for im in sorted(study_dir.glob("*.jpg"))
333
+ ]
334
+ if not rels:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
  continue
336
+ txts = list(study_dir.glob("*.txt"))
337
+ report_path = txts[0] if txts else None
338
+ yield (study_dir, p_dir.name, subj, study, rels,
339
+ report_path, split_dirs[split_sub])
340
+ else: # "files"
341
+ files_dir = mimic_root / "files"
342
+ for p_dir in sorted(files_dir.glob("p*")):
343
+ for pat_dir in p_dir.glob("p*"):
344
+ for study_dir in pat_dir.glob("s*"):
345
+ subj, study = pat_dir.name, study_dir.name
346
+ rels = [
347
+ f"files/{p_dir.name}/{subj}/{study}/{im.name}"
348
+ for im in sorted(study_dir.glob("*.jpg"))
349
+ ]
350
+ if not rels:
351
+ continue
352
+ # Report lookup: separate tree if reports_root is set,
353
+ # else alongside images (parent dir holds sYYYY.txt
354
+ # per PhysioNet convention OR inside study dir).
355
+ if reports_root_p is not None:
356
+ report_path = (reports_root_p / "files" /
357
+ p_dir.name / subj / f"{study}.txt")
358
+ else:
359
+ # Try both: study_dir/*.txt then parent/{study}.txt
360
+ cand = list(study_dir.glob("*.txt"))
361
+ if cand:
362
+ report_path = cand[0]
363
+ else:
364
+ report_path = pat_dir / f"{study}.txt"
365
+ split_label = split_map.get(
366
+ (subj.lstrip("p"), study.lstrip("s"))
367
+ ) if split_map else None
368
+ yield (study_dir, p_dir.name, subj, study, rels,
369
+ report_path, split_label)
370
+
371
+ for (study_dir, p_dir_name, subj, study, rels,
372
+ report_path, split_label) in _iter_studies():
373
+ n_studies += 1
374
+ # Index EVERY image up front — a VQA row may reference a study
375
+ # that has images but no findings/impression report.
376
+ for r in rels:
377
+ image_index[r.split("/", 1)[1]] = r
378
+ # Studies missing from split CSV (files layout) are skipped —
379
+ # emitting them would silently dump into "train".
380
+ if split_label is None:
381
+ n_no_split += 1
382
+ continue
383
+ if report_path is None or not Path(report_path).is_file():
384
+ n_missing_report += 1
385
+ continue
386
+ findings, impression = _parse_report(Path(report_path))
387
+ structured = _structured_for(subj, study)
388
+ if structured is None:
389
+ n_no_chexpert += 1
390
+
391
+ for path_fields in _image_groups(rels):
392
+ base = {
393
+ **path_fields,
394
+ "question": None,
395
+ "split": split_label,
396
+ "study_id": study,
397
+ "subject_id": subj,
398
+ }
399
+ if report_mode == "merged":
400
+ target = format_merged_report(findings, impression)
401
+ if target is None:
402
+ skipped_merged_no_impression += 1
403
+ continue
404
+ samples.append({**base, "task": "report",
405
+ "target": target,
406
+ "structured_findings": structured})
407
+ elif report_mode == "split_cascade":
408
+ if findings:
409
+ samples.append({**base, "task": "findings",
410
+ "target": findings,
411
+ "structured_findings": structured})
412
+ if impression:
413
+ if not findings:
414
+ skipped_cascade_no_findings += 1
415
+ else:
416
+ samples.append({**base, "task": "impression",
417
+ "target": impression,
418
+ "structured_findings":
419
+ f"Findings: {findings}"})
420
+ else: # "split"
421
+ if findings:
422
+ samples.append({**base, "task": "findings",
423
+ "target": findings,
424
+ "structured_findings": structured})
425
+ if impression:
426
+ samples.append({**base, "task": "impression",
427
+ "target": impression,
428
+ "structured_findings": structured})
429
 
430
  # ── Pass 2: optional VQA attach (mirrors the notebook) ────────────────
431
  n_vqa = n_vqa_dropped = 0
 
475
  by_task[s["task"]] = by_task.get(s["task"], 0) + 1
476
 
477
  print(f"[mimic_cxr_builder] wrote {len(samples):,} samples → {output_path}")
478
+ print(f" layout : {layout}")
479
  print(f" report_mode : {report_mode}")
480
  print(f" image_mode : {image_mode}")
481
  print(f" studies indexed : {n_studies:,}")
482
  print(f" missing report : {n_missing_report:,}")
483
  print(f" studies w/o chexpert label : {n_no_chexpert:,}")
484
+ if layout == "files":
485
+ print(f" studies w/o split-CSV entry (skipped) : {n_no_split:,}")
486
  if report_mode == "merged":
487
  print(f" skipped no_impr : {skipped_merged_no_impression:,}")
488
  if report_mode == "split_cascade":
 
510
  choices=["split", "merged", "split_cascade"])
511
  p.add_argument("--image_mode", default="all_views_split",
512
  choices=["all_views_split", "frontal_only_split", "multi_image_merged"])
513
+ p.add_argument("--layout", default="presplit",
514
+ choices=["presplit", "files"],
515
+ help="presplit: {root}/{train,valid,test}/pXX/... (custom MIMIC-CXR.zip). "
516
+ "files: {root}/files/pXX/... (raw PhysioNet tree, used by "
517
+ "MIMIC-CXR_resized after tar extraction). Requires --split_csv.")
518
+ p.add_argument("--split_csv", default=None,
519
+ help="mimic-cxr-2.0.0-split.csv (auto-discovered under --mimic_root "
520
+ "if omitted). Required for layout='files'.")
521
+ p.add_argument("--reports_root", default=None,
522
+ help="Root of the mimic-cxr-reports tree (separate from images). "
523
+ "Used when layout='files' and reports are NOT in the image tars.")
524
  return p.parse_args()
525
 
526
 
 
533
  vqa_root = a.vqa_root,
534
  report_mode = a.report_mode,
535
  image_mode = a.image_mode,
536
+ layout = a.layout,
537
+ split_csv = a.split_csv,
538
+ reports_root = a.reports_root,
539
  )
data/mimic_cxr_resized_builder.py ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ mimic_cxr_resized_builder.py
3
+ ----------------------------
4
+ Build the unified instruction JSON for the MIMIC-CXR_resized dataset
5
+ (`hieu3636/cxr-vlm-data/MIMIC-CXR_resized/`) — a filtered + resized subset
6
+ of MIMIC-CXR that ships with its own manifest CSVs.
7
+
8
+ Why a separate builder?
9
+ MIMIC-CXR_resized is manifest-driven, not directory-walking like
10
+ `mimic_cxr_builder.py`:
11
+ - Splits come from THREE manifest CSVs (manifest_train.csv,
12
+ manifest_val.csv, manifest_test.csv) — NOT from PhysioNet's
13
+ mimic-cxr-2.0.0-split.csv (the user redistributed val/test from
14
+ the original train pool to balance sizes).
15
+ - The 14 CheXpert labels are baked into the manifest as `chex_*`
16
+ columns; no separate chexpert.csv lookup is needed.
17
+ - Each manifest row = ONE image (one DICOM). Multi-view studies
18
+ appear as multiple rows sharing (subject_id, study_id).
19
+ - Image + report paths are stored verbatim in `image_relpath` /
20
+ `report_relpath`, relative to the extracted-tar root. Reports
21
+ live inside the same `files/` tree at patient-dir level
22
+ (e.g. files/p10/p10000032/s50414267.txt), NOT inside study dirs.
23
+
24
+ VQA
25
+ ---
26
+ 3 JSON files (`vqa.json` for train, `vqa_val.json`, `vqa_test.json`) sit
27
+ under a `vqa/` sibling dir. Each row references `image_path` exactly the
28
+ same as `image_relpath` in the manifest, so we look it up in an
29
+ image-index built during the manifest pass. Missing-image VQA rows are
30
+ dropped (the resized subset has fewer images than the full MIMIC).
31
+
32
+ Output JSON schema is identical to the other two builders so downstream
33
+ (`CXRInstructDataset`, evaluation) is unchanged.
34
+ """
35
+
36
+ import argparse
37
+ import csv
38
+ import json
39
+ from pathlib import Path
40
+ from typing import Dict, List, Optional, Tuple
41
+
42
+ from .mimic_cxr_builder import _parse_report # reuse the same FINDINGS/IMPRESSION regex
43
+
44
+
45
+ # Manifest column name → CheXpert PATHOLOGIES name mapping is direct:
46
+ # manifest col "chex_Atelectasis" ↔ PATHOLOGIES "Atelectasis", etc.
47
+ # We resolve the canonical 14-name list at runtime to stay in sync with
48
+ # `model.chexpert_classifier.PATHOLOGIES` (single source of truth).
49
+
50
+
51
+ # ─── PNU builder from a single manifest row ─────────────────────────────────
52
+
53
+ def _row_to_pnu(row: Dict[str, str]) -> str:
54
+ """
55
+ Translate the 14 `chex_*` columns of a manifest row into the PNU
56
+ structured-findings string (Positive/Negative/Uncertain Abnormalities).
57
+ Same U-MultiClass convention as `mimic_cxr_builder._load_chexpert_map`:
58
+ "1" / "1.0" → positive
59
+ "0" / "0.0" → negative
60
+ "-1" / "-1.0" → uncertain
61
+ blank / NaN → negative (META-CXR default)
62
+ """
63
+ from model.chexpert_classifier import (
64
+ PATHOLOGIES, buckets_to_pnu,
65
+ CLASS_NEGATIVE, CLASS_POSITIVE, CLASS_UNCERTAIN,
66
+ )
67
+ val_to_cls = {
68
+ "1": CLASS_POSITIVE, "1.0": CLASS_POSITIVE,
69
+ "0": CLASS_NEGATIVE, "0.0": CLASS_NEGATIVE,
70
+ "-1": CLASS_UNCERTAIN, "-1.0": CLASS_UNCERTAIN,
71
+ }
72
+ mapping = {}
73
+ for name in PATHOLOGIES:
74
+ v = str(row.get(f"chex_{name}", "")).strip()
75
+ mapping[name] = val_to_cls.get(v, CLASS_NEGATIVE)
76
+ return buckets_to_pnu(mapping)
77
+
78
+
79
+ # ─── Helpers ────────────────────────────────────────────────────────────────
80
+
81
+ # Manifest split-label ↔ output split-label.
82
+ # Manifest uses "val" (3-letter); the rest of the pipeline expects "validate".
83
+ _MANIFEST_FILES = (
84
+ ("manifest_train.csv", "train"),
85
+ ("manifest_val.csv", "validate"),
86
+ ("manifest_test.csv", "test"),
87
+ )
88
+ _VQA_FILES = (
89
+ ("vqa.json", "train"),
90
+ ("vqa_val.json", "validate"),
91
+ ("vqa_test.json","test"),
92
+ )
93
+
94
+
95
+ def _group_manifest_by_study(csv_path: Path) -> Dict[Tuple[str, str], List[Dict[str, str]]]:
96
+ """
97
+ Parse one manifest CSV and group rows by (subject_id, study_id) so that
98
+ multi-view studies end up as a single bucket — needed for the
99
+ `multi_image_merged` image_mode and to keep one structured_findings per
100
+ study (all views of a study share the same CheXpert labels).
101
+ """
102
+ grouped: Dict[Tuple[str, str], List[Dict[str, str]]] = {}
103
+ with open(csv_path, encoding="utf-8", newline="") as f:
104
+ reader = csv.DictReader(f)
105
+ for row in reader:
106
+ key = (str(row["subject_id"]).strip(),
107
+ str(row["study_id"]).strip())
108
+ grouped.setdefault(key, []).append(row)
109
+ return grouped
110
+
111
+
112
+ def _image_groups(rels: List[str], image_mode: str):
113
+ """Yield path_fields dicts honouring image_mode (mirrors the other builders)."""
114
+ if image_mode == "all_views_split":
115
+ for r in rels:
116
+ yield {"image_path": r, "image_paths": None}
117
+ elif image_mode == "frontal_only_split":
118
+ yield {"image_path": rels[0], "image_paths": None}
119
+ else: # multi_image_merged
120
+ yield {"image_path": None, "image_paths": rels}
121
+
122
+
123
+ # ─── Main builder ───────────────────────────────────────────────────────────
124
+
125
+ def build_mimic_cxr_resized_instruct_json(
126
+ root: str,
127
+ manifest_dir: Optional[str],
128
+ output_path: str,
129
+ vqa_dir: Optional[str] = None,
130
+ reports_root: Optional[str] = None,
131
+ report_mode: str = "split", # "split" | "merged" | "split_cascade"
132
+ image_mode: str = "all_views_split", # "all_views_split" | "frontal_only_split" | "multi_image_merged"
133
+ ) -> str:
134
+ """
135
+ Build the unified MIMIC-CXR_resized instruction JSON.
136
+
137
+ Args:
138
+ root: directory containing the extracted tar shards, so
139
+ `{root}/{image_relpath}` resolves to an image.
140
+ The manifest stores image_relpath like
141
+ "files/p19/p19855745/s59502026/<dicom>.jpg".
142
+ manifest_dir: directory containing manifest_{train,val,test}.csv.
143
+ If None → defaults to `root`.
144
+ output_path: where to write the JSON.
145
+ vqa_dir: directory containing vqa.json / vqa_val.json /
146
+ vqa_test.json. If None or files missing → VQA skipped
147
+ (only findings + impression samples emitted).
148
+ reports_root: directory that `report_relpath` resolves against. If
149
+ None we try (a) `{root}` (tars include reports beside
150
+ images) then (b) `{root}/reports` (separate bundle —
151
+ the layout of the local subset_bundle). Set explicitly
152
+ to skip the probe.
153
+ report_mode: "split" | "merged" | "split_cascade" (see other builders).
154
+ image_mode: "all_views_split" | "frontal_only_split" |
155
+ "multi_image_merged" (see other builders).
156
+
157
+ Returns:
158
+ Absolute output path.
159
+ """
160
+ assert report_mode in ("split", "merged", "split_cascade"), \
161
+ f"report_mode invalid: {report_mode!r}"
162
+ assert image_mode in ("all_views_split", "frontal_only_split", "multi_image_merged"), \
163
+ f"image_mode invalid: {image_mode!r}"
164
+
165
+ from .dataset import format_merged_report # local import to avoid cycle
166
+
167
+ root = Path(root)
168
+ manifest_dir = Path(manifest_dir) if manifest_dir else root
169
+ output_path = Path(output_path)
170
+
171
+ # ── Resolve reports_root (auto-probe if not set) ────────────────────────
172
+ # The manifest stores `report_relpath` like
173
+ # "files/p19/p19855745/s59502026.txt" — relative to whichever directory
174
+ # actually holds the reports tree. Two common layouts:
175
+ # (a) reports bundled into the tars → `{root}/files/.../.txt` exists
176
+ # (b) reports kept as a separate sibling → `{root}/reports/files/.../.txt`
177
+ # The local subset_bundle uses (b); the HF-extracted training setup may
178
+ # use (a). Auto-probe both and pick the first one that has any hits.
179
+ if reports_root is not None:
180
+ reports_root_p = Path(reports_root)
181
+ else:
182
+ candidates = [root, root / "reports"]
183
+ reports_root_p = root # default fallback
184
+ for cand in candidates:
185
+ if (cand / "files").is_dir():
186
+ reports_root_p = cand
187
+ break
188
+ print(f"[mimic_cxr_resized_builder] reports_root resolved → {reports_root_p}")
189
+
190
+ # ── Pass 1: walk the 3 manifest CSVs ────────────────────────────────────
191
+ samples: List[Dict] = []
192
+ image_index: Dict[str, str] = {} # image_relpath → image_relpath (identity; used for VQA lookup)
193
+ pnu_by_study: Dict[Tuple[str, str], str] = {} # (subj, study) → PNU string (for VQA reuse)
194
+
195
+ n_studies = n_missing_report = 0
196
+ skipped_merged_no_impression = skipped_cascade_no_findings = 0
197
+
198
+ for fname, split_label in _MANIFEST_FILES:
199
+ csv_path = manifest_dir / fname
200
+ if not csv_path.is_file():
201
+ print(f"[mimic_cxr_resized_builder] manifest missing: {csv_path} — skipping {split_label}")
202
+ continue
203
+ grouped = _group_manifest_by_study(csv_path)
204
+ print(f"[mimic_cxr_resized_builder] {fname}: "
205
+ f"{sum(len(v) for v in grouped.values()):,} rows / "
206
+ f"{len(grouped):,} studies")
207
+
208
+ for (subj, study), rows in grouped.items():
209
+ n_studies += 1
210
+
211
+ # All views of the same study share report + CheXpert labels.
212
+ first = rows[0]
213
+ rels = [r["image_relpath"] for r in rows]
214
+
215
+ # Index every image (incl. studies with no report yet) so a VQA
216
+ # row that references this image can still be picked up below.
217
+ for r in rels:
218
+ image_index[r] = r
219
+
220
+ structured = _row_to_pnu(first)
221
+ pnu_by_study[(subj, study)] = structured # cached for VQA reuse
222
+
223
+ report_rel = first.get("report_relpath", "").strip()
224
+ if not report_rel:
225
+ n_missing_report += 1
226
+ continue
227
+ report_path = reports_root_p / report_rel
228
+ if not report_path.is_file():
229
+ n_missing_report += 1
230
+ continue
231
+ findings, impression = _parse_report(report_path)
232
+
233
+ # Output JSON uses the same subject/study id format as the
234
+ # legacy MIMIC builder ("pXXXX" / "sYYYY") so downstream eval
235
+ # (which compares subject_id strings) keeps working unchanged.
236
+ subj_str = f"p{subj}" if not subj.startswith("p") else subj
237
+ study_str = f"s{study}" if not study.startswith("s") else study
238
+
239
+ for path_fields in _image_groups(rels, image_mode):
240
+ base = {
241
+ **path_fields,
242
+ "question": None,
243
+ "split": split_label,
244
+ "study_id": study_str,
245
+ "subject_id": subj_str,
246
+ }
247
+ if report_mode == "merged":
248
+ target = format_merged_report(findings, impression)
249
+ if target is None:
250
+ skipped_merged_no_impression += 1
251
+ continue
252
+ samples.append({**base, "task": "report",
253
+ "target": target,
254
+ "structured_findings": structured})
255
+ elif report_mode == "split_cascade":
256
+ if findings:
257
+ samples.append({**base, "task": "findings",
258
+ "target": findings,
259
+ "structured_findings": structured})
260
+ if impression:
261
+ if not findings:
262
+ skipped_cascade_no_findings += 1
263
+ else:
264
+ samples.append({**base, "task": "impression",
265
+ "target": impression,
266
+ "structured_findings":
267
+ f"Findings: {findings}"})
268
+ else: # "split"
269
+ if findings:
270
+ samples.append({**base, "task": "findings",
271
+ "target": findings,
272
+ "structured_findings": structured})
273
+ if impression:
274
+ samples.append({**base, "task": "impression",
275
+ "target": impression,
276
+ "structured_findings": structured})
277
+
278
+ # ── Pass 2: optional VQA attach ─────────────────────────────────────────
279
+ n_vqa = n_vqa_dropped = 0
280
+ if vqa_dir:
281
+ vqa_dir = Path(vqa_dir)
282
+ for fname, split_label in _VQA_FILES:
283
+ vqa_file = vqa_dir / fname
284
+ if not vqa_file.is_file():
285
+ print(f"[mimic_cxr_resized_builder] VQA missing: {vqa_file} — skipping {split_label}")
286
+ continue
287
+ for row in json.load(open(vqa_file, encoding="utf-8")):
288
+ img_path = str(row.get("image_path", "")).lstrip("/")
289
+ if img_path not in image_index:
290
+ n_vqa_dropped += 1
291
+ continue
292
+ ans = row.get("answer", [])
293
+ answer = (", ".join(map(str, ans)) if isinstance(ans, list)
294
+ else str(ans)) or "No."
295
+ subj = str(row.get("subject_id", "")).strip()
296
+ study = str(row.get("study_id", "")).strip()
297
+ # Abnormality-guided VQA: reuse the manifest's CheXpert PNU
298
+ # for this study (same context as findings/impression). None
299
+ # if the study wasn't in any manifest — should not happen
300
+ # since we already filtered to images that exist in manifest.
301
+ structured = pnu_by_study.get((subj, study))
302
+ subj_str = f"p{subj}" if subj and not subj.startswith("p") else subj
303
+ study_str = f"s{study}" if study and not study.startswith("s") else study
304
+ samples.append({
305
+ "image_path": img_path, "image_paths": None,
306
+ "task": "vqa", "target": answer,
307
+ "question": row["question"],
308
+ "structured_findings": structured,
309
+ "split": split_label,
310
+ "study_id": study_str,
311
+ "subject_id": subj_str,
312
+ })
313
+ n_vqa += 1
314
+
315
+ # ── Write ───────────────────────────────────────────────────────��───────
316
+ output_path.parent.mkdir(parents=True, exist_ok=True)
317
+ with open(output_path, "w", encoding="utf-8") as f:
318
+ json.dump(samples, f, ensure_ascii=False)
319
+
320
+ by_split, by_task = {}, {}
321
+ for s in samples:
322
+ by_split[s["split"]] = by_split.get(s["split"], 0) + 1
323
+ by_task[s["task"]] = by_task.get(s["task"], 0) + 1
324
+
325
+ print(f"[mimic_cxr_resized_builder] wrote {len(samples):,} samples → {output_path}")
326
+ print(f" root : {root}")
327
+ print(f" manifest_dir : {manifest_dir}")
328
+ print(f" vqa_dir : {vqa_dir if vqa_dir else '(disabled)'}")
329
+ print(f" report_mode : {report_mode}")
330
+ print(f" image_mode : {image_mode}")
331
+ print(f" studies indexed : {n_studies:,}")
332
+ print(f" missing report : {n_missing_report:,}")
333
+ if report_mode == "merged":
334
+ print(f" skipped no_impr : {skipped_merged_no_impression:,}")
335
+ if report_mode == "split_cascade":
336
+ print(f" skipped impr w/o findings : {skipped_cascade_no_findings:,}")
337
+ if vqa_dir:
338
+ print(f" vqa added/dropped : {n_vqa:,} / {n_vqa_dropped:,}")
339
+ print(f" by split : {by_split}")
340
+ print(f" by task : {by_task}")
341
+ return str(output_path)
342
+
343
+
344
+ # ─── CLI ────────────────────────────────────────────────────────────────────
345
+
346
+ def _parse_args():
347
+ p = argparse.ArgumentParser(description="Build MIMIC-CXR_resized unified instruction JSON")
348
+ p.add_argument("--root", required=True,
349
+ help="Root containing files/pXX/... after extracting tar shards.")
350
+ p.add_argument("--manifest_dir", default=None,
351
+ help="Folder with manifest_{train,val,test}.csv (defaults to --root).")
352
+ p.add_argument("--output", required=True, help="Output JSON path.")
353
+ p.add_argument("--vqa_dir", default=None,
354
+ help="Folder with vqa.json / vqa_val.json / vqa_test.json. Omit to skip VQA.")
355
+ p.add_argument("--reports_root", default=None,
356
+ help="Directory that report_relpath resolves against. "
357
+ "Omit to auto-probe `{root}` then `{root}/reports`.")
358
+ p.add_argument("--report_mode", default="split",
359
+ choices=["split", "merged", "split_cascade"])
360
+ p.add_argument("--image_mode", default="all_views_split",
361
+ choices=["all_views_split", "frontal_only_split", "multi_image_merged"])
362
+ return p.parse_args()
363
+
364
+
365
+ if __name__ == "__main__":
366
+ a = _parse_args()
367
+ build_mimic_cxr_resized_instruct_json(
368
+ root = a.root,
369
+ manifest_dir = a.manifest_dir,
370
+ output_path = a.output,
371
+ vqa_dir = a.vqa_dir,
372
+ reports_root = a.reports_root,
373
+ report_mode = a.report_mode,
374
+ image_mode = a.image_mode,
375
+ )
data/distri-IU-Xray.py → distri-IU-Xray.py RENAMED
@@ -1,12 +1,12 @@
1
- import os, glob
2
- from xml.etree import ElementTree as ET
3
-
4
- xml_dir = "D:/USTH/KLTN/data/IU-Xray/labels/ecgen-radiology/"
5
- counts = {}
6
- for f in glob.glob(xml_dir + "*.xml"):
7
- tree = ET.parse(f)
8
- n = len(tree.findall(".//parentImage"))
9
- counts[n] = counts.get(n, 0) + 1
10
-
11
- for k, v in sorted(counts.items()):
12
  print(f"{k} ảnh/report: {v} reports")
 
1
+ import os, glob
2
+ from xml.etree import ElementTree as ET
3
+
4
+ xml_dir = "D:/USTH/KLTN/data/IU-Xray/labels/ecgen-radiology/"
5
+ counts = {}
6
+ for f in glob.glob(xml_dir + "*.xml"):
7
+ tree = ET.parse(f)
8
+ n = len(tree.findall(".//parentImage"))
9
+ counts[n] = counts.get(n, 0) + 1
10
+
11
+ for k, v in sorted(counts.items()):
12
  print(f"{k} ảnh/report: {v} reports")
data/img_stat.py → img_stat.py RENAMED
File without changes
data/rezip.py → rezip.py RENAMED
@@ -1,12 +1,12 @@
1
- import zipfile
2
- import os
3
-
4
- zipf = zipfile.ZipFile(r"D:\USTH\KLTN\cxr-vlm-code.zip", 'w', zipfile.ZIP_DEFLATED)
5
-
6
- for root, dirs, files in os.walk('KLTN'):
7
- for file in files:
8
- filepath = os.path.join(root, file)
9
- arcname = os.path.relpath(filepath, 'KLTN')
10
- zipf.write(filepath, arcname)
11
-
12
  zipf.close()
 
1
+ import zipfile
2
+ import os
3
+
4
+ zipf = zipfile.ZipFile(r"D:\USTH\KLTN\cxr-vlm-code.zip", 'w', zipfile.ZIP_DEFLATED)
5
+
6
+ for root, dirs, files in os.walk('KLTN'):
7
+ for file in files:
8
+ filepath = os.path.join(root, file)
9
+ arcname = os.path.relpath(filepath, 'KLTN')
10
+ zipf.write(filepath, arcname)
11
+
12
  zipf.close()
scripts/cxrvlm_colab_train.ipynb CHANGED
@@ -5,32 +5,7 @@
5
  "metadata": {
6
  "id": "cell-0"
7
  },
8
- "source": [
9
- "# CXR-VLM — Kaggle Training Notebook (consolidated)\n",
10
- "\n",
11
- "Trains the 2-stage CXR-VLM (Vicuna-7B + BioViL-T fallback to timm ViT + LoRA) on a Kaggle **T4** GPU.\n",
12
- "\n",
13
- "Supports **two datasets**, selected by `DATASET_NAME` in section 0:\n",
14
- "- **`MIMIC-CXR`** — full 3 tasks (findings, impression, VQA).\n",
15
- "- **`IU-Xray`** — 2 tasks only (findings, impression). Much lighter dataset (~7.5k images).\n",
16
- "\n",
17
- "### Before you run\n",
18
- "\n",
19
- "Attach Kaggle Datasets via `+ Add Input`:\n",
20
- "\n",
21
- "| Dataset slug | Contents | When needed |\n",
22
- "|---|---|---|\n",
23
- "| `cxr-vlm-code` | entire `D:\\USTH\\KLTN` folder (configs/, data/*.py, model/, training/, evaluation/, utils/, requirements.txt) | **always** |\n",
24
- "| `cxr-vlm-data` | holds **both** datasets: `MIMIC-CXR/{train,valid,test}/p*/...` + `MIMIC-Ext-MIMIC-CXR-VQA/...` and/or `IU-Xray/images/` + `IU-Xray/labels/` | **always** |\n",
25
- "\n",
26
- "**Settings (right panel):**\n",
27
- "- Accelerator: **T4 x2** (only GPU 0 will be used)\n",
28
- "- Persistence: **Variables and Files**\n",
29
- "- Internet: **On**\n",
30
- "\n",
31
- "**Kaggle Secrets** (Add-ons → Secrets):\n",
32
- "- `HF_TOKEN` — HuggingFace token with write access to the runs repo."
33
- ],
34
  "id": "cell-0"
35
  },
36
  {
@@ -54,27 +29,9 @@
54
  },
55
  "outputId": "d6e7ebbd-4f1b-483b-f20f-0df3997a60b7"
56
  },
57
- "source": [
58
- "# ── Platform + dataset selectors ──────────────────────────────────\n",
59
- "# PLATFORM drives storage paths and how secrets are read.\n",
60
- "# Supported: 'kaggle' | 'colab' | 'lightning' | 'gcp' | 'local'\n",
61
- "PLATFORM = 'colab'\n",
62
- "DATASET_NAME = 'IU-Xray' # 'MIMIC-CXR' | 'IU-Xray'\n",
63
- "\n",
64
- "assert PLATFORM in ('kaggle', 'colab', 'lightning', 'gcp', 'local')\n",
65
- "assert DATASET_NAME in ('MIMIC-CXR', 'IU-Xray')\n",
66
- "print(f'PLATFORM = {PLATFORM} | DATASET_NAME = {DATASET_NAME}')\n"
67
- ],
68
  "execution_count": null,
69
- "outputs": [
70
- {
71
- "output_type": "stream",
72
- "name": "stdout",
73
- "text": [
74
- "PLATFORM = colab | DATASET_NAME = IU-Xray\n"
75
- ]
76
- }
77
- ],
78
  "id": "cell-select"
79
  },
80
  {
@@ -119,120 +76,9 @@
119
  },
120
  "outputId": "f6195a48-56b2-4052-8367-c8ec14c48a05"
121
  },
122
- "source": [
123
- "# ── Per-platform storage + source-of-truth ─────────────────────────\n",
124
- "# Kaggle : code + data come from attached Kaggle datasets (pre-mounted).\n",
125
- "# Others : pull code (folder) + data (single zip) from HF Hub dataset repos.\n",
126
- "#\n",
127
- "# Required HF repos:\n",
128
- "# <HF_USER>/cxr-vlm-code — project source (flat folder)\n",
129
- "# <HF_USER>/cxr-vlm-data — contains IU-Xray.zip (one zip per dataset)\n",
130
- "\n",
131
- "HF_USER = 'hieu3636' # <<< EDIT ME\n",
132
- "\n",
133
- "if PLATFORM == 'kaggle':\n",
134
- " INPUT_ROOT = Path('/kaggle/input')\n",
135
- " WORK = Path('/kaggle/working')\n",
136
- " def find_dataset(slug, required=True):\n",
137
- " for cand in [INPUT_ROOT / slug, *INPUT_ROOT.rglob(slug)]:\n",
138
- " if cand.is_dir():\n",
139
- " return cand\n",
140
- " if required:\n",
141
- " raise FileNotFoundError(f'Dataset {slug!r} not attached')\n",
142
- " return None\n",
143
- " CODE_SRC = find_dataset('cxr-vlm-code')\n",
144
- " DATA_SRC = find_dataset('cxr-vlm-data')\n",
145
- "\n",
146
- "else:\n",
147
- " # ── Non-Kaggle: resolve WORK, then pull from HF ──\n",
148
- " if PLATFORM == 'colab':\n",
149
- " from google.colab import userdata\n",
150
- " os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')\n",
151
- " WORK = Path('/content')\n",
152
- " elif PLATFORM == 'lightning':\n",
153
- " WORK = Path('/teamspace/studios/this_studio')\n",
154
- " elif PLATFORM == 'gcp':\n",
155
- " WORK = Path('/workspace')\n",
156
- " else:\n",
157
- " WORK = Path.home() / 'cxr-vlm-work'\n",
158
- " WORK.mkdir(parents=True, exist_ok=True)\n",
159
- "\n",
160
- " assert os.environ.get('HF_TOKEN'), 'HF_TOKEN missing — set it via platform secrets UI.'\n",
161
- "\n",
162
- " try:\n",
163
- " from huggingface_hub import snapshot_download, hf_hub_download\n",
164
- " except ImportError:\n",
165
- " !pip install -q huggingface_hub\n",
166
- " from huggingface_hub import snapshot_download, hf_hub_download\n",
167
- "\n",
168
- " # 1) Code: flat folder, few hundred files → snapshot_download ok\n",
169
- " print(f'Pulling code from HF (user: {HF_USER}) …')\n",
170
- " CODE_SRC = Path(snapshot_download(\n",
171
- " repo_id = f'{HF_USER}/cxr-vlm-code',\n",
172
- " repo_type = 'model',\n",
173
- " token = os.environ['HF_TOKEN'],\n",
174
- " local_dir = str(WORK / 'cxr-vlm-code'),\n",
175
- " ))\n",
176
- "\n",
177
- " # 2) Data: single zip per dataset (avoids per-file rate limits)\n",
178
- " import zipfile\n",
179
- " DATA_SRC = WORK / 'data'\n",
180
- " DATA_SRC.mkdir(parents=True, exist_ok=True)\n",
181
- "\n",
182
- " zip_name = f'{DATASET_NAME}.zip' # 'IU-Xray.zip' | 'MIMIC-CXR.zip'\n",
183
- " marker = DATA_SRC / DATASET_NAME # DATA_SRC/IU-Xray after unzip\n",
184
- "\n",
185
- " if not marker.exists():\n",
186
- " print(f'Pulling {zip_name} from HF …')\n",
187
- " zpath = hf_hub_download(\n",
188
- " repo_id = f'{HF_USER}/cxr-vlm-data',\n",
189
- " filename = zip_name,\n",
190
- " repo_type = 'dataset',\n",
191
- " token = os.environ['HF_TOKEN'],\n",
192
- " local_dir = str(DATA_SRC),\n",
193
- " )\n",
194
- " print(f' unzipping → {DATA_SRC}')\n",
195
- " with zipfile.ZipFile(zpath) as zf:\n",
196
- " zf.extractall(DATA_SRC)\n",
197
- " try:\n",
198
- " os.remove(zpath) # free disk\n",
199
- " except OSError:\n",
200
- " pass\n",
201
- " else:\n",
202
- " print(f'{marker} already present — skipping download.')\n",
203
- " print(f'Contents of {DATA_SRC}: {sorted(os.listdir(DATA_SRC))}')\n",
204
- "\n",
205
- "# ── Common: copy code into writable PROJECT dir ────────────────────\n",
206
- "PROJECT = WORK / 'cxr_vlm'\n",
207
- "if CODE_SRC.resolve() != PROJECT.resolve() and not PROJECT.exists():\n",
208
- " shutil.copytree(CODE_SRC, PROJECT)\n",
209
- "\n",
210
- "os.chdir(PROJECT)\n",
211
- "sys.path.insert(0, str(PROJECT))\n",
212
- "print('PLATFORM :', PLATFORM)\n",
213
- "print('CODE_SRC :', CODE_SRC)\n",
214
- "print('DATA_SRC :', DATA_SRC)\n",
215
- "print('PROJECT :', PROJECT)\n",
216
- "print('WORK :', WORK)\n"
217
- ],
218
  "execution_count": null,
219
- "outputs": [
220
- {
221
- "output_type": "stream",
222
- "name": "stdout",
223
- "text": [
224
- "Pulling code from HF (user: hieu3636) …\n",
225
- "Pulling IU-Xray.zip from HF …\n",
226
- " unzipping → /content/data\n",
227
- "Contents of /content/data: ['.cache', 'IU-Xray']\n",
228
- "PLATFORM : colab\n",
229
- "CODE_SRC : /content/cxr-vlm-code\n",
230
- "DATA_SRC : /content/data\n",
231
- "PROJECT : /content/cxr_vlm\n",
232
- "WORK : /content\n"
233
- ]
234
- }
235
- ],
236
  "id": "cell-paths"
237
  },
238
  {
@@ -336,29 +182,7 @@
336
  "metadata": {
337
  "id": "cell-data-md"
338
  },
339
- "source": [
340
- "## 2. Locate data on Kaggle\n",
341
- "\n",
342
- "Both datasets live under the single `cxr-vlm-data` slug. Expected layouts:\n",
343
- "\n",
344
- "**MIMIC-CXR**:\n",
345
- "```\n",
346
- "DATA_SRC/\n",
347
- "├── MIMIC-CXR/ (or at root)\n",
348
- "│ ├── train/p10/pXXXXXX/sYYYYY/*.jpg + sYYYYY.txt\n",
349
- "│ ├── valid/p10/...\n",
350
- "│ └── test/p10/...\n",
351
- "└── .../MIMIC-Ext-MIMIC-CXR-VQA/dataset/{train,valid,test}.json\n",
352
- "```\n",
353
- "\n",
354
- "**IU-Xray** (added alongside MIMIC under the same slug):\n",
355
- "```\n",
356
- "DATA_SRC/\n",
357
- "└── IU-Xray/\n",
358
- " ├── images/ # CXR*_IM-*-*.png (~7.5k files)\n",
359
- " └── labels/ # {1..3999}.xml (~3.9k files, flat — no ecgen-radiology subfolder)\n",
360
- "```"
361
- ],
362
  "id": "cell-data-md"
363
  },
364
  {
@@ -370,104 +194,9 @@
370
  },
371
  "outputId": "53c15833-f9bb-4457-95d6-3fa83f4dc909"
372
  },
373
- "source": [
374
- "def find_split_parent(root: Path) -> Path:\n",
375
- " for cand in [root, root / 'MIMIC-CXR', root / 'data' / 'MIMIC-CXR']:\n",
376
- " if (cand / 'train').exists() and (cand / 'valid').exists() and (cand / 'test').exists():\n",
377
- " return cand\n",
378
- " for p in root.rglob('train'):\n",
379
- " if p.is_dir() and (p.parent / 'valid').exists() and (p.parent / 'test').exists():\n",
380
- " return p.parent\n",
381
- " raise FileNotFoundError('Could not find train/ valid/ test/ under ' + str(root))\n",
382
- "\n",
383
- "\n",
384
- "def find_iu_dirs(root: Path):\n",
385
- " \"\"\"Locate IU-Xray `images/` and `labels/` (flat XMLs) under `root`.\n",
386
- "\n",
387
- " Resolution order:\n",
388
- " 1. `{root}/IU-Xray/{images,labels}` — canonical layout.\n",
389
- " 2. Any nested `IU-Xray` folder that contains both.\n",
390
- " 3. Fallback: any folder containing CXR*.png (images) and\n",
391
- " any folder containing *.xml — whichever comes first.\n",
392
- "\n",
393
- " The labels subfolder is treated as a flat directory of XMLs (we no\n",
394
- " longer require the legacy `ecgen-radiology/` subfolder).\n",
395
- " \"\"\"\n",
396
- " # Canonical + nested\n",
397
- " for cand in [root / 'IU-Xray', *root.rglob('IU-Xray')]:\n",
398
- " if not cand.is_dir():\n",
399
- " continue\n",
400
- " imgs = cand / 'images'\n",
401
- " lbls = cand / 'labels'\n",
402
- " if imgs.is_dir() and lbls.is_dir() and any(lbls.glob('*.xml')):\n",
403
- " return imgs, lbls\n",
404
- " # Legacy: labels/ecgen-radiology/*.xml\n",
405
- " legacy = lbls / 'ecgen-radiology'\n",
406
- " if imgs.is_dir() and legacy.is_dir() and any(legacy.glob('*.xml')):\n",
407
- " return imgs, legacy\n",
408
- "\n",
409
- " # Fallback: any images/ with CXR*.png + any folder with XML\n",
410
- " img_dir = lbl_dir = None\n",
411
- " for cand in [root / 'images', *root.rglob('images')]:\n",
412
- " if cand.is_dir() and any(cand.glob('CXR*.png')):\n",
413
- " img_dir = cand; break\n",
414
- " for cand in [root / 'labels', *root.rglob('labels')]:\n",
415
- " if cand.is_dir() and any(cand.glob('*.xml')):\n",
416
- " lbl_dir = cand; break\n",
417
- " if lbl_dir is None:\n",
418
- " # very last resort — any ecgen-radiology folder with XMLs\n",
419
- " for cand in root.rglob('ecgen-radiology'):\n",
420
- " if cand.is_dir() and any(cand.glob('*.xml')):\n",
421
- " lbl_dir = cand; break\n",
422
- " return img_dir, lbl_dir\n",
423
- "\n",
424
- "\n",
425
- "# Filled in below depending on DATASET_NAME\n",
426
- "CXR_ROOT = None # MIMIC-CXR root (with train/valid/test subdirs)\n",
427
- "SPLIT_DIRS = None # MIMIC only\n",
428
- "VQA_ROOT = None # MIMIC only\n",
429
- "IU_IMAGES_DIR = None # IU-Xray only\n",
430
- "IU_LABELS_DIR = None # IU-Xray only\n",
431
- "\n",
432
- "if DATASET_NAME == 'MIMIC-CXR':\n",
433
- " CXR_ROOT = find_split_parent(DATA_SRC)\n",
434
- " print('MIMIC-CXR root:', CXR_ROOT)\n",
435
- "\n",
436
- " SPLIT_DIRS = {\n",
437
- " 'train' : ('train', CXR_ROOT / 'train'),\n",
438
- " 'validate': ('valid', CXR_ROOT / 'valid'),\n",
439
- " 'test' : ('test', CXR_ROOT / 'test'),\n",
440
- " }\n",
441
- " for s, (sub, d) in SPLIT_DIRS.items():\n",
442
- " assert d.exists(), f'Missing split dir: {d}'\n",
443
- " print(f' {s:<9s} → {d}')\n",
444
- "\n",
445
- " for p in DATA_SRC.rglob('MIMIC-Ext-MIMIC-CXR-VQA'):\n",
446
- " cand = p / 'dataset'\n",
447
- " if cand.exists() and (cand / 'train.json').exists():\n",
448
- " VQA_ROOT = cand\n",
449
- " break\n",
450
- " assert VQA_ROOT is not None, 'VQA dataset folder not found under ' + str(DATA_SRC)\n",
451
- " print('VQA root:', VQA_ROOT)\n",
452
- "\n",
453
- "else: # IU-Xray\n",
454
- " IU_IMAGES_DIR, IU_LABELS_DIR = find_iu_dirs(DATA_SRC)\n",
455
- " assert IU_IMAGES_DIR is not None, f'IU images/ not found under {DATA_SRC}'\n",
456
- " assert IU_LABELS_DIR is not None, f'IU labels/ (with *.xml) not found under {DATA_SRC}'\n",
457
- " print('IU images dir:', IU_IMAGES_DIR, '→', len(list(IU_IMAGES_DIR.glob('*.png'))), 'PNGs')\n",
458
- " print('IU labels dir:', IU_LABELS_DIR, '→', len(list(IU_LABELS_DIR.glob('*.xml'))), 'XMLs')"
459
- ],
460
  "execution_count": null,
461
- "outputs": [
462
- {
463
- "output_type": "stream",
464
- "name": "stdout",
465
- "text": [
466
- "IU images dir: /content/data/IU-Xray/images → 1841 PNGs\n",
467
- "IU labels dir: /content/data/IU-Xray/labels → 3955 XMLs\n"
468
- ]
469
- }
470
- ],
471
  "id": "cell-find-data-mimic"
472
  },
473
  {
@@ -475,7 +204,7 @@
475
  "metadata": {
476
  "id": "cell-json-md"
477
  },
478
- "source": "## 3. Build the unified instruction JSON\n\n- **MIMIC-CXR**: auto-built by `utils.dataset_resolver` → `data.mimic_cxr_builder` the first time `train.py`/`evaluate.py` runs. It parses findings/impression, bakes the 14 GT CheXpert labels as the **PNU** string (`Positive/Negative/Uncertain Abnormalities`) into `structured_findings`, and attaches abnormality-guided VQA. The inline cells below are **no-ops for MIMIC**.\n- **IU-Xray**: built by `data.iu_xray_builder` in the cell below (resolver would also do it lazily).\n\nEither way the JSON shares one schema (`image_path`, `task`, `target`, `question`, `structured_findings`, `split`, ...) so `CXRInstructDataset` loads it unchanged.",
479
  "id": "cell-json-md"
480
  },
481
  {
@@ -487,7 +216,7 @@
487
  },
488
  "outputId": "3b965273-ac82-41a7-8ead-895094e0a8b1"
489
  },
490
- "source": "# MIMIC-CXR: the unified JSON (PNU CheXpert + abnormality-guided VQA) is now\n# auto-built by utils.dataset_resolver via data.mimic_cxr_builder when\n# train.py / evaluate.py run. The old inline parse/build/save cells are kept\n# ONLY for IU-Xray; for MIMIC they are intentional no-ops.\nif DATASET_NAME == 'MIMIC-CXR':\n print('MIMIC-CXR: JSON build handled by the resolver '\n '(data.mimic_cxr_builder, PNU + VQA) — skipping inline parse cell.')\nelse:\n print('IU-Xray: skipping MIMIC indexing cell.')\n",
491
  "execution_count": null,
492
  "outputs": [],
493
  "id": "cell-parse"
@@ -501,7 +230,7 @@
501
  },
502
  "outputId": "91e11a3e-7b4e-4457-c32a-70a17fb2ef2a"
503
  },
504
- "source": "if DATASET_NAME == 'MIMIC-CXR':\n print('MIMIC-CXR: findings/impression built by the resolver — skipping.')\nelse:\n samples = None\n print('IU-Xray: skipping MIMIC report parsing cell.')\n",
505
  "execution_count": null,
506
  "outputs": [],
507
  "id": "cell-build-findings"
@@ -515,7 +244,7 @@
515
  },
516
  "outputId": "fc5fd7bc-cb80-49aa-da79-0097ed038b5d"
517
  },
518
- "source": "if DATASET_NAME == 'MIMIC-CXR':\n print('MIMIC-CXR: VQA attached by data.mimic_cxr_builder (resolver), '\n 'with the same PNU CheXpert context — skipping inline VQA cell.')\nelse:\n print('IU-Xray: skipping MIMIC VQA cell.')\n",
519
  "execution_count": null,
520
  "outputs": [],
521
  "id": "cell-build-vqa"
@@ -529,7 +258,7 @@
529
  },
530
  "outputId": "b4d6589b-19be-4eb8-ba25-d533248439b9"
531
  },
532
- "source": "if DATASET_NAME == 'MIMIC-CXR':\n print('MIMIC-CXR: image-existence filtering done inside the resolver '\n 'builder — skipping.')\nelse:\n print('IU-Xray: skipping.')\n",
533
  "execution_count": null,
534
  "outputs": [],
535
  "id": "cell-filter"
@@ -543,7 +272,7 @@
543
  },
544
  "outputId": "b6d95196-0383-4a50-8424-9ef95eb7b34e"
545
  },
546
- "source": "out_dir = PROJECT / 'data' / 'data_files'\nout_dir.mkdir(parents=True, exist_ok=True)\n\nif DATASET_NAME == 'MIMIC-CXR':\n # Base path only — the resolver appends __{report_mode}__{image_mode}\n # and builds it (PNU CheXpert + abnormality-guided VQA) via\n # data.mimic_cxr_builder the first time train.py / evaluate.py runs.\n mimic_json_path = out_dir / 'mimic_cxr_instruct_unified.json'\n print('MIMIC-CXR: instruct JSON auto-built by resolver →',\n f'{mimic_json_path.stem}__<report_mode>__<image_mode>.json')\nelse:\n # Build IU-Xray JSON here so the notebook shows a nice summary log\n # (the resolver would also do this lazily).\n from data.iu_xray_builder import build_iu_xray_instruct_json\n iu_json_path = out_dir / 'iu_xray_instruct.json'\n build_iu_xray_instruct_json(\n images_dir = str(IU_IMAGES_DIR),\n labels_dir = str(IU_LABELS_DIR),\n output_path = str(iu_json_path),\n train_ratio = 0.70, val_ratio = 0.15, test_ratio = 0.15, seed = 42,\n )\n",
547
  "execution_count": null,
548
  "outputs": [],
549
  "id": "cell-save-json"
@@ -553,7 +282,7 @@
553
  "metadata": {
554
  "id": "cell-cfg-md"
555
  },
556
- "source": "## 4. Patch configs for the Kaggle/Colab environment\n\n- Sets `data.dataset_name`, `report_mode`, `image_mode`.\n- **MIMIC-CXR**: sets `mimic_cxr_root`, the `instruct_json` base path, auto-discovers the **CheXpert CSV** (`mimic_chexpert_csv`) and the **VQA** dir (`mimic_vqa_root`), and turns on `mimic_auto_build` so the resolver builds the PNU+VQA JSON on first run.\n- **IU-Xray**: points `iu_xray.images_dir/labels_dir/instruct_json` at the mount.\n- `training.output_root` under `WORK/ckpt` (Persistence keeps it).\n- **4-bit QLoRA**; WandB off; HF hub on — edit `hf_hub.repo_id` to your repo.\n\n⚠️ If \"CheXpert CSV: NOT FOUND\" prints, add `mimic-cxr-2.0.0-chexpert.csv` to the data so PNU abnormality guidance is active (training still runs without it, just no PNU).",
557
  "id": "cell-cfg-md"
558
  },
559
  {
@@ -565,7 +294,7 @@
565
  },
566
  "outputId": "80ddabe3-bc8b-4d14-94e2-26ff9e64970c"
567
  },
568
- "source": "from omegaconf import OmegaConf\n\ntrain_cfg = OmegaConf.load(PROJECT / 'configs' / 'train_config.yaml')\nmodel_cfg = OmegaConf.load(PROJECT / 'configs' / 'model_config.yaml')\n\n# ── dataset selector ──\ntrain_cfg.data.dataset_name = DATASET_NAME\n\n# ── training-scheme switches (thesis ablations) ──\n# report_mode: 'split' → 2 tasks (findings + impression separately)\n# 'merged' → 1 task (full report \"Findings: ...\\n\\nImpression: ...\")\n# 'split_cascade' → split, but impression's context = GT findings\n# image_mode : 'all_views_split' | 'frontal_only_split' | 'multi_image_merged'\ntrain_cfg.data.report_mode = 'split'\ntrain_cfg.data.image_mode = 'all_views_split'\ntrain_cfg.data.max_images_per_sample = 2 # only used in multi_image_merged\n\n# ── dataset-specific paths ──\nif DATASET_NAME == 'MIMIC-CXR':\n train_cfg.data.mimic_cxr_root = str(CXR_ROOT)\n # Base path; the resolver suffixes __{report_mode}__{image_mode} and\n # auto-builds (PNU CheXpert + VQA) via data.mimic_cxr_builder.\n train_cfg.data.instruct_json = str(mimic_json_path)\n train_cfg.data.mimic_auto_build = True\n\n # RaDialog / U-MultiClass abnormality guidance: locate the CheXpert\n # label CSV so the builder can bake the PNU structured_findings string.\n _cx = (sorted(DATA_SRC.rglob('*chexpert*.csv'))\n or sorted(DATA_SRC.rglob('*chexbert*.csv')))\n train_cfg.data.mimic_chexpert_csv = str(_cx[0]) if _cx else None\n print('CheXpert CSV :', train_cfg.data.mimic_chexpert_csv\n or 'NOT FOUND — PNU abnormality guidance DISABLED!')\n\n # VQA pairs ({train,valid,test}.json) → abnormality-guided VQA.\n train_cfg.data.mimic_vqa_root = str(VQA_ROOT) if VQA_ROOT is not None else None\n print('VQA root :', train_cfg.data.mimic_vqa_root or '(none — VQA skipped)')\nelse: # IU-Xray\n train_cfg.data.iu_xray.images_dir = str(IU_IMAGES_DIR)\n train_cfg.data.iu_xray.labels_dir = str(IU_LABELS_DIR)\n train_cfg.data.iu_xray.instruct_json = str(iu_json_path)\n train_cfg.data.iu_xray.auto_build = True\n\ntrain_cfg.data.train_split = 'train'\ntrain_cfg.data.val_split = 'validate'\ntrain_cfg.data.test_split = 'test'\n\n# ── checkpoint root (Persistence keeps /content/ckpt/) ──\nCKPT_ROOT = WORK / 'ckpt'\ntrain_cfg.training.output_root = str(CKPT_ROOT)\n\n# ── batching ──\ntrain_cfg.training.per_device_train_batch_size = 4\ntrain_cfg.training.per_device_eval_batch_size = 4\ntrain_cfg.training.gradient_accumulation_steps = 4\ntrain_cfg.training.fp16 = False\ntrain_cfg.training.bf16 = True\ntrain_cfg.training.dataloader_num_workers = 8\n\ntrain_cfg.stage2.num_epoch = 5\n\n# ── wandb off ──\ntrain_cfg.wandb.enabled = False\n\n# ── HuggingFace Hub run tracking ──\ntrain_cfg.hf_hub.enabled = True\ntrain_cfg.hf_hub.repo_id = 'hieu3636/cxr-vlm-runs' # <<< EDIT ME\ntrain_cfg.hf_hub.token_env = 'HF_TOKEN'\ntrain_cfg.hf_hub.private = True\ntrain_cfg.hf_hub.run_state_file = str(CKPT_ROOT / 'run_id.txt')\n\n# ── 4-bit QLoRA ──\nmodel_cfg.llm.load_in_8bit = False\nmodel_cfg.llm.load_in_4bit = True\n# Oracle PNU path does NOT use the CheXpert classifier module (labels come\n# from the GT csv baked into the prompt). Keep it disabled until you wire\n# the learned classifier for realistic inference.\nmodel_cfg.chexpert_classifier.enabled = False\n\nOmegaConf.save(train_cfg, PROJECT / 'configs' / 'train_config.yaml')\nOmegaConf.save(model_cfg, PROJECT / 'configs' / 'model_config.yaml')\n\nprint('--- train_cfg.data ---'); print(OmegaConf.to_yaml(train_cfg.data))\nprint('--- train_cfg.training ---');print(OmegaConf.to_yaml(train_cfg.training))\nprint('--- train_cfg.hf_hub ---'); print(OmegaConf.to_yaml(train_cfg.hf_hub))\nprint('--- model_cfg.llm ---'); print(OmegaConf.to_yaml(model_cfg.llm))\n",
569
  "execution_count": null,
570
  "outputs": [],
571
  "id": "cell-cfg"
@@ -591,29 +320,9 @@
591
  },
592
  "outputId": "8a2ce693-94fc-4425-f62c-679614d6dab5"
593
  },
594
- "source": [
595
- "# HF_TOKEN setup. On non-Kaggle platforms it's already set inside cell-paths\n",
596
- "# (needed to pull code + data). Here we only handle the Kaggle path.\n",
597
- "try:\n",
598
- " if PLATFORM == 'kaggle':\n",
599
- " from kaggle_secrets import UserSecretsClient\n",
600
- " os.environ['HF_TOKEN'] = UserSecretsClient().get_secret('HF_TOKEN')\n",
601
- " # Other platforms: already populated in cell-paths\n",
602
- " assert os.environ.get('HF_TOKEN'), 'HF_TOKEN missing'\n",
603
- " print('HF_TOKEN loaded ✓')\n",
604
- "except Exception as e:\n",
605
- " print('No HF_TOKEN — Vicuna-7B download may rate-limit and hub upload will be disabled:', e)\n"
606
- ],
607
  "execution_count": null,
608
- "outputs": [
609
- {
610
- "output_type": "stream",
611
- "name": "stdout",
612
- "text": [
613
- "HF_TOKEN loaded ✓\n"
614
- ]
615
- }
616
- ],
617
  "id": "cell-hf-token"
618
  },
619
  {
@@ -668,24 +377,7 @@
668
  "metadata": {
669
  "id": "cell-mode-md"
670
  },
671
- "source": [
672
- "## 5b. Resume controller\n",
673
- "\n",
674
- "Single switch. No more \"which stage\" — `train.py` auto-detects which stage\n",
675
- "to continue from by inspecting checkpoints on disk.\n",
676
- "\n",
677
- "| MODE | What happens |\n",
678
- "|---------------------|--------------|\n",
679
- "| `'fresh'` | Allocate a brand-new `{DATASET}_run_N+1` folder. Train both stages from scratch. |\n",
680
- "| `'resume'` | Reuse latest matching `{DATASET}_run_N` (or `EXPLICIT_RUN_ID`). Auto-detect: stage 1 mid-checkpoint, stage 1 done → stage 2 fresh, stage 2 mid-checkpoint, or both done. |\n",
681
- "\n",
682
- "`EXPLICIT_RUN_ID` is optional (set to `None` to auto-pick the latest run on\n",
683
- "disk or HF Hub that matches the current dataset prefix).\n",
684
- "\n",
685
- "When `MODE='resume'` on a fresh VM the train cell will pull the previous\n",
686
- "run's checkpoints from HF before training. The `--mode resume` flag in\n",
687
- "`train.py` does the auto-detect — no further action needed in the notebook."
688
- ],
689
  "id": "cell-resume-md"
690
  },
691
  {
 
5
  "metadata": {
6
  "id": "cell-0"
7
  },
8
+ "source": "# CXR-VLM — Kaggle / Colab Training Notebook (consolidated)\n\nTrains the 2-stage CXR-VLM (Vicuna-7B + BioViL-T fallback to timm ViT + LoRA) on a Kaggle **T4** or Colab **A100 / L4** GPU.\n\nSupports **three datasets**, selected by `DATASET_NAME` in section 0:\n- **`MIMIC-CXR_resized`** *(default)* — filtered + resized subset of MIMIC-CXR, distributed as tar shards. Manifest-driven (`manifest_{train,val,test}.csv` + `vqa/*.json`, reports inside the tars). 3 tasks (findings, impression, VQA). Lighter than full MIMIC, balanced val/test.\n- **`MIMIC-CXR`** — full pre-split MIMIC-CXR (3 tasks). Heavy; needs the original `train/valid/test` tree + chexpert.csv + VQA pairs.\n- **`IU-Xray`** — 2 tasks only (findings, impression). ~7.5k images, fastest sanity run.\n\n### Source-of-truth\n\nAll platforms (kaggle / colab / lightning / gcp / local) pull code + data from **HuggingFace Hub** — no Kaggle dataset attach is needed anymore. Just:\n\n| Repo | Contents |\n|---|---|\n| `<HF_USER>/cxr-vlm-code` | project source (configs/, data/*.py, model/, training/, evaluation/, utils/, requirements.txt) |\n| `<HF_USER>/cxr-vlm-data` | tar shards under `MIMIC-CXR_resized/` **or** `MIMIC-CXR.zip` **or** `IU-Xray.zip` |\n\n### Settings\n\n- **Kaggle**: accelerator **T4 x2** (only GPU 0 used); Persistence: **Variables and Files**; Internet: **On**\n- **Colab**: any GPU (A100 recommended); enable Files (persisted under `/content`)\n\n### Secrets\n\n- `HF_TOKEN` — HuggingFace token with **write** access to the runs repo (`hf_hub.repo_id` in config). Read from Kaggle Secrets (Add-ons → Secrets) or Colab userdata (🔑 sidebar).",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  "id": "cell-0"
10
  },
11
  {
 
29
  },
30
  "outputId": "d6e7ebbd-4f1b-483b-f20f-0df3997a60b7"
31
  },
32
+ "source": "# ── Platform + dataset selectors ──────────────────────────────────\n# PLATFORM drives storage paths and how secrets are read.\n# Supported: 'kaggle' | 'colab' | 'lightning' | 'gcp' | 'local'\nPLATFORM = 'colab'\nDATASET_NAME = 'MIMIC-CXR_resized' # 'MIMIC-CXR' | 'MIMIC-CXR_resized' | 'IU-Xray'\n\nassert PLATFORM in ('kaggle', 'colab', 'lightning', 'gcp', 'local')\nassert DATASET_NAME in ('MIMIC-CXR', 'MIMIC-CXR_resized', 'IU-Xray')\nprint(f'PLATFORM = {PLATFORM} | DATASET_NAME = {DATASET_NAME}')",
 
 
 
 
 
 
 
 
 
 
33
  "execution_count": null,
34
+ "outputs": [],
 
 
 
 
 
 
 
 
35
  "id": "cell-select"
36
  },
37
  {
 
76
  },
77
  "outputId": "f6195a48-56b2-4052-8367-c8ec14c48a05"
78
  },
79
+ "source": "# ── Per-platform storage + source-of-truth ─────────────────────────\n# All platforms (kaggle / colab / lightning / gcp / local) pull code +\n# data from HF Hub. The only platform-specific bit is:\n# * WORK : where to land outputs (persisted dirs differ per host)\n# * TOKEN : how HF_TOKEN reaches os.environ (secrets API differs)\n#\n# Required HF repos:\n# <HF_USER>/cxr-vlm-code — project source (flat folder)\n# <HF_USER>/cxr-vlm-data — per-dataset payloads:\n# MIMIC-CXR_resized/ (tar shards + manifests + vqa)\n# MIMIC-CXR.zip (single zip)\n# IU-Xray.zip (single zip)\n\nHF_USER = 'hieu3636' # <<< EDIT ME\n\n# ── 1) WORK dir + HF_TOKEN bootstrap (platform-specific) ───────────\nif PLATFORM == 'kaggle':\n from kaggle_secrets import UserSecretsClient\n os.environ['HF_TOKEN'] = UserSecretsClient().get_secret('HF_TOKEN')\n WORK = Path('/kaggle/working')\nelif PLATFORM == 'colab':\n from google.colab import userdata\n os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')\n WORK = Path('/content')\nelif PLATFORM == 'lightning':\n WORK = Path('/teamspace/studios/this_studio')\nelif PLATFORM == 'gcp':\n WORK = Path('/workspace')\nelse: # 'local'\n WORK = Path.home() / 'cxr-vlm-work'\nWORK.mkdir(parents=True, exist_ok=True)\n\nassert os.environ.get('HF_TOKEN'), \\\n 'HF_TOKEN missing — set it via the platform secrets UI before re-running.'\n\ntry:\n from huggingface_hub import snapshot_download, hf_hub_download, HfApi\nexcept ImportError:\n !pip install -q huggingface_hub\n from huggingface_hub import snapshot_download, hf_hub_download, HfApi\n\n# ── 2) Code: flat folder, few hundred files → snapshot_download ──\nprint(f'Pulling code from HF (user: {HF_USER}) …')\nCODE_SRC = Path(snapshot_download(\n repo_id = f'{HF_USER}/cxr-vlm-code',\n repo_type = 'model',\n token = os.environ['HF_TOKEN'],\n local_dir = str(WORK / 'cxr-vlm-code'),\n))\n\n# ── 3) Data: layout depends on DATASET_NAME ──\nDATA_SRC = WORK / 'data'\nDATA_SRC.mkdir(parents=True, exist_ok=True)\n\nif DATASET_NAME == 'MIMIC-CXR_resized':\n # Tar-sharded payload. Reports + images live INSIDE the tars under\n # `files/pXX/pXXXX/{sYYYY/*.jpg, sYYYY.txt}` so extracting all shards\n # gives one unified tree. We download manifests + vqa + SHARDS.txt\n # first (small, ~tens of MB), then each *.tar one at a time →\n # extract → delete (saves disk).\n # Final on-disk layout:\n # DATA_SRC/MIMIC-CXR_resized/\n # ├── manifest_{train,val,test}.csv\n # ├── vqa/ {vqa.json, vqa_val.json, vqa_test.json}\n # ├── SHARDS.txt + _manifest.json\n # └── files/pXX/pXXXX/ ← from tars\n # ├── sYYYY.txt (report)\n # └── sYYYY/<dicom>.jpg (images)\n import tarfile\n mr_dir = DATA_SRC / 'MIMIC-CXR_resized'\n mr_dir.mkdir(parents=True, exist_ok=True)\n files_dir = mr_dir / 'files'\n\n # Marker: if files/ already has shards extracted AND manifests exist,\n # skip everything. Lets the cell be re-run safely.\n manifests_present = all(\n (mr_dir / f).is_file() for f in ('manifest_train.csv', 'manifest_val.csv', 'manifest_test.csv')\n )\n if manifests_present and files_dir.is_dir() and any(files_dir.glob('p*')):\n print(f'{mr_dir} already populated — skipping download.')\n else:\n api = HfApi(token=os.environ['HF_TOKEN'])\n all_files = api.list_repo_files(\n repo_id=f'{HF_USER}/cxr-vlm-data', repo_type='dataset')\n mr_files = [f for f in all_files if f.startswith('MIMIC-CXR_resized/')]\n tar_files = sorted(f for f in mr_files if f.endswith('.tar'))\n meta_files = [f for f in mr_files if not f.endswith('.tar')]\n print(f'MIMIC-CXR_resized on HF: {len(tar_files)} tar shards + {len(meta_files)} metadata files')\n\n # 3a) Pull metadata (manifests, vqa, SHARDS.txt, _manifest.json)\n # in one snapshot (small; few MB).\n print(f' downloading manifests + vqa + SHARDS.txt …')\n snapshot_download(\n repo_id = f'{HF_USER}/cxr-vlm-data',\n repo_type = 'dataset',\n allow_patterns = ['MIMIC-CXR_resized/*.csv',\n 'MIMIC-CXR_resized/*.json',\n 'MIMIC-CXR_resized/*.txt',\n 'MIMIC-CXR_resized/vqa/**'],\n token = os.environ['HF_TOKEN'],\n local_dir = str(DATA_SRC),\n )\n\n # 3b) Sequentially fetch + extract + delete each image tar to\n # minimise peak disk usage (each shard ~2 GB). Reports come\n # out alongside images — both land under mr_dir/files/.\n print(f' downloading + extracting {len(tar_files)} tar shards …')\n for i, tf in enumerate(tar_files, 1):\n print(f' [{i}/{len(tar_files)}] {tf}')\n tar_path = Path(hf_hub_download(\n repo_id=f'{HF_USER}/cxr-vlm-data', repo_type='dataset',\n filename=tf, token=os.environ['HF_TOKEN'],\n local_dir=str(DATA_SRC),\n ))\n with tarfile.open(tar_path) as t:\n # Extract into mr_dir so member paths like\n # \"files/p10/.../*.jpg\" + \"files/p10/.../*.txt\" land at\n # mr_dir/files/p10/…\n t.extractall(mr_dir)\n tar_path.unlink(missing_ok=True)\n print(f' done. {mr_dir} ready.')\n\nelse:\n # MIMIC-CXR / IU-Xray: single zip per dataset (legacy path)\n import zipfile\n zip_name = f'{DATASET_NAME}.zip' # 'IU-Xray.zip' | 'MIMIC-CXR.zip'\n marker = DATA_SRC / DATASET_NAME # DATA_SRC/IU-Xray after unzip\n\n if not marker.exists():\n print(f'Pulling {zip_name} from HF …')\n zpath = hf_hub_download(\n repo_id = f'{HF_USER}/cxr-vlm-data',\n filename = zip_name,\n repo_type = 'dataset',\n token = os.environ['HF_TOKEN'],\n local_dir = str(DATA_SRC),\n )\n print(f' unzipping → {DATA_SRC}')\n with zipfile.ZipFile(zpath) as zf:\n zf.extractall(DATA_SRC)\n try:\n os.remove(zpath) # free disk\n except OSError:\n pass\n else:\n print(f'{marker} already present — skipping download.')\n\nprint(f'Contents of {DATA_SRC}: {sorted(os.listdir(DATA_SRC))}')\n\n# ── Common: copy code into writable PROJECT dir ────────────────────\nPROJECT = WORK / 'cxr_vlm'\nif CODE_SRC.resolve() != PROJECT.resolve() and not PROJECT.exists():\n shutil.copytree(CODE_SRC, PROJECT)\n\nos.chdir(PROJECT)\nsys.path.insert(0, str(PROJECT))\nprint('PLATFORM :', PLATFORM)\nprint('CODE_SRC :', CODE_SRC)\nprint('DATA_SRC :', DATA_SRC)\nprint('PROJECT :', PROJECT)\nprint('WORK :', WORK)",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  "execution_count": null,
81
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  "id": "cell-paths"
83
  },
84
  {
 
182
  "metadata": {
183
  "id": "cell-data-md"
184
  },
185
+ "source": "## 2. Locate data\n\nAll datasets live under a single `cxr-vlm-data` slug (Kaggle) or HF repo (others). Expected layouts:\n\n**MIMIC-CXR_resized** *(default)*:\n```\nDATA_SRC/\n└── MIMIC-CXR_resized/\n ├── manifest_train.csv ← drives split + chex_* + has_vqa\n ├── manifest_val.csv\n ├── manifest_test.csv\n ├── vqa/\n │ ├── vqa.json ← train VQA pairs\n │ ├── vqa_val.json\n │ └── vqa_test.json\n ├── files/ ← extracted from tar shards\n │ └── pXX/pXXXXXXXX/\n │ ├── sYYYYYYYY.txt ← report (alongside, at patient dir)\n │ └── sYYYYYYYY/<dicom>.jpg\n ├── SHARDS.txt\n └── _manifest.json\n```\n\n**MIMIC-CXR** (legacy pre-split):\n```\nDATA_SRC/\n├── MIMIC-CXR/{train,valid,test}/p10/pXXXXXX/sYYYYY/*.jpg + sYYYYY.txt\n└── .../MIMIC-Ext-MIMIC-CXR-VQA/dataset/{train,valid,test}.json\n```\n\n**IU-Xray**:\n```\nDATA_SRC/\n└── IU-Xray/\n ├── images/ # CXR*_IM-*-*.png (~7.5k files)\n └── labels/ # {1..3999}.xml (~3.9k files, flat — no ecgen-radiology subfolder)\n```",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  "id": "cell-data-md"
187
  },
188
  {
 
194
  },
195
  "outputId": "53c15833-f9bb-4457-95d6-3fa83f4dc909"
196
  },
197
+ "source": "def find_split_parent(root: Path) -> Path:\n for cand in [root, root / 'MIMIC-CXR', root / 'data' / 'MIMIC-CXR']:\n if (cand / 'train').exists() and (cand / 'valid').exists() and (cand / 'test').exists():\n return cand\n for p in root.rglob('train'):\n if p.is_dir() and (p.parent / 'valid').exists() and (p.parent / 'test').exists():\n return p.parent\n raise FileNotFoundError('Could not find train/ valid/ test/ under ' + str(root))\n\n\ndef find_mimic_resized_root(root: Path) -> Path:\n \"\"\"Find the MIMIC-CXR_resized payload — folder with manifest_*.csv + files/.\"\"\"\n for cand in [root / 'MIMIC-CXR_resized', root, *root.rglob('MIMIC-CXR_resized')]:\n if (cand / 'manifest_train.csv').is_file():\n return cand\n raise FileNotFoundError(\n f'Could not find MIMIC-CXR_resized payload under {root}. '\n f'Expected manifest_train.csv (alongside manifest_val.csv / manifest_test.csv).'\n )\n\n\ndef find_iu_dirs(root: Path):\n \"\"\"Locate IU-Xray `images/` and `labels/` (flat XMLs) under `root`.\n\n Resolution order:\n 1. `{root}/IU-Xray/{images,labels}` — canonical layout.\n 2. Any nested `IU-Xray` folder that contains both.\n 3. Fallback: any folder containing CXR*.png (images) and\n any folder containing *.xml — whichever comes first.\n\n The labels subfolder is treated as a flat directory of XMLs (we no\n longer require the legacy `ecgen-radiology/` subfolder).\n \"\"\"\n # Canonical + nested\n for cand in [root / 'IU-Xray', *root.rglob('IU-Xray')]:\n if not cand.is_dir():\n continue\n imgs = cand / 'images'\n lbls = cand / 'labels'\n if imgs.is_dir() and lbls.is_dir() and any(lbls.glob('*.xml')):\n return imgs, lbls\n # Legacy: labels/ecgen-radiology/*.xml\n legacy = lbls / 'ecgen-radiology'\n if imgs.is_dir() and legacy.is_dir() and any(legacy.glob('*.xml')):\n return imgs, legacy\n\n # Fallback: any images/ with CXR*.png + any folder with XML\n img_dir = lbl_dir = None\n for cand in [root / 'images', *root.rglob('images')]:\n if cand.is_dir() and any(cand.glob('CXR*.png')):\n img_dir = cand; break\n for cand in [root / 'labels', *root.rglob('labels')]:\n if cand.is_dir() and any(cand.glob('*.xml')):\n lbl_dir = cand; break\n if lbl_dir is None:\n # very last resort — any ecgen-radiology folder with XMLs\n for cand in root.rglob('ecgen-radiology'):\n if cand.is_dir() and any(cand.glob('*.xml')):\n lbl_dir = cand; break\n return img_dir, lbl_dir\n\n\n# Filled in below depending on DATASET_NAME\nCXR_ROOT = None # MIMIC-CXR root (with train/valid/test subdirs)\nSPLIT_DIRS = None # MIMIC only\nVQA_ROOT = None # MIMIC only\nMR_ROOT = None # MIMIC-CXR_resized root (manifests + files/ + vqa/)\nIU_IMAGES_DIR = None # IU-Xray only\nIU_LABELS_DIR = None # IU-Xray only\n\nif DATASET_NAME == 'MIMIC-CXR':\n CXR_ROOT = find_split_parent(DATA_SRC)\n print('MIMIC-CXR root:', CXR_ROOT)\n\n SPLIT_DIRS = {\n 'train' : ('train', CXR_ROOT / 'train'),\n 'validate': ('valid', CXR_ROOT / 'valid'),\n 'test' : ('test', CXR_ROOT / 'test'),\n }\n for s, (sub, d) in SPLIT_DIRS.items():\n assert d.exists(), f'Missing split dir: {d}'\n print(f' {s:<9s} → {d}')\n\n for p in DATA_SRC.rglob('MIMIC-Ext-MIMIC-CXR-VQA'):\n cand = p / 'dataset'\n if cand.exists() and (cand / 'train.json').exists():\n VQA_ROOT = cand\n break\n assert VQA_ROOT is not None, 'VQA dataset folder not found under ' + str(DATA_SRC)\n print('VQA root:', VQA_ROOT)\n\nelif DATASET_NAME == 'MIMIC-CXR_resized':\n MR_ROOT = find_mimic_resized_root(DATA_SRC)\n print('MIMIC-CXR_resized root:', MR_ROOT)\n # Sanity: 3 manifest CSVs, files/ (images+reports), vqa/\n for cf in ('manifest_train.csv', 'manifest_val.csv', 'manifest_test.csv'):\n f = MR_ROOT / cf\n print(f' {cf}: {\"OK\" if f.is_file() else \"MISSING\"}')\n for sub in ('files', 'vqa'):\n d = MR_ROOT / sub\n print(f' {sub:<5s}: {\"OK\" if d.is_dir() else \"MISSING\"} ({d})')\n # Spot-check one report (.txt) sits at patient-dir level inside files/\n txt_hits = list((MR_ROOT / 'files').glob('p*/p*/s*.txt')) if (MR_ROOT / 'files').is_dir() else []\n print(f' reports inside files/ : {len(txt_hits):,} found (sample: {txt_hits[0] if txt_hits else \"—\"})')\n\nelse: # IU-Xray\n IU_IMAGES_DIR, IU_LABELS_DIR = find_iu_dirs(DATA_SRC)\n assert IU_IMAGES_DIR is not None, f'IU images/ not found under {DATA_SRC}'\n assert IU_LABELS_DIR is not None, f'IU labels/ (with *.xml) not found under {DATA_SRC}'\n print('IU images dir:', IU_IMAGES_DIR, '→', len(list(IU_IMAGES_DIR.glob('*.png'))), 'PNGs')\n print('IU labels dir:', IU_LABELS_DIR, '→', len(list(IU_LABELS_DIR.glob('*.xml'))), 'XMLs')",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  "execution_count": null,
199
+ "outputs": [],
 
 
 
 
 
 
 
 
 
200
  "id": "cell-find-data-mimic"
201
  },
202
  {
 
204
  "metadata": {
205
  "id": "cell-json-md"
206
  },
207
+ "source": "## 3. Build the unified instruction JSON\n\n- **MIMIC-CXR_resized**: auto-built by `utils.dataset_resolver` → `data.mimic_cxr_resized_builder` the first time `train.py` / `evaluate.py` runs. It reads `manifest_{train,val,test}.csv` (which carry the split label, image/report relpath, and the 14 CheXpert `chex_*` columns → PNU `Positive/Negative/Uncertain Abnormalities` string), parses findings + impression from each report, and attaches abnormality-guided VQA from `vqa/{vqa,vqa_val,vqa_test}.json`. The inline cells below are **no-ops** for this dataset.\n- **MIMIC-CXR** (full, pre-split): auto-built by `data.mimic_cxr_builder` (CheXpert.csv-based). Inline cells are no-ops here too.\n- **IU-Xray**: built by `data.iu_xray_builder` in the cell below (resolver would also do it lazily; we build here just to get a summary log).\n\nAll three paths produce the same JSON schema (`image_path`, `task`, `target`, `question`, `structured_findings`, `split`, ) so `CXRInstructDataset` loads them unchanged.",
208
  "id": "cell-json-md"
209
  },
210
  {
 
216
  },
217
  "outputId": "3b965273-ac82-41a7-8ead-895094e0a8b1"
218
  },
219
+ "source": "# MIMIC-CXR and MIMIC-CXR_resized: the unified JSON is built lazily by\n# utils.dataset_resolver (→ data.mimic_cxr_builder for MIMIC-CXR, or\n# → data.mimic_cxr_resized_builder for MIMIC-CXR_resized) when train.py /\n# evaluate.py first run. The old inline parse/build cells are no-ops for\n# both; IU-Xray still gets a friendly inline build below for the log.\nif DATASET_NAME in ('MIMIC-CXR', 'MIMIC-CXR_resized'):\n print(f'{DATASET_NAME}: JSON build handled by the resolver — skipping inline parse cell.')\nelse:\n print('IU-Xray: skipping MIMIC indexing cell.')",
220
  "execution_count": null,
221
  "outputs": [],
222
  "id": "cell-parse"
 
230
  },
231
  "outputId": "91e11a3e-7b4e-4457-c32a-70a17fb2ef2a"
232
  },
233
+ "source": "if DATASET_NAME in ('MIMIC-CXR', 'MIMIC-CXR_resized'):\n print(f'{DATASET_NAME}: findings/impression built by the resolver — skipping.')\nelse:\n samples = None\n print('IU-Xray: skipping MIMIC report parsing cell.')",
234
  "execution_count": null,
235
  "outputs": [],
236
  "id": "cell-build-findings"
 
244
  },
245
  "outputId": "fc5fd7bc-cb80-49aa-da79-0097ed038b5d"
246
  },
247
+ "source": "if DATASET_NAME in ('MIMIC-CXR', 'MIMIC-CXR_resized'):\n print(f'{DATASET_NAME}: VQA attached by the resolver builder '\n '(with the same PNU CheXpert context) — skipping inline VQA cell.')\nelse:\n print('IU-Xray: skipping MIMIC VQA cell.')",
248
  "execution_count": null,
249
  "outputs": [],
250
  "id": "cell-build-vqa"
 
258
  },
259
  "outputId": "b4d6589b-19be-4eb8-ba25-d533248439b9"
260
  },
261
+ "source": "if DATASET_NAME in ('MIMIC-CXR', 'MIMIC-CXR_resized'):\n print(f'{DATASET_NAME}: image-existence filtering handled inside the resolver '\n 'builder — skipping.')\nelse:\n print('IU-Xray: skipping.')",
262
  "execution_count": null,
263
  "outputs": [],
264
  "id": "cell-filter"
 
272
  },
273
  "outputId": "b6d95196-0383-4a50-8424-9ef95eb7b34e"
274
  },
275
+ "source": "out_dir = PROJECT / 'data' / 'data_files'\nout_dir.mkdir(parents=True, exist_ok=True)\n\nif DATASET_NAME == 'MIMIC-CXR':\n # Base path only — the resolver appends __{report_mode}__{image_mode}\n # and builds it (PNU CheXpert + abnormality-guided VQA) via\n # data.mimic_cxr_builder the first time train.py / evaluate.py runs.\n mimic_json_path = out_dir / 'mimic_cxr_instruct_unified.json'\n print('MIMIC-CXR: instruct JSON auto-built by resolver →',\n f'{mimic_json_path.stem}__<report_mode>__<image_mode>.json')\nelif DATASET_NAME == 'MIMIC-CXR_resized':\n # Same lazy-build story but via data.mimic_cxr_resized_builder.\n mr_json_path = out_dir / 'mimic_cxr_resized_instruct.json'\n print('MIMIC-CXR_resized: instruct JSON auto-built by resolver →',\n f'{mr_json_path.stem}__<report_mode>__<image_mode>.json')\nelse:\n # Build IU-Xray JSON here so the notebook shows a nice summary log\n # (the resolver would also do this lazily).\n from data.iu_xray_builder import build_iu_xray_instruct_json\n iu_json_path = out_dir / 'iu_xray_instruct.json'\n build_iu_xray_instruct_json(\n images_dir = str(IU_IMAGES_DIR),\n labels_dir = str(IU_LABELS_DIR),\n output_path = str(iu_json_path),\n train_ratio = 0.70, val_ratio = 0.15, test_ratio = 0.15, seed = 42,\n )",
276
  "execution_count": null,
277
  "outputs": [],
278
  "id": "cell-save-json"
 
282
  "metadata": {
283
  "id": "cell-cfg-md"
284
  },
285
+ "source": "## 4. Patch configs for the Kaggle/Colab environment\n\n- Sets `data.dataset_name`, `report_mode`, `image_mode`.\n- **MIMIC-CXR_resized** *(default)*: sets `mimic_cxr_resized.root` (the manifest+files+vqa+reports payload). `manifest_dir` / `vqa_dir` / `reports_root` are left null so the resolver auto-detects from `{root}/`, `{root}/vqa/`, `{root}/reports/`. The builder reads `chex_*` columns directly — no separate CheXpert CSV is needed.\n- **MIMIC-CXR**: sets `mimic_cxr_root`, the `instruct_json` base path, auto-discovers the **CheXpert CSV** (`mimic_chexpert_csv`) and the **VQA** dir (`mimic_vqa_root`), and turns on `mimic_auto_build`.\n- **IU-Xray**: points `iu_xray.images_dir/labels_dir/instruct_json` at the mount.\n- `tasks.*.weight` is left at the config defaults (findings 0.30 / impression 0.20 / vqa 0.50). `WeightedRandomSampler` in `CXRTrainer._get_train_sampler` enforces the mix at train time — see `data/dataset.py:get_per_sample_weights`.\n- `training.output_root` under `WORK/ckpt` (Persistence keeps it).\n- **4-bit QLoRA**; WandB off; HF hub on — edit `hf_hub.repo_id` to your repo.\n\n⚠️ MIMIC-CXR (full) path: if \"CheXpert CSV: NOT FOUND\" prints, add `mimic-cxr-2.0.0-chexpert.csv` to the data so PNU abnormality guidance is active (training still runs without it, just no PNU). For MIMIC-CXR_resized this is N/A — labels are baked into the manifest.",
286
  "id": "cell-cfg-md"
287
  },
288
  {
 
294
  },
295
  "outputId": "80ddabe3-bc8b-4d14-94e2-26ff9e64970c"
296
  },
297
+ "source": "from omegaconf import OmegaConf\n\ntrain_cfg = OmegaConf.load(PROJECT / 'configs' / 'train_config.yaml')\nmodel_cfg = OmegaConf.load(PROJECT / 'configs' / 'model_config.yaml')\n\n# ── dataset selector ──\ntrain_cfg.data.dataset_name = DATASET_NAME\n\n# ── training-scheme switches (thesis ablations) ──\n# report_mode: 'split' → 2 tasks (findings + impression separately)\n# 'merged' → 1 task (full report \"Findings: ...\\n\\nImpression: ...\")\n# 'split_cascade' → split, but impression's context = GT findings\n# image_mode : 'all_views_split' | 'frontal_only_split' | 'multi_image_merged'\ntrain_cfg.data.report_mode = 'split'\ntrain_cfg.data.image_mode = 'all_views_split'\ntrain_cfg.data.max_images_per_sample = 2 # only used in multi_image_merged\n\n# ── dataset-specific paths ──\nif DATASET_NAME == 'MIMIC-CXR':\n train_cfg.data.mimic_cxr_root = str(CXR_ROOT)\n # Base path; the resolver suffixes __{report_mode}__{image_mode} and\n # auto-builds (PNU CheXpert + VQA) via data.mimic_cxr_builder.\n train_cfg.data.instruct_json = str(mimic_json_path)\n train_cfg.data.mimic_auto_build = True\n\n # RaDialog / U-MultiClass abnormality guidance: locate the CheXpert\n # label CSV so the builder can bake the PNU structured_findings string.\n _cx = (sorted(DATA_SRC.rglob('*chexpert*.csv'))\n or sorted(DATA_SRC.rglob('*chexbert*.csv')))\n train_cfg.data.mimic_chexpert_csv = str(_cx[0]) if _cx else None\n print('CheXpert CSV :', train_cfg.data.mimic_chexpert_csv\n or 'NOT FOUND — PNU abnormality guidance DISABLED!')\n\n # VQA pairs ({train,valid,test}.json) → abnormality-guided VQA.\n train_cfg.data.mimic_vqa_root = str(VQA_ROOT) if VQA_ROOT is not None else None\n print('VQA root :', train_cfg.data.mimic_vqa_root or '(none — VQA skipped)')\n\nelif DATASET_NAME == 'MIMIC-CXR_resized':\n # The MIMIC-CXR_resized builder is manifest-driven: it reads\n # `manifest_{train,val,test}.csv` for split + the 14 chex_* labels\n # (PNU bucketed directly from the CSV, no separate chexpert.csv needed),\n # uses `report_relpath` from the manifest to find each .txt, and pulls\n # VQA from `vqa/{vqa,vqa_val,vqa_test}.json`.\n train_cfg.data.mimic_cxr_resized.root = str(MR_ROOT)\n train_cfg.data.mimic_cxr_resized.manifest_dir = None # null → defaults to root\n train_cfg.data.mimic_cxr_resized.vqa_dir = None # null → {root}/vqa\n train_cfg.data.mimic_cxr_resized.reports_root = None # null → auto-probe {root} then {root}/reports\n train_cfg.data.mimic_cxr_resized.instruct_json = str(mr_json_path)\n train_cfg.data.mimic_cxr_resized.auto_build = True\n\nelse: # IU-Xray\n train_cfg.data.iu_xray.images_dir = str(IU_IMAGES_DIR)\n train_cfg.data.iu_xray.labels_dir = str(IU_LABELS_DIR)\n train_cfg.data.iu_xray.instruct_json = str(iu_json_path)\n train_cfg.data.iu_xray.auto_build = True\n\ntrain_cfg.data.train_split = 'train'\ntrain_cfg.data.val_split = 'validate'\ntrain_cfg.data.test_split = 'test'\n\n# ── checkpoint root (Persistence keeps /content/ckpt/) ──\nCKPT_ROOT = WORK / 'ckpt'\ntrain_cfg.training.output_root = str(CKPT_ROOT)\n\n# ── batching ──\ntrain_cfg.training.per_device_train_batch_size = 4\ntrain_cfg.training.per_device_eval_batch_size = 4\ntrain_cfg.training.gradient_accumulation_steps = 4\ntrain_cfg.training.fp16 = False\ntrain_cfg.training.bf16 = True\ntrain_cfg.training.dataloader_num_workers = 8\n\ntrain_cfg.stage2.num_epoch = 5\n\n# ── task weights (sampling ratio enforced by WeightedRandomSampler) ──\n# Defaults in train_config.yaml: 0.30 / 0.20 / 0.50 (RRG ≈ VQA, impression\n# lower because in split_cascade mode it sees GT findings as input).\n# Resolver auto-renormalizes and drops vqa for IU-Xray. Override here only\n# if you want to experiment per-run, e.g.:\n# train_cfg.tasks.findings_generation.weight = 0.30\n# train_cfg.tasks.impression_generation.weight = 0.20\n# train_cfg.tasks.vqa.weight = 0.50\n\n# ── wandb off ──\ntrain_cfg.wandb.enabled = False\n\n# ── HuggingFace Hub run tracking ──\ntrain_cfg.hf_hub.enabled = True\ntrain_cfg.hf_hub.repo_id = 'hieu3636/cxr-vlm-runs' # <<< EDIT ME\ntrain_cfg.hf_hub.token_env = 'HF_TOKEN'\ntrain_cfg.hf_hub.private = True\ntrain_cfg.hf_hub.run_state_file = str(CKPT_ROOT / 'run_id.txt')\n\n# ── 4-bit QLoRA ──\nmodel_cfg.llm.load_in_8bit = False\nmodel_cfg.llm.load_in_4bit = True\n# Oracle PNU path does NOT use the CheXpert classifier module (labels come\n# from the GT csv/manifest baked into the prompt). Keep it disabled until\n# you wire the learned classifier for realistic inference.\nmodel_cfg.chexpert_classifier.enabled = False\n\nOmegaConf.save(train_cfg, PROJECT / 'configs' / 'train_config.yaml')\nOmegaConf.save(model_cfg, PROJECT / 'configs' / 'model_config.yaml')\n\nprint('--- train_cfg.data ---'); print(OmegaConf.to_yaml(train_cfg.data))\nprint('--- train_cfg.tasks ---'); print(OmegaConf.to_yaml(train_cfg.tasks))\nprint('--- train_cfg.training ---');print(OmegaConf.to_yaml(train_cfg.training))\nprint('--- train_cfg.hf_hub ---'); print(OmegaConf.to_yaml(train_cfg.hf_hub))\nprint('--- model_cfg.llm ---'); print(OmegaConf.to_yaml(model_cfg.llm))",
298
  "execution_count": null,
299
  "outputs": [],
300
  "id": "cell-cfg"
 
320
  },
321
  "outputId": "8a2ce693-94fc-4425-f62c-679614d6dab5"
322
  },
323
+ "source": "# HF_TOKEN was already loaded in cell-paths (uniformly across all platforms).\n# This cell is now just a confirmation + reminder.\nassert os.environ.get('HF_TOKEN'), 'HF_TOKEN missing — re-run cell-paths.'\nprint('HF_TOKEN loaded ✓')",
 
 
 
 
 
 
 
 
 
 
 
 
324
  "execution_count": null,
325
+ "outputs": [],
 
 
 
 
 
 
 
 
326
  "id": "cell-hf-token"
327
  },
328
  {
 
377
  "metadata": {
378
  "id": "cell-mode-md"
379
  },
380
+ "source": "## 5b. Resume controller\n\nSingle switch. No more \"which stage\" — `train.py` auto-detects which stage to continue from by inspecting checkpoints on disk.\n\n| MODE | What happens |\n|------------|--------------|\n| `'fresh'` | Allocate a brand-new `{DATASET}_run_N+1` folder. Train both stages from scratch. |\n| `'resume'` | Reuse latest matching `{DATASET}_run_N` (or `EXPLICIT_RUN_ID`). Auto-detect from local disk: stage 1 mid-checkpoint, stage 1 done → stage 2 fresh, stage 2 mid-checkpoint, or both done. |\n\n`EXPLICIT_RUN_ID` is optional (set to `None` to auto-pick the latest run on disk or HF Hub that matches the current dataset prefix).\n\n### Fresh-VM resume\n\nIf your Colab/Kaggle VM was reset and the local `ckpt/{run_id}/` is gone (persistence lost or switching machines), the train cell will **auto-pull** the previous run's `stage{1,2}/last/` + `stage1/best/` (= stage1 final) from HF Hub into the canonical local layout before training, so `detect_resume_point` can pick up where you left off. `timing.json` is also pulled so the session-count + cumulative-time keeps incrementing.\n\n`run_id` resolution order (when `MODE='resume'`): `EXPLICIT_RUN_ID` > local `run_id.txt` > latest `{DATASET}_run_*` on HF Hub.",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
  "id": "cell-resume-md"
382
  },
383
  {
training/train.py CHANGED
@@ -44,7 +44,7 @@ from model.rad_dino import BioViLTEncoder
44
  from data import CXRInstructDataset, CXRDataCollator
45
  from utils.logger import setup_logger
46
  from utils.checkpoint import save_checkpoint, load_checkpoint
47
- from utils.hf_uploader import build_tracker_from_cfg, pull_last_for_resume
48
  from utils.dataset_resolver import (
49
  resolve_dataset_spec,
50
  resolve_run_id,
@@ -263,6 +263,37 @@ def get_trainer(
263
  model = self.model
264
  load_checkpoint(model, resume_from_checkpoint)
265
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
  return CXRTrainer(
267
  model = model,
268
  args = training_args,
@@ -649,6 +680,25 @@ def main():
649
  args.resume_from = local_resume
650
  logger.info(f"Will resume from pulled checkpoint: {local_resume} (stage{args.stage})")
651
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
652
  # ── Compute per-stage output dirs under {output_root}/{run_id}/ ──
653
  stage1_out = stage_dir(output_root, run_id,
654
  str(train_cfg.stage1.get("subdir", "stage1_projection")))
@@ -724,6 +774,16 @@ def main():
724
  "resumed": bool(args.resume_from),
725
  "resume_from": args.resume_from,
726
  })
 
 
 
 
 
 
 
 
 
 
727
 
728
  # Build model
729
  logger.info("Building CXR VLM...")
 
44
  from data import CXRInstructDataset, CXRDataCollator
45
  from utils.logger import setup_logger
46
  from utils.checkpoint import save_checkpoint, load_checkpoint
47
+ from utils.hf_uploader import build_tracker_from_cfg, pull_last_for_resume, hydrate_run_dir_from_hf
48
  from utils.dataset_resolver import (
49
  resolve_dataset_spec,
50
  resolve_run_id,
 
263
  model = self.model
264
  load_checkpoint(model, resume_from_checkpoint)
265
 
266
+ def _get_train_sampler(self, *args, **kwargs):
267
+ """
268
+ Use `WeightedRandomSampler` when the train dataset is mixed-task
269
+ and exposes per-sample weights — this is what makes the configured
270
+ `tasks.*.weight` ratios actually control batch composition.
271
+ Falls back to HF's default (RandomSampler / DistributedSampler)
272
+ for single-task or eval-time datasets.
273
+
274
+ Notes:
275
+ * Eval is unaffected — HF's `_get_eval_sampler` returns a
276
+ `SequentialSampler` by default, so weighted reweighting only
277
+ applies to training.
278
+ * `replacement=True` is required for true oversampling — without
279
+ it you can't draw more samples of a rare-but-upweighted task
280
+ than physically exist. Tradeoff: a small fraction of samples
281
+ in a numerous-but-downweighted task may never appear in a
282
+ given epoch. Acceptable across multiple epochs.
283
+ """
284
+ ds = self.train_dataset
285
+ getter = getattr(ds, "get_per_sample_weights", None)
286
+ if getter is not None:
287
+ weights = getter()
288
+ if weights is not None:
289
+ from torch.utils.data import WeightedRandomSampler
290
+ return WeightedRandomSampler(
291
+ weights = weights,
292
+ num_samples = len(ds),
293
+ replacement = True,
294
+ )
295
+ return super()._get_train_sampler(*args, **kwargs)
296
+
297
  return CXRTrainer(
298
  model = model,
299
  args = training_args,
 
680
  args.resume_from = local_resume
681
  logger.info(f"Will resume from pulled checkpoint: {local_resume} (stage{args.stage})")
682
 
683
+ # ── Fresh-VM resume: hydrate from HF before detect_resume_point ──
684
+ # When `--mode resume` is set but the local run dir is empty (Colab
685
+ # persistence lost, switching machines), pull configs + last/best
686
+ # checkpoints from HF Hub into the canonical local layout so the
687
+ # detector finds them. No-op if local already has artifacts or HF
688
+ # tracking is disabled.
689
+ if args.mode == "resume" and hf_repo_id and hf_token:
690
+ try:
691
+ hydrate_run_dir_from_hf(
692
+ repo_id = hf_repo_id,
693
+ token = hf_token,
694
+ run_id = run_id,
695
+ output_root = output_root,
696
+ stage1_subdir = str(train_cfg.stage1.get("subdir", "stage1_projection")),
697
+ stage2_subdir = str(train_cfg.stage2.get("subdir", "stage2_instruct")),
698
+ )
699
+ except Exception as e:
700
+ logger.warning(f"[resume hydrate] {type(e).__name__}: {e}")
701
+
702
  # ── Compute per-stage output dirs under {output_root}/{run_id}/ ──
703
  stage1_out = stage_dir(output_root, run_id,
704
  str(train_cfg.stage1.get("subdir", "stage1_projection")))
 
774
  "resumed": bool(args.resume_from),
775
  "resume_from": args.resume_from,
776
  })
777
+ # Snapshot the resolved config + run_meta.json to HF so the run is
778
+ # self-describing on the hub (you can answer "what config did
779
+ # {run_id} actually use?" without pulling the whole checkpoint).
780
+ # `save_run_config` writes these into {run_dir}/configs/ +
781
+ # {run_dir}/run_meta.json a few lines above.
782
+ rd = run_dir(output_root, run_id)
783
+ if (rd / "configs").is_dir():
784
+ tracker.upload_folder(str(rd / "configs"), "configs")
785
+ if (rd / "run_meta.json").is_file():
786
+ tracker.upload_file(str(rd / "run_meta.json"), "run_meta.json")
787
 
788
  # Build model
789
  logger.info("Building CXR VLM...")
data/upload_to_hf_2.py → upload_to_hf_2.py RENAMED
File without changes
utils/dataset_resolver.py CHANGED
@@ -25,7 +25,7 @@ from pathlib import Path
25
  from typing import Dict, List, Optional
26
 
27
 
28
- SUPPORTED_DATASETS = ("MIMIC-CXR", "IU-Xray")
29
 
30
 
31
  @dataclass
@@ -113,6 +113,21 @@ def resolve_dataset_spec(train_cfg) -> DatasetSpec:
113
  train_cfg.data, report_mode, image_mode
114
  )
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  else: # IU-Xray
117
  # IU has no VQA.
118
  available = ["report"] if report_mode == "merged" else ["findings", "impression"]
@@ -233,6 +248,63 @@ def _ensure_mimic_json_exists(data_cfg,
233
  return str(out)
234
 
235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  # ─── Run ID resolution (dataset-prefixed) ───────────────────────────────────
237
 
238
  def resolve_run_id(
 
25
  from typing import Dict, List, Optional
26
 
27
 
28
+ SUPPORTED_DATASETS = ("MIMIC-CXR", "MIMIC-CXR_resized", "IU-Xray")
29
 
30
 
31
  @dataclass
 
113
  train_cfg.data, report_mode, image_mode
114
  )
115
 
116
+ elif name == "MIMIC-CXR_resized":
117
+ # Same semantic dataset as MIMIC-CXR (all 3 tasks) but the on-disk
118
+ # layout is the raw PhysioNet tree {root}/files/pXX/... and splits
119
+ # come from mimic-cxr-2.0.0-split.csv instead of a pre-split dir
120
+ # structure. Reuses the same builder with layout="files".
121
+ if report_mode == "merged":
122
+ available = ["report", "vqa"]
123
+ else:
124
+ available = ["findings", "impression", "vqa"]
125
+ mr = train_cfg.data.mimic_cxr_resized
126
+ image_root = mr.root
127
+ instruct_json = _ensure_mimic_resized_json_exists(
128
+ mr, report_mode, image_mode
129
+ )
130
+
131
  else: # IU-Xray
132
  # IU has no VQA.
133
  available = ["report"] if report_mode == "merged" else ["findings", "impression"]
 
248
  return str(out)
249
 
250
 
251
+ def _ensure_mimic_resized_json_exists(mr_cfg,
252
+ report_mode: str = "split",
253
+ image_mode: str = "all_views_split") -> str:
254
+ """
255
+ Build the MIMIC-CXR_resized unified JSON if missing.
256
+
257
+ This dataset is **manifest-driven**, not directory-walking:
258
+ - 3 manifest CSVs (manifest_{train,val,test}.csv) carry every row's
259
+ split label, image/report relative path, and the 14 CheXpert
260
+ labels as chex_* columns. No separate *split*.csv or *chexpert*.csv
261
+ is read.
262
+ - VQA is read from `vqa_dir/{vqa.json, vqa_val.json, vqa_test.json}`.
263
+
264
+ The cache path is suffixed with report_mode+image_mode (same convention
265
+ as the other two builders) so each mode combination gets its own cache.
266
+ """
267
+ base = Path(_get(mr_cfg, "instruct_json",
268
+ "data/data_files/mimic_cxr_resized_instruct.json"))
269
+ out = base.with_name(f"{base.stem}__{report_mode}__{image_mode}{base.suffix}")
270
+ if out.is_file():
271
+ return str(out)
272
+
273
+ if not bool(_get(mr_cfg, "auto_build", True)):
274
+ raise FileNotFoundError(
275
+ f"MIMIC-CXR_resized instruct JSON not found at {out} and "
276
+ f"auto_build=false. Run: python -m data.mimic_cxr_resized_builder "
277
+ f"--root {_get(mr_cfg, 'root')} --output {out} "
278
+ f"--report_mode {report_mode} --image_mode {image_mode}"
279
+ )
280
+
281
+ from data.mimic_cxr_resized_builder import build_mimic_cxr_resized_instruct_json
282
+ print(f"[dataset_resolver] MIMIC-CXR_resized JSON not found → auto-building "
283
+ f"(report_mode={report_mode}, image_mode={image_mode}) …")
284
+ root_path = str(_get(mr_cfg, "root"))
285
+ # Convention defaults: manifest CSVs sit at `root`, VQA at `{root}/vqa`.
286
+ # Either can be overridden in config; an explicit empty string for
287
+ # vqa_dir disables VQA entirely.
288
+ manifest_dir = _get(mr_cfg, "manifest_dir") or root_path
289
+ vqa_dir_cfg = _get(mr_cfg, "vqa_dir")
290
+ if vqa_dir_cfg is None:
291
+ vqa_dir = str(Path(root_path) / "vqa")
292
+ elif vqa_dir_cfg == "":
293
+ vqa_dir = None # explicit opt-out
294
+ else:
295
+ vqa_dir = str(vqa_dir_cfg)
296
+ build_mimic_cxr_resized_instruct_json(
297
+ root = root_path,
298
+ manifest_dir = manifest_dir,
299
+ output_path = str(out),
300
+ vqa_dir = vqa_dir,
301
+ reports_root = _get(mr_cfg, "reports_root"),
302
+ report_mode = report_mode,
303
+ image_mode = image_mode,
304
+ )
305
+ return str(out)
306
+
307
+
308
  # ─── Run ID resolution (dataset-prefixed) ───────────────────────────────────
309
 
310
  def resolve_run_id(
utils/hf_uploader.py CHANGED
@@ -288,6 +288,157 @@ def pull_last_for_resume(
288
  return str(last_dir)
289
 
290
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
  def build_tracker_from_cfg(train_cfg, resuming: bool = False, explicit_run_id: Optional[str] = None):
292
  """Convenience factory from OmegaConf DictConfig."""
293
  hf = getattr(train_cfg, "hf_hub", None)
 
288
  return str(last_dir)
289
 
290
 
291
+ def hydrate_run_dir_from_hf(
292
+ repo_id: str,
293
+ token: Optional[str],
294
+ run_id: str,
295
+ output_root: str,
296
+ stage1_subdir: str = "stage1_projection",
297
+ stage2_subdir: str = "stage2_instruct",
298
+ ) -> bool:
299
+ """
300
+ Repopulate a local run dir from HF artifacts so `detect_resume_point`
301
+ can find checkpoints after a fresh-VM resume (persistence lost / new host).
302
+
303
+ HF layout (uploaded by HFBestLastCallback + end-of-stage saves):
304
+ {run_id}/configs/ (YAML snapshots)
305
+ {run_id}/run_meta.json
306
+ {run_id}/timing.json
307
+ {run_id}/stage1/last/ + stage1/best/ (best/ = stage1 final, renamed `checkpoint_*`)
308
+ {run_id}/stage2/last/ + stage2/best/
309
+
310
+ Local layout `detect_resume_point` expects:
311
+ {output_root}/{run_id}/stage1_projection/stage1_final_* ← stage1 done
312
+ {output_root}/{run_id}/stage1_projection/checkpoint-N/... ← stage1 mid
313
+ {output_root}/{run_id}/stage2_instruct/stage2_final_* ← stage2 done
314
+ {output_root}/{run_id}/stage2_instruct/checkpoint-N/... ← stage2 mid
315
+
316
+ Mapping rules:
317
+ * `stage2/last/` → `stage2_instruct/checkpoint-1/` (placeholder N=1;
318
+ Trainer reads the real global_step from trainer_state.json inside).
319
+ * `stage1/best/` → `stage1_projection/stage1_final_*` (rename files
320
+ from `checkpoint_*` to `stage1_final_*` so save_checkpoint conventions
321
+ line up with what the rest of the pipeline expects).
322
+ * `stage1/last/` → `stage1_projection/checkpoint-1/` (only if no
323
+ stage1_final placed — i.e. stage 1 hadn't finished yet on HF).
324
+
325
+ Returns True if at least one artifact was placed, False otherwise.
326
+ """
327
+ if not HF_AVAILABLE:
328
+ print("[hydrate_run_dir_from_hf] huggingface_hub not installed — skip")
329
+ return False
330
+ from huggingface_hub import snapshot_download
331
+ import shutil
332
+
333
+ token = token or os.environ.get("HF_TOKEN")
334
+ output_root = Path(output_root)
335
+ staging = output_root / "_hf_pull"
336
+ dst_root = output_root / run_id
337
+
338
+ # Skip if local already has any final/checkpoint — we're not on a fresh VM.
339
+ s1_local = dst_root / stage1_subdir
340
+ s2_local = dst_root / stage2_subdir
341
+ def _has_ckpt(d: Path) -> bool:
342
+ return d.is_dir() and any(d.glob("checkpoint-*"))
343
+ if (
344
+ (s1_local / "stage1_final_projection.pt").exists()
345
+ or (s2_local / "stage2_final_projection.pt").exists()
346
+ or _has_ckpt(s1_local)
347
+ or _has_ckpt(s2_local)
348
+ ):
349
+ print(f"[hydrate_run_dir_from_hf] local {dst_root} already populated — skip pull")
350
+ return False
351
+
352
+ # Pull the run's relevant files (configs + meta + last/best, skip
353
+ # training_log.jsonl which can be large).
354
+ staging.mkdir(parents=True, exist_ok=True)
355
+ try:
356
+ snapshot_download(
357
+ repo_id = repo_id,
358
+ repo_type = "model",
359
+ token = token,
360
+ allow_patterns = [
361
+ f"{run_id}/configs/**",
362
+ f"{run_id}/run_meta.json",
363
+ f"{run_id}/timing.json",
364
+ f"{run_id}/meta.json",
365
+ f"{run_id}/stage1/last/**",
366
+ f"{run_id}/stage1/best/**",
367
+ f"{run_id}/stage2/last/**",
368
+ f"{run_id}/stage2/best/**",
369
+ ],
370
+ local_dir = str(staging),
371
+ )
372
+ except Exception as e:
373
+ print(f"[hydrate_run_dir_from_hf] snapshot_download failed: {e}")
374
+ return False
375
+
376
+ src_root = staging / run_id
377
+ if not src_root.is_dir():
378
+ print(f"[hydrate_run_dir_from_hf] HF has no '{run_id}/' folder")
379
+ shutil.rmtree(staging, ignore_errors=True)
380
+ return False
381
+
382
+ dst_root.mkdir(parents=True, exist_ok=True)
383
+ placed_any = False
384
+
385
+ # configs/, run_meta.json, timing.json, meta.json: straight copy
386
+ for sub in ("configs",):
387
+ s = src_root / sub
388
+ if s.is_dir():
389
+ shutil.copytree(s, dst_root / sub, dirs_exist_ok=True)
390
+ placed_any = True
391
+ for f in ("run_meta.json", "timing.json", "meta.json"):
392
+ s = src_root / f
393
+ if s.is_file():
394
+ shutil.copy2(s, dst_root / f)
395
+ placed_any = True
396
+
397
+ # Stage 2 last → checkpoint-1
398
+ s2_last_src = src_root / "stage2" / "last"
399
+ if s2_last_src.is_dir() and any(s2_last_src.iterdir()):
400
+ dst = dst_root / stage2_subdir / "checkpoint-1"
401
+ dst.mkdir(parents=True, exist_ok=True)
402
+ shutil.copytree(s2_last_src, dst, dirs_exist_ok=True)
403
+ placed_any = True
404
+ print(f"[hydrate_run_dir_from_hf] stage2 mid-resume placed at {dst}")
405
+
406
+ # Stage 1 best (= final) → stage1_final_*
407
+ s1_best_src = src_root / "stage1" / "best"
408
+ if s1_best_src.is_dir() and (s1_best_src / "checkpoint_projection.pt").exists():
409
+ dst_s1 = dst_root / stage1_subdir
410
+ dst_s1.mkdir(parents=True, exist_ok=True)
411
+ for entry in s1_best_src.iterdir():
412
+ # Rename "checkpoint_*" → "stage1_final_*"
413
+ new_name = entry.name.replace("checkpoint_", "stage1_final_", 1) \
414
+ if entry.name.startswith("checkpoint_") else entry.name
415
+ if entry.is_file():
416
+ shutil.copy2(entry, dst_s1 / new_name)
417
+ elif entry.is_dir():
418
+ shutil.copytree(entry, dst_s1 / new_name, dirs_exist_ok=True)
419
+ placed_any = True
420
+ print(f"[hydrate_run_dir_from_hf] stage1 final placed at {dst_s1}")
421
+
422
+ # Stage 1 last → checkpoint-1 (ONLY if stage1 didn't finish yet)
423
+ if not (dst_root / stage1_subdir / "stage1_final_projection.pt").exists():
424
+ s1_last_src = src_root / "stage1" / "last"
425
+ if s1_last_src.is_dir() and any(s1_last_src.iterdir()):
426
+ dst = dst_root / stage1_subdir / "checkpoint-1"
427
+ dst.mkdir(parents=True, exist_ok=True)
428
+ shutil.copytree(s1_last_src, dst, dirs_exist_ok=True)
429
+ placed_any = True
430
+ print(f"[hydrate_run_dir_from_hf] stage1 mid-resume placed at {dst}")
431
+
432
+ # Cleanup staging
433
+ shutil.rmtree(staging, ignore_errors=True)
434
+
435
+ if placed_any:
436
+ print(f"[hydrate_run_dir_from_hf] hydrated {dst_root} from HF")
437
+ else:
438
+ print(f"[hydrate_run_dir_from_hf] nothing usable on HF for {run_id}")
439
+ return placed_any
440
+
441
+
442
  def build_tracker_from_cfg(train_cfg, resuming: bool = False, explicit_run_id: Optional[str] = None):
443
  """Convenience factory from OmegaConf DictConfig."""
444
  hf = getattr(train_cfg, "hf_hub", None)