f

Files changed (13) hide show

configs/train_config.yaml +43 -4
data/count_img.py → count_img.py +75 -75
data/dataset.py +47 -0
data/mimic_cxr_builder.py +227 -81
data/mimic_cxr_resized_builder.py +375 -0
data/distri-IU-Xray.py → distri-IU-Xray.py +11 -11
data/img_stat.py → img_stat.py +0 -0
data/rezip.py → rezip.py +11 -11
scripts/cxrvlm_colab_train.ipynb +19 -327
training/train.py +61 -1
data/upload_to_hf_2.py → upload_to_hf_2.py +0 -0
utils/dataset_resolver.py +73 -1
utils/hf_uploader.py +151 -0

configs/train_config.yaml CHANGED Viewed

@@ -5,7 +5,15 @@
 # ── Data ─────────────────────────────────────
 data:
   # Pick which dataset to train on.
-  # Supported: "MIMIC-CXR" (all 3 tasks) | "IU-Xray" (findings + impression only)
   dataset_name: "IU-Xray"
   # How findings and impression are turned into training samples.
@@ -74,6 +82,36 @@ data:
   # pre-built file (built via `python -m data.mimic_cxr_builder ...`).
   mimic_auto_build:       true
   # --- IU X-ray paths (used when dataset_name == "IU-Xray") ---
   # On local Windows the defaults below match D:\USTH\KLTN\data\IU-Xray\...
   # On Kaggle set these to the mounted dataset (e.g. /kaggle/input/vlm-cxr-data/...)
@@ -102,16 +140,17 @@ data:
 tasks:
   findings_generation:
     enabled: true
-    weight: 0.4                        # used when report_mode = split
   impression_generation:
     enabled: true
-    weight: 0.2                        # used when report_mode = split
   report_generation:
     enabled: true
     weight: 0.6                        # used when report_mode = merged
   vqa:
     enabled: true
-    weight: 0.4
 # ── Training ─────────────────────────────────
 training:

 # ── Data ─────────────────────────────────────
 data:
   # Pick which dataset to train on.
+  # Supported:
+  #   "MIMIC-CXR"          — pre-split layout {root}/{train,valid,test}/pXX/...
+  #                          all 3 tasks (findings, impression, vqa)
+  #   "MIMIC-CXR_resized"  — same data filtered+resized to tar shards on HF
+  #                          (hieu3636/cxr-vlm-data/MIMIC-CXR_resized/). After
+  #                          extraction the layout matches the raw PhysioNet
+  #                          tree ({root}/files/pXX/pXXXX/sYYYY/*.jpg); splits
+  #                          come from mimic-cxr-2.0.0-split.csv. All 3 tasks.
+  #   "IU-Xray"            — findings + impression only (no VQA)
   dataset_name: "IU-Xray"
   # How findings and impression are turned into training samples.
   # pre-built file (built via `python -m data.mimic_cxr_builder ...`).
   mimic_auto_build:       true
+  # --- MIMIC-CXR_resized paths (used when dataset_name == "MIMIC-CXR_resized")
+  # Filtered + resized subset of MIMIC-CXR distributed via HF as tar shards
+  # (hieu3636/cxr-vlm-data/MIMIC-CXR_resized/) + a "subset_bundle" with the
+  # manifest CSVs and VQA JSON files. This dataset is MANIFEST-DRIVEN:
+  #
+  #   manifest_{train,val,test}.csv  — one row per image. Contains the split
+  #       label, image_relpath, report_relpath, has_vqa, and 14 chex_*
+  #       columns (the CheXpert labels). The val/test pool was redistributed
+  #       from the original train split (subset is small), so the official
+  #       PhysioNet mimic-cxr-2.0.0-split.csv is NOT used.
+  #   vqa/{vqa.json, vqa_val.json, vqa_test.json}  — VQA pairs filtered to
+  #       only the images present in this resized subset.
+  #
+  # After extracting the tar shards, the on-disk layout (under `root`) is:
+  #     {root}/files/pXX/pXXXXXXXX/sYYYYYYYY/<dicom>.jpg
+  #     {root}/files/pXX/pXXXXXXXX/sYYYYYYYY.txt        (reports alongside)
+  mimic_cxr_resized:
+    root:           "D:/USTH/KLTN/subset_bundle"   # extracted-tar root (parent of files/)
+    manifest_dir:   null   # null → same as `root`. Folder containing
+                           # manifest_{train,val,test}.csv.
+    vqa_dir:        null   # null → use `{root}/vqa`. Folder containing
+                           # vqa.json / vqa_val.json / vqa_test.json. Set
+                           # to "" to disable VQA.
+    reports_root:   null   # null → auto-probe `{root}` then `{root}/reports`.
+                           # Set explicitly if reports live somewhere else
+                           # (e.g. when reports are bundled inside tars vs.
+                           # a sibling `reports/` dir like subset_bundle/).
+    instruct_json:  "data/data_files/mimic_cxr_resized_instruct.json"
+    auto_build:     true   # build JSON automatically if missing
   # --- IU X-ray paths (used when dataset_name == "IU-Xray") ---
   # On local Windows the defaults below match D:\USTH\KLTN\data\IU-Xray\...
   # On Kaggle set these to the mounted dataset (e.g. /kaggle/input/vlm-cxr-data/...)
 tasks:
   findings_generation:
     enabled: true
+    weight: 0.30                       # used when report_mode = split
   impression_generation:
     enabled: true
+    weight: 0.20                       # used when report_mode = split (lower:
+                                       # impression is conditioned on findings)
   report_generation:
     enabled: true
     weight: 0.6                        # used when report_mode = merged
   vqa:
     enabled: true
+    weight: 0.50                       # boosted so VQA ≈ RRG (findings+impression)
 # ── Training ─────────────────────────────────
 training:

data/count_img.py → count_img.py RENAMED Viewed

@@ -1,76 +1,76 @@
-import os
-import json
-def get_local_images(root_dir):
-    """
-    Lấy toàn bộ đường dẫn ảnh local (dạng pxx/...)
-    """
-    local_images = set()
-    for p_folder in os.listdir(root_dir):
-        if not p_folder.startswith("p1"):  # chỉ p10 -> p19
-            continue
-        p_path = os.path.join(root_dir, p_folder)
-        for root, _, files in os.walk(p_path):
-            for file in files:
-                if file.endswith(".jpg"):
-                    full_path = os.path.join(root, file)
-                    # convert về dạng giống VQA: p10/.../xxx.jpg
-                    rel_path = os.path.relpath(full_path, root_dir)
-                    rel_path = rel_path.replace("\\", "/")
-                    local_images.add(rel_path)
-    return local_images
-def get_vqa_images(vqa_json_path):
-    """
-    Lấy toàn bộ image_path từ file VQA json
-    """
-    with open(vqa_json_path, "r", encoding="utf-8") as f:
-        data = json.load(f)
-    vqa_images = set()
-    for item in data:
-        if "image_path" in item:
-            vqa_images.add(item["image_path"])
-    return vqa_images
-def main(root_dir, vqa_json_path):
-    print("Đang quét ảnh local...")
-    local_images = get_local_images(root_dir)
-    print(f"Số ảnh local: {len(local_images)}")
-    print("Đang đọc VQA json...")
-    vqa_images = get_vqa_images(vqa_json_path)
-    print(f"Số ảnh trong VQA: {len(vqa_images)}")
-    # intersection
-    matched = local_images & vqa_images
-    print("\n===== KẾT QUẢ =====")
-    print(f"Số ảnh trùng: {len(matched)}")
-    print(f"Tỷ lệ cover VQA: {len(matched) / len(vqa_images):.4f}")
-    # nếu muốn lưu danh sách
-    with open("matched_images.txt", "w") as f:
-        for path in matched:
-            f.write(path + "\n")
-    print("Đã lưu danh sách vào matched_images.txt")
-if __name__ == "__main__":
-    x = "train"
-    y = "valid"
-    root_dir = r"D:\USTH\KLTN\data\{x}".format(x=x)     # ví dụ: D:/mimic-cxr
-    vqa_json = r"D:\USTH\KLTN\data\mimic-ext-mimic-cxr-vqa-a-complex-diverse-and-large-scale-visual-question-answering-dataset-for-chest-x-ray-images-1.0.0\MIMIC-Ext-MIMIC-CXR-VQA\dataset\{y}.json".format(y=y)         # ví dụ: D:/vqa/train.json
     main(root_dir, vqa_json)

+import os
+import json
+def get_local_images(root_dir):
+    """
+    Lấy toàn bộ đường dẫn ảnh local (dạng pxx/...)
+    """
+    local_images = set()
+    for p_folder in os.listdir(root_dir):
+        if not p_folder.startswith("p1"):  # chỉ p10 -> p19
+            continue
+        p_path = os.path.join(root_dir, p_folder)
+        for root, _, files in os.walk(p_path):
+            for file in files:
+                if file.endswith(".jpg"):
+                    full_path = os.path.join(root, file)
+                    # convert về dạng giống VQA: p10/.../xxx.jpg
+                    rel_path = os.path.relpath(full_path, root_dir)
+                    rel_path = rel_path.replace("\\", "/")
+                    local_images.add(rel_path)
+    return local_images
+def get_vqa_images(vqa_json_path):
+    """
+    Lấy toàn bộ image_path từ file VQA json
+    """
+    with open(vqa_json_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    vqa_images = set()
+    for item in data:
+        if "image_path" in item:
+            vqa_images.add(item["image_path"])
+    return vqa_images
+def main(root_dir, vqa_json_path):
+    print("Đang quét ảnh local...")
+    local_images = get_local_images(root_dir)
+    print(f"Số ảnh local: {len(local_images)}")
+    print("Đang đọc VQA json...")
+    vqa_images = get_vqa_images(vqa_json_path)
+    print(f"Số ảnh trong VQA: {len(vqa_images)}")
+    # intersection
+    matched = local_images & vqa_images
+    print("\n===== KẾT QUẢ =====")
+    print(f"Số ảnh trùng: {len(matched)}")
+    print(f"Tỷ lệ cover VQA: {len(matched) / len(vqa_images):.4f}")
+    # nếu muốn lưu danh sách
+    with open("matched_images.txt", "w") as f:
+        for path in matched:
+            f.write(path + "\n")
+    print("Đã lưu danh sách vào matched_images.txt")
+if __name__ == "__main__":
+    x = "train"
+    y = "valid"
+    root_dir = r"D:\USTH\KLTN\data\{x}".format(x=x)     # ví dụ: D:/mimic-cxr
+    vqa_json = r"D:\USTH\KLTN\data\mimic-ext-mimic-cxr-vqa-a-complex-diverse-and-large-scale-visual-question-answering-dataset-for-chest-x-ray-images-1.0.0\MIMIC-Ext-MIMIC-CXR-VQA\dataset\{y}.json".format(y=y)         # ví dụ: D:/vqa/train.json
     main(root_dir, vqa_json)

data/dataset.py CHANGED Viewed

@@ -161,6 +161,53 @@ class CXRInstructDataset(Dataset):
     def __len__(self) -> int:
         return len(self.samples)
     def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
         sample = self.samples[idx]

     def __len__(self) -> int:
         return len(self.samples)
+    def get_per_sample_weights(self) -> Optional[List[float]]:
+        """
+        Build per-sample weights for `torch.utils.data.WeightedRandomSampler`
+        so that, in expectation, each task occupies its configured fraction of
+        drawn training samples — regardless of how many samples of each task
+        exist in the JSON.
+        Math:
+            For task t with N_t samples in the JSON and configured weight w_t,
+            give every sample of t the weight `w_t / N_t`. The aggregate
+            probability of drawing ANY sample of task t over one draw becomes
+            `N_t * (w_t / N_t) = w_t`, which is exactly the desired ratio.
+        Tasks with weight 0 (e.g. VQA on IU-Xray) get weight 0 → never drawn.
+        Tasks present in the JSON but absent from `self.task_weights` also get
+        weight 0 (loud-failure-on-misconfig is preferable to silent miscounts).
+        Returns:
+            list of floats of length len(self.samples), or None if this is a
+            single-task dataset (`self.task != "mixed"`) — in that case every
+            sample is the same task, so weighted sampling is unnecessary and
+            the default uniform `RandomSampler` is correct.
+        """
+        if self.task != "mixed":
+            return None
+        # Count samples per task that actually appear in this dataset.
+        counts: Dict[str, int] = {}
+        for s in self.samples:
+            counts[s["task"]] = counts.get(s["task"], 0) + 1
+        # Per-sample weight = w_task / N_task. Tasks not in task_weights → 0.
+        weights = [
+            float(self.task_weights.get(s["task"], 0.0)) / counts[s["task"]]
+            for s in self.samples
+        ]
+        # Sanity: print effective per-task probabilities once so the actual
+        # mix during training is visible in logs (helps catch misconfigured
+        # weights vs. JSON-task-set mismatch).
+        eff = {t: float(self.task_weights.get(t, 0.0)) for t in counts}
+        eff_sum = sum(eff.values()) or 1.0
+        eff = {t: round(v / eff_sum, 4) for t, v in eff.items()}
+        print(f"[CXRInstructDataset] WeightedRandomSampler effective task mix: "
+              f"{eff}  (counts: {counts})")
+        return weights
     def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
         sample = self.samples[idx]

data/mimic_cxr_builder.py CHANGED Viewed

@@ -94,6 +94,45 @@ def _discover_chexpert_csv(mimic_root: Path, explicit: Optional[str]) -> Optiona
     return None
 def _load_chexpert_map(csv_path: Path) -> Dict[Tuple[str, str], str]:
     """
     Return {(subject_id, study_id): <PNU string>} where the ids are the bare
@@ -150,6 +189,9 @@ def build_mimic_cxr_instruct_json(
     vqa_root:     Optional[str] = None,
     report_mode:  str = "split",                  # "split" | "merged" | "split_cascade"
     image_mode:   str = "all_views_split",        # "all_views_split" | "frontal_only_split" | "multi_image_merged"
 ) -> str:
     """
     Build the unified MIMIC-CXR instruction JSON.
@@ -170,30 +212,70 @@ def build_mimic_cxr_instruct_json(
     of the study — this MIMIC layout has no metadata.csv to read ViewPosition
     from. Swap in a ViewPosition lookup if you add that CSV.
     Returns the absolute output path.
     """
     assert report_mode in ("split", "merged", "split_cascade"), \
         f"report_mode must be 'split', 'merged', or 'split_cascade', got {report_mode!r}"
     assert image_mode in ("all_views_split", "frontal_only_split", "multi_image_merged"), \
         f"image_mode invalid: {image_mode!r}"
     from .dataset import format_merged_report   # local import to avoid cycle
     mimic_root  = Path(mimic_root)
     output_path = Path(output_path)
-    # split dir name → split label written into the JSON
-    split_dirs = {
-        "train": "train",
-        "valid": "validate",
-        "test":  "test",
-    }
-    present = {sub: mimic_root / sub for sub in split_dirs if (mimic_root / sub).is_dir()}
-    if not present:
-        raise FileNotFoundError(
-            f"No train/valid/test subdirs under {mimic_root}. "
-            f"Expected the pre-split MIMIC-CXR layout."
-        )
     # ── CheXpert labels ───────────────────────────────────────────────────
     csv_path = _discover_chexpert_csv(mimic_root, chexpert_csv)
@@ -211,21 +293,15 @@ def build_mimic_cxr_instruct_json(
     # ── Pass 1: index studies ─────────────────────────────────────────────
     samples: List[Dict] = []
     # sub_rel ("pXX/pXXXX/sYYYY/img.jpg") → full stored image_path
-    # ("{split}/pXX/pXXXX/sYYYY/img.jpg"). O(1) VQA lookup.
     image_index: Dict[str, str] = {}
-    n_studies = n_missing_report = n_no_chexpert = 0
     skipped_merged_no_impression = skipped_cascade_no_findings = 0
     def _structured_for(subj: str, study: str) -> Optional[str]:
         return chexpert_map.get((subj.lstrip("p"), study.lstrip("s")))
-    def _rels_for(study_dir: Path, split_sub: str, subj: str, study: str) -> List[str]:
-        """Split-prefixed relative image paths for one study, sorted."""
-        return [
-            f"{split_sub}/{im.parent.parent.parent.name}/{subj}/{study}/{im.name}"
-            for im in sorted(study_dir.glob("*.jpg"))
-        ]
     def _image_groups(rels: List[str]):
         """Yield path_fields dicts honouring image_mode (same rules as IU)."""
         if image_mode == "all_views_split":
@@ -236,67 +312,120 @@ def build_mimic_cxr_instruct_json(
         else:  # multi_image_merged
             yield {"image_path": None, "image_paths": rels}
-    for split_sub, split_dir in present.items():
-        for p_dir in sorted(split_dir.glob("p*")):
-            for pat_dir in p_dir.glob("p*"):
-                for study_dir in pat_dir.glob("s*"):
-                    subj, study = pat_dir.name, study_dir.name
-                    rels = _rels_for(study_dir, split_sub, subj, study)
-                    if not rels:
-                        continue
-                    n_studies += 1
-                    # Index EVERY image up front — a VQA row may reference a
-                    # study that has images but no findings/impression report.
-                    for r in rels:
-                        image_index[r.split("/", 1)[1]] = r
-                    txts = list(study_dir.glob("*.txt"))
-                    if not txts:
-                        n_missing_report += 1
-                        continue
-                    findings, impression = _parse_report(txts[0])
-                    structured = _structured_for(subj, study)
-                    if structured is None:
-                        n_no_chexpert += 1
-                    split_label = split_dirs[split_sub]
-                    for path_fields in _image_groups(rels):
-                        base = {
-                            **path_fields,
-                            "question":   None,
-                            "split":      split_label,
-                            "study_id":   study,
-                            "subject_id": subj,
-                        }
-                        if report_mode == "merged":
-                            target = format_merged_report(findings, impression)
-                            if target is None:
-                                skipped_merged_no_impression += 1
                                 continue
-                            samples.append({**base, "task": "report",
-                                             "target": target,
-                                             "structured_findings": structured})
-                        elif report_mode == "split_cascade":
-                            if findings:
-                                samples.append({**base, "task": "findings",
-                                                 "target": findings,
-                                                 "structured_findings": structured})
-                            if impression:
-                                if not findings:
-                                    skipped_cascade_no_findings += 1
-                                else:
-                                    samples.append({**base, "task": "impression",
-                                                     "target": impression,
-                                                     "structured_findings":
-                                                         f"Findings: {findings}"})
-                        else:  # "split"
-                            if findings:
-                                samples.append({**base, "task": "findings",
-                                                 "target": findings,
-                                                 "structured_findings": structured})
-                            if impression:
-                                samples.append({**base, "task": "impression",
-                                                 "target": impression,
-                                                 "structured_findings": structured})
     # ── Pass 2: optional VQA attach (mirrors the notebook) ────────────────
     n_vqa = n_vqa_dropped = 0
@@ -346,11 +475,14 @@ def build_mimic_cxr_instruct_json(
         by_task[s["task"]]   = by_task.get(s["task"], 0) + 1
     print(f"[mimic_cxr_builder] wrote {len(samples):,} samples → {output_path}")
     print(f"  report_mode      : {report_mode}")
     print(f"  image_mode       : {image_mode}")
     print(f"  studies indexed  : {n_studies:,}")
     print(f"  missing report   : {n_missing_report:,}")
     print(f"  studies w/o chexpert label : {n_no_chexpert:,}")
     if report_mode == "merged":
         print(f"  skipped no_impr  : {skipped_merged_no_impression:,}")
     if report_mode == "split_cascade":
@@ -378,6 +510,17 @@ def _parse_args():
                    choices=["split", "merged", "split_cascade"])
     p.add_argument("--image_mode",   default="all_views_split",
                    choices=["all_views_split", "frontal_only_split", "multi_image_merged"])
     return p.parse_args()
@@ -390,4 +533,7 @@ if __name__ == "__main__":
         vqa_root     = a.vqa_root,
         report_mode  = a.report_mode,
         image_mode   = a.image_mode,
     )

     return None
+def _discover_split_csv(mimic_root: Path, explicit: Optional[str]) -> Optional[Path]:
+    """Locate mimic-cxr-2.0.0-split.csv (or any *split*.csv) under `mimic_root`.
+    Used by the "files" layout to assign train/validate/test per study."""
+    if explicit:
+        p = Path(explicit)
+        return p if p.is_file() else None
+    hits = sorted(glob.glob(str(mimic_root / "**" / "*split*.csv"), recursive=True))
+    return Path(hits[0]) if hits else None
+def _load_split_map(csv_path: Path) -> Dict[Tuple[str, str], str]:
+    """
+    Return {(subject_id, study_id): "train"|"validate"|"test"} from
+    mimic-cxr-2.0.0-split.csv. IDs stored without the p/s prefix to match
+    the chexpert map convention. Tolerates 'valid' as alias for 'validate'.
+    """
+    out: Dict[Tuple[str, str], str] = {}
+    with open(csv_path, newline="") as f:
+        reader = csv.DictReader(f)
+        col = {c.lower().strip(): c for c in reader.fieldnames or []}
+        subj_c  = col.get("subject_id")
+        study_c = col.get("study_id")
+        split_c = col.get("split")
+        if not (subj_c and study_c and split_c):
+            raise ValueError(
+                f"{csv_path} missing subject_id/study_id/split columns "
+                f"(have: {reader.fieldnames})"
+            )
+        for row in reader:
+            subj  = str(row[subj_c]).strip().lstrip("p").split(".")[0]
+            study = str(row[study_c]).strip().lstrip("s").split(".")[0]
+            sp    = str(row[split_c]).strip().lower()
+            if sp == "valid":
+                sp = "validate"
+            if sp in ("train", "validate", "test"):
+                out[(subj, study)] = sp
+    return out
 def _load_chexpert_map(csv_path: Path) -> Dict[Tuple[str, str], str]:
     """
     Return {(subject_id, study_id): <PNU string>} where the ids are the bare
     vqa_root:     Optional[str] = None,
     report_mode:  str = "split",                  # "split" | "merged" | "split_cascade"
     image_mode:   str = "all_views_split",        # "all_views_split" | "frontal_only_split" | "multi_image_merged"
+    layout:       str = "presplit",               # "presplit" | "files"
+    split_csv:    Optional[str] = None,           # required for layout="files"
+    reports_root: Optional[str] = None,           # for layout="files"; None → reports alongside images
 ) -> str:
     """
     Build the unified MIMIC-CXR instruction JSON.
     of the study — this MIMIC layout has no metadata.csv to read ViewPosition
     from. Swap in a ViewPosition lookup if you add that CSV.
+    layout selects which on-disk tree to walk:
+      "presplit" — {root}/{train,valid,test}/pXX/pXXXX/sYYYY/{*.jpg + *.txt}
+                   The custom MIMIC-CXR.zip used by the notebook. Default.
+      "files"    — {root}/files/pXX/pXXXX/sYYYY/*.jpg (raw PhysioNet tree).
+                   Used by MIMIC-CXR_resized after extracting tar shards.
+                   Requires `split_csv` (or auto-discovers *split*.csv) to
+                   assign train/validate/test. Reports are read from
+                   `reports_root` (separate tree, e.g. mimic-cxr-reports/)
+                   or from the study dir if reports_root is None.
     Returns the absolute output path.
     """
     assert report_mode in ("split", "merged", "split_cascade"), \
         f"report_mode must be 'split', 'merged', or 'split_cascade', got {report_mode!r}"
     assert image_mode in ("all_views_split", "frontal_only_split", "multi_image_merged"), \
         f"image_mode invalid: {image_mode!r}"
+    assert layout in ("presplit", "files"), \
+        f"layout must be 'presplit' or 'files', got {layout!r}"
     from .dataset import format_merged_report   # local import to avoid cycle
     mimic_root  = Path(mimic_root)
     output_path = Path(output_path)
+    # ── Locate study dirs + split assignment ────────────────────────────────
+    # Two layouts produce the same downstream shape: each entry is
+    #   (study_dir, subject_dir_name, study_dir_name, split_label, image_rel_prefix)
+    # where image_rel_prefix is the leading path component used when building
+    # the JSON-stored relative image path. presplit prefixes with the split
+    # dir name ("train/..."), files prefixes with "files/...".
+    if layout == "presplit":
+        split_dirs = {"train": "train", "valid": "validate", "test": "test"}
+        present = {sub: mimic_root / sub for sub in split_dirs if (mimic_root / sub).is_dir()}
+        if not present:
+            raise FileNotFoundError(
+                f"No train/valid/test subdirs under {mimic_root}. "
+                f"Expected the pre-split MIMIC-CXR layout."
+            )
+        split_map = None     # not needed — split comes from dir name
+    else:  # "files"
+        files_dir = mimic_root / "files"
+        if not files_dir.is_dir():
+            raise FileNotFoundError(
+                f"Expected {files_dir} for layout='files'. After extracting "
+                f"the MIMIC-CXR_resized tars the layout should be "
+                f"{{root}}/files/pXX/pXXXX/sYYYY/*.jpg."
+            )
+        sp_path = _discover_split_csv(mimic_root, split_csv)
+        if sp_path is None:
+            raise FileNotFoundError(
+                f"Could not find a split CSV under {mimic_root} and none "
+                f"passed via --split_csv. layout='files' needs "
+                f"mimic-cxr-2.0.0-split.csv to assign train/validate/test."
+            )
+        split_map = _load_split_map(sp_path)
+        print(f"[mimic_cxr_builder] split CSV: {sp_path} "
+              f"({len(split_map):,} (subj,study) entries)")
+        reports_root_p = Path(reports_root) if reports_root else None
+        if reports_root_p is not None and not reports_root_p.is_dir():
+            raise FileNotFoundError(
+                f"reports_root={reports_root_p} does not exist. Either point "
+                f"to the extracted mimic-cxr-reports tree (with a `files/` "
+                f"subdir inside it) or leave it null to look alongside images."
+            )
     # ── CheXpert labels ───────────────────────────────────────────────────
     csv_path = _discover_chexpert_csv(mimic_root, chexpert_csv)
     # ── Pass 1: index studies ─────────────────────────────────────────────
     samples: List[Dict] = []
     # sub_rel ("pXX/pXXXX/sYYYY/img.jpg") → full stored image_path
+    # ("{split}/pXX/pXXXX/sYYYY/img.jpg"  or  "files/pXX/pXXXX/sYYYY/img.jpg").
+    # O(1) VQA lookup.
     image_index: Dict[str, str] = {}
+    n_studies = n_missing_report = n_no_chexpert = n_no_split = 0
     skipped_merged_no_impression = skipped_cascade_no_findings = 0
     def _structured_for(subj: str, study: str) -> Optional[str]:
         return chexpert_map.get((subj.lstrip("p"), study.lstrip("s")))
     def _image_groups(rels: List[str]):
         """Yield path_fields dicts honouring image_mode (same rules as IU)."""
         if image_mode == "all_views_split":
         else:  # multi_image_merged
             yield {"image_path": None, "image_paths": rels}
+    def _iter_studies():
+        """
+        Yield (study_dir, p_dir_name, subj, study, rels, report_path, split_label)
+        for every valid study in either layout.
+            rels = list of JSON-relative image paths (split-prefixed or
+                   "files/"-prefixed depending on layout).
+            report_path = Path to the report .txt (may not exist; caller handles).
+            split_label = "train"/"validate"/"test" or None when unresolved.
+        """
+        if layout == "presplit":
+            for split_sub, split_dir in present.items():
+                for p_dir in sorted(split_dir.glob("p*")):
+                    for pat_dir in p_dir.glob("p*"):
+                        for study_dir in pat_dir.glob("s*"):
+                            subj, study = pat_dir.name, study_dir.name
+                            rels = [
+                                f"{split_sub}/{p_dir.name}/{subj}/{study}/{im.name}"
+                                for im in sorted(study_dir.glob("*.jpg"))
+                            ]
+                            if not rels:
                                 continue
+                            txts = list(study_dir.glob("*.txt"))
+                            report_path = txts[0] if txts else None
+                            yield (study_dir, p_dir.name, subj, study, rels,
+                                   report_path, split_dirs[split_sub])
+        else:  # "files"
+            files_dir = mimic_root / "files"
+            for p_dir in sorted(files_dir.glob("p*")):
+                for pat_dir in p_dir.glob("p*"):
+                    for study_dir in pat_dir.glob("s*"):
+                        subj, study = pat_dir.name, study_dir.name
+                        rels = [
+                            f"files/{p_dir.name}/{subj}/{study}/{im.name}"
+                            for im in sorted(study_dir.glob("*.jpg"))
+                        ]
+                        if not rels:
+                            continue
+                        # Report lookup: separate tree if reports_root is set,
+                        # else alongside images (parent dir holds sYYYY.txt
+                        # per PhysioNet convention OR inside study dir).
+                        if reports_root_p is not None:
+                            report_path = (reports_root_p / "files" /
+                                           p_dir.name / subj / f"{study}.txt")
+                        else:
+                            # Try both: study_dir/*.txt then parent/{study}.txt
+                            cand = list(study_dir.glob("*.txt"))
+                            if cand:
+                                report_path = cand[0]
+                            else:
+                                report_path = pat_dir / f"{study}.txt"
+                        split_label = split_map.get(
+                            (subj.lstrip("p"), study.lstrip("s"))
+                        ) if split_map else None
+                        yield (study_dir, p_dir.name, subj, study, rels,
+                               report_path, split_label)
+    for (study_dir, p_dir_name, subj, study, rels,
+         report_path, split_label) in _iter_studies():
+        n_studies += 1
+        # Index EVERY image up front — a VQA row may reference a study
+        # that has images but no findings/impression report.
+        for r in rels:
+            image_index[r.split("/", 1)[1]] = r
+        # Studies missing from split CSV (files layout) are skipped —
+        # emitting them would silently dump into "train".
+        if split_label is None:
+            n_no_split += 1
+            continue
+        if report_path is None or not Path(report_path).is_file():
+            n_missing_report += 1
+            continue
+        findings, impression = _parse_report(Path(report_path))
+        structured = _structured_for(subj, study)
+        if structured is None:
+            n_no_chexpert += 1
+        for path_fields in _image_groups(rels):
+            base = {
+                **path_fields,
+                "question":   None,
+                "split":      split_label,
+                "study_id":   study,
+                "subject_id": subj,
+            }
+            if report_mode == "merged":
+                target = format_merged_report(findings, impression)
+                if target is None:
+                    skipped_merged_no_impression += 1
+                    continue
+                samples.append({**base, "task": "report",
+                                "target": target,
+                                "structured_findings": structured})
+            elif report_mode == "split_cascade":
+                if findings:
+                    samples.append({**base, "task": "findings",
+                                    "target": findings,
+                                    "structured_findings": structured})
+                if impression:
+                    if not findings:
+                        skipped_cascade_no_findings += 1
+                    else:
+                        samples.append({**base, "task": "impression",
+                                        "target": impression,
+                                        "structured_findings":
+                                            f"Findings: {findings}"})
+            else:  # "split"
+                if findings:
+                    samples.append({**base, "task": "findings",
+                                    "target": findings,
+                                    "structured_findings": structured})
+                if impression:
+                    samples.append({**base, "task": "impression",
+                                    "target": impression,
+                                    "structured_findings": structured})
     # ── Pass 2: optional VQA attach (mirrors the notebook) ────────────────
     n_vqa = n_vqa_dropped = 0
         by_task[s["task"]]   = by_task.get(s["task"], 0) + 1
     print(f"[mimic_cxr_builder] wrote {len(samples):,} samples → {output_path}")
+    print(f"  layout           : {layout}")
     print(f"  report_mode      : {report_mode}")
     print(f"  image_mode       : {image_mode}")
     print(f"  studies indexed  : {n_studies:,}")
     print(f"  missing report   : {n_missing_report:,}")
     print(f"  studies w/o chexpert label : {n_no_chexpert:,}")
+    if layout == "files":
+        print(f"  studies w/o split-CSV entry (skipped) : {n_no_split:,}")
     if report_mode == "merged":
         print(f"  skipped no_impr  : {skipped_merged_no_impression:,}")
     if report_mode == "split_cascade":
                    choices=["split", "merged", "split_cascade"])
     p.add_argument("--image_mode",   default="all_views_split",
                    choices=["all_views_split", "frontal_only_split", "multi_image_merged"])
+    p.add_argument("--layout",       default="presplit",
+                   choices=["presplit", "files"],
+                   help="presplit: {root}/{train,valid,test}/pXX/... (custom MIMIC-CXR.zip). "
+                        "files: {root}/files/pXX/... (raw PhysioNet tree, used by "
+                        "MIMIC-CXR_resized after tar extraction). Requires --split_csv.")
+    p.add_argument("--split_csv",    default=None,
+                   help="mimic-cxr-2.0.0-split.csv (auto-discovered under --mimic_root "
+                        "if omitted). Required for layout='files'.")
+    p.add_argument("--reports_root", default=None,
+                   help="Root of the mimic-cxr-reports tree (separate from images). "
+                        "Used when layout='files' and reports are NOT in the image tars.")
     return p.parse_args()
         vqa_root     = a.vqa_root,
         report_mode  = a.report_mode,
         image_mode   = a.image_mode,
+        layout       = a.layout,
+        split_csv    = a.split_csv,
+        reports_root = a.reports_root,
     )

data/mimic_cxr_resized_builder.py ADDED Viewed

	@@ -0,0 +1,375 @@

+"""
+mimic_cxr_resized_builder.py
+----------------------------
+Build the unified instruction JSON for the MIMIC-CXR_resized dataset
+(`hieu3636/cxr-vlm-data/MIMIC-CXR_resized/`) — a filtered + resized subset
+of MIMIC-CXR that ships with its own manifest CSVs.
+Why a separate builder?
+  MIMIC-CXR_resized is manifest-driven, not directory-walking like
+  `mimic_cxr_builder.py`:
+    - Splits come from THREE manifest CSVs (manifest_train.csv,
+      manifest_val.csv, manifest_test.csv) — NOT from PhysioNet's
+      mimic-cxr-2.0.0-split.csv (the user redistributed val/test from
+      the original train pool to balance sizes).
+    - The 14 CheXpert labels are baked into the manifest as `chex_*`
+      columns; no separate chexpert.csv lookup is needed.
+    - Each manifest row = ONE image (one DICOM). Multi-view studies
+      appear as multiple rows sharing (subject_id, study_id).
+    - Image + report paths are stored verbatim in `image_relpath` /
+      `report_relpath`, relative to the extracted-tar root. Reports
+      live inside the same `files/` tree at patient-dir level
+      (e.g. files/p10/p10000032/s50414267.txt), NOT inside study dirs.
+VQA
+---
+3 JSON files (`vqa.json` for train, `vqa_val.json`, `vqa_test.json`) sit
+under a `vqa/` sibling dir. Each row references `image_path` exactly the
+same as `image_relpath` in the manifest, so we look it up in an
+image-index built during the manifest pass. Missing-image VQA rows are
+dropped (the resized subset has fewer images than the full MIMIC).
+Output JSON schema is identical to the other two builders so downstream
+(`CXRInstructDataset`, evaluation) is unchanged.
+"""
+import argparse
+import csv
+import json
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+from .mimic_cxr_builder import _parse_report   # reuse the same FINDINGS/IMPRESSION regex
+# Manifest column name → CheXpert PATHOLOGIES name mapping is direct:
+#   manifest col "chex_Atelectasis" ↔ PATHOLOGIES "Atelectasis", etc.
+# We resolve the canonical 14-name list at runtime to stay in sync with
+# `model.chexpert_classifier.PATHOLOGIES` (single source of truth).
+# ─── PNU builder from a single manifest row ─────────────────────────────────
+def _row_to_pnu(row: Dict[str, str]) -> str:
+    """
+    Translate the 14 `chex_*` columns of a manifest row into the PNU
+    structured-findings string (Positive/Negative/Uncertain Abnormalities).
+    Same U-MultiClass convention as `mimic_cxr_builder._load_chexpert_map`:
+        "1" / "1.0"   → positive
+        "0" / "0.0"   → negative
+        "-1" / "-1.0" → uncertain
+        blank / NaN   → negative (META-CXR default)
+    """
+    from model.chexpert_classifier import (
+        PATHOLOGIES, buckets_to_pnu,
+        CLASS_NEGATIVE, CLASS_POSITIVE, CLASS_UNCERTAIN,
+    )
+    val_to_cls = {
+        "1": CLASS_POSITIVE,  "1.0": CLASS_POSITIVE,
+        "0": CLASS_NEGATIVE,  "0.0": CLASS_NEGATIVE,
+        "-1": CLASS_UNCERTAIN, "-1.0": CLASS_UNCERTAIN,
+    }
+    mapping = {}
+    for name in PATHOLOGIES:
+        v = str(row.get(f"chex_{name}", "")).strip()
+        mapping[name] = val_to_cls.get(v, CLASS_NEGATIVE)
+    return buckets_to_pnu(mapping)
+# ─── Helpers ────────────────────────────────────────────────────────────────
+# Manifest split-label ↔ output split-label.
+# Manifest uses "val" (3-letter); the rest of the pipeline expects "validate".
+_MANIFEST_FILES = (
+    ("manifest_train.csv", "train"),
+    ("manifest_val.csv",   "validate"),
+    ("manifest_test.csv",  "test"),
+)
+_VQA_FILES = (
+    ("vqa.json",     "train"),
+    ("vqa_val.json", "validate"),
+    ("vqa_test.json","test"),
+)
+def _group_manifest_by_study(csv_path: Path) -> Dict[Tuple[str, str], List[Dict[str, str]]]:
+    """
+    Parse one manifest CSV and group rows by (subject_id, study_id) so that
+    multi-view studies end up as a single bucket — needed for the
+    `multi_image_merged` image_mode and to keep one structured_findings per
+    study (all views of a study share the same CheXpert labels).
+    """
+    grouped: Dict[Tuple[str, str], List[Dict[str, str]]] = {}
+    with open(csv_path, encoding="utf-8", newline="") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            key = (str(row["subject_id"]).strip(),
+                   str(row["study_id"]).strip())
+            grouped.setdefault(key, []).append(row)
+    return grouped
+def _image_groups(rels: List[str], image_mode: str):
+    """Yield path_fields dicts honouring image_mode (mirrors the other builders)."""
+    if image_mode == "all_views_split":
+        for r in rels:
+            yield {"image_path": r, "image_paths": None}
+    elif image_mode == "frontal_only_split":
+        yield {"image_path": rels[0], "image_paths": None}
+    else:  # multi_image_merged
+        yield {"image_path": None, "image_paths": rels}
+# ─── Main builder ───────────────────────────────────────────────────────────
+def build_mimic_cxr_resized_instruct_json(
+    root:          str,
+    manifest_dir:  Optional[str],
+    output_path:   str,
+    vqa_dir:       Optional[str] = None,
+    reports_root:  Optional[str] = None,
+    report_mode:   str = "split",                  # "split" | "merged" | "split_cascade"
+    image_mode:    str = "all_views_split",        # "all_views_split" | "frontal_only_split" | "multi_image_merged"
+) -> str:
+    """
+    Build the unified MIMIC-CXR_resized instruction JSON.
+    Args:
+        root:         directory containing the extracted tar shards, so
+                      `{root}/{image_relpath}` resolves to an image.
+                      The manifest stores image_relpath like
+                      "files/p19/p19855745/s59502026/<dicom>.jpg".
+        manifest_dir: directory containing manifest_{train,val,test}.csv.
+                      If None → defaults to `root`.
+        output_path:  where to write the JSON.
+        vqa_dir:      directory containing vqa.json / vqa_val.json /
+                      vqa_test.json. If None or files missing → VQA skipped
+                      (only findings + impression samples emitted).
+        reports_root: directory that `report_relpath` resolves against. If
+                      None we try (a) `{root}` (tars include reports beside
+                      images) then (b) `{root}/reports` (separate bundle —
+                      the layout of the local subset_bundle). Set explicitly
+                      to skip the probe.
+        report_mode:  "split" | "merged" | "split_cascade" (see other builders).
+        image_mode:   "all_views_split" | "frontal_only_split" |
+                      "multi_image_merged" (see other builders).
+    Returns:
+        Absolute output path.
+    """
+    assert report_mode in ("split", "merged", "split_cascade"), \
+        f"report_mode invalid: {report_mode!r}"
+    assert image_mode in ("all_views_split", "frontal_only_split", "multi_image_merged"), \
+        f"image_mode invalid: {image_mode!r}"
+    from .dataset import format_merged_report   # local import to avoid cycle
+    root         = Path(root)
+    manifest_dir = Path(manifest_dir) if manifest_dir else root
+    output_path  = Path(output_path)
+    # ── Resolve reports_root (auto-probe if not set) ────────────────────────
+    # The manifest stores `report_relpath` like
+    # "files/p19/p19855745/s59502026.txt" — relative to whichever directory
+    # actually holds the reports tree. Two common layouts:
+    #   (a) reports bundled into the tars → `{root}/files/.../.txt` exists
+    #   (b) reports kept as a separate sibling → `{root}/reports/files/.../.txt`
+    # The local subset_bundle uses (b); the HF-extracted training setup may
+    # use (a). Auto-probe both and pick the first one that has any hits.
+    if reports_root is not None:
+        reports_root_p = Path(reports_root)
+    else:
+        candidates = [root, root / "reports"]
+        reports_root_p = root            # default fallback
+        for cand in candidates:
+            if (cand / "files").is_dir():
+                reports_root_p = cand
+                break
+    print(f"[mimic_cxr_resized_builder] reports_root resolved → {reports_root_p}")
+    # ── Pass 1: walk the 3 manifest CSVs ────────────────────────────────────
+    samples: List[Dict]            = []
+    image_index: Dict[str, str]    = {}  # image_relpath → image_relpath (identity; used for VQA lookup)
+    pnu_by_study: Dict[Tuple[str, str], str] = {}  # (subj, study) → PNU string (for VQA reuse)
+    n_studies = n_missing_report = 0
+    skipped_merged_no_impression = skipped_cascade_no_findings = 0
+    for fname, split_label in _MANIFEST_FILES:
+        csv_path = manifest_dir / fname
+        if not csv_path.is_file():
+            print(f"[mimic_cxr_resized_builder] manifest missing: {csv_path} — skipping {split_label}")
+            continue
+        grouped = _group_manifest_by_study(csv_path)
+        print(f"[mimic_cxr_resized_builder] {fname}: "
+              f"{sum(len(v) for v in grouped.values()):,} rows / "
+              f"{len(grouped):,} studies")
+        for (subj, study), rows in grouped.items():
+            n_studies += 1
+            # All views of the same study share report + CheXpert labels.
+            first = rows[0]
+            rels  = [r["image_relpath"] for r in rows]
+            # Index every image (incl. studies with no report yet) so a VQA
+            # row that references this image can still be picked up below.
+            for r in rels:
+                image_index[r] = r
+            structured = _row_to_pnu(first)
+            pnu_by_study[(subj, study)] = structured  # cached for VQA reuse
+            report_rel = first.get("report_relpath", "").strip()
+            if not report_rel:
+                n_missing_report += 1
+                continue
+            report_path = reports_root_p / report_rel
+            if not report_path.is_file():
+                n_missing_report += 1
+                continue
+            findings, impression = _parse_report(report_path)
+            # Output JSON uses the same subject/study id format as the
+            # legacy MIMIC builder ("pXXXX" / "sYYYY") so downstream eval
+            # (which compares subject_id strings) keeps working unchanged.
+            subj_str  = f"p{subj}" if not subj.startswith("p") else subj
+            study_str = f"s{study}" if not study.startswith("s") else study
+            for path_fields in _image_groups(rels, image_mode):
+                base = {
+                    **path_fields,
+                    "question":   None,
+                    "split":      split_label,
+                    "study_id":   study_str,
+                    "subject_id": subj_str,
+                }
+                if report_mode == "merged":
+                    target = format_merged_report(findings, impression)
+                    if target is None:
+                        skipped_merged_no_impression += 1
+                        continue
+                    samples.append({**base, "task": "report",
+                                    "target": target,
+                                    "structured_findings": structured})
+                elif report_mode == "split_cascade":
+                    if findings:
+                        samples.append({**base, "task": "findings",
+                                        "target": findings,
+                                        "structured_findings": structured})
+                    if impression:
+                        if not findings:
+                            skipped_cascade_no_findings += 1
+                        else:
+                            samples.append({**base, "task": "impression",
+                                            "target": impression,
+                                            "structured_findings":
+                                                f"Findings: {findings}"})
+                else:  # "split"
+                    if findings:
+                        samples.append({**base, "task": "findings",
+                                        "target": findings,
+                                        "structured_findings": structured})
+                    if impression:
+                        samples.append({**base, "task": "impression",
+                                        "target": impression,
+                                        "structured_findings": structured})
+    # ── Pass 2: optional VQA attach ─────────────────────────────────────────
+    n_vqa = n_vqa_dropped = 0
+    if vqa_dir:
+        vqa_dir = Path(vqa_dir)
+        for fname, split_label in _VQA_FILES:
+            vqa_file = vqa_dir / fname
+            if not vqa_file.is_file():
+                print(f"[mimic_cxr_resized_builder] VQA missing: {vqa_file} — skipping {split_label}")
+                continue
+            for row in json.load(open(vqa_file, encoding="utf-8")):
+                img_path = str(row.get("image_path", "")).lstrip("/")
+                if img_path not in image_index:
+                    n_vqa_dropped += 1
+                    continue
+                ans = row.get("answer", [])
+                answer = (", ".join(map(str, ans)) if isinstance(ans, list)
+                          else str(ans)) or "No."
+                subj  = str(row.get("subject_id", "")).strip()
+                study = str(row.get("study_id",   "")).strip()
+                # Abnormality-guided VQA: reuse the manifest's CheXpert PNU
+                # for this study (same context as findings/impression). None
+                # if the study wasn't in any manifest — should not happen
+                # since we already filtered to images that exist in manifest.
+                structured = pnu_by_study.get((subj, study))
+                subj_str  = f"p{subj}" if subj and not subj.startswith("p") else subj
+                study_str = f"s{study}" if study and not study.startswith("s") else study
+                samples.append({
+                    "image_path": img_path, "image_paths": None,
+                    "task": "vqa", "target": answer,
+                    "question": row["question"],
+                    "structured_findings": structured,
+                    "split": split_label,
+                    "study_id":  study_str,
+                    "subject_id": subj_str,
+                })
+                n_vqa += 1
+    # ── Write ───────────────────────────────────────────────────────��───────
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(samples, f, ensure_ascii=False)
+    by_split, by_task = {}, {}
+    for s in samples:
+        by_split[s["split"]] = by_split.get(s["split"], 0) + 1
+        by_task[s["task"]]   = by_task.get(s["task"], 0) + 1
+    print(f"[mimic_cxr_resized_builder] wrote {len(samples):,} samples → {output_path}")
+    print(f"  root              : {root}")
+    print(f"  manifest_dir      : {manifest_dir}")
+    print(f"  vqa_dir           : {vqa_dir if vqa_dir else '(disabled)'}")
+    print(f"  report_mode       : {report_mode}")
+    print(f"  image_mode        : {image_mode}")
+    print(f"  studies indexed   : {n_studies:,}")
+    print(f"  missing report    : {n_missing_report:,}")
+    if report_mode == "merged":
+        print(f"  skipped no_impr   : {skipped_merged_no_impression:,}")
+    if report_mode == "split_cascade":
+        print(f"  skipped impr w/o findings : {skipped_cascade_no_findings:,}")
+    if vqa_dir:
+        print(f"  vqa added/dropped : {n_vqa:,} / {n_vqa_dropped:,}")
+    print(f"  by split          : {by_split}")
+    print(f"  by task           : {by_task}")
+    return str(output_path)
+# ─── CLI ────────────────────────────────────────────────────────────────────
+def _parse_args():
+    p = argparse.ArgumentParser(description="Build MIMIC-CXR_resized unified instruction JSON")
+    p.add_argument("--root",         required=True,
+                   help="Root containing files/pXX/... after extracting tar shards.")
+    p.add_argument("--manifest_dir", default=None,
+                   help="Folder with manifest_{train,val,test}.csv (defaults to --root).")
+    p.add_argument("--output",       required=True, help="Output JSON path.")
+    p.add_argument("--vqa_dir",      default=None,
+                   help="Folder with vqa.json / vqa_val.json / vqa_test.json. Omit to skip VQA.")
+    p.add_argument("--reports_root", default=None,
+                   help="Directory that report_relpath resolves against. "
+                        "Omit to auto-probe `{root}` then `{root}/reports`.")
+    p.add_argument("--report_mode",  default="split",
+                   choices=["split", "merged", "split_cascade"])
+    p.add_argument("--image_mode",   default="all_views_split",
+                   choices=["all_views_split", "frontal_only_split", "multi_image_merged"])
+    return p.parse_args()
+if __name__ == "__main__":
+    a = _parse_args()
+    build_mimic_cxr_resized_instruct_json(
+        root         = a.root,
+        manifest_dir = a.manifest_dir,
+        output_path  = a.output,
+        vqa_dir      = a.vqa_dir,
+        reports_root = a.reports_root,
+        report_mode  = a.report_mode,
+        image_mode   = a.image_mode,
+    )

data/distri-IU-Xray.py → distri-IU-Xray.py RENAMED Viewed

@@ -1,12 +1,12 @@
-import os, glob
-from xml.etree import ElementTree as ET
-xml_dir = "D:/USTH/KLTN/data/IU-Xray/labels/ecgen-radiology/"
-counts = {}
-for f in glob.glob(xml_dir + "*.xml"):
-    tree = ET.parse(f)
-    n = len(tree.findall(".//parentImage"))
-    counts[n] = counts.get(n, 0) + 1
-for k, v in sorted(counts.items()):
     print(f"{k} ảnh/report: {v} reports")

+import os, glob
+from xml.etree import ElementTree as ET
+xml_dir = "D:/USTH/KLTN/data/IU-Xray/labels/ecgen-radiology/"
+counts = {}
+for f in glob.glob(xml_dir + "*.xml"):
+    tree = ET.parse(f)
+    n = len(tree.findall(".//parentImage"))
+    counts[n] = counts.get(n, 0) + 1
+for k, v in sorted(counts.items()):
     print(f"{k} ảnh/report: {v} reports")

data/img_stat.py → img_stat.py RENAMED Viewed

File without changes

data/rezip.py → rezip.py RENAMED Viewed

@@ -1,12 +1,12 @@
-import zipfile
-import os
-zipf = zipfile.ZipFile(r"D:\USTH\KLTN\cxr-vlm-code.zip", 'w', zipfile.ZIP_DEFLATED)
-for root, dirs, files in os.walk('KLTN'):
-    for file in files:
-        filepath = os.path.join(root, file)
-        arcname = os.path.relpath(filepath, 'KLTN')
-        zipf.write(filepath, arcname)
 zipf.close()

+import zipfile
+import os
+zipf = zipfile.ZipFile(r"D:\USTH\KLTN\cxr-vlm-code.zip", 'w', zipfile.ZIP_DEFLATED)
+for root, dirs, files in os.walk('KLTN'):
+    for file in files:
+        filepath = os.path.join(root, file)
+        arcname = os.path.relpath(filepath, 'KLTN')
+        zipf.write(filepath, arcname)
 zipf.close()

scripts/cxrvlm_colab_train.ipynb CHANGED Viewed

@@ -5,32 +5,7 @@
    "metadata": {
     "id": "cell-0"
    },
-   "source": [
-    "# CXR-VLM — Kaggle Training Notebook (consolidated)\n",
-    "\n",
-    "Trains the 2-stage CXR-VLM (Vicuna-7B + BioViL-T fallback to timm ViT + LoRA) on a Kaggle **T4** GPU.\n",
-    "\n",
-    "Supports **two datasets**, selected by `DATASET_NAME` in section 0:\n",
-    "- **`MIMIC-CXR`** — full 3 tasks (findings, impression, VQA).\n",
-    "- **`IU-Xray`**   — 2 tasks only (findings, impression). Much lighter dataset (~7.5k images).\n",
-    "\n",
-    "### Before you run\n",
-    "\n",
-    "Attach Kaggle Datasets via `+ Add Input`:\n",
-    "\n",
-    "| Dataset slug | Contents | When needed |\n",
-    "|---|---|---|\n",
-    "| `cxr-vlm-code` | entire `D:\\USTH\\KLTN` folder (configs/, data/*.py, model/, training/, evaluation/, utils/, requirements.txt) | **always** |\n",
-    "| `cxr-vlm-data` | holds **both** datasets: `MIMIC-CXR/{train,valid,test}/p*/...` + `MIMIC-Ext-MIMIC-CXR-VQA/...` and/or `IU-Xray/images/` + `IU-Xray/labels/` | **always** |\n",
-    "\n",
-    "**Settings (right panel):**\n",
-    "- Accelerator: **T4 x2** (only GPU 0 will be used)\n",
-    "- Persistence: **Variables and Files**\n",
-    "- Internet: **On**\n",
-    "\n",
-    "**Kaggle Secrets** (Add-ons → Secrets):\n",
-    "- `HF_TOKEN` — HuggingFace token with write access to the runs repo."
-   ],
    "id": "cell-0"
   },
   {
@@ -54,27 +29,9 @@
     },
     "outputId": "d6e7ebbd-4f1b-483b-f20f-0df3997a60b7"
    },
-   "source": [
-    "# ── Platform + dataset selectors ──────────────────────────────────\n",
-    "# PLATFORM drives storage paths and how secrets are read.\n",
-    "# Supported: 'kaggle' | 'colab' | 'lightning' | 'gcp' | 'local'\n",
-    "PLATFORM     = 'colab'\n",
-    "DATASET_NAME = 'IU-Xray'     # 'MIMIC-CXR' | 'IU-Xray'\n",
-    "\n",
-    "assert PLATFORM     in ('kaggle', 'colab', 'lightning', 'gcp', 'local')\n",
-    "assert DATASET_NAME in ('MIMIC-CXR', 'IU-Xray')\n",
-    "print(f'PLATFORM = {PLATFORM} | DATASET_NAME = {DATASET_NAME}')\n"
-   ],
    "execution_count": null,
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "PLATFORM = colab | DATASET_NAME = IU-Xray\n"
-     ]
-    }
-   ],
    "id": "cell-select"
   },
   {
@@ -119,120 +76,9 @@
     },
     "outputId": "f6195a48-56b2-4052-8367-c8ec14c48a05"
    },
-   "source": [
-    "# ── Per-platform storage + source-of-truth ─────────────────────────\n",
-    "# Kaggle : code + data come from attached Kaggle datasets (pre-mounted).\n",
-    "# Others : pull code (folder) + data (single zip) from HF Hub dataset repos.\n",
-    "#\n",
-    "# Required HF repos:\n",
-    "#   <HF_USER>/cxr-vlm-code   — project source (flat folder)\n",
-    "#   <HF_USER>/cxr-vlm-data   — contains IU-Xray.zip (one zip per dataset)\n",
-    "\n",
-    "HF_USER = 'hieu3636'   # <<< EDIT ME\n",
-    "\n",
-    "if PLATFORM == 'kaggle':\n",
-    "    INPUT_ROOT = Path('/kaggle/input')\n",
-    "    WORK       = Path('/kaggle/working')\n",
-    "    def find_dataset(slug, required=True):\n",
-    "        for cand in [INPUT_ROOT / slug, *INPUT_ROOT.rglob(slug)]:\n",
-    "            if cand.is_dir():\n",
-    "                return cand\n",
-    "        if required:\n",
-    "            raise FileNotFoundError(f'Dataset {slug!r} not attached')\n",
-    "        return None\n",
-    "    CODE_SRC = find_dataset('cxr-vlm-code')\n",
-    "    DATA_SRC = find_dataset('cxr-vlm-data')\n",
-    "\n",
-    "else:\n",
-    "    # ── Non-Kaggle: resolve WORK, then pull from HF ──\n",
-    "    if PLATFORM == 'colab':\n",
-    "        from google.colab import userdata\n",
-    "        os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')\n",
-    "        WORK = Path('/content')\n",
-    "    elif PLATFORM == 'lightning':\n",
-    "        WORK = Path('/teamspace/studios/this_studio')\n",
-    "    elif PLATFORM == 'gcp':\n",
-    "        WORK = Path('/workspace')\n",
-    "    else:\n",
-    "        WORK = Path.home() / 'cxr-vlm-work'\n",
-    "    WORK.mkdir(parents=True, exist_ok=True)\n",
-    "\n",
-    "    assert os.environ.get('HF_TOKEN'), 'HF_TOKEN missing — set it via platform secrets UI.'\n",
-    "\n",
-    "    try:\n",
-    "        from huggingface_hub import snapshot_download, hf_hub_download\n",
-    "    except ImportError:\n",
-    "        !pip install -q huggingface_hub\n",
-    "        from huggingface_hub import snapshot_download, hf_hub_download\n",
-    "\n",
-    "    # 1) Code: flat folder, few hundred files → snapshot_download ok\n",
-    "    print(f'Pulling code from HF (user: {HF_USER}) …')\n",
-    "    CODE_SRC = Path(snapshot_download(\n",
-    "        repo_id   = f'{HF_USER}/cxr-vlm-code',\n",
-    "        repo_type = 'model',\n",
-    "        token     = os.environ['HF_TOKEN'],\n",
-    "        local_dir = str(WORK / 'cxr-vlm-code'),\n",
-    "    ))\n",
-    "\n",
-    "    # 2) Data: single zip per dataset (avoids per-file rate limits)\n",
-    "    import zipfile\n",
-    "    DATA_SRC = WORK / 'data'\n",
-    "    DATA_SRC.mkdir(parents=True, exist_ok=True)\n",
-    "\n",
-    "    zip_name = f'{DATASET_NAME}.zip'           # 'IU-Xray.zip' | 'MIMIC-CXR.zip'\n",
-    "    marker   = DATA_SRC / DATASET_NAME          # DATA_SRC/IU-Xray after unzip\n",
-    "\n",
-    "    if not marker.exists():\n",
-    "        print(f'Pulling {zip_name} from HF …')\n",
-    "        zpath = hf_hub_download(\n",
-    "            repo_id   = f'{HF_USER}/cxr-vlm-data',\n",
-    "            filename  = zip_name,\n",
-    "            repo_type = 'dataset',\n",
-    "            token     = os.environ['HF_TOKEN'],\n",
-    "            local_dir = str(DATA_SRC),\n",
-    "        )\n",
-    "        print(f'  unzipping → {DATA_SRC}')\n",
-    "        with zipfile.ZipFile(zpath) as zf:\n",
-    "            zf.extractall(DATA_SRC)\n",
-    "        try:\n",
-    "            os.remove(zpath)  # free disk\n",
-    "        except OSError:\n",
-    "            pass\n",
-    "    else:\n",
-    "        print(f'{marker} already present — skipping download.')\n",
-    "    print(f'Contents of {DATA_SRC}: {sorted(os.listdir(DATA_SRC))}')\n",
-    "\n",
-    "# ── Common: copy code into writable PROJECT dir ────────────────────\n",
-    "PROJECT = WORK / 'cxr_vlm'\n",
-    "if CODE_SRC.resolve() != PROJECT.resolve() and not PROJECT.exists():\n",
-    "    shutil.copytree(CODE_SRC, PROJECT)\n",
-    "\n",
-    "os.chdir(PROJECT)\n",
-    "sys.path.insert(0, str(PROJECT))\n",
-    "print('PLATFORM :', PLATFORM)\n",
-    "print('CODE_SRC :', CODE_SRC)\n",
-    "print('DATA_SRC :', DATA_SRC)\n",
-    "print('PROJECT  :', PROJECT)\n",
-    "print('WORK     :', WORK)\n"
-   ],
    "execution_count": null,
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "Pulling code from HF (user: hieu3636) …\n",
-      "Pulling IU-Xray.zip from HF …\n",
-      "  unzipping → /content/data\n",
-      "Contents of /content/data: ['.cache', 'IU-Xray']\n",
-      "PLATFORM : colab\n",
-      "CODE_SRC : /content/cxr-vlm-code\n",
-      "DATA_SRC : /content/data\n",
-      "PROJECT  : /content/cxr_vlm\n",
-      "WORK     : /content\n"
-     ]
-    }
-   ],
    "id": "cell-paths"
   },
   {
@@ -336,29 +182,7 @@
    "metadata": {
     "id": "cell-data-md"
    },
-   "source": [
-    "## 2. Locate data on Kaggle\n",
-    "\n",
-    "Both datasets live under the single `cxr-vlm-data` slug. Expected layouts:\n",
-    "\n",
-    "**MIMIC-CXR**:\n",
-    "```\n",
-    "DATA_SRC/\n",
-    "├── MIMIC-CXR/ (or at root)\n",
-    "│   ├── train/p10/pXXXXXX/sYYYYY/*.jpg + sYYYYY.txt\n",
-    "│   ├── valid/p10/...\n",
-    "│   └── test/p10/...\n",
-    "└── .../MIMIC-Ext-MIMIC-CXR-VQA/dataset/{train,valid,test}.json\n",
-    "```\n",
-    "\n",
-    "**IU-Xray** (added alongside MIMIC under the same slug):\n",
-    "```\n",
-    "DATA_SRC/\n",
-    "└── IU-Xray/\n",
-    "    ├── images/        # CXR*_IM-*-*.png (~7.5k files)\n",
-    "    └── labels/        # {1..3999}.xml   (~3.9k files, flat — no ecgen-radiology subfolder)\n",
-    "```"
-   ],
    "id": "cell-data-md"
   },
   {
@@ -370,104 +194,9 @@
     },
     "outputId": "53c15833-f9bb-4457-95d6-3fa83f4dc909"
    },
-   "source": [
-    "def find_split_parent(root: Path) -> Path:\n",
-    "    for cand in [root, root / 'MIMIC-CXR', root / 'data' / 'MIMIC-CXR']:\n",
-    "        if (cand / 'train').exists() and (cand / 'valid').exists() and (cand / 'test').exists():\n",
-    "            return cand\n",
-    "    for p in root.rglob('train'):\n",
-    "        if p.is_dir() and (p.parent / 'valid').exists() and (p.parent / 'test').exists():\n",
-    "            return p.parent\n",
-    "    raise FileNotFoundError('Could not find train/ valid/ test/ under ' + str(root))\n",
-    "\n",
-    "\n",
-    "def find_iu_dirs(root: Path):\n",
-    "    \"\"\"Locate IU-Xray `images/` and `labels/` (flat XMLs) under `root`.\n",
-    "\n",
-    "    Resolution order:\n",
-    "      1. `{root}/IU-Xray/{images,labels}` — canonical layout.\n",
-    "      2. Any nested `IU-Xray` folder that contains both.\n",
-    "      3. Fallback: any folder containing CXR*.png (images) and\n",
-    "         any folder containing *.xml — whichever comes first.\n",
-    "\n",
-    "    The labels subfolder is treated as a flat directory of XMLs (we no\n",
-    "    longer require the legacy `ecgen-radiology/` subfolder).\n",
-    "    \"\"\"\n",
-    "    # Canonical + nested\n",
-    "    for cand in [root / 'IU-Xray', *root.rglob('IU-Xray')]:\n",
-    "        if not cand.is_dir():\n",
-    "            continue\n",
-    "        imgs = cand / 'images'\n",
-    "        lbls = cand / 'labels'\n",
-    "        if imgs.is_dir() and lbls.is_dir() and any(lbls.glob('*.xml')):\n",
-    "            return imgs, lbls\n",
-    "        # Legacy: labels/ecgen-radiology/*.xml\n",
-    "        legacy = lbls / 'ecgen-radiology'\n",
-    "        if imgs.is_dir() and legacy.is_dir() and any(legacy.glob('*.xml')):\n",
-    "            return imgs, legacy\n",
-    "\n",
-    "    # Fallback: any images/ with CXR*.png + any folder with XML\n",
-    "    img_dir = lbl_dir = None\n",
-    "    for cand in [root / 'images', *root.rglob('images')]:\n",
-    "        if cand.is_dir() and any(cand.glob('CXR*.png')):\n",
-    "            img_dir = cand; break\n",
-    "    for cand in [root / 'labels', *root.rglob('labels')]:\n",
-    "        if cand.is_dir() and any(cand.glob('*.xml')):\n",
-    "            lbl_dir = cand; break\n",
-    "    if lbl_dir is None:\n",
-    "        # very last resort — any ecgen-radiology folder with XMLs\n",
-    "        for cand in root.rglob('ecgen-radiology'):\n",
-    "            if cand.is_dir() and any(cand.glob('*.xml')):\n",
-    "                lbl_dir = cand; break\n",
-    "    return img_dir, lbl_dir\n",
-    "\n",
-    "\n",
-    "# Filled in below depending on DATASET_NAME\n",
-    "CXR_ROOT      = None                  # MIMIC-CXR root (with train/valid/test subdirs)\n",
-    "SPLIT_DIRS    = None                  # MIMIC only\n",
-    "VQA_ROOT      = None                  # MIMIC only\n",
-    "IU_IMAGES_DIR = None                  # IU-Xray only\n",
-    "IU_LABELS_DIR = None                  # IU-Xray only\n",
-    "\n",
-    "if DATASET_NAME == 'MIMIC-CXR':\n",
-    "    CXR_ROOT = find_split_parent(DATA_SRC)\n",
-    "    print('MIMIC-CXR root:', CXR_ROOT)\n",
-    "\n",
-    "    SPLIT_DIRS = {\n",
-    "        'train'   : ('train', CXR_ROOT / 'train'),\n",
-    "        'validate': ('valid', CXR_ROOT / 'valid'),\n",
-    "        'test'    : ('test',  CXR_ROOT / 'test'),\n",
-    "    }\n",
-    "    for s, (sub, d) in SPLIT_DIRS.items():\n",
-    "        assert d.exists(), f'Missing split dir: {d}'\n",
-    "        print(f'  {s:<9s} → {d}')\n",
-    "\n",
-    "    for p in DATA_SRC.rglob('MIMIC-Ext-MIMIC-CXR-VQA'):\n",
-    "        cand = p / 'dataset'\n",
-    "        if cand.exists() and (cand / 'train.json').exists():\n",
-    "            VQA_ROOT = cand\n",
-    "            break\n",
-    "    assert VQA_ROOT is not None, 'VQA dataset folder not found under ' + str(DATA_SRC)\n",
-    "    print('VQA root:', VQA_ROOT)\n",
-    "\n",
-    "else:   # IU-Xray\n",
-    "    IU_IMAGES_DIR, IU_LABELS_DIR = find_iu_dirs(DATA_SRC)\n",
-    "    assert IU_IMAGES_DIR is not None, f'IU images/ not found under {DATA_SRC}'\n",
-    "    assert IU_LABELS_DIR is not None, f'IU labels/ (with *.xml) not found under {DATA_SRC}'\n",
-    "    print('IU images dir:', IU_IMAGES_DIR, '→', len(list(IU_IMAGES_DIR.glob('*.png'))), 'PNGs')\n",
-    "    print('IU labels dir:', IU_LABELS_DIR, '→', len(list(IU_LABELS_DIR.glob('*.xml'))), 'XMLs')"
-   ],
    "execution_count": null,
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "IU images dir: /content/data/IU-Xray/images → 1841 PNGs\n",
-      "IU labels dir: /content/data/IU-Xray/labels → 3955 XMLs\n"
-     ]
-    }
-   ],
    "id": "cell-find-data-mimic"
   },
   {
@@ -475,7 +204,7 @@
    "metadata": {
     "id": "cell-json-md"
    },
-   "source": "## 3. Build the unified instruction JSON\n\n- **MIMIC-CXR**: auto-built by `utils.dataset_resolver` → `data.mimic_cxr_builder` the first time `train.py`/`evaluate.py` runs. It parses findings/impression, bakes the 14 GT CheXpert labels as the **PNU** string (`Positive/Negative/Uncertain Abnormalities`) into `structured_findings`, and attaches abnormality-guided VQA. The inline cells below are **no-ops for MIMIC**.\n- **IU-Xray**: built by `data.iu_xray_builder` in the cell below (resolver would also do it lazily).\n\nEither way the JSON shares one schema (`image_path`, `task`, `target`, `question`, `structured_findings`, `split`, ...) so `CXRInstructDataset` loads it unchanged.",
    "id": "cell-json-md"
   },
   {
@@ -487,7 +216,7 @@
     },
     "outputId": "3b965273-ac82-41a7-8ead-895094e0a8b1"
    },
-   "source": "# MIMIC-CXR: the unified JSON (PNU CheXpert + abnormality-guided VQA) is now\n# auto-built by utils.dataset_resolver via data.mimic_cxr_builder when\n# train.py / evaluate.py run. The old inline parse/build/save cells are kept\n# ONLY for IU-Xray; for MIMIC they are intentional no-ops.\nif DATASET_NAME == 'MIMIC-CXR':\n    print('MIMIC-CXR: JSON build handled by the resolver '\n          '(data.mimic_cxr_builder, PNU + VQA) — skipping inline parse cell.')\nelse:\n    print('IU-Xray: skipping MIMIC indexing cell.')\n",
    "execution_count": null,
    "outputs": [],
    "id": "cell-parse"
@@ -501,7 +230,7 @@
     },
     "outputId": "91e11a3e-7b4e-4457-c32a-70a17fb2ef2a"
    },
-   "source": "if DATASET_NAME == 'MIMIC-CXR':\n    print('MIMIC-CXR: findings/impression built by the resolver — skipping.')\nelse:\n    samples = None\n    print('IU-Xray: skipping MIMIC report parsing cell.')\n",
    "execution_count": null,
    "outputs": [],
    "id": "cell-build-findings"
@@ -515,7 +244,7 @@
     },
     "outputId": "fc5fd7bc-cb80-49aa-da79-0097ed038b5d"
    },
-   "source": "if DATASET_NAME == 'MIMIC-CXR':\n    print('MIMIC-CXR: VQA attached by data.mimic_cxr_builder (resolver), '\n          'with the same PNU CheXpert context — skipping inline VQA cell.')\nelse:\n    print('IU-Xray: skipping MIMIC VQA cell.')\n",
    "execution_count": null,
    "outputs": [],
    "id": "cell-build-vqa"
@@ -529,7 +258,7 @@
     },
     "outputId": "b4d6589b-19be-4eb8-ba25-d533248439b9"
    },
-   "source": "if DATASET_NAME == 'MIMIC-CXR':\n    print('MIMIC-CXR: image-existence filtering done inside the resolver '\n          'builder — skipping.')\nelse:\n    print('IU-Xray: skipping.')\n",
    "execution_count": null,
    "outputs": [],
    "id": "cell-filter"
@@ -543,7 +272,7 @@
     },
     "outputId": "b6d95196-0383-4a50-8424-9ef95eb7b34e"
    },
-   "source": "out_dir = PROJECT / 'data' / 'data_files'\nout_dir.mkdir(parents=True, exist_ok=True)\n\nif DATASET_NAME == 'MIMIC-CXR':\n    # Base path only — the resolver appends __{report_mode}__{image_mode}\n    # and builds it (PNU CheXpert + abnormality-guided VQA) via\n    # data.mimic_cxr_builder the first time train.py / evaluate.py runs.\n    mimic_json_path = out_dir / 'mimic_cxr_instruct_unified.json'\n    print('MIMIC-CXR: instruct JSON auto-built by resolver →',\n          f'{mimic_json_path.stem}__<report_mode>__<image_mode>.json')\nelse:\n    # Build IU-Xray JSON here so the notebook shows a nice summary log\n    # (the resolver would also do this lazily).\n    from data.iu_xray_builder import build_iu_xray_instruct_json\n    iu_json_path = out_dir / 'iu_xray_instruct.json'\n    build_iu_xray_instruct_json(\n        images_dir  = str(IU_IMAGES_DIR),\n        labels_dir  = str(IU_LABELS_DIR),\n        output_path = str(iu_json_path),\n        train_ratio = 0.70, val_ratio = 0.15, test_ratio = 0.15, seed = 42,\n    )\n",
    "execution_count": null,
    "outputs": [],
    "id": "cell-save-json"
@@ -553,7 +282,7 @@
    "metadata": {
     "id": "cell-cfg-md"
    },
-   "source": "## 4. Patch configs for the Kaggle/Colab environment\n\n- Sets `data.dataset_name`, `report_mode`, `image_mode`.\n- **MIMIC-CXR**: sets `mimic_cxr_root`, the `instruct_json` base path, auto-discovers the **CheXpert CSV** (`mimic_chexpert_csv`) and the **VQA** dir (`mimic_vqa_root`), and turns on `mimic_auto_build` so the resolver builds the PNU+VQA JSON on first run.\n- **IU-Xray**: points `iu_xray.images_dir/labels_dir/instruct_json` at the mount.\n- `training.output_root` under `WORK/ckpt` (Persistence keeps it).\n- **4-bit QLoRA**; WandB off; HF hub on — edit `hf_hub.repo_id` to your repo.\n\n⚠️ If \"CheXpert CSV: NOT FOUND\" prints, add `mimic-cxr-2.0.0-chexpert.csv` to the data so PNU abnormality guidance is active (training still runs without it, just no PNU).",
    "id": "cell-cfg-md"
   },
   {
@@ -565,7 +294,7 @@
     },
     "outputId": "80ddabe3-bc8b-4d14-94e2-26ff9e64970c"
    },
-   "source": "from omegaconf import OmegaConf\n\ntrain_cfg = OmegaConf.load(PROJECT / 'configs' / 'train_config.yaml')\nmodel_cfg = OmegaConf.load(PROJECT / 'configs' / 'model_config.yaml')\n\n# ── dataset selector ──\ntrain_cfg.data.dataset_name = DATASET_NAME\n\n# ── training-scheme switches (thesis ablations) ──\n#   report_mode: 'split'         → 2 tasks (findings + impression separately)\n#                'merged'        → 1 task (full report \"Findings: ...\\n\\nImpression: ...\")\n#                'split_cascade' → split, but impression's context = GT findings\n#   image_mode : 'all_views_split' | 'frontal_only_split' | 'multi_image_merged'\ntrain_cfg.data.report_mode             = 'split'\ntrain_cfg.data.image_mode              = 'all_views_split'\ntrain_cfg.data.max_images_per_sample   = 2          # only used in multi_image_merged\n\n# ── dataset-specific paths ──\nif DATASET_NAME == 'MIMIC-CXR':\n    train_cfg.data.mimic_cxr_root = str(CXR_ROOT)\n    # Base path; the resolver suffixes __{report_mode}__{image_mode} and\n    # auto-builds (PNU CheXpert + VQA) via data.mimic_cxr_builder.\n    train_cfg.data.instruct_json  = str(mimic_json_path)\n    train_cfg.data.mimic_auto_build = True\n\n    # RaDialog / U-MultiClass abnormality guidance: locate the CheXpert\n    # label CSV so the builder can bake the PNU structured_findings string.\n    _cx = (sorted(DATA_SRC.rglob('*chexpert*.csv'))\n           or sorted(DATA_SRC.rglob('*chexbert*.csv')))\n    train_cfg.data.mimic_chexpert_csv = str(_cx[0]) if _cx else None\n    print('CheXpert CSV :', train_cfg.data.mimic_chexpert_csv\n          or 'NOT FOUND — PNU abnormality guidance DISABLED!')\n\n    # VQA pairs ({train,valid,test}.json) → abnormality-guided VQA.\n    train_cfg.data.mimic_vqa_root = str(VQA_ROOT) if VQA_ROOT is not None else None\n    print('VQA root     :', train_cfg.data.mimic_vqa_root or '(none — VQA skipped)')\nelse:  # IU-Xray\n    train_cfg.data.iu_xray.images_dir    = str(IU_IMAGES_DIR)\n    train_cfg.data.iu_xray.labels_dir    = str(IU_LABELS_DIR)\n    train_cfg.data.iu_xray.instruct_json = str(iu_json_path)\n    train_cfg.data.iu_xray.auto_build    = True\n\ntrain_cfg.data.train_split = 'train'\ntrain_cfg.data.val_split   = 'validate'\ntrain_cfg.data.test_split  = 'test'\n\n# ── checkpoint root (Persistence keeps /content/ckpt/) ──\nCKPT_ROOT = WORK / 'ckpt'\ntrain_cfg.training.output_root = str(CKPT_ROOT)\n\n# ── batching ──\ntrain_cfg.training.per_device_train_batch_size = 4\ntrain_cfg.training.per_device_eval_batch_size  = 4\ntrain_cfg.training.gradient_accumulation_steps = 4\ntrain_cfg.training.fp16                        = False\ntrain_cfg.training.bf16                        = True\ntrain_cfg.training.dataloader_num_workers      = 8\n\ntrain_cfg.stage2.num_epoch                     = 5\n\n# ── wandb off ──\ntrain_cfg.wandb.enabled = False\n\n# ── HuggingFace Hub run tracking ──\ntrain_cfg.hf_hub.enabled        = True\ntrain_cfg.hf_hub.repo_id        = 'hieu3636/cxr-vlm-runs'   # <<< EDIT ME\ntrain_cfg.hf_hub.token_env      = 'HF_TOKEN'\ntrain_cfg.hf_hub.private        = True\ntrain_cfg.hf_hub.run_state_file = str(CKPT_ROOT / 'run_id.txt')\n\n# ── 4-bit QLoRA ──\nmodel_cfg.llm.load_in_8bit = False\nmodel_cfg.llm.load_in_4bit = True\n# Oracle PNU path does NOT use the CheXpert classifier module (labels come\n# from the GT csv baked into the prompt). Keep it disabled until you wire\n# the learned classifier for realistic inference.\nmodel_cfg.chexpert_classifier.enabled = False\n\nOmegaConf.save(train_cfg, PROJECT / 'configs' / 'train_config.yaml')\nOmegaConf.save(model_cfg, PROJECT / 'configs' / 'model_config.yaml')\n\nprint('--- train_cfg.data ---');    print(OmegaConf.to_yaml(train_cfg.data))\nprint('--- train_cfg.training ---');print(OmegaConf.to_yaml(train_cfg.training))\nprint('--- train_cfg.hf_hub ---');  print(OmegaConf.to_yaml(train_cfg.hf_hub))\nprint('--- model_cfg.llm ---');     print(OmegaConf.to_yaml(model_cfg.llm))\n",
    "execution_count": null,
    "outputs": [],
    "id": "cell-cfg"
@@ -591,29 +320,9 @@
     },
     "outputId": "8a2ce693-94fc-4425-f62c-679614d6dab5"
    },
-   "source": [
-    "# HF_TOKEN setup. On non-Kaggle platforms it's already set inside cell-paths\n",
-    "# (needed to pull code + data). Here we only handle the Kaggle path.\n",
-    "try:\n",
-    "    if PLATFORM == 'kaggle':\n",
-    "        from kaggle_secrets import UserSecretsClient\n",
-    "        os.environ['HF_TOKEN'] = UserSecretsClient().get_secret('HF_TOKEN')\n",
-    "    # Other platforms: already populated in cell-paths\n",
-    "    assert os.environ.get('HF_TOKEN'), 'HF_TOKEN missing'\n",
-    "    print('HF_TOKEN loaded ✓')\n",
-    "except Exception as e:\n",
-    "    print('No HF_TOKEN — Vicuna-7B download may rate-limit and hub upload will be disabled:', e)\n"
-   ],
    "execution_count": null,
-   "outputs": [
-    {
-     "output_type": "stream",
-     "name": "stdout",
-     "text": [
-      "HF_TOKEN loaded ✓\n"
-     ]
-    }
-   ],
    "id": "cell-hf-token"
   },
   {
@@ -668,24 +377,7 @@
    "metadata": {
     "id": "cell-mode-md"
    },
-   "source": [
-    "## 5b. Resume controller\n",
-    "\n",
-    "Single switch. No more \"which stage\" — `train.py` auto-detects which stage\n",
-    "to continue from by inspecting checkpoints on disk.\n",
-    "\n",
-    "| MODE                | What happens |\n",
-    "|---------------------|--------------|\n",
-    "| `'fresh'`           | Allocate a brand-new `{DATASET}_run_N+1` folder. Train both stages from scratch. |\n",
-    "| `'resume'`          | Reuse latest matching `{DATASET}_run_N` (or `EXPLICIT_RUN_ID`). Auto-detect: stage 1 mid-checkpoint, stage 1 done → stage 2 fresh, stage 2 mid-checkpoint, or both done. |\n",
-    "\n",
-    "`EXPLICIT_RUN_ID` is optional (set to `None` to auto-pick the latest run on\n",
-    "disk or HF Hub that matches the current dataset prefix).\n",
-    "\n",
-    "When `MODE='resume'` on a fresh VM the train cell will pull the previous\n",
-    "run's checkpoints from HF before training. The `--mode resume` flag in\n",
-    "`train.py` does the auto-detect — no further action needed in the notebook."
-   ],
    "id": "cell-resume-md"
   },
   {

    "metadata": {
     "id": "cell-0"
    },
+   "source": "# CXR-VLM — Kaggle / Colab Training Notebook (consolidated)\n\nTrains the 2-stage CXR-VLM (Vicuna-7B + BioViL-T fallback to timm ViT + LoRA) on a Kaggle **T4** or Colab **A100 / L4** GPU.\n\nSupports **three datasets**, selected by `DATASET_NAME` in section 0:\n- **`MIMIC-CXR_resized`** *(default)* — filtered + resized subset of MIMIC-CXR, distributed as tar shards. Manifest-driven (`manifest_{train,val,test}.csv` + `vqa/*.json`, reports inside the tars). 3 tasks (findings, impression, VQA). Lighter than full MIMIC, balanced val/test.\n- **`MIMIC-CXR`** — full pre-split MIMIC-CXR (3 tasks). Heavy; needs the original `train/valid/test` tree + chexpert.csv + VQA pairs.\n- **`IU-Xray`** — 2 tasks only (findings, impression). ~7.5k images, fastest sanity run.\n\n### Source-of-truth\n\nAll platforms (kaggle / colab / lightning / gcp / local) pull code + data from **HuggingFace Hub** — no Kaggle dataset attach is needed anymore. Just:\n\n| Repo | Contents |\n|---|---|\n| `<HF_USER>/cxr-vlm-code` | project source (configs/, data/*.py, model/, training/, evaluation/, utils/, requirements.txt) |\n| `<HF_USER>/cxr-vlm-data` | tar shards under `MIMIC-CXR_resized/` **or** `MIMIC-CXR.zip` **or** `IU-Xray.zip` |\n\n### Settings\n\n- **Kaggle**: accelerator **T4 x2** (only GPU 0 used); Persistence: **Variables and Files**; Internet: **On**\n- **Colab**: any GPU (A100 recommended); enable Files (persisted under `/content`)\n\n### Secrets\n\n- `HF_TOKEN` — HuggingFace token with **write** access to the runs repo (`hf_hub.repo_id` in config). Read from Kaggle Secrets (Add-ons → Secrets) or Colab userdata (🔑 sidebar).",
    "id": "cell-0"
   },
   {
     },
     "outputId": "d6e7ebbd-4f1b-483b-f20f-0df3997a60b7"
    },
+   "source": "# ── Platform + dataset selectors ──────────────────────────────────\n# PLATFORM drives storage paths and how secrets are read.\n# Supported: 'kaggle' | 'colab' | 'lightning' | 'gcp' | 'local'\nPLATFORM     = 'colab'\nDATASET_NAME = 'MIMIC-CXR_resized'   # 'MIMIC-CXR' | 'MIMIC-CXR_resized' | 'IU-Xray'\n\nassert PLATFORM     in ('kaggle', 'colab', 'lightning', 'gcp', 'local')\nassert DATASET_NAME in ('MIMIC-CXR', 'MIMIC-CXR_resized', 'IU-Xray')\nprint(f'PLATFORM = {PLATFORM} | DATASET_NAME = {DATASET_NAME}')",
    "execution_count": null,
+   "outputs": [],
    "id": "cell-select"
   },
   {
     },
     "outputId": "f6195a48-56b2-4052-8367-c8ec14c48a05"
    },
+   "source": "# ── Per-platform storage + source-of-truth ─────────────────────────\n# All platforms (kaggle / colab / lightning / gcp / local) pull code +\n# data from HF Hub. The only platform-specific bit is:\n#   * WORK  : where to land outputs (persisted dirs differ per host)\n#   * TOKEN : how HF_TOKEN reaches os.environ (secrets API differs)\n#\n# Required HF repos:\n#   <HF_USER>/cxr-vlm-code   — project source (flat folder)\n#   <HF_USER>/cxr-vlm-data   — per-dataset payloads:\n#                                MIMIC-CXR_resized/   (tar shards + manifests + vqa)\n#                                MIMIC-CXR.zip        (single zip)\n#                                IU-Xray.zip          (single zip)\n\nHF_USER = 'hieu3636'   # <<< EDIT ME\n\n# ── 1) WORK dir + HF_TOKEN bootstrap (platform-specific) ───────────\nif PLATFORM == 'kaggle':\n    from kaggle_secrets import UserSecretsClient\n    os.environ['HF_TOKEN'] = UserSecretsClient().get_secret('HF_TOKEN')\n    WORK = Path('/kaggle/working')\nelif PLATFORM == 'colab':\n    from google.colab import userdata\n    os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')\n    WORK = Path('/content')\nelif PLATFORM == 'lightning':\n    WORK = Path('/teamspace/studios/this_studio')\nelif PLATFORM == 'gcp':\n    WORK = Path('/workspace')\nelse:  # 'local'\n    WORK = Path.home() / 'cxr-vlm-work'\nWORK.mkdir(parents=True, exist_ok=True)\n\nassert os.environ.get('HF_TOKEN'), \\\n    'HF_TOKEN missing — set it via the platform secrets UI before re-running.'\n\ntry:\n    from huggingface_hub import snapshot_download, hf_hub_download, HfApi\nexcept ImportError:\n    !pip install -q huggingface_hub\n    from huggingface_hub import snapshot_download, hf_hub_download, HfApi\n\n# ── 2) Code: flat folder, few hundred files → snapshot_download ──\nprint(f'Pulling code from HF (user: {HF_USER}) …')\nCODE_SRC = Path(snapshot_download(\n    repo_id   = f'{HF_USER}/cxr-vlm-code',\n    repo_type = 'model',\n    token     = os.environ['HF_TOKEN'],\n    local_dir = str(WORK / 'cxr-vlm-code'),\n))\n\n# ── 3) Data: layout depends on DATASET_NAME ──\nDATA_SRC = WORK / 'data'\nDATA_SRC.mkdir(parents=True, exist_ok=True)\n\nif DATASET_NAME == 'MIMIC-CXR_resized':\n    # Tar-sharded payload. Reports + images live INSIDE the tars under\n    # `files/pXX/pXXXX/{sYYYY/*.jpg, sYYYY.txt}` so extracting all shards\n    # gives one unified tree. We download manifests + vqa + SHARDS.txt\n    # first (small, ~tens of MB), then each *.tar one at a time →\n    # extract → delete (saves disk).\n    # Final on-disk layout:\n    #   DATA_SRC/MIMIC-CXR_resized/\n    #     ├── manifest_{train,val,test}.csv\n    #     ├── vqa/ {vqa.json, vqa_val.json, vqa_test.json}\n    #     ├── SHARDS.txt + _manifest.json\n    #     └── files/pXX/pXXXX/                         ← from tars\n    #           ├── sYYYY.txt                          (report)\n    #           └── sYYYY/<dicom>.jpg                  (images)\n    import tarfile\n    mr_dir = DATA_SRC / 'MIMIC-CXR_resized'\n    mr_dir.mkdir(parents=True, exist_ok=True)\n    files_dir = mr_dir / 'files'\n\n    # Marker: if files/ already has shards extracted AND manifests exist,\n    # skip everything. Lets the cell be re-run safely.\n    manifests_present = all(\n        (mr_dir / f).is_file() for f in ('manifest_train.csv', 'manifest_val.csv', 'manifest_test.csv')\n    )\n    if manifests_present and files_dir.is_dir() and any(files_dir.glob('p*')):\n        print(f'{mr_dir} already populated — skipping download.')\n    else:\n        api = HfApi(token=os.environ['HF_TOKEN'])\n        all_files = api.list_repo_files(\n            repo_id=f'{HF_USER}/cxr-vlm-data', repo_type='dataset')\n        mr_files = [f for f in all_files if f.startswith('MIMIC-CXR_resized/')]\n        tar_files = sorted(f for f in mr_files if f.endswith('.tar'))\n        meta_files = [f for f in mr_files if not f.endswith('.tar')]\n        print(f'MIMIC-CXR_resized on HF: {len(tar_files)} tar shards + {len(meta_files)} metadata files')\n\n        # 3a) Pull metadata (manifests, vqa, SHARDS.txt, _manifest.json)\n        #     in one snapshot (small; few MB).\n        print(f'  downloading manifests + vqa + SHARDS.txt …')\n        snapshot_download(\n            repo_id        = f'{HF_USER}/cxr-vlm-data',\n            repo_type      = 'dataset',\n            allow_patterns = ['MIMIC-CXR_resized/*.csv',\n                              'MIMIC-CXR_resized/*.json',\n                              'MIMIC-CXR_resized/*.txt',\n                              'MIMIC-CXR_resized/vqa/**'],\n            token          = os.environ['HF_TOKEN'],\n            local_dir      = str(DATA_SRC),\n        )\n\n        # 3b) Sequentially fetch + extract + delete each image tar to\n        #     minimise peak disk usage (each shard ~2 GB). Reports come\n        #     out alongside images — both land under mr_dir/files/.\n        print(f'  downloading + extracting {len(tar_files)} tar shards …')\n        for i, tf in enumerate(tar_files, 1):\n            print(f'    [{i}/{len(tar_files)}] {tf}')\n            tar_path = Path(hf_hub_download(\n                repo_id=f'{HF_USER}/cxr-vlm-data', repo_type='dataset',\n                filename=tf, token=os.environ['HF_TOKEN'],\n                local_dir=str(DATA_SRC),\n            ))\n            with tarfile.open(tar_path) as t:\n                # Extract into mr_dir so member paths like\n                # \"files/p10/.../*.jpg\" + \"files/p10/.../*.txt\" land at\n                # mr_dir/files/p10/…\n                t.extractall(mr_dir)\n            tar_path.unlink(missing_ok=True)\n        print(f'  done. {mr_dir} ready.')\n\nelse:\n    # MIMIC-CXR / IU-Xray: single zip per dataset (legacy path)\n    import zipfile\n    zip_name = f'{DATASET_NAME}.zip'            # 'IU-Xray.zip' | 'MIMIC-CXR.zip'\n    marker   = DATA_SRC / DATASET_NAME           # DATA_SRC/IU-Xray after unzip\n\n    if not marker.exists():\n        print(f'Pulling {zip_name} from HF …')\n        zpath = hf_hub_download(\n            repo_id   = f'{HF_USER}/cxr-vlm-data',\n            filename  = zip_name,\n            repo_type = 'dataset',\n            token     = os.environ['HF_TOKEN'],\n            local_dir = str(DATA_SRC),\n        )\n        print(f'  unzipping → {DATA_SRC}')\n        with zipfile.ZipFile(zpath) as zf:\n            zf.extractall(DATA_SRC)\n        try:\n            os.remove(zpath)  # free disk\n        except OSError:\n            pass\n    else:\n        print(f'{marker} already present — skipping download.')\n\nprint(f'Contents of {DATA_SRC}: {sorted(os.listdir(DATA_SRC))}')\n\n# ── Common: copy code into writable PROJECT dir ────────────────────\nPROJECT = WORK / 'cxr_vlm'\nif CODE_SRC.resolve() != PROJECT.resolve() and not PROJECT.exists():\n    shutil.copytree(CODE_SRC, PROJECT)\n\nos.chdir(PROJECT)\nsys.path.insert(0, str(PROJECT))\nprint('PLATFORM :', PLATFORM)\nprint('CODE_SRC :', CODE_SRC)\nprint('DATA_SRC :', DATA_SRC)\nprint('PROJECT  :', PROJECT)\nprint('WORK     :', WORK)",
    "execution_count": null,
+   "outputs": [],
    "id": "cell-paths"
   },
   {
    "metadata": {
     "id": "cell-data-md"
    },
+   "source": "## 2. Locate data\n\nAll datasets live under a single `cxr-vlm-data` slug (Kaggle) or HF repo (others). Expected layouts:\n\n**MIMIC-CXR_resized** *(default)*:\n```\nDATA_SRC/\n└── MIMIC-CXR_resized/\n    ├── manifest_train.csv          ← drives split + chex_* + has_vqa\n    ├── manifest_val.csv\n    ├── manifest_test.csv\n    ├── vqa/\n    │   ├── vqa.json                ← train VQA pairs\n    │   ├── vqa_val.json\n    │   └── vqa_test.json\n    ├── files/                      ← extracted from tar shards\n    │   └── pXX/pXXXXXXXX/\n    │       ├── sYYYYYYYY.txt       ← report (alongside, at patient dir)\n    │       └── sYYYYYYYY/<dicom>.jpg\n    ├── SHARDS.txt\n    └── _manifest.json\n```\n\n**MIMIC-CXR** (legacy pre-split):\n```\nDATA_SRC/\n├── MIMIC-CXR/{train,valid,test}/p10/pXXXXXX/sYYYYY/*.jpg + sYYYYY.txt\n└── .../MIMIC-Ext-MIMIC-CXR-VQA/dataset/{train,valid,test}.json\n```\n\n**IU-Xray**:\n```\nDATA_SRC/\n└── IU-Xray/\n    ├── images/        # CXR*_IM-*-*.png (~7.5k files)\n    └── labels/        # {1..3999}.xml   (~3.9k files, flat — no ecgen-radiology subfolder)\n```",
    "id": "cell-data-md"
   },
   {
     },
     "outputId": "53c15833-f9bb-4457-95d6-3fa83f4dc909"
    },
+   "source": "def find_split_parent(root: Path) -> Path:\n    for cand in [root, root / 'MIMIC-CXR', root / 'data' / 'MIMIC-CXR']:\n        if (cand / 'train').exists() and (cand / 'valid').exists() and (cand / 'test').exists():\n            return cand\n    for p in root.rglob('train'):\n        if p.is_dir() and (p.parent / 'valid').exists() and (p.parent / 'test').exists():\n            return p.parent\n    raise FileNotFoundError('Could not find train/ valid/ test/ under ' + str(root))\n\n\ndef find_mimic_resized_root(root: Path) -> Path:\n    \"\"\"Find the MIMIC-CXR_resized payload — folder with manifest_*.csv + files/.\"\"\"\n    for cand in [root / 'MIMIC-CXR_resized', root, *root.rglob('MIMIC-CXR_resized')]:\n        if (cand / 'manifest_train.csv').is_file():\n            return cand\n    raise FileNotFoundError(\n        f'Could not find MIMIC-CXR_resized payload under {root}. '\n        f'Expected manifest_train.csv (alongside manifest_val.csv / manifest_test.csv).'\n    )\n\n\ndef find_iu_dirs(root: Path):\n    \"\"\"Locate IU-Xray `images/` and `labels/` (flat XMLs) under `root`.\n\n    Resolution order:\n      1. `{root}/IU-Xray/{images,labels}` — canonical layout.\n      2. Any nested `IU-Xray` folder that contains both.\n      3. Fallback: any folder containing CXR*.png (images) and\n         any folder containing *.xml — whichever comes first.\n\n    The labels subfolder is treated as a flat directory of XMLs (we no\n    longer require the legacy `ecgen-radiology/` subfolder).\n    \"\"\"\n    # Canonical + nested\n    for cand in [root / 'IU-Xray', *root.rglob('IU-Xray')]:\n        if not cand.is_dir():\n            continue\n        imgs = cand / 'images'\n        lbls = cand / 'labels'\n        if imgs.is_dir() and lbls.is_dir() and any(lbls.glob('*.xml')):\n            return imgs, lbls\n        # Legacy: labels/ecgen-radiology/*.xml\n        legacy = lbls / 'ecgen-radiology'\n        if imgs.is_dir() and legacy.is_dir() and any(legacy.glob('*.xml')):\n            return imgs, legacy\n\n    # Fallback: any images/ with CXR*.png + any folder with XML\n    img_dir = lbl_dir = None\n    for cand in [root / 'images', *root.rglob('images')]:\n        if cand.is_dir() and any(cand.glob('CXR*.png')):\n            img_dir = cand; break\n    for cand in [root / 'labels', *root.rglob('labels')]:\n        if cand.is_dir() and any(cand.glob('*.xml')):\n            lbl_dir = cand; break\n    if lbl_dir is None:\n        # very last resort — any ecgen-radiology folder with XMLs\n        for cand in root.rglob('ecgen-radiology'):\n            if cand.is_dir() and any(cand.glob('*.xml')):\n                lbl_dir = cand; break\n    return img_dir, lbl_dir\n\n\n# Filled in below depending on DATASET_NAME\nCXR_ROOT      = None                  # MIMIC-CXR root (with train/valid/test subdirs)\nSPLIT_DIRS    = None                  # MIMIC only\nVQA_ROOT      = None                  # MIMIC only\nMR_ROOT       = None                  # MIMIC-CXR_resized root (manifests + files/ + vqa/)\nIU_IMAGES_DIR = None                  # IU-Xray only\nIU_LABELS_DIR = None                  # IU-Xray only\n\nif DATASET_NAME == 'MIMIC-CXR':\n    CXR_ROOT = find_split_parent(DATA_SRC)\n    print('MIMIC-CXR root:', CXR_ROOT)\n\n    SPLIT_DIRS = {\n        'train'   : ('train', CXR_ROOT / 'train'),\n        'validate': ('valid', CXR_ROOT / 'valid'),\n        'test'    : ('test',  CXR_ROOT / 'test'),\n    }\n    for s, (sub, d) in SPLIT_DIRS.items():\n        assert d.exists(), f'Missing split dir: {d}'\n        print(f'  {s:<9s} → {d}')\n\n    for p in DATA_SRC.rglob('MIMIC-Ext-MIMIC-CXR-VQA'):\n        cand = p / 'dataset'\n        if cand.exists() and (cand / 'train.json').exists():\n            VQA_ROOT = cand\n            break\n    assert VQA_ROOT is not None, 'VQA dataset folder not found under ' + str(DATA_SRC)\n    print('VQA root:', VQA_ROOT)\n\nelif DATASET_NAME == 'MIMIC-CXR_resized':\n    MR_ROOT = find_mimic_resized_root(DATA_SRC)\n    print('MIMIC-CXR_resized root:', MR_ROOT)\n    # Sanity: 3 manifest CSVs, files/ (images+reports), vqa/\n    for cf in ('manifest_train.csv', 'manifest_val.csv', 'manifest_test.csv'):\n        f = MR_ROOT / cf\n        print(f'  {cf}: {\"OK\" if f.is_file() else \"MISSING\"}')\n    for sub in ('files', 'vqa'):\n        d = MR_ROOT / sub\n        print(f'  {sub:<5s}: {\"OK\" if d.is_dir() else \"MISSING\"}  ({d})')\n    # Spot-check one report (.txt) sits at patient-dir level inside files/\n    txt_hits = list((MR_ROOT / 'files').glob('p*/p*/s*.txt')) if (MR_ROOT / 'files').is_dir() else []\n    print(f'  reports inside files/ : {len(txt_hits):,} found (sample: {txt_hits[0] if txt_hits else \"—\"})')\n\nelse:   # IU-Xray\n    IU_IMAGES_DIR, IU_LABELS_DIR = find_iu_dirs(DATA_SRC)\n    assert IU_IMAGES_DIR is not None, f'IU images/ not found under {DATA_SRC}'\n    assert IU_LABELS_DIR is not None, f'IU labels/ (with *.xml) not found under {DATA_SRC}'\n    print('IU images dir:', IU_IMAGES_DIR, '→', len(list(IU_IMAGES_DIR.glob('*.png'))), 'PNGs')\n    print('IU labels dir:', IU_LABELS_DIR, '→', len(list(IU_LABELS_DIR.glob('*.xml'))), 'XMLs')",
    "execution_count": null,
+   "outputs": [],
    "id": "cell-find-data-mimic"
   },
   {
    "metadata": {
     "id": "cell-json-md"
    },
+   "source": "## 3. Build the unified instruction JSON\n\n- **MIMIC-CXR_resized**: auto-built by `utils.dataset_resolver` → `data.mimic_cxr_resized_builder` the first time `train.py` / `evaluate.py` runs. It reads `manifest_{train,val,test}.csv` (which carry the split label, image/report relpath, and the 14 CheXpert `chex_*` columns → PNU `Positive/Negative/Uncertain Abnormalities` string), parses findings + impression from each report, and attaches abnormality-guided VQA from `vqa/{vqa,vqa_val,vqa_test}.json`. The inline cells below are **no-ops** for this dataset.\n- **MIMIC-CXR** (full, pre-split): auto-built by `data.mimic_cxr_builder` (CheXpert.csv-based). Inline cells are no-ops here too.\n- **IU-Xray**: built by `data.iu_xray_builder` in the cell below (resolver would also do it lazily; we build here just to get a summary log).\n\nAll three paths produce the same JSON schema (`image_path`, `task`, `target`, `question`, `structured_findings`, `split`, …) so `CXRInstructDataset` loads them unchanged.",
    "id": "cell-json-md"
   },
   {
     },
     "outputId": "3b965273-ac82-41a7-8ead-895094e0a8b1"
    },
+   "source": "# MIMIC-CXR and MIMIC-CXR_resized: the unified JSON is built lazily by\n# utils.dataset_resolver (→ data.mimic_cxr_builder for MIMIC-CXR, or\n# → data.mimic_cxr_resized_builder for MIMIC-CXR_resized) when train.py /\n# evaluate.py first run. The old inline parse/build cells are no-ops for\n# both; IU-Xray still gets a friendly inline build below for the log.\nif DATASET_NAME in ('MIMIC-CXR', 'MIMIC-CXR_resized'):\n    print(f'{DATASET_NAME}: JSON build handled by the resolver — skipping inline parse cell.')\nelse:\n    print('IU-Xray: skipping MIMIC indexing cell.')",
    "execution_count": null,
    "outputs": [],
    "id": "cell-parse"
     },
     "outputId": "91e11a3e-7b4e-4457-c32a-70a17fb2ef2a"
    },
+   "source": "if DATASET_NAME in ('MIMIC-CXR', 'MIMIC-CXR_resized'):\n    print(f'{DATASET_NAME}: findings/impression built by the resolver — skipping.')\nelse:\n    samples = None\n    print('IU-Xray: skipping MIMIC report parsing cell.')",
    "execution_count": null,
    "outputs": [],
    "id": "cell-build-findings"
     },
     "outputId": "fc5fd7bc-cb80-49aa-da79-0097ed038b5d"
    },
+   "source": "if DATASET_NAME in ('MIMIC-CXR', 'MIMIC-CXR_resized'):\n    print(f'{DATASET_NAME}: VQA attached by the resolver builder '\n          '(with the same PNU CheXpert context) — skipping inline VQA cell.')\nelse:\n    print('IU-Xray: skipping MIMIC VQA cell.')",
    "execution_count": null,
    "outputs": [],
    "id": "cell-build-vqa"
     },
     "outputId": "b4d6589b-19be-4eb8-ba25-d533248439b9"
    },
+   "source": "if DATASET_NAME in ('MIMIC-CXR', 'MIMIC-CXR_resized'):\n    print(f'{DATASET_NAME}: image-existence filtering handled inside the resolver '\n          'builder — skipping.')\nelse:\n    print('IU-Xray: skipping.')",
    "execution_count": null,
    "outputs": [],
    "id": "cell-filter"
     },
     "outputId": "b6d95196-0383-4a50-8424-9ef95eb7b34e"
    },
+   "source": "out_dir = PROJECT / 'data' / 'data_files'\nout_dir.mkdir(parents=True, exist_ok=True)\n\nif DATASET_NAME == 'MIMIC-CXR':\n    # Base path only — the resolver appends __{report_mode}__{image_mode}\n    # and builds it (PNU CheXpert + abnormality-guided VQA) via\n    # data.mimic_cxr_builder the first time train.py / evaluate.py runs.\n    mimic_json_path = out_dir / 'mimic_cxr_instruct_unified.json'\n    print('MIMIC-CXR: instruct JSON auto-built by resolver →',\n          f'{mimic_json_path.stem}__<report_mode>__<image_mode>.json')\nelif DATASET_NAME == 'MIMIC-CXR_resized':\n    # Same lazy-build story but via data.mimic_cxr_resized_builder.\n    mr_json_path = out_dir / 'mimic_cxr_resized_instruct.json'\n    print('MIMIC-CXR_resized: instruct JSON auto-built by resolver →',\n          f'{mr_json_path.stem}__<report_mode>__<image_mode>.json')\nelse:\n    # Build IU-Xray JSON here so the notebook shows a nice summary log\n    # (the resolver would also do this lazily).\n    from data.iu_xray_builder import build_iu_xray_instruct_json\n    iu_json_path = out_dir / 'iu_xray_instruct.json'\n    build_iu_xray_instruct_json(\n        images_dir  = str(IU_IMAGES_DIR),\n        labels_dir  = str(IU_LABELS_DIR),\n        output_path = str(iu_json_path),\n        train_ratio = 0.70, val_ratio = 0.15, test_ratio = 0.15, seed = 42,\n    )",
    "execution_count": null,
    "outputs": [],
    "id": "cell-save-json"
    "metadata": {
     "id": "cell-cfg-md"
    },
+   "source": "## 4. Patch configs for the Kaggle/Colab environment\n\n- Sets `data.dataset_name`, `report_mode`, `image_mode`.\n- **MIMIC-CXR_resized** *(default)*: sets `mimic_cxr_resized.root` (the manifest+files+vqa+reports payload). `manifest_dir` / `vqa_dir` / `reports_root` are left null so the resolver auto-detects from `{root}/`, `{root}/vqa/`, `{root}/reports/`. The builder reads `chex_*` columns directly — no separate CheXpert CSV is needed.\n- **MIMIC-CXR**: sets `mimic_cxr_root`, the `instruct_json` base path, auto-discovers the **CheXpert CSV** (`mimic_chexpert_csv`) and the **VQA** dir (`mimic_vqa_root`), and turns on `mimic_auto_build`.\n- **IU-Xray**: points `iu_xray.images_dir/labels_dir/instruct_json` at the mount.\n- `tasks.*.weight` is left at the config defaults (findings 0.30 / impression 0.20 / vqa 0.50). `WeightedRandomSampler` in `CXRTrainer._get_train_sampler` enforces the mix at train time — see `data/dataset.py:get_per_sample_weights`.\n- `training.output_root` under `WORK/ckpt` (Persistence keeps it).\n- **4-bit QLoRA**; WandB off; HF hub on — edit `hf_hub.repo_id` to your repo.\n\n⚠️ MIMIC-CXR (full) path: if \"CheXpert CSV: NOT FOUND\" prints, add `mimic-cxr-2.0.0-chexpert.csv` to the data so PNU abnormality guidance is active (training still runs without it, just no PNU). For MIMIC-CXR_resized this is N/A — labels are baked into the manifest.",
    "id": "cell-cfg-md"
   },
   {
     },
     "outputId": "80ddabe3-bc8b-4d14-94e2-26ff9e64970c"
    },
+   "source": "from omegaconf import OmegaConf\n\ntrain_cfg = OmegaConf.load(PROJECT / 'configs' / 'train_config.yaml')\nmodel_cfg = OmegaConf.load(PROJECT / 'configs' / 'model_config.yaml')\n\n# ── dataset selector ──\ntrain_cfg.data.dataset_name = DATASET_NAME\n\n# ── training-scheme switches (thesis ablations) ──\n#   report_mode: 'split'         → 2 tasks (findings + impression separately)\n#                'merged'        → 1 task (full report \"Findings: ...\\n\\nImpression: ...\")\n#                'split_cascade' → split, but impression's context = GT findings\n#   image_mode : 'all_views_split' | 'frontal_only_split' | 'multi_image_merged'\ntrain_cfg.data.report_mode             = 'split'\ntrain_cfg.data.image_mode              = 'all_views_split'\ntrain_cfg.data.max_images_per_sample   = 2          # only used in multi_image_merged\n\n# ── dataset-specific paths ──\nif DATASET_NAME == 'MIMIC-CXR':\n    train_cfg.data.mimic_cxr_root = str(CXR_ROOT)\n    # Base path; the resolver suffixes __{report_mode}__{image_mode} and\n    # auto-builds (PNU CheXpert + VQA) via data.mimic_cxr_builder.\n    train_cfg.data.instruct_json  = str(mimic_json_path)\n    train_cfg.data.mimic_auto_build = True\n\n    # RaDialog / U-MultiClass abnormality guidance: locate the CheXpert\n    # label CSV so the builder can bake the PNU structured_findings string.\n    _cx = (sorted(DATA_SRC.rglob('*chexpert*.csv'))\n           or sorted(DATA_SRC.rglob('*chexbert*.csv')))\n    train_cfg.data.mimic_chexpert_csv = str(_cx[0]) if _cx else None\n    print('CheXpert CSV :', train_cfg.data.mimic_chexpert_csv\n          or 'NOT FOUND — PNU abnormality guidance DISABLED!')\n\n    # VQA pairs ({train,valid,test}.json) → abnormality-guided VQA.\n    train_cfg.data.mimic_vqa_root = str(VQA_ROOT) if VQA_ROOT is not None else None\n    print('VQA root     :', train_cfg.data.mimic_vqa_root or '(none — VQA skipped)')\n\nelif DATASET_NAME == 'MIMIC-CXR_resized':\n    # The MIMIC-CXR_resized builder is manifest-driven: it reads\n    # `manifest_{train,val,test}.csv` for split + the 14 chex_* labels\n    # (PNU bucketed directly from the CSV, no separate chexpert.csv needed),\n    # uses `report_relpath` from the manifest to find each .txt, and pulls\n    # VQA from `vqa/{vqa,vqa_val,vqa_test}.json`.\n    train_cfg.data.mimic_cxr_resized.root         = str(MR_ROOT)\n    train_cfg.data.mimic_cxr_resized.manifest_dir = None   # null → defaults to root\n    train_cfg.data.mimic_cxr_resized.vqa_dir      = None   # null → {root}/vqa\n    train_cfg.data.mimic_cxr_resized.reports_root = None   # null → auto-probe {root} then {root}/reports\n    train_cfg.data.mimic_cxr_resized.instruct_json = str(mr_json_path)\n    train_cfg.data.mimic_cxr_resized.auto_build   = True\n\nelse:  # IU-Xray\n    train_cfg.data.iu_xray.images_dir    = str(IU_IMAGES_DIR)\n    train_cfg.data.iu_xray.labels_dir    = str(IU_LABELS_DIR)\n    train_cfg.data.iu_xray.instruct_json = str(iu_json_path)\n    train_cfg.data.iu_xray.auto_build    = True\n\ntrain_cfg.data.train_split = 'train'\ntrain_cfg.data.val_split   = 'validate'\ntrain_cfg.data.test_split  = 'test'\n\n# ── checkpoint root (Persistence keeps /content/ckpt/) ──\nCKPT_ROOT = WORK / 'ckpt'\ntrain_cfg.training.output_root = str(CKPT_ROOT)\n\n# ── batching ──\ntrain_cfg.training.per_device_train_batch_size = 4\ntrain_cfg.training.per_device_eval_batch_size  = 4\ntrain_cfg.training.gradient_accumulation_steps = 4\ntrain_cfg.training.fp16                        = False\ntrain_cfg.training.bf16                        = True\ntrain_cfg.training.dataloader_num_workers      = 8\n\ntrain_cfg.stage2.num_epoch                     = 5\n\n# ── task weights (sampling ratio enforced by WeightedRandomSampler) ──\n# Defaults in train_config.yaml: 0.30 / 0.20 / 0.50 (RRG ≈ VQA, impression\n# lower because in split_cascade mode it sees GT findings as input).\n# Resolver auto-renormalizes and drops vqa for IU-Xray. Override here only\n# if you want to experiment per-run, e.g.:\n#   train_cfg.tasks.findings_generation.weight   = 0.30\n#   train_cfg.tasks.impression_generation.weight = 0.20\n#   train_cfg.tasks.vqa.weight                   = 0.50\n\n# ── wandb off ──\ntrain_cfg.wandb.enabled = False\n\n# ── HuggingFace Hub run tracking ──\ntrain_cfg.hf_hub.enabled        = True\ntrain_cfg.hf_hub.repo_id        = 'hieu3636/cxr-vlm-runs'   # <<< EDIT ME\ntrain_cfg.hf_hub.token_env      = 'HF_TOKEN'\ntrain_cfg.hf_hub.private        = True\ntrain_cfg.hf_hub.run_state_file = str(CKPT_ROOT / 'run_id.txt')\n\n# ── 4-bit QLoRA ──\nmodel_cfg.llm.load_in_8bit = False\nmodel_cfg.llm.load_in_4bit = True\n# Oracle PNU path does NOT use the CheXpert classifier module (labels come\n# from the GT csv/manifest baked into the prompt). Keep it disabled until\n# you wire the learned classifier for realistic inference.\nmodel_cfg.chexpert_classifier.enabled = False\n\nOmegaConf.save(train_cfg, PROJECT / 'configs' / 'train_config.yaml')\nOmegaConf.save(model_cfg, PROJECT / 'configs' / 'model_config.yaml')\n\nprint('--- train_cfg.data ---');    print(OmegaConf.to_yaml(train_cfg.data))\nprint('--- train_cfg.tasks ---');   print(OmegaConf.to_yaml(train_cfg.tasks))\nprint('--- train_cfg.training ---');print(OmegaConf.to_yaml(train_cfg.training))\nprint('--- train_cfg.hf_hub ---');  print(OmegaConf.to_yaml(train_cfg.hf_hub))\nprint('--- model_cfg.llm ---');     print(OmegaConf.to_yaml(model_cfg.llm))",
    "execution_count": null,
    "outputs": [],
    "id": "cell-cfg"
     },
     "outputId": "8a2ce693-94fc-4425-f62c-679614d6dab5"
    },
+   "source": "# HF_TOKEN was already loaded in cell-paths (uniformly across all platforms).\n# This cell is now just a confirmation + reminder.\nassert os.environ.get('HF_TOKEN'), 'HF_TOKEN missing — re-run cell-paths.'\nprint('HF_TOKEN loaded ✓')",
    "execution_count": null,
+   "outputs": [],
    "id": "cell-hf-token"
   },
   {
    "metadata": {
     "id": "cell-mode-md"
    },
+   "source": "## 5b. Resume controller\n\nSingle switch. No more \"which stage\" — `train.py` auto-detects which stage to continue from by inspecting checkpoints on disk.\n\n| MODE       | What happens |\n|------------|--------------|\n| `'fresh'`  | Allocate a brand-new `{DATASET}_run_N+1` folder. Train both stages from scratch. |\n| `'resume'` | Reuse latest matching `{DATASET}_run_N` (or `EXPLICIT_RUN_ID`). Auto-detect from local disk: stage 1 mid-checkpoint, stage 1 done → stage 2 fresh, stage 2 mid-checkpoint, or both done. |\n\n`EXPLICIT_RUN_ID` is optional (set to `None` to auto-pick the latest run on disk or HF Hub that matches the current dataset prefix).\n\n### Fresh-VM resume\n\nIf your Colab/Kaggle VM was reset and the local `ckpt/{run_id}/` is gone (persistence lost or switching machines), the train cell will **auto-pull** the previous run's `stage{1,2}/last/` + `stage1/best/` (= stage1 final) from HF Hub into the canonical local layout before training, so `detect_resume_point` can pick up where you left off. `timing.json` is also pulled so the session-count + cumulative-time keeps incrementing.\n\n`run_id` resolution order (when `MODE='resume'`): `EXPLICIT_RUN_ID` > local `run_id.txt` > latest `{DATASET}_run_*` on HF Hub.",
    "id": "cell-resume-md"
   },
   {

training/train.py CHANGED Viewed

@@ -44,7 +44,7 @@ from model.rad_dino import BioViLTEncoder
 from data import CXRInstructDataset, CXRDataCollator
 from utils.logger import setup_logger
 from utils.checkpoint import save_checkpoint, load_checkpoint
-from utils.hf_uploader import build_tracker_from_cfg, pull_last_for_resume
 from utils.dataset_resolver import (
     resolve_dataset_spec,
     resolve_run_id,
@@ -263,6 +263,37 @@ def get_trainer(
                 model = self.model
             load_checkpoint(model, resume_from_checkpoint)
     return CXRTrainer(
         model           = model,
         args            = training_args,
@@ -649,6 +680,25 @@ def main():
         args.resume_from = local_resume
         logger.info(f"Will resume from pulled checkpoint: {local_resume} (stage{args.stage})")
     # ── Compute per-stage output dirs under {output_root}/{run_id}/ ──
     stage1_out = stage_dir(output_root, run_id,
                            str(train_cfg.stage1.get("subdir", "stage1_projection")))
@@ -724,6 +774,16 @@ def main():
             "resumed":      bool(args.resume_from),
             "resume_from":  args.resume_from,
         })
     # Build model
     logger.info("Building CXR VLM...")

 from data import CXRInstructDataset, CXRDataCollator
 from utils.logger import setup_logger
 from utils.checkpoint import save_checkpoint, load_checkpoint
+from utils.hf_uploader import build_tracker_from_cfg, pull_last_for_resume, hydrate_run_dir_from_hf
 from utils.dataset_resolver import (
     resolve_dataset_spec,
     resolve_run_id,
                 model = self.model
             load_checkpoint(model, resume_from_checkpoint)
+        def _get_train_sampler(self, *args, **kwargs):
+            """
+            Use `WeightedRandomSampler` when the train dataset is mixed-task
+            and exposes per-sample weights — this is what makes the configured
+            `tasks.*.weight` ratios actually control batch composition.
+            Falls back to HF's default (RandomSampler / DistributedSampler)
+            for single-task or eval-time datasets.
+            Notes:
+              * Eval is unaffected — HF's `_get_eval_sampler` returns a
+                `SequentialSampler` by default, so weighted reweighting only
+                applies to training.
+              * `replacement=True` is required for true oversampling — without
+                it you can't draw more samples of a rare-but-upweighted task
+                than physically exist. Tradeoff: a small fraction of samples
+                in a numerous-but-downweighted task may never appear in a
+                given epoch. Acceptable across multiple epochs.
+            """
+            ds = self.train_dataset
+            getter = getattr(ds, "get_per_sample_weights", None)
+            if getter is not None:
+                weights = getter()
+                if weights is not None:
+                    from torch.utils.data import WeightedRandomSampler
+                    return WeightedRandomSampler(
+                        weights     = weights,
+                        num_samples = len(ds),
+                        replacement = True,
+                    )
+            return super()._get_train_sampler(*args, **kwargs)
     return CXRTrainer(
         model           = model,
         args            = training_args,
         args.resume_from = local_resume
         logger.info(f"Will resume from pulled checkpoint: {local_resume} (stage{args.stage})")
+    # ── Fresh-VM resume: hydrate from HF before detect_resume_point ──
+    # When `--mode resume` is set but the local run dir is empty (Colab
+    # persistence lost, switching machines), pull configs + last/best
+    # checkpoints from HF Hub into the canonical local layout so the
+    # detector finds them. No-op if local already has artifacts or HF
+    # tracking is disabled.
+    if args.mode == "resume" and hf_repo_id and hf_token:
+        try:
+            hydrate_run_dir_from_hf(
+                repo_id       = hf_repo_id,
+                token         = hf_token,
+                run_id        = run_id,
+                output_root   = output_root,
+                stage1_subdir = str(train_cfg.stage1.get("subdir", "stage1_projection")),
+                stage2_subdir = str(train_cfg.stage2.get("subdir", "stage2_instruct")),
+            )
+        except Exception as e:
+            logger.warning(f"[resume hydrate] {type(e).__name__}: {e}")
     # ── Compute per-stage output dirs under {output_root}/{run_id}/ ──
     stage1_out = stage_dir(output_root, run_id,
                            str(train_cfg.stage1.get("subdir", "stage1_projection")))
             "resumed":      bool(args.resume_from),
             "resume_from":  args.resume_from,
         })
+        # Snapshot the resolved config + run_meta.json to HF so the run is
+        # self-describing on the hub (you can answer "what config did
+        # {run_id} actually use?" without pulling the whole checkpoint).
+        # `save_run_config` writes these into {run_dir}/configs/ +
+        # {run_dir}/run_meta.json a few lines above.
+        rd = run_dir(output_root, run_id)
+        if (rd / "configs").is_dir():
+            tracker.upload_folder(str(rd / "configs"), "configs")
+        if (rd / "run_meta.json").is_file():
+            tracker.upload_file(str(rd / "run_meta.json"), "run_meta.json")
     # Build model
     logger.info("Building CXR VLM...")

data/upload_to_hf_2.py → upload_to_hf_2.py RENAMED Viewed

File without changes

utils/dataset_resolver.py CHANGED Viewed

@@ -25,7 +25,7 @@ from pathlib import Path
 from typing import Dict, List, Optional
-SUPPORTED_DATASETS = ("MIMIC-CXR", "IU-Xray")
 @dataclass
@@ -113,6 +113,21 @@ def resolve_dataset_spec(train_cfg) -> DatasetSpec:
             train_cfg.data, report_mode, image_mode
         )
     else:  # IU-Xray
         # IU has no VQA.
         available = ["report"] if report_mode == "merged" else ["findings", "impression"]
@@ -233,6 +248,63 @@ def _ensure_mimic_json_exists(data_cfg,
     return str(out)
 # ─── Run ID resolution (dataset-prefixed) ───────────────────────────────────
 def resolve_run_id(

 from typing import Dict, List, Optional
+SUPPORTED_DATASETS = ("MIMIC-CXR", "MIMIC-CXR_resized", "IU-Xray")
 @dataclass
             train_cfg.data, report_mode, image_mode
         )
+    elif name == "MIMIC-CXR_resized":
+        # Same semantic dataset as MIMIC-CXR (all 3 tasks) but the on-disk
+        # layout is the raw PhysioNet tree {root}/files/pXX/... and splits
+        # come from mimic-cxr-2.0.0-split.csv instead of a pre-split dir
+        # structure. Reuses the same builder with layout="files".
+        if report_mode == "merged":
+            available = ["report", "vqa"]
+        else:
+            available = ["findings", "impression", "vqa"]
+        mr = train_cfg.data.mimic_cxr_resized
+        image_root    = mr.root
+        instruct_json = _ensure_mimic_resized_json_exists(
+            mr, report_mode, image_mode
+        )
     else:  # IU-Xray
         # IU has no VQA.
         available = ["report"] if report_mode == "merged" else ["findings", "impression"]
     return str(out)
+def _ensure_mimic_resized_json_exists(mr_cfg,
+                                      report_mode: str = "split",
+                                      image_mode:  str = "all_views_split") -> str:
+    """
+    Build the MIMIC-CXR_resized unified JSON if missing.
+    This dataset is **manifest-driven**, not directory-walking:
+        - 3 manifest CSVs (manifest_{train,val,test}.csv) carry every row's
+          split label, image/report relative path, and the 14 CheXpert
+          labels as chex_* columns. No separate *split*.csv or *chexpert*.csv
+          is read.
+        - VQA is read from `vqa_dir/{vqa.json, vqa_val.json, vqa_test.json}`.
+    The cache path is suffixed with report_mode+image_mode (same convention
+    as the other two builders) so each mode combination gets its own cache.
+    """
+    base = Path(_get(mr_cfg, "instruct_json",
+                     "data/data_files/mimic_cxr_resized_instruct.json"))
+    out  = base.with_name(f"{base.stem}__{report_mode}__{image_mode}{base.suffix}")
+    if out.is_file():
+        return str(out)
+    if not bool(_get(mr_cfg, "auto_build", True)):
+        raise FileNotFoundError(
+            f"MIMIC-CXR_resized instruct JSON not found at {out} and "
+            f"auto_build=false. Run: python -m data.mimic_cxr_resized_builder "
+            f"--root {_get(mr_cfg, 'root')} --output {out} "
+            f"--report_mode {report_mode} --image_mode {image_mode}"
+        )
+    from data.mimic_cxr_resized_builder import build_mimic_cxr_resized_instruct_json
+    print(f"[dataset_resolver] MIMIC-CXR_resized JSON not found → auto-building "
+          f"(report_mode={report_mode}, image_mode={image_mode}) …")
+    root_path = str(_get(mr_cfg, "root"))
+    # Convention defaults: manifest CSVs sit at `root`, VQA at `{root}/vqa`.
+    # Either can be overridden in config; an explicit empty string for
+    # vqa_dir disables VQA entirely.
+    manifest_dir = _get(mr_cfg, "manifest_dir") or root_path
+    vqa_dir_cfg  = _get(mr_cfg, "vqa_dir")
+    if vqa_dir_cfg is None:
+        vqa_dir = str(Path(root_path) / "vqa")
+    elif vqa_dir_cfg == "":
+        vqa_dir = None     # explicit opt-out
+    else:
+        vqa_dir = str(vqa_dir_cfg)
+    build_mimic_cxr_resized_instruct_json(
+        root         = root_path,
+        manifest_dir = manifest_dir,
+        output_path  = str(out),
+        vqa_dir      = vqa_dir,
+        reports_root = _get(mr_cfg, "reports_root"),
+        report_mode  = report_mode,
+        image_mode   = image_mode,
+    )
+    return str(out)
 # ─── Run ID resolution (dataset-prefixed) ───────────────────────────────────
 def resolve_run_id(

utils/hf_uploader.py CHANGED Viewed

@@ -288,6 +288,157 @@ def pull_last_for_resume(
     return str(last_dir)
 def build_tracker_from_cfg(train_cfg, resuming: bool = False, explicit_run_id: Optional[str] = None):
     """Convenience factory from OmegaConf DictConfig."""
     hf = getattr(train_cfg, "hf_hub", None)

     return str(last_dir)
+def hydrate_run_dir_from_hf(
+    repo_id:       str,
+    token:         Optional[str],
+    run_id:        str,
+    output_root:   str,
+    stage1_subdir: str = "stage1_projection",
+    stage2_subdir: str = "stage2_instruct",
+) -> bool:
+    """
+    Repopulate a local run dir from HF artifacts so `detect_resume_point`
+    can find checkpoints after a fresh-VM resume (persistence lost / new host).
+    HF layout (uploaded by HFBestLastCallback + end-of-stage saves):
+        {run_id}/configs/                         (YAML snapshots)
+        {run_id}/run_meta.json
+        {run_id}/timing.json
+        {run_id}/stage1/last/  +  stage1/best/    (best/ = stage1 final, renamed `checkpoint_*`)
+        {run_id}/stage2/last/  +  stage2/best/
+    Local layout `detect_resume_point` expects:
+        {output_root}/{run_id}/stage1_projection/stage1_final_*    ← stage1 done
+        {output_root}/{run_id}/stage1_projection/checkpoint-N/...  ← stage1 mid
+        {output_root}/{run_id}/stage2_instruct/stage2_final_*      ← stage2 done
+        {output_root}/{run_id}/stage2_instruct/checkpoint-N/...    ← stage2 mid
+    Mapping rules:
+      * `stage2/last/`  → `stage2_instruct/checkpoint-1/`  (placeholder N=1;
+        Trainer reads the real global_step from trainer_state.json inside).
+      * `stage1/best/`  → `stage1_projection/stage1_final_*`  (rename files
+        from `checkpoint_*` to `stage1_final_*` so save_checkpoint conventions
+        line up with what the rest of the pipeline expects).
+      * `stage1/last/`  → `stage1_projection/checkpoint-1/`  (only if no
+        stage1_final placed — i.e. stage 1 hadn't finished yet on HF).
+    Returns True if at least one artifact was placed, False otherwise.
+    """
+    if not HF_AVAILABLE:
+        print("[hydrate_run_dir_from_hf] huggingface_hub not installed — skip")
+        return False
+    from huggingface_hub import snapshot_download
+    import shutil
+    token = token or os.environ.get("HF_TOKEN")
+    output_root = Path(output_root)
+    staging     = output_root / "_hf_pull"
+    dst_root    = output_root / run_id
+    # Skip if local already has any final/checkpoint — we're not on a fresh VM.
+    s1_local = dst_root / stage1_subdir
+    s2_local = dst_root / stage2_subdir
+    def _has_ckpt(d: Path) -> bool:
+        return d.is_dir() and any(d.glob("checkpoint-*"))
+    if (
+        (s1_local / "stage1_final_projection.pt").exists()
+        or (s2_local / "stage2_final_projection.pt").exists()
+        or _has_ckpt(s1_local)
+        or _has_ckpt(s2_local)
+    ):
+        print(f"[hydrate_run_dir_from_hf] local {dst_root} already populated — skip pull")
+        return False
+    # Pull the run's relevant files (configs + meta + last/best, skip
+    # training_log.jsonl which can be large).
+    staging.mkdir(parents=True, exist_ok=True)
+    try:
+        snapshot_download(
+            repo_id        = repo_id,
+            repo_type      = "model",
+            token          = token,
+            allow_patterns = [
+                f"{run_id}/configs/**",
+                f"{run_id}/run_meta.json",
+                f"{run_id}/timing.json",
+                f"{run_id}/meta.json",
+                f"{run_id}/stage1/last/**",
+                f"{run_id}/stage1/best/**",
+                f"{run_id}/stage2/last/**",
+                f"{run_id}/stage2/best/**",
+            ],
+            local_dir      = str(staging),
+        )
+    except Exception as e:
+        print(f"[hydrate_run_dir_from_hf] snapshot_download failed: {e}")
+        return False
+    src_root = staging / run_id
+    if not src_root.is_dir():
+        print(f"[hydrate_run_dir_from_hf] HF has no '{run_id}/' folder")
+        shutil.rmtree(staging, ignore_errors=True)
+        return False
+    dst_root.mkdir(parents=True, exist_ok=True)
+    placed_any = False
+    # configs/, run_meta.json, timing.json, meta.json: straight copy
+    for sub in ("configs",):
+        s = src_root / sub
+        if s.is_dir():
+            shutil.copytree(s, dst_root / sub, dirs_exist_ok=True)
+            placed_any = True
+    for f in ("run_meta.json", "timing.json", "meta.json"):
+        s = src_root / f
+        if s.is_file():
+            shutil.copy2(s, dst_root / f)
+            placed_any = True
+    # Stage 2 last → checkpoint-1
+    s2_last_src = src_root / "stage2" / "last"
+    if s2_last_src.is_dir() and any(s2_last_src.iterdir()):
+        dst = dst_root / stage2_subdir / "checkpoint-1"
+        dst.mkdir(parents=True, exist_ok=True)
+        shutil.copytree(s2_last_src, dst, dirs_exist_ok=True)
+        placed_any = True
+        print(f"[hydrate_run_dir_from_hf] stage2 mid-resume placed at {dst}")
+    # Stage 1 best (= final) → stage1_final_*
+    s1_best_src = src_root / "stage1" / "best"
+    if s1_best_src.is_dir() and (s1_best_src / "checkpoint_projection.pt").exists():
+        dst_s1 = dst_root / stage1_subdir
+        dst_s1.mkdir(parents=True, exist_ok=True)
+        for entry in s1_best_src.iterdir():
+            # Rename "checkpoint_*" → "stage1_final_*"
+            new_name = entry.name.replace("checkpoint_", "stage1_final_", 1) \
+                       if entry.name.startswith("checkpoint_") else entry.name
+            if entry.is_file():
+                shutil.copy2(entry, dst_s1 / new_name)
+            elif entry.is_dir():
+                shutil.copytree(entry, dst_s1 / new_name, dirs_exist_ok=True)
+        placed_any = True
+        print(f"[hydrate_run_dir_from_hf] stage1 final placed at {dst_s1}")
+    # Stage 1 last → checkpoint-1 (ONLY if stage1 didn't finish yet)
+    if not (dst_root / stage1_subdir / "stage1_final_projection.pt").exists():
+        s1_last_src = src_root / "stage1" / "last"
+        if s1_last_src.is_dir() and any(s1_last_src.iterdir()):
+            dst = dst_root / stage1_subdir / "checkpoint-1"
+            dst.mkdir(parents=True, exist_ok=True)
+            shutil.copytree(s1_last_src, dst, dirs_exist_ok=True)
+            placed_any = True
+            print(f"[hydrate_run_dir_from_hf] stage1 mid-resume placed at {dst}")
+    # Cleanup staging
+    shutil.rmtree(staging, ignore_errors=True)
+    if placed_any:
+        print(f"[hydrate_run_dir_from_hf] hydrated {dst_root} from HF")
+    else:
+        print(f"[hydrate_run_dir_from_hf] nothing usable on HF for {run_id}")
+    return placed_any
 def build_tracker_from_cfg(train_cfg, resuming: bool = False, explicit_run_id: Optional[str] = None):
     """Convenience factory from OmegaConf DictConfig."""
     hf = getattr(train_cfg, "hf_hub", None)