convitom commited on
Commit ·
b961b41
1
Parent(s): 0a99045
- configs/train_config.yaml +43 -4
- data/count_img.py → count_img.py +75 -75
- data/dataset.py +47 -0
- data/mimic_cxr_builder.py +227 -81
- data/mimic_cxr_resized_builder.py +375 -0
- data/distri-IU-Xray.py → distri-IU-Xray.py +11 -11
- data/img_stat.py → img_stat.py +0 -0
- data/rezip.py → rezip.py +11 -11
- scripts/cxrvlm_colab_train.ipynb +19 -327
- training/train.py +61 -1
- data/upload_to_hf_2.py → upload_to_hf_2.py +0 -0
- utils/dataset_resolver.py +73 -1
- utils/hf_uploader.py +151 -0
configs/train_config.yaml
CHANGED
|
@@ -5,7 +5,15 @@
|
|
| 5 |
# ── Data ─────────────────────────────────────
|
| 6 |
data:
|
| 7 |
# Pick which dataset to train on.
|
| 8 |
-
# Supported:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
dataset_name: "IU-Xray"
|
| 10 |
|
| 11 |
# How findings and impression are turned into training samples.
|
|
@@ -74,6 +82,36 @@ data:
|
|
| 74 |
# pre-built file (built via `python -m data.mimic_cxr_builder ...`).
|
| 75 |
mimic_auto_build: true
|
| 76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
# --- IU X-ray paths (used when dataset_name == "IU-Xray") ---
|
| 78 |
# On local Windows the defaults below match D:\USTH\KLTN\data\IU-Xray\...
|
| 79 |
# On Kaggle set these to the mounted dataset (e.g. /kaggle/input/vlm-cxr-data/...)
|
|
@@ -102,16 +140,17 @@ data:
|
|
| 102 |
tasks:
|
| 103 |
findings_generation:
|
| 104 |
enabled: true
|
| 105 |
-
weight: 0.
|
| 106 |
impression_generation:
|
| 107 |
enabled: true
|
| 108 |
-
weight: 0.
|
|
|
|
| 109 |
report_generation:
|
| 110 |
enabled: true
|
| 111 |
weight: 0.6 # used when report_mode = merged
|
| 112 |
vqa:
|
| 113 |
enabled: true
|
| 114 |
-
weight: 0.
|
| 115 |
|
| 116 |
# ── Training ─────────────────────────────────
|
| 117 |
training:
|
|
|
|
| 5 |
# ── Data ─────────────────────────────────────
|
| 6 |
data:
|
| 7 |
# Pick which dataset to train on.
|
| 8 |
+
# Supported:
|
| 9 |
+
# "MIMIC-CXR" — pre-split layout {root}/{train,valid,test}/pXX/...
|
| 10 |
+
# all 3 tasks (findings, impression, vqa)
|
| 11 |
+
# "MIMIC-CXR_resized" — same data filtered+resized to tar shards on HF
|
| 12 |
+
# (hieu3636/cxr-vlm-data/MIMIC-CXR_resized/). After
|
| 13 |
+
# extraction the layout matches the raw PhysioNet
|
| 14 |
+
# tree ({root}/files/pXX/pXXXX/sYYYY/*.jpg); splits
|
| 15 |
+
# come from mimic-cxr-2.0.0-split.csv. All 3 tasks.
|
| 16 |
+
# "IU-Xray" — findings + impression only (no VQA)
|
| 17 |
dataset_name: "IU-Xray"
|
| 18 |
|
| 19 |
# How findings and impression are turned into training samples.
|
|
|
|
| 82 |
# pre-built file (built via `python -m data.mimic_cxr_builder ...`).
|
| 83 |
mimic_auto_build: true
|
| 84 |
|
| 85 |
+
# --- MIMIC-CXR_resized paths (used when dataset_name == "MIMIC-CXR_resized")
|
| 86 |
+
# Filtered + resized subset of MIMIC-CXR distributed via HF as tar shards
|
| 87 |
+
# (hieu3636/cxr-vlm-data/MIMIC-CXR_resized/) + a "subset_bundle" with the
|
| 88 |
+
# manifest CSVs and VQA JSON files. This dataset is MANIFEST-DRIVEN:
|
| 89 |
+
#
|
| 90 |
+
# manifest_{train,val,test}.csv — one row per image. Contains the split
|
| 91 |
+
# label, image_relpath, report_relpath, has_vqa, and 14 chex_*
|
| 92 |
+
# columns (the CheXpert labels). The val/test pool was redistributed
|
| 93 |
+
# from the original train split (subset is small), so the official
|
| 94 |
+
# PhysioNet mimic-cxr-2.0.0-split.csv is NOT used.
|
| 95 |
+
# vqa/{vqa.json, vqa_val.json, vqa_test.json} — VQA pairs filtered to
|
| 96 |
+
# only the images present in this resized subset.
|
| 97 |
+
#
|
| 98 |
+
# After extracting the tar shards, the on-disk layout (under `root`) is:
|
| 99 |
+
# {root}/files/pXX/pXXXXXXXX/sYYYYYYYY/<dicom>.jpg
|
| 100 |
+
# {root}/files/pXX/pXXXXXXXX/sYYYYYYYY.txt (reports alongside)
|
| 101 |
+
mimic_cxr_resized:
|
| 102 |
+
root: "D:/USTH/KLTN/subset_bundle" # extracted-tar root (parent of files/)
|
| 103 |
+
manifest_dir: null # null → same as `root`. Folder containing
|
| 104 |
+
# manifest_{train,val,test}.csv.
|
| 105 |
+
vqa_dir: null # null → use `{root}/vqa`. Folder containing
|
| 106 |
+
# vqa.json / vqa_val.json / vqa_test.json. Set
|
| 107 |
+
# to "" to disable VQA.
|
| 108 |
+
reports_root: null # null → auto-probe `{root}` then `{root}/reports`.
|
| 109 |
+
# Set explicitly if reports live somewhere else
|
| 110 |
+
# (e.g. when reports are bundled inside tars vs.
|
| 111 |
+
# a sibling `reports/` dir like subset_bundle/).
|
| 112 |
+
instruct_json: "data/data_files/mimic_cxr_resized_instruct.json"
|
| 113 |
+
auto_build: true # build JSON automatically if missing
|
| 114 |
+
|
| 115 |
# --- IU X-ray paths (used when dataset_name == "IU-Xray") ---
|
| 116 |
# On local Windows the defaults below match D:\USTH\KLTN\data\IU-Xray\...
|
| 117 |
# On Kaggle set these to the mounted dataset (e.g. /kaggle/input/vlm-cxr-data/...)
|
|
|
|
| 140 |
tasks:
|
| 141 |
findings_generation:
|
| 142 |
enabled: true
|
| 143 |
+
weight: 0.30 # used when report_mode = split
|
| 144 |
impression_generation:
|
| 145 |
enabled: true
|
| 146 |
+
weight: 0.20 # used when report_mode = split (lower:
|
| 147 |
+
# impression is conditioned on findings)
|
| 148 |
report_generation:
|
| 149 |
enabled: true
|
| 150 |
weight: 0.6 # used when report_mode = merged
|
| 151 |
vqa:
|
| 152 |
enabled: true
|
| 153 |
+
weight: 0.50 # boosted so VQA ≈ RRG (findings+impression)
|
| 154 |
|
| 155 |
# ── Training ─────────────────────────────────
|
| 156 |
training:
|
data/count_img.py → count_img.py
RENAMED
|
@@ -1,76 +1,76 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import json
|
| 3 |
-
|
| 4 |
-
def get_local_images(root_dir):
|
| 5 |
-
"""
|
| 6 |
-
Lấy toàn bộ đường dẫn ảnh local (dạng pxx/...)
|
| 7 |
-
"""
|
| 8 |
-
local_images = set()
|
| 9 |
-
|
| 10 |
-
for p_folder in os.listdir(root_dir):
|
| 11 |
-
if not p_folder.startswith("p1"): # chỉ p10 -> p19
|
| 12 |
-
continue
|
| 13 |
-
|
| 14 |
-
p_path = os.path.join(root_dir, p_folder)
|
| 15 |
-
|
| 16 |
-
for root, _, files in os.walk(p_path):
|
| 17 |
-
for file in files:
|
| 18 |
-
if file.endswith(".jpg"):
|
| 19 |
-
full_path = os.path.join(root, file)
|
| 20 |
-
|
| 21 |
-
# convert về dạng giống VQA: p10/.../xxx.jpg
|
| 22 |
-
rel_path = os.path.relpath(full_path, root_dir)
|
| 23 |
-
rel_path = rel_path.replace("\\", "/")
|
| 24 |
-
|
| 25 |
-
local_images.add(rel_path)
|
| 26 |
-
|
| 27 |
-
return local_images
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
def get_vqa_images(vqa_json_path):
|
| 31 |
-
"""
|
| 32 |
-
Lấy toàn bộ image_path từ file VQA json
|
| 33 |
-
"""
|
| 34 |
-
with open(vqa_json_path, "r", encoding="utf-8") as f:
|
| 35 |
-
data = json.load(f)
|
| 36 |
-
|
| 37 |
-
vqa_images = set()
|
| 38 |
-
|
| 39 |
-
for item in data:
|
| 40 |
-
if "image_path" in item:
|
| 41 |
-
vqa_images.add(item["image_path"])
|
| 42 |
-
|
| 43 |
-
return vqa_images
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
def main(root_dir, vqa_json_path):
|
| 47 |
-
print("Đang quét ảnh local...")
|
| 48 |
-
local_images = get_local_images(root_dir)
|
| 49 |
-
print(f"Số ảnh local: {len(local_images)}")
|
| 50 |
-
|
| 51 |
-
print("Đang đọc VQA json...")
|
| 52 |
-
vqa_images = get_vqa_images(vqa_json_path)
|
| 53 |
-
print(f"Số ảnh trong VQA: {len(vqa_images)}")
|
| 54 |
-
|
| 55 |
-
# intersection
|
| 56 |
-
matched = local_images & vqa_images
|
| 57 |
-
|
| 58 |
-
print("\n===== KẾT QUẢ =====")
|
| 59 |
-
print(f"Số ảnh trùng: {len(matched)}")
|
| 60 |
-
print(f"Tỷ lệ cover VQA: {len(matched) / len(vqa_images):.4f}")
|
| 61 |
-
|
| 62 |
-
# nếu muốn lưu danh sách
|
| 63 |
-
with open("matched_images.txt", "w") as f:
|
| 64 |
-
for path in matched:
|
| 65 |
-
f.write(path + "\n")
|
| 66 |
-
|
| 67 |
-
print("Đã lưu danh sách vào matched_images.txt")
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
if __name__ == "__main__":
|
| 71 |
-
x = "train"
|
| 72 |
-
y = "valid"
|
| 73 |
-
root_dir = r"D:\USTH\KLTN\data\{x}".format(x=x) # ví dụ: D:/mimic-cxr
|
| 74 |
-
vqa_json = r"D:\USTH\KLTN\data\mimic-ext-mimic-cxr-vqa-a-complex-diverse-and-large-scale-visual-question-answering-dataset-for-chest-x-ray-images-1.0.0\MIMIC-Ext-MIMIC-CXR-VQA\dataset\{y}.json".format(y=y) # ví dụ: D:/vqa/train.json
|
| 75 |
-
|
| 76 |
main(root_dir, vqa_json)
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
|
| 4 |
+
def get_local_images(root_dir):
|
| 5 |
+
"""
|
| 6 |
+
Lấy toàn bộ đường dẫn ảnh local (dạng pxx/...)
|
| 7 |
+
"""
|
| 8 |
+
local_images = set()
|
| 9 |
+
|
| 10 |
+
for p_folder in os.listdir(root_dir):
|
| 11 |
+
if not p_folder.startswith("p1"): # chỉ p10 -> p19
|
| 12 |
+
continue
|
| 13 |
+
|
| 14 |
+
p_path = os.path.join(root_dir, p_folder)
|
| 15 |
+
|
| 16 |
+
for root, _, files in os.walk(p_path):
|
| 17 |
+
for file in files:
|
| 18 |
+
if file.endswith(".jpg"):
|
| 19 |
+
full_path = os.path.join(root, file)
|
| 20 |
+
|
| 21 |
+
# convert về dạng giống VQA: p10/.../xxx.jpg
|
| 22 |
+
rel_path = os.path.relpath(full_path, root_dir)
|
| 23 |
+
rel_path = rel_path.replace("\\", "/")
|
| 24 |
+
|
| 25 |
+
local_images.add(rel_path)
|
| 26 |
+
|
| 27 |
+
return local_images
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def get_vqa_images(vqa_json_path):
|
| 31 |
+
"""
|
| 32 |
+
Lấy toàn bộ image_path từ file VQA json
|
| 33 |
+
"""
|
| 34 |
+
with open(vqa_json_path, "r", encoding="utf-8") as f:
|
| 35 |
+
data = json.load(f)
|
| 36 |
+
|
| 37 |
+
vqa_images = set()
|
| 38 |
+
|
| 39 |
+
for item in data:
|
| 40 |
+
if "image_path" in item:
|
| 41 |
+
vqa_images.add(item["image_path"])
|
| 42 |
+
|
| 43 |
+
return vqa_images
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def main(root_dir, vqa_json_path):
|
| 47 |
+
print("Đang quét ảnh local...")
|
| 48 |
+
local_images = get_local_images(root_dir)
|
| 49 |
+
print(f"Số ảnh local: {len(local_images)}")
|
| 50 |
+
|
| 51 |
+
print("Đang đọc VQA json...")
|
| 52 |
+
vqa_images = get_vqa_images(vqa_json_path)
|
| 53 |
+
print(f"Số ảnh trong VQA: {len(vqa_images)}")
|
| 54 |
+
|
| 55 |
+
# intersection
|
| 56 |
+
matched = local_images & vqa_images
|
| 57 |
+
|
| 58 |
+
print("\n===== KẾT QUẢ =====")
|
| 59 |
+
print(f"Số ảnh trùng: {len(matched)}")
|
| 60 |
+
print(f"Tỷ lệ cover VQA: {len(matched) / len(vqa_images):.4f}")
|
| 61 |
+
|
| 62 |
+
# nếu muốn lưu danh sách
|
| 63 |
+
with open("matched_images.txt", "w") as f:
|
| 64 |
+
for path in matched:
|
| 65 |
+
f.write(path + "\n")
|
| 66 |
+
|
| 67 |
+
print("Đã lưu danh sách vào matched_images.txt")
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
if __name__ == "__main__":
|
| 71 |
+
x = "train"
|
| 72 |
+
y = "valid"
|
| 73 |
+
root_dir = r"D:\USTH\KLTN\data\{x}".format(x=x) # ví dụ: D:/mimic-cxr
|
| 74 |
+
vqa_json = r"D:\USTH\KLTN\data\mimic-ext-mimic-cxr-vqa-a-complex-diverse-and-large-scale-visual-question-answering-dataset-for-chest-x-ray-images-1.0.0\MIMIC-Ext-MIMIC-CXR-VQA\dataset\{y}.json".format(y=y) # ví dụ: D:/vqa/train.json
|
| 75 |
+
|
| 76 |
main(root_dir, vqa_json)
|
data/dataset.py
CHANGED
|
@@ -161,6 +161,53 @@ class CXRInstructDataset(Dataset):
|
|
| 161 |
def __len__(self) -> int:
|
| 162 |
return len(self.samples)
|
| 163 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
|
| 165 |
sample = self.samples[idx]
|
| 166 |
|
|
|
|
| 161 |
def __len__(self) -> int:
|
| 162 |
return len(self.samples)
|
| 163 |
|
| 164 |
+
def get_per_sample_weights(self) -> Optional[List[float]]:
|
| 165 |
+
"""
|
| 166 |
+
Build per-sample weights for `torch.utils.data.WeightedRandomSampler`
|
| 167 |
+
so that, in expectation, each task occupies its configured fraction of
|
| 168 |
+
drawn training samples — regardless of how many samples of each task
|
| 169 |
+
exist in the JSON.
|
| 170 |
+
|
| 171 |
+
Math:
|
| 172 |
+
For task t with N_t samples in the JSON and configured weight w_t,
|
| 173 |
+
give every sample of t the weight `w_t / N_t`. The aggregate
|
| 174 |
+
probability of drawing ANY sample of task t over one draw becomes
|
| 175 |
+
`N_t * (w_t / N_t) = w_t`, which is exactly the desired ratio.
|
| 176 |
+
|
| 177 |
+
Tasks with weight 0 (e.g. VQA on IU-Xray) get weight 0 → never drawn.
|
| 178 |
+
Tasks present in the JSON but absent from `self.task_weights` also get
|
| 179 |
+
weight 0 (loud-failure-on-misconfig is preferable to silent miscounts).
|
| 180 |
+
|
| 181 |
+
Returns:
|
| 182 |
+
list of floats of length len(self.samples), or None if this is a
|
| 183 |
+
single-task dataset (`self.task != "mixed"`) — in that case every
|
| 184 |
+
sample is the same task, so weighted sampling is unnecessary and
|
| 185 |
+
the default uniform `RandomSampler` is correct.
|
| 186 |
+
"""
|
| 187 |
+
if self.task != "mixed":
|
| 188 |
+
return None
|
| 189 |
+
|
| 190 |
+
# Count samples per task that actually appear in this dataset.
|
| 191 |
+
counts: Dict[str, int] = {}
|
| 192 |
+
for s in self.samples:
|
| 193 |
+
counts[s["task"]] = counts.get(s["task"], 0) + 1
|
| 194 |
+
|
| 195 |
+
# Per-sample weight = w_task / N_task. Tasks not in task_weights → 0.
|
| 196 |
+
weights = [
|
| 197 |
+
float(self.task_weights.get(s["task"], 0.0)) / counts[s["task"]]
|
| 198 |
+
for s in self.samples
|
| 199 |
+
]
|
| 200 |
+
|
| 201 |
+
# Sanity: print effective per-task probabilities once so the actual
|
| 202 |
+
# mix during training is visible in logs (helps catch misconfigured
|
| 203 |
+
# weights vs. JSON-task-set mismatch).
|
| 204 |
+
eff = {t: float(self.task_weights.get(t, 0.0)) for t in counts}
|
| 205 |
+
eff_sum = sum(eff.values()) or 1.0
|
| 206 |
+
eff = {t: round(v / eff_sum, 4) for t, v in eff.items()}
|
| 207 |
+
print(f"[CXRInstructDataset] WeightedRandomSampler effective task mix: "
|
| 208 |
+
f"{eff} (counts: {counts})")
|
| 209 |
+
return weights
|
| 210 |
+
|
| 211 |
def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
|
| 212 |
sample = self.samples[idx]
|
| 213 |
|
data/mimic_cxr_builder.py
CHANGED
|
@@ -94,6 +94,45 @@ def _discover_chexpert_csv(mimic_root: Path, explicit: Optional[str]) -> Optiona
|
|
| 94 |
return None
|
| 95 |
|
| 96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
def _load_chexpert_map(csv_path: Path) -> Dict[Tuple[str, str], str]:
|
| 98 |
"""
|
| 99 |
Return {(subject_id, study_id): <PNU string>} where the ids are the bare
|
|
@@ -150,6 +189,9 @@ def build_mimic_cxr_instruct_json(
|
|
| 150 |
vqa_root: Optional[str] = None,
|
| 151 |
report_mode: str = "split", # "split" | "merged" | "split_cascade"
|
| 152 |
image_mode: str = "all_views_split", # "all_views_split" | "frontal_only_split" | "multi_image_merged"
|
|
|
|
|
|
|
|
|
|
| 153 |
) -> str:
|
| 154 |
"""
|
| 155 |
Build the unified MIMIC-CXR instruction JSON.
|
|
@@ -170,30 +212,70 @@ def build_mimic_cxr_instruct_json(
|
|
| 170 |
of the study — this MIMIC layout has no metadata.csv to read ViewPosition
|
| 171 |
from. Swap in a ViewPosition lookup if you add that CSV.
|
| 172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
Returns the absolute output path.
|
| 174 |
"""
|
| 175 |
assert report_mode in ("split", "merged", "split_cascade"), \
|
| 176 |
f"report_mode must be 'split', 'merged', or 'split_cascade', got {report_mode!r}"
|
| 177 |
assert image_mode in ("all_views_split", "frontal_only_split", "multi_image_merged"), \
|
| 178 |
f"image_mode invalid: {image_mode!r}"
|
|
|
|
|
|
|
| 179 |
|
| 180 |
from .dataset import format_merged_report # local import to avoid cycle
|
| 181 |
|
| 182 |
mimic_root = Path(mimic_root)
|
| 183 |
output_path = Path(output_path)
|
| 184 |
|
| 185 |
-
#
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
|
| 198 |
# ── CheXpert labels ───────────────────────────────────────────────────
|
| 199 |
csv_path = _discover_chexpert_csv(mimic_root, chexpert_csv)
|
|
@@ -211,21 +293,15 @@ def build_mimic_cxr_instruct_json(
|
|
| 211 |
# ── Pass 1: index studies ─────────────────────────────────────────────
|
| 212 |
samples: List[Dict] = []
|
| 213 |
# sub_rel ("pXX/pXXXX/sYYYY/img.jpg") → full stored image_path
|
| 214 |
-
# ("{split}/pXX/pXXXX/sYYYY/img.jpg"
|
|
|
|
| 215 |
image_index: Dict[str, str] = {}
|
| 216 |
-
n_studies = n_missing_report = n_no_chexpert = 0
|
| 217 |
skipped_merged_no_impression = skipped_cascade_no_findings = 0
|
| 218 |
|
| 219 |
def _structured_for(subj: str, study: str) -> Optional[str]:
|
| 220 |
return chexpert_map.get((subj.lstrip("p"), study.lstrip("s")))
|
| 221 |
|
| 222 |
-
def _rels_for(study_dir: Path, split_sub: str, subj: str, study: str) -> List[str]:
|
| 223 |
-
"""Split-prefixed relative image paths for one study, sorted."""
|
| 224 |
-
return [
|
| 225 |
-
f"{split_sub}/{im.parent.parent.parent.name}/{subj}/{study}/{im.name}"
|
| 226 |
-
for im in sorted(study_dir.glob("*.jpg"))
|
| 227 |
-
]
|
| 228 |
-
|
| 229 |
def _image_groups(rels: List[str]):
|
| 230 |
"""Yield path_fields dicts honouring image_mode (same rules as IU)."""
|
| 231 |
if image_mode == "all_views_split":
|
|
@@ -236,67 +312,120 @@ def build_mimic_cxr_instruct_json(
|
|
| 236 |
else: # multi_image_merged
|
| 237 |
yield {"image_path": None, "image_paths": rels}
|
| 238 |
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
n_no_chexpert += 1
|
| 260 |
-
split_label = split_dirs[split_sub]
|
| 261 |
-
|
| 262 |
-
for path_fields in _image_groups(rels):
|
| 263 |
-
base = {
|
| 264 |
-
**path_fields,
|
| 265 |
-
"question": None,
|
| 266 |
-
"split": split_label,
|
| 267 |
-
"study_id": study,
|
| 268 |
-
"subject_id": subj,
|
| 269 |
-
}
|
| 270 |
-
if report_mode == "merged":
|
| 271 |
-
target = format_merged_report(findings, impression)
|
| 272 |
-
if target is None:
|
| 273 |
-
skipped_merged_no_impression += 1
|
| 274 |
continue
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
|
| 301 |
# ── Pass 2: optional VQA attach (mirrors the notebook) ────────────────
|
| 302 |
n_vqa = n_vqa_dropped = 0
|
|
@@ -346,11 +475,14 @@ def build_mimic_cxr_instruct_json(
|
|
| 346 |
by_task[s["task"]] = by_task.get(s["task"], 0) + 1
|
| 347 |
|
| 348 |
print(f"[mimic_cxr_builder] wrote {len(samples):,} samples → {output_path}")
|
|
|
|
| 349 |
print(f" report_mode : {report_mode}")
|
| 350 |
print(f" image_mode : {image_mode}")
|
| 351 |
print(f" studies indexed : {n_studies:,}")
|
| 352 |
print(f" missing report : {n_missing_report:,}")
|
| 353 |
print(f" studies w/o chexpert label : {n_no_chexpert:,}")
|
|
|
|
|
|
|
| 354 |
if report_mode == "merged":
|
| 355 |
print(f" skipped no_impr : {skipped_merged_no_impression:,}")
|
| 356 |
if report_mode == "split_cascade":
|
|
@@ -378,6 +510,17 @@ def _parse_args():
|
|
| 378 |
choices=["split", "merged", "split_cascade"])
|
| 379 |
p.add_argument("--image_mode", default="all_views_split",
|
| 380 |
choices=["all_views_split", "frontal_only_split", "multi_image_merged"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
return p.parse_args()
|
| 382 |
|
| 383 |
|
|
@@ -390,4 +533,7 @@ if __name__ == "__main__":
|
|
| 390 |
vqa_root = a.vqa_root,
|
| 391 |
report_mode = a.report_mode,
|
| 392 |
image_mode = a.image_mode,
|
|
|
|
|
|
|
|
|
|
| 393 |
)
|
|
|
|
| 94 |
return None
|
| 95 |
|
| 96 |
|
| 97 |
+
def _discover_split_csv(mimic_root: Path, explicit: Optional[str]) -> Optional[Path]:
|
| 98 |
+
"""Locate mimic-cxr-2.0.0-split.csv (or any *split*.csv) under `mimic_root`.
|
| 99 |
+
Used by the "files" layout to assign train/validate/test per study."""
|
| 100 |
+
if explicit:
|
| 101 |
+
p = Path(explicit)
|
| 102 |
+
return p if p.is_file() else None
|
| 103 |
+
hits = sorted(glob.glob(str(mimic_root / "**" / "*split*.csv"), recursive=True))
|
| 104 |
+
return Path(hits[0]) if hits else None
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def _load_split_map(csv_path: Path) -> Dict[Tuple[str, str], str]:
|
| 108 |
+
"""
|
| 109 |
+
Return {(subject_id, study_id): "train"|"validate"|"test"} from
|
| 110 |
+
mimic-cxr-2.0.0-split.csv. IDs stored without the p/s prefix to match
|
| 111 |
+
the chexpert map convention. Tolerates 'valid' as alias for 'validate'.
|
| 112 |
+
"""
|
| 113 |
+
out: Dict[Tuple[str, str], str] = {}
|
| 114 |
+
with open(csv_path, newline="") as f:
|
| 115 |
+
reader = csv.DictReader(f)
|
| 116 |
+
col = {c.lower().strip(): c for c in reader.fieldnames or []}
|
| 117 |
+
subj_c = col.get("subject_id")
|
| 118 |
+
study_c = col.get("study_id")
|
| 119 |
+
split_c = col.get("split")
|
| 120 |
+
if not (subj_c and study_c and split_c):
|
| 121 |
+
raise ValueError(
|
| 122 |
+
f"{csv_path} missing subject_id/study_id/split columns "
|
| 123 |
+
f"(have: {reader.fieldnames})"
|
| 124 |
+
)
|
| 125 |
+
for row in reader:
|
| 126 |
+
subj = str(row[subj_c]).strip().lstrip("p").split(".")[0]
|
| 127 |
+
study = str(row[study_c]).strip().lstrip("s").split(".")[0]
|
| 128 |
+
sp = str(row[split_c]).strip().lower()
|
| 129 |
+
if sp == "valid":
|
| 130 |
+
sp = "validate"
|
| 131 |
+
if sp in ("train", "validate", "test"):
|
| 132 |
+
out[(subj, study)] = sp
|
| 133 |
+
return out
|
| 134 |
+
|
| 135 |
+
|
| 136 |
def _load_chexpert_map(csv_path: Path) -> Dict[Tuple[str, str], str]:
|
| 137 |
"""
|
| 138 |
Return {(subject_id, study_id): <PNU string>} where the ids are the bare
|
|
|
|
| 189 |
vqa_root: Optional[str] = None,
|
| 190 |
report_mode: str = "split", # "split" | "merged" | "split_cascade"
|
| 191 |
image_mode: str = "all_views_split", # "all_views_split" | "frontal_only_split" | "multi_image_merged"
|
| 192 |
+
layout: str = "presplit", # "presplit" | "files"
|
| 193 |
+
split_csv: Optional[str] = None, # required for layout="files"
|
| 194 |
+
reports_root: Optional[str] = None, # for layout="files"; None → reports alongside images
|
| 195 |
) -> str:
|
| 196 |
"""
|
| 197 |
Build the unified MIMIC-CXR instruction JSON.
|
|
|
|
| 212 |
of the study — this MIMIC layout has no metadata.csv to read ViewPosition
|
| 213 |
from. Swap in a ViewPosition lookup if you add that CSV.
|
| 214 |
|
| 215 |
+
layout selects which on-disk tree to walk:
|
| 216 |
+
"presplit" — {root}/{train,valid,test}/pXX/pXXXX/sYYYY/{*.jpg + *.txt}
|
| 217 |
+
The custom MIMIC-CXR.zip used by the notebook. Default.
|
| 218 |
+
"files" — {root}/files/pXX/pXXXX/sYYYY/*.jpg (raw PhysioNet tree).
|
| 219 |
+
Used by MIMIC-CXR_resized after extracting tar shards.
|
| 220 |
+
Requires `split_csv` (or auto-discovers *split*.csv) to
|
| 221 |
+
assign train/validate/test. Reports are read from
|
| 222 |
+
`reports_root` (separate tree, e.g. mimic-cxr-reports/)
|
| 223 |
+
or from the study dir if reports_root is None.
|
| 224 |
+
|
| 225 |
Returns the absolute output path.
|
| 226 |
"""
|
| 227 |
assert report_mode in ("split", "merged", "split_cascade"), \
|
| 228 |
f"report_mode must be 'split', 'merged', or 'split_cascade', got {report_mode!r}"
|
| 229 |
assert image_mode in ("all_views_split", "frontal_only_split", "multi_image_merged"), \
|
| 230 |
f"image_mode invalid: {image_mode!r}"
|
| 231 |
+
assert layout in ("presplit", "files"), \
|
| 232 |
+
f"layout must be 'presplit' or 'files', got {layout!r}"
|
| 233 |
|
| 234 |
from .dataset import format_merged_report # local import to avoid cycle
|
| 235 |
|
| 236 |
mimic_root = Path(mimic_root)
|
| 237 |
output_path = Path(output_path)
|
| 238 |
|
| 239 |
+
# ── Locate study dirs + split assignment ────────────────────────────────
|
| 240 |
+
# Two layouts produce the same downstream shape: each entry is
|
| 241 |
+
# (study_dir, subject_dir_name, study_dir_name, split_label, image_rel_prefix)
|
| 242 |
+
# where image_rel_prefix is the leading path component used when building
|
| 243 |
+
# the JSON-stored relative image path. presplit prefixes with the split
|
| 244 |
+
# dir name ("train/..."), files prefixes with "files/...".
|
| 245 |
+
if layout == "presplit":
|
| 246 |
+
split_dirs = {"train": "train", "valid": "validate", "test": "test"}
|
| 247 |
+
present = {sub: mimic_root / sub for sub in split_dirs if (mimic_root / sub).is_dir()}
|
| 248 |
+
if not present:
|
| 249 |
+
raise FileNotFoundError(
|
| 250 |
+
f"No train/valid/test subdirs under {mimic_root}. "
|
| 251 |
+
f"Expected the pre-split MIMIC-CXR layout."
|
| 252 |
+
)
|
| 253 |
+
split_map = None # not needed — split comes from dir name
|
| 254 |
+
else: # "files"
|
| 255 |
+
files_dir = mimic_root / "files"
|
| 256 |
+
if not files_dir.is_dir():
|
| 257 |
+
raise FileNotFoundError(
|
| 258 |
+
f"Expected {files_dir} for layout='files'. After extracting "
|
| 259 |
+
f"the MIMIC-CXR_resized tars the layout should be "
|
| 260 |
+
f"{{root}}/files/pXX/pXXXX/sYYYY/*.jpg."
|
| 261 |
+
)
|
| 262 |
+
sp_path = _discover_split_csv(mimic_root, split_csv)
|
| 263 |
+
if sp_path is None:
|
| 264 |
+
raise FileNotFoundError(
|
| 265 |
+
f"Could not find a split CSV under {mimic_root} and none "
|
| 266 |
+
f"passed via --split_csv. layout='files' needs "
|
| 267 |
+
f"mimic-cxr-2.0.0-split.csv to assign train/validate/test."
|
| 268 |
+
)
|
| 269 |
+
split_map = _load_split_map(sp_path)
|
| 270 |
+
print(f"[mimic_cxr_builder] split CSV: {sp_path} "
|
| 271 |
+
f"({len(split_map):,} (subj,study) entries)")
|
| 272 |
+
reports_root_p = Path(reports_root) if reports_root else None
|
| 273 |
+
if reports_root_p is not None and not reports_root_p.is_dir():
|
| 274 |
+
raise FileNotFoundError(
|
| 275 |
+
f"reports_root={reports_root_p} does not exist. Either point "
|
| 276 |
+
f"to the extracted mimic-cxr-reports tree (with a `files/` "
|
| 277 |
+
f"subdir inside it) or leave it null to look alongside images."
|
| 278 |
+
)
|
| 279 |
|
| 280 |
# ── CheXpert labels ───────────────────────────────────────────────────
|
| 281 |
csv_path = _discover_chexpert_csv(mimic_root, chexpert_csv)
|
|
|
|
| 293 |
# ── Pass 1: index studies ─────────────────────────────────────────────
|
| 294 |
samples: List[Dict] = []
|
| 295 |
# sub_rel ("pXX/pXXXX/sYYYY/img.jpg") → full stored image_path
|
| 296 |
+
# ("{split}/pXX/pXXXX/sYYYY/img.jpg" or "files/pXX/pXXXX/sYYYY/img.jpg").
|
| 297 |
+
# O(1) VQA lookup.
|
| 298 |
image_index: Dict[str, str] = {}
|
| 299 |
+
n_studies = n_missing_report = n_no_chexpert = n_no_split = 0
|
| 300 |
skipped_merged_no_impression = skipped_cascade_no_findings = 0
|
| 301 |
|
| 302 |
def _structured_for(subj: str, study: str) -> Optional[str]:
|
| 303 |
return chexpert_map.get((subj.lstrip("p"), study.lstrip("s")))
|
| 304 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
def _image_groups(rels: List[str]):
|
| 306 |
"""Yield path_fields dicts honouring image_mode (same rules as IU)."""
|
| 307 |
if image_mode == "all_views_split":
|
|
|
|
| 312 |
else: # multi_image_merged
|
| 313 |
yield {"image_path": None, "image_paths": rels}
|
| 314 |
|
| 315 |
+
def _iter_studies():
|
| 316 |
+
"""
|
| 317 |
+
Yield (study_dir, p_dir_name, subj, study, rels, report_path, split_label)
|
| 318 |
+
for every valid study in either layout.
|
| 319 |
+
rels = list of JSON-relative image paths (split-prefixed or
|
| 320 |
+
"files/"-prefixed depending on layout).
|
| 321 |
+
report_path = Path to the report .txt (may not exist; caller handles).
|
| 322 |
+
split_label = "train"/"validate"/"test" or None when unresolved.
|
| 323 |
+
"""
|
| 324 |
+
if layout == "presplit":
|
| 325 |
+
for split_sub, split_dir in present.items():
|
| 326 |
+
for p_dir in sorted(split_dir.glob("p*")):
|
| 327 |
+
for pat_dir in p_dir.glob("p*"):
|
| 328 |
+
for study_dir in pat_dir.glob("s*"):
|
| 329 |
+
subj, study = pat_dir.name, study_dir.name
|
| 330 |
+
rels = [
|
| 331 |
+
f"{split_sub}/{p_dir.name}/{subj}/{study}/{im.name}"
|
| 332 |
+
for im in sorted(study_dir.glob("*.jpg"))
|
| 333 |
+
]
|
| 334 |
+
if not rels:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
continue
|
| 336 |
+
txts = list(study_dir.glob("*.txt"))
|
| 337 |
+
report_path = txts[0] if txts else None
|
| 338 |
+
yield (study_dir, p_dir.name, subj, study, rels,
|
| 339 |
+
report_path, split_dirs[split_sub])
|
| 340 |
+
else: # "files"
|
| 341 |
+
files_dir = mimic_root / "files"
|
| 342 |
+
for p_dir in sorted(files_dir.glob("p*")):
|
| 343 |
+
for pat_dir in p_dir.glob("p*"):
|
| 344 |
+
for study_dir in pat_dir.glob("s*"):
|
| 345 |
+
subj, study = pat_dir.name, study_dir.name
|
| 346 |
+
rels = [
|
| 347 |
+
f"files/{p_dir.name}/{subj}/{study}/{im.name}"
|
| 348 |
+
for im in sorted(study_dir.glob("*.jpg"))
|
| 349 |
+
]
|
| 350 |
+
if not rels:
|
| 351 |
+
continue
|
| 352 |
+
# Report lookup: separate tree if reports_root is set,
|
| 353 |
+
# else alongside images (parent dir holds sYYYY.txt
|
| 354 |
+
# per PhysioNet convention OR inside study dir).
|
| 355 |
+
if reports_root_p is not None:
|
| 356 |
+
report_path = (reports_root_p / "files" /
|
| 357 |
+
p_dir.name / subj / f"{study}.txt")
|
| 358 |
+
else:
|
| 359 |
+
# Try both: study_dir/*.txt then parent/{study}.txt
|
| 360 |
+
cand = list(study_dir.glob("*.txt"))
|
| 361 |
+
if cand:
|
| 362 |
+
report_path = cand[0]
|
| 363 |
+
else:
|
| 364 |
+
report_path = pat_dir / f"{study}.txt"
|
| 365 |
+
split_label = split_map.get(
|
| 366 |
+
(subj.lstrip("p"), study.lstrip("s"))
|
| 367 |
+
) if split_map else None
|
| 368 |
+
yield (study_dir, p_dir.name, subj, study, rels,
|
| 369 |
+
report_path, split_label)
|
| 370 |
+
|
| 371 |
+
for (study_dir, p_dir_name, subj, study, rels,
|
| 372 |
+
report_path, split_label) in _iter_studies():
|
| 373 |
+
n_studies += 1
|
| 374 |
+
# Index EVERY image up front — a VQA row may reference a study
|
| 375 |
+
# that has images but no findings/impression report.
|
| 376 |
+
for r in rels:
|
| 377 |
+
image_index[r.split("/", 1)[1]] = r
|
| 378 |
+
# Studies missing from split CSV (files layout) are skipped —
|
| 379 |
+
# emitting them would silently dump into "train".
|
| 380 |
+
if split_label is None:
|
| 381 |
+
n_no_split += 1
|
| 382 |
+
continue
|
| 383 |
+
if report_path is None or not Path(report_path).is_file():
|
| 384 |
+
n_missing_report += 1
|
| 385 |
+
continue
|
| 386 |
+
findings, impression = _parse_report(Path(report_path))
|
| 387 |
+
structured = _structured_for(subj, study)
|
| 388 |
+
if structured is None:
|
| 389 |
+
n_no_chexpert += 1
|
| 390 |
+
|
| 391 |
+
for path_fields in _image_groups(rels):
|
| 392 |
+
base = {
|
| 393 |
+
**path_fields,
|
| 394 |
+
"question": None,
|
| 395 |
+
"split": split_label,
|
| 396 |
+
"study_id": study,
|
| 397 |
+
"subject_id": subj,
|
| 398 |
+
}
|
| 399 |
+
if report_mode == "merged":
|
| 400 |
+
target = format_merged_report(findings, impression)
|
| 401 |
+
if target is None:
|
| 402 |
+
skipped_merged_no_impression += 1
|
| 403 |
+
continue
|
| 404 |
+
samples.append({**base, "task": "report",
|
| 405 |
+
"target": target,
|
| 406 |
+
"structured_findings": structured})
|
| 407 |
+
elif report_mode == "split_cascade":
|
| 408 |
+
if findings:
|
| 409 |
+
samples.append({**base, "task": "findings",
|
| 410 |
+
"target": findings,
|
| 411 |
+
"structured_findings": structured})
|
| 412 |
+
if impression:
|
| 413 |
+
if not findings:
|
| 414 |
+
skipped_cascade_no_findings += 1
|
| 415 |
+
else:
|
| 416 |
+
samples.append({**base, "task": "impression",
|
| 417 |
+
"target": impression,
|
| 418 |
+
"structured_findings":
|
| 419 |
+
f"Findings: {findings}"})
|
| 420 |
+
else: # "split"
|
| 421 |
+
if findings:
|
| 422 |
+
samples.append({**base, "task": "findings",
|
| 423 |
+
"target": findings,
|
| 424 |
+
"structured_findings": structured})
|
| 425 |
+
if impression:
|
| 426 |
+
samples.append({**base, "task": "impression",
|
| 427 |
+
"target": impression,
|
| 428 |
+
"structured_findings": structured})
|
| 429 |
|
| 430 |
# ── Pass 2: optional VQA attach (mirrors the notebook) ────────────────
|
| 431 |
n_vqa = n_vqa_dropped = 0
|
|
|
|
| 475 |
by_task[s["task"]] = by_task.get(s["task"], 0) + 1
|
| 476 |
|
| 477 |
print(f"[mimic_cxr_builder] wrote {len(samples):,} samples → {output_path}")
|
| 478 |
+
print(f" layout : {layout}")
|
| 479 |
print(f" report_mode : {report_mode}")
|
| 480 |
print(f" image_mode : {image_mode}")
|
| 481 |
print(f" studies indexed : {n_studies:,}")
|
| 482 |
print(f" missing report : {n_missing_report:,}")
|
| 483 |
print(f" studies w/o chexpert label : {n_no_chexpert:,}")
|
| 484 |
+
if layout == "files":
|
| 485 |
+
print(f" studies w/o split-CSV entry (skipped) : {n_no_split:,}")
|
| 486 |
if report_mode == "merged":
|
| 487 |
print(f" skipped no_impr : {skipped_merged_no_impression:,}")
|
| 488 |
if report_mode == "split_cascade":
|
|
|
|
| 510 |
choices=["split", "merged", "split_cascade"])
|
| 511 |
p.add_argument("--image_mode", default="all_views_split",
|
| 512 |
choices=["all_views_split", "frontal_only_split", "multi_image_merged"])
|
| 513 |
+
p.add_argument("--layout", default="presplit",
|
| 514 |
+
choices=["presplit", "files"],
|
| 515 |
+
help="presplit: {root}/{train,valid,test}/pXX/... (custom MIMIC-CXR.zip). "
|
| 516 |
+
"files: {root}/files/pXX/... (raw PhysioNet tree, used by "
|
| 517 |
+
"MIMIC-CXR_resized after tar extraction). Requires --split_csv.")
|
| 518 |
+
p.add_argument("--split_csv", default=None,
|
| 519 |
+
help="mimic-cxr-2.0.0-split.csv (auto-discovered under --mimic_root "
|
| 520 |
+
"if omitted). Required for layout='files'.")
|
| 521 |
+
p.add_argument("--reports_root", default=None,
|
| 522 |
+
help="Root of the mimic-cxr-reports tree (separate from images). "
|
| 523 |
+
"Used when layout='files' and reports are NOT in the image tars.")
|
| 524 |
return p.parse_args()
|
| 525 |
|
| 526 |
|
|
|
|
| 533 |
vqa_root = a.vqa_root,
|
| 534 |
report_mode = a.report_mode,
|
| 535 |
image_mode = a.image_mode,
|
| 536 |
+
layout = a.layout,
|
| 537 |
+
split_csv = a.split_csv,
|
| 538 |
+
reports_root = a.reports_root,
|
| 539 |
)
|
data/mimic_cxr_resized_builder.py
ADDED
|
@@ -0,0 +1,375 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
mimic_cxr_resized_builder.py
|
| 3 |
+
----------------------------
|
| 4 |
+
Build the unified instruction JSON for the MIMIC-CXR_resized dataset
|
| 5 |
+
(`hieu3636/cxr-vlm-data/MIMIC-CXR_resized/`) — a filtered + resized subset
|
| 6 |
+
of MIMIC-CXR that ships with its own manifest CSVs.
|
| 7 |
+
|
| 8 |
+
Why a separate builder?
|
| 9 |
+
MIMIC-CXR_resized is manifest-driven, not directory-walking like
|
| 10 |
+
`mimic_cxr_builder.py`:
|
| 11 |
+
- Splits come from THREE manifest CSVs (manifest_train.csv,
|
| 12 |
+
manifest_val.csv, manifest_test.csv) — NOT from PhysioNet's
|
| 13 |
+
mimic-cxr-2.0.0-split.csv (the user redistributed val/test from
|
| 14 |
+
the original train pool to balance sizes).
|
| 15 |
+
- The 14 CheXpert labels are baked into the manifest as `chex_*`
|
| 16 |
+
columns; no separate chexpert.csv lookup is needed.
|
| 17 |
+
- Each manifest row = ONE image (one DICOM). Multi-view studies
|
| 18 |
+
appear as multiple rows sharing (subject_id, study_id).
|
| 19 |
+
- Image + report paths are stored verbatim in `image_relpath` /
|
| 20 |
+
`report_relpath`, relative to the extracted-tar root. Reports
|
| 21 |
+
live inside the same `files/` tree at patient-dir level
|
| 22 |
+
(e.g. files/p10/p10000032/s50414267.txt), NOT inside study dirs.
|
| 23 |
+
|
| 24 |
+
VQA
|
| 25 |
+
---
|
| 26 |
+
3 JSON files (`vqa.json` for train, `vqa_val.json`, `vqa_test.json`) sit
|
| 27 |
+
under a `vqa/` sibling dir. Each row references `image_path` exactly the
|
| 28 |
+
same as `image_relpath` in the manifest, so we look it up in an
|
| 29 |
+
image-index built during the manifest pass. Missing-image VQA rows are
|
| 30 |
+
dropped (the resized subset has fewer images than the full MIMIC).
|
| 31 |
+
|
| 32 |
+
Output JSON schema is identical to the other two builders so downstream
|
| 33 |
+
(`CXRInstructDataset`, evaluation) is unchanged.
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
import argparse
|
| 37 |
+
import csv
|
| 38 |
+
import json
|
| 39 |
+
from pathlib import Path
|
| 40 |
+
from typing import Dict, List, Optional, Tuple
|
| 41 |
+
|
| 42 |
+
from .mimic_cxr_builder import _parse_report # reuse the same FINDINGS/IMPRESSION regex
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# Manifest column name → CheXpert PATHOLOGIES name mapping is direct:
|
| 46 |
+
# manifest col "chex_Atelectasis" ↔ PATHOLOGIES "Atelectasis", etc.
|
| 47 |
+
# We resolve the canonical 14-name list at runtime to stay in sync with
|
| 48 |
+
# `model.chexpert_classifier.PATHOLOGIES` (single source of truth).
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
# ─── PNU builder from a single manifest row ─────────────────────────────────
|
| 52 |
+
|
| 53 |
+
def _row_to_pnu(row: Dict[str, str]) -> str:
|
| 54 |
+
"""
|
| 55 |
+
Translate the 14 `chex_*` columns of a manifest row into the PNU
|
| 56 |
+
structured-findings string (Positive/Negative/Uncertain Abnormalities).
|
| 57 |
+
Same U-MultiClass convention as `mimic_cxr_builder._load_chexpert_map`:
|
| 58 |
+
"1" / "1.0" → positive
|
| 59 |
+
"0" / "0.0" → negative
|
| 60 |
+
"-1" / "-1.0" → uncertain
|
| 61 |
+
blank / NaN → negative (META-CXR default)
|
| 62 |
+
"""
|
| 63 |
+
from model.chexpert_classifier import (
|
| 64 |
+
PATHOLOGIES, buckets_to_pnu,
|
| 65 |
+
CLASS_NEGATIVE, CLASS_POSITIVE, CLASS_UNCERTAIN,
|
| 66 |
+
)
|
| 67 |
+
val_to_cls = {
|
| 68 |
+
"1": CLASS_POSITIVE, "1.0": CLASS_POSITIVE,
|
| 69 |
+
"0": CLASS_NEGATIVE, "0.0": CLASS_NEGATIVE,
|
| 70 |
+
"-1": CLASS_UNCERTAIN, "-1.0": CLASS_UNCERTAIN,
|
| 71 |
+
}
|
| 72 |
+
mapping = {}
|
| 73 |
+
for name in PATHOLOGIES:
|
| 74 |
+
v = str(row.get(f"chex_{name}", "")).strip()
|
| 75 |
+
mapping[name] = val_to_cls.get(v, CLASS_NEGATIVE)
|
| 76 |
+
return buckets_to_pnu(mapping)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
# ─── Helpers ────────────────────────────────────────────────────────────────
|
| 80 |
+
|
| 81 |
+
# Manifest split-label ↔ output split-label.
|
| 82 |
+
# Manifest uses "val" (3-letter); the rest of the pipeline expects "validate".
|
| 83 |
+
_MANIFEST_FILES = (
|
| 84 |
+
("manifest_train.csv", "train"),
|
| 85 |
+
("manifest_val.csv", "validate"),
|
| 86 |
+
("manifest_test.csv", "test"),
|
| 87 |
+
)
|
| 88 |
+
_VQA_FILES = (
|
| 89 |
+
("vqa.json", "train"),
|
| 90 |
+
("vqa_val.json", "validate"),
|
| 91 |
+
("vqa_test.json","test"),
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def _group_manifest_by_study(csv_path: Path) -> Dict[Tuple[str, str], List[Dict[str, str]]]:
|
| 96 |
+
"""
|
| 97 |
+
Parse one manifest CSV and group rows by (subject_id, study_id) so that
|
| 98 |
+
multi-view studies end up as a single bucket — needed for the
|
| 99 |
+
`multi_image_merged` image_mode and to keep one structured_findings per
|
| 100 |
+
study (all views of a study share the same CheXpert labels).
|
| 101 |
+
"""
|
| 102 |
+
grouped: Dict[Tuple[str, str], List[Dict[str, str]]] = {}
|
| 103 |
+
with open(csv_path, encoding="utf-8", newline="") as f:
|
| 104 |
+
reader = csv.DictReader(f)
|
| 105 |
+
for row in reader:
|
| 106 |
+
key = (str(row["subject_id"]).strip(),
|
| 107 |
+
str(row["study_id"]).strip())
|
| 108 |
+
grouped.setdefault(key, []).append(row)
|
| 109 |
+
return grouped
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def _image_groups(rels: List[str], image_mode: str):
|
| 113 |
+
"""Yield path_fields dicts honouring image_mode (mirrors the other builders)."""
|
| 114 |
+
if image_mode == "all_views_split":
|
| 115 |
+
for r in rels:
|
| 116 |
+
yield {"image_path": r, "image_paths": None}
|
| 117 |
+
elif image_mode == "frontal_only_split":
|
| 118 |
+
yield {"image_path": rels[0], "image_paths": None}
|
| 119 |
+
else: # multi_image_merged
|
| 120 |
+
yield {"image_path": None, "image_paths": rels}
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
# ─── Main builder ───────────────────────────────────────────────────────────
|
| 124 |
+
|
| 125 |
+
def build_mimic_cxr_resized_instruct_json(
|
| 126 |
+
root: str,
|
| 127 |
+
manifest_dir: Optional[str],
|
| 128 |
+
output_path: str,
|
| 129 |
+
vqa_dir: Optional[str] = None,
|
| 130 |
+
reports_root: Optional[str] = None,
|
| 131 |
+
report_mode: str = "split", # "split" | "merged" | "split_cascade"
|
| 132 |
+
image_mode: str = "all_views_split", # "all_views_split" | "frontal_only_split" | "multi_image_merged"
|
| 133 |
+
) -> str:
|
| 134 |
+
"""
|
| 135 |
+
Build the unified MIMIC-CXR_resized instruction JSON.
|
| 136 |
+
|
| 137 |
+
Args:
|
| 138 |
+
root: directory containing the extracted tar shards, so
|
| 139 |
+
`{root}/{image_relpath}` resolves to an image.
|
| 140 |
+
The manifest stores image_relpath like
|
| 141 |
+
"files/p19/p19855745/s59502026/<dicom>.jpg".
|
| 142 |
+
manifest_dir: directory containing manifest_{train,val,test}.csv.
|
| 143 |
+
If None → defaults to `root`.
|
| 144 |
+
output_path: where to write the JSON.
|
| 145 |
+
vqa_dir: directory containing vqa.json / vqa_val.json /
|
| 146 |
+
vqa_test.json. If None or files missing → VQA skipped
|
| 147 |
+
(only findings + impression samples emitted).
|
| 148 |
+
reports_root: directory that `report_relpath` resolves against. If
|
| 149 |
+
None we try (a) `{root}` (tars include reports beside
|
| 150 |
+
images) then (b) `{root}/reports` (separate bundle —
|
| 151 |
+
the layout of the local subset_bundle). Set explicitly
|
| 152 |
+
to skip the probe.
|
| 153 |
+
report_mode: "split" | "merged" | "split_cascade" (see other builders).
|
| 154 |
+
image_mode: "all_views_split" | "frontal_only_split" |
|
| 155 |
+
"multi_image_merged" (see other builders).
|
| 156 |
+
|
| 157 |
+
Returns:
|
| 158 |
+
Absolute output path.
|
| 159 |
+
"""
|
| 160 |
+
assert report_mode in ("split", "merged", "split_cascade"), \
|
| 161 |
+
f"report_mode invalid: {report_mode!r}"
|
| 162 |
+
assert image_mode in ("all_views_split", "frontal_only_split", "multi_image_merged"), \
|
| 163 |
+
f"image_mode invalid: {image_mode!r}"
|
| 164 |
+
|
| 165 |
+
from .dataset import format_merged_report # local import to avoid cycle
|
| 166 |
+
|
| 167 |
+
root = Path(root)
|
| 168 |
+
manifest_dir = Path(manifest_dir) if manifest_dir else root
|
| 169 |
+
output_path = Path(output_path)
|
| 170 |
+
|
| 171 |
+
# ── Resolve reports_root (auto-probe if not set) ────────────────────────
|
| 172 |
+
# The manifest stores `report_relpath` like
|
| 173 |
+
# "files/p19/p19855745/s59502026.txt" — relative to whichever directory
|
| 174 |
+
# actually holds the reports tree. Two common layouts:
|
| 175 |
+
# (a) reports bundled into the tars → `{root}/files/.../.txt` exists
|
| 176 |
+
# (b) reports kept as a separate sibling → `{root}/reports/files/.../.txt`
|
| 177 |
+
# The local subset_bundle uses (b); the HF-extracted training setup may
|
| 178 |
+
# use (a). Auto-probe both and pick the first one that has any hits.
|
| 179 |
+
if reports_root is not None:
|
| 180 |
+
reports_root_p = Path(reports_root)
|
| 181 |
+
else:
|
| 182 |
+
candidates = [root, root / "reports"]
|
| 183 |
+
reports_root_p = root # default fallback
|
| 184 |
+
for cand in candidates:
|
| 185 |
+
if (cand / "files").is_dir():
|
| 186 |
+
reports_root_p = cand
|
| 187 |
+
break
|
| 188 |
+
print(f"[mimic_cxr_resized_builder] reports_root resolved → {reports_root_p}")
|
| 189 |
+
|
| 190 |
+
# ── Pass 1: walk the 3 manifest CSVs ────────────────────────────────────
|
| 191 |
+
samples: List[Dict] = []
|
| 192 |
+
image_index: Dict[str, str] = {} # image_relpath → image_relpath (identity; used for VQA lookup)
|
| 193 |
+
pnu_by_study: Dict[Tuple[str, str], str] = {} # (subj, study) → PNU string (for VQA reuse)
|
| 194 |
+
|
| 195 |
+
n_studies = n_missing_report = 0
|
| 196 |
+
skipped_merged_no_impression = skipped_cascade_no_findings = 0
|
| 197 |
+
|
| 198 |
+
for fname, split_label in _MANIFEST_FILES:
|
| 199 |
+
csv_path = manifest_dir / fname
|
| 200 |
+
if not csv_path.is_file():
|
| 201 |
+
print(f"[mimic_cxr_resized_builder] manifest missing: {csv_path} — skipping {split_label}")
|
| 202 |
+
continue
|
| 203 |
+
grouped = _group_manifest_by_study(csv_path)
|
| 204 |
+
print(f"[mimic_cxr_resized_builder] {fname}: "
|
| 205 |
+
f"{sum(len(v) for v in grouped.values()):,} rows / "
|
| 206 |
+
f"{len(grouped):,} studies")
|
| 207 |
+
|
| 208 |
+
for (subj, study), rows in grouped.items():
|
| 209 |
+
n_studies += 1
|
| 210 |
+
|
| 211 |
+
# All views of the same study share report + CheXpert labels.
|
| 212 |
+
first = rows[0]
|
| 213 |
+
rels = [r["image_relpath"] for r in rows]
|
| 214 |
+
|
| 215 |
+
# Index every image (incl. studies with no report yet) so a VQA
|
| 216 |
+
# row that references this image can still be picked up below.
|
| 217 |
+
for r in rels:
|
| 218 |
+
image_index[r] = r
|
| 219 |
+
|
| 220 |
+
structured = _row_to_pnu(first)
|
| 221 |
+
pnu_by_study[(subj, study)] = structured # cached for VQA reuse
|
| 222 |
+
|
| 223 |
+
report_rel = first.get("report_relpath", "").strip()
|
| 224 |
+
if not report_rel:
|
| 225 |
+
n_missing_report += 1
|
| 226 |
+
continue
|
| 227 |
+
report_path = reports_root_p / report_rel
|
| 228 |
+
if not report_path.is_file():
|
| 229 |
+
n_missing_report += 1
|
| 230 |
+
continue
|
| 231 |
+
findings, impression = _parse_report(report_path)
|
| 232 |
+
|
| 233 |
+
# Output JSON uses the same subject/study id format as the
|
| 234 |
+
# legacy MIMIC builder ("pXXXX" / "sYYYY") so downstream eval
|
| 235 |
+
# (which compares subject_id strings) keeps working unchanged.
|
| 236 |
+
subj_str = f"p{subj}" if not subj.startswith("p") else subj
|
| 237 |
+
study_str = f"s{study}" if not study.startswith("s") else study
|
| 238 |
+
|
| 239 |
+
for path_fields in _image_groups(rels, image_mode):
|
| 240 |
+
base = {
|
| 241 |
+
**path_fields,
|
| 242 |
+
"question": None,
|
| 243 |
+
"split": split_label,
|
| 244 |
+
"study_id": study_str,
|
| 245 |
+
"subject_id": subj_str,
|
| 246 |
+
}
|
| 247 |
+
if report_mode == "merged":
|
| 248 |
+
target = format_merged_report(findings, impression)
|
| 249 |
+
if target is None:
|
| 250 |
+
skipped_merged_no_impression += 1
|
| 251 |
+
continue
|
| 252 |
+
samples.append({**base, "task": "report",
|
| 253 |
+
"target": target,
|
| 254 |
+
"structured_findings": structured})
|
| 255 |
+
elif report_mode == "split_cascade":
|
| 256 |
+
if findings:
|
| 257 |
+
samples.append({**base, "task": "findings",
|
| 258 |
+
"target": findings,
|
| 259 |
+
"structured_findings": structured})
|
| 260 |
+
if impression:
|
| 261 |
+
if not findings:
|
| 262 |
+
skipped_cascade_no_findings += 1
|
| 263 |
+
else:
|
| 264 |
+
samples.append({**base, "task": "impression",
|
| 265 |
+
"target": impression,
|
| 266 |
+
"structured_findings":
|
| 267 |
+
f"Findings: {findings}"})
|
| 268 |
+
else: # "split"
|
| 269 |
+
if findings:
|
| 270 |
+
samples.append({**base, "task": "findings",
|
| 271 |
+
"target": findings,
|
| 272 |
+
"structured_findings": structured})
|
| 273 |
+
if impression:
|
| 274 |
+
samples.append({**base, "task": "impression",
|
| 275 |
+
"target": impression,
|
| 276 |
+
"structured_findings": structured})
|
| 277 |
+
|
| 278 |
+
# ── Pass 2: optional VQA attach ─────────────────────────────────────────
|
| 279 |
+
n_vqa = n_vqa_dropped = 0
|
| 280 |
+
if vqa_dir:
|
| 281 |
+
vqa_dir = Path(vqa_dir)
|
| 282 |
+
for fname, split_label in _VQA_FILES:
|
| 283 |
+
vqa_file = vqa_dir / fname
|
| 284 |
+
if not vqa_file.is_file():
|
| 285 |
+
print(f"[mimic_cxr_resized_builder] VQA missing: {vqa_file} — skipping {split_label}")
|
| 286 |
+
continue
|
| 287 |
+
for row in json.load(open(vqa_file, encoding="utf-8")):
|
| 288 |
+
img_path = str(row.get("image_path", "")).lstrip("/")
|
| 289 |
+
if img_path not in image_index:
|
| 290 |
+
n_vqa_dropped += 1
|
| 291 |
+
continue
|
| 292 |
+
ans = row.get("answer", [])
|
| 293 |
+
answer = (", ".join(map(str, ans)) if isinstance(ans, list)
|
| 294 |
+
else str(ans)) or "No."
|
| 295 |
+
subj = str(row.get("subject_id", "")).strip()
|
| 296 |
+
study = str(row.get("study_id", "")).strip()
|
| 297 |
+
# Abnormality-guided VQA: reuse the manifest's CheXpert PNU
|
| 298 |
+
# for this study (same context as findings/impression). None
|
| 299 |
+
# if the study wasn't in any manifest — should not happen
|
| 300 |
+
# since we already filtered to images that exist in manifest.
|
| 301 |
+
structured = pnu_by_study.get((subj, study))
|
| 302 |
+
subj_str = f"p{subj}" if subj and not subj.startswith("p") else subj
|
| 303 |
+
study_str = f"s{study}" if study and not study.startswith("s") else study
|
| 304 |
+
samples.append({
|
| 305 |
+
"image_path": img_path, "image_paths": None,
|
| 306 |
+
"task": "vqa", "target": answer,
|
| 307 |
+
"question": row["question"],
|
| 308 |
+
"structured_findings": structured,
|
| 309 |
+
"split": split_label,
|
| 310 |
+
"study_id": study_str,
|
| 311 |
+
"subject_id": subj_str,
|
| 312 |
+
})
|
| 313 |
+
n_vqa += 1
|
| 314 |
+
|
| 315 |
+
# ── Write ───────────────────────────────────────────────────────��───────
|
| 316 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 317 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 318 |
+
json.dump(samples, f, ensure_ascii=False)
|
| 319 |
+
|
| 320 |
+
by_split, by_task = {}, {}
|
| 321 |
+
for s in samples:
|
| 322 |
+
by_split[s["split"]] = by_split.get(s["split"], 0) + 1
|
| 323 |
+
by_task[s["task"]] = by_task.get(s["task"], 0) + 1
|
| 324 |
+
|
| 325 |
+
print(f"[mimic_cxr_resized_builder] wrote {len(samples):,} samples → {output_path}")
|
| 326 |
+
print(f" root : {root}")
|
| 327 |
+
print(f" manifest_dir : {manifest_dir}")
|
| 328 |
+
print(f" vqa_dir : {vqa_dir if vqa_dir else '(disabled)'}")
|
| 329 |
+
print(f" report_mode : {report_mode}")
|
| 330 |
+
print(f" image_mode : {image_mode}")
|
| 331 |
+
print(f" studies indexed : {n_studies:,}")
|
| 332 |
+
print(f" missing report : {n_missing_report:,}")
|
| 333 |
+
if report_mode == "merged":
|
| 334 |
+
print(f" skipped no_impr : {skipped_merged_no_impression:,}")
|
| 335 |
+
if report_mode == "split_cascade":
|
| 336 |
+
print(f" skipped impr w/o findings : {skipped_cascade_no_findings:,}")
|
| 337 |
+
if vqa_dir:
|
| 338 |
+
print(f" vqa added/dropped : {n_vqa:,} / {n_vqa_dropped:,}")
|
| 339 |
+
print(f" by split : {by_split}")
|
| 340 |
+
print(f" by task : {by_task}")
|
| 341 |
+
return str(output_path)
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
# ─── CLI ────────────────────────────────────────────────────────────────────
|
| 345 |
+
|
| 346 |
+
def _parse_args():
|
| 347 |
+
p = argparse.ArgumentParser(description="Build MIMIC-CXR_resized unified instruction JSON")
|
| 348 |
+
p.add_argument("--root", required=True,
|
| 349 |
+
help="Root containing files/pXX/... after extracting tar shards.")
|
| 350 |
+
p.add_argument("--manifest_dir", default=None,
|
| 351 |
+
help="Folder with manifest_{train,val,test}.csv (defaults to --root).")
|
| 352 |
+
p.add_argument("--output", required=True, help="Output JSON path.")
|
| 353 |
+
p.add_argument("--vqa_dir", default=None,
|
| 354 |
+
help="Folder with vqa.json / vqa_val.json / vqa_test.json. Omit to skip VQA.")
|
| 355 |
+
p.add_argument("--reports_root", default=None,
|
| 356 |
+
help="Directory that report_relpath resolves against. "
|
| 357 |
+
"Omit to auto-probe `{root}` then `{root}/reports`.")
|
| 358 |
+
p.add_argument("--report_mode", default="split",
|
| 359 |
+
choices=["split", "merged", "split_cascade"])
|
| 360 |
+
p.add_argument("--image_mode", default="all_views_split",
|
| 361 |
+
choices=["all_views_split", "frontal_only_split", "multi_image_merged"])
|
| 362 |
+
return p.parse_args()
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
if __name__ == "__main__":
|
| 366 |
+
a = _parse_args()
|
| 367 |
+
build_mimic_cxr_resized_instruct_json(
|
| 368 |
+
root = a.root,
|
| 369 |
+
manifest_dir = a.manifest_dir,
|
| 370 |
+
output_path = a.output,
|
| 371 |
+
vqa_dir = a.vqa_dir,
|
| 372 |
+
reports_root = a.reports_root,
|
| 373 |
+
report_mode = a.report_mode,
|
| 374 |
+
image_mode = a.image_mode,
|
| 375 |
+
)
|
data/distri-IU-Xray.py → distri-IU-Xray.py
RENAMED
|
@@ -1,12 +1,12 @@
|
|
| 1 |
-
import os, glob
|
| 2 |
-
from xml.etree import ElementTree as ET
|
| 3 |
-
|
| 4 |
-
xml_dir = "D:/USTH/KLTN/data/IU-Xray/labels/ecgen-radiology/"
|
| 5 |
-
counts = {}
|
| 6 |
-
for f in glob.glob(xml_dir + "*.xml"):
|
| 7 |
-
tree = ET.parse(f)
|
| 8 |
-
n = len(tree.findall(".//parentImage"))
|
| 9 |
-
counts[n] = counts.get(n, 0) + 1
|
| 10 |
-
|
| 11 |
-
for k, v in sorted(counts.items()):
|
| 12 |
print(f"{k} ảnh/report: {v} reports")
|
|
|
|
| 1 |
+
import os, glob
|
| 2 |
+
from xml.etree import ElementTree as ET
|
| 3 |
+
|
| 4 |
+
xml_dir = "D:/USTH/KLTN/data/IU-Xray/labels/ecgen-radiology/"
|
| 5 |
+
counts = {}
|
| 6 |
+
for f in glob.glob(xml_dir + "*.xml"):
|
| 7 |
+
tree = ET.parse(f)
|
| 8 |
+
n = len(tree.findall(".//parentImage"))
|
| 9 |
+
counts[n] = counts.get(n, 0) + 1
|
| 10 |
+
|
| 11 |
+
for k, v in sorted(counts.items()):
|
| 12 |
print(f"{k} ảnh/report: {v} reports")
|
data/img_stat.py → img_stat.py
RENAMED
|
File without changes
|
data/rezip.py → rezip.py
RENAMED
|
@@ -1,12 +1,12 @@
|
|
| 1 |
-
import zipfile
|
| 2 |
-
import os
|
| 3 |
-
|
| 4 |
-
zipf = zipfile.ZipFile(r"D:\USTH\KLTN\cxr-vlm-code.zip", 'w', zipfile.ZIP_DEFLATED)
|
| 5 |
-
|
| 6 |
-
for root, dirs, files in os.walk('KLTN'):
|
| 7 |
-
for file in files:
|
| 8 |
-
filepath = os.path.join(root, file)
|
| 9 |
-
arcname = os.path.relpath(filepath, 'KLTN')
|
| 10 |
-
zipf.write(filepath, arcname)
|
| 11 |
-
|
| 12 |
zipf.close()
|
|
|
|
| 1 |
+
import zipfile
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
zipf = zipfile.ZipFile(r"D:\USTH\KLTN\cxr-vlm-code.zip", 'w', zipfile.ZIP_DEFLATED)
|
| 5 |
+
|
| 6 |
+
for root, dirs, files in os.walk('KLTN'):
|
| 7 |
+
for file in files:
|
| 8 |
+
filepath = os.path.join(root, file)
|
| 9 |
+
arcname = os.path.relpath(filepath, 'KLTN')
|
| 10 |
+
zipf.write(filepath, arcname)
|
| 11 |
+
|
| 12 |
zipf.close()
|
scripts/cxrvlm_colab_train.ipynb
CHANGED
|
@@ -5,32 +5,7 @@
|
|
| 5 |
"metadata": {
|
| 6 |
"id": "cell-0"
|
| 7 |
},
|
| 8 |
-
"source":
|
| 9 |
-
"# CXR-VLM — Kaggle Training Notebook (consolidated)\n",
|
| 10 |
-
"\n",
|
| 11 |
-
"Trains the 2-stage CXR-VLM (Vicuna-7B + BioViL-T fallback to timm ViT + LoRA) on a Kaggle **T4** GPU.\n",
|
| 12 |
-
"\n",
|
| 13 |
-
"Supports **two datasets**, selected by `DATASET_NAME` in section 0:\n",
|
| 14 |
-
"- **`MIMIC-CXR`** — full 3 tasks (findings, impression, VQA).\n",
|
| 15 |
-
"- **`IU-Xray`** — 2 tasks only (findings, impression). Much lighter dataset (~7.5k images).\n",
|
| 16 |
-
"\n",
|
| 17 |
-
"### Before you run\n",
|
| 18 |
-
"\n",
|
| 19 |
-
"Attach Kaggle Datasets via `+ Add Input`:\n",
|
| 20 |
-
"\n",
|
| 21 |
-
"| Dataset slug | Contents | When needed |\n",
|
| 22 |
-
"|---|---|---|\n",
|
| 23 |
-
"| `cxr-vlm-code` | entire `D:\\USTH\\KLTN` folder (configs/, data/*.py, model/, training/, evaluation/, utils/, requirements.txt) | **always** |\n",
|
| 24 |
-
"| `cxr-vlm-data` | holds **both** datasets: `MIMIC-CXR/{train,valid,test}/p*/...` + `MIMIC-Ext-MIMIC-CXR-VQA/...` and/or `IU-Xray/images/` + `IU-Xray/labels/` | **always** |\n",
|
| 25 |
-
"\n",
|
| 26 |
-
"**Settings (right panel):**\n",
|
| 27 |
-
"- Accelerator: **T4 x2** (only GPU 0 will be used)\n",
|
| 28 |
-
"- Persistence: **Variables and Files**\n",
|
| 29 |
-
"- Internet: **On**\n",
|
| 30 |
-
"\n",
|
| 31 |
-
"**Kaggle Secrets** (Add-ons → Secrets):\n",
|
| 32 |
-
"- `HF_TOKEN` — HuggingFace token with write access to the runs repo."
|
| 33 |
-
],
|
| 34 |
"id": "cell-0"
|
| 35 |
},
|
| 36 |
{
|
|
@@ -54,27 +29,9 @@
|
|
| 54 |
},
|
| 55 |
"outputId": "d6e7ebbd-4f1b-483b-f20f-0df3997a60b7"
|
| 56 |
},
|
| 57 |
-
"source":
|
| 58 |
-
"# ── Platform + dataset selectors ──────────────────────────────────\n",
|
| 59 |
-
"# PLATFORM drives storage paths and how secrets are read.\n",
|
| 60 |
-
"# Supported: 'kaggle' | 'colab' | 'lightning' | 'gcp' | 'local'\n",
|
| 61 |
-
"PLATFORM = 'colab'\n",
|
| 62 |
-
"DATASET_NAME = 'IU-Xray' # 'MIMIC-CXR' | 'IU-Xray'\n",
|
| 63 |
-
"\n",
|
| 64 |
-
"assert PLATFORM in ('kaggle', 'colab', 'lightning', 'gcp', 'local')\n",
|
| 65 |
-
"assert DATASET_NAME in ('MIMIC-CXR', 'IU-Xray')\n",
|
| 66 |
-
"print(f'PLATFORM = {PLATFORM} | DATASET_NAME = {DATASET_NAME}')\n"
|
| 67 |
-
],
|
| 68 |
"execution_count": null,
|
| 69 |
-
"outputs": [
|
| 70 |
-
{
|
| 71 |
-
"output_type": "stream",
|
| 72 |
-
"name": "stdout",
|
| 73 |
-
"text": [
|
| 74 |
-
"PLATFORM = colab | DATASET_NAME = IU-Xray\n"
|
| 75 |
-
]
|
| 76 |
-
}
|
| 77 |
-
],
|
| 78 |
"id": "cell-select"
|
| 79 |
},
|
| 80 |
{
|
|
@@ -119,120 +76,9 @@
|
|
| 119 |
},
|
| 120 |
"outputId": "f6195a48-56b2-4052-8367-c8ec14c48a05"
|
| 121 |
},
|
| 122 |
-
"source": [
|
| 123 |
-
"# ── Per-platform storage + source-of-truth ─────────────────────────\n",
|
| 124 |
-
"# Kaggle : code + data come from attached Kaggle datasets (pre-mounted).\n",
|
| 125 |
-
"# Others : pull code (folder) + data (single zip) from HF Hub dataset repos.\n",
|
| 126 |
-
"#\n",
|
| 127 |
-
"# Required HF repos:\n",
|
| 128 |
-
"# <HF_USER>/cxr-vlm-code — project source (flat folder)\n",
|
| 129 |
-
"# <HF_USER>/cxr-vlm-data — contains IU-Xray.zip (one zip per dataset)\n",
|
| 130 |
-
"\n",
|
| 131 |
-
"HF_USER = 'hieu3636' # <<< EDIT ME\n",
|
| 132 |
-
"\n",
|
| 133 |
-
"if PLATFORM == 'kaggle':\n",
|
| 134 |
-
" INPUT_ROOT = Path('/kaggle/input')\n",
|
| 135 |
-
" WORK = Path('/kaggle/working')\n",
|
| 136 |
-
" def find_dataset(slug, required=True):\n",
|
| 137 |
-
" for cand in [INPUT_ROOT / slug, *INPUT_ROOT.rglob(slug)]:\n",
|
| 138 |
-
" if cand.is_dir():\n",
|
| 139 |
-
" return cand\n",
|
| 140 |
-
" if required:\n",
|
| 141 |
-
" raise FileNotFoundError(f'Dataset {slug!r} not attached')\n",
|
| 142 |
-
" return None\n",
|
| 143 |
-
" CODE_SRC = find_dataset('cxr-vlm-code')\n",
|
| 144 |
-
" DATA_SRC = find_dataset('cxr-vlm-data')\n",
|
| 145 |
-
"\n",
|
| 146 |
-
"else:\n",
|
| 147 |
-
" # ── Non-Kaggle: resolve WORK, then pull from HF ──\n",
|
| 148 |
-
" if PLATFORM == 'colab':\n",
|
| 149 |
-
" from google.colab import userdata\n",
|
| 150 |
-
" os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')\n",
|
| 151 |
-
" WORK = Path('/content')\n",
|
| 152 |
-
" elif PLATFORM == 'lightning':\n",
|
| 153 |
-
" WORK = Path('/teamspace/studios/this_studio')\n",
|
| 154 |
-
" elif PLATFORM == 'gcp':\n",
|
| 155 |
-
" WORK = Path('/workspace')\n",
|
| 156 |
-
" else:\n",
|
| 157 |
-
" WORK = Path.home() / 'cxr-vlm-work'\n",
|
| 158 |
-
" WORK.mkdir(parents=True, exist_ok=True)\n",
|
| 159 |
-
"\n",
|
| 160 |
-
" assert os.environ.get('HF_TOKEN'), 'HF_TOKEN missing — set it via platform secrets UI.'\n",
|
| 161 |
-
"\n",
|
| 162 |
-
" try:\n",
|
| 163 |
-
" from huggingface_hub import snapshot_download, hf_hub_download\n",
|
| 164 |
-
" except ImportError:\n",
|
| 165 |
-
" !pip install -q huggingface_hub\n",
|
| 166 |
-
" from huggingface_hub import snapshot_download, hf_hub_download\n",
|
| 167 |
-
"\n",
|
| 168 |
-
" # 1) Code: flat folder, few hundred files → snapshot_download ok\n",
|
| 169 |
-
" print(f'Pulling code from HF (user: {HF_USER}) …')\n",
|
| 170 |
-
" CODE_SRC = Path(snapshot_download(\n",
|
| 171 |
-
" repo_id = f'{HF_USER}/cxr-vlm-code',\n",
|
| 172 |
-
" repo_type = 'model',\n",
|
| 173 |
-
" token = os.environ['HF_TOKEN'],\n",
|
| 174 |
-
" local_dir = str(WORK / 'cxr-vlm-code'),\n",
|
| 175 |
-
" ))\n",
|
| 176 |
-
"\n",
|
| 177 |
-
" # 2) Data: single zip per dataset (avoids per-file rate limits)\n",
|
| 178 |
-
" import zipfile\n",
|
| 179 |
-
" DATA_SRC = WORK / 'data'\n",
|
| 180 |
-
" DATA_SRC.mkdir(parents=True, exist_ok=True)\n",
|
| 181 |
-
"\n",
|
| 182 |
-
" zip_name = f'{DATASET_NAME}.zip' # 'IU-Xray.zip' | 'MIMIC-CXR.zip'\n",
|
| 183 |
-
" marker = DATA_SRC / DATASET_NAME # DATA_SRC/IU-Xray after unzip\n",
|
| 184 |
-
"\n",
|
| 185 |
-
" if not marker.exists():\n",
|
| 186 |
-
" print(f'Pulling {zip_name} from HF …')\n",
|
| 187 |
-
" zpath = hf_hub_download(\n",
|
| 188 |
-
" repo_id = f'{HF_USER}/cxr-vlm-data',\n",
|
| 189 |
-
" filename = zip_name,\n",
|
| 190 |
-
" repo_type = 'dataset',\n",
|
| 191 |
-
" token = os.environ['HF_TOKEN'],\n",
|
| 192 |
-
" local_dir = str(DATA_SRC),\n",
|
| 193 |
-
" )\n",
|
| 194 |
-
" print(f' unzipping → {DATA_SRC}')\n",
|
| 195 |
-
" with zipfile.ZipFile(zpath) as zf:\n",
|
| 196 |
-
" zf.extractall(DATA_SRC)\n",
|
| 197 |
-
" try:\n",
|
| 198 |
-
" os.remove(zpath) # free disk\n",
|
| 199 |
-
" except OSError:\n",
|
| 200 |
-
" pass\n",
|
| 201 |
-
" else:\n",
|
| 202 |
-
" print(f'{marker} already present — skipping download.')\n",
|
| 203 |
-
" print(f'Contents of {DATA_SRC}: {sorted(os.listdir(DATA_SRC))}')\n",
|
| 204 |
-
"\n",
|
| 205 |
-
"# ── Common: copy code into writable PROJECT dir ────────────────────\n",
|
| 206 |
-
"PROJECT = WORK / 'cxr_vlm'\n",
|
| 207 |
-
"if CODE_SRC.resolve() != PROJECT.resolve() and not PROJECT.exists():\n",
|
| 208 |
-
" shutil.copytree(CODE_SRC, PROJECT)\n",
|
| 209 |
-
"\n",
|
| 210 |
-
"os.chdir(PROJECT)\n",
|
| 211 |
-
"sys.path.insert(0, str(PROJECT))\n",
|
| 212 |
-
"print('PLATFORM :', PLATFORM)\n",
|
| 213 |
-
"print('CODE_SRC :', CODE_SRC)\n",
|
| 214 |
-
"print('DATA_SRC :', DATA_SRC)\n",
|
| 215 |
-
"print('PROJECT :', PROJECT)\n",
|
| 216 |
-
"print('WORK :', WORK)\n"
|
| 217 |
-
],
|
| 218 |
"execution_count": null,
|
| 219 |
-
"outputs": [
|
| 220 |
-
{
|
| 221 |
-
"output_type": "stream",
|
| 222 |
-
"name": "stdout",
|
| 223 |
-
"text": [
|
| 224 |
-
"Pulling code from HF (user: hieu3636) …\n",
|
| 225 |
-
"Pulling IU-Xray.zip from HF …\n",
|
| 226 |
-
" unzipping → /content/data\n",
|
| 227 |
-
"Contents of /content/data: ['.cache', 'IU-Xray']\n",
|
| 228 |
-
"PLATFORM : colab\n",
|
| 229 |
-
"CODE_SRC : /content/cxr-vlm-code\n",
|
| 230 |
-
"DATA_SRC : /content/data\n",
|
| 231 |
-
"PROJECT : /content/cxr_vlm\n",
|
| 232 |
-
"WORK : /content\n"
|
| 233 |
-
]
|
| 234 |
-
}
|
| 235 |
-
],
|
| 236 |
"id": "cell-paths"
|
| 237 |
},
|
| 238 |
{
|
|
@@ -336,29 +182,7 @@
|
|
| 336 |
"metadata": {
|
| 337 |
"id": "cell-data-md"
|
| 338 |
},
|
| 339 |
-
"source":
|
| 340 |
-
"## 2. Locate data on Kaggle\n",
|
| 341 |
-
"\n",
|
| 342 |
-
"Both datasets live under the single `cxr-vlm-data` slug. Expected layouts:\n",
|
| 343 |
-
"\n",
|
| 344 |
-
"**MIMIC-CXR**:\n",
|
| 345 |
-
"```\n",
|
| 346 |
-
"DATA_SRC/\n",
|
| 347 |
-
"├── MIMIC-CXR/ (or at root)\n",
|
| 348 |
-
"│ ├── train/p10/pXXXXXX/sYYYYY/*.jpg + sYYYYY.txt\n",
|
| 349 |
-
"│ ├── valid/p10/...\n",
|
| 350 |
-
"│ └── test/p10/...\n",
|
| 351 |
-
"└── .../MIMIC-Ext-MIMIC-CXR-VQA/dataset/{train,valid,test}.json\n",
|
| 352 |
-
"```\n",
|
| 353 |
-
"\n",
|
| 354 |
-
"**IU-Xray** (added alongside MIMIC under the same slug):\n",
|
| 355 |
-
"```\n",
|
| 356 |
-
"DATA_SRC/\n",
|
| 357 |
-
"└── IU-Xray/\n",
|
| 358 |
-
" ├── images/ # CXR*_IM-*-*.png (~7.5k files)\n",
|
| 359 |
-
" └── labels/ # {1..3999}.xml (~3.9k files, flat — no ecgen-radiology subfolder)\n",
|
| 360 |
-
"```"
|
| 361 |
-
],
|
| 362 |
"id": "cell-data-md"
|
| 363 |
},
|
| 364 |
{
|
|
@@ -370,104 +194,9 @@
|
|
| 370 |
},
|
| 371 |
"outputId": "53c15833-f9bb-4457-95d6-3fa83f4dc909"
|
| 372 |
},
|
| 373 |
-
"source": [
|
| 374 |
-
"def find_split_parent(root: Path) -> Path:\n",
|
| 375 |
-
" for cand in [root, root / 'MIMIC-CXR', root / 'data' / 'MIMIC-CXR']:\n",
|
| 376 |
-
" if (cand / 'train').exists() and (cand / 'valid').exists() and (cand / 'test').exists():\n",
|
| 377 |
-
" return cand\n",
|
| 378 |
-
" for p in root.rglob('train'):\n",
|
| 379 |
-
" if p.is_dir() and (p.parent / 'valid').exists() and (p.parent / 'test').exists():\n",
|
| 380 |
-
" return p.parent\n",
|
| 381 |
-
" raise FileNotFoundError('Could not find train/ valid/ test/ under ' + str(root))\n",
|
| 382 |
-
"\n",
|
| 383 |
-
"\n",
|
| 384 |
-
"def find_iu_dirs(root: Path):\n",
|
| 385 |
-
" \"\"\"Locate IU-Xray `images/` and `labels/` (flat XMLs) under `root`.\n",
|
| 386 |
-
"\n",
|
| 387 |
-
" Resolution order:\n",
|
| 388 |
-
" 1. `{root}/IU-Xray/{images,labels}` — canonical layout.\n",
|
| 389 |
-
" 2. Any nested `IU-Xray` folder that contains both.\n",
|
| 390 |
-
" 3. Fallback: any folder containing CXR*.png (images) and\n",
|
| 391 |
-
" any folder containing *.xml — whichever comes first.\n",
|
| 392 |
-
"\n",
|
| 393 |
-
" The labels subfolder is treated as a flat directory of XMLs (we no\n",
|
| 394 |
-
" longer require the legacy `ecgen-radiology/` subfolder).\n",
|
| 395 |
-
" \"\"\"\n",
|
| 396 |
-
" # Canonical + nested\n",
|
| 397 |
-
" for cand in [root / 'IU-Xray', *root.rglob('IU-Xray')]:\n",
|
| 398 |
-
" if not cand.is_dir():\n",
|
| 399 |
-
" continue\n",
|
| 400 |
-
" imgs = cand / 'images'\n",
|
| 401 |
-
" lbls = cand / 'labels'\n",
|
| 402 |
-
" if imgs.is_dir() and lbls.is_dir() and any(lbls.glob('*.xml')):\n",
|
| 403 |
-
" return imgs, lbls\n",
|
| 404 |
-
" # Legacy: labels/ecgen-radiology/*.xml\n",
|
| 405 |
-
" legacy = lbls / 'ecgen-radiology'\n",
|
| 406 |
-
" if imgs.is_dir() and legacy.is_dir() and any(legacy.glob('*.xml')):\n",
|
| 407 |
-
" return imgs, legacy\n",
|
| 408 |
-
"\n",
|
| 409 |
-
" # Fallback: any images/ with CXR*.png + any folder with XML\n",
|
| 410 |
-
" img_dir = lbl_dir = None\n",
|
| 411 |
-
" for cand in [root / 'images', *root.rglob('images')]:\n",
|
| 412 |
-
" if cand.is_dir() and any(cand.glob('CXR*.png')):\n",
|
| 413 |
-
" img_dir = cand; break\n",
|
| 414 |
-
" for cand in [root / 'labels', *root.rglob('labels')]:\n",
|
| 415 |
-
" if cand.is_dir() and any(cand.glob('*.xml')):\n",
|
| 416 |
-
" lbl_dir = cand; break\n",
|
| 417 |
-
" if lbl_dir is None:\n",
|
| 418 |
-
" # very last resort — any ecgen-radiology folder with XMLs\n",
|
| 419 |
-
" for cand in root.rglob('ecgen-radiology'):\n",
|
| 420 |
-
" if cand.is_dir() and any(cand.glob('*.xml')):\n",
|
| 421 |
-
" lbl_dir = cand; break\n",
|
| 422 |
-
" return img_dir, lbl_dir\n",
|
| 423 |
-
"\n",
|
| 424 |
-
"\n",
|
| 425 |
-
"# Filled in below depending on DATASET_NAME\n",
|
| 426 |
-
"CXR_ROOT = None # MIMIC-CXR root (with train/valid/test subdirs)\n",
|
| 427 |
-
"SPLIT_DIRS = None # MIMIC only\n",
|
| 428 |
-
"VQA_ROOT = None # MIMIC only\n",
|
| 429 |
-
"IU_IMAGES_DIR = None # IU-Xray only\n",
|
| 430 |
-
"IU_LABELS_DIR = None # IU-Xray only\n",
|
| 431 |
-
"\n",
|
| 432 |
-
"if DATASET_NAME == 'MIMIC-CXR':\n",
|
| 433 |
-
" CXR_ROOT = find_split_parent(DATA_SRC)\n",
|
| 434 |
-
" print('MIMIC-CXR root:', CXR_ROOT)\n",
|
| 435 |
-
"\n",
|
| 436 |
-
" SPLIT_DIRS = {\n",
|
| 437 |
-
" 'train' : ('train', CXR_ROOT / 'train'),\n",
|
| 438 |
-
" 'validate': ('valid', CXR_ROOT / 'valid'),\n",
|
| 439 |
-
" 'test' : ('test', CXR_ROOT / 'test'),\n",
|
| 440 |
-
" }\n",
|
| 441 |
-
" for s, (sub, d) in SPLIT_DIRS.items():\n",
|
| 442 |
-
" assert d.exists(), f'Missing split dir: {d}'\n",
|
| 443 |
-
" print(f' {s:<9s} → {d}')\n",
|
| 444 |
-
"\n",
|
| 445 |
-
" for p in DATA_SRC.rglob('MIMIC-Ext-MIMIC-CXR-VQA'):\n",
|
| 446 |
-
" cand = p / 'dataset'\n",
|
| 447 |
-
" if cand.exists() and (cand / 'train.json').exists():\n",
|
| 448 |
-
" VQA_ROOT = cand\n",
|
| 449 |
-
" break\n",
|
| 450 |
-
" assert VQA_ROOT is not None, 'VQA dataset folder not found under ' + str(DATA_SRC)\n",
|
| 451 |
-
" print('VQA root:', VQA_ROOT)\n",
|
| 452 |
-
"\n",
|
| 453 |
-
"else: # IU-Xray\n",
|
| 454 |
-
" IU_IMAGES_DIR, IU_LABELS_DIR = find_iu_dirs(DATA_SRC)\n",
|
| 455 |
-
" assert IU_IMAGES_DIR is not None, f'IU images/ not found under {DATA_SRC}'\n",
|
| 456 |
-
" assert IU_LABELS_DIR is not None, f'IU labels/ (with *.xml) not found under {DATA_SRC}'\n",
|
| 457 |
-
" print('IU images dir:', IU_IMAGES_DIR, '→', len(list(IU_IMAGES_DIR.glob('*.png'))), 'PNGs')\n",
|
| 458 |
-
" print('IU labels dir:', IU_LABELS_DIR, '→', len(list(IU_LABELS_DIR.glob('*.xml'))), 'XMLs')"
|
| 459 |
-
],
|
| 460 |
"execution_count": null,
|
| 461 |
-
"outputs": [
|
| 462 |
-
{
|
| 463 |
-
"output_type": "stream",
|
| 464 |
-
"name": "stdout",
|
| 465 |
-
"text": [
|
| 466 |
-
"IU images dir: /content/data/IU-Xray/images → 1841 PNGs\n",
|
| 467 |
-
"IU labels dir: /content/data/IU-Xray/labels → 3955 XMLs\n"
|
| 468 |
-
]
|
| 469 |
-
}
|
| 470 |
-
],
|
| 471 |
"id": "cell-find-data-mimic"
|
| 472 |
},
|
| 473 |
{
|
|
@@ -475,7 +204,7 @@
|
|
| 475 |
"metadata": {
|
| 476 |
"id": "cell-json-md"
|
| 477 |
},
|
| 478 |
-
"source": "## 3. Build the unified instruction JSON\n\n- **MIMIC-
|
| 479 |
"id": "cell-json-md"
|
| 480 |
},
|
| 481 |
{
|
|
@@ -487,7 +216,7 @@
|
|
| 487 |
},
|
| 488 |
"outputId": "3b965273-ac82-41a7-8ead-895094e0a8b1"
|
| 489 |
},
|
| 490 |
-
"source": "# MIMIC-CXR: the unified JSON
|
| 491 |
"execution_count": null,
|
| 492 |
"outputs": [],
|
| 493 |
"id": "cell-parse"
|
|
@@ -501,7 +230,7 @@
|
|
| 501 |
},
|
| 502 |
"outputId": "91e11a3e-7b4e-4457-c32a-70a17fb2ef2a"
|
| 503 |
},
|
| 504 |
-
"source": "if DATASET_NAME
|
| 505 |
"execution_count": null,
|
| 506 |
"outputs": [],
|
| 507 |
"id": "cell-build-findings"
|
|
@@ -515,7 +244,7 @@
|
|
| 515 |
},
|
| 516 |
"outputId": "fc5fd7bc-cb80-49aa-da79-0097ed038b5d"
|
| 517 |
},
|
| 518 |
-
"source": "if DATASET_NAME
|
| 519 |
"execution_count": null,
|
| 520 |
"outputs": [],
|
| 521 |
"id": "cell-build-vqa"
|
|
@@ -529,7 +258,7 @@
|
|
| 529 |
},
|
| 530 |
"outputId": "b4d6589b-19be-4eb8-ba25-d533248439b9"
|
| 531 |
},
|
| 532 |
-
"source": "if DATASET_NAME
|
| 533 |
"execution_count": null,
|
| 534 |
"outputs": [],
|
| 535 |
"id": "cell-filter"
|
|
@@ -543,7 +272,7 @@
|
|
| 543 |
},
|
| 544 |
"outputId": "b6d95196-0383-4a50-8424-9ef95eb7b34e"
|
| 545 |
},
|
| 546 |
-
"source": "out_dir = PROJECT / 'data' / 'data_files'\nout_dir.mkdir(parents=True, exist_ok=True)\n\nif DATASET_NAME == 'MIMIC-CXR':\n # Base path only — the resolver appends __{report_mode}__{image_mode}\n # and builds it (PNU CheXpert + abnormality-guided VQA) via\n # data.mimic_cxr_builder the first time train.py / evaluate.py runs.\n mimic_json_path = out_dir / 'mimic_cxr_instruct_unified.json'\n print('MIMIC-CXR: instruct JSON auto-built by resolver →',\n f'{mimic_json_path.stem}__<report_mode>__<image_mode>.json')\nelse:\n # Build IU-Xray JSON here so the notebook shows a nice summary log\n # (the resolver would also do this lazily).\n from data.iu_xray_builder import build_iu_xray_instruct_json\n iu_json_path = out_dir / 'iu_xray_instruct.json'\n build_iu_xray_instruct_json(\n images_dir = str(IU_IMAGES_DIR),\n labels_dir = str(IU_LABELS_DIR),\n output_path = str(iu_json_path),\n train_ratio = 0.70, val_ratio = 0.15, test_ratio = 0.15, seed = 42,\n )
|
| 547 |
"execution_count": null,
|
| 548 |
"outputs": [],
|
| 549 |
"id": "cell-save-json"
|
|
@@ -553,7 +282,7 @@
|
|
| 553 |
"metadata": {
|
| 554 |
"id": "cell-cfg-md"
|
| 555 |
},
|
| 556 |
-
"source": "## 4. Patch configs for the Kaggle/Colab environment\n\n- Sets `data.dataset_name`, `report_mode`, `image_mode`.\n- **MIMIC-CXR**: sets `mimic_cxr_root`, the `instruct_json` base path, auto-discovers the **CheXpert CSV** (`mimic_chexpert_csv`) and the **VQA** dir (`mimic_vqa_root`), and turns on `mimic_auto_build`
|
| 557 |
"id": "cell-cfg-md"
|
| 558 |
},
|
| 559 |
{
|
|
@@ -565,7 +294,7 @@
|
|
| 565 |
},
|
| 566 |
"outputId": "80ddabe3-bc8b-4d14-94e2-26ff9e64970c"
|
| 567 |
},
|
| 568 |
-
"source": "from omegaconf import OmegaConf\n\ntrain_cfg = OmegaConf.load(PROJECT / 'configs' / 'train_config.yaml')\nmodel_cfg = OmegaConf.load(PROJECT / 'configs' / 'model_config.yaml')\n\n# ── dataset selector ──\ntrain_cfg.data.dataset_name = DATASET_NAME\n\n# ── training-scheme switches (thesis ablations) ──\n# report_mode: 'split' → 2 tasks (findings + impression separately)\n# 'merged' → 1 task (full report \"Findings: ...\\n\\nImpression: ...\")\n# 'split_cascade' → split, but impression's context = GT findings\n# image_mode : 'all_views_split' | 'frontal_only_split' | 'multi_image_merged'\ntrain_cfg.data.report_mode = 'split'\ntrain_cfg.data.image_mode = 'all_views_split'\ntrain_cfg.data.max_images_per_sample = 2 # only used in multi_image_merged\n\n# ── dataset-specific paths ──\nif DATASET_NAME == 'MIMIC-CXR':\n train_cfg.data.mimic_cxr_root = str(CXR_ROOT)\n # Base path; the resolver suffixes __{report_mode}__{image_mode} and\n # auto-builds (PNU CheXpert + VQA) via data.mimic_cxr_builder.\n train_cfg.data.instruct_json = str(mimic_json_path)\n train_cfg.data.mimic_auto_build = True\n\n # RaDialog / U-MultiClass abnormality guidance: locate the CheXpert\n # label CSV so the builder can bake the PNU structured_findings string.\n _cx = (sorted(DATA_SRC.rglob('*chexpert*.csv'))\n or sorted(DATA_SRC.rglob('*chexbert*.csv')))\n train_cfg.data.mimic_chexpert_csv = str(_cx[0]) if _cx else None\n print('CheXpert CSV :', train_cfg.data.mimic_chexpert_csv\n or 'NOT FOUND — PNU abnormality guidance DISABLED!')\n\n # VQA pairs ({train,valid,test}.json) → abnormality-guided VQA.\n train_cfg.data.mimic_vqa_root = str(VQA_ROOT) if VQA_ROOT is not None else None\n print('VQA root :', train_cfg.data.mimic_vqa_root or '(none — VQA skipped)')\nelse: # IU-Xray\n train_cfg.data.iu_xray.images_dir = str(IU_IMAGES_DIR)\n train_cfg.data.iu_xray.labels_dir = str(IU_LABELS_DIR)\n train_cfg.data.iu_xray.instruct_json = str(iu_json_path)\n train_cfg.data.iu_xray.auto_build = True\n\ntrain_cfg.data.train_split = 'train'\ntrain_cfg.data.val_split = 'validate'\ntrain_cfg.data.test_split = 'test'\n\n# ── checkpoint root (Persistence keeps /content/ckpt/) ──\nCKPT_ROOT = WORK / 'ckpt'\ntrain_cfg.training.output_root = str(CKPT_ROOT)\n\n# ── batching ──\ntrain_cfg.training.per_device_train_batch_size = 4\ntrain_cfg.training.per_device_eval_batch_size = 4\ntrain_cfg.training.gradient_accumulation_steps = 4\ntrain_cfg.training.fp16 = False\ntrain_cfg.training.bf16 = True\ntrain_cfg.training.dataloader_num_workers = 8\n\ntrain_cfg.stage2.num_epoch = 5\n\n# ── wandb off ──\ntrain_cfg.wandb.enabled = False\n\n# ── HuggingFace Hub run tracking ──\ntrain_cfg.hf_hub.enabled = True\ntrain_cfg.hf_hub.repo_id = 'hieu3636/cxr-vlm-runs' # <<< EDIT ME\ntrain_cfg.hf_hub.token_env = 'HF_TOKEN'\ntrain_cfg.hf_hub.private = True\ntrain_cfg.hf_hub.run_state_file = str(CKPT_ROOT / 'run_id.txt')\n\n# ── 4-bit QLoRA ──\nmodel_cfg.llm.load_in_8bit = False\nmodel_cfg.llm.load_in_4bit = True\n# Oracle PNU path does NOT use the CheXpert classifier module (labels come\n# from the GT csv baked into the prompt). Keep it disabled until you wire
|
| 569 |
"execution_count": null,
|
| 570 |
"outputs": [],
|
| 571 |
"id": "cell-cfg"
|
|
@@ -591,29 +320,9 @@
|
|
| 591 |
},
|
| 592 |
"outputId": "8a2ce693-94fc-4425-f62c-679614d6dab5"
|
| 593 |
},
|
| 594 |
-
"source":
|
| 595 |
-
"# HF_TOKEN setup. On non-Kaggle platforms it's already set inside cell-paths\n",
|
| 596 |
-
"# (needed to pull code + data). Here we only handle the Kaggle path.\n",
|
| 597 |
-
"try:\n",
|
| 598 |
-
" if PLATFORM == 'kaggle':\n",
|
| 599 |
-
" from kaggle_secrets import UserSecretsClient\n",
|
| 600 |
-
" os.environ['HF_TOKEN'] = UserSecretsClient().get_secret('HF_TOKEN')\n",
|
| 601 |
-
" # Other platforms: already populated in cell-paths\n",
|
| 602 |
-
" assert os.environ.get('HF_TOKEN'), 'HF_TOKEN missing'\n",
|
| 603 |
-
" print('HF_TOKEN loaded ✓')\n",
|
| 604 |
-
"except Exception as e:\n",
|
| 605 |
-
" print('No HF_TOKEN — Vicuna-7B download may rate-limit and hub upload will be disabled:', e)\n"
|
| 606 |
-
],
|
| 607 |
"execution_count": null,
|
| 608 |
-
"outputs": [
|
| 609 |
-
{
|
| 610 |
-
"output_type": "stream",
|
| 611 |
-
"name": "stdout",
|
| 612 |
-
"text": [
|
| 613 |
-
"HF_TOKEN loaded ✓\n"
|
| 614 |
-
]
|
| 615 |
-
}
|
| 616 |
-
],
|
| 617 |
"id": "cell-hf-token"
|
| 618 |
},
|
| 619 |
{
|
|
@@ -668,24 +377,7 @@
|
|
| 668 |
"metadata": {
|
| 669 |
"id": "cell-mode-md"
|
| 670 |
},
|
| 671 |
-
"source":
|
| 672 |
-
"## 5b. Resume controller\n",
|
| 673 |
-
"\n",
|
| 674 |
-
"Single switch. No more \"which stage\" — `train.py` auto-detects which stage\n",
|
| 675 |
-
"to continue from by inspecting checkpoints on disk.\n",
|
| 676 |
-
"\n",
|
| 677 |
-
"| MODE | What happens |\n",
|
| 678 |
-
"|---------------------|--------------|\n",
|
| 679 |
-
"| `'fresh'` | Allocate a brand-new `{DATASET}_run_N+1` folder. Train both stages from scratch. |\n",
|
| 680 |
-
"| `'resume'` | Reuse latest matching `{DATASET}_run_N` (or `EXPLICIT_RUN_ID`). Auto-detect: stage 1 mid-checkpoint, stage 1 done → stage 2 fresh, stage 2 mid-checkpoint, or both done. |\n",
|
| 681 |
-
"\n",
|
| 682 |
-
"`EXPLICIT_RUN_ID` is optional (set to `None` to auto-pick the latest run on\n",
|
| 683 |
-
"disk or HF Hub that matches the current dataset prefix).\n",
|
| 684 |
-
"\n",
|
| 685 |
-
"When `MODE='resume'` on a fresh VM the train cell will pull the previous\n",
|
| 686 |
-
"run's checkpoints from HF before training. The `--mode resume` flag in\n",
|
| 687 |
-
"`train.py` does the auto-detect — no further action needed in the notebook."
|
| 688 |
-
],
|
| 689 |
"id": "cell-resume-md"
|
| 690 |
},
|
| 691 |
{
|
|
|
|
| 5 |
"metadata": {
|
| 6 |
"id": "cell-0"
|
| 7 |
},
|
| 8 |
+
"source": "# CXR-VLM — Kaggle / Colab Training Notebook (consolidated)\n\nTrains the 2-stage CXR-VLM (Vicuna-7B + BioViL-T fallback to timm ViT + LoRA) on a Kaggle **T4** or Colab **A100 / L4** GPU.\n\nSupports **three datasets**, selected by `DATASET_NAME` in section 0:\n- **`MIMIC-CXR_resized`** *(default)* — filtered + resized subset of MIMIC-CXR, distributed as tar shards. Manifest-driven (`manifest_{train,val,test}.csv` + `vqa/*.json`, reports inside the tars). 3 tasks (findings, impression, VQA). Lighter than full MIMIC, balanced val/test.\n- **`MIMIC-CXR`** — full pre-split MIMIC-CXR (3 tasks). Heavy; needs the original `train/valid/test` tree + chexpert.csv + VQA pairs.\n- **`IU-Xray`** — 2 tasks only (findings, impression). ~7.5k images, fastest sanity run.\n\n### Source-of-truth\n\nAll platforms (kaggle / colab / lightning / gcp / local) pull code + data from **HuggingFace Hub** — no Kaggle dataset attach is needed anymore. Just:\n\n| Repo | Contents |\n|---|---|\n| `<HF_USER>/cxr-vlm-code` | project source (configs/, data/*.py, model/, training/, evaluation/, utils/, requirements.txt) |\n| `<HF_USER>/cxr-vlm-data` | tar shards under `MIMIC-CXR_resized/` **or** `MIMIC-CXR.zip` **or** `IU-Xray.zip` |\n\n### Settings\n\n- **Kaggle**: accelerator **T4 x2** (only GPU 0 used); Persistence: **Variables and Files**; Internet: **On**\n- **Colab**: any GPU (A100 recommended); enable Files (persisted under `/content`)\n\n### Secrets\n\n- `HF_TOKEN` — HuggingFace token with **write** access to the runs repo (`hf_hub.repo_id` in config). Read from Kaggle Secrets (Add-ons → Secrets) or Colab userdata (🔑 sidebar).",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
"id": "cell-0"
|
| 10 |
},
|
| 11 |
{
|
|
|
|
| 29 |
},
|
| 30 |
"outputId": "d6e7ebbd-4f1b-483b-f20f-0df3997a60b7"
|
| 31 |
},
|
| 32 |
+
"source": "# ── Platform + dataset selectors ──────────────────────────────────\n# PLATFORM drives storage paths and how secrets are read.\n# Supported: 'kaggle' | 'colab' | 'lightning' | 'gcp' | 'local'\nPLATFORM = 'colab'\nDATASET_NAME = 'MIMIC-CXR_resized' # 'MIMIC-CXR' | 'MIMIC-CXR_resized' | 'IU-Xray'\n\nassert PLATFORM in ('kaggle', 'colab', 'lightning', 'gcp', 'local')\nassert DATASET_NAME in ('MIMIC-CXR', 'MIMIC-CXR_resized', 'IU-Xray')\nprint(f'PLATFORM = {PLATFORM} | DATASET_NAME = {DATASET_NAME}')",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
"execution_count": null,
|
| 34 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
"id": "cell-select"
|
| 36 |
},
|
| 37 |
{
|
|
|
|
| 76 |
},
|
| 77 |
"outputId": "f6195a48-56b2-4052-8367-c8ec14c48a05"
|
| 78 |
},
|
| 79 |
+
"source": "# ── Per-platform storage + source-of-truth ─────────────────────────\n# All platforms (kaggle / colab / lightning / gcp / local) pull code +\n# data from HF Hub. The only platform-specific bit is:\n# * WORK : where to land outputs (persisted dirs differ per host)\n# * TOKEN : how HF_TOKEN reaches os.environ (secrets API differs)\n#\n# Required HF repos:\n# <HF_USER>/cxr-vlm-code — project source (flat folder)\n# <HF_USER>/cxr-vlm-data — per-dataset payloads:\n# MIMIC-CXR_resized/ (tar shards + manifests + vqa)\n# MIMIC-CXR.zip (single zip)\n# IU-Xray.zip (single zip)\n\nHF_USER = 'hieu3636' # <<< EDIT ME\n\n# ── 1) WORK dir + HF_TOKEN bootstrap (platform-specific) ───────────\nif PLATFORM == 'kaggle':\n from kaggle_secrets import UserSecretsClient\n os.environ['HF_TOKEN'] = UserSecretsClient().get_secret('HF_TOKEN')\n WORK = Path('/kaggle/working')\nelif PLATFORM == 'colab':\n from google.colab import userdata\n os.environ['HF_TOKEN'] = userdata.get('HF_TOKEN')\n WORK = Path('/content')\nelif PLATFORM == 'lightning':\n WORK = Path('/teamspace/studios/this_studio')\nelif PLATFORM == 'gcp':\n WORK = Path('/workspace')\nelse: # 'local'\n WORK = Path.home() / 'cxr-vlm-work'\nWORK.mkdir(parents=True, exist_ok=True)\n\nassert os.environ.get('HF_TOKEN'), \\\n 'HF_TOKEN missing — set it via the platform secrets UI before re-running.'\n\ntry:\n from huggingface_hub import snapshot_download, hf_hub_download, HfApi\nexcept ImportError:\n !pip install -q huggingface_hub\n from huggingface_hub import snapshot_download, hf_hub_download, HfApi\n\n# ── 2) Code: flat folder, few hundred files → snapshot_download ──\nprint(f'Pulling code from HF (user: {HF_USER}) …')\nCODE_SRC = Path(snapshot_download(\n repo_id = f'{HF_USER}/cxr-vlm-code',\n repo_type = 'model',\n token = os.environ['HF_TOKEN'],\n local_dir = str(WORK / 'cxr-vlm-code'),\n))\n\n# ── 3) Data: layout depends on DATASET_NAME ──\nDATA_SRC = WORK / 'data'\nDATA_SRC.mkdir(parents=True, exist_ok=True)\n\nif DATASET_NAME == 'MIMIC-CXR_resized':\n # Tar-sharded payload. Reports + images live INSIDE the tars under\n # `files/pXX/pXXXX/{sYYYY/*.jpg, sYYYY.txt}` so extracting all shards\n # gives one unified tree. We download manifests + vqa + SHARDS.txt\n # first (small, ~tens of MB), then each *.tar one at a time →\n # extract → delete (saves disk).\n # Final on-disk layout:\n # DATA_SRC/MIMIC-CXR_resized/\n # ├── manifest_{train,val,test}.csv\n # ├── vqa/ {vqa.json, vqa_val.json, vqa_test.json}\n # ├── SHARDS.txt + _manifest.json\n # └── files/pXX/pXXXX/ ← from tars\n # ├── sYYYY.txt (report)\n # └── sYYYY/<dicom>.jpg (images)\n import tarfile\n mr_dir = DATA_SRC / 'MIMIC-CXR_resized'\n mr_dir.mkdir(parents=True, exist_ok=True)\n files_dir = mr_dir / 'files'\n\n # Marker: if files/ already has shards extracted AND manifests exist,\n # skip everything. Lets the cell be re-run safely.\n manifests_present = all(\n (mr_dir / f).is_file() for f in ('manifest_train.csv', 'manifest_val.csv', 'manifest_test.csv')\n )\n if manifests_present and files_dir.is_dir() and any(files_dir.glob('p*')):\n print(f'{mr_dir} already populated — skipping download.')\n else:\n api = HfApi(token=os.environ['HF_TOKEN'])\n all_files = api.list_repo_files(\n repo_id=f'{HF_USER}/cxr-vlm-data', repo_type='dataset')\n mr_files = [f for f in all_files if f.startswith('MIMIC-CXR_resized/')]\n tar_files = sorted(f for f in mr_files if f.endswith('.tar'))\n meta_files = [f for f in mr_files if not f.endswith('.tar')]\n print(f'MIMIC-CXR_resized on HF: {len(tar_files)} tar shards + {len(meta_files)} metadata files')\n\n # 3a) Pull metadata (manifests, vqa, SHARDS.txt, _manifest.json)\n # in one snapshot (small; few MB).\n print(f' downloading manifests + vqa + SHARDS.txt …')\n snapshot_download(\n repo_id = f'{HF_USER}/cxr-vlm-data',\n repo_type = 'dataset',\n allow_patterns = ['MIMIC-CXR_resized/*.csv',\n 'MIMIC-CXR_resized/*.json',\n 'MIMIC-CXR_resized/*.txt',\n 'MIMIC-CXR_resized/vqa/**'],\n token = os.environ['HF_TOKEN'],\n local_dir = str(DATA_SRC),\n )\n\n # 3b) Sequentially fetch + extract + delete each image tar to\n # minimise peak disk usage (each shard ~2 GB). Reports come\n # out alongside images — both land under mr_dir/files/.\n print(f' downloading + extracting {len(tar_files)} tar shards …')\n for i, tf in enumerate(tar_files, 1):\n print(f' [{i}/{len(tar_files)}] {tf}')\n tar_path = Path(hf_hub_download(\n repo_id=f'{HF_USER}/cxr-vlm-data', repo_type='dataset',\n filename=tf, token=os.environ['HF_TOKEN'],\n local_dir=str(DATA_SRC),\n ))\n with tarfile.open(tar_path) as t:\n # Extract into mr_dir so member paths like\n # \"files/p10/.../*.jpg\" + \"files/p10/.../*.txt\" land at\n # mr_dir/files/p10/…\n t.extractall(mr_dir)\n tar_path.unlink(missing_ok=True)\n print(f' done. {mr_dir} ready.')\n\nelse:\n # MIMIC-CXR / IU-Xray: single zip per dataset (legacy path)\n import zipfile\n zip_name = f'{DATASET_NAME}.zip' # 'IU-Xray.zip' | 'MIMIC-CXR.zip'\n marker = DATA_SRC / DATASET_NAME # DATA_SRC/IU-Xray after unzip\n\n if not marker.exists():\n print(f'Pulling {zip_name} from HF …')\n zpath = hf_hub_download(\n repo_id = f'{HF_USER}/cxr-vlm-data',\n filename = zip_name,\n repo_type = 'dataset',\n token = os.environ['HF_TOKEN'],\n local_dir = str(DATA_SRC),\n )\n print(f' unzipping → {DATA_SRC}')\n with zipfile.ZipFile(zpath) as zf:\n zf.extractall(DATA_SRC)\n try:\n os.remove(zpath) # free disk\n except OSError:\n pass\n else:\n print(f'{marker} already present — skipping download.')\n\nprint(f'Contents of {DATA_SRC}: {sorted(os.listdir(DATA_SRC))}')\n\n# ── Common: copy code into writable PROJECT dir ────────────────────\nPROJECT = WORK / 'cxr_vlm'\nif CODE_SRC.resolve() != PROJECT.resolve() and not PROJECT.exists():\n shutil.copytree(CODE_SRC, PROJECT)\n\nos.chdir(PROJECT)\nsys.path.insert(0, str(PROJECT))\nprint('PLATFORM :', PLATFORM)\nprint('CODE_SRC :', CODE_SRC)\nprint('DATA_SRC :', DATA_SRC)\nprint('PROJECT :', PROJECT)\nprint('WORK :', WORK)",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
"execution_count": null,
|
| 81 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
"id": "cell-paths"
|
| 83 |
},
|
| 84 |
{
|
|
|
|
| 182 |
"metadata": {
|
| 183 |
"id": "cell-data-md"
|
| 184 |
},
|
| 185 |
+
"source": "## 2. Locate data\n\nAll datasets live under a single `cxr-vlm-data` slug (Kaggle) or HF repo (others). Expected layouts:\n\n**MIMIC-CXR_resized** *(default)*:\n```\nDATA_SRC/\n└── MIMIC-CXR_resized/\n ├── manifest_train.csv ← drives split + chex_* + has_vqa\n ├── manifest_val.csv\n ├── manifest_test.csv\n ├── vqa/\n │ ├── vqa.json ← train VQA pairs\n │ ├── vqa_val.json\n │ └── vqa_test.json\n ├── files/ ← extracted from tar shards\n │ └── pXX/pXXXXXXXX/\n │ ├── sYYYYYYYY.txt ← report (alongside, at patient dir)\n │ └── sYYYYYYYY/<dicom>.jpg\n ├── SHARDS.txt\n └── _manifest.json\n```\n\n**MIMIC-CXR** (legacy pre-split):\n```\nDATA_SRC/\n├── MIMIC-CXR/{train,valid,test}/p10/pXXXXXX/sYYYYY/*.jpg + sYYYYY.txt\n└── .../MIMIC-Ext-MIMIC-CXR-VQA/dataset/{train,valid,test}.json\n```\n\n**IU-Xray**:\n```\nDATA_SRC/\n└── IU-Xray/\n ├── images/ # CXR*_IM-*-*.png (~7.5k files)\n └── labels/ # {1..3999}.xml (~3.9k files, flat — no ecgen-radiology subfolder)\n```",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
"id": "cell-data-md"
|
| 187 |
},
|
| 188 |
{
|
|
|
|
| 194 |
},
|
| 195 |
"outputId": "53c15833-f9bb-4457-95d6-3fa83f4dc909"
|
| 196 |
},
|
| 197 |
+
"source": "def find_split_parent(root: Path) -> Path:\n for cand in [root, root / 'MIMIC-CXR', root / 'data' / 'MIMIC-CXR']:\n if (cand / 'train').exists() and (cand / 'valid').exists() and (cand / 'test').exists():\n return cand\n for p in root.rglob('train'):\n if p.is_dir() and (p.parent / 'valid').exists() and (p.parent / 'test').exists():\n return p.parent\n raise FileNotFoundError('Could not find train/ valid/ test/ under ' + str(root))\n\n\ndef find_mimic_resized_root(root: Path) -> Path:\n \"\"\"Find the MIMIC-CXR_resized payload — folder with manifest_*.csv + files/.\"\"\"\n for cand in [root / 'MIMIC-CXR_resized', root, *root.rglob('MIMIC-CXR_resized')]:\n if (cand / 'manifest_train.csv').is_file():\n return cand\n raise FileNotFoundError(\n f'Could not find MIMIC-CXR_resized payload under {root}. '\n f'Expected manifest_train.csv (alongside manifest_val.csv / manifest_test.csv).'\n )\n\n\ndef find_iu_dirs(root: Path):\n \"\"\"Locate IU-Xray `images/` and `labels/` (flat XMLs) under `root`.\n\n Resolution order:\n 1. `{root}/IU-Xray/{images,labels}` — canonical layout.\n 2. Any nested `IU-Xray` folder that contains both.\n 3. Fallback: any folder containing CXR*.png (images) and\n any folder containing *.xml — whichever comes first.\n\n The labels subfolder is treated as a flat directory of XMLs (we no\n longer require the legacy `ecgen-radiology/` subfolder).\n \"\"\"\n # Canonical + nested\n for cand in [root / 'IU-Xray', *root.rglob('IU-Xray')]:\n if not cand.is_dir():\n continue\n imgs = cand / 'images'\n lbls = cand / 'labels'\n if imgs.is_dir() and lbls.is_dir() and any(lbls.glob('*.xml')):\n return imgs, lbls\n # Legacy: labels/ecgen-radiology/*.xml\n legacy = lbls / 'ecgen-radiology'\n if imgs.is_dir() and legacy.is_dir() and any(legacy.glob('*.xml')):\n return imgs, legacy\n\n # Fallback: any images/ with CXR*.png + any folder with XML\n img_dir = lbl_dir = None\n for cand in [root / 'images', *root.rglob('images')]:\n if cand.is_dir() and any(cand.glob('CXR*.png')):\n img_dir = cand; break\n for cand in [root / 'labels', *root.rglob('labels')]:\n if cand.is_dir() and any(cand.glob('*.xml')):\n lbl_dir = cand; break\n if lbl_dir is None:\n # very last resort — any ecgen-radiology folder with XMLs\n for cand in root.rglob('ecgen-radiology'):\n if cand.is_dir() and any(cand.glob('*.xml')):\n lbl_dir = cand; break\n return img_dir, lbl_dir\n\n\n# Filled in below depending on DATASET_NAME\nCXR_ROOT = None # MIMIC-CXR root (with train/valid/test subdirs)\nSPLIT_DIRS = None # MIMIC only\nVQA_ROOT = None # MIMIC only\nMR_ROOT = None # MIMIC-CXR_resized root (manifests + files/ + vqa/)\nIU_IMAGES_DIR = None # IU-Xray only\nIU_LABELS_DIR = None # IU-Xray only\n\nif DATASET_NAME == 'MIMIC-CXR':\n CXR_ROOT = find_split_parent(DATA_SRC)\n print('MIMIC-CXR root:', CXR_ROOT)\n\n SPLIT_DIRS = {\n 'train' : ('train', CXR_ROOT / 'train'),\n 'validate': ('valid', CXR_ROOT / 'valid'),\n 'test' : ('test', CXR_ROOT / 'test'),\n }\n for s, (sub, d) in SPLIT_DIRS.items():\n assert d.exists(), f'Missing split dir: {d}'\n print(f' {s:<9s} → {d}')\n\n for p in DATA_SRC.rglob('MIMIC-Ext-MIMIC-CXR-VQA'):\n cand = p / 'dataset'\n if cand.exists() and (cand / 'train.json').exists():\n VQA_ROOT = cand\n break\n assert VQA_ROOT is not None, 'VQA dataset folder not found under ' + str(DATA_SRC)\n print('VQA root:', VQA_ROOT)\n\nelif DATASET_NAME == 'MIMIC-CXR_resized':\n MR_ROOT = find_mimic_resized_root(DATA_SRC)\n print('MIMIC-CXR_resized root:', MR_ROOT)\n # Sanity: 3 manifest CSVs, files/ (images+reports), vqa/\n for cf in ('manifest_train.csv', 'manifest_val.csv', 'manifest_test.csv'):\n f = MR_ROOT / cf\n print(f' {cf}: {\"OK\" if f.is_file() else \"MISSING\"}')\n for sub in ('files', 'vqa'):\n d = MR_ROOT / sub\n print(f' {sub:<5s}: {\"OK\" if d.is_dir() else \"MISSING\"} ({d})')\n # Spot-check one report (.txt) sits at patient-dir level inside files/\n txt_hits = list((MR_ROOT / 'files').glob('p*/p*/s*.txt')) if (MR_ROOT / 'files').is_dir() else []\n print(f' reports inside files/ : {len(txt_hits):,} found (sample: {txt_hits[0] if txt_hits else \"—\"})')\n\nelse: # IU-Xray\n IU_IMAGES_DIR, IU_LABELS_DIR = find_iu_dirs(DATA_SRC)\n assert IU_IMAGES_DIR is not None, f'IU images/ not found under {DATA_SRC}'\n assert IU_LABELS_DIR is not None, f'IU labels/ (with *.xml) not found under {DATA_SRC}'\n print('IU images dir:', IU_IMAGES_DIR, '→', len(list(IU_IMAGES_DIR.glob('*.png'))), 'PNGs')\n print('IU labels dir:', IU_LABELS_DIR, '→', len(list(IU_LABELS_DIR.glob('*.xml'))), 'XMLs')",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
"execution_count": null,
|
| 199 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
"id": "cell-find-data-mimic"
|
| 201 |
},
|
| 202 |
{
|
|
|
|
| 204 |
"metadata": {
|
| 205 |
"id": "cell-json-md"
|
| 206 |
},
|
| 207 |
+
"source": "## 3. Build the unified instruction JSON\n\n- **MIMIC-CXR_resized**: auto-built by `utils.dataset_resolver` → `data.mimic_cxr_resized_builder` the first time `train.py` / `evaluate.py` runs. It reads `manifest_{train,val,test}.csv` (which carry the split label, image/report relpath, and the 14 CheXpert `chex_*` columns → PNU `Positive/Negative/Uncertain Abnormalities` string), parses findings + impression from each report, and attaches abnormality-guided VQA from `vqa/{vqa,vqa_val,vqa_test}.json`. The inline cells below are **no-ops** for this dataset.\n- **MIMIC-CXR** (full, pre-split): auto-built by `data.mimic_cxr_builder` (CheXpert.csv-based). Inline cells are no-ops here too.\n- **IU-Xray**: built by `data.iu_xray_builder` in the cell below (resolver would also do it lazily; we build here just to get a summary log).\n\nAll three paths produce the same JSON schema (`image_path`, `task`, `target`, `question`, `structured_findings`, `split`, …) so `CXRInstructDataset` loads them unchanged.",
|
| 208 |
"id": "cell-json-md"
|
| 209 |
},
|
| 210 |
{
|
|
|
|
| 216 |
},
|
| 217 |
"outputId": "3b965273-ac82-41a7-8ead-895094e0a8b1"
|
| 218 |
},
|
| 219 |
+
"source": "# MIMIC-CXR and MIMIC-CXR_resized: the unified JSON is built lazily by\n# utils.dataset_resolver (→ data.mimic_cxr_builder for MIMIC-CXR, or\n# → data.mimic_cxr_resized_builder for MIMIC-CXR_resized) when train.py /\n# evaluate.py first run. The old inline parse/build cells are no-ops for\n# both; IU-Xray still gets a friendly inline build below for the log.\nif DATASET_NAME in ('MIMIC-CXR', 'MIMIC-CXR_resized'):\n print(f'{DATASET_NAME}: JSON build handled by the resolver — skipping inline parse cell.')\nelse:\n print('IU-Xray: skipping MIMIC indexing cell.')",
|
| 220 |
"execution_count": null,
|
| 221 |
"outputs": [],
|
| 222 |
"id": "cell-parse"
|
|
|
|
| 230 |
},
|
| 231 |
"outputId": "91e11a3e-7b4e-4457-c32a-70a17fb2ef2a"
|
| 232 |
},
|
| 233 |
+
"source": "if DATASET_NAME in ('MIMIC-CXR', 'MIMIC-CXR_resized'):\n print(f'{DATASET_NAME}: findings/impression built by the resolver — skipping.')\nelse:\n samples = None\n print('IU-Xray: skipping MIMIC report parsing cell.')",
|
| 234 |
"execution_count": null,
|
| 235 |
"outputs": [],
|
| 236 |
"id": "cell-build-findings"
|
|
|
|
| 244 |
},
|
| 245 |
"outputId": "fc5fd7bc-cb80-49aa-da79-0097ed038b5d"
|
| 246 |
},
|
| 247 |
+
"source": "if DATASET_NAME in ('MIMIC-CXR', 'MIMIC-CXR_resized'):\n print(f'{DATASET_NAME}: VQA attached by the resolver builder '\n '(with the same PNU CheXpert context) — skipping inline VQA cell.')\nelse:\n print('IU-Xray: skipping MIMIC VQA cell.')",
|
| 248 |
"execution_count": null,
|
| 249 |
"outputs": [],
|
| 250 |
"id": "cell-build-vqa"
|
|
|
|
| 258 |
},
|
| 259 |
"outputId": "b4d6589b-19be-4eb8-ba25-d533248439b9"
|
| 260 |
},
|
| 261 |
+
"source": "if DATASET_NAME in ('MIMIC-CXR', 'MIMIC-CXR_resized'):\n print(f'{DATASET_NAME}: image-existence filtering handled inside the resolver '\n 'builder — skipping.')\nelse:\n print('IU-Xray: skipping.')",
|
| 262 |
"execution_count": null,
|
| 263 |
"outputs": [],
|
| 264 |
"id": "cell-filter"
|
|
|
|
| 272 |
},
|
| 273 |
"outputId": "b6d95196-0383-4a50-8424-9ef95eb7b34e"
|
| 274 |
},
|
| 275 |
+
"source": "out_dir = PROJECT / 'data' / 'data_files'\nout_dir.mkdir(parents=True, exist_ok=True)\n\nif DATASET_NAME == 'MIMIC-CXR':\n # Base path only — the resolver appends __{report_mode}__{image_mode}\n # and builds it (PNU CheXpert + abnormality-guided VQA) via\n # data.mimic_cxr_builder the first time train.py / evaluate.py runs.\n mimic_json_path = out_dir / 'mimic_cxr_instruct_unified.json'\n print('MIMIC-CXR: instruct JSON auto-built by resolver →',\n f'{mimic_json_path.stem}__<report_mode>__<image_mode>.json')\nelif DATASET_NAME == 'MIMIC-CXR_resized':\n # Same lazy-build story but via data.mimic_cxr_resized_builder.\n mr_json_path = out_dir / 'mimic_cxr_resized_instruct.json'\n print('MIMIC-CXR_resized: instruct JSON auto-built by resolver →',\n f'{mr_json_path.stem}__<report_mode>__<image_mode>.json')\nelse:\n # Build IU-Xray JSON here so the notebook shows a nice summary log\n # (the resolver would also do this lazily).\n from data.iu_xray_builder import build_iu_xray_instruct_json\n iu_json_path = out_dir / 'iu_xray_instruct.json'\n build_iu_xray_instruct_json(\n images_dir = str(IU_IMAGES_DIR),\n labels_dir = str(IU_LABELS_DIR),\n output_path = str(iu_json_path),\n train_ratio = 0.70, val_ratio = 0.15, test_ratio = 0.15, seed = 42,\n )",
|
| 276 |
"execution_count": null,
|
| 277 |
"outputs": [],
|
| 278 |
"id": "cell-save-json"
|
|
|
|
| 282 |
"metadata": {
|
| 283 |
"id": "cell-cfg-md"
|
| 284 |
},
|
| 285 |
+
"source": "## 4. Patch configs for the Kaggle/Colab environment\n\n- Sets `data.dataset_name`, `report_mode`, `image_mode`.\n- **MIMIC-CXR_resized** *(default)*: sets `mimic_cxr_resized.root` (the manifest+files+vqa+reports payload). `manifest_dir` / `vqa_dir` / `reports_root` are left null so the resolver auto-detects from `{root}/`, `{root}/vqa/`, `{root}/reports/`. The builder reads `chex_*` columns directly — no separate CheXpert CSV is needed.\n- **MIMIC-CXR**: sets `mimic_cxr_root`, the `instruct_json` base path, auto-discovers the **CheXpert CSV** (`mimic_chexpert_csv`) and the **VQA** dir (`mimic_vqa_root`), and turns on `mimic_auto_build`.\n- **IU-Xray**: points `iu_xray.images_dir/labels_dir/instruct_json` at the mount.\n- `tasks.*.weight` is left at the config defaults (findings 0.30 / impression 0.20 / vqa 0.50). `WeightedRandomSampler` in `CXRTrainer._get_train_sampler` enforces the mix at train time — see `data/dataset.py:get_per_sample_weights`.\n- `training.output_root` under `WORK/ckpt` (Persistence keeps it).\n- **4-bit QLoRA**; WandB off; HF hub on — edit `hf_hub.repo_id` to your repo.\n\n⚠️ MIMIC-CXR (full) path: if \"CheXpert CSV: NOT FOUND\" prints, add `mimic-cxr-2.0.0-chexpert.csv` to the data so PNU abnormality guidance is active (training still runs without it, just no PNU). For MIMIC-CXR_resized this is N/A — labels are baked into the manifest.",
|
| 286 |
"id": "cell-cfg-md"
|
| 287 |
},
|
| 288 |
{
|
|
|
|
| 294 |
},
|
| 295 |
"outputId": "80ddabe3-bc8b-4d14-94e2-26ff9e64970c"
|
| 296 |
},
|
| 297 |
+
"source": "from omegaconf import OmegaConf\n\ntrain_cfg = OmegaConf.load(PROJECT / 'configs' / 'train_config.yaml')\nmodel_cfg = OmegaConf.load(PROJECT / 'configs' / 'model_config.yaml')\n\n# ── dataset selector ──\ntrain_cfg.data.dataset_name = DATASET_NAME\n\n# ── training-scheme switches (thesis ablations) ──\n# report_mode: 'split' → 2 tasks (findings + impression separately)\n# 'merged' → 1 task (full report \"Findings: ...\\n\\nImpression: ...\")\n# 'split_cascade' → split, but impression's context = GT findings\n# image_mode : 'all_views_split' | 'frontal_only_split' | 'multi_image_merged'\ntrain_cfg.data.report_mode = 'split'\ntrain_cfg.data.image_mode = 'all_views_split'\ntrain_cfg.data.max_images_per_sample = 2 # only used in multi_image_merged\n\n# ── dataset-specific paths ──\nif DATASET_NAME == 'MIMIC-CXR':\n train_cfg.data.mimic_cxr_root = str(CXR_ROOT)\n # Base path; the resolver suffixes __{report_mode}__{image_mode} and\n # auto-builds (PNU CheXpert + VQA) via data.mimic_cxr_builder.\n train_cfg.data.instruct_json = str(mimic_json_path)\n train_cfg.data.mimic_auto_build = True\n\n # RaDialog / U-MultiClass abnormality guidance: locate the CheXpert\n # label CSV so the builder can bake the PNU structured_findings string.\n _cx = (sorted(DATA_SRC.rglob('*chexpert*.csv'))\n or sorted(DATA_SRC.rglob('*chexbert*.csv')))\n train_cfg.data.mimic_chexpert_csv = str(_cx[0]) if _cx else None\n print('CheXpert CSV :', train_cfg.data.mimic_chexpert_csv\n or 'NOT FOUND — PNU abnormality guidance DISABLED!')\n\n # VQA pairs ({train,valid,test}.json) → abnormality-guided VQA.\n train_cfg.data.mimic_vqa_root = str(VQA_ROOT) if VQA_ROOT is not None else None\n print('VQA root :', train_cfg.data.mimic_vqa_root or '(none — VQA skipped)')\n\nelif DATASET_NAME == 'MIMIC-CXR_resized':\n # The MIMIC-CXR_resized builder is manifest-driven: it reads\n # `manifest_{train,val,test}.csv` for split + the 14 chex_* labels\n # (PNU bucketed directly from the CSV, no separate chexpert.csv needed),\n # uses `report_relpath` from the manifest to find each .txt, and pulls\n # VQA from `vqa/{vqa,vqa_val,vqa_test}.json`.\n train_cfg.data.mimic_cxr_resized.root = str(MR_ROOT)\n train_cfg.data.mimic_cxr_resized.manifest_dir = None # null → defaults to root\n train_cfg.data.mimic_cxr_resized.vqa_dir = None # null → {root}/vqa\n train_cfg.data.mimic_cxr_resized.reports_root = None # null → auto-probe {root} then {root}/reports\n train_cfg.data.mimic_cxr_resized.instruct_json = str(mr_json_path)\n train_cfg.data.mimic_cxr_resized.auto_build = True\n\nelse: # IU-Xray\n train_cfg.data.iu_xray.images_dir = str(IU_IMAGES_DIR)\n train_cfg.data.iu_xray.labels_dir = str(IU_LABELS_DIR)\n train_cfg.data.iu_xray.instruct_json = str(iu_json_path)\n train_cfg.data.iu_xray.auto_build = True\n\ntrain_cfg.data.train_split = 'train'\ntrain_cfg.data.val_split = 'validate'\ntrain_cfg.data.test_split = 'test'\n\n# ── checkpoint root (Persistence keeps /content/ckpt/) ──\nCKPT_ROOT = WORK / 'ckpt'\ntrain_cfg.training.output_root = str(CKPT_ROOT)\n\n# ── batching ──\ntrain_cfg.training.per_device_train_batch_size = 4\ntrain_cfg.training.per_device_eval_batch_size = 4\ntrain_cfg.training.gradient_accumulation_steps = 4\ntrain_cfg.training.fp16 = False\ntrain_cfg.training.bf16 = True\ntrain_cfg.training.dataloader_num_workers = 8\n\ntrain_cfg.stage2.num_epoch = 5\n\n# ── task weights (sampling ratio enforced by WeightedRandomSampler) ──\n# Defaults in train_config.yaml: 0.30 / 0.20 / 0.50 (RRG ≈ VQA, impression\n# lower because in split_cascade mode it sees GT findings as input).\n# Resolver auto-renormalizes and drops vqa for IU-Xray. Override here only\n# if you want to experiment per-run, e.g.:\n# train_cfg.tasks.findings_generation.weight = 0.30\n# train_cfg.tasks.impression_generation.weight = 0.20\n# train_cfg.tasks.vqa.weight = 0.50\n\n# ── wandb off ──\ntrain_cfg.wandb.enabled = False\n\n# ── HuggingFace Hub run tracking ──\ntrain_cfg.hf_hub.enabled = True\ntrain_cfg.hf_hub.repo_id = 'hieu3636/cxr-vlm-runs' # <<< EDIT ME\ntrain_cfg.hf_hub.token_env = 'HF_TOKEN'\ntrain_cfg.hf_hub.private = True\ntrain_cfg.hf_hub.run_state_file = str(CKPT_ROOT / 'run_id.txt')\n\n# ── 4-bit QLoRA ──\nmodel_cfg.llm.load_in_8bit = False\nmodel_cfg.llm.load_in_4bit = True\n# Oracle PNU path does NOT use the CheXpert classifier module (labels come\n# from the GT csv/manifest baked into the prompt). Keep it disabled until\n# you wire the learned classifier for realistic inference.\nmodel_cfg.chexpert_classifier.enabled = False\n\nOmegaConf.save(train_cfg, PROJECT / 'configs' / 'train_config.yaml')\nOmegaConf.save(model_cfg, PROJECT / 'configs' / 'model_config.yaml')\n\nprint('--- train_cfg.data ---'); print(OmegaConf.to_yaml(train_cfg.data))\nprint('--- train_cfg.tasks ---'); print(OmegaConf.to_yaml(train_cfg.tasks))\nprint('--- train_cfg.training ---');print(OmegaConf.to_yaml(train_cfg.training))\nprint('--- train_cfg.hf_hub ---'); print(OmegaConf.to_yaml(train_cfg.hf_hub))\nprint('--- model_cfg.llm ---'); print(OmegaConf.to_yaml(model_cfg.llm))",
|
| 298 |
"execution_count": null,
|
| 299 |
"outputs": [],
|
| 300 |
"id": "cell-cfg"
|
|
|
|
| 320 |
},
|
| 321 |
"outputId": "8a2ce693-94fc-4425-f62c-679614d6dab5"
|
| 322 |
},
|
| 323 |
+
"source": "# HF_TOKEN was already loaded in cell-paths (uniformly across all platforms).\n# This cell is now just a confirmation + reminder.\nassert os.environ.get('HF_TOKEN'), 'HF_TOKEN missing — re-run cell-paths.'\nprint('HF_TOKEN loaded ✓')",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
"execution_count": null,
|
| 325 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
"id": "cell-hf-token"
|
| 327 |
},
|
| 328 |
{
|
|
|
|
| 377 |
"metadata": {
|
| 378 |
"id": "cell-mode-md"
|
| 379 |
},
|
| 380 |
+
"source": "## 5b. Resume controller\n\nSingle switch. No more \"which stage\" — `train.py` auto-detects which stage to continue from by inspecting checkpoints on disk.\n\n| MODE | What happens |\n|------------|--------------|\n| `'fresh'` | Allocate a brand-new `{DATASET}_run_N+1` folder. Train both stages from scratch. |\n| `'resume'` | Reuse latest matching `{DATASET}_run_N` (or `EXPLICIT_RUN_ID`). Auto-detect from local disk: stage 1 mid-checkpoint, stage 1 done → stage 2 fresh, stage 2 mid-checkpoint, or both done. |\n\n`EXPLICIT_RUN_ID` is optional (set to `None` to auto-pick the latest run on disk or HF Hub that matches the current dataset prefix).\n\n### Fresh-VM resume\n\nIf your Colab/Kaggle VM was reset and the local `ckpt/{run_id}/` is gone (persistence lost or switching machines), the train cell will **auto-pull** the previous run's `stage{1,2}/last/` + `stage1/best/` (= stage1 final) from HF Hub into the canonical local layout before training, so `detect_resume_point` can pick up where you left off. `timing.json` is also pulled so the session-count + cumulative-time keeps incrementing.\n\n`run_id` resolution order (when `MODE='resume'`): `EXPLICIT_RUN_ID` > local `run_id.txt` > latest `{DATASET}_run_*` on HF Hub.",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
"id": "cell-resume-md"
|
| 382 |
},
|
| 383 |
{
|
training/train.py
CHANGED
|
@@ -44,7 +44,7 @@ from model.rad_dino import BioViLTEncoder
|
|
| 44 |
from data import CXRInstructDataset, CXRDataCollator
|
| 45 |
from utils.logger import setup_logger
|
| 46 |
from utils.checkpoint import save_checkpoint, load_checkpoint
|
| 47 |
-
from utils.hf_uploader import build_tracker_from_cfg, pull_last_for_resume
|
| 48 |
from utils.dataset_resolver import (
|
| 49 |
resolve_dataset_spec,
|
| 50 |
resolve_run_id,
|
|
@@ -263,6 +263,37 @@ def get_trainer(
|
|
| 263 |
model = self.model
|
| 264 |
load_checkpoint(model, resume_from_checkpoint)
|
| 265 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
return CXRTrainer(
|
| 267 |
model = model,
|
| 268 |
args = training_args,
|
|
@@ -649,6 +680,25 @@ def main():
|
|
| 649 |
args.resume_from = local_resume
|
| 650 |
logger.info(f"Will resume from pulled checkpoint: {local_resume} (stage{args.stage})")
|
| 651 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 652 |
# ── Compute per-stage output dirs under {output_root}/{run_id}/ ──
|
| 653 |
stage1_out = stage_dir(output_root, run_id,
|
| 654 |
str(train_cfg.stage1.get("subdir", "stage1_projection")))
|
|
@@ -724,6 +774,16 @@ def main():
|
|
| 724 |
"resumed": bool(args.resume_from),
|
| 725 |
"resume_from": args.resume_from,
|
| 726 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 727 |
|
| 728 |
# Build model
|
| 729 |
logger.info("Building CXR VLM...")
|
|
|
|
| 44 |
from data import CXRInstructDataset, CXRDataCollator
|
| 45 |
from utils.logger import setup_logger
|
| 46 |
from utils.checkpoint import save_checkpoint, load_checkpoint
|
| 47 |
+
from utils.hf_uploader import build_tracker_from_cfg, pull_last_for_resume, hydrate_run_dir_from_hf
|
| 48 |
from utils.dataset_resolver import (
|
| 49 |
resolve_dataset_spec,
|
| 50 |
resolve_run_id,
|
|
|
|
| 263 |
model = self.model
|
| 264 |
load_checkpoint(model, resume_from_checkpoint)
|
| 265 |
|
| 266 |
+
def _get_train_sampler(self, *args, **kwargs):
|
| 267 |
+
"""
|
| 268 |
+
Use `WeightedRandomSampler` when the train dataset is mixed-task
|
| 269 |
+
and exposes per-sample weights — this is what makes the configured
|
| 270 |
+
`tasks.*.weight` ratios actually control batch composition.
|
| 271 |
+
Falls back to HF's default (RandomSampler / DistributedSampler)
|
| 272 |
+
for single-task or eval-time datasets.
|
| 273 |
+
|
| 274 |
+
Notes:
|
| 275 |
+
* Eval is unaffected — HF's `_get_eval_sampler` returns a
|
| 276 |
+
`SequentialSampler` by default, so weighted reweighting only
|
| 277 |
+
applies to training.
|
| 278 |
+
* `replacement=True` is required for true oversampling — without
|
| 279 |
+
it you can't draw more samples of a rare-but-upweighted task
|
| 280 |
+
than physically exist. Tradeoff: a small fraction of samples
|
| 281 |
+
in a numerous-but-downweighted task may never appear in a
|
| 282 |
+
given epoch. Acceptable across multiple epochs.
|
| 283 |
+
"""
|
| 284 |
+
ds = self.train_dataset
|
| 285 |
+
getter = getattr(ds, "get_per_sample_weights", None)
|
| 286 |
+
if getter is not None:
|
| 287 |
+
weights = getter()
|
| 288 |
+
if weights is not None:
|
| 289 |
+
from torch.utils.data import WeightedRandomSampler
|
| 290 |
+
return WeightedRandomSampler(
|
| 291 |
+
weights = weights,
|
| 292 |
+
num_samples = len(ds),
|
| 293 |
+
replacement = True,
|
| 294 |
+
)
|
| 295 |
+
return super()._get_train_sampler(*args, **kwargs)
|
| 296 |
+
|
| 297 |
return CXRTrainer(
|
| 298 |
model = model,
|
| 299 |
args = training_args,
|
|
|
|
| 680 |
args.resume_from = local_resume
|
| 681 |
logger.info(f"Will resume from pulled checkpoint: {local_resume} (stage{args.stage})")
|
| 682 |
|
| 683 |
+
# ── Fresh-VM resume: hydrate from HF before detect_resume_point ──
|
| 684 |
+
# When `--mode resume` is set but the local run dir is empty (Colab
|
| 685 |
+
# persistence lost, switching machines), pull configs + last/best
|
| 686 |
+
# checkpoints from HF Hub into the canonical local layout so the
|
| 687 |
+
# detector finds them. No-op if local already has artifacts or HF
|
| 688 |
+
# tracking is disabled.
|
| 689 |
+
if args.mode == "resume" and hf_repo_id and hf_token:
|
| 690 |
+
try:
|
| 691 |
+
hydrate_run_dir_from_hf(
|
| 692 |
+
repo_id = hf_repo_id,
|
| 693 |
+
token = hf_token,
|
| 694 |
+
run_id = run_id,
|
| 695 |
+
output_root = output_root,
|
| 696 |
+
stage1_subdir = str(train_cfg.stage1.get("subdir", "stage1_projection")),
|
| 697 |
+
stage2_subdir = str(train_cfg.stage2.get("subdir", "stage2_instruct")),
|
| 698 |
+
)
|
| 699 |
+
except Exception as e:
|
| 700 |
+
logger.warning(f"[resume hydrate] {type(e).__name__}: {e}")
|
| 701 |
+
|
| 702 |
# ── Compute per-stage output dirs under {output_root}/{run_id}/ ──
|
| 703 |
stage1_out = stage_dir(output_root, run_id,
|
| 704 |
str(train_cfg.stage1.get("subdir", "stage1_projection")))
|
|
|
|
| 774 |
"resumed": bool(args.resume_from),
|
| 775 |
"resume_from": args.resume_from,
|
| 776 |
})
|
| 777 |
+
# Snapshot the resolved config + run_meta.json to HF so the run is
|
| 778 |
+
# self-describing on the hub (you can answer "what config did
|
| 779 |
+
# {run_id} actually use?" without pulling the whole checkpoint).
|
| 780 |
+
# `save_run_config` writes these into {run_dir}/configs/ +
|
| 781 |
+
# {run_dir}/run_meta.json a few lines above.
|
| 782 |
+
rd = run_dir(output_root, run_id)
|
| 783 |
+
if (rd / "configs").is_dir():
|
| 784 |
+
tracker.upload_folder(str(rd / "configs"), "configs")
|
| 785 |
+
if (rd / "run_meta.json").is_file():
|
| 786 |
+
tracker.upload_file(str(rd / "run_meta.json"), "run_meta.json")
|
| 787 |
|
| 788 |
# Build model
|
| 789 |
logger.info("Building CXR VLM...")
|
data/upload_to_hf_2.py → upload_to_hf_2.py
RENAMED
|
File without changes
|
utils/dataset_resolver.py
CHANGED
|
@@ -25,7 +25,7 @@ from pathlib import Path
|
|
| 25 |
from typing import Dict, List, Optional
|
| 26 |
|
| 27 |
|
| 28 |
-
SUPPORTED_DATASETS = ("MIMIC-CXR", "IU-Xray")
|
| 29 |
|
| 30 |
|
| 31 |
@dataclass
|
|
@@ -113,6 +113,21 @@ def resolve_dataset_spec(train_cfg) -> DatasetSpec:
|
|
| 113 |
train_cfg.data, report_mode, image_mode
|
| 114 |
)
|
| 115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
else: # IU-Xray
|
| 117 |
# IU has no VQA.
|
| 118 |
available = ["report"] if report_mode == "merged" else ["findings", "impression"]
|
|
@@ -233,6 +248,63 @@ def _ensure_mimic_json_exists(data_cfg,
|
|
| 233 |
return str(out)
|
| 234 |
|
| 235 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
# ─── Run ID resolution (dataset-prefixed) ───────────────────────────────────
|
| 237 |
|
| 238 |
def resolve_run_id(
|
|
|
|
| 25 |
from typing import Dict, List, Optional
|
| 26 |
|
| 27 |
|
| 28 |
+
SUPPORTED_DATASETS = ("MIMIC-CXR", "MIMIC-CXR_resized", "IU-Xray")
|
| 29 |
|
| 30 |
|
| 31 |
@dataclass
|
|
|
|
| 113 |
train_cfg.data, report_mode, image_mode
|
| 114 |
)
|
| 115 |
|
| 116 |
+
elif name == "MIMIC-CXR_resized":
|
| 117 |
+
# Same semantic dataset as MIMIC-CXR (all 3 tasks) but the on-disk
|
| 118 |
+
# layout is the raw PhysioNet tree {root}/files/pXX/... and splits
|
| 119 |
+
# come from mimic-cxr-2.0.0-split.csv instead of a pre-split dir
|
| 120 |
+
# structure. Reuses the same builder with layout="files".
|
| 121 |
+
if report_mode == "merged":
|
| 122 |
+
available = ["report", "vqa"]
|
| 123 |
+
else:
|
| 124 |
+
available = ["findings", "impression", "vqa"]
|
| 125 |
+
mr = train_cfg.data.mimic_cxr_resized
|
| 126 |
+
image_root = mr.root
|
| 127 |
+
instruct_json = _ensure_mimic_resized_json_exists(
|
| 128 |
+
mr, report_mode, image_mode
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
else: # IU-Xray
|
| 132 |
# IU has no VQA.
|
| 133 |
available = ["report"] if report_mode == "merged" else ["findings", "impression"]
|
|
|
|
| 248 |
return str(out)
|
| 249 |
|
| 250 |
|
| 251 |
+
def _ensure_mimic_resized_json_exists(mr_cfg,
|
| 252 |
+
report_mode: str = "split",
|
| 253 |
+
image_mode: str = "all_views_split") -> str:
|
| 254 |
+
"""
|
| 255 |
+
Build the MIMIC-CXR_resized unified JSON if missing.
|
| 256 |
+
|
| 257 |
+
This dataset is **manifest-driven**, not directory-walking:
|
| 258 |
+
- 3 manifest CSVs (manifest_{train,val,test}.csv) carry every row's
|
| 259 |
+
split label, image/report relative path, and the 14 CheXpert
|
| 260 |
+
labels as chex_* columns. No separate *split*.csv or *chexpert*.csv
|
| 261 |
+
is read.
|
| 262 |
+
- VQA is read from `vqa_dir/{vqa.json, vqa_val.json, vqa_test.json}`.
|
| 263 |
+
|
| 264 |
+
The cache path is suffixed with report_mode+image_mode (same convention
|
| 265 |
+
as the other two builders) so each mode combination gets its own cache.
|
| 266 |
+
"""
|
| 267 |
+
base = Path(_get(mr_cfg, "instruct_json",
|
| 268 |
+
"data/data_files/mimic_cxr_resized_instruct.json"))
|
| 269 |
+
out = base.with_name(f"{base.stem}__{report_mode}__{image_mode}{base.suffix}")
|
| 270 |
+
if out.is_file():
|
| 271 |
+
return str(out)
|
| 272 |
+
|
| 273 |
+
if not bool(_get(mr_cfg, "auto_build", True)):
|
| 274 |
+
raise FileNotFoundError(
|
| 275 |
+
f"MIMIC-CXR_resized instruct JSON not found at {out} and "
|
| 276 |
+
f"auto_build=false. Run: python -m data.mimic_cxr_resized_builder "
|
| 277 |
+
f"--root {_get(mr_cfg, 'root')} --output {out} "
|
| 278 |
+
f"--report_mode {report_mode} --image_mode {image_mode}"
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
from data.mimic_cxr_resized_builder import build_mimic_cxr_resized_instruct_json
|
| 282 |
+
print(f"[dataset_resolver] MIMIC-CXR_resized JSON not found → auto-building "
|
| 283 |
+
f"(report_mode={report_mode}, image_mode={image_mode}) …")
|
| 284 |
+
root_path = str(_get(mr_cfg, "root"))
|
| 285 |
+
# Convention defaults: manifest CSVs sit at `root`, VQA at `{root}/vqa`.
|
| 286 |
+
# Either can be overridden in config; an explicit empty string for
|
| 287 |
+
# vqa_dir disables VQA entirely.
|
| 288 |
+
manifest_dir = _get(mr_cfg, "manifest_dir") or root_path
|
| 289 |
+
vqa_dir_cfg = _get(mr_cfg, "vqa_dir")
|
| 290 |
+
if vqa_dir_cfg is None:
|
| 291 |
+
vqa_dir = str(Path(root_path) / "vqa")
|
| 292 |
+
elif vqa_dir_cfg == "":
|
| 293 |
+
vqa_dir = None # explicit opt-out
|
| 294 |
+
else:
|
| 295 |
+
vqa_dir = str(vqa_dir_cfg)
|
| 296 |
+
build_mimic_cxr_resized_instruct_json(
|
| 297 |
+
root = root_path,
|
| 298 |
+
manifest_dir = manifest_dir,
|
| 299 |
+
output_path = str(out),
|
| 300 |
+
vqa_dir = vqa_dir,
|
| 301 |
+
reports_root = _get(mr_cfg, "reports_root"),
|
| 302 |
+
report_mode = report_mode,
|
| 303 |
+
image_mode = image_mode,
|
| 304 |
+
)
|
| 305 |
+
return str(out)
|
| 306 |
+
|
| 307 |
+
|
| 308 |
# ─── Run ID resolution (dataset-prefixed) ───────────────────────────────────
|
| 309 |
|
| 310 |
def resolve_run_id(
|
utils/hf_uploader.py
CHANGED
|
@@ -288,6 +288,157 @@ def pull_last_for_resume(
|
|
| 288 |
return str(last_dir)
|
| 289 |
|
| 290 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
def build_tracker_from_cfg(train_cfg, resuming: bool = False, explicit_run_id: Optional[str] = None):
|
| 292 |
"""Convenience factory from OmegaConf DictConfig."""
|
| 293 |
hf = getattr(train_cfg, "hf_hub", None)
|
|
|
|
| 288 |
return str(last_dir)
|
| 289 |
|
| 290 |
|
| 291 |
+
def hydrate_run_dir_from_hf(
|
| 292 |
+
repo_id: str,
|
| 293 |
+
token: Optional[str],
|
| 294 |
+
run_id: str,
|
| 295 |
+
output_root: str,
|
| 296 |
+
stage1_subdir: str = "stage1_projection",
|
| 297 |
+
stage2_subdir: str = "stage2_instruct",
|
| 298 |
+
) -> bool:
|
| 299 |
+
"""
|
| 300 |
+
Repopulate a local run dir from HF artifacts so `detect_resume_point`
|
| 301 |
+
can find checkpoints after a fresh-VM resume (persistence lost / new host).
|
| 302 |
+
|
| 303 |
+
HF layout (uploaded by HFBestLastCallback + end-of-stage saves):
|
| 304 |
+
{run_id}/configs/ (YAML snapshots)
|
| 305 |
+
{run_id}/run_meta.json
|
| 306 |
+
{run_id}/timing.json
|
| 307 |
+
{run_id}/stage1/last/ + stage1/best/ (best/ = stage1 final, renamed `checkpoint_*`)
|
| 308 |
+
{run_id}/stage2/last/ + stage2/best/
|
| 309 |
+
|
| 310 |
+
Local layout `detect_resume_point` expects:
|
| 311 |
+
{output_root}/{run_id}/stage1_projection/stage1_final_* ← stage1 done
|
| 312 |
+
{output_root}/{run_id}/stage1_projection/checkpoint-N/... ← stage1 mid
|
| 313 |
+
{output_root}/{run_id}/stage2_instruct/stage2_final_* ← stage2 done
|
| 314 |
+
{output_root}/{run_id}/stage2_instruct/checkpoint-N/... ← stage2 mid
|
| 315 |
+
|
| 316 |
+
Mapping rules:
|
| 317 |
+
* `stage2/last/` → `stage2_instruct/checkpoint-1/` (placeholder N=1;
|
| 318 |
+
Trainer reads the real global_step from trainer_state.json inside).
|
| 319 |
+
* `stage1/best/` → `stage1_projection/stage1_final_*` (rename files
|
| 320 |
+
from `checkpoint_*` to `stage1_final_*` so save_checkpoint conventions
|
| 321 |
+
line up with what the rest of the pipeline expects).
|
| 322 |
+
* `stage1/last/` → `stage1_projection/checkpoint-1/` (only if no
|
| 323 |
+
stage1_final placed — i.e. stage 1 hadn't finished yet on HF).
|
| 324 |
+
|
| 325 |
+
Returns True if at least one artifact was placed, False otherwise.
|
| 326 |
+
"""
|
| 327 |
+
if not HF_AVAILABLE:
|
| 328 |
+
print("[hydrate_run_dir_from_hf] huggingface_hub not installed — skip")
|
| 329 |
+
return False
|
| 330 |
+
from huggingface_hub import snapshot_download
|
| 331 |
+
import shutil
|
| 332 |
+
|
| 333 |
+
token = token or os.environ.get("HF_TOKEN")
|
| 334 |
+
output_root = Path(output_root)
|
| 335 |
+
staging = output_root / "_hf_pull"
|
| 336 |
+
dst_root = output_root / run_id
|
| 337 |
+
|
| 338 |
+
# Skip if local already has any final/checkpoint — we're not on a fresh VM.
|
| 339 |
+
s1_local = dst_root / stage1_subdir
|
| 340 |
+
s2_local = dst_root / stage2_subdir
|
| 341 |
+
def _has_ckpt(d: Path) -> bool:
|
| 342 |
+
return d.is_dir() and any(d.glob("checkpoint-*"))
|
| 343 |
+
if (
|
| 344 |
+
(s1_local / "stage1_final_projection.pt").exists()
|
| 345 |
+
or (s2_local / "stage2_final_projection.pt").exists()
|
| 346 |
+
or _has_ckpt(s1_local)
|
| 347 |
+
or _has_ckpt(s2_local)
|
| 348 |
+
):
|
| 349 |
+
print(f"[hydrate_run_dir_from_hf] local {dst_root} already populated — skip pull")
|
| 350 |
+
return False
|
| 351 |
+
|
| 352 |
+
# Pull the run's relevant files (configs + meta + last/best, skip
|
| 353 |
+
# training_log.jsonl which can be large).
|
| 354 |
+
staging.mkdir(parents=True, exist_ok=True)
|
| 355 |
+
try:
|
| 356 |
+
snapshot_download(
|
| 357 |
+
repo_id = repo_id,
|
| 358 |
+
repo_type = "model",
|
| 359 |
+
token = token,
|
| 360 |
+
allow_patterns = [
|
| 361 |
+
f"{run_id}/configs/**",
|
| 362 |
+
f"{run_id}/run_meta.json",
|
| 363 |
+
f"{run_id}/timing.json",
|
| 364 |
+
f"{run_id}/meta.json",
|
| 365 |
+
f"{run_id}/stage1/last/**",
|
| 366 |
+
f"{run_id}/stage1/best/**",
|
| 367 |
+
f"{run_id}/stage2/last/**",
|
| 368 |
+
f"{run_id}/stage2/best/**",
|
| 369 |
+
],
|
| 370 |
+
local_dir = str(staging),
|
| 371 |
+
)
|
| 372 |
+
except Exception as e:
|
| 373 |
+
print(f"[hydrate_run_dir_from_hf] snapshot_download failed: {e}")
|
| 374 |
+
return False
|
| 375 |
+
|
| 376 |
+
src_root = staging / run_id
|
| 377 |
+
if not src_root.is_dir():
|
| 378 |
+
print(f"[hydrate_run_dir_from_hf] HF has no '{run_id}/' folder")
|
| 379 |
+
shutil.rmtree(staging, ignore_errors=True)
|
| 380 |
+
return False
|
| 381 |
+
|
| 382 |
+
dst_root.mkdir(parents=True, exist_ok=True)
|
| 383 |
+
placed_any = False
|
| 384 |
+
|
| 385 |
+
# configs/, run_meta.json, timing.json, meta.json: straight copy
|
| 386 |
+
for sub in ("configs",):
|
| 387 |
+
s = src_root / sub
|
| 388 |
+
if s.is_dir():
|
| 389 |
+
shutil.copytree(s, dst_root / sub, dirs_exist_ok=True)
|
| 390 |
+
placed_any = True
|
| 391 |
+
for f in ("run_meta.json", "timing.json", "meta.json"):
|
| 392 |
+
s = src_root / f
|
| 393 |
+
if s.is_file():
|
| 394 |
+
shutil.copy2(s, dst_root / f)
|
| 395 |
+
placed_any = True
|
| 396 |
+
|
| 397 |
+
# Stage 2 last → checkpoint-1
|
| 398 |
+
s2_last_src = src_root / "stage2" / "last"
|
| 399 |
+
if s2_last_src.is_dir() and any(s2_last_src.iterdir()):
|
| 400 |
+
dst = dst_root / stage2_subdir / "checkpoint-1"
|
| 401 |
+
dst.mkdir(parents=True, exist_ok=True)
|
| 402 |
+
shutil.copytree(s2_last_src, dst, dirs_exist_ok=True)
|
| 403 |
+
placed_any = True
|
| 404 |
+
print(f"[hydrate_run_dir_from_hf] stage2 mid-resume placed at {dst}")
|
| 405 |
+
|
| 406 |
+
# Stage 1 best (= final) → stage1_final_*
|
| 407 |
+
s1_best_src = src_root / "stage1" / "best"
|
| 408 |
+
if s1_best_src.is_dir() and (s1_best_src / "checkpoint_projection.pt").exists():
|
| 409 |
+
dst_s1 = dst_root / stage1_subdir
|
| 410 |
+
dst_s1.mkdir(parents=True, exist_ok=True)
|
| 411 |
+
for entry in s1_best_src.iterdir():
|
| 412 |
+
# Rename "checkpoint_*" → "stage1_final_*"
|
| 413 |
+
new_name = entry.name.replace("checkpoint_", "stage1_final_", 1) \
|
| 414 |
+
if entry.name.startswith("checkpoint_") else entry.name
|
| 415 |
+
if entry.is_file():
|
| 416 |
+
shutil.copy2(entry, dst_s1 / new_name)
|
| 417 |
+
elif entry.is_dir():
|
| 418 |
+
shutil.copytree(entry, dst_s1 / new_name, dirs_exist_ok=True)
|
| 419 |
+
placed_any = True
|
| 420 |
+
print(f"[hydrate_run_dir_from_hf] stage1 final placed at {dst_s1}")
|
| 421 |
+
|
| 422 |
+
# Stage 1 last → checkpoint-1 (ONLY if stage1 didn't finish yet)
|
| 423 |
+
if not (dst_root / stage1_subdir / "stage1_final_projection.pt").exists():
|
| 424 |
+
s1_last_src = src_root / "stage1" / "last"
|
| 425 |
+
if s1_last_src.is_dir() and any(s1_last_src.iterdir()):
|
| 426 |
+
dst = dst_root / stage1_subdir / "checkpoint-1"
|
| 427 |
+
dst.mkdir(parents=True, exist_ok=True)
|
| 428 |
+
shutil.copytree(s1_last_src, dst, dirs_exist_ok=True)
|
| 429 |
+
placed_any = True
|
| 430 |
+
print(f"[hydrate_run_dir_from_hf] stage1 mid-resume placed at {dst}")
|
| 431 |
+
|
| 432 |
+
# Cleanup staging
|
| 433 |
+
shutil.rmtree(staging, ignore_errors=True)
|
| 434 |
+
|
| 435 |
+
if placed_any:
|
| 436 |
+
print(f"[hydrate_run_dir_from_hf] hydrated {dst_root} from HF")
|
| 437 |
+
else:
|
| 438 |
+
print(f"[hydrate_run_dir_from_hf] nothing usable on HF for {run_id}")
|
| 439 |
+
return placed_any
|
| 440 |
+
|
| 441 |
+
|
| 442 |
def build_tracker_from_cfg(train_cfg, resuming: bool = False, explicit_run_id: Optional[str] = None):
|
| 443 |
"""Convenience factory from OmegaConf DictConfig."""
|
| 444 |
hf = getattr(train_cfg, "hf_hub", None)
|