convitom commited on
Commit
c369576
·
1 Parent(s): 28b13fc
.claude/settings.local.json CHANGED
@@ -32,7 +32,9 @@
32
  "Bash(where magick *)",
33
  "Bash(where inkscape *)",
34
  "Bash(python -c \"import cairosvg; cairosvg.svg2png\\(url='pipeline_diagram.svg', write_to='pipeline_diagram.png', output_width=2800\\)\")",
35
- "Bash(python scripts/_patch_notebook.py)"
 
 
36
  ]
37
  }
38
  }
 
32
  "Bash(where magick *)",
33
  "Bash(where inkscape *)",
34
  "Bash(python -c \"import cairosvg; cairosvg.svg2png\\(url='pipeline_diagram.svg', write_to='pipeline_diagram.png', output_width=2800\\)\")",
35
+ "Bash(python scripts/_patch_notebook.py)",
36
+ "Bash(python scripts/_patch_notebook2.py)",
37
+ "Bash(python scripts/_verify_nb.py)"
38
  ]
39
  }
40
  }
.claude/worktrees/keen-galileo-ce978c ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 28b13fcded5614d6ab4088a8d8f6a20b87060306
.claude/worktrees/nice-kapitsa-0be5ac ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 28b13fcded5614d6ab4088a8d8f6a20b87060306
count_img_on_HF.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import HfApi
2
+ from huggingface_hub.hf_api import RepoFile
3
+
4
+ # ── Sửa 2 dòng này theo nhu cầu ──────────────────────────────────────────────
5
+ REPO_ID = "hieu3636/cxr-vlm-data" # tên repo HuggingFace
6
+ FOLDER = "IU-Xray_2"
7
+ # folder cần đếm, ví dụ "files/p10" hoặc None để scan toàn bộ
8
+ # ─────────────────────────────────────────────────────────────────────────────
9
+
10
+ IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".tiff", ".tif", ".webp", ".dcm"}
11
+
12
+
13
+ def fmt_size(n_bytes: float) -> str:
14
+ for unit in ("B", "KB", "MB", "GB", "TB"):
15
+ if n_bytes < 1024:
16
+ return f"{n_bytes:.2f} {unit}"
17
+ n_bytes /= 1024
18
+ return f"{n_bytes:.2f} PB"
19
+
20
+
21
+ def main() -> None:
22
+ api = HfApi()
23
+
24
+ print(f"Repo : {REPO_ID}")
25
+ print(f"Folder : {FOLDER or '(toàn bộ repo)'}")
26
+ print("Đang lấy danh sách file...\n")
27
+
28
+ total_images = 0
29
+ total_bytes = 0
30
+ ext_counts: dict[str, int] = {}
31
+
32
+ for item in api.list_repo_tree(
33
+ repo_id=REPO_ID,
34
+ repo_type="dataset",
35
+ path_in_repo=FOLDER or "",
36
+ recursive=True,
37
+ expand=True, # lấy kèm metadata (size, ...)
38
+ ):
39
+ if not isinstance(item, RepoFile):
40
+ continue
41
+ name = item.rfilename
42
+ ext = ("." + name.rsplit(".", 1)[-1].lower()) if "." in name else ""
43
+ if ext not in IMAGE_EXTENSIONS:
44
+ continue
45
+ size = item.size or 0
46
+ total_images += 1
47
+ total_bytes += size
48
+ ext_counts[ext] = ext_counts.get(ext, 0) + 1
49
+
50
+ if total_images % 1000 == 0:
51
+ print(f" ... đã đếm {total_images:,} ảnh", flush=True)
52
+
53
+ print(f"\nTổng số ảnh : {total_images:,}")
54
+ print(f"Tổng dung lượng: {fmt_size(total_bytes)}")
55
+ if ext_counts:
56
+ print("\nTheo đuôi file:")
57
+ for ext, cnt in sorted(ext_counts.items(), key=lambda x: -x[1]):
58
+ print(f" {ext:10s} {cnt:,}")
59
+
60
+
61
+ if __name__ == "__main__":
62
+ main()
evaluation/metrics.py CHANGED
@@ -251,7 +251,9 @@ def evaluate_all(
251
  """
252
  results = {}
253
 
254
- if task in ("findings", "impression"):
 
 
255
  results.update(compute_bleu(hypotheses, references))
256
  results.update(compute_rouge(hypotheses, references))
257
  results.update(compute_bertscore(hypotheses, references, device=device))
 
251
  """
252
  results = {}
253
 
254
+ # "report" is the merged-mode task (full Findings + Impression in one
255
+ # target). Same NLG/clinical metrics apply as for findings/impression.
256
+ if task in ("findings", "impression", "report"):
257
  results.update(compute_bleu(hypotheses, references))
258
  results.update(compute_rouge(hypotheses, references))
259
  results.update(compute_bertscore(hypotheses, references, device=device))
scripts/cxrvlm_colab_train.ipynb CHANGED
@@ -1138,6 +1138,44 @@
1138
  "from datetime import datetime as _dt, timezone as _tz\n",
1139
  "from pathlib import Path as _Path\n",
1140
  "\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1141
  "_resume_args = \"\"\n",
1142
  "_is_resume = False\n",
1143
  "if \"RESUME_FROM\" in dir() and RESUME_FROM and RESUME_STAGE == 1:\n",
@@ -1147,6 +1185,12 @@
1147
  "else:\n",
1148
  " print(\"▶ STAGE 1 fresh run\")\n",
1149
  "\n",
 
 
 
 
 
 
1150
  "# ─── Start-of-stage timer ──────────────────────────────────────────────────\n",
1151
  "_t0_stage1 = _time.time()\n",
1152
  "_iso_start_stage1 = _dt.now(_tz.utc).isoformat(timespec=\"seconds\")\n",
@@ -1187,6 +1231,9 @@
1187
  " })\n",
1188
  " _timing_path.write_text(_json.dumps(_t, indent=2))\n",
1189
  "\n",
 
 
 
1190
  " def _fmt(sec):\n",
1191
  " h, r = divmod(int(sec), 3600); m, s = divmod(r, 60); return f\"{h:d}h {m:02d}m {s:02d}s\"\n",
1192
  " print()\n",
@@ -1233,6 +1280,44 @@
1233
  "from datetime import datetime as _dt, timezone as _tz\n",
1234
  "from pathlib import Path as _Path\n",
1235
  "\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1236
  "_resume_args = \"\"\n",
1237
  "_is_resume = False\n",
1238
  "if \"RESUME_FROM\" in dir() and RESUME_FROM and RESUME_STAGE == 2:\n",
@@ -1243,7 +1328,24 @@
1243
  " _resume_args = f'--run_id \"{RESUME_RUN_ID}\"'\n",
1244
  " print(\"▶ STAGE 2 fresh start, pinned to run_id\", RESUME_RUN_ID)\n",
1245
  "else:\n",
1246
- " print(\"▶ STAGE 2 fresh (same session as stage1)\")\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1247
  "\n",
1248
  "# ─── Start-of-stage timer ──────────────────────────────────────────────────\n",
1249
  "_t0_stage2 = _time.time()\n",
@@ -1285,6 +1387,9 @@
1285
  " })\n",
1286
  " _timing_path.write_text(_json.dumps(_t, indent=2))\n",
1287
  "\n",
 
 
 
1288
  " def _fmt(sec):\n",
1289
  " h, r = divmod(int(sec), 3600); m, s = divmod(r, 60); return f\"{h:d}h {m:02d}m {s:02d}s\"\n",
1290
  " _total = _t[\"stage1_elapsed_sec\"] + _t[\"stage2_elapsed_sec\"]\n",
@@ -1444,11 +1549,58 @@
1444
  "import json as _json\n",
1445
  "from pathlib import Path as _Path\n",
1446
  "\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1447
  "_run_id_file = CKPT_ROOT / \"run_id.txt\"\n",
1448
  "assert _run_id_file.exists(), \"No run_id.txt — train at least one stage first.\"\n",
1449
  "_run_id = _run_id_file.read_text().strip()\n",
 
 
 
 
 
 
1450
  "_timing_path = CKPT_ROOT / _run_id / \"timing.json\"\n",
1451
- "assert _timing_path.exists(), f\"No timing.json under {_timing_path.parent} — was the stage cell run via the wrapped version?\"\n",
 
 
 
1452
  "\n",
1453
  "_t = _json.loads(_timing_path.read_text())\n",
1454
  "\n",
 
1138
  "from datetime import datetime as _dt, timezone as _tz\n",
1139
  "from pathlib import Path as _Path\n",
1140
  "\n",
1141
+ "def _pull_timing_from_hf(run_id, ckpt_root, repo_id, token):\n",
1142
+ " # Pull timing.json from HF Hub for this run if not present locally.\n",
1143
+ " local = ckpt_root / run_id / \"timing.json\"\n",
1144
+ " if local.exists() or not repo_id or not token:\n",
1145
+ " return\n",
1146
+ " try:\n",
1147
+ " from huggingface_hub import hf_hub_download\n",
1148
+ " hf_hub_download(\n",
1149
+ " repo_id = repo_id,\n",
1150
+ " repo_type = \"model\",\n",
1151
+ " filename = f\"{run_id}/timing.json\",\n",
1152
+ " token = token,\n",
1153
+ " local_dir = str(ckpt_root),\n",
1154
+ " )\n",
1155
+ " print(f\"[TIMING] pulled previous timing.json from HF → {local}\")\n",
1156
+ " except Exception as e:\n",
1157
+ " # First run for this run_id → no remote file yet. That's fine.\n",
1158
+ " pass\n",
1159
+ "\n",
1160
+ "def _push_timing_to_hf(run_id, ckpt_root, repo_id, token):\n",
1161
+ " # Upload local timing.json to HF Hub under {run_id}/timing.json.\n",
1162
+ " local = ckpt_root / run_id / \"timing.json\"\n",
1163
+ " if not local.exists() or not repo_id or not token:\n",
1164
+ " return\n",
1165
+ " try:\n",
1166
+ " from huggingface_hub import HfApi\n",
1167
+ " HfApi(token=token).upload_file(\n",
1168
+ " path_or_fileobj = str(local),\n",
1169
+ " path_in_repo = f\"{run_id}/timing.json\",\n",
1170
+ " repo_id = repo_id,\n",
1171
+ " repo_type = \"model\",\n",
1172
+ " commit_message = f\"timing.json @ {run_id}\",\n",
1173
+ " )\n",
1174
+ " print(f\"[TIMING] uploaded timing.json to HF → {repo_id}/{run_id}/timing.json\")\n",
1175
+ " except Exception as e:\n",
1176
+ " print(f\"[TIMING] upload failed (non-fatal): {e}\")\n",
1177
+ "\n",
1178
+ "\n",
1179
  "_resume_args = \"\"\n",
1180
  "_is_resume = False\n",
1181
  "if \"RESUME_FROM\" in dir() and RESUME_FROM and RESUME_STAGE == 1:\n",
 
1185
  "else:\n",
1186
  " print(\"▶ STAGE 1 fresh run\")\n",
1187
  "\n",
1188
+ "# ─── Pre-pull timing.json from HF if resuming (best-effort) ────────────────\n",
1189
+ "_hf_repo = getattr(train_cfg.hf_hub, \"repo_id\", None) if train_cfg.hf_hub.enabled else None\n",
1190
+ "_hf_token = os.environ.get(\"HF_TOKEN\")\n",
1191
+ "if _is_resume and \"RESUME_RUN_ID\" in dir() and RESUME_RUN_ID:\n",
1192
+ " _pull_timing_from_hf(RESUME_RUN_ID, CKPT_ROOT, _hf_repo, _hf_token)\n",
1193
+ "\n",
1194
  "# ─── Start-of-stage timer ──────────────────────────────────────────────────\n",
1195
  "_t0_stage1 = _time.time()\n",
1196
  "_iso_start_stage1 = _dt.now(_tz.utc).isoformat(timespec=\"seconds\")\n",
 
1231
  " })\n",
1232
  " _timing_path.write_text(_json.dumps(_t, indent=2))\n",
1233
  "\n",
1234
+ " # ─── Push to HF Hub so the timer survives a fresh VM ─────────────────\n",
1235
+ " _push_timing_to_hf(_run_id_now, CKPT_ROOT, _hf_repo, _hf_token)\n",
1236
+ "\n",
1237
  " def _fmt(sec):\n",
1238
  " h, r = divmod(int(sec), 3600); m, s = divmod(r, 60); return f\"{h:d}h {m:02d}m {s:02d}s\"\n",
1239
  " print()\n",
 
1280
  "from datetime import datetime as _dt, timezone as _tz\n",
1281
  "from pathlib import Path as _Path\n",
1282
  "\n",
1283
+ "def _pull_timing_from_hf(run_id, ckpt_root, repo_id, token):\n",
1284
+ " # Pull timing.json from HF Hub for this run if not present locally.\n",
1285
+ " local = ckpt_root / run_id / \"timing.json\"\n",
1286
+ " if local.exists() or not repo_id or not token:\n",
1287
+ " return\n",
1288
+ " try:\n",
1289
+ " from huggingface_hub import hf_hub_download\n",
1290
+ " hf_hub_download(\n",
1291
+ " repo_id = repo_id,\n",
1292
+ " repo_type = \"model\",\n",
1293
+ " filename = f\"{run_id}/timing.json\",\n",
1294
+ " token = token,\n",
1295
+ " local_dir = str(ckpt_root),\n",
1296
+ " )\n",
1297
+ " print(f\"[TIMING] pulled previous timing.json from HF → {local}\")\n",
1298
+ " except Exception as e:\n",
1299
+ " # First run for this run_id → no remote file yet. That's fine.\n",
1300
+ " pass\n",
1301
+ "\n",
1302
+ "def _push_timing_to_hf(run_id, ckpt_root, repo_id, token):\n",
1303
+ " # Upload local timing.json to HF Hub under {run_id}/timing.json.\n",
1304
+ " local = ckpt_root / run_id / \"timing.json\"\n",
1305
+ " if not local.exists() or not repo_id or not token:\n",
1306
+ " return\n",
1307
+ " try:\n",
1308
+ " from huggingface_hub import HfApi\n",
1309
+ " HfApi(token=token).upload_file(\n",
1310
+ " path_or_fileobj = str(local),\n",
1311
+ " path_in_repo = f\"{run_id}/timing.json\",\n",
1312
+ " repo_id = repo_id,\n",
1313
+ " repo_type = \"model\",\n",
1314
+ " commit_message = f\"timing.json @ {run_id}\",\n",
1315
+ " )\n",
1316
+ " print(f\"[TIMING] uploaded timing.json to HF → {repo_id}/{run_id}/timing.json\")\n",
1317
+ " except Exception as e:\n",
1318
+ " print(f\"[TIMING] upload failed (non-fatal): {e}\")\n",
1319
+ "\n",
1320
+ "\n",
1321
  "_resume_args = \"\"\n",
1322
  "_is_resume = False\n",
1323
  "if \"RESUME_FROM\" in dir() and RESUME_FROM and RESUME_STAGE == 2:\n",
 
1328
  " _resume_args = f'--run_id \"{RESUME_RUN_ID}\"'\n",
1329
  " print(\"▶ STAGE 2 fresh start, pinned to run_id\", RESUME_RUN_ID)\n",
1330
  "else:\n",
1331
+ " # ─── FIX: pin stage 2 to the run_id stage 1 just wrote ────────────────\n",
1332
+ " # Without this, train.py treats stage 2 as a brand-new launch and\n",
1333
+ " # allocates a NEW run_N folder, splitting stage1/stage2 across two runs.\n",
1334
+ " _state_file = CKPT_ROOT / \"run_id.txt\"\n",
1335
+ " if _state_file.exists():\n",
1336
+ " _pinned = _state_file.read_text().strip()\n",
1337
+ " _resume_args = f'--run_id \"{_pinned}\"'\n",
1338
+ " print(f\"▶ STAGE 2 fresh, auto-pinned to run_id from state file: {_pinned}\")\n",
1339
+ " else:\n",
1340
+ " print(\"▶ STAGE 2 fresh (no state file — train.py will allocate a new run_id)\")\n",
1341
+ "\n",
1342
+ "# ─── Pre-pull timing.json from HF (in case of fresh VM) ───────────────────\n",
1343
+ "_hf_repo = getattr(train_cfg.hf_hub, \"repo_id\", None) if train_cfg.hf_hub.enabled else None\n",
1344
+ "_hf_token = os.environ.get(\"HF_TOKEN\")\n",
1345
+ "# Best guess at run_id BEFORE training (may be missing if stage 1 wasn't run here)\n",
1346
+ "_pre_state = CKPT_ROOT / \"run_id.txt\"\n",
1347
+ "if _pre_state.exists():\n",
1348
+ " _pull_timing_from_hf(_pre_state.read_text().strip(), CKPT_ROOT, _hf_repo, _hf_token)\n",
1349
  "\n",
1350
  "# ─── Start-of-stage timer ──────────────────────────────────────────────────\n",
1351
  "_t0_stage2 = _time.time()\n",
 
1387
  " })\n",
1388
  " _timing_path.write_text(_json.dumps(_t, indent=2))\n",
1389
  "\n",
1390
+ " # ─── Push to HF Hub ──────────────────────────────────────────────────\n",
1391
+ " _push_timing_to_hf(_run_id_now, CKPT_ROOT, _hf_repo, _hf_token)\n",
1392
+ "\n",
1393
  " def _fmt(sec):\n",
1394
  " h, r = divmod(int(sec), 3600); m, s = divmod(r, 60); return f\"{h:d}h {m:02d}m {s:02d}s\"\n",
1395
  " _total = _t[\"stage1_elapsed_sec\"] + _t[\"stage2_elapsed_sec\"]\n",
 
1549
  "import json as _json\n",
1550
  "from pathlib import Path as _Path\n",
1551
  "\n",
1552
+ "def _pull_timing_from_hf(run_id, ckpt_root, repo_id, token):\n",
1553
+ " # Pull timing.json from HF Hub for this run if not present locally.\n",
1554
+ " local = ckpt_root / run_id / \"timing.json\"\n",
1555
+ " if local.exists() or not repo_id or not token:\n",
1556
+ " return\n",
1557
+ " try:\n",
1558
+ " from huggingface_hub import hf_hub_download\n",
1559
+ " hf_hub_download(\n",
1560
+ " repo_id = repo_id,\n",
1561
+ " repo_type = \"model\",\n",
1562
+ " filename = f\"{run_id}/timing.json\",\n",
1563
+ " token = token,\n",
1564
+ " local_dir = str(ckpt_root),\n",
1565
+ " )\n",
1566
+ " print(f\"[TIMING] pulled previous timing.json from HF → {local}\")\n",
1567
+ " except Exception as e:\n",
1568
+ " # First run for this run_id → no remote file yet. That's fine.\n",
1569
+ " pass\n",
1570
+ "\n",
1571
+ "def _push_timing_to_hf(run_id, ckpt_root, repo_id, token):\n",
1572
+ " # Upload local timing.json to HF Hub under {run_id}/timing.json.\n",
1573
+ " local = ckpt_root / run_id / \"timing.json\"\n",
1574
+ " if not local.exists() or not repo_id or not token:\n",
1575
+ " return\n",
1576
+ " try:\n",
1577
+ " from huggingface_hub import HfApi\n",
1578
+ " HfApi(token=token).upload_file(\n",
1579
+ " path_or_fileobj = str(local),\n",
1580
+ " path_in_repo = f\"{run_id}/timing.json\",\n",
1581
+ " repo_id = repo_id,\n",
1582
+ " repo_type = \"model\",\n",
1583
+ " commit_message = f\"timing.json @ {run_id}\",\n",
1584
+ " )\n",
1585
+ " print(f\"[TIMING] uploaded timing.json to HF → {repo_id}/{run_id}/timing.json\")\n",
1586
+ " except Exception as e:\n",
1587
+ " print(f\"[TIMING] upload failed (non-fatal): {e}\")\n",
1588
+ "\n",
1589
+ "\n",
1590
  "_run_id_file = CKPT_ROOT / \"run_id.txt\"\n",
1591
  "assert _run_id_file.exists(), \"No run_id.txt — train at least one stage first.\"\n",
1592
  "_run_id = _run_id_file.read_text().strip()\n",
1593
+ "\n",
1594
+ "# Pull the latest timing.json from HF in case we're on a fresh VM.\n",
1595
+ "_hf_repo = getattr(train_cfg.hf_hub, \"repo_id\", None) if train_cfg.hf_hub.enabled else None\n",
1596
+ "_hf_token = os.environ.get(\"HF_TOKEN\")\n",
1597
+ "_pull_timing_from_hf(_run_id, CKPT_ROOT, _hf_repo, _hf_token)\n",
1598
+ "\n",
1599
  "_timing_path = CKPT_ROOT / _run_id / \"timing.json\"\n",
1600
+ "assert _timing_path.exists(), (\n",
1601
+ " f\"No timing.json under {_timing_path.parent} (also not on HF). \"\n",
1602
+ " f\"Was the stage cell run via the wrapped version?\"\n",
1603
+ ")\n",
1604
  "\n",
1605
  "_t = _json.loads(_timing_path.read_text())\n",
1606
  "\n",