convitom commited on
Commit ·
c369576
1
Parent(s): 28b13fc
- .claude/settings.local.json +3 -1
- .claude/worktrees/keen-galileo-ce978c +1 -0
- .claude/worktrees/nice-kapitsa-0be5ac +1 -0
- count_img_on_HF.py +62 -0
- evaluation/metrics.py +3 -1
- scripts/cxrvlm_colab_train.ipynb +154 -2
.claude/settings.local.json
CHANGED
|
@@ -32,7 +32,9 @@
|
|
| 32 |
"Bash(where magick *)",
|
| 33 |
"Bash(where inkscape *)",
|
| 34 |
"Bash(python -c \"import cairosvg; cairosvg.svg2png\\(url='pipeline_diagram.svg', write_to='pipeline_diagram.png', output_width=2800\\)\")",
|
| 35 |
-
"Bash(python scripts/_patch_notebook.py)"
|
|
|
|
|
|
|
| 36 |
]
|
| 37 |
}
|
| 38 |
}
|
|
|
|
| 32 |
"Bash(where magick *)",
|
| 33 |
"Bash(where inkscape *)",
|
| 34 |
"Bash(python -c \"import cairosvg; cairosvg.svg2png\\(url='pipeline_diagram.svg', write_to='pipeline_diagram.png', output_width=2800\\)\")",
|
| 35 |
+
"Bash(python scripts/_patch_notebook.py)",
|
| 36 |
+
"Bash(python scripts/_patch_notebook2.py)",
|
| 37 |
+
"Bash(python scripts/_verify_nb.py)"
|
| 38 |
]
|
| 39 |
}
|
| 40 |
}
|
.claude/worktrees/keen-galileo-ce978c
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Subproject commit 28b13fcded5614d6ab4088a8d8f6a20b87060306
|
.claude/worktrees/nice-kapitsa-0be5ac
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Subproject commit 28b13fcded5614d6ab4088a8d8f6a20b87060306
|
count_img_on_HF.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from huggingface_hub import HfApi
|
| 2 |
+
from huggingface_hub.hf_api import RepoFile
|
| 3 |
+
|
| 4 |
+
# ── Sửa 2 dòng này theo nhu cầu ──────────────────────────────────────────────
|
| 5 |
+
REPO_ID = "hieu3636/cxr-vlm-data" # tên repo HuggingFace
|
| 6 |
+
FOLDER = "IU-Xray_2"
|
| 7 |
+
# folder cần đếm, ví dụ "files/p10" hoặc None để scan toàn bộ
|
| 8 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 9 |
+
|
| 10 |
+
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".tiff", ".tif", ".webp", ".dcm"}
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def fmt_size(n_bytes: float) -> str:
|
| 14 |
+
for unit in ("B", "KB", "MB", "GB", "TB"):
|
| 15 |
+
if n_bytes < 1024:
|
| 16 |
+
return f"{n_bytes:.2f} {unit}"
|
| 17 |
+
n_bytes /= 1024
|
| 18 |
+
return f"{n_bytes:.2f} PB"
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def main() -> None:
|
| 22 |
+
api = HfApi()
|
| 23 |
+
|
| 24 |
+
print(f"Repo : {REPO_ID}")
|
| 25 |
+
print(f"Folder : {FOLDER or '(toàn bộ repo)'}")
|
| 26 |
+
print("Đang lấy danh sách file...\n")
|
| 27 |
+
|
| 28 |
+
total_images = 0
|
| 29 |
+
total_bytes = 0
|
| 30 |
+
ext_counts: dict[str, int] = {}
|
| 31 |
+
|
| 32 |
+
for item in api.list_repo_tree(
|
| 33 |
+
repo_id=REPO_ID,
|
| 34 |
+
repo_type="dataset",
|
| 35 |
+
path_in_repo=FOLDER or "",
|
| 36 |
+
recursive=True,
|
| 37 |
+
expand=True, # lấy kèm metadata (size, ...)
|
| 38 |
+
):
|
| 39 |
+
if not isinstance(item, RepoFile):
|
| 40 |
+
continue
|
| 41 |
+
name = item.rfilename
|
| 42 |
+
ext = ("." + name.rsplit(".", 1)[-1].lower()) if "." in name else ""
|
| 43 |
+
if ext not in IMAGE_EXTENSIONS:
|
| 44 |
+
continue
|
| 45 |
+
size = item.size or 0
|
| 46 |
+
total_images += 1
|
| 47 |
+
total_bytes += size
|
| 48 |
+
ext_counts[ext] = ext_counts.get(ext, 0) + 1
|
| 49 |
+
|
| 50 |
+
if total_images % 1000 == 0:
|
| 51 |
+
print(f" ... đã đếm {total_images:,} ảnh", flush=True)
|
| 52 |
+
|
| 53 |
+
print(f"\nTổng số ảnh : {total_images:,}")
|
| 54 |
+
print(f"Tổng dung lượng: {fmt_size(total_bytes)}")
|
| 55 |
+
if ext_counts:
|
| 56 |
+
print("\nTheo đuôi file:")
|
| 57 |
+
for ext, cnt in sorted(ext_counts.items(), key=lambda x: -x[1]):
|
| 58 |
+
print(f" {ext:10s} {cnt:,}")
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
if __name__ == "__main__":
|
| 62 |
+
main()
|
evaluation/metrics.py
CHANGED
|
@@ -251,7 +251,9 @@ def evaluate_all(
|
|
| 251 |
"""
|
| 252 |
results = {}
|
| 253 |
|
| 254 |
-
|
|
|
|
|
|
|
| 255 |
results.update(compute_bleu(hypotheses, references))
|
| 256 |
results.update(compute_rouge(hypotheses, references))
|
| 257 |
results.update(compute_bertscore(hypotheses, references, device=device))
|
|
|
|
| 251 |
"""
|
| 252 |
results = {}
|
| 253 |
|
| 254 |
+
# "report" is the merged-mode task (full Findings + Impression in one
|
| 255 |
+
# target). Same NLG/clinical metrics apply as for findings/impression.
|
| 256 |
+
if task in ("findings", "impression", "report"):
|
| 257 |
results.update(compute_bleu(hypotheses, references))
|
| 258 |
results.update(compute_rouge(hypotheses, references))
|
| 259 |
results.update(compute_bertscore(hypotheses, references, device=device))
|
scripts/cxrvlm_colab_train.ipynb
CHANGED
|
@@ -1138,6 +1138,44 @@
|
|
| 1138 |
"from datetime import datetime as _dt, timezone as _tz\n",
|
| 1139 |
"from pathlib import Path as _Path\n",
|
| 1140 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1141 |
"_resume_args = \"\"\n",
|
| 1142 |
"_is_resume = False\n",
|
| 1143 |
"if \"RESUME_FROM\" in dir() and RESUME_FROM and RESUME_STAGE == 1:\n",
|
|
@@ -1147,6 +1185,12 @@
|
|
| 1147 |
"else:\n",
|
| 1148 |
" print(\"▶ STAGE 1 fresh run\")\n",
|
| 1149 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1150 |
"# ─── Start-of-stage timer ──────────────────────────────────────────────────\n",
|
| 1151 |
"_t0_stage1 = _time.time()\n",
|
| 1152 |
"_iso_start_stage1 = _dt.now(_tz.utc).isoformat(timespec=\"seconds\")\n",
|
|
@@ -1187,6 +1231,9 @@
|
|
| 1187 |
" })\n",
|
| 1188 |
" _timing_path.write_text(_json.dumps(_t, indent=2))\n",
|
| 1189 |
"\n",
|
|
|
|
|
|
|
|
|
|
| 1190 |
" def _fmt(sec):\n",
|
| 1191 |
" h, r = divmod(int(sec), 3600); m, s = divmod(r, 60); return f\"{h:d}h {m:02d}m {s:02d}s\"\n",
|
| 1192 |
" print()\n",
|
|
@@ -1233,6 +1280,44 @@
|
|
| 1233 |
"from datetime import datetime as _dt, timezone as _tz\n",
|
| 1234 |
"from pathlib import Path as _Path\n",
|
| 1235 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1236 |
"_resume_args = \"\"\n",
|
| 1237 |
"_is_resume = False\n",
|
| 1238 |
"if \"RESUME_FROM\" in dir() and RESUME_FROM and RESUME_STAGE == 2:\n",
|
|
@@ -1243,7 +1328,24 @@
|
|
| 1243 |
" _resume_args = f'--run_id \"{RESUME_RUN_ID}\"'\n",
|
| 1244 |
" print(\"▶ STAGE 2 fresh start, pinned to run_id\", RESUME_RUN_ID)\n",
|
| 1245 |
"else:\n",
|
| 1246 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1247 |
"\n",
|
| 1248 |
"# ─── Start-of-stage timer ──────────────────────────────────────────────────\n",
|
| 1249 |
"_t0_stage2 = _time.time()\n",
|
|
@@ -1285,6 +1387,9 @@
|
|
| 1285 |
" })\n",
|
| 1286 |
" _timing_path.write_text(_json.dumps(_t, indent=2))\n",
|
| 1287 |
"\n",
|
|
|
|
|
|
|
|
|
|
| 1288 |
" def _fmt(sec):\n",
|
| 1289 |
" h, r = divmod(int(sec), 3600); m, s = divmod(r, 60); return f\"{h:d}h {m:02d}m {s:02d}s\"\n",
|
| 1290 |
" _total = _t[\"stage1_elapsed_sec\"] + _t[\"stage2_elapsed_sec\"]\n",
|
|
@@ -1444,11 +1549,58 @@
|
|
| 1444 |
"import json as _json\n",
|
| 1445 |
"from pathlib import Path as _Path\n",
|
| 1446 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1447 |
"_run_id_file = CKPT_ROOT / \"run_id.txt\"\n",
|
| 1448 |
"assert _run_id_file.exists(), \"No run_id.txt — train at least one stage first.\"\n",
|
| 1449 |
"_run_id = _run_id_file.read_text().strip()\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1450 |
"_timing_path = CKPT_ROOT / _run_id / \"timing.json\"\n",
|
| 1451 |
-
"assert _timing_path.exists(),
|
|
|
|
|
|
|
|
|
|
| 1452 |
"\n",
|
| 1453 |
"_t = _json.loads(_timing_path.read_text())\n",
|
| 1454 |
"\n",
|
|
|
|
| 1138 |
"from datetime import datetime as _dt, timezone as _tz\n",
|
| 1139 |
"from pathlib import Path as _Path\n",
|
| 1140 |
"\n",
|
| 1141 |
+
"def _pull_timing_from_hf(run_id, ckpt_root, repo_id, token):\n",
|
| 1142 |
+
" # Pull timing.json from HF Hub for this run if not present locally.\n",
|
| 1143 |
+
" local = ckpt_root / run_id / \"timing.json\"\n",
|
| 1144 |
+
" if local.exists() or not repo_id or not token:\n",
|
| 1145 |
+
" return\n",
|
| 1146 |
+
" try:\n",
|
| 1147 |
+
" from huggingface_hub import hf_hub_download\n",
|
| 1148 |
+
" hf_hub_download(\n",
|
| 1149 |
+
" repo_id = repo_id,\n",
|
| 1150 |
+
" repo_type = \"model\",\n",
|
| 1151 |
+
" filename = f\"{run_id}/timing.json\",\n",
|
| 1152 |
+
" token = token,\n",
|
| 1153 |
+
" local_dir = str(ckpt_root),\n",
|
| 1154 |
+
" )\n",
|
| 1155 |
+
" print(f\"[TIMING] pulled previous timing.json from HF → {local}\")\n",
|
| 1156 |
+
" except Exception as e:\n",
|
| 1157 |
+
" # First run for this run_id → no remote file yet. That's fine.\n",
|
| 1158 |
+
" pass\n",
|
| 1159 |
+
"\n",
|
| 1160 |
+
"def _push_timing_to_hf(run_id, ckpt_root, repo_id, token):\n",
|
| 1161 |
+
" # Upload local timing.json to HF Hub under {run_id}/timing.json.\n",
|
| 1162 |
+
" local = ckpt_root / run_id / \"timing.json\"\n",
|
| 1163 |
+
" if not local.exists() or not repo_id or not token:\n",
|
| 1164 |
+
" return\n",
|
| 1165 |
+
" try:\n",
|
| 1166 |
+
" from huggingface_hub import HfApi\n",
|
| 1167 |
+
" HfApi(token=token).upload_file(\n",
|
| 1168 |
+
" path_or_fileobj = str(local),\n",
|
| 1169 |
+
" path_in_repo = f\"{run_id}/timing.json\",\n",
|
| 1170 |
+
" repo_id = repo_id,\n",
|
| 1171 |
+
" repo_type = \"model\",\n",
|
| 1172 |
+
" commit_message = f\"timing.json @ {run_id}\",\n",
|
| 1173 |
+
" )\n",
|
| 1174 |
+
" print(f\"[TIMING] uploaded timing.json to HF → {repo_id}/{run_id}/timing.json\")\n",
|
| 1175 |
+
" except Exception as e:\n",
|
| 1176 |
+
" print(f\"[TIMING] upload failed (non-fatal): {e}\")\n",
|
| 1177 |
+
"\n",
|
| 1178 |
+
"\n",
|
| 1179 |
"_resume_args = \"\"\n",
|
| 1180 |
"_is_resume = False\n",
|
| 1181 |
"if \"RESUME_FROM\" in dir() and RESUME_FROM and RESUME_STAGE == 1:\n",
|
|
|
|
| 1185 |
"else:\n",
|
| 1186 |
" print(\"▶ STAGE 1 fresh run\")\n",
|
| 1187 |
"\n",
|
| 1188 |
+
"# ─── Pre-pull timing.json from HF if resuming (best-effort) ────────────────\n",
|
| 1189 |
+
"_hf_repo = getattr(train_cfg.hf_hub, \"repo_id\", None) if train_cfg.hf_hub.enabled else None\n",
|
| 1190 |
+
"_hf_token = os.environ.get(\"HF_TOKEN\")\n",
|
| 1191 |
+
"if _is_resume and \"RESUME_RUN_ID\" in dir() and RESUME_RUN_ID:\n",
|
| 1192 |
+
" _pull_timing_from_hf(RESUME_RUN_ID, CKPT_ROOT, _hf_repo, _hf_token)\n",
|
| 1193 |
+
"\n",
|
| 1194 |
"# ─── Start-of-stage timer ──────────────────────────────────────────────────\n",
|
| 1195 |
"_t0_stage1 = _time.time()\n",
|
| 1196 |
"_iso_start_stage1 = _dt.now(_tz.utc).isoformat(timespec=\"seconds\")\n",
|
|
|
|
| 1231 |
" })\n",
|
| 1232 |
" _timing_path.write_text(_json.dumps(_t, indent=2))\n",
|
| 1233 |
"\n",
|
| 1234 |
+
" # ─── Push to HF Hub so the timer survives a fresh VM ─────────────────\n",
|
| 1235 |
+
" _push_timing_to_hf(_run_id_now, CKPT_ROOT, _hf_repo, _hf_token)\n",
|
| 1236 |
+
"\n",
|
| 1237 |
" def _fmt(sec):\n",
|
| 1238 |
" h, r = divmod(int(sec), 3600); m, s = divmod(r, 60); return f\"{h:d}h {m:02d}m {s:02d}s\"\n",
|
| 1239 |
" print()\n",
|
|
|
|
| 1280 |
"from datetime import datetime as _dt, timezone as _tz\n",
|
| 1281 |
"from pathlib import Path as _Path\n",
|
| 1282 |
"\n",
|
| 1283 |
+
"def _pull_timing_from_hf(run_id, ckpt_root, repo_id, token):\n",
|
| 1284 |
+
" # Pull timing.json from HF Hub for this run if not present locally.\n",
|
| 1285 |
+
" local = ckpt_root / run_id / \"timing.json\"\n",
|
| 1286 |
+
" if local.exists() or not repo_id or not token:\n",
|
| 1287 |
+
" return\n",
|
| 1288 |
+
" try:\n",
|
| 1289 |
+
" from huggingface_hub import hf_hub_download\n",
|
| 1290 |
+
" hf_hub_download(\n",
|
| 1291 |
+
" repo_id = repo_id,\n",
|
| 1292 |
+
" repo_type = \"model\",\n",
|
| 1293 |
+
" filename = f\"{run_id}/timing.json\",\n",
|
| 1294 |
+
" token = token,\n",
|
| 1295 |
+
" local_dir = str(ckpt_root),\n",
|
| 1296 |
+
" )\n",
|
| 1297 |
+
" print(f\"[TIMING] pulled previous timing.json from HF → {local}\")\n",
|
| 1298 |
+
" except Exception as e:\n",
|
| 1299 |
+
" # First run for this run_id → no remote file yet. That's fine.\n",
|
| 1300 |
+
" pass\n",
|
| 1301 |
+
"\n",
|
| 1302 |
+
"def _push_timing_to_hf(run_id, ckpt_root, repo_id, token):\n",
|
| 1303 |
+
" # Upload local timing.json to HF Hub under {run_id}/timing.json.\n",
|
| 1304 |
+
" local = ckpt_root / run_id / \"timing.json\"\n",
|
| 1305 |
+
" if not local.exists() or not repo_id or not token:\n",
|
| 1306 |
+
" return\n",
|
| 1307 |
+
" try:\n",
|
| 1308 |
+
" from huggingface_hub import HfApi\n",
|
| 1309 |
+
" HfApi(token=token).upload_file(\n",
|
| 1310 |
+
" path_or_fileobj = str(local),\n",
|
| 1311 |
+
" path_in_repo = f\"{run_id}/timing.json\",\n",
|
| 1312 |
+
" repo_id = repo_id,\n",
|
| 1313 |
+
" repo_type = \"model\",\n",
|
| 1314 |
+
" commit_message = f\"timing.json @ {run_id}\",\n",
|
| 1315 |
+
" )\n",
|
| 1316 |
+
" print(f\"[TIMING] uploaded timing.json to HF → {repo_id}/{run_id}/timing.json\")\n",
|
| 1317 |
+
" except Exception as e:\n",
|
| 1318 |
+
" print(f\"[TIMING] upload failed (non-fatal): {e}\")\n",
|
| 1319 |
+
"\n",
|
| 1320 |
+
"\n",
|
| 1321 |
"_resume_args = \"\"\n",
|
| 1322 |
"_is_resume = False\n",
|
| 1323 |
"if \"RESUME_FROM\" in dir() and RESUME_FROM and RESUME_STAGE == 2:\n",
|
|
|
|
| 1328 |
" _resume_args = f'--run_id \"{RESUME_RUN_ID}\"'\n",
|
| 1329 |
" print(\"▶ STAGE 2 fresh start, pinned to run_id\", RESUME_RUN_ID)\n",
|
| 1330 |
"else:\n",
|
| 1331 |
+
" # ─── FIX: pin stage 2 to the run_id stage 1 just wrote ────────────────\n",
|
| 1332 |
+
" # Without this, train.py treats stage 2 as a brand-new launch and\n",
|
| 1333 |
+
" # allocates a NEW run_N folder, splitting stage1/stage2 across two runs.\n",
|
| 1334 |
+
" _state_file = CKPT_ROOT / \"run_id.txt\"\n",
|
| 1335 |
+
" if _state_file.exists():\n",
|
| 1336 |
+
" _pinned = _state_file.read_text().strip()\n",
|
| 1337 |
+
" _resume_args = f'--run_id \"{_pinned}\"'\n",
|
| 1338 |
+
" print(f\"▶ STAGE 2 fresh, auto-pinned to run_id from state file: {_pinned}\")\n",
|
| 1339 |
+
" else:\n",
|
| 1340 |
+
" print(\"▶ STAGE 2 fresh (no state file — train.py will allocate a new run_id)\")\n",
|
| 1341 |
+
"\n",
|
| 1342 |
+
"# ─── Pre-pull timing.json from HF (in case of fresh VM) ───────────────────\n",
|
| 1343 |
+
"_hf_repo = getattr(train_cfg.hf_hub, \"repo_id\", None) if train_cfg.hf_hub.enabled else None\n",
|
| 1344 |
+
"_hf_token = os.environ.get(\"HF_TOKEN\")\n",
|
| 1345 |
+
"# Best guess at run_id BEFORE training (may be missing if stage 1 wasn't run here)\n",
|
| 1346 |
+
"_pre_state = CKPT_ROOT / \"run_id.txt\"\n",
|
| 1347 |
+
"if _pre_state.exists():\n",
|
| 1348 |
+
" _pull_timing_from_hf(_pre_state.read_text().strip(), CKPT_ROOT, _hf_repo, _hf_token)\n",
|
| 1349 |
"\n",
|
| 1350 |
"# ─── Start-of-stage timer ──────────────────────────────────────────────────\n",
|
| 1351 |
"_t0_stage2 = _time.time()\n",
|
|
|
|
| 1387 |
" })\n",
|
| 1388 |
" _timing_path.write_text(_json.dumps(_t, indent=2))\n",
|
| 1389 |
"\n",
|
| 1390 |
+
" # ─── Push to HF Hub ──────────────────────────────────────────────────\n",
|
| 1391 |
+
" _push_timing_to_hf(_run_id_now, CKPT_ROOT, _hf_repo, _hf_token)\n",
|
| 1392 |
+
"\n",
|
| 1393 |
" def _fmt(sec):\n",
|
| 1394 |
" h, r = divmod(int(sec), 3600); m, s = divmod(r, 60); return f\"{h:d}h {m:02d}m {s:02d}s\"\n",
|
| 1395 |
" _total = _t[\"stage1_elapsed_sec\"] + _t[\"stage2_elapsed_sec\"]\n",
|
|
|
|
| 1549 |
"import json as _json\n",
|
| 1550 |
"from pathlib import Path as _Path\n",
|
| 1551 |
"\n",
|
| 1552 |
+
"def _pull_timing_from_hf(run_id, ckpt_root, repo_id, token):\n",
|
| 1553 |
+
" # Pull timing.json from HF Hub for this run if not present locally.\n",
|
| 1554 |
+
" local = ckpt_root / run_id / \"timing.json\"\n",
|
| 1555 |
+
" if local.exists() or not repo_id or not token:\n",
|
| 1556 |
+
" return\n",
|
| 1557 |
+
" try:\n",
|
| 1558 |
+
" from huggingface_hub import hf_hub_download\n",
|
| 1559 |
+
" hf_hub_download(\n",
|
| 1560 |
+
" repo_id = repo_id,\n",
|
| 1561 |
+
" repo_type = \"model\",\n",
|
| 1562 |
+
" filename = f\"{run_id}/timing.json\",\n",
|
| 1563 |
+
" token = token,\n",
|
| 1564 |
+
" local_dir = str(ckpt_root),\n",
|
| 1565 |
+
" )\n",
|
| 1566 |
+
" print(f\"[TIMING] pulled previous timing.json from HF → {local}\")\n",
|
| 1567 |
+
" except Exception as e:\n",
|
| 1568 |
+
" # First run for this run_id → no remote file yet. That's fine.\n",
|
| 1569 |
+
" pass\n",
|
| 1570 |
+
"\n",
|
| 1571 |
+
"def _push_timing_to_hf(run_id, ckpt_root, repo_id, token):\n",
|
| 1572 |
+
" # Upload local timing.json to HF Hub under {run_id}/timing.json.\n",
|
| 1573 |
+
" local = ckpt_root / run_id / \"timing.json\"\n",
|
| 1574 |
+
" if not local.exists() or not repo_id or not token:\n",
|
| 1575 |
+
" return\n",
|
| 1576 |
+
" try:\n",
|
| 1577 |
+
" from huggingface_hub import HfApi\n",
|
| 1578 |
+
" HfApi(token=token).upload_file(\n",
|
| 1579 |
+
" path_or_fileobj = str(local),\n",
|
| 1580 |
+
" path_in_repo = f\"{run_id}/timing.json\",\n",
|
| 1581 |
+
" repo_id = repo_id,\n",
|
| 1582 |
+
" repo_type = \"model\",\n",
|
| 1583 |
+
" commit_message = f\"timing.json @ {run_id}\",\n",
|
| 1584 |
+
" )\n",
|
| 1585 |
+
" print(f\"[TIMING] uploaded timing.json to HF → {repo_id}/{run_id}/timing.json\")\n",
|
| 1586 |
+
" except Exception as e:\n",
|
| 1587 |
+
" print(f\"[TIMING] upload failed (non-fatal): {e}\")\n",
|
| 1588 |
+
"\n",
|
| 1589 |
+
"\n",
|
| 1590 |
"_run_id_file = CKPT_ROOT / \"run_id.txt\"\n",
|
| 1591 |
"assert _run_id_file.exists(), \"No run_id.txt — train at least one stage first.\"\n",
|
| 1592 |
"_run_id = _run_id_file.read_text().strip()\n",
|
| 1593 |
+
"\n",
|
| 1594 |
+
"# Pull the latest timing.json from HF in case we're on a fresh VM.\n",
|
| 1595 |
+
"_hf_repo = getattr(train_cfg.hf_hub, \"repo_id\", None) if train_cfg.hf_hub.enabled else None\n",
|
| 1596 |
+
"_hf_token = os.environ.get(\"HF_TOKEN\")\n",
|
| 1597 |
+
"_pull_timing_from_hf(_run_id, CKPT_ROOT, _hf_repo, _hf_token)\n",
|
| 1598 |
+
"\n",
|
| 1599 |
"_timing_path = CKPT_ROOT / _run_id / \"timing.json\"\n",
|
| 1600 |
+
"assert _timing_path.exists(), (\n",
|
| 1601 |
+
" f\"No timing.json under {_timing_path.parent} (also not on HF). \"\n",
|
| 1602 |
+
" f\"Was the stage cell run via the wrapped version?\"\n",
|
| 1603 |
+
")\n",
|
| 1604 |
"\n",
|
| 1605 |
"_t = _json.loads(_timing_path.read_text())\n",
|
| 1606 |
"\n",
|