| """Rebuild a queue manifest for ONLY the 768-res datasets (fives/refuge2/idridd), |
| injecting CPU thread caps to fix the dataloader thread-oversubscription that starved |
| the GPU (epoch ~750s -> ~180s). Reuses the original hires manifest's jobs verbatim |
| (img_size 768, batch 4, num_workers 8, PCI_BUS_ID) and only: |
| (1) keeps the 3 R=768 datasets' jobs, |
| (2) prepends OMP/MKL/OPENBLAS/NUMEXPR/VECLIB_NUM_THREADS=8 to each export line. |
| The already-finished cvc/kvasir/busi (512/384) cells are untouched. |
| Run ON a100: TS=$(date -u +%Y%m%dT%H%M%SZ) python3 scripts/gen_hires_768_recap_manifest.py |
| """ |
| import json, os, glob |
|
|
| SRC = os.path.expanduser("~/.aris_queue/runs/hires_20260610T021920Z/manifest.json") |
| DS768_PREFIXES = ("hr_fives_", "hr_refuge2_", "hr_idridd_segmentation_") |
| CAP = ("OMP_NUM_THREADS=8 MKL_NUM_THREADS=8 OPENBLAS_NUM_THREADS=8 " |
| "NUMEXPR_NUM_THREADS=8 VECLIB_MAXIMUM_THREADS=8") |
| OLD = "export CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=${GPU}" |
| NEW = "export CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=${GPU} " + CAP |
|
|
| man = json.load(open(SRC)) |
| top = {k: v for k, v in man.items() if k != "phases"} |
| src_jobs = man.get("jobs") or man["phases"][0]["jobs"] |
|
|
| jobs = [] |
| for j in src_jobs: |
| if not j["id"].startswith(DS768_PREFIXES): |
| continue |
| |
| if os.path.isfile(os.path.join(top.get("cwd", "."), j["expected_output"])): |
| continue |
| c = j["cmd"] |
| if OLD not in c: |
| raise SystemExit("export anchor not found in: " + j["id"]) |
| j2 = dict(j) |
| j2["id"] = "rc_" + j["id"] |
| j2["cmd"] = c.replace(OLD, NEW) |
| jobs.append(j2) |
|
|
| if not jobs: |
| raise SystemExit("no 768 jobs to (re)run") |
|
|
| man2 = dict(top) |
| man2["project"] = "baselines_hires_768_recap" |
| man2["phases"] = [{"name": "rc768", "depends_on": [], "jobs": jobs}] |
|
|
| RUN = "hires768_" + os.environ["TS"] |
| rd = os.path.expanduser("~/.aris_queue/runs/" + RUN) |
| os.makedirs(rd + "/logs", exist_ok=True) |
| json.dump(man2, open(rd + "/manifest.json", "w"), indent=2) |
| print("RUN=" + RUN, "jobs=" + str(len(jobs))) |
| print("sample cmd:", jobs[0]["cmd"][:160]) |
|
|