| """Resolution-aligned RE-TRAIN manifest: the 4 fully-conv framework methods on the |
| 6 resolution-mismatched datasets, at a per-dataset higher img_size (matching ~nnU-Net's |
| working resolution). Reuses each arch's original command, swaps in --img_size R + |
| smaller batch for the bigger inputs, injects CUDA_DEVICE_ORDER=PCI_BUS_ID (else jobs |
| land on busy L40s). 4 archs x 6 datasets x 3 seeds = 72 jobs.""" |
| import json, glob, re, os |
|
|
| |
| DS = [ |
| ("cvc_clinicdb", "official", 384, 16), |
| ("kvasir_seg", "official", 512, 8), |
| ("busi", "fold01", 512, 8), |
| ("fives", "official", 768, 4), |
| ("refuge2", "official", 768, 4), |
| ("idridd_segmentation", "fold01", 768, 4), |
| ] |
| ARCHS = ["unet", "unetpp", "deeplabv3plus", "attention_unet"] |
|
|
| base = {} |
| for p in sorted(glob.glob(os.path.expanduser("~/.aris_queue/runs/20260605T13*/manifest.json"))): |
| d = json.load(open(p)); js = d.get("jobs") or d.get("phases", [{}])[0].get("jobs", []) |
| for j in js: |
| c = j.get("cmd", "") |
| m = re.search(r"--dataset (\S+).*?--arch (\S+)", c) |
| if m and "--seed 0" in c: |
| base.setdefault((m.group(1), m.group(2)), c) |
|
|
| jobs, missing = [], [] |
| for ds, proto, R, B in DS: |
| for a in ARCHS: |
| bc = base.get((ds, a)) |
| if not bc: |
| missing.append((ds, a)); continue |
| for s in (0, 1, 2): |
| c = bc |
| c = c.replace("export CUDA_VISIBLE_DEVICES=${GPU}", |
| "export CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=${GPU}") |
| c = c.replace("--batch_size 16", "--batch_size %d" % B) |
| c = c.replace("--seed 0", "--img_size %d --seed %d" % (R, s)) |
| out = "results/baselines/%s_%s/%s/seed%d/metrics.json" % (ds, proto, a, s) |
| jobs.append({"id": "hr_%s_%s_s%d" % (ds, a, s), "cmd": c, "expected_output": out}) |
|
|
| if missing: |
| raise SystemExit("missing base cmds: %s" % missing) |
|
|
| manifest = {"project": "baselines_hires", "cwd": "/home/wzhang/LSC/Code/NPJ", "conda": "seggen", |
| "ssh": "a100", "gpus": [4, 5], "jobs_per_gpu": 2, "max_parallel": 4, |
| "gpu_free_threshold_mib": 40000, "oom_retry": {"delay": 300, "max_attempts": 3}, |
| "phases": [{"name": "hr", "depends_on": [], "jobs": jobs}]} |
|
|
| RUN = "hires_" + os.environ["TS"] |
| rd = os.path.expanduser("~/.aris_queue/runs/" + RUN) |
| os.makedirs(rd + "/logs", exist_ok=True) |
| json.dump(manifest, open(rd + "/manifest.json", "w"), indent=2) |
| print("RUN=" + RUN, "jobs=" + str(len(jobs))) |
|
|