| """Re-run manifest for the 10 framework PanNuke fold02/03 jobs that died on a |
| transient GPU4 uncorrectable-ECC burst. Excludes attention_unet (those 2 are |
| finishing in the original queue). Lower concurrency (jobs_per_gpu=2).""" |
| import json, glob, re, os |
|
|
| base = {} |
| for p in sorted(glob.glob(os.path.expanduser("~/.aris_queue/runs/20260605T13*/manifest.json"))): |
| d = json.load(open(p)); js = d.get("jobs") or d.get("phases", [{}])[0].get("jobs", []) |
| for j in js: |
| c = j.get("cmd", "") |
| if "pannuke" in c and "--protocol fold01" in c and "--seed 0" in c: |
| m = re.search(r"--arch (\S+)", c) |
| if m: |
| base.setdefault(m.group(1), c) |
|
|
| archs = ["unet", "unetpp", "deeplabv3plus", "transunet", "swinunet"] |
| jobs = [] |
| for proto in ["fold02", "fold03"]: |
| for a in archs: |
| c = base[a].replace("--protocol fold01", "--protocol " + proto) |
| out = "results/baselines/pannuke_semantic_%s/%s/seed0/metrics.json" % (proto, a) |
| jobs.append({"id": "fw_%s_%s" % (proto, a), "cmd": c, "expected_output": out}) |
|
|
| manifest = {"project": "pannuke_fw_rerun", "cwd": "/home/wzhang/LSC/Code/NPJ", "conda": "seggen", |
| "ssh": "a100", "gpus": [4, 5], "jobs_per_gpu": 2, "max_parallel": 4, |
| "gpu_free_threshold_mib": 60000, "oom_retry": {"delay": 240, "max_attempts": 3}, |
| "phases": [{"name": "fw", "depends_on": [], "jobs": jobs}]} |
|
|
| RUN = "pannuke_fwre_" + os.environ["TS"] |
| rd = os.path.expanduser("~/.aris_queue/runs/" + RUN) |
| os.makedirs(rd + "/logs", exist_ok=True) |
| json.dump(manifest, open(rd + "/manifest.json", "w"), indent=2) |
| print("RUN=" + RUN, "jobs=" + str(len(jobs))) |
|
|