GenSeg-Baselines / code /scripts /gen_hires_512_manifest.py
MaybeRichard's picture
Upload folder using huggingface_hub
057ec4b verified
Raw
History Blame Contribute Delete
1.87 kB
"""Rebuild the 768 recap manifest at R=512 for fives/refuge2/idridd (user chose to
drop 768->512: still high-res & resolution-fair, 300ep/3seed protocol intact, but
~2.3x less compute -> ~6-8 days instead of ~12-18). Reuses the capped 768 manifest's
36 jobs (thread caps already injected; cv2.setNumThreads(1) is in the code) and only:
--img_size 768 -> 512, --batch_size 4 -> 8 (batch 8 matches finished kvasir/busi@512).
expected_output paths unchanged (metrics.json gets written at whatever res trained).
Run ON a100: TS=$(date -u +%Y%m%dT%H%M%SZ) python3 scripts/gen_hires_512_manifest.py
"""
import json, os
SRC = os.path.expanduser("~/.aris_queue/runs/hires768_20260611T004855Z/manifest.json")
man = json.load(open(SRC))
top = {k: v for k, v in man.items() if k != "phases"}
src_jobs = man.get("jobs") or man["phases"][0]["jobs"]
jobs = []
for j in src_jobs:
c = j["cmd"]
if "--img_size 768" not in c or "--batch_size 4" not in c:
raise SystemExit("anchor not found in " + j["id"])
j2 = dict(j)
j2["id"] = j["id"].replace("rc_hr_", "r512_")
j2["cmd"] = c.replace("--img_size 768", "--img_size 512").replace("--batch_size 4", "--batch_size 8")
jobs.append(j2)
if not jobs:
raise SystemExit("no jobs")
man2 = dict(top)
man2["project"] = "baselines_hires_512"
# GPU5 hosts a non-campaign DDIM job (~43GB); lower the free-mem gate so the queue can
# still place light 512 jobs (~16GB) in GPU5's spare room instead of pinning to GPU4.
man2["gpu_free_threshold_mib"] = 30000
man2["phases"] = [{"name": "r512", "depends_on": [], "jobs": jobs}]
RUN = "hires512_" + os.environ["TS"]
rd = os.path.expanduser("~/.aris_queue/runs/" + RUN)
os.makedirs(rd + "/logs", exist_ok=True)
json.dump(man2, open(rd + "/manifest.json", "w"), indent=2)
print("RUN=" + RUN, "jobs=" + str(len(jobs)))
print("sample:", jobs[0]["cmd"][:200])