GenSeg-Baselines / code /scripts /h800_run_unified512.py
MaybeRichard's picture
code: complete eval pipeline (7 metrics + per-class + Wilcoxon) + Swin-UNet/TransUNet networks; remove backups/obsolete
1a18f22 verified
Raw
History Blame Contribute Delete
3.87 kB
"""8-GPU pool runner for the UNIFIED-512 conv retrain on h800 (L20Y x8).
Mirrors the baseline grid: 4 conv archs x the existing (dataset,protocol,seed) cells,
all at img_size 512. 1 job per GPU, resumable (skips cells whose metrics.json exists),
per-job log. occupy.py auto-yields. Run detached; tail /tmp/unified512_runner.log.
"""
import os, sys, subprocess, time
CODE = "/mnt/tidal-alsh-share2/dataset/qinshengqian/research/c3/NPJ-ACM/Code"
DATA = "/data/temp/NPJ-ACM/Data"
WORK = "/data/temp/NPJ-ACM/work" # CWD; results -> WORK/results/unified512/...
PY = "/data/temp/miniconda3/envs/seggen/bin/python"
LOGD = WORK + "/logs_unified512"
PROXY = "http://10.140.15.68:3128"
NGPU, IMG, BATCH, EPOCHS = 8, 512, 8, 300
ARCHS = ["unet", "unetpp", "deeplabv3plus", "attention_unet"]
CELLS = [ # (dataset, protocol, [seeds]) -- matches existing baseline structure
("acdc_png", "official", [0, 1, 2]),
("busi", "fold01", [0, 1, 2]),
("cvc_clinicdb", "official", [0, 1, 2]),
("fives", "official", [0, 1, 2]),
("idridd_segmentation", "fold01", [0, 1, 2]),
("kvasir_seg", "official", [0, 1, 2]),
("medsegdb_isic2018", "holdout", [0, 1, 2]),
("medsegdb_kits19", "fold01", [0, 1, 2]),
("refuge2", "official", [0, 1, 2]),
("pannuke_semantic", "fold01", [0, 1, 2]),
("pannuke_semantic", "fold02", [0]),
("pannuke_semantic", "fold03", [0]),
]
os.makedirs(LOGD, exist_ok=True)
os.makedirs(WORK, exist_ok=True)
jobs = []
for ds, proto, seeds in CELLS:
for arch in ARCHS:
for s in seeds:
out = f"{WORK}/results/unified512/{ds}_{proto}/{arch}/seed{s}/metrics.json"
jobs.append({"ds": ds, "proto": proto, "arch": arch, "seed": s, "out": out,
"tag": f"{ds}_{proto}_{arch}_s{s}"})
def make_cmd(j, gpu):
return (
f"export CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES={gpu} "
f"OMP_NUM_THREADS=8 MKL_NUM_THREADS=8 OPENBLAS_NUM_THREADS=8 "
f"https_proxy={PROXY} http_proxy={PROXY} && cd {WORK} && "
f"{PY} {CODE}/framework/train.py --data_root {DATA} --dataset {j['ds']} --protocol {j['proto']} "
f"--arch {j['arch']} --img_size {IMG} --batch_size {BATCH} --num_workers 8 --amp bf16 "
f"--exp_name unified512 --seed {j['seed']} --no-visualize --encoder resnet50 --encoder_weights imagenet "
f"--epochs {EPOCHS} && "
f"{PY} {CODE}/framework/test.py --data_root {DATA} --dataset {j['ds']} --protocol {j['proto']} "
f"--arch {j['arch']} --img_size {IMG} --exp_name unified512 --seed {j['seed']} --encoder resnet50"
)
pending = [j for j in jobs if not os.path.isfile(j["out"])]
print(f"[runner] total={len(jobs)} done={len(jobs)-len(pending)} pending={len(pending)}", flush=True)
running = {} # gpu -> (Popen, job, start)
free = list(range(NGPU))
done = ok = fail = 0
i = 0
while i < len(pending) or running:
while free and i < len(pending):
gpu = free.pop(0)
j = pending[i]; i += 1
lf = open(f"{LOGD}/{j['tag']}.log", "w")
p = subprocess.Popen(["bash", "-lc", make_cmd(j, gpu)], stdout=lf, stderr=subprocess.STDOUT)
running[gpu] = (p, j, time.time(), lf)
print(f"[launch] gpu{gpu} {j['tag']}", flush=True)
time.sleep(20)
for gpu, (p, j, st, lf) in list(running.items()):
if p.poll() is not None:
lf.close(); done += 1
okj = os.path.isfile(j["out"])
ok += okj; fail += (not okj)
mins = (time.time() - st) / 60
print(f"[finish] gpu{gpu} {j['tag']} rc={p.returncode} ok={okj} "
f"{mins:.0f}min ({done}/{len(pending)} done, {fail} failed)", flush=True)
del running[gpu]; free.append(gpu); free.sort()
print(f"[runner] ALL DONE. ok={ok} fail={fail} of {len(pending)}", flush=True)
print("UNIFIED512_RUNNER_DONE", flush=True)