"""8-GPU pool runner for the UNIFIED-512 conv retrain on h800 (L20Y x8). Mirrors the baseline grid: 4 conv archs x the existing (dataset,protocol,seed) cells, all at img_size 512. 1 job per GPU, resumable (skips cells whose metrics.json exists), per-job log. occupy.py auto-yields. Run detached; tail /tmp/unified512_runner.log. """ import os, sys, subprocess, time CODE = "/mnt/tidal-alsh-share2/dataset/qinshengqian/research/c3/NPJ-ACM/Code" DATA = "/data/temp/NPJ-ACM/Data" WORK = "/data/temp/NPJ-ACM/work" # CWD; results -> WORK/results/unified512/... PY = "/data/temp/miniconda3/envs/seggen/bin/python" LOGD = WORK + "/logs_unified512" PROXY = "http://10.140.15.68:3128" NGPU, IMG, BATCH, EPOCHS = 8, 512, 8, 300 ARCHS = ["unet", "unetpp", "deeplabv3plus", "attention_unet"] CELLS = [ # (dataset, protocol, [seeds]) -- matches existing baseline structure ("acdc_png", "official", [0, 1, 2]), ("busi", "fold01", [0, 1, 2]), ("cvc_clinicdb", "official", [0, 1, 2]), ("fives", "official", [0, 1, 2]), ("idridd_segmentation", "fold01", [0, 1, 2]), ("kvasir_seg", "official", [0, 1, 2]), ("medsegdb_isic2018", "holdout", [0, 1, 2]), ("medsegdb_kits19", "fold01", [0, 1, 2]), ("refuge2", "official", [0, 1, 2]), ("pannuke_semantic", "fold01", [0, 1, 2]), ("pannuke_semantic", "fold02", [0]), ("pannuke_semantic", "fold03", [0]), ] os.makedirs(LOGD, exist_ok=True) os.makedirs(WORK, exist_ok=True) jobs = [] for ds, proto, seeds in CELLS: for arch in ARCHS: for s in seeds: out = f"{WORK}/results/unified512/{ds}_{proto}/{arch}/seed{s}/metrics.json" jobs.append({"ds": ds, "proto": proto, "arch": arch, "seed": s, "out": out, "tag": f"{ds}_{proto}_{arch}_s{s}"}) def make_cmd(j, gpu): return ( f"export CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES={gpu} " f"OMP_NUM_THREADS=8 MKL_NUM_THREADS=8 OPENBLAS_NUM_THREADS=8 " f"https_proxy={PROXY} http_proxy={PROXY} && cd {WORK} && " f"{PY} {CODE}/framework/train.py --data_root {DATA} --dataset {j['ds']} --protocol {j['proto']} " f"--arch {j['arch']} --img_size {IMG} --batch_size {BATCH} --num_workers 8 --amp bf16 " f"--exp_name unified512 --seed {j['seed']} --no-visualize --encoder resnet50 --encoder_weights imagenet " f"--epochs {EPOCHS} && " f"{PY} {CODE}/framework/test.py --data_root {DATA} --dataset {j['ds']} --protocol {j['proto']} " f"--arch {j['arch']} --img_size {IMG} --exp_name unified512 --seed {j['seed']} --encoder resnet50" ) pending = [j for j in jobs if not os.path.isfile(j["out"])] print(f"[runner] total={len(jobs)} done={len(jobs)-len(pending)} pending={len(pending)}", flush=True) running = {} # gpu -> (Popen, job, start) free = list(range(NGPU)) done = ok = fail = 0 i = 0 while i < len(pending) or running: while free and i < len(pending): gpu = free.pop(0) j = pending[i]; i += 1 lf = open(f"{LOGD}/{j['tag']}.log", "w") p = subprocess.Popen(["bash", "-lc", make_cmd(j, gpu)], stdout=lf, stderr=subprocess.STDOUT) running[gpu] = (p, j, time.time(), lf) print(f"[launch] gpu{gpu} {j['tag']}", flush=True) time.sleep(20) for gpu, (p, j, st, lf) in list(running.items()): if p.poll() is not None: lf.close(); done += 1 okj = os.path.isfile(j["out"]) ok += okj; fail += (not okj) mins = (time.time() - st) / 60 print(f"[finish] gpu{gpu} {j['tag']} rc={p.returncode} ok={okj} " f"{mins:.0f}min ({done}/{len(pending)} done, {fail} failed)", flush=True) del running[gpu]; free.append(gpu); free.sort() print(f"[runner] ALL DONE. ok={ok} fail={fail} of {len(pending)}", flush=True) print("UNIFIED512_RUNNER_DONE", flush=True)