code: complete eval pipeline (7 metrics + per-class + Wilcoxon) + Swin-UNet/TransUNet networks; remove backups/obsolete
1a18f22 verified | """8-GPU pool runner for the UNIFIED-512 conv retrain on h800 (L20Y x8). | |
| Mirrors the baseline grid: 4 conv archs x the existing (dataset,protocol,seed) cells, | |
| all at img_size 512. 1 job per GPU, resumable (skips cells whose metrics.json exists), | |
| per-job log. occupy.py auto-yields. Run detached; tail /tmp/unified512_runner.log. | |
| """ | |
| import os, sys, subprocess, time | |
| CODE = "/mnt/tidal-alsh-share2/dataset/qinshengqian/research/c3/NPJ-ACM/Code" | |
| DATA = "/data/temp/NPJ-ACM/Data" | |
| WORK = "/data/temp/NPJ-ACM/work" # CWD; results -> WORK/results/unified512/... | |
| PY = "/data/temp/miniconda3/envs/seggen/bin/python" | |
| LOGD = WORK + "/logs_unified512" | |
| PROXY = "http://10.140.15.68:3128" | |
| NGPU, IMG, BATCH, EPOCHS = 8, 512, 8, 300 | |
| ARCHS = ["unet", "unetpp", "deeplabv3plus", "attention_unet"] | |
| CELLS = [ # (dataset, protocol, [seeds]) -- matches existing baseline structure | |
| ("acdc_png", "official", [0, 1, 2]), | |
| ("busi", "fold01", [0, 1, 2]), | |
| ("cvc_clinicdb", "official", [0, 1, 2]), | |
| ("fives", "official", [0, 1, 2]), | |
| ("idridd_segmentation", "fold01", [0, 1, 2]), | |
| ("kvasir_seg", "official", [0, 1, 2]), | |
| ("medsegdb_isic2018", "holdout", [0, 1, 2]), | |
| ("medsegdb_kits19", "fold01", [0, 1, 2]), | |
| ("refuge2", "official", [0, 1, 2]), | |
| ("pannuke_semantic", "fold01", [0, 1, 2]), | |
| ("pannuke_semantic", "fold02", [0]), | |
| ("pannuke_semantic", "fold03", [0]), | |
| ] | |
| os.makedirs(LOGD, exist_ok=True) | |
| os.makedirs(WORK, exist_ok=True) | |
| jobs = [] | |
| for ds, proto, seeds in CELLS: | |
| for arch in ARCHS: | |
| for s in seeds: | |
| out = f"{WORK}/results/unified512/{ds}_{proto}/{arch}/seed{s}/metrics.json" | |
| jobs.append({"ds": ds, "proto": proto, "arch": arch, "seed": s, "out": out, | |
| "tag": f"{ds}_{proto}_{arch}_s{s}"}) | |
| def make_cmd(j, gpu): | |
| return ( | |
| f"export CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES={gpu} " | |
| f"OMP_NUM_THREADS=8 MKL_NUM_THREADS=8 OPENBLAS_NUM_THREADS=8 " | |
| f"https_proxy={PROXY} http_proxy={PROXY} && cd {WORK} && " | |
| f"{PY} {CODE}/framework/train.py --data_root {DATA} --dataset {j['ds']} --protocol {j['proto']} " | |
| f"--arch {j['arch']} --img_size {IMG} --batch_size {BATCH} --num_workers 8 --amp bf16 " | |
| f"--exp_name unified512 --seed {j['seed']} --no-visualize --encoder resnet50 --encoder_weights imagenet " | |
| f"--epochs {EPOCHS} && " | |
| f"{PY} {CODE}/framework/test.py --data_root {DATA} --dataset {j['ds']} --protocol {j['proto']} " | |
| f"--arch {j['arch']} --img_size {IMG} --exp_name unified512 --seed {j['seed']} --encoder resnet50" | |
| ) | |
| pending = [j for j in jobs if not os.path.isfile(j["out"])] | |
| print(f"[runner] total={len(jobs)} done={len(jobs)-len(pending)} pending={len(pending)}", flush=True) | |
| running = {} # gpu -> (Popen, job, start) | |
| free = list(range(NGPU)) | |
| done = ok = fail = 0 | |
| i = 0 | |
| while i < len(pending) or running: | |
| while free and i < len(pending): | |
| gpu = free.pop(0) | |
| j = pending[i]; i += 1 | |
| lf = open(f"{LOGD}/{j['tag']}.log", "w") | |
| p = subprocess.Popen(["bash", "-lc", make_cmd(j, gpu)], stdout=lf, stderr=subprocess.STDOUT) | |
| running[gpu] = (p, j, time.time(), lf) | |
| print(f"[launch] gpu{gpu} {j['tag']}", flush=True) | |
| time.sleep(20) | |
| for gpu, (p, j, st, lf) in list(running.items()): | |
| if p.poll() is not None: | |
| lf.close(); done += 1 | |
| okj = os.path.isfile(j["out"]) | |
| ok += okj; fail += (not okj) | |
| mins = (time.time() - st) / 60 | |
| print(f"[finish] gpu{gpu} {j['tag']} rc={p.returncode} ok={okj} " | |
| f"{mins:.0f}min ({done}/{len(pending)} done, {fail} failed)", flush=True) | |
| del running[gpu]; free.append(gpu); free.sort() | |
| print(f"[runner] ALL DONE. ok={ok} fail={fail} of {len(pending)}", flush=True) | |
| print("UNIFIED512_RUNNER_DONE", flush=True) | |