GenSeg-Baselines / code /scripts /h800_run_unified512.py

code: complete eval pipeline (7 metrics + per-class + Wilcoxon) + Swin-UNet/TransUNet networks; remove backups/obsolete

1a18f22 verified 15 days ago

3.87 kB

	"""8-GPU pool runner for the UNIFIED-512 conv retrain on h800 (L20Y x8).
	Mirrors the baseline grid: 4 conv archs x the existing (dataset,protocol,seed) cells,
	all at img_size 512. 1 job per GPU, resumable (skips cells whose metrics.json exists),
	per-job log. occupy.py auto-yields. Run detached; tail /tmp/unified512_runner.log.
	"""
	import os, sys, subprocess, time

	CODE = "/mnt/tidal-alsh-share2/dataset/qinshengqian/research/c3/NPJ-ACM/Code"
	DATA = "/data/temp/NPJ-ACM/Data"
	WORK = "/data/temp/NPJ-ACM/work" # CWD; results -> WORK/results/unified512/...
	PY = "/data/temp/miniconda3/envs/seggen/bin/python"
	LOGD = WORK + "/logs_unified512"
	PROXY = "http://10.140.15.68:3128"
	NGPU, IMG, BATCH, EPOCHS = 8, 512, 8, 300
	ARCHS = ["unet", "unetpp", "deeplabv3plus", "attention_unet"]
	CELLS = [ # (dataset, protocol, [seeds]) -- matches existing baseline structure
	("acdc_png", "official", [0, 1, 2]),
	("busi", "fold01", [0, 1, 2]),
	("cvc_clinicdb", "official", [0, 1, 2]),
	("fives", "official", [0, 1, 2]),
	("idridd_segmentation", "fold01", [0, 1, 2]),
	("kvasir_seg", "official", [0, 1, 2]),
	("medsegdb_isic2018", "holdout", [0, 1, 2]),
	("medsegdb_kits19", "fold01", [0, 1, 2]),
	("refuge2", "official", [0, 1, 2]),
	("pannuke_semantic", "fold01", [0, 1, 2]),
	("pannuke_semantic", "fold02", [0]),
	("pannuke_semantic", "fold03", [0]),
	]
	os.makedirs(LOGD, exist_ok=True)
	os.makedirs(WORK, exist_ok=True)

	jobs = []
	for ds, proto, seeds in CELLS:
	for arch in ARCHS:
	for s in seeds:
	out = f"{WORK}/results/unified512/{ds}_{proto}/{arch}/seed{s}/metrics.json"
	jobs.append({"ds": ds, "proto": proto, "arch": arch, "seed": s, "out": out,
	"tag": f"{ds}_{proto}_{arch}_s{s}"})

	def make_cmd(j, gpu):
	return (
	f"export CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES={gpu} "
	f"OMP_NUM_THREADS=8 MKL_NUM_THREADS=8 OPENBLAS_NUM_THREADS=8 "
	f"https_proxy={PROXY} http_proxy={PROXY} && cd {WORK} && "
	f"{PY} {CODE}/framework/train.py --data_root {DATA} --dataset {j['ds']} --protocol {j['proto']} "
	f"--arch {j['arch']} --img_size {IMG} --batch_size {BATCH} --num_workers 8 --amp bf16 "
	f"--exp_name unified512 --seed {j['seed']} --no-visualize --encoder resnet50 --encoder_weights imagenet "
	f"--epochs {EPOCHS} && "
	f"{PY} {CODE}/framework/test.py --data_root {DATA} --dataset {j['ds']} --protocol {j['proto']} "
	f"--arch {j['arch']} --img_size {IMG} --exp_name unified512 --seed {j['seed']} --encoder resnet50"
	)

	pending = [j for j in jobs if not os.path.isfile(j["out"])]
	print(f"[runner] total={len(jobs)} done={len(jobs)-len(pending)} pending={len(pending)}", flush=True)

	running = {} # gpu -> (Popen, job, start)
	free = list(range(NGPU))
	done = ok = fail = 0
	i = 0
	while i < len(pending) or running:
	while free and i < len(pending):
	gpu = free.pop(0)
	j = pending[i]; i += 1
	lf = open(f"{LOGD}/{j['tag']}.log", "w")
	p = subprocess.Popen(["bash", "-lc", make_cmd(j, gpu)], stdout=lf, stderr=subprocess.STDOUT)
	running[gpu] = (p, j, time.time(), lf)
	print(f"[launch] gpu{gpu} {j['tag']}", flush=True)
	time.sleep(20)
	for gpu, (p, j, st, lf) in list(running.items()):
	if p.poll() is not None:
	lf.close(); done += 1
	okj = os.path.isfile(j["out"])
	ok += okj; fail += (not okj)
	mins = (time.time() - st) / 60
	print(f"[finish] gpu{gpu} {j['tag']} rc={p.returncode} ok={okj} "
	f"{mins:.0f}min ({done}/{len(pending)} done, {fail} failed)", flush=True)
	del running[gpu]; free.append(gpu); free.sort()
	print(f"[runner] ALL DONE. ok={ok} fail={fail} of {len(pending)}", flush=True)
	print("UNIFIED512_RUNNER_DONE", flush=True)