code: complete eval pipeline (7 metrics + per-class + Wilcoxon) + Swin-UNet/TransUNet networks; remove backups/obsolete
1a18f22 verified | """Phase-1 of unified-512 re-eval on h800: re-score SwinUNet/TransUNet at eval_size 512. | |
| These are res-locked (224/256) so we DON'T retrain — we load their HF-curated best-seed | |
| weights, run at native input, resize preds+GT to 512, write metrics.json into the | |
| unified512 results tree. 12 cells x {swinunet, transunet} = 24 evals, 8-GPU pool. | |
| """ | |
| import os, glob, shutil, subprocess, time | |
| CODE = "/mnt/tidal-alsh-share2/dataset/qinshengqian/research/c3/NPJ-ACM/Code" | |
| DATA = "/data/temp/NPJ-ACM/Data" | |
| WORK = "/data/temp/NPJ-ACM/work" | |
| PY = "/data/temp/miniconda3/envs/seggen/bin/python" | |
| HFW = WORK + "/hf_weights/weights/framework" # <cell>/{swinunet,transunet}.pth | |
| RES = WORK + "/results/unified512" # eval_at_res writes here (out_root=results rel to WORK) | |
| LOGD = WORK + "/logs_eval512"; os.makedirs(LOGD, exist_ok=True) | |
| PROXY = "http://10.140.15.68:3128" | |
| PROTOS = ["official", "holdout", "fold01", "fold02", "fold03"] | |
| NGPU = 8 | |
| def split_cell(cell): | |
| for p in PROTOS: | |
| if cell.endswith("_" + p): | |
| return cell[:-(len(p) + 1)], p | |
| raise ValueError("bad cell " + cell) | |
| jobs = [] | |
| for cell_dir in sorted(glob.glob(HFW + "/*")): | |
| cell = os.path.basename(cell_dir) | |
| ds, proto = split_cell(cell) | |
| for arch in ("swinunet", "transunet"): | |
| w = f"{cell_dir}/{arch}.pth" | |
| if not os.path.isfile(w): | |
| continue | |
| out = f"{RES}/{cell}/{arch}/seed0" | |
| jobs.append({"ds": ds, "proto": proto, "arch": arch, "w": w, "out": out, | |
| "tag": f"{cell}_{arch}", "mj": f"{out}/metrics.json"}) | |
| pending = [j for j in jobs if not os.path.isfile(j["mj"])] | |
| print(f"[eval512] total={len(jobs)} done={len(jobs)-len(pending)} pending={len(pending)}", flush=True) | |
| def make_cmd(j, gpu): | |
| enc = "R50-ViT-B_16" if j["arch"] == "transunet" else "resnet50" | |
| # place the HF weight as best.pth where eval_at_res.py expects it | |
| os.makedirs(j["out"], exist_ok=True) | |
| shutil.copy(j["w"], j["out"] + "/best.pth") | |
| return ( | |
| f"export CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES={gpu} " | |
| f"OMP_NUM_THREADS=8 MKL_NUM_THREADS=8 OPENBLAS_NUM_THREADS=8 " | |
| f"https_proxy={PROXY} http_proxy={PROXY} && cd {WORK} && " | |
| f"{PY} {CODE}/framework/eval_at_res.py --data_root {DATA} --dataset {j['ds']} " | |
| f"--protocol {j['proto']} --arch {j['arch']} --seed 0 --eval_size 512 " | |
| f"--exp_name unified512 --encoder {enc}" | |
| ) | |
| running = {}; free = list(range(NGPU)); i = 0; ok = fail = 0 | |
| while i < len(pending) or running: | |
| while free and i < len(pending): | |
| gpu = free.pop(0); j = pending[i]; i += 1 | |
| lf = open(f"{LOGD}/{j['tag']}.log", "w") | |
| p = subprocess.Popen(["bash", "-lc", make_cmd(j, gpu)], stdout=lf, stderr=subprocess.STDOUT) | |
| running[gpu] = (p, j, lf); print(f"[launch] gpu{gpu} {j['tag']}", flush=True) | |
| time.sleep(8) | |
| for gpu, (p, j, lf) in list(running.items()): | |
| if p.poll() is not None: | |
| lf.close(); okj = os.path.isfile(j["mj"]); ok += okj; fail += (not okj) | |
| print(f"[finish] gpu{gpu} {j['tag']} rc={p.returncode} ok={okj}", flush=True) | |
| del running[gpu]; free.append(gpu); free.sort() | |
| print(f"[eval512] ALL DONE ok={ok} fail={fail}", flush=True) | |
| print("SWIN_TRANSUNET_EVAL512_DONE", flush=True) | |