"""8-GPU pool runner for the UNIFIED-512 conv retrain on h800 (L20Y x8).
Mirrors the baseline grid: 4 conv archs x the existing (dataset,protocol,seed) cells,
all at img_size 512. 1 job per GPU, resumable (skips cells whose metrics.json exists),
per-job log. occupy.py auto-yields. Run detached; tail /tmp/unified512_runner.log.
"""
import os, sys, subprocess, time

CODE = "/mnt/tidal-alsh-share2/dataset/qinshengqian/research/c3/NPJ-ACM/Code"
DATA = "/data/temp/NPJ-ACM/Data"
WORK = "/data/temp/NPJ-ACM/work"          # CWD; results -> WORK/results/unified512/...
PY   = "/data/temp/miniconda3/envs/seggen/bin/python"
LOGD = WORK + "/logs_unified512"
PROXY = "http://10.140.15.68:3128"
NGPU, IMG, BATCH, EPOCHS = 8, 512, 8, 300
ARCHS = ["unet", "unetpp", "deeplabv3plus", "attention_unet"]
CELLS = [  # (dataset, protocol, [seeds]) -- matches existing baseline structure
    ("acdc_png", "official", [0, 1, 2]),
    ("busi", "fold01", [0, 1, 2]),
    ("cvc_clinicdb", "official", [0, 1, 2]),
    ("fives", "official", [0, 1, 2]),
    ("idridd_segmentation", "fold01", [0, 1, 2]),
    ("kvasir_seg", "official", [0, 1, 2]),
    ("medsegdb_isic2018", "holdout", [0, 1, 2]),
    ("medsegdb_kits19", "fold01", [0, 1, 2]),
    ("refuge2", "official", [0, 1, 2]),
    ("pannuke_semantic", "fold01", [0, 1, 2]),
    ("pannuke_semantic", "fold02", [0]),
    ("pannuke_semantic", "fold03", [0]),
]
os.makedirs(LOGD, exist_ok=True)
os.makedirs(WORK, exist_ok=True)

jobs = []
for ds, proto, seeds in CELLS:
    for arch in ARCHS:
        for s in seeds:
            out = f"{WORK}/results/unified512/{ds}_{proto}/{arch}/seed{s}/metrics.json"
            jobs.append({"ds": ds, "proto": proto, "arch": arch, "seed": s, "out": out,
                         "tag": f"{ds}_{proto}_{arch}_s{s}"})

def make_cmd(j, gpu):
    return (
        f"export CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES={gpu} "
        f"OMP_NUM_THREADS=8 MKL_NUM_THREADS=8 OPENBLAS_NUM_THREADS=8 "
        f"https_proxy={PROXY} http_proxy={PROXY} && cd {WORK} && "
        f"{PY} {CODE}/framework/train.py --data_root {DATA} --dataset {j['ds']} --protocol {j['proto']} "
        f"--arch {j['arch']} --img_size {IMG} --batch_size {BATCH} --num_workers 8 --amp bf16 "
        f"--exp_name unified512 --seed {j['seed']} --no-visualize --encoder resnet50 --encoder_weights imagenet "
        f"--epochs {EPOCHS} && "
        f"{PY} {CODE}/framework/test.py --data_root {DATA} --dataset {j['ds']} --protocol {j['proto']} "
        f"--arch {j['arch']} --img_size {IMG} --exp_name unified512 --seed {j['seed']} --encoder resnet50"
    )

pending = [j for j in jobs if not os.path.isfile(j["out"])]
print(f"[runner] total={len(jobs)} done={len(jobs)-len(pending)} pending={len(pending)}", flush=True)

running = {}   # gpu -> (Popen, job, start)
free = list(range(NGPU))
done = ok = fail = 0
i = 0
while i < len(pending) or running:
    while free and i < len(pending):
        gpu = free.pop(0)
        j = pending[i]; i += 1
        lf = open(f"{LOGD}/{j['tag']}.log", "w")
        p = subprocess.Popen(["bash", "-lc", make_cmd(j, gpu)], stdout=lf, stderr=subprocess.STDOUT)
        running[gpu] = (p, j, time.time(), lf)
        print(f"[launch] gpu{gpu} {j['tag']}", flush=True)
    time.sleep(20)
    for gpu, (p, j, st, lf) in list(running.items()):
        if p.poll() is not None:
            lf.close(); done += 1
            okj = os.path.isfile(j["out"])
            ok += okj; fail += (not okj)
            mins = (time.time() - st) / 60
            print(f"[finish] gpu{gpu} {j['tag']} rc={p.returncode} ok={okj} "
                  f"{mins:.0f}min ({done}/{len(pending)} done, {fail} failed)", flush=True)
            del running[gpu]; free.append(gpu); free.sort()
print(f"[runner] ALL DONE. ok={ok} fail={fail} of {len(pending)}", flush=True)
print("UNIFIED512_RUNNER_DONE", flush=True)