"""Aggregate per-seed metrics.json into paper-style result tables (mean±SD). Scans results//**/seed*/metrics.json, groups by (dataset, protocol, arch), reports mean±SD over seeds (over folds for CV datasets). Emits: - summary.csv : full per-(dataset,method) detail, every metric (raw data export) - summary.md : the main Dice table, methods×datasets (quick read) - summary.tex : the main Dice table as booktabs LaTeX (paper-ready) - summary.html : full paper-style report (main tables, per-class, significance, setup) python framework/report/aggregate.py --exp_name baselines [--out_root results] """ from __future__ import annotations import os import json import glob import argparse import warnings from collections import defaultdict import numpy as np # per-image Dice vectors can have all-NaN positions (empty masks across seeds); # np.nanmean warns harmlessly on those — silence it for clean console/report runs. warnings.filterwarnings("ignore", message="Mean of empty slice") # (key, label, is_percent, higher_is_better) METRICS = [ ("dice", "Dice", True, True), ("iou", "IoU", True, True), ("hd95", "HD95", False, False), ("assd", "ASSD", False, False), ("sensitivity", "Sens", True, True), ("specificity", "Spec", True, True), ("precision", "Prec", True, True), ] def load_runs(out_root, exp_name): runs = [] for path in glob.glob(os.path.join(out_root, exp_name, "**", "seed*", "metrics.json"), recursive=True): try: with open(path) as f: runs.append(json.load(f)) except Exception: pass return runs _PROTO_LABEL = { ("idridd_segmentation", "fold01"): "official", ("busi", "fold01"): "single-split", ("medsegdb_kits19", "fold01"): "single-split", ("pannuke_semantic", "fold01"): "single-split", } _CV_DATASETS = {"pannuke_semantic"} def _proto_label(dataset, protocol): return _PROTO_LABEL.get((dataset, protocol), protocol) def _agg_over(items, key): vals = np.array([it.get("metrics", {}).get(f"{key}_mean", np.nan) for it in items], np.float64) vals = vals[~np.isnan(vals)] return (float(vals.mean()), float(vals.std())) if vals.size else (float("nan"), float("nan")) def summarize(runs): by_da = defaultdict(lambda: defaultdict(list)) for d in runs: by_da[(d.get("dataset"), d.get("arch"))][d.get("protocol")].append(d) rows = [] for (dataset, arch), proto_map in sorted(by_da.items()): protos = sorted(p for p in proto_map if p is not None) row = {"dataset": dataset, "arch": arch} if dataset in _CV_DATASETS and len(protos) > 1: row["protocol"] = f"{len(protos)}-fold" row["n_seeds"] = len(protos) for key, _, _, _ in METRICS: fold_means = [m for m in (_agg_over(proto_map[p], key)[0] for p in protos) if not np.isnan(m)] fm = np.array(fold_means, np.float64) row[f"{key}_mean"] = float(fm.mean()) if fm.size else float("nan") row[f"{key}_sd"] = float(fm.std()) if fm.size else float("nan") else: proto = protos[0] if protos else None items = proto_map.get(proto, []) row["protocol"] = _proto_label(dataset, proto) row["n_seeds"] = len(items) for key, _, _, _ in METRICS: row[f"{key}_mean"], row[f"{key}_sd"] = _agg_over(items, key) rows.append(row) return rows # ----------------------------------------------------------------------------- display _ARCH_ORDER = ["unet", "unetpp", "deeplabv3plus", "attention_unet", "transunet", "swinunet", "nnunet", "umamba"] _ARCH_DISP = {"unet": "UNet", "unetpp": "UNet++", "deeplabv3plus": "DeepLabV3+", "attention_unet": "Attention-UNet", "transunet": "TransUNet", "swinunet": "Swin-UNet", "nnunet": "nnU-Net", "umamba": "U-Mamba"} _DS_ORDER = ["cvc_clinicdb", "kvasir_seg", "fives", "busi", "refuge2", "acdc_png", "idridd_segmentation", "pannuke_semantic", "medsegdb_isic2018", "medsegdb_kits19"] _DS_DISP = {"cvc_clinicdb": "CVC-ClinicDB", "kvasir_seg": "Kvasir-SEG", "fives": "FIVES", "busi": "BUSI", "refuge2": "REFUGE2", "acdc_png": "ACDC", "idridd_segmentation": "IDRiD", "pannuke_semantic": "PanNuke", "medsegdb_isic2018": "ISIC2018", "medsegdb_kits19": "KiTS19"} def _fmt(row, key, pct): m, s = row[f"{key}_mean"], row[f"{key}_sd"] if m != m: return "—" return f"{m*100:.2f}±{s*100:.2f}" if pct else f"{m:.2f}±{s:.2f}" def _grid(rows): cell = {(r["dataset"], r["arch"]): r for r in rows} methods = [a for a in _ARCH_ORDER if any(r["arch"] == a for r in rows)] or \ sorted({r["arch"] for r in rows}) seen = [d for d in _DS_ORDER if any(r["dataset"] == d for r in rows)] extra = [r["dataset"] for r in rows if r["dataset"] not in _DS_ORDER] datasets = list(dict.fromkeys(seen + extra)) return cell, datasets, methods # ----------------------------------------------------------------------------- significance def _per_image_dice_vec(runs_for_da): by_proto = defaultdict(list) for d in runs_for_da: by_proto[d.get("protocol")].append(d) parts = [] for proto in sorted(by_proto): arrs = [np.array([pi.get("dice", np.nan) for pi in d.get("per_image", [])], float) for d in by_proto[proto]] arrs = [a for a in arrs if a.size] if not arrs: continue L = min(a.size for a in arrs) parts.append(np.nanmean(np.stack([a[:L] for a in arrs]), axis=0)) return np.concatenate(parts) if parts else np.array([]) def _sig_tied_sets(runs): """{dataset: set(archs whose per-image Dice is NOT significantly worse than the best, paired Wilcoxon p>=0.05)} — the 'statistically best' set, used to bold the Dice table.""" try: from scipy.stats import wilcoxon except Exception: return {} by_da = defaultdict(list) for d in runs: by_da[(d.get("dataset"), d.get("arch"))].append(d) def pval(a, b): L = min(a.size, b.size) if L < 6: return float("nan") x, y = a[:L], b[:L] m = ~(np.isnan(x) | np.isnan(y)) if m.sum() < 6 or np.allclose(x[m], y[m]): return 1.0 try: return float(wilcoxon(x[m], y[m]).pvalue) except Exception: return 1.0 out = {} for ds in {k[0] for k in by_da}: vecs = {a: _per_image_dice_vec(by_da[(ds, a)]) for a in _ARCH_ORDER if (ds, a) in by_da} vecs = {a: v for a, v in vecs.items() if v.size} if not vecs: continue means = {a: float(np.nanmean(v)) for a, v in vecs.items()} best = max(means, key=means.get) tied = {best} for a, v in vecs.items(): if a != best and not (pval(vecs[best], v) < 0.05): tied.add(a) out[ds] = tied return out # ----------------------------------------------------------------------------- text exports def to_csv(rows): cols = ["dataset", "protocol", "arch", "n_seeds"] for k, _, _, _ in METRICS: cols += [f"{k}_mean", f"{k}_sd"] out = ",".join(cols) + "\n" for r in rows: out += ",".join(str(r[c]) for c in cols) + "\n" return out def _dice_matrix(rows): """(methods, datasets, cell, avg) for the main Dice table.""" cell, datasets, methods = _grid(rows) avg = {a: np.nanmean([cell[(d, a)]["dice_mean"] for d in datasets if (d, a) in cell] or [np.nan]) for a in methods} return cell, datasets, methods, avg def _dice_bold(a, d, cell, best, sig): """Whether (dataset d, arch a)'s Dice cell should be bold: in the significance 'tied-for-best' set when available, else the single best per dataset.""" if (d, a) not in cell: return False if sig is not None: return a in sig.get(d, set()) return cell[(d, a)]["dice_mean"] == best[d] def to_markdown(rows, sig=None): cell, datasets, methods, _ = _dice_matrix(rows) head = ["Method"] + [_DS_DISP.get(d, d) for d in datasets] out = "## Main results — Dice (mean±SD %, ↑)\n\n" out += ("_**Bold** = best or not significantly worse than best per dataset " "(paired Wilcoxon on per-image Dice, p≥0.05). No cross-dataset average column — " "the seven modalities are too heterogeneous for one number to be meaningful._\n\n") out += "| " + " | ".join(head) + " |\n|" + "---|" * len(head) + "\n" best = {d: max((cell[(d, a)]["dice_mean"] for a in methods if (d, a) in cell), default=np.nan) for d in datasets} for a in methods: cells = [_ARCH_DISP.get(a, a)] for d in datasets: if (d, a) in cell: t = _fmt(cell[(d, a)], "dice", True) cells.append(f"**{t}**" if _dice_bold(a, d, cell, best, sig) else t) else: cells.append("–") out += "| " + " | ".join(cells) + " |\n" return out def to_latex(rows, sig=None): cell, datasets, methods, _ = _dice_matrix(rows) spec = "l" + "c" * len(datasets) out = ("% Main results: Dice (mean over seeds, %). Bold = best or not significantly\n" "% worse than best per dataset (paired Wilcoxon on per-image Dice, p>=0.05).\n" "% No cross-dataset average column (modalities too heterogeneous).\n") out += "\\begin{tabular}{" + spec + "}\n\\toprule\n" out += "Method & " + " & ".join(_DS_DISP.get(d, d) for d in datasets) + " \\\\\n\\midrule\n" best = {d: max((cell[(d, a)]["dice_mean"] for a in methods if (d, a) in cell), default=np.nan) for d in datasets} for a in methods: cells = [_ARCH_DISP.get(a, a)] for d in datasets: if (d, a) in cell: t = f"{cell[(d, a)]['dice_mean'] * 100:.1f}" cells.append(f"\\textbf{{{t}}}" if _dice_bold(a, d, cell, best, sig) else t) else: cells.append("--") out += " & ".join(cells) + " \\\\\n" if a == "attention_unet": out += "\\midrule\n" # separate CNNs from transformers/foundation out += "\\bottomrule\n\\end{tabular}\n" return out # ----------------------------------------------------------------------------- HTML report _DATASETS_INFO = [ ("1", "CVC-ClinicDB", "Colonoscopy (endoscopy)", "Polyp", "2", "RGB", "384×288", "official", "490 / 61 / 61"), ("2", "Kvasir-SEG", "GI endoscopy", "Polyp", "2", "RGB", "~622×529 (var)", "official", "800 / 100 / 100"), ("3", "FIVES", "Retinal fundus", "Vessel", "2", "RGB", "2048×2048", "official", "480 / 120 / 200"), ("4", "BUSI", "Breast ultrasound", "Tumor", "2", "grayscale¹", "variable", "single-split²", "545 / 78 / 157"), ("5", "REFUGE2", "Retinal fundus", "Optic disc & cup", "3", "RGB", "~2124×2056", "official", "400 / 400 / 400"), ("6", "ACDC", "Cardiac MRI (2D slices)", "RV / Myo / LV", "4", "grayscale", "~240×256 (var)", "official", "136 / 210 / 380"), ("7", "IDRiD", "Retinal fundus", "DR lesions (4) + optic disc", "6", "RGB", "4288×2848", "official", "43 / 11 / 27"), ("8", "PanNuke", "Histopathology (H&E)", "Nuclei (5 types)", "6", "RGB", "256×256", "official 3-fold CV", "~2.7k / 2.6k / 2.6k per fold"), ("9", "ISIC2018", "Dermoscopy", "Skin lesion", "2", "RGB", "256×256", "holdout", "2582 / 369 / 737"), ("10", "KiTS19", "Kidney CT (2D slices)", "Kidney (binary)", "2", "grayscale¹", "256×256", "single-split²", "2832 / 479 / 705"), ] _METHODS_INFO = [ ("UNet", "CNN encoder–decoder", "SMP, ResNet-50 encoder (ImageNet)"), ("UNet++", "Nested UNet", "SMP, ResNet-50 (ImageNet)"), ("DeepLabV3+", "Atrous CNN", "SMP, ResNet-50 (ImageNet)"), ("Attention-UNet", "Attention-gated UNet", "Re-implemented, from scratch"), ("TransUNet", "CNN–Transformer hybrid", "R50-ViT-B/16 (ImageNet), input 256"), ("Swin-UNet", "Pure-Transformer UNet", "Swin-Tiny (ImageNet), input 224"), ("nnU-Net (v2)", "Self-configuring CNN", "2D config, 250 epochs"), ("U-Mamba", "State-space (Mamba) UNet", "U-Mamba_Bot, 100 epochs"), ] _METRICS_INFO = [ ("Dice (DSC)", "2TP / (2TP+FP+FN)", "↑", "%", "区域重叠度(主指标),对类别不平衡较鲁棒。"), ("IoU (Jaccard)", "TP / (TP+FP+FN)", "↑", "%", "交并比,更严格的重叠度,常与 Dice 并列。"), ("HD95", "95% Hausdorff distance (boundaries)", "↓", "px", "边界最大误差的95%分位,越小边界越贴合。"), ("ASSD", "average symmetric surface distance", "↓", "px", "平均对称表面距离,整体边界吻合度。"), ("Sensitivity", "TP / (TP+FN)", "↑", "%", "召回/敏感度,反映漏分割程度。"), ("Specificity", "TN / (TN+FP)", "↑", "%", "特异度,背景误报控制。"), ("Precision", "TP / (TP+FP)", "↑", "%", "精确率,反映过分割/误报程度。"), ] _PERCLASS_NAMES = { "acdc_png": {"1": "RV", "2": "Myocardium", "3": "LV"}, "refuge2": {"1": "Optic Disc", "2": "Optic Cup"}, "idridd_segmentation": {"1": "MA", "2": "Haemorrhage", "3": "Hard Exudate", "4": "Soft Exudate", "5": "Optic Disc"}, "pannuke_semantic": {"1": "Neoplastic", "2": "Inflammatory", "3": "Connective", "4": "Dead", "5": "Epithelial"}, } def _collect_perclass(runs): acc = defaultdict(lambda: defaultdict(list)) for d in runs: key = (d.get("dataset"), d.get("arch")) for pi in d.get("per_image", []): for c, m in (pi.get("per_class") or {}).items(): v = (m or {}).get("dice") if v is not None and v == v: acc[key][c].append(v) return {k: {c: float(np.mean(v)) for c, v in cd.items() if v} for k, cd in acc.items()} _CSS = """ body{font-family:'Helvetica Neue',Arial,sans-serif;margin:30px auto;max-width:1180px;color:#1a1a1a;line-height:1.5} h1{font-size:21px;margin:0 0 4px}h2{font-size:15px;color:#0a5a33;margin:30px 0 4px;border-bottom:1px solid #e3e3e3;padding-bottom:3px} h3{font-size:13px;margin:16px 0 4px;color:#333} p,li{font-size:13px}code{background:#f2f2f2;padding:1px 4px;border-radius:3px} .cap{color:#666;font-size:11.5px;margin:3px 0 6px} .tw{overflow-x:auto} table.rt{border-collapse:collapse;margin:6px 0 8px;font-size:11.5px} table.rt th,table.rt td{padding:4px 9px;text-align:center;white-space:nowrap} table.rt thead th{border-top:2px solid #222;border-bottom:1.2px solid #222;font-weight:600} table.rt tbody tr:last-child td{border-bottom:2px solid #222} table.rt td.m,table.rt th.m{text-align:left;font-weight:600} table.rt td.avg,table.rt th.avg{border-left:1px solid #c8c8c8;background:#f7f9f8} table.rt tbody tr.grp td{border-top:1px solid #cfcfcf} table.rt b{color:#08402a} table.info{border-collapse:collapse;margin:6px 0 14px;font-size:12px} table.info th,table.info td{border:1px solid #ddd;padding:4px 8px;text-align:center} table.info th{background:#f3f3f3}table.info td.l{text-align:left} .note{background:#eef7f0;border-left:3px solid #0a6;padding:8px 12px;font-size:12.5px;margin:8px 0} hr{border:none;border-top:1px solid #e3e3e3;margin:24px 0} """ def _metric_table(cell, datasets, methods, key, pct, hib, bold_sets=None): """Transposed table: methods (rows) × datasets (cols). bold_sets[ds] (set of archs) if given (Dice significance), else bold the single best per column. Deliberately NO cross-dataset summary column: the ten datasets span seven modalities with very different difficulty, so a simple average is not meaningful (and would conflict with the per-dataset ranking).""" best = {} for d in datasets: vals = {a: cell[(d, a)][f"{key}_mean"] for a in methods if (d, a) in cell and cell[(d, a)][f"{key}_mean"] == cell[(d, a)][f"{key}_mean"]} best[d] = ((max if hib else min)(vals, key=vals.get) if vals else None) h = ["
" + "".join(f"" for d in datasets) + ""] for a in methods: grp = " class='grp'" if a == "transunet" else "" tds = [f""] for d in datasets: if (d, a) in cell and cell[(d, a)][f"{key}_mean"] == cell[(d, a)][f"{key}_mean"]: t = _fmt(cell[(d, a)], key, pct) b = (a in bold_sets.get(d, set())) if bold_sets is not None else (a == best[d]) tds.append(f"") else: tds.append("") h.append(f"" + "".join(tds) + "") h.append("
Method{_DS_DISP.get(d, d)}
{_ARCH_DISP.get(a, a)}{''+t+'' if b else t}
") return "\n".join(h) def _perclass_section(runs): pc = _collect_perclass(runs) h = [] for ds, names in _PERCLASS_NAMES.items(): methods = [a for a in _ARCH_ORDER if (ds, a) in pc and pc[(ds, a)]] if not methods: continue classes = sorted(names, key=int) colbest = {c: max((pc[(ds, a)].get(c, float('nan')) for a in methods), default=float('nan')) for c in classes} h.append(f"

{_DS_DISP.get(ds, ds)}

") h.append("
" + "".join(f"" for c in classes) + "") for a in methods: grp = " class='grp'" if a == "transunet" else "" cells, present = [], [] for c in classes: v = pc[(ds, a)].get(c) if v is None: cells.append("") else: present.append(v) t = f"{v*100:.1f}" cells.append(f"") macro = (sum(present) / len(present) * 100) if present else float("nan") h.append(f"{''.join(cells)}" f"") h.append("
Method{names[c]}macro
{''+t+'' if v == colbest[c] else t}{_ARCH_DISP.get(a, a)}{macro:.1f}
") return "\n".join(h) def _setup_html(): h = ["

A. Datasets

", "" ""] for r in _DATASETS_INFO: h.append("" "" % r) h.append("
#DatasetModalityTargetClsChNative sizeProtocolTrain/Val/Test
%s%s%s%s%s%s%s%s%s
") h.append("
¹ BUSI/KiTS19 grayscale stored as 3-ch PNG (read as grayscale). " "² no canonical split → one fixed fold (of 5) with 3 seeds; others use the official split. " "Labels 0…C-1 (0=bg); multi-class metrics macro-averaged over foreground classes.
") h.append("

B. Methods

") h.append("") for m in _METHODS_INFO: h.append("" % m) h.append("
MethodFamilyBackbone / setup
%s%s%s
") h.append("

C. Metrics

") h.append("" "") for m in _METRICS_INFO: h.append("" % m) h.append("
MetricDefinitionDirUnit作用 / 含义(中文)
%s%s%s%s%s
") return "\n".join(h) def to_html(rows, runs=None, title="SegGen benchmark", sig=None): cell, datasets, methods = _grid(rows) if sig is None: sig = _sig_tied_sets(runs) if runs else None h = [f"{title}" ""] h.append(f"

{title}: 8 methods × 10 datasets (unified 512, resolution-fair)

") h.append("

Eight 2D medical-image segmentation methods on ten public datasets (seven modalities). " "Values are mean±SD over 3 seeds (over the 3 folds for PanNuke). " "Each (dataset,method) cell aggregates tens–thousands of test images.

") h.append("
Resolution-fair protocol. Convolutional nets train at 512; the fixed-input " "transformers (Swin-UNet 224, TransUNet 256) and nnU-Net/U-Mamba run at their native size; " "every prediction and ground truth is then resized to a common 512×512 before scoring, so " "boundary metrics (HD95/ASSD, in pixels) are directly comparable across methods.
") h.append("

1. Main results — Dice (%) ↑

") h.append("
Bold = best, or not significantly different from the best per dataset " "(paired Wilcoxon on per-image Dice, p≥0.05). " "Horizontal rule separates CNNs (top) from Transformer / foundation models (bottom). " "No cross-dataset average is reported — the seven modalities differ too much in difficulty " "for a single number to be meaningful.
") h.append(_metric_table(cell, datasets, methods, "dice", True, True, bold_sets=sig)) h.append("

2. Boundary accuracy — HD95 (px) ↓

") h.append("
95% Hausdorff distance at the common 512 resolution (lower = better; " "bold = best per dataset). Now comparable across methods.
") h.append(_metric_table(cell, datasets, methods, "hd95", False, False)) h.append("

3. Overlap — IoU (%) ↑

") h.append("
Jaccard index, the stricter overlap measure (bold = best per dataset).
") h.append(_metric_table(cell, datasets, methods, "iou", True, True)) if runs: pcs = _perclass_section(runs) if pcs.strip(): h.append("

4. Per-class Dice (%) — multi-class datasets

") h.append("
Mean per-class Dice over all test images/runs (0=background excluded; " "bold=best per class). The macro column weights each foreground class " "equally (a within-dataset mean, not a cross-dataset one). It can differ by ~1 pt from " "the §1 Dice — which is image-weighted (each image is first averaged over the classes it " "contains) — whenever some images lack a class (e.g. ACDC's RV appears in only 335/380 " "images); both conventions are standard, neither is an error.
") h.append(pcs) h.append("

5. Supplementary metrics — Sensitivity & Precision (%) ↑

") h.append("
Two complementary error views (bold = best per dataset): low " "Sensitivity (recall) signals under-segmentation (missed foreground); low " "Precision signals over-segmentation (false positives). Specificity is omitted " "— background dominates, so it stays >96% with almost no spread across methods (≤0.6 pt on " "average) — and ASSD is omitted as redundant with HD95; both, and every metric, are " "tabulated in full in summary.csv.
") h.append("

Sensitivity / recall ↑

") h.append(_metric_table(cell, datasets, methods, "sensitivity", True, True)) h.append("

Precision ↑

") h.append(_metric_table(cell, datasets, methods, "precision", True, True)) h.append("

Appendix — Experimental setup

") h.append("

Full per-(dataset,method) values for every metric " "(IoU, HD95, ASSD, Sensitivity, Specificity, Precision, …) are in " "summary.csv; the Dice table as LaTeX is in summary.tex.

") h.append(_setup_html()) h.append("") return "\n".join(h) def main(): p = argparse.ArgumentParser() p.add_argument("--exp_name", required=True) p.add_argument("--out_root", default="results") args = p.parse_args() runs = load_runs(args.out_root, args.exp_name) if not runs: print(f"no metrics.json under {args.out_root}/{args.exp_name}") return rows = summarize(runs) sig = _sig_tied_sets(runs) base = os.path.join(args.out_root, args.exp_name) open(os.path.join(base, "summary.csv"), "w").write(to_csv(rows)) open(os.path.join(base, "summary.md"), "w").write(to_markdown(rows, sig)) open(os.path.join(base, "summary.tex"), "w").write(to_latex(rows, sig)) open(os.path.join(base, "summary.html"), "w").write( to_html(rows, runs, title=f"SegGen benchmark ({args.exp_name})", sig=sig)) print(to_markdown(rows, sig)) print(f"{len(runs)} runs -> {len(rows)} (dataset,arch) cells; written {base}/summary.{{csv,md,tex,html}}") if __name__ == "__main__": main()