MaybeRichard's picture
report: clarify per-class macro vs main-table Dice averaging convention
14548aa verified
Raw
History Blame Contribute Delete
25.2 kB
"""Aggregate per-seed metrics.json into paper-style result tables (mean±SD).
Scans results/<exp_name>/**/seed*/metrics.json, groups by (dataset, protocol, arch),
reports mean±SD over seeds (over folds for CV datasets). Emits:
- summary.csv : full per-(dataset,method) detail, every metric (raw data export)
- summary.md : the main Dice table, methods×datasets (quick read)
- summary.tex : the main Dice table as booktabs LaTeX (paper-ready)
- summary.html : full paper-style report (main tables, per-class, significance, setup)
python framework/report/aggregate.py --exp_name baselines [--out_root results]
"""
from __future__ import annotations
import os
import json
import glob
import argparse
import warnings
from collections import defaultdict
import numpy as np
# per-image Dice vectors can have all-NaN positions (empty masks across seeds);
# np.nanmean warns harmlessly on those — silence it for clean console/report runs.
warnings.filterwarnings("ignore", message="Mean of empty slice")
# (key, label, is_percent, higher_is_better)
METRICS = [
("dice", "Dice", True, True),
("iou", "IoU", True, True),
("hd95", "HD95", False, False),
("assd", "ASSD", False, False),
("sensitivity", "Sens", True, True),
("specificity", "Spec", True, True),
("precision", "Prec", True, True),
]
def load_runs(out_root, exp_name):
runs = []
for path in glob.glob(os.path.join(out_root, exp_name, "**", "seed*", "metrics.json"), recursive=True):
try:
with open(path) as f:
runs.append(json.load(f))
except Exception:
pass
return runs
_PROTO_LABEL = {
("idridd_segmentation", "fold01"): "official",
("busi", "fold01"): "single-split",
("medsegdb_kits19", "fold01"): "single-split",
("pannuke_semantic", "fold01"): "single-split",
}
_CV_DATASETS = {"pannuke_semantic"}
def _proto_label(dataset, protocol):
return _PROTO_LABEL.get((dataset, protocol), protocol)
def _agg_over(items, key):
vals = np.array([it.get("metrics", {}).get(f"{key}_mean", np.nan) for it in items], np.float64)
vals = vals[~np.isnan(vals)]
return (float(vals.mean()), float(vals.std())) if vals.size else (float("nan"), float("nan"))
def summarize(runs):
by_da = defaultdict(lambda: defaultdict(list))
for d in runs:
by_da[(d.get("dataset"), d.get("arch"))][d.get("protocol")].append(d)
rows = []
for (dataset, arch), proto_map in sorted(by_da.items()):
protos = sorted(p for p in proto_map if p is not None)
row = {"dataset": dataset, "arch": arch}
if dataset in _CV_DATASETS and len(protos) > 1:
row["protocol"] = f"{len(protos)}-fold"
row["n_seeds"] = len(protos)
for key, _, _, _ in METRICS:
fold_means = [m for m in (_agg_over(proto_map[p], key)[0] for p in protos)
if not np.isnan(m)]
fm = np.array(fold_means, np.float64)
row[f"{key}_mean"] = float(fm.mean()) if fm.size else float("nan")
row[f"{key}_sd"] = float(fm.std()) if fm.size else float("nan")
else:
proto = protos[0] if protos else None
items = proto_map.get(proto, [])
row["protocol"] = _proto_label(dataset, proto)
row["n_seeds"] = len(items)
for key, _, _, _ in METRICS:
row[f"{key}_mean"], row[f"{key}_sd"] = _agg_over(items, key)
rows.append(row)
return rows
# ----------------------------------------------------------------------------- display
_ARCH_ORDER = ["unet", "unetpp", "deeplabv3plus", "attention_unet", "transunet", "swinunet",
"nnunet", "umamba"]
_ARCH_DISP = {"unet": "UNet", "unetpp": "UNet++", "deeplabv3plus": "DeepLabV3+",
"attention_unet": "Attention-UNet", "transunet": "TransUNet",
"swinunet": "Swin-UNet", "nnunet": "nnU-Net", "umamba": "U-Mamba"}
_DS_ORDER = ["cvc_clinicdb", "kvasir_seg", "fives", "busi", "refuge2", "acdc_png",
"idridd_segmentation", "pannuke_semantic", "medsegdb_isic2018", "medsegdb_kits19"]
_DS_DISP = {"cvc_clinicdb": "CVC-ClinicDB", "kvasir_seg": "Kvasir-SEG", "fives": "FIVES",
"busi": "BUSI", "refuge2": "REFUGE2", "acdc_png": "ACDC",
"idridd_segmentation": "IDRiD", "pannuke_semantic": "PanNuke",
"medsegdb_isic2018": "ISIC2018", "medsegdb_kits19": "KiTS19"}
def _fmt(row, key, pct):
m, s = row[f"{key}_mean"], row[f"{key}_sd"]
if m != m:
return "—"
return f"{m*100:.2f}±{s*100:.2f}" if pct else f"{m:.2f}±{s:.2f}"
def _grid(rows):
cell = {(r["dataset"], r["arch"]): r for r in rows}
methods = [a for a in _ARCH_ORDER if any(r["arch"] == a for r in rows)] or \
sorted({r["arch"] for r in rows})
seen = [d for d in _DS_ORDER if any(r["dataset"] == d for r in rows)]
extra = [r["dataset"] for r in rows if r["dataset"] not in _DS_ORDER]
datasets = list(dict.fromkeys(seen + extra))
return cell, datasets, methods
# ----------------------------------------------------------------------------- significance
def _per_image_dice_vec(runs_for_da):
by_proto = defaultdict(list)
for d in runs_for_da:
by_proto[d.get("protocol")].append(d)
parts = []
for proto in sorted(by_proto):
arrs = [np.array([pi.get("dice", np.nan) for pi in d.get("per_image", [])], float)
for d in by_proto[proto]]
arrs = [a for a in arrs if a.size]
if not arrs:
continue
L = min(a.size for a in arrs)
parts.append(np.nanmean(np.stack([a[:L] for a in arrs]), axis=0))
return np.concatenate(parts) if parts else np.array([])
def _sig_tied_sets(runs):
"""{dataset: set(archs whose per-image Dice is NOT significantly worse than the best,
paired Wilcoxon p>=0.05)} — the 'statistically best' set, used to bold the Dice table."""
try:
from scipy.stats import wilcoxon
except Exception:
return {}
by_da = defaultdict(list)
for d in runs:
by_da[(d.get("dataset"), d.get("arch"))].append(d)
def pval(a, b):
L = min(a.size, b.size)
if L < 6:
return float("nan")
x, y = a[:L], b[:L]
m = ~(np.isnan(x) | np.isnan(y))
if m.sum() < 6 or np.allclose(x[m], y[m]):
return 1.0
try:
return float(wilcoxon(x[m], y[m]).pvalue)
except Exception:
return 1.0
out = {}
for ds in {k[0] for k in by_da}:
vecs = {a: _per_image_dice_vec(by_da[(ds, a)]) for a in _ARCH_ORDER if (ds, a) in by_da}
vecs = {a: v for a, v in vecs.items() if v.size}
if not vecs:
continue
means = {a: float(np.nanmean(v)) for a, v in vecs.items()}
best = max(means, key=means.get)
tied = {best}
for a, v in vecs.items():
if a != best and not (pval(vecs[best], v) < 0.05):
tied.add(a)
out[ds] = tied
return out
# ----------------------------------------------------------------------------- text exports
def to_csv(rows):
cols = ["dataset", "protocol", "arch", "n_seeds"]
for k, _, _, _ in METRICS:
cols += [f"{k}_mean", f"{k}_sd"]
out = ",".join(cols) + "\n"
for r in rows:
out += ",".join(str(r[c]) for c in cols) + "\n"
return out
def _dice_matrix(rows):
"""(methods, datasets, cell, avg) for the main Dice table."""
cell, datasets, methods = _grid(rows)
avg = {a: np.nanmean([cell[(d, a)]["dice_mean"] for d in datasets if (d, a) in cell] or [np.nan])
for a in methods}
return cell, datasets, methods, avg
def _dice_bold(a, d, cell, best, sig):
"""Whether (dataset d, arch a)'s Dice cell should be bold: in the significance
'tied-for-best' set when available, else the single best per dataset."""
if (d, a) not in cell:
return False
if sig is not None:
return a in sig.get(d, set())
return cell[(d, a)]["dice_mean"] == best[d]
def to_markdown(rows, sig=None):
cell, datasets, methods, _ = _dice_matrix(rows)
head = ["Method"] + [_DS_DISP.get(d, d) for d in datasets]
out = "## Main results — Dice (mean±SD %, ↑)\n\n"
out += ("_**Bold** = best or not significantly worse than best per dataset "
"(paired Wilcoxon on per-image Dice, p≥0.05). No cross-dataset average column — "
"the seven modalities are too heterogeneous for one number to be meaningful._\n\n")
out += "| " + " | ".join(head) + " |\n|" + "---|" * len(head) + "\n"
best = {d: max((cell[(d, a)]["dice_mean"] for a in methods if (d, a) in cell), default=np.nan)
for d in datasets}
for a in methods:
cells = [_ARCH_DISP.get(a, a)]
for d in datasets:
if (d, a) in cell:
t = _fmt(cell[(d, a)], "dice", True)
cells.append(f"**{t}**" if _dice_bold(a, d, cell, best, sig) else t)
else:
cells.append("–")
out += "| " + " | ".join(cells) + " |\n"
return out
def to_latex(rows, sig=None):
cell, datasets, methods, _ = _dice_matrix(rows)
spec = "l" + "c" * len(datasets)
out = ("% Main results: Dice (mean over seeds, %). Bold = best or not significantly\n"
"% worse than best per dataset (paired Wilcoxon on per-image Dice, p>=0.05).\n"
"% No cross-dataset average column (modalities too heterogeneous).\n")
out += "\\begin{tabular}{" + spec + "}\n\\toprule\n"
out += "Method & " + " & ".join(_DS_DISP.get(d, d) for d in datasets) + " \\\\\n\\midrule\n"
best = {d: max((cell[(d, a)]["dice_mean"] for a in methods if (d, a) in cell), default=np.nan)
for d in datasets}
for a in methods:
cells = [_ARCH_DISP.get(a, a)]
for d in datasets:
if (d, a) in cell:
t = f"{cell[(d, a)]['dice_mean'] * 100:.1f}"
cells.append(f"\\textbf{{{t}}}" if _dice_bold(a, d, cell, best, sig) else t)
else:
cells.append("--")
out += " & ".join(cells) + " \\\\\n"
if a == "attention_unet":
out += "\\midrule\n" # separate CNNs from transformers/foundation
out += "\\bottomrule\n\\end{tabular}\n"
return out
# ----------------------------------------------------------------------------- HTML report
_DATASETS_INFO = [
("1", "CVC-ClinicDB", "Colonoscopy (endoscopy)", "Polyp", "2", "RGB", "384×288", "official", "490 / 61 / 61"),
("2", "Kvasir-SEG", "GI endoscopy", "Polyp", "2", "RGB", "~622×529 (var)", "official", "800 / 100 / 100"),
("3", "FIVES", "Retinal fundus", "Vessel", "2", "RGB", "2048×2048", "official", "480 / 120 / 200"),
("4", "BUSI", "Breast ultrasound", "Tumor", "2", "grayscale¹", "variable", "single-split²", "545 / 78 / 157"),
("5", "REFUGE2", "Retinal fundus", "Optic disc & cup", "3", "RGB", "~2124×2056", "official", "400 / 400 / 400"),
("6", "ACDC", "Cardiac MRI (2D slices)", "RV / Myo / LV", "4", "grayscale", "~240×256 (var)", "official", "136 / 210 / 380"),
("7", "IDRiD", "Retinal fundus", "DR lesions (4) + optic disc", "6", "RGB", "4288×2848", "official", "43 / 11 / 27"),
("8", "PanNuke", "Histopathology (H&E)", "Nuclei (5 types)", "6", "RGB", "256×256", "official 3-fold CV", "~2.7k / 2.6k / 2.6k per fold"),
("9", "ISIC2018", "Dermoscopy", "Skin lesion", "2", "RGB", "256×256", "holdout", "2582 / 369 / 737"),
("10", "KiTS19", "Kidney CT (2D slices)", "Kidney (binary)", "2", "grayscale¹", "256×256", "single-split²", "2832 / 479 / 705"),
]
_METHODS_INFO = [
("UNet", "CNN encoder–decoder", "SMP, ResNet-50 encoder (ImageNet)"),
("UNet++", "Nested UNet", "SMP, ResNet-50 (ImageNet)"),
("DeepLabV3+", "Atrous CNN", "SMP, ResNet-50 (ImageNet)"),
("Attention-UNet", "Attention-gated UNet", "Re-implemented, from scratch"),
("TransUNet", "CNN–Transformer hybrid", "R50-ViT-B/16 (ImageNet), input 256"),
("Swin-UNet", "Pure-Transformer UNet", "Swin-Tiny (ImageNet), input 224"),
("nnU-Net (v2)", "Self-configuring CNN", "2D config, 250 epochs"),
("U-Mamba", "State-space (Mamba) UNet", "U-Mamba_Bot, 100 epochs"),
]
_METRICS_INFO = [
("Dice (DSC)", "2TP / (2TP+FP+FN)", "↑", "%", "区域重叠度(主指标),对类别不平衡较鲁棒。"),
("IoU (Jaccard)", "TP / (TP+FP+FN)", "↑", "%", "交并比,更严格的重叠度,常与 Dice 并列。"),
("HD95", "95% Hausdorff distance (boundaries)", "↓", "px", "边界最大误差的95%分位,越小边界越贴合。"),
("ASSD", "average symmetric surface distance", "↓", "px", "平均对称表面距离,整体边界吻合度。"),
("Sensitivity", "TP / (TP+FN)", "↑", "%", "召回/敏感度,反映漏分割程度。"),
("Specificity", "TN / (TN+FP)", "↑", "%", "特异度,背景误报控制。"),
("Precision", "TP / (TP+FP)", "↑", "%", "精确率,反映过分割/误报程度。"),
]
_PERCLASS_NAMES = {
"acdc_png": {"1": "RV", "2": "Myocardium", "3": "LV"},
"refuge2": {"1": "Optic Disc", "2": "Optic Cup"},
"idridd_segmentation": {"1": "MA", "2": "Haemorrhage", "3": "Hard Exudate", "4": "Soft Exudate", "5": "Optic Disc"},
"pannuke_semantic": {"1": "Neoplastic", "2": "Inflammatory", "3": "Connective", "4": "Dead", "5": "Epithelial"},
}
def _collect_perclass(runs):
acc = defaultdict(lambda: defaultdict(list))
for d in runs:
key = (d.get("dataset"), d.get("arch"))
for pi in d.get("per_image", []):
for c, m in (pi.get("per_class") or {}).items():
v = (m or {}).get("dice")
if v is not None and v == v:
acc[key][c].append(v)
return {k: {c: float(np.mean(v)) for c, v in cd.items() if v} for k, cd in acc.items()}
_CSS = """
body{font-family:'Helvetica Neue',Arial,sans-serif;margin:30px auto;max-width:1180px;color:#1a1a1a;line-height:1.5}
h1{font-size:21px;margin:0 0 4px}h2{font-size:15px;color:#0a5a33;margin:30px 0 4px;border-bottom:1px solid #e3e3e3;padding-bottom:3px}
h3{font-size:13px;margin:16px 0 4px;color:#333}
p,li{font-size:13px}code{background:#f2f2f2;padding:1px 4px;border-radius:3px}
.cap{color:#666;font-size:11.5px;margin:3px 0 6px}
.tw{overflow-x:auto}
table.rt{border-collapse:collapse;margin:6px 0 8px;font-size:11.5px}
table.rt th,table.rt td{padding:4px 9px;text-align:center;white-space:nowrap}
table.rt thead th{border-top:2px solid #222;border-bottom:1.2px solid #222;font-weight:600}
table.rt tbody tr:last-child td{border-bottom:2px solid #222}
table.rt td.m,table.rt th.m{text-align:left;font-weight:600}
table.rt td.avg,table.rt th.avg{border-left:1px solid #c8c8c8;background:#f7f9f8}
table.rt tbody tr.grp td{border-top:1px solid #cfcfcf}
table.rt b{color:#08402a}
table.info{border-collapse:collapse;margin:6px 0 14px;font-size:12px}
table.info th,table.info td{border:1px solid #ddd;padding:4px 8px;text-align:center}
table.info th{background:#f3f3f3}table.info td.l{text-align:left}
.note{background:#eef7f0;border-left:3px solid #0a6;padding:8px 12px;font-size:12.5px;margin:8px 0}
hr{border:none;border-top:1px solid #e3e3e3;margin:24px 0}
"""
def _metric_table(cell, datasets, methods, key, pct, hib, bold_sets=None):
"""Transposed table: methods (rows) × datasets (cols). bold_sets[ds] (set of archs)
if given (Dice significance), else bold the single best per column. Deliberately NO
cross-dataset summary column: the ten datasets span seven modalities with very
different difficulty, so a simple average is not meaningful (and would conflict with
the per-dataset ranking)."""
best = {}
for d in datasets:
vals = {a: cell[(d, a)][f"{key}_mean"] for a in methods
if (d, a) in cell and cell[(d, a)][f"{key}_mean"] == cell[(d, a)][f"{key}_mean"]}
best[d] = ((max if hib else min)(vals, key=vals.get) if vals else None)
h = ["<div class='tw'><table class='rt'><thead><tr><th class='m'>Method</th>"
+ "".join(f"<th>{_DS_DISP.get(d, d)}</th>" for d in datasets)
+ "</tr></thead><tbody>"]
for a in methods:
grp = " class='grp'" if a == "transunet" else ""
tds = [f"<td class='m'>{_ARCH_DISP.get(a, a)}</td>"]
for d in datasets:
if (d, a) in cell and cell[(d, a)][f"{key}_mean"] == cell[(d, a)][f"{key}_mean"]:
t = _fmt(cell[(d, a)], key, pct)
b = (a in bold_sets.get(d, set())) if bold_sets is not None else (a == best[d])
tds.append(f"<td>{'<b>'+t+'</b>' if b else t}</td>")
else:
tds.append("<td>–</td>")
h.append(f"<tr{grp}>" + "".join(tds) + "</tr>")
h.append("</tbody></table></div>")
return "\n".join(h)
def _perclass_section(runs):
pc = _collect_perclass(runs)
h = []
for ds, names in _PERCLASS_NAMES.items():
methods = [a for a in _ARCH_ORDER if (ds, a) in pc and pc[(ds, a)]]
if not methods:
continue
classes = sorted(names, key=int)
colbest = {c: max((pc[(ds, a)].get(c, float('nan')) for a in methods), default=float('nan'))
for c in classes}
h.append(f"<h3>{_DS_DISP.get(ds, ds)}</h3>")
h.append("<div class='tw'><table class='rt'><thead><tr><th class='m'>Method</th>"
+ "".join(f"<th>{names[c]}</th>" for c in classes) + "<th class='avg'>macro</th></tr></thead><tbody>")
for a in methods:
grp = " class='grp'" if a == "transunet" else ""
cells, present = [], []
for c in classes:
v = pc[(ds, a)].get(c)
if v is None:
cells.append("<td>–</td>")
else:
present.append(v)
t = f"{v*100:.1f}"
cells.append(f"<td>{'<b>'+t+'</b>' if v == colbest[c] else t}</td>")
macro = (sum(present) / len(present) * 100) if present else float("nan")
h.append(f"<tr{grp}><td class='m'>{_ARCH_DISP.get(a, a)}</td>{''.join(cells)}"
f"<td class='avg'>{macro:.1f}</td></tr>")
h.append("</tbody></table></div>")
return "\n".join(h)
def _setup_html():
h = ["<h2>A. Datasets</h2>",
"<table class='info'><tr><th>#</th><th>Dataset</th><th>Modality</th><th>Target</th><th>Cls</th>"
"<th>Ch</th><th>Native size</th><th>Protocol</th><th>Train/Val/Test</th></tr>"]
for r in _DATASETS_INFO:
h.append("<tr><td>%s</td><td class='l'>%s</td><td class='l'>%s</td><td class='l'>%s</td><td>%s</td>"
"<td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>" % r)
h.append("</table>")
h.append("<div class='cap'>¹ BUSI/KiTS19 grayscale stored as 3-ch PNG (read as grayscale). "
"² no canonical split → one fixed fold (of 5) with 3 seeds; others use the official split. "
"Labels 0…C-1 (0=bg); multi-class metrics macro-averaged over foreground classes.</div>")
h.append("<h2>B. Methods</h2>")
h.append("<table class='info'><tr><th>Method</th><th>Family</th><th>Backbone / setup</th></tr>")
for m in _METHODS_INFO:
h.append("<tr><td class='l'>%s</td><td class='l'>%s</td><td class='l'>%s</td></tr>" % m)
h.append("</table>")
h.append("<h2>C. Metrics</h2>")
h.append("<table class='info'><tr><th>Metric</th><th>Definition</th><th>Dir</th><th>Unit</th>"
"<th>作用 / 含义(中文)</th></tr>")
for m in _METRICS_INFO:
h.append("<tr><td class='l'>%s</td><td class='l'>%s</td><td>%s</td><td>%s</td><td class='l'>%s</td></tr>" % m)
h.append("</table>")
return "\n".join(h)
def to_html(rows, runs=None, title="SegGen benchmark", sig=None):
cell, datasets, methods = _grid(rows)
if sig is None:
sig = _sig_tied_sets(runs) if runs else None
h = [f"<!doctype html><html><head><meta charset='utf-8'><title>{title}</title><style>{_CSS}</style>"
"</head><body>"]
h.append(f"<h1>{title}: 8 methods × 10 datasets (unified 512, resolution-fair)</h1>")
h.append("<p>Eight 2D medical-image segmentation methods on ten public datasets (seven modalities). "
"Values are <b>mean±SD</b> over 3 seeds (over the 3 folds for PanNuke). "
"Each (dataset,method) cell aggregates tens–thousands of test images.</p>")
h.append("<div class='note'><b>Resolution-fair protocol.</b> Convolutional nets train at 512; the fixed-input "
"transformers (Swin-UNet 224, TransUNet 256) and nnU-Net/U-Mamba run at their native size; "
"<b>every prediction and ground truth is then resized to a common 512×512 before scoring</b>, so "
"boundary metrics (HD95/ASSD, in pixels) are directly comparable across methods.</div>")
h.append("<h2>1. Main results — Dice (%) ↑</h2>")
h.append("<div class='cap'><b>Bold</b> = best, or not significantly different from the best per dataset "
"(paired Wilcoxon on per-image Dice, p≥0.05). "
"Horizontal rule separates CNNs (top) from Transformer / foundation models (bottom). "
"No cross-dataset average is reported — the seven modalities differ too much in difficulty "
"for a single number to be meaningful.</div>")
h.append(_metric_table(cell, datasets, methods, "dice", True, True, bold_sets=sig))
h.append("<h2>2. Boundary accuracy — HD95 (px) ↓</h2>")
h.append("<div class='cap'>95% Hausdorff distance at the common 512 resolution (lower = better; "
"<b>bold</b> = best per dataset). Now comparable across methods.</div>")
h.append(_metric_table(cell, datasets, methods, "hd95", False, False))
h.append("<h2>3. Overlap — IoU (%) ↑</h2>")
h.append("<div class='cap'>Jaccard index, the stricter overlap measure (<b>bold</b> = best per dataset).</div>")
h.append(_metric_table(cell, datasets, methods, "iou", True, True))
if runs:
pcs = _perclass_section(runs)
if pcs.strip():
h.append("<h2>4. Per-class Dice (%) — multi-class datasets</h2>")
h.append("<div class='cap'>Mean per-class Dice over all test images/runs (0=background excluded; "
"<b>bold</b>=best per class). The <i>macro</i> column weights each foreground class "
"equally (a within-dataset mean, not a cross-dataset one). It can differ by ~1 pt from "
"the §1 Dice — which is image-weighted (each image is first averaged over the classes it "
"contains) — whenever some images lack a class (e.g. ACDC's RV appears in only 335/380 "
"images); both conventions are standard, neither is an error.</div>")
h.append(pcs)
h.append("<h2>5. Supplementary metrics — Sensitivity &amp; Precision (%) ↑</h2>")
h.append("<div class='cap'>Two complementary error views (<b>bold</b> = best per dataset): low "
"<b>Sensitivity</b> (recall) signals under-segmentation (missed foreground); low "
"<b>Precision</b> signals over-segmentation (false positives). <i>Specificity</i> is omitted "
"— background dominates, so it stays &gt;96% with almost no spread across methods (≤0.6 pt on "
"average) — and <i>ASSD</i> is omitted as redundant with HD95; both, and every metric, are "
"tabulated in full in <code>summary.csv</code>.</div>")
h.append("<h3>Sensitivity / recall ↑</h3>")
h.append(_metric_table(cell, datasets, methods, "sensitivity", True, True))
h.append("<h3>Precision ↑</h3>")
h.append(_metric_table(cell, datasets, methods, "precision", True, True))
h.append("<hr><h2>Appendix — Experimental setup</h2>")
h.append("<p class='cap'>Full per-(dataset,method) values for <b>every</b> metric "
"(IoU, HD95, ASSD, Sensitivity, Specificity, Precision, …) are in "
"<code>summary.csv</code>; the Dice table as LaTeX is in <code>summary.tex</code>.</p>")
h.append(_setup_html())
h.append("</body></html>")
return "\n".join(h)
def main():
p = argparse.ArgumentParser()
p.add_argument("--exp_name", required=True)
p.add_argument("--out_root", default="results")
args = p.parse_args()
runs = load_runs(args.out_root, args.exp_name)
if not runs:
print(f"no metrics.json under {args.out_root}/{args.exp_name}")
return
rows = summarize(runs)
sig = _sig_tied_sets(runs)
base = os.path.join(args.out_root, args.exp_name)
open(os.path.join(base, "summary.csv"), "w").write(to_csv(rows))
open(os.path.join(base, "summary.md"), "w").write(to_markdown(rows, sig))
open(os.path.join(base, "summary.tex"), "w").write(to_latex(rows, sig))
open(os.path.join(base, "summary.html"), "w").write(
to_html(rows, runs, title=f"SegGen benchmark ({args.exp_name})", sig=sig))
print(to_markdown(rows, sig))
print(f"{len(runs)} runs -> {len(rows)} (dataset,arch) cells; written {base}/summary.{{csv,md,tex,html}}")
if __name__ == "__main__":
main()