report: clarify per-class macro vs main-table Dice averaging convention

14548aa verified 15 days ago

25.2 kB

	"""Aggregate per-seed metrics.json into paper-style result tables (mean±SD).

	Scans results/<exp_name>/*/seed/metrics.json, groups by (dataset, protocol, arch),
	reports mean±SD over seeds (over folds for CV datasets). Emits:
	- summary.csv : full per-(dataset,method) detail, every metric (raw data export)
	- summary.md : the main Dice table, methods×datasets (quick read)
	- summary.tex : the main Dice table as booktabs LaTeX (paper-ready)
	- summary.html : full paper-style report (main tables, per-class, significance, setup)

	python framework/report/aggregate.py --exp_name baselines [--out_root results]
	"""
	from __future__ import annotations

	import os
	import json
	import glob
	import argparse
	import warnings
	from collections import defaultdict

	import numpy as np

	# per-image Dice vectors can have all-NaN positions (empty masks across seeds);
	# np.nanmean warns harmlessly on those — silence it for clean console/report runs.
	warnings.filterwarnings("ignore", message="Mean of empty slice")

	# (key, label, is_percent, higher_is_better)
	METRICS = [
	("dice", "Dice", True, True),
	("iou", "IoU", True, True),
	("hd95", "HD95", False, False),
	("assd", "ASSD", False, False),
	("sensitivity", "Sens", True, True),
	("specificity", "Spec", True, True),
	("precision", "Prec", True, True),
	]


	def load_runs(out_root, exp_name):
	runs = []
	for path in glob.glob(os.path.join(out_root, exp_name, "*", "seed", "metrics.json"), recursive=True):
	try:
	with open(path) as f:
	runs.append(json.load(f))
	except Exception:
	pass
	return runs


	_PROTO_LABEL = {
	("idridd_segmentation", "fold01"): "official",
	("busi", "fold01"): "single-split",
	("medsegdb_kits19", "fold01"): "single-split",
	("pannuke_semantic", "fold01"): "single-split",
	}
	_CV_DATASETS = {"pannuke_semantic"}


	def _proto_label(dataset, protocol):
	return _PROTO_LABEL.get((dataset, protocol), protocol)


	def _agg_over(items, key):
	vals = np.array([it.get("metrics", {}).get(f"{key}_mean", np.nan) for it in items], np.float64)
	vals = vals[~np.isnan(vals)]
	return (float(vals.mean()), float(vals.std())) if vals.size else (float("nan"), float("nan"))


	def summarize(runs):
	by_da = defaultdict(lambda: defaultdict(list))
	for d in runs:
	by_da[(d.get("dataset"), d.get("arch"))][d.get("protocol")].append(d)
	rows = []
	for (dataset, arch), proto_map in sorted(by_da.items()):
	protos = sorted(p for p in proto_map if p is not None)
	row = {"dataset": dataset, "arch": arch}
	if dataset in _CV_DATASETS and len(protos) > 1:
	row["protocol"] = f"{len(protos)}-fold"
	row["n_seeds"] = len(protos)
	for key, _, _, _ in METRICS:
	fold_means = [m for m in (_agg_over(proto_map[p], key)[0] for p in protos)
	if not np.isnan(m)]
	fm = np.array(fold_means, np.float64)
	row[f"{key}_mean"] = float(fm.mean()) if fm.size else float("nan")
	row[f"{key}_sd"] = float(fm.std()) if fm.size else float("nan")
	else:
	proto = protos[0] if protos else None
	items = proto_map.get(proto, [])
	row["protocol"] = _proto_label(dataset, proto)
	row["n_seeds"] = len(items)
	for key, _, _, _ in METRICS:
	row[f"{key}_mean"], row[f"{key}_sd"] = _agg_over(items, key)
	rows.append(row)
	return rows


	# ----------------------------------------------------------------------------- display
	_ARCH_ORDER = ["unet", "unetpp", "deeplabv3plus", "attention_unet", "transunet", "swinunet",
	"nnunet", "umamba"]
	_ARCH_DISP = {"unet": "UNet", "unetpp": "UNet++", "deeplabv3plus": "DeepLabV3+",
	"attention_unet": "Attention-UNet", "transunet": "TransUNet",
	"swinunet": "Swin-UNet", "nnunet": "nnU-Net", "umamba": "U-Mamba"}
	_DS_ORDER = ["cvc_clinicdb", "kvasir_seg", "fives", "busi", "refuge2", "acdc_png",
	"idridd_segmentation", "pannuke_semantic", "medsegdb_isic2018", "medsegdb_kits19"]
	_DS_DISP = {"cvc_clinicdb": "CVC-ClinicDB", "kvasir_seg": "Kvasir-SEG", "fives": "FIVES",
	"busi": "BUSI", "refuge2": "REFUGE2", "acdc_png": "ACDC",
	"idridd_segmentation": "IDRiD", "pannuke_semantic": "PanNuke",
	"medsegdb_isic2018": "ISIC2018", "medsegdb_kits19": "KiTS19"}


	def _fmt(row, key, pct):
	m, s = row[f"{key}_mean"], row[f"{key}_sd"]
	if m != m:
	return "—"
	return f"{m100:.2f}±{s100:.2f}" if pct else f"{m:.2f}±{s:.2f}"


	def _grid(rows):
	cell = {(r["dataset"], r["arch"]): r for r in rows}
	methods = [a for a in _ARCH_ORDER if any(r["arch"] == a for r in rows)] or \
	sorted({r["arch"] for r in rows})
	seen = [d for d in _DS_ORDER if any(r["dataset"] == d for r in rows)]
	extra = [r["dataset"] for r in rows if r["dataset"] not in _DS_ORDER]
	datasets = list(dict.fromkeys(seen + extra))
	return cell, datasets, methods


	# ----------------------------------------------------------------------------- significance
	def _per_image_dice_vec(runs_for_da):
	by_proto = defaultdict(list)
	for d in runs_for_da:
	by_proto[d.get("protocol")].append(d)
	parts = []
	for proto in sorted(by_proto):
	arrs = [np.array([pi.get("dice", np.nan) for pi in d.get("per_image", [])], float)
	for d in by_proto[proto]]
	arrs = [a for a in arrs if a.size]
	if not arrs:
	continue
	L = min(a.size for a in arrs)
	parts.append(np.nanmean(np.stack([a[:L] for a in arrs]), axis=0))
	return np.concatenate(parts) if parts else np.array([])


	def _sig_tied_sets(runs):
	"""{dataset: set(archs whose per-image Dice is NOT significantly worse than the best,
	paired Wilcoxon p>=0.05)} — the 'statistically best' set, used to bold the Dice table."""
	try:
	from scipy.stats import wilcoxon
	except Exception:
	return {}
	by_da = defaultdict(list)
	for d in runs:
	by_da[(d.get("dataset"), d.get("arch"))].append(d)

	def pval(a, b):
	L = min(a.size, b.size)
	if L < 6:
	return float("nan")
	x, y = a[:L], b[:L]
	m = ~(np.isnan(x) \| np.isnan(y))
	if m.sum() < 6 or np.allclose(x[m], y[m]):
	return 1.0
	try:
	return float(wilcoxon(x[m], y[m]).pvalue)
	except Exception:
	return 1.0

	out = {}
	for ds in {k[0] for k in by_da}:
	vecs = {a: _per_image_dice_vec(by_da[(ds, a)]) for a in _ARCH_ORDER if (ds, a) in by_da}
	vecs = {a: v for a, v in vecs.items() if v.size}
	if not vecs:
	continue
	means = {a: float(np.nanmean(v)) for a, v in vecs.items()}
	best = max(means, key=means.get)
	tied = {best}
	for a, v in vecs.items():
	if a != best and not (pval(vecs[best], v) < 0.05):
	tied.add(a)
	out[ds] = tied
	return out


	# ----------------------------------------------------------------------------- text exports
	def to_csv(rows):
	cols = ["dataset", "protocol", "arch", "n_seeds"]
	for k, _, _, _ in METRICS:
	cols += [f"{k}_mean", f"{k}_sd"]
	out = ",".join(cols) + "\n"
	for r in rows:
	out += ",".join(str(r[c]) for c in cols) + "\n"
	return out


	def _dice_matrix(rows):
	"""(methods, datasets, cell, avg) for the main Dice table."""
	cell, datasets, methods = _grid(rows)
	avg = {a: np.nanmean([cell[(d, a)]["dice_mean"] for d in datasets if (d, a) in cell] or [np.nan])
	for a in methods}
	return cell, datasets, methods, avg


	def _dice_bold(a, d, cell, best, sig):
	"""Whether (dataset d, arch a)'s Dice cell should be bold: in the significance
	'tied-for-best' set when available, else the single best per dataset."""
	if (d, a) not in cell:
	return False
	if sig is not None:
	return a in sig.get(d, set())
	return cell[(d, a)]["dice_mean"] == best[d]


	def to_markdown(rows, sig=None):
	cell, datasets, methods, _ = _dice_matrix(rows)
	head = ["Method"] + [_DS_DISP.get(d, d) for d in datasets]
	out = "## Main results — Dice (mean±SD %, ↑)\n\n"
	out += ("_Bold = best or not significantly worse than best per dataset "
	"(paired Wilcoxon on per-image Dice, p≥0.05). No cross-dataset average column — "
	"the seven modalities are too heterogeneous for one number to be meaningful._\n\n")
	out += "\| " + " \| ".join(head) + " \|\n\|" + "---\|" * len(head) + "\n"
	best = {d: max((cell[(d, a)]["dice_mean"] for a in methods if (d, a) in cell), default=np.nan)
	for d in datasets}
	for a in methods:
	cells = [_ARCH_DISP.get(a, a)]
	for d in datasets:
	if (d, a) in cell:
	t = _fmt(cell[(d, a)], "dice", True)
	cells.append(f"{t}" if _dice_bold(a, d, cell, best, sig) else t)
	else:
	cells.append("–")
	out += "\| " + " \| ".join(cells) + " \|\n"
	return out


	def to_latex(rows, sig=None):
	cell, datasets, methods, _ = _dice_matrix(rows)
	spec = "l" + "c" * len(datasets)
	out = ("% Main results: Dice (mean over seeds, %). Bold = best or not significantly\n"
	"% worse than best per dataset (paired Wilcoxon on per-image Dice, p>=0.05).\n"
	"% No cross-dataset average column (modalities too heterogeneous).\n")
	out += "\\begin{tabular}{" + spec + "}\n\\toprule\n"
	out += "Method & " + " & ".join(_DS_DISP.get(d, d) for d in datasets) + " \\\\\n\\midrule\n"
	best = {d: max((cell[(d, a)]["dice_mean"] for a in methods if (d, a) in cell), default=np.nan)
	for d in datasets}
	for a in methods:
	cells = [_ARCH_DISP.get(a, a)]
	for d in datasets:
	if (d, a) in cell:
	t = f"{cell[(d, a)]['dice_mean'] * 100:.1f}"
	cells.append(f"\\textbf{{{t}}}" if _dice_bold(a, d, cell, best, sig) else t)
	else:
	cells.append("--")
	out += " & ".join(cells) + " \\\\\n"
	if a == "attention_unet":
	out += "\\midrule\n" # separate CNNs from transformers/foundation
	out += "\\bottomrule\n\\end{tabular}\n"
	return out


	# ----------------------------------------------------------------------------- HTML report
	_DATASETS_INFO = [
	("1", "CVC-ClinicDB", "Colonoscopy (endoscopy)", "Polyp", "2", "RGB", "384×288", "official", "490 / 61 / 61"),
	("2", "Kvasir-SEG", "GI endoscopy", "Polyp", "2", "RGB", "~622×529 (var)", "official", "800 / 100 / 100"),
	("3", "FIVES", "Retinal fundus", "Vessel", "2", "RGB", "2048×2048", "official", "480 / 120 / 200"),
	("4", "BUSI", "Breast ultrasound", "Tumor", "2", "grayscale¹", "variable", "single-split²", "545 / 78 / 157"),
	("5", "REFUGE2", "Retinal fundus", "Optic disc & cup", "3", "RGB", "~2124×2056", "official", "400 / 400 / 400"),
	("6", "ACDC", "Cardiac MRI (2D slices)", "RV / Myo / LV", "4", "grayscale", "~240×256 (var)", "official", "136 / 210 / 380"),
	("7", "IDRiD", "Retinal fundus", "DR lesions (4) + optic disc", "6", "RGB", "4288×2848", "official", "43 / 11 / 27"),
	("8", "PanNuke", "Histopathology (H&E)", "Nuclei (5 types)", "6", "RGB", "256×256", "official 3-fold CV", "~2.7k / 2.6k / 2.6k per fold"),
	("9", "ISIC2018", "Dermoscopy", "Skin lesion", "2", "RGB", "256×256", "holdout", "2582 / 369 / 737"),
	("10", "KiTS19", "Kidney CT (2D slices)", "Kidney (binary)", "2", "grayscale¹", "256×256", "single-split²", "2832 / 479 / 705"),
	]
	_METHODS_INFO = [
	("UNet", "CNN encoder–decoder", "SMP, ResNet-50 encoder (ImageNet)"),
	("UNet++", "Nested UNet", "SMP, ResNet-50 (ImageNet)"),
	("DeepLabV3+", "Atrous CNN", "SMP, ResNet-50 (ImageNet)"),
	("Attention-UNet", "Attention-gated UNet", "Re-implemented, from scratch"),
	("TransUNet", "CNN–Transformer hybrid", "R50-ViT-B/16 (ImageNet), input 256"),
	("Swin-UNet", "Pure-Transformer UNet", "Swin-Tiny (ImageNet), input 224"),
	("nnU-Net (v2)", "Self-configuring CNN", "2D config, 250 epochs"),
	("U-Mamba", "State-space (Mamba) UNet", "U-Mamba_Bot, 100 epochs"),
	]
	_METRICS_INFO = [
	("Dice (DSC)", "2TP / (2TP+FP+FN)", "↑", "%", "区域重叠度(主指标),对类别不平衡较鲁棒。"),
	("IoU (Jaccard)", "TP / (TP+FP+FN)", "↑", "%", "交并比,更严格的重叠度,常与 Dice 并列。"),
	("HD95", "95% Hausdorff distance (boundaries)", "↓", "px", "边界最大误差的95%分位,越小边界越贴合。"),
	("ASSD", "average symmetric surface distance", "↓", "px", "平均对称表面距离,整体边界吻合度。"),
	("Sensitivity", "TP / (TP+FN)", "↑", "%", "召回/敏感度,反映漏分割程度。"),
	("Specificity", "TN / (TN+FP)", "↑", "%", "特异度,背景误报控制。"),
	("Precision", "TP / (TP+FP)", "↑", "%", "精确率,反映过分割/误报程度。"),
	]
	_PERCLASS_NAMES = {
	"acdc_png": {"1": "RV", "2": "Myocardium", "3": "LV"},
	"refuge2": {"1": "Optic Disc", "2": "Optic Cup"},
	"idridd_segmentation": {"1": "MA", "2": "Haemorrhage", "3": "Hard Exudate", "4": "Soft Exudate", "5": "Optic Disc"},
	"pannuke_semantic": {"1": "Neoplastic", "2": "Inflammatory", "3": "Connective", "4": "Dead", "5": "Epithelial"},
	}


	def _collect_perclass(runs):
	acc = defaultdict(lambda: defaultdict(list))
	for d in runs:
	key = (d.get("dataset"), d.get("arch"))
	for pi in d.get("per_image", []):
	for c, m in (pi.get("per_class") or {}).items():
	v = (m or {}).get("dice")
	if v is not None and v == v:
	acc[key][c].append(v)
	return {k: {c: float(np.mean(v)) for c, v in cd.items() if v} for k, cd in acc.items()}


	_CSS = """
	body{font-family:'Helvetica Neue',Arial,sans-serif;margin:30px auto;max-width:1180px;color:#1a1a1a;line-height:1.5}
	h1{font-size:21px;margin:0 0 4px}h2{font-size:15px;color:#0a5a33;margin:30px 0 4px;border-bottom:1px solid #e3e3e3;padding-bottom:3px}
	h3{font-size:13px;margin:16px 0 4px;color:#333}
	p,li{font-size:13px}code{background:#f2f2f2;padding:1px 4px;border-radius:3px}
	.cap{color:#666;font-size:11.5px;margin:3px 0 6px}
	.tw{overflow-x:auto}
	table.rt{border-collapse:collapse;margin:6px 0 8px;font-size:11.5px}
	table.rt th,table.rt td{padding:4px 9px;text-align:center;white-space:nowrap}
	table.rt thead th{border-top:2px solid #222;border-bottom:1.2px solid #222;font-weight:600}
	table.rt tbody tr:last-child td{border-bottom:2px solid #222}
	table.rt td.m,table.rt th.m{text-align:left;font-weight:600}
	table.rt td.avg,table.rt th.avg{border-left:1px solid #c8c8c8;background:#f7f9f8}
	table.rt tbody tr.grp td{border-top:1px solid #cfcfcf}
	table.rt b{color:#08402a}
	table.info{border-collapse:collapse;margin:6px 0 14px;font-size:12px}
	table.info th,table.info td{border:1px solid #ddd;padding:4px 8px;text-align:center}
	table.info th{background:#f3f3f3}table.info td.l{text-align:left}
	.note{background:#eef7f0;border-left:3px solid #0a6;padding:8px 12px;font-size:12.5px;margin:8px 0}
	hr{border:none;border-top:1px solid #e3e3e3;margin:24px 0}
	"""


	def _metric_table(cell, datasets, methods, key, pct, hib, bold_sets=None):
	"""Transposed table: methods (rows) × datasets (cols). bold_sets[ds] (set of archs)
	if given (Dice significance), else bold the single best per column. Deliberately NO
	cross-dataset summary column: the ten datasets span seven modalities with very
	different difficulty, so a simple average is not meaningful (and would conflict with
	the per-dataset ranking)."""
	best = {}
	for d in datasets:
	vals = {a: cell[(d, a)][f"{key}_mean"] for a in methods
	if (d, a) in cell and cell[(d, a)][f"{key}_mean"] == cell[(d, a)][f"{key}_mean"]}
	best[d] = ((max if hib else min)(vals, key=vals.get) if vals else None)
	h = ["<div class='tw'><table class='rt'><thead><tr><th class='m'>Method</th>"
	+ "".join(f"<th>{_DS_DISP.get(d, d)}</th>" for d in datasets)
	+ "</tr></thead><tbody>"]
	for a in methods:
	grp = " class='grp'" if a == "transunet" else ""
	tds = [f"<td class='m'>{_ARCH_DISP.get(a, a)}</td>"]
	for d in datasets:
	if (d, a) in cell and cell[(d, a)][f"{key}_mean"] == cell[(d, a)][f"{key}_mean"]:
	t = _fmt(cell[(d, a)], key, pct)
	b = (a in bold_sets.get(d, set())) if bold_sets is not None else (a == best[d])
	tds.append(f"<td>{'<b>'+t+'</b>' if b else t}</td>")
	else:
	tds.append("<td>–</td>")
	h.append(f"<tr{grp}>" + "".join(tds) + "</tr>")
	h.append("</tbody></table></div>")
	return "\n".join(h)


	def _perclass_section(runs):
	pc = _collect_perclass(runs)
	h = []
	for ds, names in _PERCLASS_NAMES.items():
	methods = [a for a in _ARCH_ORDER if (ds, a) in pc and pc[(ds, a)]]
	if not methods:
	continue
	classes = sorted(names, key=int)
	colbest = {c: max((pc[(ds, a)].get(c, float('nan')) for a in methods), default=float('nan'))
	for c in classes}
	h.append(f"<h3>{_DS_DISP.get(ds, ds)}</h3>")
	h.append("<div class='tw'><table class='rt'><thead><tr><th class='m'>Method</th>"
	+ "".join(f"<th>{names[c]}</th>" for c in classes) + "<th class='avg'>macro</th></tr></thead><tbody>")
	for a in methods:
	grp = " class='grp'" if a == "transunet" else ""
	cells, present = [], []
	for c in classes:
	v = pc[(ds, a)].get(c)
	if v is None:
	cells.append("<td>–</td>")
	else:
	present.append(v)
	t = f"{v*100:.1f}"
	cells.append(f"<td>{'<b>'+t+'</b>' if v == colbest[c] else t}</td>")
	macro = (sum(present) / len(present) * 100) if present else float("nan")
	h.append(f"<tr{grp}><td class='m'>{_ARCH_DISP.get(a, a)}</td>{''.join(cells)}"
	f"<td class='avg'>{macro:.1f}</td></tr>")
	h.append("</tbody></table></div>")
	return "\n".join(h)


	def _setup_html():
	h = ["<h2>A. Datasets</h2>",
	"<table class='info'><tr><th>#</th><th>Dataset</th><th>Modality</th><th>Target</th><th>Cls</th>"
	"<th>Ch</th><th>Native size</th><th>Protocol</th><th>Train/Val/Test</th></tr>"]
	for r in _DATASETS_INFO:
	h.append("<tr><td>%s</td><td class='l'>%s</td><td class='l'>%s</td><td class='l'>%s</td><td>%s</td>"
	"<td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>" % r)
	h.append("</table>")
	h.append("<div class='cap'>¹ BUSI/KiTS19 grayscale stored as 3-ch PNG (read as grayscale). "
	"² no canonical split → one fixed fold (of 5) with 3 seeds; others use the official split. "
	"Labels 0…C-1 (0=bg); multi-class metrics macro-averaged over foreground classes.</div>")
	h.append("<h2>B. Methods</h2>")
	h.append("<table class='info'><tr><th>Method</th><th>Family</th><th>Backbone / setup</th></tr>")
	for m in _METHODS_INFO:
	h.append("<tr><td class='l'>%s</td><td class='l'>%s</td><td class='l'>%s</td></tr>" % m)
	h.append("</table>")
	h.append("<h2>C. Metrics</h2>")
	h.append("<table class='info'><tr><th>Metric</th><th>Definition</th><th>Dir</th><th>Unit</th>"
	"<th>作用 / 含义(中文)</th></tr>")
	for m in _METRICS_INFO:
	h.append("<tr><td class='l'>%s</td><td class='l'>%s</td><td>%s</td><td>%s</td><td class='l'>%s</td></tr>" % m)
	h.append("</table>")
	return "\n".join(h)


	def to_html(rows, runs=None, title="SegGen benchmark", sig=None):
	cell, datasets, methods = _grid(rows)
	if sig is None:
	sig = _sig_tied_sets(runs) if runs else None
	h = [f"<!doctype html><html><head><meta charset='utf-8'><title>{title}</title><style>{_CSS}</style>"
	"</head><body>"]
	h.append(f"<h1>{title}: 8 methods × 10 datasets (unified 512, resolution-fair)</h1>")
	h.append("<p>Eight 2D medical-image segmentation methods on ten public datasets (seven modalities). "
	"Values are <b>mean±SD</b> over 3 seeds (over the 3 folds for PanNuke). "
	"Each (dataset,method) cell aggregates tens–thousands of test images.</p>")
	h.append("<div class='note'><b>Resolution-fair protocol.</b> Convolutional nets train at 512; the fixed-input "
	"transformers (Swin-UNet 224, TransUNet 256) and nnU-Net/U-Mamba run at their native size; "
	"<b>every prediction and ground truth is then resized to a common 512×512 before scoring</b>, so "
	"boundary metrics (HD95/ASSD, in pixels) are directly comparable across methods.</div>")

	h.append("<h2>1. Main results — Dice (%) ↑</h2>")
	h.append("<div class='cap'><b>Bold</b> = best, or not significantly different from the best per dataset "
	"(paired Wilcoxon on per-image Dice, p≥0.05). "
	"Horizontal rule separates CNNs (top) from Transformer / foundation models (bottom). "
	"No cross-dataset average is reported — the seven modalities differ too much in difficulty "
	"for a single number to be meaningful.</div>")
	h.append(_metric_table(cell, datasets, methods, "dice", True, True, bold_sets=sig))

	h.append("<h2>2. Boundary accuracy — HD95 (px) ↓</h2>")
	h.append("<div class='cap'>95% Hausdorff distance at the common 512 resolution (lower = better; "
	"<b>bold</b> = best per dataset). Now comparable across methods.</div>")
	h.append(_metric_table(cell, datasets, methods, "hd95", False, False))

	h.append("<h2>3. Overlap — IoU (%) ↑</h2>")
	h.append("<div class='cap'>Jaccard index, the stricter overlap measure (<b>bold</b> = best per dataset).</div>")
	h.append(_metric_table(cell, datasets, methods, "iou", True, True))

	if runs:
	pcs = _perclass_section(runs)
	if pcs.strip():
	h.append("<h2>4. Per-class Dice (%) — multi-class datasets</h2>")
	h.append("<div class='cap'>Mean per-class Dice over all test images/runs (0=background excluded; "
	"<b>bold</b>=best per class). The <i>macro</i> column weights each foreground class "
	"equally (a within-dataset mean, not a cross-dataset one). It can differ by ~1 pt from "
	"the §1 Dice — which is image-weighted (each image is first averaged over the classes it "
	"contains) — whenever some images lack a class (e.g. ACDC's RV appears in only 335/380 "
	"images); both conventions are standard, neither is an error.</div>")
	h.append(pcs)

	h.append("<h2>5. Supplementary metrics — Sensitivity & Precision (%) ↑</h2>")
	h.append("<div class='cap'>Two complementary error views (<b>bold</b> = best per dataset): low "
	"<b>Sensitivity</b> (recall) signals under-segmentation (missed foreground); low "
	"<b>Precision</b> signals over-segmentation (false positives). <i>Specificity</i> is omitted "
	"— background dominates, so it stays >96% with almost no spread across methods (≤0.6 pt on "
	"average) — and <i>ASSD</i> is omitted as redundant with HD95; both, and every metric, are "
	"tabulated in full in <code>summary.csv</code>.</div>")
	h.append("<h3>Sensitivity / recall ↑</h3>")
	h.append(_metric_table(cell, datasets, methods, "sensitivity", True, True))
	h.append("<h3>Precision ↑</h3>")
	h.append(_metric_table(cell, datasets, methods, "precision", True, True))

	h.append("<hr><h2>Appendix — Experimental setup</h2>")
	h.append("<p class='cap'>Full per-(dataset,method) values for <b>every</b> metric "
	"(IoU, HD95, ASSD, Sensitivity, Specificity, Precision, …) are in "
	"<code>summary.csv</code>; the Dice table as LaTeX is in <code>summary.tex</code>.</p>")
	h.append(_setup_html())
	h.append("</body></html>")
	return "\n".join(h)


	def main():
	p = argparse.ArgumentParser()
	p.add_argument("--exp_name", required=True)
	p.add_argument("--out_root", default="results")
	args = p.parse_args()

	runs = load_runs(args.out_root, args.exp_name)
	if not runs:
	print(f"no metrics.json under {args.out_root}/{args.exp_name}")
	return
	rows = summarize(runs)
	sig = _sig_tied_sets(runs)
	base = os.path.join(args.out_root, args.exp_name)
	open(os.path.join(base, "summary.csv"), "w").write(to_csv(rows))
	open(os.path.join(base, "summary.md"), "w").write(to_markdown(rows, sig))
	open(os.path.join(base, "summary.tex"), "w").write(to_latex(rows, sig))
	open(os.path.join(base, "summary.html"), "w").write(
	to_html(rows, runs, title=f"SegGen benchmark ({args.exp_name})", sig=sig))
	print(to_markdown(rows, sig))
	print(f"{len(runs)} runs -> {len(rows)} (dataset,arch) cells; written {base}/summary.{{csv,md,tex,html}}")


	if __name__ == "__main__":
	main()