File size: 4,621 Bytes

fc329a3

"""Generate paper-style summary tables from saved benchmark results."""
import argparse
import json
from pathlib import Path

METHOD_ORDER = [
    "global",
    "partition",
    "twostage",
    "fullcp",
    "jackknife_plus",
    "oneshot",
    "trainres",
    "weighted",
    "oracle",
]

METHOD_LABELS = {
    "global": "Global",
    "partition": "Mondrian",
    "twostage": "TwoStage",
    "fullcp": "FullCP",
    "jackknife_plus": "Jackknife+",
    "oneshot": "OneShot",
    "trainres": "TrainRes",
    "weighted": "Weighted",
    "oracle": "Oracle",
}

SYNTHETIC_SPECS = [
    ("d1_homogeneous.json", "D1 Homogeneous"),
    ("d2_pure_scale.json", "D2 Pure scale"),
    ("d3_discrete_groups_aligned.json", "D3 Discrete aligned"),
    ("d4_model_bias.json", "D4 Bias"),
    ("d5_heavy_tail.json", "D5 Heavy tail"),
    ("d6_high_k.json", "D6† High-K"),
]

REAL_SPECS = [
    ("exp2_2_softmax_cifar10_strata_entropy_fixed.json", "CIFAR-10"),
    ("exp2_3_hyperspectral_samson_nmf_all_methods.json", "Samson"),
    ("exp2_5_topics_K10_all_methods.json", "Topics"),
    ("exp2_6_affective_text.json", "AffectiveText"),
    ("exp2_4_age_ldl_K10_image_knn_main.json", "UTKFace"),
    ("real_bulk_deconv.json", "PBMC"),
]

EXTRA_FILES = {
    "D1 Homogeneous": ["d1_homogeneous_exact.json"],
    "D3 Discrete aligned": ["d3_discrete_groups_aux.json"],
    "D5 Heavy tail": ["d5_heavy_tail_aux.json"],
    "D6† High-K": ["d6_high_k_aux.json", "d6_high_k_exact_appendix.json"],
    "PBMC": ["real_bulk_deconv_fullcp.json", "real_bulk_deconv_aux.json", "real_bulk_deconv_trainres.json"],
    "UTKFace": ["exp2_4_age_ldl_K10_image_knn_fullcp_2k.json"],
}


def load_json(path: Path) -> dict:
    with open(path) as f:
        return json.load(f)


def extract_summary(data: dict) -> dict:
    if "summary" in data:
        return data["summary"]
    if "aggregated" in data:
        return data["aggregated"]
    raise KeyError("Missing summary/aggregated block")


def metric_cell(summary: dict, method: str) -> str:
    if method not in summary:
        return "--"
    cov = summary[method]["marginal_coverage"]["mean"]
    disp = summary[method]["max_disparity"]["mean"]
    worst = summary[method]["worst_stratum_coverage"]["mean"] if "worst_stratum_coverage" in summary[method] else None
    if worst is None and "stratified_coverage" in summary[method]:
        worst = min(v["mean"] for v in summary[method]["stratified_coverage"].values())
    radius = summary[method]["mean_radius"]["mean"] if "mean_radius" in summary[method] else None
    if radius is None:
        return f"{cov:.3f} / {disp:.3f} / {worst:.3f}"
    return f"{cov:.3f} / {disp:.3f} / {worst:.3f} / {radius:.3f}"


def write_markdown_table(out_path: Path, title: str, specs: list[tuple[str, str]], results_dir: Path, extras: dict[str, list[str]] | None = None):
    rows = []
    for filename, label in specs:
        path = results_dir / filename
        if not path.exists():
            continue
        summary = extract_summary(load_json(path))
        if extras and label in extras:
            for extra_name in extras[label]:
                extra_path = results_dir / extra_name
                if extra_path.exists():
                    summary = {**summary, **extract_summary(load_json(extra_path))}
        row = [label]
        for method in METHOD_ORDER:
            row.append(metric_cell(summary, method))
        rows.append(row)

    headers = ["Task"] + [METHOD_LABELS[m] for m in METHOD_ORDER]
    lines = [f"# {title}", "", "Cells report `coverage / disparity / worst-stratum / radius`.", ""]
    lines.append("| " + " | ".join(headers) + " |")
    lines.append("|" + "|".join(["---"] * len(headers)) + "|")
    for row in rows:
        lines.append("| " + " | ".join(row) + " |")
    out_path.write_text("\n".join(lines) + "\n")
    print(f"Saved {out_path}")


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--results-dir", default="results/tables")
    parser.add_argument("--output-dir", default="results/tables")
    args = parser.parse_args()

    results_dir = Path(args.results_dir)
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    write_markdown_table(
        output_dir / "paper_table_synthetic_summary.md",
        "Synthetic Summary",
        SYNTHETIC_SPECS,
        results_dir,
        extras=EXTRA_FILES,
    )
    write_markdown_table(
        output_dir / "paper_table_real_summary.md",
        "Real-Data Summary",
        REAL_SPECS,
        results_dir,
        extras=EXTRA_FILES,
    )


if __name__ == "__main__":
    main()