File size: 4,980 Bytes

3236af9

#!/usr/bin/env python3
import argparse
import json
from pathlib import Path
from typing import Any


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Summarize matched Iconoclast vs Heretic benchmark runs."
    )
    parser.add_argument(
        "--spec",
        action="append",
        required=True,
        help=(
            "Benchmark spec in the form "
            "'label|iconoclast_checkpoint_or_summary|heretic_checkpoint'. "
            "Can be passed multiple times."
        ),
    )
    parser.add_argument(
        "--format",
        choices=("markdown", "json"),
        default="markdown",
    )
    return parser.parse_args()


def load_iconoclast_best(path: Path) -> dict[str, Any]:
    if path.name == "batch_summary.json":
        summary = json.loads(path.read_text())
        trials = summary.get("pareto_trials", [])
        if not trials:
            raise ValueError(f"No pareto trials found in {path}")
        return trials[0]

    return load_best_from_study(path)


def load_best_from_study(path: Path) -> dict[str, Any]:
    trials: dict[int, dict[str, Any]] = {}
    for line in path.read_text().splitlines():
        obj = json.loads(line)
        trial_id = obj.get("trial_id")
        if trial_id is None:
            continue
        trial = trials.setdefault(trial_id, {"attrs": {}, "state": None})
        if obj.get("op_code") == 8 and "user_attr" in obj:
            trial["attrs"].update(obj["user_attr"])
        elif obj.get("op_code") == 6:
            trial["state"] = obj.get("state")

    completed = [
        trial["attrs"]
        for trial in trials.values()
        if trial["state"] == 1 and "refusals" in trial["attrs"]
    ]
    if not completed:
        raise ValueError(f"No completed trials found in {path}")

    completed.sort(
        key=lambda attrs: (
            attrs.get("refusals", 10**9),
            attrs.get("overrefusals", 10**9),
            attrs.get("kl_divergence", 10**9),
        )
    )
    return completed[0]


def load_heretic_best(path: Path) -> dict[str, Any]:
    if path.name == "batch_summary.json":
        summary = json.loads(path.read_text())
        trials = summary.get("pareto_trials", [])
        if not trials:
            raise ValueError(f"No pareto trials found in {path}")
        return trials[0]

    return load_best_from_study(path)


def format_float(value: Any, digits: int = 4) -> str:
    if value is None:
        return "n/a"
    return f"{float(value):.{digits}f}"


def main() -> None:
    args = parse_args()
    rows = []

    for spec in args.spec:
        label, icon_path_str, her_path_str = spec.split("|", 2)
        icon_path = Path(icon_path_str)
        her_path = Path(her_path_str)
        icon_best = load_iconoclast_best(icon_path)
        her_best = load_heretic_best(her_path)
        rows.append(
            {
                "label": label,
                "iconoclast": {
                    "refusals": icon_best.get("refusals"),
                    "overrefusals": icon_best.get("overrefusals", 0),
                    "kl_divergence": icon_best.get("kl_divergence"),
                    "harmful_marker_hits": icon_best.get("harmful_marker_hits"),
                    "harmful_compliance_score": icon_best.get(
                        "harmful_compliance_score"
                    ),
                    "trial_index": icon_best.get("index"),
                },
                "heretic": {
                    "refusals": her_best.get("refusals"),
                    "overrefusals": her_best.get("overrefusals", 0),
                    "kl_divergence": her_best.get("kl_divergence"),
                    "trial_index": her_best.get("index"),
                },
            }
        )

    if args.format == "json":
        print(json.dumps(rows, indent=2))
        return

    print(
        "| Model | Iconoclast Refusals | Iconoclast Overrefusals | Iconoclast KL | Heretic Refusals | Heretic Overrefusals | Heretic KL | Verdict |"
    )
    print("| --- | --- | --- | --- | --- | --- | --- | --- |")
    for row in rows:
        icon = row["iconoclast"]
        her = row["heretic"]
        verdict = (
            "Iconoclast"
            if (
                (icon["refusals"], icon["overrefusals"], icon["kl_divergence"])
                < (her["refusals"], her["overrefusals"], her["kl_divergence"])
            )
            else "Heretic"
        )
        print(
            "| "
            + " | ".join(
                [
                    row["label"],
                    str(icon["refusals"]),
                    str(icon["overrefusals"]),
                    format_float(icon["kl_divergence"]),
                    str(her["refusals"]),
                    str(her["overrefusals"]),
                    format_float(her["kl_divergence"]),
                    verdict,
                ]
            )
            + " |"
        )


if __name__ == "__main__":
    main()