Iconoclast / scripts /summarize_multimodel_benchmark.py
OpenAI Codex
Publish Iconoclast research release
3236af9
#!/usr/bin/env python3
import argparse
import json
from pathlib import Path
from typing import Any
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Summarize matched Iconoclast vs Heretic benchmark runs."
)
parser.add_argument(
"--spec",
action="append",
required=True,
help=(
"Benchmark spec in the form "
"'label|iconoclast_checkpoint_or_summary|heretic_checkpoint'. "
"Can be passed multiple times."
),
)
parser.add_argument(
"--format",
choices=("markdown", "json"),
default="markdown",
)
return parser.parse_args()
def load_iconoclast_best(path: Path) -> dict[str, Any]:
if path.name == "batch_summary.json":
summary = json.loads(path.read_text())
trials = summary.get("pareto_trials", [])
if not trials:
raise ValueError(f"No pareto trials found in {path}")
return trials[0]
return load_best_from_study(path)
def load_best_from_study(path: Path) -> dict[str, Any]:
trials: dict[int, dict[str, Any]] = {}
for line in path.read_text().splitlines():
obj = json.loads(line)
trial_id = obj.get("trial_id")
if trial_id is None:
continue
trial = trials.setdefault(trial_id, {"attrs": {}, "state": None})
if obj.get("op_code") == 8 and "user_attr" in obj:
trial["attrs"].update(obj["user_attr"])
elif obj.get("op_code") == 6:
trial["state"] = obj.get("state")
completed = [
trial["attrs"]
for trial in trials.values()
if trial["state"] == 1 and "refusals" in trial["attrs"]
]
if not completed:
raise ValueError(f"No completed trials found in {path}")
completed.sort(
key=lambda attrs: (
attrs.get("refusals", 10**9),
attrs.get("overrefusals", 10**9),
attrs.get("kl_divergence", 10**9),
)
)
return completed[0]
def load_heretic_best(path: Path) -> dict[str, Any]:
if path.name == "batch_summary.json":
summary = json.loads(path.read_text())
trials = summary.get("pareto_trials", [])
if not trials:
raise ValueError(f"No pareto trials found in {path}")
return trials[0]
return load_best_from_study(path)
def format_float(value: Any, digits: int = 4) -> str:
if value is None:
return "n/a"
return f"{float(value):.{digits}f}"
def main() -> None:
args = parse_args()
rows = []
for spec in args.spec:
label, icon_path_str, her_path_str = spec.split("|", 2)
icon_path = Path(icon_path_str)
her_path = Path(her_path_str)
icon_best = load_iconoclast_best(icon_path)
her_best = load_heretic_best(her_path)
rows.append(
{
"label": label,
"iconoclast": {
"refusals": icon_best.get("refusals"),
"overrefusals": icon_best.get("overrefusals", 0),
"kl_divergence": icon_best.get("kl_divergence"),
"harmful_marker_hits": icon_best.get("harmful_marker_hits"),
"harmful_compliance_score": icon_best.get(
"harmful_compliance_score"
),
"trial_index": icon_best.get("index"),
},
"heretic": {
"refusals": her_best.get("refusals"),
"overrefusals": her_best.get("overrefusals", 0),
"kl_divergence": her_best.get("kl_divergence"),
"trial_index": her_best.get("index"),
},
}
)
if args.format == "json":
print(json.dumps(rows, indent=2))
return
print(
"| Model | Iconoclast Refusals | Iconoclast Overrefusals | Iconoclast KL | Heretic Refusals | Heretic Overrefusals | Heretic KL | Verdict |"
)
print("| --- | --- | --- | --- | --- | --- | --- | --- |")
for row in rows:
icon = row["iconoclast"]
her = row["heretic"]
verdict = (
"Iconoclast"
if (
(icon["refusals"], icon["overrefusals"], icon["kl_divergence"])
< (her["refusals"], her["overrefusals"], her["kl_divergence"])
)
else "Heretic"
)
print(
"| "
+ " | ".join(
[
row["label"],
str(icon["refusals"]),
str(icon["overrefusals"]),
format_float(icon["kl_divergence"]),
str(her["refusals"]),
str(her["overrefusals"]),
format_float(her["kl_divergence"]),
verdict,
]
)
+ " |"
)
if __name__ == "__main__":
main()