recap-t2i-evaluation-code-2026 / eval_code /scripts /summarize_grounded_cbu_verify.py
Authors
Initial anonymous NeurIPS 2026 E&D code and results release
7f59fb7 verified
#!/usr/bin/env python3
"""Summarize exact-unit grounded-CBU verification responses."""
from __future__ import annotations
import argparse
import json
from collections import Counter, defaultdict
from pathlib import Path
from typing import Any
STATUSES = [
"grounded",
"unsupported",
"uncertain",
"invalid_text_unit",
"not_a_visual_claim",
"image_unavailable",
]
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Summarize grounded-CBU verification responses")
parser.add_argument("--input", required=True)
parser.add_argument("--output", required=True)
parser.add_argument(
"--include",
action="append",
default=[],
help="Additional response JSONL to merge before latest-by-request summarization.",
)
parser.add_argument(
"--latest-by-request",
action="store_true",
help="Use only the last response per request_id. Useful for append/resume retry logs.",
)
return parser.parse_args()
def unit_lookup(row: dict[str, Any]) -> dict[str, dict[str, Any]]:
return {unit["unit_id"]: unit for unit in row.get("claimed_units", []) if isinstance(unit, dict) and "unit_id" in unit}
def add_rates(stats: dict[str, Any]) -> dict[str, Any]:
valid = stats.get("valid_units", 0)
visual = stats.get("visual_units", 0)
for status in STATUSES:
stats[f"{status}_rate_all"] = stats.get(status, 0) / valid if valid else 0.0
stats[f"{status}_rate_visual"] = stats.get(status, 0) / visual if visual else 0.0
stats["grounded_precision"] = stats.get("grounded", 0) / visual if visual else 0.0
stats["unsupported_rate"] = stats.get("unsupported", 0) / visual if visual else 0.0
stats["uncertain_rate"] = stats.get("uncertain", 0) / visual if visual else 0.0
return stats
def main() -> int:
args = parse_args()
surface_stats: dict[str, Counter[str]] = defaultdict(Counter)
category_stats: dict[str, Counter[str]] = defaultdict(Counter)
status_examples: dict[str, list[dict[str, Any]]] = defaultdict(list)
total = 0
ok = 0
rows: list[dict[str, Any]] = []
input_paths = [Path(args.input), *[Path(item) for item in args.include]]
if args.latest_by_request:
latest: dict[str, dict[str, Any]] = {}
for input_path in input_paths:
with input_path.open("r", encoding="utf-8") as handle:
for line in handle:
if not line.strip():
continue
row = json.loads(line)
request_id = row.get("request_id")
if isinstance(request_id, str):
latest[request_id] = row
rows = list(latest.values())
else:
rows = []
for input_path in input_paths:
with input_path.open("r", encoding="utf-8") as handle:
rows.extend(json.loads(line) for line in handle if line.strip())
for row in rows:
total += 1
surface = row.get("request", {}).get("surface", "__unknown__")
surface_stats[surface]["responses"] += 1
if not row.get("ok"):
surface_stats[surface]["bad"] += 1
continue
ok += 1
surface_stats[surface]["ok"] += 1
lookup = unit_lookup(row.get("request", {}))
for result in row.get("parsed", {}).get("unit_results", []):
unit_id = result.get("unit_id")
unit = lookup.get(unit_id, {})
category = unit.get("category", "__unknown__")
status = result.get("status", "__bad_status__")
surface_stats[surface]["valid_units"] += 1
surface_stats[surface][status] += 1
category_stats[category]["valid_units"] += 1
category_stats[category][status] += 1
if status in {"grounded", "unsupported", "uncertain"}:
surface_stats[surface]["visual_units"] += 1
category_stats[category]["visual_units"] += 1
if status in {"unsupported", "uncertain", "invalid_text_unit", "not_a_visual_claim"} and len(status_examples[status]) < 20:
status_examples[status].append(
{
"surface": surface,
"caption_id": row.get("request", {}).get("caption_id"),
"category": category,
"unit": unit.get("unit"),
"target": unit.get("target"),
"status": status,
"evidence": result.get("evidence"),
}
)
surfaces = {surface: add_rates(dict(counter)) for surface, counter in surface_stats.items()}
categories = {category: add_rates(dict(counter)) for category, counter in category_stats.items()}
out = {
"input": args.input,
"responses": total,
"ok": ok,
"bad": total - ok,
"surfaces": surfaces,
"categories": categories,
"examples": status_examples,
}
Path(args.output).parent.mkdir(parents=True, exist_ok=True)
Path(args.output).write_text(json.dumps(out, indent=2, ensure_ascii=False), encoding="utf-8")
print(json.dumps({"output": args.output, "responses": total, "ok": ok, "bad": total - ok}, indent=2))
return 0
if __name__ == "__main__":
raise SystemExit(main())