| |
| """Summarize exact-unit grounded-CBU verification responses.""" |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| from collections import Counter, defaultdict |
| from pathlib import Path |
| from typing import Any |
|
|
|
|
| STATUSES = [ |
| "grounded", |
| "unsupported", |
| "uncertain", |
| "invalid_text_unit", |
| "not_a_visual_claim", |
| "image_unavailable", |
| ] |
|
|
|
|
| def parse_args() -> argparse.Namespace: |
| parser = argparse.ArgumentParser(description="Summarize grounded-CBU verification responses") |
| parser.add_argument("--input", required=True) |
| parser.add_argument("--output", required=True) |
| parser.add_argument( |
| "--include", |
| action="append", |
| default=[], |
| help="Additional response JSONL to merge before latest-by-request summarization.", |
| ) |
| parser.add_argument( |
| "--latest-by-request", |
| action="store_true", |
| help="Use only the last response per request_id. Useful for append/resume retry logs.", |
| ) |
| return parser.parse_args() |
|
|
|
|
| def unit_lookup(row: dict[str, Any]) -> dict[str, dict[str, Any]]: |
| return {unit["unit_id"]: unit for unit in row.get("claimed_units", []) if isinstance(unit, dict) and "unit_id" in unit} |
|
|
|
|
| def add_rates(stats: dict[str, Any]) -> dict[str, Any]: |
| valid = stats.get("valid_units", 0) |
| visual = stats.get("visual_units", 0) |
| for status in STATUSES: |
| stats[f"{status}_rate_all"] = stats.get(status, 0) / valid if valid else 0.0 |
| stats[f"{status}_rate_visual"] = stats.get(status, 0) / visual if visual else 0.0 |
| stats["grounded_precision"] = stats.get("grounded", 0) / visual if visual else 0.0 |
| stats["unsupported_rate"] = stats.get("unsupported", 0) / visual if visual else 0.0 |
| stats["uncertain_rate"] = stats.get("uncertain", 0) / visual if visual else 0.0 |
| return stats |
|
|
|
|
| def main() -> int: |
| args = parse_args() |
| surface_stats: dict[str, Counter[str]] = defaultdict(Counter) |
| category_stats: dict[str, Counter[str]] = defaultdict(Counter) |
| status_examples: dict[str, list[dict[str, Any]]] = defaultdict(list) |
| total = 0 |
| ok = 0 |
| rows: list[dict[str, Any]] = [] |
| input_paths = [Path(args.input), *[Path(item) for item in args.include]] |
| if args.latest_by_request: |
| latest: dict[str, dict[str, Any]] = {} |
| for input_path in input_paths: |
| with input_path.open("r", encoding="utf-8") as handle: |
| for line in handle: |
| if not line.strip(): |
| continue |
| row = json.loads(line) |
| request_id = row.get("request_id") |
| if isinstance(request_id, str): |
| latest[request_id] = row |
| rows = list(latest.values()) |
| else: |
| rows = [] |
| for input_path in input_paths: |
| with input_path.open("r", encoding="utf-8") as handle: |
| rows.extend(json.loads(line) for line in handle if line.strip()) |
| for row in rows: |
| total += 1 |
| surface = row.get("request", {}).get("surface", "__unknown__") |
| surface_stats[surface]["responses"] += 1 |
| if not row.get("ok"): |
| surface_stats[surface]["bad"] += 1 |
| continue |
| ok += 1 |
| surface_stats[surface]["ok"] += 1 |
| lookup = unit_lookup(row.get("request", {})) |
| for result in row.get("parsed", {}).get("unit_results", []): |
| unit_id = result.get("unit_id") |
| unit = lookup.get(unit_id, {}) |
| category = unit.get("category", "__unknown__") |
| status = result.get("status", "__bad_status__") |
| surface_stats[surface]["valid_units"] += 1 |
| surface_stats[surface][status] += 1 |
| category_stats[category]["valid_units"] += 1 |
| category_stats[category][status] += 1 |
| if status in {"grounded", "unsupported", "uncertain"}: |
| surface_stats[surface]["visual_units"] += 1 |
| category_stats[category]["visual_units"] += 1 |
| if status in {"unsupported", "uncertain", "invalid_text_unit", "not_a_visual_claim"} and len(status_examples[status]) < 20: |
| status_examples[status].append( |
| { |
| "surface": surface, |
| "caption_id": row.get("request", {}).get("caption_id"), |
| "category": category, |
| "unit": unit.get("unit"), |
| "target": unit.get("target"), |
| "status": status, |
| "evidence": result.get("evidence"), |
| } |
| ) |
| surfaces = {surface: add_rates(dict(counter)) for surface, counter in surface_stats.items()} |
| categories = {category: add_rates(dict(counter)) for category, counter in category_stats.items()} |
| out = { |
| "input": args.input, |
| "responses": total, |
| "ok": ok, |
| "bad": total - ok, |
| "surfaces": surfaces, |
| "categories": categories, |
| "examples": status_examples, |
| } |
| Path(args.output).parent.mkdir(parents=True, exist_ok=True) |
| Path(args.output).write_text(json.dumps(out, indent=2, ensure_ascii=False), encoding="utf-8") |
| print(json.dumps({"output": args.output, "responses": total, "ok": ok, "bad": total - ok}, indent=2)) |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|