recap-t2i-evaluation-code-2026 / eval_code /scripts /build_cbu_vqa_requests.py
Authors
Initial anonymous NeurIPS 2026 E&D code and results release
7f59fb7 verified
#!/usr/bin/env python3
"""Build VQA-style yes/no question requests from grounded-CBU request JSONL."""
from __future__ import annotations
import argparse
import hashlib
import json
from pathlib import Path
from typing import Any
SYSTEM_PROMPT = """You are a strict visual question answering judge.
Return only valid compact JSON. Answer each question using only visible image evidence."""
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Build VQA-style requests from CBU verification requests")
parser.add_argument("--input", required=True, help="grounded-CBU request JSONL")
parser.add_argument("--output", required=True)
parser.add_argument("--max-requests", type=int, default=None)
parser.add_argument("--sample-records", type=int, default=None)
parser.add_argument("--sample-seed", type=int, default=0)
parser.add_argument("--max-questions-per-request", type=int, default=None)
return parser.parse_args()
def stable_float(*parts: object) -> float:
raw = ":".join(str(part) for part in parts)
digest = hashlib.blake2b(raw.encode("utf-8"), digest_size=8).digest()
return int.from_bytes(digest, "big") / 2**64
def question_for(unit: dict[str, Any]) -> str:
category = str(unit.get("category", ""))
phrase = str(unit.get("unit", "")).strip()
target = str(unit.get("target", "")).strip()
if category == "text_rendering":
return f"Is the rendered text claim '{phrase}' visibly supported by the image?"
if target:
return f"Is the visual claim '{target}: {phrase}' supported by the image?"
return f"Is the visual claim '{phrase}' supported by the image?"
def user_prompt(questions: list[dict[str, str]]) -> str:
question_json = json.dumps(questions, ensure_ascii=False, separators=(",", ":"))
return (
"Answer each visual question using only the image.\n"
"Rules:\n"
"- Do not use any caption text or outside knowledge.\n"
"- Use yes when the image visibly supports the question.\n"
"- Use no when the image contradicts the question or lacks visible support.\n"
"- Use uncertain when the question is too fine-grained, occluded, unreadable, or visually ambiguous.\n"
"- Keep evidence short and grounded in visible image content.\n"
"- Return exactly one answer for each input question_id.\n\n"
f"questions={question_json}"
)
def iter_rows(args: argparse.Namespace) -> list[dict[str, Any]]:
rows: list[dict[str, Any]] = []
with Path(args.input).open("r", encoding="utf-8") as handle:
for line in handle:
if args.max_requests is not None and args.sample_records is None and len(rows) >= args.max_requests:
break
if line.strip():
rows.append(json.loads(line))
if args.sample_records is not None:
rows.sort(key=lambda row: stable_float(args.sample_seed, row.get("request_id", "")))
rows = rows[: args.sample_records]
rows.sort(key=lambda row: row.get("source_row", 0))
return rows
def main() -> int:
args = parse_args()
rows = iter_rows(args)
output = Path(args.output)
output.parent.mkdir(parents=True, exist_ok=True)
written = 0
skipped = 0
with output.open("w", encoding="utf-8") as handle:
for row in rows:
units = row.get("claimed_units", [])
if args.max_questions_per_request is not None:
units = units[: args.max_questions_per_request]
questions = [
{
"question_id": str(unit["unit_id"]),
"category": str(unit.get("category", "")),
"question": question_for(unit),
}
for unit in units
if isinstance(unit, dict) and isinstance(unit.get("unit_id"), str)
]
if not questions:
skipped += 1
continue
request_id = hashlib.blake2b(
f"cbu_vqa_v1:{row.get('request_id')}:{row.get('caption_id')}".encode("utf-8"),
digest_size=16,
).hexdigest()
out = {
"request_id": request_id,
"task": "cbu_vqa_v1",
"surface": row.get("surface"),
"caption_id": row.get("caption_id"),
"source_row": row.get("source_row"),
"token_budget": row.get("token_budget"),
"questions": questions,
"system_prompt": SYSTEM_PROMPT,
"user_prompt": user_prompt(questions),
"image_url": row.get("image_url"),
"image_path": row.get("image_path"),
"image_sha256": row.get("image_sha256"),
"pair_id": row.get("pair_id"),
"pair_key": row.get("pair_key"),
"public_lookup_key": row.get("public_lookup_key"),
"family": row.get("family"),
}
handle.write(json.dumps(out, ensure_ascii=False) + "\n")
written += 1
manifest = {
"task": "cbu_vqa_v1",
"input": args.input,
"output": str(output),
"requests": written,
"skipped": skipped,
"sample_records": args.sample_records,
"sample_seed": args.sample_seed,
"max_questions_per_request": args.max_questions_per_request,
}
output.with_suffix(".manifest.json").write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8")
print(json.dumps(manifest, indent=2, ensure_ascii=False))
return 0
if __name__ == "__main__":
raise SystemExit(main())