Authors

Initial anonymous NeurIPS 2026 E&D code and results release

7f59fb7 verified 19 days ago

5.69 kB

	#!/usr/bin/env python3
	"""Build VQA-style yes/no question requests from grounded-CBU request JSONL."""

	from __future__ import annotations

	import argparse
	import hashlib
	import json
	from pathlib import Path
	from typing import Any


	SYSTEM_PROMPT = """You are a strict visual question answering judge.
	Return only valid compact JSON. Answer each question using only visible image evidence."""


	def parse_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser(description="Build VQA-style requests from CBU verification requests")
	parser.add_argument("--input", required=True, help="grounded-CBU request JSONL")
	parser.add_argument("--output", required=True)
	parser.add_argument("--max-requests", type=int, default=None)
	parser.add_argument("--sample-records", type=int, default=None)
	parser.add_argument("--sample-seed", type=int, default=0)
	parser.add_argument("--max-questions-per-request", type=int, default=None)
	return parser.parse_args()


	def stable_float(*parts: object) -> float:
	raw = ":".join(str(part) for part in parts)
	digest = hashlib.blake2b(raw.encode("utf-8"), digest_size=8).digest()
	return int.from_bytes(digest, "big") / 2**64


	def question_for(unit: dict[str, Any]) -> str:
	category = str(unit.get("category", ""))
	phrase = str(unit.get("unit", "")).strip()
	target = str(unit.get("target", "")).strip()
	if category == "text_rendering":
	return f"Is the rendered text claim '{phrase}' visibly supported by the image?"
	if target:
	return f"Is the visual claim '{target}: {phrase}' supported by the image?"
	return f"Is the visual claim '{phrase}' supported by the image?"


	def user_prompt(questions: list[dict[str, str]]) -> str:
	question_json = json.dumps(questions, ensure_ascii=False, separators=(",", ":"))
	return (
	"Answer each visual question using only the image.\n"
	"Rules:\n"
	"- Do not use any caption text or outside knowledge.\n"
	"- Use yes when the image visibly supports the question.\n"
	"- Use no when the image contradicts the question or lacks visible support.\n"
	"- Use uncertain when the question is too fine-grained, occluded, unreadable, or visually ambiguous.\n"
	"- Keep evidence short and grounded in visible image content.\n"
	"- Return exactly one answer for each input question_id.\n\n"
	f"questions={question_json}"
	)


	def iter_rows(args: argparse.Namespace) -> list[dict[str, Any]]:
	rows: list[dict[str, Any]] = []
	with Path(args.input).open("r", encoding="utf-8") as handle:
	for line in handle:
	if args.max_requests is not None and args.sample_records is None and len(rows) >= args.max_requests:
	break
	if line.strip():
	rows.append(json.loads(line))
	if args.sample_records is not None:
	rows.sort(key=lambda row: stable_float(args.sample_seed, row.get("request_id", "")))
	rows = rows[: args.sample_records]
	rows.sort(key=lambda row: row.get("source_row", 0))
	return rows


	def main() -> int:
	args = parse_args()
	rows = iter_rows(args)
	output = Path(args.output)
	output.parent.mkdir(parents=True, exist_ok=True)
	written = 0
	skipped = 0
	with output.open("w", encoding="utf-8") as handle:
	for row in rows:
	units = row.get("claimed_units", [])
	if args.max_questions_per_request is not None:
	units = units[: args.max_questions_per_request]
	questions = [
	{
	"question_id": str(unit["unit_id"]),
	"category": str(unit.get("category", "")),
	"question": question_for(unit),
	}
	for unit in units
	if isinstance(unit, dict) and isinstance(unit.get("unit_id"), str)
	]
	if not questions:
	skipped += 1
	continue
	request_id = hashlib.blake2b(
	f"cbu_vqa_v1:{row.get('request_id')}:{row.get('caption_id')}".encode("utf-8"),
	digest_size=16,
	).hexdigest()
	out = {
	"request_id": request_id,
	"task": "cbu_vqa_v1",
	"surface": row.get("surface"),
	"caption_id": row.get("caption_id"),
	"source_row": row.get("source_row"),
	"token_budget": row.get("token_budget"),
	"questions": questions,
	"system_prompt": SYSTEM_PROMPT,
	"user_prompt": user_prompt(questions),
	"image_url": row.get("image_url"),
	"image_path": row.get("image_path"),
	"image_sha256": row.get("image_sha256"),
	"pair_id": row.get("pair_id"),
	"pair_key": row.get("pair_key"),
	"public_lookup_key": row.get("public_lookup_key"),
	"family": row.get("family"),
	}
	handle.write(json.dumps(out, ensure_ascii=False) + "\n")
	written += 1
	manifest = {
	"task": "cbu_vqa_v1",
	"input": args.input,
	"output": str(output),
	"requests": written,
	"skipped": skipped,
	"sample_records": args.sample_records,
	"sample_seed": args.sample_seed,
	"max_questions_per_request": args.max_questions_per_request,
	}
	output.with_suffix(".manifest.json").write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8")
	print(json.dumps(manifest, indent=2, ensure_ascii=False))
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())