File size: 5,692 Bytes

7f59fb7

#!/usr/bin/env python3
"""Build VQA-style yes/no question requests from grounded-CBU request JSONL."""

from __future__ import annotations

import argparse
import hashlib
import json
from pathlib import Path
from typing import Any


SYSTEM_PROMPT = """You are a strict visual question answering judge.
Return only valid compact JSON. Answer each question using only visible image evidence."""


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Build VQA-style requests from CBU verification requests")
    parser.add_argument("--input", required=True, help="grounded-CBU request JSONL")
    parser.add_argument("--output", required=True)
    parser.add_argument("--max-requests", type=int, default=None)
    parser.add_argument("--sample-records", type=int, default=None)
    parser.add_argument("--sample-seed", type=int, default=0)
    parser.add_argument("--max-questions-per-request", type=int, default=None)
    return parser.parse_args()


def stable_float(*parts: object) -> float:
    raw = ":".join(str(part) for part in parts)
    digest = hashlib.blake2b(raw.encode("utf-8"), digest_size=8).digest()
    return int.from_bytes(digest, "big") / 2**64


def question_for(unit: dict[str, Any]) -> str:
    category = str(unit.get("category", ""))
    phrase = str(unit.get("unit", "")).strip()
    target = str(unit.get("target", "")).strip()
    if category == "text_rendering":
        return f"Is the rendered text claim '{phrase}' visibly supported by the image?"
    if target:
        return f"Is the visual claim '{target}: {phrase}' supported by the image?"
    return f"Is the visual claim '{phrase}' supported by the image?"


def user_prompt(questions: list[dict[str, str]]) -> str:
    question_json = json.dumps(questions, ensure_ascii=False, separators=(",", ":"))
    return (
        "Answer each visual question using only the image.\n"
        "Rules:\n"
        "- Do not use any caption text or outside knowledge.\n"
        "- Use yes when the image visibly supports the question.\n"
        "- Use no when the image contradicts the question or lacks visible support.\n"
        "- Use uncertain when the question is too fine-grained, occluded, unreadable, or visually ambiguous.\n"
        "- Keep evidence short and grounded in visible image content.\n"
        "- Return exactly one answer for each input question_id.\n\n"
        f"questions={question_json}"
    )


def iter_rows(args: argparse.Namespace) -> list[dict[str, Any]]:
    rows: list[dict[str, Any]] = []
    with Path(args.input).open("r", encoding="utf-8") as handle:
        for line in handle:
            if args.max_requests is not None and args.sample_records is None and len(rows) >= args.max_requests:
                break
            if line.strip():
                rows.append(json.loads(line))
    if args.sample_records is not None:
        rows.sort(key=lambda row: stable_float(args.sample_seed, row.get("request_id", "")))
        rows = rows[: args.sample_records]
        rows.sort(key=lambda row: row.get("source_row", 0))
    return rows


def main() -> int:
    args = parse_args()
    rows = iter_rows(args)
    output = Path(args.output)
    output.parent.mkdir(parents=True, exist_ok=True)
    written = 0
    skipped = 0
    with output.open("w", encoding="utf-8") as handle:
        for row in rows:
            units = row.get("claimed_units", [])
            if args.max_questions_per_request is not None:
                units = units[: args.max_questions_per_request]
            questions = [
                {
                    "question_id": str(unit["unit_id"]),
                    "category": str(unit.get("category", "")),
                    "question": question_for(unit),
                }
                for unit in units
                if isinstance(unit, dict) and isinstance(unit.get("unit_id"), str)
            ]
            if not questions:
                skipped += 1
                continue
            request_id = hashlib.blake2b(
                f"cbu_vqa_v1:{row.get('request_id')}:{row.get('caption_id')}".encode("utf-8"),
                digest_size=16,
            ).hexdigest()
            out = {
                "request_id": request_id,
                "task": "cbu_vqa_v1",
                "surface": row.get("surface"),
                "caption_id": row.get("caption_id"),
                "source_row": row.get("source_row"),
                "token_budget": row.get("token_budget"),
                "questions": questions,
                "system_prompt": SYSTEM_PROMPT,
                "user_prompt": user_prompt(questions),
                "image_url": row.get("image_url"),
                "image_path": row.get("image_path"),
                "image_sha256": row.get("image_sha256"),
                "pair_id": row.get("pair_id"),
                "pair_key": row.get("pair_key"),
                "public_lookup_key": row.get("public_lookup_key"),
                "family": row.get("family"),
            }
            handle.write(json.dumps(out, ensure_ascii=False) + "\n")
            written += 1
    manifest = {
        "task": "cbu_vqa_v1",
        "input": args.input,
        "output": str(output),
        "requests": written,
        "skipped": skipped,
        "sample_records": args.sample_records,
        "sample_seed": args.sample_seed,
        "max_questions_per_request": args.max_questions_per_request,
    }
    output.with_suffix(".manifest.json").write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8")
    print(json.dumps(manifest, indent=2, ensure_ascii=False))
    return 0


if __name__ == "__main__":
    raise SystemExit(main())