| |
| """Build VQA-style yes/no question requests from grounded-CBU request JSONL.""" |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import hashlib |
| import json |
| from pathlib import Path |
| from typing import Any |
|
|
|
|
| SYSTEM_PROMPT = """You are a strict visual question answering judge. |
| Return only valid compact JSON. Answer each question using only visible image evidence.""" |
|
|
|
|
| def parse_args() -> argparse.Namespace: |
| parser = argparse.ArgumentParser(description="Build VQA-style requests from CBU verification requests") |
| parser.add_argument("--input", required=True, help="grounded-CBU request JSONL") |
| parser.add_argument("--output", required=True) |
| parser.add_argument("--max-requests", type=int, default=None) |
| parser.add_argument("--sample-records", type=int, default=None) |
| parser.add_argument("--sample-seed", type=int, default=0) |
| parser.add_argument("--max-questions-per-request", type=int, default=None) |
| return parser.parse_args() |
|
|
|
|
| def stable_float(*parts: object) -> float: |
| raw = ":".join(str(part) for part in parts) |
| digest = hashlib.blake2b(raw.encode("utf-8"), digest_size=8).digest() |
| return int.from_bytes(digest, "big") / 2**64 |
|
|
|
|
| def question_for(unit: dict[str, Any]) -> str: |
| category = str(unit.get("category", "")) |
| phrase = str(unit.get("unit", "")).strip() |
| target = str(unit.get("target", "")).strip() |
| if category == "text_rendering": |
| return f"Is the rendered text claim '{phrase}' visibly supported by the image?" |
| if target: |
| return f"Is the visual claim '{target}: {phrase}' supported by the image?" |
| return f"Is the visual claim '{phrase}' supported by the image?" |
|
|
|
|
| def user_prompt(questions: list[dict[str, str]]) -> str: |
| question_json = json.dumps(questions, ensure_ascii=False, separators=(",", ":")) |
| return ( |
| "Answer each visual question using only the image.\n" |
| "Rules:\n" |
| "- Do not use any caption text or outside knowledge.\n" |
| "- Use yes when the image visibly supports the question.\n" |
| "- Use no when the image contradicts the question or lacks visible support.\n" |
| "- Use uncertain when the question is too fine-grained, occluded, unreadable, or visually ambiguous.\n" |
| "- Keep evidence short and grounded in visible image content.\n" |
| "- Return exactly one answer for each input question_id.\n\n" |
| f"questions={question_json}" |
| ) |
|
|
|
|
| def iter_rows(args: argparse.Namespace) -> list[dict[str, Any]]: |
| rows: list[dict[str, Any]] = [] |
| with Path(args.input).open("r", encoding="utf-8") as handle: |
| for line in handle: |
| if args.max_requests is not None and args.sample_records is None and len(rows) >= args.max_requests: |
| break |
| if line.strip(): |
| rows.append(json.loads(line)) |
| if args.sample_records is not None: |
| rows.sort(key=lambda row: stable_float(args.sample_seed, row.get("request_id", ""))) |
| rows = rows[: args.sample_records] |
| rows.sort(key=lambda row: row.get("source_row", 0)) |
| return rows |
|
|
|
|
| def main() -> int: |
| args = parse_args() |
| rows = iter_rows(args) |
| output = Path(args.output) |
| output.parent.mkdir(parents=True, exist_ok=True) |
| written = 0 |
| skipped = 0 |
| with output.open("w", encoding="utf-8") as handle: |
| for row in rows: |
| units = row.get("claimed_units", []) |
| if args.max_questions_per_request is not None: |
| units = units[: args.max_questions_per_request] |
| questions = [ |
| { |
| "question_id": str(unit["unit_id"]), |
| "category": str(unit.get("category", "")), |
| "question": question_for(unit), |
| } |
| for unit in units |
| if isinstance(unit, dict) and isinstance(unit.get("unit_id"), str) |
| ] |
| if not questions: |
| skipped += 1 |
| continue |
| request_id = hashlib.blake2b( |
| f"cbu_vqa_v1:{row.get('request_id')}:{row.get('caption_id')}".encode("utf-8"), |
| digest_size=16, |
| ).hexdigest() |
| out = { |
| "request_id": request_id, |
| "task": "cbu_vqa_v1", |
| "surface": row.get("surface"), |
| "caption_id": row.get("caption_id"), |
| "source_row": row.get("source_row"), |
| "token_budget": row.get("token_budget"), |
| "questions": questions, |
| "system_prompt": SYSTEM_PROMPT, |
| "user_prompt": user_prompt(questions), |
| "image_url": row.get("image_url"), |
| "image_path": row.get("image_path"), |
| "image_sha256": row.get("image_sha256"), |
| "pair_id": row.get("pair_id"), |
| "pair_key": row.get("pair_key"), |
| "public_lookup_key": row.get("public_lookup_key"), |
| "family": row.get("family"), |
| } |
| handle.write(json.dumps(out, ensure_ascii=False) + "\n") |
| written += 1 |
| manifest = { |
| "task": "cbu_vqa_v1", |
| "input": args.input, |
| "output": str(output), |
| "requests": written, |
| "skipped": skipped, |
| "sample_records": args.sample_records, |
| "sample_seed": args.sample_seed, |
| "max_questions_per_request": args.max_questions_per_request, |
| } |
| output.with_suffix(".manifest.json").write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8") |
| print(json.dumps(manifest, indent=2, ensure_ascii=False)) |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|