recap-t2i-evaluation-code-2026 / eval_code /scripts /build_caption_cbu_requests.py
Authors
Initial anonymous NeurIPS 2026 E&D code and results release
7f59fb7 verified
#!/usr/bin/env python3
"""Build text-only claimed-CBU extraction requests from caption JSONL files."""
from __future__ import annotations
import argparse
import hashlib
import json
from pathlib import Path
from typing import Any
UNIT_CATEGORIES = [
"object",
"attribute",
"relation",
"style",
"camera",
"lighting",
"count",
"text_rendering",
]
SYSTEM_PROMPT = """You extract atomic controllable visual content units from captions for text-to-image training-data evaluation.
Return only valid compact JSON. Extract only facts explicitly claimed by the caption. Do not infer image content beyond the caption."""
CBU_JSON_SCHEMA: dict[str, Any] = {
"type": "object",
"properties": {
"caption_id": {"type": "string"},
"claimed_units": {
"type": "array",
"items": {
"type": "object",
"properties": {
"category": {"type": "string", "enum": UNIT_CATEGORIES},
"unit": {"type": "string", "maxLength": 80},
"span": {"type": "string", "maxLength": 120},
"target": {"type": "string", "maxLength": 80},
},
"required": ["category", "unit", "span", "target"],
"additionalProperties": False,
},
},
},
"required": ["caption_id", "claimed_units"],
"additionalProperties": False,
}
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Build claimed-CBU extraction request JSONL")
parser.add_argument("--input", required=True, help="Caption JSONL")
parser.add_argument("--output", required=True)
parser.add_argument("--text-field", default="caption")
parser.add_argument("--id-field", default=None)
parser.add_argument("--surface", required=True)
parser.add_argument("--max-records", type=int, default=None)
parser.add_argument("--sample-records", type=int, default=None)
parser.add_argument("--sample-seed", type=int, default=0)
parser.add_argument("--max-caption-chars", type=int, default=1800)
parser.add_argument(
"--token-budget",
type=int,
default=None,
help="Optional whitespace token prefix budget for length-controlled CBU@B requests",
)
parser.add_argument(
"--max-units",
type=int,
default=None,
help="Optional maximum atomic units in the JSON schema; use only for stress/debug caps",
)
return parser.parse_args()
def stable_float(*parts: object) -> float:
raw = ":".join(str(part) for part in parts)
digest = hashlib.blake2b(raw.encode("utf-8"), digest_size=8).digest()
return int.from_bytes(digest, "big") / 2**64
def iter_rows(args: argparse.Namespace) -> list[tuple[int, str | None, str]]:
rows: list[tuple[int, str | None, str]] = []
with Path(args.input).open("r", encoding="utf-8") as handle:
for row_index, line in enumerate(handle):
if args.max_records is not None and args.sample_records is None and len(rows) >= args.max_records:
break
if not line.strip():
continue
row = json.loads(line)
text = row.get(args.text_field)
if not isinstance(text, str) or not text.strip():
continue
row_id = row.get(args.id_field) if args.id_field else None
rows.append((row_index, str(row_id) if row_id is not None else None, text))
if args.sample_records is not None:
rows.sort(key=lambda item: stable_float(args.sample_seed, args.surface, item[0], item[1] or ""))
rows = rows[: args.sample_records]
rows.sort(key=lambda item: item[0])
return rows
def schema_with_max_units(max_units: int | None) -> dict[str, Any]:
schema = json.loads(json.dumps(CBU_JSON_SCHEMA))
if max_units is not None:
schema["properties"]["claimed_units"]["maxItems"] = max_units
return schema
def build_user_prompt(caption_id: str, caption: str, max_caption_chars: int, max_units: int | None) -> str:
clipped = caption[:max_caption_chars].replace("\n", " ")
schema = json.dumps(schema_with_max_units(max_units), ensure_ascii=False, separators=(",", ":"))
categories = ", ".join(UNIT_CATEGORIES)
return (
"Extract caption-claimed controllable visual units as atomic records.\n"
f"Unit categories: {categories}.\n"
"Rules:\n"
"- Each record must contain exactly one visual control fact.\n"
"- Use each semantic fact once; choose the single best category.\n"
"- unit is a short canonical phrase, not a full clause.\n"
"- span is the shortest caption span supporting the unit.\n"
"- target is the object or scene element modified by the unit; use \"scene\" when global.\n"
"- relation units must include both the relation and participating objects; do not output lone verbs or prepositions.\n"
"- count units must attach a number to a target object; never output articles such as a, an, or the.\n"
"- text_rendering units are only visible rendered text explicitly claimed by the caption; absent text claims are not units.\n"
"- Do not output negative or absent facts, metadata, captioner phrases, or duplicate paraphrases.\n"
"- Keep text_rendering units short; do not copy long copyright, table, or legal text blocks.\n"
"- Use [] when the caption contains no controllable visual units.\n"
"Return only JSON matching this schema:\n"
f"{schema}\n\n"
f"caption_id={caption_id}\ncaption={clipped}"
)
def apply_token_budget(caption: str, token_budget: int | None) -> str:
if token_budget is None:
return caption
return " ".join(caption.split()[:token_budget])
def main() -> int:
args = parse_args()
if args.max_records is not None and args.sample_records is not None:
raise SystemExit("--max-records and --sample-records are mutually exclusive")
output = Path(args.output)
output.parent.mkdir(parents=True, exist_ok=True)
rows = iter_rows(args)
with output.open("w", encoding="utf-8") as handle:
for emitted_index, (source_row, row_id, caption) in enumerate(rows):
caption_id = row_id or f"{args.surface}:{source_row}"
request_caption = apply_token_budget(caption, args.token_budget)
budget_tag = f"b{args.token_budget}" if args.token_budget is not None else "full"
request_id = hashlib.blake2b(
f"claimed_cbu_v2:{budget_tag}:{args.surface}:{source_row}:{caption_id}".encode("utf-8"),
digest_size=16,
).hexdigest()
row = {
"request_id": request_id,
"task": "claimed_cbu_v2",
"token_budget": args.token_budget,
"surface": args.surface,
"caption_id": caption_id,
"source_row": source_row,
"emitted_index": emitted_index,
"caption": request_caption,
"source_caption": caption,
"system_prompt": SYSTEM_PROMPT,
"user_prompt": build_user_prompt(caption_id, request_caption, args.max_caption_chars, args.max_units),
}
handle.write(json.dumps(row, ensure_ascii=False) + "\n")
manifest = {
"task": "claimed_cbu_v2",
"input": args.input,
"output": str(output),
"surface": args.surface,
"text_field": args.text_field,
"id_field": args.id_field,
"max_records": args.max_records,
"sample_records": args.sample_records,
"sample_seed": args.sample_seed,
"token_budget": args.token_budget,
"max_units": args.max_units,
"rows": len(rows),
"schema": schema_with_max_units(args.max_units),
}
manifest_path = output.with_suffix(".manifest.json")
manifest_path.write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8")
print(json.dumps({"output": str(output), "manifest": str(manifest_path), "requests": len(rows)}, indent=2))
return 0
if __name__ == "__main__":
raise SystemExit(main())