File size: 8,255 Bytes
7f59fb7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 | #!/usr/bin/env python3
"""Build text-only claimed-CBU extraction requests from caption JSONL files."""
from __future__ import annotations
import argparse
import hashlib
import json
from pathlib import Path
from typing import Any
UNIT_CATEGORIES = [
"object",
"attribute",
"relation",
"style",
"camera",
"lighting",
"count",
"text_rendering",
]
SYSTEM_PROMPT = """You extract atomic controllable visual content units from captions for text-to-image training-data evaluation.
Return only valid compact JSON. Extract only facts explicitly claimed by the caption. Do not infer image content beyond the caption."""
CBU_JSON_SCHEMA: dict[str, Any] = {
"type": "object",
"properties": {
"caption_id": {"type": "string"},
"claimed_units": {
"type": "array",
"items": {
"type": "object",
"properties": {
"category": {"type": "string", "enum": UNIT_CATEGORIES},
"unit": {"type": "string", "maxLength": 80},
"span": {"type": "string", "maxLength": 120},
"target": {"type": "string", "maxLength": 80},
},
"required": ["category", "unit", "span", "target"],
"additionalProperties": False,
},
},
},
"required": ["caption_id", "claimed_units"],
"additionalProperties": False,
}
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Build claimed-CBU extraction request JSONL")
parser.add_argument("--input", required=True, help="Caption JSONL")
parser.add_argument("--output", required=True)
parser.add_argument("--text-field", default="caption")
parser.add_argument("--id-field", default=None)
parser.add_argument("--surface", required=True)
parser.add_argument("--max-records", type=int, default=None)
parser.add_argument("--sample-records", type=int, default=None)
parser.add_argument("--sample-seed", type=int, default=0)
parser.add_argument("--max-caption-chars", type=int, default=1800)
parser.add_argument(
"--token-budget",
type=int,
default=None,
help="Optional whitespace token prefix budget for length-controlled CBU@B requests",
)
parser.add_argument(
"--max-units",
type=int,
default=None,
help="Optional maximum atomic units in the JSON schema; use only for stress/debug caps",
)
return parser.parse_args()
def stable_float(*parts: object) -> float:
raw = ":".join(str(part) for part in parts)
digest = hashlib.blake2b(raw.encode("utf-8"), digest_size=8).digest()
return int.from_bytes(digest, "big") / 2**64
def iter_rows(args: argparse.Namespace) -> list[tuple[int, str | None, str]]:
rows: list[tuple[int, str | None, str]] = []
with Path(args.input).open("r", encoding="utf-8") as handle:
for row_index, line in enumerate(handle):
if args.max_records is not None and args.sample_records is None and len(rows) >= args.max_records:
break
if not line.strip():
continue
row = json.loads(line)
text = row.get(args.text_field)
if not isinstance(text, str) or not text.strip():
continue
row_id = row.get(args.id_field) if args.id_field else None
rows.append((row_index, str(row_id) if row_id is not None else None, text))
if args.sample_records is not None:
rows.sort(key=lambda item: stable_float(args.sample_seed, args.surface, item[0], item[1] or ""))
rows = rows[: args.sample_records]
rows.sort(key=lambda item: item[0])
return rows
def schema_with_max_units(max_units: int | None) -> dict[str, Any]:
schema = json.loads(json.dumps(CBU_JSON_SCHEMA))
if max_units is not None:
schema["properties"]["claimed_units"]["maxItems"] = max_units
return schema
def build_user_prompt(caption_id: str, caption: str, max_caption_chars: int, max_units: int | None) -> str:
clipped = caption[:max_caption_chars].replace("\n", " ")
schema = json.dumps(schema_with_max_units(max_units), ensure_ascii=False, separators=(",", ":"))
categories = ", ".join(UNIT_CATEGORIES)
return (
"Extract caption-claimed controllable visual units as atomic records.\n"
f"Unit categories: {categories}.\n"
"Rules:\n"
"- Each record must contain exactly one visual control fact.\n"
"- Use each semantic fact once; choose the single best category.\n"
"- unit is a short canonical phrase, not a full clause.\n"
"- span is the shortest caption span supporting the unit.\n"
"- target is the object or scene element modified by the unit; use \"scene\" when global.\n"
"- relation units must include both the relation and participating objects; do not output lone verbs or prepositions.\n"
"- count units must attach a number to a target object; never output articles such as a, an, or the.\n"
"- text_rendering units are only visible rendered text explicitly claimed by the caption; absent text claims are not units.\n"
"- Do not output negative or absent facts, metadata, captioner phrases, or duplicate paraphrases.\n"
"- Keep text_rendering units short; do not copy long copyright, table, or legal text blocks.\n"
"- Use [] when the caption contains no controllable visual units.\n"
"Return only JSON matching this schema:\n"
f"{schema}\n\n"
f"caption_id={caption_id}\ncaption={clipped}"
)
def apply_token_budget(caption: str, token_budget: int | None) -> str:
if token_budget is None:
return caption
return " ".join(caption.split()[:token_budget])
def main() -> int:
args = parse_args()
if args.max_records is not None and args.sample_records is not None:
raise SystemExit("--max-records and --sample-records are mutually exclusive")
output = Path(args.output)
output.parent.mkdir(parents=True, exist_ok=True)
rows = iter_rows(args)
with output.open("w", encoding="utf-8") as handle:
for emitted_index, (source_row, row_id, caption) in enumerate(rows):
caption_id = row_id or f"{args.surface}:{source_row}"
request_caption = apply_token_budget(caption, args.token_budget)
budget_tag = f"b{args.token_budget}" if args.token_budget is not None else "full"
request_id = hashlib.blake2b(
f"claimed_cbu_v2:{budget_tag}:{args.surface}:{source_row}:{caption_id}".encode("utf-8"),
digest_size=16,
).hexdigest()
row = {
"request_id": request_id,
"task": "claimed_cbu_v2",
"token_budget": args.token_budget,
"surface": args.surface,
"caption_id": caption_id,
"source_row": source_row,
"emitted_index": emitted_index,
"caption": request_caption,
"source_caption": caption,
"system_prompt": SYSTEM_PROMPT,
"user_prompt": build_user_prompt(caption_id, request_caption, args.max_caption_chars, args.max_units),
}
handle.write(json.dumps(row, ensure_ascii=False) + "\n")
manifest = {
"task": "claimed_cbu_v2",
"input": args.input,
"output": str(output),
"surface": args.surface,
"text_field": args.text_field,
"id_field": args.id_field,
"max_records": args.max_records,
"sample_records": args.sample_records,
"sample_seed": args.sample_seed,
"token_budget": args.token_budget,
"max_units": args.max_units,
"rows": len(rows),
"schema": schema_with_max_units(args.max_units),
}
manifest_path = output.with_suffix(".manifest.json")
manifest_path.write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8")
print(json.dumps({"output": str(output), "manifest": str(manifest_path), "requests": len(rows)}, indent=2))
return 0
if __name__ == "__main__":
raise SystemExit(main())
|