Authors commited on
Commit
7f59fb7
·
verified ·
1 Parent(s): 587e704

Initial anonymous NeurIPS 2026 E&D code and results release

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +13 -0
  2. README_RELEASE.md +5 -0
  3. eval_code/configs/recap/vllm_serve_gemma4_31b_it.yaml +40 -0
  4. eval_code/scripts/build_caption_cbu_requests.py +196 -0
  5. eval_code/scripts/build_cbu_vqa_requests.py +139 -0
  6. eval_code/scripts/build_grounded_cbu_verify_requests.py +197 -0
  7. eval_code/scripts/caption_embedding_vendi.py +1330 -0
  8. eval_code/scripts/compute_longclip_retrieval_margin.py +368 -0
  9. eval_code/scripts/export_cbu_metric_tables.py +386 -0
  10. eval_code/scripts/export_cbu_vqa_tables.py +84 -0
  11. eval_code/scripts/pack_recap_ed_metrics.py +223 -0
  12. eval_code/scripts/plot_caption_survey_curves.py +251 -0
  13. eval_code/scripts/run_cbu_vqa_requests.py +261 -0
  14. eval_code/scripts/run_grounded_cbu_verify_requests.py +289 -0
  15. eval_code/scripts/run_text_json_requests.py +256 -0
  16. eval_code/scripts/summarize_cbu_responses.py +296 -0
  17. eval_code/scripts/summarize_cbu_vqa_responses.py +153 -0
  18. eval_code/scripts/summarize_grounded_cbu_verify.py +135 -0
  19. eval_code/scripts/vllm/serve_gemma4_31b_it.sh +72 -0
  20. eval_results/ALL_EVAL_RESULTS_INDEX.md +28 -0
  21. eval_results/README.md +14 -0
  22. eval_results/all_cbu_b64_summary.csv +15 -0
  23. eval_results/all_vqa_b64_summary.csv +17 -0
  24. eval_results/cc12m_budget_frontier_plot.csv +17 -0
  25. eval_results/cc12m_cbu_budget_frontier.png +0 -0
  26. eval_results/cc12m_cbu_vqa_bootstrap_ci.tsv +5 -0
  27. eval_results/cc12m_cbu_yield_efficiency_scatter.png +0 -0
  28. eval_results/cc12m_gemma4_vqa_bootstrap_ci.tsv +5 -0
  29. eval_results/cc12m_longclip_plot.csv +9 -0
  30. eval_results/cc12m_vqa_supported_risk_pareto.csv +9 -0
  31. eval_results/cc12m_vqa_supported_risk_pareto.png +0 -0
  32. eval_results/datacomp-naive-qwen35-baseline-2026-05-02/README.md +106 -0
  33. eval_results/datacomp-naive-qwen35-baseline-2026-05-02/cpu_text_metrics/cpu_text_comparison.md +4 -0
  34. eval_results/datacomp-naive-qwen35-baseline-2026-05-02/cpu_text_metrics/cpu_text_comparison.tsv +3 -0
  35. eval_results/datacomp-naive-qwen35-baseline-2026-05-02/cpu_text_metrics/cpu_text_summary.json +56 -0
  36. eval_results/datacomp-naive-qwen35-baseline-2026-05-02/gemma4_metric_tables/cbu_bootstrap_summary.json +238 -0
  37. eval_results/datacomp-naive-qwen35-baseline-2026-05-02/gemma4_metric_tables/cbu_vqa_gemma4_table.md +3 -0
  38. eval_results/datacomp-naive-qwen35-baseline-2026-05-02/gemma4_metric_tables/cbu_vqa_gemma4_table.tex +7 -0
  39. eval_results/datacomp-naive-qwen35-baseline-2026-05-02/gemma4_metric_tables/claimed_cbu_ci.tsv +2 -0
  40. eval_results/datacomp-naive-qwen35-baseline-2026-05-02/gemma4_metric_tables/grounded_cbu_category_ci.tsv +9 -0
  41. eval_results/datacomp-naive-qwen35-baseline-2026-05-02/gemma4_metric_tables/grounded_cbu_ci.tsv +2 -0
  42. eval_results/datacomp-naive-qwen35-baseline-2026-05-02/naive_qwen35_caption.summary.json +15 -0
  43. eval_results/embeddinggemma_pair_summary.tsv +8 -0
  44. eval_results/eval_results_summary.md +34 -0
  45. eval_results/gemma-cross-corpus-2026-05-02/README.md +3 -0
  46. eval_results/gemma-cross-corpus-2026-05-02/cbu_bootstrap_summary.json +1375 -0
  47. eval_results/gemma-cross-corpus-2026-05-02/cbu_vqa_gemma4_cross_corpus_table.md +10 -0
  48. eval_results/gemma-cross-corpus-2026-05-02/cbu_vqa_gemma4_cross_corpus_table.tex +14 -0
  49. eval_results/gemma-cross-corpus-2026-05-02/claimed_cbu_ci.tsv +1 -0
  50. eval_results/gemma-cross-corpus-2026-05-02/grounded_cbu_category_ci.tsv +65 -0
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # NeurIPS E&D Recap Evaluation Export Bundle
2
+
3
+ This sanitized bundle groups review-facing material for a recaptioned T2I supervision evaluation submission.
4
+
5
+ ## Layout
6
+
7
+ - `dataset_release/`: Hugging Face oriented caption metadata records, grouped pair records, dataset card draft, and Croissant metadata template. Source images are not included.
8
+ - `eval_code/`: reproducible evaluation scripts and vLLM configuration copies.
9
+ - `eval_results/`: compact result tables, plot-ready CSV files, and generated figure drafts.
10
+ - `paper_drafts/`: sanitized writing drafts and appendix notes.
11
+ - `metadata/`: auxiliary export metadata.
12
+
13
+ Local machine paths, usernames, and repository identifiers have been replaced with placeholders.
README_RELEASE.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Anonymous Recap T2I Evaluation Code and Results
2
+
3
+ This repository stages executable evaluation scripts, compact result tables, and manifests for the NeurIPS 2026 E&D review package. Dataset metadata is staged separately at `https://huggingface.co/datasets/Anonymous1557/recap-t2i-evaluation-metadata-2026`.
4
+
5
+ Large image audit tarballs and unredacted source metadata are excluded from this code package and retained in the private SMB archive unless explicitly approved for release.
eval_code/configs/recap/vllm_serve_gemma4_31b_it.yaml ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # vLLM serve config: google/gemma-4-31B-it, DP=8
2
+ #
3
+ # Intended use:
4
+ # VLLM_CONFIG=configs/recap/vllm_serve_gemma4_31b_it.yaml \
5
+ # VLLM_LOG=/tmp/vllm_gemma4_31b_it.log \
6
+ # bash scripts/vllm/serve_gemma4_31b_it.sh start
7
+ #
8
+ # This config is for cross-family VQA/CBU judging. It uses one replica per H200
9
+ # to maximize throughput on image-conditioned yes/no/uncertain audit requests.
10
+
11
+ model: "<HF_CACHE>/models--google--gemma-4-31B-it/snapshots/439edf5652646a0d1bd8b46bfdc1d3645761a445"
12
+ served-model-name: "google/gemma-4-31B-it"
13
+ host: "0.0.0.0"
14
+ port: 8000
15
+
16
+ # Parallelism
17
+ data-parallel-size: 8
18
+ tensor-parallel-size: 1
19
+
20
+ # Memory and concurrency
21
+ dtype: "auto"
22
+ gpu-memory-utilization: 0.94
23
+ max-model-len: 4096
24
+ max-num-seqs: 512
25
+ max-num-batched-tokens: 65536
26
+ max-cudagraph-capture-size: 512
27
+
28
+ # Keep KV compact for high-concurrency VQA judge workloads.
29
+ kv-cache-dtype: "fp8"
30
+
31
+ # Multimodal / throughput
32
+ enable-chunked-prefill: true
33
+ enable-prefix-caching: true
34
+ limit-mm-per-prompt: '{"image": 1}'
35
+ mm-processor-kwargs: '{"max_pixels": 1003520}'
36
+ allowed-local-media-path: "/"
37
+
38
+ # Logging
39
+ disable-uvicorn-access-log: true
40
+ uvicorn-log-level: "warning"
eval_code/scripts/build_caption_cbu_requests.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Build text-only claimed-CBU extraction requests from caption JSONL files."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import hashlib
8
+ import json
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+
13
+ UNIT_CATEGORIES = [
14
+ "object",
15
+ "attribute",
16
+ "relation",
17
+ "style",
18
+ "camera",
19
+ "lighting",
20
+ "count",
21
+ "text_rendering",
22
+ ]
23
+
24
+
25
+ SYSTEM_PROMPT = """You extract atomic controllable visual content units from captions for text-to-image training-data evaluation.
26
+ Return only valid compact JSON. Extract only facts explicitly claimed by the caption. Do not infer image content beyond the caption."""
27
+
28
+
29
+ CBU_JSON_SCHEMA: dict[str, Any] = {
30
+ "type": "object",
31
+ "properties": {
32
+ "caption_id": {"type": "string"},
33
+ "claimed_units": {
34
+ "type": "array",
35
+ "items": {
36
+ "type": "object",
37
+ "properties": {
38
+ "category": {"type": "string", "enum": UNIT_CATEGORIES},
39
+ "unit": {"type": "string", "maxLength": 80},
40
+ "span": {"type": "string", "maxLength": 120},
41
+ "target": {"type": "string", "maxLength": 80},
42
+ },
43
+ "required": ["category", "unit", "span", "target"],
44
+ "additionalProperties": False,
45
+ },
46
+ },
47
+ },
48
+ "required": ["caption_id", "claimed_units"],
49
+ "additionalProperties": False,
50
+ }
51
+
52
+
53
+ def parse_args() -> argparse.Namespace:
54
+ parser = argparse.ArgumentParser(description="Build claimed-CBU extraction request JSONL")
55
+ parser.add_argument("--input", required=True, help="Caption JSONL")
56
+ parser.add_argument("--output", required=True)
57
+ parser.add_argument("--text-field", default="caption")
58
+ parser.add_argument("--id-field", default=None)
59
+ parser.add_argument("--surface", required=True)
60
+ parser.add_argument("--max-records", type=int, default=None)
61
+ parser.add_argument("--sample-records", type=int, default=None)
62
+ parser.add_argument("--sample-seed", type=int, default=0)
63
+ parser.add_argument("--max-caption-chars", type=int, default=1800)
64
+ parser.add_argument(
65
+ "--token-budget",
66
+ type=int,
67
+ default=None,
68
+ help="Optional whitespace token prefix budget for length-controlled CBU@B requests",
69
+ )
70
+ parser.add_argument(
71
+ "--max-units",
72
+ type=int,
73
+ default=None,
74
+ help="Optional maximum atomic units in the JSON schema; use only for stress/debug caps",
75
+ )
76
+ return parser.parse_args()
77
+
78
+
79
+ def stable_float(*parts: object) -> float:
80
+ raw = ":".join(str(part) for part in parts)
81
+ digest = hashlib.blake2b(raw.encode("utf-8"), digest_size=8).digest()
82
+ return int.from_bytes(digest, "big") / 2**64
83
+
84
+
85
+ def iter_rows(args: argparse.Namespace) -> list[tuple[int, str | None, str]]:
86
+ rows: list[tuple[int, str | None, str]] = []
87
+ with Path(args.input).open("r", encoding="utf-8") as handle:
88
+ for row_index, line in enumerate(handle):
89
+ if args.max_records is not None and args.sample_records is None and len(rows) >= args.max_records:
90
+ break
91
+ if not line.strip():
92
+ continue
93
+ row = json.loads(line)
94
+ text = row.get(args.text_field)
95
+ if not isinstance(text, str) or not text.strip():
96
+ continue
97
+ row_id = row.get(args.id_field) if args.id_field else None
98
+ rows.append((row_index, str(row_id) if row_id is not None else None, text))
99
+ if args.sample_records is not None:
100
+ rows.sort(key=lambda item: stable_float(args.sample_seed, args.surface, item[0], item[1] or ""))
101
+ rows = rows[: args.sample_records]
102
+ rows.sort(key=lambda item: item[0])
103
+ return rows
104
+
105
+
106
+ def schema_with_max_units(max_units: int | None) -> dict[str, Any]:
107
+ schema = json.loads(json.dumps(CBU_JSON_SCHEMA))
108
+ if max_units is not None:
109
+ schema["properties"]["claimed_units"]["maxItems"] = max_units
110
+ return schema
111
+
112
+
113
+ def build_user_prompt(caption_id: str, caption: str, max_caption_chars: int, max_units: int | None) -> str:
114
+ clipped = caption[:max_caption_chars].replace("\n", " ")
115
+ schema = json.dumps(schema_with_max_units(max_units), ensure_ascii=False, separators=(",", ":"))
116
+ categories = ", ".join(UNIT_CATEGORIES)
117
+ return (
118
+ "Extract caption-claimed controllable visual units as atomic records.\n"
119
+ f"Unit categories: {categories}.\n"
120
+ "Rules:\n"
121
+ "- Each record must contain exactly one visual control fact.\n"
122
+ "- Use each semantic fact once; choose the single best category.\n"
123
+ "- unit is a short canonical phrase, not a full clause.\n"
124
+ "- span is the shortest caption span supporting the unit.\n"
125
+ "- target is the object or scene element modified by the unit; use \"scene\" when global.\n"
126
+ "- relation units must include both the relation and participating objects; do not output lone verbs or prepositions.\n"
127
+ "- count units must attach a number to a target object; never output articles such as a, an, or the.\n"
128
+ "- text_rendering units are only visible rendered text explicitly claimed by the caption; absent text claims are not units.\n"
129
+ "- Do not output negative or absent facts, metadata, captioner phrases, or duplicate paraphrases.\n"
130
+ "- Keep text_rendering units short; do not copy long copyright, table, or legal text blocks.\n"
131
+ "- Use [] when the caption contains no controllable visual units.\n"
132
+ "Return only JSON matching this schema:\n"
133
+ f"{schema}\n\n"
134
+ f"caption_id={caption_id}\ncaption={clipped}"
135
+ )
136
+
137
+
138
+ def apply_token_budget(caption: str, token_budget: int | None) -> str:
139
+ if token_budget is None:
140
+ return caption
141
+ return " ".join(caption.split()[:token_budget])
142
+
143
+
144
+ def main() -> int:
145
+ args = parse_args()
146
+ if args.max_records is not None and args.sample_records is not None:
147
+ raise SystemExit("--max-records and --sample-records are mutually exclusive")
148
+ output = Path(args.output)
149
+ output.parent.mkdir(parents=True, exist_ok=True)
150
+ rows = iter_rows(args)
151
+ with output.open("w", encoding="utf-8") as handle:
152
+ for emitted_index, (source_row, row_id, caption) in enumerate(rows):
153
+ caption_id = row_id or f"{args.surface}:{source_row}"
154
+ request_caption = apply_token_budget(caption, args.token_budget)
155
+ budget_tag = f"b{args.token_budget}" if args.token_budget is not None else "full"
156
+ request_id = hashlib.blake2b(
157
+ f"claimed_cbu_v2:{budget_tag}:{args.surface}:{source_row}:{caption_id}".encode("utf-8"),
158
+ digest_size=16,
159
+ ).hexdigest()
160
+ row = {
161
+ "request_id": request_id,
162
+ "task": "claimed_cbu_v2",
163
+ "token_budget": args.token_budget,
164
+ "surface": args.surface,
165
+ "caption_id": caption_id,
166
+ "source_row": source_row,
167
+ "emitted_index": emitted_index,
168
+ "caption": request_caption,
169
+ "source_caption": caption,
170
+ "system_prompt": SYSTEM_PROMPT,
171
+ "user_prompt": build_user_prompt(caption_id, request_caption, args.max_caption_chars, args.max_units),
172
+ }
173
+ handle.write(json.dumps(row, ensure_ascii=False) + "\n")
174
+ manifest = {
175
+ "task": "claimed_cbu_v2",
176
+ "input": args.input,
177
+ "output": str(output),
178
+ "surface": args.surface,
179
+ "text_field": args.text_field,
180
+ "id_field": args.id_field,
181
+ "max_records": args.max_records,
182
+ "sample_records": args.sample_records,
183
+ "sample_seed": args.sample_seed,
184
+ "token_budget": args.token_budget,
185
+ "max_units": args.max_units,
186
+ "rows": len(rows),
187
+ "schema": schema_with_max_units(args.max_units),
188
+ }
189
+ manifest_path = output.with_suffix(".manifest.json")
190
+ manifest_path.write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8")
191
+ print(json.dumps({"output": str(output), "manifest": str(manifest_path), "requests": len(rows)}, indent=2))
192
+ return 0
193
+
194
+
195
+ if __name__ == "__main__":
196
+ raise SystemExit(main())
eval_code/scripts/build_cbu_vqa_requests.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Build VQA-style yes/no question requests from grounded-CBU request JSONL."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import hashlib
8
+ import json
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+
13
+ SYSTEM_PROMPT = """You are a strict visual question answering judge.
14
+ Return only valid compact JSON. Answer each question using only visible image evidence."""
15
+
16
+
17
+ def parse_args() -> argparse.Namespace:
18
+ parser = argparse.ArgumentParser(description="Build VQA-style requests from CBU verification requests")
19
+ parser.add_argument("--input", required=True, help="grounded-CBU request JSONL")
20
+ parser.add_argument("--output", required=True)
21
+ parser.add_argument("--max-requests", type=int, default=None)
22
+ parser.add_argument("--sample-records", type=int, default=None)
23
+ parser.add_argument("--sample-seed", type=int, default=0)
24
+ parser.add_argument("--max-questions-per-request", type=int, default=None)
25
+ return parser.parse_args()
26
+
27
+
28
+ def stable_float(*parts: object) -> float:
29
+ raw = ":".join(str(part) for part in parts)
30
+ digest = hashlib.blake2b(raw.encode("utf-8"), digest_size=8).digest()
31
+ return int.from_bytes(digest, "big") / 2**64
32
+
33
+
34
+ def question_for(unit: dict[str, Any]) -> str:
35
+ category = str(unit.get("category", ""))
36
+ phrase = str(unit.get("unit", "")).strip()
37
+ target = str(unit.get("target", "")).strip()
38
+ if category == "text_rendering":
39
+ return f"Is the rendered text claim '{phrase}' visibly supported by the image?"
40
+ if target:
41
+ return f"Is the visual claim '{target}: {phrase}' supported by the image?"
42
+ return f"Is the visual claim '{phrase}' supported by the image?"
43
+
44
+
45
+ def user_prompt(questions: list[dict[str, str]]) -> str:
46
+ question_json = json.dumps(questions, ensure_ascii=False, separators=(",", ":"))
47
+ return (
48
+ "Answer each visual question using only the image.\n"
49
+ "Rules:\n"
50
+ "- Do not use any caption text or outside knowledge.\n"
51
+ "- Use yes when the image visibly supports the question.\n"
52
+ "- Use no when the image contradicts the question or lacks visible support.\n"
53
+ "- Use uncertain when the question is too fine-grained, occluded, unreadable, or visually ambiguous.\n"
54
+ "- Keep evidence short and grounded in visible image content.\n"
55
+ "- Return exactly one answer for each input question_id.\n\n"
56
+ f"questions={question_json}"
57
+ )
58
+
59
+
60
+ def iter_rows(args: argparse.Namespace) -> list[dict[str, Any]]:
61
+ rows: list[dict[str, Any]] = []
62
+ with Path(args.input).open("r", encoding="utf-8") as handle:
63
+ for line in handle:
64
+ if args.max_requests is not None and args.sample_records is None and len(rows) >= args.max_requests:
65
+ break
66
+ if line.strip():
67
+ rows.append(json.loads(line))
68
+ if args.sample_records is not None:
69
+ rows.sort(key=lambda row: stable_float(args.sample_seed, row.get("request_id", "")))
70
+ rows = rows[: args.sample_records]
71
+ rows.sort(key=lambda row: row.get("source_row", 0))
72
+ return rows
73
+
74
+
75
+ def main() -> int:
76
+ args = parse_args()
77
+ rows = iter_rows(args)
78
+ output = Path(args.output)
79
+ output.parent.mkdir(parents=True, exist_ok=True)
80
+ written = 0
81
+ skipped = 0
82
+ with output.open("w", encoding="utf-8") as handle:
83
+ for row in rows:
84
+ units = row.get("claimed_units", [])
85
+ if args.max_questions_per_request is not None:
86
+ units = units[: args.max_questions_per_request]
87
+ questions = [
88
+ {
89
+ "question_id": str(unit["unit_id"]),
90
+ "category": str(unit.get("category", "")),
91
+ "question": question_for(unit),
92
+ }
93
+ for unit in units
94
+ if isinstance(unit, dict) and isinstance(unit.get("unit_id"), str)
95
+ ]
96
+ if not questions:
97
+ skipped += 1
98
+ continue
99
+ request_id = hashlib.blake2b(
100
+ f"cbu_vqa_v1:{row.get('request_id')}:{row.get('caption_id')}".encode("utf-8"),
101
+ digest_size=16,
102
+ ).hexdigest()
103
+ out = {
104
+ "request_id": request_id,
105
+ "task": "cbu_vqa_v1",
106
+ "surface": row.get("surface"),
107
+ "caption_id": row.get("caption_id"),
108
+ "source_row": row.get("source_row"),
109
+ "token_budget": row.get("token_budget"),
110
+ "questions": questions,
111
+ "system_prompt": SYSTEM_PROMPT,
112
+ "user_prompt": user_prompt(questions),
113
+ "image_url": row.get("image_url"),
114
+ "image_path": row.get("image_path"),
115
+ "image_sha256": row.get("image_sha256"),
116
+ "pair_id": row.get("pair_id"),
117
+ "pair_key": row.get("pair_key"),
118
+ "public_lookup_key": row.get("public_lookup_key"),
119
+ "family": row.get("family"),
120
+ }
121
+ handle.write(json.dumps(out, ensure_ascii=False) + "\n")
122
+ written += 1
123
+ manifest = {
124
+ "task": "cbu_vqa_v1",
125
+ "input": args.input,
126
+ "output": str(output),
127
+ "requests": written,
128
+ "skipped": skipped,
129
+ "sample_records": args.sample_records,
130
+ "sample_seed": args.sample_seed,
131
+ "max_questions_per_request": args.max_questions_per_request,
132
+ }
133
+ output.with_suffix(".manifest.json").write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8")
134
+ print(json.dumps(manifest, indent=2, ensure_ascii=False))
135
+ return 0
136
+
137
+
138
+ if __name__ == "__main__":
139
+ raise SystemExit(main())
eval_code/scripts/build_grounded_cbu_verify_requests.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Build exact-unit image audit requests from claimed-CBU responses."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import hashlib
8
+ import json
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+
13
+ SYSTEM_PROMPT = """You are a strict visual grounding judge for text-to-image training captions.
14
+ Return only valid compact JSON. Judge only whether each provided caption-derived unit is visibly supported by the image."""
15
+
16
+
17
+ def parse_args() -> argparse.Namespace:
18
+ parser = argparse.ArgumentParser(description="Build exact-unit grounded-CBU verification requests")
19
+ parser.add_argument("--claimed-responses", required=True)
20
+ parser.add_argument("--source-jsonl", required=True, help="Fair-slice JSONL used to build the claimed requests")
21
+ parser.add_argument("--output", required=True)
22
+ parser.add_argument("--max-requests", type=int, default=None)
23
+ parser.add_argument("--max-units-per-request", type=int, default=None, help="Debug cap only; omit for main audit")
24
+ parser.add_argument("--image-path-field", default=None)
25
+ parser.add_argument(
26
+ "--require-local-image",
27
+ action="store_true",
28
+ help="Skip rows without a local image path. Use for reproducible image-grounded audits.",
29
+ )
30
+ parser.add_argument(
31
+ "--surface-filter",
32
+ default=None,
33
+ help="If set, keep only claimed responses whose request.surface exactly matches this value.",
34
+ )
35
+ return parser.parse_args()
36
+
37
+
38
+ def iter_ok_claims(path: Path, surface_filter: str | None = None) -> list[dict[str, Any]]:
39
+ rows: list[dict[str, Any]] = []
40
+ with path.open("r", encoding="utf-8") as handle:
41
+ for line in handle:
42
+ if not line.strip():
43
+ continue
44
+ row = json.loads(line)
45
+ parsed = row.get("parsed")
46
+ request = row.get("request", {})
47
+ if surface_filter is not None and request.get("surface") != surface_filter:
48
+ continue
49
+ units = parsed.get("claimed_units") if isinstance(parsed, dict) else None
50
+ if not row.get("ok") or not isinstance(units, list) or not units:
51
+ continue
52
+ rows.append({"request": request, "parsed": parsed})
53
+ return rows
54
+
55
+
56
+ def load_source_rows(source_jsonl: Path, needed: set[int]) -> dict[int, dict[str, Any]]:
57
+ out: dict[int, dict[str, Any]] = {}
58
+ with source_jsonl.open("r", encoding="utf-8") as handle:
59
+ for index, line in enumerate(handle):
60
+ if index in needed and line.strip():
61
+ out[index] = json.loads(line)
62
+ if len(out) == len(needed):
63
+ break
64
+ return out
65
+
66
+
67
+ def image_fields(source_row: dict[str, Any], image_path_field: str | None) -> dict[str, Any]:
68
+ image = source_row.get("image") if isinstance(source_row.get("image"), dict) else {}
69
+ metadata = source_row.get("metadata") if isinstance(source_row.get("metadata"), dict) else {}
70
+ local_record = source_row.get("local_record") if isinstance(source_row.get("local_record"), dict) else {}
71
+ public_record = source_row.get("public_record") if isinstance(source_row.get("public_record"), dict) else {}
72
+ if image_path_field:
73
+ image_path = source_row.get(image_path_field)
74
+ else:
75
+ image_path = (
76
+ image.get("local_abs_path")
77
+ or local_record.get("image_abs_path")
78
+ or source_row.get("image_abs_path")
79
+ or source_row.get("image_path")
80
+ )
81
+ image_url = (
82
+ image.get("url")
83
+ or source_row.get("url")
84
+ or source_row.get("image_url")
85
+ or metadata.get("canonical_url")
86
+ or public_record.get("url")
87
+ or source_row.get("pair_key")
88
+ )
89
+ return {
90
+ "image_url": image_url,
91
+ "image_path": image_path,
92
+ "image_sha256": image.get("sha256") or source_row.get("sha256"),
93
+ "pair_id": source_row.get("pair_id"),
94
+ "pair_key": source_row.get("pair_key"),
95
+ "public_lookup_key": source_row.get("public_lookup_key"),
96
+ "family": source_row.get("family"),
97
+ }
98
+
99
+
100
+ def normalize_unit(raw: dict[str, Any], caption_id: str, index: int) -> dict[str, str]:
101
+ return {
102
+ "unit_id": f"{caption_id}:u{index:04d}",
103
+ "category": str(raw.get("category", "")),
104
+ "unit": str(raw.get("unit", "")),
105
+ "span": str(raw.get("span", "")),
106
+ "target": str(raw.get("target", "")),
107
+ }
108
+
109
+
110
+ def user_prompt(caption: str, units: list[dict[str, str]]) -> str:
111
+ unit_json = json.dumps(units, ensure_ascii=False, separators=(",", ":"))
112
+ return (
113
+ "Verify the visual grounding of each provided caption-derived unit.\n"
114
+ "Rules:\n"
115
+ "- Do not add, remove, split, merge, rename, or reinterpret unit_id values.\n"
116
+ "- Use grounded when the image visibly supports the unit.\n"
117
+ "- Use unsupported when the image contradicts the unit or lacks visible support.\n"
118
+ "- Use uncertain when the unit is too fine-grained, occluded, unreadable, or visually ambiguous.\n"
119
+ "- Use invalid_text_unit only when the unit is not a meaningful visual claim from the caption.\n"
120
+ "- Use not_a_visual_claim only for non-visual metadata or captioner-language units.\n"
121
+ "- Keep evidence short; cite only visible image evidence.\n"
122
+ "Return JSON with caption_id and unit_results, exactly one result for each input unit_id.\n\n"
123
+ f"caption={caption}\n"
124
+ f"claimed_units={unit_json}"
125
+ )
126
+
127
+
128
+ def main() -> int:
129
+ args = parse_args()
130
+ claims = iter_ok_claims(Path(args.claimed_responses), args.surface_filter)
131
+ if args.max_requests is not None:
132
+ claims = claims[: args.max_requests]
133
+ needed = {int(item["request"]["source_row"]) for item in claims if item["request"].get("source_row") is not None}
134
+ sources = load_source_rows(Path(args.source_jsonl), needed)
135
+ output = Path(args.output)
136
+ output.parent.mkdir(parents=True, exist_ok=True)
137
+ written = 0
138
+ skipped = 0
139
+ with output.open("w", encoding="utf-8") as handle:
140
+ for item in claims:
141
+ req = item["request"]
142
+ source_row = sources.get(int(req["source_row"]))
143
+ if source_row is None:
144
+ skipped += 1
145
+ continue
146
+ image_info = image_fields(source_row, args.image_path_field)
147
+ if args.require_local_image and not image_info.get("image_path"):
148
+ skipped += 1
149
+ continue
150
+ caption_id = str(item["parsed"].get("caption_id") or req.get("caption_id"))
151
+ units = [
152
+ normalize_unit(raw, caption_id, index)
153
+ for index, raw in enumerate(item["parsed"].get("claimed_units", []))
154
+ if isinstance(raw, dict)
155
+ ]
156
+ if args.max_units_per_request is not None:
157
+ units = units[: args.max_units_per_request]
158
+ if not units:
159
+ skipped += 1
160
+ continue
161
+ row = {
162
+ "request_id": hashlib.blake2b(
163
+ f"grounded_cbu_verify_v2:{req.get('request_id')}:{caption_id}".encode("utf-8"),
164
+ digest_size=16,
165
+ ).hexdigest(),
166
+ "task": "grounded_cbu_verify_v2",
167
+ "surface": req.get("surface"),
168
+ "caption_id": caption_id,
169
+ "source_row": req.get("source_row"),
170
+ "token_budget": req.get("token_budget"),
171
+ "caption": req.get("caption"),
172
+ "source_caption": req.get("source_caption"),
173
+ "claimed_units": units,
174
+ "system_prompt": SYSTEM_PROMPT,
175
+ "user_prompt": user_prompt(str(req.get("caption", "")), units),
176
+ **image_info,
177
+ }
178
+ handle.write(json.dumps(row, ensure_ascii=False) + "\n")
179
+ written += 1
180
+ manifest = {
181
+ "task": "grounded_cbu_verify_v2",
182
+ "claimed_responses": args.claimed_responses,
183
+ "source_jsonl": args.source_jsonl,
184
+ "output": str(output),
185
+ "requests": written,
186
+ "skipped": skipped,
187
+ "max_requests": args.max_requests,
188
+ "max_units_per_request": args.max_units_per_request,
189
+ "surface_filter": args.surface_filter,
190
+ }
191
+ output.with_suffix(".manifest.json").write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8")
192
+ print(json.dumps(manifest, indent=2, ensure_ascii=False))
193
+ return 0
194
+
195
+
196
+ if __name__ == "__main__":
197
+ raise SystemExit(main())
eval_code/scripts/caption_embedding_vendi.py ADDED
@@ -0,0 +1,1330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Encode caption text and compute block Vendi scores.
3
+
4
+ The script is intentionally split into three subcommands:
5
+ - `inspect`: report tokenizer/config limits for candidate encoders
6
+ - `encode`: cache normalized text embeddings from JSONL captions
7
+ - `vendi`: compute sampled block Vendi/effective-rank summaries from caches
8
+
9
+ The encoder path is GPU-ready but the same code can be sanity-checked on CPU
10
+ with a tiny sample before H200 allocation.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import argparse
16
+ import json
17
+ import math
18
+ import random
19
+ import sys
20
+ import time
21
+ import types
22
+ from dataclasses import asdict, dataclass
23
+ from pathlib import Path
24
+ from typing import Any, Iterable
25
+
26
+ import numpy as np
27
+ import torch
28
+
29
+
30
+ @dataclass
31
+ class EmbeddingShard:
32
+ path: str
33
+ rows: int
34
+ dim: int
35
+ dtype: str
36
+ start_row: int
37
+ end_row: int
38
+
39
+
40
+ def parse_args() -> argparse.Namespace:
41
+ parser = argparse.ArgumentParser(description="Caption embedding cache and Vendi utilities")
42
+ subparsers = parser.add_subparsers(dest="cmd", required=True)
43
+
44
+ inspect = subparsers.add_parser("inspect", help="Inspect tokenizer/model text limits")
45
+ inspect.add_argument("--model", action="append", required=True, help="HF model id/path; may be repeated")
46
+ inspect.add_argument("--trust-remote-code", action="store_true")
47
+ inspect.add_argument(
48
+ "--compat-remote-code",
49
+ action="store_true",
50
+ help="Install small compatibility shims for older HF remote-code embedding models.",
51
+ )
52
+
53
+ encode = subparsers.add_parser("encode", help="Extract normalized text embeddings")
54
+ encode.add_argument("--input", required=True, help="JSONL input")
55
+ encode.add_argument("--text-field", default="caption")
56
+ encode.add_argument("--id-field", default=None)
57
+ encode.add_argument("--model", required=True)
58
+ encode.add_argument("--output-dir", required=True)
59
+ encode.add_argument("--max-records", type=int, default=None)
60
+ encode.add_argument(
61
+ "--sample-records",
62
+ type=int,
63
+ default=None,
64
+ help="Reservoir-sample this many records before modulo splitting. Mutually exclusive with --max-records.",
65
+ )
66
+ encode.add_argument("--sample-seed", type=int, default=0)
67
+ encode.add_argument("--split-count", type=int, default=1, help="Modulo split count for multi-GPU extraction")
68
+ encode.add_argument("--split-index", type=int, default=0, help="Modulo split index for this worker")
69
+ encode.add_argument("--batch-size", type=int, default=256)
70
+ encode.add_argument("--max-length", type=int, default=None)
71
+ encode.add_argument("--device", default="cuda")
72
+ encode.add_argument("--dtype", default="float16", choices=["float16", "bfloat16", "float32"])
73
+ encode.add_argument("--embedding-dtype", default="float16", choices=["float16", "float32"])
74
+ encode.add_argument("--shard-rows", type=int, default=100_000)
75
+ encode.add_argument("--pooling", default="auto", choices=["auto", "cls", "mean", "pooler", "last"])
76
+ encode.add_argument("--padding-side", default=None, choices=["left", "right"], help="Override tokenizer padding side")
77
+ encode.add_argument("--text-prefix", default="", help="Prefix applied to every text before tokenization")
78
+ encode.add_argument(
79
+ "--text-template",
80
+ default=None,
81
+ help="Python format template applied before tokenization. Must contain '{text}'. Overrides --text-prefix.",
82
+ )
83
+ encode.add_argument("--trust-remote-code", action="store_true")
84
+ encode.add_argument(
85
+ "--compat-remote-code",
86
+ action="store_true",
87
+ help="Install small compatibility shims for older HF remote-code embedding models.",
88
+ )
89
+ encode.add_argument("--compile", action="store_true")
90
+
91
+ bge = subparsers.add_parser("encode-bge-m3", help="Extract official BGE-M3 dense embeddings via FlagEmbedding")
92
+ bge.add_argument("--input", required=True, help="JSONL input")
93
+ bge.add_argument("--text-field", default="caption")
94
+ bge.add_argument("--id-field", default=None)
95
+ bge.add_argument("--model", default="BAAI/bge-m3")
96
+ bge.add_argument("--output-dir", required=True)
97
+ bge.add_argument("--max-records", type=int, default=None)
98
+ bge.add_argument("--sample-records", type=int, default=None)
99
+ bge.add_argument("--sample-seed", type=int, default=0)
100
+ bge.add_argument("--split-count", type=int, default=1)
101
+ bge.add_argument("--split-index", type=int, default=0)
102
+ bge.add_argument("--batch-size", type=int, default=256)
103
+ bge.add_argument("--max-length", type=int, default=512)
104
+ bge.add_argument("--device", default="cuda")
105
+ bge.add_argument("--use-fp16", action=argparse.BooleanOptionalAction, default=True)
106
+ bge.add_argument("--embedding-dtype", default="float16", choices=["float16", "float32"])
107
+ bge.add_argument("--shard-rows", type=int, default=100_000)
108
+ bge.add_argument("--text-prefix", default="", help="Prefix applied to every text before encoding")
109
+ bge.add_argument("--text-template", default=None, help="Python format template containing '{text}'")
110
+ bge.add_argument("--encode-mode", default="corpus", choices=["corpus", "queries", "encode"])
111
+ bge.add_argument("--query-instruction", default=None, help="Optional BGEM3 query_instruction_for_retrieval")
112
+ bge.add_argument("--query-instruction-format", default="{}{}", help="BGEM3 query_instruction_format")
113
+
114
+ st = subparsers.add_parser(
115
+ "encode-sentence-transformer",
116
+ help="Extract embeddings with SentenceTransformer's model-specific encode protocol",
117
+ )
118
+ st.add_argument("--input", required=True, help="JSONL input")
119
+ st.add_argument("--text-field", default="caption")
120
+ st.add_argument("--id-field", default=None)
121
+ st.add_argument("--model", required=True)
122
+ st.add_argument("--output-dir", required=True)
123
+ st.add_argument("--max-records", type=int, default=None)
124
+ st.add_argument("--sample-records", type=int, default=None)
125
+ st.add_argument("--sample-seed", type=int, default=0)
126
+ st.add_argument("--split-count", type=int, default=1)
127
+ st.add_argument("--split-index", type=int, default=0)
128
+ st.add_argument("--batch-size", type=int, default=256)
129
+ st.add_argument("--max-length", type=int, default=None)
130
+ st.add_argument("--device", default="cuda")
131
+ st.add_argument("--embedding-dtype", default="float16", choices=["float16", "float32"])
132
+ st.add_argument("--shard-rows", type=int, default=100_000)
133
+ st.add_argument("--text-prefix", default="", help="Prefix applied to every text before encoding")
134
+ st.add_argument("--text-template", default=None, help="Python format template containing '{text}'")
135
+ st.add_argument("--prompt-name", default=None, help="SentenceTransformer prompt_name, e.g. document or query")
136
+
137
+ vendi = subparsers.add_parser("vendi", help="Compute sampled block Vendi from embedding cache")
138
+ vendi.add_argument("--manifest", required=True)
139
+ vendi.add_argument("--output", required=True)
140
+ vendi.add_argument("--block-size", type=int, default=4096)
141
+ vendi.add_argument("--blocks", type=int, default=64)
142
+ vendi.add_argument(
143
+ "--sampling",
144
+ choices=["random", "partition"],
145
+ default="random",
146
+ help="random samples blocks; partition shuffles once and uses every row in disjoint blocks.",
147
+ )
148
+ vendi.add_argument("--seed", type=int, default=0)
149
+ vendi.add_argument("--device", default="cuda")
150
+ vendi.add_argument("--matrix-device", default=None, help="Override device for eigvalsh; defaults to --device")
151
+ vendi.add_argument("--dtype", default="float32", choices=["float16", "bfloat16", "float32"])
152
+
153
+ geom = subparsers.add_parser("geometry", help="Compute embedding-distribution geometry summaries")
154
+ geom.add_argument("--manifest", required=True)
155
+ geom.add_argument("--output", required=True)
156
+ geom.add_argument("--max-rows", type=int, default=100_000)
157
+ geom.add_argument("--seed", type=int, default=0)
158
+ geom.add_argument("--device", default="cuda")
159
+ geom.add_argument("--dtype", default="float32", choices=["float16", "bfloat16", "float32"])
160
+
161
+ knn = subparsers.add_parser("knn", help="Compute exact nearest-neighbor support between two embedding caches")
162
+ knn.add_argument("--query-manifest", required=True)
163
+ knn.add_argument("--gallery-manifest", required=True)
164
+ knn.add_argument("--output", required=True)
165
+ knn.add_argument("--query-max-rows", type=int, default=None)
166
+ knn.add_argument("--gallery-max-rows", type=int, default=None)
167
+ knn.add_argument("--seed", type=int, default=0)
168
+ knn.add_argument("--device", default="cuda")
169
+ knn.add_argument("--dtype", default="float16", choices=["float16", "bfloat16", "float32"])
170
+ knn.add_argument("--query-batch-size", type=int, default=1024)
171
+ knn.add_argument(
172
+ "--gallery-chunk-size",
173
+ type=int,
174
+ default=0,
175
+ help="0 keeps the full gallery resident on device; positive values stream gallery chunks.",
176
+ )
177
+ knn.add_argument("--thresholds", default="0.60,0.70,0.75,0.80,0.85,0.90")
178
+ knn.add_argument("--save-scores", default=None, help="Optional .npy path for per-query nearest-neighbor cosine scores")
179
+
180
+ support = subparsers.add_parser("support", help="Compute PRDC-style query-in-gallery manifold support")
181
+ support.add_argument("--query-manifest", required=True, help="Prompt/query embedding manifest P")
182
+ support.add_argument("--gallery-manifest", required=True, help="Caption/support embedding manifest C")
183
+ support.add_argument("--output", required=True)
184
+ support.add_argument("--query-max-rows", type=int, default=None)
185
+ support.add_argument("--gallery-max-rows", type=int, default=None)
186
+ support.add_argument("--seed", type=int, default=0)
187
+ support.add_argument("--k", type=int, default=10)
188
+ support.add_argument("--device", default="cuda")
189
+ support.add_argument("--dtype", default="float16", choices=["float16", "bfloat16", "float32"])
190
+ support.add_argument("--query-batch-size", type=int, default=512)
191
+ support.add_argument("--gallery-batch-size", type=int, default=512)
192
+ support.add_argument("--save-scores", default=None, help="Optional .npz path for per-query support scores")
193
+
194
+ return parser.parse_args()
195
+
196
+
197
+ def torch_dtype(name: str) -> torch.dtype:
198
+ return {"float16": torch.float16, "bfloat16": torch.bfloat16, "float32": torch.float32}[name]
199
+
200
+
201
+ def numpy_dtype(name: str) -> np.dtype:
202
+ return {"float16": np.float16, "float32": np.float32}[name]
203
+
204
+
205
+ def load_transformers():
206
+ try:
207
+ from transformers import AutoConfig, AutoModel, AutoTokenizer
208
+ except ImportError as exc: # pragma: no cover - depends on uv environment
209
+ raise SystemExit("transformers is required. Run through `uv run` after sourcing .env.") from exc
210
+ return AutoConfig, AutoModel, AutoTokenizer
211
+
212
+
213
+ def install_remote_code_compat() -> None:
214
+ """Compatibility shims for embedding-model remote code.
215
+
216
+ Jina v2 imports `transformers.onnx.OnnxConfig`, which is absent in the
217
+ current Transformers build used by this project. Jina v3 also expects the
218
+ legacy `all_tied_weights_keys` property on PreTrainedModel. The shims are
219
+ intentionally minimal and only installed when requested.
220
+ """
221
+ try:
222
+ import transformers
223
+ from transformers import PreTrainedModel
224
+ except ImportError:
225
+ return
226
+
227
+ if "transformers.onnx" not in sys.modules:
228
+ onnx_module = types.ModuleType("transformers.onnx")
229
+
230
+ class OnnxConfig: # pragma: no cover - exercised by remote code import
231
+ pass
232
+
233
+ onnx_module.OnnxConfig = OnnxConfig
234
+ sys.modules["transformers.onnx"] = onnx_module
235
+ setattr(transformers, "onnx", onnx_module)
236
+
237
+ if not hasattr(PreTrainedModel, "all_tied_weights_keys"):
238
+
239
+ def all_tied_weights_keys(self: Any) -> dict[str, None]:
240
+ stored = getattr(self, "_compat_all_tied_weights_keys", None)
241
+ if stored is not None:
242
+ return stored
243
+ keys = getattr(self, "_tied_weights_keys", None) or []
244
+ return {key: None for key in keys}
245
+
246
+ def set_all_tied_weights_keys(self: Any, value: Any) -> None:
247
+ if isinstance(value, dict):
248
+ self._compat_all_tied_weights_keys = value
249
+ elif value is None:
250
+ self._compat_all_tied_weights_keys = {}
251
+ else:
252
+ self._compat_all_tied_weights_keys = {key: None for key in value}
253
+
254
+ PreTrainedModel.all_tied_weights_keys = property( # type: ignore[attr-defined]
255
+ all_tied_weights_keys,
256
+ set_all_tied_weights_keys,
257
+ )
258
+
259
+ try:
260
+ import transformers.pytorch_utils as pytorch_utils
261
+
262
+ if not hasattr(pytorch_utils, "find_pruneable_heads_and_indices"):
263
+ def find_pruneable_heads_and_indices(
264
+ heads: list[int] | set[int],
265
+ n_heads: int,
266
+ head_size: int,
267
+ already_pruned_heads: set[int],
268
+ ) -> tuple[set[int], torch.Tensor]:
269
+ heads = set(heads) - already_pruned_heads
270
+ mask = torch.ones(n_heads, head_size)
271
+ for head in heads:
272
+ pruned_before = sum(1 if pruned_head < head else 0 for pruned_head in already_pruned_heads)
273
+ mask[head - pruned_before] = 0
274
+ mask = mask.view(-1).contiguous().eq(1)
275
+ index = torch.arange(len(mask))[mask].long()
276
+ return heads, index
277
+
278
+ pytorch_utils.find_pruneable_heads_and_indices = find_pruneable_heads_and_indices
279
+ if not hasattr(pytorch_utils, "prune_linear_layer"):
280
+ from transformers.modeling_utils import prune_linear_layer
281
+
282
+ pytorch_utils.prune_linear_layer = prune_linear_layer
283
+ except Exception:
284
+ pass
285
+
286
+
287
+ def iter_jsonl(
288
+ path: Path,
289
+ text_field: str,
290
+ id_field: str | None,
291
+ max_records: int | None,
292
+ split_count: int,
293
+ split_index: int,
294
+ ) -> Iterable[tuple[str, str | None, int]]:
295
+ emitted = 0
296
+ seen = 0
297
+ with path.open("r", encoding="utf-8") as handle:
298
+ for line in handle:
299
+ if max_records is not None and emitted >= max_records:
300
+ break
301
+ line = line.strip()
302
+ if not line:
303
+ seen += 1
304
+ continue
305
+ row_index = seen
306
+ seen += 1
307
+ if row_index % split_count != split_index:
308
+ continue
309
+ row = json.loads(line)
310
+ text = row.get(text_field)
311
+ if not isinstance(text, str):
312
+ text = ""
313
+ row_id = str(row.get(id_field)) if id_field and row.get(id_field) is not None else None
314
+ emitted += 1
315
+ yield text, row_id, row_index
316
+
317
+
318
+ def iter_jsonl_sampled(
319
+ path: Path,
320
+ text_field: str,
321
+ id_field: str | None,
322
+ sample_records: int,
323
+ sample_seed: int,
324
+ split_count: int,
325
+ split_index: int,
326
+ ) -> Iterable[tuple[str, str | None, int]]:
327
+ if sample_records < 1:
328
+ raise SystemExit("--sample-records must be >= 1")
329
+ rng = random.Random(sample_seed)
330
+ reservoir: list[tuple[str, str | None, int]] = []
331
+ seen = 0
332
+ with path.open("r", encoding="utf-8") as handle:
333
+ for line in handle:
334
+ line = line.strip()
335
+ if not line:
336
+ continue
337
+ row_index = seen
338
+ seen += 1
339
+ row = json.loads(line)
340
+ text = row.get(text_field)
341
+ if not isinstance(text, str):
342
+ text = ""
343
+ row_id = str(row.get(id_field)) if id_field and row.get(id_field) is not None else None
344
+ item = (text, row_id, row_index)
345
+ if len(reservoir) < sample_records:
346
+ reservoir.append(item)
347
+ else:
348
+ replace_index = rng.randrange(seen)
349
+ if replace_index < sample_records:
350
+ reservoir[replace_index] = item
351
+ reservoir.sort(key=lambda item: item[2])
352
+ for emitted, item in enumerate(reservoir):
353
+ if emitted % split_count == split_index:
354
+ yield item
355
+
356
+
357
+ def batched(items: Iterable[tuple[str, str | None, int]], batch_size: int) -> Iterable[list[tuple[str, str | None, int]]]:
358
+ batch: list[tuple[str, str | None, int]] = []
359
+ for item in items:
360
+ batch.append(item)
361
+ if len(batch) >= batch_size:
362
+ yield batch
363
+ batch = []
364
+ if batch:
365
+ yield batch
366
+
367
+
368
+ def config_text_limit(config: Any) -> int | None:
369
+ candidates = []
370
+ for obj in [config, getattr(config, "text_config", None)]:
371
+ if obj is None:
372
+ continue
373
+ for name in ["max_position_embeddings", "max_sequence_length", "context_length", "seq_length"]:
374
+ value = getattr(obj, name, None)
375
+ if isinstance(value, int) and value > 0:
376
+ candidates.append(value)
377
+ return min(candidates) if candidates else None
378
+
379
+
380
+ def inspect_models(args: argparse.Namespace) -> int:
381
+ if args.compat_remote_code:
382
+ install_remote_code_compat()
383
+ AutoConfig, _AutoModel, AutoTokenizer = load_transformers()
384
+ rows = []
385
+ for model_id in args.model:
386
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=args.trust_remote_code)
387
+ config = AutoConfig.from_pretrained(model_id, trust_remote_code=args.trust_remote_code)
388
+ rows.append(
389
+ {
390
+ "model": model_id,
391
+ "model_type": getattr(config, "model_type", None),
392
+ "tokenizer_model_max_length": getattr(tokenizer, "model_max_length", None),
393
+ "config_text_limit": config_text_limit(config),
394
+ "text_config_max_position_embeddings": getattr(getattr(config, "text_config", None), "max_position_embeddings", None),
395
+ "max_position_embeddings": getattr(config, "max_position_embeddings", None),
396
+ "projection_dim": getattr(config, "projection_dim", None) or getattr(config, "projection_size", None),
397
+ "hidden_size": getattr(config, "hidden_size", None) or getattr(getattr(config, "text_config", None), "hidden_size", None),
398
+ }
399
+ )
400
+ print(json.dumps(rows, indent=2, ensure_ascii=False))
401
+ return 0
402
+
403
+
404
+ def load_encoder(
405
+ model_id: str,
406
+ device: str,
407
+ dtype: str,
408
+ trust_remote_code: bool,
409
+ compile_model: bool,
410
+ compat_remote_code: bool,
411
+ padding_side: str | None,
412
+ ):
413
+ if compat_remote_code:
414
+ install_remote_code_compat()
415
+ AutoConfig, AutoModel, AutoTokenizer = load_transformers()
416
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=trust_remote_code)
417
+ if padding_side is not None:
418
+ tokenizer.padding_side = padding_side
419
+ config = None
420
+ if compat_remote_code:
421
+ config = AutoConfig.from_pretrained(model_id, trust_remote_code=trust_remote_code)
422
+ for name, value in {
423
+ "is_decoder": False,
424
+ "add_cross_attention": False,
425
+ "chunk_size_feed_forward": 0,
426
+ "use_return_dict": True,
427
+ "output_attentions": False,
428
+ "output_hidden_states": False,
429
+ }.items():
430
+ if not hasattr(config, name):
431
+ setattr(config, name, value)
432
+ model = AutoModel.from_pretrained(
433
+ model_id,
434
+ config=config,
435
+ dtype=torch_dtype(dtype),
436
+ trust_remote_code=trust_remote_code,
437
+ )
438
+ model.eval().to(device)
439
+ if compile_model:
440
+ model = torch.compile(model)
441
+ return tokenizer, model
442
+
443
+
444
+ def pool_outputs(model: Any, outputs: Any, encoded: dict[str, torch.Tensor], pooling: str) -> torch.Tensor:
445
+ if hasattr(outputs, "text_embeds") and outputs.text_embeds is not None:
446
+ return outputs.text_embeds
447
+ if hasattr(outputs, "pooler_output") and outputs.pooler_output is not None and pooling in {"auto", "pooler"}:
448
+ return outputs.pooler_output
449
+ hidden = outputs.last_hidden_state if hasattr(outputs, "last_hidden_state") else outputs[0]
450
+ if pooling == "last":
451
+ attention = encoded.get("attention_mask")
452
+ if attention is None:
453
+ return hidden[:, -1]
454
+ left_padding = bool((attention[:, -1].sum() == attention.shape[0]).item())
455
+ if left_padding:
456
+ return hidden[:, -1]
457
+ sequence_lengths = attention.sum(dim=1) - 1
458
+ batch_size = hidden.shape[0]
459
+ return hidden[torch.arange(batch_size, device=hidden.device), sequence_lengths]
460
+ if pooling == "cls":
461
+ return hidden[:, 0]
462
+ attention = encoded.get("attention_mask")
463
+ if pooling in {"auto", "mean"} and attention is not None:
464
+ weights = attention.to(hidden.dtype).unsqueeze(-1)
465
+ return (hidden * weights).sum(dim=1) / weights.sum(dim=1).clamp_min(1.0)
466
+ return hidden[:, 0]
467
+
468
+
469
+ @torch.inference_mode()
470
+ def encode_batch(
471
+ tokenizer: Any,
472
+ model: Any,
473
+ texts: list[str],
474
+ device: str,
475
+ max_length: int | None,
476
+ pooling: str,
477
+ ) -> torch.Tensor:
478
+ encoded = tokenizer(
479
+ texts,
480
+ padding=True,
481
+ truncation=True,
482
+ max_length=max_length,
483
+ return_tensors="pt",
484
+ )
485
+ encoded = {key: value.to(device) for key, value in encoded.items()}
486
+ if hasattr(model, "get_text_features"):
487
+ features = model.get_text_features(**encoded)
488
+ if not isinstance(features, torch.Tensor):
489
+ features = pool_outputs(model, features, encoded, pooling)
490
+ else:
491
+ outputs = model(**encoded)
492
+ features = pool_outputs(model, outputs, encoded, pooling)
493
+ features = torch.nn.functional.normalize(features.float(), dim=-1)
494
+ return features.cpu()
495
+
496
+
497
+ def flush_shard(
498
+ output_dir: Path,
499
+ shard_index: int,
500
+ start_row: int,
501
+ rows: list[np.ndarray],
502
+ embedding_dtype: str,
503
+ ) -> EmbeddingShard:
504
+ array = np.asarray(rows, dtype=numpy_dtype(embedding_dtype))
505
+ path = output_dir / f"embeddings-{shard_index:05d}.npy"
506
+ np.save(path, array)
507
+ return EmbeddingShard(
508
+ path=str(path),
509
+ rows=int(array.shape[0]),
510
+ dim=int(array.shape[1]) if array.ndim == 2 else 0,
511
+ dtype=embedding_dtype,
512
+ start_row=start_row,
513
+ end_row=start_row + int(array.shape[0]),
514
+ )
515
+
516
+
517
+ def encode_main(args: argparse.Namespace) -> int:
518
+ output_dir = Path(args.output_dir)
519
+ output_dir.mkdir(parents=True, exist_ok=True)
520
+ tokenizer, model = load_encoder(
521
+ args.model,
522
+ args.device,
523
+ args.dtype,
524
+ args.trust_remote_code,
525
+ args.compile,
526
+ args.compat_remote_code,
527
+ args.padding_side,
528
+ )
529
+ config_limit = config_text_limit(getattr(model, "config", None))
530
+ max_length = args.max_length or config_limit or getattr(tokenizer, "model_max_length", None)
531
+ if isinstance(max_length, int) and max_length > 1_000_000:
532
+ max_length = None
533
+
534
+ rows: list[np.ndarray] = []
535
+ row_ids: list[str | None] = []
536
+ row_indices: list[int] = []
537
+ shards: list[EmbeddingShard] = []
538
+ total = 0
539
+ shard_start = 0
540
+ started = time.time()
541
+ if args.split_count < 1:
542
+ raise SystemExit("--split-count must be >= 1")
543
+ if not (0 <= args.split_index < args.split_count):
544
+ raise SystemExit("--split-index must satisfy 0 <= split_index < split_count")
545
+ if args.sample_records is not None and args.max_records is not None:
546
+ raise SystemExit("--sample-records and --max-records are mutually exclusive")
547
+ if args.text_template is not None and "{text}" not in args.text_template:
548
+ raise SystemExit("--text-template must contain '{text}'")
549
+ if args.sample_records is not None:
550
+ source = iter_jsonl_sampled(
551
+ Path(args.input),
552
+ args.text_field,
553
+ args.id_field,
554
+ args.sample_records,
555
+ args.sample_seed,
556
+ args.split_count,
557
+ args.split_index,
558
+ )
559
+ else:
560
+ source = iter_jsonl(
561
+ Path(args.input),
562
+ args.text_field,
563
+ args.id_field,
564
+ args.max_records,
565
+ args.split_count,
566
+ args.split_index,
567
+ )
568
+ for batch in batched(source, args.batch_size):
569
+ texts = [text for text, _row_id, _row_index in batch]
570
+ if args.text_template is not None:
571
+ texts = [args.text_template.format(text=text) for text in texts]
572
+ elif args.text_prefix:
573
+ texts = [f"{args.text_prefix}{text}" for text in texts]
574
+ ids = [row_id for _text, row_id, _row_index in batch]
575
+ indices = [row_index for _text, _row_id, row_index in batch]
576
+ features = encode_batch(tokenizer, model, texts, args.device, max_length, args.pooling)
577
+ rows.extend(features.numpy())
578
+ row_ids.extend(ids)
579
+ row_indices.extend(indices)
580
+ total += len(batch)
581
+ if len(rows) >= args.shard_rows:
582
+ shards.append(flush_shard(output_dir, len(shards), shard_start, rows, args.embedding_dtype))
583
+ shard_start += len(rows)
584
+ rows = []
585
+ if rows:
586
+ shards.append(flush_shard(output_dir, len(shards), shard_start, rows, args.embedding_dtype))
587
+
588
+ if row_indices:
589
+ with (output_dir / "row_ids.jsonl").open("w", encoding="utf-8") as handle:
590
+ for index, (row_id, row_index) in enumerate(zip(row_ids, row_indices, strict=True)):
591
+ handle.write(
592
+ json.dumps(
593
+ {"split_row": index, "source_row": row_index, "id": row_id},
594
+ ensure_ascii=False,
595
+ )
596
+ + "\n"
597
+ )
598
+
599
+ manifest = {
600
+ "input": args.input,
601
+ "text_field": args.text_field,
602
+ "id_field": args.id_field,
603
+ "model": args.model,
604
+ "max_length": max_length,
605
+ "max_records": args.max_records,
606
+ "sample_records": args.sample_records,
607
+ "sample_seed": args.sample_seed,
608
+ "split_count": args.split_count,
609
+ "split_index": args.split_index,
610
+ "pooling": args.pooling,
611
+ "padding_side": getattr(tokenizer, "padding_side", None),
612
+ "text_prefix": args.text_prefix,
613
+ "text_template": args.text_template,
614
+ "compat_remote_code": args.compat_remote_code,
615
+ "device": args.device,
616
+ "dtype": args.dtype,
617
+ "embedding_dtype": args.embedding_dtype,
618
+ "rows": total,
619
+ "seconds": round(time.time() - started, 3),
620
+ "rows_per_second": round(total / max(time.time() - started, 1e-6), 3),
621
+ "shards": [asdict(shard) for shard in shards],
622
+ }
623
+ (output_dir / "manifest.json").write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8")
624
+ print(json.dumps({"output_dir": str(output_dir), "rows": total, "shards": len(shards), "max_length": max_length}, indent=2))
625
+ return 0
626
+
627
+
628
+ def encode_bge_m3_main(args: argparse.Namespace) -> int:
629
+ try:
630
+ from FlagEmbedding import BGEM3FlagModel
631
+ except ImportError as exc:
632
+ raise SystemExit("FlagEmbedding is required for encode-bge-m3. Install with `uv sync --extra eval`.") from exc
633
+
634
+ output_dir = Path(args.output_dir)
635
+ output_dir.mkdir(parents=True, exist_ok=True)
636
+ if args.split_count < 1:
637
+ raise SystemExit("--split-count must be >= 1")
638
+ if not (0 <= args.split_index < args.split_count):
639
+ raise SystemExit("--split-index must satisfy 0 <= split_index < split_count")
640
+ if args.sample_records is not None and args.max_records is not None:
641
+ raise SystemExit("--sample-records and --max-records are mutually exclusive")
642
+ if args.text_template is not None and "{text}" not in args.text_template:
643
+ raise SystemExit("--text-template must contain '{text}'")
644
+
645
+ model = BGEM3FlagModel(
646
+ args.model,
647
+ normalize_embeddings=True,
648
+ use_fp16=args.use_fp16,
649
+ devices=args.device,
650
+ pooling_method="cls",
651
+ batch_size=args.batch_size,
652
+ query_max_length=args.max_length,
653
+ passage_max_length=args.max_length,
654
+ return_dense=True,
655
+ return_sparse=False,
656
+ return_colbert_vecs=False,
657
+ query_instruction_for_retrieval=args.query_instruction,
658
+ query_instruction_format=args.query_instruction_format,
659
+ )
660
+ if args.sample_records is not None:
661
+ source = iter_jsonl_sampled(
662
+ Path(args.input),
663
+ args.text_field,
664
+ args.id_field,
665
+ args.sample_records,
666
+ args.sample_seed,
667
+ args.split_count,
668
+ args.split_index,
669
+ )
670
+ else:
671
+ source = iter_jsonl(
672
+ Path(args.input),
673
+ args.text_field,
674
+ args.id_field,
675
+ args.max_records,
676
+ args.split_count,
677
+ args.split_index,
678
+ )
679
+
680
+ rows: list[np.ndarray] = []
681
+ row_ids: list[str | None] = []
682
+ row_indices: list[int] = []
683
+ shards: list[EmbeddingShard] = []
684
+ total = 0
685
+ shard_start = 0
686
+ started = time.time()
687
+ for batch in batched(source, args.batch_size):
688
+ texts = [text for text, _row_id, _row_index in batch]
689
+ if args.text_template is not None:
690
+ texts = [args.text_template.format(text=text) for text in texts]
691
+ elif args.text_prefix:
692
+ texts = [f"{args.text_prefix}{text}" for text in texts]
693
+ ids = [row_id for _text, row_id, _row_index in batch]
694
+ indices = [row_index for _text, _row_id, row_index in batch]
695
+ encode_fn = {
696
+ "corpus": model.encode_corpus,
697
+ "queries": model.encode_queries,
698
+ "encode": model.encode,
699
+ }[args.encode_mode]
700
+ encoded = encode_fn(
701
+ texts,
702
+ batch_size=args.batch_size,
703
+ max_length=args.max_length,
704
+ return_dense=True,
705
+ return_sparse=False,
706
+ return_colbert_vecs=False,
707
+ )
708
+ features = np.asarray(encoded["dense_vecs"], dtype=np.float32)
709
+ features /= np.maximum(np.linalg.norm(features, axis=1, keepdims=True), 1e-12)
710
+ rows.extend(features)
711
+ row_ids.extend(ids)
712
+ row_indices.extend(indices)
713
+ total += len(batch)
714
+ if len(rows) >= args.shard_rows:
715
+ shards.append(flush_shard(output_dir, len(shards), shard_start, rows, args.embedding_dtype))
716
+ shard_start += len(rows)
717
+ rows = []
718
+ if rows:
719
+ shards.append(flush_shard(output_dir, len(shards), shard_start, rows, args.embedding_dtype))
720
+
721
+ if row_indices:
722
+ with (output_dir / "row_ids.jsonl").open("w", encoding="utf-8") as handle:
723
+ for index, (row_id, row_index) in enumerate(zip(row_ids, row_indices, strict=True)):
724
+ handle.write(
725
+ json.dumps(
726
+ {"split_row": index, "source_row": row_index, "id": row_id},
727
+ ensure_ascii=False,
728
+ )
729
+ + "\n"
730
+ )
731
+
732
+ elapsed = time.time() - started
733
+ manifest = {
734
+ "input": args.input,
735
+ "text_field": args.text_field,
736
+ "id_field": args.id_field,
737
+ "model": args.model,
738
+ "backend": "FlagEmbedding.BGEM3FlagModel",
739
+ "max_length": args.max_length,
740
+ "max_records": args.max_records,
741
+ "sample_records": args.sample_records,
742
+ "sample_seed": args.sample_seed,
743
+ "split_count": args.split_count,
744
+ "split_index": args.split_index,
745
+ "pooling": "cls",
746
+ "encode_mode": args.encode_mode,
747
+ "normalize_embeddings": True,
748
+ "text_prefix": args.text_prefix,
749
+ "text_template": args.text_template,
750
+ "query_instruction": args.query_instruction,
751
+ "query_instruction_format": args.query_instruction_format,
752
+ "device": args.device,
753
+ "use_fp16": args.use_fp16,
754
+ "embedding_dtype": args.embedding_dtype,
755
+ "rows": total,
756
+ "seconds": round(elapsed, 3),
757
+ "rows_per_second": round(total / max(elapsed, 1e-6), 3),
758
+ "shards": [asdict(shard) for shard in shards],
759
+ }
760
+ (output_dir / "manifest.json").write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8")
761
+ print(json.dumps({"output_dir": str(output_dir), "rows": total, "shards": len(shards), "max_length": args.max_length}, indent=2))
762
+ return 0
763
+
764
+
765
+ def encode_sentence_transformer_main(args: argparse.Namespace) -> int:
766
+ try:
767
+ from sentence_transformers import SentenceTransformer
768
+ except ImportError as exc:
769
+ raise SystemExit("sentence-transformers is required. Run `uv sync --extra eval`.") from exc
770
+
771
+ output_dir = Path(args.output_dir)
772
+ output_dir.mkdir(parents=True, exist_ok=True)
773
+ if args.split_count < 1:
774
+ raise SystemExit("--split-count must be >= 1")
775
+ if not (0 <= args.split_index < args.split_count):
776
+ raise SystemExit("--split-index must satisfy 0 <= split_index < split_count")
777
+ if args.sample_records is not None and args.max_records is not None:
778
+ raise SystemExit("--sample-records and --max-records are mutually exclusive")
779
+ if args.text_template is not None and "{text}" not in args.text_template:
780
+ raise SystemExit("--text-template must contain '{text}'")
781
+
782
+ model = SentenceTransformer(args.model, device=args.device)
783
+ if args.max_length is not None:
784
+ model.max_seq_length = args.max_length
785
+ max_length = int(model.max_seq_length) if getattr(model, "max_seq_length", None) is not None else args.max_length
786
+ if args.sample_records is not None:
787
+ source = iter_jsonl_sampled(
788
+ Path(args.input),
789
+ args.text_field,
790
+ args.id_field,
791
+ args.sample_records,
792
+ args.sample_seed,
793
+ args.split_count,
794
+ args.split_index,
795
+ )
796
+ else:
797
+ source = iter_jsonl(
798
+ Path(args.input),
799
+ args.text_field,
800
+ args.id_field,
801
+ args.max_records,
802
+ args.split_count,
803
+ args.split_index,
804
+ )
805
+
806
+ rows: list[np.ndarray] = []
807
+ row_ids: list[str | None] = []
808
+ row_indices: list[int] = []
809
+ shards: list[EmbeddingShard] = []
810
+ total = 0
811
+ shard_start = 0
812
+ started = time.time()
813
+ for batch in batched(source, args.batch_size):
814
+ texts = [text for text, _row_id, _row_index in batch]
815
+ if args.text_template is not None:
816
+ texts = [args.text_template.format(text=text) for text in texts]
817
+ elif args.text_prefix:
818
+ texts = [f"{args.text_prefix}{text}" for text in texts]
819
+ ids = [row_id for _text, row_id, _row_index in batch]
820
+ indices = [row_index for _text, _row_id, row_index in batch]
821
+ encode_kwargs = {
822
+ "batch_size": args.batch_size,
823
+ "normalize_embeddings": True,
824
+ "convert_to_numpy": True,
825
+ "show_progress_bar": False,
826
+ }
827
+ if args.prompt_name is not None:
828
+ encode_kwargs["prompt_name"] = args.prompt_name
829
+ features = model.encode(texts, **encode_kwargs)
830
+ features = np.asarray(features, dtype=np.float32)
831
+ features /= np.maximum(np.linalg.norm(features, axis=1, keepdims=True), 1e-12)
832
+ rows.extend(features)
833
+ row_ids.extend(ids)
834
+ row_indices.extend(indices)
835
+ total += len(batch)
836
+ if len(rows) >= args.shard_rows:
837
+ shards.append(flush_shard(output_dir, len(shards), shard_start, rows, args.embedding_dtype))
838
+ shard_start += len(rows)
839
+ rows = []
840
+ if rows:
841
+ shards.append(flush_shard(output_dir, len(shards), shard_start, rows, args.embedding_dtype))
842
+
843
+ if row_indices:
844
+ with (output_dir / "row_ids.jsonl").open("w", encoding="utf-8") as handle:
845
+ for index, (row_id, row_index) in enumerate(zip(row_ids, row_indices, strict=True)):
846
+ handle.write(
847
+ json.dumps(
848
+ {"split_row": index, "source_row": row_index, "id": row_id},
849
+ ensure_ascii=False,
850
+ )
851
+ + "\n"
852
+ )
853
+
854
+ elapsed = time.time() - started
855
+ manifest = {
856
+ "input": args.input,
857
+ "text_field": args.text_field,
858
+ "id_field": args.id_field,
859
+ "model": args.model,
860
+ "backend": "sentence_transformers.SentenceTransformer",
861
+ "max_length": max_length,
862
+ "max_records": args.max_records,
863
+ "sample_records": args.sample_records,
864
+ "sample_seed": args.sample_seed,
865
+ "split_count": args.split_count,
866
+ "split_index": args.split_index,
867
+ "pooling": "model_default",
868
+ "normalize_embeddings": True,
869
+ "text_prefix": args.text_prefix,
870
+ "text_template": args.text_template,
871
+ "prompt_name": args.prompt_name,
872
+ "available_prompts": getattr(model, "prompts", None),
873
+ "device": args.device,
874
+ "embedding_dtype": args.embedding_dtype,
875
+ "rows": total,
876
+ "seconds": round(elapsed, 3),
877
+ "rows_per_second": round(total / max(elapsed, 1e-6), 3),
878
+ "shards": [asdict(shard) for shard in shards],
879
+ }
880
+ (output_dir / "manifest.json").write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8")
881
+ print(json.dumps({"output_dir": str(output_dir), "rows": total, "shards": len(shards), "max_length": max_length}, indent=2))
882
+ return 0
883
+
884
+
885
+ def load_embedding_manifest(path: Path) -> tuple[dict[str, Any], np.ndarray]:
886
+ manifest = json.loads(path.read_text(encoding="utf-8"))
887
+ arrays = [np.load(shard["path"], mmap_mode="r") for shard in manifest["shards"]]
888
+ if not arrays:
889
+ return manifest, np.zeros((0, 0), dtype=np.float32)
890
+ return manifest, np.concatenate(arrays, axis=0)
891
+
892
+
893
+ def sample_embeddings(embeddings: np.ndarray, max_rows: int | None, seed: int) -> tuple[np.ndarray, list[int]]:
894
+ n = int(embeddings.shape[0])
895
+ if max_rows is None or max_rows >= n:
896
+ indices = list(range(n))
897
+ else:
898
+ rng = random.Random(seed)
899
+ indices = sorted(rng.sample(range(n), max_rows))
900
+ return np.asarray(embeddings[indices], dtype=np.float32), indices
901
+
902
+
903
+ def vendi_from_block(block: torch.Tensor) -> dict[str, float]:
904
+ block = torch.nn.functional.normalize(block.float(), dim=-1)
905
+ kernel = block @ block.T
906
+ eigenvalues = torch.linalg.eigvalsh(kernel).clamp_min(0)
907
+ total = eigenvalues.sum().clamp_min(1e-12)
908
+ probs = eigenvalues / total
909
+ entropy = -(probs * torch.log(probs.clamp_min(1e-12))).sum()
910
+ vendi = torch.exp(entropy)
911
+ return {
912
+ "vendi": float(vendi.item()),
913
+ "effective_rank": float(vendi.item()),
914
+ "trace": float(total.item()),
915
+ "max_eigen_prob": float(probs.max().item()),
916
+ }
917
+
918
+
919
+ def mean_ci(values: list[float]) -> dict[str, float]:
920
+ if not values:
921
+ return {"mean": 0.0, "ci95_low": 0.0, "ci95_high": 0.0}
922
+ mean = sum(values) / len(values)
923
+ if len(values) == 1:
924
+ return {"mean": mean, "ci95_low": mean, "ci95_high": mean}
925
+ variance = sum((value - mean) ** 2 for value in values) / (len(values) - 1)
926
+ half = 1.96 * math.sqrt(variance / len(values))
927
+ return {"mean": mean, "ci95_low": mean - half, "ci95_high": mean + half}
928
+
929
+
930
+ def parse_thresholds(text: str) -> list[float]:
931
+ values = []
932
+ for part in text.split(","):
933
+ part = part.strip()
934
+ if not part:
935
+ continue
936
+ value = float(part)
937
+ if not -1.0 <= value <= 1.0:
938
+ raise SystemExit(f"invalid cosine threshold outside [-1, 1]: {value}")
939
+ values.append(value)
940
+ if not values:
941
+ raise SystemExit("--thresholds must contain at least one value")
942
+ return values
943
+
944
+
945
+ def summarize_scores(scores: np.ndarray, thresholds: list[float]) -> dict[str, Any]:
946
+ percentiles = {
947
+ f"p{percentile:02d}": float(np.percentile(scores, percentile))
948
+ for percentile in [1, 5, 10, 25, 50, 75, 90, 95, 99]
949
+ }
950
+ support = {
951
+ f"support_at_{threshold:.2f}": float(np.mean(scores >= threshold))
952
+ for threshold in thresholds
953
+ }
954
+ return {
955
+ "mean_nn_cosine": float(np.mean(scores)),
956
+ "std_nn_cosine": float(np.std(scores, ddof=1)) if scores.size > 1 else 0.0,
957
+ **percentiles,
958
+ **support,
959
+ }
960
+
961
+
962
+ def summarize_support(covered: np.ndarray, density: np.ndarray, nn_cosine: np.ndarray) -> dict[str, Any]:
963
+ nn_distance = 1.0 - nn_cosine
964
+ return {
965
+ "coverage": float(np.mean(covered)),
966
+ "density": float(np.mean(density)),
967
+ "density_p50": float(np.percentile(density, 50)),
968
+ "density_p95": float(np.percentile(density, 95)),
969
+ "nn_cosine_mean": float(np.mean(nn_cosine)),
970
+ "nn_cosine_p50": float(np.percentile(nn_cosine, 50)),
971
+ "nn_cosine_p05": float(np.percentile(nn_cosine, 5)),
972
+ "nn_distance_p95": float(np.percentile(nn_distance, 95)),
973
+ "nn_distance_p99": float(np.percentile(nn_distance, 99)),
974
+ }
975
+
976
+
977
+ @torch.inference_mode()
978
+ def exact_nn_cosine(
979
+ query: np.ndarray,
980
+ gallery: np.ndarray,
981
+ device: str,
982
+ dtype: torch.dtype,
983
+ query_batch_size: int,
984
+ gallery_chunk_size: int,
985
+ ) -> np.ndarray:
986
+ if query.ndim != 2 or gallery.ndim != 2:
987
+ raise SystemExit("query and gallery embeddings must be 2D arrays")
988
+ if query.shape[1] != gallery.shape[1]:
989
+ raise SystemExit(f"dimension mismatch: query dim {query.shape[1]} vs gallery dim {gallery.shape[1]}")
990
+ if query.shape[0] == 0 or gallery.shape[0] == 0:
991
+ raise SystemExit("query and gallery embeddings must be non-empty")
992
+ if query_batch_size < 1:
993
+ raise SystemExit("--query-batch-size must be >= 1")
994
+ if gallery_chunk_size < 0:
995
+ raise SystemExit("--gallery-chunk-size must be >= 0")
996
+
997
+ scores: list[np.ndarray] = []
998
+ if gallery_chunk_size == 0:
999
+ gallery_tensor = torch.from_numpy(gallery).to(device=device, dtype=dtype)
1000
+ gallery_tensor = torch.nn.functional.normalize(gallery_tensor.float(), dim=-1).to(dtype)
1001
+ gallery_t = gallery_tensor.T.contiguous()
1002
+ for start in range(0, query.shape[0], query_batch_size):
1003
+ query_tensor = torch.from_numpy(query[start : start + query_batch_size]).to(device=device, dtype=dtype)
1004
+ query_tensor = torch.nn.functional.normalize(query_tensor.float(), dim=-1).to(dtype)
1005
+ sims = query_tensor @ gallery_t
1006
+ scores.append(sims.float().max(dim=1).values.cpu().numpy())
1007
+ return np.concatenate(scores, axis=0)
1008
+
1009
+ for start in range(0, query.shape[0], query_batch_size):
1010
+ query_tensor = torch.from_numpy(query[start : start + query_batch_size]).to(device=device, dtype=dtype)
1011
+ query_tensor = torch.nn.functional.normalize(query_tensor.float(), dim=-1).to(dtype)
1012
+ best = torch.full((query_tensor.shape[0],), -2.0, device=device, dtype=torch.float32)
1013
+ for gallery_start in range(0, gallery.shape[0], gallery_chunk_size):
1014
+ gallery_tensor = torch.from_numpy(gallery[gallery_start : gallery_start + gallery_chunk_size]).to(device=device, dtype=dtype)
1015
+ gallery_tensor = torch.nn.functional.normalize(gallery_tensor.float(), dim=-1).to(dtype)
1016
+ sims = query_tensor @ gallery_tensor.T
1017
+ best = torch.maximum(best, sims.float().max(dim=1).values)
1018
+ scores.append(best.cpu().numpy())
1019
+ return np.concatenate(scores, axis=0)
1020
+
1021
+
1022
+ @torch.inference_mode()
1023
+ def kth_self_neighbor_cosine(
1024
+ gallery: np.ndarray,
1025
+ k: int,
1026
+ device: str,
1027
+ dtype: torch.dtype,
1028
+ batch_size: int,
1029
+ ) -> np.ndarray:
1030
+ if k < 1:
1031
+ raise SystemExit("--k must be >= 1")
1032
+ if gallery.shape[0] <= k:
1033
+ raise SystemExit(f"gallery rows ({gallery.shape[0]}) must be > k ({k})")
1034
+ if batch_size < 1:
1035
+ raise SystemExit("--gallery-batch-size must be >= 1")
1036
+ gallery_tensor = torch.from_numpy(gallery).to(device=device, dtype=dtype)
1037
+ gallery_tensor = torch.nn.functional.normalize(gallery_tensor.float(), dim=-1).to(dtype)
1038
+ gallery_t = gallery_tensor.T.contiguous()
1039
+ thresholds: list[np.ndarray] = []
1040
+ for start in range(0, gallery.shape[0], batch_size):
1041
+ stop = min(start + batch_size, gallery.shape[0])
1042
+ sims = gallery_tensor[start:stop] @ gallery_t
1043
+ row_indices = torch.arange(stop - start, device=device)
1044
+ sims[row_indices, torch.arange(start, stop, device=device)] = -2.0
1045
+ kth = torch.topk(sims.float(), k=k, dim=1).values[:, -1]
1046
+ thresholds.append(kth.cpu().numpy())
1047
+ return np.concatenate(thresholds, axis=0)
1048
+
1049
+
1050
+ @torch.inference_mode()
1051
+ def prdc_query_in_gallery_support(
1052
+ query: np.ndarray,
1053
+ gallery: np.ndarray,
1054
+ gallery_thresholds: np.ndarray,
1055
+ k: int,
1056
+ device: str,
1057
+ dtype: torch.dtype,
1058
+ query_batch_size: int,
1059
+ ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
1060
+ if query_batch_size < 1:
1061
+ raise SystemExit("--query-batch-size must be >= 1")
1062
+ gallery_tensor = torch.from_numpy(gallery).to(device=device, dtype=dtype)
1063
+ gallery_tensor = torch.nn.functional.normalize(gallery_tensor.float(), dim=-1).to(dtype)
1064
+ gallery_t = gallery_tensor.T.contiguous()
1065
+ thresholds = torch.from_numpy(gallery_thresholds.astype(np.float32)).to(device=device)
1066
+ covered_rows: list[np.ndarray] = []
1067
+ density_rows: list[np.ndarray] = []
1068
+ nn_rows: list[np.ndarray] = []
1069
+ for start in range(0, query.shape[0], query_batch_size):
1070
+ query_tensor = torch.from_numpy(query[start : start + query_batch_size]).to(device=device, dtype=dtype)
1071
+ query_tensor = torch.nn.functional.normalize(query_tensor.float(), dim=-1).to(dtype)
1072
+ sims = (query_tensor @ gallery_t).float()
1073
+ support_hits = sims >= thresholds.unsqueeze(0)
1074
+ hit_counts = support_hits.sum(dim=1).float()
1075
+ covered_rows.append((hit_counts > 0).cpu().numpy())
1076
+ density_rows.append((hit_counts / float(k)).cpu().numpy())
1077
+ nn_rows.append(sims.max(dim=1).values.cpu().numpy())
1078
+ return (
1079
+ np.concatenate(covered_rows, axis=0),
1080
+ np.concatenate(density_rows, axis=0),
1081
+ np.concatenate(nn_rows, axis=0),
1082
+ )
1083
+
1084
+
1085
+ def vendi_main(args: argparse.Namespace) -> int:
1086
+ manifest, embeddings = load_embedding_manifest(Path(args.manifest))
1087
+ n = int(embeddings.shape[0])
1088
+ if n == 0:
1089
+ raise SystemExit("empty embedding cache")
1090
+ block_size = min(args.block_size, n)
1091
+ rng = random.Random(args.seed)
1092
+ matrix_device = args.matrix_device or args.device
1093
+ dtype = torch_dtype(args.dtype)
1094
+ block_rows = []
1095
+ if args.sampling == "partition":
1096
+ order = list(range(n))
1097
+ rng.shuffle(order)
1098
+ index_blocks = [order[start : start + block_size] for start in range(0, n, block_size)]
1099
+ if index_blocks and len(index_blocks[-1]) < max(2, block_size // 2):
1100
+ # Avoid a tiny tail block with a non-comparable Vendi scale.
1101
+ index_blocks[-2].extend(index_blocks[-1])
1102
+ index_blocks.pop()
1103
+ else:
1104
+ index_blocks = [
1105
+ rng.sample(range(n), block_size) if block_size < n else list(range(n))
1106
+ for _ in range(args.blocks)
1107
+ ]
1108
+ for block_index, indices in enumerate(index_blocks):
1109
+ array = np.asarray(embeddings[indices], dtype=np.float32)
1110
+ block = torch.from_numpy(array).to(matrix_device, dtype=dtype)
1111
+ stats = vendi_from_block(block)
1112
+ stats.update({"block_index": block_index, "block_size": len(indices)})
1113
+ block_rows.append(stats)
1114
+ vendi_values = [row["vendi"] for row in block_rows]
1115
+ payload = {
1116
+ "embedding_manifest": args.manifest,
1117
+ "source_model": manifest.get("model"),
1118
+ "source_rows": n,
1119
+ "block_size": block_size,
1120
+ "blocks": len(block_rows),
1121
+ "requested_blocks": args.blocks,
1122
+ "sampling": args.sampling,
1123
+ "seed": args.seed,
1124
+ "device": matrix_device,
1125
+ "summary": {
1126
+ "vendi": mean_ci(vendi_values),
1127
+ "max_eigen_prob": mean_ci([row["max_eigen_prob"] for row in block_rows]),
1128
+ },
1129
+ "block_rows": block_rows,
1130
+ "boundary": "Vendi is an embedding-space semantic diversity metric; it does not measure faithfulness, density, or downstream utility.",
1131
+ }
1132
+ output = Path(args.output)
1133
+ output.parent.mkdir(parents=True, exist_ok=True)
1134
+ output.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
1135
+ print(json.dumps({"output": str(output), "vendi_mean": payload["summary"]["vendi"]["mean"], "blocks": args.blocks}, indent=2))
1136
+ return 0
1137
+
1138
+
1139
+ def geometry_main(args: argparse.Namespace) -> int:
1140
+ manifest, embeddings = load_embedding_manifest(Path(args.manifest))
1141
+ n = int(embeddings.shape[0])
1142
+ if n == 0:
1143
+ raise SystemExit("empty embedding cache")
1144
+ rng = np.random.default_rng(args.seed)
1145
+ take = min(args.max_rows, n)
1146
+ indices = rng.choice(n, size=take, replace=False) if take < n else np.arange(n)
1147
+ x = torch.from_numpy(np.asarray(embeddings[indices], dtype=np.float32)).to(args.device, dtype=torch_dtype(args.dtype))
1148
+ x = torch.nn.functional.normalize(x.float(), dim=-1)
1149
+ centroid = torch.nn.functional.normalize(x.mean(dim=0, keepdim=True), dim=-1)
1150
+ cosine_to_centroid = (x @ centroid.T).squeeze(1)
1151
+ centered = x - x.mean(dim=0, keepdim=True)
1152
+ cov = centered.T @ centered / max(take - 1, 1)
1153
+ eig = torch.linalg.eigvalsh(cov).clamp_min(0)
1154
+ eig_sum = eig.sum().clamp_min(1e-12)
1155
+ probs = eig / eig_sum
1156
+ spectral_entropy = -(probs * torch.log(probs.clamp_min(1e-12))).sum()
1157
+ erank = torch.exp(spectral_entropy)
1158
+ participation = eig_sum.square() / eig.square().sum().clamp_min(1e-12)
1159
+ payload = {
1160
+ "embedding_manifest": args.manifest,
1161
+ "source_model": manifest.get("model"),
1162
+ "source_rows": n,
1163
+ "sample_rows": take,
1164
+ "seed": args.seed,
1165
+ "device": args.device,
1166
+ "metrics": {
1167
+ "mean_cosine_to_centroid": float(cosine_to_centroid.mean().item()),
1168
+ "std_cosine_to_centroid": float(cosine_to_centroid.std(unbiased=True).item()) if take > 1 else 0.0,
1169
+ "mean_pairwise_cosine_estimate": float((x.mean(dim=0).square().sum().item() * take - 1.0) / max(take - 1, 1)),
1170
+ "cov_effective_rank": float(erank.item()),
1171
+ "cov_participation_ratio": float(participation.item()),
1172
+ "cov_top1_mass": float((eig.max() / eig_sum).item()),
1173
+ "cov_top10_mass": float((eig.topk(min(10, eig.numel())).values.sum() / eig_sum).item()),
1174
+ "cov_trace": float(eig_sum.item()),
1175
+ },
1176
+ "boundary": "Geometry metrics describe embedding distribution shape: concentration, anisotropy, and effective dimensionality. They do not measure faithfulness or prompt support.",
1177
+ }
1178
+ output = Path(args.output)
1179
+ output.parent.mkdir(parents=True, exist_ok=True)
1180
+ output.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
1181
+ print(json.dumps({"output": str(output), **payload["metrics"]}, indent=2))
1182
+ return 0
1183
+
1184
+
1185
+ def knn_main(args: argparse.Namespace) -> int:
1186
+ query_manifest, query_embeddings_all = load_embedding_manifest(Path(args.query_manifest))
1187
+ gallery_manifest, gallery_embeddings_all = load_embedding_manifest(Path(args.gallery_manifest))
1188
+ query_embeddings, query_indices = sample_embeddings(query_embeddings_all, args.query_max_rows, args.seed)
1189
+ gallery_embeddings, gallery_indices = sample_embeddings(gallery_embeddings_all, args.gallery_max_rows, args.seed + 1)
1190
+ started = time.time()
1191
+ scores = exact_nn_cosine(
1192
+ query_embeddings,
1193
+ gallery_embeddings,
1194
+ args.device,
1195
+ torch_dtype(args.dtype),
1196
+ args.query_batch_size,
1197
+ args.gallery_chunk_size,
1198
+ )
1199
+ thresholds = parse_thresholds(args.thresholds)
1200
+ payload = {
1201
+ "query_manifest": args.query_manifest,
1202
+ "gallery_manifest": args.gallery_manifest,
1203
+ "query_model": query_manifest.get("model"),
1204
+ "gallery_model": gallery_manifest.get("model"),
1205
+ "query_source_rows": int(query_embeddings_all.shape[0]),
1206
+ "gallery_source_rows": int(gallery_embeddings_all.shape[0]),
1207
+ "query_rows": int(query_embeddings.shape[0]),
1208
+ "gallery_rows": int(gallery_embeddings.shape[0]),
1209
+ "query_seed": args.seed,
1210
+ "gallery_seed": args.seed + 1,
1211
+ "query_indices_preview": query_indices[:10],
1212
+ "gallery_indices_preview": gallery_indices[:10],
1213
+ "device": args.device,
1214
+ "dtype": args.dtype,
1215
+ "query_batch_size": args.query_batch_size,
1216
+ "gallery_chunk_size": args.gallery_chunk_size,
1217
+ "seconds": round(time.time() - started, 3),
1218
+ "metrics": summarize_scores(scores, thresholds),
1219
+ "boundary": (
1220
+ "kNN support measures nearest-neighbor coverage in the chosen embedding space. "
1221
+ "It is directional, encoder-dependent, and not a faithfulness or density metric."
1222
+ ),
1223
+ }
1224
+ if args.save_scores is not None:
1225
+ score_path = Path(args.save_scores)
1226
+ score_path.parent.mkdir(parents=True, exist_ok=True)
1227
+ np.save(score_path, scores.astype(np.float32))
1228
+ payload["score_path"] = str(score_path)
1229
+ output = Path(args.output)
1230
+ output.parent.mkdir(parents=True, exist_ok=True)
1231
+ output.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
1232
+ print(json.dumps({"output": str(output), "query_rows": payload["query_rows"], "gallery_rows": payload["gallery_rows"], **payload["metrics"]}, indent=2))
1233
+ return 0
1234
+
1235
+
1236
+ def support_main(args: argparse.Namespace) -> int:
1237
+ query_manifest, query_embeddings_all = load_embedding_manifest(Path(args.query_manifest))
1238
+ gallery_manifest, gallery_embeddings_all = load_embedding_manifest(Path(args.gallery_manifest))
1239
+ query_embeddings, query_indices = sample_embeddings(query_embeddings_all, args.query_max_rows, args.seed)
1240
+ gallery_embeddings, gallery_indices = sample_embeddings(gallery_embeddings_all, args.gallery_max_rows, args.seed + 1)
1241
+ started = time.time()
1242
+ gallery_thresholds = kth_self_neighbor_cosine(
1243
+ gallery_embeddings,
1244
+ args.k,
1245
+ args.device,
1246
+ torch_dtype(args.dtype),
1247
+ args.gallery_batch_size,
1248
+ )
1249
+ covered, density, nn_cosine = prdc_query_in_gallery_support(
1250
+ query_embeddings,
1251
+ gallery_embeddings,
1252
+ gallery_thresholds,
1253
+ args.k,
1254
+ args.device,
1255
+ torch_dtype(args.dtype),
1256
+ args.query_batch_size,
1257
+ )
1258
+ payload = {
1259
+ "query_manifest": args.query_manifest,
1260
+ "gallery_manifest": args.gallery_manifest,
1261
+ "query_model": query_manifest.get("model"),
1262
+ "gallery_model": gallery_manifest.get("model"),
1263
+ "query_source_rows": int(query_embeddings_all.shape[0]),
1264
+ "gallery_source_rows": int(gallery_embeddings_all.shape[0]),
1265
+ "query_rows": int(query_embeddings.shape[0]),
1266
+ "gallery_rows": int(gallery_embeddings.shape[0]),
1267
+ "query_seed": args.seed,
1268
+ "gallery_seed": args.seed + 1,
1269
+ "query_indices_preview": query_indices[:10],
1270
+ "gallery_indices_preview": gallery_indices[:10],
1271
+ "k": args.k,
1272
+ "device": args.device,
1273
+ "dtype": args.dtype,
1274
+ "query_batch_size": args.query_batch_size,
1275
+ "gallery_batch_size": args.gallery_batch_size,
1276
+ "seconds": round(time.time() - started, 3),
1277
+ "gallery_thresholds": {
1278
+ "mean_kth_neighbor_cosine": float(np.mean(gallery_thresholds)),
1279
+ "p05_kth_neighbor_cosine": float(np.percentile(gallery_thresholds, 5)),
1280
+ "p50_kth_neighbor_cosine": float(np.percentile(gallery_thresholds, 50)),
1281
+ "p95_kth_neighbor_cosine": float(np.percentile(gallery_thresholds, 95)),
1282
+ },
1283
+ "metrics": summarize_support(covered, density, nn_cosine),
1284
+ "boundary": (
1285
+ "P-in-C support is a PRDC-style embedding-manifold estimate: query points are covered "
1286
+ "when they fall inside at least one gallery kNN ball. It measures support in the chosen "
1287
+ "embedding space, not image faithfulness or overall caption quality."
1288
+ ),
1289
+ }
1290
+ if args.save_scores is not None:
1291
+ score_path = Path(args.save_scores)
1292
+ score_path.parent.mkdir(parents=True, exist_ok=True)
1293
+ np.savez_compressed(
1294
+ score_path,
1295
+ covered=covered.astype(np.bool_),
1296
+ density=density.astype(np.float32),
1297
+ nn_cosine=nn_cosine.astype(np.float32),
1298
+ gallery_thresholds=gallery_thresholds.astype(np.float32),
1299
+ )
1300
+ payload["score_path"] = str(score_path)
1301
+ output = Path(args.output)
1302
+ output.parent.mkdir(parents=True, exist_ok=True)
1303
+ output.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
1304
+ print(json.dumps({"output": str(output), "query_rows": payload["query_rows"], "gallery_rows": payload["gallery_rows"], **payload["metrics"]}, indent=2))
1305
+ return 0
1306
+
1307
+
1308
+ def main() -> int:
1309
+ args = parse_args()
1310
+ if args.cmd == "inspect":
1311
+ return inspect_models(args)
1312
+ if args.cmd == "encode":
1313
+ return encode_main(args)
1314
+ if args.cmd == "encode-bge-m3":
1315
+ return encode_bge_m3_main(args)
1316
+ if args.cmd == "encode-sentence-transformer":
1317
+ return encode_sentence_transformer_main(args)
1318
+ if args.cmd == "vendi":
1319
+ return vendi_main(args)
1320
+ if args.cmd == "geometry":
1321
+ return geometry_main(args)
1322
+ if args.cmd == "knn":
1323
+ return knn_main(args)
1324
+ if args.cmd == "support":
1325
+ return support_main(args)
1326
+ raise AssertionError(args.cmd)
1327
+
1328
+
1329
+ if __name__ == "__main__":
1330
+ raise SystemExit(main())
eval_code/scripts/compute_longclip_retrieval_margin.py ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Compute LongCLIP-style image-caption retrieval separability.
3
+
4
+ This metric is a frozen dual-encoder compatibility diagnostic, not a
5
+ faithfulness certificate. It reports whether each caption distinguishes its
6
+ paired image from same-slice negatives, while also reporting text truncation.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ import hashlib
13
+ import json
14
+ import random
15
+ import time
16
+ from pathlib import Path
17
+ from typing import Any
18
+
19
+ import numpy as np
20
+ import torch
21
+ from PIL import Image, ImageFile
22
+
23
+ ImageFile.LOAD_TRUNCATED_IMAGES = True
24
+
25
+
26
+ def parse_args() -> argparse.Namespace:
27
+ parser = argparse.ArgumentParser(description=__doc__)
28
+ parser.add_argument("--surface", action="append", required=True, metavar="LABEL=JSONL")
29
+ parser.add_argument("--output-dir", required=True)
30
+ parser.add_argument("--model", default="zer0int/LongCLIP-GmP-ViT-L-14")
31
+ parser.add_argument("--max-records", type=int, default=None)
32
+ parser.add_argument("--sample-records", type=int, default=None)
33
+ parser.add_argument("--sample-seed", type=int, default=0)
34
+ parser.add_argument("--batch-size", type=int, default=64)
35
+ parser.add_argument("--retrieval-block-size", type=int, default=512)
36
+ parser.add_argument("--max-length", type=int, default=248)
37
+ parser.add_argument("--device", default="cuda")
38
+ parser.add_argument("--dtype", default="float16", choices=["float16", "bfloat16", "float32"])
39
+ parser.add_argument("--bootstrap-reps", type=int, default=1000)
40
+ parser.add_argument("--trust-remote-code", action="store_true")
41
+ parser.add_argument("--save-embeddings", action="store_true")
42
+ return parser.parse_args()
43
+
44
+
45
+ def torch_dtype(name: str) -> torch.dtype:
46
+ return {"float16": torch.float16, "bfloat16": torch.bfloat16, "float32": torch.float32}[name]
47
+
48
+
49
+ def parse_surface(spec: str) -> tuple[str, Path]:
50
+ if "=" not in spec:
51
+ raise ValueError(f"--surface must be LABEL=JSONL: {spec}")
52
+ label, path = spec.split("=", 1)
53
+ return label, Path(path)
54
+
55
+
56
+ def stable_float(*parts: object) -> float:
57
+ raw = ":".join(str(part) for part in parts)
58
+ digest = hashlib.blake2b(raw.encode("utf-8"), digest_size=8).digest()
59
+ return int.from_bytes(digest, "big") / 2**64
60
+
61
+
62
+ def image_path(row: dict[str, Any]) -> str | None:
63
+ image = row.get("image") if isinstance(row.get("image"), dict) else {}
64
+ local = image.get("local_abs_path") or row.get("image_abs_path") or row.get("image_path")
65
+ if isinstance(local, str) and local:
66
+ return local
67
+ return None
68
+
69
+
70
+ def load_surface(path: Path) -> list[dict[str, Any]]:
71
+ rows: list[dict[str, Any]] = []
72
+ with path.open("r", encoding="utf-8") as handle:
73
+ for line in handle:
74
+ if not line.strip():
75
+ continue
76
+ row = json.loads(line)
77
+ caption = row.get("caption")
78
+ if isinstance(caption, str) and caption.strip():
79
+ rows.append(row)
80
+ return rows
81
+
82
+
83
+ def align_rows(surface_rows: dict[str, list[dict[str, Any]]], sample_records: int | None, max_records: int | None, seed: int) -> dict[str, list[dict[str, Any]]]:
84
+ labels = list(surface_rows)
85
+ n = min(len(surface_rows[label]) for label in labels)
86
+ indices = list(range(n))
87
+ if sample_records is not None:
88
+ indices.sort(key=lambda i: stable_float(seed, i))
89
+ indices = indices[:sample_records]
90
+ indices.sort()
91
+ elif max_records is not None:
92
+ indices = indices[:max_records]
93
+ return {label: [surface_rows[label][i] for i in indices] for label in labels}
94
+
95
+
96
+ def load_model(model_id: str, device: str, dtype_name: str, trust_remote_code: bool):
97
+ from transformers import AutoImageProcessor, AutoModel, AutoTokenizer
98
+
99
+ dtype = torch_dtype(dtype_name)
100
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=trust_remote_code)
101
+ image_processor = AutoImageProcessor.from_pretrained(model_id, trust_remote_code=trust_remote_code)
102
+ model = AutoModel.from_pretrained(model_id, trust_remote_code=trust_remote_code, torch_dtype=dtype)
103
+ model.eval().to(device)
104
+ return tokenizer, image_processor, model
105
+
106
+
107
+ def normalize(x: torch.Tensor) -> torch.Tensor:
108
+ return torch.nn.functional.normalize(x.float(), dim=-1)
109
+
110
+
111
+ def pooled_tensor(output: Any) -> torch.Tensor:
112
+ """Return a tensor embedding from HF tensor/model-output variants."""
113
+ if isinstance(output, torch.Tensor):
114
+ return output
115
+ pooler_output = getattr(output, "pooler_output", None)
116
+ if isinstance(pooler_output, torch.Tensor):
117
+ return pooler_output
118
+ image_embeds = getattr(output, "image_embeds", None)
119
+ if isinstance(image_embeds, torch.Tensor):
120
+ return image_embeds
121
+ text_embeds = getattr(output, "text_embeds", None)
122
+ if isinstance(text_embeds, torch.Tensor):
123
+ return text_embeds
124
+ last_hidden_state = getattr(output, "last_hidden_state", None)
125
+ if isinstance(last_hidden_state, torch.Tensor):
126
+ return last_hidden_state[:, 0]
127
+ if isinstance(output, (tuple, list)) and output and isinstance(output[0], torch.Tensor):
128
+ first = output[0]
129
+ return first[:, 0] if first.ndim == 3 else first
130
+ raise TypeError(f"Cannot extract pooled tensor from {type(output)!r}")
131
+
132
+
133
+ def encode_texts(tokenizer: Any, model: Any, texts: list[str], device: str, max_length: int, batch_size: int) -> tuple[np.ndarray, np.ndarray]:
134
+ embs: list[np.ndarray] = []
135
+ lengths: list[int] = []
136
+ with torch.inference_mode():
137
+ for start in range(0, len(texts), batch_size):
138
+ batch = texts[start : start + batch_size]
139
+ raw = tokenizer(batch, padding=False, truncation=False, add_special_tokens=True)
140
+ lengths.extend(len(ids) for ids in raw["input_ids"])
141
+ encoded = tokenizer(batch, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
142
+ encoded = {k: v.to(device) for k, v in encoded.items()}
143
+ if hasattr(model, "get_text_features"):
144
+ features = pooled_tensor(model.get_text_features(**encoded))
145
+ else:
146
+ features = pooled_tensor(model(**encoded))
147
+ embs.append(normalize(features).cpu().numpy().astype("float32"))
148
+ return np.concatenate(embs, axis=0), np.asarray(lengths, dtype=np.int32)
149
+
150
+
151
+ def encode_images(image_processor: Any, model: Any, rows: list[dict[str, Any]], device: str, batch_size: int) -> tuple[np.ndarray, dict[str, Any]]:
152
+ embs: list[np.ndarray] = []
153
+ kept_indices: list[int] = []
154
+ failures: list[dict[str, Any]] = []
155
+ batch_images: list[Image.Image] = []
156
+ batch_indices: list[int] = []
157
+
158
+ def flush() -> None:
159
+ if not batch_images:
160
+ return
161
+ inputs = image_processor(images=batch_images, return_tensors="pt")
162
+ inputs = {k: v.to(device) for k, v in inputs.items()}
163
+ with torch.inference_mode():
164
+ if hasattr(model, "get_image_features"):
165
+ features = pooled_tensor(model.get_image_features(**inputs))
166
+ else:
167
+ features = pooled_tensor(model(**inputs))
168
+ embs.append(normalize(features).cpu().numpy().astype("float32"))
169
+ kept_indices.extend(batch_indices)
170
+ batch_images.clear()
171
+ batch_indices.clear()
172
+
173
+ for index, row in enumerate(rows):
174
+ path = image_path(row)
175
+ if path is None:
176
+ failures.append({"index": index, "reason": "missing_image_path"})
177
+ continue
178
+ try:
179
+ image = Image.open(path).convert("RGB")
180
+ except Exception as exc: # noqa: BLE001
181
+ failures.append({"index": index, "path": path, "reason": repr(exc)[:500]})
182
+ continue
183
+ batch_images.append(image)
184
+ batch_indices.append(index)
185
+ if len(batch_images) >= batch_size:
186
+ flush()
187
+ flush()
188
+ if embs:
189
+ arr = np.concatenate(embs, axis=0)
190
+ else:
191
+ arr = np.zeros((0, 0), dtype=np.float32)
192
+ return arr, {"kept_indices": kept_indices, "failures": failures}
193
+
194
+
195
+ def mean_ci(values: np.ndarray, reps: int, rng: np.random.Generator) -> dict[str, float]:
196
+ values = np.asarray(values, dtype=np.float64)
197
+ if values.size == 0:
198
+ return {"mean": float("nan"), "ci95_low": float("nan"), "ci95_high": float("nan")}
199
+ if reps <= 0 or values.size == 1:
200
+ mean = float(values.mean())
201
+ return {"mean": mean, "ci95_low": mean, "ci95_high": mean}
202
+ means = np.empty(reps, dtype=np.float64)
203
+ n = values.size
204
+ for i in range(reps):
205
+ means[i] = values[rng.integers(0, n, n)].mean()
206
+ return {
207
+ "mean": float(values.mean()),
208
+ "ci95_low": float(np.percentile(means, 2.5)),
209
+ "ci95_high": float(np.percentile(means, 97.5)),
210
+ }
211
+
212
+
213
+ def retrieval_metrics(image_emb: np.ndarray, text_emb: np.ndarray, block_size: int) -> dict[str, np.ndarray]:
214
+ n = min(len(image_emb), len(text_emb))
215
+ pos = np.sum(image_emb[:n] * text_emb[:n], axis=1).astype(np.float32)
216
+ max_i2t = np.full(n, -np.inf, dtype=np.float32)
217
+ max_t2i = np.full(n, -np.inf, dtype=np.float32)
218
+ rank_i2t = np.ones(n, dtype=np.int32)
219
+ rank_t2i = np.ones(n, dtype=np.int32)
220
+
221
+ for image_start in range(0, n, block_size):
222
+ image_end = min(image_start + block_size, n)
223
+ image_block = image_emb[image_start:image_end]
224
+ image_idx = np.arange(image_start, image_end)
225
+ for text_start in range(0, n, block_size):
226
+ text_end = min(text_start + block_size, n)
227
+ text_block = text_emb[text_start:text_end]
228
+ text_idx = np.arange(text_start, text_end)
229
+ sims = image_block @ text_block.T
230
+ diag_mask = image_idx[:, None] == text_idx[None, :]
231
+
232
+ masked = sims.copy()
233
+ masked[diag_mask] = -np.inf
234
+ max_i2t[image_start:image_end] = np.maximum(max_i2t[image_start:image_end], masked.max(axis=1))
235
+ max_t2i[text_start:text_end] = np.maximum(max_t2i[text_start:text_end], masked.max(axis=0))
236
+
237
+ greater_i2t = sims > pos[image_start:image_end, None]
238
+ greater_i2t[diag_mask] = False
239
+ rank_i2t[image_start:image_end] += greater_i2t.sum(axis=1).astype(np.int32)
240
+
241
+ greater_t2i = sims > pos[text_start:text_end][None, :]
242
+ greater_t2i[diag_mask] = False
243
+ rank_t2i[text_start:text_end] += greater_t2i.sum(axis=0).astype(np.int32)
244
+
245
+ return {
246
+ "pos": pos,
247
+ "i2t_margin": (pos - max_i2t).astype(np.float32),
248
+ "t2i_margin": (pos - max_t2i).astype(np.float32),
249
+ "i2t_r1": (rank_i2t <= 1).astype(np.float32),
250
+ "i2t_r5": (rank_i2t <= 5).astype(np.float32),
251
+ "t2i_r1": (rank_t2i <= 1).astype(np.float32),
252
+ "t2i_r5": (rank_t2i <= 5).astype(np.float32),
253
+ }
254
+
255
+
256
+ def main() -> int:
257
+ args = parse_args()
258
+ started = time.time()
259
+ output_dir = Path(args.output_dir)
260
+ output_dir.mkdir(parents=True, exist_ok=True)
261
+ surface_specs = dict(parse_surface(spec) for spec in args.surface)
262
+ raw_rows = {label: load_surface(path) for label, path in surface_specs.items()}
263
+ rows = align_rows(raw_rows, args.sample_records, args.max_records, args.sample_seed)
264
+ labels = list(rows)
265
+ if not labels:
266
+ raise SystemExit("No surfaces provided")
267
+
268
+ tokenizer, image_processor, model = load_model(args.model, args.device, args.dtype, args.trust_remote_code)
269
+ image_emb, image_info = encode_images(image_processor, model, rows[labels[0]], args.device, args.batch_size)
270
+ kept_indices = image_info["kept_indices"]
271
+ rng = np.random.default_rng(args.sample_seed)
272
+
273
+ summaries: dict[str, Any] = {}
274
+ text_cache: dict[str, np.ndarray] = {}
275
+ token_cache: dict[str, np.ndarray] = {}
276
+ for label in labels:
277
+ kept_rows = [rows[label][index] for index in kept_indices]
278
+ texts = [str(row["caption"]) for row in kept_rows]
279
+ text_emb, token_lengths = encode_texts(tokenizer, model, texts, args.device, args.max_length, args.batch_size)
280
+ text_cache[label] = text_emb
281
+ token_cache[label] = token_lengths
282
+ metrics = retrieval_metrics(image_emb, text_emb, args.retrieval_block_size)
283
+ summaries[label] = {
284
+ "rows": int(len(texts)),
285
+ "token_mean": float(token_lengths.mean()) if len(token_lengths) else 0.0,
286
+ "token_p50": float(np.percentile(token_lengths, 50)) if len(token_lengths) else 0.0,
287
+ "token_p95": float(np.percentile(token_lengths, 95)) if len(token_lengths) else 0.0,
288
+ "truncated_rate_gt_limit": float((token_lengths > args.max_length).mean()) if len(token_lengths) else 0.0,
289
+ "pos_score": mean_ci(metrics["pos"], args.bootstrap_reps, rng),
290
+ "i2t_margin": mean_ci(metrics["i2t_margin"], args.bootstrap_reps, rng),
291
+ "t2i_margin": mean_ci(metrics["t2i_margin"], args.bootstrap_reps, rng),
292
+ "i2t_r_at_1": mean_ci(metrics["i2t_r1"], args.bootstrap_reps, rng),
293
+ "i2t_r_at_5": mean_ci(metrics["i2t_r5"], args.bootstrap_reps, rng),
294
+ "t2i_r_at_1": mean_ci(metrics["t2i_r1"], args.bootstrap_reps, rng),
295
+ "t2i_r_at_5": mean_ci(metrics["t2i_r5"], args.bootstrap_reps, rng),
296
+ }
297
+
298
+ payload = {
299
+ "model": args.model,
300
+ "max_length": args.max_length,
301
+ "surface_inputs": {label: str(path) for label, path in surface_specs.items()},
302
+ "labels": labels,
303
+ "image_rows": len(rows[labels[0]]),
304
+ "image_kept": len(kept_indices),
305
+ "image_failures": image_info["failures"][:100],
306
+ "retrieval_block_size": args.retrieval_block_size,
307
+ "bootstrap_reps": args.bootstrap_reps,
308
+ "seconds": round(time.time() - started, 2),
309
+ "summaries": summaries,
310
+ }
311
+ summary_path = output_dir / "longclip_retrieval_summary.json"
312
+ summary_path.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
313
+
314
+ rows_tsv = [
315
+ [
316
+ "surface",
317
+ "rows",
318
+ "trunc_gt_248",
319
+ "tok_mean",
320
+ "tok_p95",
321
+ "pos_mean",
322
+ "pos_ci95",
323
+ "i2t_margin_mean",
324
+ "i2t_margin_ci95",
325
+ "i2t_r1",
326
+ "i2t_r5",
327
+ "t2i_margin_mean",
328
+ "t2i_margin_ci95",
329
+ "t2i_r1",
330
+ "t2i_r5",
331
+ ]
332
+ ]
333
+ for label in labels:
334
+ s = summaries[label]
335
+ rows_tsv.append(
336
+ [
337
+ label,
338
+ str(s["rows"]),
339
+ f"{s['truncated_rate_gt_limit']:.4f}",
340
+ f"{s['token_mean']:.2f}",
341
+ f"{s['token_p95']:.1f}",
342
+ f"{s['pos_score']['mean']:.6f}",
343
+ f"[{s['pos_score']['ci95_low']:.6f},{s['pos_score']['ci95_high']:.6f}]",
344
+ f"{s['i2t_margin']['mean']:.6f}",
345
+ f"[{s['i2t_margin']['ci95_low']:.6f},{s['i2t_margin']['ci95_high']:.6f}]",
346
+ f"{s['i2t_r_at_1']['mean']:.4f}",
347
+ f"{s['i2t_r_at_5']['mean']:.4f}",
348
+ f"{s['t2i_margin']['mean']:.6f}",
349
+ f"[{s['t2i_margin']['ci95_low']:.6f},{s['t2i_margin']['ci95_high']:.6f}]",
350
+ f"{s['t2i_r_at_1']['mean']:.4f}",
351
+ f"{s['t2i_r_at_5']['mean']:.4f}",
352
+ ]
353
+ )
354
+ (output_dir / "longclip_retrieval_summary.tsv").write_text(
355
+ "\n".join("\t".join(row) for row in rows_tsv) + "\n",
356
+ encoding="utf-8",
357
+ )
358
+ if args.save_embeddings:
359
+ np.save(output_dir / "image_embeddings.npy", image_emb.astype(np.float16))
360
+ for label, emb in text_cache.items():
361
+ np.save(output_dir / f"text_embeddings_{label}.npy", emb.astype(np.float16))
362
+ np.save(output_dir / f"token_lengths_{label}.npy", token_cache[label])
363
+ print(json.dumps({"summary": str(summary_path), "rows": len(kept_indices), "labels": labels}, indent=2))
364
+ return 0
365
+
366
+
367
+ if __name__ == "__main__":
368
+ raise SystemExit(main())
eval_code/scripts/export_cbu_metric_tables.py ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Export paper-facing CBU tables with caption-level bootstrap CIs.
3
+
4
+ The script consumes existing CBU response JSONL artifacts. It does not call a
5
+ model and does not modify source captions.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import argparse
11
+ import csv
12
+ import json
13
+ import re
14
+ from collections import Counter, defaultdict
15
+ from pathlib import Path
16
+ from typing import Any
17
+
18
+ import numpy as np
19
+
20
+
21
+ UNIT_CATEGORIES = [
22
+ "object",
23
+ "attribute",
24
+ "relation",
25
+ "style",
26
+ "camera",
27
+ "lighting",
28
+ "count",
29
+ "text_rendering",
30
+ ]
31
+
32
+ VISUAL_STATUSES = {"grounded", "unsupported", "uncertain"}
33
+ TOKEN_RE = re.compile(r"[^\W_]+(?:'[^\W_]+)*", re.UNICODE)
34
+ ARTICLE_UNITS = {"a", "an", "the"}
35
+
36
+
37
+ def parse_args() -> argparse.Namespace:
38
+ parser = argparse.ArgumentParser(description=__doc__)
39
+ parser.add_argument("--claimed", action="append", default=[], metavar="LABEL=PATH")
40
+ parser.add_argument("--grounded", action="append", default=[], metavar="LABEL=PATH")
41
+ parser.add_argument("--output-dir", required=True)
42
+ parser.add_argument("--bootstrap-reps", type=int, default=2000)
43
+ parser.add_argument("--seed", type=int, default=0)
44
+ return parser.parse_args()
45
+
46
+
47
+ def parse_label_path(value: str) -> tuple[str, Path]:
48
+ if "=" not in value:
49
+ raise ValueError(f"Expected LABEL=PATH, got {value!r}")
50
+ label, path = value.split("=", 1)
51
+ return label, Path(path)
52
+
53
+
54
+ def normalize_unit(text: str) -> str:
55
+ tokens = TOKEN_RE.findall(text.lower())
56
+ while tokens and tokens[0] in ARTICLE_UNITS:
57
+ tokens.pop(0)
58
+ return " ".join(tokens)
59
+
60
+
61
+ def normalize_key_part(text: str) -> str:
62
+ return normalize_unit(text) or ""
63
+
64
+
65
+ def unit_records(group: Any) -> list[dict[str, str]]:
66
+ records: list[dict[str, str]] = []
67
+ if not isinstance(group, list):
68
+ return records
69
+ for item in group:
70
+ if not isinstance(item, dict):
71
+ continue
72
+ category = item.get("category")
73
+ unit = item.get("unit")
74
+ if category not in UNIT_CATEGORIES or not isinstance(unit, str) or not unit.strip():
75
+ continue
76
+ target = item.get("target", "")
77
+ records.append(
78
+ {
79
+ "category": category,
80
+ "unit": unit.strip(),
81
+ "target": target.strip() if isinstance(target, str) else "",
82
+ }
83
+ )
84
+ return records
85
+
86
+
87
+ def dedup_counts(group: Any) -> tuple[int, dict[str, int], int]:
88
+ counts = {category: 0 for category in UNIT_CATEGORIES}
89
+ seen: set[str] = set()
90
+ duplicate = 0
91
+ for record in unit_records(group):
92
+ norm = normalize_unit(record["unit"])
93
+ if not norm:
94
+ continue
95
+ key = f"{record['category']}|{norm}|{normalize_key_part(record.get('target', ''))}"
96
+ if key in seen:
97
+ duplicate += 1
98
+ continue
99
+ seen.add(key)
100
+ counts[record["category"]] += 1
101
+ return sum(counts.values()), counts, duplicate
102
+
103
+
104
+ def caption_tokens(request: dict[str, Any]) -> int:
105
+ caption = request.get("caption", "")
106
+ return len(TOKEN_RE.findall(caption)) if isinstance(caption, str) else 0
107
+
108
+
109
+ def read_claimed(path: Path, label: str) -> list[dict[str, Any]]:
110
+ rows: list[dict[str, Any]] = []
111
+ with path.open("r", encoding="utf-8") as handle:
112
+ for line in handle:
113
+ if not line.strip():
114
+ continue
115
+ raw = json.loads(line)
116
+ if not raw.get("ok") or not isinstance(raw.get("parsed"), dict):
117
+ continue
118
+ total, counts, duplicate = dedup_counts(raw["parsed"].get("claimed_units"))
119
+ request = raw.get("request", {})
120
+ rows.append(
121
+ {
122
+ "label": label,
123
+ "caption_id": request.get("caption_id"),
124
+ "tokens": caption_tokens(request),
125
+ "dedup_units": total,
126
+ "duplicate_units": duplicate,
127
+ **{f"{category}_units": counts[category] for category in UNIT_CATEGORIES},
128
+ }
129
+ )
130
+ return rows
131
+
132
+
133
+ def request_unit_lookup(request: dict[str, Any]) -> dict[str, dict[str, Any]]:
134
+ return {
135
+ unit.get("unit_id"): unit
136
+ for unit in request.get("claimed_units", [])
137
+ if isinstance(unit, dict) and isinstance(unit.get("unit_id"), str)
138
+ }
139
+
140
+
141
+ def read_grounded(path: Path, label: str) -> list[dict[str, Any]]:
142
+ rows: list[dict[str, Any]] = []
143
+ with path.open("r", encoding="utf-8") as handle:
144
+ for line in handle:
145
+ if not line.strip():
146
+ continue
147
+ raw = json.loads(line)
148
+ if not raw.get("ok") or not isinstance(raw.get("parsed"), dict):
149
+ continue
150
+ lookup = request_unit_lookup(raw.get("request", {}))
151
+ counter: Counter[str] = Counter()
152
+ for result in raw["parsed"].get("unit_results", []):
153
+ if not isinstance(result, dict):
154
+ continue
155
+ unit = lookup.get(result.get("unit_id"), {})
156
+ category = unit.get("category", "__unknown__")
157
+ status = result.get("status", "__bad_status__")
158
+ counter["valid"] += 1
159
+ counter[status] += 1
160
+ if status in VISUAL_STATUSES:
161
+ counter["visual"] += 1
162
+ if category in UNIT_CATEGORIES:
163
+ counter[f"{category}_visual"] += 1
164
+ counter[f"{category}_{status}"] += 1
165
+ rows.append(
166
+ {
167
+ "label": label,
168
+ "caption_id": raw.get("request", {}).get("caption_id"),
169
+ "valid": counter["valid"],
170
+ "visual": counter["visual"],
171
+ "grounded": counter["grounded"],
172
+ "unsupported": counter["unsupported"],
173
+ "uncertain": counter["uncertain"],
174
+ **{key: counter[key] for key in counter if "_" in key},
175
+ }
176
+ )
177
+ return rows
178
+
179
+
180
+ def ci(values: np.ndarray) -> tuple[float, float]:
181
+ return float(np.quantile(values, 0.025)), float(np.quantile(values, 0.975))
182
+
183
+
184
+ def bootstrap_indices(n: int, reps: int, rng: np.random.Generator) -> np.ndarray:
185
+ return rng.integers(0, n, size=(reps, n), endpoint=False)
186
+
187
+
188
+ def summarize_claimed(rows: list[dict[str, Any]], reps: int, rng: np.random.Generator) -> dict[str, Any]:
189
+ n = len(rows)
190
+ units = np.asarray([row["dedup_units"] for row in rows], dtype=np.float64)
191
+ tokens = np.asarray([max(row["tokens"], 1) for row in rows], dtype=np.float64)
192
+ dups = np.asarray([row["duplicate_units"] for row in rows], dtype=np.float64)
193
+ idx = bootstrap_indices(n, reps, rng) if n else np.empty((0, 0), dtype=np.int64)
194
+
195
+ def mean_metric(arr: np.ndarray) -> dict[str, float]:
196
+ point = float(arr.mean()) if len(arr) else 0.0
197
+ boot = arr[idx].mean(axis=1) if len(arr) else np.asarray([0.0])
198
+ low, high = ci(boot)
199
+ return {"mean": point, "ci95_low": low, "ci95_high": high}
200
+
201
+ ratio = float(100.0 * units.sum() / tokens.sum()) if tokens.sum() else 0.0
202
+ ratio_boot = 100.0 * units[idx].sum(axis=1) / tokens[idx].sum(axis=1) if n else np.asarray([0.0])
203
+ low, high = ci(ratio_boot)
204
+ out: dict[str, Any] = {
205
+ "captions": n,
206
+ "dedup_units_per_caption": mean_metric(units),
207
+ "dedup_units_per_100_tokens": {"mean": ratio, "ci95_low": low, "ci95_high": high},
208
+ "duplicate_units_per_caption": mean_metric(dups),
209
+ }
210
+ for category in UNIT_CATEGORIES:
211
+ arr = np.asarray([row[f"{category}_units"] for row in rows], dtype=np.float64)
212
+ out[f"{category}_per_caption"] = mean_metric(arr)
213
+ return out
214
+
215
+
216
+ def summarize_grounded(rows: list[dict[str, Any]], reps: int, rng: np.random.Generator) -> dict[str, Any]:
217
+ n = len(rows)
218
+ grounded = np.asarray([row["grounded"] for row in rows], dtype=np.float64)
219
+ unsupported = np.asarray([row["unsupported"] for row in rows], dtype=np.float64)
220
+ uncertain = np.asarray([row["uncertain"] for row in rows], dtype=np.float64)
221
+ visual = np.asarray([max(row["visual"], 0) for row in rows], dtype=np.float64)
222
+ idx = bootstrap_indices(n, reps, rng) if n else np.empty((0, 0), dtype=np.int64)
223
+
224
+ def ratio_metric(num: np.ndarray, den: np.ndarray) -> dict[str, float]:
225
+ point = float(num.sum() / den.sum()) if den.sum() else 0.0
226
+ if not n:
227
+ return {"mean": point, "ci95_low": point, "ci95_high": point}
228
+ boot_den = den[idx].sum(axis=1)
229
+ boot = np.divide(num[idx].sum(axis=1), boot_den, out=np.zeros_like(boot_den), where=boot_den != 0)
230
+ low, high = ci(boot)
231
+ return {"mean": point, "ci95_low": low, "ci95_high": high}
232
+
233
+ def mean_metric(arr: np.ndarray) -> dict[str, float]:
234
+ point = float(arr.mean()) if len(arr) else 0.0
235
+ boot = arr[idx].mean(axis=1) if len(arr) else np.asarray([0.0])
236
+ low, high = ci(boot)
237
+ return {"mean": point, "ci95_low": low, "ci95_high": high}
238
+
239
+ out: dict[str, Any] = {
240
+ "captions": n,
241
+ "visual_units": int(visual.sum()),
242
+ "grounded_units_per_caption": mean_metric(grounded),
243
+ "grounded_precision": ratio_metric(grounded, visual),
244
+ "unsupported_rate": ratio_metric(unsupported, visual),
245
+ "uncertain_rate": ratio_metric(uncertain, visual),
246
+ }
247
+ categories: dict[str, Any] = {}
248
+ for category in UNIT_CATEGORIES:
249
+ den = np.asarray([row.get(f"{category}_visual", 0) for row in rows], dtype=np.float64)
250
+ cat_grounded = np.asarray([row.get(f"{category}_grounded", 0) for row in rows], dtype=np.float64)
251
+ cat_unsupported = np.asarray([row.get(f"{category}_unsupported", 0) for row in rows], dtype=np.float64)
252
+ cat_uncertain = np.asarray([row.get(f"{category}_uncertain", 0) for row in rows], dtype=np.float64)
253
+ categories[category] = {
254
+ "visual_units": int(den.sum()),
255
+ "grounded_precision": ratio_metric(cat_grounded, den),
256
+ "unsupported_rate": ratio_metric(cat_unsupported, den),
257
+ "uncertain_rate": ratio_metric(cat_uncertain, den),
258
+ }
259
+ out["categories"] = categories
260
+ return out
261
+
262
+
263
+ def write_tsv(path: Path, rows: list[dict[str, Any]], fieldnames: list[str]) -> None:
264
+ path.parent.mkdir(parents=True, exist_ok=True)
265
+ with path.open("w", encoding="utf-8", newline="") as handle:
266
+ writer = csv.DictWriter(handle, fieldnames=fieldnames, delimiter="\t")
267
+ writer.writeheader()
268
+ writer.writerows(rows)
269
+
270
+
271
+ def fmt_metric(metric: dict[str, float]) -> str:
272
+ return f"{metric['mean']:.4f} [{metric['ci95_low']:.4f}, {metric['ci95_high']:.4f}]"
273
+
274
+
275
+ def main() -> int:
276
+ args = parse_args()
277
+ out_dir = Path(args.output_dir)
278
+ out_dir.mkdir(parents=True, exist_ok=True)
279
+ rng = np.random.default_rng(args.seed)
280
+
281
+ payload: dict[str, Any] = {
282
+ "bootstrap_reps": args.bootstrap_reps,
283
+ "seed": args.seed,
284
+ "claimed": {},
285
+ "grounded": {},
286
+ }
287
+
288
+ claimed_tsv: list[dict[str, Any]] = []
289
+ for item in args.claimed:
290
+ label, path = parse_label_path(item)
291
+ rows = read_claimed(path, label)
292
+ summary = summarize_claimed(rows, args.bootstrap_reps, rng)
293
+ payload["claimed"][label] = {"input": str(path), **summary}
294
+ claimed_tsv.append(
295
+ {
296
+ "surface": label,
297
+ "captions": summary["captions"],
298
+ "cbu_per_caption_ci95": fmt_metric(summary["dedup_units_per_caption"]),
299
+ "cbu_per_100_tokens_ci95": fmt_metric(summary["dedup_units_per_100_tokens"]),
300
+ "object_per_caption_ci95": fmt_metric(summary["object_per_caption"]),
301
+ "attribute_per_caption_ci95": fmt_metric(summary["attribute_per_caption"]),
302
+ "relation_per_caption_ci95": fmt_metric(summary["relation_per_caption"]),
303
+ "camera_per_caption_ci95": fmt_metric(summary["camera_per_caption"]),
304
+ "lighting_per_caption_ci95": fmt_metric(summary["lighting_per_caption"]),
305
+ "text_rendering_per_caption_ci95": fmt_metric(summary["text_rendering_per_caption"]),
306
+ }
307
+ )
308
+
309
+ grounded_tsv: list[dict[str, Any]] = []
310
+ category_tsv: list[dict[str, Any]] = []
311
+ for item in args.grounded:
312
+ label, path = parse_label_path(item)
313
+ rows = read_grounded(path, label)
314
+ summary = summarize_grounded(rows, args.bootstrap_reps, rng)
315
+ payload["grounded"][label] = {"input": str(path), **summary}
316
+ grounded_tsv.append(
317
+ {
318
+ "surface": label,
319
+ "captions": summary["captions"],
320
+ "visual_units": summary["visual_units"],
321
+ "grounded_units_per_caption_ci95": fmt_metric(summary["grounded_units_per_caption"]),
322
+ "grounded_precision_ci95": fmt_metric(summary["grounded_precision"]),
323
+ "unsupported_rate_ci95": fmt_metric(summary["unsupported_rate"]),
324
+ "uncertain_rate_ci95": fmt_metric(summary["uncertain_rate"]),
325
+ }
326
+ )
327
+ for category, cat in summary["categories"].items():
328
+ category_tsv.append(
329
+ {
330
+ "surface": label,
331
+ "category": category,
332
+ "visual_units": cat["visual_units"],
333
+ "grounded_precision_ci95": fmt_metric(cat["grounded_precision"]),
334
+ "unsupported_rate_ci95": fmt_metric(cat["unsupported_rate"]),
335
+ "uncertain_rate_ci95": fmt_metric(cat["uncertain_rate"]),
336
+ }
337
+ )
338
+
339
+ (out_dir / "cbu_bootstrap_summary.json").write_text(json.dumps(payload, indent=2), encoding="utf-8")
340
+ write_tsv(
341
+ out_dir / "claimed_cbu_ci.tsv",
342
+ claimed_tsv,
343
+ [
344
+ "surface",
345
+ "captions",
346
+ "cbu_per_caption_ci95",
347
+ "cbu_per_100_tokens_ci95",
348
+ "object_per_caption_ci95",
349
+ "attribute_per_caption_ci95",
350
+ "relation_per_caption_ci95",
351
+ "camera_per_caption_ci95",
352
+ "lighting_per_caption_ci95",
353
+ "text_rendering_per_caption_ci95",
354
+ ],
355
+ )
356
+ write_tsv(
357
+ out_dir / "grounded_cbu_ci.tsv",
358
+ grounded_tsv,
359
+ [
360
+ "surface",
361
+ "captions",
362
+ "visual_units",
363
+ "grounded_units_per_caption_ci95",
364
+ "grounded_precision_ci95",
365
+ "unsupported_rate_ci95",
366
+ "uncertain_rate_ci95",
367
+ ],
368
+ )
369
+ write_tsv(
370
+ out_dir / "grounded_cbu_category_ci.tsv",
371
+ category_tsv,
372
+ [
373
+ "surface",
374
+ "category",
375
+ "visual_units",
376
+ "grounded_precision_ci95",
377
+ "unsupported_rate_ci95",
378
+ "uncertain_rate_ci95",
379
+ ],
380
+ )
381
+ print(json.dumps({"output_dir": str(out_dir), "claimed": len(claimed_tsv), "grounded": len(grounded_tsv)}, indent=2))
382
+ return 0
383
+
384
+
385
+ if __name__ == "__main__":
386
+ raise SystemExit(main())
eval_code/scripts/export_cbu_vqa_tables.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Export compact tables from CBU-VQA summary JSON files."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+
12
+ def parse_args() -> argparse.Namespace:
13
+ parser = argparse.ArgumentParser(description=__doc__)
14
+ parser.add_argument("--summary", action="append", required=True, help="CBU-VQA summary JSON")
15
+ parser.add_argument("--output-md", required=True)
16
+ parser.add_argument("--output-tex", default=None)
17
+ return parser.parse_args()
18
+
19
+
20
+ def load_rows(paths: list[str]) -> list[dict[str, Any]]:
21
+ rows: list[dict[str, Any]] = []
22
+ for path in paths:
23
+ data = json.loads(Path(path).read_text(encoding="utf-8"))
24
+ for surface, stats in sorted(data.get("surfaces", {}).items()):
25
+ rows.append(
26
+ {
27
+ "source": path,
28
+ "surface": surface,
29
+ "responses": stats.get("responses", 0),
30
+ "ok": stats.get("ok", 0),
31
+ "questions": stats.get("questions", 0),
32
+ "support": stats.get("support_rate", 0.0),
33
+ "risk": stats.get("risk_rate", 0.0),
34
+ "uncertain": stats.get("uncertainty_rate", 0.0),
35
+ }
36
+ )
37
+ return rows
38
+
39
+
40
+ def write_markdown(rows: list[dict[str, Any]], path: Path) -> None:
41
+ lines = [
42
+ "| Surface | Resp | OK | Q | Support ↑ | Risk ↓ | Uncertain ↓ |",
43
+ "|---|---:|---:|---:|---:|---:|---:|",
44
+ ]
45
+ for row in rows:
46
+ lines.append(
47
+ "| {surface} | {responses:,} | {ok:,} | {questions:,} | {support:.4f} | {risk:.4f} | {uncertain:.4f} |".format(
48
+ **row
49
+ )
50
+ )
51
+ path.parent.mkdir(parents=True, exist_ok=True)
52
+ path.write_text("\n".join(lines) + "\n", encoding="utf-8")
53
+
54
+
55
+ def write_latex(rows: list[dict[str, Any]], path: Path) -> None:
56
+ lines = [
57
+ r"\begin{tabular}{lrrrrrr}",
58
+ r"\toprule",
59
+ r"Surface & Resp. & OK & Q & Support $\uparrow$ & Risk $\downarrow$ & Uncertain $\downarrow$ \\",
60
+ r"\midrule",
61
+ ]
62
+ for row in rows:
63
+ lines.append(
64
+ "{surface} & {responses:,} & {ok:,} & {questions:,} & {support:.4f} & {risk:.4f} & {uncertain:.4f} \\\\".format(
65
+ **row
66
+ ).replace("_", r"\_")
67
+ )
68
+ lines.extend([r"\bottomrule", r"\end{tabular}"])
69
+ path.parent.mkdir(parents=True, exist_ok=True)
70
+ path.write_text("\n".join(lines) + "\n", encoding="utf-8")
71
+
72
+
73
+ def main() -> int:
74
+ args = parse_args()
75
+ rows = load_rows(args.summary)
76
+ write_markdown(rows, Path(args.output_md))
77
+ if args.output_tex:
78
+ write_latex(rows, Path(args.output_tex))
79
+ print(json.dumps({"rows": len(rows), "output_md": args.output_md, "output_tex": args.output_tex}, indent=2))
80
+ return 0
81
+
82
+
83
+ if __name__ == "__main__":
84
+ raise SystemExit(main())
eval_code/scripts/pack_recap_ed_metrics.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Pack small recap E&D metric artifacts into a release-friendly directory."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import csv
8
+ import json
9
+ import shutil
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+
14
+ ROOT = Path("<PROJECT_ROOT>")
15
+ NVME = Path("<LOCAL_CACHE>")
16
+
17
+
18
+ EMBEDDING_RUNS = [
19
+ ("Qwen3-Embedding-4B", "ours", "qwen3-embedding-4b/datacomp_ours_50k"),
20
+ ("Qwen3-Embedding-4B", "ref", "qwen3-embedding-4b/datacomp_ref_llava15_50k"),
21
+ ("Qwen3-Embedding-8B", "ours", "qwen3-embedding-8b/datacomp_ours_50k"),
22
+ ("Qwen3-Embedding-8B", "ref", "qwen3-embedding-8b/datacomp_ref_llava15_50k"),
23
+ ("E5-Mistral-7B", "ours", "e5-mistral-7b-instruct/datacomp_ours_50k"),
24
+ ("E5-Mistral-7B", "ref", "e5-mistral-7b-instruct/datacomp_ref_llava15_50k"),
25
+ ("BGE-M3-official", "ours", "bge-m3-official/datacomp_ours_50k"),
26
+ ("BGE-M3-official", "ref", "bge-m3-official/datacomp_ref_llava15_50k"),
27
+ ]
28
+
29
+
30
+ SUPPORT_RUNS = [
31
+ ("Qwen3-Embedding-4B raw/raw", "ours", "qwen3-embedding-4b/2026-04-25/diffusiondb_raw_to_ours_50k.support.json"),
32
+ ("Qwen3-Embedding-4B raw/raw", "ref", "qwen3-embedding-4b/2026-04-25/diffusiondb_raw_to_ref_50k.support.json"),
33
+ ("Qwen3-Embedding-4B query/doc", "ours", "qwen3-embedding-4b/2026-04-25/diffusiondb_query_to_ours_50k.support.json"),
34
+ ("Qwen3-Embedding-4B query/doc", "ref", "qwen3-embedding-4b/2026-04-25/diffusiondb_query_to_ref_50k.support.json"),
35
+ ("E5-Mistral raw/raw", "ours", "e5-mistral-7b-instruct/2026-04-25/diffusiondb_raw_to_ours_50k.support.json"),
36
+ ("E5-Mistral raw/raw", "ref", "e5-mistral-7b-instruct/2026-04-25/diffusiondb_raw_to_ref_50k.support.json"),
37
+ ("E5-Mistral query/doc", "ours", "e5-mistral-7b-instruct/2026-04-25/diffusiondb_query_to_ours_50k.support.json"),
38
+ ("E5-Mistral query/doc", "ref", "e5-mistral-7b-instruct/2026-04-25/diffusiondb_query_to_ref_50k.support.json"),
39
+ ("BGE-M3 raw/corpus", "ours", "bge-m3-official/2026-04-25/diffusiondb_raw_to_ours_50k.support.json"),
40
+ ("BGE-M3 raw/corpus", "ref", "bge-m3-official/2026-04-25/diffusiondb_raw_to_ref_50k.support.json"),
41
+ ("BGE-M3 query/corpus", "ours", "bge-m3-official/2026-04-25/diffusiondb_query_to_ours_50k.support.json"),
42
+ ("BGE-M3 query/corpus", "ref", "bge-m3-official/2026-04-25/diffusiondb_query_to_ref_50k.support.json"),
43
+ ]
44
+
45
+
46
+ def parse_args() -> argparse.Namespace:
47
+ parser = argparse.ArgumentParser(description=__doc__)
48
+ parser.add_argument("--output-dir", default="artifacts/recap-ed/metrics-2026-04-25")
49
+ return parser.parse_args()
50
+
51
+
52
+ def load_json(path: Path) -> dict[str, Any]:
53
+ with path.open("r", encoding="utf-8") as handle:
54
+ return json.load(handle)
55
+
56
+
57
+ def write_tsv(path: Path, rows: list[dict[str, Any]], fields: list[str]) -> None:
58
+ path.parent.mkdir(parents=True, exist_ok=True)
59
+ with path.open("w", encoding="utf-8", newline="") as handle:
60
+ writer = csv.DictWriter(handle, fields, delimiter="\t")
61
+ writer.writeheader()
62
+ writer.writerows(rows)
63
+
64
+
65
+ def rel_or_abs(path: Path) -> str:
66
+ try:
67
+ return str(path.relative_to(ROOT))
68
+ except ValueError:
69
+ return str(path)
70
+
71
+
72
+ def pack_embedding(out_dir: Path, manifest: dict[str, Any]) -> None:
73
+ rows: list[dict[str, Any]] = []
74
+ for encoder, surface, rel in EMBEDDING_RUNS:
75
+ base = NVME / "caption-embeddings" / rel
76
+ vendi_path = base / "vendi_partition_b4096_seed0.json"
77
+ rel_path = Path(rel)
78
+ geometry_path = NVME / "caption-geometry" / rel_path.parent / f"{rel_path.name}.geometry.json"
79
+ if not geometry_path.exists():
80
+ geometry_path = base / "geometry_seed0.json"
81
+ vendi = load_json(vendi_path)
82
+ geometry = load_json(geometry_path)
83
+ geometry_metrics = geometry.get("metrics", geometry)
84
+ summary = vendi["summary"]["vendi"]
85
+ rows.append(
86
+ {
87
+ "encoder": encoder,
88
+ "surface": surface,
89
+ "rows": vendi.get("source_rows"),
90
+ "vendi_mean": f"{summary['mean']:.6f}",
91
+ "vendi_ci95_low": f"{summary['ci95_low']:.6f}",
92
+ "vendi_ci95_high": f"{summary['ci95_high']:.6f}",
93
+ "cov_effective_rank": f"{geometry_metrics.get('cov_effective_rank', 0):.6f}",
94
+ "cov_participation_ratio": f"{geometry_metrics.get('cov_participation_ratio', 0):.6f}",
95
+ "cov_top1_mass": f"{geometry_metrics.get('cov_top1_mass', 0):.6f}",
96
+ }
97
+ )
98
+ manifest["sources"].append(rel_or_abs(vendi_path))
99
+ manifest["sources"].append(rel_or_abs(geometry_path))
100
+ write_tsv(
101
+ out_dir / "embedding" / "caption_embedding_profile.tsv",
102
+ rows,
103
+ [
104
+ "encoder",
105
+ "surface",
106
+ "rows",
107
+ "vendi_mean",
108
+ "vendi_ci95_low",
109
+ "vendi_ci95_high",
110
+ "cov_effective_rank",
111
+ "cov_participation_ratio",
112
+ "cov_top1_mass",
113
+ ],
114
+ )
115
+
116
+
117
+ def pack_support(out_dir: Path, manifest: dict[str, Any]) -> None:
118
+ rows: list[dict[str, Any]] = []
119
+ for protocol, surface, rel in SUPPORT_RUNS:
120
+ path = NVME / "prompt-caption-support" / rel
121
+ data = load_json(path)
122
+ metrics = data["metrics"]
123
+ rows.append(
124
+ {
125
+ "protocol": protocol,
126
+ "surface": surface,
127
+ "prompt_rows": data.get("query_rows"),
128
+ "caption_rows": data.get("gallery_rows"),
129
+ "k": data.get("k"),
130
+ "coverage": f"{metrics['coverage']:.6f}",
131
+ "density": f"{metrics['density']:.6f}",
132
+ "nn_cosine_mean": f"{metrics['nn_cosine_mean']:.6f}",
133
+ "nn_distance_p95": f"{metrics['nn_distance_p95']:.6f}",
134
+ }
135
+ )
136
+ manifest["sources"].append(rel_or_abs(path))
137
+ write_tsv(
138
+ out_dir / "embedding" / "prompt_caption_support.tsv",
139
+ rows,
140
+ [
141
+ "protocol",
142
+ "surface",
143
+ "prompt_rows",
144
+ "caption_rows",
145
+ "k",
146
+ "coverage",
147
+ "density",
148
+ "nn_cosine_mean",
149
+ "nn_distance_p95",
150
+ ],
151
+ )
152
+
153
+
154
+ def pack_cpu(out_dir: Path, manifest: dict[str, Any]) -> None:
155
+ cpu_dir = out_dir / "cpu"
156
+ cpu_dir.mkdir(parents=True, exist_ok=True)
157
+ small_files = [
158
+ ROOT / "artifacts/caption-survey/cpu_remaining_2026-04-24/paired_delta_ci.tsv",
159
+ NVME / "caption-survey/local_long_1m.json",
160
+ NVME / "caption-survey/hf_manifest_1m.json",
161
+ ]
162
+ for src in small_files:
163
+ dst = cpu_dir / src.name
164
+ shutil.copy2(src, dst)
165
+ manifest["sources"].append(rel_or_abs(src))
166
+ manifest["packed_files"].append(rel_or_abs(dst))
167
+
168
+
169
+ def write_readme(out_dir: Path) -> None:
170
+ readme = """# Recap E&D Metric Artifacts
171
+
172
+ Date: 2026-04-25
173
+
174
+ This directory contains small, paper-facing metric artifacts for the recap E&D draft.
175
+ Large intermediate embedding arrays, VLM response JSONL files, and source image data are
176
+ not copied here. The manifest records local source paths for reproducibility.
177
+
178
+ Contents:
179
+
180
+ - `cpu/paired_delta_ci.tsv`: paired CPU lexical/surface metric deltas with CIs.
181
+ - `cpu/local_long_1m.json`: local long-caption corpus survey summaries.
182
+ - `cpu/hf_manifest_1m.json`: public-reference corpus survey summaries.
183
+ - `cbu/claimed_cbu_ci.tsv`: caption-level bootstrap CIs for claimed CBU density.
184
+ - `cbu/grounded_cbu_ci.tsv`: caption-level bootstrap CIs for exact-unit grounded CBU audits.
185
+ - `cbu/grounded_cbu_category_ci.tsv`: category-level grounded CBU audit CIs.
186
+ - `embedding/caption_embedding_profile.tsv`: Vendi and covariance-geometry profiles.
187
+ - `embedding/prompt_caption_support.tsv`: PRDC-style prompt-in-caption support metrics.
188
+
189
+ Boundary:
190
+
191
+ - Text-only metrics describe caption/supervision structure.
192
+ - `GroundedCBU` is a sampled VLM proxy audit, not a human-certified faithfulness score.
193
+ - Embedding metrics are encoder-sensitive and should be reported as profiles, not a single scalar quality score.
194
+ """
195
+ (out_dir / "README.md").write_text(readme, encoding="utf-8")
196
+
197
+
198
+ def main() -> int:
199
+ args = parse_args()
200
+ out_dir = Path(args.output_dir)
201
+ out_dir.mkdir(parents=True, exist_ok=True)
202
+ manifest: dict[str, Any] = {
203
+ "date": "2026-04-25",
204
+ "purpose": "paper-facing recap E&D metric artifact bundle",
205
+ "sources": [],
206
+ "packed_files": [],
207
+ }
208
+ pack_cpu(out_dir, manifest)
209
+ pack_embedding(out_dir, manifest)
210
+ pack_support(out_dir, manifest)
211
+ write_readme(out_dir)
212
+ manifest["packed_files"].extend(
213
+ rel_or_abs(path)
214
+ for path in sorted(out_dir.rglob("*"))
215
+ if path.is_file() and path.name != "manifest.json"
216
+ )
217
+ (out_dir / "manifest.json").write_text(json.dumps(manifest, indent=2), encoding="utf-8")
218
+ print(json.dumps({"output_dir": str(out_dir), "files": len(manifest["packed_files"])}, indent=2))
219
+ return 0
220
+
221
+
222
+ if __name__ == "__main__":
223
+ raise SystemExit(main())
eval_code/scripts/plot_caption_survey_curves.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Plot budget-curve metrics from caption survey JSON outputs."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ import matplotlib
12
+
13
+ matplotlib.use("Agg")
14
+ import matplotlib.pyplot as plt
15
+
16
+
17
+ METRICS = [
18
+ ("coverage_rate", "Budget Eligibility@B", "up"),
19
+ ("distinct_n.2", "Distinct-2@B", "up"),
20
+ ("distinct_n.3", "Distinct-3@B", "up"),
21
+ ("ngram_top_k_mass.2", "Top-100 Bigram Mass@B", "down"),
22
+ ("ngram_top_k_mass.3", "Top-100 Trigram Mass@B", "down"),
23
+ ("violation_rate", "Violation Rate@B", "down"),
24
+ ("repeated_4gram_rate", "Repeated 4-gram Rate@B", "down"),
25
+ ]
26
+
27
+
28
+ def parse_args() -> argparse.Namespace:
29
+ parser = argparse.ArgumentParser(description="Plot caption survey budget curves")
30
+ parser.add_argument("--input", action="append", required=True, help="Survey JSON path (repeatable)")
31
+ parser.add_argument("--output-dir", required=True, help="Directory for output PNG plots")
32
+ parser.add_argument(
33
+ "--long-coverage-threshold",
34
+ type=float,
35
+ default=0.5,
36
+ help="budget-eligibility@64 threshold used to split long vs short regimes",
37
+ )
38
+ return parser.parse_args()
39
+
40
+
41
+ def nested_get(mapping: dict[str, Any], path: str) -> float | None:
42
+ current: Any = mapping
43
+ for part in path.split("."):
44
+ if not isinstance(current, dict) or part not in current:
45
+ return None
46
+ current = current[part]
47
+ return float(current) if isinstance(current, (int, float)) else None
48
+
49
+
50
+ def load_rows(paths: list[str]) -> list[dict[str, Any]]:
51
+ rows: list[dict[str, Any]] = []
52
+ for raw_path in paths:
53
+ payload = json.loads(Path(raw_path).read_text(encoding="utf-8"))
54
+ if "results" in payload:
55
+ for item in payload.get("results", []):
56
+ summary = item.get("summary") or item.get("survey_summary")
57
+ if not isinstance(summary, dict):
58
+ continue
59
+ entry = item.get("entry") or {}
60
+ length_controlled = summary.get("length_controlled") or {}
61
+ budgets = sorted(int(key) for key in length_controlled.keys())
62
+ if not budgets:
63
+ continue
64
+ cov64 = nested_get(length_controlled.get("64", {}), "coverage_rate") or 0.0
65
+ full = summary.get("full_length_reference") or {}
66
+ avg_tokens = full.get("avg_tokens", full.get("avg_lexical_tokens", 0.0))
67
+ rows.append(
68
+ {
69
+ "name": entry.get("name", Path(raw_path).stem),
70
+ "family": entry.get("source_family", "unknown"),
71
+ "group": entry.get("group", "unknown"),
72
+ "description": entry.get("description", ""),
73
+ "captioner": entry.get("captioner", ""),
74
+ "avg_tokens": float(avg_tokens),
75
+ "coverage64": float(cov64),
76
+ "budgets": budgets,
77
+ "length_controlled": length_controlled,
78
+ }
79
+ )
80
+ continue
81
+
82
+ if "length_controlled" in payload:
83
+ length_controlled = payload.get("length_controlled") or {}
84
+ budgets = sorted(int(key) for key in length_controlled.keys())
85
+ if not budgets:
86
+ continue
87
+ cov64 = nested_get(length_controlled.get("64", {}), "coverage_rate") or 0.0
88
+ full = payload.get("full_length_reference") or {}
89
+ avg_tokens = full.get("avg_tokens", full.get("avg_lexical_tokens", 0.0))
90
+ stem = Path(raw_path).stem
91
+ name = stem.removesuffix("_1m").removesuffix("_50k")
92
+ family = "unknown"
93
+ if "datacomp" in name:
94
+ family = "datacomp"
95
+ elif "pd12m" in name:
96
+ family = "pd12m"
97
+ rows.append(
98
+ {
99
+ "name": name,
100
+ "family": family,
101
+ "group": "direct_summary",
102
+ "description": "",
103
+ "captioner": "",
104
+ "avg_tokens": float(avg_tokens),
105
+ "coverage64": float(cov64),
106
+ "budgets": budgets,
107
+ "length_controlled": length_controlled,
108
+ }
109
+ )
110
+ return rows
111
+
112
+
113
+ def label_for_row(row: dict[str, Any]) -> str:
114
+ name = row["name"]
115
+ if name.startswith("ours_"):
116
+ label = f"ours:{name.removeprefix('ours_')}"
117
+ elif name.startswith("ref_"):
118
+ label = f"ref:{name.removeprefix('ref_')}"
119
+ else:
120
+ label = name
121
+ if name == "ref_cc12m_qwen3vl8b":
122
+ label += "†"
123
+ return label
124
+
125
+
126
+ def decorate_metric_label(metric_label: str, direction: str) -> str:
127
+ arrow = "↑" if direction == "up" else "↓"
128
+ return f"{metric_label} {arrow}"
129
+
130
+
131
+ def style_for_row(row: dict[str, Any]) -> dict[str, Any]:
132
+ if row["name"].startswith("ours_"):
133
+ return {"linewidth": 2.8, "alpha": 0.95, "linestyle": "-"}
134
+ return {"linewidth": 1.6, "alpha": 0.85, "linestyle": "--"}
135
+
136
+
137
+ def series_for_metric(row: dict[str, Any], metric_key: str) -> tuple[list[int], list[float]]:
138
+ xs: list[int] = []
139
+ ys: list[float] = []
140
+ for budget in row["budgets"]:
141
+ summary = row["length_controlled"].get(str(budget), {})
142
+ value = nested_get(summary, metric_key)
143
+ if value is None:
144
+ continue
145
+ xs.append(budget)
146
+ ys.append(value)
147
+ return xs, ys
148
+
149
+
150
+ def save_metric_plot(
151
+ rows: list[dict[str, Any]],
152
+ metric_key: str,
153
+ metric_label: str,
154
+ direction: str,
155
+ regime_name: str,
156
+ output_path: Path,
157
+ ) -> None:
158
+ fig, ax = plt.subplots(figsize=(10.5, 6.2))
159
+ for row in sorted(rows, key=lambda item: (item["family"], item["name"])):
160
+ xs, ys = series_for_metric(row, metric_key)
161
+ if not xs:
162
+ continue
163
+ ax.plot(xs, ys, marker="o", label=label_for_row(row), **style_for_row(row))
164
+
165
+ decorated_label = decorate_metric_label(metric_label, direction)
166
+ ax.set_title(f"{decorated_label} by Budget ({regime_name})")
167
+ ax.set_xlabel("Token Budget")
168
+ ax.set_ylabel(decorated_label)
169
+ ax.set_xticks(sorted({budget for row in rows for budget in row["budgets"]}))
170
+ ax.grid(True, alpha=0.25)
171
+ ax.legend(fontsize=8, ncol=2)
172
+ output_path.parent.mkdir(parents=True, exist_ok=True)
173
+ fig.tight_layout()
174
+ fig.savefig(output_path, dpi=180)
175
+ plt.close(fig)
176
+
177
+
178
+ def save_family_plot(rows: list[dict[str, Any]], family: str, output_path: Path) -> None:
179
+ family_rows = [row for row in rows if row["family"] == family]
180
+ if not family_rows:
181
+ return
182
+ fig, axes = plt.subplots(2, 3, figsize=(14, 8.5))
183
+ axes = axes.flatten()
184
+ for axis, (metric_key, metric_label, direction) in zip(axes, METRICS[:6], strict=False):
185
+ for row in sorted(family_rows, key=lambda item: item["name"]):
186
+ xs, ys = series_for_metric(row, metric_key)
187
+ if not xs:
188
+ continue
189
+ axis.plot(xs, ys, marker="o", label=label_for_row(row), **style_for_row(row))
190
+ axis.set_title(decorate_metric_label(metric_label, direction))
191
+ axis.set_xlabel("Budget")
192
+ axis.grid(True, alpha=0.25)
193
+ handles, labels = axes[0].get_legend_handles_labels()
194
+ if handles:
195
+ fig.legend(handles, labels, loc="lower center", ncol=2, fontsize=8)
196
+ fig.suptitle(f"{family} Budget Curves", y=0.98)
197
+ fig.tight_layout(rect=(0, 0.05, 1, 0.96))
198
+ output_path.parent.mkdir(parents=True, exist_ok=True)
199
+ fig.savefig(output_path, dpi=180)
200
+ plt.close(fig)
201
+
202
+
203
+ def main() -> int:
204
+ args = parse_args()
205
+ rows = load_rows(args.input)
206
+ if not rows:
207
+ raise SystemExit("No survey rows loaded")
208
+
209
+ output_dir = Path(args.output_dir)
210
+ long_rows = [row for row in rows if row["coverage64"] >= args.long_coverage_threshold]
211
+ short_rows = [row for row in rows if row["coverage64"] < args.long_coverage_threshold]
212
+
213
+ for metric_key, metric_label, direction in METRICS:
214
+ if long_rows:
215
+ save_metric_plot(
216
+ long_rows,
217
+ metric_key,
218
+ metric_label,
219
+ direction,
220
+ "long-regime",
221
+ output_dir / "overview" / "long" / f"{metric_key.replace('.', '_')}.png",
222
+ )
223
+ if short_rows:
224
+ save_metric_plot(
225
+ short_rows,
226
+ metric_key,
227
+ metric_label,
228
+ direction,
229
+ "short-regime",
230
+ output_dir / "overview" / "short" / f"{metric_key.replace('.', '_')}.png",
231
+ )
232
+
233
+ for family in sorted({row["family"] for row in rows}):
234
+ save_family_plot(rows, family, output_dir / "families" / f"{family}.png")
235
+
236
+ manifest = {
237
+ "inputs": args.input,
238
+ "output_dir": str(output_dir),
239
+ "long_coverage_threshold": args.long_coverage_threshold,
240
+ "rows_loaded": len(rows),
241
+ "long_rows": [row["name"] for row in long_rows],
242
+ "short_rows": [row["name"] for row in short_rows],
243
+ "metrics": [metric_key for metric_key, _, _ in METRICS],
244
+ }
245
+ (output_dir / "plot_manifest.json").write_text(json.dumps(manifest, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
246
+ print(json.dumps(manifest, indent=2, ensure_ascii=False))
247
+ return 0
248
+
249
+
250
+ if __name__ == "__main__":
251
+ raise SystemExit(main())
eval_code/scripts/run_cbu_vqa_requests.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Run VQA-style CBU question requests against an OpenAI-compatible VLM server."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import asyncio
8
+ import base64
9
+ import json
10
+ import time
11
+ from io import BytesIO
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ import aiohttp
16
+ from PIL import Image, ImageFile
17
+
18
+ ImageFile.LOAD_TRUNCATED_IMAGES = True
19
+
20
+ ANSWERS = ["yes", "no", "uncertain"]
21
+
22
+
23
+ def parse_args() -> argparse.Namespace:
24
+ parser = argparse.ArgumentParser(description="Run CBU VQA requests")
25
+ parser.add_argument("--input", required=True)
26
+ parser.add_argument("--output", required=True)
27
+ parser.add_argument("--urls", default="http://localhost:8000")
28
+ parser.add_argument("--model", default="Qwen/Qwen3.5-397B-A17B-FP8")
29
+ parser.add_argument("--max-requests", type=int, default=None)
30
+ parser.add_argument("--concurrency", type=int, default=512)
31
+ parser.add_argument("--max-tokens", type=int, default=2048)
32
+ parser.add_argument("--temperature", type=float, default=0.0)
33
+ parser.add_argument("--timeout-sec", type=int, default=2400)
34
+ parser.add_argument("--image-mode", choices=["auto", "file", "data", "url"], default="file")
35
+ parser.add_argument("--structured-json", action="store_true")
36
+ parser.add_argument(
37
+ "--no-evidence",
38
+ action="store_true",
39
+ help="Use compact answer-only schema: question_id, answer, confidence.",
40
+ )
41
+ parser.add_argument("--resume", action="store_true")
42
+ parser.add_argument("--resume-ok-only", action="store_true")
43
+ parser.add_argument("--skip-ok-from", default=None)
44
+ return parser.parse_args()
45
+
46
+
47
+ def iter_requests(path: Path, max_requests: int | None) -> list[dict[str, Any]]:
48
+ rows = []
49
+ with path.open("r", encoding="utf-8") as handle:
50
+ for line in handle:
51
+ if max_requests is not None and len(rows) >= max_requests:
52
+ break
53
+ if line.strip():
54
+ rows.append(json.loads(line))
55
+ return rows
56
+
57
+
58
+ def image_url_for(row: dict[str, Any], mode: str) -> str:
59
+ if mode in {"auto", "data"} and row.get("image_path"):
60
+ path = Path(row["image_path"])
61
+ with Image.open(path) as image:
62
+ if image.mode != "RGB":
63
+ image = image.convert("RGB")
64
+ buffer = BytesIO()
65
+ image.save(buffer, format="JPEG", quality=88)
66
+ return f"data:image/jpeg;base64,{base64.b64encode(buffer.getvalue()).decode('ascii')}"
67
+ if mode in {"auto", "file"} and row.get("image_path"):
68
+ return Path(row["image_path"]).resolve().as_uri()
69
+ if mode == "file":
70
+ raise ValueError(f"request {row.get('request_id')} has no image_path")
71
+ return row["image_url"]
72
+
73
+
74
+ def response_schema(question_ids: list[str], include_evidence: bool) -> dict[str, Any]:
75
+ item_properties: dict[str, Any] = {
76
+ "question_id": {"type": "string", "enum": question_ids},
77
+ "answer": {"type": "string", "enum": ANSWERS},
78
+ "confidence": {"type": "number", "minimum": 0.0, "maximum": 1.0},
79
+ }
80
+ required = ["question_id", "answer", "confidence"]
81
+ if include_evidence:
82
+ item_properties["evidence"] = {"type": "string", "maxLength": 160}
83
+ required.append("evidence")
84
+ return {
85
+ "type": "object",
86
+ "properties": {
87
+ "caption_id": {"type": "string"},
88
+ "question_results": {
89
+ "type": "array",
90
+ "minItems": len(question_ids),
91
+ "maxItems": len(question_ids),
92
+ "items": {
93
+ "type": "object",
94
+ "properties": item_properties,
95
+ "required": required,
96
+ "additionalProperties": False,
97
+ },
98
+ },
99
+ },
100
+ "required": ["caption_id", "question_results"],
101
+ "additionalProperties": False,
102
+ }
103
+
104
+
105
+ def validate(parsed: Any, row: dict[str, Any], include_evidence: bool) -> str | None:
106
+ if not isinstance(parsed, dict):
107
+ return "top-level response is not an object"
108
+ if not isinstance(parsed.get("caption_id"), str):
109
+ return "caption_id is not a string"
110
+ results = parsed.get("question_results")
111
+ if not isinstance(results, list):
112
+ return "question_results is not an array"
113
+ expected = [question["question_id"] for question in row.get("questions", [])]
114
+ seen = []
115
+ for index, result in enumerate(results):
116
+ if not isinstance(result, dict):
117
+ return f"question_results[{index}] is not an object"
118
+ question_id = result.get("question_id")
119
+ if not isinstance(question_id, str):
120
+ return f"question_results[{index}].question_id is not a string"
121
+ seen.append(question_id)
122
+ if result.get("answer") not in set(ANSWERS):
123
+ return f"question_results[{index}].answer has invalid value"
124
+ if not isinstance(result.get("confidence"), int | float):
125
+ return f"question_results[{index}].confidence is not numeric"
126
+ if include_evidence and not isinstance(result.get("evidence"), str):
127
+ return f"question_results[{index}].evidence is not a string"
128
+ if sorted(seen) != sorted(expected):
129
+ return f"question_id set mismatch: expected={len(expected)} seen={len(seen)}"
130
+ if len(seen) != len(set(seen)):
131
+ return "duplicate question_id in response"
132
+ return None
133
+
134
+
135
+ def payload_for(row: dict[str, Any], args: argparse.Namespace) -> dict[str, Any]:
136
+ question_ids = [question["question_id"] for question in row.get("questions", [])]
137
+ user_prompt = row["user_prompt"]
138
+ if args.no_evidence:
139
+ user_prompt = user_prompt.replace(
140
+ "- Keep evidence short and grounded in visible image content.\n",
141
+ "- Return only question_id, answer, and confidence for each question; do not include evidence text.\n",
142
+ )
143
+ payload: dict[str, Any] = {
144
+ "model": args.model,
145
+ "max_tokens": args.max_tokens,
146
+ "temperature": args.temperature,
147
+ "messages": [
148
+ {"role": "system", "content": row["system_prompt"]},
149
+ {
150
+ "role": "user",
151
+ "content": [
152
+ {"type": "text", "text": user_prompt},
153
+ {"type": "image_url", "image_url": {"url": image_url_for(row, args.image_mode)}},
154
+ ],
155
+ },
156
+ ],
157
+ "chat_template_kwargs": {"enable_thinking": False},
158
+ }
159
+ if args.structured_json:
160
+ payload["structured_outputs"] = {"json": response_schema(question_ids, include_evidence=not args.no_evidence)}
161
+ return payload
162
+
163
+
164
+ async def post_one(session: aiohttp.ClientSession, url: str, row: dict[str, Any], args: argparse.Namespace) -> dict[str, Any]:
165
+ endpoint = f"{url.rstrip('/')}/v1/chat/completions"
166
+ start = time.perf_counter()
167
+ try:
168
+ async with session.post(endpoint, json=payload_for(row, args), headers={"Authorization": "Bearer sk-fake"}) as response:
169
+ text = await response.text()
170
+ elapsed = time.perf_counter() - start
171
+ if response.status >= 400:
172
+ return {"request_id": row["request_id"], "ok": False, "status": response.status, "elapsed_sec": round(elapsed, 4), "error": text[:4000], "request": row}
173
+ body = json.loads(text)
174
+ content = body["choices"][0]["message"]["content"]
175
+ parsed = None
176
+ parse_error = None
177
+ schema_error = None
178
+ try:
179
+ parsed = json.loads(content)
180
+ schema_error = validate(parsed, row, include_evidence=not args.no_evidence)
181
+ except Exception as exc: # noqa: BLE001
182
+ parse_error = repr(exc)
183
+ return {
184
+ "request_id": row["request_id"],
185
+ "ok": parse_error is None and schema_error is None,
186
+ "status": response.status,
187
+ "elapsed_sec": round(elapsed, 4),
188
+ "model": args.model,
189
+ "usage": body.get("usage", {}),
190
+ "response_text": content,
191
+ "parsed": parsed,
192
+ "parse_error": parse_error,
193
+ "schema_error": schema_error,
194
+ "request": row,
195
+ }
196
+ except Exception as exc: # noqa: BLE001
197
+ return {"request_id": row["request_id"], "ok": False, "status": None, "elapsed_sec": round(time.perf_counter() - start, 4), "error": repr(exc), "request": row}
198
+
199
+
200
+ def load_seen(args: argparse.Namespace, output: Path) -> set[str]:
201
+ seen: set[str] = set()
202
+ paths: list[Path] = []
203
+ if args.skip_ok_from:
204
+ paths.append(Path(args.skip_ok_from))
205
+ if args.resume and output.exists():
206
+ paths.append(output)
207
+ for path in paths:
208
+ with path.open("r", encoding="utf-8") as handle:
209
+ for line in handle:
210
+ if not line.strip():
211
+ continue
212
+ try:
213
+ row = json.loads(line)
214
+ except json.JSONDecodeError:
215
+ continue
216
+ if (path != output or args.resume_ok_only) and not row.get("ok"):
217
+ continue
218
+ request_id = row.get("request_id")
219
+ if isinstance(request_id, str):
220
+ seen.add(request_id)
221
+ return seen
222
+
223
+
224
+ async def run(args: argparse.Namespace) -> int:
225
+ rows = iter_requests(Path(args.input), args.max_requests)
226
+ urls = [item.strip() for item in args.urls.split(",") if item.strip()]
227
+ output = Path(args.output)
228
+ output.parent.mkdir(parents=True, exist_ok=True)
229
+ seen_request_ids = load_seen(args, output)
230
+ rows = [row for row in rows if row.get("request_id") not in seen_request_ids]
231
+ timeout = aiohttp.ClientTimeout(total=args.timeout_sec)
232
+ connector = aiohttp.TCPConnector(limit=args.concurrency)
233
+ sem = asyncio.Semaphore(args.concurrency)
234
+ ok = 0
235
+ total = 0
236
+ mode = "a" if args.resume else "w"
237
+ with output.open(mode, encoding="utf-8") as handle:
238
+ async with aiohttp.ClientSession(timeout=timeout, connector=connector) as session:
239
+ async def guarded(index: int, row: dict[str, Any]) -> dict[str, Any]:
240
+ async with sem:
241
+ return await post_one(session, urls[index % len(urls)], row, args)
242
+
243
+ tasks = [asyncio.create_task(guarded(index, row)) for index, row in enumerate(rows)]
244
+ for task in asyncio.as_completed(tasks):
245
+ result = await task
246
+ handle.write(json.dumps(result, ensure_ascii=False) + "\n")
247
+ handle.flush()
248
+ total += 1
249
+ ok += int(bool(result.get("ok")))
250
+ if total % 10 == 0 or total == len(rows):
251
+ print(json.dumps({"completed": total, "ok": ok, "total": len(rows), "skipped_existing": len(seen_request_ids)}, ensure_ascii=False))
252
+ print(json.dumps({"output": str(output), "completed": total, "ok": ok, "skipped_existing": len(seen_request_ids)}, indent=2))
253
+ return 0
254
+
255
+
256
+ def main() -> int:
257
+ return asyncio.run(run(parse_args()))
258
+
259
+
260
+ if __name__ == "__main__":
261
+ raise SystemExit(main())
eval_code/scripts/run_grounded_cbu_verify_requests.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Run exact-unit grounded-CBU verification requests against vLLM."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import asyncio
8
+ import base64
9
+ import json
10
+ import time
11
+ from io import BytesIO
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ import aiohttp
16
+ from PIL import Image, ImageFile
17
+
18
+ ImageFile.LOAD_TRUNCATED_IMAGES = True
19
+
20
+ STATUSES = [
21
+ "grounded",
22
+ "unsupported",
23
+ "uncertain",
24
+ "invalid_text_unit",
25
+ "not_a_visual_claim",
26
+ "image_unavailable",
27
+ ]
28
+
29
+
30
+ def parse_args() -> argparse.Namespace:
31
+ parser = argparse.ArgumentParser(description="Run exact-unit grounded-CBU verification requests")
32
+ parser.add_argument("--input", required=True)
33
+ parser.add_argument("--output", required=True)
34
+ parser.add_argument("--urls", default="http://localhost:8000")
35
+ parser.add_argument("--model", default="Qwen/Qwen3.5-122B-A10B-FP8")
36
+ parser.add_argument("--max-requests", type=int, default=None)
37
+ parser.add_argument("--concurrency", type=int, default=32)
38
+ parser.add_argument("--max-tokens", type=int, default=2048)
39
+ parser.add_argument("--temperature", type=float, default=0.0)
40
+ parser.add_argument("--timeout-sec", type=int, default=600)
41
+ parser.add_argument("--image-mode", choices=["auto", "file", "data", "url"], default="auto")
42
+ parser.add_argument("--structured-json", action="store_true")
43
+ parser.add_argument("--resume", action="store_true", help="Append to output and skip request_ids already present.")
44
+ parser.add_argument(
45
+ "--resume-ok-only",
46
+ action="store_true",
47
+ help="With --resume, skip only previously successful request_ids so timeout/schema failures are retried.",
48
+ )
49
+ parser.add_argument(
50
+ "--skip-ok-from",
51
+ default=None,
52
+ help="JSONL response log whose successful request_ids should be skipped while writing a separate output.",
53
+ )
54
+ return parser.parse_args()
55
+
56
+
57
+ def iter_requests(path: Path, max_requests: int | None) -> list[dict[str, Any]]:
58
+ rows = []
59
+ with path.open("r", encoding="utf-8") as handle:
60
+ for line in handle:
61
+ if max_requests is not None and len(rows) >= max_requests:
62
+ break
63
+ if line.strip():
64
+ rows.append(json.loads(line))
65
+ return rows
66
+
67
+
68
+ def image_url_for(row: dict[str, Any], mode: str) -> str:
69
+ if mode in {"auto", "data"} and row.get("image_path"):
70
+ path = Path(row["image_path"])
71
+ with Image.open(path) as image:
72
+ if image.mode != "RGB":
73
+ image = image.convert("RGB")
74
+ buffer = BytesIO()
75
+ image.save(buffer, format="JPEG", quality=88)
76
+ return f"data:image/jpeg;base64,{base64.b64encode(buffer.getvalue()).decode('ascii')}"
77
+ if mode in {"auto", "file"} and row.get("image_path"):
78
+ return Path(row["image_path"]).resolve().as_uri()
79
+ if mode == "file":
80
+ raise ValueError(f"request {row.get('request_id')} has no image_path")
81
+ return row["image_url"]
82
+
83
+
84
+ def response_schema(unit_ids: list[str]) -> dict[str, Any]:
85
+ return {
86
+ "type": "object",
87
+ "properties": {
88
+ "caption_id": {"type": "string"},
89
+ "unit_results": {
90
+ "type": "array",
91
+ "minItems": len(unit_ids),
92
+ "maxItems": len(unit_ids),
93
+ "items": {
94
+ "type": "object",
95
+ "properties": {
96
+ "unit_id": {"type": "string", "enum": unit_ids},
97
+ "status": {"type": "string", "enum": STATUSES},
98
+ "confidence": {"type": "number", "minimum": 0.0, "maximum": 1.0},
99
+ "evidence": {"type": "string", "maxLength": 180},
100
+ },
101
+ "required": ["unit_id", "status", "confidence", "evidence"],
102
+ "additionalProperties": False,
103
+ },
104
+ },
105
+ },
106
+ "required": ["caption_id", "unit_results"],
107
+ "additionalProperties": False,
108
+ }
109
+
110
+
111
+ def validate(parsed: Any, row: dict[str, Any]) -> str | None:
112
+ if not isinstance(parsed, dict):
113
+ return "top-level response is not an object"
114
+ if not isinstance(parsed.get("caption_id"), str):
115
+ return "caption_id is not a string"
116
+ results = parsed.get("unit_results")
117
+ if not isinstance(results, list):
118
+ return "unit_results is not an array"
119
+ expected = [unit["unit_id"] for unit in row.get("claimed_units", [])]
120
+ seen = []
121
+ for index, result in enumerate(results):
122
+ if not isinstance(result, dict):
123
+ return f"unit_results[{index}] is not an object"
124
+ unit_id = result.get("unit_id")
125
+ if not isinstance(unit_id, str):
126
+ return f"unit_results[{index}].unit_id is not a string"
127
+ seen.append(unit_id)
128
+ if result.get("status") not in set(STATUSES):
129
+ return f"unit_results[{index}].status has invalid value"
130
+ if not isinstance(result.get("confidence"), int | float):
131
+ return f"unit_results[{index}].confidence is not numeric"
132
+ if not isinstance(result.get("evidence"), str):
133
+ return f"unit_results[{index}].evidence is not a string"
134
+ if sorted(seen) != sorted(expected):
135
+ return f"unit_id set mismatch: expected={len(expected)} seen={len(seen)}"
136
+ if len(seen) != len(set(seen)):
137
+ return "duplicate unit_id in response"
138
+ return None
139
+
140
+
141
+ def payload_for(row: dict[str, Any], args: argparse.Namespace) -> dict[str, Any]:
142
+ unit_ids = [unit["unit_id"] for unit in row.get("claimed_units", [])]
143
+ payload: dict[str, Any] = {
144
+ "model": args.model,
145
+ "max_tokens": args.max_tokens,
146
+ "temperature": args.temperature,
147
+ "messages": [
148
+ {"role": "system", "content": row["system_prompt"]},
149
+ {
150
+ "role": "user",
151
+ "content": [
152
+ {"type": "text", "text": row["user_prompt"]},
153
+ {"type": "image_url", "image_url": {"url": image_url_for(row, args.image_mode)}},
154
+ ],
155
+ },
156
+ ],
157
+ "chat_template_kwargs": {"enable_thinking": False},
158
+ }
159
+ if args.structured_json:
160
+ payload["structured_outputs"] = {"json": response_schema(unit_ids)}
161
+ return payload
162
+
163
+
164
+ async def post_one(session: aiohttp.ClientSession, url: str, row: dict[str, Any], args: argparse.Namespace) -> dict[str, Any]:
165
+ endpoint = f"{url.rstrip('/')}/v1/chat/completions"
166
+ start = time.perf_counter()
167
+ try:
168
+ async with session.post(endpoint, json=payload_for(row, args), headers={"Authorization": "Bearer sk-fake"}) as response:
169
+ text = await response.text()
170
+ elapsed = time.perf_counter() - start
171
+ if response.status >= 400:
172
+ return {
173
+ "request_id": row["request_id"],
174
+ "ok": False,
175
+ "status": response.status,
176
+ "elapsed_sec": round(elapsed, 4),
177
+ "error": text[:4000],
178
+ "request": row,
179
+ }
180
+ body = json.loads(text)
181
+ content = body["choices"][0]["message"]["content"]
182
+ parsed = None
183
+ parse_error = None
184
+ schema_error = None
185
+ try:
186
+ parsed = json.loads(content)
187
+ schema_error = validate(parsed, row)
188
+ except Exception as exc: # noqa: BLE001
189
+ parse_error = repr(exc)
190
+ return {
191
+ "request_id": row["request_id"],
192
+ "ok": parse_error is None and schema_error is None,
193
+ "status": response.status,
194
+ "elapsed_sec": round(elapsed, 4),
195
+ "model": args.model,
196
+ "usage": body.get("usage", {}),
197
+ "response_text": content,
198
+ "parsed": parsed,
199
+ "parse_error": parse_error,
200
+ "schema_error": schema_error,
201
+ "request": row,
202
+ }
203
+ except Exception as exc: # noqa: BLE001
204
+ return {
205
+ "request_id": row["request_id"],
206
+ "ok": False,
207
+ "status": None,
208
+ "elapsed_sec": round(time.perf_counter() - start, 4),
209
+ "error": repr(exc),
210
+ "request": row,
211
+ }
212
+
213
+
214
+ async def run(args: argparse.Namespace) -> int:
215
+ rows = iter_requests(Path(args.input), args.max_requests)
216
+ urls = [item.strip() for item in args.urls.split(",") if item.strip()]
217
+ output = Path(args.output)
218
+ output.parent.mkdir(parents=True, exist_ok=True)
219
+ seen_request_ids: set[str] = set()
220
+ if args.skip_ok_from:
221
+ with Path(args.skip_ok_from).open("r", encoding="utf-8") as handle:
222
+ for line in handle:
223
+ if not line.strip():
224
+ continue
225
+ try:
226
+ row = json.loads(line)
227
+ except json.JSONDecodeError:
228
+ continue
229
+ if not row.get("ok"):
230
+ continue
231
+ request_id = row.get("request_id")
232
+ if isinstance(request_id, str):
233
+ seen_request_ids.add(request_id)
234
+ if args.resume and output.exists():
235
+ with output.open("r", encoding="utf-8") as handle:
236
+ for line in handle:
237
+ if not line.strip():
238
+ continue
239
+ try:
240
+ row = json.loads(line)
241
+ except json.JSONDecodeError:
242
+ continue
243
+ if args.resume_ok_only and not row.get("ok"):
244
+ continue
245
+ request_id = row.get("request_id")
246
+ if isinstance(request_id, str):
247
+ seen_request_ids.add(request_id)
248
+ rows = [row for row in rows if row.get("request_id") not in seen_request_ids]
249
+ timeout = aiohttp.ClientTimeout(total=args.timeout_sec)
250
+ connector = aiohttp.TCPConnector(limit=args.concurrency)
251
+ sem = asyncio.Semaphore(args.concurrency)
252
+ ok = 0
253
+ total = 0
254
+ mode = "a" if args.resume else "w"
255
+ with output.open(mode, encoding="utf-8") as handle:
256
+ async with aiohttp.ClientSession(timeout=timeout, connector=connector) as session:
257
+ async def guarded(index: int, row: dict[str, Any]) -> dict[str, Any]:
258
+ async with sem:
259
+ return await post_one(session, urls[index % len(urls)], row, args)
260
+
261
+ tasks = [asyncio.create_task(guarded(index, row)) for index, row in enumerate(rows)]
262
+ for task in asyncio.as_completed(tasks):
263
+ result = await task
264
+ handle.write(json.dumps(result, ensure_ascii=False) + "\n")
265
+ handle.flush()
266
+ total += 1
267
+ ok += int(bool(result.get("ok")))
268
+ if total % 10 == 0 or total == len(rows):
269
+ print(
270
+ json.dumps(
271
+ {
272
+ "completed": total,
273
+ "ok": ok,
274
+ "total": len(rows),
275
+ "skipped_existing": len(seen_request_ids),
276
+ },
277
+ ensure_ascii=False,
278
+ )
279
+ )
280
+ print(json.dumps({"output": str(output), "completed": total, "ok": ok, "skipped_existing": len(seen_request_ids)}, indent=2))
281
+ return 0
282
+
283
+
284
+ def main() -> int:
285
+ return asyncio.run(run(parse_args()))
286
+
287
+
288
+ if __name__ == "__main__":
289
+ raise SystemExit(main())
eval_code/scripts/run_text_json_requests.py ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Run text-only structured JSON requests against OpenAI-compatible endpoints."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import asyncio
8
+ import json
9
+ import time
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ import aiohttp
14
+
15
+ from build_caption_cbu_requests import CBU_JSON_SCHEMA, UNIT_CATEGORIES
16
+
17
+
18
+ def request_schema(row: dict[str, Any]) -> dict[str, Any]:
19
+ manifest_schema = row.get("schema")
20
+ if isinstance(manifest_schema, dict):
21
+ return manifest_schema
22
+ prompt = row.get("user_prompt", "")
23
+ if isinstance(prompt, str):
24
+ marker = "Return only JSON matching this schema:\n"
25
+ if marker in prompt:
26
+ rest = prompt.split(marker, 1)[1]
27
+ schema_text = rest.split("\n\n", 1)[0]
28
+ try:
29
+ parsed = json.loads(schema_text)
30
+ if isinstance(parsed, dict):
31
+ return parsed
32
+ except Exception: # noqa: BLE001
33
+ pass
34
+ return CBU_JSON_SCHEMA
35
+
36
+
37
+ def parse_args() -> argparse.Namespace:
38
+ parser = argparse.ArgumentParser(description="Run text-only JSON-schema requests")
39
+ parser.add_argument("--input", required=True)
40
+ parser.add_argument("--output", required=True)
41
+ parser.add_argument("--urls", default="http://localhost:8000")
42
+ parser.add_argument("--model", default="Qwen/Qwen3.5-35B-A3B")
43
+ parser.add_argument("--max-requests", type=int, default=None)
44
+ parser.add_argument("--concurrency", type=int, default=8)
45
+ parser.add_argument("--max-tokens", type=int, default=1024)
46
+ parser.add_argument("--temperature", type=float, default=0.0)
47
+ parser.add_argument("--timeout-sec", type=int, default=240)
48
+ parser.add_argument("--thinking", action="store_true")
49
+ parser.add_argument("--structured-json", action="store_true")
50
+ parser.add_argument("--response-format-schema", action="store_true")
51
+ parser.add_argument("--response-format-json", action="store_true")
52
+ parser.add_argument("--resume", action="store_true", help="Append to output and skip previously seen request_ids.")
53
+ parser.add_argument(
54
+ "--resume-ok-only",
55
+ action="store_true",
56
+ help="With --resume, skip only previously successful request_ids so failures are retried.",
57
+ )
58
+ parser.add_argument(
59
+ "--skip-ok-from",
60
+ default=None,
61
+ help="JSONL response log whose successful request_ids should be skipped while writing a separate output.",
62
+ )
63
+ return parser.parse_args()
64
+
65
+
66
+ def iter_requests(path: Path, max_requests: int | None) -> list[dict[str, Any]]:
67
+ rows = []
68
+ with path.open("r", encoding="utf-8") as handle:
69
+ for line in handle:
70
+ if max_requests is not None and len(rows) >= max_requests:
71
+ break
72
+ if line.strip():
73
+ rows.append(json.loads(line))
74
+ return rows
75
+
76
+
77
+ def validate_cbu_response(parsed: Any) -> str | None:
78
+ if not isinstance(parsed, dict):
79
+ return "top-level response is not an object"
80
+ if not isinstance(parsed.get("caption_id"), str):
81
+ return "caption_id is not a string"
82
+ claimed = parsed.get("claimed_units")
83
+ if not isinstance(claimed, list):
84
+ return "claimed_units is not an array"
85
+ for index, unit in enumerate(claimed):
86
+ if not isinstance(unit, dict):
87
+ return f"claimed_units[{index}] is not an object"
88
+ extra = sorted(set(unit) - {"category", "unit", "span", "target"})
89
+ if extra:
90
+ return f"claimed_units[{index}] has unexpected fields: {extra}"
91
+ missing = [field for field in ["category", "unit", "span", "target"] if field not in unit]
92
+ if missing:
93
+ return f"claimed_units[{index}] is missing fields: {missing}"
94
+ if unit["category"] not in UNIT_CATEGORIES:
95
+ return f"claimed_units[{index}].category has invalid value"
96
+ for field in ["unit", "span", "target"]:
97
+ if not isinstance(unit[field], str):
98
+ return f"claimed_units[{index}].{field} is not a string"
99
+ return None
100
+
101
+
102
+ def payload_for(row: dict[str, Any], args: argparse.Namespace) -> dict[str, Any]:
103
+ payload: dict[str, Any] = {
104
+ "model": args.model,
105
+ "max_tokens": args.max_tokens,
106
+ "temperature": args.temperature,
107
+ "messages": [
108
+ {"role": "system", "content": row["system_prompt"]},
109
+ {"role": "user", "content": row["user_prompt"]},
110
+ ],
111
+ "chat_template_kwargs": {"enable_thinking": args.thinking},
112
+ }
113
+ if args.structured_json:
114
+ payload["structured_outputs"] = {"json": request_schema(row)}
115
+ if args.response_format_schema:
116
+ payload["response_format"] = {
117
+ "type": "json_schema",
118
+ "json_schema": {"name": "claimed_cbu", "schema": request_schema(row)},
119
+ }
120
+ if args.response_format_json:
121
+ payload["response_format"] = {"type": "json_object"}
122
+ return payload
123
+
124
+
125
+ async def post_one(
126
+ session: aiohttp.ClientSession,
127
+ url: str,
128
+ row: dict[str, Any],
129
+ args: argparse.Namespace,
130
+ ) -> dict[str, Any]:
131
+ endpoint = f"{url.rstrip('/')}/v1/chat/completions"
132
+ payload = payload_for(row, args)
133
+ start = time.perf_counter()
134
+ try:
135
+ async with session.post(endpoint, json=payload, headers={"Authorization": "Bearer sk-fake"}) as response:
136
+ text = await response.text()
137
+ elapsed = time.perf_counter() - start
138
+ if response.status >= 400:
139
+ return {
140
+ "request_id": row["request_id"],
141
+ "ok": False,
142
+ "status": response.status,
143
+ "elapsed_sec": round(elapsed, 4),
144
+ "error": text[:4000],
145
+ "request": row,
146
+ }
147
+ body = json.loads(text)
148
+ content = body["choices"][0]["message"]["content"]
149
+ parsed = None
150
+ parse_error = None
151
+ schema_error = None
152
+ try:
153
+ parsed = json.loads(content)
154
+ schema_error = validate_cbu_response(parsed)
155
+ except Exception as exc: # noqa: BLE001
156
+ parse_error = repr(exc)
157
+ return {
158
+ "request_id": row["request_id"],
159
+ "ok": parse_error is None and schema_error is None,
160
+ "status": response.status,
161
+ "elapsed_sec": round(elapsed, 4),
162
+ "model": args.model,
163
+ "usage": body.get("usage", {}),
164
+ "response_text": content,
165
+ "parsed": parsed,
166
+ "parse_error": parse_error,
167
+ "schema_error": schema_error,
168
+ "request": row,
169
+ }
170
+ except Exception as exc: # noqa: BLE001
171
+ return {
172
+ "request_id": row["request_id"],
173
+ "ok": False,
174
+ "status": None,
175
+ "elapsed_sec": round(time.perf_counter() - start, 4),
176
+ "error": repr(exc),
177
+ "request": row,
178
+ }
179
+
180
+
181
+ async def run(args: argparse.Namespace) -> int:
182
+ rows = iter_requests(Path(args.input), args.max_requests)
183
+ urls = [item.strip() for item in args.urls.split(",") if item.strip()]
184
+ output = Path(args.output)
185
+ output.parent.mkdir(parents=True, exist_ok=True)
186
+ seen_request_ids: set[str] = set()
187
+ if args.skip_ok_from:
188
+ with Path(args.skip_ok_from).open("r", encoding="utf-8") as handle:
189
+ for line in handle:
190
+ if not line.strip():
191
+ continue
192
+ try:
193
+ row = json.loads(line)
194
+ except json.JSONDecodeError:
195
+ continue
196
+ if not row.get("ok"):
197
+ continue
198
+ request_id = row.get("request_id")
199
+ if isinstance(request_id, str):
200
+ seen_request_ids.add(request_id)
201
+ if args.resume and output.exists():
202
+ with output.open("r", encoding="utf-8") as handle:
203
+ for line in handle:
204
+ if not line.strip():
205
+ continue
206
+ try:
207
+ row = json.loads(line)
208
+ except json.JSONDecodeError:
209
+ continue
210
+ if args.resume_ok_only and not row.get("ok"):
211
+ continue
212
+ request_id = row.get("request_id")
213
+ if isinstance(request_id, str):
214
+ seen_request_ids.add(request_id)
215
+ rows = [row for row in rows if row.get("request_id") not in seen_request_ids]
216
+ timeout = aiohttp.ClientTimeout(total=args.timeout_sec)
217
+ connector = aiohttp.TCPConnector(limit=args.concurrency)
218
+ sem = asyncio.Semaphore(args.concurrency)
219
+ ok = 0
220
+ total = 0
221
+ mode = "a" if args.resume else "w"
222
+ with output.open(mode, encoding="utf-8") as handle:
223
+ async with aiohttp.ClientSession(timeout=timeout, connector=connector) as session:
224
+ async def guarded(index: int, row: dict[str, Any]) -> dict[str, Any]:
225
+ async with sem:
226
+ return await post_one(session, urls[index % len(urls)], row, args)
227
+
228
+ tasks = [asyncio.create_task(guarded(index, row)) for index, row in enumerate(rows)]
229
+ for task in asyncio.as_completed(tasks):
230
+ result = await task
231
+ handle.write(json.dumps(result, ensure_ascii=False) + "\n")
232
+ handle.flush()
233
+ total += 1
234
+ ok += int(bool(result.get("ok")))
235
+ if total % 100 == 0 or total == len(rows):
236
+ print(
237
+ json.dumps(
238
+ {
239
+ "completed": total,
240
+ "ok": ok,
241
+ "total": len(rows),
242
+ "skipped_existing": len(seen_request_ids),
243
+ },
244
+ ensure_ascii=False,
245
+ )
246
+ )
247
+ print(json.dumps({"output": str(output), "completed": total, "ok": ok, "skipped_existing": len(seen_request_ids)}, indent=2))
248
+ return 0
249
+
250
+
251
+ def main() -> int:
252
+ return asyncio.run(run(parse_args()))
253
+
254
+
255
+ if __name__ == "__main__":
256
+ raise SystemExit(main())
eval_code/scripts/summarize_cbu_responses.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Summarize claimed or grounded CBU response JSONL into table-ready metrics."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ import re
9
+ import statistics
10
+ from collections import Counter, defaultdict
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+
15
+ UNIT_CATEGORIES = [
16
+ "object",
17
+ "attribute",
18
+ "relation",
19
+ "style",
20
+ "camera",
21
+ "lighting",
22
+ "count",
23
+ "text_rendering",
24
+ ]
25
+
26
+ TOKEN_RE = re.compile(r"[^\W_]+(?:'[^\W_]+)*", re.UNICODE)
27
+ ARTICLE_UNITS = {"a", "an", "the"}
28
+
29
+
30
+ def parse_args() -> argparse.Namespace:
31
+ parser = argparse.ArgumentParser(description="Summarize CBU extraction/audit responses")
32
+ parser.add_argument("--input", required=True)
33
+ parser.add_argument("--output", required=True)
34
+ parser.add_argument("--mode", choices=["claimed", "grounded"], required=True)
35
+ parser.add_argument("--latest-by-request", action="store_true")
36
+ parser.add_argument("--include", action="append", default=[])
37
+ return parser.parse_args()
38
+
39
+
40
+ def normalize_unit(text: str) -> str:
41
+ tokens = TOKEN_RE.findall(text.lower())
42
+ while tokens and tokens[0] in ARTICLE_UNITS:
43
+ tokens.pop(0)
44
+ return " ".join(tokens)
45
+
46
+
47
+ def normalize_key_part(text: str) -> str:
48
+ normalized = normalize_unit(text)
49
+ return normalized or ""
50
+
51
+
52
+ def caption_token_count(request: dict[str, Any]) -> int:
53
+ caption = request.get("caption", "")
54
+ return len(TOKEN_RE.findall(caption)) if isinstance(caption, str) else 0
55
+
56
+
57
+ def percentile(values: list[float], q: float) -> float | None:
58
+ if not values:
59
+ return None
60
+ index = round((len(values) - 1) * q)
61
+ return sorted(values)[index]
62
+
63
+
64
+ def trimmed_mean(values: list[float], trim: float = 0.1) -> float | None:
65
+ if not values:
66
+ return None
67
+ ordered = sorted(values)
68
+ k = int(len(ordered) * trim)
69
+ trimmed = ordered[k : len(ordered) - k] if len(ordered) - 2 * k > 0 else ordered
70
+ return statistics.fmean(trimmed)
71
+
72
+
73
+ def empty_category_counts() -> dict[str, int]:
74
+ return {category: 0 for category in UNIT_CATEGORIES}
75
+
76
+
77
+ def unit_records(group: Any) -> list[dict[str, str]]:
78
+ """Normalize both legacy category arrays and v2 atomic record arrays."""
79
+ records: list[dict[str, str]] = []
80
+ if isinstance(group, dict):
81
+ for category in UNIT_CATEGORIES:
82
+ items = group.get(category, [])
83
+ if not isinstance(items, list):
84
+ continue
85
+ for item in items:
86
+ if isinstance(item, str) and item.strip():
87
+ records.append({"category": category, "unit": item.strip(), "span": item.strip(), "target": ""})
88
+ return records
89
+ if isinstance(group, list):
90
+ for item in group:
91
+ if not isinstance(item, dict):
92
+ continue
93
+ category = item.get("category")
94
+ unit = item.get("unit")
95
+ if category not in UNIT_CATEGORIES or not isinstance(unit, str) or not unit.strip():
96
+ continue
97
+ span = item.get("span", "")
98
+ target = item.get("target", "")
99
+ records.append(
100
+ {
101
+ "category": category,
102
+ "unit": unit.strip(),
103
+ "span": span.strip() if isinstance(span, str) else "",
104
+ "target": target.strip() if isinstance(target, str) else "",
105
+ }
106
+ )
107
+ return records
108
+
109
+
110
+ def count_unit_group(group: Any) -> tuple[int, dict[str, int]]:
111
+ counts = {category: 0 for category in UNIT_CATEGORIES}
112
+ for record in unit_records(group):
113
+ counts[record["category"]] += 1
114
+ return sum(counts.values()), counts
115
+
116
+
117
+ def count_deduped_unit_group(group: Any) -> tuple[int, dict[str, int], int, int]:
118
+ counts = empty_category_counts()
119
+ seen: set[str] = set()
120
+ duplicate = 0
121
+ suspicious = 0
122
+ for record in unit_records(group):
123
+ norm = normalize_unit(record["unit"])
124
+ if not norm:
125
+ continue
126
+ key = f"{record['category']}|{norm}|{normalize_key_part(record.get('target', ''))}"
127
+ if key in seen:
128
+ duplicate += 1
129
+ continue
130
+ seen.add(key)
131
+ category = record["category"]
132
+ if category == "count" and norm in ARTICLE_UNITS:
133
+ suspicious += 1
134
+ continue
135
+ if category == "text_rendering" and any(marker in norm for marker in ["no text", "no visible", "not visible", "without text"]):
136
+ suspicious += 1
137
+ continue
138
+ counts[category] += 1
139
+ return sum(counts.values()), counts, duplicate, suspicious
140
+
141
+
142
+ def add_counts(dst: Counter[str], counts: dict[str, int], prefix: str) -> None:
143
+ for category, count in counts.items():
144
+ dst[f"{prefix}_{category}"] += count
145
+
146
+
147
+ def summarize_claimed_row(parsed: dict[str, Any], request: dict[str, Any]) -> list[tuple[str, Counter[str]]]:
148
+ surface = request.get("surface", "unknown")
149
+ total, counts = count_unit_group(parsed.get("claimed_units"))
150
+ dedup_total, dedup_counts, duplicate, suspicious = count_deduped_unit_group(parsed.get("claimed_units"))
151
+ tokens = caption_token_count(request)
152
+ counter: Counter[str] = Counter()
153
+ counter["captions"] += 1
154
+ counter["claimed_total"] += total
155
+ counter["claimed_dedup_total"] += dedup_total
156
+ counter["duplicate_units"] += duplicate
157
+ counter["suspicious_units"] += suspicious
158
+ counter["caption_tokens"] += tokens
159
+ counter["rows_with_duplicate"] += int(duplicate > 0)
160
+ counter["rows_with_suspicious"] += int(suspicious > 0)
161
+ add_counts(counter, counts, "claimed")
162
+ add_counts(counter, dedup_counts, "claimed_dedup")
163
+ return [(surface, counter)]
164
+
165
+
166
+ def summarize_grounded_row(parsed: dict[str, Any], request: dict[str, Any]) -> list[tuple[str, Counter[str]]]:
167
+ rows = []
168
+ for result in parsed.get("results", []) if isinstance(parsed, dict) else []:
169
+ caption_id = result.get("caption_id")
170
+ surface = None
171
+ for caption in request.get("captions", []):
172
+ if caption.get("caption_id") == caption_id:
173
+ surface = caption.get("surface")
174
+ break
175
+ surface = surface or str(caption_id or "unknown")
176
+ grounded_total, grounded_counts = count_unit_group(result.get("grounded_units"))
177
+ unsupported_total, unsupported_counts = count_unit_group(result.get("unsupported_units"))
178
+ uncertain_total, uncertain_counts = count_unit_group(result.get("uncertain_units"))
179
+ claimed_total = grounded_total + unsupported_total + uncertain_total
180
+ counter: Counter[str] = Counter()
181
+ counter["captions"] += 1
182
+ counter["claimed_total"] += claimed_total
183
+ counter["grounded_total"] += grounded_total
184
+ counter["unsupported_total"] += unsupported_total
185
+ counter["uncertain_total"] += uncertain_total
186
+ counter[f"overall_{result.get('overall', 'missing')}"] += 1
187
+ add_counts(counter, grounded_counts, "grounded")
188
+ add_counts(counter, unsupported_counts, "unsupported")
189
+ add_counts(counter, uncertain_counts, "uncertain")
190
+ rows.append((surface, counter))
191
+ return rows
192
+
193
+
194
+ def merge(dst: Counter[str], src: Counter[str]) -> None:
195
+ for key, value in src.items():
196
+ dst[key] += value
197
+
198
+
199
+ def finalize(counter: Counter[str]) -> dict[str, Any]:
200
+ captions = max(counter["captions"], 1)
201
+ claimed = counter["claimed_total"]
202
+ output: dict[str, Any] = dict(counter)
203
+ output["claimed_per_caption"] = claimed / captions
204
+ output["claimed_dedup_per_caption"] = counter["claimed_dedup_total"] / captions
205
+ output["claimed_dedup_per_100_tokens"] = (
206
+ 100 * counter["claimed_dedup_total"] / counter["caption_tokens"] if counter["caption_tokens"] else None
207
+ )
208
+ output["duplicate_units_per_caption"] = counter["duplicate_units"] / captions
209
+ output["suspicious_units_per_caption"] = counter["suspicious_units"] / captions
210
+ output["duplicate_row_rate"] = counter["rows_with_duplicate"] / captions
211
+ output["suspicious_row_rate"] = counter["rows_with_suspicious"] / captions
212
+ output["grounded_precision"] = counter["grounded_total"] / claimed if claimed else None
213
+ output["unsupported_rate"] = counter["unsupported_total"] / claimed if claimed else None
214
+ output["uncertain_rate"] = counter["uncertain_total"] / claimed if claimed else None
215
+ for category in UNIT_CATEGORIES:
216
+ output[f"claimed_{category}_per_caption"] = counter[f"claimed_{category}"] / captions
217
+ output[f"claimed_dedup_{category}_per_caption"] = counter[f"claimed_dedup_{category}"] / captions
218
+ denom = counter[f"grounded_{category}"] + counter[f"unsupported_{category}"] + counter[f"uncertain_{category}"]
219
+ if denom:
220
+ output[f"grounded_{category}_precision"] = counter[f"grounded_{category}"] / denom
221
+ output[f"unsupported_{category}_rate"] = counter[f"unsupported_{category}"] / denom
222
+ return output
223
+
224
+
225
+ def main() -> int:
226
+ args = parse_args()
227
+ by_surface: dict[str, Counter[str]] = defaultdict(Counter)
228
+ per_surface_values: dict[str, dict[str, list[float]]] = defaultdict(lambda: defaultdict(list))
229
+ status = Counter()
230
+ input_paths = [Path(args.input), *[Path(item) for item in args.include]]
231
+ if args.latest_by_request:
232
+ latest: dict[str, dict[str, Any]] = {}
233
+ for input_path in input_paths:
234
+ with input_path.open("r", encoding="utf-8") as handle:
235
+ for line in handle:
236
+ if not line.strip():
237
+ continue
238
+ row = json.loads(line)
239
+ request_id = row.get("request_id")
240
+ if isinstance(request_id, str):
241
+ latest[request_id] = row
242
+ rows = list(latest.values())
243
+ else:
244
+ rows = []
245
+ for input_path in input_paths:
246
+ with input_path.open("r", encoding="utf-8") as handle:
247
+ rows.extend(json.loads(line) for line in handle if line.strip())
248
+ for row in rows:
249
+ status["responses"] += 1
250
+ if not row.get("ok"):
251
+ status["bad"] += 1
252
+ continue
253
+ parsed = row.get("parsed")
254
+ request = row.get("request", {})
255
+ items = (
256
+ summarize_claimed_row(parsed, request)
257
+ if args.mode == "claimed"
258
+ else summarize_grounded_row(parsed, request)
259
+ )
260
+ for surface, counter in items:
261
+ merge(by_surface[surface], counter)
262
+ merge(by_surface["__all__"], counter)
263
+ status["captions"] += counter["captions"]
264
+ if args.mode == "claimed":
265
+ tokens = max(counter["caption_tokens"], 1)
266
+ for key_surface in [surface, "__all__"]:
267
+ per_surface_values[key_surface]["claimed"].append(float(counter["claimed_total"]))
268
+ per_surface_values[key_surface]["claimed_dedup"].append(float(counter["claimed_dedup_total"]))
269
+ per_surface_values[key_surface]["claimed_dedup_per_100_tokens"].append(
270
+ 100.0 * counter["claimed_dedup_total"] / tokens
271
+ )
272
+ per_surface_values[key_surface]["caption_tokens"].append(float(counter["caption_tokens"]))
273
+ surfaces = {surface: finalize(counter) for surface, counter in sorted(by_surface.items())}
274
+ for surface, metrics in per_surface_values.items():
275
+ if surface not in surfaces:
276
+ continue
277
+ for name, values in metrics.items():
278
+ surfaces[surface][f"{name}_median"] = statistics.median(values) if values else None
279
+ surfaces[surface][f"{name}_p25"] = percentile(values, 0.25)
280
+ surfaces[surface][f"{name}_p75"] = percentile(values, 0.75)
281
+ surfaces[surface][f"{name}_trimmed_mean"] = trimmed_mean(values)
282
+ payload = {
283
+ "input": args.input,
284
+ "mode": args.mode,
285
+ "status": dict(status),
286
+ "surfaces": surfaces,
287
+ }
288
+ output = Path(args.output)
289
+ output.parent.mkdir(parents=True, exist_ok=True)
290
+ output.write_text(json.dumps(payload, indent=2, ensure_ascii=False), encoding="utf-8")
291
+ print(json.dumps({"output": str(output), **payload["status"]}, indent=2))
292
+ return 0
293
+
294
+
295
+ if __name__ == "__main__":
296
+ raise SystemExit(main())
eval_code/scripts/summarize_cbu_vqa_responses.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Summarize CBU VQA response JSONL files."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ from collections import Counter, defaultdict
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+
13
+ ANSWERS = ["yes", "no", "uncertain"]
14
+
15
+
16
+ def parse_args() -> argparse.Namespace:
17
+ parser = argparse.ArgumentParser(description="Summarize CBU VQA responses")
18
+ parser.add_argument("--input", required=True)
19
+ parser.add_argument("--output", required=True)
20
+ parser.add_argument(
21
+ "--include",
22
+ action="append",
23
+ default=[],
24
+ help="Additional response JSONL to merge before latest-by-request summarization.",
25
+ )
26
+ parser.add_argument(
27
+ "--latest-by-request",
28
+ action="store_true",
29
+ help="Use only the last response per request_id.",
30
+ )
31
+ return parser.parse_args()
32
+
33
+
34
+ def load_rows(paths: list[Path], latest_by_request: bool) -> list[dict[str, Any]]:
35
+ if not latest_by_request:
36
+ rows: list[dict[str, Any]] = []
37
+ for path in paths:
38
+ if not path.exists():
39
+ continue
40
+ with path.open("r", encoding="utf-8") as handle:
41
+ rows.extend(json.loads(line) for line in handle if line.strip())
42
+ return rows
43
+
44
+ latest: dict[str, dict[str, Any]] = {}
45
+ for path in paths:
46
+ if not path.exists():
47
+ continue
48
+ with path.open("r", encoding="utf-8") as handle:
49
+ for line in handle:
50
+ if not line.strip():
51
+ continue
52
+ row = json.loads(line)
53
+ request_id = row.get("request_id")
54
+ if isinstance(request_id, str):
55
+ latest[request_id] = row
56
+ return list(latest.values())
57
+
58
+
59
+ def question_lookup(row: dict[str, Any]) -> dict[str, dict[str, Any]]:
60
+ request = row.get("request", {})
61
+ return {
62
+ question["question_id"]: question
63
+ for question in request.get("questions", [])
64
+ if isinstance(question, dict) and isinstance(question.get("question_id"), str)
65
+ }
66
+
67
+
68
+ def add_rates(stats: dict[str, Any]) -> dict[str, Any]:
69
+ total = stats.get("questions", 0)
70
+ for answer in ANSWERS:
71
+ stats[f"{answer}_rate"] = stats.get(answer, 0) / total if total else 0.0
72
+ stats["support_rate"] = stats.get("yes", 0) / total if total else 0.0
73
+ stats["risk_rate"] = stats.get("no", 0) / total if total else 0.0
74
+ stats["uncertainty_rate"] = stats.get("uncertain", 0) / total if total else 0.0
75
+ return stats
76
+
77
+
78
+ def main() -> int:
79
+ args = parse_args()
80
+ paths = [Path(args.input), *[Path(item) for item in args.include]]
81
+ rows = load_rows(paths, args.latest_by_request)
82
+
83
+ surface_stats: dict[str, Counter[str]] = defaultdict(Counter)
84
+ category_stats: dict[str, Counter[str]] = defaultdict(Counter)
85
+ examples: dict[str, list[dict[str, Any]]] = defaultdict(list)
86
+
87
+ responses = 0
88
+ ok = 0
89
+ for row in rows:
90
+ responses += 1
91
+ request = row.get("request", {})
92
+ surface = request.get("surface", "__unknown__")
93
+ surface_stats[surface]["responses"] += 1
94
+ if not row.get("ok"):
95
+ surface_stats[surface]["bad"] += 1
96
+ if len(examples["bad_response"]) < 20:
97
+ examples["bad_response"].append(
98
+ {
99
+ "surface": surface,
100
+ "caption_id": request.get("caption_id"),
101
+ "error": row.get("parse_error") or row.get("schema_error") or row.get("error"),
102
+ }
103
+ )
104
+ continue
105
+ ok += 1
106
+ surface_stats[surface]["ok"] += 1
107
+ lookup = question_lookup(row)
108
+ for result in row.get("parsed", {}).get("question_results", []):
109
+ if not isinstance(result, dict):
110
+ continue
111
+ question_id = result.get("question_id")
112
+ answer = result.get("answer")
113
+ if answer not in ANSWERS:
114
+ continue
115
+ question = lookup.get(question_id, {})
116
+ category = question.get("category", "__unknown__")
117
+ surface_stats[surface]["questions"] += 1
118
+ surface_stats[surface][answer] += 1
119
+ category_stats[category]["questions"] += 1
120
+ category_stats[category][answer] += 1
121
+ if answer in {"no", "uncertain"} and len(examples[answer]) < 20:
122
+ examples[answer].append(
123
+ {
124
+ "surface": surface,
125
+ "caption_id": request.get("caption_id"),
126
+ "category": category,
127
+ "question": question.get("question"),
128
+ "answer": answer,
129
+ "confidence": result.get("confidence"),
130
+ "evidence": result.get("evidence"),
131
+ }
132
+ )
133
+
134
+ out = {
135
+ "input": args.input,
136
+ "include": args.include,
137
+ "latest_by_request": args.latest_by_request,
138
+ "responses": responses,
139
+ "ok": ok,
140
+ "bad": responses - ok,
141
+ "surfaces": {surface: add_rates(dict(counter)) for surface, counter in surface_stats.items()},
142
+ "categories": {category: add_rates(dict(counter)) for category, counter in category_stats.items()},
143
+ "examples": examples,
144
+ }
145
+ output = Path(args.output)
146
+ output.parent.mkdir(parents=True, exist_ok=True)
147
+ output.write_text(json.dumps(out, indent=2, ensure_ascii=False), encoding="utf-8")
148
+ print(json.dumps({"output": str(output), "responses": responses, "ok": ok, "bad": responses - ok}, indent=2))
149
+ return 0
150
+
151
+
152
+ if __name__ == "__main__":
153
+ raise SystemExit(main())
eval_code/scripts/summarize_grounded_cbu_verify.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Summarize exact-unit grounded-CBU verification responses."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ from collections import Counter, defaultdict
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+
13
+ STATUSES = [
14
+ "grounded",
15
+ "unsupported",
16
+ "uncertain",
17
+ "invalid_text_unit",
18
+ "not_a_visual_claim",
19
+ "image_unavailable",
20
+ ]
21
+
22
+
23
+ def parse_args() -> argparse.Namespace:
24
+ parser = argparse.ArgumentParser(description="Summarize grounded-CBU verification responses")
25
+ parser.add_argument("--input", required=True)
26
+ parser.add_argument("--output", required=True)
27
+ parser.add_argument(
28
+ "--include",
29
+ action="append",
30
+ default=[],
31
+ help="Additional response JSONL to merge before latest-by-request summarization.",
32
+ )
33
+ parser.add_argument(
34
+ "--latest-by-request",
35
+ action="store_true",
36
+ help="Use only the last response per request_id. Useful for append/resume retry logs.",
37
+ )
38
+ return parser.parse_args()
39
+
40
+
41
+ def unit_lookup(row: dict[str, Any]) -> dict[str, dict[str, Any]]:
42
+ return {unit["unit_id"]: unit for unit in row.get("claimed_units", []) if isinstance(unit, dict) and "unit_id" in unit}
43
+
44
+
45
+ def add_rates(stats: dict[str, Any]) -> dict[str, Any]:
46
+ valid = stats.get("valid_units", 0)
47
+ visual = stats.get("visual_units", 0)
48
+ for status in STATUSES:
49
+ stats[f"{status}_rate_all"] = stats.get(status, 0) / valid if valid else 0.0
50
+ stats[f"{status}_rate_visual"] = stats.get(status, 0) / visual if visual else 0.0
51
+ stats["grounded_precision"] = stats.get("grounded", 0) / visual if visual else 0.0
52
+ stats["unsupported_rate"] = stats.get("unsupported", 0) / visual if visual else 0.0
53
+ stats["uncertain_rate"] = stats.get("uncertain", 0) / visual if visual else 0.0
54
+ return stats
55
+
56
+
57
+ def main() -> int:
58
+ args = parse_args()
59
+ surface_stats: dict[str, Counter[str]] = defaultdict(Counter)
60
+ category_stats: dict[str, Counter[str]] = defaultdict(Counter)
61
+ status_examples: dict[str, list[dict[str, Any]]] = defaultdict(list)
62
+ total = 0
63
+ ok = 0
64
+ rows: list[dict[str, Any]] = []
65
+ input_paths = [Path(args.input), *[Path(item) for item in args.include]]
66
+ if args.latest_by_request:
67
+ latest: dict[str, dict[str, Any]] = {}
68
+ for input_path in input_paths:
69
+ with input_path.open("r", encoding="utf-8") as handle:
70
+ for line in handle:
71
+ if not line.strip():
72
+ continue
73
+ row = json.loads(line)
74
+ request_id = row.get("request_id")
75
+ if isinstance(request_id, str):
76
+ latest[request_id] = row
77
+ rows = list(latest.values())
78
+ else:
79
+ rows = []
80
+ for input_path in input_paths:
81
+ with input_path.open("r", encoding="utf-8") as handle:
82
+ rows.extend(json.loads(line) for line in handle if line.strip())
83
+ for row in rows:
84
+ total += 1
85
+ surface = row.get("request", {}).get("surface", "__unknown__")
86
+ surface_stats[surface]["responses"] += 1
87
+ if not row.get("ok"):
88
+ surface_stats[surface]["bad"] += 1
89
+ continue
90
+ ok += 1
91
+ surface_stats[surface]["ok"] += 1
92
+ lookup = unit_lookup(row.get("request", {}))
93
+ for result in row.get("parsed", {}).get("unit_results", []):
94
+ unit_id = result.get("unit_id")
95
+ unit = lookup.get(unit_id, {})
96
+ category = unit.get("category", "__unknown__")
97
+ status = result.get("status", "__bad_status__")
98
+ surface_stats[surface]["valid_units"] += 1
99
+ surface_stats[surface][status] += 1
100
+ category_stats[category]["valid_units"] += 1
101
+ category_stats[category][status] += 1
102
+ if status in {"grounded", "unsupported", "uncertain"}:
103
+ surface_stats[surface]["visual_units"] += 1
104
+ category_stats[category]["visual_units"] += 1
105
+ if status in {"unsupported", "uncertain", "invalid_text_unit", "not_a_visual_claim"} and len(status_examples[status]) < 20:
106
+ status_examples[status].append(
107
+ {
108
+ "surface": surface,
109
+ "caption_id": row.get("request", {}).get("caption_id"),
110
+ "category": category,
111
+ "unit": unit.get("unit"),
112
+ "target": unit.get("target"),
113
+ "status": status,
114
+ "evidence": result.get("evidence"),
115
+ }
116
+ )
117
+ surfaces = {surface: add_rates(dict(counter)) for surface, counter in surface_stats.items()}
118
+ categories = {category: add_rates(dict(counter)) for category, counter in category_stats.items()}
119
+ out = {
120
+ "input": args.input,
121
+ "responses": total,
122
+ "ok": ok,
123
+ "bad": total - ok,
124
+ "surfaces": surfaces,
125
+ "categories": categories,
126
+ "examples": status_examples,
127
+ }
128
+ Path(args.output).parent.mkdir(parents=True, exist_ok=True)
129
+ Path(args.output).write_text(json.dumps(out, indent=2, ensure_ascii=False), encoding="utf-8")
130
+ print(json.dumps({"output": args.output, "responses": total, "ok": ok, "bad": total - ok}, indent=2))
131
+ return 0
132
+
133
+
134
+ if __name__ == "__main__":
135
+ raise SystemExit(main())
eval_code/scripts/vllm/serve_gemma4_31b_it.sh ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Launch google/gemma-4-31B-it as DP=8 vLLM server for cross-family audits.
3
+ set -euo pipefail
4
+
5
+ PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
6
+ VENV_DIR="${VLLM_VENV:-<WORKSPACE_ROOT>/vllm-env}"
7
+ VLLM_BIN="${VENV_DIR}/bin/vllm"
8
+ CONFIG="${VLLM_CONFIG:-${PROJECT_ROOT}/configs/recap/vllm_serve_gemma4_31b_it.yaml}"
9
+ PORT="${VLLM_PORT:-8000}"
10
+ LOG="${VLLM_LOG:-/tmp/vllm_gemma4_31b_it.log}"
11
+ PID_FILE="${VLLM_PID_FILE:-/tmp/vllm_gemma4_31b_it.pid}"
12
+
13
+ export TMPDIR="${TMPDIR:-/tmp}"
14
+ export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}"
15
+ export CUDA_HOME="${CUDA_HOME:-/usr/local/cuda}"
16
+ export VLLM_WORKER_MULTIPROC_METHOD="${VLLM_WORKER_MULTIPROC_METHOD:-spawn}"
17
+ export TRITON_CACHE_DIR="${TRITON_CACHE_DIR:-/tmp/triton-cache}"
18
+ export TORCH_HOME="${TORCH_HOME:-/tmp/torch-home}"
19
+ export TORCH_EXTENSIONS_DIR="${TORCH_EXTENSIONS_DIR:-/tmp/torch-extensions}"
20
+ export TORCHINDUCTOR_CACHE_DIR="${TORCHINDUCTOR_CACHE_DIR:-/tmp/torchinductor-cache}"
21
+ export HF_HOME="${HF_HOME:-<LOCAL_CACHE>/hf}"
22
+ export HF_HUB_CACHE="${HF_HUB_CACHE:-<HF_CACHE>}"
23
+ export TRANSFORMERS_CACHE="${TRANSFORMERS_CACHE:-<LOCAL_CACHE>/transformers}"
24
+
25
+ if [[ ! -x "${VLLM_BIN}" ]]; then
26
+ echo "ERROR: vllm binary not found at ${VLLM_BIN}" >&2
27
+ exit 1
28
+ fi
29
+
30
+ status() {
31
+ if curl -fsS "http://localhost:${PORT}/v1/models" >/dev/null 2>&1; then
32
+ echo "vLLM gemma-4-31B-it :${PORT} ready"
33
+ curl -fsS "http://localhost:${PORT}/v1/models"
34
+ else
35
+ echo "vLLM gemma-4-31B-it :${PORT} not ready"
36
+ return 1
37
+ fi
38
+ }
39
+
40
+ stop() {
41
+ if [[ -f "${PID_FILE}" ]]; then
42
+ pid="$(cat "${PID_FILE}")"
43
+ if [[ -n "${pid}" ]] && ps -p "${pid}" -o command= 2>/dev/null | grep -q "vllm serve"; then
44
+ kill "${pid}" 2>/dev/null || true
45
+ sleep 2
46
+ kill -9 "${pid}" 2>/dev/null || true
47
+ fi
48
+ rm -f "${PID_FILE}"
49
+ fi
50
+ pgrep -f "vllm serve --config ${CONFIG}" 2>/dev/null | xargs -r kill 2>/dev/null || true
51
+ rm -f /dev/shm/vllm* 2>/dev/null || true
52
+ echo "stopped vLLM gemma-4-31B-it on :${PORT}"
53
+ }
54
+
55
+ start() {
56
+ mkdir -p "$(dirname "${LOG}")" "${TRITON_CACHE_DIR}" "${TORCH_HOME}" "${TORCH_EXTENSIONS_DIR}" "${TORCHINDUCTOR_CACHE_DIR}"
57
+ echo "starting vLLM gemma-4-31B-it"
58
+ echo " config: ${CONFIG}"
59
+ echo " log: ${LOG}"
60
+ CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}" \
61
+ setsid "${VLLM_BIN}" serve --config "${CONFIG}" > "${LOG}" 2>&1 < /dev/null &
62
+ echo "$!" > "${PID_FILE}"
63
+ echo " pid: $!"
64
+ }
65
+
66
+ case "${1:-start}" in
67
+ start) start ;;
68
+ stop) stop ;;
69
+ restart) stop; sleep 2; start ;;
70
+ status) status ;;
71
+ *) echo "usage: $0 {start|stop|restart|status}" >&2; exit 2 ;;
72
+ esac
eval_results/ALL_EVAL_RESULTS_INDEX.md ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Complete Evaluation Results Index
2
+
3
+ This directory contains sanitized summary artifacts for all completed recap-evaluation experiments, not only the CC12M frontier table. Raw VLM response JSONL files and source images are excluded; summary JSON/TSV/CSV/Markdown artifacts are included.
4
+
5
+ ## Result Families
6
+
7
+ | Family | Included results | Main files |
8
+ |---|---|---|
9
+ | CPU text metrics | 1M paired lexical/surface metrics, violation-code breakdown, tokenizer truncation, fair-slice manifests | raw_summaries/cpu_text_metrics/ |
10
+ | Prompt support | n-gram prompt-pool JSD/support metrics and bootstrap direction tables | raw_summaries/prompt_support/ |
11
+ | Embedding/Vendi/support | caption Vendi, covariance effective rank, PRDC-style prompt-caption support, dtype sanity | raw_summaries/embedding_vendi_support/ |
12
+ | CBU claimed density | B=64 claimed CBU summaries across CC12M/DataComp/PD12M/LAION-pop/Danbooru and CI tables | raw_summaries/cbu_claimed/ |
13
+ | Grounded CBU legacy | earlier exact-unit grounded audit summaries retained for traceability | raw_summaries/cbu_grounded_legacy/ |
14
+ | Image-conditioned VQA | Qwen-family VQA across DataComp/PD12M/LAION-pop/Danbooru/CC12M plus Gemma cross-family CC12M | raw_summaries/vqa_image_conditioned/ |
15
+ | LongCLIP retrieval | corrected CC12M full-caption and input-64 retrieval separability diagnostics | raw_summaries/longclip_retrieval/ |
16
+ | Plot-ready rollups | curated CSV/PNG files for the paper figures and tables | eval_results/*.csv, eval_results/*.png |
17
+
18
+ ## Dataset Coverage
19
+
20
+ - CC12M: four-caption corrected same-image slice, CBU B-grid, Qwen VQA, Gemma VQA, LongCLIP.
21
+ - DataComp: paired ours/reference CBU@64, Qwen VQA@64, CPU text metrics, prompt-support metrics, embedding/Vendi diagnostics.
22
+ - PD12M: paired ours/reference CBU@64 and Qwen VQA@64; metadata records included in `dataset_release/`.
23
+ - LAION-pop: paired ours/reference CBU@64 and Qwen VQA@64; metadata records included in `dataset_release/`.
24
+ - Danbooru2023: paired ours/reference CBU@64 and Qwen VQA@64; metadata records included in `dataset_release/`.
25
+
26
+ ## Boundary
27
+
28
+ The export is metadata/results only. Source images and raw VLM response streams are intentionally excluded. The included result summaries are enough to reproduce reported tables and inspect completed experiment outcomes.
eval_results/README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Recap Evaluation Result Index
2
+ Date: 2026-04-27
3
+ This file indexes plot-ready recap-evaluation outputs. All CSV files are derived from existing JSON/TSV artifacts; no model inference is performed here.
4
+ ## Plot-ready files
5
+ - `cc12m_budget_frontier_plot.csv`: CBU budget grid for B in {16,32,48,64}; plot x=B or CBU/100tok, y=CBU/cap, color=surface.
6
+ - `cc12m_vqa_supported_risk_pareto.csv`: VQA@64 Pareto data; plot x=unsupported_cap or risk, y=supported_cap, facet=judge.
7
+ - `cc12m_longclip_plot.csv`: LongCLIP full/input64 retrieval diagnostics; plot x=tok_mean or mode, y=I2T/T2I R@1.
8
+ - `all_vqa_b64_summary.csv`: All available VQA@64 summaries across CC12M/DataComp/PD12M/LAION-pop/Danbooru plus Gemma CC12M.
9
+
10
+ ## Current CC12M VQA Pareto interpretation
11
+ - Under both Qwen-family and Gemma-family judges, the Pareto frontier for supported yield vs unsupported cost contains `Ours` and the short `Qwen3-VL-8B` baseline.
12
+ - `Ours` is the high-yield endpoint; `Qwen3-VL-8B` is the low-risk endpoint.
13
+ - `LLaVA-NeXT` and `PixelProse` are dominated in the corrected CC12M VQA@64 frontier because each has lower supported yield and higher unsupported cost than `Ours` under the same judge.
14
+ - The CBU budget frontier similarly separates token efficiency from absolute yield: Qwen3-VL-8B is most efficient per token, while Ours has the highest CBU/cap from B=32 onward.
eval_results/all_cbu_b64_summary.csv ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ surface,label,captions,cbu_cap,cbu_100tok,dup_row,object_cap,attribute_cap,relation_cap,style_cap,camera_cap,lighting_cap,count_cap,text_rendering_cap
2
+ cc12m_llavanext_paired__ours_cc12m,CC12M/Ours vs LLaVA-NeXT,4874,15.041649569142388,22.877426199837732,0.0016413623307345096,4.298522773902339,6.580426754205991,2.5888387361510055,0.25851456709068527,0.22404595814526057,0.3459171112022979,0.4708658186294625,0.2745178498153467
3
+ cc12m_llavanext_paired__ref_cc12m_llavanext,CC12M/LLaVA-NeXT,4832,10.586299668874172,20.342157693179512,0.0035182119205298015,3.7547599337748343,3.5384933774834435,1.7119205298013245,0.5428394039735099,0.13927980132450332,0.1490066225165563,0.35513245033112584,0.39486754966887416
4
+ cc12m_pixelprose_paired__ours_cc12m,CC12M/Ours vs PixelProse,4851,14.903318903318903,22.672633589342333,0.00041228612657184083,4.211296639868069,6.548958977530406,2.549577406720264,0.27231498660070086,0.23232323232323232,0.3438466295609153,0.4813440527726242,0.2636569779426922
5
+ cc12m_pixelprose_paired__ref_pixelprose_cc12m,CC12M/PixelProse,4874,11.973943372999589,19.893580033132675,0.005334427574887156,3.8738202708247846,3.8153467377923675,2.540623717685679,0.4185473943373,0.13089864587607714,0.0734509643003693,0.794009027492819,0.32724661469019284
6
+ cc12m_qwen3vl8b_paired__ours_cc12m,CC12M/Ours vs Qwen3-VL-8B,4971,15.103399718366527,22.98503258909574,0.001408167370750352,4.518004425668879,6.555823777911889,2.627439147052907,0.27016696841681753,0.2055924361295514,0.39026352846509754,0.45463689398511364,0.08147254073627037
7
+ cc12m_qwen3vl8b_paired__ref_cc12m_qwen3vl8b,CC12M/Qwen3-VL-8B,4999,6.493098619723945,55.98406319529485,0.000600120024004801,2.995799159831966,1.5659131826365273,1.5679135827165434,0.11502300460092019,0.015403080616123225,0.08621724344868974,0.13522704540908181,0.011602320464092819
8
+ danbooru2023_florence2_paired__ours_danbooru2023,Danbooru/Ours,4995,14.325325325325325,21.97641884649523,0.0,3.8724724724724724,6.966366366366366,2.536136136136136,0.3747747747747748,0.22702702702702704,0.11571571571571572,0.2032032032032032,0.02962962962962963
9
+ danbooru2023_florence2_paired__ref_danbooru_florence2,Danbooru/Florence2,4979,8.184374372363928,19.25657795251777,0.001004217714400482,3.0321349668608155,2.4661578630247036,1.5631652942357903,0.5987146013255674,0.006025306286402892,0.03293834103233581,0.1899979915645712,0.29524000803374173
10
+ datacomp_recap_llava15_paired_url__ours_datacomp_forward,DataComp/Ours,4848,14.451320132013201,21.994788559947256,0.0008250825082508251,3.8803630363036303,6.445957095709571,2.3116749174917492,0.24711221122112212,0.26485148514851486,0.34385313531353134,0.511963696369637,0.44554455445544555
11
+ datacomp_recap_llava15_paired_url__ref_datacomp_recap_llava15_llama3_8b,DataComp/LLaVA1.5-Llama3,5000,10.4414,22.070735254329005,0.0072,3.6,3.5036,1.8032,0.315,0.1404,0.0958,0.3536,0.6298
12
+ laion_pop_llama32_paired__ours_laion_pop,LAION-pop/Ours,4964,14.81809024979855,22.521976356471658,0.0012087026591458502,4.014302981466559,6.941780821917808,2.421232876712329,0.3058017727639001,0.2639000805801773,0.40370668815471394,0.36583400483481066,0.10153102336825141
13
+ laion_pop_llama32_paired__ref_laion_pop_llama32_11b,LAION-pop/Llama3.2-11B,4947,11.909642207398424,18.396730136327587,0.0028299979785728724,3.5611481706084493,4.942793612290277,1.9140893470790379,0.5712553062462098,0.34242975540731757,0.11663634525975339,0.3393976147159895,0.12189205579138872
14
+ pd12m_full_paired__ours_pd12m_img2dataset,PD12M/Ours,4958,15.017144009681322,22.863644180219133,0.0014118596208148447,4.166397741024607,6.718636546994756,2.580072609923356,0.3100040338846309,0.22166196046793063,0.2989108511496571,0.5350947962888262,0.1863654699475595
15
+ pd12m_full_paired__ref_pd12m_full,PD12M/ref full,4992,9.779246794871796,24.892029839026307,0.001201923076923077,4.379607371794871,2.020232371794872,2.371794871794872,0.4238782051282051,0.10616987179487179,0.030448717948717948,0.24739583333333334,0.1997195512820513
eval_results/all_vqa_b64_summary.csv ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ source,surface,label,responses,questions,supported_cap,unsupported_cap,support,risk,uncertain
2
+ cc12m_qwen,ours_cc12m,Ours,4467,68019,14.59592567718827,0.46384598164316093,0.9585556976727091,0.03046207677266646,0.010982225554624444
3
+ cc12m_qwen,ref_cc12m_qwen3vl8b,Qwen3-VL-8B,4490,28911,6.3122494432071266,0.08775055679287305,0.9803189097575318,0.01362803085330843,0.006053059389159835
4
+ cc12m_qwen,ref_cc12m_llavanext,LLaVA-NeXT,4453,48092,9.842128901863912,0.7442173815405345,0.911315811361557,0.06890958995259086,0.019774598685852116
5
+ cc12m_qwen,ref_pixelprose_cc12m,PixelProse,4449,56043,10.730051697010564,1.623286131714992,0.8518102171546847,0.12886533554592008,0.019324447299395107
6
+ datacomp_qwen,datacomp_recap_llava15_paired_url__ours_datacomp_forward,Ours DataComp,4648,67724,13.839285714285714,0.5094664371772806,0.9498109975784065,0.03496544799480243,0.015223554426791094
7
+ datacomp_qwen,datacomp_recap_llava15_paired_url__ref_datacomp_recap_llava15_llama3_8b,Ref DataComp LLaVA1.5/Llama3,4995,52194,8.495895895895895,1.8472472472472472,0.8130628041537341,0.17678277196612638,0.01015442388013948
8
+ noncc12m_qwen,danbooru2023_florence2_paired__ours_danbooru2023,Ours Danbooru,4993,71491,12.731824554376127,0.8343681153615061,0.8892028367207061,0.05827306933739911,0.052524093941894785
9
+ noncc12m_qwen,danbooru2023_florence2_paired__ref_danbooru_florence2,Ref Danbooru Florence2,4969,40755,6.379955725498088,1.7832561883678808,0.777867746288799,0.21742117531591215,0.004711078395288922
10
+ noncc12m_qwen,laion_pop_llama32_paired__ours_laion_pop,Ours LAION-pop,4964,73489,14.21676067687349,0.45447219983883963,0.9603069847188015,0.030698471880145326,0.008994543401053219
11
+ noncc12m_qwen,laion_pop_llama32_paired__ref_laion_pop_llama32_11b,Ref LAION-pop Llama3.2,4947,58903,10.79725085910653,0.9189407721851627,0.9068128957778042,0.07717773288287523,0.016009371339320577
12
+ noncc12m_qwen,pd12m_full_paired__ours_pd12m_img2dataset,Ours PD12M,4957,74392,14.289086140810975,0.5045390357070809,0.9521319496720078,0.033619206366275946,0.014248843961716313
13
+ noncc12m_qwen,pd12m_full_paired__ref_pd12m_full,Ref PD12M full,4989,48825,8.605532170775707,1.008819402685909,0.8793241167434716,0.10308243727598566,0.017593445980542754
14
+ cc12m_gemma,ours_cc12m,Ours,4467,68019,13.823371390194762,1.011193194537721,0.9078198738587747,0.06640791543539305,0.025772210705832195
15
+ cc12m_gemma,ref_cc12m_qwen3vl8b,Qwen3-VL-8B,4490,28911,6.194432071269488,0.1821826280623608,0.9620213759468714,0.02829372903047283,0.009684895022655737
16
+ cc12m_gemma,ref_cc12m_llavanext,LLaVA-NeXT,4453,48092,9.439029867505052,1.0442398383112508,0.8739915162605008,0.09668967811694253,0.029318805622556766
17
+ cc12m_gemma,ref_pixelprose_cc12m,PixelProse,4449,56043,10.195999100921556,2.02202742189256,0.8094141998108595,0.16051960102064486,0.03006619916849562
eval_results/cc12m_budget_frontier_plot.csv ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ budget,surface,label,valid,bad_json,cbu_per_cap,cbu_per_100tok,dup_row_rate,object_per_cap,attribute_per_cap,relation_per_cap,style_per_cap,camera_per_cap,lighting_per_cap,count_per_cap,text_rendering_per_cap,pareto_efficiency_yield
2
+ 16,ours_cc12m,Ours,1000,1,5.758,34.88851187590887,0.0,1.694,2.624,0.806,0.164,0.14,0.111,0.211,0.008,0
3
+ 16,ref_cc12m_llavanext,LLaVA-NeXT,999,1,4.842842842842843,29.99194098320005,0.0,1.9289289289289289,1.4964964964964964,0.6506506506506506,0.42342342342342343,0.06706706706706707,0.057057057057057055,0.18618618618618618,0.03303303303303303,0
4
+ 16,ref_cc12m_qwen3vl8b,Qwen3-VL-8B,1000,1,6.422,56.39269406392694,0.001,2.984,1.548,1.531,0.106,0.017,0.095,0.133,0.008,1
5
+ 16,ref_pixelprose_cc12m,PixelProse,1000,1,4.205,26.021039603960396,0.001,1.814,1.176,0.668,0.189,0.038,0.028,0.28,0.012,0
6
+ 32,ours_cc12m,Ours,999,9,9.70870870870871,29.478451157984317,0.0,2.71971971971972,4.43043043043043,1.6076076076076076,0.2122122122122122,0.2132132132132132,0.20520520520520522,0.3053053053053053,0.015015015015015015,1
7
+ 32,ref_cc12m_llavanext,LLaVA-NeXT,996,9,7.975903614457831,26.3491326412153,0.001004016064257028,3.0261044176706826,2.568273092369478,1.3052208835341366,0.5040160642570282,0.09337349397590361,0.11546184738955824,0.2791164658634538,0.08433734939759036,0
8
+ 32,ref_cc12m_qwen3vl8b,Qwen3-VL-8B,999,9,6.44044044044044,56.423748136455316,0.0,2.984984984984985,1.5425425425425425,1.5375375375375375,0.11011011011011011,0.01701701701701702,0.1001001001001001,0.13913913913913914,0.009009009009009009,1
9
+ 32,ref_pixelprose_cc12m,PixelProse,997,9,7.742226680040121,24.0497258225324,0.0010030090270812437,3.00802407221665,2.358074222668004,1.510531594784353,0.2086258776328987,0.05917753259779338,0.05315947843530592,0.4974924774322969,0.04714142427281846,0
10
+ 48,ours_cc12m,Ours,996,20,12.66566265060241,25.655365967745215,0.0,3.604417670682731,5.731927710843373,2.1546184738955825,0.22088353413654618,0.2319277108433735,0.29417670682730923,0.3815261044176707,0.04618473895582329,1
11
+ 48,ref_cc12m_llavanext,LLaVA-NeXT,992,20,9.808467741935484,23.807776064988133,0.0020161290322580645,3.6985887096774195,3.2056451612903225,1.6330645161290323,0.5453629032258065,0.13205645161290322,0.17237903225806453,0.3165322580645161,0.10483870967741936,0
12
+ 48,ref_cc12m_qwen3vl8b,Qwen3-VL-8B,999,20,6.425425425425425,56.29220380601596,0.0,2.991991991991992,1.5345345345345345,1.5315315315315314,0.10810810810810811,0.018018018018018018,0.0970970970970971,0.13613613613613615,0.008008008008008008,1
13
+ 48,ref_pixelprose_cc12m,PixelProse,993,20,10.506545820745217,21.99709038773746,0.002014098690835851,3.8298086606243706,3.2255790533736155,2.2678751258811682,0.27794561933534745,0.08257804632426989,0.08761329305135952,0.6576032225579054,0.07754279959718026,0
14
+ 64,ours_cc12m,Ours,995,25,15.484422110552764,23.56747330743109,0.0,4.5768844221105525,6.7658291457286435,2.7326633165829146,0.25125628140703515,0.24824120603015076,0.37386934673366834,0.46030150753768845,0.07537688442211055,1
15
+ 64,ref_cc12m_llavanext,LLaVA-NeXT,992,25,10.904233870967742,21.646555001901103,0.004032258064516129,4.047379032258065,3.595766129032258,1.8074596774193548,0.6280241935483871,0.13608870967741934,0.23387096774193547,0.3286290322580645,0.12701612903225806,0
16
+ 64,ref_cc12m_qwen3vl8b,Qwen3-VL-8B,1000,25,6.453,56.5061295971979,0.001,2.99,1.528,1.572,0.107,0.015,0.095,0.137,0.009,1
17
+ 64,ref_pixelprose_cc12m,PixelProse,988,25,12.625506072874494,20.52049746660525,0.004048582995951417,4.394736842105263,3.8856275303643724,2.729757085020243,0.458502024291498,0.11842105263157894,0.12246963562753037,0.791497975708502,0.12449392712550607,0
eval_results/cc12m_cbu_budget_frontier.png ADDED
eval_results/cc12m_cbu_vqa_bootstrap_ci.tsv ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ surface cbu_cap cbu_cap_ci support support_ci risk risk_ci unsupported_cap unsupported_cap_ci supported_cap supported_cap_ci
2
+ Ours 15.2148 [15.1034,15.3244] 0.9591 [0.9572,0.9611] 0.0305 [0.0289,0.0322] 0.4638 [0.4376,0.4907] 14.5959 [14.4822,14.7058]
3
+ Qwen3-VL-8B 6.4371 [6.3892,6.4850] 0.9796 [0.9775,0.9817] 0.0135 [0.0118,0.0152] 0.0878 [0.0773,0.0987] 6.3122 [6.2615,6.3613]
4
+ LLaVA-NeXT 10.7809 [10.6874,10.8762] 0.9107 [0.9073,0.9142] 0.0681 [0.0651,0.0711] 0.7442 [0.7117,0.7777] 9.8421 [9.7471,9.9367]
5
+ PixelProse 12.5706 [12.4826,12.6605] 0.8502 [0.8460,0.8544] 0.1293 [0.1254,0.1333] 1.6233 [1.5727,1.6748] 10.7301 [10.6350,10.8236]
eval_results/cc12m_cbu_yield_efficiency_scatter.png ADDED
eval_results/cc12m_gemma4_vqa_bootstrap_ci.tsv ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ surface claim_cbu claim_cbu_ci support support_ci risk risk_ci unsupported_cap unsupported_cap_ci supported_cap supported_cap_ci
2
+ Ours 15.2270 [15.1208,15.3432] 0.9078 [0.9050,0.9109] 0.0664 [0.0638,0.0689] 1.0112 [0.9716,1.0495] 13.8234 [13.7174,13.9335]
3
+ Qwen3-VL-8B 6.4390 [6.3893,6.4873] 0.9620 [0.9591,0.9648] 0.0283 [0.0259,0.0308] 0.1822 [0.1666,0.1984] 6.1944 [6.1427,6.2468]
4
+ LLaVA-NeXT 10.7999 [10.7070,10.8905] 0.8740 [0.8701,0.8780] 0.0967 [0.0933,0.1002] 1.0442 [1.0069,1.0840] 9.4390 [9.3468,9.5261]
5
+ PixelProse 12.5969 [12.5088,12.6876] 0.8094 [0.8049,0.8137] 0.1605 [0.1566,0.1648] 2.0220 [1.9694,2.0788] 10.1964 [10.1090,10.2890]
eval_results/cc12m_longclip_plot.csv ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ mode,surface,rows,trunc_gt_248,tok_mean,tok_p95,pos_mean,pos_ci95,i2t_margin_mean,i2t_margin_ci95,i2t_r1,i2t_r5,t2i_margin_mean,t2i_margin_ci95,t2i_r1,t2i_r5
2
+ full,ours,4494,0.3238,231.67,320.0,0.322264,"[0.321269,0.323159]",0.052692,"[0.051671,0.053724]",0.9079,0.9878,0.051062,"[0.049759,0.052290]",0.9023,0.9855
3
+ full,qwen3vl8b,4494,0.0000,15.41,20.0,0.329667,"[0.328596,0.330746]",0.047944,"[0.046669,0.049288]",0.8611,0.9791,0.048947,"[0.047556,0.050413]",0.8420,0.9706
4
+ full,llavanext,4494,0.0040,81.88,153.0,0.335460,"[0.334478,0.336478]",0.057289,"[0.055980,0.058576]",0.9065,0.9893,0.057517,"[0.056162,0.058858]",0.8994,0.9862
5
+ full,pixelprose,4494,0.0167,108.35,193.0,0.325949,"[0.324966,0.327034]",0.052218,"[0.050927,0.053507]",0.8903,0.9771,0.052438,"[0.051127,0.053663]",0.8741,0.9755
6
+ input64,ours,4494,0.0000,81.76,90.0,0.316327,"[0.315241,0.317304]",0.045948,"[0.044788,0.047165]",0.8587,0.9713,0.045064,"[0.043627,0.046384]",0.8502,0.9651
7
+ input64,qwen3vl8b,4494,0.0000,15.41,20.0,0.329667,"[0.328596,0.330746]",0.047944,"[0.046669,0.049288]",0.8611,0.9791,0.048947,"[0.047556,0.050413]",0.8420,0.9706
8
+ input64,llavanext,4494,0.0000,60.62,83.0,0.334477,"[0.333420,0.335557]",0.055338,"[0.053996,0.056670]",0.8968,0.9849,0.055819,"[0.054487,0.057197]",0.8876,0.9829
9
+ input64,pixelprose,4494,0.0007,73.39,82.3,0.324152,"[0.323114,0.325235]",0.049075,"[0.047727,0.050442]",0.8687,0.9677,0.049699,"[0.048287,0.051025]",0.8563,0.9631
eval_results/cc12m_vqa_supported_risk_pareto.csv ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ dataset,judge,surface,label,claim_cbu_cap,claim_cbu_cap_ci_half,supported_cap,supported_cap_ci_half,unsupported_cap,unsupported_cap_ci_half,support,support_ci_half,risk,risk_ci_half,pareto_supported_cost
2
+ cc12m,Qwen3.5-397B-A17B-FP8,ours_cc12m,Ours,15.21476510067114,0.1114093959731548,14.59592567718827,0.11373404969778456,0.46384598164316093,0.026863666890530602,0.9591174271295853,0.0019560148290445056,0.03052671847444831,0.0016744650009232857,1
3
+ cc12m,Qwen3.5-397B-A17B-FP8,ref_cc12m_qwen3vl8b,Qwen3-VL-8B,6.437096415052327,0.04787909151636516,6.3122494432071266,0.05078507795100151,0.08775055679287305,0.010913140311804001,0.979630050103324,0.002106863054663677,0.013491260039144228,0.0017132916421613506,1
4
+ cc12m,Qwen3.5-397B-A17B-FP8,ref_cc12m_llavanext,LLaVA-NeXT,10.780892576810944,0.09532967032967044,9.842128901863912,0.09499775432292878,0.7442173815405345,0.03346058836739274,0.9107345292570356,0.0034450716214012855,0.06810172039716729,0.003039160276583866,0
5
+ cc12m,Qwen3.5-397B-A17B-FP8,ref_pixelprose_cc12m,PixelProse,12.570563159075611,0.08997644155261497,10.730051697010564,0.09507754551584569,1.623286131714992,0.051472240953023274,0.8502215852149048,0.004186420482200637,0.12925547153139433,0.004071021279282744,0
6
+ cc12m,Gemma-4-31B-it,ours_cc12m,Ours,15.226997985224983,0.11619812810340768,13.823371390194762,0.11008591877938656,1.011193194537721,0.03964272454927065,0.9078198738587747,0.003033190165912525,0.06640791543539305,0.002597111181184858,1
7
+ cc12m,Gemma-4-31B-it,ref_cc12m_qwen3vl8b,Qwen3-VL-8B,6.438975501113585,0.04967149220489908,6.194432071269488,0.05235385643843937,0.1821826280623608,0.016260562688326985,0.9620213759468714,0.0028781310644838687,0.02829372903047283,0.0025388759897655086,1
8
+ cc12m,Gemma-4-31B-it,ref_cc12m_llavanext,LLaVA-NeXT,10.799910172917135,0.09292042436137748,9.439029867505052,0.09223563302542459,1.0442398383112508,0.03980323956484666,0.8739915162605008,0.003965850263161097,0.09668967811694253,0.003529553703450483,0
9
+ cc12m,Gemma-4-31B-it,ref_pixelprose_cc12m,PixelProse,12.596850393700787,0.09077978696827138,10.196400449943757,0.09259006956107108,2.0220472440944883,0.05678535826217557,0.8094404657725073,0.004546951426257162,0.16052006500812602,0.004311023541536035,0
eval_results/cc12m_vqa_supported_risk_pareto.png ADDED
eval_results/datacomp-naive-qwen35-baseline-2026-05-02/README.md ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DataComp naive-Qwen35 policy ablation
2
+
3
+ Date: 2026-05-02
4
+
5
+ This package is the DataComp-side same-captioner policy ablation:
6
+
7
+ - `ours_datacomp_forward`: Qwen3.5 captioner with the grounded recap policy.
8
+ - `naive_qwen35_datacomp`: Qwen3.5 captioner with a single naive captioning prompt.
9
+
10
+ This is not a three-way attribution against the DataComp LLaVA-1.5 + LLaMA-3 reference captioner. The clean comparison here is `ours_datacomp_forward` vs `naive_qwen35_datacomp` on the same DataComp image surface.
11
+
12
+ ## Caption generation
13
+
14
+ - Model: `Qwen/Qwen3.5-35B-A3B-FP8`
15
+ - Prompt: `Please generate a detailed caption of this image. Please be as descriptive as possible.`
16
+ - System prompt: none
17
+ - Message policy: single user message with image, no system prompt
18
+ - Image mode: local file first pass, resized data-URI retry for over-context images
19
+ - Materialized requests: 4,775 / 4,997
20
+ - Caption responses: 4,775 unique requests, bad 0
21
+ - Caption token mean / median: 296.46 / 296
22
+
23
+ Primary surface:
24
+
25
+ - `naive_qwen35_datacomp.jsonl`
26
+ - `naive_qwen35_caption.summary.json`
27
+
28
+ ## Judge settings
29
+
30
+ All judge runs used deterministic default sampling from the local runners:
31
+
32
+ | Stage | Runner | Model | Temperature | Notes |
33
+ |---|---|---|---:|---|
34
+ | Claim extraction | `run_text_json_requests.py` | `google/gemma-4-31B-it` | 0.0 | default `--temperature`; not passed explicitly |
35
+ | Grounded CBU verify | `run_grounded_cbu_verify_requests.py` | `google/gemma-4-31B-it` | 0.0 | default `--temperature`; not passed explicitly |
36
+ | CBU VQA | `run_cbu_vqa_requests.py` | `google/gemma-4-31B-it` | 0.0 | default `--temperature`; not passed explicitly |
37
+
38
+ The response JSONL rows do not currently persist sampling parameters. Traceability for this run is through the runner defaults plus the command log and this README.
39
+
40
+ ## Gemma judge results
41
+
42
+ CBU extraction and grounded audit:
43
+
44
+ | Metric | Value |
45
+ |---|---:|
46
+ | Captions | 4,775 |
47
+ | Claimed CBU / caption | 12.2119 [12.1177, 12.3129] |
48
+ | Visual units | 58,260 |
49
+ | Grounded units / caption | 11.7801 [11.6785, 11.8796] |
50
+ | Grounded precision | 0.9655 [0.9638, 0.9674] |
51
+ | Unsupported rate | 0.0104 [0.0093, 0.0116] |
52
+ | Uncertain rate | 0.0241 [0.0227, 0.0255] |
53
+
54
+ CBU VQA:
55
+
56
+ | Surface | Resp | OK | Q | Support | Risk | Uncertain |
57
+ |---|---:|---:|---:|---:|---:|---:|
58
+ | `naive_qwen35_datacomp` | 4,775 | 4,775 | 58,335 | 0.9307 | 0.0403 | 0.0290 |
59
+
60
+ Compared to the existing Gemma DataComp forward table:
61
+
62
+ | Surface | Grounded precision | Unsupported | CBU-VQA support | CBU-VQA risk |
63
+ |---|---:|---:|---:|---:|
64
+ | `ours_datacomp_forward` | 0.9457 | 0.0246 | 0.8886 | 0.0840 |
65
+ | `naive_qwen35_datacomp` | 0.9655 | 0.0104 | 0.9307 | 0.0403 |
66
+
67
+ The naive surface has fewer claimed visual units than ours, so higher precision/support should be read together with coverage: 58,260 visual units for naive vs 70,894 visual units for ours in the existing DataComp Gemma table.
68
+
69
+ ## CPU text diagnostics
70
+
71
+ CPU lexical diagnostics were run on the exact 4,775-image intersection between `ours_datacomp_forward` and `naive_qwen35_datacomp`.
72
+
73
+ | Surface | Captions | Mean Lex | P95 Lex | Cov64 | Cov128 | Cov248 | Cov320 | D2 | D3 | M3 Top100 | Prefix Top100 | Rep4 | Viol/64 | Newline | Bullet | Top Opening |
74
+ |---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---|
75
+ | `ours_datacomp_forward` | 4,775 | 175.08 | 249.0 | 1.0000 | 0.9125 | 0.0515 | 0.0098 | 0.286004 | 0.606353 | 0.0411 | 0.0821 | 0.1969 | 0.020223 | 0.3818 | 0.0010 | close up view |
76
+ | `naive_qwen35_datacomp` | 4,775 | 298.02 | 392.0 | 0.9979 | 0.9927 | 0.7786 | 0.3929 | 0.262878 | 0.579717 | 0.0443 | 0.6921 | 0.2972 | 0.114788 | 0.9799 | 0.3749 | is a close |
77
+
78
+ Interpretation: naive-Qwen35 is more verbose and has higher judged support/risk scores, but it is much more template-concentrated and format-heavy. The high `Prefix Top100`, newline rate, bullet rate, and repetition rate should be reported as the caveat against treating the naive improvement as purely semantic quality.
79
+
80
+ Reference LLaVA/Recap-DataComp was not included in this exact CPU table because its available request slices use a different URL-paired surface; matching by `source_row` is invalid and URL overlap with this naive materialized slice is only a small residual subset.
81
+
82
+ ## Tables
83
+
84
+ Paper-facing tables are under `gemma4_metric_tables/`:
85
+
86
+ - `claimed_cbu_ci.tsv`
87
+ - `grounded_cbu_ci.tsv`
88
+ - `grounded_cbu_category_ci.tsv`
89
+ - `cbu_bootstrap_summary.json`
90
+ - `cbu_vqa_gemma4_table.md`
91
+ - `cbu_vqa_gemma4_table.tex`
92
+ - `../cpu_text_metrics/cpu_text_comparison.md`
93
+ - `../cpu_text_metrics/cpu_text_comparison.tsv`
94
+ - `../cpu_text_metrics/cpu_text_summary.json`
95
+
96
+ ## Portable image package
97
+
98
+ Reusable E&D image package:
99
+
100
+ - Directory: `image_packages/datacomp_naive_qwen35_policy_ablation/`
101
+ - Tarball: `image_packages/datacomp_naive_qwen35_policy_ablation.tar.gz`
102
+ - Images: 4,775
103
+ - Missing rows: 0
104
+ - Packaged requests: grounded CBU and CBU VQA, rewritten to package-relative `image_path`
105
+
106
+ Verified with `gzip -t`.
eval_results/datacomp-naive-qwen35-baseline-2026-05-02/cpu_text_metrics/cpu_text_comparison.md ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ | Surface | Captions | Mean Lex | P95 Lex | Cov64 | Cov128 | Cov248 | Cov320 | D2 | D3 | M3 Top100 | Prefix Top100 | Rep4 | Viol/64 | Newline | Bullet | Top Opening |
2
+ |---|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---:|---|
3
+ | `ours_datacomp_forward` | 4,775 | 175.08 | 249.0 | 1.0000 | 0.9125 | 0.0515 | 0.0098 | 0.286004 | 0.606353 | 0.0411 | 0.0821 | 0.1969 | 0.020223 | 0.3818 | 0.0010 | close up view |
4
+ | `naive_qwen35_datacomp` | 4,775 | 298.02 | 392.0 | 0.9979 | 0.9927 | 0.7786 | 0.3929 | 0.262878 | 0.579717 | 0.0443 | 0.6921 | 0.2972 | 0.114788 | 0.9799 | 0.3749 | is a close |
eval_results/datacomp-naive-qwen35-baseline-2026-05-02/cpu_text_metrics/cpu_text_comparison.tsv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ surface records avg_tokens p95_tokens cov64 cov128 cov248 cov320 distinct2_full distinct3_full m3_top100_full prefix_raw_top100_full rep4_full within_d3_full within_d4_full violation_rate_full viol_ind_per64_full control_hits_per64_full format_newline_rate format_bullet_rate format_numbered_list_rate top_opening_1 top_opening_1_count
2
+ ours_datacomp_forward 4775 175.081047 249.0 1.0 0.9125 0.0515 0.0098 0.286004 0.606353 0.041105 0.0821 0.1969 0.991174 0.995628 0.0595 0.020223 8.2583 0.3818 0.001 0.001 close up view 77
3
+ naive_qwen35_datacomp 4775 298.016126 392.0 0.9979 0.9927 0.7786 0.3929 0.262878 0.579717 0.04432 0.6921 0.2972 0.989423 0.997198 0.5504 0.114788 6.2184 0.9799 0.3749 0.067 is a close 726
eval_results/datacomp-naive-qwen35-baseline-2026-05-02/cpu_text_metrics/cpu_text_summary.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tokenizer": "regex lexical units: [^\\W_]+(?:'[^\\W_]+)*",
3
+ "matched_rows": 4775,
4
+ "surfaces": {
5
+ "ours_datacomp_forward": {
6
+ "surface": "ours_datacomp_forward",
7
+ "records": 4775,
8
+ "avg_tokens": 175.08104712041884,
9
+ "p95_tokens": 249.0,
10
+ "cov64": 1.0,
11
+ "cov128": 0.9125,
12
+ "cov248": 0.0515,
13
+ "cov320": 0.0098,
14
+ "distinct2_full": 0.286004,
15
+ "distinct3_full": 0.606353,
16
+ "m3_top100_full": 0.041105,
17
+ "prefix_raw_top100_full": 0.0821,
18
+ "rep4_full": 0.1969,
19
+ "within_d3_full": 0.991174,
20
+ "within_d4_full": 0.995628,
21
+ "violation_rate_full": 0.0595,
22
+ "viol_ind_per64_full": 0.020223,
23
+ "control_hits_per64_full": 8.2583,
24
+ "format_newline_rate": 0.3818,
25
+ "format_bullet_rate": 0.001,
26
+ "format_numbered_list_rate": 0.001,
27
+ "top_opening_1": "close up view",
28
+ "top_opening_1_count": 77
29
+ },
30
+ "naive_qwen35_datacomp": {
31
+ "surface": "naive_qwen35_datacomp",
32
+ "records": 4775,
33
+ "avg_tokens": 298.01612565445026,
34
+ "p95_tokens": 392.0,
35
+ "cov64": 0.9979,
36
+ "cov128": 0.9927,
37
+ "cov248": 0.7786,
38
+ "cov320": 0.3929,
39
+ "distinct2_full": 0.262878,
40
+ "distinct3_full": 0.579717,
41
+ "m3_top100_full": 0.04432,
42
+ "prefix_raw_top100_full": 0.6921,
43
+ "rep4_full": 0.2972,
44
+ "within_d3_full": 0.989423,
45
+ "within_d4_full": 0.997198,
46
+ "violation_rate_full": 0.5504,
47
+ "viol_ind_per64_full": 0.114788,
48
+ "control_hits_per64_full": 6.2184,
49
+ "format_newline_rate": 0.9799,
50
+ "format_bullet_rate": 0.3749,
51
+ "format_numbered_list_rate": 0.067,
52
+ "top_opening_1": "is a close",
53
+ "top_opening_1_count": 726
54
+ }
55
+ }
56
+ }
eval_results/datacomp-naive-qwen35-baseline-2026-05-02/gemma4_metric_tables/cbu_bootstrap_summary.json ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bootstrap_reps": 2000,
3
+ "seed": 0,
4
+ "claimed": {
5
+ "naive_qwen35_datacomp": {
6
+ "input": "artifacts/cbu/datacomp-naive-qwen35-baseline-2026-05-02/claimed_cbu_v2_naive_qwen35_datacomp_b64.responses.gemma4_31b_it_c512_mt1024.jsonl",
7
+ "captions": 4775,
8
+ "dedup_units_per_caption": {
9
+ "mean": 12.211937172774869,
10
+ "ci95_low": 12.117685863874346,
11
+ "ci95_high": 12.312884816753927
12
+ },
13
+ "dedup_units_per_100_tokens": {
14
+ "mean": 18.625390477772314,
15
+ "ci95_low": 18.485901003129428,
16
+ "ci95_high": 18.776241307080532
17
+ },
18
+ "duplicate_units_per_caption": {
19
+ "mean": 0.0035602094240837698,
20
+ "ci95_low": 0.0018848167539267015,
21
+ "ci95_high": 0.005235602094240838
22
+ },
23
+ "object_per_caption": {
24
+ "mean": 2.9212565445026177,
25
+ "ci95_low": 2.8699267015706806,
26
+ "ci95_high": 2.974874345549738
27
+ },
28
+ "attribute_per_caption": {
29
+ "mean": 5.352670157068063,
30
+ "ci95_low": 5.280413612565445,
31
+ "ci95_high": 5.423680628272251
32
+ },
33
+ "relation_per_caption": {
34
+ "mean": 1.381151832460733,
35
+ "ci95_low": 1.3505759162303665,
36
+ "ci95_high": 1.4121465968586386
37
+ },
38
+ "style_per_caption": {
39
+ "mean": 1.0284816753926702,
40
+ "ci95_low": 1.0054450261780106,
41
+ "ci95_high": 1.0517329842931937
42
+ },
43
+ "camera_per_caption": {
44
+ "mean": 0.660523560209424,
45
+ "ci95_low": 0.6376963350785341,
46
+ "ci95_high": 0.683565445026178
47
+ },
48
+ "lighting_per_caption": {
49
+ "mean": 0.2393717277486911,
50
+ "ci95_low": 0.22450261780104713,
51
+ "ci95_high": 0.2550837696335078
52
+ },
53
+ "count_per_caption": {
54
+ "mean": 0.3392670157068063,
55
+ "ci95_low": 0.3231413612565445,
56
+ "ci95_high": 0.35539267015706805
57
+ },
58
+ "text_rendering_per_caption": {
59
+ "mean": 0.28921465968586385,
60
+ "ci95_low": 0.27036649214659686,
61
+ "ci95_high": 0.3082827225130889
62
+ }
63
+ }
64
+ },
65
+ "grounded": {
66
+ "naive_qwen35_datacomp": {
67
+ "input": "artifacts/grounded-cbu/datacomp-naive-qwen35-baseline-2026-05-02/grounded_verify_v2_naive_qwen35_datacomp_b64.responses.gemma4_31b_c512_local_file_mt2048.jsonl",
68
+ "captions": 4775,
69
+ "visual_units": 58260,
70
+ "grounded_units_per_caption": {
71
+ "mean": 11.780104712041885,
72
+ "ci95_low": 11.678528795811518,
73
+ "ci95_high": 11.879596858638743
74
+ },
75
+ "grounded_precision": {
76
+ "mean": 0.9654994850669413,
77
+ "ci95_low": 0.9637811360068513,
78
+ "ci95_high": 0.9674050979253277
79
+ },
80
+ "unsupported_rate": {
81
+ "mean": 0.010384483350497768,
82
+ "ci95_low": 0.009253022043574697,
83
+ "ci95_high": 0.011574997848566833
84
+ },
85
+ "uncertain_rate": {
86
+ "mean": 0.024116031582560933,
87
+ "ci95_low": 0.0227306797964753,
88
+ "ci95_high": 0.025492158685210573
89
+ },
90
+ "categories": {
91
+ "object": {
92
+ "visual_units": 13943,
93
+ "grounded_precision": {
94
+ "mean": 0.9749695187549308,
95
+ "ci95_low": 0.9722321157964497,
96
+ "ci95_high": 0.9777625675102546
97
+ },
98
+ "unsupported_rate": {
99
+ "mean": 0.00817614573621172,
100
+ "ci95_low": 0.006573729389796407,
101
+ "ci95_high": 0.009942820581087936
102
+ },
103
+ "uncertain_rate": {
104
+ "mean": 0.016854335508857492,
105
+ "ci95_low": 0.014621329422784338,
106
+ "ci95_high": 0.019047263158799014
107
+ }
108
+ },
109
+ "attribute": {
110
+ "visual_units": 25508,
111
+ "grounded_precision": {
112
+ "mean": 0.9510741728085307,
113
+ "ci95_low": 0.9483290055013888,
114
+ "ci95_high": 0.9538571731863305
115
+ },
116
+ "unsupported_rate": {
117
+ "mean": 0.01003606711619884,
118
+ "ci95_low": 0.008637039503512835,
119
+ "ci95_high": 0.011506914261990438
120
+ },
121
+ "uncertain_rate": {
122
+ "mean": 0.0388897600752705,
123
+ "ci95_low": 0.03648831232635523,
124
+ "ci95_high": 0.041265412517918716
125
+ }
126
+ },
127
+ "relation": {
128
+ "visual_units": 6595,
129
+ "grounded_precision": {
130
+ "mean": 0.979226686884003,
131
+ "ci95_low": 0.9756088423314924,
132
+ "ci95_high": 0.9829912549589104
133
+ },
134
+ "unsupported_rate": {
135
+ "mean": 0.012736921910538287,
136
+ "ci95_low": 0.009840657602093744,
137
+ "ci95_high": 0.015594546249207836
138
+ },
139
+ "uncertain_rate": {
140
+ "mean": 0.008036391205458682,
141
+ "ci95_low": 0.005879463614752104,
142
+ "ci95_high": 0.010196506005969915
143
+ }
144
+ },
145
+ "style": {
146
+ "visual_units": 4903,
147
+ "grounded_precision": {
148
+ "mean": 0.9946971242096676,
149
+ "ci95_low": 0.992361424443466,
150
+ "ci95_high": 0.9967565868866474
151
+ },
152
+ "unsupported_rate": {
153
+ "mean": 0.0012237405669997961,
154
+ "ci95_low": 0.0004036204900748543,
155
+ "ci95_high": 0.0022689886006859402
156
+ },
157
+ "uncertain_rate": {
158
+ "mean": 0.004079135223332654,
159
+ "ci95_low": 0.0022297001685589526,
160
+ "ci95_high": 0.006222442286005553
161
+ }
162
+ },
163
+ "camera": {
164
+ "visual_units": 3153,
165
+ "grounded_precision": {
166
+ "mean": 0.987313669521091,
167
+ "ci95_low": 0.9833699876449492,
168
+ "ci95_high": 0.9911234711442108
169
+ },
170
+ "unsupported_rate": {
171
+ "mean": 0.009514747859181731,
172
+ "ci95_low": 0.006189461424091486,
173
+ "ci95_high": 0.012894916409768217
174
+ },
175
+ "uncertain_rate": {
176
+ "mean": 0.003171582619727244,
177
+ "ci95_low": 0.0012767112046430675,
178
+ "ci95_high": 0.0051597925278928725
179
+ }
180
+ },
181
+ "lighting": {
182
+ "visual_units": 1141,
183
+ "grounded_precision": {
184
+ "mean": 0.9877300613496932,
185
+ "ci95_low": 0.9809899256700093,
186
+ "ci95_high": 0.9941029042176894
187
+ },
188
+ "unsupported_rate": {
189
+ "mean": 0.0043821209465381246,
190
+ "ci95_low": 0.0008849166640607899,
191
+ "ci95_high": 0.008410605919016347
192
+ },
193
+ "uncertain_rate": {
194
+ "mean": 0.007887817703768623,
195
+ "ci95_low": 0.002643113637773779,
196
+ "ci95_high": 0.013688086732727058
197
+ }
198
+ },
199
+ "count": {
200
+ "visual_units": 1622,
201
+ "grounded_precision": {
202
+ "mean": 0.9340320591861899,
203
+ "ci95_low": 0.921660662579409,
204
+ "ci95_high": 0.946229913473424
205
+ },
206
+ "unsupported_rate": {
207
+ "mean": 0.04315659679408138,
208
+ "ci95_low": 0.03341480895617374,
209
+ "ci95_high": 0.05351880954899317
210
+ },
211
+ "uncertain_rate": {
212
+ "mean": 0.02281134401972873,
213
+ "ci95_low": 0.015662198229714857,
214
+ "ci95_high": 0.030853274629741658
215
+ }
216
+ },
217
+ "text_rendering": {
218
+ "visual_units": 1395,
219
+ "grounded_precision": {
220
+ "mean": 0.9362007168458781,
221
+ "ci95_low": 0.9220679765830679,
222
+ "ci95_high": 0.9496207804424454
223
+ },
224
+ "unsupported_rate": {
225
+ "mean": 0.02867383512544803,
226
+ "ci95_low": 0.019870493081955463,
227
+ "ci95_high": 0.038360938578329874
228
+ },
229
+ "uncertain_rate": {
230
+ "mean": 0.03512544802867384,
231
+ "ci95_low": 0.025244999513490084,
232
+ "ci95_high": 0.04539113137815038
233
+ }
234
+ }
235
+ }
236
+ }
237
+ }
238
+ }
eval_results/datacomp-naive-qwen35-baseline-2026-05-02/gemma4_metric_tables/cbu_vqa_gemma4_table.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ | Surface | Resp | OK | Q | Support ↑ | Risk ↓ | Uncertain ↓ |
2
+ |---|---:|---:|---:|---:|---:|---:|
3
+ | naive_qwen35_datacomp | 4,775 | 4,775 | 58,335 | 0.9307 | 0.0403 | 0.0290 |
eval_results/datacomp-naive-qwen35-baseline-2026-05-02/gemma4_metric_tables/cbu_vqa_gemma4_table.tex ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ \begin{tabular}{lrrrrrr}
2
+ \toprule
3
+ Surface & Resp. & OK & Q & Support $\uparrow$ & Risk $\downarrow$ & Uncertain $\downarrow$ \\
4
+ \midrule
5
+ naive\_qwen35\_datacomp & 4,775 & 4,775 & 58,335 & 0.9307 & 0.0403 & 0.0290 \\
6
+ \bottomrule
7
+ \end{tabular}
eval_results/datacomp-naive-qwen35-baseline-2026-05-02/gemma4_metric_tables/claimed_cbu_ci.tsv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ surface captions cbu_per_caption_ci95 cbu_per_100_tokens_ci95 object_per_caption_ci95 attribute_per_caption_ci95 relation_per_caption_ci95 camera_per_caption_ci95 lighting_per_caption_ci95 text_rendering_per_caption_ci95
2
+ naive_qwen35_datacomp 4775 12.2119 [12.1177, 12.3129] 18.6254 [18.4859, 18.7762] 2.9213 [2.8699, 2.9749] 5.3527 [5.2804, 5.4237] 1.3812 [1.3506, 1.4121] 0.6605 [0.6377, 0.6836] 0.2394 [0.2245, 0.2551] 0.2892 [0.2704, 0.3083]
eval_results/datacomp-naive-qwen35-baseline-2026-05-02/gemma4_metric_tables/grounded_cbu_category_ci.tsv ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ surface category visual_units grounded_precision_ci95 unsupported_rate_ci95 uncertain_rate_ci95
2
+ naive_qwen35_datacomp object 13943 0.9750 [0.9722, 0.9778] 0.0082 [0.0066, 0.0099] 0.0169 [0.0146, 0.0190]
3
+ naive_qwen35_datacomp attribute 25508 0.9511 [0.9483, 0.9539] 0.0100 [0.0086, 0.0115] 0.0389 [0.0365, 0.0413]
4
+ naive_qwen35_datacomp relation 6595 0.9792 [0.9756, 0.9830] 0.0127 [0.0098, 0.0156] 0.0080 [0.0059, 0.0102]
5
+ naive_qwen35_datacomp style 4903 0.9947 [0.9924, 0.9968] 0.0012 [0.0004, 0.0023] 0.0041 [0.0022, 0.0062]
6
+ naive_qwen35_datacomp camera 3153 0.9873 [0.9834, 0.9911] 0.0095 [0.0062, 0.0129] 0.0032 [0.0013, 0.0052]
7
+ naive_qwen35_datacomp lighting 1141 0.9877 [0.9810, 0.9941] 0.0044 [0.0009, 0.0084] 0.0079 [0.0026, 0.0137]
8
+ naive_qwen35_datacomp count 1622 0.9340 [0.9217, 0.9462] 0.0432 [0.0334, 0.0535] 0.0228 [0.0157, 0.0309]
9
+ naive_qwen35_datacomp text_rendering 1395 0.9362 [0.9221, 0.9496] 0.0287 [0.0199, 0.0384] 0.0351 [0.0252, 0.0454]
eval_results/datacomp-naive-qwen35-baseline-2026-05-02/gemma4_metric_tables/grounded_cbu_ci.tsv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ surface captions visual_units grounded_units_per_caption_ci95 grounded_precision_ci95 unsupported_rate_ci95 uncertain_rate_ci95
2
+ naive_qwen35_datacomp 4775 58260 11.7801 [11.6785, 11.8796] 0.9655 [0.9638, 0.9674] 0.0104 [0.0093, 0.0116] 0.0241 [0.0227, 0.0255]
eval_results/datacomp-naive-qwen35-baseline-2026-05-02/naive_qwen35_caption.summary.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "responses": 4980,
3
+ "unique_requests": 4775,
4
+ "captions": 4775,
5
+ "bad": 0,
6
+ "surface": "naive_qwen35_datacomp",
7
+ "output_jsonl": "artifacts/recap-ed/datacomp-naive-qwen35-baseline-2026-05-02/naive_qwen35_datacomp.jsonl",
8
+ "prompt": "Please generate a detailed caption of this image. Please be as descriptive as possible.",
9
+ "system_prompt": null,
10
+ "messages_policy": "single_user_message_with_image_no_system_prompt",
11
+ "token_mean": 296.45968586387437,
12
+ "token_median": 296,
13
+ "token_min": 12,
14
+ "token_max": 432
15
+ }
eval_results/embeddinggemma_pair_summary.tsv ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ pair ours_surface ref_surface vendi_ours vendi_ref delta_vendi_o_minus_r erank_ours erank_ref delta_erank_o_minus_r top1mass_ours top1mass_ref delta_top1mass_o_minus_r nn_o_to_r nn_r_to_o delta_nn_o_minus_r support_o_in_r support_r_in_o delta_support_o_minus_r density_o_in_r density_r_in_o delta_density_o_minus_r
2
+ cc12m_llavanext_paired ours_cc12m ref_cc12m_llavanext 66.730734 90.160599 -23.429866 269.042328 286.025848 -16.983521 0.038855 0.034805 0.004051 0.769103 0.764140 0.004963 0.957660 0.843140 0.114520 0.497990 0.236742 0.261248
3
+ cc12m_qwen3vl8b_paired ours_cc12m ref_cc12m_qwen3vl8b 51.634607 57.110133 -5.475526 222.048950 224.160461 -2.111511 0.053513 0.049385 0.004128 0.704297 0.707163 -0.002866 0.375800 0.354180 0.021620 0.049584 0.044978 0.004606
4
+ cc12m_pixelprose_paired ours_cc12m ref_pixelprose_cc12m 66.854479 73.743682 -6.889202 269.228516 288.698914 -19.470398 0.038886 0.034999 0.003888 0.680594 0.676938 0.003656 0.602020 0.500780 0.101240 0.172554 0.115546 0.057008
5
+ laion_pop_llama32_paired ours_laion_pop ref_laion_pop_llama32_11b 47.474582 63.807505 -16.332923 218.463516 241.209625 -22.746109 0.055081 0.048184 0.006897 0.794170 0.787970 0.006200 0.961740 0.845120 0.116620 0.506744 0.264092 0.242652
6
+ pd12m_full_paired ours_pd12m_img2dataset ref_pd12m_full 51.291509 37.793293 13.498217 211.494385 169.747513 41.746872 0.069870 0.062775 0.007095 0.692069 0.704875 -0.012806 0.122240 0.257560 -0.135320 0.017254 0.038008 -0.020754
7
+ danbooru2023_florence2_paired ours_danbooru2023 ref_danbooru_florence2 42.259565 20.049109 22.210456 254.459122 104.725281 149.733841 0.043924 0.112505 -0.068580 0.638019 0.668428 -0.030409 0.028080 0.413040 -0.384960 0.003374 0.102292 -0.098918
8
+ datacomp_recap_llava15_paired_url ours_datacomp_forward ref_datacomp_recap_llava15_llama3_8b 72.805853 85.648504 -12.842651 296.891449 285.083344 11.808105 0.038364 0.035212 0.003152 0.735544 0.738841 -0.003297 0.813480 0.815360 -0.001880 0.243068 0.208822 0.034246
eval_results/eval_results_summary.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Recap Evaluation Results Summary
2
+
3
+ Date: 2026-04-27
4
+
5
+ ## Evaluation Families
6
+
7
+ | Family | Main artifact | Paper role | Status |
8
+ |---|---|---|---|
9
+ | Mechanical text metrics | artifacts/caption-survey/cpu_remaining_2026-04-24 | surface concentration, violations, repetition, lexical diversity proxies | done |
10
+ | Prompt-pool support | artifacts/caption-survey/prompt_support_bootstrap_b64_n2_250k_2026-04-24.tsv | caption-prompt distribution support over declared prompt pools | done |
11
+ | Embedding diversity/support | artifacts/recap-ed/metrics-2026-04-25/embedding | Vendi/effective-rank/embedding support diagnostics | done; model-sensitive |
12
+ | Claimed CBU | artifacts/cbu/pair5k-local/claimed_cbu_v2_all7_b64_5k.responses.qwen397_c1024_mt4096.summary.json | text-side controllable-unit density at B=64 | done |
13
+ | CBU budget frontier | artifacts/cbu/cc12m-four-caption-llava-url-bridge-bgrid-1k | CC12M budget sensitivity B={16,32,48,64} | done |
14
+ | Image-conditioned VQA | artifacts/vqa-cbu | supported yield / support rate / risk | done for Qwen; Gemma cross-family done on CC12M |
15
+ | LongCLIP retrieval | artifacts/longclip | dual-encoder retrieval separability diagnostic | done for corrected CC12M |
16
+
17
+ ## Plot-Ready Outputs
18
+
19
+ - `cc12m_budget_frontier_plot.csv`: B-grid CBU yield/efficiency; plot `budget` vs `cbu_per_cap`, or `cbu_per_100tok` vs `cbu_per_cap`.
20
+ - `cc12m_vqa_supported_risk_pareto.csv`: CC12M VQA Pareto; plot `unsupported_cap` vs `supported_cap`, facet by `judge`, use `pareto_supported_cost`.
21
+ - `cc12m_longclip_plot.csv`: LongCLIP full/input64 retrieval diagnostic.
22
+ - `all_cbu_b64_summary.csv`: All available paired surfaces CBU@64.
23
+ - `all_vqa_b64_summary.csv`: All available VQA@64 summaries.
24
+ - `prompt_support_direction_summary.csv`: Prompt-pool support direction counts over prompt pools.
25
+
26
+ ## CC12M Pareto State
27
+
28
+ - `VQA@64`: Ours and Qwen3-VL-8B form the Pareto frontier under both Qwen and Gemma judges. Ours is the high supported-yield endpoint; Qwen3-VL-8B is the low-risk short-caption endpoint. LLaVA-NeXT and PixelProse are dominated on supported yield vs unsupported cost.
29
+ - `CBU@B`: Qwen3-VL-8B is the token-efficiency endpoint; Ours becomes the absolute-yield endpoint from B=32 onward. This is the cleanest plot for showing why length and density cannot be collapsed.
30
+ - `LongCLIP`: LLaVA-NeXT is strongest on input-64 retrieval margin/R@1; Ours remains locally separable but LongCLIP should stay appendix/diagnostic, not headline faithfulness.
31
+
32
+ ## Main Readout
33
+
34
+ The current evidence supports a Pareto framing rather than a scalar ranking. Ours increases supported controllable-unit yield; short-clean captions minimize risk and maximize per-token efficiency; public long baselines can be dense without matching the supported-yield/risk frontier.
eval_results/gemma-cross-corpus-2026-05-02/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # 2026-05-02 Gemma Cross-Corpus Add-on
2
+
3
+ This public-safe summary retains aggregate Gemma CBU/VQA tables only. The original run README contains local execution paths and remains in the private archive. Raw requests, raw responses, and image packages are not included in this anonymous code/results package.
eval_results/gemma-cross-corpus-2026-05-02/cbu_bootstrap_summary.json ADDED
@@ -0,0 +1,1375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bootstrap_reps": 2000,
3
+ "seed": 0,
4
+ "claimed": {},
5
+ "grounded": {
6
+ "datacomp_ours_gemma4": {
7
+ "input": "artifacts/grounded-cbu/grounded_verify_v2_ours_datacomp_forward_b64_5k.responses.gemma4_31b_c12_file_mt2048.jsonl",
8
+ "captions": 4975,
9
+ "visual_units": 70894,
10
+ "grounded_units_per_caption": {
11
+ "mean": 13.475979899497487,
12
+ "ci95_low": 13.371256281407035,
13
+ "ci95_high": 13.587547738693468
14
+ },
15
+ "grounded_precision": {
16
+ "mean": 0.9456794651169351,
17
+ "ci95_low": 0.9434585074215828,
18
+ "ci95_high": 0.9480059620159873
19
+ },
20
+ "unsupported_rate": {
21
+ "mean": 0.024586001636245663,
22
+ "ci95_low": 0.022810711845593374,
23
+ "ci95_high": 0.02637851194763035
24
+ },
25
+ "uncertain_rate": {
26
+ "mean": 0.029734533246819194,
27
+ "ci95_low": 0.028394080187231784,
28
+ "ci95_high": 0.03114928846803639
29
+ },
30
+ "categories": {
31
+ "object": {
32
+ "visual_units": 17907,
33
+ "grounded_precision": {
34
+ "mean": 0.9642597866756017,
35
+ "ci95_low": 0.9610363237643356,
36
+ "ci95_high": 0.9674664999321825
37
+ },
38
+ "unsupported_rate": {
39
+ "mean": 0.019154520578544703,
40
+ "ci95_low": 0.016682863959804863,
41
+ "ci95_high": 0.02152086512348487
42
+ },
43
+ "uncertain_rate": {
44
+ "mean": 0.016585692745853576,
45
+ "ci95_low": 0.014508736743501513,
46
+ "ci95_high": 0.018943918295760965
47
+ }
48
+ },
49
+ "attribute": {
50
+ "visual_units": 36712,
51
+ "grounded_precision": {
52
+ "mean": 0.9358247984310307,
53
+ "ci95_low": 0.9329203137056817,
54
+ "ci95_high": 0.9387029984241348
55
+ },
56
+ "unsupported_rate": {
57
+ "mean": 0.023343865765962084,
58
+ "ci95_low": 0.02125146772343254,
59
+ "ci95_high": 0.02541166652055019
60
+ },
61
+ "uncertain_rate": {
62
+ "mean": 0.04083133580300719,
63
+ "ci95_low": 0.038792389054040935,
64
+ "ci95_high": 0.04300736625807117
65
+ }
66
+ },
67
+ "relation": {
68
+ "visual_units": 8429,
69
+ "grounded_precision": {
70
+ "mean": 0.9569343931664491,
71
+ "ci95_low": 0.9522466319118269,
72
+ "ci95_high": 0.9616699703646194
73
+ },
74
+ "unsupported_rate": {
75
+ "mean": 0.027642662237513348,
76
+ "ci95_low": 0.023718871209624296,
77
+ "ci95_high": 0.03163265585009613
78
+ },
79
+ "uncertain_rate": {
80
+ "mean": 0.01542294459603749,
81
+ "ci95_low": 0.012585268641188942,
82
+ "ci95_high": 0.018206130406180576
83
+ }
84
+ },
85
+ "style": {
86
+ "visual_units": 1109,
87
+ "grounded_precision": {
88
+ "mean": 0.9828674481514879,
89
+ "ci95_low": 0.9745385100240553,
90
+ "ci95_high": 0.9901079136690647
91
+ },
92
+ "unsupported_rate": {
93
+ "mean": 0.007213706041478809,
94
+ "ci95_low": 0.0026977811281971727,
95
+ "ci95_high": 0.012545192457534672
96
+ },
97
+ "uncertain_rate": {
98
+ "mean": 0.009918845807033363,
99
+ "ci95_low": 0.004408880233418492,
100
+ "ci95_high": 0.017197192268298016
101
+ }
102
+ },
103
+ "camera": {
104
+ "visual_units": 678,
105
+ "grounded_precision": {
106
+ "mean": 0.9808259587020649,
107
+ "ci95_low": 0.9688426248938727,
108
+ "ci95_high": 0.9908814589665653
109
+ },
110
+ "unsupported_rate": {
111
+ "mean": 0.017699115044247787,
112
+ "ci95_low": 0.008570211038961039,
113
+ "ci95_high": 0.02886261350438178
114
+ },
115
+ "uncertain_rate": {
116
+ "mean": 0.0014749262536873156,
117
+ "ci95_low": 0.0,
118
+ "ci95_high": 0.004665811543436868
119
+ }
120
+ },
121
+ "lighting": {
122
+ "visual_units": 1616,
123
+ "grounded_precision": {
124
+ "mean": 0.9628712871287128,
125
+ "ci95_low": 0.9523794970310863,
126
+ "ci95_high": 0.9724717111656482
127
+ },
128
+ "unsupported_rate": {
129
+ "mean": 0.01608910891089109,
130
+ "ci95_low": 0.009432195604550915,
131
+ "ci95_high": 0.023197884730650722
132
+ },
133
+ "uncertain_rate": {
134
+ "mean": 0.02103960396039604,
135
+ "ci95_low": 0.014229015493625012,
136
+ "ci95_high": 0.028957791095690914
137
+ }
138
+ },
139
+ "count": {
140
+ "visual_units": 2519,
141
+ "grounded_precision": {
142
+ "mean": 0.9174275506153236,
143
+ "ci95_low": 0.9051838517773609,
144
+ "ci95_high": 0.9293143510760096
145
+ },
146
+ "unsupported_rate": {
147
+ "mean": 0.059944422389837236,
148
+ "ci95_low": 0.04997500859645053,
149
+ "ci95_high": 0.07063250288500046
150
+ },
151
+ "uncertain_rate": {
152
+ "mean": 0.02262802699483922,
153
+ "ci95_low": 0.016627515214735912,
154
+ "ci95_high": 0.029262822135353646
155
+ }
156
+ },
157
+ "text_rendering": {
158
+ "visual_units": 1924,
159
+ "grounded_precision": {
160
+ "mean": 0.9002079002079002,
161
+ "ci95_low": 0.883913116269738,
162
+ "ci95_high": 0.9158168932758676
163
+ },
164
+ "unsupported_rate": {
165
+ "mean": 0.058731808731808735,
166
+ "ci95_low": 0.04716448643471811,
167
+ "ci95_high": 0.07090820750728064
168
+ },
169
+ "uncertain_rate": {
170
+ "mean": 0.04106029106029106,
171
+ "ci95_low": 0.03121766477392487,
172
+ "ci95_high": 0.05085673527047604
173
+ }
174
+ }
175
+ }
176
+ },
177
+ "datacomp_ref_llava15_llama3_gemma4": {
178
+ "input": "artifacts/grounded-cbu/grounded_verify_v2_ref_datacomp_recap_llava15_b64_5k.responses.gemma4_31b_c12_file_mt2048.jsonl",
179
+ "captions": 4993,
180
+ "visual_units": 49844,
181
+ "grounded_units_per_caption": {
182
+ "mean": 8.284398157420389,
183
+ "ci95_low": 8.18925996394953,
184
+ "ci95_high": 8.376146605247348
185
+ },
186
+ "grounded_precision": {
187
+ "mean": 0.8298691918786614,
188
+ "ci95_low": 0.8248098325178892,
189
+ "ci95_high": 0.8346372137552281
190
+ },
191
+ "unsupported_rate": {
192
+ "mean": 0.1364256480218281,
193
+ "ci95_low": 0.13176538720325284,
194
+ "ci95_high": 0.14123288049770946
195
+ },
196
+ "uncertain_rate": {
197
+ "mean": 0.033705160099510474,
198
+ "ci95_low": 0.03192668676571995,
199
+ "ci95_high": 0.035440948366925316
200
+ },
201
+ "categories": {
202
+ "object": {
203
+ "visual_units": 17553,
204
+ "grounded_precision": {
205
+ "mean": 0.8627015325015667,
206
+ "ci95_low": 0.8561038616458624,
207
+ "ci95_high": 0.8689074008006997
208
+ },
209
+ "unsupported_rate": {
210
+ "mean": 0.11000968495413889,
211
+ "ci95_low": 0.10447249094349349,
212
+ "ci95_high": 0.11623041776294255
213
+ },
214
+ "uncertain_rate": {
215
+ "mean": 0.027288782544294423,
216
+ "ci95_low": 0.024654990286546868,
217
+ "ci95_high": 0.030052245129642233
218
+ }
219
+ },
220
+ "attribute": {
221
+ "visual_units": 18950,
222
+ "grounded_precision": {
223
+ "mean": 0.8221108179419525,
224
+ "ci95_low": 0.8151779991843955,
225
+ "ci95_high": 0.8291072843433401
226
+ },
227
+ "unsupported_rate": {
228
+ "mean": 0.1404221635883905,
229
+ "ci95_low": 0.13385510183117233,
230
+ "ci95_high": 0.14709778344098867
231
+ },
232
+ "uncertain_rate": {
233
+ "mean": 0.03746701846965699,
234
+ "ci95_low": 0.034650721292589025,
235
+ "ci95_high": 0.040226820003506435
236
+ }
237
+ },
238
+ "relation": {
239
+ "visual_units": 6437,
240
+ "grounded_precision": {
241
+ "mean": 0.8017710113406866,
242
+ "ci95_low": 0.7898056755645372,
243
+ "ci95_high": 0.8134298562953918
244
+ },
245
+ "unsupported_rate": {
246
+ "mean": 0.1673139661332919,
247
+ "ci95_low": 0.1566262814618043,
248
+ "ci95_high": 0.17846895550463168
249
+ },
250
+ "uncertain_rate": {
251
+ "mean": 0.03091502252602144,
252
+ "ci95_low": 0.02641733861850204,
253
+ "ci95_high": 0.03542643825320441
254
+ }
255
+ },
256
+ "style": {
257
+ "visual_units": 1363,
258
+ "grounded_precision": {
259
+ "mean": 0.909024211298606,
260
+ "ci95_low": 0.8905430120969045,
261
+ "ci95_high": 0.9264275304716482
262
+ },
263
+ "unsupported_rate": {
264
+ "mean": 0.08290535583272193,
265
+ "ci95_low": 0.06550163949711728,
266
+ "ci95_high": 0.10098194051552681
267
+ },
268
+ "uncertain_rate": {
269
+ "mean": 0.008070432868672046,
270
+ "ci95_low": 0.0036574442409453276,
271
+ "ci95_high": 0.012908330997616794
272
+ }
273
+ },
274
+ "camera": {
275
+ "visual_units": 586,
276
+ "grounded_precision": {
277
+ "mean": 0.9539249146757679,
278
+ "ci95_low": 0.9340990245313002,
279
+ "ci95_high": 0.971190084888673
280
+ },
281
+ "unsupported_rate": {
282
+ "mean": 0.03924914675767918,
283
+ "ci95_low": 0.023449907193625785,
284
+ "ci95_high": 0.0568568009573195
285
+ },
286
+ "uncertain_rate": {
287
+ "mean": 0.006825938566552901,
288
+ "ci95_low": 0.0,
289
+ "ci95_high": 0.016158570241643567
290
+ }
291
+ },
292
+ "lighting": {
293
+ "visual_units": 557,
294
+ "grounded_precision": {
295
+ "mean": 0.9425493716337523,
296
+ "ci95_low": 0.9209545841641741,
297
+ "ci95_high": 0.9637415139693121
298
+ },
299
+ "unsupported_rate": {
300
+ "mean": 0.03052064631956912,
301
+ "ci95_low": 0.016860036861274973,
302
+ "ci95_high": 0.04584162481699221
303
+ },
304
+ "uncertain_rate": {
305
+ "mean": 0.026929982046678635,
306
+ "ci95_low": 0.014284443314692424,
307
+ "ci95_high": 0.04159139842234417
308
+ }
309
+ },
310
+ "count": {
311
+ "visual_units": 1621,
312
+ "grounded_precision": {
313
+ "mean": 0.7717458359037631,
314
+ "ci95_low": 0.7495150656472152,
315
+ "ci95_high": 0.7925675387235496
316
+ },
317
+ "unsupported_rate": {
318
+ "mean": 0.2066625539790253,
319
+ "ci95_low": 0.18633889356134942,
320
+ "ci95_high": 0.2282719491411228
321
+ },
322
+ "uncertain_rate": {
323
+ "mean": 0.0215916101172116,
324
+ "ci95_low": 0.014293566934707975,
325
+ "ci95_high": 0.02975907779730107
326
+ }
327
+ },
328
+ "text_rendering": {
329
+ "visual_units": 2777,
330
+ "grounded_precision": {
331
+ "mean": 0.6867122794382428,
332
+ "ci95_low": 0.6659418299092212,
333
+ "ci95_high": 0.7066677204074464
334
+ },
335
+ "unsupported_rate": {
336
+ "mean": 0.23154483255311487,
337
+ "ci95_low": 0.2122531154567533,
338
+ "ci95_high": 0.2510471273839642
339
+ },
340
+ "uncertain_rate": {
341
+ "mean": 0.08174288800864242,
342
+ "ci95_low": 0.07049427394217786,
343
+ "ci95_high": 0.09295352323838081
344
+ }
345
+ }
346
+ }
347
+ },
348
+ "laion_pop_ours_gemma4": {
349
+ "input": "artifacts/grounded-cbu/gemma-cross-corpus-2026-05-02/responses/grounded_verify_v2_laion_pop_llama32_paired__ours_laion_pop_b64_5k.responses.gemma4_31b_file_mt2048.merged.jsonl",
350
+ "captions": 5235,
351
+ "visual_units": 76840,
352
+ "grounded_units_per_caption": {
353
+ "mean": 14.014708691499523,
354
+ "ci95_low": 13.919369627507164,
355
+ "ci95_high": 14.106432664756447
356
+ },
357
+ "grounded_precision": {
358
+ "mean": 0.9548021863612701,
359
+ "ci95_low": 0.9529896799722325,
360
+ "ci95_high": 0.9566572837454304
361
+ },
362
+ "unsupported_rate": {
363
+ "mean": 0.01773815720978657,
364
+ "ci95_low": 0.016448365431438447,
365
+ "ci95_high": 0.019014632987482634
366
+ },
367
+ "uncertain_rate": {
368
+ "mean": 0.02745965642894326,
369
+ "ci95_low": 0.02619943061801823,
370
+ "ci95_high": 0.028701005707437294
371
+ },
372
+ "categories": {
373
+ "object": {
374
+ "visual_units": 20835,
375
+ "grounded_precision": {
376
+ "mean": 0.9669786417086633,
377
+ "ci95_low": 0.9641924658755847,
378
+ "ci95_high": 0.9696334048225462
379
+ },
380
+ "unsupported_rate": {
381
+ "mean": 0.013486921046316295,
382
+ "ci95_low": 0.011761759215509404,
383
+ "ci95_high": 0.01523684166812232
384
+ },
385
+ "uncertain_rate": {
386
+ "mean": 0.019534437245020398,
387
+ "ci95_low": 0.01751867601543315,
388
+ "ci95_high": 0.02170879413425821
389
+ }
390
+ },
391
+ "attribute": {
392
+ "visual_units": 35923,
393
+ "grounded_precision": {
394
+ "mean": 0.9427664727333464,
395
+ "ci95_low": 0.9399496630663411,
396
+ "ci95_high": 0.9454086042005748
397
+ },
398
+ "unsupported_rate": {
399
+ "mean": 0.017565348105670463,
400
+ "ci95_low": 0.016032057535235764,
401
+ "ci95_high": 0.019248168499202422
402
+ },
403
+ "uncertain_rate": {
404
+ "mean": 0.03966817916098321,
405
+ "ci95_low": 0.037560816636684535,
406
+ "ci95_high": 0.0418013189080644
407
+ }
408
+ },
409
+ "relation": {
410
+ "visual_units": 12589,
411
+ "grounded_precision": {
412
+ "mean": 0.9656048931606959,
413
+ "ci95_low": 0.9620350212749554,
414
+ "ci95_high": 0.9689216951647477
415
+ },
416
+ "unsupported_rate": {
417
+ "mean": 0.021606164111525935,
418
+ "ci95_low": 0.018942978341140136,
419
+ "ci95_high": 0.024367183441305597
420
+ },
421
+ "uncertain_rate": {
422
+ "mean": 0.012788942727778219,
423
+ "ci95_low": 0.010787349834146821,
424
+ "ci95_high": 0.014976207495865219
425
+ }
426
+ },
427
+ "style": {
428
+ "visual_units": 1565,
429
+ "grounded_precision": {
430
+ "mean": 0.9763578274760384,
431
+ "ci95_low": 0.9657621612521076,
432
+ "ci95_high": 0.9846898671346335
433
+ },
434
+ "unsupported_rate": {
435
+ "mean": 0.021725239616613417,
436
+ "ci95_low": 0.013766778785982479,
437
+ "ci95_high": 0.031727679468470917
438
+ },
439
+ "uncertain_rate": {
440
+ "mean": 0.0019169329073482429,
441
+ "ci95_low": 0.0,
442
+ "ci95_high": 0.004405355696357039
443
+ }
444
+ },
445
+ "camera": {
446
+ "visual_units": 1402,
447
+ "grounded_precision": {
448
+ "mean": 0.9800285306704708,
449
+ "ci95_low": 0.9726416958689414,
450
+ "ci95_high": 0.9869196271807313
451
+ },
452
+ "unsupported_rate": {
453
+ "mean": 0.01783166904422254,
454
+ "ci95_low": 0.011387697868312216,
455
+ "ci95_high": 0.024673887724202186
456
+ },
457
+ "uncertain_rate": {
458
+ "mean": 0.0021398002853067048,
459
+ "ci95_low": 0.0,
460
+ "ci95_high": 0.004847729414903929
461
+ }
462
+ },
463
+ "lighting": {
464
+ "visual_units": 2120,
465
+ "grounded_precision": {
466
+ "mean": 0.9636792452830188,
467
+ "ci95_low": 0.9544776513590889,
468
+ "ci95_high": 0.9722486967620492
469
+ },
470
+ "unsupported_rate": {
471
+ "mean": 0.01650943396226415,
472
+ "ci95_low": 0.010505957588748659,
473
+ "ci95_high": 0.02313187443083103
474
+ },
475
+ "uncertain_rate": {
476
+ "mean": 0.01981132075471698,
477
+ "ci95_low": 0.013888390722142515,
478
+ "ci95_high": 0.02617801047120419
479
+ }
480
+ },
481
+ "count": {
482
+ "visual_units": 1882,
483
+ "grounded_precision": {
484
+ "mean": 0.9399574920297555,
485
+ "ci95_low": 0.9286488557993069,
486
+ "ci95_high": 0.9508577205269301
487
+ },
488
+ "unsupported_rate": {
489
+ "mean": 0.036663124335812966,
490
+ "ci95_low": 0.028046849152647695,
491
+ "ci95_high": 0.045934815491309684
492
+ },
493
+ "uncertain_rate": {
494
+ "mean": 0.023379383634431455,
495
+ "ci95_low": 0.01678006329983653,
496
+ "ci95_high": 0.030721138855536626
497
+ }
498
+ },
499
+ "text_rendering": {
500
+ "visual_units": 524,
501
+ "grounded_precision": {
502
+ "mean": 0.9217557251908397,
503
+ "ci95_low": 0.89171974522293,
504
+ "ci95_high": 0.9460091281751263
505
+ },
506
+ "unsupported_rate": {
507
+ "mean": 0.030534351145038167,
508
+ "ci95_low": 0.015685507079544147,
509
+ "ci95_high": 0.048290478163096215
510
+ },
511
+ "uncertain_rate": {
512
+ "mean": 0.04770992366412214,
513
+ "ci95_low": 0.027535899481451933,
514
+ "ci95_high": 0.07203408000697735
515
+ }
516
+ }
517
+ }
518
+ },
519
+ "laion_pop_ref_llama32_11b_gemma4": {
520
+ "input": "artifacts/grounded-cbu/gemma-cross-corpus-2026-05-02/responses/grounded_verify_v2_laion_pop_llama32_paired__ref_laion_pop_llama32_11b_b64_5k.responses.gemma4_31b_file_mt2048.merged.jsonl",
521
+ "captions": 4934,
522
+ "visual_units": 58530,
523
+ "grounded_units_per_caption": {
524
+ "mean": 10.883461694365627,
525
+ "ci95_low": 10.802391568706932,
526
+ "ci95_high": 10.96169943250912
527
+ },
528
+ "grounded_precision": {
529
+ "mean": 0.9174611310439091,
530
+ "ci95_low": 0.9144809990321734,
531
+ "ci95_high": 0.9203529591516613
532
+ },
533
+ "unsupported_rate": {
534
+ "mean": 0.04741158380317786,
535
+ "ci95_low": 0.045078869871934914,
536
+ "ci95_high": 0.049798940810027836
537
+ },
538
+ "uncertain_rate": {
539
+ "mean": 0.03512728515291304,
540
+ "ci95_low": 0.03354652742291209,
541
+ "ci95_high": 0.03688470763792352
542
+ },
543
+ "categories": {
544
+ "object": {
545
+ "visual_units": 17499,
546
+ "grounded_precision": {
547
+ "mean": 0.9395394022515572,
548
+ "ci95_low": 0.9354296734324157,
549
+ "ci95_high": 0.9435548006534396
550
+ },
551
+ "unsupported_rate": {
552
+ "mean": 0.030116006628950226,
553
+ "ci95_low": 0.02722223088526751,
554
+ "ci95_high": 0.03316094958286966
555
+ },
556
+ "uncertain_rate": {
557
+ "mean": 0.03034459111949254,
558
+ "ci95_low": 0.02755354972686743,
559
+ "ci95_high": 0.03310257103714487
560
+ }
561
+ },
562
+ "attribute": {
563
+ "visual_units": 24300,
564
+ "grounded_precision": {
565
+ "mean": 0.9122222222222223,
566
+ "ci95_low": 0.9080765707555861,
567
+ "ci95_high": 0.9160479385733995
568
+ },
569
+ "unsupported_rate": {
570
+ "mean": 0.04477366255144033,
571
+ "ci95_low": 0.041743665031443666,
572
+ "ci95_high": 0.04783829164134451
573
+ },
574
+ "uncertain_rate": {
575
+ "mean": 0.04300411522633745,
576
+ "ci95_low": 0.04052670604232754,
577
+ "ci95_high": 0.04575259635168874
578
+ }
579
+ },
580
+ "relation": {
581
+ "visual_units": 9388,
582
+ "grounded_precision": {
583
+ "mean": 0.8931614827439284,
584
+ "ci95_low": 0.8862970693301438,
585
+ "ci95_high": 0.8998709940861319
586
+ },
587
+ "unsupported_rate": {
588
+ "mean": 0.07754580315296122,
589
+ "ci95_low": 0.07152723210962283,
590
+ "ci95_high": 0.0835563589120201
591
+ },
592
+ "uncertain_rate": {
593
+ "mean": 0.029292714103110355,
594
+ "ci95_low": 0.025967829133300344,
595
+ "ci95_high": 0.03268904460200468
596
+ }
597
+ },
598
+ "style": {
599
+ "visual_units": 2803,
600
+ "grounded_precision": {
601
+ "mean": 0.981805208704959,
602
+ "ci95_low": 0.9766441929800787,
603
+ "ci95_high": 0.9867575739654666
604
+ },
605
+ "unsupported_rate": {
606
+ "mean": 0.008205494113449875,
607
+ "ci95_low": 0.004992822832673466,
608
+ "ci95_high": 0.01167728237791932
609
+ },
610
+ "uncertain_rate": {
611
+ "mean": 0.009989297181591153,
612
+ "ci95_low": 0.006585731512074666,
613
+ "ci95_high": 0.013724197034056185
614
+ }
615
+ },
616
+ "camera": {
617
+ "visual_units": 1689,
618
+ "grounded_precision": {
619
+ "mean": 0.9579632918886916,
620
+ "ci95_low": 0.9480353474320242,
621
+ "ci95_high": 0.9678189056986748
622
+ },
623
+ "unsupported_rate": {
624
+ "mean": 0.037892243931320305,
625
+ "ci95_low": 0.02838515125969358,
626
+ "ci95_high": 0.047281925510680764
627
+ },
628
+ "uncertain_rate": {
629
+ "mean": 0.0041444641799881585,
630
+ "ci95_low": 0.0012570118775073402,
631
+ "ci95_high": 0.007229680974351121
632
+ }
633
+ },
634
+ "lighting": {
635
+ "visual_units": 577,
636
+ "grounded_precision": {
637
+ "mean": 0.9428076256499134,
638
+ "ci95_low": 0.9221392337020697,
639
+ "ci95_high": 0.9619077757685353
640
+ },
641
+ "unsupported_rate": {
642
+ "mean": 0.025996533795493933,
643
+ "ci95_low": 0.01296236393509961,
644
+ "ci95_high": 0.040544726142071254
645
+ },
646
+ "uncertain_rate": {
647
+ "mean": 0.03119584055459272,
648
+ "ci95_low": 0.018032786885245903,
649
+ "ci95_high": 0.04659290720251998
650
+ }
651
+ },
652
+ "count": {
653
+ "visual_units": 1672,
654
+ "grounded_precision": {
655
+ "mean": 0.8038277511961722,
656
+ "ci95_low": 0.7818174008122463,
657
+ "ci95_high": 0.8253329734260568
658
+ },
659
+ "unsupported_rate": {
660
+ "mean": 0.13337320574162678,
661
+ "ci95_low": 0.11472343041732015,
662
+ "ci95_high": 0.15234714667072424
663
+ },
664
+ "uncertain_rate": {
665
+ "mean": 0.06279904306220095,
666
+ "ci95_low": 0.05067097561692991,
667
+ "ci95_high": 0.0765492834474793
668
+ }
669
+ },
670
+ "text_rendering": {
671
+ "visual_units": 602,
672
+ "grounded_precision": {
673
+ "mean": 0.7441860465116279,
674
+ "ci95_low": 0.7047773322456619,
675
+ "ci95_high": 0.7833348375451263
676
+ },
677
+ "unsupported_rate": {
678
+ "mean": 0.1777408637873754,
679
+ "ci95_low": 0.1421428961077926,
680
+ "ci95_high": 0.21182370376555062
681
+ },
682
+ "uncertain_rate": {
683
+ "mean": 0.07807308970099668,
684
+ "ci95_low": 0.053881497816718076,
685
+ "ci95_high": 0.10283172675481708
686
+ }
687
+ }
688
+ }
689
+ },
690
+ "pd12m_ours_gemma4": {
691
+ "input": "artifacts/grounded-cbu/gemma-cross-corpus-2026-05-02/responses/grounded_verify_v2_pd12m_full_paired__ours_pd12m_img2dataset_b64_5k.responses.gemma4_31b_file_mt2048.merged.jsonl",
692
+ "captions": 4878,
693
+ "visual_units": 72226,
694
+ "grounded_units_per_caption": {
695
+ "mean": 13.98339483394834,
696
+ "ci95_low": 13.892783927839279,
697
+ "ci95_high": 14.080975809758097
698
+ },
699
+ "grounded_precision": {
700
+ "mean": 0.9444106000609199,
701
+ "ci95_low": 0.9423015111074523,
702
+ "ci95_high": 0.9466077303227699
703
+ },
704
+ "unsupported_rate": {
705
+ "mean": 0.017168332733364718,
706
+ "ci95_low": 0.015683672986280997,
707
+ "ci95_high": 0.01852629414933296
708
+ },
709
+ "uncertain_rate": {
710
+ "mean": 0.03842106720571539,
711
+ "ci95_low": 0.03688762531728115,
712
+ "ci95_high": 0.039992564217287176
713
+ },
714
+ "categories": {
715
+ "object": {
716
+ "visual_units": 19969,
717
+ "grounded_precision": {
718
+ "mean": 0.9584355751414693,
719
+ "ci95_low": 0.9551936511076642,
720
+ "ci95_high": 0.9616730404750264
721
+ },
722
+ "unsupported_rate": {
723
+ "mean": 0.014622665130952978,
724
+ "ci95_low": 0.012649269110116218,
725
+ "ci95_high": 0.016574620430186544
726
+ },
727
+ "uncertain_rate": {
728
+ "mean": 0.026941759727577747,
729
+ "ci95_low": 0.024444098859569916,
730
+ "ci95_high": 0.029422380226120055
731
+ }
732
+ },
733
+ "attribute": {
734
+ "visual_units": 32324,
735
+ "grounded_precision": {
736
+ "mean": 0.9330528399950501,
737
+ "ci95_low": 0.9301680446952336,
738
+ "ci95_high": 0.935928329231497
739
+ },
740
+ "unsupported_rate": {
741
+ "mean": 0.01385967083281772,
742
+ "ci95_low": 0.012391638907511236,
743
+ "ci95_high": 0.015346772960610868
744
+ },
745
+ "uncertain_rate": {
746
+ "mean": 0.053087489172132164,
747
+ "ci95_low": 0.050520440024859774,
748
+ "ci95_high": 0.05560571293156231
749
+ }
750
+ },
751
+ "relation": {
752
+ "visual_units": 12392,
753
+ "grounded_precision": {
754
+ "mean": 0.954728857327308,
755
+ "ci95_low": 0.9505503287086146,
756
+ "ci95_high": 0.9587225334593927
757
+ },
758
+ "unsupported_rate": {
759
+ "mean": 0.02485474499677211,
760
+ "ci95_low": 0.02186390817849199,
761
+ "ci95_high": 0.028033882851901566
762
+ },
763
+ "uncertain_rate": {
764
+ "mean": 0.020416397675919948,
765
+ "ci95_low": 0.017864292388336762,
766
+ "ci95_high": 0.023058749646890597
767
+ }
768
+ },
769
+ "style": {
770
+ "visual_units": 1524,
771
+ "grounded_precision": {
772
+ "mean": 0.9862204724409449,
773
+ "ci95_low": 0.9797202731861921,
774
+ "ci95_high": 0.9920013245033112
775
+ },
776
+ "unsupported_rate": {
777
+ "mean": 0.007217847769028871,
778
+ "ci95_low": 0.003164208003318099,
779
+ "ci95_high": 0.012418503699714024
780
+ },
781
+ "uncertain_rate": {
782
+ "mean": 0.006561679790026247,
783
+ "ci95_low": 0.002668268089127504,
784
+ "ci95_high": 0.010596553830468846
785
+ }
786
+ },
787
+ "camera": {
788
+ "visual_units": 1082,
789
+ "grounded_precision": {
790
+ "mean": 0.9852125693160814,
791
+ "ci95_low": 0.9784240150093808,
792
+ "ci95_high": 0.9918867525372248
793
+ },
794
+ "unsupported_rate": {
795
+ "mean": 0.009242144177449169,
796
+ "ci95_low": 0.003787878787878788,
797
+ "ci95_high": 0.014955437709322918
798
+ },
799
+ "uncertain_rate": {
800
+ "mean": 0.005545286506469501,
801
+ "ci95_low": 0.0018095891857640724,
802
+ "ci95_high": 0.010428031210044313
803
+ }
804
+ },
805
+ "lighting": {
806
+ "visual_units": 1452,
807
+ "grounded_precision": {
808
+ "mean": 0.9545454545454546,
809
+ "ci95_low": 0.9434580065647575,
810
+ "ci95_high": 0.9655643556525163
811
+ },
812
+ "unsupported_rate": {
813
+ "mean": 0.014462809917355372,
814
+ "ci95_low": 0.009001684290707909,
815
+ "ci95_high": 0.02069425901201602
816
+ },
817
+ "uncertain_rate": {
818
+ "mean": 0.030991735537190084,
819
+ "ci95_low": 0.0222833477322651,
820
+ "ci95_high": 0.040167411791790744
821
+ }
822
+ },
823
+ "count": {
824
+ "visual_units": 2571,
825
+ "grounded_precision": {
826
+ "mean": 0.9237650719564372,
827
+ "ci95_low": 0.9131429308480155,
828
+ "ci95_high": 0.9345090816892005
829
+ },
830
+ "unsupported_rate": {
831
+ "mean": 0.03695060287825749,
832
+ "ci95_low": 0.029422150757030592,
833
+ "ci95_high": 0.04478193146417445
834
+ },
835
+ "uncertain_rate": {
836
+ "mean": 0.03928432516530533,
837
+ "ci95_low": 0.03158238405367036,
838
+ "ci95_high": 0.047523771907299846
839
+ }
840
+ },
841
+ "text_rendering": {
842
+ "visual_units": 912,
843
+ "grounded_precision": {
844
+ "mean": 0.8234649122807017,
845
+ "ci95_low": 0.7946618486235656,
846
+ "ci95_high": 0.8521741564827888
847
+ },
848
+ "unsupported_rate": {
849
+ "mean": 0.06030701754385965,
850
+ "ci95_low": 0.04179600692755259,
851
+ "ci95_high": 0.07931262648307606
852
+ },
853
+ "uncertain_rate": {
854
+ "mean": 0.1162280701754386,
855
+ "ci95_low": 0.09346544774388123,
856
+ "ci95_high": 0.13969858929173884
857
+ }
858
+ }
859
+ }
860
+ },
861
+ "pd12m_ref_gemma4": {
862
+ "input": "artifacts/grounded-cbu/gemma-cross-corpus-2026-05-02/responses/grounded_verify_v2_pd12m_full_paired__ref_pd12m_full_b64_5k.responses.gemma4_31b_file_mt2048.merged.jsonl",
863
+ "captions": 4989,
864
+ "visual_units": 48670,
865
+ "grounded_units_per_caption": {
866
+ "mean": 8.67849268390459,
867
+ "ci95_low": 8.588093806374022,
868
+ "ci95_high": 8.76830026057326
869
+ },
870
+ "grounded_precision": {
871
+ "mean": 0.8896034518183686,
872
+ "ci95_low": 0.8854235063494741,
873
+ "ci95_high": 0.8938611245657099
874
+ },
875
+ "unsupported_rate": {
876
+ "mean": 0.07507704951715635,
877
+ "ci95_low": 0.07145497249553306,
878
+ "ci95_high": 0.07861058520678543
879
+ },
880
+ "uncertain_rate": {
881
+ "mean": 0.035319498664475035,
882
+ "ci95_low": 0.03349925960458296,
883
+ "ci95_high": 0.03722167373614036
884
+ },
885
+ "categories": {
886
+ "object": {
887
+ "visual_units": 21867,
888
+ "grounded_precision": {
889
+ "mean": 0.924909681254859,
890
+ "ci95_low": 0.9211441883611063,
891
+ "ci95_high": 0.9290266397219379
892
+ },
893
+ "unsupported_rate": {
894
+ "mean": 0.04856633283029222,
895
+ "ci95_low": 0.04521409002469015,
896
+ "ci95_high": 0.05165961129865847
897
+ },
898
+ "uncertain_rate": {
899
+ "mean": 0.02652398591484886,
900
+ "ci95_low": 0.024442020449743983,
901
+ "ci95_high": 0.028826580589740913
902
+ }
903
+ },
904
+ "attribute": {
905
+ "visual_units": 10053,
906
+ "grounded_precision": {
907
+ "mean": 0.817268477071521,
908
+ "ci95_low": 0.8081304603115136,
909
+ "ci95_high": 0.8263230891360528
910
+ },
911
+ "unsupported_rate": {
912
+ "mean": 0.11150900228787426,
913
+ "ci95_low": 0.10416651002506266,
914
+ "ci95_high": 0.119007775466157
915
+ },
916
+ "uncertain_rate": {
917
+ "mean": 0.07122252064060479,
918
+ "ci95_low": 0.0659271432900421,
919
+ "ci95_high": 0.076343588817552
920
+ }
921
+ },
922
+ "relation": {
923
+ "visual_units": 11840,
924
+ "grounded_precision": {
925
+ "mean": 0.8788851351351351,
926
+ "ci95_low": 0.8722028400918369,
927
+ "ci95_high": 0.8860438985440879
928
+ },
929
+ "unsupported_rate": {
930
+ "mean": 0.09628378378378379,
931
+ "ci95_low": 0.08994703483955481,
932
+ "ci95_high": 0.10233539423457812
933
+ },
934
+ "uncertain_rate": {
935
+ "mean": 0.02483108108108108,
936
+ "ci95_low": 0.021855507466231116,
937
+ "ci95_high": 0.02752990768992023
938
+ }
939
+ },
940
+ "style": {
941
+ "visual_units": 2000,
942
+ "grounded_precision": {
943
+ "mean": 0.9705,
944
+ "ci95_low": 0.9620309397129904,
945
+ "ci95_high": 0.9784141959009558
946
+ },
947
+ "unsupported_rate": {
948
+ "mean": 0.008,
949
+ "ci95_low": 0.0039485699710470005,
950
+ "ci95_high": 0.012525678867775709
951
+ },
952
+ "uncertain_rate": {
953
+ "mean": 0.0215,
954
+ "ci95_low": 0.014712355895481156,
955
+ "ci95_high": 0.028557222991185252
956
+ }
957
+ },
958
+ "camera": {
959
+ "visual_units": 529,
960
+ "grounded_precision": {
961
+ "mean": 0.9905482041587902,
962
+ "ci95_low": 0.9815481071477382,
963
+ "ci95_high": 0.9980694980694981
964
+ },
965
+ "unsupported_rate": {
966
+ "mean": 0.00945179584120983,
967
+ "ci95_low": 0.0019305019305019305,
968
+ "ci95_high": 0.018451892852261845
969
+ },
970
+ "uncertain_rate": {
971
+ "mean": 0.0,
972
+ "ci95_low": 0.0,
973
+ "ci95_high": 0.0
974
+ }
975
+ },
976
+ "lighting": {
977
+ "visual_units": 151,
978
+ "grounded_precision": {
979
+ "mean": 0.7880794701986755,
980
+ "ci95_low": 0.7133333333333334,
981
+ "ci95_high": 0.8633610954263128
982
+ },
983
+ "unsupported_rate": {
984
+ "mean": 0.1390728476821192,
985
+ "ci95_low": 0.07842094284522319,
986
+ "ci95_high": 0.2025359219979473
987
+ },
988
+ "uncertain_rate": {
989
+ "mean": 0.0728476821192053,
990
+ "ci95_low": 0.031007751937984496,
991
+ "ci95_high": 0.1171875
992
+ }
993
+ },
994
+ "count": {
995
+ "visual_units": 1235,
996
+ "grounded_precision": {
997
+ "mean": 0.891497975708502,
998
+ "ci95_low": 0.8735133051522816,
999
+ "ci95_high": 0.9087186268570522
1000
+ },
1001
+ "unsupported_rate": {
1002
+ "mean": 0.08016194331983806,
1003
+ "ci95_low": 0.06572418315561385,
1004
+ "ci95_high": 0.0962532336425011
1005
+ },
1006
+ "uncertain_rate": {
1007
+ "mean": 0.02834008097165992,
1008
+ "ci95_low": 0.0197850851610843,
1009
+ "ci95_high": 0.03760887772194304
1010
+ }
1011
+ },
1012
+ "text_rendering": {
1013
+ "visual_units": 995,
1014
+ "grounded_precision": {
1015
+ "mean": 0.7688442211055276,
1016
+ "ci95_low": 0.7408910471345435,
1017
+ "ci95_high": 0.7962583053180506
1018
+ },
1019
+ "unsupported_rate": {
1020
+ "mean": 0.19095477386934673,
1021
+ "ci95_low": 0.16630810749564806,
1022
+ "ci95_high": 0.21827528625954196
1023
+ },
1024
+ "uncertain_rate": {
1025
+ "mean": 0.04020100502512563,
1026
+ "ci95_low": 0.02758440751161421,
1027
+ "ci95_high": 0.05378738930289204
1028
+ }
1029
+ }
1030
+ }
1031
+ },
1032
+ "danbooru_ours_gemma4": {
1033
+ "input": "artifacts/grounded-cbu/gemma-cross-corpus-2026-05-02/responses/grounded_verify_v2_danbooru2023_florence2_paired__ours_danbooru2023_b64_5k.responses.gemma4_31b_file_mt2048.merged.jsonl",
1034
+ "captions": 4879,
1035
+ "visual_units": 69427,
1036
+ "grounded_units_per_caption": {
1037
+ "mean": 13.343103094896495,
1038
+ "ci95_low": 13.263358270137322,
1039
+ "ci95_high": 13.420009223201475
1040
+ },
1041
+ "grounded_precision": {
1042
+ "mean": 0.937689947714866,
1043
+ "ci95_low": 0.9353210266173098,
1044
+ "ci95_high": 0.9400678140345307
1045
+ },
1046
+ "unsupported_rate": {
1047
+ "mean": 0.03569216587206706,
1048
+ "ci95_low": 0.033793547632205025,
1049
+ "ci95_high": 0.037533687334456604
1050
+ },
1051
+ "uncertain_rate": {
1052
+ "mean": 0.026617886413066963,
1053
+ "ci95_low": 0.025264995170242218,
1054
+ "ci95_high": 0.0279962721024009
1055
+ },
1056
+ "categories": {
1057
+ "object": {
1058
+ "visual_units": 18718,
1059
+ "grounded_precision": {
1060
+ "mean": 0.9326851159311892,
1061
+ "ci95_low": 0.928437766000394,
1062
+ "ci95_high": 0.9372108650745524
1063
+ },
1064
+ "unsupported_rate": {
1065
+ "mean": 0.03168073512127364,
1066
+ "ci95_low": 0.0288041447965154,
1067
+ "ci95_high": 0.03464041901717594
1068
+ },
1069
+ "uncertain_rate": {
1070
+ "mean": 0.03563414894753713,
1071
+ "ci95_low": 0.03228536573086078,
1072
+ "ci95_high": 0.03895369338886005
1073
+ }
1074
+ },
1075
+ "attribute": {
1076
+ "visual_units": 33800,
1077
+ "grounded_precision": {
1078
+ "mean": 0.9344970414201184,
1079
+ "ci95_low": 0.9313771418556193,
1080
+ "ci95_high": 0.9374098248629728
1081
+ },
1082
+ "unsupported_rate": {
1083
+ "mean": 0.03644970414201183,
1084
+ "ci95_low": 0.03409606310795786,
1085
+ "ci95_high": 0.03888285322216871
1086
+ },
1087
+ "uncertain_rate": {
1088
+ "mean": 0.02905325443786982,
1089
+ "ci95_low": 0.027214098892000937,
1090
+ "ci95_high": 0.03089446256587214
1091
+ }
1092
+ },
1093
+ "relation": {
1094
+ "visual_units": 12258,
1095
+ "grounded_precision": {
1096
+ "mean": 0.9447707619513787,
1097
+ "ci95_low": 0.940250251476046,
1098
+ "ci95_high": 0.9488500322338025
1099
+ },
1100
+ "unsupported_rate": {
1101
+ "mean": 0.04470549844999184,
1102
+ "ci95_low": 0.040868106622205656,
1103
+ "ci95_high": 0.04874403893744779
1104
+ },
1105
+ "uncertain_rate": {
1106
+ "mean": 0.010523739598629466,
1107
+ "ci95_low": 0.008758328912112647,
1108
+ "ci95_high": 0.012379041421155493
1109
+ }
1110
+ },
1111
+ "style": {
1112
+ "visual_units": 1846,
1113
+ "grounded_precision": {
1114
+ "mean": 0.9739978331527628,
1115
+ "ci95_low": 0.9665969610035756,
1116
+ "ci95_high": 0.9811124814761271
1117
+ },
1118
+ "unsupported_rate": {
1119
+ "mean": 0.008125677139761646,
1120
+ "ci95_low": 0.004282540736754114,
1121
+ "ci95_high": 0.012575177692728267
1122
+ },
1123
+ "uncertain_rate": {
1124
+ "mean": 0.017876489707475622,
1125
+ "ci95_low": 0.01222721386094503,
1126
+ "ci95_high": 0.024109205741154226
1127
+ }
1128
+ },
1129
+ "camera": {
1130
+ "visual_units": 1112,
1131
+ "grounded_precision": {
1132
+ "mean": 0.9820143884892086,
1133
+ "ci95_low": 0.9739271259004545,
1134
+ "ci95_high": 0.9893428063943162
1135
+ },
1136
+ "unsupported_rate": {
1137
+ "mean": 0.015287769784172662,
1138
+ "ci95_low": 0.008795074758135445,
1139
+ "ci95_high": 0.02246401628939388
1140
+ },
1141
+ "uncertain_rate": {
1142
+ "mean": 0.002697841726618705,
1143
+ "ci95_low": 0.0,
1144
+ "ci95_high": 0.006227758007117438
1145
+ }
1146
+ },
1147
+ "lighting": {
1148
+ "visual_units": 572,
1149
+ "grounded_precision": {
1150
+ "mean": 0.9493006993006993,
1151
+ "ci95_low": 0.9295981100021236,
1152
+ "ci95_high": 0.966789667896679
1153
+ },
1154
+ "unsupported_rate": {
1155
+ "mean": 0.02972027972027972,
1156
+ "ci95_low": 0.01688358270137323,
1157
+ "ci95_high": 0.04407311276502592
1158
+ },
1159
+ "uncertain_rate": {
1160
+ "mean": 0.02097902097902098,
1161
+ "ci95_low": 0.009208103130755065,
1162
+ "ci95_high": 0.034488458250213704
1163
+ }
1164
+ },
1165
+ "count": {
1166
+ "visual_units": 977,
1167
+ "grounded_precision": {
1168
+ "mean": 0.9426816786079836,
1169
+ "ci95_low": 0.9268797805121384,
1170
+ "ci95_high": 0.9575213489201178
1171
+ },
1172
+ "unsupported_rate": {
1173
+ "mean": 0.04503582395087001,
1174
+ "ci95_low": 0.03157728881026095,
1175
+ "ci95_high": 0.05983183483183482
1176
+ },
1177
+ "uncertain_rate": {
1178
+ "mean": 0.012282497441146366,
1179
+ "ci95_low": 0.005916946478678972,
1180
+ "ci95_high": 0.02020355530993828
1181
+ }
1182
+ },
1183
+ "text_rendering": {
1184
+ "visual_units": 144,
1185
+ "grounded_precision": {
1186
+ "mean": 0.8472222222222222,
1187
+ "ci95_low": 0.7883211678832117,
1188
+ "ci95_high": 0.9056698746359376
1189
+ },
1190
+ "unsupported_rate": {
1191
+ "mean": 0.08333333333333333,
1192
+ "ci95_low": 0.041666666666666664,
1193
+ "ci95_high": 0.1283812094217122
1194
+ },
1195
+ "uncertain_rate": {
1196
+ "mean": 0.06944444444444445,
1197
+ "ci95_low": 0.02938069594034797,
1198
+ "ci95_high": 0.11489179396788078
1199
+ }
1200
+ }
1201
+ }
1202
+ },
1203
+ "danbooru_ref_florence2_gemma4": {
1204
+ "input": "artifacts/grounded-cbu/gemma-cross-corpus-2026-05-02/responses/grounded_verify_v2_danbooru2023_florence2_paired__ref_danbooru_florence2_b64_5k.responses.gemma4_31b_file_mt2048.merged.jsonl",
1205
+ "captions": 4968,
1206
+ "visual_units": 40646,
1207
+ "grounded_units_per_caption": {
1208
+ "mean": 6.439009661835748,
1209
+ "ci95_low": 6.360904790660225,
1210
+ "ci95_high": 6.514900362318841
1211
+ },
1212
+ "grounded_precision": {
1213
+ "mean": 0.7870147123948236,
1214
+ "ci95_low": 0.7811164877031025,
1215
+ "ci95_high": 0.7933524180366222
1216
+ },
1217
+ "unsupported_rate": {
1218
+ "mean": 0.17389164985484426,
1219
+ "ci95_low": 0.16829958528079125,
1220
+ "ci95_high": 0.17941198580066595
1221
+ },
1222
+ "uncertain_rate": {
1223
+ "mean": 0.03909363775033214,
1224
+ "ci95_low": 0.037210508029968115,
1225
+ "ci95_high": 0.04105116788616811
1226
+ },
1227
+ "categories": {
1228
+ "object": {
1229
+ "visual_units": 15099,
1230
+ "grounded_precision": {
1231
+ "mean": 0.8533015431485529,
1232
+ "ci95_low": 0.8464048689769926,
1233
+ "ci95_high": 0.8600571230864825
1234
+ },
1235
+ "unsupported_rate": {
1236
+ "mean": 0.12696205046691833,
1237
+ "ci95_low": 0.12043951118207459,
1238
+ "ci95_high": 0.13358764830785877
1239
+ },
1240
+ "uncertain_rate": {
1241
+ "mean": 0.01973640638452878,
1242
+ "ci95_low": 0.017507491853620417,
1243
+ "ci95_high": 0.02205217334408175
1244
+ }
1245
+ },
1246
+ "attribute": {
1247
+ "visual_units": 12265,
1248
+ "grounded_precision": {
1249
+ "mean": 0.696942519364044,
1250
+ "ci95_low": 0.6871524158965265,
1251
+ "ci95_high": 0.7068862971725645
1252
+ },
1253
+ "unsupported_rate": {
1254
+ "mean": 0.22380758255197716,
1255
+ "ci95_low": 0.21517133587705695,
1256
+ "ci95_high": 0.23231732751904313
1257
+ },
1258
+ "uncertain_rate": {
1259
+ "mean": 0.0792498980839788,
1260
+ "ci95_low": 0.07454460333231255,
1261
+ "ci95_high": 0.08410270043229699
1262
+ }
1263
+ },
1264
+ "relation": {
1265
+ "visual_units": 7781,
1266
+ "grounded_precision": {
1267
+ "mean": 0.7528595296234417,
1268
+ "ci95_low": 0.7421206268636046,
1269
+ "ci95_high": 0.7633506200025479
1270
+ },
1271
+ "unsupported_rate": {
1272
+ "mean": 0.2223364606091762,
1273
+ "ci95_low": 0.2125031040559888,
1274
+ "ci95_high": 0.2323494647471591
1275
+ },
1276
+ "uncertain_rate": {
1277
+ "mean": 0.024804009767382083,
1278
+ "ci95_low": 0.021322743895831952,
1279
+ "ci95_high": 0.02829033983002319
1280
+ }
1281
+ },
1282
+ "style": {
1283
+ "visual_units": 2893,
1284
+ "grounded_precision": {
1285
+ "mean": 0.9633598340822676,
1286
+ "ci95_low": 0.9559915586069002,
1287
+ "ci95_high": 0.9707531672740175
1288
+ },
1289
+ "unsupported_rate": {
1290
+ "mean": 0.02212236432768752,
1291
+ "ci95_low": 0.016421652562371717,
1292
+ "ci95_high": 0.02826700942420635
1293
+ },
1294
+ "uncertain_rate": {
1295
+ "mean": 0.014517801590044937,
1296
+ "ci95_low": 0.00987937478763167,
1297
+ "ci95_high": 0.019464889394231646
1298
+ }
1299
+ },
1300
+ "camera": {
1301
+ "visual_units": 29,
1302
+ "grounded_precision": {
1303
+ "mean": 1.0,
1304
+ "ci95_low": 1.0,
1305
+ "ci95_high": 1.0
1306
+ },
1307
+ "unsupported_rate": {
1308
+ "mean": 0.0,
1309
+ "ci95_low": 0.0,
1310
+ "ci95_high": 0.0
1311
+ },
1312
+ "uncertain_rate": {
1313
+ "mean": 0.0,
1314
+ "ci95_low": 0.0,
1315
+ "ci95_high": 0.0
1316
+ }
1317
+ },
1318
+ "lighting": {
1319
+ "visual_units": 164,
1320
+ "grounded_precision": {
1321
+ "mean": 0.8048780487804879,
1322
+ "ci95_low": 0.7405001643925694,
1323
+ "ci95_high": 0.8647140021652833
1324
+ },
1325
+ "unsupported_rate": {
1326
+ "mean": 0.1402439024390244,
1327
+ "ci95_low": 0.08749273255813952,
1328
+ "ci95_high": 0.1985839086938143
1329
+ },
1330
+ "uncertain_rate": {
1331
+ "mean": 0.054878048780487805,
1332
+ "ci95_low": 0.023668639053254437,
1333
+ "ci95_high": 0.09090909090909091
1334
+ }
1335
+ },
1336
+ "count": {
1337
+ "visual_units": 946,
1338
+ "grounded_precision": {
1339
+ "mean": 0.9386892177589852,
1340
+ "ci95_low": 0.9228329809725159,
1341
+ "ci95_high": 0.953977018739048
1342
+ },
1343
+ "unsupported_rate": {
1344
+ "mean": 0.05708245243128964,
1345
+ "ci95_low": 0.04231166150670795,
1346
+ "ci95_high": 0.0721654956552192
1347
+ },
1348
+ "uncertain_rate": {
1349
+ "mean": 0.004228329809725159,
1350
+ "ci95_low": 0.0010192387616229663,
1351
+ "ci95_high": 0.008832707471540264
1352
+ }
1353
+ },
1354
+ "text_rendering": {
1355
+ "visual_units": 1469,
1356
+ "grounded_precision": {
1357
+ "mean": 0.5874744724302247,
1358
+ "ci95_low": 0.5621583756000362,
1359
+ "ci95_high": 0.6121796184489994
1360
+ },
1361
+ "unsupported_rate": {
1362
+ "mean": 0.3641933287950987,
1363
+ "ci95_low": 0.3403043178190916,
1364
+ "ci95_high": 0.3885517920790823
1365
+ },
1366
+ "uncertain_rate": {
1367
+ "mean": 0.04833219877467665,
1368
+ "ci95_low": 0.03775967475698478,
1369
+ "ci95_high": 0.05977279311715422
1370
+ }
1371
+ }
1372
+ }
1373
+ }
1374
+ }
1375
+ }
eval_results/gemma-cross-corpus-2026-05-02/cbu_vqa_gemma4_cross_corpus_table.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ | Surface | Resp | OK | Q | Support ↑ | Risk ↓ | Uncertain ↓ |
2
+ |---|---:|---:|---:|---:|---:|---:|
3
+ | danbooru2023_florence2_paired__ours_danbooru2023 | 4,993 | 4,993 | 71,555 | 0.8276 | 0.0938 | 0.0786 |
4
+ | danbooru2023_florence2_paired__ref_danbooru_florence2 | 4,969 | 4,969 | 40,755 | 0.7494 | 0.2345 | 0.0161 |
5
+ | laion_pop_llama32_paired__ours_laion_pop | 4,964 | 4,964 | 73,564 | 0.9192 | 0.0601 | 0.0207 |
6
+ | laion_pop_llama32_paired__ref_laion_pop_llama32_11b | 4,947 | 4,947 | 58,935 | 0.8583 | 0.1131 | 0.0286 |
7
+ | ours_datacomp_forward | 4,775 | 4,775 | 68,500 | 0.8886 | 0.0840 | 0.0274 |
8
+ | pd12m_full_paired__ours_pd12m_img2dataset | 4,957 | 4,957 | 74,463 | 0.9013 | 0.0659 | 0.0328 |
9
+ | pd12m_full_paired__ref_pd12m_full | 4,989 | 4,989 | 48,825 | 0.8405 | 0.1308 | 0.0287 |
10
+ | ref_datacomp_recap_llava15_llama3_8b | 4,779 | 4,779 | 47,878 | 0.7662 | 0.2170 | 0.0168 |
eval_results/gemma-cross-corpus-2026-05-02/cbu_vqa_gemma4_cross_corpus_table.tex ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ \begin{tabular}{lrrrrrr}
2
+ \toprule
3
+ Surface & Resp. & OK & Q & Support $\uparrow$ & Risk $\downarrow$ & Uncertain $\downarrow$ \\
4
+ \midrule
5
+ danbooru2023\_florence2\_paired\_\_ours\_danbooru2023 & 4,993 & 4,993 & 71,555 & 0.8276 & 0.0938 & 0.0786 \\
6
+ danbooru2023\_florence2\_paired\_\_ref\_danbooru\_florence2 & 4,969 & 4,969 & 40,755 & 0.7494 & 0.2345 & 0.0161 \\
7
+ laion\_pop\_llama32\_paired\_\_ours\_laion\_pop & 4,964 & 4,964 & 73,564 & 0.9192 & 0.0601 & 0.0207 \\
8
+ laion\_pop\_llama32\_paired\_\_ref\_laion\_pop\_llama32\_11b & 4,947 & 4,947 & 58,935 & 0.8583 & 0.1131 & 0.0286 \\
9
+ ours\_datacomp\_forward & 4,775 & 4,775 & 68,500 & 0.8886 & 0.0840 & 0.0274 \\
10
+ pd12m\_full\_paired\_\_ours\_pd12m\_img2dataset & 4,957 & 4,957 & 74,463 & 0.9013 & 0.0659 & 0.0328 \\
11
+ pd12m\_full\_paired\_\_ref\_pd12m\_full & 4,989 & 4,989 & 48,825 & 0.8405 & 0.1308 & 0.0287 \\
12
+ ref\_datacomp\_recap\_llava15\_llama3\_8b & 4,779 & 4,779 & 47,878 & 0.7662 & 0.2170 & 0.0168 \\
13
+ \bottomrule
14
+ \end{tabular}
eval_results/gemma-cross-corpus-2026-05-02/claimed_cbu_ci.tsv ADDED
@@ -0,0 +1 @@
 
 
1
+ surface captions cbu_per_caption_ci95 cbu_per_100_tokens_ci95 object_per_caption_ci95 attribute_per_caption_ci95 relation_per_caption_ci95 camera_per_caption_ci95 lighting_per_caption_ci95 text_rendering_per_caption_ci95
eval_results/gemma-cross-corpus-2026-05-02/grounded_cbu_category_ci.tsv ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ surface category visual_units grounded_precision_ci95 unsupported_rate_ci95 uncertain_rate_ci95
2
+ datacomp_ours_gemma4 object 17907 0.9643 [0.9610, 0.9675] 0.0192 [0.0167, 0.0215] 0.0166 [0.0145, 0.0189]
3
+ datacomp_ours_gemma4 attribute 36712 0.9358 [0.9329, 0.9387] 0.0233 [0.0213, 0.0254] 0.0408 [0.0388, 0.0430]
4
+ datacomp_ours_gemma4 relation 8429 0.9569 [0.9522, 0.9617] 0.0276 [0.0237, 0.0316] 0.0154 [0.0126, 0.0182]
5
+ datacomp_ours_gemma4 style 1109 0.9829 [0.9745, 0.9901] 0.0072 [0.0027, 0.0125] 0.0099 [0.0044, 0.0172]
6
+ datacomp_ours_gemma4 camera 678 0.9808 [0.9688, 0.9909] 0.0177 [0.0086, 0.0289] 0.0015 [0.0000, 0.0047]
7
+ datacomp_ours_gemma4 lighting 1616 0.9629 [0.9524, 0.9725] 0.0161 [0.0094, 0.0232] 0.0210 [0.0142, 0.0290]
8
+ datacomp_ours_gemma4 count 2519 0.9174 [0.9052, 0.9293] 0.0599 [0.0500, 0.0706] 0.0226 [0.0166, 0.0293]
9
+ datacomp_ours_gemma4 text_rendering 1924 0.9002 [0.8839, 0.9158] 0.0587 [0.0472, 0.0709] 0.0411 [0.0312, 0.0509]
10
+ datacomp_ref_llava15_llama3_gemma4 object 17553 0.8627 [0.8561, 0.8689] 0.1100 [0.1045, 0.1162] 0.0273 [0.0247, 0.0301]
11
+ datacomp_ref_llava15_llama3_gemma4 attribute 18950 0.8221 [0.8152, 0.8291] 0.1404 [0.1339, 0.1471] 0.0375 [0.0347, 0.0402]
12
+ datacomp_ref_llava15_llama3_gemma4 relation 6437 0.8018 [0.7898, 0.8134] 0.1673 [0.1566, 0.1785] 0.0309 [0.0264, 0.0354]
13
+ datacomp_ref_llava15_llama3_gemma4 style 1363 0.9090 [0.8905, 0.9264] 0.0829 [0.0655, 0.1010] 0.0081 [0.0037, 0.0129]
14
+ datacomp_ref_llava15_llama3_gemma4 camera 586 0.9539 [0.9341, 0.9712] 0.0392 [0.0234, 0.0569] 0.0068 [0.0000, 0.0162]
15
+ datacomp_ref_llava15_llama3_gemma4 lighting 557 0.9425 [0.9210, 0.9637] 0.0305 [0.0169, 0.0458] 0.0269 [0.0143, 0.0416]
16
+ datacomp_ref_llava15_llama3_gemma4 count 1621 0.7717 [0.7495, 0.7926] 0.2067 [0.1863, 0.2283] 0.0216 [0.0143, 0.0298]
17
+ datacomp_ref_llava15_llama3_gemma4 text_rendering 2777 0.6867 [0.6659, 0.7067] 0.2315 [0.2123, 0.2510] 0.0817 [0.0705, 0.0930]
18
+ laion_pop_ours_gemma4 object 20835 0.9670 [0.9642, 0.9696] 0.0135 [0.0118, 0.0152] 0.0195 [0.0175, 0.0217]
19
+ laion_pop_ours_gemma4 attribute 35923 0.9428 [0.9399, 0.9454] 0.0176 [0.0160, 0.0192] 0.0397 [0.0376, 0.0418]
20
+ laion_pop_ours_gemma4 relation 12589 0.9656 [0.9620, 0.9689] 0.0216 [0.0189, 0.0244] 0.0128 [0.0108, 0.0150]
21
+ laion_pop_ours_gemma4 style 1565 0.9764 [0.9658, 0.9847] 0.0217 [0.0138, 0.0317] 0.0019 [0.0000, 0.0044]
22
+ laion_pop_ours_gemma4 camera 1402 0.9800 [0.9726, 0.9869] 0.0178 [0.0114, 0.0247] 0.0021 [0.0000, 0.0048]
23
+ laion_pop_ours_gemma4 lighting 2120 0.9637 [0.9545, 0.9722] 0.0165 [0.0105, 0.0231] 0.0198 [0.0139, 0.0262]
24
+ laion_pop_ours_gemma4 count 1882 0.9400 [0.9286, 0.9509] 0.0367 [0.0280, 0.0459] 0.0234 [0.0168, 0.0307]
25
+ laion_pop_ours_gemma4 text_rendering 524 0.9218 [0.8917, 0.9460] 0.0305 [0.0157, 0.0483] 0.0477 [0.0275, 0.0720]
26
+ laion_pop_ref_llama32_11b_gemma4 object 17499 0.9395 [0.9354, 0.9436] 0.0301 [0.0272, 0.0332] 0.0303 [0.0276, 0.0331]
27
+ laion_pop_ref_llama32_11b_gemma4 attribute 24300 0.9122 [0.9081, 0.9160] 0.0448 [0.0417, 0.0478] 0.0430 [0.0405, 0.0458]
28
+ laion_pop_ref_llama32_11b_gemma4 relation 9388 0.8932 [0.8863, 0.8999] 0.0775 [0.0715, 0.0836] 0.0293 [0.0260, 0.0327]
29
+ laion_pop_ref_llama32_11b_gemma4 style 2803 0.9818 [0.9766, 0.9868] 0.0082 [0.0050, 0.0117] 0.0100 [0.0066, 0.0137]
30
+ laion_pop_ref_llama32_11b_gemma4 camera 1689 0.9580 [0.9480, 0.9678] 0.0379 [0.0284, 0.0473] 0.0041 [0.0013, 0.0072]
31
+ laion_pop_ref_llama32_11b_gemma4 lighting 577 0.9428 [0.9221, 0.9619] 0.0260 [0.0130, 0.0405] 0.0312 [0.0180, 0.0466]
32
+ laion_pop_ref_llama32_11b_gemma4 count 1672 0.8038 [0.7818, 0.8253] 0.1334 [0.1147, 0.1523] 0.0628 [0.0507, 0.0765]
33
+ laion_pop_ref_llama32_11b_gemma4 text_rendering 602 0.7442 [0.7048, 0.7833] 0.1777 [0.1421, 0.2118] 0.0781 [0.0539, 0.1028]
34
+ pd12m_ours_gemma4 object 19969 0.9584 [0.9552, 0.9617] 0.0146 [0.0126, 0.0166] 0.0269 [0.0244, 0.0294]
35
+ pd12m_ours_gemma4 attribute 32324 0.9331 [0.9302, 0.9359] 0.0139 [0.0124, 0.0153] 0.0531 [0.0505, 0.0556]
36
+ pd12m_ours_gemma4 relation 12392 0.9547 [0.9506, 0.9587] 0.0249 [0.0219, 0.0280] 0.0204 [0.0179, 0.0231]
37
+ pd12m_ours_gemma4 style 1524 0.9862 [0.9797, 0.9920] 0.0072 [0.0032, 0.0124] 0.0066 [0.0027, 0.0106]
38
+ pd12m_ours_gemma4 camera 1082 0.9852 [0.9784, 0.9919] 0.0092 [0.0038, 0.0150] 0.0055 [0.0018, 0.0104]
39
+ pd12m_ours_gemma4 lighting 1452 0.9545 [0.9435, 0.9656] 0.0145 [0.0090, 0.0207] 0.0310 [0.0223, 0.0402]
40
+ pd12m_ours_gemma4 count 2571 0.9238 [0.9131, 0.9345] 0.0370 [0.0294, 0.0448] 0.0393 [0.0316, 0.0475]
41
+ pd12m_ours_gemma4 text_rendering 912 0.8235 [0.7947, 0.8522] 0.0603 [0.0418, 0.0793] 0.1162 [0.0935, 0.1397]
42
+ pd12m_ref_gemma4 object 21867 0.9249 [0.9211, 0.9290] 0.0486 [0.0452, 0.0517] 0.0265 [0.0244, 0.0288]
43
+ pd12m_ref_gemma4 attribute 10053 0.8173 [0.8081, 0.8263] 0.1115 [0.1042, 0.1190] 0.0712 [0.0659, 0.0763]
44
+ pd12m_ref_gemma4 relation 11840 0.8789 [0.8722, 0.8860] 0.0963 [0.0899, 0.1023] 0.0248 [0.0219, 0.0275]
45
+ pd12m_ref_gemma4 style 2000 0.9705 [0.9620, 0.9784] 0.0080 [0.0039, 0.0125] 0.0215 [0.0147, 0.0286]
46
+ pd12m_ref_gemma4 camera 529 0.9905 [0.9815, 0.9981] 0.0095 [0.0019, 0.0185] 0.0000 [0.0000, 0.0000]
47
+ pd12m_ref_gemma4 lighting 151 0.7881 [0.7133, 0.8634] 0.1391 [0.0784, 0.2025] 0.0728 [0.0310, 0.1172]
48
+ pd12m_ref_gemma4 count 1235 0.8915 [0.8735, 0.9087] 0.0802 [0.0657, 0.0963] 0.0283 [0.0198, 0.0376]
49
+ pd12m_ref_gemma4 text_rendering 995 0.7688 [0.7409, 0.7963] 0.1910 [0.1663, 0.2183] 0.0402 [0.0276, 0.0538]
50
+ danbooru_ours_gemma4 object 18718 0.9327 [0.9284, 0.9372] 0.0317 [0.0288, 0.0346] 0.0356 [0.0323, 0.0390]
51
+ danbooru_ours_gemma4 attribute 33800 0.9345 [0.9314, 0.9374] 0.0364 [0.0341, 0.0389] 0.0291 [0.0272, 0.0309]
52
+ danbooru_ours_gemma4 relation 12258 0.9448 [0.9403, 0.9489] 0.0447 [0.0409, 0.0487] 0.0105 [0.0088, 0.0124]
53
+ danbooru_ours_gemma4 style 1846 0.9740 [0.9666, 0.9811] 0.0081 [0.0043, 0.0126] 0.0179 [0.0122, 0.0241]
54
+ danbooru_ours_gemma4 camera 1112 0.9820 [0.9739, 0.9893] 0.0153 [0.0088, 0.0225] 0.0027 [0.0000, 0.0062]
55
+ danbooru_ours_gemma4 lighting 572 0.9493 [0.9296, 0.9668] 0.0297 [0.0169, 0.0441] 0.0210 [0.0092, 0.0345]
56
+ danbooru_ours_gemma4 count 977 0.9427 [0.9269, 0.9575] 0.0450 [0.0316, 0.0598] 0.0123 [0.0059, 0.0202]
57
+ danbooru_ours_gemma4 text_rendering 144 0.8472 [0.7883, 0.9057] 0.0833 [0.0417, 0.1284] 0.0694 [0.0294, 0.1149]
58
+ danbooru_ref_florence2_gemma4 object 15099 0.8533 [0.8464, 0.8601] 0.1270 [0.1204, 0.1336] 0.0197 [0.0175, 0.0221]
59
+ danbooru_ref_florence2_gemma4 attribute 12265 0.6969 [0.6872, 0.7069] 0.2238 [0.2152, 0.2323] 0.0792 [0.0745, 0.0841]
60
+ danbooru_ref_florence2_gemma4 relation 7781 0.7529 [0.7421, 0.7634] 0.2223 [0.2125, 0.2323] 0.0248 [0.0213, 0.0283]
61
+ danbooru_ref_florence2_gemma4 style 2893 0.9634 [0.9560, 0.9708] 0.0221 [0.0164, 0.0283] 0.0145 [0.0099, 0.0195]
62
+ danbooru_ref_florence2_gemma4 camera 29 1.0000 [1.0000, 1.0000] 0.0000 [0.0000, 0.0000] 0.0000 [0.0000, 0.0000]
63
+ danbooru_ref_florence2_gemma4 lighting 164 0.8049 [0.7405, 0.8647] 0.1402 [0.0875, 0.1986] 0.0549 [0.0237, 0.0909]
64
+ danbooru_ref_florence2_gemma4 count 946 0.9387 [0.9228, 0.9540] 0.0571 [0.0423, 0.0722] 0.0042 [0.0010, 0.0088]
65
+ danbooru_ref_florence2_gemma4 text_rendering 1469 0.5875 [0.5622, 0.6122] 0.3642 [0.3403, 0.3886] 0.0483 [0.0378, 0.0598]