#!/usr/bin/env python3 """Pack small recap E&D metric artifacts into a release-friendly directory.""" from __future__ import annotations import argparse import csv import json import shutil from pathlib import Path from typing import Any ROOT = Path("") NVME = Path("") EMBEDDING_RUNS = [ ("Qwen3-Embedding-4B", "ours", "qwen3-embedding-4b/datacomp_ours_50k"), ("Qwen3-Embedding-4B", "ref", "qwen3-embedding-4b/datacomp_ref_llava15_50k"), ("Qwen3-Embedding-8B", "ours", "qwen3-embedding-8b/datacomp_ours_50k"), ("Qwen3-Embedding-8B", "ref", "qwen3-embedding-8b/datacomp_ref_llava15_50k"), ("E5-Mistral-7B", "ours", "e5-mistral-7b-instruct/datacomp_ours_50k"), ("E5-Mistral-7B", "ref", "e5-mistral-7b-instruct/datacomp_ref_llava15_50k"), ("BGE-M3-official", "ours", "bge-m3-official/datacomp_ours_50k"), ("BGE-M3-official", "ref", "bge-m3-official/datacomp_ref_llava15_50k"), ] SUPPORT_RUNS = [ ("Qwen3-Embedding-4B raw/raw", "ours", "qwen3-embedding-4b/2026-04-25/diffusiondb_raw_to_ours_50k.support.json"), ("Qwen3-Embedding-4B raw/raw", "ref", "qwen3-embedding-4b/2026-04-25/diffusiondb_raw_to_ref_50k.support.json"), ("Qwen3-Embedding-4B query/doc", "ours", "qwen3-embedding-4b/2026-04-25/diffusiondb_query_to_ours_50k.support.json"), ("Qwen3-Embedding-4B query/doc", "ref", "qwen3-embedding-4b/2026-04-25/diffusiondb_query_to_ref_50k.support.json"), ("E5-Mistral raw/raw", "ours", "e5-mistral-7b-instruct/2026-04-25/diffusiondb_raw_to_ours_50k.support.json"), ("E5-Mistral raw/raw", "ref", "e5-mistral-7b-instruct/2026-04-25/diffusiondb_raw_to_ref_50k.support.json"), ("E5-Mistral query/doc", "ours", "e5-mistral-7b-instruct/2026-04-25/diffusiondb_query_to_ours_50k.support.json"), ("E5-Mistral query/doc", "ref", "e5-mistral-7b-instruct/2026-04-25/diffusiondb_query_to_ref_50k.support.json"), ("BGE-M3 raw/corpus", "ours", "bge-m3-official/2026-04-25/diffusiondb_raw_to_ours_50k.support.json"), ("BGE-M3 raw/corpus", "ref", "bge-m3-official/2026-04-25/diffusiondb_raw_to_ref_50k.support.json"), ("BGE-M3 query/corpus", "ours", "bge-m3-official/2026-04-25/diffusiondb_query_to_ours_50k.support.json"), ("BGE-M3 query/corpus", "ref", "bge-m3-official/2026-04-25/diffusiondb_query_to_ref_50k.support.json"), ] def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--output-dir", default="artifacts/recap-ed/metrics-2026-04-25") return parser.parse_args() def load_json(path: Path) -> dict[str, Any]: with path.open("r", encoding="utf-8") as handle: return json.load(handle) def write_tsv(path: Path, rows: list[dict[str, Any]], fields: list[str]) -> None: path.parent.mkdir(parents=True, exist_ok=True) with path.open("w", encoding="utf-8", newline="") as handle: writer = csv.DictWriter(handle, fields, delimiter="\t") writer.writeheader() writer.writerows(rows) def rel_or_abs(path: Path) -> str: try: return str(path.relative_to(ROOT)) except ValueError: return str(path) def pack_embedding(out_dir: Path, manifest: dict[str, Any]) -> None: rows: list[dict[str, Any]] = [] for encoder, surface, rel in EMBEDDING_RUNS: base = NVME / "caption-embeddings" / rel vendi_path = base / "vendi_partition_b4096_seed0.json" rel_path = Path(rel) geometry_path = NVME / "caption-geometry" / rel_path.parent / f"{rel_path.name}.geometry.json" if not geometry_path.exists(): geometry_path = base / "geometry_seed0.json" vendi = load_json(vendi_path) geometry = load_json(geometry_path) geometry_metrics = geometry.get("metrics", geometry) summary = vendi["summary"]["vendi"] rows.append( { "encoder": encoder, "surface": surface, "rows": vendi.get("source_rows"), "vendi_mean": f"{summary['mean']:.6f}", "vendi_ci95_low": f"{summary['ci95_low']:.6f}", "vendi_ci95_high": f"{summary['ci95_high']:.6f}", "cov_effective_rank": f"{geometry_metrics.get('cov_effective_rank', 0):.6f}", "cov_participation_ratio": f"{geometry_metrics.get('cov_participation_ratio', 0):.6f}", "cov_top1_mass": f"{geometry_metrics.get('cov_top1_mass', 0):.6f}", } ) manifest["sources"].append(rel_or_abs(vendi_path)) manifest["sources"].append(rel_or_abs(geometry_path)) write_tsv( out_dir / "embedding" / "caption_embedding_profile.tsv", rows, [ "encoder", "surface", "rows", "vendi_mean", "vendi_ci95_low", "vendi_ci95_high", "cov_effective_rank", "cov_participation_ratio", "cov_top1_mass", ], ) def pack_support(out_dir: Path, manifest: dict[str, Any]) -> None: rows: list[dict[str, Any]] = [] for protocol, surface, rel in SUPPORT_RUNS: path = NVME / "prompt-caption-support" / rel data = load_json(path) metrics = data["metrics"] rows.append( { "protocol": protocol, "surface": surface, "prompt_rows": data.get("query_rows"), "caption_rows": data.get("gallery_rows"), "k": data.get("k"), "coverage": f"{metrics['coverage']:.6f}", "density": f"{metrics['density']:.6f}", "nn_cosine_mean": f"{metrics['nn_cosine_mean']:.6f}", "nn_distance_p95": f"{metrics['nn_distance_p95']:.6f}", } ) manifest["sources"].append(rel_or_abs(path)) write_tsv( out_dir / "embedding" / "prompt_caption_support.tsv", rows, [ "protocol", "surface", "prompt_rows", "caption_rows", "k", "coverage", "density", "nn_cosine_mean", "nn_distance_p95", ], ) def pack_cpu(out_dir: Path, manifest: dict[str, Any]) -> None: cpu_dir = out_dir / "cpu" cpu_dir.mkdir(parents=True, exist_ok=True) small_files = [ ROOT / "artifacts/caption-survey/cpu_remaining_2026-04-24/paired_delta_ci.tsv", NVME / "caption-survey/local_long_1m.json", NVME / "caption-survey/hf_manifest_1m.json", ] for src in small_files: dst = cpu_dir / src.name shutil.copy2(src, dst) manifest["sources"].append(rel_or_abs(src)) manifest["packed_files"].append(rel_or_abs(dst)) def write_readme(out_dir: Path) -> None: readme = """# Recap E&D Metric Artifacts Date: 2026-04-25 This directory contains small, paper-facing metric artifacts for the recap E&D draft. Large intermediate embedding arrays, VLM response JSONL files, and source image data are not copied here. The manifest records local source paths for reproducibility. Contents: - `cpu/paired_delta_ci.tsv`: paired CPU lexical/surface metric deltas with CIs. - `cpu/local_long_1m.json`: local long-caption corpus survey summaries. - `cpu/hf_manifest_1m.json`: public-reference corpus survey summaries. - `cbu/claimed_cbu_ci.tsv`: caption-level bootstrap CIs for claimed CBU density. - `cbu/grounded_cbu_ci.tsv`: caption-level bootstrap CIs for exact-unit grounded CBU audits. - `cbu/grounded_cbu_category_ci.tsv`: category-level grounded CBU audit CIs. - `embedding/caption_embedding_profile.tsv`: Vendi and covariance-geometry profiles. - `embedding/prompt_caption_support.tsv`: PRDC-style prompt-in-caption support metrics. Boundary: - Text-only metrics describe caption/supervision structure. - `GroundedCBU` is a sampled VLM proxy audit, not a human-certified faithfulness score. - Embedding metrics are encoder-sensitive and should be reported as profiles, not a single scalar quality score. """ (out_dir / "README.md").write_text(readme, encoding="utf-8") def main() -> int: args = parse_args() out_dir = Path(args.output_dir) out_dir.mkdir(parents=True, exist_ok=True) manifest: dict[str, Any] = { "date": "2026-04-25", "purpose": "paper-facing recap E&D metric artifact bundle", "sources": [], "packed_files": [], } pack_cpu(out_dir, manifest) pack_embedding(out_dir, manifest) pack_support(out_dir, manifest) write_readme(out_dir) manifest["packed_files"].extend( rel_or_abs(path) for path in sorted(out_dir.rglob("*")) if path.is_file() and path.name != "manifest.json" ) (out_dir / "manifest.json").write_text(json.dumps(manifest, indent=2), encoding="utf-8") print(json.dumps({"output_dir": str(out_dir), "files": len(manifest["packed_files"])}, indent=2)) return 0 if __name__ == "__main__": raise SystemExit(main())