| |
| """Pack small recap E&D metric artifacts into a release-friendly directory.""" |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import csv |
| import json |
| import shutil |
| from pathlib import Path |
| from typing import Any |
|
|
|
|
| ROOT = Path("<PROJECT_ROOT>") |
| NVME = Path("<LOCAL_CACHE>") |
|
|
|
|
| EMBEDDING_RUNS = [ |
| ("Qwen3-Embedding-4B", "ours", "qwen3-embedding-4b/datacomp_ours_50k"), |
| ("Qwen3-Embedding-4B", "ref", "qwen3-embedding-4b/datacomp_ref_llava15_50k"), |
| ("Qwen3-Embedding-8B", "ours", "qwen3-embedding-8b/datacomp_ours_50k"), |
| ("Qwen3-Embedding-8B", "ref", "qwen3-embedding-8b/datacomp_ref_llava15_50k"), |
| ("E5-Mistral-7B", "ours", "e5-mistral-7b-instruct/datacomp_ours_50k"), |
| ("E5-Mistral-7B", "ref", "e5-mistral-7b-instruct/datacomp_ref_llava15_50k"), |
| ("BGE-M3-official", "ours", "bge-m3-official/datacomp_ours_50k"), |
| ("BGE-M3-official", "ref", "bge-m3-official/datacomp_ref_llava15_50k"), |
| ] |
|
|
|
|
| SUPPORT_RUNS = [ |
| ("Qwen3-Embedding-4B raw/raw", "ours", "qwen3-embedding-4b/2026-04-25/diffusiondb_raw_to_ours_50k.support.json"), |
| ("Qwen3-Embedding-4B raw/raw", "ref", "qwen3-embedding-4b/2026-04-25/diffusiondb_raw_to_ref_50k.support.json"), |
| ("Qwen3-Embedding-4B query/doc", "ours", "qwen3-embedding-4b/2026-04-25/diffusiondb_query_to_ours_50k.support.json"), |
| ("Qwen3-Embedding-4B query/doc", "ref", "qwen3-embedding-4b/2026-04-25/diffusiondb_query_to_ref_50k.support.json"), |
| ("E5-Mistral raw/raw", "ours", "e5-mistral-7b-instruct/2026-04-25/diffusiondb_raw_to_ours_50k.support.json"), |
| ("E5-Mistral raw/raw", "ref", "e5-mistral-7b-instruct/2026-04-25/diffusiondb_raw_to_ref_50k.support.json"), |
| ("E5-Mistral query/doc", "ours", "e5-mistral-7b-instruct/2026-04-25/diffusiondb_query_to_ours_50k.support.json"), |
| ("E5-Mistral query/doc", "ref", "e5-mistral-7b-instruct/2026-04-25/diffusiondb_query_to_ref_50k.support.json"), |
| ("BGE-M3 raw/corpus", "ours", "bge-m3-official/2026-04-25/diffusiondb_raw_to_ours_50k.support.json"), |
| ("BGE-M3 raw/corpus", "ref", "bge-m3-official/2026-04-25/diffusiondb_raw_to_ref_50k.support.json"), |
| ("BGE-M3 query/corpus", "ours", "bge-m3-official/2026-04-25/diffusiondb_query_to_ours_50k.support.json"), |
| ("BGE-M3 query/corpus", "ref", "bge-m3-official/2026-04-25/diffusiondb_query_to_ref_50k.support.json"), |
| ] |
|
|
|
|
| def parse_args() -> argparse.Namespace: |
| parser = argparse.ArgumentParser(description=__doc__) |
| parser.add_argument("--output-dir", default="artifacts/recap-ed/metrics-2026-04-25") |
| return parser.parse_args() |
|
|
|
|
| def load_json(path: Path) -> dict[str, Any]: |
| with path.open("r", encoding="utf-8") as handle: |
| return json.load(handle) |
|
|
|
|
| def write_tsv(path: Path, rows: list[dict[str, Any]], fields: list[str]) -> None: |
| path.parent.mkdir(parents=True, exist_ok=True) |
| with path.open("w", encoding="utf-8", newline="") as handle: |
| writer = csv.DictWriter(handle, fields, delimiter="\t") |
| writer.writeheader() |
| writer.writerows(rows) |
|
|
|
|
| def rel_or_abs(path: Path) -> str: |
| try: |
| return str(path.relative_to(ROOT)) |
| except ValueError: |
| return str(path) |
|
|
|
|
| def pack_embedding(out_dir: Path, manifest: dict[str, Any]) -> None: |
| rows: list[dict[str, Any]] = [] |
| for encoder, surface, rel in EMBEDDING_RUNS: |
| base = NVME / "caption-embeddings" / rel |
| vendi_path = base / "vendi_partition_b4096_seed0.json" |
| rel_path = Path(rel) |
| geometry_path = NVME / "caption-geometry" / rel_path.parent / f"{rel_path.name}.geometry.json" |
| if not geometry_path.exists(): |
| geometry_path = base / "geometry_seed0.json" |
| vendi = load_json(vendi_path) |
| geometry = load_json(geometry_path) |
| geometry_metrics = geometry.get("metrics", geometry) |
| summary = vendi["summary"]["vendi"] |
| rows.append( |
| { |
| "encoder": encoder, |
| "surface": surface, |
| "rows": vendi.get("source_rows"), |
| "vendi_mean": f"{summary['mean']:.6f}", |
| "vendi_ci95_low": f"{summary['ci95_low']:.6f}", |
| "vendi_ci95_high": f"{summary['ci95_high']:.6f}", |
| "cov_effective_rank": f"{geometry_metrics.get('cov_effective_rank', 0):.6f}", |
| "cov_participation_ratio": f"{geometry_metrics.get('cov_participation_ratio', 0):.6f}", |
| "cov_top1_mass": f"{geometry_metrics.get('cov_top1_mass', 0):.6f}", |
| } |
| ) |
| manifest["sources"].append(rel_or_abs(vendi_path)) |
| manifest["sources"].append(rel_or_abs(geometry_path)) |
| write_tsv( |
| out_dir / "embedding" / "caption_embedding_profile.tsv", |
| rows, |
| [ |
| "encoder", |
| "surface", |
| "rows", |
| "vendi_mean", |
| "vendi_ci95_low", |
| "vendi_ci95_high", |
| "cov_effective_rank", |
| "cov_participation_ratio", |
| "cov_top1_mass", |
| ], |
| ) |
|
|
|
|
| def pack_support(out_dir: Path, manifest: dict[str, Any]) -> None: |
| rows: list[dict[str, Any]] = [] |
| for protocol, surface, rel in SUPPORT_RUNS: |
| path = NVME / "prompt-caption-support" / rel |
| data = load_json(path) |
| metrics = data["metrics"] |
| rows.append( |
| { |
| "protocol": protocol, |
| "surface": surface, |
| "prompt_rows": data.get("query_rows"), |
| "caption_rows": data.get("gallery_rows"), |
| "k": data.get("k"), |
| "coverage": f"{metrics['coverage']:.6f}", |
| "density": f"{metrics['density']:.6f}", |
| "nn_cosine_mean": f"{metrics['nn_cosine_mean']:.6f}", |
| "nn_distance_p95": f"{metrics['nn_distance_p95']:.6f}", |
| } |
| ) |
| manifest["sources"].append(rel_or_abs(path)) |
| write_tsv( |
| out_dir / "embedding" / "prompt_caption_support.tsv", |
| rows, |
| [ |
| "protocol", |
| "surface", |
| "prompt_rows", |
| "caption_rows", |
| "k", |
| "coverage", |
| "density", |
| "nn_cosine_mean", |
| "nn_distance_p95", |
| ], |
| ) |
|
|
|
|
| def pack_cpu(out_dir: Path, manifest: dict[str, Any]) -> None: |
| cpu_dir = out_dir / "cpu" |
| cpu_dir.mkdir(parents=True, exist_ok=True) |
| small_files = [ |
| ROOT / "artifacts/caption-survey/cpu_remaining_2026-04-24/paired_delta_ci.tsv", |
| NVME / "caption-survey/local_long_1m.json", |
| NVME / "caption-survey/hf_manifest_1m.json", |
| ] |
| for src in small_files: |
| dst = cpu_dir / src.name |
| shutil.copy2(src, dst) |
| manifest["sources"].append(rel_or_abs(src)) |
| manifest["packed_files"].append(rel_or_abs(dst)) |
|
|
|
|
| def write_readme(out_dir: Path) -> None: |
| readme = """# Recap E&D Metric Artifacts |
| |
| Date: 2026-04-25 |
| |
| This directory contains small, paper-facing metric artifacts for the recap E&D draft. |
| Large intermediate embedding arrays, VLM response JSONL files, and source image data are |
| not copied here. The manifest records local source paths for reproducibility. |
| |
| Contents: |
| |
| - `cpu/paired_delta_ci.tsv`: paired CPU lexical/surface metric deltas with CIs. |
| - `cpu/local_long_1m.json`: local long-caption corpus survey summaries. |
| - `cpu/hf_manifest_1m.json`: public-reference corpus survey summaries. |
| - `cbu/claimed_cbu_ci.tsv`: caption-level bootstrap CIs for claimed CBU density. |
| - `cbu/grounded_cbu_ci.tsv`: caption-level bootstrap CIs for exact-unit grounded CBU audits. |
| - `cbu/grounded_cbu_category_ci.tsv`: category-level grounded CBU audit CIs. |
| - `embedding/caption_embedding_profile.tsv`: Vendi and covariance-geometry profiles. |
| - `embedding/prompt_caption_support.tsv`: PRDC-style prompt-in-caption support metrics. |
| |
| Boundary: |
| |
| - Text-only metrics describe caption/supervision structure. |
| - `GroundedCBU` is a sampled VLM proxy audit, not a human-certified faithfulness score. |
| - Embedding metrics are encoder-sensitive and should be reported as profiles, not a single scalar quality score. |
| """ |
| (out_dir / "README.md").write_text(readme, encoding="utf-8") |
|
|
|
|
| def main() -> int: |
| args = parse_args() |
| out_dir = Path(args.output_dir) |
| out_dir.mkdir(parents=True, exist_ok=True) |
| manifest: dict[str, Any] = { |
| "date": "2026-04-25", |
| "purpose": "paper-facing recap E&D metric artifact bundle", |
| "sources": [], |
| "packed_files": [], |
| } |
| pack_cpu(out_dir, manifest) |
| pack_embedding(out_dir, manifest) |
| pack_support(out_dir, manifest) |
| write_readme(out_dir) |
| manifest["packed_files"].extend( |
| rel_or_abs(path) |
| for path in sorted(out_dir.rglob("*")) |
| if path.is_file() and path.name != "manifest.json" |
| ) |
| (out_dir / "manifest.json").write_text(json.dumps(manifest, indent=2), encoding="utf-8") |
| print(json.dumps({"output_dir": str(out_dir), "files": len(manifest["packed_files"])}, indent=2)) |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|