ropedia-xperience-10m-task-baselines / scripts /validate_source_alignment.py
cy0307's picture
Publish Ropedia Xperience-10M task baseline cards
9371cfb verified
#!/usr/bin/env python3
"""Validate Xperience-10M source-description alignment.
This is an offline gate over committed source-alignment facts. It checks that
the repo distinguishes the gated full dataset, the public sample card, and this
project's one-episode scope across the main repo, website, and HF cards.
"""
from __future__ import annotations
import argparse
import json
from datetime import datetime, timezone
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
DEFAULT_HF_ROOT = ROOT.parent / "hf_publish"
OUTPUT_JSON = ROOT / "docs/data/source_alignment_audit.json"
OUTPUT_MD = ROOT / "SOURCE_ALIGNMENT_AUDIT.md"
ALIGNMENT_JSON = ROOT / "docs/data/xperience10m_dataset_card_alignment.json"
EXPECTED_FULL_DATASET = {
"repo_id": "ropedia-ai/xperience-10m",
"pretty_name": "Xperience-10M",
"repo_sha": "ce943cf271a758b60240084892d05cf6dc12dd90",
"last_modified": "2026-04-21T05:03:45.000Z",
"gated": "manual",
"license": "other",
"task_categories": {
"video-classification",
"image-to-text",
"depth-estimation",
"robotics",
},
"modalities": {"3d", "audio", "video"},
"card_tags": {
"egocentric",
"first-person",
"multimodal",
"3d",
"4d",
"embodied-ai",
"robotics",
"human-motion",
"mocap",
"imu",
"audio",
"depth",
"captions",
"video",
},
"total_file_size_display": "31.9 TB",
"used_storage_bytes_observed": 31871115497224,
}
EXPECTED_API_LISTING = {
"sibling_count": 85258,
"session_folder_count": 803,
"episode_folder_count": 12103,
"annotation_hdf5_count": 12103,
"mp4_count": 72612,
"visualization_rrd_count": 541,
}
EXPECTED_SAMPLE = {
"repo_id": "ropedia-ai/xperience-10m-sample",
"pretty_name": "Xperience-10M-Sample",
"license": "cc-by-nc-4.0",
"tooling": {"HOMIE Toolkit", "Rerun 0.29.0 for visualization.rrd"},
}
MODALITY_MARKERS = [
"six RGB video streams",
"audio",
"stereo depth",
"camera pose",
"SLAM",
"two-hand motion capture",
"full-body motion capture",
"inertial",
"language",
"metadata",
"calibration",
]
CURRENT_PROJECT_LIMIT_MARKERS = [
"large-scale audio-visual pretraining",
"caption generation",
"depth-pixel estimation",
"SLAM estimation",
"neural rendering",
"policy learning",
"cross-episode generalization",
"real held-out multi-episode Qwen3-Omni model quality",
]
PRESENTATION_MARKERS = {
"README.md": [
"ropedia-ai/xperience-10m",
"ropedia-ai/xperience-10m-sample",
"SOURCE_ALIGNMENT_AUDIT.md",
"source_alignment_audit.json",
"31.9 TB",
"about-1PB",
"cc-by-nc-4.0",
"HOMIE Toolkit",
"Rerun 0.29.0",
"12,103 episode folders",
"metadata only",
"limited in diversity",
],
"XPERIENCE10M_DATASET_CARD_ALIGNMENT.md": [
"ropedia-ai/xperience-10m",
"ropedia-ai/xperience-10m-sample",
"31.9 TB",
"31,871,115,497,224",
"cc-by-nc-4.0",
"HOMIE Toolkit",
"Rerun 0.29.0",
"12,103 episode folders",
"metadata only",
"limited in diversity",
],
"DATA_NOTICE.md": [
"ropedia-ai/xperience-10m",
"ropedia-ai/xperience-10m-sample",
"cc-by-nc-4.0",
"HOMIE Toolkit",
"Rerun 0.29.0",
"does not redistribute",
],
"docs/index.html": [
"ropedia-ai/xperience-10m",
"xperience-10m-sample",
"data/source_alignment_audit.json",
"31.9 TB",
"about-1PB",
"cc-by-nc-4.0",
"HOMIE Toolkit",
"Rerun 0.29.0",
"12,103 episode folders",
"not a local data inventory",
"limited diversity",
],
}
HF_PRESENTATION_MARKERS = {
"space/README.md": [
"xperience10m_dataset_card_alignment.json",
"source_alignment_audit.json",
"31.9 TB",
"about-1PB",
"cc-by-nc-4.0",
"HOMIE Toolkit",
"Rerun 0.29.0",
"12,103 episode folders",
"upstream listing metadata only",
"limited in diversity",
],
"artifacts/README.md": [
"xperience10m_dataset_card_alignment.json",
"source_alignment_audit.json",
"31.9 TB",
"about-1PB",
"cc-by-nc-4.0",
"HOMIE Toolkit",
"Rerun 0.29.0",
"12,103 episode folders",
"metadata only",
"limited in diversity",
],
"artifacts/PROJECT_README.md": [
"ropedia-ai/xperience-10m-sample",
"SOURCE_ALIGNMENT_AUDIT.md",
"source_alignment_audit.json",
"31.9 TB",
"about-1PB",
"cc-by-nc-4.0",
"HOMIE Toolkit",
"Rerun 0.29.0",
"12,103 episode folders",
"limited in diversity",
],
"model/README.md": [
"xperience10m_dataset_card_alignment.json",
"source_alignment_audit.json",
"31.9 TB",
"about-1PB",
"cc-by-nc-4.0",
"HOMIE",
"Toolkit",
"Rerun 0.29.0",
"12,103 episode folders",
"upstream listing metadata only",
"limited in diversity",
],
}
def load_json(path: Path) -> dict:
return json.loads(path.read_text(encoding="utf-8"))
def check(name: str, passed: bool, detail: str, evidence: list[str]) -> dict:
return {
"name": name,
"status": "pass" if passed else "fail",
"detail": detail,
"evidence": evidence,
}
def marker_record(base: Path, relative_path: str, markers: list[str]) -> dict:
path = base / relative_path
text = path.read_text(encoding="utf-8", errors="ignore") if path.exists() else ""
missing = [marker for marker in markers if marker not in text]
return {
"path": relative_path,
"exists": path.exists(),
"required_marker_count": len(markers),
"missing_markers": missing,
"status": "pass" if path.exists() and not missing else "fail",
}
def render_markdown(payload: dict) -> str:
alignment = payload["alignment_summary"]
lines = [
"# Source Alignment Note",
"",
"This file records how the repo, website, and HF cards present the same",
"Xperience-10M source facts and current-project language.",
"",
f"Current status: **{payload['status']}**",
"",
"## Source Facts",
"",
"| Layer | Current value |",
"| --- | --- |",
f"| Full dataset repo | `{alignment['full_dataset_repo']}` |",
f"| Full dataset access | {alignment['full_dataset_access']} |",
f"| Live HF file-size display | {alignment['live_hf_file_size_display']} |",
f"| Full-scale storage statement | {alignment['full_scale_storage_statement']} |",
f"| API episode listing | {alignment['api_episode_folders']:,} episode folders with `annotation.hdf5` as upstream metadata only |",
f"| Public sample repo | `{alignment['sample_repo']}` |",
f"| Public sample license | `{alignment['sample_license']}` |",
f"| Current verified project data | {alignment['current_project_scope']} |",
"",
"## Checks",
"",
"| Check | Status | Evidence |",
"| --- | --- | --- |",
]
for item in payload["checks"]:
evidence = ", ".join(f"`{path}`" for path in item["evidence"])
lines.append(f"| {item['name']} | {item['status']} | {evidence} |")
lines.extend([
"",
"## Current Project Scope",
"",
"- HF API file counts are source-listing metadata, not local data possession.",
"- The live HF 31.9 TB file-size display is recorded separately from the card's about-1PB full-scale storage statement.",
"- The public sample license is preserved separately from the gated full dataset license field.",
"- The official limited-diversity / showcase-quality disclaimer is preserved in the responsible-use notes.",
"- Raw MP4, HDF5, RRD, private gated data, and full Qwen weights are not redistributed.",
"- Current model evidence remains one public sample episode, not cross-episode generalization.",
"",
])
return "\n".join(lines)
def build_report(hf_root: Path) -> dict:
alignment = load_json(ALIGNMENT_JSON)
checks: list[dict] = []
metadata = alignment.get("hf_repo_metadata_observed", {})
api_listing = metadata.get("api_file_listing_observed", {})
live_hf_page = metadata.get("live_hf_page_observed", {})
sample = alignment.get("public_sample_card_observed", {})
current = alignment.get("current_repo_alignment", {})
responsible_use = "\n".join(alignment.get("responsible_use_boundary", []))
checks.append(
check(
"full_dataset_metadata_matches_observed_snapshot",
metadata.get("repo_id") == EXPECTED_FULL_DATASET["repo_id"]
and metadata.get("pretty_name") == EXPECTED_FULL_DATASET["pretty_name"]
and metadata.get("repo_sha") == EXPECTED_FULL_DATASET["repo_sha"]
and metadata.get("last_modified") == EXPECTED_FULL_DATASET["last_modified"]
and metadata.get("gated") == EXPECTED_FULL_DATASET["gated"]
and metadata.get("license") == EXPECTED_FULL_DATASET["license"]
and set(metadata.get("task_categories", [])) == EXPECTED_FULL_DATASET["task_categories"]
and set(metadata.get("modalities", [])) == EXPECTED_FULL_DATASET["modalities"]
and set(metadata.get("card_tags", [])) == EXPECTED_FULL_DATASET["card_tags"]
and live_hf_page.get("total_file_size_display") == EXPECTED_FULL_DATASET["total_file_size_display"]
and live_hf_page.get("used_storage_bytes_observed") == EXPECTED_FULL_DATASET["used_storage_bytes_observed"],
"gated full-dataset metadata, card tags, and live HF file-size display match the recorded snapshot",
["docs/data/xperience10m_dataset_card_alignment.json"],
)
)
checks.append(
check(
"api_listing_snapshot_is_consistent",
all(api_listing.get(key) == value for key, value in EXPECTED_API_LISTING.items()),
"HF API file-listing counts remain internally consistent in the committed alignment JSON",
["docs/data/xperience10m_dataset_card_alignment.json"],
)
)
checks.append(
check(
"sample_card_metadata_is_preserved",
sample.get("repo_id") == EXPECTED_SAMPLE["repo_id"]
and sample.get("pretty_name") == EXPECTED_SAMPLE["pretty_name"]
and sample.get("license") == EXPECTED_SAMPLE["license"]
and set(sample.get("tooling", [])) == EXPECTED_SAMPLE["tooling"],
"public sample card license and tooling are recorded separately from the gated full dataset",
["docs/data/xperience10m_dataset_card_alignment.json"],
)
)
modality_text = "\n".join(alignment.get("official_modalities", []))
missing_modalities = [marker for marker in MODALITY_MARKERS if marker not in modality_text]
checks.append(
check(
"official_modality_description_is_complete",
not missing_modalities,
f"missing modality markers={missing_modalities}",
["docs/data/xperience10m_dataset_card_alignment.json"],
)
)
not_claimed = set(current.get("not_yet_claimed", []))
checks.append(
check(
"current_project_scope_is_explicit",
current.get("validated_episode_count") == 1
and current.get("validated_frames") == 5821
and current.get("validated_windows") == 1161
and current.get("current_feature_dim") == 8546
and current.get("raw_data_redistributed") is False
and "extracted into the current baseline feature vector" in current.get("audio_feature_status", "")
and set(CURRENT_PROJECT_LIMIT_MARKERS).issubset(not_claimed),
"one-episode scope, audio status, raw-data exclusion, and current project coverage are present",
["docs/data/xperience10m_dataset_card_alignment.json"],
)
)
checks.append(
check(
"responsible_use_disclaimer_is_preserved",
"limited in diversity" in responsible_use
and "showcase/production quality" in responsible_use
and "identity recognition" in responsible_use
and "surveillance" in responsible_use
and "sensitive attribute inference" in responsible_use,
"official limited-diversity and prohibited-use notes are preserved",
["docs/data/xperience10m_dataset_card_alignment.json"],
)
)
repo_marker_records = [marker_record(ROOT, path, markers) for path, markers in PRESENTATION_MARKERS.items()]
hf_marker_records = [marker_record(hf_root, path, markers) for path, markers in HF_PRESENTATION_MARKERS.items()]
checks.append(
check(
"repo_public_surfaces_preserve_source_markers",
all(item["status"] == "pass" for item in repo_marker_records),
"README, data notice, alignment doc, and website expose official dataset facts, sample details, and project coverage",
[item["path"] for item in repo_marker_records],
)
)
checks.append(
check(
"hf_public_cards_preserve_source_markers",
all(item["status"] == "pass" for item in hf_marker_records),
"HF Space, artifact dataset, model card, and mirrored project README expose project coverage",
[item["path"] for item in hf_marker_records],
)
)
failures = [item for item in checks if item["status"] != "pass"]
payload = {
"title": "Ropedia Xperience-10M Source Alignment Note",
"status": "pass" if not failures else "fail",
"generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"),
"alignment_json": "docs/data/xperience10m_dataset_card_alignment.json",
"alignment_summary": {
"full_dataset_repo": metadata.get("repo_id"),
"full_dataset_access": metadata.get("gated"),
"live_hf_file_size_display": live_hf_page.get("total_file_size_display"),
"full_scale_storage_statement": alignment.get("official_dataset_summary", {}).get("storage_described_by_card"),
"api_episode_folders": api_listing.get("episode_folder_count"),
"sample_repo": sample.get("repo_id"),
"sample_license": sample.get("license"),
"current_project_scope": "1 public sample episode, 5,821 frames, 1,161 windows, 8,546 current features",
},
"checks": checks,
"repo_marker_records": repo_marker_records,
"hf_marker_records": hf_marker_records,
"failures": failures,
}
return payload
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--hf-root", type=Path, default=DEFAULT_HF_ROOT)
parser.add_argument("--output-json", type=Path, default=OUTPUT_JSON)
parser.add_argument("--output-md", type=Path, default=OUTPUT_MD)
args = parser.parse_args()
payload = build_report(args.hf_root.resolve())
args.output_json.parent.mkdir(parents=True, exist_ok=True)
args.output_json.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
args.output_md.write_text(render_markdown(payload), encoding="utf-8")
print(f"{payload['status'].upper()}: wrote {args.output_json}")
print(f"{payload['status'].upper()}: wrote {args.output_md}")
return 0 if payload["status"] == "pass" else 1
if __name__ == "__main__":
raise SystemExit(main())