#!/usr/bin/env python3 """Validate Xperience-10M source-description alignment. This is an offline gate over committed source-alignment facts. It checks that the repo distinguishes the gated full dataset, the public sample card, and this project's one-episode scope across the main repo, website, and HF cards. """ from __future__ import annotations import argparse import json from datetime import datetime, timezone from pathlib import Path ROOT = Path(__file__).resolve().parents[1] DEFAULT_HF_ROOT = ROOT.parent / "hf_publish" OUTPUT_JSON = ROOT / "docs/data/source_alignment_audit.json" OUTPUT_MD = ROOT / "SOURCE_ALIGNMENT_AUDIT.md" ALIGNMENT_JSON = ROOT / "docs/data/xperience10m_dataset_card_alignment.json" EXPECTED_FULL_DATASET = { "repo_id": "ropedia-ai/xperience-10m", "pretty_name": "Xperience-10M", "repo_sha": "ce943cf271a758b60240084892d05cf6dc12dd90", "last_modified": "2026-04-21T05:03:45.000Z", "gated": "manual", "license": "other", "task_categories": { "video-classification", "image-to-text", "depth-estimation", "robotics", }, "modalities": {"3d", "audio", "video"}, "card_tags": { "egocentric", "first-person", "multimodal", "3d", "4d", "embodied-ai", "robotics", "human-motion", "mocap", "imu", "audio", "depth", "captions", "video", }, "total_file_size_display": "31.9 TB", "used_storage_bytes_observed": 31871115497224, } EXPECTED_API_LISTING = { "sibling_count": 85258, "session_folder_count": 803, "episode_folder_count": 12103, "annotation_hdf5_count": 12103, "mp4_count": 72612, "visualization_rrd_count": 541, } EXPECTED_SAMPLE = { "repo_id": "ropedia-ai/xperience-10m-sample", "pretty_name": "Xperience-10M-Sample", "license": "cc-by-nc-4.0", "tooling": {"HOMIE Toolkit", "Rerun 0.29.0 for visualization.rrd"}, } MODALITY_MARKERS = [ "six RGB video streams", "audio", "stereo depth", "camera pose", "SLAM", "two-hand motion capture", "full-body motion capture", "inertial", "language", "metadata", "calibration", ] CURRENT_PROJECT_LIMIT_MARKERS = [ "large-scale audio-visual pretraining", "caption generation", "depth-pixel estimation", "SLAM estimation", "neural rendering", "policy learning", "cross-episode generalization", "real held-out multi-episode Qwen3-Omni model quality", ] PRESENTATION_MARKERS = { "README.md": [ "ropedia-ai/xperience-10m", "ropedia-ai/xperience-10m-sample", "SOURCE_ALIGNMENT_AUDIT.md", "source_alignment_audit.json", "31.9 TB", "about-1PB", "cc-by-nc-4.0", "HOMIE Toolkit", "Rerun 0.29.0", "12,103 episode folders", "metadata only", "limited in diversity", ], "XPERIENCE10M_DATASET_CARD_ALIGNMENT.md": [ "ropedia-ai/xperience-10m", "ropedia-ai/xperience-10m-sample", "31.9 TB", "31,871,115,497,224", "cc-by-nc-4.0", "HOMIE Toolkit", "Rerun 0.29.0", "12,103 episode folders", "metadata only", "limited in diversity", ], "DATA_NOTICE.md": [ "ropedia-ai/xperience-10m", "ropedia-ai/xperience-10m-sample", "cc-by-nc-4.0", "HOMIE Toolkit", "Rerun 0.29.0", "does not redistribute", ], "docs/index.html": [ "ropedia-ai/xperience-10m", "xperience-10m-sample", "data/source_alignment_audit.json", "31.9 TB", "about-1PB", "cc-by-nc-4.0", "HOMIE Toolkit", "Rerun 0.29.0", "12,103 episode folders", "not a local data inventory", "limited diversity", ], } HF_PRESENTATION_MARKERS = { "space/README.md": [ "xperience10m_dataset_card_alignment.json", "source_alignment_audit.json", "31.9 TB", "about-1PB", "cc-by-nc-4.0", "HOMIE Toolkit", "Rerun 0.29.0", "12,103 episode folders", "upstream listing metadata only", "limited in diversity", ], "artifacts/README.md": [ "xperience10m_dataset_card_alignment.json", "source_alignment_audit.json", "31.9 TB", "about-1PB", "cc-by-nc-4.0", "HOMIE Toolkit", "Rerun 0.29.0", "12,103 episode folders", "metadata only", "limited in diversity", ], "artifacts/PROJECT_README.md": [ "ropedia-ai/xperience-10m-sample", "SOURCE_ALIGNMENT_AUDIT.md", "source_alignment_audit.json", "31.9 TB", "about-1PB", "cc-by-nc-4.0", "HOMIE Toolkit", "Rerun 0.29.0", "12,103 episode folders", "limited in diversity", ], "model/README.md": [ "xperience10m_dataset_card_alignment.json", "source_alignment_audit.json", "31.9 TB", "about-1PB", "cc-by-nc-4.0", "HOMIE", "Toolkit", "Rerun 0.29.0", "12,103 episode folders", "upstream listing metadata only", "limited in diversity", ], } def load_json(path: Path) -> dict: return json.loads(path.read_text(encoding="utf-8")) def check(name: str, passed: bool, detail: str, evidence: list[str]) -> dict: return { "name": name, "status": "pass" if passed else "fail", "detail": detail, "evidence": evidence, } def marker_record(base: Path, relative_path: str, markers: list[str]) -> dict: path = base / relative_path text = path.read_text(encoding="utf-8", errors="ignore") if path.exists() else "" missing = [marker for marker in markers if marker not in text] return { "path": relative_path, "exists": path.exists(), "required_marker_count": len(markers), "missing_markers": missing, "status": "pass" if path.exists() and not missing else "fail", } def render_markdown(payload: dict) -> str: alignment = payload["alignment_summary"] lines = [ "# Source Alignment Note", "", "This file records how the repo, website, and HF cards present the same", "Xperience-10M source facts and current-project language.", "", f"Current status: **{payload['status']}**", "", "## Source Facts", "", "| Layer | Current value |", "| --- | --- |", f"| Full dataset repo | `{alignment['full_dataset_repo']}` |", f"| Full dataset access | {alignment['full_dataset_access']} |", f"| Live HF file-size display | {alignment['live_hf_file_size_display']} |", f"| Full-scale storage statement | {alignment['full_scale_storage_statement']} |", f"| API episode listing | {alignment['api_episode_folders']:,} episode folders with `annotation.hdf5` as upstream metadata only |", f"| Public sample repo | `{alignment['sample_repo']}` |", f"| Public sample license | `{alignment['sample_license']}` |", f"| Current verified project data | {alignment['current_project_scope']} |", "", "## Checks", "", "| Check | Status | Evidence |", "| --- | --- | --- |", ] for item in payload["checks"]: evidence = ", ".join(f"`{path}`" for path in item["evidence"]) lines.append(f"| {item['name']} | {item['status']} | {evidence} |") lines.extend([ "", "## Current Project Scope", "", "- HF API file counts are source-listing metadata, not local data possession.", "- The live HF 31.9 TB file-size display is recorded separately from the card's about-1PB full-scale storage statement.", "- The public sample license is preserved separately from the gated full dataset license field.", "- The official limited-diversity / showcase-quality disclaimer is preserved in the responsible-use notes.", "- Raw MP4, HDF5, RRD, private gated data, and full Qwen weights are not redistributed.", "- Current model evidence remains one public sample episode, not cross-episode generalization.", "", ]) return "\n".join(lines) def build_report(hf_root: Path) -> dict: alignment = load_json(ALIGNMENT_JSON) checks: list[dict] = [] metadata = alignment.get("hf_repo_metadata_observed", {}) api_listing = metadata.get("api_file_listing_observed", {}) live_hf_page = metadata.get("live_hf_page_observed", {}) sample = alignment.get("public_sample_card_observed", {}) current = alignment.get("current_repo_alignment", {}) responsible_use = "\n".join(alignment.get("responsible_use_boundary", [])) checks.append( check( "full_dataset_metadata_matches_observed_snapshot", metadata.get("repo_id") == EXPECTED_FULL_DATASET["repo_id"] and metadata.get("pretty_name") == EXPECTED_FULL_DATASET["pretty_name"] and metadata.get("repo_sha") == EXPECTED_FULL_DATASET["repo_sha"] and metadata.get("last_modified") == EXPECTED_FULL_DATASET["last_modified"] and metadata.get("gated") == EXPECTED_FULL_DATASET["gated"] and metadata.get("license") == EXPECTED_FULL_DATASET["license"] and set(metadata.get("task_categories", [])) == EXPECTED_FULL_DATASET["task_categories"] and set(metadata.get("modalities", [])) == EXPECTED_FULL_DATASET["modalities"] and set(metadata.get("card_tags", [])) == EXPECTED_FULL_DATASET["card_tags"] and live_hf_page.get("total_file_size_display") == EXPECTED_FULL_DATASET["total_file_size_display"] and live_hf_page.get("used_storage_bytes_observed") == EXPECTED_FULL_DATASET["used_storage_bytes_observed"], "gated full-dataset metadata, card tags, and live HF file-size display match the recorded snapshot", ["docs/data/xperience10m_dataset_card_alignment.json"], ) ) checks.append( check( "api_listing_snapshot_is_consistent", all(api_listing.get(key) == value for key, value in EXPECTED_API_LISTING.items()), "HF API file-listing counts remain internally consistent in the committed alignment JSON", ["docs/data/xperience10m_dataset_card_alignment.json"], ) ) checks.append( check( "sample_card_metadata_is_preserved", sample.get("repo_id") == EXPECTED_SAMPLE["repo_id"] and sample.get("pretty_name") == EXPECTED_SAMPLE["pretty_name"] and sample.get("license") == EXPECTED_SAMPLE["license"] and set(sample.get("tooling", [])) == EXPECTED_SAMPLE["tooling"], "public sample card license and tooling are recorded separately from the gated full dataset", ["docs/data/xperience10m_dataset_card_alignment.json"], ) ) modality_text = "\n".join(alignment.get("official_modalities", [])) missing_modalities = [marker for marker in MODALITY_MARKERS if marker not in modality_text] checks.append( check( "official_modality_description_is_complete", not missing_modalities, f"missing modality markers={missing_modalities}", ["docs/data/xperience10m_dataset_card_alignment.json"], ) ) not_claimed = set(current.get("not_yet_claimed", [])) checks.append( check( "current_project_scope_is_explicit", current.get("validated_episode_count") == 1 and current.get("validated_frames") == 5821 and current.get("validated_windows") == 1161 and current.get("current_feature_dim") == 8546 and current.get("raw_data_redistributed") is False and "extracted into the current baseline feature vector" in current.get("audio_feature_status", "") and set(CURRENT_PROJECT_LIMIT_MARKERS).issubset(not_claimed), "one-episode scope, audio status, raw-data exclusion, and current project coverage are present", ["docs/data/xperience10m_dataset_card_alignment.json"], ) ) checks.append( check( "responsible_use_disclaimer_is_preserved", "limited in diversity" in responsible_use and "showcase/production quality" in responsible_use and "identity recognition" in responsible_use and "surveillance" in responsible_use and "sensitive attribute inference" in responsible_use, "official limited-diversity and prohibited-use notes are preserved", ["docs/data/xperience10m_dataset_card_alignment.json"], ) ) repo_marker_records = [marker_record(ROOT, path, markers) for path, markers in PRESENTATION_MARKERS.items()] hf_marker_records = [marker_record(hf_root, path, markers) for path, markers in HF_PRESENTATION_MARKERS.items()] checks.append( check( "repo_public_surfaces_preserve_source_markers", all(item["status"] == "pass" for item in repo_marker_records), "README, data notice, alignment doc, and website expose official dataset facts, sample details, and project coverage", [item["path"] for item in repo_marker_records], ) ) checks.append( check( "hf_public_cards_preserve_source_markers", all(item["status"] == "pass" for item in hf_marker_records), "HF Space, artifact dataset, model card, and mirrored project README expose project coverage", [item["path"] for item in hf_marker_records], ) ) failures = [item for item in checks if item["status"] != "pass"] payload = { "title": "Ropedia Xperience-10M Source Alignment Note", "status": "pass" if not failures else "fail", "generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"), "alignment_json": "docs/data/xperience10m_dataset_card_alignment.json", "alignment_summary": { "full_dataset_repo": metadata.get("repo_id"), "full_dataset_access": metadata.get("gated"), "live_hf_file_size_display": live_hf_page.get("total_file_size_display"), "full_scale_storage_statement": alignment.get("official_dataset_summary", {}).get("storage_described_by_card"), "api_episode_folders": api_listing.get("episode_folder_count"), "sample_repo": sample.get("repo_id"), "sample_license": sample.get("license"), "current_project_scope": "1 public sample episode, 5,821 frames, 1,161 windows, 8,546 current features", }, "checks": checks, "repo_marker_records": repo_marker_records, "hf_marker_records": hf_marker_records, "failures": failures, } return payload def main() -> int: parser = argparse.ArgumentParser() parser.add_argument("--hf-root", type=Path, default=DEFAULT_HF_ROOT) parser.add_argument("--output-json", type=Path, default=OUTPUT_JSON) parser.add_argument("--output-md", type=Path, default=OUTPUT_MD) args = parser.parse_args() payload = build_report(args.hf_root.resolve()) args.output_json.parent.mkdir(parents=True, exist_ok=True) args.output_json.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8") args.output_md.write_text(render_markdown(payload), encoding="utf-8") print(f"{payload['status'].upper()}: wrote {args.output_json}") print(f"{payload['status'].upper()}: wrote {args.output_md}") return 0 if payload["status"] == "pass" else 1 if __name__ == "__main__": raise SystemExit(main())