| |
| """Validate Xperience-10M source-description alignment. |
| |
| This is an offline gate over committed source-alignment facts. It checks that |
| the repo distinguishes the gated full dataset, the public sample card, and this |
| project's one-episode scope across the main repo, website, and HF cards. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| from datetime import datetime, timezone |
| from pathlib import Path |
|
|
|
|
| ROOT = Path(__file__).resolve().parents[1] |
| DEFAULT_HF_ROOT = ROOT.parent / "hf_publish" |
| OUTPUT_JSON = ROOT / "docs/data/source_alignment_audit.json" |
| OUTPUT_MD = ROOT / "SOURCE_ALIGNMENT_AUDIT.md" |
| ALIGNMENT_JSON = ROOT / "docs/data/xperience10m_dataset_card_alignment.json" |
|
|
| EXPECTED_FULL_DATASET = { |
| "repo_id": "ropedia-ai/xperience-10m", |
| "pretty_name": "Xperience-10M", |
| "repo_sha": "ce943cf271a758b60240084892d05cf6dc12dd90", |
| "last_modified": "2026-04-21T05:03:45.000Z", |
| "gated": "manual", |
| "license": "other", |
| "task_categories": { |
| "video-classification", |
| "image-to-text", |
| "depth-estimation", |
| "robotics", |
| }, |
| "modalities": {"3d", "audio", "video"}, |
| "card_tags": { |
| "egocentric", |
| "first-person", |
| "multimodal", |
| "3d", |
| "4d", |
| "embodied-ai", |
| "robotics", |
| "human-motion", |
| "mocap", |
| "imu", |
| "audio", |
| "depth", |
| "captions", |
| "video", |
| }, |
| "total_file_size_display": "31.9 TB", |
| "used_storage_bytes_observed": 31871115497224, |
| } |
|
|
| EXPECTED_API_LISTING = { |
| "sibling_count": 85258, |
| "session_folder_count": 803, |
| "episode_folder_count": 12103, |
| "annotation_hdf5_count": 12103, |
| "mp4_count": 72612, |
| "visualization_rrd_count": 541, |
| } |
|
|
| EXPECTED_SAMPLE = { |
| "repo_id": "ropedia-ai/xperience-10m-sample", |
| "pretty_name": "Xperience-10M-Sample", |
| "license": "cc-by-nc-4.0", |
| "tooling": {"HOMIE Toolkit", "Rerun 0.29.0 for visualization.rrd"}, |
| } |
|
|
| MODALITY_MARKERS = [ |
| "six RGB video streams", |
| "audio", |
| "stereo depth", |
| "camera pose", |
| "SLAM", |
| "two-hand motion capture", |
| "full-body motion capture", |
| "inertial", |
| "language", |
| "metadata", |
| "calibration", |
| ] |
|
|
| CURRENT_PROJECT_LIMIT_MARKERS = [ |
| "large-scale audio-visual pretraining", |
| "caption generation", |
| "depth-pixel estimation", |
| "SLAM estimation", |
| "neural rendering", |
| "policy learning", |
| "cross-episode generalization", |
| "real held-out multi-episode Qwen3-Omni model quality", |
| ] |
|
|
| PRESENTATION_MARKERS = { |
| "README.md": [ |
| "ropedia-ai/xperience-10m", |
| "ropedia-ai/xperience-10m-sample", |
| "SOURCE_ALIGNMENT_AUDIT.md", |
| "source_alignment_audit.json", |
| "31.9 TB", |
| "about-1PB", |
| "cc-by-nc-4.0", |
| "HOMIE Toolkit", |
| "Rerun 0.29.0", |
| "12,103 episode folders", |
| "metadata only", |
| "limited in diversity", |
| ], |
| "XPERIENCE10M_DATASET_CARD_ALIGNMENT.md": [ |
| "ropedia-ai/xperience-10m", |
| "ropedia-ai/xperience-10m-sample", |
| "31.9 TB", |
| "31,871,115,497,224", |
| "cc-by-nc-4.0", |
| "HOMIE Toolkit", |
| "Rerun 0.29.0", |
| "12,103 episode folders", |
| "metadata only", |
| "limited in diversity", |
| ], |
| "DATA_NOTICE.md": [ |
| "ropedia-ai/xperience-10m", |
| "ropedia-ai/xperience-10m-sample", |
| "cc-by-nc-4.0", |
| "HOMIE Toolkit", |
| "Rerun 0.29.0", |
| "does not redistribute", |
| ], |
| "docs/index.html": [ |
| "ropedia-ai/xperience-10m", |
| "xperience-10m-sample", |
| "data/source_alignment_audit.json", |
| "31.9 TB", |
| "about-1PB", |
| "cc-by-nc-4.0", |
| "HOMIE Toolkit", |
| "Rerun 0.29.0", |
| "12,103 episode folders", |
| "not a local data inventory", |
| "limited diversity", |
| ], |
| } |
|
|
| HF_PRESENTATION_MARKERS = { |
| "space/README.md": [ |
| "xperience10m_dataset_card_alignment.json", |
| "source_alignment_audit.json", |
| "31.9 TB", |
| "about-1PB", |
| "cc-by-nc-4.0", |
| "HOMIE Toolkit", |
| "Rerun 0.29.0", |
| "12,103 episode folders", |
| "upstream listing metadata only", |
| "limited in diversity", |
| ], |
| "artifacts/README.md": [ |
| "xperience10m_dataset_card_alignment.json", |
| "source_alignment_audit.json", |
| "31.9 TB", |
| "about-1PB", |
| "cc-by-nc-4.0", |
| "HOMIE Toolkit", |
| "Rerun 0.29.0", |
| "12,103 episode folders", |
| "metadata only", |
| "limited in diversity", |
| ], |
| "artifacts/PROJECT_README.md": [ |
| "ropedia-ai/xperience-10m-sample", |
| "SOURCE_ALIGNMENT_AUDIT.md", |
| "source_alignment_audit.json", |
| "31.9 TB", |
| "about-1PB", |
| "cc-by-nc-4.0", |
| "HOMIE Toolkit", |
| "Rerun 0.29.0", |
| "12,103 episode folders", |
| "limited in diversity", |
| ], |
| "model/README.md": [ |
| "xperience10m_dataset_card_alignment.json", |
| "source_alignment_audit.json", |
| "31.9 TB", |
| "about-1PB", |
| "cc-by-nc-4.0", |
| "HOMIE", |
| "Toolkit", |
| "Rerun 0.29.0", |
| "12,103 episode folders", |
| "upstream listing metadata only", |
| "limited in diversity", |
| ], |
| } |
|
|
|
|
| def load_json(path: Path) -> dict: |
| return json.loads(path.read_text(encoding="utf-8")) |
|
|
|
|
| def check(name: str, passed: bool, detail: str, evidence: list[str]) -> dict: |
| return { |
| "name": name, |
| "status": "pass" if passed else "fail", |
| "detail": detail, |
| "evidence": evidence, |
| } |
|
|
|
|
| def marker_record(base: Path, relative_path: str, markers: list[str]) -> dict: |
| path = base / relative_path |
| text = path.read_text(encoding="utf-8", errors="ignore") if path.exists() else "" |
| missing = [marker for marker in markers if marker not in text] |
| return { |
| "path": relative_path, |
| "exists": path.exists(), |
| "required_marker_count": len(markers), |
| "missing_markers": missing, |
| "status": "pass" if path.exists() and not missing else "fail", |
| } |
|
|
|
|
| def render_markdown(payload: dict) -> str: |
| alignment = payload["alignment_summary"] |
| lines = [ |
| "# Source Alignment Note", |
| "", |
| "This file records how the repo, website, and HF cards present the same", |
| "Xperience-10M source facts and current-project language.", |
| "", |
| f"Current status: **{payload['status']}**", |
| "", |
| "## Source Facts", |
| "", |
| "| Layer | Current value |", |
| "| --- | --- |", |
| f"| Full dataset repo | `{alignment['full_dataset_repo']}` |", |
| f"| Full dataset access | {alignment['full_dataset_access']} |", |
| f"| Live HF file-size display | {alignment['live_hf_file_size_display']} |", |
| f"| Full-scale storage statement | {alignment['full_scale_storage_statement']} |", |
| f"| API episode listing | {alignment['api_episode_folders']:,} episode folders with `annotation.hdf5` as upstream metadata only |", |
| f"| Public sample repo | `{alignment['sample_repo']}` |", |
| f"| Public sample license | `{alignment['sample_license']}` |", |
| f"| Current verified project data | {alignment['current_project_scope']} |", |
| "", |
| "## Checks", |
| "", |
| "| Check | Status | Evidence |", |
| "| --- | --- | --- |", |
| ] |
| for item in payload["checks"]: |
| evidence = ", ".join(f"`{path}`" for path in item["evidence"]) |
| lines.append(f"| {item['name']} | {item['status']} | {evidence} |") |
| lines.extend([ |
| "", |
| "## Current Project Scope", |
| "", |
| "- HF API file counts are source-listing metadata, not local data possession.", |
| "- The live HF 31.9 TB file-size display is recorded separately from the card's about-1PB full-scale storage statement.", |
| "- The public sample license is preserved separately from the gated full dataset license field.", |
| "- The official limited-diversity / showcase-quality disclaimer is preserved in the responsible-use notes.", |
| "- Raw MP4, HDF5, RRD, private gated data, and full Qwen weights are not redistributed.", |
| "- Current model evidence remains one public sample episode, not cross-episode generalization.", |
| "", |
| ]) |
| return "\n".join(lines) |
|
|
|
|
| def build_report(hf_root: Path) -> dict: |
| alignment = load_json(ALIGNMENT_JSON) |
| checks: list[dict] = [] |
|
|
| metadata = alignment.get("hf_repo_metadata_observed", {}) |
| api_listing = metadata.get("api_file_listing_observed", {}) |
| live_hf_page = metadata.get("live_hf_page_observed", {}) |
| sample = alignment.get("public_sample_card_observed", {}) |
| current = alignment.get("current_repo_alignment", {}) |
| responsible_use = "\n".join(alignment.get("responsible_use_boundary", [])) |
|
|
| checks.append( |
| check( |
| "full_dataset_metadata_matches_observed_snapshot", |
| metadata.get("repo_id") == EXPECTED_FULL_DATASET["repo_id"] |
| and metadata.get("pretty_name") == EXPECTED_FULL_DATASET["pretty_name"] |
| and metadata.get("repo_sha") == EXPECTED_FULL_DATASET["repo_sha"] |
| and metadata.get("last_modified") == EXPECTED_FULL_DATASET["last_modified"] |
| and metadata.get("gated") == EXPECTED_FULL_DATASET["gated"] |
| and metadata.get("license") == EXPECTED_FULL_DATASET["license"] |
| and set(metadata.get("task_categories", [])) == EXPECTED_FULL_DATASET["task_categories"] |
| and set(metadata.get("modalities", [])) == EXPECTED_FULL_DATASET["modalities"] |
| and set(metadata.get("card_tags", [])) == EXPECTED_FULL_DATASET["card_tags"] |
| and live_hf_page.get("total_file_size_display") == EXPECTED_FULL_DATASET["total_file_size_display"] |
| and live_hf_page.get("used_storage_bytes_observed") == EXPECTED_FULL_DATASET["used_storage_bytes_observed"], |
| "gated full-dataset metadata, card tags, and live HF file-size display match the recorded snapshot", |
| ["docs/data/xperience10m_dataset_card_alignment.json"], |
| ) |
| ) |
| checks.append( |
| check( |
| "api_listing_snapshot_is_consistent", |
| all(api_listing.get(key) == value for key, value in EXPECTED_API_LISTING.items()), |
| "HF API file-listing counts remain internally consistent in the committed alignment JSON", |
| ["docs/data/xperience10m_dataset_card_alignment.json"], |
| ) |
| ) |
| checks.append( |
| check( |
| "sample_card_metadata_is_preserved", |
| sample.get("repo_id") == EXPECTED_SAMPLE["repo_id"] |
| and sample.get("pretty_name") == EXPECTED_SAMPLE["pretty_name"] |
| and sample.get("license") == EXPECTED_SAMPLE["license"] |
| and set(sample.get("tooling", [])) == EXPECTED_SAMPLE["tooling"], |
| "public sample card license and tooling are recorded separately from the gated full dataset", |
| ["docs/data/xperience10m_dataset_card_alignment.json"], |
| ) |
| ) |
|
|
| modality_text = "\n".join(alignment.get("official_modalities", [])) |
| missing_modalities = [marker for marker in MODALITY_MARKERS if marker not in modality_text] |
| checks.append( |
| check( |
| "official_modality_description_is_complete", |
| not missing_modalities, |
| f"missing modality markers={missing_modalities}", |
| ["docs/data/xperience10m_dataset_card_alignment.json"], |
| ) |
| ) |
|
|
| not_claimed = set(current.get("not_yet_claimed", [])) |
| checks.append( |
| check( |
| "current_project_scope_is_explicit", |
| current.get("validated_episode_count") == 1 |
| and current.get("validated_frames") == 5821 |
| and current.get("validated_windows") == 1161 |
| and current.get("current_feature_dim") == 8546 |
| and current.get("raw_data_redistributed") is False |
| and "extracted into the current baseline feature vector" in current.get("audio_feature_status", "") |
| and set(CURRENT_PROJECT_LIMIT_MARKERS).issubset(not_claimed), |
| "one-episode scope, audio status, raw-data exclusion, and current project coverage are present", |
| ["docs/data/xperience10m_dataset_card_alignment.json"], |
| ) |
| ) |
| checks.append( |
| check( |
| "responsible_use_disclaimer_is_preserved", |
| "limited in diversity" in responsible_use |
| and "showcase/production quality" in responsible_use |
| and "identity recognition" in responsible_use |
| and "surveillance" in responsible_use |
| and "sensitive attribute inference" in responsible_use, |
| "official limited-diversity and prohibited-use notes are preserved", |
| ["docs/data/xperience10m_dataset_card_alignment.json"], |
| ) |
| ) |
|
|
| repo_marker_records = [marker_record(ROOT, path, markers) for path, markers in PRESENTATION_MARKERS.items()] |
| hf_marker_records = [marker_record(hf_root, path, markers) for path, markers in HF_PRESENTATION_MARKERS.items()] |
| checks.append( |
| check( |
| "repo_public_surfaces_preserve_source_markers", |
| all(item["status"] == "pass" for item in repo_marker_records), |
| "README, data notice, alignment doc, and website expose official dataset facts, sample details, and project coverage", |
| [item["path"] for item in repo_marker_records], |
| ) |
| ) |
| checks.append( |
| check( |
| "hf_public_cards_preserve_source_markers", |
| all(item["status"] == "pass" for item in hf_marker_records), |
| "HF Space, artifact dataset, model card, and mirrored project README expose project coverage", |
| [item["path"] for item in hf_marker_records], |
| ) |
| ) |
|
|
| failures = [item for item in checks if item["status"] != "pass"] |
| payload = { |
| "title": "Ropedia Xperience-10M Source Alignment Note", |
| "status": "pass" if not failures else "fail", |
| "generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"), |
| "alignment_json": "docs/data/xperience10m_dataset_card_alignment.json", |
| "alignment_summary": { |
| "full_dataset_repo": metadata.get("repo_id"), |
| "full_dataset_access": metadata.get("gated"), |
| "live_hf_file_size_display": live_hf_page.get("total_file_size_display"), |
| "full_scale_storage_statement": alignment.get("official_dataset_summary", {}).get("storage_described_by_card"), |
| "api_episode_folders": api_listing.get("episode_folder_count"), |
| "sample_repo": sample.get("repo_id"), |
| "sample_license": sample.get("license"), |
| "current_project_scope": "1 public sample episode, 5,821 frames, 1,161 windows, 8,546 current features", |
| }, |
| "checks": checks, |
| "repo_marker_records": repo_marker_records, |
| "hf_marker_records": hf_marker_records, |
| "failures": failures, |
| } |
| return payload |
|
|
|
|
| def main() -> int: |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--hf-root", type=Path, default=DEFAULT_HF_ROOT) |
| parser.add_argument("--output-json", type=Path, default=OUTPUT_JSON) |
| parser.add_argument("--output-md", type=Path, default=OUTPUT_MD) |
| args = parser.parse_args() |
|
|
| payload = build_report(args.hf_root.resolve()) |
| args.output_json.parent.mkdir(parents=True, exist_ok=True) |
| args.output_json.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8") |
| args.output_md.write_text(render_markdown(payload), encoding="utf-8") |
| print(f"{payload['status'].upper()}: wrote {args.output_json}") |
| print(f"{payload['status'].upper()}: wrote {args.output_md}") |
| return 0 if payload["status"] == "pass" else 1 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|