File size: 15,831 Bytes

#!/usr/bin/env python3
"""Validate Xperience-10M source-description alignment.

This is an offline gate over committed source-alignment facts. It checks that
the repo distinguishes the gated full dataset, the public sample card, and this
project's one-episode scope across the main repo, website, and HF cards.
"""

from __future__ import annotations

import argparse
import json
from datetime import datetime, timezone
from pathlib import Path


ROOT = Path(__file__).resolve().parents[1]
DEFAULT_HF_ROOT = ROOT.parent / "hf_publish"
OUTPUT_JSON = ROOT / "docs/data/source_alignment_audit.json"
OUTPUT_MD = ROOT / "SOURCE_ALIGNMENT_AUDIT.md"
ALIGNMENT_JSON = ROOT / "docs/data/xperience10m_dataset_card_alignment.json"

EXPECTED_FULL_DATASET = {
    "repo_id": "ropedia-ai/xperience-10m",
    "pretty_name": "Xperience-10M",
    "repo_sha": "ce943cf271a758b60240084892d05cf6dc12dd90",
    "last_modified": "2026-04-21T05:03:45.000Z",
    "gated": "manual",
    "license": "other",
    "task_categories": {
        "video-classification",
        "image-to-text",
        "depth-estimation",
        "robotics",
    },
    "modalities": {"3d", "audio", "video"},
    "card_tags": {
        "egocentric",
        "first-person",
        "multimodal",
        "3d",
        "4d",
        "embodied-ai",
        "robotics",
        "human-motion",
        "mocap",
        "imu",
        "audio",
        "depth",
        "captions",
        "video",
    },
    "total_file_size_display": "31.9 TB",
    "used_storage_bytes_observed": 31871115497224,
}

EXPECTED_API_LISTING = {
    "sibling_count": 85258,
    "session_folder_count": 803,
    "episode_folder_count": 12103,
    "annotation_hdf5_count": 12103,
    "mp4_count": 72612,
    "visualization_rrd_count": 541,
}

EXPECTED_SAMPLE = {
    "repo_id": "ropedia-ai/xperience-10m-sample",
    "pretty_name": "Xperience-10M-Sample",
    "license": "cc-by-nc-4.0",
    "tooling": {"HOMIE Toolkit", "Rerun 0.29.0 for visualization.rrd"},
}

MODALITY_MARKERS = [
    "six RGB video streams",
    "audio",
    "stereo depth",
    "camera pose",
    "SLAM",
    "two-hand motion capture",
    "full-body motion capture",
    "inertial",
    "language",
    "metadata",
    "calibration",
]

CURRENT_PROJECT_LIMIT_MARKERS = [
    "large-scale audio-visual pretraining",
    "caption generation",
    "depth-pixel estimation",
    "SLAM estimation",
    "neural rendering",
    "policy learning",
    "cross-episode generalization",
    "real held-out multi-episode Qwen3-Omni model quality",
]

PRESENTATION_MARKERS = {
    "README.md": [
        "ropedia-ai/xperience-10m",
        "ropedia-ai/xperience-10m-sample",
        "SOURCE_ALIGNMENT_AUDIT.md",
        "source_alignment_audit.json",
        "31.9 TB",
        "about-1PB",
        "cc-by-nc-4.0",
        "HOMIE Toolkit",
        "Rerun 0.29.0",
        "12,103 episode folders",
        "metadata only",
        "limited in diversity",
    ],
    "XPERIENCE10M_DATASET_CARD_ALIGNMENT.md": [
        "ropedia-ai/xperience-10m",
        "ropedia-ai/xperience-10m-sample",
        "31.9 TB",
        "31,871,115,497,224",
        "cc-by-nc-4.0",
        "HOMIE Toolkit",
        "Rerun 0.29.0",
        "12,103 episode folders",
        "metadata only",
        "limited in diversity",
    ],
    "DATA_NOTICE.md": [
        "ropedia-ai/xperience-10m",
        "ropedia-ai/xperience-10m-sample",
        "cc-by-nc-4.0",
        "HOMIE Toolkit",
        "Rerun 0.29.0",
        "does not redistribute",
    ],
    "docs/index.html": [
        "ropedia-ai/xperience-10m",
        "xperience-10m-sample",
        "data/source_alignment_audit.json",
        "31.9 TB",
        "about-1PB",
        "cc-by-nc-4.0",
        "HOMIE Toolkit",
        "Rerun 0.29.0",
        "12,103 episode folders",
        "not a local data inventory",
        "limited diversity",
    ],
}

HF_PRESENTATION_MARKERS = {
    "space/README.md": [
        "xperience10m_dataset_card_alignment.json",
        "source_alignment_audit.json",
        "31.9 TB",
        "about-1PB",
        "cc-by-nc-4.0",
        "HOMIE Toolkit",
        "Rerun 0.29.0",
        "12,103 episode folders",
        "upstream listing metadata only",
        "limited in diversity",
    ],
    "artifacts/README.md": [
        "xperience10m_dataset_card_alignment.json",
        "source_alignment_audit.json",
        "31.9 TB",
        "about-1PB",
        "cc-by-nc-4.0",
        "HOMIE Toolkit",
        "Rerun 0.29.0",
        "12,103 episode folders",
        "metadata only",
        "limited in diversity",
    ],
    "artifacts/PROJECT_README.md": [
        "ropedia-ai/xperience-10m-sample",
        "SOURCE_ALIGNMENT_AUDIT.md",
        "source_alignment_audit.json",
        "31.9 TB",
        "about-1PB",
        "cc-by-nc-4.0",
        "HOMIE Toolkit",
        "Rerun 0.29.0",
        "12,103 episode folders",
        "limited in diversity",
    ],
    "model/README.md": [
        "xperience10m_dataset_card_alignment.json",
        "source_alignment_audit.json",
        "31.9 TB",
        "about-1PB",
        "cc-by-nc-4.0",
        "HOMIE",
        "Toolkit",
        "Rerun 0.29.0",
        "12,103 episode folders",
        "upstream listing metadata only",
        "limited in diversity",
    ],
}


def load_json(path: Path) -> dict:
    return json.loads(path.read_text(encoding="utf-8"))


def check(name: str, passed: bool, detail: str, evidence: list[str]) -> dict:
    return {
        "name": name,
        "status": "pass" if passed else "fail",
        "detail": detail,
        "evidence": evidence,
    }


def marker_record(base: Path, relative_path: str, markers: list[str]) -> dict:
    path = base / relative_path
    text = path.read_text(encoding="utf-8", errors="ignore") if path.exists() else ""
    missing = [marker for marker in markers if marker not in text]
    return {
        "path": relative_path,
        "exists": path.exists(),
        "required_marker_count": len(markers),
        "missing_markers": missing,
        "status": "pass" if path.exists() and not missing else "fail",
    }


def render_markdown(payload: dict) -> str:
    alignment = payload["alignment_summary"]
    lines = [
        "# Source Alignment Note",
        "",
        "This file records how the repo, website, and HF cards present the same",
        "Xperience-10M source facts and current-project language.",
        "",
        f"Current status: **{payload['status']}**",
        "",
        "## Source Facts",
        "",
        "| Layer | Current value |",
        "| --- | --- |",
        f"| Full dataset repo | `{alignment['full_dataset_repo']}` |",
        f"| Full dataset access | {alignment['full_dataset_access']} |",
        f"| Live HF file-size display | {alignment['live_hf_file_size_display']} |",
        f"| Full-scale storage statement | {alignment['full_scale_storage_statement']} |",
        f"| API episode listing | {alignment['api_episode_folders']:,} episode folders with `annotation.hdf5` as upstream metadata only |",
        f"| Public sample repo | `{alignment['sample_repo']}` |",
        f"| Public sample license | `{alignment['sample_license']}` |",
        f"| Current verified project data | {alignment['current_project_scope']} |",
        "",
        "## Checks",
        "",
        "| Check | Status | Evidence |",
        "| --- | --- | --- |",
    ]
    for item in payload["checks"]:
        evidence = ", ".join(f"`{path}`" for path in item["evidence"])
        lines.append(f"| {item['name']} | {item['status']} | {evidence} |")
    lines.extend([
        "",
        "## Current Project Scope",
        "",
        "- HF API file counts are source-listing metadata, not local data possession.",
        "- The live HF 31.9 TB file-size display is recorded separately from the card's about-1PB full-scale storage statement.",
        "- The public sample license is preserved separately from the gated full dataset license field.",
        "- The official limited-diversity / showcase-quality disclaimer is preserved in the responsible-use notes.",
        "- Raw MP4, HDF5, RRD, private gated data, and full Qwen weights are not redistributed.",
        "- Current model evidence remains one public sample episode, not cross-episode generalization.",
        "",
    ])
    return "\n".join(lines)


def build_report(hf_root: Path) -> dict:
    alignment = load_json(ALIGNMENT_JSON)
    checks: list[dict] = []

    metadata = alignment.get("hf_repo_metadata_observed", {})
    api_listing = metadata.get("api_file_listing_observed", {})
    live_hf_page = metadata.get("live_hf_page_observed", {})
    sample = alignment.get("public_sample_card_observed", {})
    current = alignment.get("current_repo_alignment", {})
    responsible_use = "\n".join(alignment.get("responsible_use_boundary", []))

    checks.append(
        check(
            "full_dataset_metadata_matches_observed_snapshot",
            metadata.get("repo_id") == EXPECTED_FULL_DATASET["repo_id"]
            and metadata.get("pretty_name") == EXPECTED_FULL_DATASET["pretty_name"]
            and metadata.get("repo_sha") == EXPECTED_FULL_DATASET["repo_sha"]
            and metadata.get("last_modified") == EXPECTED_FULL_DATASET["last_modified"]
            and metadata.get("gated") == EXPECTED_FULL_DATASET["gated"]
            and metadata.get("license") == EXPECTED_FULL_DATASET["license"]
            and set(metadata.get("task_categories", [])) == EXPECTED_FULL_DATASET["task_categories"]
            and set(metadata.get("modalities", [])) == EXPECTED_FULL_DATASET["modalities"]
            and set(metadata.get("card_tags", [])) == EXPECTED_FULL_DATASET["card_tags"]
            and live_hf_page.get("total_file_size_display") == EXPECTED_FULL_DATASET["total_file_size_display"]
            and live_hf_page.get("used_storage_bytes_observed") == EXPECTED_FULL_DATASET["used_storage_bytes_observed"],
            "gated full-dataset metadata, card tags, and live HF file-size display match the recorded snapshot",
            ["docs/data/xperience10m_dataset_card_alignment.json"],
        )
    )
    checks.append(
        check(
            "api_listing_snapshot_is_consistent",
            all(api_listing.get(key) == value for key, value in EXPECTED_API_LISTING.items()),
            "HF API file-listing counts remain internally consistent in the committed alignment JSON",
            ["docs/data/xperience10m_dataset_card_alignment.json"],
        )
    )
    checks.append(
        check(
            "sample_card_metadata_is_preserved",
            sample.get("repo_id") == EXPECTED_SAMPLE["repo_id"]
            and sample.get("pretty_name") == EXPECTED_SAMPLE["pretty_name"]
            and sample.get("license") == EXPECTED_SAMPLE["license"]
            and set(sample.get("tooling", [])) == EXPECTED_SAMPLE["tooling"],
            "public sample card license and tooling are recorded separately from the gated full dataset",
            ["docs/data/xperience10m_dataset_card_alignment.json"],
        )
    )

    modality_text = "\n".join(alignment.get("official_modalities", []))
    missing_modalities = [marker for marker in MODALITY_MARKERS if marker not in modality_text]
    checks.append(
        check(
            "official_modality_description_is_complete",
            not missing_modalities,
            f"missing modality markers={missing_modalities}",
            ["docs/data/xperience10m_dataset_card_alignment.json"],
        )
    )

    not_claimed = set(current.get("not_yet_claimed", []))
    checks.append(
        check(
            "current_project_scope_is_explicit",
            current.get("validated_episode_count") == 1
            and current.get("validated_frames") == 5821
            and current.get("validated_windows") == 1161
            and current.get("current_feature_dim") == 8546
            and current.get("raw_data_redistributed") is False
            and "extracted into the current baseline feature vector" in current.get("audio_feature_status", "")
            and set(CURRENT_PROJECT_LIMIT_MARKERS).issubset(not_claimed),
            "one-episode scope, audio status, raw-data exclusion, and current project coverage are present",
            ["docs/data/xperience10m_dataset_card_alignment.json"],
        )
    )
    checks.append(
        check(
            "responsible_use_disclaimer_is_preserved",
            "limited in diversity" in responsible_use
            and "showcase/production quality" in responsible_use
            and "identity recognition" in responsible_use
            and "surveillance" in responsible_use
            and "sensitive attribute inference" in responsible_use,
            "official limited-diversity and prohibited-use notes are preserved",
            ["docs/data/xperience10m_dataset_card_alignment.json"],
        )
    )

    repo_marker_records = [marker_record(ROOT, path, markers) for path, markers in PRESENTATION_MARKERS.items()]
    hf_marker_records = [marker_record(hf_root, path, markers) for path, markers in HF_PRESENTATION_MARKERS.items()]
    checks.append(
        check(
            "repo_public_surfaces_preserve_source_markers",
            all(item["status"] == "pass" for item in repo_marker_records),
            "README, data notice, alignment doc, and website expose official dataset facts, sample details, and project coverage",
            [item["path"] for item in repo_marker_records],
        )
    )
    checks.append(
        check(
            "hf_public_cards_preserve_source_markers",
            all(item["status"] == "pass" for item in hf_marker_records),
            "HF Space, artifact dataset, model card, and mirrored project README expose project coverage",
            [item["path"] for item in hf_marker_records],
        )
    )

    failures = [item for item in checks if item["status"] != "pass"]
    payload = {
        "title": "Ropedia Xperience-10M Source Alignment Note",
        "status": "pass" if not failures else "fail",
        "generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"),
        "alignment_json": "docs/data/xperience10m_dataset_card_alignment.json",
        "alignment_summary": {
            "full_dataset_repo": metadata.get("repo_id"),
            "full_dataset_access": metadata.get("gated"),
            "live_hf_file_size_display": live_hf_page.get("total_file_size_display"),
            "full_scale_storage_statement": alignment.get("official_dataset_summary", {}).get("storage_described_by_card"),
            "api_episode_folders": api_listing.get("episode_folder_count"),
            "sample_repo": sample.get("repo_id"),
            "sample_license": sample.get("license"),
            "current_project_scope": "1 public sample episode, 5,821 frames, 1,161 windows, 8,546 current features",
        },
        "checks": checks,
        "repo_marker_records": repo_marker_records,
        "hf_marker_records": hf_marker_records,
        "failures": failures,
    }
    return payload


def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument("--hf-root", type=Path, default=DEFAULT_HF_ROOT)
    parser.add_argument("--output-json", type=Path, default=OUTPUT_JSON)
    parser.add_argument("--output-md", type=Path, default=OUTPUT_MD)
    args = parser.parse_args()

    payload = build_report(args.hf_root.resolve())
    args.output_json.parent.mkdir(parents=True, exist_ok=True)
    args.output_json.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
    args.output_md.write_text(render_markdown(payload), encoding="utf-8")
    print(f"{payload['status'].upper()}: wrote {args.output_json}")
    print(f"{payload['status'].upper()}: wrote {args.output_md}")
    return 0 if payload["status"] == "pass" else 1


if __name__ == "__main__":
    raise SystemExit(main())