File size: 15,831 Bytes
29331c9
 
 
 
 
cf07180
29331c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d58132
29331c9
 
 
 
 
 
 
 
 
 
 
9d58132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29331c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf07180
a8124a8
29331c9
 
 
 
 
 
9371cfb
29331c9
 
 
 
 
 
 
 
9d58132
 
29331c9
 
 
 
 
9d58132
29331c9
 
 
 
9d58132
 
29331c9
 
 
 
 
9d58132
29331c9
 
 
 
 
 
 
 
 
 
 
 
 
9d58132
 
29331c9
 
 
 
cf07180
9d58132
29331c9
 
 
 
 
 
 
9d58132
 
29331c9
 
 
 
9371cfb
9d58132
29331c9
 
 
 
9d58132
 
29331c9
 
 
 
 
9d58132
29331c9
 
 
 
 
9d58132
 
29331c9
 
 
 
9d58132
29331c9
 
 
 
9d58132
 
29331c9
 
 
 
 
9371cfb
9d58132
29331c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
476e8e8
29331c9
476e8e8
 
29331c9
 
 
 
 
 
 
 
 
9d58132
 
29331c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf07180
29331c9
 
9d58132
29331c9
cf07180
29331c9
 
 
 
 
 
 
 
 
 
 
 
 
9d58132
29331c9
 
9d58132
29331c9
 
 
 
 
9d58132
29331c9
 
 
 
 
9d58132
 
 
 
 
29331c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf07180
29331c9
 
 
a8124a8
29331c9
a8124a8
cf07180
 
29331c9
 
 
9d58132
 
 
 
 
 
 
 
cf07180
9d58132
 
 
29331c9
 
 
 
 
 
 
6a1869c
29331c9
 
 
 
 
 
 
6a1869c
29331c9
 
 
 
 
 
476e8e8
29331c9
 
 
 
 
 
9d58132
 
29331c9
 
 
a8124a8
29331c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
#!/usr/bin/env python3
"""Validate Xperience-10M source-description alignment.

This is an offline gate over committed source-alignment facts. It checks that
the repo distinguishes the gated full dataset, the public sample card, and this
project's one-episode scope across the main repo, website, and HF cards.
"""

from __future__ import annotations

import argparse
import json
from datetime import datetime, timezone
from pathlib import Path


ROOT = Path(__file__).resolve().parents[1]
DEFAULT_HF_ROOT = ROOT.parent / "hf_publish"
OUTPUT_JSON = ROOT / "docs/data/source_alignment_audit.json"
OUTPUT_MD = ROOT / "SOURCE_ALIGNMENT_AUDIT.md"
ALIGNMENT_JSON = ROOT / "docs/data/xperience10m_dataset_card_alignment.json"

EXPECTED_FULL_DATASET = {
    "repo_id": "ropedia-ai/xperience-10m",
    "pretty_name": "Xperience-10M",
    "repo_sha": "ce943cf271a758b60240084892d05cf6dc12dd90",
    "last_modified": "2026-04-21T05:03:45.000Z",
    "gated": "manual",
    "license": "other",
    "task_categories": {
        "video-classification",
        "image-to-text",
        "depth-estimation",
        "robotics",
    },
    "modalities": {"3d", "audio", "video"},
    "card_tags": {
        "egocentric",
        "first-person",
        "multimodal",
        "3d",
        "4d",
        "embodied-ai",
        "robotics",
        "human-motion",
        "mocap",
        "imu",
        "audio",
        "depth",
        "captions",
        "video",
    },
    "total_file_size_display": "31.9 TB",
    "used_storage_bytes_observed": 31871115497224,
}

EXPECTED_API_LISTING = {
    "sibling_count": 85258,
    "session_folder_count": 803,
    "episode_folder_count": 12103,
    "annotation_hdf5_count": 12103,
    "mp4_count": 72612,
    "visualization_rrd_count": 541,
}

EXPECTED_SAMPLE = {
    "repo_id": "ropedia-ai/xperience-10m-sample",
    "pretty_name": "Xperience-10M-Sample",
    "license": "cc-by-nc-4.0",
    "tooling": {"HOMIE Toolkit", "Rerun 0.29.0 for visualization.rrd"},
}

MODALITY_MARKERS = [
    "six RGB video streams",
    "audio",
    "stereo depth",
    "camera pose",
    "SLAM",
    "two-hand motion capture",
    "full-body motion capture",
    "inertial",
    "language",
    "metadata",
    "calibration",
]

CURRENT_PROJECT_LIMIT_MARKERS = [
    "large-scale audio-visual pretraining",
    "caption generation",
    "depth-pixel estimation",
    "SLAM estimation",
    "neural rendering",
    "policy learning",
    "cross-episode generalization",
    "real held-out multi-episode Qwen3-Omni model quality",
]

PRESENTATION_MARKERS = {
    "README.md": [
        "ropedia-ai/xperience-10m",
        "ropedia-ai/xperience-10m-sample",
        "SOURCE_ALIGNMENT_AUDIT.md",
        "source_alignment_audit.json",
        "31.9 TB",
        "about-1PB",
        "cc-by-nc-4.0",
        "HOMIE Toolkit",
        "Rerun 0.29.0",
        "12,103 episode folders",
        "metadata only",
        "limited in diversity",
    ],
    "XPERIENCE10M_DATASET_CARD_ALIGNMENT.md": [
        "ropedia-ai/xperience-10m",
        "ropedia-ai/xperience-10m-sample",
        "31.9 TB",
        "31,871,115,497,224",
        "cc-by-nc-4.0",
        "HOMIE Toolkit",
        "Rerun 0.29.0",
        "12,103 episode folders",
        "metadata only",
        "limited in diversity",
    ],
    "DATA_NOTICE.md": [
        "ropedia-ai/xperience-10m",
        "ropedia-ai/xperience-10m-sample",
        "cc-by-nc-4.0",
        "HOMIE Toolkit",
        "Rerun 0.29.0",
        "does not redistribute",
    ],
    "docs/index.html": [
        "ropedia-ai/xperience-10m",
        "xperience-10m-sample",
        "data/source_alignment_audit.json",
        "31.9 TB",
        "about-1PB",
        "cc-by-nc-4.0",
        "HOMIE Toolkit",
        "Rerun 0.29.0",
        "12,103 episode folders",
        "not a local data inventory",
        "limited diversity",
    ],
}

HF_PRESENTATION_MARKERS = {
    "space/README.md": [
        "xperience10m_dataset_card_alignment.json",
        "source_alignment_audit.json",
        "31.9 TB",
        "about-1PB",
        "cc-by-nc-4.0",
        "HOMIE Toolkit",
        "Rerun 0.29.0",
        "12,103 episode folders",
        "upstream listing metadata only",
        "limited in diversity",
    ],
    "artifacts/README.md": [
        "xperience10m_dataset_card_alignment.json",
        "source_alignment_audit.json",
        "31.9 TB",
        "about-1PB",
        "cc-by-nc-4.0",
        "HOMIE Toolkit",
        "Rerun 0.29.0",
        "12,103 episode folders",
        "metadata only",
        "limited in diversity",
    ],
    "artifacts/PROJECT_README.md": [
        "ropedia-ai/xperience-10m-sample",
        "SOURCE_ALIGNMENT_AUDIT.md",
        "source_alignment_audit.json",
        "31.9 TB",
        "about-1PB",
        "cc-by-nc-4.0",
        "HOMIE Toolkit",
        "Rerun 0.29.0",
        "12,103 episode folders",
        "limited in diversity",
    ],
    "model/README.md": [
        "xperience10m_dataset_card_alignment.json",
        "source_alignment_audit.json",
        "31.9 TB",
        "about-1PB",
        "cc-by-nc-4.0",
        "HOMIE",
        "Toolkit",
        "Rerun 0.29.0",
        "12,103 episode folders",
        "upstream listing metadata only",
        "limited in diversity",
    ],
}


def load_json(path: Path) -> dict:
    return json.loads(path.read_text(encoding="utf-8"))


def check(name: str, passed: bool, detail: str, evidence: list[str]) -> dict:
    return {
        "name": name,
        "status": "pass" if passed else "fail",
        "detail": detail,
        "evidence": evidence,
    }


def marker_record(base: Path, relative_path: str, markers: list[str]) -> dict:
    path = base / relative_path
    text = path.read_text(encoding="utf-8", errors="ignore") if path.exists() else ""
    missing = [marker for marker in markers if marker not in text]
    return {
        "path": relative_path,
        "exists": path.exists(),
        "required_marker_count": len(markers),
        "missing_markers": missing,
        "status": "pass" if path.exists() and not missing else "fail",
    }


def render_markdown(payload: dict) -> str:
    alignment = payload["alignment_summary"]
    lines = [
        "# Source Alignment Note",
        "",
        "This file records how the repo, website, and HF cards present the same",
        "Xperience-10M source facts and current-project language.",
        "",
        f"Current status: **{payload['status']}**",
        "",
        "## Source Facts",
        "",
        "| Layer | Current value |",
        "| --- | --- |",
        f"| Full dataset repo | `{alignment['full_dataset_repo']}` |",
        f"| Full dataset access | {alignment['full_dataset_access']} |",
        f"| Live HF file-size display | {alignment['live_hf_file_size_display']} |",
        f"| Full-scale storage statement | {alignment['full_scale_storage_statement']} |",
        f"| API episode listing | {alignment['api_episode_folders']:,} episode folders with `annotation.hdf5` as upstream metadata only |",
        f"| Public sample repo | `{alignment['sample_repo']}` |",
        f"| Public sample license | `{alignment['sample_license']}` |",
        f"| Current verified project data | {alignment['current_project_scope']} |",
        "",
        "## Checks",
        "",
        "| Check | Status | Evidence |",
        "| --- | --- | --- |",
    ]
    for item in payload["checks"]:
        evidence = ", ".join(f"`{path}`" for path in item["evidence"])
        lines.append(f"| {item['name']} | {item['status']} | {evidence} |")
    lines.extend([
        "",
        "## Current Project Scope",
        "",
        "- HF API file counts are source-listing metadata, not local data possession.",
        "- The live HF 31.9 TB file-size display is recorded separately from the card's about-1PB full-scale storage statement.",
        "- The public sample license is preserved separately from the gated full dataset license field.",
        "- The official limited-diversity / showcase-quality disclaimer is preserved in the responsible-use notes.",
        "- Raw MP4, HDF5, RRD, private gated data, and full Qwen weights are not redistributed.",
        "- Current model evidence remains one public sample episode, not cross-episode generalization.",
        "",
    ])
    return "\n".join(lines)


def build_report(hf_root: Path) -> dict:
    alignment = load_json(ALIGNMENT_JSON)
    checks: list[dict] = []

    metadata = alignment.get("hf_repo_metadata_observed", {})
    api_listing = metadata.get("api_file_listing_observed", {})
    live_hf_page = metadata.get("live_hf_page_observed", {})
    sample = alignment.get("public_sample_card_observed", {})
    current = alignment.get("current_repo_alignment", {})
    responsible_use = "\n".join(alignment.get("responsible_use_boundary", []))

    checks.append(
        check(
            "full_dataset_metadata_matches_observed_snapshot",
            metadata.get("repo_id") == EXPECTED_FULL_DATASET["repo_id"]
            and metadata.get("pretty_name") == EXPECTED_FULL_DATASET["pretty_name"]
            and metadata.get("repo_sha") == EXPECTED_FULL_DATASET["repo_sha"]
            and metadata.get("last_modified") == EXPECTED_FULL_DATASET["last_modified"]
            and metadata.get("gated") == EXPECTED_FULL_DATASET["gated"]
            and metadata.get("license") == EXPECTED_FULL_DATASET["license"]
            and set(metadata.get("task_categories", [])) == EXPECTED_FULL_DATASET["task_categories"]
            and set(metadata.get("modalities", [])) == EXPECTED_FULL_DATASET["modalities"]
            and set(metadata.get("card_tags", [])) == EXPECTED_FULL_DATASET["card_tags"]
            and live_hf_page.get("total_file_size_display") == EXPECTED_FULL_DATASET["total_file_size_display"]
            and live_hf_page.get("used_storage_bytes_observed") == EXPECTED_FULL_DATASET["used_storage_bytes_observed"],
            "gated full-dataset metadata, card tags, and live HF file-size display match the recorded snapshot",
            ["docs/data/xperience10m_dataset_card_alignment.json"],
        )
    )
    checks.append(
        check(
            "api_listing_snapshot_is_consistent",
            all(api_listing.get(key) == value for key, value in EXPECTED_API_LISTING.items()),
            "HF API file-listing counts remain internally consistent in the committed alignment JSON",
            ["docs/data/xperience10m_dataset_card_alignment.json"],
        )
    )
    checks.append(
        check(
            "sample_card_metadata_is_preserved",
            sample.get("repo_id") == EXPECTED_SAMPLE["repo_id"]
            and sample.get("pretty_name") == EXPECTED_SAMPLE["pretty_name"]
            and sample.get("license") == EXPECTED_SAMPLE["license"]
            and set(sample.get("tooling", [])) == EXPECTED_SAMPLE["tooling"],
            "public sample card license and tooling are recorded separately from the gated full dataset",
            ["docs/data/xperience10m_dataset_card_alignment.json"],
        )
    )

    modality_text = "\n".join(alignment.get("official_modalities", []))
    missing_modalities = [marker for marker in MODALITY_MARKERS if marker not in modality_text]
    checks.append(
        check(
            "official_modality_description_is_complete",
            not missing_modalities,
            f"missing modality markers={missing_modalities}",
            ["docs/data/xperience10m_dataset_card_alignment.json"],
        )
    )

    not_claimed = set(current.get("not_yet_claimed", []))
    checks.append(
        check(
            "current_project_scope_is_explicit",
            current.get("validated_episode_count") == 1
            and current.get("validated_frames") == 5821
            and current.get("validated_windows") == 1161
            and current.get("current_feature_dim") == 8546
            and current.get("raw_data_redistributed") is False
            and "extracted into the current baseline feature vector" in current.get("audio_feature_status", "")
            and set(CURRENT_PROJECT_LIMIT_MARKERS).issubset(not_claimed),
            "one-episode scope, audio status, raw-data exclusion, and current project coverage are present",
            ["docs/data/xperience10m_dataset_card_alignment.json"],
        )
    )
    checks.append(
        check(
            "responsible_use_disclaimer_is_preserved",
            "limited in diversity" in responsible_use
            and "showcase/production quality" in responsible_use
            and "identity recognition" in responsible_use
            and "surveillance" in responsible_use
            and "sensitive attribute inference" in responsible_use,
            "official limited-diversity and prohibited-use notes are preserved",
            ["docs/data/xperience10m_dataset_card_alignment.json"],
        )
    )

    repo_marker_records = [marker_record(ROOT, path, markers) for path, markers in PRESENTATION_MARKERS.items()]
    hf_marker_records = [marker_record(hf_root, path, markers) for path, markers in HF_PRESENTATION_MARKERS.items()]
    checks.append(
        check(
            "repo_public_surfaces_preserve_source_markers",
            all(item["status"] == "pass" for item in repo_marker_records),
            "README, data notice, alignment doc, and website expose official dataset facts, sample details, and project coverage",
            [item["path"] for item in repo_marker_records],
        )
    )
    checks.append(
        check(
            "hf_public_cards_preserve_source_markers",
            all(item["status"] == "pass" for item in hf_marker_records),
            "HF Space, artifact dataset, model card, and mirrored project README expose project coverage",
            [item["path"] for item in hf_marker_records],
        )
    )

    failures = [item for item in checks if item["status"] != "pass"]
    payload = {
        "title": "Ropedia Xperience-10M Source Alignment Note",
        "status": "pass" if not failures else "fail",
        "generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"),
        "alignment_json": "docs/data/xperience10m_dataset_card_alignment.json",
        "alignment_summary": {
            "full_dataset_repo": metadata.get("repo_id"),
            "full_dataset_access": metadata.get("gated"),
            "live_hf_file_size_display": live_hf_page.get("total_file_size_display"),
            "full_scale_storage_statement": alignment.get("official_dataset_summary", {}).get("storage_described_by_card"),
            "api_episode_folders": api_listing.get("episode_folder_count"),
            "sample_repo": sample.get("repo_id"),
            "sample_license": sample.get("license"),
            "current_project_scope": "1 public sample episode, 5,821 frames, 1,161 windows, 8,546 current features",
        },
        "checks": checks,
        "repo_marker_records": repo_marker_records,
        "hf_marker_records": hf_marker_records,
        "failures": failures,
    }
    return payload


def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument("--hf-root", type=Path, default=DEFAULT_HF_ROOT)
    parser.add_argument("--output-json", type=Path, default=OUTPUT_JSON)
    parser.add_argument("--output-md", type=Path, default=OUTPUT_MD)
    args = parser.parse_args()

    payload = build_report(args.hf_root.resolve())
    args.output_json.parent.mkdir(parents=True, exist_ok=True)
    args.output_json.write_text(json.dumps(payload, indent=2) + "\n", encoding="utf-8")
    args.output_md.write_text(render_markdown(payload), encoding="utf-8")
    print(f"{payload['status'].upper()}: wrote {args.output_json}")
    print(f"{payload['status'].upper()}: wrote {args.output_md}")
    return 0 if payload["status"] == "pass" else 1


if __name__ == "__main__":
    raise SystemExit(main())