| |
| """Validate public package contents for the repo and HF bundles. |
| |
| This check scans the GitHub repo plus the prepared Hugging Face |
| Space/artifact/model folders for generated Python caches, raw Xperience-10M |
| data, heavyweight checkpoint formats that do not belong in this public package, |
| and accidental Hugging Face token strings. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import re |
| import subprocess |
| import sys |
| from datetime import datetime, timezone |
| from pathlib import Path |
|
|
|
|
| ROOT = Path(__file__).resolve().parents[1] |
| DEFAULT_HF_ROOT = ROOT.parent / "hf_publish" |
|
|
| BANNED_DIR_NAMES = {"__pycache__"} |
| BANNED_FILE_NAMES = {".DS_Store"} |
| BANNED_SUFFIXES = {".pyc", ".pyo"} |
| RAW_DATA_SUFFIXES = {".mp4", ".hdf5", ".h5", ".rrd"} |
| HEAVY_MODEL_SUFFIXES = {".safetensors", ".bin", ".tar"} |
| TEXT_SUFFIXES = { |
| "", |
| ".cff", |
| ".csv", |
| ".html", |
| ".json", |
| ".md", |
| ".py", |
| ".sh", |
| ".svg", |
| ".txt", |
| ".webmanifest", |
| ".xml", |
| ".yaml", |
| ".yml", |
| } |
| TOKEN_PATTERN = re.compile(r"hf_[A-Za-z0-9]{20,}") |
| STALE_PRESENTATION_STRINGS = { |
| "xperience10m-" + "modalities-v9-large-atlas": "old task-suite infographic cache key", |
| "xperience10m-" + "taskfirst-v10": "older task-suite infographic cache key", |
| "Start with the large native " + "modality atlas": "old suite-section hierarchy copy", |
| "ChatGPT" + "-image": "internal image-generation tool wording in public copy", |
| "H" + "20": "private compute infrastructure wording in public copy", |
| "A" + "100": "private compute infrastructure wording in public copy", |
| "Cur" + "sor": "editor/work-session wording in public copy", |
| "public " + "dashboard and generated figures " + "deliberately " + "follow": "meta design-process wording in public copy", |
| } |
| LOCAL_PATH_PATTERNS = { |
| "/" + "Users/": "local macOS user path in public text", |
| "/" + "private/": "local scratch path in public text", |
| } |
| CARD_FRESHNESS_EXPECTATIONS = [ |
| { |
| "surface": "github_repo", |
| "relative_path": "README.md", |
| "required": [ |
| "xperience10m-taskfirst-v13-modality-xl", |
| "XPERIENCE10M_DATASET_CARD_ALIGNMENT.md", |
| "SOURCE_ALIGNMENT_AUDIT.md", |
| "EVALUATION_PROTOCOL.md", |
| "FIGURE_INDEX.md", |
| "brand_assets.json", |
| "PROJECT_STATUS.md", |
| "RESEARCH_TAKEAWAYS.md", |
| "xperience10m-logo-social-card.png", |
| "build_brand_assets.py", |
| "build_research_takeaways.py", |
| "research_takeaways.json", |
| "cc-by-nc-4.0", |
| "12,103 episode folders", |
| "all 12 task families before the", |
| "Public-sample modality thumbnails remain enlarged below", |
| "interactive scrub/play walkthrough storyboard", |
| "task_surface_integrity.json", |
| "rendered_site_check.json", |
| "public_surface_qa.json", |
| ], |
| }, |
| { |
| "surface": "hf_space_bundle", |
| "relative_path": "README.md", |
| "required": [ |
| "xperience10m-taskfirst-v13-modality-xl", |
| "xperience10m_dataset_card_alignment.json", |
| "source_alignment_audit.json", |
| "evaluation_protocol.json", |
| "figure_index.json", |
| "brand_assets.json", |
| "project_status.json", |
| "research_takeaways.json", |
| "xperience10m-logo-social-card.png", |
| "build_brand_assets.py", |
| "build_research_takeaways.py", |
| "cc-by-nc-4.0", |
| "12,103 episode folders", |
| "Ropedia Xperience-10M 12-task infographic", |
| "responsive native modality atlas", |
| "interactive scrub/play walkthrough storyboard", |
| "website HTML", |
| "task_surface_integrity.json", |
| "rendered_site_check.json", |
| "public_surface_qa.json", |
| ], |
| }, |
| { |
| "surface": "hf_artifact_bundle", |
| "relative_path": "README.md", |
| "required": [ |
| "xperience10m-taskfirst-v13-modality-xl", |
| "xperience10m_dataset_card_alignment.json", |
| "source_alignment_audit.json", |
| "evaluation_protocol.json", |
| "figure_index.json", |
| "brand_assets.json", |
| "project_status.json", |
| "research_takeaways.json", |
| "xperience10m-logo-social-card.png", |
| "build_brand_assets.py", |
| "build_research_takeaways.py", |
| "cc-by-nc-4.0", |
| "12,103 episode folders", |
| "task-first 12-task map", |
| "interactive scrub/play walkthrough storyboard", |
| "website HTML", |
| "task_surface_integrity.json", |
| "rendered_site_check.json", |
| "public_surface_qa.json", |
| ], |
| }, |
| { |
| "surface": "hf_artifact_bundle", |
| "relative_path": "PROJECT_README.md", |
| "required": [ |
| "xperience10m-taskfirst-v13-modality-xl", |
| "XPERIENCE10M_DATASET_CARD_ALIGNMENT.md", |
| "SOURCE_ALIGNMENT_AUDIT.md", |
| "EVALUATION_PROTOCOL.md", |
| "FIGURE_INDEX.md", |
| "brand_assets.json", |
| "PROJECT_STATUS.md", |
| "RESEARCH_TAKEAWAYS.md", |
| "xperience10m-logo-social-card.png", |
| "build_brand_assets.py", |
| "build_research_takeaways.py", |
| "research_takeaways.json", |
| "cc-by-nc-4.0", |
| "12,103 episode folders", |
| "all 12 task families before the", |
| "Public-sample modality thumbnails remain enlarged below", |
| "interactive scrub/play walkthrough storyboard", |
| "task_surface_integrity.json", |
| "rendered_site_check.json", |
| "public_surface_qa.json", |
| ], |
| }, |
| { |
| "surface": "hf_model_bundle", |
| "relative_path": "README.md", |
| "required": [ |
| "xperience10m-taskfirst-v13-modality-xl", |
| "xperience10m_dataset_card_alignment.json", |
| "source_alignment_audit.json", |
| "evaluation_protocol.json", |
| "figure_index.json", |
| "brand_assets.json", |
| "project_status.json", |
| "research_takeaways.json", |
| "xperience10m-logo-social-card.png", |
| "build_brand_assets.py", |
| "build_research_takeaways.py", |
| "cc-by-nc-4.0", |
| "12,103 episode folders", |
| "Ropedia Xperience-10M 12-task infographic", |
| "responsive native modality atlas", |
| "interactive scrub/play walkthrough storyboard", |
| "website HTML", |
| "task_surface_integrity.json", |
| "rendered_site_check.json", |
| "public_surface_qa.json", |
| ], |
| }, |
| ] |
|
|
|
|
| def rel(path: Path, base: Path) -> str: |
| try: |
| return path.relative_to(base).as_posix() |
| except ValueError: |
| return path.as_posix() |
|
|
|
|
| def git_public_paths(root: Path) -> list[Path] | None: |
| try: |
| result = subprocess.run( |
| ["git", "-C", str(root), "ls-files", "--cached", "--others", "--exclude-standard"], |
| check=True, |
| stdout=subprocess.PIPE, |
| stderr=subprocess.DEVNULL, |
| text=True, |
| ) |
| except (OSError, subprocess.CalledProcessError): |
| return None |
| return [root / line for line in result.stdout.splitlines() if line.strip()] |
|
|
|
|
| def iter_public_files(root: Path, paths: list[Path] | None = None): |
| if paths is not None: |
| for path in paths: |
| if path.exists(): |
| yield path |
| return |
| if not root.exists(): |
| return |
| for path in root.rglob("*"): |
| parts = set(path.parts) |
| if ".git" in parts or ".venv" in parts or "venv" in parts: |
| continue |
| yield path |
|
|
|
|
| def scan(root: Path, *, paths: list[Path] | None = None, display_root: str | None = None) -> dict: |
| violations: list[dict] = [] |
| text_files = 0 |
| total_files = 0 |
| largest_file = {"path": None, "bytes": 0} |
|
|
| for path in iter_public_files(root, paths): |
| path_rel = rel(path, root) |
| if path.is_dir(): |
| if path.name in BANNED_DIR_NAMES: |
| violations.append({"kind": "generated_cache_dir", "path": path_rel}) |
| continue |
|
|
| total_files += 1 |
| size = path.stat().st_size |
| if size > largest_file["bytes"]: |
| largest_file = {"path": path_rel, "bytes": size} |
|
|
| suffix = path.suffix.lower() |
| if path.name in BANNED_FILE_NAMES or suffix in BANNED_SUFFIXES: |
| violations.append({"kind": "generated_cache_file", "path": path_rel}) |
| if suffix in RAW_DATA_SUFFIXES: |
| violations.append({"kind": "raw_xperience10m_data", "path": path_rel}) |
| if suffix in HEAVY_MODEL_SUFFIXES: |
| violations.append({"kind": "heavy_model_or_archive", "path": path_rel}) |
|
|
| if suffix in TEXT_SUFFIXES: |
| text_files += 1 |
| try: |
| text = path.read_text(encoding="utf-8", errors="ignore") |
| except OSError: |
| continue |
| if TOKEN_PATTERN.search(text): |
| violations.append({"kind": "possible_hf_token", "path": path_rel}) |
| for needle, reason in LOCAL_PATH_PATTERNS.items(): |
| if needle in text: |
| violations.append({ |
| "kind": "local_filesystem_path", |
| "path": path_rel, |
| "detail": reason, |
| }) |
| for needle, reason in STALE_PRESENTATION_STRINGS.items(): |
| if needle in text: |
| violations.append({ |
| "kind": "stale_presentation_copy", |
| "path": path_rel, |
| "detail": reason, |
| }) |
|
|
| return { |
| "root": display_root or rel(root, ROOT.parent), |
| "exists": root.exists(), |
| "file_count": total_files, |
| "text_file_count": text_files, |
| "largest_file": largest_file, |
| "violations": violations, |
| } |
|
|
|
|
| def required_assets(root: Path) -> dict[str, bool]: |
| required = [ |
| "README.md", |
| "CITATION.cff", |
| "LICENSE", |
| "codemeta.json", |
| "ARTIFACT_GUIDE.md", |
| "PROJECT_STATUS.md", |
| "RESEARCH_ROADMAP.md", |
| "RESEARCH_TAKEAWAYS.md", |
| "QUALITY_GATES.md", |
| "PUBLIC_SURFACE_QA.md", |
| "RENDERED_SITE_CHECK.md", |
| "EVALUATION_PROTOCOL.md", |
| "FIGURE_INDEX.md", |
| "SOURCE_ALIGNMENT_AUDIT.md", |
| "XPERIENCE10M_DATASET_CARD_ALIGNMENT.md", |
| "REPRODUCIBILITY.md", |
| "EVIDENCE_CONTRACT.md", |
| "DATA_NOTICE.md", |
| "docs/404.html", |
| "docs/apple-touch-icon.png", |
| "docs/favicon.svg", |
| "docs/favicon.png", |
| "docs/index.html", |
| "docs/research_roadmap.html", |
| "docs/robots.txt", |
| "docs/site.webmanifest", |
| "docs/sitemap.xml", |
| "docs/data/brand_assets.json", |
| "docs/data/evidence_contract.json", |
| "docs/data/evaluation_protocol.json", |
| "docs/data/figure_index.json", |
| "docs/data/source_alignment_audit.json", |
| "docs/data/artifact_index.json", |
| "docs/data/live_publication_status.json", |
| "docs/data/quality_gates.json", |
| "docs/data/project_manifest.json", |
| "docs/data/project_packet.json", |
| "docs/data/project_status.json", |
| "docs/data/research_roadmap.json", |
| "docs/data/research_roadmap_interactive.json", |
| "docs/data/research_takeaways.json", |
| "docs/data/xperience10m_dataset_card_alignment.json", |
| "docs/data/reproducibility_matrix.json", |
| "docs/data/modality_atlas.json", |
| "docs/data/mirror_parity.json", |
| "docs/data/public_surface_qa.json", |
| "docs/data/rendered_site_check.json", |
| "docs/data/scope_claims_audit.json", |
| "docs/data/task_surface_integrity.json", |
| "docs/data/website_integrity.json", |
| "docs/data/summary_metrics.json", |
| "docs/assets/modalities/video.jpg", |
| "docs/assets/modalities/audio.png", |
| "docs/assets/modalities/depth.jpg", |
| "docs/assets/modalities/pose_slam.png", |
| "docs/assets/modalities/motion_capture.png", |
| "docs/assets/modalities/inertial.png", |
| "docs/assets/modalities/language.png", |
| "docs/assets/brand/xperience10m-logo-apple-touch.png", |
| "docs/assets/brand/xperience10m-logo-favicon-32.png", |
| "docs/assets/brand/xperience10m-logo-favicon-64.png", |
| "docs/assets/brand/xperience10m-logo-mark.png", |
| "docs/assets/brand/xperience10m-logo-mark-192.png", |
| "docs/assets/brand/xperience10m-logo-mark-512.png", |
| "docs/assets/brand/xperience10m-logo-social-card.png", |
| "docs/assets/task_suite_infographic.png", |
| "docs/assets/pipeline_diagram.png", |
| "docs/assets/task_architectures.png", |
| "results/episode_task_suite/summary_report.json", |
| "results/episode_task_suite/feature_manifest.json", |
| "results/episode_task_suite/neural_mlp/timeline_action/metrics.json", |
| "results/omni_finetune/DATA_ACCESS_STATUS.md", |
| "results/omni_finetune/MULTI_EPISODE_ACCESS_STATUS.md", |
| "scripts/episode_task_suite.py", |
| "scripts/neural_task_models.py", |
| "scripts/build_artifact_index.py", |
| "scripts/build_brand_assets.py", |
| "scripts/build_evaluation_protocol.py", |
| "scripts/build_figure_index.py", |
| "scripts/build_quality_gates.py", |
| "scripts/build_public_surface_qa.py", |
| "scripts/build_rendered_site_check.py", |
| "scripts/build_interactive_research_roadmap.py", |
| "scripts/verify_live_publication.py", |
| "scripts/validate_mirror_parity.py", |
| "scripts/validate_scope_claims.py", |
| "scripts/validate_source_alignment.py", |
| "scripts/validate_task_surface.py", |
| "scripts/validate_website_integrity.py", |
| "scripts/publish_hf_bundles.py", |
| "scripts/omni/train_qwen3_omni_lora.py", |
| ] |
| return {item: (root / item).exists() for item in required} |
|
|
|
|
| def public_card_freshness(roots: dict[str, Path]) -> list[dict]: |
| records = [] |
| for item in CARD_FRESHNESS_EXPECTATIONS: |
| surface = item["surface"] |
| path = roots[surface] / item["relative_path"] |
| text = path.read_text(encoding="utf-8", errors="ignore") if path.exists() else "" |
| missing = [marker for marker in item["required"] if marker not in text] |
| records.append({ |
| "surface": surface, |
| "path": item["relative_path"], |
| "exists": path.exists(), |
| "required_marker_count": len(item["required"]), |
| "missing_markers": missing, |
| "status": "pass" if path.exists() and not missing else "fail", |
| }) |
| return records |
|
|
|
|
| def build_report(hf_root: Path) -> dict: |
| roots = { |
| "github_repo": ROOT, |
| "hf_space_bundle": hf_root / "space", |
| "hf_artifact_bundle": hf_root / "artifacts", |
| "hf_model_bundle": hf_root / "model", |
| } |
| root_labels = { |
| "github_repo": "repo", |
| "hf_space_bundle": "hf_publish/space", |
| "hf_artifact_bundle": "hf_publish/artifacts", |
| "hf_model_bundle": "hf_publish/model", |
| } |
| scans = {} |
| for name, path in roots.items(): |
| public_paths = git_public_paths(path) if name == "github_repo" else None |
| scans[name] = scan(path, paths=public_paths, display_root=root_labels[name]) |
| assets = required_assets(ROOT) |
| card_freshness = public_card_freshness(roots) |
| missing_assets = [path for path, present in assets.items() if not present] |
| violations = [ |
| {"root": name, **violation} |
| for name, result in scans.items() |
| for violation in result["violations"] |
| ] |
| checks = [ |
| { |
| "name": "required_publication_assets_present", |
| "status": "pass" if not missing_assets else "fail", |
| "missing": missing_assets, |
| }, |
| { |
| "name": "no_generated_python_caches", |
| "status": "pass" |
| if not any(v["kind"].startswith("generated_cache") for v in violations) |
| else "fail", |
| "count": sum(1 for v in violations if v["kind"].startswith("generated_cache")), |
| }, |
| { |
| "name": "no_raw_xperience10m_data", |
| "status": "pass" if not any(v["kind"] == "raw_xperience10m_data" for v in violations) else "fail", |
| "count": sum(1 for v in violations if v["kind"] == "raw_xperience10m_data"), |
| }, |
| { |
| "name": "no_heavy_model_archives", |
| "status": "pass" if not any(v["kind"] == "heavy_model_or_archive" for v in violations) else "fail", |
| "count": sum(1 for v in violations if v["kind"] == "heavy_model_or_archive"), |
| }, |
| { |
| "name": "no_hf_tokens_in_public_text", |
| "status": "pass" if not any(v["kind"] == "possible_hf_token" for v in violations) else "fail", |
| "count": sum(1 for v in violations if v["kind"] == "possible_hf_token"), |
| }, |
| { |
| "name": "no_local_filesystem_paths_in_public_text", |
| "status": "pass" if not any(v["kind"] == "local_filesystem_path" for v in violations) else "fail", |
| "count": sum(1 for v in violations if v["kind"] == "local_filesystem_path"), |
| }, |
| { |
| "name": "no_stale_task_suite_presentation_copy", |
| "status": "pass" if not any(v["kind"] == "stale_presentation_copy" for v in violations) else "fail", |
| "count": sum(1 for v in violations if v["kind"] == "stale_presentation_copy"), |
| }, |
| { |
| "name": "public_cards_reference_taskfirst_figure", |
| "status": "pass" if all(item["status"] == "pass" for item in card_freshness) else "fail", |
| "failures": [item for item in card_freshness if item["status"] != "pass"], |
| }, |
| ] |
| status = "pass" if all(check["status"] == "pass" for check in checks) else "fail" |
| return { |
| "status": status, |
| "generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"), |
| "checks": checks, |
| "required_assets": assets, |
| "public_card_freshness": card_freshness, |
| "scans": scans, |
| "violations": violations, |
| } |
|
|
|
|
| def main() -> int: |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--hf-root", type=Path, default=DEFAULT_HF_ROOT) |
| parser.add_argument("--output", type=Path, default=ROOT / "docs/data/publication_audit.json") |
| args = parser.parse_args() |
|
|
| report = build_report(args.hf_root.resolve()) |
| args.output.parent.mkdir(parents=True, exist_ok=True) |
| args.output.write_text(json.dumps(report, indent=2) + "\n", encoding="utf-8") |
| print(f"{report['status'].upper()}: wrote {args.output}") |
| if report["status"] != "pass": |
| for violation in report["violations"][:40]: |
| print(f"- {violation['root']}: {violation['kind']} {violation['path']}") |
| if len(report["violations"]) > 40: |
| print(f"- ... {len(report['violations']) - 40} more violations") |
| return 1 |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|