#!/usr/bin/env python3 """Validate public package contents for the repo and HF bundles. This check scans the GitHub repo plus the prepared Hugging Face Space/artifact/model folders for generated Python caches, raw Xperience-10M data, heavyweight checkpoint formats that do not belong in this public package, and accidental Hugging Face token strings. """ from __future__ import annotations import argparse import json import re import subprocess import sys from datetime import datetime, timezone from pathlib import Path ROOT = Path(__file__).resolve().parents[1] DEFAULT_HF_ROOT = ROOT.parent / "hf_publish" BANNED_DIR_NAMES = {"__pycache__"} BANNED_FILE_NAMES = {".DS_Store"} BANNED_SUFFIXES = {".pyc", ".pyo"} RAW_DATA_SUFFIXES = {".mp4", ".hdf5", ".h5", ".rrd"} HEAVY_MODEL_SUFFIXES = {".safetensors", ".bin", ".tar"} TEXT_SUFFIXES = { "", ".cff", ".csv", ".html", ".json", ".md", ".py", ".sh", ".svg", ".txt", ".webmanifest", ".xml", ".yaml", ".yml", } TOKEN_PATTERN = re.compile(r"hf_[A-Za-z0-9]{20,}") STALE_PRESENTATION_STRINGS = { "xperience10m-" + "modalities-v9-large-atlas": "old task-suite infographic cache key", "xperience10m-" + "taskfirst-v10": "older task-suite infographic cache key", "Start with the large native " + "modality atlas": "old suite-section hierarchy copy", "ChatGPT" + "-image": "internal image-generation tool wording in public copy", "H" + "20": "private compute infrastructure wording in public copy", "A" + "100": "private compute infrastructure wording in public copy", "Cur" + "sor": "editor/work-session wording in public copy", "public " + "dashboard and generated figures " + "deliberately " + "follow": "meta design-process wording in public copy", } LOCAL_PATH_PATTERNS = { "/" + "Users/": "local macOS user path in public text", "/" + "private/": "local scratch path in public text", } CARD_FRESHNESS_EXPECTATIONS = [ { "surface": "github_repo", "relative_path": "README.md", "required": [ "xperience10m-taskfirst-v13-modality-xl", "XPERIENCE10M_DATASET_CARD_ALIGNMENT.md", "SOURCE_ALIGNMENT_AUDIT.md", "EVALUATION_PROTOCOL.md", "FIGURE_INDEX.md", "brand_assets.json", "PROJECT_STATUS.md", "RESEARCH_TAKEAWAYS.md", "xperience10m-logo-social-card.png", "build_brand_assets.py", "build_research_takeaways.py", "research_takeaways.json", "cc-by-nc-4.0", "12,103 episode folders", "all 12 task families before the", "Public-sample modality thumbnails remain enlarged below", "interactive scrub/play walkthrough storyboard", "task_surface_integrity.json", "rendered_site_check.json", "public_surface_qa.json", ], }, { "surface": "hf_space_bundle", "relative_path": "README.md", "required": [ "xperience10m-taskfirst-v13-modality-xl", "xperience10m_dataset_card_alignment.json", "source_alignment_audit.json", "evaluation_protocol.json", "figure_index.json", "brand_assets.json", "project_status.json", "research_takeaways.json", "xperience10m-logo-social-card.png", "build_brand_assets.py", "build_research_takeaways.py", "cc-by-nc-4.0", "12,103 episode folders", "Ropedia Xperience-10M 12-task infographic", "responsive native modality atlas", "interactive scrub/play walkthrough storyboard", "website HTML", "task_surface_integrity.json", "rendered_site_check.json", "public_surface_qa.json", ], }, { "surface": "hf_artifact_bundle", "relative_path": "README.md", "required": [ "xperience10m-taskfirst-v13-modality-xl", "xperience10m_dataset_card_alignment.json", "source_alignment_audit.json", "evaluation_protocol.json", "figure_index.json", "brand_assets.json", "project_status.json", "research_takeaways.json", "xperience10m-logo-social-card.png", "build_brand_assets.py", "build_research_takeaways.py", "cc-by-nc-4.0", "12,103 episode folders", "task-first 12-task map", "interactive scrub/play walkthrough storyboard", "website HTML", "task_surface_integrity.json", "rendered_site_check.json", "public_surface_qa.json", ], }, { "surface": "hf_artifact_bundle", "relative_path": "PROJECT_README.md", "required": [ "xperience10m-taskfirst-v13-modality-xl", "XPERIENCE10M_DATASET_CARD_ALIGNMENT.md", "SOURCE_ALIGNMENT_AUDIT.md", "EVALUATION_PROTOCOL.md", "FIGURE_INDEX.md", "brand_assets.json", "PROJECT_STATUS.md", "RESEARCH_TAKEAWAYS.md", "xperience10m-logo-social-card.png", "build_brand_assets.py", "build_research_takeaways.py", "research_takeaways.json", "cc-by-nc-4.0", "12,103 episode folders", "all 12 task families before the", "Public-sample modality thumbnails remain enlarged below", "interactive scrub/play walkthrough storyboard", "task_surface_integrity.json", "rendered_site_check.json", "public_surface_qa.json", ], }, { "surface": "hf_model_bundle", "relative_path": "README.md", "required": [ "xperience10m-taskfirst-v13-modality-xl", "xperience10m_dataset_card_alignment.json", "source_alignment_audit.json", "evaluation_protocol.json", "figure_index.json", "brand_assets.json", "project_status.json", "research_takeaways.json", "xperience10m-logo-social-card.png", "build_brand_assets.py", "build_research_takeaways.py", "cc-by-nc-4.0", "12,103 episode folders", "Ropedia Xperience-10M 12-task infographic", "responsive native modality atlas", "interactive scrub/play walkthrough storyboard", "website HTML", "task_surface_integrity.json", "rendered_site_check.json", "public_surface_qa.json", ], }, ] def rel(path: Path, base: Path) -> str: try: return path.relative_to(base).as_posix() except ValueError: return path.as_posix() def git_public_paths(root: Path) -> list[Path] | None: try: result = subprocess.run( ["git", "-C", str(root), "ls-files", "--cached", "--others", "--exclude-standard"], check=True, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True, ) except (OSError, subprocess.CalledProcessError): return None return [root / line for line in result.stdout.splitlines() if line.strip()] def iter_public_files(root: Path, paths: list[Path] | None = None): if paths is not None: for path in paths: if path.exists(): yield path return if not root.exists(): return for path in root.rglob("*"): parts = set(path.parts) if ".git" in parts or ".venv" in parts or "venv" in parts: continue yield path def scan(root: Path, *, paths: list[Path] | None = None, display_root: str | None = None) -> dict: violations: list[dict] = [] text_files = 0 total_files = 0 largest_file = {"path": None, "bytes": 0} for path in iter_public_files(root, paths): path_rel = rel(path, root) if path.is_dir(): if path.name in BANNED_DIR_NAMES: violations.append({"kind": "generated_cache_dir", "path": path_rel}) continue total_files += 1 size = path.stat().st_size if size > largest_file["bytes"]: largest_file = {"path": path_rel, "bytes": size} suffix = path.suffix.lower() if path.name in BANNED_FILE_NAMES or suffix in BANNED_SUFFIXES: violations.append({"kind": "generated_cache_file", "path": path_rel}) if suffix in RAW_DATA_SUFFIXES: violations.append({"kind": "raw_xperience10m_data", "path": path_rel}) if suffix in HEAVY_MODEL_SUFFIXES: violations.append({"kind": "heavy_model_or_archive", "path": path_rel}) if suffix in TEXT_SUFFIXES: text_files += 1 try: text = path.read_text(encoding="utf-8", errors="ignore") except OSError: continue if TOKEN_PATTERN.search(text): violations.append({"kind": "possible_hf_token", "path": path_rel}) for needle, reason in LOCAL_PATH_PATTERNS.items(): if needle in text: violations.append({ "kind": "local_filesystem_path", "path": path_rel, "detail": reason, }) for needle, reason in STALE_PRESENTATION_STRINGS.items(): if needle in text: violations.append({ "kind": "stale_presentation_copy", "path": path_rel, "detail": reason, }) return { "root": display_root or rel(root, ROOT.parent), "exists": root.exists(), "file_count": total_files, "text_file_count": text_files, "largest_file": largest_file, "violations": violations, } def required_assets(root: Path) -> dict[str, bool]: required = [ "README.md", "CITATION.cff", "LICENSE", "codemeta.json", "ARTIFACT_GUIDE.md", "PROJECT_STATUS.md", "RESEARCH_ROADMAP.md", "RESEARCH_TAKEAWAYS.md", "QUALITY_GATES.md", "PUBLIC_SURFACE_QA.md", "RENDERED_SITE_CHECK.md", "EVALUATION_PROTOCOL.md", "FIGURE_INDEX.md", "SOURCE_ALIGNMENT_AUDIT.md", "XPERIENCE10M_DATASET_CARD_ALIGNMENT.md", "REPRODUCIBILITY.md", "EVIDENCE_CONTRACT.md", "DATA_NOTICE.md", "docs/404.html", "docs/apple-touch-icon.png", "docs/favicon.svg", "docs/favicon.png", "docs/index.html", "docs/research_roadmap.html", "docs/robots.txt", "docs/site.webmanifest", "docs/sitemap.xml", "docs/data/brand_assets.json", "docs/data/evidence_contract.json", "docs/data/evaluation_protocol.json", "docs/data/figure_index.json", "docs/data/source_alignment_audit.json", "docs/data/artifact_index.json", "docs/data/live_publication_status.json", "docs/data/quality_gates.json", "docs/data/project_manifest.json", "docs/data/project_packet.json", "docs/data/project_status.json", "docs/data/research_roadmap.json", "docs/data/research_roadmap_interactive.json", "docs/data/research_takeaways.json", "docs/data/xperience10m_dataset_card_alignment.json", "docs/data/reproducibility_matrix.json", "docs/data/modality_atlas.json", "docs/data/mirror_parity.json", "docs/data/public_surface_qa.json", "docs/data/rendered_site_check.json", "docs/data/scope_claims_audit.json", "docs/data/task_surface_integrity.json", "docs/data/website_integrity.json", "docs/data/summary_metrics.json", "docs/assets/modalities/video.jpg", "docs/assets/modalities/audio.png", "docs/assets/modalities/depth.jpg", "docs/assets/modalities/pose_slam.png", "docs/assets/modalities/motion_capture.png", "docs/assets/modalities/inertial.png", "docs/assets/modalities/language.png", "docs/assets/brand/xperience10m-logo-apple-touch.png", "docs/assets/brand/xperience10m-logo-favicon-32.png", "docs/assets/brand/xperience10m-logo-favicon-64.png", "docs/assets/brand/xperience10m-logo-mark.png", "docs/assets/brand/xperience10m-logo-mark-192.png", "docs/assets/brand/xperience10m-logo-mark-512.png", "docs/assets/brand/xperience10m-logo-social-card.png", "docs/assets/task_suite_infographic.png", "docs/assets/pipeline_diagram.png", "docs/assets/task_architectures.png", "results/episode_task_suite/summary_report.json", "results/episode_task_suite/feature_manifest.json", "results/episode_task_suite/neural_mlp/timeline_action/metrics.json", "results/omni_finetune/DATA_ACCESS_STATUS.md", "results/omni_finetune/MULTI_EPISODE_ACCESS_STATUS.md", "scripts/episode_task_suite.py", "scripts/neural_task_models.py", "scripts/build_artifact_index.py", "scripts/build_brand_assets.py", "scripts/build_evaluation_protocol.py", "scripts/build_figure_index.py", "scripts/build_quality_gates.py", "scripts/build_public_surface_qa.py", "scripts/build_rendered_site_check.py", "scripts/build_interactive_research_roadmap.py", "scripts/verify_live_publication.py", "scripts/validate_mirror_parity.py", "scripts/validate_scope_claims.py", "scripts/validate_source_alignment.py", "scripts/validate_task_surface.py", "scripts/validate_website_integrity.py", "scripts/publish_hf_bundles.py", "scripts/omni/train_qwen3_omni_lora.py", ] return {item: (root / item).exists() for item in required} def public_card_freshness(roots: dict[str, Path]) -> list[dict]: records = [] for item in CARD_FRESHNESS_EXPECTATIONS: surface = item["surface"] path = roots[surface] / item["relative_path"] text = path.read_text(encoding="utf-8", errors="ignore") if path.exists() else "" missing = [marker for marker in item["required"] if marker not in text] records.append({ "surface": surface, "path": item["relative_path"], "exists": path.exists(), "required_marker_count": len(item["required"]), "missing_markers": missing, "status": "pass" if path.exists() and not missing else "fail", }) return records def build_report(hf_root: Path) -> dict: roots = { "github_repo": ROOT, "hf_space_bundle": hf_root / "space", "hf_artifact_bundle": hf_root / "artifacts", "hf_model_bundle": hf_root / "model", } root_labels = { "github_repo": "repo", "hf_space_bundle": "hf_publish/space", "hf_artifact_bundle": "hf_publish/artifacts", "hf_model_bundle": "hf_publish/model", } scans = {} for name, path in roots.items(): public_paths = git_public_paths(path) if name == "github_repo" else None scans[name] = scan(path, paths=public_paths, display_root=root_labels[name]) assets = required_assets(ROOT) card_freshness = public_card_freshness(roots) missing_assets = [path for path, present in assets.items() if not present] violations = [ {"root": name, **violation} for name, result in scans.items() for violation in result["violations"] ] checks = [ { "name": "required_publication_assets_present", "status": "pass" if not missing_assets else "fail", "missing": missing_assets, }, { "name": "no_generated_python_caches", "status": "pass" if not any(v["kind"].startswith("generated_cache") for v in violations) else "fail", "count": sum(1 for v in violations if v["kind"].startswith("generated_cache")), }, { "name": "no_raw_xperience10m_data", "status": "pass" if not any(v["kind"] == "raw_xperience10m_data" for v in violations) else "fail", "count": sum(1 for v in violations if v["kind"] == "raw_xperience10m_data"), }, { "name": "no_heavy_model_archives", "status": "pass" if not any(v["kind"] == "heavy_model_or_archive" for v in violations) else "fail", "count": sum(1 for v in violations if v["kind"] == "heavy_model_or_archive"), }, { "name": "no_hf_tokens_in_public_text", "status": "pass" if not any(v["kind"] == "possible_hf_token" for v in violations) else "fail", "count": sum(1 for v in violations if v["kind"] == "possible_hf_token"), }, { "name": "no_local_filesystem_paths_in_public_text", "status": "pass" if not any(v["kind"] == "local_filesystem_path" for v in violations) else "fail", "count": sum(1 for v in violations if v["kind"] == "local_filesystem_path"), }, { "name": "no_stale_task_suite_presentation_copy", "status": "pass" if not any(v["kind"] == "stale_presentation_copy" for v in violations) else "fail", "count": sum(1 for v in violations if v["kind"] == "stale_presentation_copy"), }, { "name": "public_cards_reference_taskfirst_figure", "status": "pass" if all(item["status"] == "pass" for item in card_freshness) else "fail", "failures": [item for item in card_freshness if item["status"] != "pass"], }, ] status = "pass" if all(check["status"] == "pass" for check in checks) else "fail" return { "status": status, "generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"), "checks": checks, "required_assets": assets, "public_card_freshness": card_freshness, "scans": scans, "violations": violations, } def main() -> int: parser = argparse.ArgumentParser() parser.add_argument("--hf-root", type=Path, default=DEFAULT_HF_ROOT) parser.add_argument("--output", type=Path, default=ROOT / "docs/data/publication_audit.json") args = parser.parse_args() report = build_report(args.hf_root.resolve()) args.output.parent.mkdir(parents=True, exist_ok=True) args.output.write_text(json.dumps(report, indent=2) + "\n", encoding="utf-8") print(f"{report['status'].upper()}: wrote {args.output}") if report["status"] != "pass": for violation in report["violations"][:40]: print(f"- {violation['root']}: {violation['kind']} {violation['path']}") if len(report["violations"]) > 40: print(f"- ... {len(report['violations']) - 40} more violations") return 1 return 0 if __name__ == "__main__": raise SystemExit(main())