ropedia-xperience-10m-task-baselines / scripts /validate_publication_package.py
cy0307's picture
Publish Ropedia Xperience-10M task baseline cards
45c1706 verified
#!/usr/bin/env python3
"""Validate public package contents for the repo and HF bundles.
This check scans the GitHub repo plus the prepared Hugging Face
Space/artifact/model folders for generated Python caches, raw Xperience-10M
data, heavyweight checkpoint formats that do not belong in this public package,
and accidental Hugging Face token strings.
"""
from __future__ import annotations
import argparse
import json
import re
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
DEFAULT_HF_ROOT = ROOT.parent / "hf_publish"
BANNED_DIR_NAMES = {"__pycache__"}
BANNED_FILE_NAMES = {".DS_Store"}
BANNED_SUFFIXES = {".pyc", ".pyo"}
RAW_DATA_SUFFIXES = {".mp4", ".hdf5", ".h5", ".rrd"}
HEAVY_MODEL_SUFFIXES = {".safetensors", ".bin", ".tar"}
TEXT_SUFFIXES = {
"",
".cff",
".csv",
".html",
".json",
".md",
".py",
".sh",
".svg",
".txt",
".webmanifest",
".xml",
".yaml",
".yml",
}
TOKEN_PATTERN = re.compile(r"hf_[A-Za-z0-9]{20,}")
STALE_PRESENTATION_STRINGS = {
"xperience10m-" + "modalities-v9-large-atlas": "old task-suite infographic cache key",
"xperience10m-" + "taskfirst-v10": "older task-suite infographic cache key",
"Start with the large native " + "modality atlas": "old suite-section hierarchy copy",
"ChatGPT" + "-image": "internal image-generation tool wording in public copy",
"H" + "20": "private compute infrastructure wording in public copy",
"A" + "100": "private compute infrastructure wording in public copy",
"Cur" + "sor": "editor/work-session wording in public copy",
"public " + "dashboard and generated figures " + "deliberately " + "follow": "meta design-process wording in public copy",
}
LOCAL_PATH_PATTERNS = {
"/" + "Users/": "local macOS user path in public text",
"/" + "private/": "local scratch path in public text",
}
CARD_FRESHNESS_EXPECTATIONS = [
{
"surface": "github_repo",
"relative_path": "README.md",
"required": [
"xperience10m-taskfirst-v13-modality-xl",
"XPERIENCE10M_DATASET_CARD_ALIGNMENT.md",
"SOURCE_ALIGNMENT_AUDIT.md",
"EVALUATION_PROTOCOL.md",
"FIGURE_INDEX.md",
"brand_assets.json",
"PROJECT_STATUS.md",
"RESEARCH_TAKEAWAYS.md",
"xperience10m-logo-social-card.png",
"build_brand_assets.py",
"build_research_takeaways.py",
"research_takeaways.json",
"cc-by-nc-4.0",
"12,103 episode folders",
"all 12 task families before the",
"Public-sample modality thumbnails remain enlarged below",
"interactive scrub/play walkthrough storyboard",
"task_surface_integrity.json",
"rendered_site_check.json",
"public_surface_qa.json",
],
},
{
"surface": "hf_space_bundle",
"relative_path": "README.md",
"required": [
"xperience10m-taskfirst-v13-modality-xl",
"xperience10m_dataset_card_alignment.json",
"source_alignment_audit.json",
"evaluation_protocol.json",
"figure_index.json",
"brand_assets.json",
"project_status.json",
"research_takeaways.json",
"xperience10m-logo-social-card.png",
"build_brand_assets.py",
"build_research_takeaways.py",
"cc-by-nc-4.0",
"12,103 episode folders",
"Ropedia Xperience-10M 12-task infographic",
"responsive native modality atlas",
"interactive scrub/play walkthrough storyboard",
"website HTML",
"task_surface_integrity.json",
"rendered_site_check.json",
"public_surface_qa.json",
],
},
{
"surface": "hf_artifact_bundle",
"relative_path": "README.md",
"required": [
"xperience10m-taskfirst-v13-modality-xl",
"xperience10m_dataset_card_alignment.json",
"source_alignment_audit.json",
"evaluation_protocol.json",
"figure_index.json",
"brand_assets.json",
"project_status.json",
"research_takeaways.json",
"xperience10m-logo-social-card.png",
"build_brand_assets.py",
"build_research_takeaways.py",
"cc-by-nc-4.0",
"12,103 episode folders",
"task-first 12-task map",
"interactive scrub/play walkthrough storyboard",
"website HTML",
"task_surface_integrity.json",
"rendered_site_check.json",
"public_surface_qa.json",
],
},
{
"surface": "hf_artifact_bundle",
"relative_path": "PROJECT_README.md",
"required": [
"xperience10m-taskfirst-v13-modality-xl",
"XPERIENCE10M_DATASET_CARD_ALIGNMENT.md",
"SOURCE_ALIGNMENT_AUDIT.md",
"EVALUATION_PROTOCOL.md",
"FIGURE_INDEX.md",
"brand_assets.json",
"PROJECT_STATUS.md",
"RESEARCH_TAKEAWAYS.md",
"xperience10m-logo-social-card.png",
"build_brand_assets.py",
"build_research_takeaways.py",
"research_takeaways.json",
"cc-by-nc-4.0",
"12,103 episode folders",
"all 12 task families before the",
"Public-sample modality thumbnails remain enlarged below",
"interactive scrub/play walkthrough storyboard",
"task_surface_integrity.json",
"rendered_site_check.json",
"public_surface_qa.json",
],
},
{
"surface": "hf_model_bundle",
"relative_path": "README.md",
"required": [
"xperience10m-taskfirst-v13-modality-xl",
"xperience10m_dataset_card_alignment.json",
"source_alignment_audit.json",
"evaluation_protocol.json",
"figure_index.json",
"brand_assets.json",
"project_status.json",
"research_takeaways.json",
"xperience10m-logo-social-card.png",
"build_brand_assets.py",
"build_research_takeaways.py",
"cc-by-nc-4.0",
"12,103 episode folders",
"Ropedia Xperience-10M 12-task infographic",
"responsive native modality atlas",
"interactive scrub/play walkthrough storyboard",
"website HTML",
"task_surface_integrity.json",
"rendered_site_check.json",
"public_surface_qa.json",
],
},
]
def rel(path: Path, base: Path) -> str:
try:
return path.relative_to(base).as_posix()
except ValueError:
return path.as_posix()
def git_public_paths(root: Path) -> list[Path] | None:
try:
result = subprocess.run(
["git", "-C", str(root), "ls-files", "--cached", "--others", "--exclude-standard"],
check=True,
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
text=True,
)
except (OSError, subprocess.CalledProcessError):
return None
return [root / line for line in result.stdout.splitlines() if line.strip()]
def iter_public_files(root: Path, paths: list[Path] | None = None):
if paths is not None:
for path in paths:
if path.exists():
yield path
return
if not root.exists():
return
for path in root.rglob("*"):
parts = set(path.parts)
if ".git" in parts or ".venv" in parts or "venv" in parts:
continue
yield path
def scan(root: Path, *, paths: list[Path] | None = None, display_root: str | None = None) -> dict:
violations: list[dict] = []
text_files = 0
total_files = 0
largest_file = {"path": None, "bytes": 0}
for path in iter_public_files(root, paths):
path_rel = rel(path, root)
if path.is_dir():
if path.name in BANNED_DIR_NAMES:
violations.append({"kind": "generated_cache_dir", "path": path_rel})
continue
total_files += 1
size = path.stat().st_size
if size > largest_file["bytes"]:
largest_file = {"path": path_rel, "bytes": size}
suffix = path.suffix.lower()
if path.name in BANNED_FILE_NAMES or suffix in BANNED_SUFFIXES:
violations.append({"kind": "generated_cache_file", "path": path_rel})
if suffix in RAW_DATA_SUFFIXES:
violations.append({"kind": "raw_xperience10m_data", "path": path_rel})
if suffix in HEAVY_MODEL_SUFFIXES:
violations.append({"kind": "heavy_model_or_archive", "path": path_rel})
if suffix in TEXT_SUFFIXES:
text_files += 1
try:
text = path.read_text(encoding="utf-8", errors="ignore")
except OSError:
continue
if TOKEN_PATTERN.search(text):
violations.append({"kind": "possible_hf_token", "path": path_rel})
for needle, reason in LOCAL_PATH_PATTERNS.items():
if needle in text:
violations.append({
"kind": "local_filesystem_path",
"path": path_rel,
"detail": reason,
})
for needle, reason in STALE_PRESENTATION_STRINGS.items():
if needle in text:
violations.append({
"kind": "stale_presentation_copy",
"path": path_rel,
"detail": reason,
})
return {
"root": display_root or rel(root, ROOT.parent),
"exists": root.exists(),
"file_count": total_files,
"text_file_count": text_files,
"largest_file": largest_file,
"violations": violations,
}
def required_assets(root: Path) -> dict[str, bool]:
required = [
"README.md",
"CITATION.cff",
"LICENSE",
"codemeta.json",
"ARTIFACT_GUIDE.md",
"PROJECT_STATUS.md",
"RESEARCH_ROADMAP.md",
"RESEARCH_TAKEAWAYS.md",
"QUALITY_GATES.md",
"PUBLIC_SURFACE_QA.md",
"RENDERED_SITE_CHECK.md",
"EVALUATION_PROTOCOL.md",
"FIGURE_INDEX.md",
"SOURCE_ALIGNMENT_AUDIT.md",
"XPERIENCE10M_DATASET_CARD_ALIGNMENT.md",
"REPRODUCIBILITY.md",
"EVIDENCE_CONTRACT.md",
"DATA_NOTICE.md",
"docs/404.html",
"docs/apple-touch-icon.png",
"docs/favicon.svg",
"docs/favicon.png",
"docs/index.html",
"docs/research_roadmap.html",
"docs/robots.txt",
"docs/site.webmanifest",
"docs/sitemap.xml",
"docs/data/brand_assets.json",
"docs/data/evidence_contract.json",
"docs/data/evaluation_protocol.json",
"docs/data/figure_index.json",
"docs/data/source_alignment_audit.json",
"docs/data/artifact_index.json",
"docs/data/live_publication_status.json",
"docs/data/quality_gates.json",
"docs/data/project_manifest.json",
"docs/data/project_packet.json",
"docs/data/project_status.json",
"docs/data/research_roadmap.json",
"docs/data/research_roadmap_interactive.json",
"docs/data/research_takeaways.json",
"docs/data/xperience10m_dataset_card_alignment.json",
"docs/data/reproducibility_matrix.json",
"docs/data/modality_atlas.json",
"docs/data/mirror_parity.json",
"docs/data/public_surface_qa.json",
"docs/data/rendered_site_check.json",
"docs/data/scope_claims_audit.json",
"docs/data/task_surface_integrity.json",
"docs/data/website_integrity.json",
"docs/data/summary_metrics.json",
"docs/assets/modalities/video.jpg",
"docs/assets/modalities/audio.png",
"docs/assets/modalities/depth.jpg",
"docs/assets/modalities/pose_slam.png",
"docs/assets/modalities/motion_capture.png",
"docs/assets/modalities/inertial.png",
"docs/assets/modalities/language.png",
"docs/assets/brand/xperience10m-logo-apple-touch.png",
"docs/assets/brand/xperience10m-logo-favicon-32.png",
"docs/assets/brand/xperience10m-logo-favicon-64.png",
"docs/assets/brand/xperience10m-logo-mark.png",
"docs/assets/brand/xperience10m-logo-mark-192.png",
"docs/assets/brand/xperience10m-logo-mark-512.png",
"docs/assets/brand/xperience10m-logo-social-card.png",
"docs/assets/task_suite_infographic.png",
"docs/assets/pipeline_diagram.png",
"docs/assets/task_architectures.png",
"results/episode_task_suite/summary_report.json",
"results/episode_task_suite/feature_manifest.json",
"results/episode_task_suite/neural_mlp/timeline_action/metrics.json",
"results/omni_finetune/DATA_ACCESS_STATUS.md",
"results/omni_finetune/MULTI_EPISODE_ACCESS_STATUS.md",
"scripts/episode_task_suite.py",
"scripts/neural_task_models.py",
"scripts/build_artifact_index.py",
"scripts/build_brand_assets.py",
"scripts/build_evaluation_protocol.py",
"scripts/build_figure_index.py",
"scripts/build_quality_gates.py",
"scripts/build_public_surface_qa.py",
"scripts/build_rendered_site_check.py",
"scripts/build_interactive_research_roadmap.py",
"scripts/verify_live_publication.py",
"scripts/validate_mirror_parity.py",
"scripts/validate_scope_claims.py",
"scripts/validate_source_alignment.py",
"scripts/validate_task_surface.py",
"scripts/validate_website_integrity.py",
"scripts/publish_hf_bundles.py",
"scripts/omni/train_qwen3_omni_lora.py",
]
return {item: (root / item).exists() for item in required}
def public_card_freshness(roots: dict[str, Path]) -> list[dict]:
records = []
for item in CARD_FRESHNESS_EXPECTATIONS:
surface = item["surface"]
path = roots[surface] / item["relative_path"]
text = path.read_text(encoding="utf-8", errors="ignore") if path.exists() else ""
missing = [marker for marker in item["required"] if marker not in text]
records.append({
"surface": surface,
"path": item["relative_path"],
"exists": path.exists(),
"required_marker_count": len(item["required"]),
"missing_markers": missing,
"status": "pass" if path.exists() and not missing else "fail",
})
return records
def build_report(hf_root: Path) -> dict:
roots = {
"github_repo": ROOT,
"hf_space_bundle": hf_root / "space",
"hf_artifact_bundle": hf_root / "artifacts",
"hf_model_bundle": hf_root / "model",
}
root_labels = {
"github_repo": "repo",
"hf_space_bundle": "hf_publish/space",
"hf_artifact_bundle": "hf_publish/artifacts",
"hf_model_bundle": "hf_publish/model",
}
scans = {}
for name, path in roots.items():
public_paths = git_public_paths(path) if name == "github_repo" else None
scans[name] = scan(path, paths=public_paths, display_root=root_labels[name])
assets = required_assets(ROOT)
card_freshness = public_card_freshness(roots)
missing_assets = [path for path, present in assets.items() if not present]
violations = [
{"root": name, **violation}
for name, result in scans.items()
for violation in result["violations"]
]
checks = [
{
"name": "required_publication_assets_present",
"status": "pass" if not missing_assets else "fail",
"missing": missing_assets,
},
{
"name": "no_generated_python_caches",
"status": "pass"
if not any(v["kind"].startswith("generated_cache") for v in violations)
else "fail",
"count": sum(1 for v in violations if v["kind"].startswith("generated_cache")),
},
{
"name": "no_raw_xperience10m_data",
"status": "pass" if not any(v["kind"] == "raw_xperience10m_data" for v in violations) else "fail",
"count": sum(1 for v in violations if v["kind"] == "raw_xperience10m_data"),
},
{
"name": "no_heavy_model_archives",
"status": "pass" if not any(v["kind"] == "heavy_model_or_archive" for v in violations) else "fail",
"count": sum(1 for v in violations if v["kind"] == "heavy_model_or_archive"),
},
{
"name": "no_hf_tokens_in_public_text",
"status": "pass" if not any(v["kind"] == "possible_hf_token" for v in violations) else "fail",
"count": sum(1 for v in violations if v["kind"] == "possible_hf_token"),
},
{
"name": "no_local_filesystem_paths_in_public_text",
"status": "pass" if not any(v["kind"] == "local_filesystem_path" for v in violations) else "fail",
"count": sum(1 for v in violations if v["kind"] == "local_filesystem_path"),
},
{
"name": "no_stale_task_suite_presentation_copy",
"status": "pass" if not any(v["kind"] == "stale_presentation_copy" for v in violations) else "fail",
"count": sum(1 for v in violations if v["kind"] == "stale_presentation_copy"),
},
{
"name": "public_cards_reference_taskfirst_figure",
"status": "pass" if all(item["status"] == "pass" for item in card_freshness) else "fail",
"failures": [item for item in card_freshness if item["status"] != "pass"],
},
]
status = "pass" if all(check["status"] == "pass" for check in checks) else "fail"
return {
"status": status,
"generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"),
"checks": checks,
"required_assets": assets,
"public_card_freshness": card_freshness,
"scans": scans,
"violations": violations,
}
def main() -> int:
parser = argparse.ArgumentParser()
parser.add_argument("--hf-root", type=Path, default=DEFAULT_HF_ROOT)
parser.add_argument("--output", type=Path, default=ROOT / "docs/data/publication_audit.json")
args = parser.parse_args()
report = build_report(args.hf_root.resolve())
args.output.parent.mkdir(parents=True, exist_ok=True)
args.output.write_text(json.dumps(report, indent=2) + "\n", encoding="utf-8")
print(f"{report['status'].upper()}: wrote {args.output}")
if report["status"] != "pass":
for violation in report["violations"][:40]:
print(f"- {violation['root']}: {violation['kind']} {violation['path']}")
if len(report["violations"]) > 40:
print(f"- ... {len(report['violations']) - 40} more violations")
return 1
return 0
if __name__ == "__main__":
raise SystemExit(main())