Spaces:

nvidia
/

simready-validator

Sleeping

App Files Files Community

loginowskid commited on 9 days ago

Commit

5c23a4e

1 Parent(s): a444ac9

Preliminary check AA.002 @41a04eb

Browse files

Files changed (2) hide show

tools/hf_space/runner.py +22 -21
tools/validation/plugins/simready-report/skills/simready-report/validate.py +94 -132

tools/hf_space/runner.py CHANGED Viewed

@@ -457,10 +457,10 @@ def _validate_zip_streaming(*, api: HfApi, dataset: str, token: str | None,
     merged_results: list[dict] = []
     merged_layout: list[dict] = []
     # Set when ANY processed unit's results.json carries
-    # layout_aborted=true (the validator's strict pre-check fired).
     # Propagated into the final dict so the dashboard sees the flag
     # and renders the layout-failed banner instead of generic counts.
-    any_layout_aborted = False
     workers = os.environ.get("SR_WORKERS", "4").strip() or "4"
     cache_hits = 0
     val_ver = _validator_version()
@@ -556,7 +556,7 @@ def _validate_zip_streaming(*, api: HfApi, dataset: str, token: str | None,
         nonlocal cache_hits, zips_processed, profile_autodetect_done
         nonlocal profile, consecutive_unrecoverable, was_cancelled
         nonlocal use_plugin_default, issue_filed_for_registration_bug
-        nonlocal issue_filing_disabled, any_layout_aborted
         # Honor early abort (cancel or unrecoverable failure) — tasks
         # queued before the stop signal still get scheduled and have
         # to no-op themselves.
@@ -594,8 +594,8 @@ def _validate_zip_streaming(*, api: HfApi, dataset: str, token: str | None,
             if cached:
                 merged_results.extend(cached.get("results", []))
                 merged_layout.extend(cached.get("layout_findings") or [])
-                if cached.get("layout_aborted"):
-                    any_layout_aborted = True
                 cache_hits += 1
                 out(f"  [{i+1}/{len(zip_entries)}] cache hit: {zip_rel} "
                     f"({len(cached.get('results', []))} asset(s))")
@@ -788,8 +788,8 @@ def _validate_zip_streaming(*, api: HfApi, dataset: str, token: str | None,
                 zip_layout = rj.get("layout_findings") or []
                 merged_results.extend(zip_results)
                 merged_layout.extend(zip_layout)
-                if rj.get("layout_aborted"):
-                    any_layout_aborted = True
                 out(f"    {len(zip_results)} asset(s); rc={rc}")
                 # Emit a progress write so the dashboard sees the
                 # updated zip-count + per-asset rows immediately
@@ -810,7 +810,7 @@ def _validate_zip_streaming(*, api: HfApi, dataset: str, token: str | None,
                         "zip_sha": zip_sha,
                         "results": zip_results,
                         "layout_findings": zip_layout,
-                        "layout_aborted": bool(rj.get("layout_aborted")),
                         "validator_version": val_ver,
                         "foundation_sha": found_sha,
                         "profile": profile,
@@ -900,7 +900,7 @@ def _validate_zip_streaming(*, api: HfApi, dataset: str, token: str | None,
         "schema_version": 1,
         "results": merged_results,
         "layout_findings": merged_layout,
-        "layout_aborted": any_layout_aborted,
         "profile_coverage": {},
         "streaming_zips": len(zip_entries),
         "streaming_cache_hits": cache_hits,
@@ -911,18 +911,19 @@ def _validate_zip_streaming(*, api: HfApi, dataset: str, token: str | None,
 def _summarize(results_json: dict) -> tuple[str, str]:
     """Return (status, one-line summary)."""
-    # Layout-aborted runs short-circuit the normal "M/N assets passed"
-    # framing — the dataset never got to USD validation because it
-    # doesn't match the SimReady packaging spec. The summary should
-    # name the failure mode so the operator knows what to do
-    # (forward the report to the partner; don't dig for asset-level
-    # issues that don't exist).
-    if results_json.get("layout_aborted"):
-        violations = len(results_json.get("layout_findings") or [])
-        dirs_affected = len(results_json.get("results") or [])
-        return "fail", (f"LAYOUT FAILED — dataset doesn't follow SimReady "
-                        f"packaging spec ({violations} violation(s) across "
-                        f"{dirs_affected} location(s))")
     counts = {"error": 0, "failure": 0, "warning": 0}
     total = len(results_json.get("results", []))
     failed = 0

     merged_results: list[dict] = []
     merged_layout: list[dict] = []
     # Set when ANY processed unit's results.json carries
+    # preliminary_check_failed=true (the validator's strict pre-check fired).
     # Propagated into the final dict so the dashboard sees the flag
     # and renders the layout-failed banner instead of generic counts.
+    any_preliminary_check_failed = False
     workers = os.environ.get("SR_WORKERS", "4").strip() or "4"
     cache_hits = 0
     val_ver = _validator_version()
         nonlocal cache_hits, zips_processed, profile_autodetect_done
         nonlocal profile, consecutive_unrecoverable, was_cancelled
         nonlocal use_plugin_default, issue_filed_for_registration_bug
+        nonlocal issue_filing_disabled, any_preliminary_check_failed
         # Honor early abort (cancel or unrecoverable failure) — tasks
         # queued before the stop signal still get scheduled and have
         # to no-op themselves.
             if cached:
                 merged_results.extend(cached.get("results", []))
                 merged_layout.extend(cached.get("layout_findings") or [])
+                if cached.get("preliminary_check_failed"):
+                    any_preliminary_check_failed = True
                 cache_hits += 1
                 out(f"  [{i+1}/{len(zip_entries)}] cache hit: {zip_rel} "
                     f"({len(cached.get('results', []))} asset(s))")
                 zip_layout = rj.get("layout_findings") or []
                 merged_results.extend(zip_results)
                 merged_layout.extend(zip_layout)
+                if rj.get("preliminary_check_failed"):
+                    any_preliminary_check_failed = True
                 out(f"    {len(zip_results)} asset(s); rc={rc}")
                 # Emit a progress write so the dashboard sees the
                 # updated zip-count + per-asset rows immediately
                         "zip_sha": zip_sha,
                         "results": zip_results,
                         "layout_findings": zip_layout,
+                        "preliminary_check_failed": bool(rj.get("preliminary_check_failed")),
                         "validator_version": val_ver,
                         "foundation_sha": found_sha,
                         "profile": profile,
         "schema_version": 1,
         "results": merged_results,
         "layout_findings": merged_layout,
+        "preliminary_check_failed": any_preliminary_check_failed,
         "profile_coverage": {},
         "streaming_zips": len(zip_entries),
         "streaming_cache_hits": cache_hits,
 def _summarize(results_json: dict) -> tuple[str, str]:
     """Return (status, one-line summary)."""
+    # Preliminary-check failures short-circuit the normal
+    # "M/N assets passed" framing — the dataset didn't get to USD
+    # validation because filesystem-only foundation checks already
+    # flagged issues. The summary names the phase so the operator
+    # knows what to do (forward the report to the partner; address
+    # these before re-validating to surface deeper USD findings).
+    if results_json.get("preliminary_check_failed"):
+        violations = len(results_json.get("preliminary_findings")
+                         or results_json.get("layout_findings") or [])
+        files_affected = len(results_json.get("results") or [])
+        return "fail", (f"PRELIMINARY CHECK FAILED — {violations} foundation-spec "
+                        f"issue(s) across {files_affected} file(s); address these "
+                        f"before re-validating")
     counts = {"error": 0, "failure": 0, "warning": 0}
     total = len(results_json.get("results", []))
     failed = 0

tools/validation/plugins/simready-report/skills/simready-report/validate.py CHANGED Viewed

@@ -15,6 +15,7 @@ import json
 import logging
 import os
 import shutil
 import sys
 import tomllib
 from concurrent.futures import ProcessPoolExecutor, as_completed
@@ -690,129 +691,90 @@ _ATOMIC_ASSET_PATHS = (f"{_FOUNDATION_SPECS_BASE}/core/atomic_asset/"
                       "requirements/anchored-asset-paths.md")
-def check_simready_layout(root: Path) -> list[dict]:
-    """Strict pre-validation check against the SimReady packaging spec.
-    Returns a list of layout failures. Empty list = layout is valid;
-    the validator proceeds to USD traversal. Non-empty = the dataset
-    does not follow the spec; the validator emits these failures and
-    skips USD work entirely.
-    Spec rules enforced (docs/sdk/packaging-spec.md):
-      1. No zip files anywhere — datasets must be delivered unpacked.
-      2. No USD files at the dataset root — aggregator scenes are
-         forbidden. Each asset must live in its own directory.
-      3. Each top-level directory must contain
-         `<dirname>/<dirname>.usd` (the interface file).
-      4. Each top-level directory must contain
-         `.<dirname>.wrapp` (the required package manifest).
-    Hidden dirs (`.thumbs`, `.simready`) and known output dirs
-    (_SKIP_DIR_NAMES) are exempt from the per-dir bundle check.
     """
-    # All four LAYOUT.* codes point at the foundation Atomic Asset
-    # capability spec — it's the authoritative spec for file
-    # packaging, asset references, and supported file types.
-    # Sub-requirement spec pages exist (supported-file-types.md,
-    # anchored-asset-paths.md) but the capability page is the entry
-    # point operators should read first.
-    folder_url = _ATOMIC_ASSET_CAP
-    manifest_url = _ATOMIC_ASSET_CAP
-    fails: list[dict] = []
     try:
-        entries = sorted(root.iterdir())
-    except OSError as e:
-        return [{
-            "code": "LAYOUT.READ_FAILED",
-            "severity": "failure",
-            "path": str(root),
-            "spec_url": _SPEC_URL,
-            "msg": f"Could not read dataset root: {e}",
-        }]
-    has_any_bundle_dir = False
-    for entry in entries:
-        rel = entry.name
-        if entry.is_file():
-            suffix = entry.suffix.lower()
-            if suffix == ".zip":
-                fails.append({
-                    "code": "LAYOUT.ZIP_AT_ROOT",
-                    "severity": "failure",
-                    "path": rel,
-                    "spec_url": folder_url,
-                    "msg": (f"'{rel}' is a zip archive. SimReady datasets "
-                            f"must be delivered as unpacked directories — "
-                            f"extract all archives and re-publish."),
-                })
-            elif suffix in {".usd", ".usda", ".usdc", ".usdz"}:
-                fails.append({
-                    "code": "LAYOUT.USD_AT_ROOT",
-                    "severity": "failure",
-                    "path": rel,
-                    "spec_url": folder_url,
-                    "msg": (f"'{rel}' is a USD file at the dataset root. "
-                            f"Each asset must live in its own directory: "
-                            f"<asset_name>/<asset_name>.usd. Aggregator "
-                            f"scenes at the root are not allowed."),
-                })
-            continue
-        if not entry.is_dir():
-            continue
-        if entry.name.startswith(".") or entry.name in _SKIP_DIR_NAMES:
-            continue
-        has_any_bundle_dir = True
-        # Required: <dirname>/<dirname>.usd interface file.
-        interface_candidates = [
-            entry / f"{entry.name}.usd",
-            entry / f"{entry.name}.usda",
-            entry / f"{entry.name}.usdc",
-        ]
-        interface_present = any(p.is_file() for p in interface_candidates)
-        if not interface_present:
             try:
-                found = [str(p.relative_to(entry))
-                         for p in sorted(entry.rglob("*"))
-                         if p.is_file() and p.suffix.lower() in USD_EXTS][:5]
-            except OSError:
-                found = []
-            hint = (f" Found USDs in this dir: {', '.join(found)}"
-                    if found else " No USD files found in this directory.")
-            fails.append({
-                "code": "LAYOUT.MISSING_INTERFACE",
-                "severity": "failure",
-                "path": rel + "/",
-                "spec_url": folder_url,
-                "msg": (f"Directory '{rel}/' must contain an interface file "
-                        f"named '{rel}.usd' (or .usda/.usdc) per the "
-                        f"SimReady packaging spec.{hint}"),
-            })
-        # Required: .<dirname>.wrapp manifest.
-        manifest = entry / f".{entry.name}.wrapp"
-        if not manifest.is_file():
-            fails.append({
-                "code": "LAYOUT.MISSING_MANIFEST",
                 "severity": "failure",
-                "path": rel + "/",
-                "spec_url": manifest_url,
-                "msg": (f"Directory '{rel}/' is missing the required "
-                        f"package manifest '.{rel}.wrapp'."),
             })
-    if not has_any_bundle_dir and not fails:
-        fails.append({
-            "code": "LAYOUT.EMPTY",
             "severity": "failure",
-            "path": ".",
-            "spec_url": folder_url,
-            "msg": (f"Dataset root contains no asset directories. Per the "
-                    f"SimReady packaging spec, each asset lives in its "
-                    f"own top-level directory."),
         })
-    return fails
 def discover_assets(
@@ -1678,26 +1640,26 @@ def _run_validation_body(args, asset_profile_map: dict[str, str]) -> int:
     print(f"Output: {out_dir}", flush=True)
     print(f"Profile: {args.profile} v{args.version}", flush=True)
-    # Strict pre-validation layout check. Fail fast (no USD work) when
-    # the dataset doesn't follow the SimReady packaging spec — partners
-    # get a clear, citing-the-spec failure list instead of opaque
-    # validator errors from running on a non-conformant layout.
-    layout_fails = check_simready_layout(target)
-    if layout_fails:
         out_dir.mkdir(parents=True, exist_ok=True)
-        print(f"LAYOUT FAILED: {len(layout_fails)} violation(s); "
-              f"skipping USD validation", flush=True)
-        for f in layout_fails:
             print(f"  - {f['code']} {f['path']}: {f['msg'][:200]}", flush=True)
-        # Group violations by offending path so the dashboard renders
-        # one row per directory with its own issue list. The per-code
-        # aggregation table then correctly reports
-        # "MISSING_INTERFACE × N files" — that's the partner-actionable
-        # summary. Without this grouping, one fat row collapses all
-        # 46 issues under "." and the table lumps everything onto a
-        # single asset.
         by_path: dict[str, list[dict]] = {}
-        for f in layout_fails:
             by_path.setdefault(f["path"], []).append(f)
         results = []
         for rel, issues_here in by_path.items():
@@ -1715,8 +1677,8 @@ def _run_validation_body(args, asset_profile_map: dict[str, str]) -> int:
             "profile": args.profile,
             "profile_version": args.version,
             "results": results,
-            "layout_findings": layout_fails,
-            "layout_aborted": True,
         }
         (out_dir / "results.json").write_text(
             json.dumps(results_json, indent=2), encoding="utf-8"

 import logging
 import os
 import shutil
+import subprocess
 import sys
 import tomllib
 from concurrent.futures import ProcessPoolExecutor, as_completed
                       "requirements/anchored-asset-paths.md")
+def run_preliminary_checks(root: Path) -> list[dict]:
+    """Preliminary check phase: filesystem-only foundation requirements,
+    evaluated before USD traversal.
+    These checks are intentionally cheap and deterministic — no LLM
+    calls in the validation hot path. Each rule is a small function
+    with a strong link to its foundation spec section. Drift detection
+    (does our hardcoded list still match the spec?) runs OUT of band
+    via tools/spec_sync/ (a weekly job that uses the agent to compare
+    the foundation spec text to these hardcoded rules and opens a PR
+    on drift).
+    Currently implemented:
+      - AA.002 supported-file-types
+    """
+    return _check_aa_002_supported_file_types(root)
+# AA.002 supported-file-types — hardcoded from the foundation spec at
+# nv_core/sr_specs/docs/capabilities/core/atomic_asset/requirements/
+# supported-file-types.md ("How to comply" section, allowlist).
+# Drift-sync: tools/spec_sync/check_aa_002.py compares this list to
+# the foundation spec on a schedule and opens a PR if they diverge.
+_AA_002_ALLOWED = {
+    ".usd", ".usda", ".usdc", ".usdz",          # USD
+    ".png", ".jpg", ".jpeg", ".exr",            # Images
+    ".m4a", ".mp3", ".wav",                     # Audio
+}
+# Packaging/metadata files the AA.002 spec doesn't govern. Treated
+# as out-of-scope so we don't false-positive on .wrapp manifests,
+# README files, validation receipts, etc.
+_AA_002_METADATA_EXTS = {
+    ".wrapp", ".json", ".yaml", ".yml", ".md", ".txt", ".toml",
+}
+def _check_aa_002_supported_file_types(root: Path) -> list[dict]:
+    """AA.002 — Asset must use only supported file types.
+    Spec: nv_core/sr_specs/docs/capabilities/core/atomic_asset/
+          requirements/supported-file-types.md
+    Walks the dataset tree, flags any file whose extension isn't in
+    the foundation spec's allowlist (excluding pure metadata files
+    which the spec doesn't govern). One issue per offending file
+    with full rel_path.
     """
+    issues: list[dict] = []
     try:
+        for path in root.rglob("*"):
+            if not path.is_file():
+                continue
+            if path.name.startswith("."):
+                continue
+            if any(p.name in _SKIP_DIR_NAMES or p.name.startswith(".")
+                   for p in path.parents if p != root):
+                continue
+            suffix = path.suffix.lower()
+            if suffix in _AA_002_ALLOWED or suffix in _AA_002_METADATA_EXTS:
+                continue
             try:
+                rel = str(path.relative_to(root)).replace("\\", "/")
+            except ValueError:
+                rel = str(path)
+            issues.append({
+                "code": "AA.002",
                 "severity": "failure",
+                "path": rel,
+                "spec_url": _ATOMIC_ASSET_FILE_TYPES,
+                "msg": (f"File '{rel}' uses an unsupported file type "
+                        f"('{suffix or 'no extension'}'). AA.002 "
+                        f"allowlist: USD (.usd, .usda, .usdc, .usdz), "
+                        f"image (.png, .jpg, .jpeg, .exr), "
+                        f"audio (.m4a, .mp3, .wav)."),
             })
+    except OSError as e:
+        issues.append({
+            "code": "AA.002",
             "severity": "failure",
+            "path": str(root),
+            "spec_url": _ATOMIC_ASSET_FILE_TYPES,
+            "msg": f"Could not enumerate dataset files: {e}",
         })
+    return issues
 def discover_assets(
     print(f"Output: {out_dir}", flush=True)
     print(f"Profile: {args.profile} v{args.version}", flush=True)
+    # Preliminary check phase: cheap deterministic foundation-spec
+    # checks (filesystem-only, no USD parsing). Fails fast if any
+    # violation is found — no point burning USD-validation cycles
+    # on a dataset that has obvious spec issues. Single roundtrip:
+    # partners see ALL preliminary violations at once, fix them,
+    # then re-validate. Spec drift between our hardcoded rules and
+    # the foundation source-of-truth is caught by the out-of-band
+    # tools/spec_sync/ job.
+    preliminary_issues = run_preliminary_checks(target)
+    if preliminary_issues:
         out_dir.mkdir(parents=True, exist_ok=True)
+        print(f"PRELIMINARY CHECK: {len(preliminary_issues)} issue(s) — "
+              f"skipping USD validation until these are addressed", flush=True)
+        for f in preliminary_issues:
             print(f"  - {f['code']} {f['path']}: {f['msg'][:200]}", flush=True)
+        # Group by path so the dashboard renders one row per offending
+        # file (consistent with how USD-validation results are shaped
+        # — partners see the same per-asset layout in both phases).
         by_path: dict[str, list[dict]] = {}
+        for f in preliminary_issues:
             by_path.setdefault(f["path"], []).append(f)
         results = []
         for rel, issues_here in by_path.items():
             "profile": args.profile,
             "profile_version": args.version,
             "results": results,
+            "preliminary_findings": preliminary_issues,
+            "preliminary_check_failed": True,
         }
         (out_dir / "results.json").write_text(
             json.dumps(results_json, indent=2), encoding="utf-8"