loginowskid commited on
Commit
5c23a4e
·
1 Parent(s): a444ac9

Preliminary check AA.002 @41a04eb

Browse files
tools/hf_space/runner.py CHANGED
@@ -457,10 +457,10 @@ def _validate_zip_streaming(*, api: HfApi, dataset: str, token: str | None,
457
  merged_results: list[dict] = []
458
  merged_layout: list[dict] = []
459
  # Set when ANY processed unit's results.json carries
460
- # layout_aborted=true (the validator's strict pre-check fired).
461
  # Propagated into the final dict so the dashboard sees the flag
462
  # and renders the layout-failed banner instead of generic counts.
463
- any_layout_aborted = False
464
  workers = os.environ.get("SR_WORKERS", "4").strip() or "4"
465
  cache_hits = 0
466
  val_ver = _validator_version()
@@ -556,7 +556,7 @@ def _validate_zip_streaming(*, api: HfApi, dataset: str, token: str | None,
556
  nonlocal cache_hits, zips_processed, profile_autodetect_done
557
  nonlocal profile, consecutive_unrecoverable, was_cancelled
558
  nonlocal use_plugin_default, issue_filed_for_registration_bug
559
- nonlocal issue_filing_disabled, any_layout_aborted
560
  # Honor early abort (cancel or unrecoverable failure) — tasks
561
  # queued before the stop signal still get scheduled and have
562
  # to no-op themselves.
@@ -594,8 +594,8 @@ def _validate_zip_streaming(*, api: HfApi, dataset: str, token: str | None,
594
  if cached:
595
  merged_results.extend(cached.get("results", []))
596
  merged_layout.extend(cached.get("layout_findings") or [])
597
- if cached.get("layout_aborted"):
598
- any_layout_aborted = True
599
  cache_hits += 1
600
  out(f" [{i+1}/{len(zip_entries)}] cache hit: {zip_rel} "
601
  f"({len(cached.get('results', []))} asset(s))")
@@ -788,8 +788,8 @@ def _validate_zip_streaming(*, api: HfApi, dataset: str, token: str | None,
788
  zip_layout = rj.get("layout_findings") or []
789
  merged_results.extend(zip_results)
790
  merged_layout.extend(zip_layout)
791
- if rj.get("layout_aborted"):
792
- any_layout_aborted = True
793
  out(f" {len(zip_results)} asset(s); rc={rc}")
794
  # Emit a progress write so the dashboard sees the
795
  # updated zip-count + per-asset rows immediately
@@ -810,7 +810,7 @@ def _validate_zip_streaming(*, api: HfApi, dataset: str, token: str | None,
810
  "zip_sha": zip_sha,
811
  "results": zip_results,
812
  "layout_findings": zip_layout,
813
- "layout_aborted": bool(rj.get("layout_aborted")),
814
  "validator_version": val_ver,
815
  "foundation_sha": found_sha,
816
  "profile": profile,
@@ -900,7 +900,7 @@ def _validate_zip_streaming(*, api: HfApi, dataset: str, token: str | None,
900
  "schema_version": 1,
901
  "results": merged_results,
902
  "layout_findings": merged_layout,
903
- "layout_aborted": any_layout_aborted,
904
  "profile_coverage": {},
905
  "streaming_zips": len(zip_entries),
906
  "streaming_cache_hits": cache_hits,
@@ -911,18 +911,19 @@ def _validate_zip_streaming(*, api: HfApi, dataset: str, token: str | None,
911
 
912
  def _summarize(results_json: dict) -> tuple[str, str]:
913
  """Return (status, one-line summary)."""
914
- # Layout-aborted runs short-circuit the normal "M/N assets passed"
915
- # framing — the dataset never got to USD validation because it
916
- # doesn't match the SimReady packaging spec. The summary should
917
- # name the failure mode so the operator knows what to do
918
- # (forward the report to the partner; don't dig for asset-level
919
- # issues that don't exist).
920
- if results_json.get("layout_aborted"):
921
- violations = len(results_json.get("layout_findings") or [])
922
- dirs_affected = len(results_json.get("results") or [])
923
- return "fail", (f"LAYOUT FAILED — dataset doesn't follow SimReady "
924
- f"packaging spec ({violations} violation(s) across "
925
- f"{dirs_affected} location(s))")
 
926
  counts = {"error": 0, "failure": 0, "warning": 0}
927
  total = len(results_json.get("results", []))
928
  failed = 0
 
457
  merged_results: list[dict] = []
458
  merged_layout: list[dict] = []
459
  # Set when ANY processed unit's results.json carries
460
+ # preliminary_check_failed=true (the validator's strict pre-check fired).
461
  # Propagated into the final dict so the dashboard sees the flag
462
  # and renders the layout-failed banner instead of generic counts.
463
+ any_preliminary_check_failed = False
464
  workers = os.environ.get("SR_WORKERS", "4").strip() or "4"
465
  cache_hits = 0
466
  val_ver = _validator_version()
 
556
  nonlocal cache_hits, zips_processed, profile_autodetect_done
557
  nonlocal profile, consecutive_unrecoverable, was_cancelled
558
  nonlocal use_plugin_default, issue_filed_for_registration_bug
559
+ nonlocal issue_filing_disabled, any_preliminary_check_failed
560
  # Honor early abort (cancel or unrecoverable failure) — tasks
561
  # queued before the stop signal still get scheduled and have
562
  # to no-op themselves.
 
594
  if cached:
595
  merged_results.extend(cached.get("results", []))
596
  merged_layout.extend(cached.get("layout_findings") or [])
597
+ if cached.get("preliminary_check_failed"):
598
+ any_preliminary_check_failed = True
599
  cache_hits += 1
600
  out(f" [{i+1}/{len(zip_entries)}] cache hit: {zip_rel} "
601
  f"({len(cached.get('results', []))} asset(s))")
 
788
  zip_layout = rj.get("layout_findings") or []
789
  merged_results.extend(zip_results)
790
  merged_layout.extend(zip_layout)
791
+ if rj.get("preliminary_check_failed"):
792
+ any_preliminary_check_failed = True
793
  out(f" {len(zip_results)} asset(s); rc={rc}")
794
  # Emit a progress write so the dashboard sees the
795
  # updated zip-count + per-asset rows immediately
 
810
  "zip_sha": zip_sha,
811
  "results": zip_results,
812
  "layout_findings": zip_layout,
813
+ "preliminary_check_failed": bool(rj.get("preliminary_check_failed")),
814
  "validator_version": val_ver,
815
  "foundation_sha": found_sha,
816
  "profile": profile,
 
900
  "schema_version": 1,
901
  "results": merged_results,
902
  "layout_findings": merged_layout,
903
+ "preliminary_check_failed": any_preliminary_check_failed,
904
  "profile_coverage": {},
905
  "streaming_zips": len(zip_entries),
906
  "streaming_cache_hits": cache_hits,
 
911
 
912
  def _summarize(results_json: dict) -> tuple[str, str]:
913
  """Return (status, one-line summary)."""
914
+ # Preliminary-check failures short-circuit the normal
915
+ # "M/N assets passed" framing — the dataset didn't get to USD
916
+ # validation because filesystem-only foundation checks already
917
+ # flagged issues. The summary names the phase so the operator
918
+ # knows what to do (forward the report to the partner; address
919
+ # these before re-validating to surface deeper USD findings).
920
+ if results_json.get("preliminary_check_failed"):
921
+ violations = len(results_json.get("preliminary_findings")
922
+ or results_json.get("layout_findings") or [])
923
+ files_affected = len(results_json.get("results") or [])
924
+ return "fail", (f"PRELIMINARY CHECK FAILED — {violations} foundation-spec "
925
+ f"issue(s) across {files_affected} file(s); address these "
926
+ f"before re-validating")
927
  counts = {"error": 0, "failure": 0, "warning": 0}
928
  total = len(results_json.get("results", []))
929
  failed = 0
tools/validation/plugins/simready-report/skills/simready-report/validate.py CHANGED
@@ -15,6 +15,7 @@ import json
15
  import logging
16
  import os
17
  import shutil
 
18
  import sys
19
  import tomllib
20
  from concurrent.futures import ProcessPoolExecutor, as_completed
@@ -690,129 +691,90 @@ _ATOMIC_ASSET_PATHS = (f"{_FOUNDATION_SPECS_BASE}/core/atomic_asset/"
690
  "requirements/anchored-asset-paths.md")
691
 
692
 
693
- def check_simready_layout(root: Path) -> list[dict]:
694
- """Strict pre-validation check against the SimReady packaging spec.
 
695
 
696
- Returns a list of layout failures. Empty list = layout is valid;
697
- the validator proceeds to USD traversal. Non-empty = the dataset
698
- does not follow the spec; the validator emits these failures and
699
- skips USD work entirely.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
700
 
701
- Spec rules enforced (docs/sdk/packaging-spec.md):
702
- 1. No zip files anywhere — datasets must be delivered unpacked.
703
- 2. No USD files at the dataset root — aggregator scenes are
704
- forbidden. Each asset must live in its own directory.
705
- 3. Each top-level directory must contain
706
- `<dirname>/<dirname>.usd` (the interface file).
707
- 4. Each top-level directory must contain
708
- `.<dirname>.wrapp` (the required package manifest).
709
 
710
- Hidden dirs (`.thumbs`, `.simready`) and known output dirs
711
- (_SKIP_DIR_NAMES) are exempt from the per-dir bundle check.
 
 
 
 
 
 
 
 
712
  """
713
- # All four LAYOUT.* codes point at the foundation Atomic Asset
714
- # capability spec — it's the authoritative spec for file
715
- # packaging, asset references, and supported file types.
716
- # Sub-requirement spec pages exist (supported-file-types.md,
717
- # anchored-asset-paths.md) but the capability page is the entry
718
- # point operators should read first.
719
- folder_url = _ATOMIC_ASSET_CAP
720
- manifest_url = _ATOMIC_ASSET_CAP
721
- fails: list[dict] = []
722
  try:
723
- entries = sorted(root.iterdir())
724
- except OSError as e:
725
- return [{
726
- "code": "LAYOUT.READ_FAILED",
727
- "severity": "failure",
728
- "path": str(root),
729
- "spec_url": _SPEC_URL,
730
- "msg": f"Could not read dataset root: {e}",
731
- }]
732
-
733
- has_any_bundle_dir = False
734
- for entry in entries:
735
- rel = entry.name
736
- if entry.is_file():
737
- suffix = entry.suffix.lower()
738
- if suffix == ".zip":
739
- fails.append({
740
- "code": "LAYOUT.ZIP_AT_ROOT",
741
- "severity": "failure",
742
- "path": rel,
743
- "spec_url": folder_url,
744
- "msg": (f"'{rel}' is a zip archive. SimReady datasets "
745
- f"must be delivered as unpacked directories — "
746
- f"extract all archives and re-publish."),
747
- })
748
- elif suffix in {".usd", ".usda", ".usdc", ".usdz"}:
749
- fails.append({
750
- "code": "LAYOUT.USD_AT_ROOT",
751
- "severity": "failure",
752
- "path": rel,
753
- "spec_url": folder_url,
754
- "msg": (f"'{rel}' is a USD file at the dataset root. "
755
- f"Each asset must live in its own directory: "
756
- f"<asset_name>/<asset_name>.usd. Aggregator "
757
- f"scenes at the root are not allowed."),
758
- })
759
- continue
760
- if not entry.is_dir():
761
- continue
762
- if entry.name.startswith(".") or entry.name in _SKIP_DIR_NAMES:
763
- continue
764
- has_any_bundle_dir = True
765
-
766
- # Required: <dirname>/<dirname>.usd interface file.
767
- interface_candidates = [
768
- entry / f"{entry.name}.usd",
769
- entry / f"{entry.name}.usda",
770
- entry / f"{entry.name}.usdc",
771
- ]
772
- interface_present = any(p.is_file() for p in interface_candidates)
773
- if not interface_present:
774
  try:
775
- found = [str(p.relative_to(entry))
776
- for p in sorted(entry.rglob("*"))
777
- if p.is_file() and p.suffix.lower() in USD_EXTS][:5]
778
- except OSError:
779
- found = []
780
- hint = (f" Found USDs in this dir: {', '.join(found)}"
781
- if found else " No USD files found in this directory.")
782
- fails.append({
783
- "code": "LAYOUT.MISSING_INTERFACE",
784
- "severity": "failure",
785
- "path": rel + "/",
786
- "spec_url": folder_url,
787
- "msg": (f"Directory '{rel}/' must contain an interface file "
788
- f"named '{rel}.usd' (or .usda/.usdc) per the "
789
- f"SimReady packaging spec.{hint}"),
790
- })
791
-
792
- # Required: .<dirname>.wrapp manifest.
793
- manifest = entry / f".{entry.name}.wrapp"
794
- if not manifest.is_file():
795
- fails.append({
796
- "code": "LAYOUT.MISSING_MANIFEST",
797
  "severity": "failure",
798
- "path": rel + "/",
799
- "spec_url": manifest_url,
800
- "msg": (f"Directory '{rel}/' is missing the required "
801
- f"package manifest '.{rel}.wrapp'."),
 
 
 
802
  })
803
-
804
- if not has_any_bundle_dir and not fails:
805
- fails.append({
806
- "code": "LAYOUT.EMPTY",
807
  "severity": "failure",
808
- "path": ".",
809
- "spec_url": folder_url,
810
- "msg": (f"Dataset root contains no asset directories. Per the "
811
- f"SimReady packaging spec, each asset lives in its "
812
- f"own top-level directory."),
813
  })
814
-
815
- return fails
816
 
817
 
818
  def discover_assets(
@@ -1678,26 +1640,26 @@ def _run_validation_body(args, asset_profile_map: dict[str, str]) -> int:
1678
  print(f"Output: {out_dir}", flush=True)
1679
  print(f"Profile: {args.profile} v{args.version}", flush=True)
1680
 
1681
- # Strict pre-validation layout check. Fail fast (no USD work) when
1682
- # the dataset doesn't follow the SimReady packaging spec — partners
1683
- # get a clear, citing-the-spec failure list instead of opaque
1684
- # validator errors from running on a non-conformant layout.
1685
- layout_fails = check_simready_layout(target)
1686
- if layout_fails:
 
 
 
 
1687
  out_dir.mkdir(parents=True, exist_ok=True)
1688
- print(f"LAYOUT FAILED: {len(layout_fails)} violation(s); "
1689
- f"skipping USD validation", flush=True)
1690
- for f in layout_fails:
1691
  print(f" - {f['code']} {f['path']}: {f['msg'][:200]}", flush=True)
1692
- # Group violations by offending path so the dashboard renders
1693
- # one row per directory with its own issue list. The per-code
1694
- # aggregation table then correctly reports
1695
- # "MISSING_INTERFACE × N files" — that's the partner-actionable
1696
- # summary. Without this grouping, one fat row collapses all
1697
- # 46 issues under "." and the table lumps everything onto a
1698
- # single asset.
1699
  by_path: dict[str, list[dict]] = {}
1700
- for f in layout_fails:
1701
  by_path.setdefault(f["path"], []).append(f)
1702
  results = []
1703
  for rel, issues_here in by_path.items():
@@ -1715,8 +1677,8 @@ def _run_validation_body(args, asset_profile_map: dict[str, str]) -> int:
1715
  "profile": args.profile,
1716
  "profile_version": args.version,
1717
  "results": results,
1718
- "layout_findings": layout_fails,
1719
- "layout_aborted": True,
1720
  }
1721
  (out_dir / "results.json").write_text(
1722
  json.dumps(results_json, indent=2), encoding="utf-8"
 
15
  import logging
16
  import os
17
  import shutil
18
+ import subprocess
19
  import sys
20
  import tomllib
21
  from concurrent.futures import ProcessPoolExecutor, as_completed
 
691
  "requirements/anchored-asset-paths.md")
692
 
693
 
694
+ def run_preliminary_checks(root: Path) -> list[dict]:
695
+ """Preliminary check phase: filesystem-only foundation requirements,
696
+ evaluated before USD traversal.
697
 
698
+ These checks are intentionally cheap and deterministic no LLM
699
+ calls in the validation hot path. Each rule is a small function
700
+ with a strong link to its foundation spec section. Drift detection
701
+ (does our hardcoded list still match the spec?) runs OUT of band
702
+ via tools/spec_sync/ (a weekly job that uses the agent to compare
703
+ the foundation spec text to these hardcoded rules and opens a PR
704
+ on drift).
705
+
706
+ Currently implemented:
707
+ - AA.002 supported-file-types
708
+ """
709
+ return _check_aa_002_supported_file_types(root)
710
+
711
+
712
+ # AA.002 supported-file-types — hardcoded from the foundation spec at
713
+ # nv_core/sr_specs/docs/capabilities/core/atomic_asset/requirements/
714
+ # supported-file-types.md ("How to comply" section, allowlist).
715
+ # Drift-sync: tools/spec_sync/check_aa_002.py compares this list to
716
+ # the foundation spec on a schedule and opens a PR if they diverge.
717
+ _AA_002_ALLOWED = {
718
+ ".usd", ".usda", ".usdc", ".usdz", # USD
719
+ ".png", ".jpg", ".jpeg", ".exr", # Images
720
+ ".m4a", ".mp3", ".wav", # Audio
721
+ }
722
+ # Packaging/metadata files the AA.002 spec doesn't govern. Treated
723
+ # as out-of-scope so we don't false-positive on .wrapp manifests,
724
+ # README files, validation receipts, etc.
725
+ _AA_002_METADATA_EXTS = {
726
+ ".wrapp", ".json", ".yaml", ".yml", ".md", ".txt", ".toml",
727
+ }
728
 
 
 
 
 
 
 
 
 
729
 
730
+ def _check_aa_002_supported_file_types(root: Path) -> list[dict]:
731
+ """AA.002 Asset must use only supported file types.
732
+
733
+ Spec: nv_core/sr_specs/docs/capabilities/core/atomic_asset/
734
+ requirements/supported-file-types.md
735
+
736
+ Walks the dataset tree, flags any file whose extension isn't in
737
+ the foundation spec's allowlist (excluding pure metadata files
738
+ which the spec doesn't govern). One issue per offending file
739
+ with full rel_path.
740
  """
741
+ issues: list[dict] = []
 
 
 
 
 
 
 
 
742
  try:
743
+ for path in root.rglob("*"):
744
+ if not path.is_file():
745
+ continue
746
+ if path.name.startswith("."):
747
+ continue
748
+ if any(p.name in _SKIP_DIR_NAMES or p.name.startswith(".")
749
+ for p in path.parents if p != root):
750
+ continue
751
+ suffix = path.suffix.lower()
752
+ if suffix in _AA_002_ALLOWED or suffix in _AA_002_METADATA_EXTS:
753
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
754
  try:
755
+ rel = str(path.relative_to(root)).replace("\\", "/")
756
+ except ValueError:
757
+ rel = str(path)
758
+ issues.append({
759
+ "code": "AA.002",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
760
  "severity": "failure",
761
+ "path": rel,
762
+ "spec_url": _ATOMIC_ASSET_FILE_TYPES,
763
+ "msg": (f"File '{rel}' uses an unsupported file type "
764
+ f"('{suffix or 'no extension'}'). AA.002 "
765
+ f"allowlist: USD (.usd, .usda, .usdc, .usdz), "
766
+ f"image (.png, .jpg, .jpeg, .exr), "
767
+ f"audio (.m4a, .mp3, .wav)."),
768
  })
769
+ except OSError as e:
770
+ issues.append({
771
+ "code": "AA.002",
 
772
  "severity": "failure",
773
+ "path": str(root),
774
+ "spec_url": _ATOMIC_ASSET_FILE_TYPES,
775
+ "msg": f"Could not enumerate dataset files: {e}",
 
 
776
  })
777
+ return issues
 
778
 
779
 
780
  def discover_assets(
 
1640
  print(f"Output: {out_dir}", flush=True)
1641
  print(f"Profile: {args.profile} v{args.version}", flush=True)
1642
 
1643
+ # Preliminary check phase: cheap deterministic foundation-spec
1644
+ # checks (filesystem-only, no USD parsing). Fails fast if any
1645
+ # violation is found no point burning USD-validation cycles
1646
+ # on a dataset that has obvious spec issues. Single roundtrip:
1647
+ # partners see ALL preliminary violations at once, fix them,
1648
+ # then re-validate. Spec drift between our hardcoded rules and
1649
+ # the foundation source-of-truth is caught by the out-of-band
1650
+ # tools/spec_sync/ job.
1651
+ preliminary_issues = run_preliminary_checks(target)
1652
+ if preliminary_issues:
1653
  out_dir.mkdir(parents=True, exist_ok=True)
1654
+ print(f"PRELIMINARY CHECK: {len(preliminary_issues)} issue(s) "
1655
+ f"skipping USD validation until these are addressed", flush=True)
1656
+ for f in preliminary_issues:
1657
  print(f" - {f['code']} {f['path']}: {f['msg'][:200]}", flush=True)
1658
+ # Group by path so the dashboard renders one row per offending
1659
+ # file (consistent with how USD-validation results are shaped
1660
+ # partners see the same per-asset layout in both phases).
 
 
 
 
1661
  by_path: dict[str, list[dict]] = {}
1662
+ for f in preliminary_issues:
1663
  by_path.setdefault(f["path"], []).append(f)
1664
  results = []
1665
  for rel, issues_here in by_path.items():
 
1677
  "profile": args.profile,
1678
  "profile_version": args.version,
1679
  "results": results,
1680
+ "preliminary_findings": preliminary_issues,
1681
+ "preliminary_check_failed": True,
1682
  }
1683
  (out_dir / "results.json").write_text(
1684
  json.dumps(results_json, indent=2), encoding="utf-8"