Spaces:

nvidia
/

simready-validator

Sleeping

App Files Files Community

loginowskid commited on 10 days ago

Commit

2409b81

1 Parent(s): a83d5d2

HF API dedup @7912416

Browse files

Files changed (1) hide show

tools/hf_space/runner.py +52 -35

tools/hf_space/runner.py CHANGED Viewed

@@ -400,7 +400,10 @@ def _validate_zip_streaming(*, api: HfApi, dataset: str, token: str | None,
                              progress_file: Path | None, out,
                              force: bool = False,
                              submission_id: str = "",
-                             flat_target: Path | None = None) -> dict | None:
     """Validate a zip-bundled dataset by streaming one archive at a time.
     Flow per zip: hf_hub_download → extract → validate.py → capture
@@ -419,22 +422,29 @@ def _validate_zip_streaming(*, api: HfApi, dataset: str, token: str | None,
     from huggingface_hub import hf_hub_download
     import zipfile
-    zip_entries = _list_dataset_zips(api, dataset, token)
     # Unified path: if the dataset has no zip files, synthesize a SINGLE
     # "unit" representing the whole dataset. snapshot_download has
     # already (or will) materialize the contents into flat_target;
     # downstream daemon-pool validation treats it the same as one zip.
-    # This removes the previously-separate flat-download branch in
-    # run() and gives flat datasets the same: daemon-pool speed,
-    # cancel signaling, live progress, per-unit caching.
     is_flat = not zip_entries
     if is_flat:
         if flat_target is None or not flat_target.is_dir():
             return None  # caller must provide the materialized dir
-        try:
-            head = api.repo_info(dataset, repo_type="dataset").sha
-        except Exception:
-            head = ""
         zip_entries = [(dataset, head)]
         out(f"  flat dataset: snapshot at {flat_target}; validator will discover assets")
     if force:
@@ -1206,7 +1216,8 @@ def progress_path_for(submission_id: str) -> Path:
 def _finalize_run(*, dataset: str, profile: str, version: str,
                   results_json: dict, status: str, summary: str,
                   out_dir: Path, api: HfApi, token: str | None,
-                  open_pr: bool, results_path: Path, out) -> RunResult:
     """Shared tail-end of run(): file issues, optionally open PR on
     dataset, persist report, write cache, return RunResult."""
     try:
@@ -1254,7 +1265,10 @@ def _finalize_run(*, dataset: str, profile: str, version: str,
             f"({'cancelled' if is_cancelled else 'partial: ' + str(results_json.get('streaming_processed')) + '/' + str(results_json.get('streaming_zips'))})")
     else:
         try:
-            head = api.repo_info(dataset, repo_type="dataset").sha
             key = _cache_key(head, profile, _validator_version(), _foundation_sha())
             _write_cache(dataset, key, {
                 "schema_version": 1,
@@ -1306,30 +1320,30 @@ def run(
     token = hf_token or os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
     api = HfApi(token=token)
-    # Dataset-level cache: resolve the current HEAD before downloading
-    # so we can short-circuit on a hit. The four-tuple (head, profile,
-    # validator version, foundation sha) covers every input that
-    # determines the verdict. force=True skips this branch entirely.
-    if not force:
-        try:
-            dataset_head = api.repo_info(dataset, repo_type="dataset").sha
-        except Exception as e:
-            dataset_head = None
-            out(f"  ! could not resolve dataset HEAD ({type(e).__name__}: {e}); cache skipped")
-        if dataset_head:
-            key = _cache_key(dataset_head, profile, _validator_version(), _foundation_sha())
-            cached = _read_cache(dataset, key)
-            if cached:
-                out(f"  cache hit (key={key}, head={dataset_head[:8]}, "
-                    f"cached_at={cached.get('cached_at')}); returning without re-running")
-                return RunResult(
-                    dataset=dataset, profile=profile, version=version,
-                    status=cached["status"], summary=cached["summary"],
-                    results_json=cached["results_json"],
-                    report_path=Path(cached.get("report_path") or "/tmp"),
-                    pr_url=None,
-                )
-            out(f"  cache miss (key={key}, head={dataset_head[:8]}); running validator")
     with tempfile.TemporaryDirectory(prefix=f"hfsp-{dataset.replace('/', '_')}-") as td:
         work = Path(td)
@@ -1387,6 +1401,8 @@ def run(
             progress_file=prog_path, out=out, force=force,
             submission_id=submission_id,
             flat_target=flat_target,
         )
         out_dir = work / "out"
@@ -1427,4 +1443,5 @@ def run(
             results_json=results_json, status=status, summary=summary,
             out_dir=out_dir, api=api, token=token, open_pr=open_pr,
             results_path=results_path, out=out,
         )

                              progress_file: Path | None, out,
                              force: bool = False,
                              submission_id: str = "",
+                             flat_target: Path | None = None,
+                             prefetched_zip_entries: list | None = None,
+                             prefetched_dataset_head: str | None = None,
+                             ) -> dict | None:
     """Validate a zip-bundled dataset by streaming one archive at a time.
     Flow per zip: hf_hub_download → extract → validate.py → capture
     from huggingface_hub import hf_hub_download
     import zipfile
+    # Use the caller's pre-fetched zip listing + dataset HEAD when
+    # available. run() already calls _list_dataset_zips() and
+    # repo_info() to decide flat vs zip + populate the dataset-level
+    # cache; calling them again here doubled the HF API request count
+    # per validation for no value.
+    if prefetched_zip_entries is not None:
+        zip_entries = prefetched_zip_entries
+    else:
+        zip_entries = _list_dataset_zips(api, dataset, token)
     # Unified path: if the dataset has no zip files, synthesize a SINGLE
     # "unit" representing the whole dataset. snapshot_download has
     # already (or will) materialize the contents into flat_target;
     # downstream daemon-pool validation treats it the same as one zip.
     is_flat = not zip_entries
     if is_flat:
         if flat_target is None or not flat_target.is_dir():
             return None  # caller must provide the materialized dir
+        head = prefetched_dataset_head
+        if head is None:
+            try:
+                head = api.repo_info(dataset, repo_type="dataset").sha
+            except Exception:
+                head = ""
         zip_entries = [(dataset, head)]
         out(f"  flat dataset: snapshot at {flat_target}; validator will discover assets")
     if force:
 def _finalize_run(*, dataset: str, profile: str, version: str,
                   results_json: dict, status: str, summary: str,
                   out_dir: Path, api: HfApi, token: str | None,
+                  open_pr: bool, results_path: Path, out,
+                  dataset_head: str | None = None) -> RunResult:
     """Shared tail-end of run(): file issues, optionally open PR on
     dataset, persist report, write cache, return RunResult."""
     try:
             f"({'cancelled' if is_cancelled else 'partial: ' + str(results_json.get('streaming_processed')) + '/' + str(results_json.get('streaming_zips'))})")
     else:
         try:
+            # Reuse the pre-resolved HEAD when run() already fetched it.
+            # Falls back to a fresh API call only if the caller didn't
+            # pass one (e.g. legacy call sites).
+            head = dataset_head if dataset_head is not None else api.repo_info(dataset, repo_type="dataset").sha
             key = _cache_key(head, profile, _validator_version(), _foundation_sha())
             _write_cache(dataset, key, {
                 "schema_version": 1,
     token = hf_token or os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
     api = HfApi(token=token)
+    # Resolve the dataset HEAD ONCE up front. Used for: (a) the
+    # dataset-level cache key, (b) the per-unit cache key in the flat
+    # path, (c) the streaming function's "synthetic zip sha" for the
+    # flat unit. Without this, the same metadata was re-fetched from
+    # HF up to 4 times per validation.
+    dataset_head: str | None = None
+    try:
+        dataset_head = api.repo_info(dataset, repo_type="dataset").sha
+    except Exception as e:
+        out(f"  ! could not resolve dataset HEAD ({type(e).__name__}: {e}); cache + drift checks skipped")
+    if not force and dataset_head:
+        key = _cache_key(dataset_head, profile, _validator_version(), _foundation_sha())
+        cached = _read_cache(dataset, key)
+        if cached:
+            out(f"  cache hit (key={key}, head={dataset_head[:8]}, "
+                f"cached_at={cached.get('cached_at')}); returning without re-running")
+            return RunResult(
+                dataset=dataset, profile=profile, version=version,
+                status=cached["status"], summary=cached["summary"],
+                results_json=cached["results_json"],
+                report_path=Path(cached.get("report_path") or "/tmp"),
+                pr_url=None,
+            )
+        out(f"  cache miss (key={key}, head={dataset_head[:8]}); running validator")
     with tempfile.TemporaryDirectory(prefix=f"hfsp-{dataset.replace('/', '_')}-") as td:
         work = Path(td)
             progress_file=prog_path, out=out, force=force,
             submission_id=submission_id,
             flat_target=flat_target,
+            prefetched_zip_entries=probe_zip_entries,
+            prefetched_dataset_head=dataset_head,
         )
         out_dir = work / "out"
             results_json=results_json, status=status, summary=summary,
             out_dir=out_dir, api=api, token=token, open_pr=open_pr,
             results_path=results_path, out=out,
+            dataset_head=dataset_head,
         )