Spaces:

nvidia
/

simready-validator

Sleeping

App Files Files Community

loginowskid commited on 9 days ago

Commit

d126183

verified ·

1 Parent(s): c551136

Sync from simready-oem-library-pm@9cf6fb4c

Browse files

Files changed (1) hide show

tools/hf_space/github_issues.py +64 -56

tools/hf_space/github_issues.py CHANGED Viewed

@@ -84,48 +84,64 @@ def _add_comment(issue_num: int, body: str) -> None:
     _gh_request("POST", f"/issues/{issue_num}/comments", {"body": body})
-def _build_internal_issue_body(rule: str, code: str, g: dict[str, Any],
-                                dataset: str, profile: str) -> str:
     return (
-        f"**Validator-internal bug** — surfaced during automatic SimReady validation.\n\n"
-        f"This is NOT a customer-asset finding. The validator's own rule "
-        f"registration / spec loading is misbehaving and emitting errors that "
-        f"don't map to any real spec violation. Track + fix here.\n\n"
         f"| Field | Value |\n|---|---|\n"
-        f"| Rule | `{rule}` |\n"
-        f"| Code (as reported) | `{code}` |\n"
-        f"| Severity | {g.get('severity') or '?'} |\n"
-        f"| Occurrence count (first run) | {g['count']} |\n"
         f"| Dataset | `{dataset}` |\n"
-        f"| Profile | `{profile}` |\n\n"
-        f"**Sample message** (truncated to 200 chars):\n\n"
-        f"```\n{g['sample_msg']}\n```\n\n"
         f"---\n"
         f"_Filed automatically by the HF Space (`tools/hf_space/github_issues.py`). "
-        f"Subsequent runs that re-hit the same (rule, code) pair will comment "
-        f"here rather than open new issues._"
     )
-def _build_recurrence_comment(g: dict[str, Any], dataset: str, profile: str) -> str:
     return (
         f"Re-hit during validation of `{dataset}` (profile `{profile}`).\n"
-        f"Occurrences this run: {g['count']}, severity: {g.get('severity') or '?'}."
     )
 def ensure_internal_issues(results_json: dict, dataset: str, profile: str,
                             log_fn=None) -> dict:
-    """Scan results.json for validator-internal bugs and ensure a tracking
-    GitHub issue exists for each distinct (rule, code) pair. Best-effort —
-    swallowed exceptions return {"error": ...} so the validator's verdict
-    is never blocked on GitHub being flaky."""
     out = log_fn or (lambda s: print(s, flush=True))
     if not _gh_token():
         out("  (skipping internal-issue tracking: no GH token)")
         return {"skipped": True, "reason": "no_token"}
-    groups: dict[tuple[str, str], dict[str, Any]] = {}
     for asset in results_json.get("results", []):
         for iss in (asset.get("issues") or []):
             if not is_validator_internal_issue(iss):
@@ -133,45 +149,37 @@ def ensure_internal_issues(results_json: dict, dataset: str, profile: str,
             rule = iss.get("rule") or "?"
             code = iss.get("code") or "UNKNOWN"
             key = (rule, code)
-            g = groups.setdefault(key, {
                 "count": 0,
                 "sample_msg": (iss.get("msg") or "")[:200],
                 "severity": (iss.get("severity") or "").lower(),
             })
             g["count"] += 1
-    if not groups:
         return {"created": 0, "updated": 0}
-    created = updated = 0
-    aborted = False
-    # First 404 → token doesn't have access to issues on this repo.
-    # Bail out instead of trying all N groups, which would 404 each
-    # and spam the log with 70+ identical errors.
-    for (rule, code), g in groups.items():
-        if aborted:
-            break
-        title = f"[validator-internal] {rule} → {code}"
-        try:
-            existing = _find_issue(title)
-            if existing:
-                _add_comment(existing["number"],
-                             _build_recurrence_comment(g, dataset, profile))
-                updated += 1
-                out(f"  internal-issue #{existing['number']}: comment added for {rule}/{code}")
-            else:
-                num = _create_issue(title,
-                                    _build_internal_issue_body(rule, code, g, dataset, profile),
-                                    ["validator-internal", "process"])
-                created += 1
-                out(f"  internal-issue #{num}: opened for {rule}/{code}")
-        except Exception as e:
-            msg = f"{type(e).__name__}: {e}"
-            if "404" in msg:
-                out(f"  ! internal-issue tracking aborted (404 — token lacks issues:write "
-                    f"on {GH_REPO}); skipping {len(groups) - created - updated} remaining group(s)")
-                aborted = True
-            else:
-                out(f"  ! internal-issue {rule}/{code} tracking failed: {msg}")
-    return {"created": created, "updated": updated, "groups": len(groups),
-            "aborted_404": aborted}

     _gh_request("POST", f"/issues/{issue_num}/comments", {"body": body})
+def _build_dataset_issue_body(by_pair: dict, dataset: str, profile: str,
+                               total: int) -> str:
+    rows = "\n".join(
+        f"| `{rule}` | `{code}` | {g['severity'] or '?'} | {g['count']} | `{g['sample_msg']}` |"
+        for (rule, code), g in sorted(by_pair.items(), key=lambda kv: -kv[1]["count"])
+    )
     return (
+        f"**Validator-internal bugs on a single dataset** — surfaced during "
+        f"automatic SimReady validation. NOT a customer-asset finding; the "
+        f"validator's own rule registration / spec loading is misbehaving "
+        f"on this dataset and emitting errors that don't map to any real "
+        f"spec violation.\n\n"
         f"| Field | Value |\n|---|---|\n"
         f"| Dataset | `{dataset}` |\n"
+        f"| Profile (first run) | `{profile}` |\n"
+        f"| Total internal occurrences (first run) | {total} |\n"
+        f"| Distinct (rule, code) pairs (first run) | {len(by_pair)} |\n\n"
+        f"**Breakdown** (sorted by occurrence count, descending):\n\n"
+        f"| Rule | Code | Severity | Count | Sample message |\n"
+        f"|---|---|---|---|---|\n{rows}\n\n"
         f"---\n"
         f"_Filed automatically by the HF Space (`tools/hf_space/github_issues.py`). "
+        f"One issue per dataset — re-validating the same dataset comments "
+        f"here with the new counts instead of opening a duplicate._"
     )
+def _build_dataset_recurrence_comment(by_pair: dict, dataset: str, profile: str,
+                                       total: int) -> str:
+    rows = "\n".join(
+        f"| `{rule}` | `{code}` | {g['count']} |"
+        for (rule, code), g in sorted(by_pair.items(), key=lambda kv: -kv[1]["count"])
+    )
     return (
         f"Re-hit during validation of `{dataset}` (profile `{profile}`).\n"
+        f"This run: **{total}** internal occurrences across **{len(by_pair)}** "
+        f"distinct (rule, code) pairs.\n\n"
+        f"| Rule | Code | Count this run |\n|---|---|---|\n{rows}"
     )
 def ensure_internal_issues(results_json: dict, dataset: str, profile: str,
                             log_fn=None) -> dict:
+    """Scan results.json for validator-internal bugs and ensure exactly ONE
+    tracking GitHub issue exists per dataset. The issue lists every
+    distinct (rule, code) pair found across the whole dataset; re-runs
+    add a comment with the new counts instead of opening duplicates.
+    Best-effort — swallowed exceptions return {"error": ...} so the
+    validator's verdict is never blocked on GitHub being flaky."""
     out = log_fn or (lambda s: print(s, flush=True))
     if not _gh_token():
         out("  (skipping internal-issue tracking: no GH token)")
         return {"skipped": True, "reason": "no_token"}
+    # Group across the whole dataset: (rule, code) → {count, sample, severity}
+    by_pair: dict[tuple[str, str], dict[str, Any]] = {}
+    total = 0
     for asset in results_json.get("results", []):
         for iss in (asset.get("issues") or []):
             if not is_validator_internal_issue(iss):
             rule = iss.get("rule") or "?"
             code = iss.get("code") or "UNKNOWN"
             key = (rule, code)
+            g = by_pair.setdefault(key, {
                 "count": 0,
                 "sample_msg": (iss.get("msg") or "")[:200],
                 "severity": (iss.get("severity") or "").lower(),
             })
             g["count"] += 1
+            total += 1
+    if not by_pair:
         return {"created": 0, "updated": 0}
+    title = f"[validator-internal] {dataset}"
+    try:
+        existing = _find_issue(title)
+        if existing:
+            _add_comment(existing["number"],
+                         _build_dataset_recurrence_comment(by_pair, dataset, profile, total))
+            out(f"  internal-issue #{existing['number']}: comment added for dataset "
+                f"{dataset} ({total} occurrences, {len(by_pair)} pairs)")
+            return {"created": 0, "updated": 1, "pairs": len(by_pair), "total": total}
+        num = _create_issue(title,
+                            _build_dataset_issue_body(by_pair, dataset, profile, total),
+                            ["validator-internal", "process"])
+        out(f"  internal-issue #{num}: opened for dataset {dataset} "
+            f"({total} occurrences, {len(by_pair)} pairs)")
+        return {"created": 1, "updated": 0, "pairs": len(by_pair), "total": total}
+    except Exception as e:
+        msg = f"{type(e).__name__}: {e}"
+        if "404" in msg:
+            out(f"  ! internal-issue tracking aborted (404 — token lacks issues:write "
+                f"on {GH_REPO})")
+            return {"created": 0, "updated": 0, "aborted_404": True}
+        out(f"  ! internal-issue tracking for dataset {dataset} failed: {msg}")
+        return {"created": 0, "updated": 0, "error": msg}