Spaces:

nvidia
/

simready-validator

Sleeping

App Files Files Community

loginowskid commited on 7 days ago

Commit

f36b7a3

verified ·

1 Parent(s): d126183

Sync from simready-oem-library-pm@99650b92

Browse files

Files changed (3) hide show

tools/hf_space/agentic_issues.py +215 -0
tools/hf_space/github_issues.py +151 -25
tools/hf_space/requirements.txt +5 -0

tools/hf_space/agentic_issues.py ADDED Viewed

	@@ -0,0 +1,215 @@

+"""Agentic review of validator-internal bug groups before they're filed
+as GitHub issues.
+Without agentic review, every distinct (rule, code) pair we see becomes
+its own issue (or the dataset gets one big dump issue). That's noisy:
+- The same underlying bug may surface under different (rule, code) pairs
+  → semantically the same issue, but the title-match dedup misses it.
+- Transient one-off errors get filed alongside real bugs.
+- Auto-generated bodies are raw counts + sample strings; they don't
+  explain what the validator is actually misbehaving on.
+This module asks Claude to look at the current run's error groups AND
+the repository's existing validator-internal issues, then produces a
+structured list of decisions (`comment` / `skip` / `create`) with
+human-readable titles + bodies.
+Best-effort: if the call fails (missing API key, rate limit, malformed
+response), the caller falls back to the legacy title-match policy in
+`github_issues.py::_ensure_internal_issues_simple`.
+"""
+from __future__ import annotations
+import json
+import os
+import sys
+from typing import Any
+MODEL = "claude-sonnet-4-6"
+MAX_TOKENS = 4096
+def _client():
+    """Lazy import + construct so the module loads even when the
+    anthropic SDK isn't installed (the caller probes this)."""
+    if not os.environ.get("ANTHROPIC_API_KEY"):
+        return None
+    try:
+        from anthropic import Anthropic
+    except ImportError:
+        return None
+    return Anthropic()
+def is_available() -> bool:
+    return _client() is not None
+REVIEW_TOOL = {
+    "name": "submit_decisions",
+    "description": (
+        "Submit one decision per (rule, code) error group. "
+        "Each decision is either 'comment' on an existing issue, "
+        "'skip' if it's noise, or 'create' a single new issue. "
+        "Multiple groups may be consolidated by pointing several "
+        "decisions at the same target issue or by emitting one "
+        "'create' decision with body text that explains multiple "
+        "related groups."
+    ),
+    "input_schema": {
+        "type": "object",
+        "properties": {
+            "summary": {
+                "type": "string",
+                "description": "One short paragraph summarizing what the validator's actually misbehaving on, in plain language.",
+            },
+            "decisions": {
+                "type": "array",
+                "description": "One entry per (rule, code) group from the input.",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "rule": {"type": "string"},
+                        "code": {"type": "string"},
+                        "action": {
+                            "type": "string",
+                            "enum": ["comment", "skip", "create"],
+                        },
+                        "target_issue": {
+                            "type": ["integer", "null"],
+                            "description": "GitHub issue number to comment on. Required when action='comment'; null otherwise.",
+                        },
+                        "title": {
+                            "type": ["string", "null"],
+                            "description": "Issue title to use when action='create'. Should start with '[validator-internal]' and read as a one-line plain-language summary.",
+                        },
+                        "body": {
+                            "type": ["string", "null"],
+                            "description": "Issue body (action='create') or comment body (action='comment'). Markdown, plain language, explains what the bug is and why these errors are evidence of it.",
+                        },
+                        "reasoning": {
+                            "type": "string",
+                            "description": "Short note (1-2 sentences) explaining why this action was chosen.",
+                        },
+                    },
+                    "required": ["rule", "code", "action", "reasoning"],
+                },
+            },
+        },
+        "required": ["summary", "decisions"],
+    },
+}
+def _format_existing_issues(existing: list[dict]) -> str:
+    if not existing:
+        return "(none — this is the first time validator-internal issues are being filed in this repo)"
+    lines = []
+    for i in existing:
+        body = (i.get("body") or "")[:400].replace("\n", " ")
+        lines.append(
+            f"- #{i['number']} [{i.get('state', '?')}] {i['title']!r}\n"
+            f"    body excerpt: {body}"
+        )
+    return "\n".join(lines)
+def _format_groups(by_pair: dict, total: int) -> str:
+    lines = []
+    for (rule, code), g in sorted(by_pair.items(), key=lambda kv: -kv[1]["count"]):
+        sample = g["sample_msg"].replace("\n", " ")[:300]
+        lines.append(
+            f"- rule={rule!r} code={code!r} severity={g.get('severity') or '?'}\n"
+            f"  count={g['count']} of {total} total internal occurrences\n"
+            f"  sample message: {sample!r}"
+        )
+    return "\n".join(lines)
+SYSTEM_PROMPT = """You are reviewing validator-internal bug reports from \
+the SimReady asset validator before they're filed as GitHub issues on \
+NVIDIA-dev/simready-oem-library-pm.
+Context:
+- The validator runs the foundation spec rules against customer datasets.
+- Some errors it emits don't map to real spec violations — they're \
+bugs in the validator's own rule registration, spec loading, plugin \
+discovery, or asset traversal. Those are "validator-internal" bugs and \
+this repo tracks them as GitHub issues.
+- Examples of validator-internal codes: UNKNOWN, SDK.*, and any \
+message containing "Uncaught error" or "is not registered to \
+requirement".
+Your job, for each (rule, code) error group from the current validation \
+run:
+1. Check the existing validator-internal issues. If the group matches \
+one of them semantically (not just by title), produce a `comment` \
+decision pointing at that issue. Multiple groups may map to the same \
+existing issue.
+2. If the group looks like a transient one-off (very low count, no \
+recognizable failure mode, message looks like a stray traceback that \
+won't recur), produce a `skip` decision.
+3. Otherwise, produce a `create` decision with a plain-language title \
+(starts with '[validator-internal]') and a body that explains what the \
+validator is actually misbehaving on (don't just paraphrase the rule \
+name — interpret what failed). If two or three groups are clearly the \
+same underlying bug surfacing under different rule names, consolidate \
+into a single `create` decision and route the other groups as \
+`comment` decisions targeting that issue once it's filed (use a \
+placeholder negative number like -1, -2 etc. for cross-references — \
+the caller will resolve them after creation order).
+Be conservative: prefer `comment` over `create`, prefer `skip` only \
+when there's clear noise signal. Better to have one well-written \
+issue than five sparse ones."""
+def _build_user_prompt(by_pair: dict, dataset: str, profile: str,
+                       total: int, existing: list[dict]) -> str:
+    return f"""Dataset: `{dataset}`
+Profile: `{profile}`
+Total internal-error occurrences in this run: {total}
+Distinct (rule, code) groups: {len(by_pair)}
+# Current run's error groups
+{_format_groups(by_pair, total)}
+# Existing validator-internal issues in the repo
+{_format_existing_issues(existing)}
+Call the submit_decisions tool with one decision per group above."""
+def review_and_decide(by_pair: dict, dataset: str, profile: str,
+                      total: int, existing_issues: list[dict],
+                      log_fn=None) -> dict | None:
+    """Run the agent. Returns the decisions dict (matching the
+    submit_decisions tool's input_schema) or None if unavailable /
+    failed."""
+    out = log_fn or (lambda s: print(s, flush=True))
+    client = _client()
+    if client is None:
+        out("  (agentic review unavailable: ANTHROPIC_API_KEY unset or anthropic SDK missing)")
+        return None
+    try:
+        msg = client.messages.create(
+            model=MODEL,
+            max_tokens=MAX_TOKENS,
+            system=SYSTEM_PROMPT,
+            tools=[REVIEW_TOOL],
+            tool_choice={"type": "tool", "name": "submit_decisions"},
+            messages=[{
+                "role": "user",
+                "content": _build_user_prompt(by_pair, dataset, profile, total, existing_issues),
+            }],
+        )
+        for block in msg.content:
+            if getattr(block, "type", None) == "tool_use" and block.name == "submit_decisions":
+                return block.input
+        out("  ! agentic review returned no tool_use block; falling back")
+        return None
+    except Exception as e:
+        out(f"  ! agentic review failed ({type(e).__name__}: {str(e)[:200]}); falling back")
+        return None

tools/hf_space/github_issues.py CHANGED Viewed

@@ -125,12 +125,137 @@ def _build_dataset_recurrence_comment(by_pair: dict, dataset: str, profile: str,
     )
 def ensure_internal_issues(results_json: dict, dataset: str, profile: str,
                             log_fn=None) -> dict:
-    """Scan results.json for validator-internal bugs and ensure exactly ONE
-    tracking GitHub issue exists per dataset. The issue lists every
-    distinct (rule, code) pair found across the whole dataset; re-runs
-    add a comment with the new counts instead of opening duplicates.
     Best-effort — swallowed exceptions return {"error": ...} so the
     validator's verdict is never blocked on GitHub being flaky."""
@@ -160,26 +285,27 @@ def ensure_internal_issues(results_json: dict, dataset: str, profile: str,
     if not by_pair:
         return {"created": 0, "updated": 0}
-    title = f"[validator-internal] {dataset}"
     try:
-        existing = _find_issue(title)
-        if existing:
-            _add_comment(existing["number"],
-                         _build_dataset_recurrence_comment(by_pair, dataset, profile, total))
-            out(f"  internal-issue #{existing['number']}: comment added for dataset "
-                f"{dataset} ({total} occurrences, {len(by_pair)} pairs)")
-            return {"created": 0, "updated": 1, "pairs": len(by_pair), "total": total}
-        num = _create_issue(title,
-                            _build_dataset_issue_body(by_pair, dataset, profile, total),
-                            ["validator-internal", "process"])
-        out(f"  internal-issue #{num}: opened for dataset {dataset} "
-            f"({total} occurrences, {len(by_pair)} pairs)")
-        return {"created": 1, "updated": 0, "pairs": len(by_pair), "total": total}
     except Exception as e:
-        msg = f"{type(e).__name__}: {e}"
-        if "404" in msg:
-            out(f"  ! internal-issue tracking aborted (404 — token lacks issues:write "
-                f"on {GH_REPO})")
-            return {"created": 0, "updated": 0, "aborted_404": True}
-        out(f"  ! internal-issue tracking for dataset {dataset} failed: {msg}")
-        return {"created": 0, "updated": 0, "error": msg}

     )
+def _list_existing_internal_issues() -> list[dict]:
+    """Pull every validator-internal issue (open + recently closed) so
+    the agent can dedupe semantically against them. Closed issues are
+    included because the same bug can come back after a fix is
+    reverted or after a regression."""
+    q = urllib.parse.quote(
+        f'repo:{GH_REPO} label:"validator-internal" is:issue')
+    try:
+        result = _gh_request("GET", f"/../../search/issues?q={q}&per_page=100")
+    except Exception:
+        return []
+    return (result or {}).get("items") or []
+def _execute_decisions(decisions: list[dict], by_pair: dict, dataset: str,
+                       profile: str, total: int, log_fn) -> dict:
+    """Carry out the agent's decisions. Returns counters."""
+    out = log_fn
+    created = updated = skipped = 0
+    aborted = False
+    # Map negative placeholder numbers → real issue numbers as `create`
+    # decisions resolve. The agent uses negatives to cross-reference
+    # decisions that share a parent issue when several groups roll up
+    # into one new bug.
+    placeholder_to_real: dict[int, int] = {}
+    # Two passes: creates first (so their numbers are known), then
+    # comments (so cross-references resolve). Skips are free.
+    creates = [d for d in decisions if d.get("action") == "create"]
+    comments = [d for d in decisions if d.get("action") == "comment"]
+    skips = [d for d in decisions if d.get("action") == "skip"]
+    for d in creates:
+        if aborted: break
+        title = d.get("title") or f"[validator-internal] {dataset} :: {d.get('rule')} / {d.get('code')}"
+        body = d.get("body") or "(agent did not provide a body)"
+        try:
+            existing = _find_issue(title)
+            if existing:
+                _add_comment(existing["number"],
+                             f"_Re-hit during validation of `{dataset}` "
+                             f"(profile `{profile}`)._\n\n{body}")
+                updated += 1
+                out(f"  internal-issue #{existing['number']}: comment added "
+                    f"(agent: create→existing match by title)")
+                placeholder_to_real.setdefault(-(creates.index(d) + 1), existing["number"])
+            else:
+                num = _create_issue(title, body, ["validator-internal", "process", "agent-reviewed"])
+                created += 1
+                out(f"  internal-issue #{num}: opened ({title!r}) — agent reasoning: "
+                    f"{d.get('reasoning', '')[:160]}")
+                placeholder_to_real[-(creates.index(d) + 1)] = num
+        except Exception as e:
+            msg = f"{type(e).__name__}: {e}"
+            if "404" in msg:
+                out(f"  ! internal-issue tracking aborted (404 — token lacks issues:write on {GH_REPO})")
+                aborted = True
+            else:
+                out(f"  ! create failed for {title!r}: {msg}")
+    for d in comments:
+        if aborted: break
+        target = d.get("target_issue")
+        if target is None:
+            out(f"  ! comment decision has no target_issue; skipping ({d.get('reasoning', '')[:100]})")
+            continue
+        if target < 0:
+            target = placeholder_to_real.get(target)
+            if target is None:
+                out(f"  ! comment decision cross-references an unresolved placeholder; skipping")
+                continue
+        body = d.get("body") or (
+            f"Re-hit during validation of `{dataset}` (profile `{profile}`). "
+            f"Same underlying bug as this issue — see agent reasoning: "
+            f"{d.get('reasoning', '')}"
+        )
+        try:
+            _add_comment(target, body)
+            updated += 1
+            out(f"  internal-issue #{target}: comment added (agent: comment) — {d.get('reasoning', '')[:120]}")
+        except Exception as e:
+            msg = f"{type(e).__name__}: {e}"
+            if "404" in msg:
+                out(f"  ! comment tracking aborted (404)")
+                aborted = True
+            else:
+                out(f"  ! comment failed for #{target}: {msg}")
+    for d in skips:
+        skipped += 1
+        out(f"  internal-issue {d.get('rule')}/{d.get('code')}: skipped — {d.get('reasoning', '')[:160]}")
+    return {"created": created, "updated": updated, "skipped": skipped,
+            "aborted_404": aborted}
+def _ensure_internal_issues_simple(by_pair: dict, dataset: str, profile: str,
+                                    total: int, log_fn) -> dict:
+    """Fallback (no agentic review): one issue per dataset, dedup by
+    exact title match. This is what we used before the agent was
+    wired up; kept as a backstop for when ANTHROPIC_API_KEY is unset,
+    the SDK is missing, or the Claude call fails."""
+    out = log_fn
+    title = f"[validator-internal] {dataset}"
+    try:
+        existing = _find_issue(title)
+        if existing:
+            _add_comment(existing["number"],
+                         _build_dataset_recurrence_comment(by_pair, dataset, profile, total))
+            out(f"  internal-issue #{existing['number']}: comment added for dataset "
+                f"{dataset} ({total} occurrences, {len(by_pair)} pairs) — fallback policy")
+            return {"created": 0, "updated": 1, "pairs": len(by_pair), "total": total,
+                    "fallback": True}
+        num = _create_issue(title,
+                            _build_dataset_issue_body(by_pair, dataset, profile, total),
+                            ["validator-internal", "process"])
+        out(f"  internal-issue #{num}: opened for dataset {dataset} "
+            f"({total} occurrences, {len(by_pair)} pairs) — fallback policy")
+        return {"created": 1, "updated": 0, "pairs": len(by_pair), "total": total,
+                "fallback": True}
+    except Exception as e:
+        msg = f"{type(e).__name__}: {e}"
+        if "404" in msg:
+            out(f"  ! internal-issue tracking aborted (404 — token lacks issues:write on {GH_REPO})")
+            return {"created": 0, "updated": 0, "aborted_404": True, "fallback": True}
+        out(f"  ! internal-issue tracking for dataset {dataset} failed: {msg}")
+        return {"created": 0, "updated": 0, "error": msg, "fallback": True}
 def ensure_internal_issues(results_json: dict, dataset: str, profile: str,
                             log_fn=None) -> dict:
+    """Scan results.json for validator-internal bugs and route them to
+    GitHub issues via an agentic review pass (Claude) that classifies,
+    dedupes against existing issues, and writes plain-language
+    explanations. Falls back to a simple one-issue-per-dataset policy
+    if the agent is unavailable.
     Best-effort — swallowed exceptions return {"error": ...} so the
     validator's verdict is never blocked on GitHub being flaky."""
     if not by_pair:
         return {"created": 0, "updated": 0}
+    # Agentic path: Claude classifies + writes the issue body.
     try:
+        from agentic_issues import is_available as _agent_available, review_and_decide
     except Exception as e:
+        out(f"  (agentic_issues import failed: {type(e).__name__}: {str(e)[:120]}); "
+            f"using fallback policy")
+        return _ensure_internal_issues_simple(by_pair, dataset, profile, total, out)
+    if not _agent_available():
+        out("  (agentic review unavailable; using fallback policy)")
+        return _ensure_internal_issues_simple(by_pair, dataset, profile, total, out)
+    existing = _list_existing_internal_issues()
+    out(f"  agentic review: {len(by_pair)} group(s) vs {len(existing)} existing "
+        f"validator-internal issue(s)")
+    review = review_and_decide(by_pair, dataset, profile, total, existing, log_fn=out)
+    if review is None or not review.get("decisions"):
+        return _ensure_internal_issues_simple(by_pair, dataset, profile, total, out)
+    if review.get("summary"):
+        out(f"  agent summary: {review['summary'][:300]}")
+    result = _execute_decisions(review["decisions"], by_pair, dataset, profile, total, out)
+    result.update({"pairs": len(by_pair), "total": total, "agentic": True})
+    return result

tools/hf_space/requirements.txt CHANGED Viewed

@@ -11,6 +11,11 @@
 gradio>=4.0
 huggingface_hub>=0.34
 # Validator runtime — versions match
 # /home/horde/.simready/venv/lib/python3.12/site-packages/ on DGXC.
 usd-core==26.5

 gradio>=4.0
 huggingface_hub>=0.34
+# Agentic review of validator-internal bugs (github_issues.py).
+# Optional at runtime — when ANTHROPIC_API_KEY is unset the issue
+# filer falls back to the title-match policy.
+anthropic>=0.40
 # Validator runtime — versions match
 # /home/horde/.simready/venv/lib/python3.12/site-packages/ on DGXC.
 usd-core==26.5