Spaces:
Sleeping
Sleeping
| """Agentic review of validator-internal bug groups before they're filed | |
| as GitHub issues. | |
| Without agentic review, every distinct (rule, code) pair we see becomes | |
| its own issue (or the dataset gets one big dump issue). That's noisy: | |
| - The same underlying bug may surface under different (rule, code) pairs | |
| → semantically the same issue, but the title-match dedup misses it. | |
| - Transient one-off errors get filed alongside real bugs. | |
| - Auto-generated bodies are raw counts + sample strings; they don't | |
| explain what the validator is actually misbehaving on. | |
| This module asks Claude to look at the current run's error groups AND | |
| the repository's existing validator-internal issues, then produces a | |
| structured list of decisions (`comment` / `skip` / `create`) with | |
| human-readable titles + bodies. | |
| Best-effort: if the call fails (missing API key, rate limit, malformed | |
| response), the caller falls back to the legacy title-match policy in | |
| `github_issues.py::_ensure_internal_issues_simple`. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| import sys | |
| from typing import Any | |
| MODEL = "claude-sonnet-4-6" | |
| MAX_TOKENS = 4096 | |
| def _api_key() -> str | None: | |
| """Token name varies by setup. ANTHROPIC_API_KEY is the SDK default; | |
| on NVIDIA-internal Spaces (this one) the token is supplied as | |
| NVIDIA_INFERENCE_TOKEN and the SDK is pointed at NVIDIA's Anthropic | |
| proxy via ANTHROPIC_BASE_URL (Anthropic SDK reads BASE_URL natively, | |
| no extra wiring needed here).""" | |
| return (os.environ.get("ANTHROPIC_API_KEY") | |
| or os.environ.get("NVIDIA_INFERENCE_TOKEN")) | |
| def _client(): | |
| """Lazy import + construct so the module loads even when the | |
| anthropic SDK isn't installed (the caller probes this).""" | |
| key = _api_key() | |
| if not key: | |
| return None | |
| try: | |
| from anthropic import Anthropic | |
| except ImportError: | |
| return None | |
| return Anthropic(api_key=key) | |
| def is_available() -> bool: | |
| return _client() is not None | |
| REVIEW_TOOL = { | |
| "name": "submit_decisions", | |
| "description": ( | |
| "Submit one decision per (rule, code) error group. " | |
| "Each decision is either 'comment' on an existing issue, " | |
| "'skip' if it's noise, or 'create' a single new issue. " | |
| "Multiple groups may be consolidated by pointing several " | |
| "decisions at the same target issue or by emitting one " | |
| "'create' decision with body text that explains multiple " | |
| "related groups." | |
| ), | |
| "input_schema": { | |
| "type": "object", | |
| "properties": { | |
| "summary": { | |
| "type": "string", | |
| "description": "One short paragraph summarizing what the validator's actually misbehaving on, in plain language.", | |
| }, | |
| "decisions": { | |
| "type": "array", | |
| "description": "One entry per (rule, code) group from the input.", | |
| "items": { | |
| "type": "object", | |
| "properties": { | |
| "rule": {"type": "string"}, | |
| "code": {"type": "string"}, | |
| "action": { | |
| "type": "string", | |
| "enum": ["comment", "skip", "create"], | |
| }, | |
| "target_issue": { | |
| "type": ["integer", "null"], | |
| "description": "GitHub issue number to comment on. Required when action='comment'; null otherwise.", | |
| }, | |
| "title": { | |
| "type": ["string", "null"], | |
| "description": "Issue title to use when action='create'. Should start with '[validator-internal]' and read as a one-line plain-language summary.", | |
| }, | |
| "body": { | |
| "type": ["string", "null"], | |
| "description": "Issue body (action='create') or comment body (action='comment'). Markdown, plain language, explains what the bug is and why these errors are evidence of it.", | |
| }, | |
| "reasoning": { | |
| "type": "string", | |
| "description": "Short note (1-2 sentences) explaining why this action was chosen.", | |
| }, | |
| }, | |
| "required": ["rule", "code", "action", "reasoning"], | |
| }, | |
| }, | |
| }, | |
| "required": ["summary", "decisions"], | |
| }, | |
| } | |
| def _format_existing_issues(existing: list[dict]) -> str: | |
| if not existing: | |
| return "(none — this is the first time validator-internal issues are being filed in this repo)" | |
| lines = [] | |
| for i in existing: | |
| body = (i.get("body") or "")[:400].replace("\n", " ") | |
| lines.append( | |
| f"- #{i['number']} [{i.get('state', '?')}] {i['title']!r}\n" | |
| f" body excerpt: {body}" | |
| ) | |
| return "\n".join(lines) | |
| def _format_groups(by_pair: dict, total: int) -> str: | |
| lines = [] | |
| for (rule, code), g in sorted(by_pair.items(), key=lambda kv: -kv[1]["count"]): | |
| sample = g["sample_msg"].replace("\n", " ")[:300] | |
| lines.append( | |
| f"- rule={rule!r} code={code!r} severity={g.get('severity') or '?'}\n" | |
| f" count={g['count']} of {total} total internal occurrences\n" | |
| f" sample message: {sample!r}" | |
| ) | |
| return "\n".join(lines) | |
| SYSTEM_PROMPT = """You are reviewing validator-internal bug reports from \ | |
| the SimReady asset validator before they're filed as GitHub issues on \ | |
| NVIDIA-dev/simready-oem-library-pm. | |
| Context: | |
| - The validator runs the foundation spec rules against customer datasets. | |
| - Some errors it emits don't map to real spec violations — they're \ | |
| bugs in the validator's own rule registration, spec loading, plugin \ | |
| discovery, or asset traversal. Those are "validator-internal" bugs and \ | |
| this repo tracks them as GitHub issues. | |
| - Examples of validator-internal codes: UNKNOWN, SDK.*, and any \ | |
| message containing "Uncaught error" or "is not registered to \ | |
| requirement". | |
| Your job, for each (rule, code) error group from the current validation \ | |
| run: | |
| 1. Check the existing validator-internal issues. If the group matches \ | |
| one of them semantically (not just by title), produce a `comment` \ | |
| decision pointing at that issue. Multiple groups may map to the same \ | |
| existing issue. | |
| 2. If the group looks like a transient one-off (very low count, no \ | |
| recognizable failure mode, message looks like a stray traceback that \ | |
| won't recur), produce a `skip` decision. | |
| 3. Otherwise, produce a `create` decision with a plain-language title \ | |
| (starts with '[validator-internal]') and a body that explains what the \ | |
| validator is actually misbehaving on (don't just paraphrase the rule \ | |
| name — interpret what failed). If two or three groups are clearly the \ | |
| same underlying bug surfacing under different rule names, consolidate \ | |
| into a single `create` decision and route the other groups as \ | |
| `comment` decisions targeting that issue once it's filed (use a \ | |
| placeholder negative number like -1, -2 etc. for cross-references — \ | |
| the caller will resolve them after creation order). | |
| Be conservative: prefer `comment` over `create`, prefer `skip` only \ | |
| when there's clear noise signal. Better to have one well-written \ | |
| issue than five sparse ones.""" | |
| def _build_user_prompt(by_pair: dict, dataset: str, profile: str, | |
| total: int, existing: list[dict]) -> str: | |
| return f"""Dataset: `{dataset}` | |
| Profile: `{profile}` | |
| Total internal-error occurrences in this run: {total} | |
| Distinct (rule, code) groups: {len(by_pair)} | |
| # Current run's error groups | |
| {_format_groups(by_pair, total)} | |
| # Existing validator-internal issues in the repo | |
| {_format_existing_issues(existing)} | |
| Call the submit_decisions tool with one decision per group above.""" | |
| def review_and_decide(by_pair: dict, dataset: str, profile: str, | |
| total: int, existing_issues: list[dict], | |
| log_fn=None) -> dict | None: | |
| """Run the agent. Returns the decisions dict (matching the | |
| submit_decisions tool's input_schema) or None if unavailable / | |
| failed.""" | |
| out = log_fn or (lambda s: print(s, flush=True)) | |
| client = _client() | |
| if client is None: | |
| out(" (agentic review unavailable: ANTHROPIC_API_KEY unset or anthropic SDK missing)") | |
| return None | |
| try: | |
| msg = client.messages.create( | |
| model=MODEL, | |
| max_tokens=MAX_TOKENS, | |
| system=SYSTEM_PROMPT, | |
| tools=[REVIEW_TOOL], | |
| tool_choice={"type": "tool", "name": "submit_decisions"}, | |
| messages=[{ | |
| "role": "user", | |
| "content": _build_user_prompt(by_pair, dataset, profile, total, existing_issues), | |
| }], | |
| ) | |
| for block in msg.content: | |
| if getattr(block, "type", None) == "tool_use" and block.name == "submit_decisions": | |
| return block.input | |
| out(" ! agentic review returned no tool_use block; falling back") | |
| return None | |
| except Exception as e: | |
| out(f" ! agentic review failed ({type(e).__name__}: {str(e)[:200]}); falling back") | |
| return None | |