simready-validator / tools /hf_space /agentic_issues.py
loginowskid's picture
Sync from simready-oem-library-pm@02a6f31d
9e0af39 verified
"""Agentic review of validator-internal bug groups before they're filed
as GitHub issues.
Without agentic review, every distinct (rule, code) pair we see becomes
its own issue (or the dataset gets one big dump issue). That's noisy:
- The same underlying bug may surface under different (rule, code) pairs
→ semantically the same issue, but the title-match dedup misses it.
- Transient one-off errors get filed alongside real bugs.
- Auto-generated bodies are raw counts + sample strings; they don't
explain what the validator is actually misbehaving on.
This module asks Claude to look at the current run's error groups AND
the repository's existing validator-internal issues, then produces a
structured list of decisions (`comment` / `skip` / `create`) with
human-readable titles + bodies.
Best-effort: if the call fails (missing API key, rate limit, malformed
response), the caller falls back to the legacy title-match policy in
`github_issues.py::_ensure_internal_issues_simple`.
"""
from __future__ import annotations
import json
import os
import sys
from typing import Any
MODEL = "claude-sonnet-4-6"
MAX_TOKENS = 4096
def _api_key() -> str | None:
"""Token name varies by setup. ANTHROPIC_API_KEY is the SDK default;
on NVIDIA-internal Spaces (this one) the token is supplied as
NVIDIA_INFERENCE_TOKEN and the SDK is pointed at NVIDIA's Anthropic
proxy via ANTHROPIC_BASE_URL (Anthropic SDK reads BASE_URL natively,
no extra wiring needed here)."""
return (os.environ.get("ANTHROPIC_API_KEY")
or os.environ.get("NVIDIA_INFERENCE_TOKEN"))
def _client():
"""Lazy import + construct so the module loads even when the
anthropic SDK isn't installed (the caller probes this)."""
key = _api_key()
if not key:
return None
try:
from anthropic import Anthropic
except ImportError:
return None
return Anthropic(api_key=key)
def is_available() -> bool:
return _client() is not None
REVIEW_TOOL = {
"name": "submit_decisions",
"description": (
"Submit one decision per (rule, code) error group. "
"Each decision is either 'comment' on an existing issue, "
"'skip' if it's noise, or 'create' a single new issue. "
"Multiple groups may be consolidated by pointing several "
"decisions at the same target issue or by emitting one "
"'create' decision with body text that explains multiple "
"related groups."
),
"input_schema": {
"type": "object",
"properties": {
"summary": {
"type": "string",
"description": "One short paragraph summarizing what the validator's actually misbehaving on, in plain language.",
},
"decisions": {
"type": "array",
"description": "One entry per (rule, code) group from the input.",
"items": {
"type": "object",
"properties": {
"rule": {"type": "string"},
"code": {"type": "string"},
"action": {
"type": "string",
"enum": ["comment", "skip", "create"],
},
"target_issue": {
"type": ["integer", "null"],
"description": "GitHub issue number to comment on. Required when action='comment'; null otherwise.",
},
"title": {
"type": ["string", "null"],
"description": "Issue title to use when action='create'. Should start with '[validator-internal]' and read as a one-line plain-language summary.",
},
"body": {
"type": ["string", "null"],
"description": "Issue body (action='create') or comment body (action='comment'). Markdown, plain language, explains what the bug is and why these errors are evidence of it.",
},
"reasoning": {
"type": "string",
"description": "Short note (1-2 sentences) explaining why this action was chosen.",
},
},
"required": ["rule", "code", "action", "reasoning"],
},
},
},
"required": ["summary", "decisions"],
},
}
def _format_existing_issues(existing: list[dict]) -> str:
if not existing:
return "(none — this is the first time validator-internal issues are being filed in this repo)"
lines = []
for i in existing:
body = (i.get("body") or "")[:400].replace("\n", " ")
lines.append(
f"- #{i['number']} [{i.get('state', '?')}] {i['title']!r}\n"
f" body excerpt: {body}"
)
return "\n".join(lines)
def _format_groups(by_pair: dict, total: int) -> str:
lines = []
for (rule, code), g in sorted(by_pair.items(), key=lambda kv: -kv[1]["count"]):
sample = g["sample_msg"].replace("\n", " ")[:300]
lines.append(
f"- rule={rule!r} code={code!r} severity={g.get('severity') or '?'}\n"
f" count={g['count']} of {total} total internal occurrences\n"
f" sample message: {sample!r}"
)
return "\n".join(lines)
SYSTEM_PROMPT = """You are reviewing validator-internal bug reports from \
the SimReady asset validator before they're filed as GitHub issues on \
NVIDIA-dev/simready-oem-library-pm.
Context:
- The validator runs the foundation spec rules against customer datasets.
- Some errors it emits don't map to real spec violations — they're \
bugs in the validator's own rule registration, spec loading, plugin \
discovery, or asset traversal. Those are "validator-internal" bugs and \
this repo tracks them as GitHub issues.
- Examples of validator-internal codes: UNKNOWN, SDK.*, and any \
message containing "Uncaught error" or "is not registered to \
requirement".
Your job, for each (rule, code) error group from the current validation \
run:
1. Check the existing validator-internal issues. If the group matches \
one of them semantically (not just by title), produce a `comment` \
decision pointing at that issue. Multiple groups may map to the same \
existing issue.
2. If the group looks like a transient one-off (very low count, no \
recognizable failure mode, message looks like a stray traceback that \
won't recur), produce a `skip` decision.
3. Otherwise, produce a `create` decision with a plain-language title \
(starts with '[validator-internal]') and a body that explains what the \
validator is actually misbehaving on (don't just paraphrase the rule \
name — interpret what failed). If two or three groups are clearly the \
same underlying bug surfacing under different rule names, consolidate \
into a single `create` decision and route the other groups as \
`comment` decisions targeting that issue once it's filed (use a \
placeholder negative number like -1, -2 etc. for cross-references — \
the caller will resolve them after creation order).
Be conservative: prefer `comment` over `create`, prefer `skip` only \
when there's clear noise signal. Better to have one well-written \
issue than five sparse ones."""
def _build_user_prompt(by_pair: dict, dataset: str, profile: str,
total: int, existing: list[dict]) -> str:
return f"""Dataset: `{dataset}`
Profile: `{profile}`
Total internal-error occurrences in this run: {total}
Distinct (rule, code) groups: {len(by_pair)}
# Current run's error groups
{_format_groups(by_pair, total)}
# Existing validator-internal issues in the repo
{_format_existing_issues(existing)}
Call the submit_decisions tool with one decision per group above."""
def review_and_decide(by_pair: dict, dataset: str, profile: str,
total: int, existing_issues: list[dict],
log_fn=None) -> dict | None:
"""Run the agent. Returns the decisions dict (matching the
submit_decisions tool's input_schema) or None if unavailable /
failed."""
out = log_fn or (lambda s: print(s, flush=True))
client = _client()
if client is None:
out(" (agentic review unavailable: ANTHROPIC_API_KEY unset or anthropic SDK missing)")
return None
try:
msg = client.messages.create(
model=MODEL,
max_tokens=MAX_TOKENS,
system=SYSTEM_PROMPT,
tools=[REVIEW_TOOL],
tool_choice={"type": "tool", "name": "submit_decisions"},
messages=[{
"role": "user",
"content": _build_user_prompt(by_pair, dataset, profile, total, existing_issues),
}],
)
for block in msg.content:
if getattr(block, "type", None) == "tool_use" and block.name == "submit_decisions":
return block.input
out(" ! agentic review returned no tool_use block; falling back")
return None
except Exception as e:
out(f" ! agentic review failed ({type(e).__name__}: {str(e)[:200]}); falling back")
return None