loginowskid commited on
Commit
f36b7a3
·
verified ·
1 Parent(s): d126183

Sync from simready-oem-library-pm@99650b92

Browse files
tools/hf_space/agentic_issues.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Agentic review of validator-internal bug groups before they're filed
2
+ as GitHub issues.
3
+
4
+ Without agentic review, every distinct (rule, code) pair we see becomes
5
+ its own issue (or the dataset gets one big dump issue). That's noisy:
6
+ - The same underlying bug may surface under different (rule, code) pairs
7
+ → semantically the same issue, but the title-match dedup misses it.
8
+ - Transient one-off errors get filed alongside real bugs.
9
+ - Auto-generated bodies are raw counts + sample strings; they don't
10
+ explain what the validator is actually misbehaving on.
11
+
12
+ This module asks Claude to look at the current run's error groups AND
13
+ the repository's existing validator-internal issues, then produces a
14
+ structured list of decisions (`comment` / `skip` / `create`) with
15
+ human-readable titles + bodies.
16
+
17
+ Best-effort: if the call fails (missing API key, rate limit, malformed
18
+ response), the caller falls back to the legacy title-match policy in
19
+ `github_issues.py::_ensure_internal_issues_simple`.
20
+ """
21
+ from __future__ import annotations
22
+
23
+ import json
24
+ import os
25
+ import sys
26
+ from typing import Any
27
+
28
+ MODEL = "claude-sonnet-4-6"
29
+ MAX_TOKENS = 4096
30
+
31
+
32
+ def _client():
33
+ """Lazy import + construct so the module loads even when the
34
+ anthropic SDK isn't installed (the caller probes this)."""
35
+ if not os.environ.get("ANTHROPIC_API_KEY"):
36
+ return None
37
+ try:
38
+ from anthropic import Anthropic
39
+ except ImportError:
40
+ return None
41
+ return Anthropic()
42
+
43
+
44
+ def is_available() -> bool:
45
+ return _client() is not None
46
+
47
+
48
+ REVIEW_TOOL = {
49
+ "name": "submit_decisions",
50
+ "description": (
51
+ "Submit one decision per (rule, code) error group. "
52
+ "Each decision is either 'comment' on an existing issue, "
53
+ "'skip' if it's noise, or 'create' a single new issue. "
54
+ "Multiple groups may be consolidated by pointing several "
55
+ "decisions at the same target issue or by emitting one "
56
+ "'create' decision with body text that explains multiple "
57
+ "related groups."
58
+ ),
59
+ "input_schema": {
60
+ "type": "object",
61
+ "properties": {
62
+ "summary": {
63
+ "type": "string",
64
+ "description": "One short paragraph summarizing what the validator's actually misbehaving on, in plain language.",
65
+ },
66
+ "decisions": {
67
+ "type": "array",
68
+ "description": "One entry per (rule, code) group from the input.",
69
+ "items": {
70
+ "type": "object",
71
+ "properties": {
72
+ "rule": {"type": "string"},
73
+ "code": {"type": "string"},
74
+ "action": {
75
+ "type": "string",
76
+ "enum": ["comment", "skip", "create"],
77
+ },
78
+ "target_issue": {
79
+ "type": ["integer", "null"],
80
+ "description": "GitHub issue number to comment on. Required when action='comment'; null otherwise.",
81
+ },
82
+ "title": {
83
+ "type": ["string", "null"],
84
+ "description": "Issue title to use when action='create'. Should start with '[validator-internal]' and read as a one-line plain-language summary.",
85
+ },
86
+ "body": {
87
+ "type": ["string", "null"],
88
+ "description": "Issue body (action='create') or comment body (action='comment'). Markdown, plain language, explains what the bug is and why these errors are evidence of it.",
89
+ },
90
+ "reasoning": {
91
+ "type": "string",
92
+ "description": "Short note (1-2 sentences) explaining why this action was chosen.",
93
+ },
94
+ },
95
+ "required": ["rule", "code", "action", "reasoning"],
96
+ },
97
+ },
98
+ },
99
+ "required": ["summary", "decisions"],
100
+ },
101
+ }
102
+
103
+
104
+ def _format_existing_issues(existing: list[dict]) -> str:
105
+ if not existing:
106
+ return "(none — this is the first time validator-internal issues are being filed in this repo)"
107
+ lines = []
108
+ for i in existing:
109
+ body = (i.get("body") or "")[:400].replace("\n", " ")
110
+ lines.append(
111
+ f"- #{i['number']} [{i.get('state', '?')}] {i['title']!r}\n"
112
+ f" body excerpt: {body}"
113
+ )
114
+ return "\n".join(lines)
115
+
116
+
117
+ def _format_groups(by_pair: dict, total: int) -> str:
118
+ lines = []
119
+ for (rule, code), g in sorted(by_pair.items(), key=lambda kv: -kv[1]["count"]):
120
+ sample = g["sample_msg"].replace("\n", " ")[:300]
121
+ lines.append(
122
+ f"- rule={rule!r} code={code!r} severity={g.get('severity') or '?'}\n"
123
+ f" count={g['count']} of {total} total internal occurrences\n"
124
+ f" sample message: {sample!r}"
125
+ )
126
+ return "\n".join(lines)
127
+
128
+
129
+ SYSTEM_PROMPT = """You are reviewing validator-internal bug reports from \
130
+ the SimReady asset validator before they're filed as GitHub issues on \
131
+ NVIDIA-dev/simready-oem-library-pm.
132
+
133
+ Context:
134
+ - The validator runs the foundation spec rules against customer datasets.
135
+ - Some errors it emits don't map to real spec violations — they're \
136
+ bugs in the validator's own rule registration, spec loading, plugin \
137
+ discovery, or asset traversal. Those are "validator-internal" bugs and \
138
+ this repo tracks them as GitHub issues.
139
+ - Examples of validator-internal codes: UNKNOWN, SDK.*, and any \
140
+ message containing "Uncaught error" or "is not registered to \
141
+ requirement".
142
+
143
+ Your job, for each (rule, code) error group from the current validation \
144
+ run:
145
+ 1. Check the existing validator-internal issues. If the group matches \
146
+ one of them semantically (not just by title), produce a `comment` \
147
+ decision pointing at that issue. Multiple groups may map to the same \
148
+ existing issue.
149
+ 2. If the group looks like a transient one-off (very low count, no \
150
+ recognizable failure mode, message looks like a stray traceback that \
151
+ won't recur), produce a `skip` decision.
152
+ 3. Otherwise, produce a `create` decision with a plain-language title \
153
+ (starts with '[validator-internal]') and a body that explains what the \
154
+ validator is actually misbehaving on (don't just paraphrase the rule \
155
+ name — interpret what failed). If two or three groups are clearly the \
156
+ same underlying bug surfacing under different rule names, consolidate \
157
+ into a single `create` decision and route the other groups as \
158
+ `comment` decisions targeting that issue once it's filed (use a \
159
+ placeholder negative number like -1, -2 etc. for cross-references — \
160
+ the caller will resolve them after creation order).
161
+
162
+ Be conservative: prefer `comment` over `create`, prefer `skip` only \
163
+ when there's clear noise signal. Better to have one well-written \
164
+ issue than five sparse ones."""
165
+
166
+
167
+ def _build_user_prompt(by_pair: dict, dataset: str, profile: str,
168
+ total: int, existing: list[dict]) -> str:
169
+ return f"""Dataset: `{dataset}`
170
+ Profile: `{profile}`
171
+ Total internal-error occurrences in this run: {total}
172
+ Distinct (rule, code) groups: {len(by_pair)}
173
+
174
+ # Current run's error groups
175
+
176
+ {_format_groups(by_pair, total)}
177
+
178
+ # Existing validator-internal issues in the repo
179
+
180
+ {_format_existing_issues(existing)}
181
+
182
+ Call the submit_decisions tool with one decision per group above."""
183
+
184
+
185
+ def review_and_decide(by_pair: dict, dataset: str, profile: str,
186
+ total: int, existing_issues: list[dict],
187
+ log_fn=None) -> dict | None:
188
+ """Run the agent. Returns the decisions dict (matching the
189
+ submit_decisions tool's input_schema) or None if unavailable /
190
+ failed."""
191
+ out = log_fn or (lambda s: print(s, flush=True))
192
+ client = _client()
193
+ if client is None:
194
+ out(" (agentic review unavailable: ANTHROPIC_API_KEY unset or anthropic SDK missing)")
195
+ return None
196
+ try:
197
+ msg = client.messages.create(
198
+ model=MODEL,
199
+ max_tokens=MAX_TOKENS,
200
+ system=SYSTEM_PROMPT,
201
+ tools=[REVIEW_TOOL],
202
+ tool_choice={"type": "tool", "name": "submit_decisions"},
203
+ messages=[{
204
+ "role": "user",
205
+ "content": _build_user_prompt(by_pair, dataset, profile, total, existing_issues),
206
+ }],
207
+ )
208
+ for block in msg.content:
209
+ if getattr(block, "type", None) == "tool_use" and block.name == "submit_decisions":
210
+ return block.input
211
+ out(" ! agentic review returned no tool_use block; falling back")
212
+ return None
213
+ except Exception as e:
214
+ out(f" ! agentic review failed ({type(e).__name__}: {str(e)[:200]}); falling back")
215
+ return None
tools/hf_space/github_issues.py CHANGED
@@ -125,12 +125,137 @@ def _build_dataset_recurrence_comment(by_pair: dict, dataset: str, profile: str,
125
  )
126
 
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  def ensure_internal_issues(results_json: dict, dataset: str, profile: str,
129
  log_fn=None) -> dict:
130
- """Scan results.json for validator-internal bugs and ensure exactly ONE
131
- tracking GitHub issue exists per dataset. The issue lists every
132
- distinct (rule, code) pair found across the whole dataset; re-runs
133
- add a comment with the new counts instead of opening duplicates.
 
134
 
135
  Best-effort — swallowed exceptions return {"error": ...} so the
136
  validator's verdict is never blocked on GitHub being flaky."""
@@ -160,26 +285,27 @@ def ensure_internal_issues(results_json: dict, dataset: str, profile: str,
160
  if not by_pair:
161
  return {"created": 0, "updated": 0}
162
 
163
- title = f"[validator-internal] {dataset}"
164
  try:
165
- existing = _find_issue(title)
166
- if existing:
167
- _add_comment(existing["number"],
168
- _build_dataset_recurrence_comment(by_pair, dataset, profile, total))
169
- out(f" internal-issue #{existing['number']}: comment added for dataset "
170
- f"{dataset} ({total} occurrences, {len(by_pair)} pairs)")
171
- return {"created": 0, "updated": 1, "pairs": len(by_pair), "total": total}
172
- num = _create_issue(title,
173
- _build_dataset_issue_body(by_pair, dataset, profile, total),
174
- ["validator-internal", "process"])
175
- out(f" internal-issue #{num}: opened for dataset {dataset} "
176
- f"({total} occurrences, {len(by_pair)} pairs)")
177
- return {"created": 1, "updated": 0, "pairs": len(by_pair), "total": total}
178
  except Exception as e:
179
- msg = f"{type(e).__name__}: {e}"
180
- if "404" in msg:
181
- out(f" ! internal-issue tracking aborted (404 token lacks issues:write "
182
- f"on {GH_REPO})")
183
- return {"created": 0, "updated": 0, "aborted_404": True}
184
- out(f" ! internal-issue tracking for dataset {dataset} failed: {msg}")
185
- return {"created": 0, "updated": 0, "error": msg}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  )
126
 
127
 
128
+ def _list_existing_internal_issues() -> list[dict]:
129
+ """Pull every validator-internal issue (open + recently closed) so
130
+ the agent can dedupe semantically against them. Closed issues are
131
+ included because the same bug can come back after a fix is
132
+ reverted or after a regression."""
133
+ q = urllib.parse.quote(
134
+ f'repo:{GH_REPO} label:"validator-internal" is:issue')
135
+ try:
136
+ result = _gh_request("GET", f"/../../search/issues?q={q}&per_page=100")
137
+ except Exception:
138
+ return []
139
+ return (result or {}).get("items") or []
140
+
141
+
142
+ def _execute_decisions(decisions: list[dict], by_pair: dict, dataset: str,
143
+ profile: str, total: int, log_fn) -> dict:
144
+ """Carry out the agent's decisions. Returns counters."""
145
+ out = log_fn
146
+ created = updated = skipped = 0
147
+ aborted = False
148
+ # Map negative placeholder numbers → real issue numbers as `create`
149
+ # decisions resolve. The agent uses negatives to cross-reference
150
+ # decisions that share a parent issue when several groups roll up
151
+ # into one new bug.
152
+ placeholder_to_real: dict[int, int] = {}
153
+ # Two passes: creates first (so their numbers are known), then
154
+ # comments (so cross-references resolve). Skips are free.
155
+ creates = [d for d in decisions if d.get("action") == "create"]
156
+ comments = [d for d in decisions if d.get("action") == "comment"]
157
+ skips = [d for d in decisions if d.get("action") == "skip"]
158
+ for d in creates:
159
+ if aborted: break
160
+ title = d.get("title") or f"[validator-internal] {dataset} :: {d.get('rule')} / {d.get('code')}"
161
+ body = d.get("body") or "(agent did not provide a body)"
162
+ try:
163
+ existing = _find_issue(title)
164
+ if existing:
165
+ _add_comment(existing["number"],
166
+ f"_Re-hit during validation of `{dataset}` "
167
+ f"(profile `{profile}`)._\n\n{body}")
168
+ updated += 1
169
+ out(f" internal-issue #{existing['number']}: comment added "
170
+ f"(agent: create→existing match by title)")
171
+ placeholder_to_real.setdefault(-(creates.index(d) + 1), existing["number"])
172
+ else:
173
+ num = _create_issue(title, body, ["validator-internal", "process", "agent-reviewed"])
174
+ created += 1
175
+ out(f" internal-issue #{num}: opened ({title!r}) — agent reasoning: "
176
+ f"{d.get('reasoning', '')[:160]}")
177
+ placeholder_to_real[-(creates.index(d) + 1)] = num
178
+ except Exception as e:
179
+ msg = f"{type(e).__name__}: {e}"
180
+ if "404" in msg:
181
+ out(f" ! internal-issue tracking aborted (404 — token lacks issues:write on {GH_REPO})")
182
+ aborted = True
183
+ else:
184
+ out(f" ! create failed for {title!r}: {msg}")
185
+ for d in comments:
186
+ if aborted: break
187
+ target = d.get("target_issue")
188
+ if target is None:
189
+ out(f" ! comment decision has no target_issue; skipping ({d.get('reasoning', '')[:100]})")
190
+ continue
191
+ if target < 0:
192
+ target = placeholder_to_real.get(target)
193
+ if target is None:
194
+ out(f" ! comment decision cross-references an unresolved placeholder; skipping")
195
+ continue
196
+ body = d.get("body") or (
197
+ f"Re-hit during validation of `{dataset}` (profile `{profile}`). "
198
+ f"Same underlying bug as this issue — see agent reasoning: "
199
+ f"{d.get('reasoning', '')}"
200
+ )
201
+ try:
202
+ _add_comment(target, body)
203
+ updated += 1
204
+ out(f" internal-issue #{target}: comment added (agent: comment) — {d.get('reasoning', '')[:120]}")
205
+ except Exception as e:
206
+ msg = f"{type(e).__name__}: {e}"
207
+ if "404" in msg:
208
+ out(f" ! comment tracking aborted (404)")
209
+ aborted = True
210
+ else:
211
+ out(f" ! comment failed for #{target}: {msg}")
212
+ for d in skips:
213
+ skipped += 1
214
+ out(f" internal-issue {d.get('rule')}/{d.get('code')}: skipped — {d.get('reasoning', '')[:160]}")
215
+ return {"created": created, "updated": updated, "skipped": skipped,
216
+ "aborted_404": aborted}
217
+
218
+
219
+ def _ensure_internal_issues_simple(by_pair: dict, dataset: str, profile: str,
220
+ total: int, log_fn) -> dict:
221
+ """Fallback (no agentic review): one issue per dataset, dedup by
222
+ exact title match. This is what we used before the agent was
223
+ wired up; kept as a backstop for when ANTHROPIC_API_KEY is unset,
224
+ the SDK is missing, or the Claude call fails."""
225
+ out = log_fn
226
+ title = f"[validator-internal] {dataset}"
227
+ try:
228
+ existing = _find_issue(title)
229
+ if existing:
230
+ _add_comment(existing["number"],
231
+ _build_dataset_recurrence_comment(by_pair, dataset, profile, total))
232
+ out(f" internal-issue #{existing['number']}: comment added for dataset "
233
+ f"{dataset} ({total} occurrences, {len(by_pair)} pairs) — fallback policy")
234
+ return {"created": 0, "updated": 1, "pairs": len(by_pair), "total": total,
235
+ "fallback": True}
236
+ num = _create_issue(title,
237
+ _build_dataset_issue_body(by_pair, dataset, profile, total),
238
+ ["validator-internal", "process"])
239
+ out(f" internal-issue #{num}: opened for dataset {dataset} "
240
+ f"({total} occurrences, {len(by_pair)} pairs) — fallback policy")
241
+ return {"created": 1, "updated": 0, "pairs": len(by_pair), "total": total,
242
+ "fallback": True}
243
+ except Exception as e:
244
+ msg = f"{type(e).__name__}: {e}"
245
+ if "404" in msg:
246
+ out(f" ! internal-issue tracking aborted (404 — token lacks issues:write on {GH_REPO})")
247
+ return {"created": 0, "updated": 0, "aborted_404": True, "fallback": True}
248
+ out(f" ! internal-issue tracking for dataset {dataset} failed: {msg}")
249
+ return {"created": 0, "updated": 0, "error": msg, "fallback": True}
250
+
251
+
252
  def ensure_internal_issues(results_json: dict, dataset: str, profile: str,
253
  log_fn=None) -> dict:
254
+ """Scan results.json for validator-internal bugs and route them to
255
+ GitHub issues via an agentic review pass (Claude) that classifies,
256
+ dedupes against existing issues, and writes plain-language
257
+ explanations. Falls back to a simple one-issue-per-dataset policy
258
+ if the agent is unavailable.
259
 
260
  Best-effort — swallowed exceptions return {"error": ...} so the
261
  validator's verdict is never blocked on GitHub being flaky."""
 
285
  if not by_pair:
286
  return {"created": 0, "updated": 0}
287
 
288
+ # Agentic path: Claude classifies + writes the issue body.
289
  try:
290
+ from agentic_issues import is_available as _agent_available, review_and_decide
 
 
 
 
 
 
 
 
 
 
 
 
291
  except Exception as e:
292
+ out(f" (agentic_issues import failed: {type(e).__name__}: {str(e)[:120]}); "
293
+ f"using fallback policy")
294
+ return _ensure_internal_issues_simple(by_pair, dataset, profile, total, out)
295
+
296
+ if not _agent_available():
297
+ out(" (agentic review unavailable; using fallback policy)")
298
+ return _ensure_internal_issues_simple(by_pair, dataset, profile, total, out)
299
+
300
+ existing = _list_existing_internal_issues()
301
+ out(f" agentic review: {len(by_pair)} group(s) vs {len(existing)} existing "
302
+ f"validator-internal issue(s)")
303
+ review = review_and_decide(by_pair, dataset, profile, total, existing, log_fn=out)
304
+ if review is None or not review.get("decisions"):
305
+ return _ensure_internal_issues_simple(by_pair, dataset, profile, total, out)
306
+
307
+ if review.get("summary"):
308
+ out(f" agent summary: {review['summary'][:300]}")
309
+ result = _execute_decisions(review["decisions"], by_pair, dataset, profile, total, out)
310
+ result.update({"pairs": len(by_pair), "total": total, "agentic": True})
311
+ return result
tools/hf_space/requirements.txt CHANGED
@@ -11,6 +11,11 @@
11
  gradio>=4.0
12
  huggingface_hub>=0.34
13
 
 
 
 
 
 
14
  # Validator runtime — versions match
15
  # /home/horde/.simready/venv/lib/python3.12/site-packages/ on DGXC.
16
  usd-core==26.5
 
11
  gradio>=4.0
12
  huggingface_hub>=0.34
13
 
14
+ # Agentic review of validator-internal bugs (github_issues.py).
15
+ # Optional at runtime — when ANTHROPIC_API_KEY is unset the issue
16
+ # filer falls back to the title-match policy.
17
+ anthropic>=0.40
18
+
19
  # Validator runtime — versions match
20
  # /home/horde/.simready/venv/lib/python3.12/site-packages/ on DGXC.
21
  usd-core==26.5