File size: 3,030 Bytes
9ec4919
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
{
  "name": "Evaluation regression",
  "objective": "Detect regressions in agent behavior, connect them to recent prompt, context, or harness changes, and produce a verified repair proposal.",
  "trigger": {
    "type": "scheduled",
    "cadence_or_event": "Nightly after eval completion, and when benchmark scores drop or trace graders fail."
  },
  "intake": {
    "sources": ["eval run results", "failing task IDs", "trace samples", "baseline scores", "recent prompt and harness commits"],
    "selection_rule": "Investigate regression clusters with reproducible evidence; skip runs inside accepted variance or known-flaky sets."
  },
  "workspace": {
    "isolation": "Branch or sandbox with read access to traces and eval artifacts.",
    "allowed_actions": ["targeted eval reruns", "scorer inspection", "small prompt, context, or fixture patches", "report generation"],
    "disallowed_actions": ["scorer changes that hide failures", "benchmark cherry-picking", "broad prompt rewrites", "leaderboard claims"]
  },
  "context": {
    "required_files": ["evaluation rubric", "known-flaky eval list"],
    "runtime_sources": ["baseline traces", "current traces", "model and runtime configuration"]
  },
  "agents": [
    {
      "role": "Investigator",
      "responsibility": "Compare failing traces against passing baseline traces."
    },
    {
      "role": "Hypothesis writer",
      "responsibility": "Classify the likely cause: model behavior, context, tool, scorer, fixture, or harness."
    },
    {
      "role": "Implementer",
      "responsibility": "Patch the smallest plausible cause supported by trace evidence."
    },
    {
      "role": "Verifier",
      "responsibility": "Rerun targeted evals, then a smoke suite, and check for new regressions."
    },
    {
      "role": "Judge",
      "responsibility": "Decide whether evidence supports merging, deferring, or escalating."
    }
  ],
  "verification": {
    "gates": ["targeted failing tasks return to baseline", "no sentinel tasks regress", "trace evidence supports the claimed cause", "score deltas include run IDs and variance caveats"],
    "receipts": ["eval run IDs", "trace excerpts", "hypotheses considered", "rerun scores"]
  },
  "state": {
    "artifacts": ["regression investigation notes", "patch-attempt log"],
    "update_rule": "Record run IDs, failing tasks, hypotheses, patch attempts, rerun scores, and the final decision per regression cluster."
  },
  "budget": {
    "max_retries": 3,
    "max_runtime_minutes": 120
  },
  "escalation": {
    "conditions": ["scorer bug", "benchmark methodology change", "missing private traces", "model-provider incident", "fix risks overfitting"],
    "destination": "Issue or PR tagged for the eval owner with reproducible evidence"
  },
  "exit": {
    "success": "The regression is repaired with verified reruns, or classified as flaky or scorer drift with evidence.",
    "stop_without_success": "Artifacts are missing, retries are exhausted, or the tradeoff requires product judgment."
  }
}