{ "name": "Evaluation regression", "objective": "Detect regressions in agent behavior, connect them to recent prompt, context, or harness changes, and produce a verified repair proposal.", "trigger": { "type": "scheduled", "cadence_or_event": "Nightly after eval completion, and when benchmark scores drop or trace graders fail." }, "intake": { "sources": ["eval run results", "failing task IDs", "trace samples", "baseline scores", "recent prompt and harness commits"], "selection_rule": "Investigate regression clusters with reproducible evidence; skip runs inside accepted variance or known-flaky sets." }, "workspace": { "isolation": "Branch or sandbox with read access to traces and eval artifacts.", "allowed_actions": ["targeted eval reruns", "scorer inspection", "small prompt, context, or fixture patches", "report generation"], "disallowed_actions": ["scorer changes that hide failures", "benchmark cherry-picking", "broad prompt rewrites", "leaderboard claims"] }, "context": { "required_files": ["evaluation rubric", "known-flaky eval list"], "runtime_sources": ["baseline traces", "current traces", "model and runtime configuration"] }, "agents": [ { "role": "Investigator", "responsibility": "Compare failing traces against passing baseline traces." }, { "role": "Hypothesis writer", "responsibility": "Classify the likely cause: model behavior, context, tool, scorer, fixture, or harness." }, { "role": "Implementer", "responsibility": "Patch the smallest plausible cause supported by trace evidence." }, { "role": "Verifier", "responsibility": "Rerun targeted evals, then a smoke suite, and check for new regressions." }, { "role": "Judge", "responsibility": "Decide whether evidence supports merging, deferring, or escalating." } ], "verification": { "gates": ["targeted failing tasks return to baseline", "no sentinel tasks regress", "trace evidence supports the claimed cause", "score deltas include run IDs and variance caveats"], "receipts": ["eval run IDs", "trace excerpts", "hypotheses considered", "rerun scores"] }, "state": { "artifacts": ["regression investigation notes", "patch-attempt log"], "update_rule": "Record run IDs, failing tasks, hypotheses, patch attempts, rerun scores, and the final decision per regression cluster." }, "budget": { "max_retries": 3, "max_runtime_minutes": 120 }, "escalation": { "conditions": ["scorer bug", "benchmark methodology change", "missing private traces", "model-provider incident", "fix risks overfitting"], "destination": "Issue or PR tagged for the eval owner with reproducible evidence" }, "exit": { "success": "The regression is repaired with verified reruns, or classified as flaky or scorer drift with evidence.", "stop_without_success": "Artifacts are missing, retries are exhausted, or the tradeoff requires product judgment." } }