awesome-loop-engineering / examples /evaluation-regression-loop.json

Sync awesome-loop-engineering

9ec4919 verified 1 day ago

3.03 kB

	{
	"name": "Evaluation regression",
	"objective": "Detect regressions in agent behavior, connect them to recent prompt, context, or harness changes, and produce a verified repair proposal.",
	"trigger": {
	"type": "scheduled",
	"cadence_or_event": "Nightly after eval completion, and when benchmark scores drop or trace graders fail."
	},
	"intake": {
	"sources": ["eval run results", "failing task IDs", "trace samples", "baseline scores", "recent prompt and harness commits"],
	"selection_rule": "Investigate regression clusters with reproducible evidence; skip runs inside accepted variance or known-flaky sets."
	},
	"workspace": {
	"isolation": "Branch or sandbox with read access to traces and eval artifacts.",
	"allowed_actions": ["targeted eval reruns", "scorer inspection", "small prompt, context, or fixture patches", "report generation"],
	"disallowed_actions": ["scorer changes that hide failures", "benchmark cherry-picking", "broad prompt rewrites", "leaderboard claims"]
	},
	"context": {
	"required_files": ["evaluation rubric", "known-flaky eval list"],
	"runtime_sources": ["baseline traces", "current traces", "model and runtime configuration"]
	},
	"agents": [
	{
	"role": "Investigator",
	"responsibility": "Compare failing traces against passing baseline traces."
	},
	{
	"role": "Hypothesis writer",
	"responsibility": "Classify the likely cause: model behavior, context, tool, scorer, fixture, or harness."
	},
	{
	"role": "Implementer",
	"responsibility": "Patch the smallest plausible cause supported by trace evidence."
	},
	{
	"role": "Verifier",
	"responsibility": "Rerun targeted evals, then a smoke suite, and check for new regressions."
	},
	{
	"role": "Judge",
	"responsibility": "Decide whether evidence supports merging, deferring, or escalating."
	}
	],
	"verification": {
	"gates": ["targeted failing tasks return to baseline", "no sentinel tasks regress", "trace evidence supports the claimed cause", "score deltas include run IDs and variance caveats"],
	"receipts": ["eval run IDs", "trace excerpts", "hypotheses considered", "rerun scores"]
	},
	"state": {
	"artifacts": ["regression investigation notes", "patch-attempt log"],
	"update_rule": "Record run IDs, failing tasks, hypotheses, patch attempts, rerun scores, and the final decision per regression cluster."
	},
	"budget": {
	"max_retries": 3,
	"max_runtime_minutes": 120
	},
	"escalation": {
	"conditions": ["scorer bug", "benchmark methodology change", "missing private traces", "model-provider incident", "fix risks overfitting"],
	"destination": "Issue or PR tagged for the eval owner with reproducible evidence"
	},
	"exit": {
	"success": "The regression is repaired with verified reruns, or classified as flaky or scorer drift with evidence.",
	"stop_without_success": "Artifacts are missing, retries are exhausted, or the tradeoff requires product judgment."
	}
	}