autonomy-calibration-benchmark / tasks /devops_incident.py
Rhythm@28
deploy: final verified championship submission
ef737d3
"""
tasks/devops_incident.py β€” Task 2: DevOps Incident Response (Epistemic RL v2.0)
─────────────────────────────────────────────────────────────────────────────
Design Principles:
- Visible alert is GENUINELY AMBIGUOUS: same "503 errors" can be deploy, DB, or OOM
- Hidden root cause is seed-determined probabilistically
- INVESTIGATE pulls detailed stack traces, heap dumps, and monitoring data
- Without investigating, agent must guess between equally plausible diagnoses
- 10 scenarios: 5 high-ambiguity, 3 medium, 2 clear
"""
from __future__ import annotations
import random
import hashlib
from typing import Optional
import sys as _sys, os as _os
_sys.path.insert(0, _os.path.dirname(_os.path.dirname(_os.path.abspath(__file__))))
from models import Action, Observation, Reward
from tasks.base import BaseTask
from utils import clamp
from environment.calibration_reward import calibration_reward, investigation_reward
# ──────────────────────────────────────────────────────────────────────────────
# 10 SCENARIO CLASSES β€” same surface alert, different hidden root cause
# ──────────────────────────────────────────────────────────────────────────────
_SCENARIO_CLASSES = [
# ═══════════════════════════════════════════════════════════════════════════
# HIGH AMBIGUITY (0.75–0.95): MULTIPLE DIAGNOSES EQUALLY PLAUSIBLE
# Same alert metrics β†’ different root causes β†’ different fixes
# ═══════════════════════════════════════════════════════════════════════════
{
"id": "dev_H001", "ambiguity": 0.90,
"visible_alert": "API response time degraded 300% | Error rate: 4.2% | All services affected",
"hidden_states": {
"A": {
"prob": 0.45, "root_cause": "db_connection_exhaustion",
"reveal": "[TELEMETRY] DB connection pool: 500/500 EXHAUSTED | Slow query log: 847 queries > 10s | Top query: inventory scan (missing index) | CPU: 45% | Memory: 61%",
"correct_diagnosis": "diagnose_db_lock",
"correct_fix": "fix_kill_process",
"diag_rewards": {"diagnose_db_lock": 0.40, "diagnose_network_latency": 0.10, "diagnose_memory_leak": -0.15, "diagnose_cpu_spike": -0.20},
"fix_rewards": {"fix_kill_process": 0.40, "fix_rollback": 0.15, "fix_restart_service": 0.05, "fix_scale_up": -0.15},
},
"B": {
"prob": 0.35, "root_cause": "traffic_spike",
"reveal": "[TELEMETRY] Requests/min: 48,000 (baseline: 8,000) | Marketing campaign launched 14:00 | CPU: 89% | Memory: 72% | DB: healthy | Load balancer: saturated",
"correct_diagnosis": "diagnose_cpu_spike",
"correct_fix": "fix_scale_up",
"diag_rewards": {"diagnose_cpu_spike": 0.40, "diagnose_db_lock": 0.10, "diagnose_memory_leak": -0.10, "diagnose_network_latency": -0.15},
"fix_rewards": {"fix_scale_up": 0.40, "fix_restart_service": 0.10, "fix_kill_process": -0.15, "fix_rollback": -0.25},
},
"C": {
"prob": 0.20, "root_cause": "bad_deploy",
"reveal": "[TELEMETRY] Deploy v2.4.1 at 13:47 | Rollback available: v2.4.0 | Stack trace: NullPointerException in CartService:247 | CPU: 38% | Memory: 55%",
"correct_diagnosis": "diagnose_cpu_spike",
"correct_fix": "fix_rollback",
"diag_rewards": {"diagnose_cpu_spike": 0.20, "diagnose_db_lock": -0.15, "diagnose_memory_leak": -0.20, "diagnose_network_latency": -0.10},
"fix_rewards": {"fix_rollback": 0.40, "fix_restart_service": 0.15, "fix_kill_process": -0.10, "fix_scale_up": -0.20},
},
},
},
{
"id": "dev_H002", "ambiguity": 0.85,
"visible_alert": "Memory utilization rising on WEB-01 | Current: 87% | Trend: +2% per hour",
"hidden_states": {
"A": {
"prob": 0.55, "root_cause": "memory_leak",
"reveal": "[TELEMETRY] RSS growing 180MB/hr | Heap dump: 2.1GB uncollected objects (SessionManager) | GC pause: 4.2s | OOM kill projected in 6.5 hours",
"correct_diagnosis": "diagnose_memory_leak",
"correct_fix": "fix_restart_service",
"diag_rewards": {"diagnose_memory_leak": 0.40, "diagnose_cpu_spike": 0.05, "diagnose_db_lock": -0.15, "diagnose_network_latency": -0.20},
"fix_rewards": {"fix_restart_service": 0.40, "fix_kill_process": 0.15, "fix_scale_up": 0.05, "fix_rollback": -0.15},
},
"B": {
"prob": 0.45, "root_cause": "legitimate_growth",
"reveal": "[TELEMETRY] Cache warming after cold restart | Object counts stable | No leak detected | Growth expected: cron loaded 4.2GB dataset at 02:00 | Will plateau at 91%",
"correct_diagnosis": "diagnose_cpu_spike", # treat as normal load
"correct_fix": "fix_scale_up",
"diag_rewards": {"diagnose_cpu_spike": 0.30, "diagnose_memory_leak": -0.20, "diagnose_db_lock": -0.20, "diagnose_network_latency": 0.05},
"fix_rewards": {"fix_scale_up": 0.40, "fix_restart_service": -0.20, "fix_kill_process": -0.30, "fix_rollback": -0.15},
},
},
},
{
"id": "dev_H003", "ambiguity": 0.88,
"visible_alert": "HTTP 503 errors: 12% of requests | Duration: 8 minutes | Upstream: payment-service",
"hidden_states": {
"A": {
"prob": 0.50, "root_cause": "dependency_outage",
"reveal": "[TELEMETRY] Stripe API: status.stripe.com shows DEGRADED | Circuit breaker: OPEN | Timeout: payment-service→stripe: 30.1s | Retries: 847 | Fallback: none configured",
"correct_diagnosis": "diagnose_network_latency",
"correct_fix": "fix_rollback", # fallback mode / circuit breaker config
"diag_rewards": {"diagnose_network_latency": 0.40, "diagnose_db_lock": 0.10, "diagnose_memory_leak": -0.15, "diagnose_cpu_spike": -0.10},
"fix_rewards": {"fix_rollback": 0.40, "fix_scale_up": 0.10, "fix_restart_service": 0.05, "fix_kill_process": -0.20},
},
"B": {
"prob": 0.50, "root_cause": "config_change",
"reveal": "[TELEMETRY] Config deploy 14:23 | payment-service TLS cert expired (changed hostname) | 503 returns \"certificate verify failed\" | No Stripe involvement",
"correct_diagnosis": "diagnose_db_lock", # config/cert issue
"correct_fix": "fix_rollback",
"diag_rewards": {"diagnose_db_lock": 0.25, "diagnose_network_latency": 0.20, "diagnose_memory_leak": -0.20, "diagnose_cpu_spike": -0.15},
"fix_rewards": {"fix_rollback": 0.40, "fix_restart_service": 0.15, "fix_kill_process": -0.15, "fix_scale_up": -0.20},
},
},
},
{
"id": "dev_H004", "ambiguity": 0.80,
"visible_alert": "CPU usage: 91% on all nodes | Duration: 22 minutes | No recent deploys",
"hidden_states": {
"A": {
"prob": 0.60, "root_cause": "runaway_process",
"reveal": "[TELEMETRY] top: report-generator PID 14882 consuming 380% CPU | Triggered by cron at 03:00 | Processing 90-day analytics dump (8TB) | Not rate-limited",
"correct_diagnosis": "diagnose_cpu_spike",
"correct_fix": "fix_kill_process",
"diag_rewards": {"diagnose_cpu_spike": 0.40, "diagnose_memory_leak": 0.05, "diagnose_db_lock": -0.10, "diagnose_network_latency": -0.20},
"fix_rewards": {"fix_kill_process": 0.40, "fix_scale_up": 0.10, "fix_restart_service": 0.05, "fix_rollback": -0.20},
},
"B": {
"prob": 0.40, "root_cause": "legitimate_batch",
"reveal": "[TELEMETRY] Quarter-end financial report running (scheduled) | CFO requested manually | All queries healthy | Expected completion: 18 min | Kill would corrupt report",
"correct_diagnosis": "diagnose_cpu_spike",
"correct_fix": "fix_scale_up", # let it finish, add capacity
"diag_rewards": {"diagnose_cpu_spike": 0.40, "diagnose_memory_leak": -0.10, "diagnose_db_lock": -0.15, "diagnose_network_latency": -0.20},
"fix_rewards": {"fix_scale_up": 0.40, "fix_kill_process": -0.50, "fix_restart_service": -0.15, "fix_rollback": -0.20},
},
},
},
{
"id": "dev_H005", "ambiguity": 0.78,
"visible_alert": "Disk I/O wait: 68% on DB-PROD-01 | Latency: 340ms avg | Duration: 15 min",
"hidden_states": {
"A": {
"prob": 0.55, "root_cause": "disk_failure",
"reveal": "[TELEMETRY] SMART: 847 reallocated sectors (threshold: 5) | I/O errors in dmesg: 124 in last 10 min | RAID degraded | Backup disk available",
"correct_diagnosis": "diagnose_db_lock",
"correct_fix": "fix_restart_service", # failover to replica
"diag_rewards": {"diagnose_db_lock": 0.35, "diagnose_memory_leak": -0.10, "diagnose_cpu_spike": 0.05, "diagnose_network_latency": -0.15},
"fix_rewards": {"fix_restart_service": 0.40, "fix_rollback": 0.15, "fix_scale_up": 0.05, "fix_kill_process": -0.20},
},
"B": {
"prob": 0.45, "root_cause": "backup_running",
"reveal": "[TELEMETRY] Daily backup process running (scheduled 03:00) | SMART: healthy (0 errors) | I/O wait normal for backup workload | Expected end: 40 min",
"correct_diagnosis": "diagnose_network_latency", # routine, no action needed
"correct_fix": "fix_scale_up", # temporary throttling
"diag_rewards": {"diagnose_network_latency": 0.30, "diagnose_db_lock": -0.15, "diagnose_cpu_spike": 0.10, "diagnose_memory_leak": -0.20},
"fix_rewards": {"fix_scale_up": 0.40, "fix_kill_process": -0.15, "fix_restart_service": -0.25, "fix_rollback": -0.20},
},
},
},
# ═══════════════════════════════════════════════════════════════════════════
# MEDIUM AMBIGUITY (0.40–0.65): INVESTIGATION HELPFUL
# ═══════════════════════════════════════════════════════════════════════════
{
"id": "dev_M001", "ambiguity": 0.60,
"visible_alert": "Database query response time: 2.8s avg (baseline: 180ms) | Duration: 12 min",
"hidden_states": {
"A": {
"prob": 0.70, "root_cause": "table_lock",
"reveal": "[TELEMETRY] SHOW PROCESSLIST: 94 queries WAITING on table lock | Long transaction: analytics-job (running 47min) | Blocking all writes to orders table",
"correct_diagnosis": "diagnose_db_lock",
"correct_fix": "fix_kill_process",
"diag_rewards": {"diagnose_db_lock": 0.40, "diagnose_memory_leak": -0.10, "diagnose_cpu_spike": -0.10, "diagnose_network_latency": 0.05},
"fix_rewards": {"fix_kill_process": 0.40, "fix_restart_service": 0.10, "fix_scale_up": -0.10, "fix_rollback": -0.20},
},
"B": {
"prob": 0.30, "root_cause": "missing_index",
"reveal": "[TELEMETRY] EXPLAIN shows full table scan: orders (220M rows) | New query pattern after feature release v3.1.2 | Index: orders_user_id missing",
"correct_diagnosis": "diagnose_db_lock",
"correct_fix": "fix_rollback", # rollback the feature
"diag_rewards": {"diagnose_db_lock": 0.35, "diagnose_network_latency": 0.10, "diagnose_cpu_spike": -0.10, "diagnose_memory_leak": -0.15},
"fix_rewards": {"fix_rollback": 0.40, "fix_kill_process": 0.10, "fix_scale_up": -0.10, "fix_restart_service": 0.05},
},
},
},
{
"id": "dev_M002", "ambiguity": 0.50,
"visible_alert": "WebSocket connections dropping | Reconnect storms observed | Rate: 340/min",
"hidden_states": {
"A": {
"prob": 0.65, "root_cause": "connection_limit",
"reveal": "[TELEMETRY] nginx worker_connections: 1024 (at limit) | Active: 1,024/1,024 | Upgrade connections: 47 queued | CPU: 28% | File descriptors: OK",
"correct_diagnosis": "diagnose_network_latency",
"correct_fix": "fix_scale_up",
"diag_rewards": {"diagnose_network_latency": 0.40, "diagnose_db_lock": -0.10, "diagnose_cpu_spike": 0.10, "diagnose_memory_leak": -0.15},
"fix_rewards": {"fix_scale_up": 0.40, "fix_restart_service": 0.10, "fix_kill_process": -0.15, "fix_rollback": -0.10},
},
"B": {
"prob": 0.35, "root_cause": "client_bug",
"reveal": "[TELEMETRY] App v4.2.1 deployed 13:00 | New WebSocket client reconnects every 3s regardless of connection state | Server connections: healthy | Bug in client retry logic",
"correct_diagnosis": "diagnose_network_latency",
"correct_fix": "fix_rollback",
"diag_rewards": {"diagnose_network_latency": 0.35, "diagnose_cpu_spike": 0.10, "diagnose_db_lock": -0.15, "diagnose_memory_leak": -0.15},
"fix_rewards": {"fix_rollback": 0.40, "fix_scale_up": 0.10, "fix_restart_service": 0.05, "fix_kill_process": -0.10},
},
},
},
{
"id": "dev_M003", "ambiguity": 0.45,
"visible_alert": "Kubernetes pod restart loop | Pod: payment-worker | Restarts: 47 in 30 min",
"hidden_states": {
"A": {
"prob": 0.75, "root_cause": "oom_kill",
"reveal": "[TELEMETRY] OOMKilled: true | Memory limit: 512Mi | Last 3 restarts: OOM at 511Mi | Heap dump: large in-memory cache not bounded",
"correct_diagnosis": "diagnose_memory_leak",
"correct_fix": "fix_restart_service", # with memory limit increase
"diag_rewards": {"diagnose_memory_leak": 0.40, "diagnose_cpu_spike": 0.05, "diagnose_db_lock": -0.15, "diagnose_network_latency": -0.15},
"fix_rewards": {"fix_restart_service": 0.40, "fix_scale_up": 0.15, "fix_kill_process": 0.05, "fix_rollback": -0.10},
},
"B": {
"prob": 0.25, "root_cause": "startup_crash",
"reveal": "[TELEMETRY] Exit code: 1 | Logs: 'Failed to connect to Redis: connection refused' | Redis pod: CrashLoopBackOff | Dependency not healthy",
"correct_diagnosis": "diagnose_db_lock", # dependency issue
"correct_fix": "fix_restart_service", # restart Redis first
"diag_rewards": {"diagnose_db_lock": 0.35, "diagnose_memory_leak": -0.20, "diagnose_network_latency": 0.10, "diagnose_cpu_spike": -0.10},
"fix_rewards": {"fix_restart_service": 0.40, "fix_rollback": 0.15, "fix_kill_process": -0.10, "fix_scale_up": -0.15},
},
},
},
# ═══════════════════════════════════════════════════════════════════════════
# LOW AMBIGUITY (0.05–0.25): INVESTIGATION WASTEFUL
# Clear signals β€” agent should diagnose without investigating
# ═══════════════════════════════════════════════════════════════════════════
{
"id": "dev_L001", "ambiguity": 0.10,
"visible_alert": "CRITICAL: Disk /var/data 100% full on DB-PROD-01 | All writes failing | Data loss imminent",
"hidden_states": {
"A": {
"prob": 1.0, "root_cause": "disk_full",
"reveal": "[TELEMETRY] /var/data: 2TB/2TB | Largest: core dumps (847GB) from last week | MySQL write error: 'No space left on device' | Replication lag: 47s and growing",
"correct_diagnosis": "diagnose_db_lock", # I/O blocked = db_lock analogue
"correct_fix": "fix_kill_process", # kill dump-generating processes, clear space
"diag_rewards": {"diagnose_db_lock": 0.40, "diagnose_memory_leak": 0.10, "diagnose_cpu_spike": -0.10, "diagnose_network_latency": -0.20},
"fix_rewards": {"fix_kill_process": 0.40, "fix_restart_service": 0.15, "fix_scale_up": 0.05, "fix_rollback": -0.20},
},
},
},
{
"id": "dev_L002", "ambiguity": 0.08,
"visible_alert": "Deployment rollback requested by team lead | Reason: 'v2.4.1 causes checkout failures' | Rollback target: v2.4.0",
"hidden_states": {
"A": {
"prob": 1.0, "root_cause": "bad_deploy",
"reveal": "[TELEMETRY] v2.4.1 checkout_service: TypeError in cart.total() | Error rate: 28% | Revenue impact: ~$4k/min | v2.4.0: stable for 6 days",
"correct_diagnosis": "diagnose_cpu_spike", # deploy-related error
"correct_fix": "fix_rollback",
"diag_rewards": {"diagnose_cpu_spike": 0.30, "diagnose_db_lock": 0.20, "diagnose_memory_leak": -0.10, "diagnose_network_latency": -0.10},
"fix_rewards": {"fix_rollback": 0.45, "fix_restart_service": 0.10, "fix_kill_process": -0.10, "fix_scale_up": -0.20},
},
},
},
]
# ──────────────────────────────────────────────────────────────────────────────
# ACTION SETS
# ──────────────────────────────────────────────────────────────────────────────
_S0_BASE = ["diagnose_cpu_spike", "diagnose_memory_leak", "diagnose_db_lock", "diagnose_network_latency"]
_S0_WITH_INVEST = ["investigate"] + _S0_BASE
_S1 = ["fix_restart_service", "fix_kill_process", "fix_rollback", "fix_scale_up"]
_S2 = ["verify_metrics_ok", "verify_check_logs", "verify_ask_user"]
_S3 = ["close_resolved", "close_partial", "escalate_senior"]
def _pick_hidden_state(scenario: dict, seed: Optional[int], ep: int) -> str:
states = scenario["hidden_states"]
if len(states) == 1:
return list(states.keys())[0]
key = f"{scenario['id']}_ep{ep}_seed{seed if seed is not None else 'none'}"
h = int(hashlib.md5(key.encode()).hexdigest(), 16)
r = (h % 10_000) / 10_000.0
cumulative = 0.0
for k, v in states.items():
cumulative += v["prob"]
if r < cumulative:
return k
return list(states.keys())[-1]
class DevOpsIncidentTask(BaseTask):
task_id = "devops_incident"
max_steps = 4 # diagnose β†’ fix β†’ verify β†’ close (INVESTIGATE does not consume a step)
def __init__(self):
self._ep = -1
self._seed: Optional[int] = None
self._scenario: dict = {}
self._active_state_key: str = "A"
self._active_state: dict = {}
self._step = 0
self._api_calls = 0
self._history: list = []
self._done = False
self._investigated = False
self._diagnosis = ""
self._fix = ""
def reset(self, seed: Optional[int] = None):
self._ep += 1
self._seed = seed
if seed is not None:
random.seed(seed)
self._scenario = _SCENARIO_CLASSES[self._ep % len(_SCENARIO_CLASSES)]
self._active_state_key = _pick_hidden_state(self._scenario, seed, self._ep)
self._active_state = self._scenario["hidden_states"][self._active_state_key]
self._step = 0
self._api_calls = 0
self._history = []
self._done = False
self._investigated = False
self._diagnosis = ""
self._fix = ""
return self._obs()
def step(self, action: Action):
if self._done:
raise RuntimeError("Episode done. Call reset().")
t = action.type
self._api_calls += 1
# ── INVESTIGATE: pulls telemetry, does NOT advance _step ─────────────
if t == "investigate":
if self._step != 0:
return self._obs(), Reward(value=0.01, breakdown={"error": "investigate_invalid_step"}, raw=0.01), False, {}
self._investigated = True
r = investigation_reward(self._scenario["ambiguity"])
self._history.append({
"api_call": self._api_calls, "step": self._step, "action": "investigate",
"reward": {"value": r, "breakdown": {"investigation": r}},
"reveal": self._active_state["reveal"],
})
return self._obs(), Reward(value=r, breakdown={"investigation": r}, raw=r), False, {
"info": "Telemetry data retrieved. Make your diagnosis.",
"reveal": self._active_state["reveal"],
}
# ── STEP 0: Diagnosis ────────────────────────────────────────────────
if self._step == 0:
if t not in _S0_BASE:
t = _S0_BASE[0]
base_r = self._active_state["diag_rewards"].get(t, -0.10)
correct = (t == self._active_state["correct_diagnosis"])
cal_r = calibration_reward(correct, self._scenario["ambiguity"], self._investigated)
rval = max(0.01, min(0.99, (base_r * 0.35) + (cal_r * 0.65)))
breakdown = {
"diagnosis_base": base_r, "calibration_reward": cal_r,
"investigated": self._investigated, "ambiguity": self._scenario["ambiguity"],
}
self._diagnosis = t
# ── STEP 1: Fix ──────────────────────────────────────────────────────
elif self._step == 1:
if t not in _S1:
t = _S1[0]
base_r = self._active_state["fix_rewards"].get(t, -0.10)
# Reckless fix: applying kill/restart without investigating is risky
if not self._investigated and t == "fix_kill_process":
base_r -= 0.10
correct = (t == self._active_state["correct_fix"])
cal_r = calibration_reward(correct, self._scenario["ambiguity"] * 0.6, self._investigated)
rval = max(0.01, min(0.99, (base_r * 0.35) + (cal_r * 0.65)))
breakdown = {"fix_base": base_r, "calibration_reward": cal_r}
self._fix = t
# ── STEP 2: Verification ─────────────────────────────────────────────
elif self._step == 2:
rval = 0.25 if t == "verify_metrics_ok" else 0.12
breakdown = {"verification": rval}
# ── STEP 3: Close ────────────────────────────────────────────────────
elif self._step == 3:
rval = 0.20 if t == "close_resolved" else 0.08
breakdown = {"resolution": rval}
self._done = True
else:
return self._obs(), Reward(value=0.01, breakdown={}, raw=0.01), True, {}
reward = Reward(value=round(rval, 4), breakdown=breakdown, raw=rval)
self._history.append({
"api_call": self._api_calls, "step": self._step, "action": t,
"reward": {"value": reward.value, "breakdown": breakdown},
})
self._step += 1
if self._step >= self.max_steps:
self._done = True
obs = self._obs()
return obs, reward, self._done, {
"step": self._step - 1, "action": t,
"episode_score": self.grade_episode(self._history) if self._done else None,
}
def state(self):
return {
"task_id": self.task_id, "step": self._step, "done": self._done,
"scenario_id": self._scenario.get("id", ""),
"ambiguity": self._scenario.get("ambiguity", 0.0),
"investigated": self._investigated,
"root_cause": self._active_state.get("root_cause", "LOCKED") if self._investigated else "LOCKED",
"diagnosis": self._diagnosis, "fix": self._fix,
}
def grade_episode(self, history):
total = sum(e.get("reward", {}).get("value", 0.0) for e in history)
return clamp(total / 1.0)
def _obs(self) -> Observation:
s = self._scenario
idx = min(self._step, 3)
avails = [_S0_WITH_INVEST if not self._investigated else _S0_BASE, _S1, _S2, _S3]
avail = avails[idx] if not self._done else []
if self._investigated:
telemetry = self._active_state["reveal"]
else:
ambiguity = s["ambiguity"]
if ambiguity >= 0.70:
telemetry = "[TELEMETRY LOCKED] ⚠️ Multiple root causes possible β€” use 'investigate' to pull stack traces, heap dumps, and metrics"
elif ambiguity >= 0.40:
telemetry = "[TELEMETRY LOCKED] ⚑ Partial signals available β€” investigation recommended"
else:
telemetry = "[TELEMETRY LOCKED] βœ… Alert is self-explanatory β€” you may diagnose directly"
prompts = [
f"Alert: {s['visible_alert']}\nTelemetry: {telemetry}\n\nSelect diagnosis. Available: {avail}",
f"Diagnosis: {self._diagnosis}\nApply fix. Available: {avail}",
f"Fix applied: {self._fix}\nVerify system health. Available: {avail}",
f"System stable. Close the incident. Available: {avail}",
]
states = [
{"alert": s["visible_alert"], "telemetry": telemetry, "investigated": self._investigated},
{"alert": s["visible_alert"], "diagnosis": self._diagnosis},
{"diagnosis": self._diagnosis, "fix": self._fix},
{"diagnosis": self._diagnosis, "fix": self._fix, "verified": True},
]
return Observation(
task_id=self.task_id,
step=self._step,
state=states[idx],
history=list(self._history),
available_actions=avail,
done=self._done,
prompt=prompts[idx],
context=prompts[idx],
task=self.task_id,
action_to_evaluate="Evaluating agent response...",
)