Spaces:

JOY0021
/

autonomy-calibration-benchmark

Paused

autonomy-calibration-benchmark / tasks /devops_incident.py

Rhythm@28

deploy: final verified championship submission

ef737d3 22 days ago

27.9 kB

	"""
	tasks/devops_incident.py — Task 2: DevOps Incident Response (Epistemic RL v2.0)
	─────────────────────────────────────────────────────────────────────────────
	Design Principles:
	- Visible alert is GENUINELY AMBIGUOUS: same "503 errors" can be deploy, DB, or OOM
	- Hidden root cause is seed-determined probabilistically
	- INVESTIGATE pulls detailed stack traces, heap dumps, and monitoring data
	- Without investigating, agent must guess between equally plausible diagnoses
	- 10 scenarios: 5 high-ambiguity, 3 medium, 2 clear
	"""
	from __future__ import annotations
	import random
	import hashlib
	from typing import Optional
	import sys as _sys, os as _os
	_sys.path.insert(0, _os.path.dirname(_os.path.dirname(_os.path.abspath(__file__))))
	from models import Action, Observation, Reward
	from tasks.base import BaseTask
	from utils import clamp
	from environment.calibration_reward import calibration_reward, investigation_reward

	# ──────────────────────────────────────────────────────────────────────────────
	# 10 SCENARIO CLASSES — same surface alert, different hidden root cause
	# ──────────────────────────────────────────────────────────────────────────────

	_SCENARIO_CLASSES = [

	# ═══════════════════════════════════════════════════════════════════════════
	# HIGH AMBIGUITY (0.75–0.95): MULTIPLE DIAGNOSES EQUALLY PLAUSIBLE
	# Same alert metrics → different root causes → different fixes
	# ═══════════════════════════════════════════════════════════════════════════

	{
	"id": "dev_H001", "ambiguity": 0.90,
	"visible_alert": "API response time degraded 300% \| Error rate: 4.2% \| All services affected",
	"hidden_states": {
	"A": {
	"prob": 0.45, "root_cause": "db_connection_exhaustion",
	"reveal": "[TELEMETRY] DB connection pool: 500/500 EXHAUSTED \| Slow query log: 847 queries > 10s \| Top query: inventory scan (missing index) \| CPU: 45% \| Memory: 61%",
	"correct_diagnosis": "diagnose_db_lock",
	"correct_fix": "fix_kill_process",
	"diag_rewards": {"diagnose_db_lock": 0.40, "diagnose_network_latency": 0.10, "diagnose_memory_leak": -0.15, "diagnose_cpu_spike": -0.20},
	"fix_rewards": {"fix_kill_process": 0.40, "fix_rollback": 0.15, "fix_restart_service": 0.05, "fix_scale_up": -0.15},
	},
	"B": {
	"prob": 0.35, "root_cause": "traffic_spike",
	"reveal": "[TELEMETRY] Requests/min: 48,000 (baseline: 8,000) \| Marketing campaign launched 14:00 \| CPU: 89% \| Memory: 72% \| DB: healthy \| Load balancer: saturated",
	"correct_diagnosis": "diagnose_cpu_spike",
	"correct_fix": "fix_scale_up",
	"diag_rewards": {"diagnose_cpu_spike": 0.40, "diagnose_db_lock": 0.10, "diagnose_memory_leak": -0.10, "diagnose_network_latency": -0.15},
	"fix_rewards": {"fix_scale_up": 0.40, "fix_restart_service": 0.10, "fix_kill_process": -0.15, "fix_rollback": -0.25},
	},
	"C": {
	"prob": 0.20, "root_cause": "bad_deploy",
	"reveal": "[TELEMETRY] Deploy v2.4.1 at 13:47 \| Rollback available: v2.4.0 \| Stack trace: NullPointerException in CartService:247 \| CPU: 38% \| Memory: 55%",
	"correct_diagnosis": "diagnose_cpu_spike",
	"correct_fix": "fix_rollback",
	"diag_rewards": {"diagnose_cpu_spike": 0.20, "diagnose_db_lock": -0.15, "diagnose_memory_leak": -0.20, "diagnose_network_latency": -0.10},
	"fix_rewards": {"fix_rollback": 0.40, "fix_restart_service": 0.15, "fix_kill_process": -0.10, "fix_scale_up": -0.20},
	},
	},
	},

	{
	"id": "dev_H002", "ambiguity": 0.85,
	"visible_alert": "Memory utilization rising on WEB-01 \| Current: 87% \| Trend: +2% per hour",
	"hidden_states": {
	"A": {
	"prob": 0.55, "root_cause": "memory_leak",
	"reveal": "[TELEMETRY] RSS growing 180MB/hr \| Heap dump: 2.1GB uncollected objects (SessionManager) \| GC pause: 4.2s \| OOM kill projected in 6.5 hours",
	"correct_diagnosis": "diagnose_memory_leak",
	"correct_fix": "fix_restart_service",
	"diag_rewards": {"diagnose_memory_leak": 0.40, "diagnose_cpu_spike": 0.05, "diagnose_db_lock": -0.15, "diagnose_network_latency": -0.20},
	"fix_rewards": {"fix_restart_service": 0.40, "fix_kill_process": 0.15, "fix_scale_up": 0.05, "fix_rollback": -0.15},
	},
	"B": {
	"prob": 0.45, "root_cause": "legitimate_growth",
	"reveal": "[TELEMETRY] Cache warming after cold restart \| Object counts stable \| No leak detected \| Growth expected: cron loaded 4.2GB dataset at 02:00 \| Will plateau at 91%",
	"correct_diagnosis": "diagnose_cpu_spike", # treat as normal load
	"correct_fix": "fix_scale_up",
	"diag_rewards": {"diagnose_cpu_spike": 0.30, "diagnose_memory_leak": -0.20, "diagnose_db_lock": -0.20, "diagnose_network_latency": 0.05},
	"fix_rewards": {"fix_scale_up": 0.40, "fix_restart_service": -0.20, "fix_kill_process": -0.30, "fix_rollback": -0.15},
	},
	},
	},

	{
	"id": "dev_H003", "ambiguity": 0.88,
	"visible_alert": "HTTP 503 errors: 12% of requests \| Duration: 8 minutes \| Upstream: payment-service",
	"hidden_states": {
	"A": {
	"prob": 0.50, "root_cause": "dependency_outage",
	"reveal": "[TELEMETRY] Stripe API: status.stripe.com shows DEGRADED \| Circuit breaker: OPEN \| Timeout: payment-service→stripe: 30.1s \| Retries: 847 \| Fallback: none configured",
	"correct_diagnosis": "diagnose_network_latency",
	"correct_fix": "fix_rollback", # fallback mode / circuit breaker config
	"diag_rewards": {"diagnose_network_latency": 0.40, "diagnose_db_lock": 0.10, "diagnose_memory_leak": -0.15, "diagnose_cpu_spike": -0.10},
	"fix_rewards": {"fix_rollback": 0.40, "fix_scale_up": 0.10, "fix_restart_service": 0.05, "fix_kill_process": -0.20},
	},
	"B": {
	"prob": 0.50, "root_cause": "config_change",
	"reveal": "[TELEMETRY] Config deploy 14:23 \| payment-service TLS cert expired (changed hostname) \| 503 returns \"certificate verify failed\" \| No Stripe involvement",
	"correct_diagnosis": "diagnose_db_lock", # config/cert issue
	"correct_fix": "fix_rollback",
	"diag_rewards": {"diagnose_db_lock": 0.25, "diagnose_network_latency": 0.20, "diagnose_memory_leak": -0.20, "diagnose_cpu_spike": -0.15},
	"fix_rewards": {"fix_rollback": 0.40, "fix_restart_service": 0.15, "fix_kill_process": -0.15, "fix_scale_up": -0.20},
	},
	},
	},

	{
	"id": "dev_H004", "ambiguity": 0.80,
	"visible_alert": "CPU usage: 91% on all nodes \| Duration: 22 minutes \| No recent deploys",
	"hidden_states": {
	"A": {
	"prob": 0.60, "root_cause": "runaway_process",
	"reveal": "[TELEMETRY] top: report-generator PID 14882 consuming 380% CPU \| Triggered by cron at 03:00 \| Processing 90-day analytics dump (8TB) \| Not rate-limited",
	"correct_diagnosis": "diagnose_cpu_spike",
	"correct_fix": "fix_kill_process",
	"diag_rewards": {"diagnose_cpu_spike": 0.40, "diagnose_memory_leak": 0.05, "diagnose_db_lock": -0.10, "diagnose_network_latency": -0.20},
	"fix_rewards": {"fix_kill_process": 0.40, "fix_scale_up": 0.10, "fix_restart_service": 0.05, "fix_rollback": -0.20},
	},
	"B": {
	"prob": 0.40, "root_cause": "legitimate_batch",
	"reveal": "[TELEMETRY] Quarter-end financial report running (scheduled) \| CFO requested manually \| All queries healthy \| Expected completion: 18 min \| Kill would corrupt report",
	"correct_diagnosis": "diagnose_cpu_spike",
	"correct_fix": "fix_scale_up", # let it finish, add capacity
	"diag_rewards": {"diagnose_cpu_spike": 0.40, "diagnose_memory_leak": -0.10, "diagnose_db_lock": -0.15, "diagnose_network_latency": -0.20},
	"fix_rewards": {"fix_scale_up": 0.40, "fix_kill_process": -0.50, "fix_restart_service": -0.15, "fix_rollback": -0.20},
	},
	},
	},

	{
	"id": "dev_H005", "ambiguity": 0.78,
	"visible_alert": "Disk I/O wait: 68% on DB-PROD-01 \| Latency: 340ms avg \| Duration: 15 min",
	"hidden_states": {
	"A": {
	"prob": 0.55, "root_cause": "disk_failure",
	"reveal": "[TELEMETRY] SMART: 847 reallocated sectors (threshold: 5) \| I/O errors in dmesg: 124 in last 10 min \| RAID degraded \| Backup disk available",
	"correct_diagnosis": "diagnose_db_lock",
	"correct_fix": "fix_restart_service", # failover to replica
	"diag_rewards": {"diagnose_db_lock": 0.35, "diagnose_memory_leak": -0.10, "diagnose_cpu_spike": 0.05, "diagnose_network_latency": -0.15},
	"fix_rewards": {"fix_restart_service": 0.40, "fix_rollback": 0.15, "fix_scale_up": 0.05, "fix_kill_process": -0.20},
	},
	"B": {
	"prob": 0.45, "root_cause": "backup_running",
	"reveal": "[TELEMETRY] Daily backup process running (scheduled 03:00) \| SMART: healthy (0 errors) \| I/O wait normal for backup workload \| Expected end: 40 min",
	"correct_diagnosis": "diagnose_network_latency", # routine, no action needed
	"correct_fix": "fix_scale_up", # temporary throttling
	"diag_rewards": {"diagnose_network_latency": 0.30, "diagnose_db_lock": -0.15, "diagnose_cpu_spike": 0.10, "diagnose_memory_leak": -0.20},
	"fix_rewards": {"fix_scale_up": 0.40, "fix_kill_process": -0.15, "fix_restart_service": -0.25, "fix_rollback": -0.20},
	},
	},
	},

	# ═══════════════════════════════════════════════════════════════════════════
	# MEDIUM AMBIGUITY (0.40–0.65): INVESTIGATION HELPFUL
	# ═══════════════════════════════════════════════════════════════════════════

	{
	"id": "dev_M001", "ambiguity": 0.60,
	"visible_alert": "Database query response time: 2.8s avg (baseline: 180ms) \| Duration: 12 min",
	"hidden_states": {
	"A": {
	"prob": 0.70, "root_cause": "table_lock",
	"reveal": "[TELEMETRY] SHOW PROCESSLIST: 94 queries WAITING on table lock \| Long transaction: analytics-job (running 47min) \| Blocking all writes to orders table",
	"correct_diagnosis": "diagnose_db_lock",
	"correct_fix": "fix_kill_process",
	"diag_rewards": {"diagnose_db_lock": 0.40, "diagnose_memory_leak": -0.10, "diagnose_cpu_spike": -0.10, "diagnose_network_latency": 0.05},
	"fix_rewards": {"fix_kill_process": 0.40, "fix_restart_service": 0.10, "fix_scale_up": -0.10, "fix_rollback": -0.20},
	},
	"B": {
	"prob": 0.30, "root_cause": "missing_index",
	"reveal": "[TELEMETRY] EXPLAIN shows full table scan: orders (220M rows) \| New query pattern after feature release v3.1.2 \| Index: orders_user_id missing",
	"correct_diagnosis": "diagnose_db_lock",
	"correct_fix": "fix_rollback", # rollback the feature
	"diag_rewards": {"diagnose_db_lock": 0.35, "diagnose_network_latency": 0.10, "diagnose_cpu_spike": -0.10, "diagnose_memory_leak": -0.15},
	"fix_rewards": {"fix_rollback": 0.40, "fix_kill_process": 0.10, "fix_scale_up": -0.10, "fix_restart_service": 0.05},
	},
	},
	},

	{
	"id": "dev_M002", "ambiguity": 0.50,
	"visible_alert": "WebSocket connections dropping \| Reconnect storms observed \| Rate: 340/min",
	"hidden_states": {
	"A": {
	"prob": 0.65, "root_cause": "connection_limit",
	"reveal": "[TELEMETRY] nginx worker_connections: 1024 (at limit) \| Active: 1,024/1,024 \| Upgrade connections: 47 queued \| CPU: 28% \| File descriptors: OK",
	"correct_diagnosis": "diagnose_network_latency",
	"correct_fix": "fix_scale_up",
	"diag_rewards": {"diagnose_network_latency": 0.40, "diagnose_db_lock": -0.10, "diagnose_cpu_spike": 0.10, "diagnose_memory_leak": -0.15},
	"fix_rewards": {"fix_scale_up": 0.40, "fix_restart_service": 0.10, "fix_kill_process": -0.15, "fix_rollback": -0.10},
	},
	"B": {
	"prob": 0.35, "root_cause": "client_bug",
	"reveal": "[TELEMETRY] App v4.2.1 deployed 13:00 \| New WebSocket client reconnects every 3s regardless of connection state \| Server connections: healthy \| Bug in client retry logic",
	"correct_diagnosis": "diagnose_network_latency",
	"correct_fix": "fix_rollback",
	"diag_rewards": {"diagnose_network_latency": 0.35, "diagnose_cpu_spike": 0.10, "diagnose_db_lock": -0.15, "diagnose_memory_leak": -0.15},
	"fix_rewards": {"fix_rollback": 0.40, "fix_scale_up": 0.10, "fix_restart_service": 0.05, "fix_kill_process": -0.10},
	},
	},
	},

	{
	"id": "dev_M003", "ambiguity": 0.45,
	"visible_alert": "Kubernetes pod restart loop \| Pod: payment-worker \| Restarts: 47 in 30 min",
	"hidden_states": {
	"A": {
	"prob": 0.75, "root_cause": "oom_kill",
	"reveal": "[TELEMETRY] OOMKilled: true \| Memory limit: 512Mi \| Last 3 restarts: OOM at 511Mi \| Heap dump: large in-memory cache not bounded",
	"correct_diagnosis": "diagnose_memory_leak",
	"correct_fix": "fix_restart_service", # with memory limit increase
	"diag_rewards": {"diagnose_memory_leak": 0.40, "diagnose_cpu_spike": 0.05, "diagnose_db_lock": -0.15, "diagnose_network_latency": -0.15},
	"fix_rewards": {"fix_restart_service": 0.40, "fix_scale_up": 0.15, "fix_kill_process": 0.05, "fix_rollback": -0.10},
	},
	"B": {
	"prob": 0.25, "root_cause": "startup_crash",
	"reveal": "[TELEMETRY] Exit code: 1 \| Logs: 'Failed to connect to Redis: connection refused' \| Redis pod: CrashLoopBackOff \| Dependency not healthy",
	"correct_diagnosis": "diagnose_db_lock", # dependency issue
	"correct_fix": "fix_restart_service", # restart Redis first
	"diag_rewards": {"diagnose_db_lock": 0.35, "diagnose_memory_leak": -0.20, "diagnose_network_latency": 0.10, "diagnose_cpu_spike": -0.10},
	"fix_rewards": {"fix_restart_service": 0.40, "fix_rollback": 0.15, "fix_kill_process": -0.10, "fix_scale_up": -0.15},
	},
	},
	},

	# ═══════════════════════════════════════════════════════════════════════════
	# LOW AMBIGUITY (0.05–0.25): INVESTIGATION WASTEFUL
	# Clear signals — agent should diagnose without investigating
	# ═══════════════════════════════════════════════════════════════════════════

	{
	"id": "dev_L001", "ambiguity": 0.10,
	"visible_alert": "CRITICAL: Disk /var/data 100% full on DB-PROD-01 \| All writes failing \| Data loss imminent",
	"hidden_states": {
	"A": {
	"prob": 1.0, "root_cause": "disk_full",
	"reveal": "[TELEMETRY] /var/data: 2TB/2TB \| Largest: core dumps (847GB) from last week \| MySQL write error: 'No space left on device' \| Replication lag: 47s and growing",
	"correct_diagnosis": "diagnose_db_lock", # I/O blocked = db_lock analogue
	"correct_fix": "fix_kill_process", # kill dump-generating processes, clear space
	"diag_rewards": {"diagnose_db_lock": 0.40, "diagnose_memory_leak": 0.10, "diagnose_cpu_spike": -0.10, "diagnose_network_latency": -0.20},
	"fix_rewards": {"fix_kill_process": 0.40, "fix_restart_service": 0.15, "fix_scale_up": 0.05, "fix_rollback": -0.20},
	},
	},
	},

	{
	"id": "dev_L002", "ambiguity": 0.08,
	"visible_alert": "Deployment rollback requested by team lead \| Reason: 'v2.4.1 causes checkout failures' \| Rollback target: v2.4.0",
	"hidden_states": {
	"A": {
	"prob": 1.0, "root_cause": "bad_deploy",
	"reveal": "[TELEMETRY] v2.4.1 checkout_service: TypeError in cart.total() \| Error rate: 28% \| Revenue impact: ~$4k/min \| v2.4.0: stable for 6 days",
	"correct_diagnosis": "diagnose_cpu_spike", # deploy-related error
	"correct_fix": "fix_rollback",
	"diag_rewards": {"diagnose_cpu_spike": 0.30, "diagnose_db_lock": 0.20, "diagnose_memory_leak": -0.10, "diagnose_network_latency": -0.10},
	"fix_rewards": {"fix_rollback": 0.45, "fix_restart_service": 0.10, "fix_kill_process": -0.10, "fix_scale_up": -0.20},
	},
	},
	},
	]

	# ──────────────────────────────────────────────────────────────────────────────
	# ACTION SETS
	# ──────────────────────────────────────────────────────────────────────────────
	_S0_BASE = ["diagnose_cpu_spike", "diagnose_memory_leak", "diagnose_db_lock", "diagnose_network_latency"]
	_S0_WITH_INVEST = ["investigate"] + _S0_BASE
	_S1 = ["fix_restart_service", "fix_kill_process", "fix_rollback", "fix_scale_up"]
	_S2 = ["verify_metrics_ok", "verify_check_logs", "verify_ask_user"]
	_S3 = ["close_resolved", "close_partial", "escalate_senior"]


	def _pick_hidden_state(scenario: dict, seed: Optional[int], ep: int) -> str:
	states = scenario["hidden_states"]
	if len(states) == 1:
	return list(states.keys())[0]
	key = f"{scenario['id']}_ep{ep}_seed{seed if seed is not None else 'none'}"
	h = int(hashlib.md5(key.encode()).hexdigest(), 16)
	r = (h % 10_000) / 10_000.0
	cumulative = 0.0
	for k, v in states.items():
	cumulative += v["prob"]
	if r < cumulative:
	return k
	return list(states.keys())[-1]


	class DevOpsIncidentTask(BaseTask):
	task_id = "devops_incident"
	max_steps = 4 # diagnose → fix → verify → close (INVESTIGATE does not consume a step)

	def __init__(self):
	self._ep = -1
	self._seed: Optional[int] = None
	self._scenario: dict = {}
	self._active_state_key: str = "A"
	self._active_state: dict = {}
	self._step = 0
	self._api_calls = 0
	self._history: list = []
	self._done = False
	self._investigated = False
	self._diagnosis = ""
	self._fix = ""

	def reset(self, seed: Optional[int] = None):
	self._ep += 1
	self._seed = seed
	if seed is not None:
	random.seed(seed)
	self._scenario = _SCENARIO_CLASSES[self._ep % len(_SCENARIO_CLASSES)]
	self._active_state_key = _pick_hidden_state(self._scenario, seed, self._ep)
	self._active_state = self._scenario["hidden_states"][self._active_state_key]
	self._step = 0
	self._api_calls = 0
	self._history = []
	self._done = False
	self._investigated = False
	self._diagnosis = ""
	self._fix = ""
	return self._obs()

	def step(self, action: Action):
	if self._done:
	raise RuntimeError("Episode done. Call reset().")
	t = action.type
	self._api_calls += 1

	# ── INVESTIGATE: pulls telemetry, does NOT advance _step ─────────────
	if t == "investigate":
	if self._step != 0:
	return self._obs(), Reward(value=0.01, breakdown={"error": "investigate_invalid_step"}, raw=0.01), False, {}
	self._investigated = True
	r = investigation_reward(self._scenario["ambiguity"])
	self._history.append({
	"api_call": self._api_calls, "step": self._step, "action": "investigate",
	"reward": {"value": r, "breakdown": {"investigation": r}},
	"reveal": self._active_state["reveal"],
	})
	return self._obs(), Reward(value=r, breakdown={"investigation": r}, raw=r), False, {
	"info": "Telemetry data retrieved. Make your diagnosis.",
	"reveal": self._active_state["reveal"],
	}

	# ── STEP 0: Diagnosis ────────────────────────────────────────────────
	if self._step == 0:
	if t not in _S0_BASE:
	t = _S0_BASE[0]
	base_r = self._active_state["diag_rewards"].get(t, -0.10)
	correct = (t == self._active_state["correct_diagnosis"])
	cal_r = calibration_reward(correct, self._scenario["ambiguity"], self._investigated)
	rval = max(0.01, min(0.99, (base_r * 0.35) + (cal_r * 0.65)))
	breakdown = {
	"diagnosis_base": base_r, "calibration_reward": cal_r,
	"investigated": self._investigated, "ambiguity": self._scenario["ambiguity"],
	}
	self._diagnosis = t

	# ── STEP 1: Fix ──────────────────────────────────────────────────────
	elif self._step == 1:
	if t not in _S1:
	t = _S1[0]
	base_r = self._active_state["fix_rewards"].get(t, -0.10)
	# Reckless fix: applying kill/restart without investigating is risky
	if not self._investigated and t == "fix_kill_process":
	base_r -= 0.10
	correct = (t == self._active_state["correct_fix"])
	cal_r = calibration_reward(correct, self._scenario["ambiguity"] * 0.6, self._investigated)
	rval = max(0.01, min(0.99, (base_r * 0.35) + (cal_r * 0.65)))
	breakdown = {"fix_base": base_r, "calibration_reward": cal_r}
	self._fix = t

	# ── STEP 2: Verification ─────────────────────────────────────────────
	elif self._step == 2:
	rval = 0.25 if t == "verify_metrics_ok" else 0.12
	breakdown = {"verification": rval}

	# ── STEP 3: Close ────────────────────────────────────────────────────
	elif self._step == 3:
	rval = 0.20 if t == "close_resolved" else 0.08
	breakdown = {"resolution": rval}
	self._done = True

	else:
	return self._obs(), Reward(value=0.01, breakdown={}, raw=0.01), True, {}

	reward = Reward(value=round(rval, 4), breakdown=breakdown, raw=rval)
	self._history.append({
	"api_call": self._api_calls, "step": self._step, "action": t,
	"reward": {"value": reward.value, "breakdown": breakdown},
	})
	self._step += 1
	if self._step >= self.max_steps:
	self._done = True
	obs = self._obs()
	return obs, reward, self._done, {
	"step": self._step - 1, "action": t,
	"episode_score": self.grade_episode(self._history) if self._done else None,
	}

	def state(self):
	return {
	"task_id": self.task_id, "step": self._step, "done": self._done,
	"scenario_id": self._scenario.get("id", ""),
	"ambiguity": self._scenario.get("ambiguity", 0.0),
	"investigated": self._investigated,
	"root_cause": self._active_state.get("root_cause", "LOCKED") if self._investigated else "LOCKED",
	"diagnosis": self._diagnosis, "fix": self._fix,
	}

	def grade_episode(self, history):
	total = sum(e.get("reward", {}).get("value", 0.0) for e in history)
	return clamp(total / 1.0)

	def _obs(self) -> Observation:
	s = self._scenario
	idx = min(self._step, 3)
	avails = [_S0_WITH_INVEST if not self._investigated else _S0_BASE, _S1, _S2, _S3]
	avail = avails[idx] if not self._done else []

	if self._investigated:
	telemetry = self._active_state["reveal"]
	else:
	ambiguity = s["ambiguity"]
	if ambiguity >= 0.70:
	telemetry = "[TELEMETRY LOCKED] ⚠️ Multiple root causes possible — use 'investigate' to pull stack traces, heap dumps, and metrics"
	elif ambiguity >= 0.40:
	telemetry = "[TELEMETRY LOCKED] ⚡ Partial signals available — investigation recommended"
	else:
	telemetry = "[TELEMETRY LOCKED] ✅ Alert is self-explanatory — you may diagnose directly"

	prompts = [
	f"Alert: {s['visible_alert']}\nTelemetry: {telemetry}\n\nSelect diagnosis. Available: {avail}",
	f"Diagnosis: {self._diagnosis}\nApply fix. Available: {avail}",
	f"Fix applied: {self._fix}\nVerify system health. Available: {avail}",
	f"System stable. Close the incident. Available: {avail}",
	]

	states = [
	{"alert": s["visible_alert"], "telemetry": telemetry, "investigated": self._investigated},
	{"alert": s["visible_alert"], "diagnosis": self._diagnosis},
	{"diagnosis": self._diagnosis, "fix": self._fix},
	{"diagnosis": self._diagnosis, "fix": self._fix, "verified": True},
	]

	return Observation(
	task_id=self.task_id,
	step=self._step,
	state=states[idx],
	history=list(self._history),
	available_actions=avail,
	done=self._done,
	prompt=prompts[idx],
	context=prompts[idx],
	task=self.task_id,
	action_to_evaluate="Evaluating agent response...",
	)