Spaces:

JOY0021
/

autonomy-calibration-benchmark

Paused

File size: 27,850 Bytes

ef737d3

"""
tasks/devops_incident.py — Task 2: DevOps Incident Response (Epistemic RL v2.0)
─────────────────────────────────────────────────────────────────────────────
Design Principles:
  - Visible alert is GENUINELY AMBIGUOUS: same "503 errors" can be deploy, DB, or OOM
  - Hidden root cause is seed-determined probabilistically
  - INVESTIGATE pulls detailed stack traces, heap dumps, and monitoring data
  - Without investigating, agent must guess between equally plausible diagnoses
  - 10 scenarios: 5 high-ambiguity, 3 medium, 2 clear
"""
from __future__ import annotations
import random
import hashlib
from typing import Optional
import sys as _sys, os as _os
_sys.path.insert(0, _os.path.dirname(_os.path.dirname(_os.path.abspath(__file__))))
from models import Action, Observation, Reward
from tasks.base import BaseTask
from utils import clamp
from environment.calibration_reward import calibration_reward, investigation_reward

# ──────────────────────────────────────────────────────────────────────────────
# 10 SCENARIO CLASSES — same surface alert, different hidden root cause
# ──────────────────────────────────────────────────────────────────────────────

_SCENARIO_CLASSES = [

    # ═══════════════════════════════════════════════════════════════════════════
    # HIGH AMBIGUITY (0.75–0.95): MULTIPLE DIAGNOSES EQUALLY PLAUSIBLE
    # Same alert metrics → different root causes → different fixes
    # ═══════════════════════════════════════════════════════════════════════════

    {
        "id": "dev_H001", "ambiguity": 0.90,
        "visible_alert": "API response time degraded 300% | Error rate: 4.2% | All services affected",
        "hidden_states": {
            "A": {
                "prob": 0.45, "root_cause": "db_connection_exhaustion",
                "reveal": "[TELEMETRY] DB connection pool: 500/500 EXHAUSTED | Slow query log: 847 queries > 10s | Top query: inventory scan (missing index) | CPU: 45% | Memory: 61%",
                "correct_diagnosis": "diagnose_db_lock",
                "correct_fix": "fix_kill_process",
                "diag_rewards": {"diagnose_db_lock": 0.40, "diagnose_network_latency": 0.10, "diagnose_memory_leak": -0.15, "diagnose_cpu_spike": -0.20},
                "fix_rewards": {"fix_kill_process": 0.40, "fix_rollback": 0.15, "fix_restart_service": 0.05, "fix_scale_up": -0.15},
            },
            "B": {
                "prob": 0.35, "root_cause": "traffic_spike",
                "reveal": "[TELEMETRY] Requests/min: 48,000 (baseline: 8,000) | Marketing campaign launched 14:00 | CPU: 89% | Memory: 72% | DB: healthy | Load balancer: saturated",
                "correct_diagnosis": "diagnose_cpu_spike",
                "correct_fix": "fix_scale_up",
                "diag_rewards": {"diagnose_cpu_spike": 0.40, "diagnose_db_lock": 0.10, "diagnose_memory_leak": -0.10, "diagnose_network_latency": -0.15},
                "fix_rewards": {"fix_scale_up": 0.40, "fix_restart_service": 0.10, "fix_kill_process": -0.15, "fix_rollback": -0.25},
            },
            "C": {
                "prob": 0.20, "root_cause": "bad_deploy",
                "reveal": "[TELEMETRY] Deploy v2.4.1 at 13:47 | Rollback available: v2.4.0 | Stack trace: NullPointerException in CartService:247 | CPU: 38% | Memory: 55%",
                "correct_diagnosis": "diagnose_cpu_spike",
                "correct_fix": "fix_rollback",
                "diag_rewards": {"diagnose_cpu_spike": 0.20, "diagnose_db_lock": -0.15, "diagnose_memory_leak": -0.20, "diagnose_network_latency": -0.10},
                "fix_rewards": {"fix_rollback": 0.40, "fix_restart_service": 0.15, "fix_kill_process": -0.10, "fix_scale_up": -0.20},
            },
        },
    },

    {
        "id": "dev_H002", "ambiguity": 0.85,
        "visible_alert": "Memory utilization rising on WEB-01 | Current: 87% | Trend: +2% per hour",
        "hidden_states": {
            "A": {
                "prob": 0.55, "root_cause": "memory_leak",
                "reveal": "[TELEMETRY] RSS growing 180MB/hr | Heap dump: 2.1GB uncollected objects (SessionManager) | GC pause: 4.2s | OOM kill projected in 6.5 hours",
                "correct_diagnosis": "diagnose_memory_leak",
                "correct_fix": "fix_restart_service",
                "diag_rewards": {"diagnose_memory_leak": 0.40, "diagnose_cpu_spike": 0.05, "diagnose_db_lock": -0.15, "diagnose_network_latency": -0.20},
                "fix_rewards": {"fix_restart_service": 0.40, "fix_kill_process": 0.15, "fix_scale_up": 0.05, "fix_rollback": -0.15},
            },
            "B": {
                "prob": 0.45, "root_cause": "legitimate_growth",
                "reveal": "[TELEMETRY] Cache warming after cold restart | Object counts stable | No leak detected | Growth expected: cron loaded 4.2GB dataset at 02:00 | Will plateau at 91%",
                "correct_diagnosis": "diagnose_cpu_spike",   # treat as normal load
                "correct_fix": "fix_scale_up",
                "diag_rewards": {"diagnose_cpu_spike": 0.30, "diagnose_memory_leak": -0.20, "diagnose_db_lock": -0.20, "diagnose_network_latency": 0.05},
                "fix_rewards": {"fix_scale_up": 0.40, "fix_restart_service": -0.20, "fix_kill_process": -0.30, "fix_rollback": -0.15},
            },
        },
    },

    {
        "id": "dev_H003", "ambiguity": 0.88,
        "visible_alert": "HTTP 503 errors: 12% of requests | Duration: 8 minutes | Upstream: payment-service",
        "hidden_states": {
            "A": {
                "prob": 0.50, "root_cause": "dependency_outage",
                "reveal": "[TELEMETRY] Stripe API: status.stripe.com shows DEGRADED | Circuit breaker: OPEN | Timeout: payment-service→stripe: 30.1s | Retries: 847 | Fallback: none configured",
                "correct_diagnosis": "diagnose_network_latency",
                "correct_fix": "fix_rollback",   # fallback mode / circuit breaker config
                "diag_rewards": {"diagnose_network_latency": 0.40, "diagnose_db_lock": 0.10, "diagnose_memory_leak": -0.15, "diagnose_cpu_spike": -0.10},
                "fix_rewards": {"fix_rollback": 0.40, "fix_scale_up": 0.10, "fix_restart_service": 0.05, "fix_kill_process": -0.20},
            },
            "B": {
                "prob": 0.50, "root_cause": "config_change",
                "reveal": "[TELEMETRY] Config deploy 14:23 | payment-service TLS cert expired (changed hostname) | 503 returns \"certificate verify failed\" | No Stripe involvement",
                "correct_diagnosis": "diagnose_db_lock",    # config/cert issue
                "correct_fix": "fix_rollback",
                "diag_rewards": {"diagnose_db_lock": 0.25, "diagnose_network_latency": 0.20, "diagnose_memory_leak": -0.20, "diagnose_cpu_spike": -0.15},
                "fix_rewards": {"fix_rollback": 0.40, "fix_restart_service": 0.15, "fix_kill_process": -0.15, "fix_scale_up": -0.20},
            },
        },
    },

    {
        "id": "dev_H004", "ambiguity": 0.80,
        "visible_alert": "CPU usage: 91% on all nodes | Duration: 22 minutes | No recent deploys",
        "hidden_states": {
            "A": {
                "prob": 0.60, "root_cause": "runaway_process",
                "reveal": "[TELEMETRY] top: report-generator PID 14882 consuming 380% CPU | Triggered by cron at 03:00 | Processing 90-day analytics dump (8TB) | Not rate-limited",
                "correct_diagnosis": "diagnose_cpu_spike",
                "correct_fix": "fix_kill_process",
                "diag_rewards": {"diagnose_cpu_spike": 0.40, "diagnose_memory_leak": 0.05, "diagnose_db_lock": -0.10, "diagnose_network_latency": -0.20},
                "fix_rewards": {"fix_kill_process": 0.40, "fix_scale_up": 0.10, "fix_restart_service": 0.05, "fix_rollback": -0.20},
            },
            "B": {
                "prob": 0.40, "root_cause": "legitimate_batch",
                "reveal": "[TELEMETRY] Quarter-end financial report running (scheduled) | CFO requested manually | All queries healthy | Expected completion: 18 min | Kill would corrupt report",
                "correct_diagnosis": "diagnose_cpu_spike",
                "correct_fix": "fix_scale_up",   # let it finish, add capacity
                "diag_rewards": {"diagnose_cpu_spike": 0.40, "diagnose_memory_leak": -0.10, "diagnose_db_lock": -0.15, "diagnose_network_latency": -0.20},
                "fix_rewards": {"fix_scale_up": 0.40, "fix_kill_process": -0.50, "fix_restart_service": -0.15, "fix_rollback": -0.20},
            },
        },
    },

    {
        "id": "dev_H005", "ambiguity": 0.78,
        "visible_alert": "Disk I/O wait: 68% on DB-PROD-01 | Latency: 340ms avg | Duration: 15 min",
        "hidden_states": {
            "A": {
                "prob": 0.55, "root_cause": "disk_failure",
                "reveal": "[TELEMETRY] SMART: 847 reallocated sectors (threshold: 5) | I/O errors in dmesg: 124 in last 10 min | RAID degraded | Backup disk available",
                "correct_diagnosis": "diagnose_db_lock",
                "correct_fix": "fix_restart_service",   # failover to replica
                "diag_rewards": {"diagnose_db_lock": 0.35, "diagnose_memory_leak": -0.10, "diagnose_cpu_spike": 0.05, "diagnose_network_latency": -0.15},
                "fix_rewards": {"fix_restart_service": 0.40, "fix_rollback": 0.15, "fix_scale_up": 0.05, "fix_kill_process": -0.20},
            },
            "B": {
                "prob": 0.45, "root_cause": "backup_running",
                "reveal": "[TELEMETRY] Daily backup process running (scheduled 03:00) | SMART: healthy (0 errors) | I/O wait normal for backup workload | Expected end: 40 min",
                "correct_diagnosis": "diagnose_network_latency",   # routine, no action needed
                "correct_fix": "fix_scale_up",   # temporary throttling
                "diag_rewards": {"diagnose_network_latency": 0.30, "diagnose_db_lock": -0.15, "diagnose_cpu_spike": 0.10, "diagnose_memory_leak": -0.20},
                "fix_rewards": {"fix_scale_up": 0.40, "fix_kill_process": -0.15, "fix_restart_service": -0.25, "fix_rollback": -0.20},
            },
        },
    },

    # ═══════════════════════════════════════════════════════════════════════════
    # MEDIUM AMBIGUITY (0.40–0.65): INVESTIGATION HELPFUL
    # ═══════════════════════════════════════════════════════════════════════════

    {
        "id": "dev_M001", "ambiguity": 0.60,
        "visible_alert": "Database query response time: 2.8s avg (baseline: 180ms) | Duration: 12 min",
        "hidden_states": {
            "A": {
                "prob": 0.70, "root_cause": "table_lock",
                "reveal": "[TELEMETRY] SHOW PROCESSLIST: 94 queries WAITING on table lock | Long transaction: analytics-job (running 47min) | Blocking all writes to orders table",
                "correct_diagnosis": "diagnose_db_lock",
                "correct_fix": "fix_kill_process",
                "diag_rewards": {"diagnose_db_lock": 0.40, "diagnose_memory_leak": -0.10, "diagnose_cpu_spike": -0.10, "diagnose_network_latency": 0.05},
                "fix_rewards": {"fix_kill_process": 0.40, "fix_restart_service": 0.10, "fix_scale_up": -0.10, "fix_rollback": -0.20},
            },
            "B": {
                "prob": 0.30, "root_cause": "missing_index",
                "reveal": "[TELEMETRY] EXPLAIN shows full table scan: orders (220M rows) | New query pattern after feature release v3.1.2 | Index: orders_user_id missing",
                "correct_diagnosis": "diagnose_db_lock",
                "correct_fix": "fix_rollback",  # rollback the feature
                "diag_rewards": {"diagnose_db_lock": 0.35, "diagnose_network_latency": 0.10, "diagnose_cpu_spike": -0.10, "diagnose_memory_leak": -0.15},
                "fix_rewards": {"fix_rollback": 0.40, "fix_kill_process": 0.10, "fix_scale_up": -0.10, "fix_restart_service": 0.05},
            },
        },
    },

    {
        "id": "dev_M002", "ambiguity": 0.50,
        "visible_alert": "WebSocket connections dropping | Reconnect storms observed | Rate: 340/min",
        "hidden_states": {
            "A": {
                "prob": 0.65, "root_cause": "connection_limit",
                "reveal": "[TELEMETRY] nginx worker_connections: 1024 (at limit) | Active: 1,024/1,024 | Upgrade connections: 47 queued | CPU: 28% | File descriptors: OK",
                "correct_diagnosis": "diagnose_network_latency",
                "correct_fix": "fix_scale_up",
                "diag_rewards": {"diagnose_network_latency": 0.40, "diagnose_db_lock": -0.10, "diagnose_cpu_spike": 0.10, "diagnose_memory_leak": -0.15},
                "fix_rewards": {"fix_scale_up": 0.40, "fix_restart_service": 0.10, "fix_kill_process": -0.15, "fix_rollback": -0.10},
            },
            "B": {
                "prob": 0.35, "root_cause": "client_bug",
                "reveal": "[TELEMETRY] App v4.2.1 deployed 13:00 | New WebSocket client reconnects every 3s regardless of connection state | Server connections: healthy | Bug in client retry logic",
                "correct_diagnosis": "diagnose_network_latency",
                "correct_fix": "fix_rollback",
                "diag_rewards": {"diagnose_network_latency": 0.35, "diagnose_cpu_spike": 0.10, "diagnose_db_lock": -0.15, "diagnose_memory_leak": -0.15},
                "fix_rewards": {"fix_rollback": 0.40, "fix_scale_up": 0.10, "fix_restart_service": 0.05, "fix_kill_process": -0.10},
            },
        },
    },

    {
        "id": "dev_M003", "ambiguity": 0.45,
        "visible_alert": "Kubernetes pod restart loop | Pod: payment-worker | Restarts: 47 in 30 min",
        "hidden_states": {
            "A": {
                "prob": 0.75, "root_cause": "oom_kill",
                "reveal": "[TELEMETRY] OOMKilled: true | Memory limit: 512Mi | Last 3 restarts: OOM at 511Mi | Heap dump: large in-memory cache not bounded",
                "correct_diagnosis": "diagnose_memory_leak",
                "correct_fix": "fix_restart_service",  # with memory limit increase
                "diag_rewards": {"diagnose_memory_leak": 0.40, "diagnose_cpu_spike": 0.05, "diagnose_db_lock": -0.15, "diagnose_network_latency": -0.15},
                "fix_rewards": {"fix_restart_service": 0.40, "fix_scale_up": 0.15, "fix_kill_process": 0.05, "fix_rollback": -0.10},
            },
            "B": {
                "prob": 0.25, "root_cause": "startup_crash",
                "reveal": "[TELEMETRY] Exit code: 1 | Logs: 'Failed to connect to Redis: connection refused' | Redis pod: CrashLoopBackOff | Dependency not healthy",
                "correct_diagnosis": "diagnose_db_lock",  # dependency issue
                "correct_fix": "fix_restart_service",  # restart Redis first
                "diag_rewards": {"diagnose_db_lock": 0.35, "diagnose_memory_leak": -0.20, "diagnose_network_latency": 0.10, "diagnose_cpu_spike": -0.10},
                "fix_rewards": {"fix_restart_service": 0.40, "fix_rollback": 0.15, "fix_kill_process": -0.10, "fix_scale_up": -0.15},
            },
        },
    },

    # ═══════════════════════════════════════════════════════════════════════════
    # LOW AMBIGUITY (0.05–0.25): INVESTIGATION WASTEFUL
    # Clear signals — agent should diagnose without investigating
    # ═══════════════════════════════════════════════════════════════════════════

    {
        "id": "dev_L001", "ambiguity": 0.10,
        "visible_alert": "CRITICAL: Disk /var/data 100% full on DB-PROD-01 | All writes failing | Data loss imminent",
        "hidden_states": {
            "A": {
                "prob": 1.0, "root_cause": "disk_full",
                "reveal": "[TELEMETRY] /var/data: 2TB/2TB | Largest: core dumps (847GB) from last week | MySQL write error: 'No space left on device' | Replication lag: 47s and growing",
                "correct_diagnosis": "diagnose_db_lock",   # I/O blocked = db_lock analogue
                "correct_fix": "fix_kill_process",         # kill dump-generating processes, clear space
                "diag_rewards": {"diagnose_db_lock": 0.40, "diagnose_memory_leak": 0.10, "diagnose_cpu_spike": -0.10, "diagnose_network_latency": -0.20},
                "fix_rewards": {"fix_kill_process": 0.40, "fix_restart_service": 0.15, "fix_scale_up": 0.05, "fix_rollback": -0.20},
            },
        },
    },

    {
        "id": "dev_L002", "ambiguity": 0.08,
        "visible_alert": "Deployment rollback requested by team lead | Reason: 'v2.4.1 causes checkout failures' | Rollback target: v2.4.0",
        "hidden_states": {
            "A": {
                "prob": 1.0, "root_cause": "bad_deploy",
                "reveal": "[TELEMETRY] v2.4.1 checkout_service: TypeError in cart.total() | Error rate: 28% | Revenue impact: ~$4k/min | v2.4.0: stable for 6 days",
                "correct_diagnosis": "diagnose_cpu_spike",  # deploy-related error
                "correct_fix": "fix_rollback",
                "diag_rewards": {"diagnose_cpu_spike": 0.30, "diagnose_db_lock": 0.20, "diagnose_memory_leak": -0.10, "diagnose_network_latency": -0.10},
                "fix_rewards": {"fix_rollback": 0.45, "fix_restart_service": 0.10, "fix_kill_process": -0.10, "fix_scale_up": -0.20},
            },
        },
    },
]

# ──────────────────────────────────────────────────────────────────────────────
# ACTION SETS
# ──────────────────────────────────────────────────────────────────────────────
_S0_BASE = ["diagnose_cpu_spike", "diagnose_memory_leak", "diagnose_db_lock", "diagnose_network_latency"]
_S0_WITH_INVEST = ["investigate"] + _S0_BASE
_S1 = ["fix_restart_service", "fix_kill_process", "fix_rollback", "fix_scale_up"]
_S2 = ["verify_metrics_ok", "verify_check_logs", "verify_ask_user"]
_S3 = ["close_resolved", "close_partial", "escalate_senior"]


def _pick_hidden_state(scenario: dict, seed: Optional[int], ep: int) -> str:
    states = scenario["hidden_states"]
    if len(states) == 1:
        return list(states.keys())[0]
    key = f"{scenario['id']}_ep{ep}_seed{seed if seed is not None else 'none'}"
    h = int(hashlib.md5(key.encode()).hexdigest(), 16)
    r = (h % 10_000) / 10_000.0
    cumulative = 0.0
    for k, v in states.items():
        cumulative += v["prob"]
        if r < cumulative:
            return k
    return list(states.keys())[-1]


class DevOpsIncidentTask(BaseTask):
    task_id = "devops_incident"
    max_steps = 4   # diagnose → fix → verify → close (INVESTIGATE does not consume a step)

    def __init__(self):
        self._ep = -1
        self._seed: Optional[int] = None
        self._scenario: dict = {}
        self._active_state_key: str = "A"
        self._active_state: dict = {}
        self._step = 0
        self._api_calls = 0
        self._history: list = []
        self._done = False
        self._investigated = False
        self._diagnosis = ""
        self._fix = ""

    def reset(self, seed: Optional[int] = None):
        self._ep += 1
        self._seed = seed
        if seed is not None:
            random.seed(seed)
        self._scenario = _SCENARIO_CLASSES[self._ep % len(_SCENARIO_CLASSES)]
        self._active_state_key = _pick_hidden_state(self._scenario, seed, self._ep)
        self._active_state = self._scenario["hidden_states"][self._active_state_key]
        self._step = 0
        self._api_calls = 0
        self._history = []
        self._done = False
        self._investigated = False
        self._diagnosis = ""
        self._fix = ""
        return self._obs()

    def step(self, action: Action):
        if self._done:
            raise RuntimeError("Episode done. Call reset().")
        t = action.type
        self._api_calls += 1

        # ── INVESTIGATE: pulls telemetry, does NOT advance _step ─────────────
        if t == "investigate":
            if self._step != 0:
                return self._obs(), Reward(value=0.01, breakdown={"error": "investigate_invalid_step"}, raw=0.01), False, {}
            self._investigated = True
            r = investigation_reward(self._scenario["ambiguity"])
            self._history.append({
                "api_call": self._api_calls, "step": self._step, "action": "investigate",
                "reward": {"value": r, "breakdown": {"investigation": r}},
                "reveal": self._active_state["reveal"],
            })
            return self._obs(), Reward(value=r, breakdown={"investigation": r}, raw=r), False, {
                "info": "Telemetry data retrieved. Make your diagnosis.",
                "reveal": self._active_state["reveal"],
            }

        # ── STEP 0: Diagnosis ────────────────────────────────────────────────
        if self._step == 0:
            if t not in _S0_BASE:
                t = _S0_BASE[0]
            base_r = self._active_state["diag_rewards"].get(t, -0.10)
            correct = (t == self._active_state["correct_diagnosis"])
            cal_r = calibration_reward(correct, self._scenario["ambiguity"], self._investigated)
            rval = max(0.01, min(0.99, (base_r * 0.35) + (cal_r * 0.65)))
            breakdown = {
                "diagnosis_base": base_r, "calibration_reward": cal_r,
                "investigated": self._investigated, "ambiguity": self._scenario["ambiguity"],
            }
            self._diagnosis = t

        # ── STEP 1: Fix ──────────────────────────────────────────────────────
        elif self._step == 1:
            if t not in _S1:
                t = _S1[0]
            base_r = self._active_state["fix_rewards"].get(t, -0.10)
            # Reckless fix: applying kill/restart without investigating is risky
            if not self._investigated and t == "fix_kill_process":
                base_r -= 0.10
            correct = (t == self._active_state["correct_fix"])
            cal_r = calibration_reward(correct, self._scenario["ambiguity"] * 0.6, self._investigated)
            rval = max(0.01, min(0.99, (base_r * 0.35) + (cal_r * 0.65)))
            breakdown = {"fix_base": base_r, "calibration_reward": cal_r}
            self._fix = t

        # ── STEP 2: Verification ─────────────────────────────────────────────
        elif self._step == 2:
            rval = 0.25 if t == "verify_metrics_ok" else 0.12
            breakdown = {"verification": rval}

        # ── STEP 3: Close ────────────────────────────────────────────────────
        elif self._step == 3:
            rval = 0.20 if t == "close_resolved" else 0.08
            breakdown = {"resolution": rval}
            self._done = True

        else:
            return self._obs(), Reward(value=0.01, breakdown={}, raw=0.01), True, {}

        reward = Reward(value=round(rval, 4), breakdown=breakdown, raw=rval)
        self._history.append({
            "api_call": self._api_calls, "step": self._step, "action": t,
            "reward": {"value": reward.value, "breakdown": breakdown},
        })
        self._step += 1
        if self._step >= self.max_steps:
            self._done = True
        obs = self._obs()
        return obs, reward, self._done, {
            "step": self._step - 1, "action": t,
            "episode_score": self.grade_episode(self._history) if self._done else None,
        }

    def state(self):
        return {
            "task_id": self.task_id, "step": self._step, "done": self._done,
            "scenario_id": self._scenario.get("id", ""),
            "ambiguity": self._scenario.get("ambiguity", 0.0),
            "investigated": self._investigated,
            "root_cause": self._active_state.get("root_cause", "LOCKED") if self._investigated else "LOCKED",
            "diagnosis": self._diagnosis, "fix": self._fix,
        }

    def grade_episode(self, history):
        total = sum(e.get("reward", {}).get("value", 0.0) for e in history)
        return clamp(total / 1.0)

    def _obs(self) -> Observation:
        s = self._scenario
        idx = min(self._step, 3)
        avails = [_S0_WITH_INVEST if not self._investigated else _S0_BASE, _S1, _S2, _S3]
        avail = avails[idx] if not self._done else []

        if self._investigated:
            telemetry = self._active_state["reveal"]
        else:
            ambiguity = s["ambiguity"]
            if ambiguity >= 0.70:
                telemetry = "[TELEMETRY LOCKED] ⚠️ Multiple root causes possible — use 'investigate' to pull stack traces, heap dumps, and metrics"
            elif ambiguity >= 0.40:
                telemetry = "[TELEMETRY LOCKED] ⚡ Partial signals available — investigation recommended"
            else:
                telemetry = "[TELEMETRY LOCKED] ✅ Alert is self-explanatory — you may diagnose directly"

        prompts = [
            f"Alert: {s['visible_alert']}\nTelemetry: {telemetry}\n\nSelect diagnosis. Available: {avail}",
            f"Diagnosis: {self._diagnosis}\nApply fix. Available: {avail}",
            f"Fix applied: {self._fix}\nVerify system health. Available: {avail}",
            f"System stable. Close the incident. Available: {avail}",
        ]

        states = [
            {"alert": s["visible_alert"], "telemetry": telemetry, "investigated": self._investigated},
            {"alert": s["visible_alert"], "diagnosis": self._diagnosis},
            {"diagnosis": self._diagnosis, "fix": self._fix},
            {"diagnosis": self._diagnosis, "fix": self._fix, "verified": True},
        ]

        return Observation(
            task_id=self.task_id,
            step=self._step,
            state=states[idx],
            history=list(self._history),
            available_actions=avail,
            done=self._done,
            prompt=prompts[idx],
            context=prompts[idx],
            task=self.task_id,
            action_to_evaluate="Evaluating agent response...",
        )