File size: 11,064 Bytes
06b4790 77eea12 06b4790 77eea12 06b4790 849b14a 06b4790 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 | from __future__ import annotations
import uuid
from typing import Dict, Any, List
from models import Action, ActionType
from tasks.base import BaseTask, InternalState, StepOutput, semantic_match
INCIDENT_TIME = "2026-03-30T14:22:00Z"
DEPENDENCIES = [
{"service": "api-gateway", "calls": ["ml-inference-service", "product-service"], "called_by": []},
{"service": "ml-inference-service","calls": [], "called_by": ["api-gateway"]},
{"service": "log-aggregator", "calls": [], "called_by": []},
{"service": "product-service", "calls": [], "called_by": ["api-gateway"]},
]
AGGREGATOR_LOGS = [
"[14:20:01] INFO Log ingestion running: 48MB/s",
"[14:21:05] WARN Disk usage at 91% (/var/log/aggregated)",
"[14:21:45] WARN Disk usage at 95% - log rotation overdue",
"[14:22:01] ERROR Disk usage at 99% - write failure imminent",
"[14:22:02] ERROR Failed to write log chunk: No space left on device (ENOSPC)",
"[14:22:04] WARN Dropping incoming logs: buffer overflow (48000 messages dropped)",
"[14:22:05] ERROR Log rotation job FAILED: No space left on device",
"[14:22:10] CRIT Disk 100% full - all log writes failing",
]
ML_LOGS = [
"[14:21:00] INFO ml-inference-service starting",
"[14:21:01] INFO Loading model: recommendation-v2.1 (2.3GB)",
"[14:21:12] INFO Model loaded in 11.2s",
"[14:21:12] WARN Model checksum mismatch - reloading",
"[14:21:23] INFO Model loaded in 11.1s",
"[14:21:23] WARN Model checksum mismatch - reloading",
"[14:21:34] WARN Model reload loop detected: 6 reloads in 60s",
"[14:22:01] ERROR CPU throttled: 100% sustained for 120s",
"[14:22:02] WARN Deployment {version} introduced new model checksum validation - may have bug",
]
API_LOGS = [
"[14:22:00] INFO GET /api/v1/recommendations 200 145ms",
"[14:22:05] WARN GET /api/v1/recommendations 200 4823ms (ml-inference slow)",
"[14:22:15] ERROR GET /api/v1/recommendations 504 Gateway Timeout",
]
class BonusTask(BaseTask):
def initialize(self) -> InternalState:
ml_ver = f"v2.{self.rng.randint(0, 3)}.{self.rng.randint(0, 5)}"
logs = {
"log-aggregator": AGGREGATOR_LOGS[:],
"ml-inference-service": [l.replace("{version}", ml_ver) for l in ML_LOGS],
"api-gateway": API_LOGS[:],
"product-service": ["[14:22:00] INFO Service healthy - 0 errors"],
}
services = {
"api-gateway": {
"name": "api-gateway", "status": "degraded",
"cpu_percent": round(self.rng.uniform(40, 58), 1),
"memory_percent": round(self.rng.uniform(44, 56), 1),
"error_rate": round(self.rng.uniform(3.0, 6.0), 2),
"latency_p99_ms": round(self.rng.uniform(8000, 12000), 0),
"replicas_running": 2, "replicas_desired": 2,
"current_version": "v3.1.0", "last_deployed": "2026-03-20T08:00:00Z",
"minutes_degraded": 0, "sla_breach": False,
},
"ml-inference-service": {
"name": "ml-inference-service", "status": "degraded",
"cpu_percent": round(self.rng.uniform(94, 100), 1),
"memory_percent": round(self.rng.uniform(55, 72), 1),
"error_rate": round(self.rng.uniform(1.5, 4.0), 2),
"latency_p99_ms": round(self.rng.uniform(9000, 14000), 0),
"replicas_running": 2, "replicas_desired": 2,
"current_version": ml_ver, "last_deployed": "2026-03-30T14:20:55Z",
"minutes_degraded": 0, "sla_breach": False,
},
"log-aggregator": {
"name": "log-aggregator", "status": "degraded",
"cpu_percent": round(self.rng.uniform(18, 30), 1),
"memory_percent": round(self.rng.uniform(40, 52), 1),
"error_rate": round(self.rng.uniform(5.0, 9.0), 2),
"latency_p99_ms": round(self.rng.uniform(200, 500), 0),
"replicas_running": 1, "replicas_desired": 1,
"current_version": "v1.3.0", "last_deployed": "2026-03-01T10:00:00Z",
"minutes_degraded": 0, "sla_breach": False,
},
"product-service": {
"name": "product-service", "status": "healthy",
"cpu_percent": round(self.rng.uniform(25, 38), 1),
"memory_percent": round(self.rng.uniform(35, 48), 1),
"error_rate": 0.0,
"latency_p99_ms": round(self.rng.uniform(15, 35), 0),
"replicas_running": 3, "replicas_desired": 3,
"current_version": "v2.0.1", "last_deployed": "2026-03-15T12:00:00Z",
"minutes_degraded": 0, "sla_breach": False,
},
}
alerts = [
{
"id": "B001", "severity": "critical", "service": "log-aggregator",
"message": "Disk 100% full on log-aggregator - dropping 48000 log messages/min",
"timestamp": "2026-03-30T14:22:10Z", "acknowledged": False,
},
{
"id": "B002", "severity": "critical", "service": "ml-inference-service",
"message": f"CPU sustained 99%+ for 120s - model reload loop detected ({ml_ver})",
"timestamp": "2026-03-30T14:22:01Z", "acknowledged": False,
},
{
"id": "B003", "severity": "warning", "service": "api-gateway",
"message": "P99 latency 10200ms on /recommendations - upstream ml-inference slow",
"timestamp": "2026-03-30T14:22:15Z", "acknowledged": False,
},
]
state = InternalState(
episode_id=str(uuid.uuid4()), task_id="bonus", step=0, max_steps=25,
services=services, alerts=alerts, logs=logs,
action_history=[], total_reward=0.0, incident_resolved=False,
ground_truth_root_cause="disk_full_log_aggregator AND model_reload_loop_ml_inference",
ground_truth_fix="alert_oncall for disk cleanup AND rollback ml-inference-service",
incident_start_time=INCIDENT_TIME,
healthy_services=["product-service"],
service_dependencies=DEPENDENCIES,
)
state._ml_version = ml_ver
return state
def step(self, state: InternalState, action: Action) -> StepOutput:
state.step += 1
state._apply_sla_degradation()
at = action.action_type
svc = action.service or ""
reward = 0.0
done = False
info: Dict[str, Any] = {}
result_text, error_text = self._apply_action_to_logs(state, action)
gather_map = {
("read_logs", "log-aggregator"): ("rl_agg", 0.05),
("search_logs", "log-aggregator"): ("rl_agg", 0.05),
("read_logs", "ml-inference-service"): ("rl_ml", 0.05),
("search_logs", "ml-inference-service"):("rl_ml", 0.05),
("read_metrics", "log-aggregator"): ("rm_agg", 0.05),
("read_metrics", "ml-inference-service"): ("rm_ml", 0.05),
}
k = (at.value, svc)
if k in gather_map:
tag, r = gather_map[k]
if tag not in state.rewards_given:
reward += r; state.rewards_given.add(tag)
if at == ActionType.READ_RUNBOOK:
if "runbook" not in state.rewards_given:
reward += 0.04; state.rewards_given.add("runbook")
if at == ActionType.DIAGNOSE:
rc = action.root_cause or ""
has_disk = semantic_match(rc, ["disk", "storage", "full", "space", "log", "aggregat"])
has_ml = semantic_match(rc, ["ml", "inference", "model", "reload", "cpu", "loop"])
result_text = f"Diagnosis recorded: {rc}"
if has_disk and has_ml:
if "diagnose_both" not in state.rewards_given:
reward += 0.20; state.rewards_given.add("diagnose_both")
elif has_disk or has_ml:
if "diagnose_one" not in state.rewards_given:
reward += 0.08; state.rewards_given.add("diagnose_one")
# Fix 1: disk issue via oncall
if at == ActionType.ALERT_ONCALL:
reason = (action.reason or "").lower()
if semantic_match(reason, ["disk", "log", "storage", "space", "aggregat"]):
if "fix_disk" not in state.rewards_given:
reward += 0.20; state.rewards_given.add("fix_disk")
result_text = "SRE paged for disk cleanup. Volume extension underway (~5 min)."
if "fix_ml" in state.rewards_given:
state.incident_resolved = True; done = True; info["resolution"] = "incident_resolved"
else:
if "fix_disk" not in state.rewards_given:
reward += 0.08
result_text = "On-call paged. Clarify disk/log issue for faster resolution."
# Fix 2: ML reload loop via rollback or restart
if at in (ActionType.ROLLBACK, ActionType.RESTART_SERVICE) and svc == "ml-inference-service":
if "fix_ml" not in state.rewards_given:
r_base = 0.20 if at == ActionType.ROLLBACK else 0.12
reward += r_base; state.rewards_given.add("fix_ml")
state.services["ml-inference-service"]["cpu_percent"] = round(self.rng.uniform(22, 38), 1)
state.services["ml-inference-service"]["latency_p99_ms"] = round(self.rng.uniform(80, 140), 0)
state.services["ml-inference-service"]["error_rate"] = 0.0
action_word = "rolled back" if at == ActionType.ROLLBACK else "restarted"
result_text = f"ml-inference-service {action_word}. Reload loop stopped. CPU recovering."
if "fix_disk" in state.rewards_given:
state.incident_resolved = True; done = True; info["resolution"] = "incident_resolved"
if at in (ActionType.RESTART_SERVICE, ActionType.ROLLBACK) and svc in state.healthy_services:
reward -= 0.08
if at == ActionType.NOOP and state.step > 5:
reward -= 0.03
if at in (ActionType.BLOCK_IP_RANGE, ActionType.CREATE_INDEX, ActionType.FAILOVER):
reward -= 0.10
error_text = f"Action {at.value} is not applicable to this incident."
state.total_reward = self._clamp(state.total_reward + reward)
if state.step >= state.max_steps and not done:
done = True; info["reason"] = "max_steps_reached"
obs = state._build_observation(last_action_result=result_text, last_action_error=error_text)
state.action_history.append({"step": state.step, "action": action.model_dump(), "reward": round(reward, 4)})
return StepOutput(next_state=state, reward=round(reward, 4), done=done, info=info)
|