Arijit-07 commited on
Commit
0a14522
·
1 Parent(s): 8be69b1

Add Task 6: Database Performance Degradation (missing index investigation)

Browse files
api.py CHANGED
@@ -25,7 +25,7 @@ app.add_middleware(
25
  allow_headers=["*"],
26
  )
27
 
28
- VALID_TASKS = ("easy", "medium", "hard", "bonus", "security")
29
  _env: Optional[DevOpsIncidentEnv] = None
30
 
31
 
@@ -96,6 +96,7 @@ def dashboard():
96
  .hard {{ background: #3a1a1a; color: #f44336; }}
97
  .bonus {{ background: #1a1a3a; color: #9c27b0; }}
98
  .security {{ background: #3a1a1a; color: #ff5252; }}
 
99
  .endpoints {{ background: #1a1d27; border: 1px solid #2d3148; border-radius: 8px; padding: 1.25rem; margin-bottom: 2rem; }}
100
  .endpoints h3 {{ margin: 0 0 1rem; color: #fff; }}
101
  .endpoint {{ display: flex; align-items: center; gap: 0.75rem; margin-bottom: 0.5rem; }}
@@ -139,6 +140,11 @@ def dashboard():
139
  <h3>Security Incident (DDoS)</h3>
140
  <p>Botnet DDoS and credential stuffing attack. Requires traffic blocking and security escalation. Max 20 steps.</p>
141
  </div>
 
 
 
 
 
142
  </div>
143
 
144
  <div class="endpoints">
@@ -270,6 +276,16 @@ def list_tasks():
270
  "The agent must read access logs, diagnose the attack IP range, block the CIDR, and alert the security team."
271
  ),
272
  },
 
 
 
 
 
 
 
 
 
 
273
  ]
274
  }
275
 
 
25
  allow_headers=["*"],
26
  )
27
 
28
+ VALID_TASKS = ("easy", "medium", "hard", "bonus", "security", "database")
29
  _env: Optional[DevOpsIncidentEnv] = None
30
 
31
 
 
96
  .hard {{ background: #3a1a1a; color: #f44336; }}
97
  .bonus {{ background: #1a1a3a; color: #9c27b0; }}
98
  .security {{ background: #3a1a1a; color: #ff5252; }}
99
+ .database {{ background: #1a2c3a; color: #4fc3f7; }}
100
  .endpoints {{ background: #1a1d27; border: 1px solid #2d3148; border-radius: 8px; padding: 1.25rem; margin-bottom: 2rem; }}
101
  .endpoints h3 {{ margin: 0 0 1rem; color: #fff; }}
102
  .endpoint {{ display: flex; align-items: center; gap: 0.75rem; margin-bottom: 0.5rem; }}
 
140
  <h3>Security Incident (DDoS)</h3>
141
  <p>Botnet DDoS and credential stuffing attack. Requires traffic blocking and security escalation. Max 20 steps.</p>
142
  </div>
143
+ <div class="task">
144
+ <span class="badge database">DATABASE</span>
145
+ <h3>Database Degradation</h3>
146
+ <p>Missing schema index causing slow queries and full table scans. Fix via index creation or rollback. Max 20 steps.</p>
147
+ </div>
148
  </div>
149
 
150
  <div class="endpoints">
 
276
  "The agent must read access logs, diagnose the attack IP range, block the CIDR, and alert the security team."
277
  ),
278
  },
279
+ {
280
+ "id": "database",
281
+ "name": "Database Performance Degradation",
282
+ "difficulty": "hard",
283
+ "max_steps": 20,
284
+ "description": (
285
+ "A recent migration added a user_segment column to the orders table without an index. "
286
+ "Sequential table scans are spiking DB CPU. Discovered via read_metrics and the slow query log."
287
+ ),
288
+ },
289
  ]
290
  }
291
 
data/runbooks/database_performance.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Database Performance Degradation
2
+
3
+ This runbook outlines the recommended procedure for handling database performance issues, specifically focusing on slow queries, high CPU caused by sequential table scans, and missing query indexes.
4
+
5
+ ## 1. Diagnose Database Load
6
+ If the database (`postgres-primary`) is exhibiting high CPU or degraded performance without actual service crashes, use the `read_metrics` action on the database.
7
+ - Look at the `Sequential scans/min`.
8
+ - If this value is massively elevated (e.g. 500+ instead of single digits), it means queries are scanning entire tables instead of looking up rows in an index.
9
+
10
+ ## 2. Check Slow Query Logs
11
+ Use `read_logs` on the database to verify the slow queries.
12
+ - Slow query logs will identify specific query strings taking >1000ms.
13
+ - They will likely append `[seq_scan]` indicating they hit the table sequentially.
14
+ - The logs may also include automated schema anomaly warnings, such as "MISSING INDEX DETECTED".
15
+
16
+ ## 3. Resolving Missing Indexes
17
+ If a missing index is detected, it is highly likely that a recent schema migration added a field but forgot the index.
18
+ - **Action Option 1:** Use the `create_index` action, specifying the target `table` and `column` (e.g. `table="orders"`, `column="user_segment"`). This is the best approach if the data is already deployed, as it fixes the issue instantly without breaking backend code.
19
+ - **Action Option 2:** Use the `rollback` action on the database service. This will revert the schema migration. It fixes the performance, but causes downstream code applying to the new schema to error until patched.
20
+
21
+ ## 4. What NOT to do
22
+ - Do **NOT** `restart_service`. Connection pool exhaustion is a symptom, not the cause. Restarting only temporarily drops connections before being immediately overwhelmed again.
23
+ - Do **NOT** `scale_up`. Adding more replicas/workers will only hammer the slow database harder, increasing lock contention and further starving the CPU.
env.py CHANGED
@@ -2,7 +2,7 @@ from __future__ import annotations
2
  import random
3
  from typing import Optional
4
  from models import Action, Observation, StepResult, State
5
- from tasks import EasyTask, MediumTask, HardTask, BonusTask, SecurityTask
6
  from tasks.base import InternalState
7
 
8
  TASK_MAP = {
@@ -11,6 +11,7 @@ TASK_MAP = {
11
  "hard": HardTask,
12
  "bonus": BonusTask,
13
  "security": SecurityTask,
 
14
  }
15
 
16
 
 
2
  import random
3
  from typing import Optional
4
  from models import Action, Observation, StepResult, State
5
+ from tasks import EasyTask, MediumTask, HardTask, BonusTask, SecurityTask, DatabaseTask
6
  from tasks.base import InternalState
7
 
8
  TASK_MAP = {
 
11
  "hard": HardTask,
12
  "bonus": BonusTask,
13
  "security": SecurityTask,
14
+ "database": DatabaseTask,
15
  }
16
 
17
 
models.py CHANGED
@@ -17,6 +17,7 @@ class ActionType(str, Enum):
17
  NOOP = "noop"
18
  SEARCH_LOGS = "search_logs"
19
  BLOCK_IP_RANGE = "block_ip_range"
 
20
 
21
 
22
  class Action(BaseModel):
@@ -28,6 +29,8 @@ class Action(BaseModel):
28
  reason: Optional[str] = None
29
  query: Optional[str] = None # used with search_logs
30
  ip_range: Optional[str] = None
 
 
31
 
32
 
33
  class Alert(BaseModel):
 
17
  NOOP = "noop"
18
  SEARCH_LOGS = "search_logs"
19
  BLOCK_IP_RANGE = "block_ip_range"
20
+ CREATE_INDEX = "create_index"
21
 
22
 
23
  class Action(BaseModel):
 
29
  reason: Optional[str] = None
30
  query: Optional[str] = None # used with search_logs
31
  ip_range: Optional[str] = None
32
+ table: Optional[str] = None
33
+ column: Optional[str] = None
34
 
35
 
36
  class Alert(BaseModel):
openenv.yaml CHANGED
@@ -87,6 +87,18 @@ tasks:
87
  expected_score_random_agent: 0.01
88
  expected_score_strong_llm: 0.35
89
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  action_space:
91
  type: structured
92
  description: >
@@ -118,6 +130,8 @@ action_space:
118
  description: Take no action this step
119
  - name: block_ip_range
120
  description: Block traffic from an IP range (CIDR format)
 
 
121
 
122
  observation_space:
123
  type: structured
 
87
  expected_score_random_agent: 0.01
88
  expected_score_strong_llm: 0.35
89
 
90
+ - id: database
91
+ name: Database Performance Degradation (Missing Index)
92
+ description: >
93
+ A database migration ran 15 minutes ago that added a new column but forgot to add an index.
94
+ Now queries are doing full table scans sequentially, leading to major DB degradation.
95
+ The agent must read the Postgres slow query logs, evaluate sequential scan rates via metrics, and correctly assign a missing index or rollback the migration.
96
+ difficulty: hard
97
+ max_steps: 20
98
+ reward_range: [0.0, 1.0]
99
+ expected_score_random_agent: 0.01
100
+ expected_score_strong_llm: 0.35
101
+
102
  action_space:
103
  type: structured
104
  description: >
 
130
  description: Take no action this step
131
  - name: block_ip_range
132
  description: Block traffic from an IP range (CIDR format)
133
+ - name: create_index
134
+ description: Create a database index on a specific table and column
135
 
136
  observation_space:
137
  type: structured
server/app.py CHANGED
@@ -14,7 +14,7 @@ try:
14
  except ImportError:
15
  HAS_WEB_INTERFACE = False
16
 
17
- VALID_TASKS = ("easy", "medium", "hard", "bonus", "security")
18
  _env = DevOpsEnvironment()
19
  app = FastAPI(
20
  title="DevOps Incident Response — OpenEnv",
@@ -96,6 +96,7 @@ def dashboard():
96
  .hard {{ background: #3a1a1a; color: #f44336; }}
97
  .bonus {{ background: #1a1a3a; color: #9c27b0; }}
98
  .security {{ background: #3a1a1a; color: #ff5252; }}
 
99
  .endpoints {{ background: #1a1d27; border: 1px solid #2d3148; border-radius: 8px; padding: 1.25rem; margin-bottom: 2rem; }}
100
  .endpoints h3 {{ margin: 0 0 1rem; color: #fff; }}
101
  .endpoint {{ display: flex; align-items: center; gap: 0.75rem; margin-bottom: 0.5rem; }}
@@ -139,6 +140,11 @@ def dashboard():
139
  <h3>Security Incident (DDoS)</h3>
140
  <p>Botnet DDoS and credential stuffing attack. Requires traffic blocking and security escalation. Max 20 steps.</p>
141
  </div>
 
 
 
 
 
142
  </div>
143
 
144
  <div class="endpoints">
@@ -268,6 +274,16 @@ def list_tasks():
268
  "The agent must read access logs, diagnose the attack IP range, block the CIDR, and alert the security team."
269
  ),
270
  },
 
 
 
 
 
 
 
 
 
 
271
  ]
272
  }
273
 
 
14
  except ImportError:
15
  HAS_WEB_INTERFACE = False
16
 
17
+ VALID_TASKS = ("easy", "medium", "hard", "bonus", "security", "database")
18
  _env = DevOpsEnvironment()
19
  app = FastAPI(
20
  title="DevOps Incident Response — OpenEnv",
 
96
  .hard {{ background: #3a1a1a; color: #f44336; }}
97
  .bonus {{ background: #1a1a3a; color: #9c27b0; }}
98
  .security {{ background: #3a1a1a; color: #ff5252; }}
99
+ .database {{ background: #1a2c3a; color: #4fc3f7; }}
100
  .endpoints {{ background: #1a1d27; border: 1px solid #2d3148; border-radius: 8px; padding: 1.25rem; margin-bottom: 2rem; }}
101
  .endpoints h3 {{ margin: 0 0 1rem; color: #fff; }}
102
  .endpoint {{ display: flex; align-items: center; gap: 0.75rem; margin-bottom: 0.5rem; }}
 
140
  <h3>Security Incident (DDoS)</h3>
141
  <p>Botnet DDoS and credential stuffing attack. Requires traffic blocking and security escalation. Max 20 steps.</p>
142
  </div>
143
+ <div class="task">
144
+ <span class="badge database">DATABASE</span>
145
+ <h3>Database Degradation</h3>
146
+ <p>Missing schema index causing slow queries and full table scans. Fix via index creation or rollback. Max 20 steps.</p>
147
+ </div>
148
  </div>
149
 
150
  <div class="endpoints">
 
274
  "The agent must read access logs, diagnose the attack IP range, block the CIDR, and alert the security team."
275
  ),
276
  },
277
+ {
278
+ "id": "database",
279
+ "name": "Database Performance Degradation",
280
+ "difficulty": "hard",
281
+ "max_steps": 20,
282
+ "description": (
283
+ "A recent migration added a user_segment column to the orders table without an index. "
284
+ "Sequential table scans are spiking DB CPU. Discovered via read_metrics and the slow query log."
285
+ ),
286
+ },
287
  ]
288
  }
289
 
tasks/__init__.py CHANGED
@@ -3,5 +3,6 @@ from tasks.task_medium import MediumTask
3
  from tasks.task_hard import HardTask
4
  from tasks.task_bonus import BonusTask
5
  from tasks.task_security import SecurityTask
 
6
 
7
- __all__ = ["EasyTask", "MediumTask", "HardTask", "BonusTask", "SecurityTask"]
 
3
  from tasks.task_hard import HardTask
4
  from tasks.task_bonus import BonusTask
5
  from tasks.task_security import SecurityTask
6
+ from tasks.task_database import DatabaseTask
7
 
8
+ __all__ = ["EasyTask", "MediumTask", "HardTask", "BonusTask", "SecurityTask", "DatabaseTask"]
tasks/task_database.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import uuid
3
+ from typing import Dict, Any, List
4
+ from models import Action, ActionType
5
+ from tasks.base import BaseTask, InternalState, StepOutput, semantic_match
6
+
7
+ INCIDENT_TIME = "2026-04-12T14:22:00Z"
8
+
9
+ DEPENDENCIES = [
10
+ {"service": "api-gateway", "calls": ["order-service", "user-service"], "called_by": []},
11
+ {"service": "order-service", "calls": ["postgres-primary"], "called_by": ["api-gateway"]},
12
+ {"service": "analytics-service", "calls": ["postgres-primary"], "called_by": []},
13
+ {"service": "postgres-primary", "calls": [], "called_by": ["order-service", "analytics-service"]},
14
+ {"service": "user-service", "calls": [], "called_by": ["api-gateway"]},
15
+ ]
16
+
17
+ POSTGRES_LOGS = [
18
+ "[14:22:01] SLOW_QUERY 4281ms: SELECT * FROM orders WHERE user_segment='premium' LIMIT 100 [seq_scan: 18M rows]",
19
+ "[14:22:03] SLOW_QUERY 4190ms: SELECT COUNT(*) FROM orders WHERE user_segment='standard' [seq_scan: 18M rows]",
20
+ "[14:22:05] SLOW_QUERY 4350ms: SELECT order_id, total FROM orders WHERE user_segment='enterprise' [seq_scan: 18M rows]",
21
+ "[14:22:07] INFO MISSING INDEX DETECTED: orders.user_segment has no index (added in migration 20260425_add_user_segment)",
22
+ "[14:22:08] WARN Table scan count: 847/min (normal: 2/min) — index missing on hot column",
23
+ "[14:22:09] SLOW_QUERY 4401ms: SELECT * FROM orders WHERE user_segment='premium' AND created_at > '2026-04-01' [seq_scan]",
24
+ ]
25
+
26
+ ORDER_LOGS = [
27
+ "[14:22:01] WARN DB query timeout: getOrdersBySegment() exceeded 5000ms",
28
+ "[14:22:02] ERROR Failed to fetch orders for dashboard: upstream DB timeout",
29
+ "[14:22:05] WARN Retry 1/3: getOrdersBySegment() - 4300ms",
30
+ "[14:22:09] ERROR Circuit breaker OPEN for postgres-primary read replica",
31
+ ]
32
+
33
+ ANALYTICS_LOGS = [
34
+ "[14:22:00] INFO Starting hourly aggregation job: orders by user_segment",
35
+ "[14:22:04] WARN Aggregation query running slow: 4100ms elapsed (expected: 80ms)",
36
+ "[14:22:08] ERROR Aggregation job timed out after 300s — will retry in 60min",
37
+ "[14:22:09] INFO Root cause likely: orders table scan (no index on user_segment)",
38
+ ]
39
+
40
+
41
+ class DatabaseTask(BaseTask):
42
+ def initialize(self) -> InternalState:
43
+ logs = {
44
+ "postgres-primary": POSTGRES_LOGS[:],
45
+ "order-service": ORDER_LOGS[:],
46
+ "analytics-service": ANALYTICS_LOGS[:],
47
+ "api-gateway": ["[14:22:05] WARN Upstream order-service latency 4600ms"],
48
+ "user-service": ["[14:22:00] INFO Service normal"],
49
+ }
50
+
51
+ services = {
52
+ "postgres-primary": {
53
+ "name": "postgres-primary", "status": "degraded",
54
+ "cpu_percent": 94.0, "memory_percent": 65.0,
55
+ "error_rate": 0.0, "latency_p99_ms": 4401.0,
56
+ "replicas_running": 1, "replicas_desired": 1,
57
+ "current_version": "v14.1", "last_deployed": "2025-01-01T00:00:00Z",
58
+ "minutes_degraded": 15, "sla_breach": False,
59
+ },
60
+ "order-service": {
61
+ "name": "order-service", "status": "degraded",
62
+ "cpu_percent": 35.0, "memory_percent": 45.0,
63
+ "error_rate": 2.5, "latency_p99_ms": 4800.0,
64
+ "replicas_running": 3, "replicas_desired": 3,
65
+ "current_version": "v2.1.0", "last_deployed": "2026-03-20T08:00:00Z",
66
+ "minutes_degraded": 15, "sla_breach": False,
67
+ },
68
+ "analytics-service": {
69
+ "name": "analytics-service", "status": "degraded",
70
+ "cpu_percent": 25.0, "memory_percent": 30.0,
71
+ "error_rate": 5.0, "latency_p99_ms": 300000.0,
72
+ "replicas_running": 1, "replicas_desired": 1,
73
+ "current_version": "v1.5.0", "last_deployed": "2026-04-10T11:00:00Z",
74
+ "minutes_degraded": 15, "sla_breach": False,
75
+ },
76
+ "api-gateway": {
77
+ "name": "api-gateway", "status": "degraded",
78
+ "cpu_percent": 45.0, "memory_percent": 45.0,
79
+ "error_rate": 1.5, "latency_p99_ms": 4600.0,
80
+ "replicas_running": 5, "replicas_desired": 5,
81
+ "current_version": "v3.1.0", "last_deployed": "2026-03-20T08:00:00Z",
82
+ "minutes_degraded": 15, "sla_breach": False,
83
+ },
84
+ "user-service": {
85
+ "name": "user-service", "status": "healthy",
86
+ "cpu_percent": 15.0, "memory_percent": 30.0,
87
+ "error_rate": 0.0, "latency_p99_ms": 25.0,
88
+ "replicas_running": 2, "replicas_desired": 2,
89
+ "current_version": "v1.1.2", "last_deployed": "2026-03-01T00:00:00Z",
90
+ "minutes_degraded": 0, "sla_breach": False,
91
+ },
92
+ }
93
+
94
+ alerts = [
95
+ {
96
+ "id": "D001", "severity": "critical", "service": "order-service",
97
+ "message": "P99 latency 4800ms (threshold: 500ms)",
98
+ "timestamp": "2026-04-12T14:22:05Z", "acknowledged": False,
99
+ },
100
+ {
101
+ "id": "D002", "severity": "critical", "service": "analytics-service",
102
+ "message": "Hourly aggregation job timed out",
103
+ "timestamp": "2026-04-12T14:22:08Z", "acknowledged": False,
104
+ },
105
+ {
106
+ "id": "D003", "severity": "warning", "service": "postgres-primary",
107
+ "message": "CPU 94% sustained 15min, high sequential scan rate",
108
+ "timestamp": "2026-04-12T14:22:07Z", "acknowledged": False,
109
+ },
110
+ {
111
+ "id": "D004", "severity": "warning", "service": "api-gateway",
112
+ "message": "Upstream order-service latency 4600ms",
113
+ "timestamp": "2026-04-12T14:22:09Z", "acknowledged": False,
114
+ },
115
+ ]
116
+
117
+ state = InternalState(
118
+ episode_id=str(uuid.uuid4()), task_id="database", step=0, max_steps=20,
119
+ services=services, alerts=alerts, logs=logs,
120
+ action_history=[], total_reward=0.0, incident_resolved=False,
121
+ ground_truth_root_cause="missing_index_orders_user_segment_column_migration",
122
+ ground_truth_fix="create index on orders.user_segment OR rollback migration",
123
+ incident_start_time=INCIDENT_TIME,
124
+ healthy_services=["user-service"],
125
+ service_dependencies=DEPENDENCIES,
126
+ )
127
+ return state
128
+
129
+ def step(self, state: InternalState, action: Action) -> StepOutput:
130
+ state.step += 1
131
+ state._apply_sla_degradation()
132
+ at = action.action_type
133
+ svc = action.service or ""
134
+ reward = 0.0
135
+ done = False
136
+ info: Dict[str, Any] = {}
137
+
138
+ result_text, error_text = self._apply_action_to_logs(state, action)
139
+
140
+ # Custom read_metrics response for postgres-primary
141
+ if at == ActionType.READ_METRICS and svc == "postgres-primary":
142
+ s = state.services[svc]
143
+ result_text = (
144
+ f"=== Metrics: postgres-primary ===\n"
145
+ f"Status: {s['status'].upper()}\n"
146
+ f"CPU: {s['cpu_percent']:.1f}% (normal: 15%)\n"
147
+ f"Memory: {s['memory_percent']:.1f}%\n"
148
+ f"Sequential scans/min: 847 (normal: 2)\n"
149
+ f"Index scans/min: 12 (normal: 890)\n"
150
+ f"Active queries: 48 (normal: 8)\n"
151
+ f"Longest running query: {s['latency_p99_ms']:.0f}ms\n"
152
+ f"Last migration: 20260425_add_user_segment (14:07:00, 15 min ago)\n"
153
+ )
154
+ state.evidence_log.append({
155
+ "step": state.step,
156
+ "source": f"metrics:{svc}",
157
+ "summary": "postgres-primary: cpu=94%, seq_scans=847/min, normal=2/min",
158
+ "raw": result_text,
159
+ })
160
+
161
+ gather_map = {
162
+ ("read_logs", "postgres-primary"): ("rl_pg", 0.10),
163
+ ("search_logs", "postgres-primary"): ("rl_pg", 0.10),
164
+ ("read_metrics", "postgres-primary"): ("rm_pg", 0.10),
165
+ ("read_logs", "analytics-service"): ("rl_ana", 0.05),
166
+ ("search_logs", "analytics-service"): ("rl_ana", 0.05),
167
+ }
168
+ k = (at.value, svc)
169
+ if k in gather_map:
170
+ tag, r = gather_map[k]
171
+ if tag not in state.rewards_given:
172
+ reward += r; state.rewards_given.add(tag)
173
+
174
+ if at == ActionType.READ_RUNBOOK:
175
+ if "runbook_any" not in state.rewards_given:
176
+ reward += 0.05; state.rewards_given.add("runbook_any")
177
+
178
+ if at == ActionType.DIAGNOSE:
179
+ rc = action.root_cause or ""
180
+ if semantic_match(rc, ["index", "migration", "user_segment", "seq_scan", "table scan"]):
181
+ if "diagnose_correct" not in state.rewards_given:
182
+ reward += 0.20; state.rewards_given.add("diagnose_correct")
183
+ result_text = f"Diagnosis recorded: {rc}"
184
+
185
+ if at == ActionType.CREATE_INDEX:
186
+ table = (action.table or "").lower()
187
+ column = (action.column or "").lower()
188
+ if table == "orders" and "user_segment" in column:
189
+ if "fix_index" not in state.rewards_given:
190
+ reward += 0.30; state.rewards_given.add("fix_index")
191
+ result_text = f"Successfully created index on {table}.{column}. Sequential scans dropped. Query latency normalizing."
192
+ state.services["postgres-primary"]["cpu_percent"] = 18.0
193
+ state.services["postgres-primary"]["latency_p99_ms"] = 12.0
194
+ state.incident_resolved = True; done = True; info["resolution"] = "incident_resolved"
195
+ else:
196
+ reward -= 0.10
197
+ result_text = f"Created index on {table}.{column}, but it had no effect on the ongoing sequential scans."
198
+
199
+ if at == ActionType.ROLLBACK and svc == "postgres-primary":
200
+ if "fix_index" not in state.rewards_given:
201
+ reward += 0.20; state.rewards_given.add("fix_index")
202
+ result_text = "Migration rolled back. user_segment column removed. Service queries failing back to old schema, but database CPU returning to normal."
203
+ state.services["postgres-primary"]["cpu_percent"] = 18.0
204
+ state.services["postgres-primary"]["latency_p99_ms"] = 12.0
205
+ state.incident_resolved = True; done = True; info["resolution"] = "incident_resolved"
206
+
207
+ if at == ActionType.RESTART_SERVICE:
208
+ reward -= 0.10
209
+ result_text = f"Restarted {svc}. Connection pool dropped but immediately overwhelmed again by slow queries missing index."
210
+
211
+ if at == ActionType.SCALE_UP:
212
+ reward -= 0.08
213
+ result_text = f"Scaled up {svc}. More workers are now hitting the database, worsening the CPU starvation."
214
+
215
+ if at == ActionType.NOOP and state.step > 5:
216
+ reward -= 0.03
217
+
218
+ state.total_reward = self._clamp(state.total_reward + reward)
219
+ if state.step >= state.max_steps and not done:
220
+ done = True; info["reason"] = "max_steps_reached"
221
+
222
+ obs = state._build_observation(last_action_result=result_text, last_action_error=error_text)
223
+ state.action_history.append({"step": state.step, "action": action.model_dump(), "reward": round(reward, 4)})
224
+ return StepOutput(next_state=state, reward=round(reward, 4), done=done, info=info)