Add Task 6: Database Performance Degradation (missing index investigation)
Browse files- api.py +17 -1
- data/runbooks/database_performance.md +23 -0
- env.py +2 -1
- models.py +3 -0
- openenv.yaml +14 -0
- server/app.py +17 -1
- tasks/__init__.py +2 -1
- tasks/task_database.py +224 -0
api.py
CHANGED
|
@@ -25,7 +25,7 @@ app.add_middleware(
|
|
| 25 |
allow_headers=["*"],
|
| 26 |
)
|
| 27 |
|
| 28 |
-
VALID_TASKS = ("easy", "medium", "hard", "bonus", "security")
|
| 29 |
_env: Optional[DevOpsIncidentEnv] = None
|
| 30 |
|
| 31 |
|
|
@@ -96,6 +96,7 @@ def dashboard():
|
|
| 96 |
.hard {{ background: #3a1a1a; color: #f44336; }}
|
| 97 |
.bonus {{ background: #1a1a3a; color: #9c27b0; }}
|
| 98 |
.security {{ background: #3a1a1a; color: #ff5252; }}
|
|
|
|
| 99 |
.endpoints {{ background: #1a1d27; border: 1px solid #2d3148; border-radius: 8px; padding: 1.25rem; margin-bottom: 2rem; }}
|
| 100 |
.endpoints h3 {{ margin: 0 0 1rem; color: #fff; }}
|
| 101 |
.endpoint {{ display: flex; align-items: center; gap: 0.75rem; margin-bottom: 0.5rem; }}
|
|
@@ -139,6 +140,11 @@ def dashboard():
|
|
| 139 |
<h3>Security Incident (DDoS)</h3>
|
| 140 |
<p>Botnet DDoS and credential stuffing attack. Requires traffic blocking and security escalation. Max 20 steps.</p>
|
| 141 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
</div>
|
| 143 |
|
| 144 |
<div class="endpoints">
|
|
@@ -270,6 +276,16 @@ def list_tasks():
|
|
| 270 |
"The agent must read access logs, diagnose the attack IP range, block the CIDR, and alert the security team."
|
| 271 |
),
|
| 272 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
]
|
| 274 |
}
|
| 275 |
|
|
|
|
| 25 |
allow_headers=["*"],
|
| 26 |
)
|
| 27 |
|
| 28 |
+
VALID_TASKS = ("easy", "medium", "hard", "bonus", "security", "database")
|
| 29 |
_env: Optional[DevOpsIncidentEnv] = None
|
| 30 |
|
| 31 |
|
|
|
|
| 96 |
.hard {{ background: #3a1a1a; color: #f44336; }}
|
| 97 |
.bonus {{ background: #1a1a3a; color: #9c27b0; }}
|
| 98 |
.security {{ background: #3a1a1a; color: #ff5252; }}
|
| 99 |
+
.database {{ background: #1a2c3a; color: #4fc3f7; }}
|
| 100 |
.endpoints {{ background: #1a1d27; border: 1px solid #2d3148; border-radius: 8px; padding: 1.25rem; margin-bottom: 2rem; }}
|
| 101 |
.endpoints h3 {{ margin: 0 0 1rem; color: #fff; }}
|
| 102 |
.endpoint {{ display: flex; align-items: center; gap: 0.75rem; margin-bottom: 0.5rem; }}
|
|
|
|
| 140 |
<h3>Security Incident (DDoS)</h3>
|
| 141 |
<p>Botnet DDoS and credential stuffing attack. Requires traffic blocking and security escalation. Max 20 steps.</p>
|
| 142 |
</div>
|
| 143 |
+
<div class="task">
|
| 144 |
+
<span class="badge database">DATABASE</span>
|
| 145 |
+
<h3>Database Degradation</h3>
|
| 146 |
+
<p>Missing schema index causing slow queries and full table scans. Fix via index creation or rollback. Max 20 steps.</p>
|
| 147 |
+
</div>
|
| 148 |
</div>
|
| 149 |
|
| 150 |
<div class="endpoints">
|
|
|
|
| 276 |
"The agent must read access logs, diagnose the attack IP range, block the CIDR, and alert the security team."
|
| 277 |
),
|
| 278 |
},
|
| 279 |
+
{
|
| 280 |
+
"id": "database",
|
| 281 |
+
"name": "Database Performance Degradation",
|
| 282 |
+
"difficulty": "hard",
|
| 283 |
+
"max_steps": 20,
|
| 284 |
+
"description": (
|
| 285 |
+
"A recent migration added a user_segment column to the orders table without an index. "
|
| 286 |
+
"Sequential table scans are spiking DB CPU. Discovered via read_metrics and the slow query log."
|
| 287 |
+
),
|
| 288 |
+
},
|
| 289 |
]
|
| 290 |
}
|
| 291 |
|
data/runbooks/database_performance.md
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Database Performance Degradation
|
| 2 |
+
|
| 3 |
+
This runbook outlines the recommended procedure for handling database performance issues, specifically focusing on slow queries, high CPU caused by sequential table scans, and missing query indexes.
|
| 4 |
+
|
| 5 |
+
## 1. Diagnose Database Load
|
| 6 |
+
If the database (`postgres-primary`) is exhibiting high CPU or degraded performance without actual service crashes, use the `read_metrics` action on the database.
|
| 7 |
+
- Look at the `Sequential scans/min`.
|
| 8 |
+
- If this value is massively elevated (e.g. 500+ instead of single digits), it means queries are scanning entire tables instead of looking up rows in an index.
|
| 9 |
+
|
| 10 |
+
## 2. Check Slow Query Logs
|
| 11 |
+
Use `read_logs` on the database to verify the slow queries.
|
| 12 |
+
- Slow query logs will identify specific query strings taking >1000ms.
|
| 13 |
+
- They will likely append `[seq_scan]` indicating they hit the table sequentially.
|
| 14 |
+
- The logs may also include automated schema anomaly warnings, such as "MISSING INDEX DETECTED".
|
| 15 |
+
|
| 16 |
+
## 3. Resolving Missing Indexes
|
| 17 |
+
If a missing index is detected, it is highly likely that a recent schema migration added a field but forgot the index.
|
| 18 |
+
- **Action Option 1:** Use the `create_index` action, specifying the target `table` and `column` (e.g. `table="orders"`, `column="user_segment"`). This is the best approach if the data is already deployed, as it fixes the issue instantly without breaking backend code.
|
| 19 |
+
- **Action Option 2:** Use the `rollback` action on the database service. This will revert the schema migration. It fixes the performance, but causes downstream code applying to the new schema to error until patched.
|
| 20 |
+
|
| 21 |
+
## 4. What NOT to do
|
| 22 |
+
- Do **NOT** `restart_service`. Connection pool exhaustion is a symptom, not the cause. Restarting only temporarily drops connections before being immediately overwhelmed again.
|
| 23 |
+
- Do **NOT** `scale_up`. Adding more replicas/workers will only hammer the slow database harder, increasing lock contention and further starving the CPU.
|
env.py
CHANGED
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
| 2 |
import random
|
| 3 |
from typing import Optional
|
| 4 |
from models import Action, Observation, StepResult, State
|
| 5 |
-
from tasks import EasyTask, MediumTask, HardTask, BonusTask, SecurityTask
|
| 6 |
from tasks.base import InternalState
|
| 7 |
|
| 8 |
TASK_MAP = {
|
|
@@ -11,6 +11,7 @@ TASK_MAP = {
|
|
| 11 |
"hard": HardTask,
|
| 12 |
"bonus": BonusTask,
|
| 13 |
"security": SecurityTask,
|
|
|
|
| 14 |
}
|
| 15 |
|
| 16 |
|
|
|
|
| 2 |
import random
|
| 3 |
from typing import Optional
|
| 4 |
from models import Action, Observation, StepResult, State
|
| 5 |
+
from tasks import EasyTask, MediumTask, HardTask, BonusTask, SecurityTask, DatabaseTask
|
| 6 |
from tasks.base import InternalState
|
| 7 |
|
| 8 |
TASK_MAP = {
|
|
|
|
| 11 |
"hard": HardTask,
|
| 12 |
"bonus": BonusTask,
|
| 13 |
"security": SecurityTask,
|
| 14 |
+
"database": DatabaseTask,
|
| 15 |
}
|
| 16 |
|
| 17 |
|
models.py
CHANGED
|
@@ -17,6 +17,7 @@ class ActionType(str, Enum):
|
|
| 17 |
NOOP = "noop"
|
| 18 |
SEARCH_LOGS = "search_logs"
|
| 19 |
BLOCK_IP_RANGE = "block_ip_range"
|
|
|
|
| 20 |
|
| 21 |
|
| 22 |
class Action(BaseModel):
|
|
@@ -28,6 +29,8 @@ class Action(BaseModel):
|
|
| 28 |
reason: Optional[str] = None
|
| 29 |
query: Optional[str] = None # used with search_logs
|
| 30 |
ip_range: Optional[str] = None
|
|
|
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
class Alert(BaseModel):
|
|
|
|
| 17 |
NOOP = "noop"
|
| 18 |
SEARCH_LOGS = "search_logs"
|
| 19 |
BLOCK_IP_RANGE = "block_ip_range"
|
| 20 |
+
CREATE_INDEX = "create_index"
|
| 21 |
|
| 22 |
|
| 23 |
class Action(BaseModel):
|
|
|
|
| 29 |
reason: Optional[str] = None
|
| 30 |
query: Optional[str] = None # used with search_logs
|
| 31 |
ip_range: Optional[str] = None
|
| 32 |
+
table: Optional[str] = None
|
| 33 |
+
column: Optional[str] = None
|
| 34 |
|
| 35 |
|
| 36 |
class Alert(BaseModel):
|
openenv.yaml
CHANGED
|
@@ -87,6 +87,18 @@ tasks:
|
|
| 87 |
expected_score_random_agent: 0.01
|
| 88 |
expected_score_strong_llm: 0.35
|
| 89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
action_space:
|
| 91 |
type: structured
|
| 92 |
description: >
|
|
@@ -118,6 +130,8 @@ action_space:
|
|
| 118 |
description: Take no action this step
|
| 119 |
- name: block_ip_range
|
| 120 |
description: Block traffic from an IP range (CIDR format)
|
|
|
|
|
|
|
| 121 |
|
| 122 |
observation_space:
|
| 123 |
type: structured
|
|
|
|
| 87 |
expected_score_random_agent: 0.01
|
| 88 |
expected_score_strong_llm: 0.35
|
| 89 |
|
| 90 |
+
- id: database
|
| 91 |
+
name: Database Performance Degradation (Missing Index)
|
| 92 |
+
description: >
|
| 93 |
+
A database migration ran 15 minutes ago that added a new column but forgot to add an index.
|
| 94 |
+
Now queries are doing full table scans sequentially, leading to major DB degradation.
|
| 95 |
+
The agent must read the Postgres slow query logs, evaluate sequential scan rates via metrics, and correctly assign a missing index or rollback the migration.
|
| 96 |
+
difficulty: hard
|
| 97 |
+
max_steps: 20
|
| 98 |
+
reward_range: [0.0, 1.0]
|
| 99 |
+
expected_score_random_agent: 0.01
|
| 100 |
+
expected_score_strong_llm: 0.35
|
| 101 |
+
|
| 102 |
action_space:
|
| 103 |
type: structured
|
| 104 |
description: >
|
|
|
|
| 130 |
description: Take no action this step
|
| 131 |
- name: block_ip_range
|
| 132 |
description: Block traffic from an IP range (CIDR format)
|
| 133 |
+
- name: create_index
|
| 134 |
+
description: Create a database index on a specific table and column
|
| 135 |
|
| 136 |
observation_space:
|
| 137 |
type: structured
|
server/app.py
CHANGED
|
@@ -14,7 +14,7 @@ try:
|
|
| 14 |
except ImportError:
|
| 15 |
HAS_WEB_INTERFACE = False
|
| 16 |
|
| 17 |
-
VALID_TASKS = ("easy", "medium", "hard", "bonus", "security")
|
| 18 |
_env = DevOpsEnvironment()
|
| 19 |
app = FastAPI(
|
| 20 |
title="DevOps Incident Response — OpenEnv",
|
|
@@ -96,6 +96,7 @@ def dashboard():
|
|
| 96 |
.hard {{ background: #3a1a1a; color: #f44336; }}
|
| 97 |
.bonus {{ background: #1a1a3a; color: #9c27b0; }}
|
| 98 |
.security {{ background: #3a1a1a; color: #ff5252; }}
|
|
|
|
| 99 |
.endpoints {{ background: #1a1d27; border: 1px solid #2d3148; border-radius: 8px; padding: 1.25rem; margin-bottom: 2rem; }}
|
| 100 |
.endpoints h3 {{ margin: 0 0 1rem; color: #fff; }}
|
| 101 |
.endpoint {{ display: flex; align-items: center; gap: 0.75rem; margin-bottom: 0.5rem; }}
|
|
@@ -139,6 +140,11 @@ def dashboard():
|
|
| 139 |
<h3>Security Incident (DDoS)</h3>
|
| 140 |
<p>Botnet DDoS and credential stuffing attack. Requires traffic blocking and security escalation. Max 20 steps.</p>
|
| 141 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
</div>
|
| 143 |
|
| 144 |
<div class="endpoints">
|
|
@@ -268,6 +274,16 @@ def list_tasks():
|
|
| 268 |
"The agent must read access logs, diagnose the attack IP range, block the CIDR, and alert the security team."
|
| 269 |
),
|
| 270 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
]
|
| 272 |
}
|
| 273 |
|
|
|
|
| 14 |
except ImportError:
|
| 15 |
HAS_WEB_INTERFACE = False
|
| 16 |
|
| 17 |
+
VALID_TASKS = ("easy", "medium", "hard", "bonus", "security", "database")
|
| 18 |
_env = DevOpsEnvironment()
|
| 19 |
app = FastAPI(
|
| 20 |
title="DevOps Incident Response — OpenEnv",
|
|
|
|
| 96 |
.hard {{ background: #3a1a1a; color: #f44336; }}
|
| 97 |
.bonus {{ background: #1a1a3a; color: #9c27b0; }}
|
| 98 |
.security {{ background: #3a1a1a; color: #ff5252; }}
|
| 99 |
+
.database {{ background: #1a2c3a; color: #4fc3f7; }}
|
| 100 |
.endpoints {{ background: #1a1d27; border: 1px solid #2d3148; border-radius: 8px; padding: 1.25rem; margin-bottom: 2rem; }}
|
| 101 |
.endpoints h3 {{ margin: 0 0 1rem; color: #fff; }}
|
| 102 |
.endpoint {{ display: flex; align-items: center; gap: 0.75rem; margin-bottom: 0.5rem; }}
|
|
|
|
| 140 |
<h3>Security Incident (DDoS)</h3>
|
| 141 |
<p>Botnet DDoS and credential stuffing attack. Requires traffic blocking and security escalation. Max 20 steps.</p>
|
| 142 |
</div>
|
| 143 |
+
<div class="task">
|
| 144 |
+
<span class="badge database">DATABASE</span>
|
| 145 |
+
<h3>Database Degradation</h3>
|
| 146 |
+
<p>Missing schema index causing slow queries and full table scans. Fix via index creation or rollback. Max 20 steps.</p>
|
| 147 |
+
</div>
|
| 148 |
</div>
|
| 149 |
|
| 150 |
<div class="endpoints">
|
|
|
|
| 274 |
"The agent must read access logs, diagnose the attack IP range, block the CIDR, and alert the security team."
|
| 275 |
),
|
| 276 |
},
|
| 277 |
+
{
|
| 278 |
+
"id": "database",
|
| 279 |
+
"name": "Database Performance Degradation",
|
| 280 |
+
"difficulty": "hard",
|
| 281 |
+
"max_steps": 20,
|
| 282 |
+
"description": (
|
| 283 |
+
"A recent migration added a user_segment column to the orders table without an index. "
|
| 284 |
+
"Sequential table scans are spiking DB CPU. Discovered via read_metrics and the slow query log."
|
| 285 |
+
),
|
| 286 |
+
},
|
| 287 |
]
|
| 288 |
}
|
| 289 |
|
tasks/__init__.py
CHANGED
|
@@ -3,5 +3,6 @@ from tasks.task_medium import MediumTask
|
|
| 3 |
from tasks.task_hard import HardTask
|
| 4 |
from tasks.task_bonus import BonusTask
|
| 5 |
from tasks.task_security import SecurityTask
|
|
|
|
| 6 |
|
| 7 |
-
__all__ = ["EasyTask", "MediumTask", "HardTask", "BonusTask", "SecurityTask"]
|
|
|
|
| 3 |
from tasks.task_hard import HardTask
|
| 4 |
from tasks.task_bonus import BonusTask
|
| 5 |
from tasks.task_security import SecurityTask
|
| 6 |
+
from tasks.task_database import DatabaseTask
|
| 7 |
|
| 8 |
+
__all__ = ["EasyTask", "MediumTask", "HardTask", "BonusTask", "SecurityTask", "DatabaseTask"]
|
tasks/task_database.py
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
import uuid
|
| 3 |
+
from typing import Dict, Any, List
|
| 4 |
+
from models import Action, ActionType
|
| 5 |
+
from tasks.base import BaseTask, InternalState, StepOutput, semantic_match
|
| 6 |
+
|
| 7 |
+
INCIDENT_TIME = "2026-04-12T14:22:00Z"
|
| 8 |
+
|
| 9 |
+
DEPENDENCIES = [
|
| 10 |
+
{"service": "api-gateway", "calls": ["order-service", "user-service"], "called_by": []},
|
| 11 |
+
{"service": "order-service", "calls": ["postgres-primary"], "called_by": ["api-gateway"]},
|
| 12 |
+
{"service": "analytics-service", "calls": ["postgres-primary"], "called_by": []},
|
| 13 |
+
{"service": "postgres-primary", "calls": [], "called_by": ["order-service", "analytics-service"]},
|
| 14 |
+
{"service": "user-service", "calls": [], "called_by": ["api-gateway"]},
|
| 15 |
+
]
|
| 16 |
+
|
| 17 |
+
POSTGRES_LOGS = [
|
| 18 |
+
"[14:22:01] SLOW_QUERY 4281ms: SELECT * FROM orders WHERE user_segment='premium' LIMIT 100 [seq_scan: 18M rows]",
|
| 19 |
+
"[14:22:03] SLOW_QUERY 4190ms: SELECT COUNT(*) FROM orders WHERE user_segment='standard' [seq_scan: 18M rows]",
|
| 20 |
+
"[14:22:05] SLOW_QUERY 4350ms: SELECT order_id, total FROM orders WHERE user_segment='enterprise' [seq_scan: 18M rows]",
|
| 21 |
+
"[14:22:07] INFO MISSING INDEX DETECTED: orders.user_segment has no index (added in migration 20260425_add_user_segment)",
|
| 22 |
+
"[14:22:08] WARN Table scan count: 847/min (normal: 2/min) — index missing on hot column",
|
| 23 |
+
"[14:22:09] SLOW_QUERY 4401ms: SELECT * FROM orders WHERE user_segment='premium' AND created_at > '2026-04-01' [seq_scan]",
|
| 24 |
+
]
|
| 25 |
+
|
| 26 |
+
ORDER_LOGS = [
|
| 27 |
+
"[14:22:01] WARN DB query timeout: getOrdersBySegment() exceeded 5000ms",
|
| 28 |
+
"[14:22:02] ERROR Failed to fetch orders for dashboard: upstream DB timeout",
|
| 29 |
+
"[14:22:05] WARN Retry 1/3: getOrdersBySegment() - 4300ms",
|
| 30 |
+
"[14:22:09] ERROR Circuit breaker OPEN for postgres-primary read replica",
|
| 31 |
+
]
|
| 32 |
+
|
| 33 |
+
ANALYTICS_LOGS = [
|
| 34 |
+
"[14:22:00] INFO Starting hourly aggregation job: orders by user_segment",
|
| 35 |
+
"[14:22:04] WARN Aggregation query running slow: 4100ms elapsed (expected: 80ms)",
|
| 36 |
+
"[14:22:08] ERROR Aggregation job timed out after 300s — will retry in 60min",
|
| 37 |
+
"[14:22:09] INFO Root cause likely: orders table scan (no index on user_segment)",
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class DatabaseTask(BaseTask):
|
| 42 |
+
def initialize(self) -> InternalState:
|
| 43 |
+
logs = {
|
| 44 |
+
"postgres-primary": POSTGRES_LOGS[:],
|
| 45 |
+
"order-service": ORDER_LOGS[:],
|
| 46 |
+
"analytics-service": ANALYTICS_LOGS[:],
|
| 47 |
+
"api-gateway": ["[14:22:05] WARN Upstream order-service latency 4600ms"],
|
| 48 |
+
"user-service": ["[14:22:00] INFO Service normal"],
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
services = {
|
| 52 |
+
"postgres-primary": {
|
| 53 |
+
"name": "postgres-primary", "status": "degraded",
|
| 54 |
+
"cpu_percent": 94.0, "memory_percent": 65.0,
|
| 55 |
+
"error_rate": 0.0, "latency_p99_ms": 4401.0,
|
| 56 |
+
"replicas_running": 1, "replicas_desired": 1,
|
| 57 |
+
"current_version": "v14.1", "last_deployed": "2025-01-01T00:00:00Z",
|
| 58 |
+
"minutes_degraded": 15, "sla_breach": False,
|
| 59 |
+
},
|
| 60 |
+
"order-service": {
|
| 61 |
+
"name": "order-service", "status": "degraded",
|
| 62 |
+
"cpu_percent": 35.0, "memory_percent": 45.0,
|
| 63 |
+
"error_rate": 2.5, "latency_p99_ms": 4800.0,
|
| 64 |
+
"replicas_running": 3, "replicas_desired": 3,
|
| 65 |
+
"current_version": "v2.1.0", "last_deployed": "2026-03-20T08:00:00Z",
|
| 66 |
+
"minutes_degraded": 15, "sla_breach": False,
|
| 67 |
+
},
|
| 68 |
+
"analytics-service": {
|
| 69 |
+
"name": "analytics-service", "status": "degraded",
|
| 70 |
+
"cpu_percent": 25.0, "memory_percent": 30.0,
|
| 71 |
+
"error_rate": 5.0, "latency_p99_ms": 300000.0,
|
| 72 |
+
"replicas_running": 1, "replicas_desired": 1,
|
| 73 |
+
"current_version": "v1.5.0", "last_deployed": "2026-04-10T11:00:00Z",
|
| 74 |
+
"minutes_degraded": 15, "sla_breach": False,
|
| 75 |
+
},
|
| 76 |
+
"api-gateway": {
|
| 77 |
+
"name": "api-gateway", "status": "degraded",
|
| 78 |
+
"cpu_percent": 45.0, "memory_percent": 45.0,
|
| 79 |
+
"error_rate": 1.5, "latency_p99_ms": 4600.0,
|
| 80 |
+
"replicas_running": 5, "replicas_desired": 5,
|
| 81 |
+
"current_version": "v3.1.0", "last_deployed": "2026-03-20T08:00:00Z",
|
| 82 |
+
"minutes_degraded": 15, "sla_breach": False,
|
| 83 |
+
},
|
| 84 |
+
"user-service": {
|
| 85 |
+
"name": "user-service", "status": "healthy",
|
| 86 |
+
"cpu_percent": 15.0, "memory_percent": 30.0,
|
| 87 |
+
"error_rate": 0.0, "latency_p99_ms": 25.0,
|
| 88 |
+
"replicas_running": 2, "replicas_desired": 2,
|
| 89 |
+
"current_version": "v1.1.2", "last_deployed": "2026-03-01T00:00:00Z",
|
| 90 |
+
"minutes_degraded": 0, "sla_breach": False,
|
| 91 |
+
},
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
alerts = [
|
| 95 |
+
{
|
| 96 |
+
"id": "D001", "severity": "critical", "service": "order-service",
|
| 97 |
+
"message": "P99 latency 4800ms (threshold: 500ms)",
|
| 98 |
+
"timestamp": "2026-04-12T14:22:05Z", "acknowledged": False,
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"id": "D002", "severity": "critical", "service": "analytics-service",
|
| 102 |
+
"message": "Hourly aggregation job timed out",
|
| 103 |
+
"timestamp": "2026-04-12T14:22:08Z", "acknowledged": False,
|
| 104 |
+
},
|
| 105 |
+
{
|
| 106 |
+
"id": "D003", "severity": "warning", "service": "postgres-primary",
|
| 107 |
+
"message": "CPU 94% sustained 15min, high sequential scan rate",
|
| 108 |
+
"timestamp": "2026-04-12T14:22:07Z", "acknowledged": False,
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"id": "D004", "severity": "warning", "service": "api-gateway",
|
| 112 |
+
"message": "Upstream order-service latency 4600ms",
|
| 113 |
+
"timestamp": "2026-04-12T14:22:09Z", "acknowledged": False,
|
| 114 |
+
},
|
| 115 |
+
]
|
| 116 |
+
|
| 117 |
+
state = InternalState(
|
| 118 |
+
episode_id=str(uuid.uuid4()), task_id="database", step=0, max_steps=20,
|
| 119 |
+
services=services, alerts=alerts, logs=logs,
|
| 120 |
+
action_history=[], total_reward=0.0, incident_resolved=False,
|
| 121 |
+
ground_truth_root_cause="missing_index_orders_user_segment_column_migration",
|
| 122 |
+
ground_truth_fix="create index on orders.user_segment OR rollback migration",
|
| 123 |
+
incident_start_time=INCIDENT_TIME,
|
| 124 |
+
healthy_services=["user-service"],
|
| 125 |
+
service_dependencies=DEPENDENCIES,
|
| 126 |
+
)
|
| 127 |
+
return state
|
| 128 |
+
|
| 129 |
+
def step(self, state: InternalState, action: Action) -> StepOutput:
|
| 130 |
+
state.step += 1
|
| 131 |
+
state._apply_sla_degradation()
|
| 132 |
+
at = action.action_type
|
| 133 |
+
svc = action.service or ""
|
| 134 |
+
reward = 0.0
|
| 135 |
+
done = False
|
| 136 |
+
info: Dict[str, Any] = {}
|
| 137 |
+
|
| 138 |
+
result_text, error_text = self._apply_action_to_logs(state, action)
|
| 139 |
+
|
| 140 |
+
# Custom read_metrics response for postgres-primary
|
| 141 |
+
if at == ActionType.READ_METRICS and svc == "postgres-primary":
|
| 142 |
+
s = state.services[svc]
|
| 143 |
+
result_text = (
|
| 144 |
+
f"=== Metrics: postgres-primary ===\n"
|
| 145 |
+
f"Status: {s['status'].upper()}\n"
|
| 146 |
+
f"CPU: {s['cpu_percent']:.1f}% (normal: 15%)\n"
|
| 147 |
+
f"Memory: {s['memory_percent']:.1f}%\n"
|
| 148 |
+
f"Sequential scans/min: 847 (normal: 2)\n"
|
| 149 |
+
f"Index scans/min: 12 (normal: 890)\n"
|
| 150 |
+
f"Active queries: 48 (normal: 8)\n"
|
| 151 |
+
f"Longest running query: {s['latency_p99_ms']:.0f}ms\n"
|
| 152 |
+
f"Last migration: 20260425_add_user_segment (14:07:00, 15 min ago)\n"
|
| 153 |
+
)
|
| 154 |
+
state.evidence_log.append({
|
| 155 |
+
"step": state.step,
|
| 156 |
+
"source": f"metrics:{svc}",
|
| 157 |
+
"summary": "postgres-primary: cpu=94%, seq_scans=847/min, normal=2/min",
|
| 158 |
+
"raw": result_text,
|
| 159 |
+
})
|
| 160 |
+
|
| 161 |
+
gather_map = {
|
| 162 |
+
("read_logs", "postgres-primary"): ("rl_pg", 0.10),
|
| 163 |
+
("search_logs", "postgres-primary"): ("rl_pg", 0.10),
|
| 164 |
+
("read_metrics", "postgres-primary"): ("rm_pg", 0.10),
|
| 165 |
+
("read_logs", "analytics-service"): ("rl_ana", 0.05),
|
| 166 |
+
("search_logs", "analytics-service"): ("rl_ana", 0.05),
|
| 167 |
+
}
|
| 168 |
+
k = (at.value, svc)
|
| 169 |
+
if k in gather_map:
|
| 170 |
+
tag, r = gather_map[k]
|
| 171 |
+
if tag not in state.rewards_given:
|
| 172 |
+
reward += r; state.rewards_given.add(tag)
|
| 173 |
+
|
| 174 |
+
if at == ActionType.READ_RUNBOOK:
|
| 175 |
+
if "runbook_any" not in state.rewards_given:
|
| 176 |
+
reward += 0.05; state.rewards_given.add("runbook_any")
|
| 177 |
+
|
| 178 |
+
if at == ActionType.DIAGNOSE:
|
| 179 |
+
rc = action.root_cause or ""
|
| 180 |
+
if semantic_match(rc, ["index", "migration", "user_segment", "seq_scan", "table scan"]):
|
| 181 |
+
if "diagnose_correct" not in state.rewards_given:
|
| 182 |
+
reward += 0.20; state.rewards_given.add("diagnose_correct")
|
| 183 |
+
result_text = f"Diagnosis recorded: {rc}"
|
| 184 |
+
|
| 185 |
+
if at == ActionType.CREATE_INDEX:
|
| 186 |
+
table = (action.table or "").lower()
|
| 187 |
+
column = (action.column or "").lower()
|
| 188 |
+
if table == "orders" and "user_segment" in column:
|
| 189 |
+
if "fix_index" not in state.rewards_given:
|
| 190 |
+
reward += 0.30; state.rewards_given.add("fix_index")
|
| 191 |
+
result_text = f"Successfully created index on {table}.{column}. Sequential scans dropped. Query latency normalizing."
|
| 192 |
+
state.services["postgres-primary"]["cpu_percent"] = 18.0
|
| 193 |
+
state.services["postgres-primary"]["latency_p99_ms"] = 12.0
|
| 194 |
+
state.incident_resolved = True; done = True; info["resolution"] = "incident_resolved"
|
| 195 |
+
else:
|
| 196 |
+
reward -= 0.10
|
| 197 |
+
result_text = f"Created index on {table}.{column}, but it had no effect on the ongoing sequential scans."
|
| 198 |
+
|
| 199 |
+
if at == ActionType.ROLLBACK and svc == "postgres-primary":
|
| 200 |
+
if "fix_index" not in state.rewards_given:
|
| 201 |
+
reward += 0.20; state.rewards_given.add("fix_index")
|
| 202 |
+
result_text = "Migration rolled back. user_segment column removed. Service queries failing back to old schema, but database CPU returning to normal."
|
| 203 |
+
state.services["postgres-primary"]["cpu_percent"] = 18.0
|
| 204 |
+
state.services["postgres-primary"]["latency_p99_ms"] = 12.0
|
| 205 |
+
state.incident_resolved = True; done = True; info["resolution"] = "incident_resolved"
|
| 206 |
+
|
| 207 |
+
if at == ActionType.RESTART_SERVICE:
|
| 208 |
+
reward -= 0.10
|
| 209 |
+
result_text = f"Restarted {svc}. Connection pool dropped but immediately overwhelmed again by slow queries missing index."
|
| 210 |
+
|
| 211 |
+
if at == ActionType.SCALE_UP:
|
| 212 |
+
reward -= 0.08
|
| 213 |
+
result_text = f"Scaled up {svc}. More workers are now hitting the database, worsening the CPU starvation."
|
| 214 |
+
|
| 215 |
+
if at == ActionType.NOOP and state.step > 5:
|
| 216 |
+
reward -= 0.03
|
| 217 |
+
|
| 218 |
+
state.total_reward = self._clamp(state.total_reward + reward)
|
| 219 |
+
if state.step >= state.max_steps and not done:
|
| 220 |
+
done = True; info["reason"] = "max_steps_reached"
|
| 221 |
+
|
| 222 |
+
obs = state._build_observation(last_action_result=result_text, last_action_error=error_text)
|
| 223 |
+
state.action_history.append({"step": state.step, "action": action.model_dump(), "reward": round(reward, 4)})
|
| 224 |
+
return StepOutput(next_state=state, reward=round(reward, 4), done=done, info=info)
|