Spaces:
Sleeping
Sleeping
feat: initial push with env and 3 tasks
Browse files- .dockerignore +12 -0
- .gitignore +22 -0
- Dockerfile +16 -0
- README.md +287 -1
- env/__init__.py +0 -0
- env/environment.py +363 -0
- env/scenario.py +67 -0
- env/services.py +155 -0
- graders/__init__.py +0 -0
- graders/grader.py +170 -0
- inference.py +268 -0
- models.py +134 -0
- openenv.yaml +27 -0
- requirements.txt +5 -0
- server/__init__.py +0 -0
- server/app.py +146 -0
- tasks/__init__.py +40 -0
- tasks/easy_oom.py +299 -0
- tasks/hard_concurrent.py +353 -0
- tasks/medium_deadlock.py +298 -0
.dockerignore
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.py[cod]
|
| 3 |
+
*.egg-info/
|
| 4 |
+
.git/
|
| 5 |
+
.gitignore
|
| 6 |
+
.env
|
| 7 |
+
.env.*
|
| 8 |
+
.venv/
|
| 9 |
+
venv/
|
| 10 |
+
*.md
|
| 11 |
+
!README.md
|
| 12 |
+
.dockerignore
|
.gitignore
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.py[cod]
|
| 3 |
+
*.egg-info/
|
| 4 |
+
dist/
|
| 5 |
+
build/
|
| 6 |
+
.eggs/
|
| 7 |
+
*.egg
|
| 8 |
+
|
| 9 |
+
.env
|
| 10 |
+
.env.*
|
| 11 |
+
!.env.example
|
| 12 |
+
|
| 13 |
+
.venv/
|
| 14 |
+
venv/
|
| 15 |
+
|
| 16 |
+
.idea/
|
| 17 |
+
.vscode/
|
| 18 |
+
*.swp
|
| 19 |
+
*.swo
|
| 20 |
+
*~
|
| 21 |
+
|
| 22 |
+
*.log
|
Dockerfile
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 6 |
+
PYTHONUNBUFFERED=1 \
|
| 7 |
+
PORT=8000
|
| 8 |
+
|
| 9 |
+
COPY requirements.txt /app/requirements.txt
|
| 10 |
+
RUN pip install --no-cache-dir -r /app/requirements.txt
|
| 11 |
+
|
| 12 |
+
COPY . /app
|
| 13 |
+
|
| 14 |
+
EXPOSE 8000
|
| 15 |
+
|
| 16 |
+
CMD ["python", "-m", "uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
README.md
CHANGED
|
@@ -1 +1,287 @@
|
|
| 1 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SRE Incident Response Environment
|
| 2 |
+
|
| 3 |
+
An OpenEnv-compatible reinforcement learning environment that simulates production incident response. AI agents must investigate microservice architectures, diagnose root causes, and apply fixes β just like a real on-call SRE engineer.
|
| 4 |
+
|
| 5 |
+
## Motivation
|
| 6 |
+
|
| 7 |
+
Every tech company has on-call rotations, yet there's no standardized benchmark for evaluating AI agents on incident response. This environment fills that gap by simulating realistic production incidents with:
|
| 8 |
+
|
| 9 |
+
- **Multi-service architectures** with dependency chains and cascading failures
|
| 10 |
+
- **Progressive information revelation** β agents must actively investigate (read logs, check metrics, trace requests)
|
| 11 |
+
- **Red herrings and misleading symptoms** β alerts point to symptoms, not root causes
|
| 12 |
+
- **Concurrent faults** in the hardest tier β testing whether agents can find multiple independent root causes
|
| 13 |
+
- **Realistic operational data** β 50+ log lines per service with noise, time-series metrics, distributed traces, deploy history, runbooks, and config diffs
|
| 14 |
+
|
| 15 |
+
## Service Architecture
|
| 16 |
+
|
| 17 |
+
All tasks share the same 7-service microservice architecture:
|
| 18 |
+
|
| 19 |
+
```
|
| 20 |
+
+--------------+
|
| 21 |
+
+-------->| auth-service |<------+
|
| 22 |
+
| +------+-------+ |
|
| 23 |
+
| | depends | depends
|
| 24 |
+
+---------+------+ +------v------+ +-----+--------+
|
| 25 |
+
| api-gateway | | cache-redis | | notification |
|
| 26 |
+
| (entry point) | +-------------+ | -service |
|
| 27 |
+
+-+----------+---+ +--------------+
|
| 28 |
+
| |
|
| 29 |
+
| depends | depends
|
| 30 |
+
v v
|
| 31 |
+
+------------+ +-----------------+
|
| 32 |
+
|user-service| |payment-service |
|
| 33 |
+
+-----+------+ +--------+--------+
|
| 34 |
+
| depends | depends
|
| 35 |
+
v v
|
| 36 |
+
+----------------------------+
|
| 37 |
+
| db-postgres |
|
| 38 |
+
+----------------------------+
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
Each service has: name, status (`HEALTHY`/`DEGRADED`/`DOWN`), version, replica count, dependencies, logs, metrics, traces, deploy history, config, and runbook data.
|
| 42 |
+
|
| 43 |
+
## Tasks
|
| 44 |
+
|
| 45 |
+
Tasks are auto-discovered from the `tasks/` directory. Each task is a self-contained Python file defining a `SCENARIO` object.
|
| 46 |
+
|
| 47 |
+
| Task ID | Name | Difficulty | Max Steps | Root Cause | Fix Required |
|
| 48 |
+
|---------|------|-----------|-----------|------------|--------------|
|
| 49 |
+
| `easy` | Single Service OOM Crash | Easy | 15 | `auth-service` (OOM) | `restart_service(auth-service)` |
|
| 50 |
+
| `medium` | Cascading Database Deadlock | Medium | 25 | `db-postgres` (deadlock) | `restart_service(db-postgres)` |
|
| 51 |
+
| `hard` | Concurrent Faults + Misleading Evidence | Hard | 35 | `payment-service` (bad deploy) AND `cache-redis` (memory leak) | `rollback_deploy(payment-service, v3.8.1)` AND `restart_service(cache-redis)` |
|
| 52 |
+
|
| 53 |
+
### Task Details
|
| 54 |
+
|
| 55 |
+
**Easy** β Alert directly names `auth-service` as down. Logs clearly show OOM crash cycle (heap growth, OOM kills, restart exhaustion). Single root cause, single fix.
|
| 56 |
+
|
| 57 |
+
**Medium** β Alerts blame `payment-service` and `user-service` (both are victims). The real cause is a long-running analytics query deadlocking `db-postgres`. Agent must notice "writes fail but reads work", follow dependency chain to the database, and read `db-postgres` logs to find the deadlock. Red herring: `cache-redis` miss ratio alert (benign TTL expiry).
|
| 58 |
+
|
| 59 |
+
**Hard** β Two independent faults at the same time: (1) `payment-service` has a bad deploy (v3.8.2, NullPointerException in new validator module), (2) `cache-redis` has a memory leak causing eviction storms that degrade `auth-service`. Red herrings: `user-service` config warnings (benign), `notification-service` queue backup (victim of auth-service). Agent must find and fix BOTH faults. After fixing only one, post-remediation check shows remaining services are still unhealthy.
|
| 60 |
+
|
| 61 |
+
### Adding New Tasks
|
| 62 |
+
|
| 63 |
+
To add a new task:
|
| 64 |
+
|
| 65 |
+
1. Create a new file in `tasks/` (e.g., `tasks/my_new_task.py`)
|
| 66 |
+
2. Define a `SCENARIO = IncidentScenario(task_id="my_new_task", ...)` β see existing task files for the template
|
| 67 |
+
3. Done. The task loader in `tasks/__init__.py` auto-discovers any `.py` file that exports a `SCENARIO` object.
|
| 68 |
+
|
| 69 |
+
No changes needed to the environment engine, grader, server, or inference script. The grader is generic β it reads ground truth (root cause services, required fixes, keywords, weights) from the scenario definition.
|
| 70 |
+
|
| 71 |
+
## Project Structure
|
| 72 |
+
|
| 73 |
+
```
|
| 74 |
+
IncidentResponse_RL/
|
| 75 |
+
βββ models.py # Pydantic models: Action, Observation, State, enums
|
| 76 |
+
βββ openenv.yaml # OpenEnv manifest (tasks, models, runtime config)
|
| 77 |
+
βββ requirements.txt # Python dependencies
|
| 78 |
+
βββ Dockerfile # Container for HF Spaces deployment
|
| 79 |
+
βββ inference.py # Baseline agent using OpenAI client
|
| 80 |
+
βββ README.md
|
| 81 |
+
β
|
| 82 |
+
βββ env/ # Core environment engine
|
| 83 |
+
β βββ __init__.py
|
| 84 |
+
β βββ scenario.py # IncidentScenario, ServiceConfig, RequiredFix dataclasses
|
| 85 |
+
β βββ environment.py # step() / reset() / state() implementation
|
| 86 |
+
β βββ services.py # Alert generation, dependency cascade, data formatting
|
| 87 |
+
β
|
| 88 |
+
βββ tasks/ # Task definitions (auto-discovered)
|
| 89 |
+
β βββ __init__.py # Auto-discovery loader β SCENARIOS dict
|
| 90 |
+
β βββ easy_oom.py # Easy: Single Service OOM Crash
|
| 91 |
+
β βββ medium_deadlock.py # Medium: Cascading Database Deadlock
|
| 92 |
+
β βββ hard_concurrent.py # Hard: Concurrent Faults + Misleading Evidence
|
| 93 |
+
β
|
| 94 |
+
βββ graders/ # Scoring engine
|
| 95 |
+
β βββ __init__.py
|
| 96 |
+
β βββ grader.py # Generic rubric-based grader (0.0-1.0)
|
| 97 |
+
β
|
| 98 |
+
βββ server/ # FastAPI web server
|
| 99 |
+
βββ __init__.py
|
| 100 |
+
βββ app.py # /reset, /step, /state, /tasks endpoints
|
| 101 |
+
```
|
| 102 |
+
|
| 103 |
+
## Action Space
|
| 104 |
+
|
| 105 |
+
All actions are sent as a single JSON object with an `action_type` field. Optional fields depend on the action type.
|
| 106 |
+
|
| 107 |
+
### Investigation Actions (read-only, gather information)
|
| 108 |
+
|
| 109 |
+
| Action | Required Fields | Returns |
|
| 110 |
+
|--------|----------------|---------|
|
| 111 |
+
| `read_logs` | `service` | 50+ timestamped log lines with noise and signal |
|
| 112 |
+
| `check_metrics` | `service` | Time-series table (CPU, memory, latency, error rate, etc.) |
|
| 113 |
+
| `ping_service` | `service` | Reachability check with latency |
|
| 114 |
+
| `check_dependencies` | `service` | Upstream dependency list with current health status |
|
| 115 |
+
| `inspect_deploy` | `service` | Deploy history (version, timestamp, status) |
|
| 116 |
+
| `query_traces` | `service` | Distributed trace spans showing latency breakdown |
|
| 117 |
+
| `check_runbook` | `service` | Operational runbook with troubleshooting steps |
|
| 118 |
+
| `diff_config` | `service` | Current vs previous config comparison |
|
| 119 |
+
|
| 120 |
+
### Remediation Actions (modify environment state)
|
| 121 |
+
|
| 122 |
+
| Action | Required Fields | Effect |
|
| 123 |
+
|--------|----------------|--------|
|
| 124 |
+
| `restart_service` | `service` | Restarts pods. Fixes OOM/leak issues. No effect if root cause is elsewhere. |
|
| 125 |
+
| `rollback_deploy` | `service`, `target_version` | Rolls back to specified version. Must match exact version string. |
|
| 126 |
+
| `scale_up` | `service`, `replicas` | Increases replica count. Can alleviate memory pressure. |
|
| 127 |
+
| `drain_traffic` | `service` | Stops routing traffic to the service. |
|
| 128 |
+
|
| 129 |
+
### Terminal Action
|
| 130 |
+
|
| 131 |
+
| Action | Required Fields | Effect |
|
| 132 |
+
|--------|----------------|--------|
|
| 133 |
+
| `submit_diagnosis` | `root_cause_service`, `root_cause_category`, `fix_description` | Ends episode, triggers grading. |
|
| 134 |
+
|
| 135 |
+
### Root Cause Categories
|
| 136 |
+
|
| 137 |
+
`oom_crash`, `db_deadlock`, `bad_deploy`, `memory_leak`, `network_partition`, `disk_full`, `config_error`, `cert_expiry`, `dns_failure`, `rate_limit`
|
| 138 |
+
|
| 139 |
+
### Example Actions
|
| 140 |
+
|
| 141 |
+
```json
|
| 142 |
+
{"action_type": "read_logs", "service": "auth-service"}
|
| 143 |
+
{"action_type": "check_metrics", "service": "db-postgres"}
|
| 144 |
+
{"action_type": "rollback_deploy", "service": "payment-service", "target_version": "v3.8.1"}
|
| 145 |
+
{"action_type": "submit_diagnosis", "root_cause_service": "db-postgres", "root_cause_category": "db_deadlock", "fix_description": "Restarted db-postgres to clear deadlock caused by analytics-cron query"}
|
| 146 |
+
```
|
| 147 |
+
|
| 148 |
+
## Observation Space
|
| 149 |
+
|
| 150 |
+
On `reset()`, the agent receives:
|
| 151 |
+
- **Service health dashboard** β all 7 services with status (`HEALTHY`/`DEGRADED`/`DOWN`), version, replica count
|
| 152 |
+
- **Active alerts** β severity-tagged alerts (SEV-1/SEV-2/SEV-3)
|
| 153 |
+
- **Incident summary** β text description of the situation
|
| 154 |
+
|
| 155 |
+
On each `step()`, the agent receives:
|
| 156 |
+
- **Updated service statuses** β health may change after remediation
|
| 157 |
+
- **Updated alerts** β alerts clear when services recover
|
| 158 |
+
- **Action result** β the data returned by the action (logs, metrics, traces, etc.)
|
| 159 |
+
- **Reward** β per-step reward signal
|
| 160 |
+
- **Done flag** β whether the episode has ended
|
| 161 |
+
- **Score** β final score (only on terminal step)
|
| 162 |
+
|
| 163 |
+
### Progressive Revelation
|
| 164 |
+
|
| 165 |
+
The agent does NOT see all data upfront. It must actively choose which services to investigate and which data to request. Each investigation action consumes a step, creating a planning pressure: the agent must balance information gathering with remediation within the step budget.
|
| 166 |
+
|
| 167 |
+
### Post-Remediation Feedback
|
| 168 |
+
|
| 169 |
+
After any remediation action, the observation includes a `[POST-REMEDIATION CHECK]` that lists which services are still unhealthy. This is critical for the hard task β after fixing only one of two faults, the check reveals remaining issues.
|
| 170 |
+
|
| 171 |
+
## Reward Function
|
| 172 |
+
|
| 173 |
+
### Per-Step Shaping
|
| 174 |
+
|
| 175 |
+
| Action | Reward |
|
| 176 |
+
|--------|--------|
|
| 177 |
+
| Investigating a root-cause service | +0.01 |
|
| 178 |
+
| Investigating a non-root-cause service | 0.00 |
|
| 179 |
+
| Correct remediation (matches required fix) | +0.05 |
|
| 180 |
+
| Wrong remediation (wrong service or wrong fix type) | -0.05 |
|
| 181 |
+
|
| 182 |
+
### Terminal Grading (0.0 - 1.0)
|
| 183 |
+
|
| 184 |
+
The grader is generic and rubric-based. Each task defines its own weights:
|
| 185 |
+
|
| 186 |
+
| Component | Easy | Medium | Hard |
|
| 187 |
+
|-----------|------|--------|------|
|
| 188 |
+
| Correct root cause service identified | 0.30 | 0.25 | 0.15 |
|
| 189 |
+
| Correct root cause category | 0.20 | 0.20 | 0.10 |
|
| 190 |
+
| Primary fix applied | 0.30 | 0.25 | 0.15 |
|
| 191 |
+
| Secondary fix(es) applied | -- | -- | 0.20 |
|
| 192 |
+
| Diagnosis text quality (keyword match) | 0.10 | 0.10 | 0.15 |
|
| 193 |
+
| Investigation thoroughness | 0.10 | 0.10 | 0.10 |
|
| 194 |
+
| Wrong remediation penalty | -0.03/ea | -0.05/ea | -0.05/ea |
|
| 195 |
+
|
| 196 |
+
**Diagnosis text scoring** uses deterministic keyword matching β the grader checks if the fix description mentions key terms (service names, fault types, fix actions). No LLM-based judging.
|
| 197 |
+
|
| 198 |
+
**Investigation thoroughness** checks whether the agent examined at least one root-cause service before submitting.
|
| 199 |
+
|
| 200 |
+
## Setup
|
| 201 |
+
|
| 202 |
+
### Local Development
|
| 203 |
+
|
| 204 |
+
```bash
|
| 205 |
+
pip install -r requirements.txt
|
| 206 |
+
python -m uvicorn server.app:app --host 0.0.0.0 --port 8000
|
| 207 |
+
```
|
| 208 |
+
|
| 209 |
+
### Docker
|
| 210 |
+
|
| 211 |
+
```bash
|
| 212 |
+
docker build -t sre-incident-response .
|
| 213 |
+
docker run -p 8000:8000 sre-incident-response
|
| 214 |
+
```
|
| 215 |
+
|
| 216 |
+
### API Usage
|
| 217 |
+
|
| 218 |
+
```bash
|
| 219 |
+
# List available tasks
|
| 220 |
+
curl http://localhost:8000/tasks
|
| 221 |
+
|
| 222 |
+
# Reset (start a new episode)
|
| 223 |
+
curl -X POST http://localhost:8000/reset \
|
| 224 |
+
-H "Content-Type: application/json" \
|
| 225 |
+
-d '{"task_id": "easy"}'
|
| 226 |
+
|
| 227 |
+
# Step (take an action)
|
| 228 |
+
curl -X POST http://localhost:8000/step \
|
| 229 |
+
-H "Content-Type: application/json" \
|
| 230 |
+
-d '{"session_id": "<SESSION_ID>", "action": {"action_type": "read_logs", "service": "auth-service"}}'
|
| 231 |
+
|
| 232 |
+
# Get current episode state
|
| 233 |
+
curl http://localhost:8000/state/<SESSION_ID>
|
| 234 |
+
```
|
| 235 |
+
|
| 236 |
+
OpenEnv-prefixed endpoints are also available: `/openenv/reset`, `/openenv/step`, `/openenv/state/{session_id}`, `/openenv/tasks`.
|
| 237 |
+
|
| 238 |
+
### Running Inference
|
| 239 |
+
|
| 240 |
+
```bash
|
| 241 |
+
export HF_TOKEN=your_token
|
| 242 |
+
export API_BASE_URL=https://router.huggingface.co/v1
|
| 243 |
+
export MODEL_NAME=Qwen/Qwen2.5-72B-Instruct
|
| 244 |
+
|
| 245 |
+
python inference.py
|
| 246 |
+
```
|
| 247 |
+
|
| 248 |
+
The inference script runs a baseline LLM agent against all tasks, emitting structured stdout logs:
|
| 249 |
+
|
| 250 |
+
```
|
| 251 |
+
[START] task=easy env=sre_incident_response model=Qwen/Qwen2.5-72B-Instruct
|
| 252 |
+
[STEP] step=1 action=read_logs(auth-service) reward=0.01 done=false error=null
|
| 253 |
+
[STEP] step=2 action=check_metrics(auth-service) reward=0.01 done=false error=null
|
| 254 |
+
[STEP] step=3 action=restart_service(auth-service) reward=0.05 done=false error=null
|
| 255 |
+
[STEP] step=4 action=submit_diagnosis reward=1.00 done=true error=null
|
| 256 |
+
[END] success=true steps=4 score=1.00 rewards=0.01,0.01,0.05,1.00
|
| 257 |
+
```
|
| 258 |
+
|
| 259 |
+
## Baseline Scores
|
| 260 |
+
|
| 261 |
+
| Task | Expected Score Range | What a Perfect Agent Scores |
|
| 262 |
+
|------|---------------------|---------------------------|
|
| 263 |
+
| easy | 0.70 - 0.95 | 1.00 |
|
| 264 |
+
| medium | 0.40 - 0.75 | 0.90 |
|
| 265 |
+
| hard | 0.20 - 0.55 | 0.85 |
|
| 266 |
+
|
| 267 |
+
## Environment Variables
|
| 268 |
+
|
| 269 |
+
| Variable | Description | Default |
|
| 270 |
+
|----------|-------------|---------|
|
| 271 |
+
| `API_BASE_URL` | LLM API endpoint | `https://router.huggingface.co/v1` |
|
| 272 |
+
| `MODEL_NAME` | Model identifier | `Qwen/Qwen2.5-72B-Instruct` |
|
| 273 |
+
| `HF_TOKEN` | HuggingFace API key | Required |
|
| 274 |
+
| `PORT` | Server port | `8000` |
|
| 275 |
+
| `SRE_TASKS` | Comma-separated task IDs to run in inference | `easy,medium,hard` |
|
| 276 |
+
|
| 277 |
+
## OpenEnv Spec Compliance
|
| 278 |
+
|
| 279 |
+
- `openenv.yaml` with metadata, task definitions, typed models, and runtime config
|
| 280 |
+
- `step(action)` returns observation, reward, done, info
|
| 281 |
+
- `reset()` returns initial observation
|
| 282 |
+
- `state()` returns current episode metadata
|
| 283 |
+
- Typed Pydantic models for Action, Observation, and State
|
| 284 |
+
- 3 tasks with programmatic graders (easy, medium, hard)
|
| 285 |
+
- Scores in 0.0-1.0 range with partial progress signals
|
| 286 |
+
- Working Dockerfile for containerized execution
|
| 287 |
+
- Baseline inference script (`inference.py`) with reproducible scores
|
env/__init__.py
ADDED
|
File without changes
|
env/environment.py
ADDED
|
@@ -0,0 +1,363 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Core environment engine β implements reset/step/state for the SRE Incident Response env.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import uuid
|
| 6 |
+
from typing import Any, Dict, Optional, Set, Tuple
|
| 7 |
+
|
| 8 |
+
from models import (
|
| 9 |
+
Action,
|
| 10 |
+
ActionType,
|
| 11 |
+
GraderResult,
|
| 12 |
+
INVESTIGATION_ACTIONS,
|
| 13 |
+
Observation,
|
| 14 |
+
REMEDIATION_ACTIONS,
|
| 15 |
+
RootCauseCategory,
|
| 16 |
+
ServiceState,
|
| 17 |
+
ServiceStatus,
|
| 18 |
+
State,
|
| 19 |
+
)
|
| 20 |
+
from env.scenario import IncidentScenario, RequiredFix
|
| 21 |
+
from tasks import SCENARIOS
|
| 22 |
+
from env.services import (
|
| 23 |
+
format_config_diff,
|
| 24 |
+
format_deploy_history,
|
| 25 |
+
format_dependencies,
|
| 26 |
+
format_logs,
|
| 27 |
+
format_metrics,
|
| 28 |
+
format_runbook,
|
| 29 |
+
format_traces,
|
| 30 |
+
generate_alerts,
|
| 31 |
+
ping_service,
|
| 32 |
+
recompute_health,
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class Session:
|
| 37 |
+
"""Tracks the state of a single episode."""
|
| 38 |
+
|
| 39 |
+
def __init__(self, scenario: IncidentScenario, session_id: str):
|
| 40 |
+
self.session_id = session_id
|
| 41 |
+
self.scenario = scenario
|
| 42 |
+
self.step_count = 0
|
| 43 |
+
self.done = False
|
| 44 |
+
self.cumulative_reward = 0.0
|
| 45 |
+
|
| 46 |
+
# Mutable service state: {name: {status, version, replicas}}
|
| 47 |
+
self.services: Dict[str, Dict[str, Any]] = {}
|
| 48 |
+
for name, cfg in scenario.services.items():
|
| 49 |
+
self.services[name] = {
|
| 50 |
+
"status": cfg.status,
|
| 51 |
+
"version": cfg.version,
|
| 52 |
+
"replicas": cfg.replicas,
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
# Track which root-cause services have been fixed
|
| 56 |
+
self.fixed_services: Set[str] = set()
|
| 57 |
+
|
| 58 |
+
# Build root-cause map: service_name -> fault_type
|
| 59 |
+
self.root_cause_map: Dict[str, str] = {}
|
| 60 |
+
for name, cfg in scenario.services.items():
|
| 61 |
+
if cfg.is_root_cause and cfg.fault_type:
|
| 62 |
+
self.root_cause_map[name] = cfg.fault_type
|
| 63 |
+
|
| 64 |
+
# Action history for grading
|
| 65 |
+
self.actions: list[Action] = []
|
| 66 |
+
self.services_investigated: Set[str] = set()
|
| 67 |
+
self.remediations_applied: list[Dict[str, Any]] = []
|
| 68 |
+
self.diagnosis: Optional[Action] = None
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class IncidentResponseEnv:
|
| 72 |
+
"""The SRE Incident Response OpenEnv environment."""
|
| 73 |
+
|
| 74 |
+
def __init__(self):
|
| 75 |
+
self.sessions: Dict[str, Session] = {}
|
| 76 |
+
|
| 77 |
+
def get_task_ids(self) -> list[str]:
|
| 78 |
+
return list(SCENARIOS.keys())
|
| 79 |
+
|
| 80 |
+
def reset(self, task_id: str, seed: int = 0) -> Tuple[Observation, str]:
|
| 81 |
+
"""Start a new episode for the given task."""
|
| 82 |
+
if task_id not in SCENARIOS:
|
| 83 |
+
raise ValueError(f"Unknown task_id: {task_id}. Available: {list(SCENARIOS.keys())}")
|
| 84 |
+
|
| 85 |
+
scenario = SCENARIOS[task_id]
|
| 86 |
+
session_id = str(uuid.uuid4())[:8]
|
| 87 |
+
session = Session(scenario, session_id)
|
| 88 |
+
self.sessions[session_id] = session
|
| 89 |
+
|
| 90 |
+
# Build initial observation
|
| 91 |
+
obs = self._build_observation(session, action_result=None)
|
| 92 |
+
return obs, session_id
|
| 93 |
+
|
| 94 |
+
def step(self, session_id: str, action: Action) -> Tuple[Observation, float, bool, Dict]:
|
| 95 |
+
"""Execute an action and return (observation, reward, done, info)."""
|
| 96 |
+
session = self._get_session(session_id)
|
| 97 |
+
if session.done:
|
| 98 |
+
obs = self._build_observation(session, action_result="Episode already finished.")
|
| 99 |
+
return obs, 0.0, True, {"error": "Episode already finished."}
|
| 100 |
+
|
| 101 |
+
session.step_count += 1
|
| 102 |
+
session.actions.append(action)
|
| 103 |
+
|
| 104 |
+
reward = 0.0
|
| 105 |
+
action_result = ""
|
| 106 |
+
info: Dict[str, Any] = {}
|
| 107 |
+
|
| 108 |
+
service_name = action.service
|
| 109 |
+
scenario = session.scenario
|
| 110 |
+
|
| 111 |
+
# Validate service name for actions that require it
|
| 112 |
+
if action.action_type != ActionType.SUBMIT_DIAGNOSIS:
|
| 113 |
+
if service_name and service_name not in scenario.services:
|
| 114 |
+
action_result = f"Unknown service: '{service_name}'. Available: {list(scenario.services.keys())}"
|
| 115 |
+
obs = self._build_observation(session, action_result=action_result)
|
| 116 |
+
return obs, 0.0, False, {"error": action_result}
|
| 117 |
+
if not service_name and action.action_type != ActionType.SUBMIT_DIAGNOSIS:
|
| 118 |
+
action_result = "Action requires a 'service' parameter."
|
| 119 |
+
obs = self._build_observation(session, action_result=action_result)
|
| 120 |
+
return obs, 0.0, False, {"error": action_result}
|
| 121 |
+
|
| 122 |
+
# ββ Investigation actions ββ
|
| 123 |
+
if action.action_type in INVESTIGATION_ACTIONS:
|
| 124 |
+
session.services_investigated.add(service_name)
|
| 125 |
+
action_result = self._handle_investigation(session, action)
|
| 126 |
+
|
| 127 |
+
# Small reward for investigating root cause services
|
| 128 |
+
if service_name in scenario.root_cause_services:
|
| 129 |
+
reward = 0.01
|
| 130 |
+
else:
|
| 131 |
+
reward = 0.0
|
| 132 |
+
|
| 133 |
+
# ββ Remediation actions ββ
|
| 134 |
+
elif action.action_type in REMEDIATION_ACTIONS:
|
| 135 |
+
action_result, reward = self._handle_remediation(session, action)
|
| 136 |
+
session.remediations_applied.append({
|
| 137 |
+
"action": action.action_type.value,
|
| 138 |
+
"service": service_name,
|
| 139 |
+
"target_version": action.target_version,
|
| 140 |
+
"replicas": action.replicas,
|
| 141 |
+
})
|
| 142 |
+
|
| 143 |
+
# ββ Submit diagnosis ββ
|
| 144 |
+
elif action.action_type == ActionType.SUBMIT_DIAGNOSIS:
|
| 145 |
+
session.diagnosis = action
|
| 146 |
+
session.done = True
|
| 147 |
+
grader_result = self._grade(session)
|
| 148 |
+
reward = grader_result.score
|
| 149 |
+
action_result = f"Diagnosis submitted. Score: {grader_result.score:.2f}"
|
| 150 |
+
info["grader_result"] = grader_result.model_dump()
|
| 151 |
+
|
| 152 |
+
session.cumulative_reward += reward
|
| 153 |
+
|
| 154 |
+
# Check max steps
|
| 155 |
+
if session.step_count >= scenario.max_steps and not session.done:
|
| 156 |
+
session.done = True
|
| 157 |
+
if session.diagnosis is None:
|
| 158 |
+
# Auto-grade with whatever we have
|
| 159 |
+
grader_result = self._grade(session)
|
| 160 |
+
reward = grader_result.score
|
| 161 |
+
info["grader_result"] = grader_result.model_dump()
|
| 162 |
+
action_result += f"\n[MAX STEPS REACHED] Episode ended. Score: {grader_result.score:.2f}"
|
| 163 |
+
|
| 164 |
+
obs = self._build_observation(session, action_result=action_result, reward=reward)
|
| 165 |
+
obs.done = session.done
|
| 166 |
+
if "grader_result" in info:
|
| 167 |
+
obs.score = info["grader_result"]["score"]
|
| 168 |
+
|
| 169 |
+
return obs, reward, session.done, info
|
| 170 |
+
|
| 171 |
+
def state(self, session_id: str) -> State:
|
| 172 |
+
"""Return current episode state."""
|
| 173 |
+
session = self._get_session(session_id)
|
| 174 |
+
return State(
|
| 175 |
+
session_id=session.session_id,
|
| 176 |
+
task_id=session.scenario.task_id,
|
| 177 |
+
step_count=session.step_count,
|
| 178 |
+
max_steps=session.scenario.max_steps,
|
| 179 |
+
done=session.done,
|
| 180 |
+
actions_taken=[a.action_type.value for a in session.actions],
|
| 181 |
+
services_investigated=list(session.services_investigated),
|
| 182 |
+
remediations_applied=[f"{r['action']}({r['service']})" for r in session.remediations_applied],
|
| 183 |
+
cumulative_reward=round(session.cumulative_reward, 4),
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
# ββ Internal helpers βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 187 |
+
|
| 188 |
+
def _get_session(self, session_id: str) -> Session:
|
| 189 |
+
if session_id not in self.sessions:
|
| 190 |
+
raise ValueError(f"Unknown session: {session_id}")
|
| 191 |
+
return self.sessions[session_id]
|
| 192 |
+
|
| 193 |
+
def _build_observation(
|
| 194 |
+
self, session: Session, action_result: Optional[str], reward: float = 0.0,
|
| 195 |
+
) -> Observation:
|
| 196 |
+
scenario = session.scenario
|
| 197 |
+
svc_states = {}
|
| 198 |
+
for name, data in session.services.items():
|
| 199 |
+
svc_states[name] = ServiceState(
|
| 200 |
+
status=data["status"],
|
| 201 |
+
version=data["version"],
|
| 202 |
+
replicas=data["replicas"],
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
alerts = generate_alerts(
|
| 206 |
+
session.services, scenario.initial_alerts, session.fixed_services,
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
return Observation(
|
| 210 |
+
step_number=session.step_count,
|
| 211 |
+
timestamp=f"2026-04-06T04:{session.step_count:02d}:00Z",
|
| 212 |
+
services=svc_states,
|
| 213 |
+
active_alerts=alerts,
|
| 214 |
+
incident_summary=scenario.incident_summary if session.step_count == 0 else "",
|
| 215 |
+
action_result=action_result,
|
| 216 |
+
reward=round(reward, 4),
|
| 217 |
+
done=session.done,
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
def _handle_investigation(self, session: Session, action: Action) -> str:
|
| 221 |
+
scenario = session.scenario
|
| 222 |
+
svc = action.service
|
| 223 |
+
|
| 224 |
+
if action.action_type == ActionType.READ_LOGS:
|
| 225 |
+
logs = scenario.logs.get(svc, [])
|
| 226 |
+
return format_logs(logs)
|
| 227 |
+
|
| 228 |
+
elif action.action_type == ActionType.CHECK_METRICS:
|
| 229 |
+
metrics = scenario.metrics.get(svc, [])
|
| 230 |
+
return format_metrics(metrics)
|
| 231 |
+
|
| 232 |
+
elif action.action_type == ActionType.PING_SERVICE:
|
| 233 |
+
status = session.services[svc]["status"]
|
| 234 |
+
return ping_service(status, svc)
|
| 235 |
+
|
| 236 |
+
elif action.action_type == ActionType.CHECK_DEPENDENCIES:
|
| 237 |
+
deps = scenario.dependencies.get(svc, [])
|
| 238 |
+
dep_info = format_dependencies(deps)
|
| 239 |
+
# Also show current health of dependencies
|
| 240 |
+
dep_health = []
|
| 241 |
+
for d in deps:
|
| 242 |
+
if d in session.services:
|
| 243 |
+
dep_health.append(f" {d}: {session.services[d]['status'].value}")
|
| 244 |
+
if dep_health:
|
| 245 |
+
dep_info += "\n\nDependency health:\n" + "\n".join(dep_health)
|
| 246 |
+
return dep_info
|
| 247 |
+
|
| 248 |
+
elif action.action_type == ActionType.INSPECT_DEPLOY:
|
| 249 |
+
deploys = scenario.deploy_history.get(svc, [])
|
| 250 |
+
return format_deploy_history(deploys)
|
| 251 |
+
|
| 252 |
+
elif action.action_type == ActionType.QUERY_TRACES:
|
| 253 |
+
traces = scenario.traces.get(svc, [])
|
| 254 |
+
return format_traces(traces)
|
| 255 |
+
|
| 256 |
+
elif action.action_type == ActionType.CHECK_RUNBOOK:
|
| 257 |
+
runbook = scenario.runbooks.get(svc, "")
|
| 258 |
+
return format_runbook(runbook)
|
| 259 |
+
|
| 260 |
+
elif action.action_type == ActionType.DIFF_CONFIG:
|
| 261 |
+
configs = scenario.configs.get(svc, {})
|
| 262 |
+
return format_config_diff(configs)
|
| 263 |
+
|
| 264 |
+
return f"No data available for {action.action_type.value} on {svc}."
|
| 265 |
+
|
| 266 |
+
def _handle_remediation(self, session: Session, action: Action) -> Tuple[str, float]:
|
| 267 |
+
scenario = session.scenario
|
| 268 |
+
svc = action.service
|
| 269 |
+
reward = 0.0
|
| 270 |
+
result = ""
|
| 271 |
+
|
| 272 |
+
# Check if this remediation matches any required fix
|
| 273 |
+
fix_matched = False
|
| 274 |
+
for req_fix in scenario.required_fixes:
|
| 275 |
+
if self._fix_matches(action, req_fix):
|
| 276 |
+
fix_matched = True
|
| 277 |
+
session.fixed_services.add(svc)
|
| 278 |
+
reward = 0.05
|
| 279 |
+
break
|
| 280 |
+
|
| 281 |
+
if action.action_type == ActionType.RESTART_SERVICE:
|
| 282 |
+
if fix_matched:
|
| 283 |
+
session.services[svc]["status"] = ServiceStatus.HEALTHY
|
| 284 |
+
result = f"Service '{svc}' restarted successfully. Status: HEALTHY"
|
| 285 |
+
else:
|
| 286 |
+
# Restarting a non-root-cause service: no effect on the underlying issue
|
| 287 |
+
current = session.services[svc]["status"]
|
| 288 |
+
if current == ServiceStatus.DOWN and svc in session.root_cause_map:
|
| 289 |
+
result = f"Service '{svc}' restarted but crashed again β underlying issue persists."
|
| 290 |
+
elif current == ServiceStatus.HEALTHY:
|
| 291 |
+
result = f"Service '{svc}' restarted. It was already healthy β no change."
|
| 292 |
+
else:
|
| 293 |
+
result = f"Service '{svc}' restarted. Status unchanged β issue is caused by an upstream dependency."
|
| 294 |
+
reward = -0.05
|
| 295 |
+
|
| 296 |
+
elif action.action_type == ActionType.ROLLBACK_DEPLOY:
|
| 297 |
+
if fix_matched:
|
| 298 |
+
session.services[svc]["version"] = action.target_version or ""
|
| 299 |
+
session.services[svc]["status"] = ServiceStatus.HEALTHY
|
| 300 |
+
result = (
|
| 301 |
+
f"Service '{svc}' rolled back to {action.target_version}. "
|
| 302 |
+
f"Pods restarting with previous version... Status: HEALTHY"
|
| 303 |
+
)
|
| 304 |
+
else:
|
| 305 |
+
current_version = session.services[svc]["version"]
|
| 306 |
+
result = (
|
| 307 |
+
f"Rolled back '{svc}' to {action.target_version}, but this didn't resolve the issue. "
|
| 308 |
+
f"Previous version was {current_version}."
|
| 309 |
+
)
|
| 310 |
+
reward = -0.05
|
| 311 |
+
|
| 312 |
+
elif action.action_type == ActionType.SCALE_UP:
|
| 313 |
+
replicas = action.replicas or 3
|
| 314 |
+
if fix_matched or (svc in scenario.root_cause_services):
|
| 315 |
+
session.services[svc]["replicas"] = replicas
|
| 316 |
+
session.fixed_services.add(svc)
|
| 317 |
+
session.services[svc]["status"] = ServiceStatus.HEALTHY
|
| 318 |
+
result = f"Service '{svc}' scaled to {replicas} replicas. Memory pressure alleviated. Status: HEALTHY"
|
| 319 |
+
reward = 0.05
|
| 320 |
+
else:
|
| 321 |
+
session.services[svc]["replicas"] = replicas
|
| 322 |
+
result = f"Service '{svc}' scaled to {replicas} replicas. No effect on the underlying issue."
|
| 323 |
+
reward = -0.05
|
| 324 |
+
|
| 325 |
+
elif action.action_type == ActionType.DRAIN_TRAFFIC:
|
| 326 |
+
result = f"Traffic drained from '{svc}'. Service is no longer receiving requests."
|
| 327 |
+
if svc not in scenario.root_cause_services:
|
| 328 |
+
reward = -0.05
|
| 329 |
+
|
| 330 |
+
# Recompute health after remediation
|
| 331 |
+
session.services = recompute_health(
|
| 332 |
+
session.services,
|
| 333 |
+
scenario.dependencies,
|
| 334 |
+
session.fixed_services,
|
| 335 |
+
session.root_cause_map,
|
| 336 |
+
)
|
| 337 |
+
|
| 338 |
+
# Add post-remediation status summary
|
| 339 |
+
still_broken = [
|
| 340 |
+
name for name, data in session.services.items()
|
| 341 |
+
if data["status"] != ServiceStatus.HEALTHY
|
| 342 |
+
]
|
| 343 |
+
if still_broken:
|
| 344 |
+
result += f"\n\n[POST-REMEDIATION CHECK] Services still unhealthy: {', '.join(still_broken)}"
|
| 345 |
+
else:
|
| 346 |
+
result += "\n\n[POST-REMEDIATION CHECK] All services are now HEALTHY."
|
| 347 |
+
|
| 348 |
+
return result, reward
|
| 349 |
+
|
| 350 |
+
def _fix_matches(self, action: Action, req_fix: RequiredFix) -> bool:
|
| 351 |
+
"""Check if an action matches a required fix."""
|
| 352 |
+
if action.action_type.value != req_fix.action:
|
| 353 |
+
return False
|
| 354 |
+
if action.service != req_fix.service:
|
| 355 |
+
return False
|
| 356 |
+
if req_fix.target_version and action.target_version != req_fix.target_version:
|
| 357 |
+
return False
|
| 358 |
+
return True
|
| 359 |
+
|
| 360 |
+
def _grade(self, session: Session) -> GraderResult:
|
| 361 |
+
"""Deterministic grading of the episode."""
|
| 362 |
+
from graders.grader import grade_episode
|
| 363 |
+
return grade_episode(session)
|
env/scenario.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Scenario schema β shared dataclasses used by all task definitions.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass, field
|
| 6 |
+
from typing import Any, Dict, List, Optional
|
| 7 |
+
|
| 8 |
+
from models import RootCauseCategory, ServiceStatus
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@dataclass
|
| 12 |
+
class ServiceConfig:
|
| 13 |
+
"""Configuration for a single service in the simulated architecture."""
|
| 14 |
+
status: ServiceStatus
|
| 15 |
+
deps: List[str] = field(default_factory=list)
|
| 16 |
+
version: str = ""
|
| 17 |
+
replicas: int = 1
|
| 18 |
+
is_root_cause: bool = False
|
| 19 |
+
fault_type: Optional[str] = None
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@dataclass
|
| 23 |
+
class RequiredFix:
|
| 24 |
+
"""A fix that the agent must apply to resolve the incident."""
|
| 25 |
+
action: str # "restart_service", "rollback_deploy", "scale_up"
|
| 26 |
+
service: str
|
| 27 |
+
target_version: Optional[str] = None
|
| 28 |
+
replicas: Optional[int] = None
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@dataclass
|
| 32 |
+
class IncidentScenario:
|
| 33 |
+
"""
|
| 34 |
+
A self-contained incident scenario definition.
|
| 35 |
+
|
| 36 |
+
To create a new task, create a new Python file in tasks/ that instantiates
|
| 37 |
+
this dataclass and assigns it to a module-level variable named SCENARIO.
|
| 38 |
+
"""
|
| 39 |
+
task_id: str
|
| 40 |
+
name: str
|
| 41 |
+
difficulty: str # "easy", "medium", "hard"
|
| 42 |
+
max_steps: int
|
| 43 |
+
incident_summary: str
|
| 44 |
+
|
| 45 |
+
# Service architecture
|
| 46 |
+
services: Dict[str, ServiceConfig] = field(default_factory=dict)
|
| 47 |
+
|
| 48 |
+
# Pre-written data per service
|
| 49 |
+
logs: Dict[str, List[str]] = field(default_factory=dict)
|
| 50 |
+
metrics: Dict[str, List[Dict[str, Any]]] = field(default_factory=dict)
|
| 51 |
+
traces: Dict[str, List[str]] = field(default_factory=dict)
|
| 52 |
+
deploy_history: Dict[str, List[str]] = field(default_factory=dict)
|
| 53 |
+
runbooks: Dict[str, str] = field(default_factory=dict)
|
| 54 |
+
configs: Dict[str, Dict[str, str]] = field(default_factory=dict)
|
| 55 |
+
dependencies: Dict[str, List[str]] = field(default_factory=dict)
|
| 56 |
+
|
| 57 |
+
# Initial alerts
|
| 58 |
+
initial_alerts: List[str] = field(default_factory=list)
|
| 59 |
+
|
| 60 |
+
# Ground truth for grading
|
| 61 |
+
root_cause_services: List[str] = field(default_factory=list)
|
| 62 |
+
root_cause_categories: List[RootCauseCategory] = field(default_factory=list)
|
| 63 |
+
required_fixes: List[RequiredFix] = field(default_factory=list)
|
| 64 |
+
diagnosis_keywords: List[str] = field(default_factory=list)
|
| 65 |
+
|
| 66 |
+
# Grading weights
|
| 67 |
+
weights: Dict[str, float] = field(default_factory=dict)
|
env/services.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Service simulation helpers β generates alerts, formats data, cascades dependency health.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from typing import Any, Dict, List, Set, Tuple
|
| 6 |
+
|
| 7 |
+
from models import ServiceStatus
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def generate_alerts(
|
| 11 |
+
services: Dict[str, Any],
|
| 12 |
+
scenario_alerts: List[str],
|
| 13 |
+
fixed_services: Set[str],
|
| 14 |
+
) -> List[str]:
|
| 15 |
+
"""Regenerate alerts based on current service state.
|
| 16 |
+
If all root-cause services are fixed, alerts clear."""
|
| 17 |
+
alerts: List[str] = []
|
| 18 |
+
for svc_name, svc in services.items():
|
| 19 |
+
status = svc["status"]
|
| 20 |
+
if status == ServiceStatus.DOWN and svc_name not in fixed_services:
|
| 21 |
+
alerts.append(f"[ALERT SEV-1] {svc_name}: service is DOWN, 0 healthy pods")
|
| 22 |
+
elif status == ServiceStatus.DEGRADED and svc_name not in fixed_services:
|
| 23 |
+
alerts.append(f"[ALERT SEV-2] {svc_name}: service is DEGRADED")
|
| 24 |
+
if not alerts:
|
| 25 |
+
return ["[INFO] All services HEALTHY β no active alerts."]
|
| 26 |
+
return alerts
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def recompute_health(
|
| 30 |
+
services: Dict[str, Any],
|
| 31 |
+
dependencies: Dict[str, List[str]],
|
| 32 |
+
fixed_services: Set[str],
|
| 33 |
+
root_cause_map: Dict[str, str],
|
| 34 |
+
) -> Dict[str, Any]:
|
| 35 |
+
"""Walk the dependency graph and update service health.
|
| 36 |
+
|
| 37 |
+
Rules:
|
| 38 |
+
- A root-cause service that has been fixed becomes HEALTHY.
|
| 39 |
+
- A non-root-cause service becomes HEALTHY if all its deps are HEALTHY.
|
| 40 |
+
- A non-root-cause service becomes DEGRADED if any dep is DEGRADED.
|
| 41 |
+
- A non-root-cause service becomes DOWN if any dep is DOWN.
|
| 42 |
+
"""
|
| 43 |
+
updated = {k: dict(v) for k, v in services.items()}
|
| 44 |
+
|
| 45 |
+
# First, fix root-cause services that have been remediated
|
| 46 |
+
for svc_name in fixed_services:
|
| 47 |
+
if svc_name in updated:
|
| 48 |
+
updated[svc_name]["status"] = ServiceStatus.HEALTHY
|
| 49 |
+
|
| 50 |
+
# Iteratively propagate health (max 5 rounds to handle chains)
|
| 51 |
+
for _ in range(5):
|
| 52 |
+
changed = False
|
| 53 |
+
for svc_name, deps in dependencies.items():
|
| 54 |
+
if svc_name in fixed_services:
|
| 55 |
+
continue
|
| 56 |
+
if svc_name in root_cause_map and svc_name not in fixed_services:
|
| 57 |
+
continue # still broken
|
| 58 |
+
|
| 59 |
+
if not deps:
|
| 60 |
+
continue
|
| 61 |
+
|
| 62 |
+
dep_statuses = [updated[d]["status"] for d in deps if d in updated]
|
| 63 |
+
if not dep_statuses:
|
| 64 |
+
continue
|
| 65 |
+
|
| 66 |
+
if any(s == ServiceStatus.DOWN for s in dep_statuses):
|
| 67 |
+
new_status = ServiceStatus.DEGRADED # downstream of DOWN = DEGRADED
|
| 68 |
+
elif any(s == ServiceStatus.DEGRADED for s in dep_statuses):
|
| 69 |
+
new_status = ServiceStatus.DEGRADED
|
| 70 |
+
else:
|
| 71 |
+
new_status = ServiceStatus.HEALTHY
|
| 72 |
+
|
| 73 |
+
if updated[svc_name]["status"] != new_status:
|
| 74 |
+
updated[svc_name]["status"] = new_status
|
| 75 |
+
changed = True
|
| 76 |
+
|
| 77 |
+
if not changed:
|
| 78 |
+
break
|
| 79 |
+
|
| 80 |
+
return updated
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def format_metrics(metrics_list: List[Dict[str, Any]]) -> str:
|
| 84 |
+
"""Format time-series metrics into a readable table."""
|
| 85 |
+
if not metrics_list:
|
| 86 |
+
return "No metrics available for this service."
|
| 87 |
+
|
| 88 |
+
# Get all keys from the first entry
|
| 89 |
+
keys = list(metrics_list[0].keys())
|
| 90 |
+
header = " ".join(f"{k:<18}" for k in keys)
|
| 91 |
+
lines = [header, "-" * len(header)]
|
| 92 |
+
for row in metrics_list:
|
| 93 |
+
vals = []
|
| 94 |
+
for k in keys:
|
| 95 |
+
v = row.get(k, "")
|
| 96 |
+
vals.append(f"{str(v):<18}")
|
| 97 |
+
lines.append(" ".join(vals))
|
| 98 |
+
return "\n".join(lines)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def format_logs(log_lines: List[str]) -> str:
|
| 102 |
+
"""Join log lines with newlines."""
|
| 103 |
+
if not log_lines:
|
| 104 |
+
return "No logs available for this service."
|
| 105 |
+
return "\n".join(log_lines)
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def format_traces(trace_lines: List[str]) -> str:
|
| 109 |
+
"""Format trace data."""
|
| 110 |
+
if not trace_lines:
|
| 111 |
+
return "No traces available for this service."
|
| 112 |
+
return "\n".join(trace_lines)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def format_deploy_history(deploy_lines: List[str]) -> str:
|
| 116 |
+
"""Format deploy history."""
|
| 117 |
+
if not deploy_lines:
|
| 118 |
+
return "No deploy history available for this service."
|
| 119 |
+
return "\n".join(deploy_lines)
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def format_dependencies(deps: List[str]) -> str:
|
| 123 |
+
"""Format dependency list."""
|
| 124 |
+
if not deps:
|
| 125 |
+
return "This service has no upstream dependencies."
|
| 126 |
+
return "Dependencies: " + ", ".join(deps)
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def format_runbook(runbook: str) -> str:
|
| 130 |
+
"""Return runbook text."""
|
| 131 |
+
if not runbook:
|
| 132 |
+
return "No runbook available for this service."
|
| 133 |
+
return runbook
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def format_config_diff(config_data: Dict[str, str]) -> str:
|
| 137 |
+
"""Format config diff."""
|
| 138 |
+
if not config_data:
|
| 139 |
+
return "No config data available for this service."
|
| 140 |
+
result = []
|
| 141 |
+
if "diff" in config_data:
|
| 142 |
+
result.append(f"Config diff: {config_data['diff']}")
|
| 143 |
+
if "current" in config_data:
|
| 144 |
+
result.append(f"\nCurrent config:\n{config_data['current']}")
|
| 145 |
+
return "\n".join(result)
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def ping_service(status: ServiceStatus, service_name: str) -> str:
|
| 149 |
+
"""Simulate a ping to a service."""
|
| 150 |
+
if status == ServiceStatus.HEALTHY:
|
| 151 |
+
return f"PING {service_name}: responding on :8080/healthz β 200 OK (latency: 5ms)"
|
| 152 |
+
elif status == ServiceStatus.DEGRADED:
|
| 153 |
+
return f"PING {service_name}: responding on :8080/healthz β 200 OK (latency: 1200ms, SLOW)"
|
| 154 |
+
else:
|
| 155 |
+
return f"PING {service_name}: connection refused on :8080/healthz β service unreachable"
|
graders/__init__.py
ADDED
|
File without changes
|
graders/grader.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Deterministic grading engine for the SRE Incident Response environment.
|
| 3 |
+
Scores episodes on a 0.0-1.0 scale based on weighted rubric components.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
from typing import TYPE_CHECKING, Dict, List
|
| 9 |
+
|
| 10 |
+
from models import (
|
| 11 |
+
ActionType,
|
| 12 |
+
GraderResult,
|
| 13 |
+
INVESTIGATION_ACTIONS,
|
| 14 |
+
RootCauseCategory,
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
if TYPE_CHECKING:
|
| 18 |
+
from env.environment import Session
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def grade_episode(session: Session) -> GraderResult:
|
| 22 |
+
"""Grade a completed episode and return a GraderResult."""
|
| 23 |
+
scenario = session.scenario
|
| 24 |
+
weights = scenario.weights
|
| 25 |
+
diagnosis = session.diagnosis
|
| 26 |
+
notes: List[str] = []
|
| 27 |
+
breakdown: Dict[str, float] = {}
|
| 28 |
+
|
| 29 |
+
score = 0.0
|
| 30 |
+
|
| 31 |
+
# ββ 1. Root cause service identification ββ
|
| 32 |
+
service_score = 0.0
|
| 33 |
+
if diagnosis and diagnosis.root_cause_service:
|
| 34 |
+
if diagnosis.root_cause_service in scenario.root_cause_services:
|
| 35 |
+
service_score = weights.get("correct_service", 0)
|
| 36 |
+
notes.append(f"Correct root cause service: {diagnosis.root_cause_service}")
|
| 37 |
+
else:
|
| 38 |
+
notes.append(
|
| 39 |
+
f"Wrong root cause service: {diagnosis.root_cause_service} "
|
| 40 |
+
f"(expected one of: {scenario.root_cause_services})"
|
| 41 |
+
)
|
| 42 |
+
else:
|
| 43 |
+
notes.append("No root cause service submitted.")
|
| 44 |
+
breakdown["correct_service"] = service_score
|
| 45 |
+
score += service_score
|
| 46 |
+
|
| 47 |
+
# ββ 2. Root cause category ββ
|
| 48 |
+
category_score = 0.0
|
| 49 |
+
if diagnosis and diagnosis.root_cause_category:
|
| 50 |
+
if diagnosis.root_cause_category in scenario.root_cause_categories:
|
| 51 |
+
category_score = weights.get("correct_category", 0)
|
| 52 |
+
notes.append(f"Correct root cause category: {diagnosis.root_cause_category.value}")
|
| 53 |
+
else:
|
| 54 |
+
notes.append(
|
| 55 |
+
f"Wrong root cause category: {diagnosis.root_cause_category.value} "
|
| 56 |
+
f"(expected one of: {[c.value for c in scenario.root_cause_categories]})"
|
| 57 |
+
)
|
| 58 |
+
else:
|
| 59 |
+
notes.append("No root cause category submitted.")
|
| 60 |
+
breakdown["correct_category"] = category_score
|
| 61 |
+
score += category_score
|
| 62 |
+
|
| 63 |
+
# ββ 3. Primary fix applied ββ
|
| 64 |
+
fix_score = 0.0
|
| 65 |
+
primary_fix = scenario.required_fixes[0] if scenario.required_fixes else None
|
| 66 |
+
if primary_fix and primary_fix.service in session.fixed_services:
|
| 67 |
+
fix_score = weights.get("correct_fix", 0)
|
| 68 |
+
notes.append(f"Primary fix applied: {primary_fix.action}({primary_fix.service})")
|
| 69 |
+
elif primary_fix:
|
| 70 |
+
notes.append(
|
| 71 |
+
f"Primary fix NOT applied. Expected: {primary_fix.action}({primary_fix.service})"
|
| 72 |
+
)
|
| 73 |
+
breakdown["correct_fix"] = fix_score
|
| 74 |
+
score += fix_score
|
| 75 |
+
|
| 76 |
+
# ββ 4. Secondary fixes (hard tier) ββ
|
| 77 |
+
secondary_score = 0.0
|
| 78 |
+
secondary_weight = weights.get("secondary_fix", 0)
|
| 79 |
+
if secondary_weight > 0 and len(scenario.required_fixes) > 1:
|
| 80 |
+
secondary_fixes = scenario.required_fixes[1:]
|
| 81 |
+
fixed_count = sum(
|
| 82 |
+
1 for f in secondary_fixes if f.service in session.fixed_services
|
| 83 |
+
)
|
| 84 |
+
fraction = fixed_count / len(secondary_fixes)
|
| 85 |
+
secondary_score = secondary_weight * fraction
|
| 86 |
+
if fixed_count == len(secondary_fixes):
|
| 87 |
+
notes.append(f"All {len(secondary_fixes)} secondary fix(es) applied.")
|
| 88 |
+
elif fixed_count > 0:
|
| 89 |
+
notes.append(
|
| 90 |
+
f"Partial secondary fixes: {fixed_count}/{len(secondary_fixes)} applied."
|
| 91 |
+
)
|
| 92 |
+
else:
|
| 93 |
+
notes.append(f"No secondary fixes applied (needed {len(secondary_fixes)}).")
|
| 94 |
+
breakdown["secondary_fix"] = secondary_score
|
| 95 |
+
score += secondary_score
|
| 96 |
+
|
| 97 |
+
# ββ 5. Diagnosis text quality (keyword matching) ββ
|
| 98 |
+
text_score = 0.0
|
| 99 |
+
text_weight = weights.get("diagnosis_text", 0)
|
| 100 |
+
if diagnosis and diagnosis.fix_description:
|
| 101 |
+
desc_lower = diagnosis.fix_description.lower()
|
| 102 |
+
keywords = scenario.diagnosis_keywords
|
| 103 |
+
matched = sum(1 for kw in keywords if kw.lower() in desc_lower)
|
| 104 |
+
fraction = min(matched / max(len(keywords) // 2, 1), 1.0) # need half the keywords for full marks
|
| 105 |
+
text_score = text_weight * fraction
|
| 106 |
+
notes.append(
|
| 107 |
+
f"Diagnosis text: {matched}/{len(keywords)} keywords matched "
|
| 108 |
+
f"({fraction:.0%} of required)"
|
| 109 |
+
)
|
| 110 |
+
else:
|
| 111 |
+
notes.append("No diagnosis description submitted.")
|
| 112 |
+
breakdown["diagnosis_text"] = round(text_score, 4)
|
| 113 |
+
score += text_score
|
| 114 |
+
|
| 115 |
+
# ββ 6. Investigation thoroughness ββ
|
| 116 |
+
invest_score = 0.0
|
| 117 |
+
invest_weight = weights.get("investigation", 0)
|
| 118 |
+
# Check if agent investigated at least one root cause service
|
| 119 |
+
investigated_root = any(
|
| 120 |
+
svc in session.services_investigated
|
| 121 |
+
for svc in scenario.root_cause_services
|
| 122 |
+
)
|
| 123 |
+
if investigated_root:
|
| 124 |
+
invest_score = invest_weight
|
| 125 |
+
notes.append(
|
| 126 |
+
f"Investigation: examined root cause service(s) "
|
| 127 |
+
f"({session.services_investigated & set(scenario.root_cause_services)})"
|
| 128 |
+
)
|
| 129 |
+
else:
|
| 130 |
+
notes.append(
|
| 131 |
+
f"Investigation: did NOT examine any root cause service. "
|
| 132 |
+
f"Investigated: {session.services_investigated or 'none'}"
|
| 133 |
+
)
|
| 134 |
+
breakdown["investigation"] = invest_score
|
| 135 |
+
score += invest_score
|
| 136 |
+
|
| 137 |
+
# ββ 7. Wrong remediation penalties ββ
|
| 138 |
+
penalty = 0.0
|
| 139 |
+
penalty_per = weights.get("wrong_penalty", 0.05)
|
| 140 |
+
wrong_count = 0
|
| 141 |
+
for rem in session.remediations_applied:
|
| 142 |
+
is_correct = False
|
| 143 |
+
for req_fix in scenario.required_fixes:
|
| 144 |
+
if rem["action"] == req_fix.action and rem["service"] == req_fix.service:
|
| 145 |
+
if req_fix.target_version and rem.get("target_version") != req_fix.target_version:
|
| 146 |
+
continue
|
| 147 |
+
is_correct = True
|
| 148 |
+
break
|
| 149 |
+
# Also accept scale_up on root cause services as correct
|
| 150 |
+
if rem["service"] in scenario.root_cause_services and rem["action"] in ("restart_service", "scale_up", "rollback_deploy"):
|
| 151 |
+
is_correct = True
|
| 152 |
+
if not is_correct:
|
| 153 |
+
wrong_count += 1
|
| 154 |
+
penalty += penalty_per
|
| 155 |
+
|
| 156 |
+
if wrong_count > 0:
|
| 157 |
+
notes.append(f"Penalty: {wrong_count} wrong remediation(s) (-{penalty:.2f})")
|
| 158 |
+
breakdown["wrong_penalty"] = -round(penalty, 4)
|
| 159 |
+
score -= penalty
|
| 160 |
+
|
| 161 |
+
# ββ Final clamp ββ
|
| 162 |
+
score = round(max(0.0, min(1.0, score)), 4)
|
| 163 |
+
solved = score >= 0.7
|
| 164 |
+
|
| 165 |
+
return GraderResult(
|
| 166 |
+
score=score,
|
| 167 |
+
solved=solved,
|
| 168 |
+
breakdown=breakdown,
|
| 169 |
+
notes=notes,
|
| 170 |
+
)
|
inference.py
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Inference Script β SRE Incident Response Environment
|
| 3 |
+
=====================================================
|
| 4 |
+
MANDATORY:
|
| 5 |
+
- Before submitting, ensure the following variables are defined in your environment:
|
| 6 |
+
API_BASE_URL The API endpoint for the LLM.
|
| 7 |
+
MODEL_NAME The model identifier to use for inference.
|
| 8 |
+
HF_TOKEN Your Hugging Face / API key.
|
| 9 |
+
LOCAL_IMAGE_NAME The name of the local Docker image (if using from_docker_image)
|
| 10 |
+
|
| 11 |
+
- The inference script must be named `inference.py` and placed in the root directory
|
| 12 |
+
- Participants must use OpenAI Client for all LLM calls using above variables
|
| 13 |
+
|
| 14 |
+
STDOUT FORMAT:
|
| 15 |
+
[START] task=<task_name> env=<benchmark> model=<model_name>
|
| 16 |
+
[STEP] step=<n> action=<action_str> reward=<0.00> done=<true|false> error=<msg|null>
|
| 17 |
+
[END] success=<true|false> steps=<n> score=<score> rewards=<r1,r2,...,rn>
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
import asyncio
|
| 21 |
+
import json
|
| 22 |
+
import os
|
| 23 |
+
import sys
|
| 24 |
+
import textwrap
|
| 25 |
+
from typing import List
|
| 26 |
+
|
| 27 |
+
from openai import OpenAI
|
| 28 |
+
|
| 29 |
+
# Add project root to path
|
| 30 |
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
| 31 |
+
|
| 32 |
+
from models import Action, ActionType, RootCauseCategory
|
| 33 |
+
from env.environment import IncidentResponseEnv
|
| 34 |
+
|
| 35 |
+
IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")
|
| 36 |
+
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 37 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 38 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
|
| 39 |
+
BENCHMARK = "sre_incident_response"
|
| 40 |
+
MAX_STEPS = 20
|
| 41 |
+
TEMPERATURE = 0.7
|
| 42 |
+
SUCCESS_SCORE_THRESHOLD = 0.7
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# ββ Logging helpers (strict format) βββββββββββββββββββββββββββββββββββ
|
| 46 |
+
|
| 47 |
+
def log_start(task: str, env: str, model: str) -> None:
|
| 48 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def log_step(step: int, action: str, reward: float, done: bool, error) -> None:
|
| 52 |
+
error_str = str(error) if error is not None else "null"
|
| 53 |
+
done_str = "true" if done else "false"
|
| 54 |
+
print(
|
| 55 |
+
f"[STEP] step={step} action={action} reward={reward:.2f} done={done_str} error={error_str}",
|
| 56 |
+
flush=True,
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
|
| 61 |
+
success_str = "true" if success else "false"
|
| 62 |
+
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 63 |
+
print(
|
| 64 |
+
f"[END] success={success_str} steps={steps} score={score:.2f} rewards={rewards_str}",
|
| 65 |
+
flush=True,
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
# ββ System prompt βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 70 |
+
|
| 71 |
+
SYSTEM_PROMPT = textwrap.dedent("""\
|
| 72 |
+
You are an expert SRE (Site Reliability Engineer) responding to a production incident.
|
| 73 |
+
You are given the current state of a microservice architecture and must:
|
| 74 |
+
1. Investigate by reading logs, checking metrics, tracing requests, and examining dependencies
|
| 75 |
+
2. Identify the root cause(s)
|
| 76 |
+
3. Apply the correct fix(es)
|
| 77 |
+
4. Submit a diagnosis
|
| 78 |
+
|
| 79 |
+
Available actions (respond with a single JSON object):
|
| 80 |
+
|
| 81 |
+
Investigation actions (require "service" field):
|
| 82 |
+
- read_logs: Read recent logs from a service
|
| 83 |
+
- check_metrics: Get time-series metrics (CPU, memory, latency, error rate)
|
| 84 |
+
- ping_service: Check if service is reachable
|
| 85 |
+
- check_dependencies: See upstream/downstream dependencies and their health
|
| 86 |
+
- inspect_deploy: See deploy history (versions, timestamps)
|
| 87 |
+
- query_traces: See distributed trace spans
|
| 88 |
+
- check_runbook: Get operational runbook for the service
|
| 89 |
+
- diff_config: Compare current vs previous config
|
| 90 |
+
|
| 91 |
+
Remediation actions (require "service" field):
|
| 92 |
+
- restart_service: Restart all pods for a service
|
| 93 |
+
- rollback_deploy: Rollback to a specific version (requires "target_version")
|
| 94 |
+
- scale_up: Increase replica count (requires "replicas")
|
| 95 |
+
- drain_traffic: Stop routing traffic to a service
|
| 96 |
+
|
| 97 |
+
Terminal action:
|
| 98 |
+
- submit_diagnosis: Submit your diagnosis (requires "root_cause_service", "root_cause_category", "fix_description")
|
| 99 |
+
|
| 100 |
+
Root cause categories: oom_crash, db_deadlock, bad_deploy, memory_leak, network_partition, disk_full, config_error, cert_expiry, dns_failure, rate_limit
|
| 101 |
+
|
| 102 |
+
IMPORTANT: Respond with ONLY a JSON object like:
|
| 103 |
+
{"action_type": "read_logs", "service": "auth-service"}
|
| 104 |
+
{"action_type": "rollback_deploy", "service": "payment-service", "target_version": "v3.8.1"}
|
| 105 |
+
{"action_type": "submit_diagnosis", "root_cause_service": "db-postgres", "root_cause_category": "db_deadlock", "fix_description": "Restarted db-postgres to clear deadlock"}
|
| 106 |
+
""")
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
# ββ Helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 110 |
+
|
| 111 |
+
def format_observation(obs_dict: dict) -> str:
|
| 112 |
+
"""Format observation into a readable prompt for the LLM."""
|
| 113 |
+
parts = []
|
| 114 |
+
|
| 115 |
+
if obs_dict.get("incident_summary"):
|
| 116 |
+
parts.append(f"INCIDENT SUMMARY: {obs_dict['incident_summary']}")
|
| 117 |
+
|
| 118 |
+
parts.append(f"\nSTEP: {obs_dict.get('step_number', 0)}")
|
| 119 |
+
|
| 120 |
+
services = obs_dict.get("services", {})
|
| 121 |
+
if services:
|
| 122 |
+
parts.append("\nSERVICE STATUS DASHBOARD:")
|
| 123 |
+
for name, state in services.items():
|
| 124 |
+
status = state.get("status", "UNKNOWN")
|
| 125 |
+
version = state.get("version", "")
|
| 126 |
+
parts.append(f" {name}: {status} (version: {version})")
|
| 127 |
+
|
| 128 |
+
alerts = obs_dict.get("active_alerts", [])
|
| 129 |
+
if alerts:
|
| 130 |
+
parts.append("\nACTIVE ALERTS:")
|
| 131 |
+
for alert in alerts:
|
| 132 |
+
parts.append(f" {alert}")
|
| 133 |
+
|
| 134 |
+
action_result = obs_dict.get("action_result")
|
| 135 |
+
if action_result:
|
| 136 |
+
parts.append(f"\nRESULT OF LAST ACTION:\n{action_result}")
|
| 137 |
+
|
| 138 |
+
return "\n".join(parts)
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def get_model_message(client: OpenAI, obs_text: str, history: List[str]) -> str:
|
| 142 |
+
"""Call the LLM and return the raw response text."""
|
| 143 |
+
messages = [
|
| 144 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 145 |
+
]
|
| 146 |
+
# Include recent history for context
|
| 147 |
+
for h in history[-6:]:
|
| 148 |
+
messages.append({"role": "user", "content": h})
|
| 149 |
+
messages.append({"role": "user", "content": obs_text})
|
| 150 |
+
|
| 151 |
+
try:
|
| 152 |
+
response = client.chat.completions.create(
|
| 153 |
+
model=MODEL_NAME,
|
| 154 |
+
messages=messages,
|
| 155 |
+
temperature=TEMPERATURE,
|
| 156 |
+
max_tokens=512,
|
| 157 |
+
)
|
| 158 |
+
return response.choices[0].message.content
|
| 159 |
+
except Exception as exc:
|
| 160 |
+
print(f"[DEBUG] Model request failed: {exc}", flush=True)
|
| 161 |
+
return '{"action_type": "read_logs", "service": "auth-service"}'
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def parse_action(response_text: str) -> Action:
|
| 165 |
+
"""Parse LLM response into an Action object."""
|
| 166 |
+
text = response_text.strip()
|
| 167 |
+
|
| 168 |
+
if "```json" in text:
|
| 169 |
+
text = text.split("```json")[1].split("```")[0].strip()
|
| 170 |
+
elif "```" in text:
|
| 171 |
+
text = text.split("```")[1].split("```")[0].strip()
|
| 172 |
+
|
| 173 |
+
start = text.find("{")
|
| 174 |
+
end = text.rfind("}")
|
| 175 |
+
if start != -1 and end != -1:
|
| 176 |
+
text = text[start : end + 1]
|
| 177 |
+
|
| 178 |
+
data = json.loads(text)
|
| 179 |
+
return Action(**data)
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
# ββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 183 |
+
|
| 184 |
+
async def run_task(task_id: str) -> float:
|
| 185 |
+
"""Run inference on a single task. Returns score in [0, 1]."""
|
| 186 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 187 |
+
env = IncidentResponseEnv()
|
| 188 |
+
|
| 189 |
+
history: List[str] = []
|
| 190 |
+
rewards: List[float] = []
|
| 191 |
+
steps_taken = 0
|
| 192 |
+
score = 0.0
|
| 193 |
+
success = False
|
| 194 |
+
|
| 195 |
+
log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
|
| 196 |
+
|
| 197 |
+
try:
|
| 198 |
+
obs, session_id = env.reset(task_id=task_id)
|
| 199 |
+
obs_dict = obs.model_dump()
|
| 200 |
+
|
| 201 |
+
for step in range(1, MAX_STEPS + 1):
|
| 202 |
+
if obs_dict.get("done", False):
|
| 203 |
+
break
|
| 204 |
+
|
| 205 |
+
obs_text = format_observation(obs_dict)
|
| 206 |
+
message = get_model_message(client, obs_text, history)
|
| 207 |
+
|
| 208 |
+
try:
|
| 209 |
+
action = parse_action(message)
|
| 210 |
+
error = None
|
| 211 |
+
except Exception as e:
|
| 212 |
+
error = str(e)
|
| 213 |
+
log_step(step=step, action="parse_error", reward=0.0, done=False, error=error)
|
| 214 |
+
rewards.append(0.0)
|
| 215 |
+
steps_taken = step
|
| 216 |
+
history.append(f"Step {step}: parse_error -> reward 0.00")
|
| 217 |
+
continue
|
| 218 |
+
|
| 219 |
+
obs, reward, done, info = env.step(session_id, action)
|
| 220 |
+
obs_dict = obs.model_dump()
|
| 221 |
+
|
| 222 |
+
reward = reward or 0.0
|
| 223 |
+
rewards.append(reward)
|
| 224 |
+
steps_taken = step
|
| 225 |
+
|
| 226 |
+
action_str = action.action_type.value
|
| 227 |
+
if action.service:
|
| 228 |
+
action_str += f"({action.service})"
|
| 229 |
+
|
| 230 |
+
log_step(step=step, action=action_str, reward=reward, done=done, error=error)
|
| 231 |
+
|
| 232 |
+
history.append(f"Step {step}: {action_str} -> reward {reward:+.2f}")
|
| 233 |
+
|
| 234 |
+
if done:
|
| 235 |
+
if "grader_result" in info:
|
| 236 |
+
score = info["grader_result"]["score"]
|
| 237 |
+
break
|
| 238 |
+
|
| 239 |
+
# Clamp score to [0, 1]
|
| 240 |
+
score = min(max(score, 0.0), 1.0)
|
| 241 |
+
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 242 |
+
|
| 243 |
+
finally:
|
| 244 |
+
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 245 |
+
|
| 246 |
+
return score
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
async def main() -> None:
|
| 250 |
+
task_ids = os.getenv("SRE_TASKS", "easy,medium,hard").split(",")
|
| 251 |
+
scores = {}
|
| 252 |
+
|
| 253 |
+
for task_id in task_ids:
|
| 254 |
+
task_id = task_id.strip()
|
| 255 |
+
score = await run_task(task_id)
|
| 256 |
+
scores[task_id] = score
|
| 257 |
+
|
| 258 |
+
print(f"\n{'='*60}", flush=True)
|
| 259 |
+
print("FINAL SCORES:", flush=True)
|
| 260 |
+
for task_id, score in scores.items():
|
| 261 |
+
print(f" {task_id}: {score:.2f}", flush=True)
|
| 262 |
+
avg = sum(scores.values()) / len(scores) if scores else 0
|
| 263 |
+
print(f" AVERAGE: {avg:.2f}", flush=True)
|
| 264 |
+
print(f"{'='*60}", flush=True)
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
if __name__ == "__main__":
|
| 268 |
+
asyncio.run(main())
|
models.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Pydantic models for the SRE Incident Response OpenEnv environment.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from enum import Enum
|
| 6 |
+
from typing import Dict, List, Optional
|
| 7 |
+
|
| 8 |
+
from pydantic import BaseModel, Field
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
# ββ Enums ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 12 |
+
|
| 13 |
+
class ServiceStatus(str, Enum):
|
| 14 |
+
HEALTHY = "HEALTHY"
|
| 15 |
+
DEGRADED = "DEGRADED"
|
| 16 |
+
DOWN = "DOWN"
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class ActionType(str, Enum):
|
| 20 |
+
# Investigation (read-only)
|
| 21 |
+
READ_LOGS = "read_logs"
|
| 22 |
+
CHECK_METRICS = "check_metrics"
|
| 23 |
+
PING_SERVICE = "ping_service"
|
| 24 |
+
CHECK_DEPENDENCIES = "check_dependencies"
|
| 25 |
+
INSPECT_DEPLOY = "inspect_deploy"
|
| 26 |
+
QUERY_TRACES = "query_traces"
|
| 27 |
+
CHECK_RUNBOOK = "check_runbook"
|
| 28 |
+
DIFF_CONFIG = "diff_config"
|
| 29 |
+
|
| 30 |
+
# Remediation (modifies state)
|
| 31 |
+
RESTART_SERVICE = "restart_service"
|
| 32 |
+
ROLLBACK_DEPLOY = "rollback_deploy"
|
| 33 |
+
SCALE_UP = "scale_up"
|
| 34 |
+
DRAIN_TRAFFIC = "drain_traffic"
|
| 35 |
+
|
| 36 |
+
# Terminal
|
| 37 |
+
SUBMIT_DIAGNOSIS = "submit_diagnosis"
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class RootCauseCategory(str, Enum):
|
| 41 |
+
OOM_CRASH = "oom_crash"
|
| 42 |
+
DB_DEADLOCK = "db_deadlock"
|
| 43 |
+
BAD_DEPLOY = "bad_deploy"
|
| 44 |
+
MEMORY_LEAK = "memory_leak"
|
| 45 |
+
NETWORK_PARTITION = "network_partition"
|
| 46 |
+
DISK_FULL = "disk_full"
|
| 47 |
+
CONFIG_ERROR = "config_error"
|
| 48 |
+
CERT_EXPIRY = "cert_expiry"
|
| 49 |
+
DNS_FAILURE = "dns_failure"
|
| 50 |
+
RATE_LIMIT = "rate_limit"
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
INVESTIGATION_ACTIONS = {
|
| 54 |
+
ActionType.READ_LOGS,
|
| 55 |
+
ActionType.CHECK_METRICS,
|
| 56 |
+
ActionType.PING_SERVICE,
|
| 57 |
+
ActionType.CHECK_DEPENDENCIES,
|
| 58 |
+
ActionType.INSPECT_DEPLOY,
|
| 59 |
+
ActionType.QUERY_TRACES,
|
| 60 |
+
ActionType.CHECK_RUNBOOK,
|
| 61 |
+
ActionType.DIFF_CONFIG,
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
REMEDIATION_ACTIONS = {
|
| 65 |
+
ActionType.RESTART_SERVICE,
|
| 66 |
+
ActionType.ROLLBACK_DEPLOY,
|
| 67 |
+
ActionType.SCALE_UP,
|
| 68 |
+
ActionType.DRAIN_TRAFFIC,
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
# ββ Action βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 73 |
+
|
| 74 |
+
class Action(BaseModel):
|
| 75 |
+
action_type: ActionType
|
| 76 |
+
service: Optional[str] = None
|
| 77 |
+
target_version: Optional[str] = None
|
| 78 |
+
replicas: Optional[int] = Field(None, ge=1, le=10)
|
| 79 |
+
root_cause_service: Optional[str] = None
|
| 80 |
+
root_cause_category: Optional[RootCauseCategory] = None
|
| 81 |
+
fix_description: Optional[str] = None
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
# ββ Observation ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 85 |
+
|
| 86 |
+
class ServiceState(BaseModel):
|
| 87 |
+
status: ServiceStatus
|
| 88 |
+
version: str = ""
|
| 89 |
+
replicas: int = 1
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
class Observation(BaseModel):
|
| 93 |
+
step_number: int = 0
|
| 94 |
+
timestamp: str = ""
|
| 95 |
+
services: Dict[str, ServiceState] = Field(default_factory=dict)
|
| 96 |
+
active_alerts: List[str] = Field(default_factory=list)
|
| 97 |
+
incident_summary: str = ""
|
| 98 |
+
action_result: Optional[str] = None
|
| 99 |
+
reward: float = 0.0
|
| 100 |
+
done: bool = False
|
| 101 |
+
score: Optional[float] = None
|
| 102 |
+
info: Dict = Field(default_factory=dict)
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
# ββ State ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 106 |
+
|
| 107 |
+
class State(BaseModel):
|
| 108 |
+
session_id: str = ""
|
| 109 |
+
task_id: str = ""
|
| 110 |
+
step_count: int = 0
|
| 111 |
+
max_steps: int = 0
|
| 112 |
+
done: bool = False
|
| 113 |
+
actions_taken: List[str] = Field(default_factory=list)
|
| 114 |
+
services_investigated: List[str] = Field(default_factory=list)
|
| 115 |
+
remediations_applied: List[str] = Field(default_factory=list)
|
| 116 |
+
cumulative_reward: float = 0.0
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
# ββ Reward βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 120 |
+
|
| 121 |
+
class Reward(BaseModel):
|
| 122 |
+
value: float = Field(0.0, ge=0.0, le=1.0)
|
| 123 |
+
step_reward: float = 0.0
|
| 124 |
+
breakdown: Dict[str, float] = Field(default_factory=dict)
|
| 125 |
+
is_terminal: bool = False
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
# ββ Grader Result ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 129 |
+
|
| 130 |
+
class GraderResult(BaseModel):
|
| 131 |
+
score: float = Field(..., ge=0.0, le=1.0)
|
| 132 |
+
solved: bool = False
|
| 133 |
+
breakdown: Dict[str, float] = Field(default_factory=dict)
|
| 134 |
+
notes: List[str] = Field(default_factory=list)
|
openenv.yaml
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: sre-incident-response
|
| 2 |
+
version: "1.0.0"
|
| 3 |
+
description: "SRE Incident Response environment β train AI agents to diagnose and fix production incidents"
|
| 4 |
+
|
| 5 |
+
tasks:
|
| 6 |
+
- id: easy
|
| 7 |
+
name: Single Service OOM Crash
|
| 8 |
+
difficulty: easy
|
| 9 |
+
max_steps: 15
|
| 10 |
+
- id: medium
|
| 11 |
+
name: Cascading Database Deadlock
|
| 12 |
+
difficulty: medium
|
| 13 |
+
max_steps: 25
|
| 14 |
+
- id: hard
|
| 15 |
+
name: Concurrent Faults with Misleading Evidence
|
| 16 |
+
difficulty: hard
|
| 17 |
+
max_steps: 35
|
| 18 |
+
|
| 19 |
+
models:
|
| 20 |
+
action: models.Action
|
| 21 |
+
observation: models.Observation
|
| 22 |
+
reward: models.Reward
|
| 23 |
+
state: models.State
|
| 24 |
+
|
| 25 |
+
runtime:
|
| 26 |
+
port: 8000
|
| 27 |
+
entrypoint: server.app:app
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.104.0
|
| 2 |
+
uvicorn>=0.24.0
|
| 3 |
+
pydantic>=2.0.0
|
| 4 |
+
openai>=1.0.0
|
| 5 |
+
pyyaml>=6.0
|
server/__init__.py
ADDED
|
File without changes
|
server/app.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI server for the SRE Incident Response OpenEnv environment.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import sys
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
# Add project root to path
|
| 9 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 10 |
+
|
| 11 |
+
from fastapi import FastAPI, HTTPException
|
| 12 |
+
from pydantic import BaseModel
|
| 13 |
+
from typing import Dict, List, Optional
|
| 14 |
+
|
| 15 |
+
from models import Action, Observation, State
|
| 16 |
+
from env.environment import IncidentResponseEnv
|
| 17 |
+
from tasks import SCENARIOS
|
| 18 |
+
|
| 19 |
+
app = FastAPI(
|
| 20 |
+
title="SRE Incident Response Environment",
|
| 21 |
+
description="An OpenEnv environment for training AI agents on production incident response.",
|
| 22 |
+
version="1.0.0",
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
env = IncidentResponseEnv()
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# ββ Request/Response models ββββββββββββββββββββββββββββββββββββββββββββ
|
| 29 |
+
|
| 30 |
+
class ResetRequest(BaseModel):
|
| 31 |
+
task_id: str = "easy"
|
| 32 |
+
seed: int = 0
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class ResetResponse(BaseModel):
|
| 36 |
+
observation: Observation
|
| 37 |
+
session_id: str
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class StepRequest(BaseModel):
|
| 41 |
+
session_id: str
|
| 42 |
+
action: Action
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
class StepResponse(BaseModel):
|
| 46 |
+
observation: Observation
|
| 47 |
+
reward: float
|
| 48 |
+
done: bool
|
| 49 |
+
info: Dict
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
class TaskInfo(BaseModel):
|
| 53 |
+
task_id: str
|
| 54 |
+
name: str
|
| 55 |
+
difficulty: str
|
| 56 |
+
max_steps: int
|
| 57 |
+
description: str
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
# ββ Endpoints ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 61 |
+
|
| 62 |
+
@app.get("/")
|
| 63 |
+
def root():
|
| 64 |
+
return {
|
| 65 |
+
"name": "SRE Incident Response Environment",
|
| 66 |
+
"version": "1.0.0",
|
| 67 |
+
"endpoints": ["/reset", "/step", "/state/{session_id}", "/tasks"],
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
@app.post("/reset", response_model=ResetResponse)
|
| 72 |
+
def reset(request: ResetRequest):
|
| 73 |
+
try:
|
| 74 |
+
obs, session_id = env.reset(task_id=request.task_id, seed=request.seed)
|
| 75 |
+
return ResetResponse(observation=obs, session_id=session_id)
|
| 76 |
+
except ValueError as e:
|
| 77 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
@app.post("/step", response_model=StepResponse)
|
| 81 |
+
def step(request: StepRequest):
|
| 82 |
+
try:
|
| 83 |
+
obs, reward, done, info = env.step(request.session_id, request.action)
|
| 84 |
+
# Ensure info is JSON-serializable
|
| 85 |
+
clean_info = {}
|
| 86 |
+
for k, v in info.items():
|
| 87 |
+
clean_info[k] = v
|
| 88 |
+
return StepResponse(observation=obs, reward=reward, done=done, info=clean_info)
|
| 89 |
+
except ValueError as e:
|
| 90 |
+
raise HTTPException(status_code=400, detail=str(e))
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
@app.get("/state/{session_id}", response_model=State)
|
| 94 |
+
def state(session_id: str):
|
| 95 |
+
try:
|
| 96 |
+
return env.state(session_id)
|
| 97 |
+
except ValueError as e:
|
| 98 |
+
raise HTTPException(status_code=404, detail=str(e))
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
@app.get("/tasks", response_model=List[TaskInfo])
|
| 102 |
+
def tasks():
|
| 103 |
+
result = []
|
| 104 |
+
for tid, scenario in SCENARIOS.items():
|
| 105 |
+
result.append(TaskInfo(
|
| 106 |
+
task_id=tid,
|
| 107 |
+
name=scenario.name,
|
| 108 |
+
difficulty=scenario.difficulty,
|
| 109 |
+
max_steps=scenario.max_steps,
|
| 110 |
+
description=scenario.incident_summary,
|
| 111 |
+
))
|
| 112 |
+
return result
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
# ββ OpenEnv-prefixed aliases βββββββββββββββββββββββββββββββββββββββββββ
|
| 116 |
+
|
| 117 |
+
@app.post("/openenv/reset", response_model=ResetResponse)
|
| 118 |
+
def openenv_reset(request: ResetRequest):
|
| 119 |
+
return reset(request)
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
@app.post("/openenv/step", response_model=StepResponse)
|
| 123 |
+
def openenv_step(request: StepRequest):
|
| 124 |
+
return step(request)
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
@app.get("/openenv/state/{session_id}", response_model=State)
|
| 128 |
+
def openenv_state(session_id: str):
|
| 129 |
+
return state(session_id)
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
@app.get("/openenv/tasks", response_model=List[TaskInfo])
|
| 133 |
+
def openenv_tasks():
|
| 134 |
+
return tasks()
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
# ββ Main βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 138 |
+
|
| 139 |
+
def main():
|
| 140 |
+
import uvicorn
|
| 141 |
+
port = int(os.environ.get("PORT", "8000"))
|
| 142 |
+
uvicorn.run(app, host="0.0.0.0", port=port)
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
if __name__ == "__main__":
|
| 146 |
+
main()
|
tasks/__init__.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Task auto-discovery and registry.
|
| 3 |
+
|
| 4 |
+
Any .py file in this directory that defines a module-level SCENARIO variable
|
| 5 |
+
(an IncidentScenario instance) will be automatically loaded and registered.
|
| 6 |
+
|
| 7 |
+
To add a new task:
|
| 8 |
+
1. Create a new .py file in this directory (e.g., tasks/my_new_task.py)
|
| 9 |
+
2. Define SCENARIO = IncidentScenario(task_id="my_new_task", ...)
|
| 10 |
+
3. That's it β the task will be available via the API automatically.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import importlib
|
| 14 |
+
import pkgutil
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
from typing import Dict
|
| 17 |
+
|
| 18 |
+
from env.scenario import IncidentScenario
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _discover_scenarios() -> Dict[str, IncidentScenario]:
|
| 22 |
+
"""Scan all .py files in tasks/ and collect SCENARIO instances."""
|
| 23 |
+
scenarios: Dict[str, IncidentScenario] = {}
|
| 24 |
+
package_dir = Path(__file__).parent
|
| 25 |
+
|
| 26 |
+
for finder, module_name, is_pkg in pkgutil.iter_modules([str(package_dir)]):
|
| 27 |
+
if module_name.startswith("_"):
|
| 28 |
+
continue
|
| 29 |
+
try:
|
| 30 |
+
module = importlib.import_module(f"tasks.{module_name}")
|
| 31 |
+
scenario = getattr(module, "SCENARIO", None)
|
| 32 |
+
if isinstance(scenario, IncidentScenario):
|
| 33 |
+
scenarios[scenario.task_id] = scenario
|
| 34 |
+
except Exception as e:
|
| 35 |
+
print(f"Warning: failed to load task module 'tasks.{module_name}': {e}")
|
| 36 |
+
|
| 37 |
+
return scenarios
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
SCENARIOS: Dict[str, IncidentScenario] = _discover_scenarios()
|
tasks/easy_oom.py
ADDED
|
@@ -0,0 +1,299 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Task: Single Service OOM Crash
|
| 3 |
+
To add a new task, copy this file, modify the SCENARIO definition, and place it in tasks/.
|
| 4 |
+
The task loader will auto-discover it.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from env.scenario import IncidentScenario, RequiredFix, ServiceConfig
|
| 8 |
+
from models import RootCauseCategory, ServiceStatus
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
SCENARIO = IncidentScenario(
|
| 12 |
+
task_id="easy",
|
| 13 |
+
name="Single Service OOM Crash",
|
| 14 |
+
difficulty="easy",
|
| 15 |
+
max_steps=15,
|
| 16 |
+
incident_summary=(
|
| 17 |
+
"PagerDuty alert fired at 02:15 UTC. auth-service is down with elevated error rates "
|
| 18 |
+
"and pod restarts. api-gateway reporting 503s on login endpoints. Other services appear "
|
| 19 |
+
"unaffected. On-call engineer needed to investigate and restore service."
|
| 20 |
+
),
|
| 21 |
+
|
| 22 |
+
services={
|
| 23 |
+
"api-gateway": ServiceConfig(
|
| 24 |
+
status=ServiceStatus.DEGRADED, deps=["auth-service", "user-service", "payment-service"],
|
| 25 |
+
version="v1.12.0", replicas=3,
|
| 26 |
+
),
|
| 27 |
+
"auth-service": ServiceConfig(
|
| 28 |
+
status=ServiceStatus.DOWN, deps=["cache-redis"],
|
| 29 |
+
version="v2.14.0", replicas=2, is_root_cause=True, fault_type="oom_crash",
|
| 30 |
+
),
|
| 31 |
+
"user-service": ServiceConfig(
|
| 32 |
+
status=ServiceStatus.HEALTHY, deps=["db-postgres"],
|
| 33 |
+
version="v4.2.1", replicas=2,
|
| 34 |
+
),
|
| 35 |
+
"payment-service": ServiceConfig(
|
| 36 |
+
status=ServiceStatus.HEALTHY, deps=["db-postgres"],
|
| 37 |
+
version="v3.8.1", replicas=2,
|
| 38 |
+
),
|
| 39 |
+
"db-postgres": ServiceConfig(
|
| 40 |
+
status=ServiceStatus.HEALTHY, deps=[],
|
| 41 |
+
version="v15.4", replicas=1,
|
| 42 |
+
),
|
| 43 |
+
"cache-redis": ServiceConfig(
|
| 44 |
+
status=ServiceStatus.HEALTHY, deps=[],
|
| 45 |
+
version="v7.2.4", replicas=1,
|
| 46 |
+
),
|
| 47 |
+
"notification-service": ServiceConfig(
|
| 48 |
+
status=ServiceStatus.HEALTHY, deps=["auth-service"],
|
| 49 |
+
version="v1.5.0", replicas=1,
|
| 50 |
+
),
|
| 51 |
+
},
|
| 52 |
+
|
| 53 |
+
initial_alerts=[
|
| 54 |
+
"[ALERT SEV-2] auth-service: error rate >50%, pod restarts detected (3 restarts in 5m)",
|
| 55 |
+
"[ALERT SEV-3] api-gateway: elevated 503 responses on /api/v2/login and /api/v2/verify",
|
| 56 |
+
],
|
| 57 |
+
|
| 58 |
+
logs={
|
| 59 |
+
"auth-service": [
|
| 60 |
+
"2026-04-06T02:10:01Z INFO [auth-service] Request processed: POST /auth/token uid=user_8832 latency=45ms",
|
| 61 |
+
"2026-04-06T02:10:02Z INFO [auth-service] Request processed: POST /auth/token uid=user_1204 latency=52ms",
|
| 62 |
+
"2026-04-06T02:10:03Z INFO [auth-service] Cache hit for session sid=a8f32c, returning cached token",
|
| 63 |
+
"2026-04-06T02:10:04Z INFO [auth-service] Request processed: POST /auth/verify uid=user_6650 latency=41ms",
|
| 64 |
+
"2026-04-06T02:10:05Z INFO [auth-service] Request processed: POST /auth/verify uid=user_3310 latency=38ms",
|
| 65 |
+
"2026-04-06T02:10:06Z INFO [auth-service] Cache hit for session sid=b2e19f, returning cached token",
|
| 66 |
+
"2026-04-06T02:10:07Z INFO [auth-service] Request processed: POST /auth/token uid=user_7712 latency=48ms",
|
| 67 |
+
"2026-04-06T02:10:08Z DEBUG [auth-service] GC pause: 120ms (heap=1.8GB/2.0GB)",
|
| 68 |
+
"2026-04-06T02:10:09Z INFO [auth-service] Request processed: POST /auth/token uid=user_2290 latency=155ms",
|
| 69 |
+
"2026-04-06T02:10:10Z INFO [auth-service] Request processed: POST /auth/token uid=user_5571 latency=310ms",
|
| 70 |
+
"2026-04-06T02:10:11Z WARN [auth-service] Heap usage at 91% (1.82GB/2.0GB), approaching limit",
|
| 71 |
+
"2026-04-06T02:10:12Z INFO [auth-service] Request processed: POST /auth/token uid=user_9912 latency=580ms",
|
| 72 |
+
"2026-04-06T02:10:13Z INFO [auth-service] Request processed: POST /auth/verify uid=user_4105 latency=620ms",
|
| 73 |
+
"2026-04-06T02:10:14Z WARN [auth-service] GC overhead limit exceeded, full GC triggered (heap=1.95GB/2.0GB)",
|
| 74 |
+
"2026-04-06T02:10:15Z INFO [auth-service] Full GC completed in 2100ms, freed 50MB",
|
| 75 |
+
"2026-04-06T02:10:16Z INFO [auth-service] Request processed: POST /auth/token uid=user_4423 latency=2400ms",
|
| 76 |
+
"2026-04-06T02:10:17Z INFO [auth-service] Request processed: POST /auth/verify uid=user_8001 latency=1900ms",
|
| 77 |
+
"2026-04-06T02:10:18Z ERROR [auth-service] OutOfMemoryError: unable to allocate 64MB for token cache expansion",
|
| 78 |
+
"2026-04-06T02:10:18Z ERROR [auth-service] Worker pid=1842 killed by OOM killer (resident=2.01GB, limit=2.0GB)",
|
| 79 |
+
"2026-04-06T02:10:19Z WARN [auth-service] Process supervisor restarting worker (attempt 1/3)",
|
| 80 |
+
"2026-04-06T02:10:22Z INFO [auth-service] Worker pid=1901 started, initializing token cache...",
|
| 81 |
+
"2026-04-06T02:10:25Z INFO [auth-service] Request processed: POST /auth/token uid=user_7781 latency=65ms",
|
| 82 |
+
"2026-04-06T02:10:28Z INFO [auth-service] Request processed: POST /auth/token uid=user_2209 latency=72ms",
|
| 83 |
+
"2026-04-06T02:10:30Z INFO [auth-service] Cache hit for session sid=f8a21c, returning cached token",
|
| 84 |
+
"2026-04-06T02:10:33Z INFO [auth-service] Request processed: POST /auth/verify uid=user_3390 latency=55ms",
|
| 85 |
+
"2026-04-06T02:10:35Z INFO [auth-service] Request processed: POST /auth/token uid=user_1150 latency=68ms",
|
| 86 |
+
"2026-04-06T02:10:40Z INFO [auth-service] Request processed: POST /auth/token uid=user_4482 latency=75ms",
|
| 87 |
+
"2026-04-06T02:10:45Z DEBUG [auth-service] GC pause: 85ms (heap=1.5GB/2.0GB)",
|
| 88 |
+
"2026-04-06T02:11:00Z INFO [auth-service] Request processed: POST /auth/token uid=user_6633 latency=90ms",
|
| 89 |
+
"2026-04-06T02:11:30Z INFO [auth-service] Request processed: POST /auth/verify uid=user_9901 latency=110ms",
|
| 90 |
+
"2026-04-06T02:12:00Z WARN [auth-service] Heap usage at 82% (1.64GB/2.0GB) β growing again after restart",
|
| 91 |
+
"2026-04-06T02:12:30Z INFO [auth-service] Request processed: POST /auth/token uid=user_5510 latency=180ms",
|
| 92 |
+
"2026-04-06T02:12:45Z WARN [auth-service] Heap usage at 88% (1.76GB/2.0GB) β growing linearly",
|
| 93 |
+
"2026-04-06T02:13:00Z DEBUG [auth-service] GC pause: 350ms (heap=1.85GB/2.0GB)",
|
| 94 |
+
"2026-04-06T02:13:05Z INFO [auth-service] Request processed: POST /auth/token uid=user_8820 latency=890ms",
|
| 95 |
+
"2026-04-06T02:13:10Z ERROR [auth-service] OutOfMemoryError: unable to allocate 32MB for request buffer",
|
| 96 |
+
"2026-04-06T02:13:10Z ERROR [auth-service] Worker pid=1901 killed by OOM killer (resident=1.98GB, limit=2.0GB)",
|
| 97 |
+
"2026-04-06T02:13:11Z WARN [auth-service] Process supervisor restarting worker (attempt 2/3)",
|
| 98 |
+
"2026-04-06T02:13:14Z INFO [auth-service] Worker pid=1955 started, initializing token cache...",
|
| 99 |
+
"2026-04-06T02:13:20Z INFO [auth-service] Request processed: POST /auth/token uid=user_1122 latency=58ms",
|
| 100 |
+
"2026-04-06T02:13:45Z WARN [auth-service] Heap usage at 80% (1.60GB/2.0GB)",
|
| 101 |
+
"2026-04-06T02:14:15Z WARN [auth-service] Heap usage at 87% (1.74GB/2.0GB)",
|
| 102 |
+
"2026-04-06T02:14:45Z WARN [auth-service] GC overhead limit exceeded, full GC triggered",
|
| 103 |
+
"2026-04-06T02:15:00Z INFO [auth-service] Full GC completed in 2800ms, freed 30MB β diminishing returns",
|
| 104 |
+
"2026-04-06T02:15:20Z ERROR [auth-service] OutOfMemoryError: Java heap space",
|
| 105 |
+
"2026-04-06T02:15:33Z ERROR [auth-service] Worker pid=1955 killed by OOM killer (resident=2.03GB, limit=2.0GB)",
|
| 106 |
+
"2026-04-06T02:15:34Z ERROR [auth-service] Process supervisor: all 3 restart attempts exhausted",
|
| 107 |
+
"2026-04-06T02:15:34Z FATAL [auth-service] Service entering crash loop backoff β no healthy workers remaining",
|
| 108 |
+
"2026-04-06T02:15:35Z ERROR [auth-service] Health check failed: connection refused on :8080/healthz",
|
| 109 |
+
],
|
| 110 |
+
"api-gateway": [
|
| 111 |
+
"2026-04-06T02:10:01Z INFO [api-gateway] Route: POST /api/v2/login -> auth-service (200, 48ms)",
|
| 112 |
+
"2026-04-06T02:10:02Z INFO [api-gateway] Route: GET /api/v2/user/profile -> user-service (200, 32ms)",
|
| 113 |
+
"2026-04-06T02:10:03Z INFO [api-gateway] Route: POST /api/v2/pay -> payment-service (200, 95ms)",
|
| 114 |
+
"2026-04-06T02:10:05Z INFO [api-gateway] Route: GET /api/v2/user/settings -> user-service (200, 28ms)",
|
| 115 |
+
"2026-04-06T02:10:08Z INFO [api-gateway] Route: POST /api/v2/login -> auth-service (200, 155ms)",
|
| 116 |
+
"2026-04-06T02:10:10Z INFO [api-gateway] Route: POST /api/v2/login -> auth-service (200, 320ms)",
|
| 117 |
+
"2026-04-06T02:10:15Z WARN [api-gateway] Route: POST /api/v2/login -> auth-service (200, 2500ms) β slow",
|
| 118 |
+
"2026-04-06T02:10:18Z ERROR [api-gateway] Route: POST /api/v2/login -> auth-service (503, timeout after 5000ms)",
|
| 119 |
+
"2026-04-06T02:10:20Z INFO [api-gateway] Route: GET /api/v2/user/profile -> user-service (200, 30ms)",
|
| 120 |
+
"2026-04-06T02:10:22Z INFO [api-gateway] Route: POST /api/v2/login -> auth-service (200, 68ms)",
|
| 121 |
+
"2026-04-06T02:10:25Z INFO [api-gateway] Route: POST /api/v2/pay -> payment-service (200, 88ms)",
|
| 122 |
+
"2026-04-06T02:13:10Z ERROR [api-gateway] Route: POST /api/v2/login -> auth-service (503, timeout after 5000ms)",
|
| 123 |
+
"2026-04-06T02:13:12Z WARN [api-gateway] Retrying auth-service request (attempt 2/3)",
|
| 124 |
+
"2026-04-06T02:13:17Z ERROR [api-gateway] Route: POST /api/v2/login -> auth-service (503, timeout after 5000ms)",
|
| 125 |
+
"2026-04-06T02:15:35Z ERROR [api-gateway] Route: POST /api/v2/login -> auth-service (503, connection refused)",
|
| 126 |
+
"2026-04-06T02:15:36Z WARN [api-gateway] Circuit breaker OPEN for auth-service (failures=10, threshold=5)",
|
| 127 |
+
"2026-04-06T02:15:37Z ERROR [api-gateway] Route: POST /api/v2/login -> auth-service (503, circuit breaker open)",
|
| 128 |
+
"2026-04-06T02:15:37Z ERROR [api-gateway] Route: POST /api/v2/verify -> auth-service (503, circuit breaker open)",
|
| 129 |
+
"2026-04-06T02:15:38Z INFO [api-gateway] Route: GET /api/v2/user/profile -> user-service (200, 28ms)",
|
| 130 |
+
"2026-04-06T02:15:40Z INFO [api-gateway] Route: POST /api/v2/pay -> payment-service (200, 95ms)",
|
| 131 |
+
"2026-04-06T02:15:42Z INFO [api-gateway] Route: GET /api/v2/user/settings -> user-service (200, 25ms)",
|
| 132 |
+
],
|
| 133 |
+
"user-service": [
|
| 134 |
+
"2026-04-06T02:10:01Z INFO [user-service] GET /users/profile uid=user_4421 -> 200 (32ms)",
|
| 135 |
+
"2026-04-06T02:10:05Z INFO [user-service] GET /users/settings uid=user_8832 -> 200 (28ms)",
|
| 136 |
+
"2026-04-06T02:10:10Z INFO [user-service] PUT /users/profile uid=user_3310 -> 200 (85ms)",
|
| 137 |
+
"2026-04-06T02:10:15Z INFO [user-service] GET /users/profile uid=user_1101 -> 200 (30ms)",
|
| 138 |
+
"2026-04-06T02:10:20Z INFO [user-service] GET /users/profile uid=user_5571 -> 200 (27ms)",
|
| 139 |
+
"2026-04-06T02:15:00Z INFO [user-service] GET /users/profile uid=user_7712 -> 200 (31ms)",
|
| 140 |
+
"2026-04-06T02:15:30Z INFO [user-service] PUT /users/settings uid=user_2209 -> 200 (78ms)",
|
| 141 |
+
"2026-04-06T02:15:35Z INFO [user-service] GET /users/profile uid=user_9901 -> 200 (26ms)",
|
| 142 |
+
],
|
| 143 |
+
"payment-service": [
|
| 144 |
+
"2026-04-06T02:10:01Z INFO [payment-service] Processing payment txn=pay_8832 amount=$45.00 -> db-postgres",
|
| 145 |
+
"2026-04-06T02:10:02Z INFO [payment-service] Payment completed txn=pay_8832 latency=85ms",
|
| 146 |
+
"2026-04-06T02:10:10Z INFO [payment-service] Processing payment txn=pay_1120 amount=$12.99 -> db-postgres",
|
| 147 |
+
"2026-04-06T02:10:10Z INFO [payment-service] Payment completed txn=pay_1120 latency=92ms",
|
| 148 |
+
"2026-04-06T02:15:00Z INFO [payment-service] Processing payment txn=pay_4455 amount=$78.50 -> db-postgres",
|
| 149 |
+
"2026-04-06T02:15:01Z INFO [payment-service] Payment completed txn=pay_4455 latency=88ms",
|
| 150 |
+
"2026-04-06T02:15:30Z INFO [payment-service] Health check /healthz -> 200 OK",
|
| 151 |
+
],
|
| 152 |
+
"db-postgres": [
|
| 153 |
+
"2026-04-06T02:00:00Z INFO [db-postgres] Checkpoint starting: time-based",
|
| 154 |
+
"2026-04-06T02:00:02Z INFO [db-postgres] Checkpoint complete: wrote 842 buffers (5.7%)",
|
| 155 |
+
"2026-04-06T02:10:00Z INFO [db-postgres] Active connections: 35/100",
|
| 156 |
+
"2026-04-06T02:10:01Z INFO [db-postgres] Autovacuum: processing table users (dead tuples: 120)",
|
| 157 |
+
"2026-04-06T02:15:00Z INFO [db-postgres] Active connections: 34/100",
|
| 158 |
+
"2026-04-06T02:15:01Z INFO [db-postgres] Checkpoint starting: time-based",
|
| 159 |
+
"2026-04-06T02:15:03Z INFO [db-postgres] Checkpoint complete: wrote 910 buffers (6.2%)",
|
| 160 |
+
],
|
| 161 |
+
"cache-redis": [
|
| 162 |
+
"2026-04-06T02:10:00Z INFO [cache-redis] Memory usage: 1.2GB/4.0GB (30%)",
|
| 163 |
+
"2026-04-06T02:10:01Z INFO [cache-redis] Cache hit ratio: 92% (within normal range 85-95%)",
|
| 164 |
+
"2026-04-06T02:10:05Z INFO [cache-redis] Connected clients: 45",
|
| 165 |
+
"2026-04-06T02:15:00Z INFO [cache-redis] Memory usage: 1.2GB/4.0GB (30%)",
|
| 166 |
+
"2026-04-06T02:15:01Z INFO [cache-redis] Cache hit ratio: 91%",
|
| 167 |
+
"2026-04-06T02:15:05Z INFO [cache-redis] Key evictions: 0 in last 5m",
|
| 168 |
+
],
|
| 169 |
+
"notification-service": [
|
| 170 |
+
"2026-04-06T02:10:00Z INFO [notification-service] Email batch #4420 sent successfully (12 emails)",
|
| 171 |
+
"2026-04-06T02:10:05Z INFO [notification-service] Auth token validated for batch #4421 (45ms)",
|
| 172 |
+
"2026-04-06T02:15:00Z INFO [notification-service] Email batch #4425 sent successfully (8 emails)",
|
| 173 |
+
"2026-04-06T02:15:30Z INFO [notification-service] Health check /healthz -> 200 OK",
|
| 174 |
+
],
|
| 175 |
+
},
|
| 176 |
+
|
| 177 |
+
metrics={
|
| 178 |
+
"auth-service": [
|
| 179 |
+
{"timestamp": "2026-04-06T02:00:00Z", "cpu_pct": 25, "mem_pct": 60, "heap_gb": 1.2, "latency_p50": 45, "latency_p99": 120, "error_rate": 0.001, "restarts": 0, "connections": 150},
|
| 180 |
+
{"timestamp": "2026-04-06T02:05:00Z", "cpu_pct": 30, "mem_pct": 72, "heap_gb": 1.44, "latency_p50": 52, "latency_p99": 180, "error_rate": 0.002, "restarts": 0, "connections": 155},
|
| 181 |
+
{"timestamp": "2026-04-06T02:10:00Z", "cpu_pct": 45, "mem_pct": 91, "heap_gb": 1.82, "latency_p50": 310, "latency_p99": 2400, "error_rate": 0.15, "restarts": 1, "connections": 148},
|
| 182 |
+
{"timestamp": "2026-04-06T02:11:00Z", "cpu_pct": 35, "mem_pct": 65, "heap_gb": 1.30, "latency_p50": 65, "latency_p99": 200, "error_rate": 0.02, "restarts": 1, "connections": 140},
|
| 183 |
+
{"timestamp": "2026-04-06T02:13:00Z", "cpu_pct": 48, "mem_pct": 94, "heap_gb": 1.88, "latency_p50": 450, "latency_p99": 3100, "error_rate": 0.20, "restarts": 2, "connections": 130},
|
| 184 |
+
{"timestamp": "2026-04-06T02:15:00Z", "cpu_pct": 0, "mem_pct": 0, "heap_gb": 0, "latency_p50": 0, "latency_p99": 0, "error_rate": 1.0, "restarts": 3, "connections": 0},
|
| 185 |
+
],
|
| 186 |
+
"api-gateway": [
|
| 187 |
+
{"timestamp": "2026-04-06T02:00:00Z", "cpu_pct": 20, "mem_pct": 45, "latency_p50": 32, "latency_p99": 85, "error_rate": 0.001, "5xx_rate": 0.001},
|
| 188 |
+
{"timestamp": "2026-04-06T02:10:00Z", "cpu_pct": 22, "mem_pct": 46, "latency_p50": 35, "latency_p99": 95, "error_rate": 0.005, "5xx_rate": 0.003},
|
| 189 |
+
{"timestamp": "2026-04-06T02:15:00Z", "cpu_pct": 25, "mem_pct": 48, "latency_p50": 40, "latency_p99": 5200, "error_rate": 0.42, "5xx_rate": 0.40},
|
| 190 |
+
],
|
| 191 |
+
"user-service": [
|
| 192 |
+
{"timestamp": "2026-04-06T02:00:00Z", "cpu_pct": 15, "mem_pct": 35, "latency_p50": 28, "latency_p99": 75, "error_rate": 0.001},
|
| 193 |
+
{"timestamp": "2026-04-06T02:15:00Z", "cpu_pct": 15, "mem_pct": 35, "latency_p50": 29, "latency_p99": 78, "error_rate": 0.001},
|
| 194 |
+
],
|
| 195 |
+
"payment-service": [
|
| 196 |
+
{"timestamp": "2026-04-06T02:00:00Z", "cpu_pct": 18, "mem_pct": 40, "latency_p50": 85, "latency_p99": 150, "error_rate": 0.001},
|
| 197 |
+
{"timestamp": "2026-04-06T02:15:00Z", "cpu_pct": 18, "mem_pct": 40, "latency_p50": 88, "latency_p99": 155, "error_rate": 0.001},
|
| 198 |
+
],
|
| 199 |
+
"db-postgres": [
|
| 200 |
+
{"timestamp": "2026-04-06T02:00:00Z", "cpu_pct": 30, "mem_pct": 55, "connections": 35, "active_locks": 2, "deadlocks": 0, "write_iops": 1200, "read_iops": 3500},
|
| 201 |
+
{"timestamp": "2026-04-06T02:15:00Z", "cpu_pct": 32, "mem_pct": 55, "connections": 34, "active_locks": 2, "deadlocks": 0, "write_iops": 1150, "read_iops": 3400},
|
| 202 |
+
],
|
| 203 |
+
"cache-redis": [
|
| 204 |
+
{"timestamp": "2026-04-06T02:00:00Z", "mem_gb": 1.2, "mem_pct": 30, "hit_ratio": 0.92, "evictions_per_s": 0, "connections": 45},
|
| 205 |
+
{"timestamp": "2026-04-06T02:15:00Z", "mem_gb": 1.2, "mem_pct": 30, "hit_ratio": 0.91, "evictions_per_s": 0, "connections": 45},
|
| 206 |
+
],
|
| 207 |
+
},
|
| 208 |
+
|
| 209 |
+
traces={
|
| 210 |
+
"auth-service": [
|
| 211 |
+
"No recent traces available β service is down. Last successful trace:",
|
| 212 |
+
"Trace: POST /auth/token (uid=user_4423, total=2400ms) β BEFORE CRASH",
|
| 213 |
+
" ββ auth-service.checkSessionCache() 5ms (cache-redis HIT)",
|
| 214 |
+
" ββ auth-service.generateToken() 45ms",
|
| 215 |
+
" ββ auth-service.GC_FULL_PAUSE 2100ms β GC dominated total time",
|
| 216 |
+
" ββ auth-service.writeResponse() 250ms",
|
| 217 |
+
],
|
| 218 |
+
"api-gateway": [
|
| 219 |
+
"Trace: POST /api/v2/login (total=5005ms) β TIMEOUT",
|
| 220 |
+
" ββ api-gateway.parseRequest() 2ms",
|
| 221 |
+
" ββ api-gateway.routeToAuthService() 5000ms (TIMEOUT β auth-service unreachable)",
|
| 222 |
+
" ββ api-gateway.returnError() 3ms (503 Service Unavailable)",
|
| 223 |
+
],
|
| 224 |
+
},
|
| 225 |
+
|
| 226 |
+
deploy_history={
|
| 227 |
+
"auth-service": [
|
| 228 |
+
"v2.14.0 deployed 2026-04-01T10:00:00Z status=stable (running 5 days, no issues)",
|
| 229 |
+
"v2.13.2 deployed 2026-03-25T14:00:00Z status=superseded",
|
| 230 |
+
],
|
| 231 |
+
"api-gateway": [
|
| 232 |
+
"v1.12.0 deployed 2026-03-28T09:00:00Z status=stable (running 9 days)",
|
| 233 |
+
],
|
| 234 |
+
"user-service": [
|
| 235 |
+
"v4.2.1 deployed 2026-04-05T16:00:00Z status=stable (running 10 hours)",
|
| 236 |
+
"v4.2.0 deployed 2026-04-01T11:00:00Z status=superseded",
|
| 237 |
+
],
|
| 238 |
+
"payment-service": [
|
| 239 |
+
"v3.8.1 deployed 2026-04-03T14:00:00Z status=stable (running 3 days)",
|
| 240 |
+
],
|
| 241 |
+
},
|
| 242 |
+
|
| 243 |
+
runbooks={
|
| 244 |
+
"auth-service": (
|
| 245 |
+
"## auth-service Runbook\n"
|
| 246 |
+
"- OOM crashes: Check heap usage trends in metrics. If memory grows linearly after\n"
|
| 247 |
+
" restart, likely a memory leak in the token cache. Short-term fix: restart to clear\n"
|
| 248 |
+
" cached state. Long-term: file ticket for cache eviction policy fix.\n"
|
| 249 |
+
"- High latency: Check cache-redis connectivity. Auth-service falls back to DB lookups\n"
|
| 250 |
+
" if cache is down, which increases latency 10x.\n"
|
| 251 |
+
"- Connection refused: Service may be in crash loop. Check restart count and supervisor logs.\n"
|
| 252 |
+
"- Token validation failures: Check if JWT signing key was recently rotated."
|
| 253 |
+
),
|
| 254 |
+
"api-gateway": (
|
| 255 |
+
"## api-gateway Runbook\n"
|
| 256 |
+
"- 503 errors: Check downstream service health. Gateway proxies to auth-service,\n"
|
| 257 |
+
" user-service, and payment-service. Identify which downstream is failing.\n"
|
| 258 |
+
"- Circuit breaker open: Downstream service has exceeded failure threshold.\n"
|
| 259 |
+
" Fix the downstream service; circuit breaker will auto-close after 30s of healthy responses.\n"
|
| 260 |
+
"- High latency: Usually caused by slow downstream. Check traces to identify bottleneck."
|
| 261 |
+
),
|
| 262 |
+
},
|
| 263 |
+
|
| 264 |
+
configs={
|
| 265 |
+
"auth-service": {
|
| 266 |
+
"current": "JVM_HEAP_MAX=2g\nTOKEN_CACHE_SIZE=500000\nSESSION_TTL=3600\nREDIS_POOL_SIZE=20",
|
| 267 |
+
"previous": "JVM_HEAP_MAX=2g\nTOKEN_CACHE_SIZE=500000\nSESSION_TTL=3600\nREDIS_POOL_SIZE=20",
|
| 268 |
+
"diff": "No changes β config has not been modified recently.",
|
| 269 |
+
},
|
| 270 |
+
},
|
| 271 |
+
|
| 272 |
+
dependencies={
|
| 273 |
+
"api-gateway": ["auth-service", "user-service", "payment-service"],
|
| 274 |
+
"auth-service": ["cache-redis"],
|
| 275 |
+
"user-service": ["db-postgres"],
|
| 276 |
+
"payment-service": ["db-postgres"],
|
| 277 |
+
"db-postgres": [],
|
| 278 |
+
"cache-redis": [],
|
| 279 |
+
"notification-service": ["auth-service"],
|
| 280 |
+
},
|
| 281 |
+
|
| 282 |
+
root_cause_services=["auth-service"],
|
| 283 |
+
root_cause_categories=[RootCauseCategory.OOM_CRASH],
|
| 284 |
+
required_fixes=[
|
| 285 |
+
RequiredFix(action="restart_service", service="auth-service"),
|
| 286 |
+
],
|
| 287 |
+
diagnosis_keywords=["auth-service", "oom", "out of memory", "memory", "crash", "restart"],
|
| 288 |
+
|
| 289 |
+
weights={
|
| 290 |
+
"correct_service": 0.30,
|
| 291 |
+
"correct_category": 0.20,
|
| 292 |
+
"correct_fix": 0.30,
|
| 293 |
+
"secondary_fix": 0.00,
|
| 294 |
+
"diagnosis_text": 0.10,
|
| 295 |
+
"investigation": 0.10,
|
| 296 |
+
"wrong_penalty": 0.03,
|
| 297 |
+
},
|
| 298 |
+
)
|
| 299 |
+
|
tasks/hard_concurrent.py
ADDED
|
@@ -0,0 +1,353 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Task: Concurrent Faults with Misleading Evidence
|
| 3 |
+
To add a new task, copy this file, modify the SCENARIO definition, and place it in tasks/.
|
| 4 |
+
The task loader will auto-discover it.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from env.scenario import IncidentScenario, RequiredFix, ServiceConfig
|
| 8 |
+
from models import RootCauseCategory, ServiceStatus
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
SCENARIO = IncidentScenario(
|
| 12 |
+
task_id="hard",
|
| 13 |
+
name="Concurrent Faults with Misleading Evidence",
|
| 14 |
+
difficulty="hard",
|
| 15 |
+
max_steps=35,
|
| 16 |
+
incident_summary=(
|
| 17 |
+
"SEV-1 incident declared at 04:00 UTC. Multiple services affected simultaneously. "
|
| 18 |
+
"payment-service is completely down after a recent deploy. auth-service showing intermittent "
|
| 19 |
+
"timeouts and session validation failures. notification-service queue backing up. "
|
| 20 |
+
"user-service has config warnings. api-gateway showing >30% error rate across multiple "
|
| 21 |
+
"endpoints. Need to identify ALL root causes and restore full system health."
|
| 22 |
+
),
|
| 23 |
+
|
| 24 |
+
services={
|
| 25 |
+
"api-gateway": ServiceConfig(
|
| 26 |
+
status=ServiceStatus.DEGRADED, deps=["auth-service", "user-service", "payment-service"],
|
| 27 |
+
version="v1.12.0", replicas=3,
|
| 28 |
+
),
|
| 29 |
+
"auth-service": ServiceConfig(
|
| 30 |
+
status=ServiceStatus.DEGRADED, deps=["cache-redis"],
|
| 31 |
+
version="v2.14.0", replicas=2,
|
| 32 |
+
),
|
| 33 |
+
"user-service": ServiceConfig(
|
| 34 |
+
status=ServiceStatus.HEALTHY, deps=["db-postgres"],
|
| 35 |
+
version="v4.2.1", replicas=2,
|
| 36 |
+
),
|
| 37 |
+
"payment-service": ServiceConfig(
|
| 38 |
+
status=ServiceStatus.DOWN, deps=["db-postgres"],
|
| 39 |
+
version="v3.8.2", replicas=2, is_root_cause=True, fault_type="bad_deploy",
|
| 40 |
+
),
|
| 41 |
+
"db-postgres": ServiceConfig(
|
| 42 |
+
status=ServiceStatus.HEALTHY, deps=[],
|
| 43 |
+
version="v15.4", replicas=1,
|
| 44 |
+
),
|
| 45 |
+
"cache-redis": ServiceConfig(
|
| 46 |
+
status=ServiceStatus.DEGRADED, deps=[],
|
| 47 |
+
version="v7.2.4", replicas=1, is_root_cause=True, fault_type="memory_leak",
|
| 48 |
+
),
|
| 49 |
+
"notification-service": ServiceConfig(
|
| 50 |
+
status=ServiceStatus.DEGRADED, deps=["auth-service"],
|
| 51 |
+
version="v1.5.0", replicas=1,
|
| 52 |
+
),
|
| 53 |
+
},
|
| 54 |
+
|
| 55 |
+
initial_alerts=[
|
| 56 |
+
"[ALERT SEV-1] api-gateway: error rate >30%, multiple downstream failures detected",
|
| 57 |
+
"[ALERT SEV-1] payment-service: health check failing, 0/2 pods ready, CrashLoopBackOff",
|
| 58 |
+
"[ALERT SEV-2] auth-service: intermittent 500 errors, session validation latency >3s",
|
| 59 |
+
"[ALERT SEV-2] notification-service: email delivery queue depth >2000, processing stalled",
|
| 60 |
+
"[ALERT SEV-3] user-service: config validation warning (non-critical)",
|
| 61 |
+
],
|
| 62 |
+
|
| 63 |
+
logs={
|
| 64 |
+
"payment-service": [
|
| 65 |
+
"2026-04-06T04:00:00Z INFO [payment-service] Deploying v3.8.2 (previous: v3.8.1)",
|
| 66 |
+
"2026-04-06T04:00:01Z INFO [payment-service] Container image pulled: registry.internal/payment-service:v3.8.2",
|
| 67 |
+
"2026-04-06T04:00:02Z INFO [payment-service] Pod payment-service-7d4f8b-xk9m2 starting...",
|
| 68 |
+
"2026-04-06T04:00:03Z INFO [payment-service] Starting health check sequence...",
|
| 69 |
+
"2026-04-06T04:00:04Z INFO [payment-service] Loading configuration from ConfigMap...",
|
| 70 |
+
"2026-04-06T04:00:05Z INFO [payment-service] Initializing payment validation module v2 (new in v3.8.2)",
|
| 71 |
+
"2026-04-06T04:00:05Z ERROR [payment-service] NullPointerException in PaymentValidatorV2.initialize(): config.getValidationRules() returned null",
|
| 72 |
+
"2026-04-06T04:00:05Z ERROR [payment-service] Stack trace:",
|
| 73 |
+
" at com.acme.payment.validator.PaymentValidatorV2.initialize(PaymentValidatorV2.java:42)",
|
| 74 |
+
" at com.acme.payment.bootstrap.ServiceBootstrap.initModules(ServiceBootstrap.java:118)",
|
| 75 |
+
" at com.acme.payment.bootstrap.ServiceBootstrap.start(ServiceBootstrap.java:55)",
|
| 76 |
+
" at com.acme.payment.Main.main(Main.java:12)",
|
| 77 |
+
"2026-04-06T04:00:06Z FATAL [payment-service] Bootstrap failed: required module 'payment-validator-v2' could not initialize",
|
| 78 |
+
"2026-04-06T04:00:06Z INFO [payment-service] Shutdown hook triggered, cleaning up...",
|
| 79 |
+
"2026-04-06T04:00:07Z INFO [payment-service] Health check endpoint /healthz returning 503",
|
| 80 |
+
"2026-04-06T04:00:10Z WARN [payment-service] Kubernetes: pod payment-service-7d4f8b-xk9m2 failed readiness probe (1/3)",
|
| 81 |
+
"2026-04-06T04:00:20Z WARN [payment-service] Kubernetes: pod payment-service-7d4f8b-xk9m2 failed readiness probe (2/3)",
|
| 82 |
+
"2026-04-06T04:00:30Z ERROR [payment-service] Kubernetes: pod payment-service-7d4f8b-xk9m2 marked NotReady, removed from service",
|
| 83 |
+
"2026-04-06T04:00:31Z INFO [payment-service] Kubernetes: restarting pod (CrashLoopBackOff)",
|
| 84 |
+
"2026-04-06T04:00:35Z INFO [payment-service] Starting health check sequence...",
|
| 85 |
+
"2026-04-06T04:00:37Z ERROR [payment-service] NullPointerException in PaymentValidatorV2.initialize(): config.getValidationRules() returned null",
|
| 86 |
+
"2026-04-06T04:00:37Z FATAL [payment-service] Bootstrap failed: required module 'payment-validator-v2' could not initialize",
|
| 87 |
+
"2026-04-06T04:00:38Z INFO [payment-service] Kubernetes: restarting pod (CrashLoopBackOff)",
|
| 88 |
+
"2026-04-06T04:00:45Z INFO [payment-service] Starting health check sequence...",
|
| 89 |
+
"2026-04-06T04:00:47Z ERROR [payment-service] NullPointerException in PaymentValidatorV2.initialize(): config.getValidationRules() returned null",
|
| 90 |
+
"2026-04-06T04:00:47Z FATAL [payment-service] Bootstrap failed: required module 'payment-validator-v2' could not initialize",
|
| 91 |
+
"2026-04-06T04:01:00Z ERROR [payment-service] CrashLoopBackOff: backing off 60s before next restart",
|
| 92 |
+
"2026-04-06T04:02:05Z INFO [payment-service] Starting health check sequence...",
|
| 93 |
+
"2026-04-06T04:02:07Z ERROR [payment-service] NullPointerException in PaymentValidatorV2.initialize(): config.getValidationRules() returned null",
|
| 94 |
+
"2026-04-06T04:02:07Z FATAL [payment-service] Bootstrap failed: required module 'payment-validator-v2' could not initialize",
|
| 95 |
+
"2026-04-06T04:02:10Z ERROR [payment-service] CrashLoopBackOff: backing off 120s before next restart",
|
| 96 |
+
],
|
| 97 |
+
"cache-redis": [
|
| 98 |
+
"2026-04-06T03:00:00Z INFO [cache-redis] Memory usage: 2.8GB/4.0GB (70%) β within operational range",
|
| 99 |
+
"2026-04-06T03:05:00Z INFO [cache-redis] Memory usage: 2.9GB/4.0GB (72%)",
|
| 100 |
+
"2026-04-06T03:10:00Z INFO [cache-redis] Memory usage: 3.0GB/4.0GB (75%)",
|
| 101 |
+
"2026-04-06T03:15:00Z INFO [cache-redis] Memory usage: 3.1GB/4.0GB (77%)",
|
| 102 |
+
"2026-04-06T03:20:00Z INFO [cache-redis] Memory usage: 3.2GB/4.0GB (80%)",
|
| 103 |
+
"2026-04-06T03:25:00Z INFO [cache-redis] Memory usage: 3.3GB/4.0GB (82%)",
|
| 104 |
+
"2026-04-06T03:30:00Z WARN [cache-redis] Memory usage: 3.4GB/4.0GB (85%) β approaching maxmemory threshold",
|
| 105 |
+
"2026-04-06T03:30:01Z INFO [cache-redis] Eviction policy: allkeys-lru activated",
|
| 106 |
+
"2026-04-06T03:30:05Z WARN [cache-redis] Evicting 1200 keys/sec to maintain memory budget",
|
| 107 |
+
"2026-04-06T03:35:00Z WARN [cache-redis] Memory usage: 3.5GB/4.0GB (87%) despite active eviction",
|
| 108 |
+
"2026-04-06T03:40:00Z WARN [cache-redis] Memory usage: 3.6GB/4.0GB (90%)",
|
| 109 |
+
"2026-04-06T03:45:00Z WARN [cache-redis] Memory usage: 3.7GB/4.0GB (92%) despite active eviction",
|
| 110 |
+
"2026-04-06T03:45:01Z WARN [cache-redis] Eviction rate insufficient: incoming writes (2.1GB/hr) exceed eviction rate (1.5GB/hr)",
|
| 111 |
+
"2026-04-06T03:45:02Z WARN [cache-redis] Key namespace auth:session:* most affected β 60% of evictions from this prefix",
|
| 112 |
+
"2026-04-06T03:50:00Z WARN [cache-redis] Memory usage: 3.8GB/4.0GB (95%)",
|
| 113 |
+
"2026-04-06T03:55:00Z ERROR [cache-redis] Memory usage: 3.82GB/4.0GB (95.5%)",
|
| 114 |
+
"2026-04-06T04:00:00Z ERROR [cache-redis] Memory usage: 3.85GB/4.0GB (96%) β critical threshold",
|
| 115 |
+
"2026-04-06T04:00:01Z ERROR [cache-redis] Rejecting 12% of SET commands due to memory pressure",
|
| 116 |
+
"2026-04-06T04:00:02Z WARN [cache-redis] Client auth-service reporting increased cache misses (hit ratio: 35%, normal: 90%)",
|
| 117 |
+
"2026-04-06T04:00:05Z ERROR [cache-redis] Memory fragmentation ratio: 1.8 (healthy: <1.5) β possible memory leak in module",
|
| 118 |
+
"2026-04-06T04:00:10Z WARN [cache-redis] Resident memory growing despite aggressive eviction β suspect leaked allocations in Lua script engine",
|
| 119 |
+
"2026-04-06T04:00:15Z ERROR [cache-redis] Rejecting 18% of SET commands due to memory pressure",
|
| 120 |
+
],
|
| 121 |
+
"auth-service": [
|
| 122 |
+
"2026-04-06T03:00:00Z INFO [auth-service] Request: POST /auth/token uid=user_4421 -> cache HIT (12ms)",
|
| 123 |
+
"2026-04-06T03:00:05Z INFO [auth-service] Request: POST /auth/verify uid=user_8832 -> cache HIT (10ms)",
|
| 124 |
+
"2026-04-06T03:15:00Z INFO [auth-service] Request: POST /auth/token uid=user_3310 -> cache HIT (11ms)",
|
| 125 |
+
"2026-04-06T03:30:00Z INFO [auth-service] Request: POST /auth/token uid=user_5571 -> cache HIT (13ms)",
|
| 126 |
+
"2026-04-06T03:45:00Z WARN [auth-service] Cache miss for session sid=c9f21a β falling back to db-postgres lookup (280ms)",
|
| 127 |
+
"2026-04-06T03:45:02Z INFO [auth-service] Request: POST /auth/token uid=user_7712 -> cache HIT (14ms)",
|
| 128 |
+
"2026-04-06T03:45:05Z WARN [auth-service] Cache miss rate elevated: 45% (normal: <10%)",
|
| 129 |
+
"2026-04-06T03:45:10Z WARN [auth-service] Cache miss for session sid=d4e82b β falling back to db-postgres lookup (320ms)",
|
| 130 |
+
"2026-04-06T03:50:00Z WARN [auth-service] DB connection pool: 28/30 active (falling back to DB for most session lookups)",
|
| 131 |
+
"2026-04-06T03:55:00Z WARN [auth-service] Cache miss rate: 55% β DB fallback path overloaded",
|
| 132 |
+
"2026-04-06T04:00:00Z ERROR [auth-service] Cache write rejected by redis: OOM command not allowed when used memory > maxmemory",
|
| 133 |
+
"2026-04-06T04:00:01Z WARN [auth-service] 65% of requests hitting DB fallback path β latency p99 = 3200ms",
|
| 134 |
+
"2026-04-06T04:00:03Z ERROR [auth-service] Request timeout: POST /auth/verify uid=user_8832 (DB fallback overloaded)",
|
| 135 |
+
"2026-04-06T04:00:05Z ERROR [auth-service] Request timeout: POST /auth/token uid=user_2209 (DB fallback overloaded)",
|
| 136 |
+
"2026-04-06T04:00:08Z WARN [auth-service] DB connection pool: 30/30 active (SATURATED)",
|
| 137 |
+
"2026-04-06T04:00:10Z WARN [auth-service] Degraded mode: session validation averaging 1800ms (SLA: 200ms)",
|
| 138 |
+
"2026-04-06T04:00:15Z ERROR [auth-service] 5 request timeouts in last 60 seconds",
|
| 139 |
+
],
|
| 140 |
+
"user-service": [
|
| 141 |
+
"2026-04-06T03:30:00Z INFO [user-service] Config reload triggered by configmap update",
|
| 142 |
+
"2026-04-06T03:30:01Z WARN [user-service] Config validation: feature flag 'enable_profile_v2' references unknown experiment 'profile_redesign_q2'",
|
| 143 |
+
"2026-04-06T03:30:01Z WARN [user-service] Config validation: deprecated field 'legacy_avatar_url' present β will be removed in v4.0",
|
| 144 |
+
"2026-04-06T03:30:02Z INFO [user-service] Config applied successfully (2 warnings, 0 errors)",
|
| 145 |
+
"2026-04-06T03:30:03Z INFO [user-service] All endpoints healthy, no service disruption during config reload",
|
| 146 |
+
"2026-04-06T03:30:10Z INFO [user-service] GET /users/profile uid=user_4421 -> 200 (28ms)",
|
| 147 |
+
"2026-04-06T03:45:00Z INFO [user-service] GET /users/profile uid=user_1101 -> 200 (30ms)",
|
| 148 |
+
"2026-04-06T03:45:05Z INFO [user-service] PUT /users/profile uid=user_3310 -> 200 (82ms)",
|
| 149 |
+
"2026-04-06T04:00:00Z INFO [user-service] GET /users/profile uid=user_1101 -> 200 (28ms)",
|
| 150 |
+
"2026-04-06T04:00:01Z INFO [user-service] PUT /users/profile uid=user_3310 -> 200 (95ms)",
|
| 151 |
+
"2026-04-06T04:00:05Z INFO [user-service] GET /users/settings uid=user_5571 -> 200 (26ms)",
|
| 152 |
+
"2026-04-06T04:00:10Z INFO [user-service] Health check /healthz -> 200 OK",
|
| 153 |
+
],
|
| 154 |
+
"notification-service": [
|
| 155 |
+
"2026-04-06T03:45:00Z INFO [notification-service] Auth token validated for batch #4445 (48ms)",
|
| 156 |
+
"2026-04-06T03:45:01Z INFO [notification-service] Email batch #4445 sent successfully (15 emails)",
|
| 157 |
+
"2026-04-06T04:00:00Z WARN [notification-service] Auth token validation taking 2800ms (SLA: 500ms)",
|
| 158 |
+
"2026-04-06T04:00:02Z WARN [notification-service] Email delivery queue depth: 2400 (normal: <100)",
|
| 159 |
+
"2026-04-06T04:00:05Z ERROR [notification-service] Failed to validate sender auth for notification batch #8832 β auth-service timeout",
|
| 160 |
+
"2026-04-06T04:00:06Z WARN [notification-service] Pausing email delivery until auth validation recovers",
|
| 161 |
+
"2026-04-06T04:00:10Z WARN [notification-service] Queue depth growing: 2800 pending emails",
|
| 162 |
+
"2026-04-06T04:00:15Z ERROR [notification-service] Auth validation timeout for batch #8833",
|
| 163 |
+
"2026-04-06T04:00:20Z WARN [notification-service] Queue depth: 3200 β SLA breach imminent for time-sensitive notifications",
|
| 164 |
+
],
|
| 165 |
+
"api-gateway": [
|
| 166 |
+
"2026-04-06T03:59:55Z INFO [api-gateway] Route: POST /api/v2/login -> auth-service (200, 45ms)",
|
| 167 |
+
"2026-04-06T03:59:58Z INFO [api-gateway] Route: POST /api/v2/pay -> payment-service (200, 92ms)",
|
| 168 |
+
"2026-04-06T04:00:01Z ERROR [api-gateway] Route: POST /api/v2/pay -> payment-service (503, connection refused)",
|
| 169 |
+
"2026-04-06T04:00:02Z WARN [api-gateway] Route: POST /api/v2/login -> auth-service (200, 1800ms) β slow",
|
| 170 |
+
"2026-04-06T04:00:03Z INFO [api-gateway] Route: GET /api/v2/user/profile -> user-service (200, 28ms)",
|
| 171 |
+
"2026-04-06T04:00:05Z ERROR [api-gateway] Route: POST /api/v2/pay -> payment-service (503, connection refused)",
|
| 172 |
+
"2026-04-06T04:00:06Z WARN [api-gateway] Circuit breaker OPEN for payment-service (failures=5, threshold=5)",
|
| 173 |
+
"2026-04-06T04:00:08Z ERROR [api-gateway] Route: POST /api/v2/verify -> auth-service (504, timeout after 5000ms)",
|
| 174 |
+
"2026-04-06T04:00:10Z INFO [api-gateway] Route: GET /api/v2/user/settings -> user-service (200, 25ms)",
|
| 175 |
+
"2026-04-06T04:00:12Z ERROR [api-gateway] Route: POST /api/v2/pay -> payment-service (503, circuit breaker open)",
|
| 176 |
+
"2026-04-06T04:00:15Z WARN [api-gateway] Route: POST /api/v2/login -> auth-service (200, 3200ms) β very slow",
|
| 177 |
+
"2026-04-06T04:00:18Z ERROR [api-gateway] Route: POST /api/v2/verify -> auth-service (504, timeout after 5000ms)",
|
| 178 |
+
"2026-04-06T04:00:20Z INFO [api-gateway] Route: GET /api/v2/user/profile -> user-service (200, 30ms)",
|
| 179 |
+
],
|
| 180 |
+
"db-postgres": [
|
| 181 |
+
"2026-04-06T03:55:00Z INFO [db-postgres] Active connections: 42/100",
|
| 182 |
+
"2026-04-06T04:00:00Z INFO [db-postgres] Active connections: 58/100",
|
| 183 |
+
"2026-04-06T04:00:01Z INFO [db-postgres] Checkpoint starting: time-based",
|
| 184 |
+
"2026-04-06T04:00:03Z INFO [db-postgres] Checkpoint complete: wrote 1450 buffers (9.8%)",
|
| 185 |
+
"2026-04-06T04:00:05Z INFO [db-postgres] Higher than normal read load β auth-service fallback queries detected",
|
| 186 |
+
"2026-04-06T04:00:10Z INFO [db-postgres] Active connections: 62/100 β elevated but within limits",
|
| 187 |
+
"2026-04-06T04:00:15Z INFO [db-postgres] No deadlocks detected. Lock wait queue empty.",
|
| 188 |
+
"2026-04-06T04:00:20Z INFO [db-postgres] Autovacuum: processing table sessions (dead tuples: 850)",
|
| 189 |
+
],
|
| 190 |
+
},
|
| 191 |
+
|
| 192 |
+
metrics={
|
| 193 |
+
"payment-service": [
|
| 194 |
+
{"timestamp": "2026-04-06T03:55:00Z", "cpu_pct": 18, "mem_pct": 40, "latency_p50": 88, "latency_p99": 155, "error_rate": 0.001, "pods_ready": 2, "pods_total": 2},
|
| 195 |
+
{"timestamp": "2026-04-06T04:00:00Z", "cpu_pct": 0, "mem_pct": 0, "latency_p50": 0, "latency_p99": 0, "error_rate": 1.0, "pods_ready": 0, "pods_total": 2},
|
| 196 |
+
],
|
| 197 |
+
"cache-redis": [
|
| 198 |
+
{"timestamp": "2026-04-06T02:00:00Z", "mem_gb": 2.4, "mem_pct": 60, "hit_ratio": 0.92, "evictions_per_s": 0, "connections": 45, "fragmentation_ratio": 1.1},
|
| 199 |
+
{"timestamp": "2026-04-06T02:30:00Z", "mem_gb": 2.6, "mem_pct": 65, "hit_ratio": 0.91, "evictions_per_s": 0, "connections": 46, "fragmentation_ratio": 1.2},
|
| 200 |
+
{"timestamp": "2026-04-06T03:00:00Z", "mem_gb": 2.8, "mem_pct": 70, "hit_ratio": 0.90, "evictions_per_s": 5, "connections": 47, "fragmentation_ratio": 1.3},
|
| 201 |
+
{"timestamp": "2026-04-06T03:30:00Z", "mem_gb": 3.4, "mem_pct": 85, "hit_ratio": 0.72, "evictions_per_s": 1200, "connections": 48, "fragmentation_ratio": 1.5},
|
| 202 |
+
{"timestamp": "2026-04-06T03:45:00Z", "mem_gb": 3.7, "mem_pct": 92, "hit_ratio": 0.55, "evictions_per_s": 1800, "connections": 48, "fragmentation_ratio": 1.7},
|
| 203 |
+
{"timestamp": "2026-04-06T04:00:00Z", "mem_gb": 3.85, "mem_pct": 96, "hit_ratio": 0.35, "evictions_per_s": 2200, "connections": 47, "fragmentation_ratio": 1.8},
|
| 204 |
+
],
|
| 205 |
+
"auth-service": [
|
| 206 |
+
{"timestamp": "2026-04-06T03:00:00Z", "cpu_pct": 22, "mem_pct": 58, "latency_p50": 12, "latency_p99": 45, "error_rate": 0.001, "cache_hit_ratio": 0.90, "db_fallback_pct": 0.10},
|
| 207 |
+
{"timestamp": "2026-04-06T03:30:00Z", "cpu_pct": 28, "mem_pct": 60, "latency_p50": 25, "latency_p99": 180, "error_rate": 0.005, "cache_hit_ratio": 0.72, "db_fallback_pct": 0.28},
|
| 208 |
+
{"timestamp": "2026-04-06T03:45:00Z", "cpu_pct": 35, "mem_pct": 62, "latency_p50": 120, "latency_p99": 1200, "error_rate": 0.05, "cache_hit_ratio": 0.55, "db_fallback_pct": 0.45},
|
| 209 |
+
{"timestamp": "2026-04-06T04:00:00Z", "cpu_pct": 42, "mem_pct": 65, "latency_p50": 800, "latency_p99": 3200, "error_rate": 0.15, "cache_hit_ratio": 0.35, "db_fallback_pct": 0.65},
|
| 210 |
+
],
|
| 211 |
+
"user-service": [
|
| 212 |
+
{"timestamp": "2026-04-06T03:00:00Z", "cpu_pct": 15, "mem_pct": 35, "latency_p50": 28, "latency_p99": 75, "error_rate": 0.001},
|
| 213 |
+
{"timestamp": "2026-04-06T04:00:00Z", "cpu_pct": 15, "mem_pct": 35, "latency_p50": 30, "latency_p99": 82, "error_rate": 0.001},
|
| 214 |
+
],
|
| 215 |
+
"notification-service": [
|
| 216 |
+
{"timestamp": "2026-04-06T03:45:00Z", "cpu_pct": 12, "mem_pct": 30, "queue_depth": 15, "auth_validation_ms": 48, "emails_sent_per_min": 120},
|
| 217 |
+
{"timestamp": "2026-04-06T04:00:00Z", "cpu_pct": 14, "mem_pct": 32, "queue_depth": 2400, "auth_validation_ms": 2800, "emails_sent_per_min": 5},
|
| 218 |
+
],
|
| 219 |
+
"api-gateway": [
|
| 220 |
+
{"timestamp": "2026-04-06T03:55:00Z", "cpu_pct": 20, "mem_pct": 45, "latency_p50": 35, "latency_p99": 95, "error_rate": 0.002, "5xx_rate": 0.001},
|
| 221 |
+
{"timestamp": "2026-04-06T04:00:00Z", "cpu_pct": 28, "mem_pct": 48, "latency_p50": 120, "latency_p99": 5200, "error_rate": 0.35, "5xx_rate": 0.32},
|
| 222 |
+
],
|
| 223 |
+
"db-postgres": [
|
| 224 |
+
{"timestamp": "2026-04-06T03:55:00Z", "cpu_pct": 35, "mem_pct": 55, "connections": 42, "active_locks": 2, "deadlocks": 0, "write_iops": 1200, "read_iops": 3500},
|
| 225 |
+
{"timestamp": "2026-04-06T04:00:00Z", "cpu_pct": 45, "mem_pct": 58, "connections": 62, "active_locks": 3, "deadlocks": 0, "write_iops": 1100, "read_iops": 4800},
|
| 226 |
+
],
|
| 227 |
+
},
|
| 228 |
+
|
| 229 |
+
traces={
|
| 230 |
+
"payment-service": [
|
| 231 |
+
"No recent traces β service is down (CrashLoopBackOff). Last successful trace (before deploy):",
|
| 232 |
+
"Trace: POST /api/v2/pay (txn=pay_9901, total=92ms) β v3.8.1",
|
| 233 |
+
" ββ payment-service.validateRequest() 8ms",
|
| 234 |
+
" ββ payment-service.checkBalance() 25ms (SELECT -> db-postgres)",
|
| 235 |
+
" ββ payment-service.insertTransaction() 40ms (INSERT -> db-postgres)",
|
| 236 |
+
" ββ payment-service.sendConfirmation() 19ms",
|
| 237 |
+
],
|
| 238 |
+
"auth-service": [
|
| 239 |
+
"Trace: POST /auth/verify (uid=user_8832, total=3200ms)",
|
| 240 |
+
" ββ auth-service.checkSessionCache() 8ms (cache-redis MISS)",
|
| 241 |
+
" ββ auth-service.fallbackDBLookup() 2900ms (db-postgres β under load from fallback traffic)",
|
| 242 |
+
" ββ auth-service.validateToken() 45ms",
|
| 243 |
+
" ββ auth-service.writeBackToCache() FAILED (redis OOM rejected write)",
|
| 244 |
+
],
|
| 245 |
+
"notification-service": [
|
| 246 |
+
"Trace: POST /notifications/send (batch=#8832, total=5200ms) β TIMEOUT",
|
| 247 |
+
" ββ notification-service.prepareBatch() 12ms",
|
| 248 |
+
" ββ notification-service.validateAuth() 5000ms (-> auth-service TIMEOUT)",
|
| 249 |
+
" ββ notification-service.sendEmails() never reached",
|
| 250 |
+
],
|
| 251 |
+
},
|
| 252 |
+
|
| 253 |
+
deploy_history={
|
| 254 |
+
"payment-service": [
|
| 255 |
+
"v3.8.2 deployed 2026-04-06T04:00:00Z status=CrashLoopBackOff (deployed 15 min ago)",
|
| 256 |
+
"v3.8.1 deployed 2026-04-03T14:00:00Z status=superseded (was stable for 3 days)",
|
| 257 |
+
"v3.8.0 deployed 2026-03-28T10:00:00Z status=superseded",
|
| 258 |
+
],
|
| 259 |
+
"auth-service": [
|
| 260 |
+
"v2.14.0 deployed 2026-04-01T10:00:00Z status=stable (running 5 days, no issues)",
|
| 261 |
+
],
|
| 262 |
+
"cache-redis": [
|
| 263 |
+
"v7.2.4 deployed 2026-03-20T09:00:00Z status=stable (running 17 days)",
|
| 264 |
+
],
|
| 265 |
+
"user-service": [
|
| 266 |
+
"v4.2.1 deployed 2026-04-05T16:00:00Z status=stable (running 12 hours)",
|
| 267 |
+
],
|
| 268 |
+
},
|
| 269 |
+
|
| 270 |
+
runbooks={
|
| 271 |
+
"payment-service": (
|
| 272 |
+
"## payment-service Runbook\n"
|
| 273 |
+
"- Crash on startup / CrashLoopBackOff: Check recent deploys. If the latest deploy\n"
|
| 274 |
+
" introduced the crash, rollback to previous known-good version:\n"
|
| 275 |
+
" rollback_deploy(service='payment-service', target_version='<previous_version>')\n"
|
| 276 |
+
" Check deploy history for the last stable version.\n"
|
| 277 |
+
"- Transaction timeouts: Check db-postgres connection pool and lock status.\n"
|
| 278 |
+
"- High latency: Check downstream service health (db-postgres)."
|
| 279 |
+
),
|
| 280 |
+
"cache-redis": (
|
| 281 |
+
"## cache-redis Runbook\n"
|
| 282 |
+
"- Memory pressure / approaching maxmemory: Check memory trend in metrics.\n"
|
| 283 |
+
" If memory grows despite eviction, likely a memory leak.\n"
|
| 284 |
+
" Short-term fix: restart_service to clear leaked memory.\n"
|
| 285 |
+
" Alternative: scale_up to add more replicas and distribute load.\n"
|
| 286 |
+
"- Elevated miss ratio: If caused by memory pressure/eviction storm, fix memory issue first.\n"
|
| 287 |
+
" If caused by TTL expiry batch, wait for cache to warm back up."
|
| 288 |
+
),
|
| 289 |
+
"auth-service": (
|
| 290 |
+
"## auth-service Runbook\n"
|
| 291 |
+
"- High latency / DB fallback: Check cache-redis health. If redis is degraded,\n"
|
| 292 |
+
" auth-service falls back to DB lookups which are 10-50x slower.\n"
|
| 293 |
+
" Fix redis first β auth-service will recover automatically.\n"
|
| 294 |
+
"- Cache write failures: Redis may be rejecting writes due to OOM. Check redis memory."
|
| 295 |
+
),
|
| 296 |
+
"notification-service": (
|
| 297 |
+
"## notification-service Runbook\n"
|
| 298 |
+
"- Queue backing up: Usually caused by auth-service degradation. Notification-service\n"
|
| 299 |
+
" validates sender auth before sending. If auth is slow, queue grows.\n"
|
| 300 |
+
" Fix auth-service first β queue will drain automatically."
|
| 301 |
+
),
|
| 302 |
+
},
|
| 303 |
+
|
| 304 |
+
configs={
|
| 305 |
+
"payment-service": {
|
| 306 |
+
"current": "DB_POOL_SIZE=50\nDB_TIMEOUT=5000\nRETRY_COUNT=3\nVALIDATOR_VERSION=v2\nFEATURE_NEW_VALIDATION=true",
|
| 307 |
+
"previous": "DB_POOL_SIZE=50\nDB_TIMEOUT=5000\nRETRY_COUNT=3\nVALIDATOR_VERSION=v1\nFEATURE_NEW_VALIDATION=false",
|
| 308 |
+
"diff": "Changed VALIDATOR_VERSION from v1 to v2, enabled FEATURE_NEW_VALIDATION (part of v3.8.2 deploy)",
|
| 309 |
+
},
|
| 310 |
+
"user-service": {
|
| 311 |
+
"current": "FEATURE_PROFILE_V2=true\nLEGACY_AVATAR_URL=https://cdn.example.com/avatars\nDB_POOL_SIZE=30",
|
| 312 |
+
"previous": "FEATURE_PROFILE_V2=false\nDB_POOL_SIZE=30",
|
| 313 |
+
"diff": "Added FEATURE_PROFILE_V2=true and LEGACY_AVATAR_URL (config change 30 min ago). 2 validation warnings but applied successfully.",
|
| 314 |
+
},
|
| 315 |
+
"cache-redis": {
|
| 316 |
+
"current": "maxmemory=4gb\nmaxmemory-policy=allkeys-lru\ntimeout=300\ntcp-keepalive=60",
|
| 317 |
+
"previous": "maxmemory=4gb\nmaxmemory-policy=allkeys-lru\ntimeout=300\ntcp-keepalive=60",
|
| 318 |
+
"diff": "No changes β config has not been modified recently.",
|
| 319 |
+
},
|
| 320 |
+
},
|
| 321 |
+
|
| 322 |
+
dependencies={
|
| 323 |
+
"api-gateway": ["auth-service", "user-service", "payment-service"],
|
| 324 |
+
"auth-service": ["cache-redis"],
|
| 325 |
+
"user-service": ["db-postgres"],
|
| 326 |
+
"payment-service": ["db-postgres"],
|
| 327 |
+
"db-postgres": [],
|
| 328 |
+
"cache-redis": [],
|
| 329 |
+
"notification-service": ["auth-service"],
|
| 330 |
+
},
|
| 331 |
+
|
| 332 |
+
root_cause_services=["payment-service", "cache-redis"],
|
| 333 |
+
root_cause_categories=[RootCauseCategory.BAD_DEPLOY, RootCauseCategory.MEMORY_LEAK],
|
| 334 |
+
required_fixes=[
|
| 335 |
+
RequiredFix(action="rollback_deploy", service="payment-service", target_version="v3.8.1"),
|
| 336 |
+
RequiredFix(action="restart_service", service="cache-redis"),
|
| 337 |
+
],
|
| 338 |
+
diagnosis_keywords=[
|
| 339 |
+
"payment-service", "deploy", "rollback", "v3.8.2", "v3.8.1", "NullPointerException", "crash",
|
| 340 |
+
"cache-redis", "memory", "leak", "eviction", "auth-service", "fallback",
|
| 341 |
+
],
|
| 342 |
+
|
| 343 |
+
weights={
|
| 344 |
+
"correct_service": 0.15,
|
| 345 |
+
"correct_category": 0.10,
|
| 346 |
+
"correct_fix": 0.15,
|
| 347 |
+
"secondary_fix": 0.20,
|
| 348 |
+
"diagnosis_text": 0.15,
|
| 349 |
+
"investigation": 0.10,
|
| 350 |
+
"wrong_penalty": 0.05,
|
| 351 |
+
},
|
| 352 |
+
)
|
| 353 |
+
|
tasks/medium_deadlock.py
ADDED
|
@@ -0,0 +1,298 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Task: Cascading Database Deadlock
|
| 3 |
+
To add a new task, copy this file, modify the SCENARIO definition, and place it in tasks/.
|
| 4 |
+
The task loader will auto-discover it.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from env.scenario import IncidentScenario, RequiredFix, ServiceConfig
|
| 8 |
+
from models import RootCauseCategory, ServiceStatus
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
SCENARIO = IncidentScenario(
|
| 12 |
+
task_id="medium",
|
| 13 |
+
name="Cascading Database Deadlock",
|
| 14 |
+
difficulty="medium",
|
| 15 |
+
max_steps=25,
|
| 16 |
+
incident_summary=(
|
| 17 |
+
"Multiple alerts fired at 03:05 UTC. payment-service and user-service both showing elevated "
|
| 18 |
+
"error rates and latency. Transaction timeouts increasing. cache-redis also flagged with "
|
| 19 |
+
"elevated miss ratio. Need to identify root cause and restore write path."
|
| 20 |
+
),
|
| 21 |
+
|
| 22 |
+
services={
|
| 23 |
+
"api-gateway": ServiceConfig(
|
| 24 |
+
status=ServiceStatus.DEGRADED, deps=["auth-service", "user-service", "payment-service"],
|
| 25 |
+
version="v1.12.0", replicas=3,
|
| 26 |
+
),
|
| 27 |
+
"auth-service": ServiceConfig(
|
| 28 |
+
status=ServiceStatus.HEALTHY, deps=["cache-redis"],
|
| 29 |
+
version="v2.14.0", replicas=2,
|
| 30 |
+
),
|
| 31 |
+
"user-service": ServiceConfig(
|
| 32 |
+
status=ServiceStatus.DEGRADED, deps=["db-postgres"],
|
| 33 |
+
version="v4.2.1", replicas=2,
|
| 34 |
+
),
|
| 35 |
+
"payment-service": ServiceConfig(
|
| 36 |
+
status=ServiceStatus.DEGRADED, deps=["db-postgres"],
|
| 37 |
+
version="v3.8.1", replicas=2,
|
| 38 |
+
),
|
| 39 |
+
"db-postgres": ServiceConfig(
|
| 40 |
+
status=ServiceStatus.DEGRADED, deps=[],
|
| 41 |
+
version="v15.4", replicas=1, is_root_cause=True, fault_type="db_deadlock",
|
| 42 |
+
),
|
| 43 |
+
"cache-redis": ServiceConfig(
|
| 44 |
+
status=ServiceStatus.HEALTHY, deps=[],
|
| 45 |
+
version="v7.2.4", replicas=1,
|
| 46 |
+
),
|
| 47 |
+
"notification-service": ServiceConfig(
|
| 48 |
+
status=ServiceStatus.HEALTHY, deps=["auth-service"],
|
| 49 |
+
version="v1.5.0", replicas=1,
|
| 50 |
+
),
|
| 51 |
+
},
|
| 52 |
+
|
| 53 |
+
initial_alerts=[
|
| 54 |
+
"[ALERT SEV-2] payment-service: transaction timeouts >15%, p99 latency >2s",
|
| 55 |
+
"[ALERT SEV-2] user-service: elevated error rate on profile updates",
|
| 56 |
+
"[ALERT SEV-3] cache-redis: cache miss ratio elevated (informational)",
|
| 57 |
+
],
|
| 58 |
+
|
| 59 |
+
logs={
|
| 60 |
+
"payment-service": [
|
| 61 |
+
"2026-04-06T03:00:01Z INFO [payment-service] Processing payment txn=pay_8832 amount=$45.00 -> db-postgres",
|
| 62 |
+
"2026-04-06T03:00:02Z INFO [payment-service] Payment completed txn=pay_8832 latency=85ms",
|
| 63 |
+
"2026-04-06T03:00:10Z INFO [payment-service] Processing payment txn=pay_1120 amount=$12.99 -> db-postgres",
|
| 64 |
+
"2026-04-06T03:00:11Z INFO [payment-service] Payment completed txn=pay_1120 latency=92ms",
|
| 65 |
+
"2026-04-06T03:00:20Z INFO [payment-service] Processing payment txn=pay_3341 amount=$199.00 -> db-postgres",
|
| 66 |
+
"2026-04-06T03:00:21Z INFO [payment-service] Payment completed txn=pay_3341 latency=78ms",
|
| 67 |
+
"2026-04-06T03:01:00Z INFO [payment-service] Health check /healthz -> 200 OK",
|
| 68 |
+
"2026-04-06T03:02:00Z INFO [payment-service] Processing payment txn=pay_5590 amount=$25.00 -> db-postgres",
|
| 69 |
+
"2026-04-06T03:02:01Z INFO [payment-service] Payment completed txn=pay_5590 latency=95ms",
|
| 70 |
+
"2026-04-06T03:03:00Z INFO [payment-service] Processing payment txn=pay_6612 amount=$150.00 -> db-postgres",
|
| 71 |
+
"2026-04-06T03:03:01Z INFO [payment-service] Payment completed txn=pay_6612 latency=88ms",
|
| 72 |
+
"2026-04-06T03:04:00Z INFO [payment-service] Health check /healthz -> 200 OK",
|
| 73 |
+
"2026-04-06T03:05:00Z INFO [payment-service] Processing payment txn=pay_7789 amount=$55.00 -> db-postgres",
|
| 74 |
+
"2026-04-06T03:05:12Z WARN [payment-service] Slow query: INSERT INTO transactions (...) took 3200ms (threshold: 500ms)",
|
| 75 |
+
"2026-04-06T03:05:15Z INFO [payment-service] Payment completed txn=pay_7789 latency=3250ms",
|
| 76 |
+
"2026-04-06T03:05:16Z INFO [payment-service] Processing payment txn=pay_1120 amount=$67.00 -> db-postgres",
|
| 77 |
+
"2026-04-06T03:05:18Z WARN [payment-service] DB connection pool: 48/50 active (96% utilized)",
|
| 78 |
+
"2026-04-06T03:05:20Z ERROR [payment-service] Transaction timeout: txn=pay_4455 exceeded 5000ms deadline",
|
| 79 |
+
"2026-04-06T03:05:20Z ERROR [payment-service] Retrying txn=pay_4455 (attempt 2/3)",
|
| 80 |
+
"2026-04-06T03:05:25Z ERROR [payment-service] Transaction timeout: txn=pay_4455 exceeded 5000ms deadline (retry 2)",
|
| 81 |
+
"2026-04-06T03:05:25Z ERROR [payment-service] Transaction failed permanently: txn=pay_4455 after 3 retries",
|
| 82 |
+
"2026-04-06T03:05:26Z WARN [payment-service] DB connection pool: 50/50 active (SATURATED) β new requests queuing",
|
| 83 |
+
"2026-04-06T03:05:28Z ERROR [payment-service] Connection acquisition timeout: waited 10s for available connection",
|
| 84 |
+
"2026-04-06T03:05:30Z INFO [payment-service] Read query SELECT balance WHERE user_id=... completed in 45ms",
|
| 85 |
+
"2026-04-06T03:05:32Z ERROR [payment-service] Transaction timeout: txn=pay_6691 exceeded 5000ms deadline",
|
| 86 |
+
"2026-04-06T03:05:33Z WARN [payment-service] Circuit breaker WARNING for db-postgres writes (failures=8/10 threshold)",
|
| 87 |
+
"2026-04-06T03:05:35Z ERROR [payment-service] Transaction timeout: txn=pay_7801 exceeded 5000ms deadline",
|
| 88 |
+
"2026-04-06T03:05:40Z ERROR [payment-service] Transaction timeout: txn=pay_8912 exceeded 5000ms deadline",
|
| 89 |
+
"2026-04-06T03:06:00Z ERROR [payment-service] Connection acquisition timeout: waited 15s for available connection",
|
| 90 |
+
"2026-04-06T03:07:00Z ERROR [payment-service] 12 transactions failed in last 5 minutes. Write path severely degraded.",
|
| 91 |
+
"2026-04-06T03:08:00Z ERROR [payment-service] 15 transactions failed in last 5 minutes. Write path severely degraded.",
|
| 92 |
+
],
|
| 93 |
+
"user-service": [
|
| 94 |
+
"2026-04-06T03:00:01Z INFO [user-service] GET /users/profile uid=user_4421 -> 200 (32ms)",
|
| 95 |
+
"2026-04-06T03:00:05Z INFO [user-service] GET /users/settings uid=user_8832 -> 200 (28ms)",
|
| 96 |
+
"2026-04-06T03:00:10Z INFO [user-service] PUT /users/profile uid=user_3310 -> 200 (85ms)",
|
| 97 |
+
"2026-04-06T03:01:00Z INFO [user-service] GET /users/profile uid=user_1101 -> 200 (30ms)",
|
| 98 |
+
"2026-04-06T03:02:00Z INFO [user-service] PUT /users/settings uid=user_5571 -> 200 (78ms)",
|
| 99 |
+
"2026-04-06T03:03:00Z INFO [user-service] GET /users/profile uid=user_7712 -> 200 (27ms)",
|
| 100 |
+
"2026-04-06T03:04:00Z INFO [user-service] GET /users/profile uid=user_2209 -> 200 (31ms)",
|
| 101 |
+
"2026-04-06T03:05:10Z INFO [user-service] GET /users/profile uid=user_9901 -> 200 (29ms)",
|
| 102 |
+
"2026-04-06T03:05:15Z INFO [user-service] GET /users/profile uid=user_6633 -> 200 (26ms)",
|
| 103 |
+
"2026-04-06T03:05:18Z WARN [user-service] Slow mutation: UPDATE users SET email=... took 4100ms",
|
| 104 |
+
"2026-04-06T03:05:20Z ERROR [user-service] Profile update failed: uid=user_8832 β database lock acquisition timeout",
|
| 105 |
+
"2026-04-06T03:05:22Z INFO [user-service] GET /users/profile uid=user_1101 -> 200 (28ms)",
|
| 106 |
+
"2026-04-06T03:05:25Z ERROR [user-service] Profile update failed: uid=user_3310 β database lock acquisition timeout",
|
| 107 |
+
"2026-04-06T03:05:26Z WARN [user-service] Write operations failing at 60% rate, reads unaffected",
|
| 108 |
+
"2026-04-06T03:05:30Z INFO [user-service] GET /users/profile uid=user_4482 -> 200 (30ms)",
|
| 109 |
+
"2026-04-06T03:06:00Z ERROR [user-service] Profile update failed: uid=user_5510 β database lock acquisition timeout",
|
| 110 |
+
"2026-04-06T03:06:05Z INFO [user-service] GET /users/settings uid=user_7781 -> 200 (25ms)",
|
| 111 |
+
"2026-04-06T03:07:00Z WARN [user-service] Write operations failing at 75% rate, reads unaffected",
|
| 112 |
+
],
|
| 113 |
+
"db-postgres": [
|
| 114 |
+
"2026-04-06T02:55:00Z INFO [db-postgres] Connection from analytics-cron@10.0.3.42: BEGIN; SELECT ... FROM transactions JOIN users ... (full table scan)",
|
| 115 |
+
"2026-04-06T02:55:01Z INFO [db-postgres] Query plan: Seq Scan on transactions (rows=2.4M, cost=45000..89000)",
|
| 116 |
+
"2026-04-06T02:55:01Z WARN [db-postgres] Long-running transaction txid=8830012 holding RowExclusiveLock on transactions table",
|
| 117 |
+
"2026-04-06T02:56:00Z INFO [db-postgres] Active connections: 55/100",
|
| 118 |
+
"2026-04-06T02:58:00Z INFO [db-postgres] Active connections: 68/100",
|
| 119 |
+
"2026-04-06T03:00:00Z INFO [db-postgres] Checkpoint starting: time-based",
|
| 120 |
+
"2026-04-06T03:00:02Z INFO [db-postgres] Checkpoint complete: wrote 1204 buffers (8.2%)",
|
| 121 |
+
"2026-04-06T03:00:05Z INFO [db-postgres] Active connections: 70/100",
|
| 122 |
+
"2026-04-06T03:02:00Z INFO [db-postgres] Active connections: 78/100",
|
| 123 |
+
"2026-04-06T03:04:00Z INFO [db-postgres] Active connections: 88/100",
|
| 124 |
+
"2026-04-06T03:05:10Z WARN [db-postgres] Deadlock detected: process 4821 (payment-service) waiting for RowExclusiveLock on transactions, blocked by process 4455 (analytics-cron)",
|
| 125 |
+
"2026-04-06T03:05:10Z WARN [db-postgres] Deadlock detected: process 4830 (user-service) waiting for RowExclusiveLock on users, blocked by process 4455 (analytics-cron)",
|
| 126 |
+
"2026-04-06T03:05:11Z INFO [db-postgres] Active connections: 95/100 (analytics-cron holding 1, payment-service pool 50, user-service pool 30, other 14)",
|
| 127 |
+
"2026-04-06T03:05:15Z WARN [db-postgres] Long-running transaction txid=8830012 has been active for 10m15s β consider terminating",
|
| 128 |
+
"2026-04-06T03:05:20Z WARN [db-postgres] Lock wait queue depth: 12 processes waiting",
|
| 129 |
+
"2026-04-06T03:06:00Z INFO [db-postgres] SELECT queries completing normally (read path unaffected)",
|
| 130 |
+
"2026-04-06T03:06:30Z WARN [db-postgres] Connection pool nearing limit: 98/100 active",
|
| 131 |
+
"2026-04-06T03:07:00Z WARN [db-postgres] Lock wait queue depth: 18 processes waiting β growing",
|
| 132 |
+
"2026-04-06T03:08:00Z ERROR [db-postgres] Connection limit reached: 100/100 β rejecting new connections",
|
| 133 |
+
],
|
| 134 |
+
"auth-service": [
|
| 135 |
+
"2026-04-06T03:00:00Z INFO [auth-service] Request processed: POST /auth/token uid=user_8832 latency=42ms",
|
| 136 |
+
"2026-04-06T03:00:05Z INFO [auth-service] Cache hit for session sid=a8f32c, returning cached token",
|
| 137 |
+
"2026-04-06T03:05:00Z INFO [auth-service] Request processed: POST /auth/verify uid=user_3310 latency=38ms",
|
| 138 |
+
"2026-04-06T03:05:10Z INFO [auth-service] Request processed: POST /auth/token uid=user_5571 latency=45ms",
|
| 139 |
+
"2026-04-06T03:05:30Z INFO [auth-service] Health check /healthz -> 200 OK",
|
| 140 |
+
"2026-04-06T03:08:00Z INFO [auth-service] Request processed: POST /auth/verify uid=user_1101 latency=40ms",
|
| 141 |
+
],
|
| 142 |
+
"cache-redis": [
|
| 143 |
+
"2026-04-06T03:00:00Z INFO [cache-redis] Memory usage: 1.2GB/4.0GB (30%)",
|
| 144 |
+
"2026-04-06T03:00:01Z INFO [cache-redis] Cache hit ratio: 82% (normal: 85-95%)",
|
| 145 |
+
"2026-04-06T03:02:00Z INFO [cache-redis] Cache hit ratio: 80%",
|
| 146 |
+
"2026-04-06T03:05:00Z INFO [cache-redis] Cache hit ratio: 78% β slight decrease",
|
| 147 |
+
"2026-04-06T03:05:01Z INFO [cache-redis] Key evictions: 45 in last 5m (within normal range)",
|
| 148 |
+
"2026-04-06T03:05:02Z WARN [cache-redis] Cache miss ratio elevated for prefix auth:session:* β possible cache warming after TTL expiry batch",
|
| 149 |
+
"2026-04-06T03:05:10Z INFO [cache-redis] Memory usage: 1.3GB/4.0GB (32%) β stable",
|
| 150 |
+
"2026-04-06T03:06:00Z INFO [cache-redis] Cache hit ratio recovering: 84%",
|
| 151 |
+
"2026-04-06T03:08:00Z INFO [cache-redis] Cache hit ratio: 88% β back to normal",
|
| 152 |
+
],
|
| 153 |
+
"api-gateway": [
|
| 154 |
+
"2026-04-06T03:00:01Z INFO [api-gateway] Route: POST /api/v2/login -> auth-service (200, 45ms)",
|
| 155 |
+
"2026-04-06T03:00:02Z INFO [api-gateway] Route: POST /api/v2/pay -> payment-service (200, 88ms)",
|
| 156 |
+
"2026-04-06T03:05:20Z WARN [api-gateway] Route: POST /api/v2/pay -> payment-service (504, 5200ms)",
|
| 157 |
+
"2026-04-06T03:05:22Z WARN [api-gateway] Route: PUT /api/v2/user/profile -> user-service (504, 4800ms)",
|
| 158 |
+
"2026-04-06T03:05:25Z INFO [api-gateway] Route: GET /api/v2/user/profile -> user-service (200, 30ms)",
|
| 159 |
+
"2026-04-06T03:05:30Z INFO [api-gateway] Route: POST /api/v2/login -> auth-service (200, 42ms)",
|
| 160 |
+
"2026-04-06T03:06:00Z ERROR [api-gateway] Route: POST /api/v2/pay -> payment-service (504, timeout)",
|
| 161 |
+
],
|
| 162 |
+
"notification-service": [
|
| 163 |
+
"2026-04-06T03:00:00Z INFO [notification-service] Email batch #4430 sent successfully (10 emails)",
|
| 164 |
+
"2026-04-06T03:05:00Z INFO [notification-service] Email batch #4435 sent successfully (7 emails)",
|
| 165 |
+
"2026-04-06T03:08:00Z INFO [notification-service] Health check /healthz -> 200 OK",
|
| 166 |
+
],
|
| 167 |
+
},
|
| 168 |
+
|
| 169 |
+
metrics={
|
| 170 |
+
"payment-service": [
|
| 171 |
+
{"timestamp": "2026-04-06T02:50:00Z", "cpu_pct": 20, "mem_pct": 40, "latency_p50": 80, "latency_p99": 150, "error_rate": 0.001, "db_pool_active": 15, "db_pool_max": 50},
|
| 172 |
+
{"timestamp": "2026-04-06T03:00:00Z", "cpu_pct": 20, "mem_pct": 40, "latency_p50": 85, "latency_p99": 160, "error_rate": 0.002, "db_pool_active": 18, "db_pool_max": 50},
|
| 173 |
+
{"timestamp": "2026-04-06T03:05:00Z", "cpu_pct": 22, "mem_pct": 41, "latency_p50": 3200, "latency_p99": 8500, "error_rate": 0.35, "db_pool_active": 50, "db_pool_max": 50},
|
| 174 |
+
{"timestamp": "2026-04-06T03:08:00Z", "cpu_pct": 18, "mem_pct": 40, "latency_p50": 4500, "latency_p99": "timeout", "error_rate": 0.52, "db_pool_active": 50, "db_pool_max": 50},
|
| 175 |
+
],
|
| 176 |
+
"user-service": [
|
| 177 |
+
{"timestamp": "2026-04-06T02:50:00Z", "cpu_pct": 15, "mem_pct": 35, "latency_p50": 28, "latency_p99": 75, "error_rate": 0.001, "write_error_rate": 0.001},
|
| 178 |
+
{"timestamp": "2026-04-06T03:05:00Z", "cpu_pct": 16, "mem_pct": 35, "latency_p50": 30, "latency_p99": 4100, "error_rate": 0.18, "write_error_rate": 0.60},
|
| 179 |
+
{"timestamp": "2026-04-06T03:08:00Z", "cpu_pct": 15, "mem_pct": 35, "latency_p50": 28, "latency_p99": "timeout", "error_rate": 0.25, "write_error_rate": 0.75},
|
| 180 |
+
],
|
| 181 |
+
"db-postgres": [
|
| 182 |
+
{"timestamp": "2026-04-06T02:50:00Z", "cpu_pct": 35, "mem_pct": 60, "connections": 45, "active_locks": 3, "lock_wait_ms_p99": 5, "write_iops": 1200, "read_iops": 3500, "deadlocks": 0},
|
| 183 |
+
{"timestamp": "2026-04-06T02:55:00Z", "cpu_pct": 55, "mem_pct": 62, "connections": 55, "active_locks": 8, "lock_wait_ms_p99": 15, "write_iops": 1200, "read_iops": 4200, "deadlocks": 0},
|
| 184 |
+
{"timestamp": "2026-04-06T03:00:00Z", "cpu_pct": 65, "mem_pct": 64, "connections": 70, "active_locks": 15, "lock_wait_ms_p99": 250, "write_iops": 800, "read_iops": 4000, "deadlocks": 0},
|
| 185 |
+
{"timestamp": "2026-04-06T03:05:00Z", "cpu_pct": 78, "mem_pct": 65, "connections": 95, "active_locks": 28, "lock_wait_ms_p99": 8500, "write_iops": 200, "read_iops": 3800, "deadlocks": 4},
|
| 186 |
+
{"timestamp": "2026-04-06T03:08:00Z", "cpu_pct": 80, "mem_pct": 66, "connections": 100, "active_locks": 32, "lock_wait_ms_p99": 12000, "write_iops": 50, "read_iops": 3600, "deadlocks": 12},
|
| 187 |
+
],
|
| 188 |
+
"auth-service": [
|
| 189 |
+
{"timestamp": "2026-04-06T03:00:00Z", "cpu_pct": 22, "mem_pct": 58, "latency_p50": 42, "latency_p99": 110, "error_rate": 0.001},
|
| 190 |
+
{"timestamp": "2026-04-06T03:08:00Z", "cpu_pct": 23, "mem_pct": 58, "latency_p50": 44, "latency_p99": 115, "error_rate": 0.001},
|
| 191 |
+
],
|
| 192 |
+
"cache-redis": [
|
| 193 |
+
{"timestamp": "2026-04-06T03:00:00Z", "mem_gb": 1.2, "mem_pct": 30, "hit_ratio": 0.82, "evictions_per_s": 8, "connections": 46},
|
| 194 |
+
{"timestamp": "2026-04-06T03:05:00Z", "mem_gb": 1.3, "mem_pct": 32, "hit_ratio": 0.78, "evictions_per_s": 12, "connections": 46},
|
| 195 |
+
{"timestamp": "2026-04-06T03:08:00Z", "mem_gb": 1.2, "mem_pct": 30, "hit_ratio": 0.88, "evictions_per_s": 2, "connections": 45},
|
| 196 |
+
],
|
| 197 |
+
"api-gateway": [
|
| 198 |
+
{"timestamp": "2026-04-06T03:00:00Z", "cpu_pct": 20, "mem_pct": 45, "latency_p50": 35, "latency_p99": 90, "error_rate": 0.002, "5xx_rate": 0.001},
|
| 199 |
+
{"timestamp": "2026-04-06T03:05:00Z", "cpu_pct": 22, "mem_pct": 46, "latency_p50": 45, "latency_p99": 5500, "error_rate": 0.18, "5xx_rate": 0.15},
|
| 200 |
+
{"timestamp": "2026-04-06T03:08:00Z", "cpu_pct": 23, "mem_pct": 46, "latency_p50": 50, "latency_p99": "timeout", "error_rate": 0.25, "5xx_rate": 0.22},
|
| 201 |
+
],
|
| 202 |
+
},
|
| 203 |
+
|
| 204 |
+
traces={
|
| 205 |
+
"payment-service": [
|
| 206 |
+
"Trace: POST /api/v2/pay (txn=pay_6691, total=8500ms) β TIMEOUT",
|
| 207 |
+
" ββ payment-service.validateRequest() 12ms",
|
| 208 |
+
" ββ payment-service.checkBalance() 45ms (SELECT -> db-postgres, fast)",
|
| 209 |
+
" ββ payment-service.insertTransaction() 8400ms (INSERT -> db-postgres, BLOCKED ON LOCK)",
|
| 210 |
+
" ββ payment-service.sendConfirmation() never reached (timeout)",
|
| 211 |
+
],
|
| 212 |
+
"user-service": [
|
| 213 |
+
"Trace: PUT /api/v2/user/profile (uid=user_8832, total=4800ms) β TIMEOUT",
|
| 214 |
+
" ββ user-service.validateInput() 5ms",
|
| 215 |
+
" ββ user-service.updateProfile() 4780ms (UPDATE -> db-postgres, BLOCKED ON LOCK)",
|
| 216 |
+
" ββ user-service.invalidateCache() never reached (timeout)",
|
| 217 |
+
],
|
| 218 |
+
},
|
| 219 |
+
|
| 220 |
+
deploy_history={
|
| 221 |
+
"payment-service": [
|
| 222 |
+
"v3.8.1 deployed 2026-04-03T14:00:00Z status=stable (running 3 days)",
|
| 223 |
+
],
|
| 224 |
+
"user-service": [
|
| 225 |
+
"v4.2.1 deployed 2026-04-05T16:00:00Z status=stable (running 11 hours)",
|
| 226 |
+
],
|
| 227 |
+
"db-postgres": [
|
| 228 |
+
"v15.4 deployed 2026-03-15T08:00:00Z status=stable (running 22 days)",
|
| 229 |
+
],
|
| 230 |
+
},
|
| 231 |
+
|
| 232 |
+
runbooks={
|
| 233 |
+
"payment-service": (
|
| 234 |
+
"## payment-service Runbook\n"
|
| 235 |
+
"- Transaction timeouts: Check db-postgres connection pool and lock status.\n"
|
| 236 |
+
" If db connection pool is saturated but CPU/memory are normal, likely a DB-side issue.\n"
|
| 237 |
+
"- High latency: Check downstream service health (db-postgres).\n"
|
| 238 |
+
"- Crash on startup: Check recent deploys and rollback if needed."
|
| 239 |
+
),
|
| 240 |
+
"db-postgres": (
|
| 241 |
+
"## db-postgres Runbook\n"
|
| 242 |
+
"- Deadlocks: Identify the blocking transaction using pg_stat_activity.\n"
|
| 243 |
+
" Kill long-running queries or restart postgres to clear all locks.\n"
|
| 244 |
+
"- Connection exhaustion: Check for connection leaks. Consider increasing max_connections\n"
|
| 245 |
+
" or terminating idle connections.\n"
|
| 246 |
+
"- High CPU: Check for expensive queries in pg_stat_statements. Consider adding indexes.\n"
|
| 247 |
+
"- Replication lag: Check network connectivity to replicas and WAL sender status."
|
| 248 |
+
),
|
| 249 |
+
"cache-redis": (
|
| 250 |
+
"## cache-redis Runbook\n"
|
| 251 |
+
"- Elevated miss ratio: Often caused by TTL expiry batches. Wait 5-10 minutes for cache\n"
|
| 252 |
+
" to warm back up. If miss ratio doesn't recover, check maxmemory and eviction policy.\n"
|
| 253 |
+
"- Memory pressure: Check for memory leaks. Scale up replicas or increase maxmemory.\n"
|
| 254 |
+
"- Connection issues: Check network connectivity and client pool configuration."
|
| 255 |
+
),
|
| 256 |
+
},
|
| 257 |
+
|
| 258 |
+
configs={
|
| 259 |
+
"db-postgres": {
|
| 260 |
+
"current": "max_connections=100\nshared_buffers=4GB\nwork_mem=256MB\nlock_timeout=30s\ndeadlock_timeout=1s",
|
| 261 |
+
"previous": "max_connections=100\nshared_buffers=4GB\nwork_mem=256MB\nlock_timeout=30s\ndeadlock_timeout=1s",
|
| 262 |
+
"diff": "No changes β config has not been modified recently.",
|
| 263 |
+
},
|
| 264 |
+
"payment-service": {
|
| 265 |
+
"current": "DB_POOL_SIZE=50\nDB_TIMEOUT=5000\nRETRY_COUNT=3\nCIRCUIT_BREAKER_THRESHOLD=10",
|
| 266 |
+
"previous": "DB_POOL_SIZE=50\nDB_TIMEOUT=5000\nRETRY_COUNT=3\nCIRCUIT_BREAKER_THRESHOLD=10",
|
| 267 |
+
"diff": "No changes β config has not been modified recently.",
|
| 268 |
+
},
|
| 269 |
+
},
|
| 270 |
+
|
| 271 |
+
dependencies={
|
| 272 |
+
"api-gateway": ["auth-service", "user-service", "payment-service"],
|
| 273 |
+
"auth-service": ["cache-redis"],
|
| 274 |
+
"user-service": ["db-postgres"],
|
| 275 |
+
"payment-service": ["db-postgres"],
|
| 276 |
+
"db-postgres": [],
|
| 277 |
+
"cache-redis": [],
|
| 278 |
+
"notification-service": ["auth-service"],
|
| 279 |
+
},
|
| 280 |
+
|
| 281 |
+
root_cause_services=["db-postgres"],
|
| 282 |
+
root_cause_categories=[RootCauseCategory.DB_DEADLOCK],
|
| 283 |
+
required_fixes=[
|
| 284 |
+
RequiredFix(action="restart_service", service="db-postgres"),
|
| 285 |
+
],
|
| 286 |
+
diagnosis_keywords=["db-postgres", "deadlock", "lock", "analytics-cron", "long-running", "transaction", "blocking"],
|
| 287 |
+
|
| 288 |
+
weights={
|
| 289 |
+
"correct_service": 0.25,
|
| 290 |
+
"correct_category": 0.20,
|
| 291 |
+
"correct_fix": 0.25,
|
| 292 |
+
"secondary_fix": 0.00,
|
| 293 |
+
"diagnosis_text": 0.10,
|
| 294 |
+
"investigation": 0.10,
|
| 295 |
+
"wrong_penalty": 0.05,
|
| 296 |
+
},
|
| 297 |
+
)
|
| 298 |
+
|