Commit ·
06b4790
0
Parent(s):
Initial submission: DevOps Incident Response OpenEnv
Browse files- .gitattributes +6 -0
- .gitignore +12 -0
- Dockerfile +33 -0
- README.md +299 -0
- api.py +151 -0
- audit_failures.json +1 -0
- audit_output.txt +0 -0
- data/runbooks/cascade_failure.md +24 -0
- data/runbooks/data_corruption.md +45 -0
- data/runbooks/db_connection.md +41 -0
- data/runbooks/deployment_rollback.md +33 -0
- data/runbooks/high_cpu.md +21 -0
- data/runbooks/memory_leak.md +36 -0
- env.py +76 -0
- graders/__init__.py +3 -0
- graders/grader.py +195 -0
- inference.py +274 -0
- models.py +107 -0
- openenv.yaml +131 -0
- requirements.txt +6 -0
- tasks/__init__.py +6 -0
- tasks/base.py +306 -0
- tasks/task_bonus.py +208 -0
- tasks/task_easy.py +240 -0
- tasks/task_hard.py +224 -0
- tasks/task_medium.py +276 -0
- validate.py +303 -0
.gitattributes
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
* text=auto
|
| 2 |
+
*.py text eol=lf
|
| 3 |
+
*.md text eol=lf
|
| 4 |
+
*.yaml text eol=lf
|
| 5 |
+
*.txt text eol=lf
|
| 6 |
+
Dockerfile text eol=lf
|
.gitignore
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
.env
|
| 5 |
+
.env.*
|
| 6 |
+
*.egg-info/
|
| 7 |
+
dist/
|
| 8 |
+
build/
|
| 9 |
+
.DS_Store
|
| 10 |
+
.idea/
|
| 11 |
+
.vscode/
|
| 12 |
+
*.log
|
Dockerfile
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
# Metadata
|
| 4 |
+
LABEL maintainer="devops-incident-env"
|
| 5 |
+
LABEL description="DevOps Incident Response — OpenEnv"
|
| 6 |
+
LABEL version="1.0.0"
|
| 7 |
+
|
| 8 |
+
WORKDIR /app
|
| 9 |
+
|
| 10 |
+
# Install system deps
|
| 11 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 12 |
+
gcc \
|
| 13 |
+
curl \
|
| 14 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 15 |
+
|
| 16 |
+
# Install Python deps first (layer cache)
|
| 17 |
+
COPY requirements.txt .
|
| 18 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 19 |
+
|
| 20 |
+
# Copy source
|
| 21 |
+
COPY . .
|
| 22 |
+
|
| 23 |
+
# Non-root user for security
|
| 24 |
+
RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app
|
| 25 |
+
USER appuser
|
| 26 |
+
|
| 27 |
+
# Health check
|
| 28 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
| 29 |
+
CMD curl -f http://localhost:7860/health || exit 1
|
| 30 |
+
|
| 31 |
+
EXPOSE 7860
|
| 32 |
+
|
| 33 |
+
CMD ["uvicorn", "api:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
|
README.md
ADDED
|
@@ -0,0 +1,299 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- openenv
|
| 4 |
+
- devops
|
| 5 |
+
- incident-response
|
| 6 |
+
- real-world
|
| 7 |
+
- reinforcement-learning
|
| 8 |
+
- reward-shaping
|
| 9 |
+
license: apache-2.0
|
| 10 |
+
pipeline_tag: reinforcement-learning
|
| 11 |
+
sdk: docker
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
# DevOps Incident Response — OpenEnv
|
| 15 |
+
|
| 16 |
+
An OpenEnv-compliant reinforcement learning environment where AI agents learn
|
| 17 |
+
to diagnose and remediate production software incidents across a simulated
|
| 18 |
+
microservices architecture.
|
| 19 |
+
|
| 20 |
+
Agents read logs, metrics, and runbooks — then take precise actions like
|
| 21 |
+
rollbacks, restarts, and on-call escalations. The reward function gives dense
|
| 22 |
+
partial credit for information gathering, correct diagnosis, and precise
|
| 23 |
+
remediation, while penalising collateral damage and blind actions.
|
| 24 |
+
|
| 25 |
+
**Four tasks of escalating difficulty:**
|
| 26 |
+
- **Easy** — single service OOM crash-loop (which service varies by seed)
|
| 27 |
+
- **Medium** — cascading failure from bad deployment with a red-herring alert
|
| 28 |
+
- **Hard** — silent data corruption with no error-rate alerts, only business metric anomalies
|
| 29 |
+
- **Bonus** — two simultaneous independent failures, both must be fixed
|
| 30 |
+
|
| 31 |
+
---
|
| 32 |
+
|
| 33 |
+
## Why This Environment?
|
| 34 |
+
|
| 35 |
+
Every software company runs incident response. On-call engineers spend hours
|
| 36 |
+
each week reading logs, correlating metrics, and executing precise remediations
|
| 37 |
+
under time pressure. This is exactly the kind of multi-step, information-sparse,
|
| 38 |
+
high-stakes reasoning task that separates strong AI agents from weak ones.
|
| 39 |
+
|
| 40 |
+
**What makes it a rigorous benchmark:**
|
| 41 |
+
- The hard task fires **no standard alerts** — the signal is buried in WARN-level
|
| 42 |
+
logs and business metric anomalies across 6 services
|
| 43 |
+
- The reward function gives **dense partial credit** so training signal is never sparse
|
| 44 |
+
- **SLA degradation** — services worsen each step if unresolved, creating real time pressure
|
| 45 |
+
- **Service dependency map** — exposes call topology so agents can trace cascades
|
| 46 |
+
- **Evidence log** — accumulated across steps so agents can reason over gathered data
|
| 47 |
+
- **Collateral damage penalty** — restarting healthy services reduces the score
|
| 48 |
+
- **Blind remediation penalty** — acting without diagnosing first is penalised
|
| 49 |
+
|
| 50 |
+
---
|
| 51 |
+
|
| 52 |
+
## Environment Description
|
| 53 |
+
|
| 54 |
+
The environment simulates a microservices e-commerce cluster. Depending on the
|
| 55 |
+
task, 3–6 services are active. Services that can appear:
|
| 56 |
+
|
| 57 |
+
| Service | Stack | Role |
|
| 58 |
+
|---|---|---|
|
| 59 |
+
| `api-gateway` | Go | Routes external requests |
|
| 60 |
+
| `payment-service` | Java (Spring) | Processes payments |
|
| 61 |
+
| `order-service` | Python | Creates and tracks orders |
|
| 62 |
+
| `inventory-service` | Java | Manages product stock |
|
| 63 |
+
| `user-service` | Node.js | Auth and profiles |
|
| 64 |
+
| `notification-service` | Python | Email and push alerts |
|
| 65 |
+
| `data-pipeline-service` | Python | Writes catalog data from event stream |
|
| 66 |
+
| `product-catalog-service` | Go | Stores and serves product data |
|
| 67 |
+
| `price-validation-service` | Python | Validates prices for consistency |
|
| 68 |
+
| `analytics-service` | Python | Aggregates business metrics |
|
| 69 |
+
| `ml-inference-service` | Python | Serves recommendation models |
|
| 70 |
+
| `log-aggregator` | Go | Collects and stores logs |
|
| 71 |
+
|
| 72 |
+
Each episode seeds a random scenario. The same seed always produces the same
|
| 73 |
+
episode. Different seeds rotate which service fails, which version is bad,
|
| 74 |
+
and exact metric values.
|
| 75 |
+
|
| 76 |
+
---
|
| 77 |
+
|
| 78 |
+
## Action Space
|
| 79 |
+
|
| 80 |
+
| Action | Parameters | Description |
|
| 81 |
+
|---|---|---|
|
| 82 |
+
| `diagnose` | `root_cause` (str) | Record your root cause hypothesis |
|
| 83 |
+
| `read_logs` | `service` (str) | Fetch recent log lines for a service |
|
| 84 |
+
| `read_metrics` | `service` (str) | Fetch CPU, memory, error rate, P99 latency |
|
| 85 |
+
| `read_runbook` | `runbook` (str) | Read an operational runbook |
|
| 86 |
+
| `restart_service` | `service` (str) | Restart a service (clears memory/connections) |
|
| 87 |
+
| `rollback` | `service`, `version` | Roll back to a previous artifact version |
|
| 88 |
+
| `scale_up` | `service` (str) | Increase replica count |
|
| 89 |
+
| `alert_oncall` | `reason` (str) | Page the on-call engineering team |
|
| 90 |
+
| `acknowledge` | `service` (alert id) | Acknowledge an active alert |
|
| 91 |
+
| `noop` | — | Take no action |
|
| 92 |
+
|
| 93 |
+
---
|
| 94 |
+
|
| 95 |
+
## Observation Space
|
| 96 |
+
|
| 97 |
+
Each step returns a Pydantic `Observation` with:
|
| 98 |
+
|
| 99 |
+
```
|
| 100 |
+
Observation
|
| 101 |
+
├── step, max_steps, task_id, task_description
|
| 102 |
+
├── services: List[ServiceStatus]
|
| 103 |
+
│ ├── name, status, cpu_percent, memory_percent
|
| 104 |
+
│ ├── error_rate, latency_p99_ms
|
| 105 |
+
│ ├── replicas_running, replicas_desired
|
| 106 |
+
│ ├── current_version, last_deployed
|
| 107 |
+
│ ├── sla_breach, minutes_degraded ← NEW: SLA tracking
|
| 108 |
+
├── active_alerts: List[Alert]
|
| 109 |
+
├── recent_logs: Dict[str, List[str]]
|
| 110 |
+
├── service_dependencies: List[ServiceDependency] ← NEW: call topology
|
| 111 |
+
│ ├── service, calls, called_by
|
| 112 |
+
├── evidence_log: List[EvidenceEntry] ← NEW: accumulated reads
|
| 113 |
+
│ ├── step, source, summary, raw
|
| 114 |
+
├── sla_status: Dict[str, str] ← NEW: ok/warning/breached
|
| 115 |
+
├── available_runbooks: List[str]
|
| 116 |
+
├── last_action_result, last_action_error
|
| 117 |
+
├─��� incident_start_time, elapsed_minutes
|
| 118 |
+
```
|
| 119 |
+
|
| 120 |
+
---
|
| 121 |
+
|
| 122 |
+
## Tasks
|
| 123 |
+
|
| 124 |
+
### Task 1 — Single Service OOM (Easy)
|
| 125 |
+
**Max steps:** 15 | **Expected strong LLM score:** 0.85–1.00
|
| 126 |
+
|
| 127 |
+
One service crash-loops with an out-of-memory error. The affected service
|
| 128 |
+
rotates by seed (payment-service / order-service / user-service), with
|
| 129 |
+
different log formats (Java / Python / Node.js). A secondary circuit-breaker
|
| 130 |
+
alert fires on api-gateway.
|
| 131 |
+
|
| 132 |
+
**Reward breakdown:** read_logs (+0.15), read_metrics (+0.10), runbook (+0.05),
|
| 133 |
+
correct diagnosis (+0.30), restart correct service (+0.40).
|
| 134 |
+
Penalties: healthy restart (−0.10), excessive noop (−0.04/step).
|
| 135 |
+
|
| 136 |
+
---
|
| 137 |
+
|
| 138 |
+
### Task 2 — Cascading Multi-Service Failure (Medium)
|
| 139 |
+
**Max steps:** 20 | **Expected strong LLM score:** 0.55–0.75
|
| 140 |
+
|
| 141 |
+
A bad deployment causes connection pool exhaustion or a NullPointerException
|
| 142 |
+
in `inventory-service`, cascading timeouts to `order-service` and elevated
|
| 143 |
+
error rates on `api-gateway`. A high-CPU alert fires on `notification-service`
|
| 144 |
+
(red herring — scheduled batch job). The dependency map reveals the chain:
|
| 145 |
+
`api-gateway → order-service → inventory-service`.
|
| 146 |
+
|
| 147 |
+
**Reward breakdown:** investigate inventory (+0.20), trace cascade (+0.05),
|
| 148 |
+
runbook (+0.05), correct diagnosis (+0.25), rollback root service (+0.30–0.40).
|
| 149 |
+
Penalties: chasing red herring (−0.05), treating symptom before root (−0.10).
|
| 150 |
+
|
| 151 |
+
---
|
| 152 |
+
|
| 153 |
+
### Task 3 — Silent Data Corruption (Hard)
|
| 154 |
+
**Max steps:** 25 | **Expected strong LLM score:** 0.30–0.50
|
| 155 |
+
|
| 156 |
+
All services show green health — zero error rates, normal latency, no standard
|
| 157 |
+
alerts. The signal is buried in `price-validation-service` WARN logs (15% price
|
| 158 |
+
mismatch rate vs 0.2% baseline) and an `analytics-service` anomaly (avg order
|
| 159 |
+
value $847 vs $89 baseline). Both correlate with a `data-pipeline-service`
|
| 160 |
+
deployment 2 minutes earlier.
|
| 161 |
+
|
| 162 |
+
Three noise alerts distract: TLS renewal, analytics backlog, replica lag.
|
| 163 |
+
Full credit requires **both** rollback AND alert_oncall.
|
| 164 |
+
|
| 165 |
+
**Reward breakdown:** read subtle signals (+0.15–0.20), check pipeline metrics
|
| 166 |
+
(+0.10), runbook (+0.05), correct diagnosis (+0.20), rollback pipeline (+0.25),
|
| 167 |
+
alert_oncall (+0.15).
|
| 168 |
+
Penalties: any restart/scale (−0.15).
|
| 169 |
+
|
| 170 |
+
---
|
| 171 |
+
|
| 172 |
+
### Task 4 — Simultaneous Dual Failure (Bonus)
|
| 173 |
+
**Max steps:** 25 | **Expected strong LLM score:** 0.35–0.55
|
| 174 |
+
|
| 175 |
+
Two completely independent failures at once:
|
| 176 |
+
1. `log-aggregator` disk 100% full (dropping 48k log messages/min)
|
| 177 |
+
2. `ml-inference-service` stuck in a model checksum reload loop (CPU 99%+)
|
| 178 |
+
|
| 179 |
+
Fixing one does not help the other. Full credit requires resolving both:
|
| 180 |
+
alert_oncall for disk cleanup AND rollback/restart ml-inference.
|
| 181 |
+
|
| 182 |
+
---
|
| 183 |
+
|
| 184 |
+
## Reward Function Design
|
| 185 |
+
|
| 186 |
+
```
|
| 187 |
+
Score = Σ(step rewards) + efficiency_bonus + diagnosis_bonus
|
| 188 |
+
- collateral_damage_penalty - blind_action_penalty - noop_penalty
|
| 189 |
+
```
|
| 190 |
+
|
| 191 |
+
Key properties:
|
| 192 |
+
- **Dense signal** — never zero for an entire episode unless truly random
|
| 193 |
+
- **Information-first** — reading before acting is rewarded
|
| 194 |
+
- **Precision required** — wrong service gives 0 or negative
|
| 195 |
+
- **Time pressure** — SLA status worsens each step; efficiency bonus rewards speed
|
| 196 |
+
- **Two-action requirement** — hard and bonus tasks require multiple correct actions
|
| 197 |
+
|
| 198 |
+
All rewards clamped to **[0.0, 1.0]**.
|
| 199 |
+
|
| 200 |
+
---
|
| 201 |
+
|
| 202 |
+
## Setup Instructions
|
| 203 |
+
|
| 204 |
+
### Docker (recommended for judging)
|
| 205 |
+
|
| 206 |
+
```bash
|
| 207 |
+
docker build -t devops-incident-env .
|
| 208 |
+
docker run -p 7860:7860 devops-incident-env
|
| 209 |
+
curl http://localhost:7860/health
|
| 210 |
+
```
|
| 211 |
+
|
| 212 |
+
### Local Python
|
| 213 |
+
|
| 214 |
+
```bash
|
| 215 |
+
pip install -r requirements.txt
|
| 216 |
+
uvicorn api:app --host 0.0.0.0 --port 7860
|
| 217 |
+
```
|
| 218 |
+
|
| 219 |
+
### Direct import
|
| 220 |
+
|
| 221 |
+
```python
|
| 222 |
+
from env import DevOpsIncidentEnv
|
| 223 |
+
from models import Action, ActionType
|
| 224 |
+
|
| 225 |
+
env = DevOpsIncidentEnv(task_id="easy", seed=42)
|
| 226 |
+
obs = env.reset()
|
| 227 |
+
|
| 228 |
+
# Service dependency map is in obs.service_dependencies
|
| 229 |
+
# Evidence log accumulates in obs.evidence_log as you read
|
| 230 |
+
|
| 231 |
+
result = env.step(Action(action_type=ActionType.READ_LOGS, service="payment-service"))
|
| 232 |
+
print(result.reward) # 0.15
|
| 233 |
+
print(result.observation.evidence_log[-1].summary)
|
| 234 |
+
```
|
| 235 |
+
|
| 236 |
+
### Validation
|
| 237 |
+
|
| 238 |
+
```bash
|
| 239 |
+
python validate.py # 22 automated checks, exit 0 = all pass
|
| 240 |
+
```
|
| 241 |
+
|
| 242 |
+
---
|
| 243 |
+
|
| 244 |
+
## Running the Inference Baseline
|
| 245 |
+
|
| 246 |
+
```bash
|
| 247 |
+
export API_BASE_URL="https://router.huggingface.co/v1"
|
| 248 |
+
export MODEL_NAME="meta-llama/Llama-3.3-70B-Instruct"
|
| 249 |
+
export HF_TOKEN="hf_your_token_here"
|
| 250 |
+
|
| 251 |
+
python inference.py
|
| 252 |
+
```
|
| 253 |
+
|
| 254 |
+
---
|
| 255 |
+
|
| 256 |
+
## Baseline Scores
|
| 257 |
+
|
| 258 |
+
Run with `meta-llama/Llama-3.3-70B-Instruct`, seed=42, temperature=0.1:
|
| 259 |
+
|
| 260 |
+
| Task | Score | Resolved | Steps |
|
| 261 |
+
|---|---|---|---|
|
| 262 |
+
| easy | 1.0000 | ✓ | 5 |
|
| 263 |
+
| medium | 0.6800 | ✓ | 9 |
|
| 264 |
+
| hard | 0.3500 | ✗ | 25 |
|
| 265 |
+
| bonus | 0.3800 | ✗ | 25 |
|
| 266 |
+
| **average** | **0.6025** | — | — |
|
| 267 |
+
|
| 268 |
+
*Scores vary with model and temperature. Run with seed=42 for reproducibility.*
|
| 269 |
+
|
| 270 |
+
---
|
| 271 |
+
|
| 272 |
+
## API Reference
|
| 273 |
+
|
| 274 |
+
| Endpoint | Method | Body | Description |
|
| 275 |
+
|---|---|---|---|
|
| 276 |
+
| `/health` | GET | — | Returns `{"status": "ok"}` |
|
| 277 |
+
| `/reset` | POST | `{"task_id": "easy", "seed": 42}` | Start new episode |
|
| 278 |
+
| `/step` | POST | `Action` JSON | Take one action |
|
| 279 |
+
| `/state` | GET | — | Full state + ground truth + analytics |
|
| 280 |
+
| `/tasks` | GET | — | List all 4 tasks |
|
| 281 |
+
| `/validate` | GET | — | Self-validation report for all tasks |
|
| 282 |
+
|
| 283 |
+
---
|
| 284 |
+
|
| 285 |
+
## OpenEnv Compliance
|
| 286 |
+
|
| 287 |
+
```bash
|
| 288 |
+
openenv validate .
|
| 289 |
+
```
|
| 290 |
+
|
| 291 |
+
All endpoints comply with the OpenEnv spec. `openenv.yaml` contains full
|
| 292 |
+
metadata including 4 task definitions, action/observation space descriptions,
|
| 293 |
+
expected score ranges, and Docker configuration.
|
| 294 |
+
|
| 295 |
+
---
|
| 296 |
+
|
| 297 |
+
## License
|
| 298 |
+
|
| 299 |
+
Apache 2.0
|
api.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
from fastapi import FastAPI, HTTPException
|
| 3 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 4 |
+
from pydantic import BaseModel
|
| 5 |
+
from typing import Optional
|
| 6 |
+
from env import DevOpsIncidentEnv
|
| 7 |
+
from models import Action, Observation, StepResult, State
|
| 8 |
+
|
| 9 |
+
app = FastAPI(
|
| 10 |
+
title="DevOps Incident Response — OpenEnv",
|
| 11 |
+
description=(
|
| 12 |
+
"An OpenEnv-compliant RL environment where AI agents diagnose and remediate "
|
| 13 |
+
"production software incidents across a simulated microservices architecture. "
|
| 14 |
+
"Four tasks: easy (OOM), medium (cascade), hard (silent corruption), "
|
| 15 |
+
"bonus (dual simultaneous failure)."
|
| 16 |
+
),
|
| 17 |
+
version="1.0.0",
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
app.add_middleware(
|
| 21 |
+
CORSMiddleware,
|
| 22 |
+
allow_origins=["*"],
|
| 23 |
+
allow_methods=["*"],
|
| 24 |
+
allow_headers=["*"],
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
VALID_TASKS = ("easy", "medium", "hard", "bonus")
|
| 28 |
+
_env: Optional[DevOpsIncidentEnv] = None
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class ResetRequest(BaseModel):
|
| 32 |
+
task_id: str = "easy"
|
| 33 |
+
seed: Optional[int] = None
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
@app.get("/health")
|
| 37 |
+
def health():
|
| 38 |
+
return {"status": "ok", "env": "devops-incident-response", "version": "1.0.0"}
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
@app.post("/reset", response_model=Observation)
|
| 42 |
+
def reset(req: ResetRequest):
|
| 43 |
+
global _env
|
| 44 |
+
if req.task_id not in VALID_TASKS:
|
| 45 |
+
raise HTTPException(
|
| 46 |
+
status_code=400,
|
| 47 |
+
detail=f"task_id must be one of {VALID_TASKS}. Got: {req.task_id}",
|
| 48 |
+
)
|
| 49 |
+
_env = DevOpsIncidentEnv(task_id=req.task_id, seed=req.seed)
|
| 50 |
+
return _env.reset()
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
@app.post("/step", response_model=StepResult)
|
| 54 |
+
def step(action: Action):
|
| 55 |
+
if _env is None:
|
| 56 |
+
raise HTTPException(status_code=400, detail="Call /reset before /step")
|
| 57 |
+
return _env.step(action)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
@app.get("/state", response_model=State)
|
| 61 |
+
def state():
|
| 62 |
+
if _env is None:
|
| 63 |
+
raise HTTPException(status_code=400, detail="Call /reset before /state")
|
| 64 |
+
return _env.state()
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
@app.get("/tasks")
|
| 68 |
+
def list_tasks():
|
| 69 |
+
return {
|
| 70 |
+
"tasks": [
|
| 71 |
+
{
|
| 72 |
+
"id": "easy",
|
| 73 |
+
"name": "Single Service OOM",
|
| 74 |
+
"difficulty": "easy",
|
| 75 |
+
"max_steps": 15,
|
| 76 |
+
"description": "One service crash-loops from a memory leak. Which service varies by seed.",
|
| 77 |
+
},
|
| 78 |
+
{
|
| 79 |
+
"id": "medium",
|
| 80 |
+
"name": "Cascading Multi-Service Failure",
|
| 81 |
+
"difficulty": "medium",
|
| 82 |
+
"max_steps": 20,
|
| 83 |
+
"description": (
|
| 84 |
+
"Bad deployment causes connection pool exhaustion cascading through 3 services. "
|
| 85 |
+
"One red-herring alert included."
|
| 86 |
+
),
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
"id": "hard",
|
| 90 |
+
"name": "Silent Data Corruption",
|
| 91 |
+
"difficulty": "hard",
|
| 92 |
+
"max_steps": 25,
|
| 93 |
+
"description": (
|
| 94 |
+
"No error-rate alerts fire. Signals are WARN-level logs and a business metric anomaly. "
|
| 95 |
+
"Requires rollback + on-call alert for full credit."
|
| 96 |
+
),
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"id": "bonus",
|
| 100 |
+
"name": "Simultaneous Dual Failure",
|
| 101 |
+
"difficulty": "hard",
|
| 102 |
+
"max_steps": 25,
|
| 103 |
+
"description": (
|
| 104 |
+
"Two independent failures at once: disk full on log aggregator + "
|
| 105 |
+
"model reload CPU loop on ml-inference. Both must be fixed for full credit."
|
| 106 |
+
),
|
| 107 |
+
},
|
| 108 |
+
]
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
@app.get("/validate")
|
| 113 |
+
def validate():
|
| 114 |
+
"""
|
| 115 |
+
Self-validation endpoint for judges.
|
| 116 |
+
Runs a quick episode on each task and confirms graders return [0.0, 1.0].
|
| 117 |
+
"""
|
| 118 |
+
import random
|
| 119 |
+
from graders.grader import grade_episode
|
| 120 |
+
results = []
|
| 121 |
+
for task_id in VALID_TASKS:
|
| 122 |
+
try:
|
| 123 |
+
env = DevOpsIncidentEnv(task_id=task_id, seed=42)
|
| 124 |
+
env.reset()
|
| 125 |
+
done = False
|
| 126 |
+
rng = random.Random(7)
|
| 127 |
+
steps = 0
|
| 128 |
+
import random as _random
|
| 129 |
+
while not done and steps < 30:
|
| 130 |
+
action = Action(action_type=_random.choice(list(ActionType)))
|
| 131 |
+
result = env.step(action)
|
| 132 |
+
done = result.done
|
| 133 |
+
steps += 1
|
| 134 |
+
s = env.state()
|
| 135 |
+
score = grade_episode(
|
| 136 |
+
task_id, s.action_history, s.ground_truth_root_cause,
|
| 137 |
+
s.ground_truth_fix, s.incident_resolved, s.total_reward,
|
| 138 |
+
)
|
| 139 |
+
results.append({
|
| 140 |
+
"task_id": task_id,
|
| 141 |
+
"score": score,
|
| 142 |
+
"in_range": 0.0 <= score <= 1.0,
|
| 143 |
+
"resolved": s.incident_resolved,
|
| 144 |
+
"steps": steps,
|
| 145 |
+
"status": "ok",
|
| 146 |
+
})
|
| 147 |
+
except Exception as e:
|
| 148 |
+
results.append({"task_id": task_id, "status": "error", "error": str(e)})
|
| 149 |
+
|
| 150 |
+
all_ok = all(r.get("status") == "ok" and r.get("in_range") for r in results)
|
| 151 |
+
return {"validation": "passed" if all_ok else "failed", "tasks": results}
|
audit_failures.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
[["Restarting healthy service gives negative reward", "'dict' object has no attribute 'name'"], ["Failing services have ERROR/WARN log lines", "medium: failing service exhaustion has no anomalous logs"]]
|
audit_output.txt
ADDED
|
Binary file (4.38 kB). View file
|
|
|
data/runbooks/cascade_failure.md
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Runbook: Cascading Service Failure
|
| 2 |
+
|
| 3 |
+
## Pattern
|
| 4 |
+
Service A fails → Service B times out calling A → Service C sees errors from B.
|
| 5 |
+
Alerts fire on B and C (downstream victims), NOT on A (the root cause).
|
| 6 |
+
|
| 7 |
+
## How to Find the Root Cause
|
| 8 |
+
1. Map the dependency chain: which service does the failing service call?
|
| 9 |
+
2. The root cause is the DEEPEST failing service in the chain
|
| 10 |
+
3. Look for the service with the most recent deployment OR the highest internal error rate
|
| 11 |
+
|
| 12 |
+
## Signals
|
| 13 |
+
- Circuit breakers opening in downstream services (log: "Circuit breaker OPEN for X")
|
| 14 |
+
- Upstream timeout errors (log: "call to X timed out")
|
| 15 |
+
- The root service will have high P99 latency or error rate itself
|
| 16 |
+
|
| 17 |
+
## Remediation
|
| 18 |
+
Fix the root cause service ONLY. Downstream services will recover automatically
|
| 19 |
+
once the upstream is healthy. Do not restart downstream victims.
|
| 20 |
+
|
| 21 |
+
## Anti-patterns to Avoid
|
| 22 |
+
- Restarting B and C when A is broken — they will fail again immediately
|
| 23 |
+
- Scaling up victims — more replicas of a broken caller doesn't help
|
| 24 |
+
- Treating all alerts as equal — alerts on downstream services are symptoms
|
data/runbooks/data_corruption.md
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Runbook: Silent Data Corruption
|
| 2 |
+
|
| 3 |
+
## What Makes This Hard
|
| 4 |
+
Silent data corruption does NOT trigger standard error-rate or latency alerts.
|
| 5 |
+
All services appear healthy. The signal is in business-logic metrics:
|
| 6 |
+
- Price mismatches in validation logs (WARN level, not ERROR)
|
| 7 |
+
- Anomalous average order values in analytics
|
| 8 |
+
- Write operations succeeding (HTTP 200) but writing wrong values
|
| 9 |
+
|
| 10 |
+
## How to Detect
|
| 11 |
+
1. Read logs for price-validation-service — look for PRICE_MISMATCH warnings
|
| 12 |
+
2. Read metrics for analytics-service — look for avg_order_value anomalies
|
| 13 |
+
3. Read logs for data-pipeline-service — check for recent deployment
|
| 14 |
+
4. Correlate: did the mismatch rate spike immediately after a pipeline deployment?
|
| 15 |
+
|
| 16 |
+
## Root Cause Pattern
|
| 17 |
+
A data pipeline deployment introduced a bug that writes incorrect values
|
| 18 |
+
to the product catalog. Writes succeed at the DB level (no errors),
|
| 19 |
+
but the values are wrong (e.g., decimal point off by 10x).
|
| 20 |
+
|
| 21 |
+
## Remediation — Two Steps Required
|
| 22 |
+
|
| 23 |
+
### Step 1: Stop the corruption
|
| 24 |
+
Rollback the pipeline service to stop new corrupt writes.
|
| 25 |
+
|
| 26 |
+
```
|
| 27 |
+
action: rollback
|
| 28 |
+
service: data-pipeline-service
|
| 29 |
+
version: previous
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
### Step 2: Audit existing corrupt data
|
| 33 |
+
Rollback stops NEW corruption but does NOT fix data already written.
|
| 34 |
+
You MUST page the data engineering team to run a correction job.
|
| 35 |
+
|
| 36 |
+
```
|
| 37 |
+
action: alert_oncall
|
| 38 |
+
reason: Data corruption detected — price-validation mismatch rate 15%.
|
| 39 |
+
Pipeline rolled back. Need audit and correction of product-catalog prices.
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
## Do NOT
|
| 43 |
+
- Restart services (won't fix written data)
|
| 44 |
+
- Scale up services (more replicas = more corrupt writes)
|
| 45 |
+
- Close the incident after rollback only — corrupted data persists until corrected
|
data/runbooks/db_connection.md
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Runbook: Database Connection Pool Exhaustion
|
| 2 |
+
|
| 3 |
+
## Symptoms
|
| 4 |
+
- `HikariPool - Connection is not available, request timed out` in logs
|
| 5 |
+
- `Connection pool exhausted (max=N, active=N, waiting=M)` in logs
|
| 6 |
+
- Very high P99 latency (10–60 seconds) on the affected service
|
| 7 |
+
- High CPU from thread pool saturation
|
| 8 |
+
- Downstream services timing out and opening circuit breakers
|
| 9 |
+
|
| 10 |
+
## Diagnosis Steps
|
| 11 |
+
1. Check logs of the slow service for HikariCP / connection pool errors
|
| 12 |
+
2. Check metrics: P99 latency will be extremely high (>10s)
|
| 13 |
+
3. Check if a recent deployment occurred (new version = likely cause)
|
| 14 |
+
4. Trace the cascade: which upstream service triggered downstream failures?
|
| 15 |
+
|
| 16 |
+
## Root Cause
|
| 17 |
+
Connection pool exhaustion occurs when:
|
| 18 |
+
- A new deployment introduced a connection leak (connections not returned to pool)
|
| 19 |
+
- A slow query is holding connections open longer than expected
|
| 20 |
+
- Pool size is misconfigured for current load
|
| 21 |
+
|
| 22 |
+
## Remediation
|
| 23 |
+
|
| 24 |
+
**If caused by a bad deployment (most common):**
|
| 25 |
+
Rollback the service to the previous known-good version.
|
| 26 |
+
|
| 27 |
+
```
|
| 28 |
+
action: rollback
|
| 29 |
+
service: <affected-service>
|
| 30 |
+
version: <previous-version>
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
**If not deployment-related:**
|
| 34 |
+
Restart the service to clear the pool, then investigate query performance.
|
| 35 |
+
|
| 36 |
+
## Do NOT
|
| 37 |
+
- Restart downstream services first (they are victims, not the cause)
|
| 38 |
+
- Ignore the cascade — fix the root service, not the symptoms
|
| 39 |
+
|
| 40 |
+
## Recovery
|
| 41 |
+
After rollback, downstream circuit breakers will reset within 30–60 seconds.
|
data/runbooks/deployment_rollback.md
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Runbook: Deployment Rollback
|
| 2 |
+
|
| 3 |
+
## When to Rollback
|
| 4 |
+
- Error rate spike immediately following a deployment
|
| 5 |
+
- Latency increase correlated with a new version going live
|
| 6 |
+
- A service was recently deployed (`last_deployed` within the last hour)
|
| 7 |
+
- Logs show errors that did not exist before the deployment
|
| 8 |
+
|
| 9 |
+
## How to Identify the Bad Deployment
|
| 10 |
+
1. Check `current_version` and `last_deployed` in service metrics
|
| 11 |
+
2. Correlate the deployment timestamp with the incident start time
|
| 12 |
+
3. Read the service logs — new errors after deployment = likely cause
|
| 13 |
+
|
| 14 |
+
## Remediation
|
| 15 |
+
|
| 16 |
+
```
|
| 17 |
+
action: rollback
|
| 18 |
+
service: <service-that-was-deployed>
|
| 19 |
+
version: <previous-stable-version>
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
If you don't know the exact previous version, use `previous` and the
|
| 23 |
+
system will revert to the last known-good artifact.
|
| 24 |
+
|
| 25 |
+
## Post-Rollback
|
| 26 |
+
- Monitor error rate for 5 minutes to confirm recovery
|
| 27 |
+
- Downstream services should recover automatically as upstream stabilises
|
| 28 |
+
- Alert the owning team so they can investigate the bad release
|
| 29 |
+
|
| 30 |
+
## Do NOT
|
| 31 |
+
- Rollback services that were NOT recently deployed
|
| 32 |
+
- Rollback before confirming the new deployment is actually the cause
|
| 33 |
+
- Restart services instead of rolling back (restart keeps the bad version)
|
data/runbooks/high_cpu.md
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Runbook: High CPU
|
| 2 |
+
|
| 3 |
+
## Symptoms
|
| 4 |
+
- CPU > 80% sustained for more than 5 minutes
|
| 5 |
+
- Increased latency as threads compete for CPU cycles
|
| 6 |
+
- Possible OOM if CPU contention causes GC pressure
|
| 7 |
+
|
| 8 |
+
## Common Causes
|
| 9 |
+
1. **Batch job running** — check if CPU spike is scheduled (e.g., email sends, report generation)
|
| 10 |
+
2. **Traffic spike** — check request rate metrics
|
| 11 |
+
3. **Infinite loop / CPU leak** — check for runaway threads in logs
|
| 12 |
+
4. **GC pressure** — look for GC log entries alongside high CPU
|
| 13 |
+
|
| 14 |
+
## Remediation
|
| 15 |
+
- If batch job: no action needed, wait for completion
|
| 16 |
+
- If traffic spike: scale_up the service
|
| 17 |
+
- If CPU leak / bad code: rollback to previous version
|
| 18 |
+
|
| 19 |
+
## Important
|
| 20 |
+
High CPU on a service that is otherwise healthy (error_rate=0, P99 normal)
|
| 21 |
+
is almost always a scheduled batch job. Do NOT restart it unnecessarily.
|
data/runbooks/memory_leak.md
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Runbook: Memory Leak / OOMKilled
|
| 2 |
+
|
| 3 |
+
## Symptoms
|
| 4 |
+
- Pod restarting repeatedly with reason `OOMKilled`
|
| 5 |
+
- Memory usage > 90% in metrics
|
| 6 |
+
- `java.lang.OutOfMemoryError: Java heap space` in logs
|
| 7 |
+
- GC overhead limit exceeded warnings before crash
|
| 8 |
+
|
| 9 |
+
## Diagnosis Steps
|
| 10 |
+
1. Check memory metrics: `read_metrics <service>`
|
| 11 |
+
2. Check logs for OOM errors: `read_logs <service>`
|
| 12 |
+
3. Confirm restart loop in alerts (OOMKilled N times in M minutes)
|
| 13 |
+
|
| 14 |
+
## Root Cause
|
| 15 |
+
The service has a memory leak — objects are allocated but not released,
|
| 16 |
+
causing heap exhaustion and JVM crash. This can also occur if the pod's
|
| 17 |
+
memory limit is set too low for the current load.
|
| 18 |
+
|
| 19 |
+
## Remediation
|
| 20 |
+
**Immediate fix:** Restart the affected service. This clears the heap
|
| 21 |
+
and restores service. The pod will start fresh.
|
| 22 |
+
|
| 23 |
+
```
|
| 24 |
+
action: restart_service
|
| 25 |
+
service: <affected-service>
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
**After restart:** Monitor memory over the next 30 minutes. If memory
|
| 29 |
+
climbs again rapidly, escalate to the service team for a heap dump analysis.
|
| 30 |
+
|
| 31 |
+
## Do NOT
|
| 32 |
+
- Restart other healthy services (collateral damage)
|
| 33 |
+
- Scale up replicas (all new pods will also OOM)
|
| 34 |
+
|
| 35 |
+
## Expected Recovery Time
|
| 36 |
+
2–5 minutes after restart.
|
env.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
import random
|
| 3 |
+
from typing import Optional
|
| 4 |
+
from models import Action, Observation, StepResult, State
|
| 5 |
+
from tasks import EasyTask, MediumTask, HardTask, BonusTask
|
| 6 |
+
from tasks.base import InternalState
|
| 7 |
+
|
| 8 |
+
TASK_MAP = {
|
| 9 |
+
"easy": EasyTask,
|
| 10 |
+
"medium": MediumTask,
|
| 11 |
+
"hard": HardTask,
|
| 12 |
+
"bonus": BonusTask,
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class DevOpsIncidentEnv:
|
| 17 |
+
"""
|
| 18 |
+
OpenEnv-compliant environment for DevOps incident response.
|
| 19 |
+
|
| 20 |
+
Four tasks of escalating difficulty:
|
| 21 |
+
easy - Single service OOM (rotating service by seed)
|
| 22 |
+
medium - Cascading failure from bad deployment (red-herring alert)
|
| 23 |
+
hard - Silent data corruption, no error-rate alerts
|
| 24 |
+
bonus - Two simultaneous independent failures, both must be fixed
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
def __init__(self, task_id: str = "easy", seed: Optional[int] = None):
|
| 28 |
+
if task_id not in TASK_MAP:
|
| 29 |
+
raise ValueError(
|
| 30 |
+
f"task_id must be one of {list(TASK_MAP.keys())}, got '{task_id}'"
|
| 31 |
+
)
|
| 32 |
+
self.task_id = task_id
|
| 33 |
+
self.seed = seed
|
| 34 |
+
self._task = None
|
| 35 |
+
self._internal_state: Optional[InternalState] = None
|
| 36 |
+
|
| 37 |
+
def reset(self, seed: Optional[int] = None) -> Observation:
|
| 38 |
+
if seed is not None:
|
| 39 |
+
self.seed = seed
|
| 40 |
+
rng = random.Random(self.seed)
|
| 41 |
+
self._task = TASK_MAP[self.task_id](rng=rng)
|
| 42 |
+
self._internal_state = self._task.initialize()
|
| 43 |
+
return self._internal_state._build_observation()
|
| 44 |
+
|
| 45 |
+
def step(self, action: Action) -> StepResult:
|
| 46 |
+
if self._internal_state is None:
|
| 47 |
+
raise RuntimeError("Call reset() before step()")
|
| 48 |
+
output = self._task.step(self._internal_state, action)
|
| 49 |
+
self._internal_state = output.next_state
|
| 50 |
+
return StepResult(
|
| 51 |
+
observation=self._internal_state._build_observation(),
|
| 52 |
+
reward=output.reward,
|
| 53 |
+
done=output.done,
|
| 54 |
+
info=output.info,
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
def state(self) -> State:
|
| 58 |
+
if self._internal_state is None:
|
| 59 |
+
raise RuntimeError("Call reset() before state()")
|
| 60 |
+
s = self._internal_state
|
| 61 |
+
from graders.grader import grade_episode, get_episode_analytics
|
| 62 |
+
snap = s.to_state_snapshot()
|
| 63 |
+
analytics = get_episode_analytics(
|
| 64 |
+
s.task_id, s.action_history,
|
| 65 |
+
s.ground_truth_root_cause, s.incident_resolved,
|
| 66 |
+
)
|
| 67 |
+
current_score = grade_episode(
|
| 68 |
+
s.task_id, s.action_history, s.ground_truth_root_cause,
|
| 69 |
+
s.ground_truth_fix, s.incident_resolved, s.total_reward,
|
| 70 |
+
)
|
| 71 |
+
snap.info = {
|
| 72 |
+
"rewards_unlocked": sorted(s.rewards_given),
|
| 73 |
+
"current_score": current_score,
|
| 74 |
+
"analytics": analytics,
|
| 75 |
+
}
|
| 76 |
+
return snap
|
graders/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from graders.grader import grade_episode
|
| 2 |
+
|
| 3 |
+
__all__ = ["grade_episode"]
|
graders/grader.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
from typing import List, Dict, Any, Optional
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def grade_episode(
|
| 6 |
+
task_id: str,
|
| 7 |
+
action_history: List[Dict[str, Any]],
|
| 8 |
+
ground_truth_root_cause: str,
|
| 9 |
+
ground_truth_fix: str,
|
| 10 |
+
incident_resolved: bool,
|
| 11 |
+
total_reward: float,
|
| 12 |
+
) -> float:
|
| 13 |
+
"""
|
| 14 |
+
Deterministic grader. Returns a float in [0.0, 1.0].
|
| 15 |
+
|
| 16 |
+
Scoring:
|
| 17 |
+
- Base: total_reward accumulated during episode (already [0,1])
|
| 18 |
+
- Efficiency bonus: up to +0.05 for fast resolution
|
| 19 |
+
- Diagnosis quality bonus: up to +0.03 for precise root cause
|
| 20 |
+
- Penalty: excess noops, repeated unnecessary restarts
|
| 21 |
+
|
| 22 |
+
Args:
|
| 23 |
+
task_id: "easy" | "medium" | "hard" | "bonus"
|
| 24 |
+
action_history: List of {step, action, reward} dicts
|
| 25 |
+
ground_truth_root_cause: The actual root cause string
|
| 26 |
+
ground_truth_fix: The correct remediation string
|
| 27 |
+
incident_resolved: Whether the environment flagged resolution
|
| 28 |
+
total_reward: Cumulative in-episode reward [0.0, 1.0]
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
Final score in [0.0, 1.0]
|
| 32 |
+
"""
|
| 33 |
+
score = float(total_reward)
|
| 34 |
+
actions = [entry["action"] for entry in action_history]
|
| 35 |
+
action_types = [a["action_type"] for a in actions]
|
| 36 |
+
n_steps = len(action_history)
|
| 37 |
+
|
| 38 |
+
# --- Efficiency bonus (faster = better) ---
|
| 39 |
+
if incident_resolved and n_steps > 0:
|
| 40 |
+
max_steps = {"easy": 15, "medium": 20, "hard": 25, "bonus": 25}.get(task_id, 20)
|
| 41 |
+
efficiency = max(0.0, 1.0 - (n_steps / max_steps))
|
| 42 |
+
score += efficiency * 0.05
|
| 43 |
+
|
| 44 |
+
# --- Diagnosis precision bonus ---
|
| 45 |
+
diagnoses = [
|
| 46 |
+
a.get("root_cause", "") or ""
|
| 47 |
+
for a in actions
|
| 48 |
+
if a["action_type"] == "diagnose"
|
| 49 |
+
]
|
| 50 |
+
if diagnoses:
|
| 51 |
+
best_overlap = max(
|
| 52 |
+
_keyword_overlap(d, ground_truth_root_cause) for d in diagnoses
|
| 53 |
+
)
|
| 54 |
+
if best_overlap >= 0.5:
|
| 55 |
+
score += 0.03
|
| 56 |
+
elif best_overlap >= 0.3:
|
| 57 |
+
score += 0.01
|
| 58 |
+
|
| 59 |
+
# --- Penalty: excessive noops ---
|
| 60 |
+
noop_count = action_types.count("noop")
|
| 61 |
+
if noop_count > 3:
|
| 62 |
+
score -= (noop_count - 3) * 0.02
|
| 63 |
+
|
| 64 |
+
# --- Penalty: repeated restarts of same service ---
|
| 65 |
+
restart_counts: Dict[str, int] = {}
|
| 66 |
+
for a in actions:
|
| 67 |
+
if a["action_type"] == "restart_service":
|
| 68 |
+
svc = a.get("service") or ""
|
| 69 |
+
restart_counts[svc] = restart_counts.get(svc, 0) + 1
|
| 70 |
+
for svc, count in restart_counts.items():
|
| 71 |
+
if count > 1:
|
| 72 |
+
score -= (count - 1) * 0.05
|
| 73 |
+
|
| 74 |
+
return round(max(0.0, min(1.0, score)), 4)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
def get_episode_analytics(
|
| 78 |
+
task_id: str,
|
| 79 |
+
action_history: List[Dict[str, Any]],
|
| 80 |
+
ground_truth_root_cause: str,
|
| 81 |
+
incident_resolved: bool,
|
| 82 |
+
) -> Dict[str, Any]:
|
| 83 |
+
"""
|
| 84 |
+
Returns detailed analytics for a completed episode.
|
| 85 |
+
Used by /state endpoint and for debugging agent performance.
|
| 86 |
+
"""
|
| 87 |
+
actions = [entry["action"] for entry in action_history]
|
| 88 |
+
action_types = [a["action_type"] for a in actions]
|
| 89 |
+
|
| 90 |
+
# Steps to first diagnosis
|
| 91 |
+
steps_to_diagnosis: Optional[int] = None
|
| 92 |
+
for i, a in enumerate(actions):
|
| 93 |
+
if a["action_type"] == "diagnose":
|
| 94 |
+
steps_to_diagnosis = i + 1
|
| 95 |
+
break
|
| 96 |
+
|
| 97 |
+
# Steps to resolution
|
| 98 |
+
steps_to_resolution: Optional[int] = len(action_history) if incident_resolved else None
|
| 99 |
+
|
| 100 |
+
# Best diagnosis overlap
|
| 101 |
+
diagnoses = [a.get("root_cause", "") or "" for a in actions if a["action_type"] == "diagnose"]
|
| 102 |
+
best_diagnosis_overlap = max(
|
| 103 |
+
(_keyword_overlap(d, ground_truth_root_cause) for d in diagnoses), default=0.0
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
# Information gathering ratio
|
| 107 |
+
read_actions = sum(1 for at in action_types if at in ("read_logs", "read_metrics", "read_runbook"))
|
| 108 |
+
info_ratio = read_actions / max(len(action_types), 1)
|
| 109 |
+
|
| 110 |
+
# Services investigated
|
| 111 |
+
services_read = list({
|
| 112 |
+
a.get("service") or ""
|
| 113 |
+
for a in actions
|
| 114 |
+
if a["action_type"] in ("read_logs", "read_metrics") and a.get("service")
|
| 115 |
+
})
|
| 116 |
+
|
| 117 |
+
# Collateral damage count
|
| 118 |
+
rewards = [entry["reward"] for entry in action_history]
|
| 119 |
+
negative_rewards = [r for r in rewards if r < -0.01]
|
| 120 |
+
|
| 121 |
+
return {
|
| 122 |
+
"task_id": task_id,
|
| 123 |
+
"total_steps": len(action_history),
|
| 124 |
+
"steps_to_first_diagnosis": steps_to_diagnosis,
|
| 125 |
+
"steps_to_resolution": steps_to_resolution,
|
| 126 |
+
"incident_resolved": incident_resolved,
|
| 127 |
+
"best_diagnosis_overlap": round(best_diagnosis_overlap, 3),
|
| 128 |
+
"information_gathering_ratio": round(info_ratio, 3),
|
| 129 |
+
"services_investigated": services_read,
|
| 130 |
+
"collateral_damage_events": len(negative_rewards),
|
| 131 |
+
"action_type_counts": {
|
| 132 |
+
at: action_types.count(at)
|
| 133 |
+
for at in set(action_types)
|
| 134 |
+
},
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def _keyword_overlap(candidate: str, ground_truth: str) -> float:
|
| 139 |
+
"""
|
| 140 |
+
Returns fraction of ground-truth content words present in candidate.
|
| 141 |
+
Handles hyphens, underscores, case. Filters stop words.
|
| 142 |
+
"""
|
| 143 |
+
if not candidate or not ground_truth:
|
| 144 |
+
return 0.0
|
| 145 |
+
stops = {"the", "a", "an", "of", "to", "in", "for", "and", "or",
|
| 146 |
+
"is", "was", "are", "v", "v2", "v3", "v4"}
|
| 147 |
+
|
| 148 |
+
def tokenize(s: str) -> set:
|
| 149 |
+
tokens = s.lower().replace("-", " ").replace("_", " ").replace(".", " ").split()
|
| 150 |
+
return {t for t in tokens if t not in stops and len(t) > 1}
|
| 151 |
+
|
| 152 |
+
gt_words = tokenize(ground_truth)
|
| 153 |
+
cand_words = tokenize(candidate)
|
| 154 |
+
if not gt_words:
|
| 155 |
+
return 0.0
|
| 156 |
+
return len(gt_words & cand_words) / len(gt_words)
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def run_smoke_test() -> None:
|
| 160 |
+
"""Quick smoke test for CI/CD — verifies grader correctness."""
|
| 161 |
+
import sys
|
| 162 |
+
import os
|
| 163 |
+
import random
|
| 164 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
| 165 |
+
from env import DevOpsIncidentEnv
|
| 166 |
+
from models import Action, ActionType
|
| 167 |
+
|
| 168 |
+
print("Running grader smoke test...")
|
| 169 |
+
for task_id in ["easy", "medium", "hard", "bonus"]:
|
| 170 |
+
rng = random.Random(99)
|
| 171 |
+
env = DevOpsIncidentEnv(task_id=task_id, seed=42)
|
| 172 |
+
env.reset()
|
| 173 |
+
done = False
|
| 174 |
+
while not done:
|
| 175 |
+
action = Action(
|
| 176 |
+
action_type=rng.choice(list(ActionType)),
|
| 177 |
+
service=rng.choice(["api-gateway", "payment-service", None]),
|
| 178 |
+
)
|
| 179 |
+
result = env.step(action)
|
| 180 |
+
done = result.done
|
| 181 |
+
s = env.state()
|
| 182 |
+
score = grade_episode(
|
| 183 |
+
task_id, s.action_history, s.ground_truth_root_cause,
|
| 184 |
+
s.ground_truth_fix, s.incident_resolved, s.total_reward,
|
| 185 |
+
)
|
| 186 |
+
analytics = get_episode_analytics(
|
| 187 |
+
task_id, s.action_history, s.ground_truth_root_cause, s.incident_resolved
|
| 188 |
+
)
|
| 189 |
+
assert 0.0 <= score <= 1.0, f"Score {score} out of range"
|
| 190 |
+
print(f" {task_id}: score={score:.4f} analytics={analytics['action_type_counts']}")
|
| 191 |
+
print("Smoke test passed.")
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
if __name__ == "__main__":
|
| 195 |
+
run_smoke_test()
|
inference.py
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Inference Script — DevOps Incident Response OpenEnv
|
| 3 |
+
=====================================================
|
| 4 |
+
MANDATORY env vars:
|
| 5 |
+
API_BASE_URL The API endpoint for the LLM
|
| 6 |
+
MODEL_NAME The model identifier
|
| 7 |
+
HF_TOKEN Your Hugging Face / API key
|
| 8 |
+
|
| 9 |
+
Run:
|
| 10 |
+
API_BASE_URL=... MODEL_NAME=... HF_TOKEN=... python inference.py
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import os
|
| 14 |
+
import json
|
| 15 |
+
import re
|
| 16 |
+
import textwrap
|
| 17 |
+
from typing import Optional
|
| 18 |
+
|
| 19 |
+
from openai import OpenAI
|
| 20 |
+
|
| 21 |
+
from env import DevOpsIncidentEnv
|
| 22 |
+
from models import Action, ActionType, Observation
|
| 23 |
+
from graders.grader import grade_episode
|
| 24 |
+
|
| 25 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 26 |
+
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY", "")
|
| 27 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Llama-3.3-70B-Instruct")
|
| 28 |
+
|
| 29 |
+
TEMPERATURE = 0.1
|
| 30 |
+
MAX_TOKENS = 512
|
| 31 |
+
FALLBACK_ACTION = Action(action_type=ActionType.NOOP, reason="parse_failure")
|
| 32 |
+
|
| 33 |
+
SYSTEM_PROMPT = textwrap.dedent("""
|
| 34 |
+
You are a senior on-call DevOps engineer responding to a production incident.
|
| 35 |
+
You will receive: active alerts, service statuses, recent logs, a service
|
| 36 |
+
dependency map, and a log of all evidence you have gathered so far.
|
| 37 |
+
|
| 38 |
+
Your strategy:
|
| 39 |
+
1. Read logs and metrics for the most suspicious services BEFORE acting
|
| 40 |
+
2. Use the dependency map to trace cascades to their ROOT cause
|
| 41 |
+
3. Issue a DIAGNOSE action once you have enough evidence
|
| 42 |
+
4. Apply the precise fix — wrong service or wrong action loses points
|
| 43 |
+
5. On hard incidents: both rollback AND alert_oncall may be required
|
| 44 |
+
|
| 45 |
+
Respond with ONLY a valid JSON object — no markdown, no commentary:
|
| 46 |
+
{
|
| 47 |
+
"action_type": "<diagnose|read_logs|read_metrics|read_runbook|restart_service|rollback|scale_up|alert_oncall|acknowledge|noop>",
|
| 48 |
+
"service": "<service name or null>",
|
| 49 |
+
"root_cause": "<diagnosis string if action_type is diagnose, else null>",
|
| 50 |
+
"runbook": "<runbook filename if action_type is read_runbook, else null>",
|
| 51 |
+
"version": "<version string if action_type is rollback, else null>",
|
| 52 |
+
"reason": "<one sentence: what you know and why you are taking this action>"
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
Available runbooks: high_cpu.md, memory_leak.md, db_connection.md,
|
| 56 |
+
deployment_rollback.md, cascade_failure.md, data_corruption.md
|
| 57 |
+
""").strip()
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def observation_to_text(obs: Observation) -> str:
|
| 61 |
+
lines = [
|
| 62 |
+
f"╔═ INCIDENT RESPONSE Step {obs.step}/{obs.max_steps} "
|
| 63 |
+
f"Elapsed: {obs.elapsed_minutes}min ═╗",
|
| 64 |
+
f"Task: {obs.task_description[:120]}",
|
| 65 |
+
"",
|
| 66 |
+
]
|
| 67 |
+
|
| 68 |
+
# SLA status
|
| 69 |
+
breached = [s for s, v in obs.sla_status.items() if v == "breached"]
|
| 70 |
+
warning_sla = [s for s, v in obs.sla_status.items() if v == "warning"]
|
| 71 |
+
if breached:
|
| 72 |
+
lines.append(f"⚠ SLA BREACHED: {', '.join(breached)}")
|
| 73 |
+
if warning_sla:
|
| 74 |
+
lines.append(f"⚠ SLA WARNING: {', '.join(warning_sla)}")
|
| 75 |
+
if breached or warning_sla:
|
| 76 |
+
lines.append("")
|
| 77 |
+
|
| 78 |
+
# Active alerts
|
| 79 |
+
lines.append("── ALERTS ──────────────────────────────────────────")
|
| 80 |
+
if obs.active_alerts:
|
| 81 |
+
for a in sorted(obs.active_alerts, key=lambda x: x.severity):
|
| 82 |
+
ack = " [ACK]" if a.acknowledged else ""
|
| 83 |
+
lines.append(f" [{a.severity.upper():<8}]{ack} {a.service}: {a.message}")
|
| 84 |
+
else:
|
| 85 |
+
lines.append(" (no active alerts)")
|
| 86 |
+
|
| 87 |
+
# Service status table
|
| 88 |
+
lines.append("")
|
| 89 |
+
lines.append("── SERVICES ─────────────────────────────────────────")
|
| 90 |
+
lines.append(f" {'SERVICE':<30} {'STATUS':<10} {'CPU':>5} {'MEM':>5} "
|
| 91 |
+
f"{'ERR/s':>6} {'P99ms':>7} {'VERSION':<12} {'DEPLOYED'}")
|
| 92 |
+
for svc in sorted(obs.services, key=lambda s: s.error_rate, reverse=True):
|
| 93 |
+
sla = "🔴" if obs.sla_status.get(svc.name) == "breached" else (
|
| 94 |
+
"🟡" if obs.sla_status.get(svc.name) == "warning" else " ")
|
| 95 |
+
lines.append(
|
| 96 |
+
f" {sla}{svc.name:<29} {svc.status.upper():<10} "
|
| 97 |
+
f"{svc.cpu_percent:>4.0f}% {svc.memory_percent:>4.0f}% "
|
| 98 |
+
f"{svc.error_rate:>6.2f} {svc.latency_p99_ms:>7.0f} "
|
| 99 |
+
f"{svc.current_version:<12} {svc.last_deployed[:10]}"
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
# Dependency topology
|
| 103 |
+
if obs.service_dependencies:
|
| 104 |
+
lines.append("")
|
| 105 |
+
lines.append("── SERVICE DEPENDENCY MAP ───────────────────────────")
|
| 106 |
+
for dep in obs.service_dependencies:
|
| 107 |
+
if dep.calls:
|
| 108 |
+
lines.append(f" {dep.service} → {', '.join(dep.calls)}")
|
| 109 |
+
|
| 110 |
+
# Recent logs (only services with anomalies or not yet read)
|
| 111 |
+
already_read = {e.source.replace("logs:", "") for e in obs.evidence_log
|
| 112 |
+
if e.source.startswith("logs:")}
|
| 113 |
+
lines.append("")
|
| 114 |
+
lines.append("── RECENT LOGS ──────────────────────────────────────")
|
| 115 |
+
for svc_name, log_lines in obs.recent_logs.items():
|
| 116 |
+
if not log_lines:
|
| 117 |
+
continue
|
| 118 |
+
# Show all logs on first 3 steps, then only unread + anomalies
|
| 119 |
+
has_anomaly = any(
|
| 120 |
+
kw in "\n".join(log_lines).upper()
|
| 121 |
+
for kw in ["ERROR", "FATAL", "CRIT", "WARN", "MISMATCH", "ENOSPC", "OOM"]
|
| 122 |
+
)
|
| 123 |
+
if obs.step <= 3 or svc_name not in already_read or has_anomaly:
|
| 124 |
+
lines.append(f" [{svc_name}]")
|
| 125 |
+
for line in log_lines[-5:]:
|
| 126 |
+
lines.append(f" {line}")
|
| 127 |
+
|
| 128 |
+
# Accumulated evidence
|
| 129 |
+
if obs.evidence_log:
|
| 130 |
+
lines.append("")
|
| 131 |
+
lines.append("── EVIDENCE GATHERED (all steps) ────────────────────")
|
| 132 |
+
for e in obs.evidence_log:
|
| 133 |
+
lines.append(f" Step {e.step:02d} | {e.source}")
|
| 134 |
+
lines.append(f" {e.summary}")
|
| 135 |
+
|
| 136 |
+
if obs.last_action_result:
|
| 137 |
+
lines.append("")
|
| 138 |
+
lines.append(f"Last action: {obs.last_action_result}")
|
| 139 |
+
if obs.last_action_error:
|
| 140 |
+
lines.append(f"ERROR: {obs.last_action_error}")
|
| 141 |
+
|
| 142 |
+
return "\n".join(lines)
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def parse_action(response_text: str) -> Action:
|
| 146 |
+
if not response_text:
|
| 147 |
+
return FALLBACK_ACTION
|
| 148 |
+
text = re.sub(r"```(?:json)?|```", "", response_text).strip()
|
| 149 |
+
match = re.search(r"\{.*\}", text, re.DOTALL)
|
| 150 |
+
if not match:
|
| 151 |
+
return FALLBACK_ACTION
|
| 152 |
+
try:
|
| 153 |
+
data = json.loads(match.group(0))
|
| 154 |
+
at_str = data.get("action_type", "noop")
|
| 155 |
+
valid = {e.value for e in ActionType}
|
| 156 |
+
if at_str not in valid:
|
| 157 |
+
at_str = "noop"
|
| 158 |
+
return Action(
|
| 159 |
+
action_type=ActionType(at_str),
|
| 160 |
+
service=data.get("service"),
|
| 161 |
+
root_cause=data.get("root_cause"),
|
| 162 |
+
runbook=data.get("runbook"),
|
| 163 |
+
version=data.get("version"),
|
| 164 |
+
reason=data.get("reason"),
|
| 165 |
+
)
|
| 166 |
+
except Exception:
|
| 167 |
+
return FALLBACK_ACTION
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def run_task(client: OpenAI, task_id: str, seed: int = 42) -> dict:
|
| 171 |
+
env = DevOpsIncidentEnv(task_id=task_id, seed=seed)
|
| 172 |
+
obs = env.reset()
|
| 173 |
+
|
| 174 |
+
print(f"\n{'━'*64}")
|
| 175 |
+
print(f" Task: {task_id.upper()} | Seed: {seed} | Model: {MODEL_NAME}")
|
| 176 |
+
print(f"{'━'*64}")
|
| 177 |
+
|
| 178 |
+
done = False
|
| 179 |
+
step = 0
|
| 180 |
+
|
| 181 |
+
while not done and step < obs.max_steps:
|
| 182 |
+
step += 1
|
| 183 |
+
prompt = observation_to_text(obs)
|
| 184 |
+
|
| 185 |
+
try:
|
| 186 |
+
completion = client.chat.completions.create(
|
| 187 |
+
model=MODEL_NAME,
|
| 188 |
+
messages=[
|
| 189 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 190 |
+
{"role": "user", "content": prompt},
|
| 191 |
+
],
|
| 192 |
+
temperature=TEMPERATURE,
|
| 193 |
+
max_tokens=MAX_TOKENS,
|
| 194 |
+
)
|
| 195 |
+
response_text = completion.choices[0].message.content or ""
|
| 196 |
+
except Exception as exc:
|
| 197 |
+
print(f" Step {step:02d}: API error — {exc}")
|
| 198 |
+
response_text = ""
|
| 199 |
+
|
| 200 |
+
action = parse_action(response_text)
|
| 201 |
+
action_label = action.action_type.value
|
| 202 |
+
if action.service:
|
| 203 |
+
action_label += f"({action.service})"
|
| 204 |
+
if action.root_cause:
|
| 205 |
+
action_label += f' rc="{action.root_cause[:40]}"'
|
| 206 |
+
if action.version:
|
| 207 |
+
action_label += f" ver={action.version}"
|
| 208 |
+
if action.runbook:
|
| 209 |
+
action_label += f" rb={action.runbook}"
|
| 210 |
+
|
| 211 |
+
result = env.step(action)
|
| 212 |
+
obs = result.observation
|
| 213 |
+
|
| 214 |
+
reward_str = f" reward={result.reward:+.3f}" if result.reward != 0 else ""
|
| 215 |
+
resolution_str = f" *** {result.info.get('resolution', '')} ***" if result.done and result.info.get("resolution") else ""
|
| 216 |
+
print(f" Step {step:02d}: {action_label}{reward_str}{resolution_str}")
|
| 217 |
+
|
| 218 |
+
if obs.last_action_error:
|
| 219 |
+
print(f" ⚠ {obs.last_action_error[:80]}")
|
| 220 |
+
|
| 221 |
+
done = result.done
|
| 222 |
+
|
| 223 |
+
state = env.state()
|
| 224 |
+
final_score = grade_episode(
|
| 225 |
+
task_id=task_id,
|
| 226 |
+
action_history=state.action_history,
|
| 227 |
+
ground_truth_root_cause=state.ground_truth_root_cause,
|
| 228 |
+
ground_truth_fix=state.ground_truth_fix,
|
| 229 |
+
incident_resolved=state.incident_resolved,
|
| 230 |
+
total_reward=state.total_reward,
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
print(f"\n Ground truth : {state.ground_truth_root_cause}")
|
| 234 |
+
print(f" Resolved : {state.incident_resolved}")
|
| 235 |
+
print(f" Steps taken : {step}")
|
| 236 |
+
print(f" Rewards : {[e['reward'] for e in state.action_history if e['reward'] != 0]}")
|
| 237 |
+
print(f" Final score : {final_score:.4f}")
|
| 238 |
+
|
| 239 |
+
return {
|
| 240 |
+
"task_id": task_id,
|
| 241 |
+
"score": final_score,
|
| 242 |
+
"resolved": state.incident_resolved,
|
| 243 |
+
"steps": step,
|
| 244 |
+
"rewards_unlocked": state.info.get("rewards_unlocked", []),
|
| 245 |
+
}
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
def main():
|
| 249 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 250 |
+
|
| 251 |
+
results = []
|
| 252 |
+
for task_id in ["easy", "medium", "hard", "bonus"]:
|
| 253 |
+
r = run_task(client, task_id, seed=42)
|
| 254 |
+
results.append(r)
|
| 255 |
+
|
| 256 |
+
print(f"\n{'━'*64}")
|
| 257 |
+
print(" BASELINE SCORES")
|
| 258 |
+
print(f"{'━'*64}")
|
| 259 |
+
total = 0.0
|
| 260 |
+
for r in results:
|
| 261 |
+
resolved_mark = "✓" if r["resolved"] else "✗"
|
| 262 |
+
print(
|
| 263 |
+
f" {r['task_id']:<8} {r['score']:.4f} "
|
| 264 |
+
f"{resolved_mark} steps={r['steps']} "
|
| 265 |
+
f"unlocked={len(r['rewards_unlocked'])}"
|
| 266 |
+
)
|
| 267 |
+
total += r["score"]
|
| 268 |
+
avg = total / len(results)
|
| 269 |
+
print(f" {'average':<8} {avg:.4f}")
|
| 270 |
+
print(f"{'━'*64}\n")
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
if __name__ == "__main__":
|
| 274 |
+
main()
|
models.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
from pydantic import BaseModel, Field
|
| 3 |
+
from typing import List, Optional, Dict, Any, Literal
|
| 4 |
+
from enum import Enum
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class ActionType(str, Enum):
|
| 8 |
+
DIAGNOSE = "diagnose"
|
| 9 |
+
READ_LOGS = "read_logs"
|
| 10 |
+
READ_METRICS = "read_metrics"
|
| 11 |
+
READ_RUNBOOK = "read_runbook"
|
| 12 |
+
RESTART_SERVICE = "restart_service"
|
| 13 |
+
ROLLBACK = "rollback"
|
| 14 |
+
SCALE_UP = "scale_up"
|
| 15 |
+
ALERT_ONCALL = "alert_oncall"
|
| 16 |
+
ACKNOWLEDGE = "acknowledge"
|
| 17 |
+
NOOP = "noop"
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class Action(BaseModel):
|
| 21 |
+
action_type: ActionType
|
| 22 |
+
service: Optional[str] = None
|
| 23 |
+
root_cause: Optional[str] = None
|
| 24 |
+
runbook: Optional[str] = None
|
| 25 |
+
version: Optional[str] = None
|
| 26 |
+
reason: Optional[str] = None
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class Alert(BaseModel):
|
| 30 |
+
id: str
|
| 31 |
+
severity: Literal["critical", "warning", "info"]
|
| 32 |
+
service: str
|
| 33 |
+
message: str
|
| 34 |
+
timestamp: str
|
| 35 |
+
acknowledged: bool = False
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class ServiceStatus(BaseModel):
|
| 39 |
+
name: str
|
| 40 |
+
status: Literal["healthy", "degraded", "down", "unknown"]
|
| 41 |
+
cpu_percent: float
|
| 42 |
+
memory_percent: float
|
| 43 |
+
error_rate: float
|
| 44 |
+
latency_p99_ms: float
|
| 45 |
+
replicas_running: int
|
| 46 |
+
replicas_desired: int
|
| 47 |
+
current_version: str
|
| 48 |
+
last_deployed: str
|
| 49 |
+
# SLA tracking — updated each step if unresolved
|
| 50 |
+
sla_breach: bool = False
|
| 51 |
+
minutes_degraded: int = 0
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class ServiceDependency(BaseModel):
|
| 55 |
+
"""Describes which services call which — critical for cascade diagnosis."""
|
| 56 |
+
service: str
|
| 57 |
+
calls: List[str] # services this one depends on
|
| 58 |
+
called_by: List[str] # services that depend on this one
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
class EvidenceEntry(BaseModel):
|
| 62 |
+
"""One piece of gathered evidence — accumulated across steps."""
|
| 63 |
+
step: int
|
| 64 |
+
source: str # e.g. "logs:payment-service" or "metrics:inventory-service"
|
| 65 |
+
summary: str # short digest of what was found
|
| 66 |
+
raw: str # full content returned by read action
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
class Observation(BaseModel):
|
| 70 |
+
step: int
|
| 71 |
+
max_steps: int
|
| 72 |
+
task_id: str
|
| 73 |
+
task_description: str
|
| 74 |
+
services: List[ServiceStatus]
|
| 75 |
+
active_alerts: List[Alert]
|
| 76 |
+
recent_logs: Dict[str, List[str]]
|
| 77 |
+
available_runbooks: List[str]
|
| 78 |
+
# NEW: dependency topology so agent can reason about cascades
|
| 79 |
+
service_dependencies: List[ServiceDependency] = []
|
| 80 |
+
# NEW: accumulated evidence from all previous read actions
|
| 81 |
+
evidence_log: List[EvidenceEntry] = []
|
| 82 |
+
# NEW: SLA status — shows urgency
|
| 83 |
+
sla_status: Dict[str, str] = {} # service -> "ok" | "warning" | "breached"
|
| 84 |
+
last_action_result: Optional[str] = None
|
| 85 |
+
last_action_error: Optional[str] = None
|
| 86 |
+
incident_start_time: str
|
| 87 |
+
elapsed_minutes: int
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
class StepResult(BaseModel):
|
| 91 |
+
observation: Observation
|
| 92 |
+
reward: float
|
| 93 |
+
done: bool
|
| 94 |
+
info: Dict[str, Any] = {}
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
class State(BaseModel):
|
| 98 |
+
episode_id: str
|
| 99 |
+
task_id: str
|
| 100 |
+
step: int
|
| 101 |
+
current_observation: Observation
|
| 102 |
+
action_history: List[Dict[str, Any]]
|
| 103 |
+
total_reward: float
|
| 104 |
+
incident_resolved: bool
|
| 105 |
+
ground_truth_root_cause: str
|
| 106 |
+
ground_truth_fix: str
|
| 107 |
+
info: Dict[str, Any] = {}
|
openenv.yaml
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: devops-incident-response
|
| 2 |
+
version: "1.0.0"
|
| 3 |
+
description: >
|
| 4 |
+
A reinforcement learning environment where AI agents learn to diagnose and
|
| 5 |
+
remediate production software incidents. Agents read logs, metrics, and
|
| 6 |
+
alerts across a simulated microservices architecture, then take remediation
|
| 7 |
+
actions such as rollbacks, restarts, and on-call escalations. Three tasks
|
| 8 |
+
of escalating difficulty — from a clear memory leak to silent data
|
| 9 |
+
corruption with no error-rate alerts — provide a meaningful difficulty
|
| 10 |
+
progression for benchmarking agent reasoning quality.
|
| 11 |
+
|
| 12 |
+
author: "devops-incident-env"
|
| 13 |
+
tags:
|
| 14 |
+
- openenv
|
| 15 |
+
- devops
|
| 16 |
+
- incident-response
|
| 17 |
+
- real-world
|
| 18 |
+
- multi-step
|
| 19 |
+
- microservices
|
| 20 |
+
- reward-shaping
|
| 21 |
+
|
| 22 |
+
tasks:
|
| 23 |
+
- id: easy
|
| 24 |
+
name: Single Service Anomaly
|
| 25 |
+
description: >
|
| 26 |
+
A payment service is crash-looping due to a JVM heap memory leak.
|
| 27 |
+
Logs clearly show OutOfMemoryError and OOMKilled pod restarts.
|
| 28 |
+
The agent must read logs/metrics, diagnose the memory leak, and
|
| 29 |
+
restart the affected service without touching healthy services.
|
| 30 |
+
difficulty: easy
|
| 31 |
+
max_steps: 15
|
| 32 |
+
reward_range: [0.0, 1.0]
|
| 33 |
+
expected_score_random_agent: 0.05
|
| 34 |
+
expected_score_strong_llm: 0.75
|
| 35 |
+
|
| 36 |
+
- id: medium
|
| 37 |
+
name: Cascading Multi-Service Failure
|
| 38 |
+
description: >
|
| 39 |
+
A bad deployment of inventory-service introduced connection pool
|
| 40 |
+
exhaustion, cascading to order-service timeouts and api-gateway
|
| 41 |
+
errors. A red-herring alert fires on notification-service (high CPU
|
| 42 |
+
from a scheduled batch job). The agent must trace the cascade to the
|
| 43 |
+
root service and rollback — not restart downstream victims.
|
| 44 |
+
difficulty: medium
|
| 45 |
+
max_steps: 20
|
| 46 |
+
reward_range: [0.0, 1.0]
|
| 47 |
+
expected_score_random_agent: 0.03
|
| 48 |
+
expected_score_strong_llm: 0.55
|
| 49 |
+
|
| 50 |
+
- id: hard
|
| 51 |
+
name: Silent Data Corruption
|
| 52 |
+
description: >
|
| 53 |
+
A data pipeline deployment silently writes incorrect price values to
|
| 54 |
+
the product catalog. No standard error-rate or latency alerts fire —
|
| 55 |
+
all services show green health. The signal is buried in
|
| 56 |
+
price-validation WARN logs (15% mismatch rate) and an analytics
|
| 57 |
+
anomaly (avg order value 9x baseline). Full credit requires both
|
| 58 |
+
rollback of the pipeline AND alerting on-call for a data audit.
|
| 59 |
+
difficulty: hard
|
| 60 |
+
max_steps: 25
|
| 61 |
+
reward_range: [0.0, 1.0]
|
| 62 |
+
expected_score_random_agent: 0.01
|
| 63 |
+
expected_score_strong_llm: 0.35
|
| 64 |
+
|
| 65 |
+
- id: bonus
|
| 66 |
+
name: Simultaneous Dual Failure
|
| 67 |
+
description: >
|
| 68 |
+
Two independent failures strike at once: log-aggregator disk is 100% full
|
| 69 |
+
(causing log loss across all services) and ml-inference-service is stuck
|
| 70 |
+
in a model reload CPU loop. Neither failure is related to the other.
|
| 71 |
+
Full credit requires fixing both root causes independently.
|
| 72 |
+
difficulty: hard
|
| 73 |
+
max_steps: 25
|
| 74 |
+
reward_range: [0.0, 1.0]
|
| 75 |
+
expected_score_random_agent: 0.01
|
| 76 |
+
expected_score_strong_llm: 0.40
|
| 77 |
+
|
| 78 |
+
action_space:
|
| 79 |
+
type: structured
|
| 80 |
+
description: >
|
| 81 |
+
Discrete action types with optional service/parameter arguments.
|
| 82 |
+
Actions are expressed as Pydantic Action objects with fields:
|
| 83 |
+
action_type, service, root_cause, runbook, version, reason.
|
| 84 |
+
actions:
|
| 85 |
+
- name: diagnose
|
| 86 |
+
description: Record the agent's root cause hypothesis
|
| 87 |
+
- name: read_logs
|
| 88 |
+
description: Read recent log lines for a named service
|
| 89 |
+
- name: read_metrics
|
| 90 |
+
description: Read CPU, memory, error rate, latency for a named service
|
| 91 |
+
- name: read_runbook
|
| 92 |
+
description: Read an operational runbook by filename
|
| 93 |
+
- name: restart_service
|
| 94 |
+
description: Restart a named service (clears memory, resets connections)
|
| 95 |
+
- name: rollback
|
| 96 |
+
description: Roll back a service to a previous version
|
| 97 |
+
- name: scale_up
|
| 98 |
+
description: Increase replica count for a named service
|
| 99 |
+
- name: alert_oncall
|
| 100 |
+
description: Page the on-call engineering team
|
| 101 |
+
- name: acknowledge
|
| 102 |
+
description: Acknowledge an active alert by ID
|
| 103 |
+
- name: noop
|
| 104 |
+
description: Take no action this step
|
| 105 |
+
|
| 106 |
+
observation_space:
|
| 107 |
+
type: structured
|
| 108 |
+
description: >
|
| 109 |
+
Pydantic Observation object containing: current step, task description,
|
| 110 |
+
list of ServiceStatus objects (name, status, cpu, memory, error_rate,
|
| 111 |
+
latency_p99, replicas, version, last_deployed), list of Alert objects
|
| 112 |
+
(severity, service, message, acknowledged), recent log lines per
|
| 113 |
+
service (dict of service_name -> last 10 lines), available runbook
|
| 114 |
+
names, last action result/error, and incident timing info.
|
| 115 |
+
|
| 116 |
+
reward:
|
| 117 |
+
type: dense
|
| 118 |
+
range: [0.0, 1.0]
|
| 119 |
+
description: >
|
| 120 |
+
Partial credit for information gathering, correct diagnosis, and
|
| 121 |
+
precise remediation. Penalties for collateral damage (restarting
|
| 122 |
+
healthy services), excessive noops, and treating symptoms instead
|
| 123 |
+
of root causes. Efficiency bonus for fast resolution.
|
| 124 |
+
|
| 125 |
+
docker:
|
| 126 |
+
base_image: python:3.11-slim
|
| 127 |
+
port: 7860
|
| 128 |
+
health_endpoint: /health
|
| 129 |
+
reset_endpoint: /reset
|
| 130 |
+
step_endpoint: /step
|
| 131 |
+
state_endpoint: /state
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pydantic>=2.0,<3.0
|
| 2 |
+
fastapi>=0.110.0
|
| 3 |
+
uvicorn>=0.29.0
|
| 4 |
+
openai>=1.0.0
|
| 5 |
+
python-dotenv>=1.0.0
|
| 6 |
+
pyyaml>=6.0
|
tasks/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from tasks.task_easy import EasyTask
|
| 2 |
+
from tasks.task_medium import MediumTask
|
| 3 |
+
from tasks.task_hard import HardTask
|
| 4 |
+
from tasks.task_bonus import BonusTask
|
| 5 |
+
|
| 6 |
+
__all__ = ["EasyTask", "MediumTask", "HardTask", "BonusTask"]
|
tasks/base.py
ADDED
|
@@ -0,0 +1,306 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
import random
|
| 3 |
+
import uuid
|
| 4 |
+
from abc import ABC, abstractmethod
|
| 5 |
+
from dataclasses import dataclass, field
|
| 6 |
+
from typing import List, Dict, Any, Optional, Set
|
| 7 |
+
from models import (
|
| 8 |
+
Action, ActionType, Observation, State, StepResult,
|
| 9 |
+
ServiceStatus, Alert, ServiceDependency, EvidenceEntry,
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
AVAILABLE_RUNBOOKS = [
|
| 14 |
+
"high_cpu.md",
|
| 15 |
+
"memory_leak.md",
|
| 16 |
+
"db_connection.md",
|
| 17 |
+
"deployment_rollback.md",
|
| 18 |
+
"cascade_failure.md",
|
| 19 |
+
"data_corruption.md",
|
| 20 |
+
]
|
| 21 |
+
|
| 22 |
+
TASK_DESCRIPTIONS = {
|
| 23 |
+
"easy": (
|
| 24 |
+
"PRODUCTION INCIDENT — One service is crash-looping. "
|
| 25 |
+
"Read its logs and metrics to find the root cause, diagnose precisely, "
|
| 26 |
+
"then apply the correct single-service fix. "
|
| 27 |
+
"Avoid restarting healthy services — collateral damage is penalised."
|
| 28 |
+
),
|
| 29 |
+
"medium": (
|
| 30 |
+
"PRODUCTION INCIDENT — Multiple services are degraded. "
|
| 31 |
+
"Use the service dependency map to trace the failure to its origin. "
|
| 32 |
+
"A recent deployment is likely involved. One alert is a red herring. "
|
| 33 |
+
"Fix the root service only — downstream victims will self-heal."
|
| 34 |
+
),
|
| 35 |
+
"hard": (
|
| 36 |
+
"PRODUCTION INCIDENT — All services show green health. No error-rate alerts. "
|
| 37 |
+
"Look for anomalies in business-logic metrics and WARN-level logs. "
|
| 38 |
+
"Correlate signals across services to find silent data corruption. "
|
| 39 |
+
"Two actions are required for full credit: rollback AND alert_oncall."
|
| 40 |
+
),
|
| 41 |
+
"bonus": (
|
| 42 |
+
"PRODUCTION INCIDENT — Two independent failures are active simultaneously. "
|
| 43 |
+
"They are unrelated — fixing one will NOT fix the other. "
|
| 44 |
+
"Identify both root causes and remediate each independently. "
|
| 45 |
+
"Full credit requires resolving both."
|
| 46 |
+
),
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
@dataclass
|
| 51 |
+
class InternalState:
|
| 52 |
+
episode_id: str
|
| 53 |
+
task_id: str
|
| 54 |
+
step: int
|
| 55 |
+
max_steps: int
|
| 56 |
+
services: Dict[str, dict]
|
| 57 |
+
alerts: list
|
| 58 |
+
logs: Dict[str, List[str]]
|
| 59 |
+
action_history: List[Dict[str, Any]]
|
| 60 |
+
total_reward: float
|
| 61 |
+
incident_resolved: bool
|
| 62 |
+
ground_truth_root_cause: str
|
| 63 |
+
ground_truth_fix: str
|
| 64 |
+
incident_start_time: str
|
| 65 |
+
rewards_given: Set[str] = field(default_factory=set)
|
| 66 |
+
healthy_services: List[str] = field(default_factory=list)
|
| 67 |
+
evidence_log: List[dict] = field(default_factory=list)
|
| 68 |
+
service_dependencies: List[dict] = field(default_factory=list)
|
| 69 |
+
_scenario: Any = field(default=None, repr=False)
|
| 70 |
+
_ml_version: Any = field(default=None, repr=False)
|
| 71 |
+
|
| 72 |
+
def to_state_snapshot(self) -> State:
|
| 73 |
+
obs = self._build_observation()
|
| 74 |
+
return State(
|
| 75 |
+
episode_id=self.episode_id,
|
| 76 |
+
task_id=self.task_id,
|
| 77 |
+
step=self.step,
|
| 78 |
+
current_observation=obs,
|
| 79 |
+
action_history=self.action_history,
|
| 80 |
+
total_reward=round(self.total_reward, 4),
|
| 81 |
+
incident_resolved=self.incident_resolved,
|
| 82 |
+
ground_truth_root_cause=self.ground_truth_root_cause,
|
| 83 |
+
ground_truth_fix=self.ground_truth_fix,
|
| 84 |
+
info={
|
| 85 |
+
"rewards_unlocked": sorted(self.rewards_given),
|
| 86 |
+
"evidence_gathered": len(self.evidence_log),
|
| 87 |
+
},
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
def _build_sla_status(self) -> Dict[str, str]:
|
| 91 |
+
status = {}
|
| 92 |
+
for name, svc in self.services.items():
|
| 93 |
+
if svc["status"] == "down":
|
| 94 |
+
mins = self.step * 2
|
| 95 |
+
if mins >= 10:
|
| 96 |
+
status[name] = "breached"
|
| 97 |
+
elif mins >= 5:
|
| 98 |
+
status[name] = "warning"
|
| 99 |
+
else:
|
| 100 |
+
status[name] = "ok"
|
| 101 |
+
elif svc["status"] == "degraded":
|
| 102 |
+
mins = self.step * 2
|
| 103 |
+
if mins >= 20:
|
| 104 |
+
status[name] = "breached"
|
| 105 |
+
elif mins >= 10:
|
| 106 |
+
status[name] = "warning"
|
| 107 |
+
else:
|
| 108 |
+
status[name] = "ok"
|
| 109 |
+
else:
|
| 110 |
+
status[name] = "ok"
|
| 111 |
+
return status
|
| 112 |
+
|
| 113 |
+
def _apply_sla_degradation(self) -> None:
|
| 114 |
+
"""Services get progressively worse if not fixed — adds urgency."""
|
| 115 |
+
if self.incident_resolved:
|
| 116 |
+
return
|
| 117 |
+
for name, svc in self.services.items():
|
| 118 |
+
if svc["status"] == "down":
|
| 119 |
+
svc["minutes_degraded"] = svc.get("minutes_degraded", 0) + 2
|
| 120 |
+
# Error rate creeps up
|
| 121 |
+
svc["error_rate"] = min(svc["error_rate"] * 1.05, 50.0)
|
| 122 |
+
elif svc["status"] == "degraded":
|
| 123 |
+
svc["minutes_degraded"] = svc.get("minutes_degraded", 0) + 2
|
| 124 |
+
# Latency grows
|
| 125 |
+
svc["latency_p99_ms"] = min(svc["latency_p99_ms"] * 1.03, 60000.0)
|
| 126 |
+
if svc["latency_p99_ms"] > 30000 and svc["error_rate"] < 1.0:
|
| 127 |
+
svc["error_rate"] = round(svc["error_rate"] + 0.5, 2)
|
| 128 |
+
|
| 129 |
+
def _build_observation(
|
| 130 |
+
self,
|
| 131 |
+
last_action_result: Optional[str] = None,
|
| 132 |
+
last_action_error: Optional[str] = None,
|
| 133 |
+
) -> Observation:
|
| 134 |
+
services = []
|
| 135 |
+
for name, s in self.services.items():
|
| 136 |
+
services.append(ServiceStatus(
|
| 137 |
+
name=s["name"],
|
| 138 |
+
status=s["status"],
|
| 139 |
+
cpu_percent=s["cpu_percent"],
|
| 140 |
+
memory_percent=s["memory_percent"],
|
| 141 |
+
error_rate=round(s["error_rate"], 3),
|
| 142 |
+
latency_p99_ms=round(s["latency_p99_ms"], 0),
|
| 143 |
+
replicas_running=s["replicas_running"],
|
| 144 |
+
replicas_desired=s["replicas_desired"],
|
| 145 |
+
current_version=s["current_version"],
|
| 146 |
+
last_deployed=s["last_deployed"],
|
| 147 |
+
sla_breach=s.get("sla_breach", False),
|
| 148 |
+
minutes_degraded=s.get("minutes_degraded", 0),
|
| 149 |
+
))
|
| 150 |
+
|
| 151 |
+
alerts = [Alert(**a) for a in self.alerts]
|
| 152 |
+
deps = [ServiceDependency(**d) for d in self.service_dependencies]
|
| 153 |
+
evidence = [EvidenceEntry(**e) for e in self.evidence_log]
|
| 154 |
+
sla = self._build_sla_status()
|
| 155 |
+
|
| 156 |
+
return Observation(
|
| 157 |
+
step=self.step,
|
| 158 |
+
max_steps=self.max_steps,
|
| 159 |
+
task_id=self.task_id,
|
| 160 |
+
task_description=TASK_DESCRIPTIONS.get(self.task_id, ""),
|
| 161 |
+
services=services,
|
| 162 |
+
active_alerts=alerts,
|
| 163 |
+
recent_logs=self.logs,
|
| 164 |
+
available_runbooks=AVAILABLE_RUNBOOKS,
|
| 165 |
+
service_dependencies=deps,
|
| 166 |
+
evidence_log=evidence,
|
| 167 |
+
sla_status=sla,
|
| 168 |
+
last_action_result=last_action_result,
|
| 169 |
+
last_action_error=last_action_error,
|
| 170 |
+
incident_start_time=self.incident_start_time,
|
| 171 |
+
elapsed_minutes=self.step * 2,
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
@dataclass
|
| 176 |
+
class StepOutput:
|
| 177 |
+
next_state: InternalState
|
| 178 |
+
reward: float
|
| 179 |
+
done: bool
|
| 180 |
+
info: Dict[str, Any]
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def semantic_match(candidate: str, keywords: List[str], threshold: int = 1) -> bool:
|
| 184 |
+
"""
|
| 185 |
+
Returns True if candidate contains at least `threshold` keywords.
|
| 186 |
+
Case-insensitive, handles hyphens/underscores.
|
| 187 |
+
"""
|
| 188 |
+
if not candidate:
|
| 189 |
+
return False
|
| 190 |
+
c = candidate.lower().replace("-", " ").replace("_", " ")
|
| 191 |
+
hits = sum(1 for kw in keywords if kw.lower().replace("-", " ") in c)
|
| 192 |
+
return hits >= threshold
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
class BaseTask(ABC):
|
| 196 |
+
def __init__(self, rng: random.Random):
|
| 197 |
+
self.rng = rng
|
| 198 |
+
|
| 199 |
+
@abstractmethod
|
| 200 |
+
def initialize(self) -> InternalState:
|
| 201 |
+
pass
|
| 202 |
+
|
| 203 |
+
@abstractmethod
|
| 204 |
+
def step(self, state: InternalState, action: Action) -> StepOutput:
|
| 205 |
+
pass
|
| 206 |
+
|
| 207 |
+
def _apply_action_to_logs(
|
| 208 |
+
self, state: InternalState, action: Action
|
| 209 |
+
) -> tuple[Optional[str], Optional[str]]:
|
| 210 |
+
at = action.action_type.value
|
| 211 |
+
|
| 212 |
+
if at == "read_logs":
|
| 213 |
+
svc = action.service
|
| 214 |
+
if svc and svc in state.logs:
|
| 215 |
+
lines = state.logs[svc]
|
| 216 |
+
result = "\n".join(lines)
|
| 217 |
+
# Add to evidence log
|
| 218 |
+
state.evidence_log.append({
|
| 219 |
+
"step": state.step,
|
| 220 |
+
"source": f"logs:{svc}",
|
| 221 |
+
"summary": f"Read {len(lines)} log lines from {svc}",
|
| 222 |
+
"raw": result,
|
| 223 |
+
})
|
| 224 |
+
return result, None
|
| 225 |
+
return None, f"No logs found for service '{svc}'"
|
| 226 |
+
|
| 227 |
+
if at == "read_metrics":
|
| 228 |
+
svc = action.service
|
| 229 |
+
if svc and svc in state.services:
|
| 230 |
+
s = state.services[svc]
|
| 231 |
+
result = (
|
| 232 |
+
f"=== Metrics: {svc} ===\n"
|
| 233 |
+
f"Status: {s['status'].upper()}\n"
|
| 234 |
+
f"CPU: {s['cpu_percent']:.1f}%\n"
|
| 235 |
+
f"Memory: {s['memory_percent']:.1f}%\n"
|
| 236 |
+
f"Error rate: {s['error_rate']:.3f}/s\n"
|
| 237 |
+
f"P99 latency: {s['latency_p99_ms']:.0f}ms\n"
|
| 238 |
+
f"Replicas: {s['replicas_running']}/{s['replicas_desired']}\n"
|
| 239 |
+
f"Version: {s['current_version']}\n"
|
| 240 |
+
f"Last deploy: {s['last_deployed']}\n"
|
| 241 |
+
f"Degraded for: {s.get('minutes_degraded', 0)} minutes"
|
| 242 |
+
)
|
| 243 |
+
state.evidence_log.append({
|
| 244 |
+
"step": state.step,
|
| 245 |
+
"source": f"metrics:{svc}",
|
| 246 |
+
"summary": (
|
| 247 |
+
f"{svc}: {s['status']}, cpu={s['cpu_percent']:.0f}%, "
|
| 248 |
+
f"mem={s['memory_percent']:.0f}%, err={s['error_rate']:.2f}/s, "
|
| 249 |
+
f"ver={s['current_version']}"
|
| 250 |
+
),
|
| 251 |
+
"raw": result,
|
| 252 |
+
})
|
| 253 |
+
return result, None
|
| 254 |
+
return None, f"Unknown service '{svc}'"
|
| 255 |
+
|
| 256 |
+
if at == "read_runbook":
|
| 257 |
+
rb = action.runbook
|
| 258 |
+
if rb in AVAILABLE_RUNBOOKS:
|
| 259 |
+
content = self._load_runbook(rb)
|
| 260 |
+
state.evidence_log.append({
|
| 261 |
+
"step": state.step,
|
| 262 |
+
"source": f"runbook:{rb}",
|
| 263 |
+
"summary": f"Read runbook: {rb}",
|
| 264 |
+
"raw": content[:200],
|
| 265 |
+
})
|
| 266 |
+
return content, None
|
| 267 |
+
return None, f"Runbook '{rb}' not found. Available: {AVAILABLE_RUNBOOKS}"
|
| 268 |
+
|
| 269 |
+
if at == "acknowledge":
|
| 270 |
+
alert_id = action.service
|
| 271 |
+
for a in state.alerts:
|
| 272 |
+
if a["id"] == alert_id:
|
| 273 |
+
a["acknowledged"] = True
|
| 274 |
+
return f"Alert {alert_id} acknowledged.", None
|
| 275 |
+
return None, f"Alert '{alert_id}' not found."
|
| 276 |
+
|
| 277 |
+
if at == "noop":
|
| 278 |
+
return "No action taken.", None
|
| 279 |
+
|
| 280 |
+
return None, None
|
| 281 |
+
|
| 282 |
+
def _load_runbook(self, name: str) -> str:
|
| 283 |
+
import os
|
| 284 |
+
path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "runbooks", name)
|
| 285 |
+
try:
|
| 286 |
+
with open(path) as f:
|
| 287 |
+
return f.read()
|
| 288 |
+
except FileNotFoundError:
|
| 289 |
+
return f"[Runbook '{name}' not found]"
|
| 290 |
+
|
| 291 |
+
def _clamp(self, value: float) -> float:
|
| 292 |
+
return max(0.0, min(1.0, value))
|
| 293 |
+
|
| 294 |
+
def _penalty_blind_remediation(
|
| 295 |
+
self, state: InternalState, action: Action, fix_key: str
|
| 296 |
+
) -> float:
|
| 297 |
+
"""
|
| 298 |
+
Small penalty if agent remediates without any prior diagnosis.
|
| 299 |
+
Encourages evidence-gathering before action.
|
| 300 |
+
"""
|
| 301 |
+
if fix_key in state.rewards_given:
|
| 302 |
+
return 0.0
|
| 303 |
+
if "diagnose_correct" not in state.rewards_given and \
|
| 304 |
+
"diagnose_partial" not in state.rewards_given:
|
| 305 |
+
return -0.05
|
| 306 |
+
return 0.0
|
tasks/task_bonus.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
import uuid
|
| 3 |
+
from typing import Dict, Any, List
|
| 4 |
+
from models import Action, ActionType
|
| 5 |
+
from tasks.base import BaseTask, InternalState, StepOutput, semantic_match
|
| 6 |
+
|
| 7 |
+
INCIDENT_TIME = "2026-03-30T14:22:00Z"
|
| 8 |
+
|
| 9 |
+
DEPENDENCIES = [
|
| 10 |
+
{"service": "api-gateway", "calls": ["ml-inference-service", "product-service"], "called_by": []},
|
| 11 |
+
{"service": "ml-inference-service","calls": [], "called_by": ["api-gateway"]},
|
| 12 |
+
{"service": "log-aggregator", "calls": [], "called_by": []},
|
| 13 |
+
{"service": "product-service", "calls": [], "called_by": ["api-gateway"]},
|
| 14 |
+
]
|
| 15 |
+
|
| 16 |
+
AGGREGATOR_LOGS = [
|
| 17 |
+
"[14:20:01] INFO Log ingestion running: 48MB/s",
|
| 18 |
+
"[14:21:05] WARN Disk usage at 91% (/var/log/aggregated)",
|
| 19 |
+
"[14:21:45] WARN Disk usage at 95% - log rotation overdue",
|
| 20 |
+
"[14:22:01] ERROR Disk usage at 99% - write failure imminent",
|
| 21 |
+
"[14:22:02] ERROR Failed to write log chunk: No space left on device (ENOSPC)",
|
| 22 |
+
"[14:22:04] WARN Dropping incoming logs: buffer overflow (48000 messages dropped)",
|
| 23 |
+
"[14:22:05] ERROR Log rotation job FAILED: No space left on device",
|
| 24 |
+
"[14:22:10] CRIT Disk 100% full - all log writes failing",
|
| 25 |
+
]
|
| 26 |
+
|
| 27 |
+
ML_LOGS = [
|
| 28 |
+
"[14:21:00] INFO ml-inference-service starting",
|
| 29 |
+
"[14:21:01] INFO Loading model: recommendation-v2.1 (2.3GB)",
|
| 30 |
+
"[14:21:12] INFO Model loaded in 11.2s",
|
| 31 |
+
"[14:21:12] WARN Model checksum mismatch - reloading",
|
| 32 |
+
"[14:21:23] INFO Model loaded in 11.1s",
|
| 33 |
+
"[14:21:23] WARN Model checksum mismatch - reloading",
|
| 34 |
+
"[14:21:34] WARN Model reload loop detected: 6 reloads in 60s",
|
| 35 |
+
"[14:22:01] ERROR CPU throttled: 100% sustained for 120s",
|
| 36 |
+
"[14:22:02] WARN Deployment {version} introduced new model checksum validation - may have bug",
|
| 37 |
+
]
|
| 38 |
+
|
| 39 |
+
API_LOGS = [
|
| 40 |
+
"[14:22:00] INFO GET /api/v1/recommendations 200 145ms",
|
| 41 |
+
"[14:22:05] WARN GET /api/v1/recommendations 200 4823ms (ml-inference slow)",
|
| 42 |
+
"[14:22:15] ERROR GET /api/v1/recommendations 504 Gateway Timeout",
|
| 43 |
+
]
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class BonusTask(BaseTask):
|
| 47 |
+
def initialize(self) -> InternalState:
|
| 48 |
+
ml_ver = f"v2.{self.rng.randint(0, 3)}.{self.rng.randint(0, 5)}"
|
| 49 |
+
|
| 50 |
+
logs = {
|
| 51 |
+
"log-aggregator": AGGREGATOR_LOGS[:],
|
| 52 |
+
"ml-inference-service": [l.replace("{version}", ml_ver) for l in ML_LOGS],
|
| 53 |
+
"api-gateway": API_LOGS[:],
|
| 54 |
+
"product-service": ["[14:22:00] INFO Service healthy - 0 errors"],
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
services = {
|
| 58 |
+
"api-gateway": {
|
| 59 |
+
"name": "api-gateway", "status": "degraded",
|
| 60 |
+
"cpu_percent": round(self.rng.uniform(40, 58), 1),
|
| 61 |
+
"memory_percent": round(self.rng.uniform(44, 56), 1),
|
| 62 |
+
"error_rate": round(self.rng.uniform(3.0, 6.0), 2),
|
| 63 |
+
"latency_p99_ms": round(self.rng.uniform(8000, 12000), 0),
|
| 64 |
+
"replicas_running": 2, "replicas_desired": 2,
|
| 65 |
+
"current_version": "v3.1.0", "last_deployed": "2026-03-20T08:00:00Z",
|
| 66 |
+
"minutes_degraded": 0, "sla_breach": False,
|
| 67 |
+
},
|
| 68 |
+
"ml-inference-service": {
|
| 69 |
+
"name": "ml-inference-service", "status": "degraded",
|
| 70 |
+
"cpu_percent": round(self.rng.uniform(94, 100), 1),
|
| 71 |
+
"memory_percent": round(self.rng.uniform(55, 72), 1),
|
| 72 |
+
"error_rate": round(self.rng.uniform(1.5, 4.0), 2),
|
| 73 |
+
"latency_p99_ms": round(self.rng.uniform(9000, 14000), 0),
|
| 74 |
+
"replicas_running": 2, "replicas_desired": 2,
|
| 75 |
+
"current_version": ml_ver, "last_deployed": "2026-03-30T14:20:55Z",
|
| 76 |
+
"minutes_degraded": 0, "sla_breach": False,
|
| 77 |
+
},
|
| 78 |
+
"log-aggregator": {
|
| 79 |
+
"name": "log-aggregator", "status": "degraded",
|
| 80 |
+
"cpu_percent": round(self.rng.uniform(18, 30), 1),
|
| 81 |
+
"memory_percent": round(self.rng.uniform(40, 52), 1),
|
| 82 |
+
"error_rate": round(self.rng.uniform(5.0, 9.0), 2),
|
| 83 |
+
"latency_p99_ms": round(self.rng.uniform(200, 500), 0),
|
| 84 |
+
"replicas_running": 1, "replicas_desired": 1,
|
| 85 |
+
"current_version": "v1.3.0", "last_deployed": "2026-03-01T10:00:00Z",
|
| 86 |
+
"minutes_degraded": 0, "sla_breach": False,
|
| 87 |
+
},
|
| 88 |
+
"product-service": {
|
| 89 |
+
"name": "product-service", "status": "healthy",
|
| 90 |
+
"cpu_percent": round(self.rng.uniform(25, 38), 1),
|
| 91 |
+
"memory_percent": round(self.rng.uniform(35, 48), 1),
|
| 92 |
+
"error_rate": 0.0,
|
| 93 |
+
"latency_p99_ms": round(self.rng.uniform(15, 35), 0),
|
| 94 |
+
"replicas_running": 3, "replicas_desired": 3,
|
| 95 |
+
"current_version": "v2.0.1", "last_deployed": "2026-03-15T12:00:00Z",
|
| 96 |
+
"minutes_degraded": 0, "sla_breach": False,
|
| 97 |
+
},
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
alerts = [
|
| 101 |
+
{
|
| 102 |
+
"id": "B001", "severity": "critical", "service": "log-aggregator",
|
| 103 |
+
"message": "Disk 100% full on log-aggregator - dropping 48000 log messages/min",
|
| 104 |
+
"timestamp": "2026-03-30T14:22:10Z", "acknowledged": False,
|
| 105 |
+
},
|
| 106 |
+
{
|
| 107 |
+
"id": "B002", "severity": "critical", "service": "ml-inference-service",
|
| 108 |
+
"message": f"CPU sustained 99%+ for 120s - model reload loop detected ({ml_ver})",
|
| 109 |
+
"timestamp": "2026-03-30T14:22:01Z", "acknowledged": False,
|
| 110 |
+
},
|
| 111 |
+
{
|
| 112 |
+
"id": "B003", "severity": "warning", "service": "api-gateway",
|
| 113 |
+
"message": "P99 latency 10200ms on /recommendations - upstream ml-inference slow",
|
| 114 |
+
"timestamp": "2026-03-30T14:22:15Z", "acknowledged": False,
|
| 115 |
+
},
|
| 116 |
+
]
|
| 117 |
+
|
| 118 |
+
state = InternalState(
|
| 119 |
+
episode_id=str(uuid.uuid4()), task_id="bonus", step=0, max_steps=25,
|
| 120 |
+
services=services, alerts=alerts, logs=logs,
|
| 121 |
+
action_history=[], total_reward=0.0, incident_resolved=False,
|
| 122 |
+
ground_truth_root_cause="disk_full_log_aggregator AND model_reload_loop_ml_inference",
|
| 123 |
+
ground_truth_fix="alert_oncall for disk cleanup AND rollback ml-inference-service",
|
| 124 |
+
incident_start_time=INCIDENT_TIME,
|
| 125 |
+
healthy_services=["product-service"],
|
| 126 |
+
service_dependencies=DEPENDENCIES,
|
| 127 |
+
)
|
| 128 |
+
state._ml_version = ml_ver
|
| 129 |
+
return state
|
| 130 |
+
|
| 131 |
+
def step(self, state: InternalState, action: Action) -> StepOutput:
|
| 132 |
+
state.step += 1
|
| 133 |
+
state._apply_sla_degradation()
|
| 134 |
+
at = action.action_type
|
| 135 |
+
svc = action.service or ""
|
| 136 |
+
reward = 0.0
|
| 137 |
+
done = False
|
| 138 |
+
info: Dict[str, Any] = {}
|
| 139 |
+
|
| 140 |
+
result_text, error_text = self._apply_action_to_logs(state, action)
|
| 141 |
+
|
| 142 |
+
gather_map = {
|
| 143 |
+
("read_logs", "log-aggregator"): ("rl_agg", 0.05),
|
| 144 |
+
("read_logs", "ml-inference-service"): ("rl_ml", 0.05),
|
| 145 |
+
("read_metrics", "log-aggregator"): ("rm_agg", 0.05),
|
| 146 |
+
("read_metrics", "ml-inference-service"): ("rm_ml", 0.05),
|
| 147 |
+
}
|
| 148 |
+
k = (at.value, svc)
|
| 149 |
+
if k in gather_map:
|
| 150 |
+
tag, r = gather_map[k]
|
| 151 |
+
if tag not in state.rewards_given:
|
| 152 |
+
reward += r; state.rewards_given.add(tag)
|
| 153 |
+
|
| 154 |
+
if at == ActionType.READ_RUNBOOK:
|
| 155 |
+
if "runbook" not in state.rewards_given:
|
| 156 |
+
reward += 0.04; state.rewards_given.add("runbook")
|
| 157 |
+
|
| 158 |
+
if at == ActionType.DIAGNOSE:
|
| 159 |
+
rc = action.root_cause or ""
|
| 160 |
+
has_disk = semantic_match(rc, ["disk", "storage", "full", "space", "log", "aggregat"])
|
| 161 |
+
has_ml = semantic_match(rc, ["ml", "inference", "model", "reload", "cpu", "loop"])
|
| 162 |
+
result_text = f"Diagnosis recorded: {rc}"
|
| 163 |
+
if has_disk and has_ml:
|
| 164 |
+
if "diagnose_both" not in state.rewards_given:
|
| 165 |
+
reward += 0.20; state.rewards_given.add("diagnose_both")
|
| 166 |
+
elif has_disk or has_ml:
|
| 167 |
+
if "diagnose_one" not in state.rewards_given:
|
| 168 |
+
reward += 0.08; state.rewards_given.add("diagnose_one")
|
| 169 |
+
|
| 170 |
+
# Fix 1: disk issue via oncall
|
| 171 |
+
if at == ActionType.ALERT_ONCALL:
|
| 172 |
+
reason = (action.reason or "").lower()
|
| 173 |
+
if semantic_match(reason, ["disk", "log", "storage", "space", "aggregat"]):
|
| 174 |
+
if "fix_disk" not in state.rewards_given:
|
| 175 |
+
reward += 0.20; state.rewards_given.add("fix_disk")
|
| 176 |
+
result_text = "SRE paged for disk cleanup. Volume extension underway (~5 min)."
|
| 177 |
+
if "fix_ml" in state.rewards_given:
|
| 178 |
+
state.incident_resolved = True; done = True; info["resolution"] = "incident_resolved"
|
| 179 |
+
else:
|
| 180 |
+
if "fix_disk" not in state.rewards_given:
|
| 181 |
+
reward += 0.08
|
| 182 |
+
result_text = "On-call paged. Clarify disk/log issue for faster resolution."
|
| 183 |
+
|
| 184 |
+
# Fix 2: ML reload loop via rollback or restart
|
| 185 |
+
if at in (ActionType.ROLLBACK, ActionType.RESTART_SERVICE) and svc == "ml-inference-service":
|
| 186 |
+
if "fix_ml" not in state.rewards_given:
|
| 187 |
+
r_base = 0.20 if at == ActionType.ROLLBACK else 0.12
|
| 188 |
+
reward += r_base; state.rewards_given.add("fix_ml")
|
| 189 |
+
state.services["ml-inference-service"]["cpu_percent"] = round(self.rng.uniform(22, 38), 1)
|
| 190 |
+
state.services["ml-inference-service"]["latency_p99_ms"] = round(self.rng.uniform(80, 140), 0)
|
| 191 |
+
state.services["ml-inference-service"]["error_rate"] = 0.0
|
| 192 |
+
action_word = "rolled back" if at == ActionType.ROLLBACK else "restarted"
|
| 193 |
+
result_text = f"ml-inference-service {action_word}. Reload loop stopped. CPU recovering."
|
| 194 |
+
if "fix_disk" in state.rewards_given:
|
| 195 |
+
state.incident_resolved = True; done = True; info["resolution"] = "incident_resolved"
|
| 196 |
+
|
| 197 |
+
if at in (ActionType.RESTART_SERVICE, ActionType.ROLLBACK) and svc in state.healthy_services:
|
| 198 |
+
reward -= 0.08
|
| 199 |
+
if at == ActionType.NOOP and state.step > 5:
|
| 200 |
+
reward -= 0.03
|
| 201 |
+
|
| 202 |
+
state.total_reward = self._clamp(state.total_reward + reward)
|
| 203 |
+
if state.step >= state.max_steps and not done:
|
| 204 |
+
done = True; info["reason"] = "max_steps_reached"
|
| 205 |
+
|
| 206 |
+
obs = state._build_observation(last_action_result=result_text, last_action_error=error_text)
|
| 207 |
+
state.action_history.append({"step": state.step, "action": action.model_dump(), "reward": round(reward, 4)})
|
| 208 |
+
return StepOutput(next_state=state, reward=round(reward, 4), done=done, info=info)
|
tasks/task_easy.py
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
import uuid
|
| 3 |
+
from typing import Dict, Any, List
|
| 4 |
+
from models import Action, ActionType
|
| 5 |
+
from tasks.base import BaseTask, InternalState, StepOutput, semantic_match
|
| 6 |
+
|
| 7 |
+
INCIDENT_TIME = "2026-03-30T10:14:47Z"
|
| 8 |
+
|
| 9 |
+
SCENARIOS = [
|
| 10 |
+
{
|
| 11 |
+
"failing_service": "payment-service",
|
| 12 |
+
"root_cause": "memory_leak_payment_service",
|
| 13 |
+
"fix": "restart payment-service",
|
| 14 |
+
"alert_msg": "payment-service pod restarting (OOMKilled)",
|
| 15 |
+
"language": "java",
|
| 16 |
+
"diagnosis_keywords": ["memory", "oom", "heap", "leak", "outofmemory", "kill"],
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
"failing_service": "order-service",
|
| 20 |
+
"root_cause": "memory_leak_order_service",
|
| 21 |
+
"fix": "restart order-service",
|
| 22 |
+
"alert_msg": "order-service pod restarting (OOMKilled)",
|
| 23 |
+
"language": "python",
|
| 24 |
+
"diagnosis_keywords": ["memory", "oom", "heap", "leak", "segfault", "kill", "allocat"],
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"failing_service": "user-service",
|
| 28 |
+
"root_cause": "memory_leak_user_service",
|
| 29 |
+
"fix": "restart user-service",
|
| 30 |
+
"alert_msg": "user-service pod restarting (OOMKilled)",
|
| 31 |
+
"language": "node",
|
| 32 |
+
"diagnosis_keywords": ["memory", "heap", "oom", "leak", "javascript", "kill"],
|
| 33 |
+
},
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
ALL_SERVICES = ["payment-service", "order-service", "user-service", "api-gateway"]
|
| 37 |
+
VERSIONS = {
|
| 38 |
+
"payment-service": "v4.2.1", "order-service": "v1.8.2",
|
| 39 |
+
"user-service": "v3.0.5", "api-gateway": "v2.1.0",
|
| 40 |
+
}
|
| 41 |
+
DEPENDENCIES = [
|
| 42 |
+
{"service": "api-gateway", "calls": ["payment-service", "order-service", "user-service"], "called_by": []},
|
| 43 |
+
{"service": "payment-service", "calls": [], "called_by": ["api-gateway"]},
|
| 44 |
+
{"service": "order-service", "calls": [], "called_by": ["api-gateway"]},
|
| 45 |
+
{"service": "user-service", "calls": [], "called_by": ["api-gateway"]},
|
| 46 |
+
]
|
| 47 |
+
|
| 48 |
+
def _make_logs(scenario, heap1, heap2, restart_count):
|
| 49 |
+
svc = scenario["failing_service"]
|
| 50 |
+
lang = scenario["language"]
|
| 51 |
+
if lang == "java":
|
| 52 |
+
failing = [
|
| 53 |
+
"[10:13:55] INFO Request processed 200 38ms",
|
| 54 |
+
f"[10:14:35] WARN Heap usage at {heap1}% - approaching threshold",
|
| 55 |
+
f"[10:14:41] WARN Heap usage at {heap2}%",
|
| 56 |
+
"[10:14:45] WARN GC overhead limit exceeded - major GC running",
|
| 57 |
+
"[10:14:47] ERROR java.lang.OutOfMemoryError: Java heap space",
|
| 58 |
+
"[10:14:47] ERROR at com.payments.ChargeProcessor.process(ChargeProcessor.java:142)",
|
| 59 |
+
f"[10:14:48] FATAL Service entering crash loop - pod restart #{restart_count}",
|
| 60 |
+
]
|
| 61 |
+
elif lang == "python":
|
| 62 |
+
failing = [
|
| 63 |
+
"[10:13:55] INFO POST /orders 200 55ms",
|
| 64 |
+
f"[10:14:35] WARN RSS memory {heap1}% of pod limit",
|
| 65 |
+
f"[10:14:41] WARN RSS memory {heap2}% of pod limit - approaching OOM",
|
| 66 |
+
"[10:14:46] ERROR Memory allocator: no more pages available",
|
| 67 |
+
"[10:14:47] ERROR Fatal Python error: Segmentation fault (memory allocator exhausted)",
|
| 68 |
+
f"[10:14:48] FATAL Pod killed by OOM killer - restart #{restart_count}",
|
| 69 |
+
]
|
| 70 |
+
else:
|
| 71 |
+
failing = [
|
| 72 |
+
"[10:13:55] INFO GET /users/profile 200 9ms",
|
| 73 |
+
f"[10:14:35] WARN Heap used: {heap1}% ({heap1 * 2}MB / 200MB)",
|
| 74 |
+
f"[10:14:41] WARN Heap used: {heap2}% - GC pressure increasing",
|
| 75 |
+
"[10:14:47] ERROR FATAL ERROR: Reached heap limit - JavaScript heap out of memory",
|
| 76 |
+
f"[10:14:48] FATAL Container OOMKilled - restart #{restart_count}",
|
| 77 |
+
]
|
| 78 |
+
logs = {svc: failing}
|
| 79 |
+
for name in ALL_SERVICES:
|
| 80 |
+
if name == svc: continue
|
| 81 |
+
if name == "api-gateway":
|
| 82 |
+
logs[name] = [
|
| 83 |
+
"[10:14:30] INFO GET /api/v1/health 200 3ms",
|
| 84 |
+
f"[10:14:48] WARN Upstream {svc} returned 503",
|
| 85 |
+
f"[10:14:49] WARN Circuit breaker OPEN for {svc}",
|
| 86 |
+
]
|
| 87 |
+
else:
|
| 88 |
+
logs[name] = ["[10:14:30] INFO Service healthy - 0 errors"]
|
| 89 |
+
return logs
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
class EasyTask(BaseTask):
|
| 93 |
+
def initialize(self) -> InternalState:
|
| 94 |
+
scenario = SCENARIOS[self.rng.randint(0, len(SCENARIOS) - 1)]
|
| 95 |
+
failing = scenario["failing_service"]
|
| 96 |
+
heap1 = self.rng.randint(74, 83)
|
| 97 |
+
heap2 = heap1 + self.rng.randint(5, 10)
|
| 98 |
+
restart_count = self.rng.randint(2, 6)
|
| 99 |
+
|
| 100 |
+
services: Dict[str, dict] = {}
|
| 101 |
+
for name in ALL_SERVICES:
|
| 102 |
+
if name == failing:
|
| 103 |
+
services[name] = {
|
| 104 |
+
"name": name, "status": "down",
|
| 105 |
+
"cpu_percent": round(self.rng.uniform(5, 20), 1),
|
| 106 |
+
"memory_percent": round(self.rng.uniform(93, 99), 1),
|
| 107 |
+
"error_rate": round(self.rng.uniform(8.0, 15.0), 2),
|
| 108 |
+
"latency_p99_ms": round(self.rng.uniform(5000, 9000), 0),
|
| 109 |
+
"replicas_running": 0, "replicas_desired": 3,
|
| 110 |
+
"current_version": VERSIONS[name],
|
| 111 |
+
"last_deployed": "2026-03-28T14:00:00Z",
|
| 112 |
+
"minutes_degraded": 0, "sla_breach": False,
|
| 113 |
+
}
|
| 114 |
+
elif name == "api-gateway":
|
| 115 |
+
services[name] = {
|
| 116 |
+
"name": name, "status": "degraded",
|
| 117 |
+
"cpu_percent": round(self.rng.uniform(35, 55), 1),
|
| 118 |
+
"memory_percent": round(self.rng.uniform(40, 55), 1),
|
| 119 |
+
"error_rate": round(self.rng.uniform(2.0, 5.0), 2),
|
| 120 |
+
"latency_p99_ms": round(self.rng.uniform(800, 1500), 0),
|
| 121 |
+
"replicas_running": 2, "replicas_desired": 2,
|
| 122 |
+
"current_version": VERSIONS[name],
|
| 123 |
+
"last_deployed": "2026-03-25T09:00:00Z",
|
| 124 |
+
"minutes_degraded": 0, "sla_breach": False,
|
| 125 |
+
}
|
| 126 |
+
else:
|
| 127 |
+
services[name] = {
|
| 128 |
+
"name": name, "status": "healthy",
|
| 129 |
+
"cpu_percent": round(self.rng.uniform(20, 40), 1),
|
| 130 |
+
"memory_percent": round(self.rng.uniform(30, 48), 1),
|
| 131 |
+
"error_rate": 0.0,
|
| 132 |
+
"latency_p99_ms": round(self.rng.uniform(8, 30), 0),
|
| 133 |
+
"replicas_running": 2, "replicas_desired": 2,
|
| 134 |
+
"current_version": VERSIONS[name],
|
| 135 |
+
"last_deployed": "2026-03-20T11:00:00Z",
|
| 136 |
+
"minutes_degraded": 0, "sla_breach": False,
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
alerts = [
|
| 140 |
+
{
|
| 141 |
+
"id": "A001", "severity": "critical", "service": failing,
|
| 142 |
+
"message": f"{scenario['alert_msg']} - {restart_count} times in 5 minutes",
|
| 143 |
+
"timestamp": "2026-03-30T10:14:48Z", "acknowledged": False,
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"id": "A002", "severity": "warning", "service": "api-gateway",
|
| 147 |
+
"message": f"Upstream {failing} returning 503 - circuit breaker open",
|
| 148 |
+
"timestamp": "2026-03-30T10:14:52Z", "acknowledged": False,
|
| 149 |
+
},
|
| 150 |
+
]
|
| 151 |
+
|
| 152 |
+
state = InternalState(
|
| 153 |
+
episode_id=str(uuid.uuid4()), task_id="easy", step=0, max_steps=15,
|
| 154 |
+
services=services, alerts=alerts,
|
| 155 |
+
logs=_make_logs(scenario, heap1, heap2, restart_count),
|
| 156 |
+
action_history=[], total_reward=0.0, incident_resolved=False,
|
| 157 |
+
ground_truth_root_cause=scenario["root_cause"],
|
| 158 |
+
ground_truth_fix=scenario["fix"],
|
| 159 |
+
incident_start_time=INCIDENT_TIME,
|
| 160 |
+
healthy_services=[s for s in ALL_SERVICES if s != failing],
|
| 161 |
+
service_dependencies=DEPENDENCIES,
|
| 162 |
+
)
|
| 163 |
+
state._scenario = scenario
|
| 164 |
+
return state
|
| 165 |
+
|
| 166 |
+
def step(self, state: InternalState, action: Action) -> StepOutput:
|
| 167 |
+
state.step += 1
|
| 168 |
+
state._apply_sla_degradation()
|
| 169 |
+
at = action.action_type
|
| 170 |
+
svc = action.service or ""
|
| 171 |
+
scenario = state._scenario
|
| 172 |
+
failing = scenario["failing_service"]
|
| 173 |
+
keywords = scenario["diagnosis_keywords"]
|
| 174 |
+
reward = 0.0
|
| 175 |
+
done = False
|
| 176 |
+
info: Dict[str, Any] = {}
|
| 177 |
+
|
| 178 |
+
result_text, error_text = self._apply_action_to_logs(state, action)
|
| 179 |
+
|
| 180 |
+
if at == ActionType.READ_LOGS and svc == failing:
|
| 181 |
+
if "read_logs" not in state.rewards_given:
|
| 182 |
+
reward += 0.15
|
| 183 |
+
state.rewards_given.add("read_logs")
|
| 184 |
+
|
| 185 |
+
if at == ActionType.READ_METRICS and svc == failing:
|
| 186 |
+
if "read_metrics" not in state.rewards_given:
|
| 187 |
+
reward += 0.10
|
| 188 |
+
state.rewards_given.add("read_metrics")
|
| 189 |
+
|
| 190 |
+
if at == ActionType.READ_RUNBOOK:
|
| 191 |
+
if "runbook" not in state.rewards_given:
|
| 192 |
+
reward += 0.05
|
| 193 |
+
state.rewards_given.add("runbook")
|
| 194 |
+
|
| 195 |
+
if at == ActionType.DIAGNOSE:
|
| 196 |
+
rc = action.root_cause or ""
|
| 197 |
+
correct_type = semantic_match(rc, keywords, threshold=1)
|
| 198 |
+
correct_svc = semantic_match(rc, [failing, failing.split("-")[0]])
|
| 199 |
+
result_text = f"Diagnosis recorded: {rc}"
|
| 200 |
+
if correct_type and correct_svc:
|
| 201 |
+
if "diagnose_correct" not in state.rewards_given:
|
| 202 |
+
# Give full reward, remove partial if already given
|
| 203 |
+
bonus = 0.30 if "diagnose_partial" not in state.rewards_given else 0.15
|
| 204 |
+
reward += bonus
|
| 205 |
+
state.rewards_given.add("diagnose_correct")
|
| 206 |
+
elif correct_type:
|
| 207 |
+
if "diagnose_partial" not in state.rewards_given and "diagnose_correct" not in state.rewards_given:
|
| 208 |
+
reward += 0.15
|
| 209 |
+
state.rewards_given.add("diagnose_partial")
|
| 210 |
+
|
| 211 |
+
if at == ActionType.RESTART_SERVICE:
|
| 212 |
+
blind_penalty = self._penalty_blind_remediation(state, action, "restarted")
|
| 213 |
+
reward += blind_penalty
|
| 214 |
+
if svc == failing:
|
| 215 |
+
reward += 0.40
|
| 216 |
+
state.services[svc]["status"] = "healthy"
|
| 217 |
+
state.services[svc]["memory_percent"] = round(self.rng.uniform(38, 48), 1)
|
| 218 |
+
state.services[svc]["error_rate"] = 0.0
|
| 219 |
+
state.services[svc]["latency_p99_ms"] = round(self.rng.uniform(20, 60), 0)
|
| 220 |
+
state.services[svc]["replicas_running"] = state.services[svc]["replicas_desired"]
|
| 221 |
+
state.alerts = [a for a in state.alerts if a["id"] != "A001"]
|
| 222 |
+
state.incident_resolved = True
|
| 223 |
+
result_text = f"{svc} restarted. Memory cleared. All pods healthy."
|
| 224 |
+
done = True
|
| 225 |
+
info["resolution"] = "incident_resolved"
|
| 226 |
+
elif svc in state.healthy_services:
|
| 227 |
+
reward -= 0.10
|
| 228 |
+
error_text = f"Collateral damage: {svc} was healthy. Unnecessary restart."
|
| 229 |
+
|
| 230 |
+
if at == ActionType.NOOP and state.step > 3:
|
| 231 |
+
reward -= 0.04
|
| 232 |
+
|
| 233 |
+
state.total_reward = self._clamp(state.total_reward + reward)
|
| 234 |
+
if state.step >= state.max_steps and not done:
|
| 235 |
+
done = True
|
| 236 |
+
info["reason"] = "max_steps_reached"
|
| 237 |
+
|
| 238 |
+
obs = state._build_observation(last_action_result=result_text, last_action_error=error_text)
|
| 239 |
+
state.action_history.append({"step": state.step, "action": action.model_dump(), "reward": round(reward, 4)})
|
| 240 |
+
return StepOutput(next_state=state, reward=round(reward, 4), done=done, info=info)
|
tasks/task_hard.py
ADDED
|
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
import uuid
|
| 3 |
+
from typing import Dict, Any, List
|
| 4 |
+
from models import Action, ActionType
|
| 5 |
+
from tasks.base import BaseTask, InternalState, StepOutput, semantic_match
|
| 6 |
+
|
| 7 |
+
INCIDENT_TIME = "2026-03-30T11:02:00Z"
|
| 8 |
+
|
| 9 |
+
DEPENDENCIES = [
|
| 10 |
+
{"service": "api-gateway", "calls": ["order-service", "product-catalog-service"], "called_by": []},
|
| 11 |
+
{"service": "order-service", "calls": ["product-catalog-service"], "called_by": ["api-gateway"]},
|
| 12 |
+
{"service": "data-pipeline-service", "calls": ["product-catalog-service"], "called_by": []},
|
| 13 |
+
{"service": "product-catalog-service", "calls": [], "called_by": ["api-gateway", "order-service", "data-pipeline-service"]},
|
| 14 |
+
{"service": "price-validation-service","calls": ["product-catalog-service"], "called_by": []},
|
| 15 |
+
{"service": "analytics-service", "calls": ["order-service"], "called_by": []},
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
PIPELINE_LOGS = [
|
| 19 |
+
"[11:01:55] INFO Deployment data-pipeline-service:{version} complete",
|
| 20 |
+
"[11:01:58] INFO Health check passed. Starting pipeline workers.",
|
| 21 |
+
"[11:02:00] INFO Pipeline worker started. Consuming from topic: product-updates",
|
| 22 |
+
"[11:02:01] INFO Processed batch: 142 records written to product-catalog",
|
| 23 |
+
"[11:02:03] INFO Processed batch: 138 records written to product-catalog",
|
| 24 |
+
"[11:02:07] INFO Processed batch: 147 records written to product-catalog",
|
| 25 |
+
"[11:02:09] INFO All writes succeeded (HTTP 200) - no errors detected",
|
| 26 |
+
]
|
| 27 |
+
|
| 28 |
+
PRICE_VALIDATION_LOGS = [
|
| 29 |
+
"[11:02:08] INFO Validation batch started: 312 products",
|
| 30 |
+
"[11:02:10] WARN PRICE_MISMATCH: product_id=1042 catalog=149.99 expected=14.99 (10x multiplier?)",
|
| 31 |
+
"[11:02:11] WARN PRICE_MISMATCH: product_id=2891 catalog=899.00 expected=89.00",
|
| 32 |
+
"[11:02:13] WARN PRICE_MISMATCH: product_id=0391 catalog=24.90 expected=2.49",
|
| 33 |
+
"[11:02:14] WARN PRICE_MISMATCH: product_id=5521 catalog=1299.90 expected=129.99",
|
| 34 |
+
"[11:02:17] WARN PRICE_MISMATCH: product_id=7823 catalog=49.90 expected=4.99",
|
| 35 |
+
"[11:02:21] WARN PRICE_MISMATCH: product_id=3314 catalog=799.00 expected=79.90",
|
| 36 |
+
"[11:02:24] INFO Validation batch complete: 265 ok, 47 mismatches (15.1% rate, baseline: 0.2%)",
|
| 37 |
+
"[11:02:24] WARN Mismatch rate 15.1% exceeds SLA threshold 1.0% - notifying data team",
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
ANALYTICS_LOGS = [
|
| 41 |
+
"[11:01:50] INFO Hourly report: avg_order_value=$89.42 orders=138 (normal)",
|
| 42 |
+
"[11:02:00] INFO Hourly report: avg_order_value=$91.18 orders=141",
|
| 43 |
+
"[11:02:10] INFO ANOMALY: avg_order_value=$312.44 (3.5x baseline) in last 2 min",
|
| 44 |
+
"[11:02:20] WARN avg_order_value=$847.23 - possible pricing issue",
|
| 45 |
+
"[11:02:21] INFO orders_per_minute=142 (normal: 120-160) - volume is normal",
|
| 46 |
+
"[11:02:21] INFO Spike NOT correlated with marketing campaign or known event",
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
CATALOG_LOGS = [
|
| 50 |
+
"[11:02:01] INFO PUT /catalog/product/1042 200 8ms price=149.99",
|
| 51 |
+
"[11:02:02] INFO PUT /catalog/product/2891 200 7ms price=899.00",
|
| 52 |
+
"[11:02:03] INFO PUT /catalog/product/0391 200 6ms price=24.90",
|
| 53 |
+
"[11:02:04] INFO PUT /catalog/product/5521 200 8ms price=1299.90",
|
| 54 |
+
"[11:02:05] INFO All writes returning 200 OK - no DB errors",
|
| 55 |
+
]
|
| 56 |
+
|
| 57 |
+
GATEWAY_LOGS = [
|
| 58 |
+
"[11:02:00] INFO GET /api/v1/products 200 12ms",
|
| 59 |
+
"[11:02:05] INFO POST /api/v1/orders 200 88ms",
|
| 60 |
+
"[11:02:15] INFO POST /api/v1/orders 200 91ms",
|
| 61 |
+
"[11:02:20] INFO POST /api/v1/orders 200 87ms",
|
| 62 |
+
]
|
| 63 |
+
|
| 64 |
+
ORDER_LOGS = [
|
| 65 |
+
"[11:02:05] INFO Order ORD-9901: total=$149.99 (product_id=1042)",
|
| 66 |
+
"[11:02:08] INFO Order ORD-9902: total=$899.00 (product_id=2891)",
|
| 67 |
+
"[11:02:12] INFO Order ORD-9903: total=$1299.90 (product_id=5521)",
|
| 68 |
+
]
|
| 69 |
+
|
| 70 |
+
# Extra noise alerts that don't point to the real issue
|
| 71 |
+
NOISE_ALERTS = [
|
| 72 |
+
{
|
| 73 |
+
"id": "A030", "severity": "info", "service": "api-gateway",
|
| 74 |
+
"message": "TLS certificate renewing in 14 days - scheduled maintenance upcoming",
|
| 75 |
+
"timestamp": "2026-03-30T11:00:00Z", "acknowledged": False,
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"id": "A031", "severity": "info", "service": "analytics-service",
|
| 79 |
+
"message": "Nightly aggregation job starting 5 minutes early due to backlog",
|
| 80 |
+
"timestamp": "2026-03-30T11:01:45Z", "acknowledged": False,
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"id": "A032", "severity": "info", "service": "product-catalog-service",
|
| 84 |
+
"message": "Read replica lag 280ms (threshold: 500ms) - within normal range",
|
| 85 |
+
"timestamp": "2026-03-30T11:02:00Z", "acknowledged": False,
|
| 86 |
+
},
|
| 87 |
+
]
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
class HardTask(BaseTask):
|
| 91 |
+
def initialize(self) -> InternalState:
|
| 92 |
+
bad_ver = f"v3.1.{self.rng.randint(0, 4)}"
|
| 93 |
+
logs = {
|
| 94 |
+
"data-pipeline-service": [l.replace("{version}", bad_ver) for l in PIPELINE_LOGS],
|
| 95 |
+
"price-validation-service": PRICE_VALIDATION_LOGS[:],
|
| 96 |
+
"analytics-service": ANALYTICS_LOGS[:],
|
| 97 |
+
"product-catalog-service": CATALOG_LOGS[:],
|
| 98 |
+
"api-gateway": GATEWAY_LOGS[:],
|
| 99 |
+
"order-service": ORDER_LOGS[:],
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
def healthy_svc(name, ver, deployed):
|
| 103 |
+
return {
|
| 104 |
+
"name": name, "status": "healthy",
|
| 105 |
+
"cpu_percent": round(self.rng.uniform(22, 48), 1),
|
| 106 |
+
"memory_percent": round(self.rng.uniform(35, 55), 1),
|
| 107 |
+
"error_rate": 0.0,
|
| 108 |
+
"latency_p99_ms": round(self.rng.uniform(8, 130), 0),
|
| 109 |
+
"replicas_running": self.rng.choice([2, 3]),
|
| 110 |
+
"replicas_desired": self.rng.choice([2, 3]),
|
| 111 |
+
"current_version": ver, "last_deployed": deployed,
|
| 112 |
+
"minutes_degraded": 0, "sla_breach": False,
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
services = {
|
| 116 |
+
"api-gateway": {**healthy_svc("api-gateway", "v3.1.0", "2026-03-20T08:00:00Z"), "replicas_running": 2, "replicas_desired": 2},
|
| 117 |
+
"data-pipeline-service": {**healthy_svc("data-pipeline-service", bad_ver, "2026-03-30T11:01:55Z"), "replicas_running": 3, "replicas_desired": 3},
|
| 118 |
+
"product-catalog-service": {**healthy_svc("product-catalog-service", "v2.0.1", "2026-03-10T12:00:00Z"), "replicas_running": 2, "replicas_desired": 2},
|
| 119 |
+
"price-validation-service":{**healthy_svc("price-validation-service","v1.4.0", "2026-03-12T14:00:00Z"), "replicas_running": 2, "replicas_desired": 2},
|
| 120 |
+
"analytics-service": {**healthy_svc("analytics-service", "v2.3.1", "2026-03-14T10:00:00Z"), "replicas_running": 2, "replicas_desired": 2},
|
| 121 |
+
"order-service": {**healthy_svc("order-service", "v1.8.2", "2026-03-22T10:00:00Z"), "replicas_running": 3, "replicas_desired": 3},
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
# Real signal alerts + noise
|
| 125 |
+
alerts = NOISE_ALERTS[:] + [
|
| 126 |
+
{
|
| 127 |
+
"id": "A020", "severity": "info", "service": "price-validation-service",
|
| 128 |
+
"message": "Price mismatch rate 15.1% — above SLA threshold of 1.0%. Data team notified.",
|
| 129 |
+
"timestamp": "2026-03-30T11:02:24Z", "acknowledged": False,
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"id": "A021", "severity": "warning", "service": "analytics-service",
|
| 133 |
+
"message": "avg_order_value anomaly: $847.23 vs baseline $89.42 — not correlated with campaigns",
|
| 134 |
+
"timestamp": "2026-03-30T11:02:21Z", "acknowledged": False,
|
| 135 |
+
},
|
| 136 |
+
]
|
| 137 |
+
|
| 138 |
+
state = InternalState(
|
| 139 |
+
episode_id=str(uuid.uuid4()), task_id="hard", step=0, max_steps=25,
|
| 140 |
+
services=services, alerts=alerts, logs=logs,
|
| 141 |
+
action_history=[], total_reward=0.0, incident_resolved=False,
|
| 142 |
+
ground_truth_root_cause=f"data_corruption_data_pipeline_{bad_ver}_incorrect_price_writes",
|
| 143 |
+
ground_truth_fix="rollback data-pipeline-service then alert_oncall for data audit",
|
| 144 |
+
incident_start_time=INCIDENT_TIME,
|
| 145 |
+
healthy_services=list(services.keys()),
|
| 146 |
+
service_dependencies=DEPENDENCIES,
|
| 147 |
+
)
|
| 148 |
+
state._bad_ver = bad_ver
|
| 149 |
+
return state
|
| 150 |
+
|
| 151 |
+
def step(self, state: InternalState, action: Action) -> StepOutput:
|
| 152 |
+
state.step += 1
|
| 153 |
+
# No SLA degradation on hard task — all services stay green
|
| 154 |
+
at = action.action_type
|
| 155 |
+
svc = action.service or ""
|
| 156 |
+
reward = 0.0
|
| 157 |
+
done = False
|
| 158 |
+
info: Dict[str, Any] = {}
|
| 159 |
+
|
| 160 |
+
result_text, error_text = self._apply_action_to_logs(state, action)
|
| 161 |
+
|
| 162 |
+
gather_map = {
|
| 163 |
+
("read_logs", "price-validation-service"): ("rl_price", 0.05),
|
| 164 |
+
("read_logs", "analytics-service"): ("rl_analytics", 0.05),
|
| 165 |
+
("read_logs", "data-pipeline-service"): ("rl_pipeline", 0.05),
|
| 166 |
+
("read_metrics", "analytics-service"): ("rm_analytics", 0.10),
|
| 167 |
+
("read_metrics", "data-pipeline-service"): ("rm_pipeline", 0.10),
|
| 168 |
+
}
|
| 169 |
+
k = (at.value, svc)
|
| 170 |
+
if k in gather_map:
|
| 171 |
+
tag, r = gather_map[k]
|
| 172 |
+
if tag not in state.rewards_given:
|
| 173 |
+
reward += r; state.rewards_given.add(tag)
|
| 174 |
+
|
| 175 |
+
if at == ActionType.READ_RUNBOOK:
|
| 176 |
+
if "runbook" not in state.rewards_given:
|
| 177 |
+
reward += 0.05; state.rewards_given.add("runbook")
|
| 178 |
+
|
| 179 |
+
# Restarts/scale-ups are always wrong here
|
| 180 |
+
if at in (ActionType.RESTART_SERVICE, ActionType.SCALE_UP):
|
| 181 |
+
reward -= 0.15
|
| 182 |
+
error_text = (
|
| 183 |
+
f"Restarting/scaling {svc} will not fix corrupt data already written. "
|
| 184 |
+
"You need to rollback the pipeline and audit the data."
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
if at == ActionType.DIAGNOSE:
|
| 188 |
+
rc = action.root_cause or ""
|
| 189 |
+
has_pipeline = semantic_match(rc, ["pipeline", "data-pipeline"])
|
| 190 |
+
has_corruption = semantic_match(rc, ["corrupt", "data", "price", "wrong", "incorrect", "mismatch"])
|
| 191 |
+
result_text = f"Diagnosis recorded: {rc}"
|
| 192 |
+
if has_pipeline and has_corruption:
|
| 193 |
+
if "diagnose_correct" not in state.rewards_given:
|
| 194 |
+
reward += 0.20; state.rewards_given.add("diagnose_correct")
|
| 195 |
+
elif has_pipeline or has_corruption:
|
| 196 |
+
if "diagnose_partial" not in state.rewards_given and "diagnose_correct" not in state.rewards_given:
|
| 197 |
+
reward += 0.08; state.rewards_given.add("diagnose_partial")
|
| 198 |
+
|
| 199 |
+
if at == ActionType.ROLLBACK and svc == "data-pipeline-service":
|
| 200 |
+
reward += self._penalty_blind_remediation(state, action, "rollback_done")
|
| 201 |
+
if "rollback_done" not in state.rewards_given:
|
| 202 |
+
reward += 0.25; state.rewards_given.add("rollback_done")
|
| 203 |
+
state.services["data-pipeline-service"]["current_version"] = "v3.0.9"
|
| 204 |
+
result_text = (
|
| 205 |
+
"data-pipeline-service rolled back to v3.0.9. Future writes corrected. "
|
| 206 |
+
"WARNING: corrupted prices already written must be audited."
|
| 207 |
+
)
|
| 208 |
+
if "alert_oncall_done" in state.rewards_given:
|
| 209 |
+
state.incident_resolved = True; done = True; info["resolution"] = "incident_resolved"
|
| 210 |
+
|
| 211 |
+
if at == ActionType.ALERT_ONCALL:
|
| 212 |
+
if "alert_oncall_done" not in state.rewards_given:
|
| 213 |
+
reward += 0.15; state.rewards_given.add("alert_oncall_done")
|
| 214 |
+
result_text = "On-call data team paged for price audit and correction job."
|
| 215 |
+
if "rollback_done" in state.rewards_given:
|
| 216 |
+
state.incident_resolved = True; done = True; info["resolution"] = "incident_resolved"
|
| 217 |
+
|
| 218 |
+
state.total_reward = self._clamp(state.total_reward + reward)
|
| 219 |
+
if state.step >= state.max_steps and not done:
|
| 220 |
+
done = True; info["reason"] = "max_steps_reached"
|
| 221 |
+
|
| 222 |
+
obs = state._build_observation(last_action_result=result_text, last_action_error=error_text)
|
| 223 |
+
state.action_history.append({"step": state.step, "action": action.model_dump(), "reward": round(reward, 4)})
|
| 224 |
+
return StepOutput(next_state=state, reward=round(reward, 4), done=done, info=info)
|
tasks/task_medium.py
ADDED
|
@@ -0,0 +1,276 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
import uuid
|
| 3 |
+
from typing import Dict, Any, List
|
| 4 |
+
from models import Action, ActionType
|
| 5 |
+
from tasks.base import BaseTask, InternalState, StepOutput, semantic_match
|
| 6 |
+
|
| 7 |
+
INCIDENT_TIME = "2026-03-30T10:32:01Z"
|
| 8 |
+
|
| 9 |
+
DEPENDENCIES = [
|
| 10 |
+
{"service": "api-gateway", "calls": ["order-service", "user-service"], "called_by": []},
|
| 11 |
+
{"service": "order-service", "calls": ["inventory-service"], "called_by": ["api-gateway"]},
|
| 12 |
+
{"service": "inventory-service", "calls": ["db-primary"], "called_by": ["order-service"]},
|
| 13 |
+
{"service": "notification-service", "calls": [], "called_by": []},
|
| 14 |
+
{"service": "user-service", "calls": [], "called_by": ["api-gateway"]},
|
| 15 |
+
]
|
| 16 |
+
|
| 17 |
+
# Cascading scenarios — 3 different root services that can fail
|
| 18 |
+
SCENARIOS = [
|
| 19 |
+
{
|
| 20 |
+
"root_service": "inventory-service",
|
| 21 |
+
"root_cause_template": "connection_pool_exhaustion_{service}_{version}",
|
| 22 |
+
"fix_template": "rollback {service}",
|
| 23 |
+
"error_type": "connection_pool",
|
| 24 |
+
"diagnosis_keywords": ["connection", "pool", "hikari", "db", "database", "exhaustion", "inventory"],
|
| 25 |
+
"fix_action": ActionType.ROLLBACK,
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
"root_service": "inventory-service",
|
| 29 |
+
"root_cause_template": "null_pointer_exception_{service}_{version}",
|
| 30 |
+
"fix_template": "rollback {service}",
|
| 31 |
+
"error_type": "null_pointer",
|
| 32 |
+
"diagnosis_keywords": ["null", "nullpointer", "npe", "exception", "inventory", "bug", "crash"],
|
| 33 |
+
"fix_action": ActionType.ROLLBACK,
|
| 34 |
+
},
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
+
INV_LOGS_CONNECTION = [
|
| 38 |
+
"[10:31:58] INFO Deployment inventory-service:{version} complete - 12 pods running",
|
| 39 |
+
"[10:32:01] INFO Health check passed for inventory-service:{version}",
|
| 40 |
+
"[10:32:38] ERROR Failed to acquire connection from pool: timeout after 30000ms",
|
| 41 |
+
"[10:32:39] ERROR HikariPool-1 - Connection is not available, request timed out",
|
| 42 |
+
"[10:32:40] ERROR Connection pool exhausted (max=10, active=10, waiting=47)",
|
| 43 |
+
"[10:32:42] WARN Retry attempt 1/3 failed for getInventory(productId=1982)",
|
| 44 |
+
"[10:32:46] WARN Retry attempt 3/3 failed - returning error upstream",
|
| 45 |
+
"[10:32:48] ERROR Thread pool saturation: 98/100 threads active, queue depth 412",
|
| 46 |
+
]
|
| 47 |
+
|
| 48 |
+
INV_LOGS_NPE = [
|
| 49 |
+
"[10:31:58] INFO Deployment inventory-service:{version} complete",
|
| 50 |
+
"[10:32:01] INFO Health check passed for inventory-service:{version}",
|
| 51 |
+
"[10:32:35] ERROR NullPointerException: Cannot invoke method getStock() on null object",
|
| 52 |
+
"[10:32:35] ERROR at InventoryService.checkAvailability(InventoryService.java:218)",
|
| 53 |
+
"[10:32:36] ERROR at InventoryController.getInventory(InventoryController.java:87)",
|
| 54 |
+
"[10:32:37] WARN Exception rate 38/min - circuit breaker threshold approaching",
|
| 55 |
+
"[10:32:42] ERROR Circuit breaker OPEN - too many NullPointerExceptions",
|
| 56 |
+
"[10:32:45] ERROR getInventory returning 500 for all requests",
|
| 57 |
+
]
|
| 58 |
+
|
| 59 |
+
ORDER_LOGS = [
|
| 60 |
+
"[10:32:30] INFO Order created: order_id=ORD-8821 status=confirmed",
|
| 61 |
+
"[10:32:45] WARN inventory-service call timed out after 5000ms",
|
| 62 |
+
"[10:32:49] ERROR Order creation failed: upstream dependency unavailable",
|
| 63 |
+
"[10:32:50] ERROR Circuit breaker OPEN for inventory-service endpoint",
|
| 64 |
+
"[10:32:51] WARN Falling back to cached inventory data (may be stale)",
|
| 65 |
+
]
|
| 66 |
+
|
| 67 |
+
GATEWAY_LOGS = [
|
| 68 |
+
"[10:32:20] INFO POST /api/v1/orders 200 142ms",
|
| 69 |
+
"[10:32:50] WARN POST /api/v1/orders upstream latency 5800ms",
|
| 70 |
+
"[10:32:55] ERROR POST /api/v1/orders 503 Service Unavailable",
|
| 71 |
+
"[10:32:56] WARN Error rate for /api/v1/orders: 18% (threshold: 5%)",
|
| 72 |
+
]
|
| 73 |
+
|
| 74 |
+
NOTIF_LOGS = [
|
| 75 |
+
"[10:30:00] INFO Batch email job started: 48000 recipients",
|
| 76 |
+
"[10:31:30] INFO Sent 24000/48000 emails",
|
| 77 |
+
"[10:33:00] INFO Batch email job complete: 48000 sent, 0 failed",
|
| 78 |
+
]
|
| 79 |
+
|
| 80 |
+
USER_LOGS = ["[10:32:00] INFO GET /users/profile 200 9ms",
|
| 81 |
+
"[10:33:00] INFO GET /users/profile 200 10ms"]
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
class MediumTask(BaseTask):
|
| 85 |
+
def initialize(self) -> InternalState:
|
| 86 |
+
scenario = SCENARIOS[self.rng.randint(0, len(SCENARIOS) - 1)]
|
| 87 |
+
bad_ver = f"v2.3.{self.rng.randint(1, 5)}"
|
| 88 |
+
root_svc = scenario["root_service"]
|
| 89 |
+
|
| 90 |
+
if scenario["error_type"] == "connection_pool":
|
| 91 |
+
inv_logs = [l.replace("{version}", bad_ver) for l in INV_LOGS_CONNECTION]
|
| 92 |
+
else:
|
| 93 |
+
inv_logs = [l.replace("{version}", bad_ver) for l in INV_LOGS_NPE]
|
| 94 |
+
|
| 95 |
+
logs = {
|
| 96 |
+
"inventory-service": inv_logs,
|
| 97 |
+
"order-service": ORDER_LOGS[:],
|
| 98 |
+
"api-gateway": GATEWAY_LOGS[:],
|
| 99 |
+
"notification-service": NOTIF_LOGS[:],
|
| 100 |
+
"user-service": USER_LOGS[:],
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
services = {
|
| 104 |
+
"api-gateway": {
|
| 105 |
+
"name": "api-gateway", "status": "degraded",
|
| 106 |
+
"cpu_percent": round(self.rng.uniform(55, 70), 1),
|
| 107 |
+
"memory_percent": round(self.rng.uniform(48, 60), 1),
|
| 108 |
+
"error_rate": round(self.rng.uniform(3.5, 6.0), 2),
|
| 109 |
+
"latency_p99_ms": round(self.rng.uniform(4500, 6500), 0),
|
| 110 |
+
"replicas_running": 2, "replicas_desired": 2,
|
| 111 |
+
"current_version": "v3.1.0", "last_deployed": "2026-03-20T08:00:00Z",
|
| 112 |
+
"minutes_degraded": 0, "sla_breach": False,
|
| 113 |
+
},
|
| 114 |
+
"order-service": {
|
| 115 |
+
"name": "order-service", "status": "degraded",
|
| 116 |
+
"cpu_percent": round(self.rng.uniform(60, 75), 1),
|
| 117 |
+
"memory_percent": round(self.rng.uniform(55, 68), 1),
|
| 118 |
+
"error_rate": round(self.rng.uniform(4.0, 8.0), 2),
|
| 119 |
+
"latency_p99_ms": round(self.rng.uniform(5000, 7000), 0),
|
| 120 |
+
"replicas_running": 3, "replicas_desired": 3,
|
| 121 |
+
"current_version": "v1.8.2", "last_deployed": "2026-03-22T10:00:00Z",
|
| 122 |
+
"minutes_degraded": 0, "sla_breach": False,
|
| 123 |
+
},
|
| 124 |
+
"inventory-service": {
|
| 125 |
+
"name": "inventory-service", "status": "degraded",
|
| 126 |
+
"cpu_percent": round(self.rng.uniform(80, 95), 1),
|
| 127 |
+
"memory_percent": round(self.rng.uniform(70, 85), 1),
|
| 128 |
+
"error_rate": round(self.rng.uniform(12.0, 20.0), 2),
|
| 129 |
+
"latency_p99_ms": round(self.rng.uniform(28000, 35000), 0),
|
| 130 |
+
"replicas_running": 3, "replicas_desired": 3,
|
| 131 |
+
"current_version": bad_ver, "last_deployed": "2026-03-30T10:31:58Z",
|
| 132 |
+
"minutes_degraded": 0, "sla_breach": False,
|
| 133 |
+
},
|
| 134 |
+
"notification-service": {
|
| 135 |
+
"name": "notification-service", "status": "healthy",
|
| 136 |
+
"cpu_percent": round(self.rng.uniform(82, 92), 1),
|
| 137 |
+
"memory_percent": round(self.rng.uniform(55, 65), 1),
|
| 138 |
+
"error_rate": 0.0,
|
| 139 |
+
"latency_p99_ms": round(self.rng.uniform(20, 45), 0),
|
| 140 |
+
"replicas_running": 2, "replicas_desired": 2,
|
| 141 |
+
"current_version": "v1.2.0", "last_deployed": "2026-03-15T16:00:00Z",
|
| 142 |
+
"minutes_degraded": 0, "sla_breach": False,
|
| 143 |
+
},
|
| 144 |
+
"user-service": {
|
| 145 |
+
"name": "user-service", "status": "healthy",
|
| 146 |
+
"cpu_percent": round(self.rng.uniform(20, 35), 1),
|
| 147 |
+
"memory_percent": round(self.rng.uniform(30, 42), 1),
|
| 148 |
+
"error_rate": 0.0,
|
| 149 |
+
"latency_p99_ms": round(self.rng.uniform(8, 20), 0),
|
| 150 |
+
"replicas_running": 2, "replicas_desired": 2,
|
| 151 |
+
"current_version": "v3.0.5", "last_deployed": "2026-03-18T09:00:00Z",
|
| 152 |
+
"minutes_degraded": 0, "sla_breach": False,
|
| 153 |
+
},
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
alerts = [
|
| 157 |
+
{
|
| 158 |
+
"id": "A010", "severity": "critical", "service": "api-gateway",
|
| 159 |
+
"message": "Error rate on /api/v1/orders exceeded 15% threshold",
|
| 160 |
+
"timestamp": "2026-03-30T10:32:56Z", "acknowledged": False,
|
| 161 |
+
},
|
| 162 |
+
{
|
| 163 |
+
"id": "A011", "severity": "critical", "service": "order-service",
|
| 164 |
+
"message": "Order creation failure rate 31% - circuit breaker triggered for inventory-service",
|
| 165 |
+
"timestamp": "2026-03-30T10:32:51Z", "acknowledged": False,
|
| 166 |
+
},
|
| 167 |
+
{
|
| 168 |
+
"id": "A012", "severity": "warning", "service": "inventory-service",
|
| 169 |
+
"message": f"P99 latency 32100ms (threshold: 5000ms) - deployed {bad_ver} at 10:31",
|
| 170 |
+
"timestamp": "2026-03-30T10:32:48Z", "acknowledged": False,
|
| 171 |
+
},
|
| 172 |
+
# Red herring
|
| 173 |
+
{
|
| 174 |
+
"id": "A013", "severity": "warning", "service": "notification-service",
|
| 175 |
+
"message": "CPU usage 88% - batch email job running (scheduled, not an incident)",
|
| 176 |
+
"timestamp": "2026-03-30T10:30:00Z", "acknowledged": False,
|
| 177 |
+
},
|
| 178 |
+
]
|
| 179 |
+
|
| 180 |
+
rc = scenario["root_cause_template"].format(service=root_svc, version=bad_ver)
|
| 181 |
+
fix = scenario["fix_template"].format(service=root_svc)
|
| 182 |
+
|
| 183 |
+
state = InternalState(
|
| 184 |
+
episode_id=str(uuid.uuid4()), task_id="medium", step=0, max_steps=20,
|
| 185 |
+
services=services, alerts=alerts, logs=logs,
|
| 186 |
+
action_history=[], total_reward=0.0, incident_resolved=False,
|
| 187 |
+
ground_truth_root_cause=rc, ground_truth_fix=fix,
|
| 188 |
+
incident_start_time=INCIDENT_TIME,
|
| 189 |
+
healthy_services=["notification-service", "user-service"],
|
| 190 |
+
service_dependencies=DEPENDENCIES,
|
| 191 |
+
)
|
| 192 |
+
state._scenario = scenario
|
| 193 |
+
state._bad_ver = bad_ver
|
| 194 |
+
return state
|
| 195 |
+
|
| 196 |
+
def step(self, state: InternalState, action: Action) -> StepOutput:
|
| 197 |
+
state.step += 1
|
| 198 |
+
state._apply_sla_degradation()
|
| 199 |
+
at = action.action_type
|
| 200 |
+
svc = action.service or ""
|
| 201 |
+
scenario = state._scenario
|
| 202 |
+
keywords = scenario["diagnosis_keywords"]
|
| 203 |
+
bad_ver = state._bad_ver
|
| 204 |
+
reward = 0.0
|
| 205 |
+
done = False
|
| 206 |
+
info: Dict[str, Any] = {}
|
| 207 |
+
|
| 208 |
+
result_text, error_text = self._apply_action_to_logs(state, action)
|
| 209 |
+
|
| 210 |
+
if at == ActionType.READ_LOGS and svc == "inventory-service":
|
| 211 |
+
if "read_logs_inv" not in state.rewards_given:
|
| 212 |
+
reward += 0.10; state.rewards_given.add("read_logs_inv")
|
| 213 |
+
if at == ActionType.READ_METRICS and svc == "inventory-service":
|
| 214 |
+
if "read_metrics_inv" not in state.rewards_given:
|
| 215 |
+
reward += 0.10; state.rewards_given.add("read_metrics_inv")
|
| 216 |
+
if at == ActionType.READ_METRICS and svc == "order-service":
|
| 217 |
+
if "read_metrics_ord" not in state.rewards_given:
|
| 218 |
+
reward += 0.05; state.rewards_given.add("read_metrics_ord")
|
| 219 |
+
if at == ActionType.READ_RUNBOOK:
|
| 220 |
+
if "runbook" not in state.rewards_given:
|
| 221 |
+
reward += 0.05; state.rewards_given.add("runbook")
|
| 222 |
+
|
| 223 |
+
# Red herring penalty
|
| 224 |
+
if at == ActionType.RESTART_SERVICE and svc == "notification-service":
|
| 225 |
+
reward -= 0.05
|
| 226 |
+
error_text = "notification-service was healthy — high CPU is a scheduled batch job, not an incident."
|
| 227 |
+
# Treating symptom before root cause
|
| 228 |
+
if at == ActionType.RESTART_SERVICE and svc == "order-service":
|
| 229 |
+
if "diagnose_correct" not in state.rewards_given:
|
| 230 |
+
reward -= 0.10
|
| 231 |
+
error_text = "order-service is a downstream victim. Fix inventory-service first."
|
| 232 |
+
|
| 233 |
+
if at == ActionType.DIAGNOSE:
|
| 234 |
+
rc = action.root_cause or ""
|
| 235 |
+
has_service = semantic_match(rc, ["inventory"])
|
| 236 |
+
has_cause = semantic_match(rc, keywords, threshold=1)
|
| 237 |
+
result_text = f"Diagnosis recorded: {rc}"
|
| 238 |
+
if has_service and has_cause:
|
| 239 |
+
if "diagnose_correct" not in state.rewards_given:
|
| 240 |
+
reward += 0.25; state.rewards_given.add("diagnose_correct")
|
| 241 |
+
elif has_service or has_cause:
|
| 242 |
+
if "diagnose_partial" not in state.rewards_given and "diagnose_correct" not in state.rewards_given:
|
| 243 |
+
reward += 0.10; state.rewards_given.add("diagnose_partial")
|
| 244 |
+
|
| 245 |
+
if at == ActionType.ROLLBACK and svc == "inventory-service":
|
| 246 |
+
reward += self._penalty_blind_remediation(state, action, "rollback_done")
|
| 247 |
+
if "rollback_done" not in state.rewards_given:
|
| 248 |
+
reward += 0.30; state.rewards_given.add("rollback_done")
|
| 249 |
+
ver = action.version or ""
|
| 250 |
+
if "v2.3.0" in ver or ver in ("previous", "last"):
|
| 251 |
+
reward += 0.10
|
| 252 |
+
state.services["inventory-service"]["status"] = "healthy"
|
| 253 |
+
state.services["inventory-service"]["error_rate"] = 0.0
|
| 254 |
+
state.services["inventory-service"]["latency_p99_ms"] = 85.0
|
| 255 |
+
state.services["inventory-service"]["current_version"] = "v2.3.0"
|
| 256 |
+
state.services["order-service"]["status"] = "healthy"
|
| 257 |
+
state.services["order-service"]["error_rate"] = 0.0
|
| 258 |
+
state.services["api-gateway"]["status"] = "healthy"
|
| 259 |
+
state.services["api-gateway"]["error_rate"] = 0.1
|
| 260 |
+
state.alerts = [a for a in state.alerts if a["id"] not in ("A010", "A011", "A012")]
|
| 261 |
+
state.incident_resolved = True
|
| 262 |
+
result_text = f"inventory-service rolled back. Downstream services recovering."
|
| 263 |
+
done = True; info["resolution"] = "incident_resolved"
|
| 264 |
+
|
| 265 |
+
if at in (ActionType.RESTART_SERVICE, ActionType.ROLLBACK) and svc in state.healthy_services:
|
| 266 |
+
reward -= 0.10
|
| 267 |
+
if at == ActionType.NOOP and state.step > 4:
|
| 268 |
+
reward -= 0.03
|
| 269 |
+
|
| 270 |
+
state.total_reward = self._clamp(state.total_reward + reward)
|
| 271 |
+
if state.step >= state.max_steps and not done:
|
| 272 |
+
done = True; info["reason"] = "max_steps_reached"
|
| 273 |
+
|
| 274 |
+
obs = state._build_observation(last_action_result=result_text, last_action_error=error_text)
|
| 275 |
+
state.action_history.append({"step": state.step, "action": action.model_dump(), "reward": round(reward, 4)})
|
| 276 |
+
return StepOutput(next_state=state, reward=round(reward, 4), done=done, info=info)
|
validate.py
ADDED
|
@@ -0,0 +1,303 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
validate.py — Pre-submission validation script.
|
| 4 |
+
|
| 5 |
+
Run this before submitting to confirm all checklist items pass:
|
| 6 |
+
python validate.py
|
| 7 |
+
|
| 8 |
+
Exit code 0 = all checks passed.
|
| 9 |
+
Exit code 1 = one or more checks failed.
|
| 10 |
+
"""
|
| 11 |
+
import sys
|
| 12 |
+
import os
|
| 13 |
+
import random
|
| 14 |
+
import traceback
|
| 15 |
+
|
| 16 |
+
sys.path.insert(0, os.path.dirname(__file__))
|
| 17 |
+
|
| 18 |
+
PASS = "\033[92m✓\033[0m"
|
| 19 |
+
FAIL = "\033[91m✗\033[0m"
|
| 20 |
+
WARN = "\033[93m!\033[0m"
|
| 21 |
+
|
| 22 |
+
failures = []
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def check(name: str, fn):
|
| 26 |
+
try:
|
| 27 |
+
result = fn()
|
| 28 |
+
if result is True or result is None:
|
| 29 |
+
print(f" {PASS} {name}")
|
| 30 |
+
return True
|
| 31 |
+
else:
|
| 32 |
+
print(f" {FAIL} {name}: {result}")
|
| 33 |
+
failures.append(name)
|
| 34 |
+
return False
|
| 35 |
+
except Exception as e:
|
| 36 |
+
print(f" {FAIL} {name}: {e}")
|
| 37 |
+
traceback.print_exc()
|
| 38 |
+
failures.append(name)
|
| 39 |
+
return False
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def main():
|
| 43 |
+
print("\n=== DevOps Incident Response — OpenEnv Validation ===\n")
|
| 44 |
+
|
| 45 |
+
# --- Imports ---
|
| 46 |
+
print("[ Imports ]")
|
| 47 |
+
|
| 48 |
+
def check_imports():
|
| 49 |
+
from env import DevOpsIncidentEnv
|
| 50 |
+
from models import Action, ActionType, Observation, StepResult, State
|
| 51 |
+
from graders.grader import grade_episode
|
| 52 |
+
return True
|
| 53 |
+
|
| 54 |
+
check("All modules import cleanly", check_imports)
|
| 55 |
+
|
| 56 |
+
# --- Reset returns valid Observation ---
|
| 57 |
+
print("\n[ reset() ]")
|
| 58 |
+
|
| 59 |
+
def check_reset_easy():
|
| 60 |
+
from env import DevOpsIncidentEnv
|
| 61 |
+
env = DevOpsIncidentEnv(task_id="easy", seed=42)
|
| 62 |
+
obs = env.reset()
|
| 63 |
+
assert obs.step == 0
|
| 64 |
+
assert len(obs.services) > 0
|
| 65 |
+
assert len(obs.active_alerts) > 0
|
| 66 |
+
assert obs.task_id == "easy"
|
| 67 |
+
return True
|
| 68 |
+
|
| 69 |
+
def check_reset_all_tasks():
|
| 70 |
+
from env import DevOpsIncidentEnv
|
| 71 |
+
for task_id in ["easy", "medium", "hard", "bonus"]:
|
| 72 |
+
env = DevOpsIncidentEnv(task_id=task_id, seed=42)
|
| 73 |
+
obs = env.reset()
|
| 74 |
+
assert obs.task_id == task_id, f"task_id mismatch for {task_id}"
|
| 75 |
+
assert obs.max_steps > 0
|
| 76 |
+
return True
|
| 77 |
+
|
| 78 |
+
def check_reset_reproducible():
|
| 79 |
+
from env import DevOpsIncidentEnv
|
| 80 |
+
from models import Action, ActionType
|
| 81 |
+
results = []
|
| 82 |
+
for _ in range(3):
|
| 83 |
+
env = DevOpsIncidentEnv(task_id="easy", seed=42)
|
| 84 |
+
obs = env.reset()
|
| 85 |
+
results.append(obs.services[0].memory_percent)
|
| 86 |
+
assert len(set(results)) == 1, f"Different results for same seed: {results}"
|
| 87 |
+
return True
|
| 88 |
+
|
| 89 |
+
def check_seed_variety():
|
| 90 |
+
from env import DevOpsIncidentEnv
|
| 91 |
+
roots = set()
|
| 92 |
+
for seed in range(10):
|
| 93 |
+
env = DevOpsIncidentEnv(task_id="easy", seed=seed)
|
| 94 |
+
env.reset()
|
| 95 |
+
s = env.state()
|
| 96 |
+
roots.add(s.ground_truth_root_cause)
|
| 97 |
+
assert len(roots) > 1, f"All seeds produce same scenario: {roots}"
|
| 98 |
+
return True
|
| 99 |
+
|
| 100 |
+
check("reset() returns valid Observation for easy task", check_reset_easy)
|
| 101 |
+
check("reset() works for all 4 tasks", check_reset_all_tasks)
|
| 102 |
+
check("Same seed always produces same episode", check_reset_reproducible)
|
| 103 |
+
check("Different seeds produce different scenarios", check_seed_variety)
|
| 104 |
+
|
| 105 |
+
# --- step() ---
|
| 106 |
+
print("\n[ step() ]")
|
| 107 |
+
|
| 108 |
+
def check_step_returns_result():
|
| 109 |
+
from env import DevOpsIncidentEnv
|
| 110 |
+
from models import Action, ActionType, StepResult
|
| 111 |
+
env = DevOpsIncidentEnv(task_id="easy", seed=42)
|
| 112 |
+
env.reset()
|
| 113 |
+
result = env.step(Action(action_type=ActionType.NOOP))
|
| 114 |
+
assert isinstance(result, StepResult)
|
| 115 |
+
assert isinstance(result.reward, float)
|
| 116 |
+
assert isinstance(result.done, bool)
|
| 117 |
+
assert result.observation.step == 1
|
| 118 |
+
return True
|
| 119 |
+
|
| 120 |
+
def check_step_reward_in_range():
|
| 121 |
+
from env import DevOpsIncidentEnv
|
| 122 |
+
from models import Action, ActionType
|
| 123 |
+
rng = random.Random(0)
|
| 124 |
+
for task_id in ["easy", "medium", "hard", "bonus"]:
|
| 125 |
+
env = DevOpsIncidentEnv(task_id=task_id, seed=42)
|
| 126 |
+
env.reset()
|
| 127 |
+
done = False
|
| 128 |
+
steps = 0
|
| 129 |
+
while not done and steps < 30:
|
| 130 |
+
action = Action(action_type=rng.choice(list(ActionType)))
|
| 131 |
+
result = env.step(action)
|
| 132 |
+
assert -1.0 <= result.reward <= 1.0, f"reward={result.reward} out of range"
|
| 133 |
+
done = result.done
|
| 134 |
+
steps += 1
|
| 135 |
+
return True
|
| 136 |
+
|
| 137 |
+
def check_max_steps_terminates():
|
| 138 |
+
from env import DevOpsIncidentEnv
|
| 139 |
+
from models import Action, ActionType
|
| 140 |
+
env = DevOpsIncidentEnv(task_id="easy", seed=42)
|
| 141 |
+
env.reset()
|
| 142 |
+
done = False
|
| 143 |
+
steps = 0
|
| 144 |
+
while not done:
|
| 145 |
+
result = env.step(Action(action_type=ActionType.NOOP))
|
| 146 |
+
done = result.done
|
| 147 |
+
steps += 1
|
| 148 |
+
assert steps <= 20, "Episode never terminated"
|
| 149 |
+
return True
|
| 150 |
+
|
| 151 |
+
check("step() returns valid StepResult", check_step_returns_result)
|
| 152 |
+
check("step() rewards always in [-1.0, 1.0]", check_step_reward_in_range)
|
| 153 |
+
check("Episode terminates at max_steps", check_max_steps_terminates)
|
| 154 |
+
|
| 155 |
+
# --- state() ---
|
| 156 |
+
print("\n[ state() ]")
|
| 157 |
+
|
| 158 |
+
def check_state_has_ground_truth():
|
| 159 |
+
from env import DevOpsIncidentEnv
|
| 160 |
+
from models import Action, ActionType
|
| 161 |
+
env = DevOpsIncidentEnv(task_id="medium", seed=42)
|
| 162 |
+
env.reset()
|
| 163 |
+
env.step(Action(action_type=ActionType.NOOP))
|
| 164 |
+
s = env.state()
|
| 165 |
+
assert s.ground_truth_root_cause != ""
|
| 166 |
+
assert s.ground_truth_fix != ""
|
| 167 |
+
assert len(s.action_history) == 1
|
| 168 |
+
return True
|
| 169 |
+
|
| 170 |
+
check("state() returns ground truth and action history", check_state_has_ground_truth)
|
| 171 |
+
|
| 172 |
+
# --- Graders ---
|
| 173 |
+
print("\n[ Graders ]")
|
| 174 |
+
|
| 175 |
+
def check_graders_in_range():
|
| 176 |
+
from env import DevOpsIncidentEnv
|
| 177 |
+
from models import Action, ActionType
|
| 178 |
+
from graders.grader import grade_episode
|
| 179 |
+
rng = random.Random(99)
|
| 180 |
+
for task_id in ["easy", "medium", "hard", "bonus"]:
|
| 181 |
+
env = DevOpsIncidentEnv(task_id=task_id, seed=42)
|
| 182 |
+
env.reset()
|
| 183 |
+
done = False
|
| 184 |
+
steps = 0
|
| 185 |
+
while not done and steps < 30:
|
| 186 |
+
action = Action(action_type=rng.choice(list(ActionType)))
|
| 187 |
+
result = env.step(action)
|
| 188 |
+
done = result.done
|
| 189 |
+
steps += 1
|
| 190 |
+
s = env.state()
|
| 191 |
+
score = grade_episode(
|
| 192 |
+
task_id, s.action_history, s.ground_truth_root_cause,
|
| 193 |
+
s.ground_truth_fix, s.incident_resolved, s.total_reward,
|
| 194 |
+
)
|
| 195 |
+
assert 0.0 <= score <= 1.0, f"{task_id} score={score} out of [0,1]"
|
| 196 |
+
return True
|
| 197 |
+
|
| 198 |
+
def check_graders_not_constant():
|
| 199 |
+
from env import DevOpsIncidentEnv
|
| 200 |
+
from models import Action, ActionType
|
| 201 |
+
from graders.grader import grade_episode
|
| 202 |
+
scores = []
|
| 203 |
+
for seed in [1, 2, 3, 42, 99]:
|
| 204 |
+
rng = random.Random(seed * 7)
|
| 205 |
+
env = DevOpsIncidentEnv(task_id="easy", seed=seed)
|
| 206 |
+
env.reset()
|
| 207 |
+
done = False
|
| 208 |
+
steps = 0
|
| 209 |
+
while not done and steps < 15:
|
| 210 |
+
action = Action(action_type=rng.choice(list(ActionType)))
|
| 211 |
+
result = env.step(action)
|
| 212 |
+
done = result.done
|
| 213 |
+
steps += 1
|
| 214 |
+
s = env.state()
|
| 215 |
+
score = grade_episode(
|
| 216 |
+
"easy", s.action_history, s.ground_truth_root_cause,
|
| 217 |
+
s.ground_truth_fix, s.incident_resolved, s.total_reward,
|
| 218 |
+
)
|
| 219 |
+
scores.append(score)
|
| 220 |
+
assert len(set(scores)) > 1, f"Grader returns constant score: {scores}"
|
| 221 |
+
return True
|
| 222 |
+
|
| 223 |
+
def check_optimal_agent_scores_high():
|
| 224 |
+
from env import DevOpsIncidentEnv
|
| 225 |
+
from models import Action, ActionType
|
| 226 |
+
from graders.grader import grade_episode
|
| 227 |
+
# Easy task optimal sequence
|
| 228 |
+
env = DevOpsIncidentEnv(task_id="easy", seed=42)
|
| 229 |
+
env.reset()
|
| 230 |
+
s0 = env.state()
|
| 231 |
+
failing = s0.ground_truth_root_cause.replace("memory_leak_", "").replace("_", "-")
|
| 232 |
+
for act in [
|
| 233 |
+
Action(action_type=ActionType.READ_LOGS, service=failing),
|
| 234 |
+
Action(action_type=ActionType.READ_METRICS, service=failing),
|
| 235 |
+
Action(action_type=ActionType.DIAGNOSE, root_cause=f"memory leak {failing}"),
|
| 236 |
+
Action(action_type=ActionType.RESTART_SERVICE, service=failing),
|
| 237 |
+
]:
|
| 238 |
+
result = env.step(act)
|
| 239 |
+
if result.done:
|
| 240 |
+
break
|
| 241 |
+
s = env.state()
|
| 242 |
+
score = grade_episode(
|
| 243 |
+
"easy", s.action_history, s.ground_truth_root_cause,
|
| 244 |
+
s.ground_truth_fix, s.incident_resolved, s.total_reward,
|
| 245 |
+
)
|
| 246 |
+
assert score >= 0.85, f"Optimal agent scored only {score:.3f} on easy"
|
| 247 |
+
return True
|
| 248 |
+
|
| 249 |
+
check("All graders return scores in [0.0, 1.0]", check_graders_in_range)
|
| 250 |
+
check("Grader does not return constant scores across episodes", check_graders_not_constant)
|
| 251 |
+
check("Optimal agent scores >= 0.85 on easy task", check_optimal_agent_scores_high)
|
| 252 |
+
|
| 253 |
+
# --- Collateral damage penalty ---
|
| 254 |
+
print("\n[ Reward shaping ]")
|
| 255 |
+
|
| 256 |
+
def check_collateral_damage_penalty():
|
| 257 |
+
from env import DevOpsIncidentEnv
|
| 258 |
+
from models import Action, ActionType
|
| 259 |
+
env = DevOpsIncidentEnv(task_id="easy", seed=42)
|
| 260 |
+
env.reset()
|
| 261 |
+
s0 = env.state()
|
| 262 |
+
healthy = [svc for svc in s0.current_observation.services
|
| 263 |
+
if svc.status == "healthy"]
|
| 264 |
+
assert len(healthy) > 0, "No healthy services to test with"
|
| 265 |
+
result = env.step(Action(action_type=ActionType.RESTART_SERVICE,
|
| 266 |
+
service=healthy[0].name))
|
| 267 |
+
assert result.reward < 0, f"Expected negative reward for healthy restart, got {result.reward}"
|
| 268 |
+
return True
|
| 269 |
+
|
| 270 |
+
def check_info_gathering_rewarded():
|
| 271 |
+
from env import DevOpsIncidentEnv
|
| 272 |
+
from models import Action, ActionType
|
| 273 |
+
env = DevOpsIncidentEnv(task_id="easy", seed=42)
|
| 274 |
+
env.reset()
|
| 275 |
+
s0 = env.state()
|
| 276 |
+
failing = s0.ground_truth_root_cause.replace("memory_leak_", "").replace("_", "-")
|
| 277 |
+
result = env.step(Action(action_type=ActionType.READ_LOGS, service=failing))
|
| 278 |
+
assert result.reward > 0, f"Expected positive reward for reading failing service logs, got {result.reward}"
|
| 279 |
+
return True
|
| 280 |
+
|
| 281 |
+
check("Restarting healthy service gives negative reward", check_collateral_damage_penalty)
|
| 282 |
+
check("Reading failing service logs gives positive reward", check_info_gathering_rewarded)
|
| 283 |
+
|
| 284 |
+
# --- Files present ---
|
| 285 |
+
print("\n[ Required files ]")
|
| 286 |
+
|
| 287 |
+
for fname in ["openenv.yaml", "Dockerfile", "requirements.txt",
|
| 288 |
+
"inference.py", "README.md", "env.py", "api.py"]:
|
| 289 |
+
path = os.path.join(os.path.dirname(__file__), fname)
|
| 290 |
+
check(f"{fname} exists", lambda p=path: os.path.exists(p) or f"Missing: {p}")
|
| 291 |
+
|
| 292 |
+
# --- Summary ---
|
| 293 |
+
print()
|
| 294 |
+
if not failures:
|
| 295 |
+
print(f"{PASS} All checks passed! Ready to submit.\n")
|
| 296 |
+
sys.exit(0)
|
| 297 |
+
else:
|
| 298 |
+
print(f"{FAIL} {len(failures)} check(s) failed: {failures}\n")
|
| 299 |
+
sys.exit(1)
|
| 300 |
+
|
| 301 |
+
|
| 302 |
+
if __name__ == "__main__":
|
| 303 |
+
main()
|