Improvements: partial log obs, search_logs action, CoT inference, dashboard UI, README motivation
Browse files- README.md +59 -16
- api.py +147 -0
- inference.py +58 -10
- models.py +2 -0
- openenv.yaml +2 -0
- tasks/base.py +31 -3
- tasks/task_bonus.py +2 -0
- tasks/task_easy.py +3 -3
- tasks/task_hard.py +3 -0
- tasks/task_medium.py +3 -3
README.md
CHANGED
|
@@ -30,22 +30,64 @@ remediation, while penalising collateral damage and blind actions.
|
|
| 30 |
|
| 31 |
---
|
| 32 |
|
| 33 |
-
##
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
- **
|
| 46 |
-
- **
|
| 47 |
-
- **
|
| 48 |
-
- **
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
---
|
| 51 |
|
|
@@ -83,6 +125,7 @@ and exact metric values.
|
|
| 83 |
| `read_logs` | `service` (str) | Fetch recent log lines for a service |
|
| 84 |
| `read_metrics` | `service` (str) | Fetch CPU, memory, error rate, P99 latency |
|
| 85 |
| `read_runbook` | `runbook` (str) | Read an operational runbook |
|
|
|
|
| 86 |
| `restart_service` | `service` (str) | Restart a service (clears memory/connections) |
|
| 87 |
| `rollback` | `service`, `version` | Roll back to a previous artifact version |
|
| 88 |
| `scale_up` | `service` (str) | Increase replica count |
|
|
|
|
| 30 |
|
| 31 |
---
|
| 32 |
|
| 33 |
+
## Motivation
|
| 34 |
+
|
| 35 |
+
Existing agent benchmarks focus on software engineering (SWE-bench),
|
| 36 |
+
web navigation (WebArena), or general tool use (AgentBench). None
|
| 37 |
+
model **operational intelligence** — the ability to reason under
|
| 38 |
+
uncertainty about live production systems.
|
| 39 |
+
|
| 40 |
+
Yet incident response is one of the highest-stakes, highest-frequency
|
| 41 |
+
tasks in software organizations. Every company running microservices
|
| 42 |
+
faces this daily. The skills required are exactly what distinguishes
|
| 43 |
+
capable AI agents from weak ones:
|
| 44 |
+
|
| 45 |
+
- **Multi-step information gathering** under time pressure
|
| 46 |
+
- **Causal reasoning** over dependent systems
|
| 47 |
+
- **Precise action selection** where wrong actions cause additional damage
|
| 48 |
+
- **Signal vs noise discrimination** (red-herring alerts, silent failures)
|
| 49 |
+
|
| 50 |
+
This environment fills that gap. It is the first OpenEnv-compliant RL
|
| 51 |
+
environment specifically designed to benchmark agent performance on
|
| 52 |
+
production incident response.
|
| 53 |
+
|
| 54 |
+
### Comparison to Existing Benchmarks
|
| 55 |
+
|
| 56 |
+
| Benchmark | Domain | Multi-step | Real-world | Partial obs | Dense reward |
|
| 57 |
+
|---|---|---|---|---|---|
|
| 58 |
+
| SWE-bench | Code repair | ✓ | ✓ | ✗ | ✗ |
|
| 59 |
+
| WebArena | Web navigation | ✓ | ✓ | ✓ | ✗ |
|
| 60 |
+
| AgentBench | General tools | ✓ | Partial | ✗ | ✗ |
|
| 61 |
+
| **DevOps-IR (ours)** | **Incident response** | **✓** | **✓** | **✓** | **✓** |
|
| 62 |
+
|
| 63 |
+
### Episode Architecture
|
| 64 |
+
```mermaid
|
| 65 |
+
graph TD
|
| 66 |
+
A[Agent] -->|Action| B[DevOpsIncidentEnv]
|
| 67 |
+
B -->|Observation| A
|
| 68 |
+
B --> C[ServiceStatus x N]
|
| 69 |
+
B --> D[AlertList]
|
| 70 |
+
B --> E[EvidenceLog]
|
| 71 |
+
B --> F[DependencyMap]
|
| 72 |
+
B --> G[SLAStatus]
|
| 73 |
+
H[Grader] -->|score 0-1| I[Episode Analytics]
|
| 74 |
+
B -->|done=True| H
|
| 75 |
+
I --> J[steps_to_diagnosis]
|
| 76 |
+
I --> K[info_gathering_ratio]
|
| 77 |
+
I --> L[collateral_damage_events]
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
### What Makes This Hard
|
| 81 |
+
|
| 82 |
+
The four tasks are designed to require qualitatively different
|
| 83 |
+
reasoning strategies:
|
| 84 |
+
|
| 85 |
+
- **Easy**: Direct signal reading — logs clearly show OOM, fix is obvious
|
| 86 |
+
- **Medium**: Dependency tracing — must follow the call chain to find root
|
| 87 |
+
- **Hard**: Anomaly correlation — zero error alerts, signal buried in WARN
|
| 88 |
+
logs and business metrics across 6 services
|
| 89 |
+
- **Bonus**: Parallel diagnosis — two unrelated failures, agent must
|
| 90 |
+
decompose and fix independently
|
| 91 |
|
| 92 |
---
|
| 93 |
|
|
|
|
| 125 |
| `read_logs` | `service` (str) | Fetch recent log lines for a service |
|
| 126 |
| `read_metrics` | `service` (str) | Fetch CPU, memory, error rate, P99 latency |
|
| 127 |
| `read_runbook` | `runbook` (str) | Read an operational runbook |
|
| 128 |
+
| `search_logs` | `service`, `query` | Search log lines matching a keyword |
|
| 129 |
| `restart_service` | `service` (str) | Restart a service (clears memory/connections) |
|
| 130 |
| `rollback` | `service`, `version` | Roll back to a previous artifact version |
|
| 131 |
| `scale_up` | `service` (str) | Increase replica count |
|
api.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
from __future__ import annotations
|
| 2 |
from fastapi import FastAPI, HTTPException
|
|
|
|
| 3 |
from fastapi.middleware.cors import CORSMiddleware
|
| 4 |
from pydantic import BaseModel
|
| 5 |
from typing import Optional
|
|
@@ -33,6 +34,152 @@ class ResetRequest(BaseModel):
|
|
| 33 |
seed: Optional[int] = None
|
| 34 |
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
@app.get("/health")
|
| 37 |
def health():
|
| 38 |
return {"status": "ok", "env": "devops-incident-response", "version": "1.0.0"}
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
from fastapi import FastAPI, HTTPException
|
| 3 |
+
from fastapi.responses import HTMLResponse
|
| 4 |
from fastapi.middleware.cors import CORSMiddleware
|
| 5 |
from pydantic import BaseModel
|
| 6 |
from typing import Optional
|
|
|
|
| 34 |
seed: Optional[int] = None
|
| 35 |
|
| 36 |
|
| 37 |
+
@app.get("/", response_class=HTMLResponse)
|
| 38 |
+
def dashboard():
|
| 39 |
+
env_state = None
|
| 40 |
+
if _env is not None:
|
| 41 |
+
try:
|
| 42 |
+
s = _env.state()
|
| 43 |
+
env_state = s
|
| 44 |
+
except Exception:
|
| 45 |
+
pass
|
| 46 |
+
|
| 47 |
+
task_info = ""
|
| 48 |
+
if env_state:
|
| 49 |
+
task_info = f"""
|
| 50 |
+
<div class="stat">
|
| 51 |
+
<span class="label">Current Task</span>
|
| 52 |
+
<span class="value">{env_state.task_id.upper()}</span>
|
| 53 |
+
</div>
|
| 54 |
+
<div class="stat">
|
| 55 |
+
<span class="label">Step</span>
|
| 56 |
+
<span class="value">{env_state.step} / {env_state.current_observation.max_steps}</span>
|
| 57 |
+
</div>
|
| 58 |
+
<div class="stat">
|
| 59 |
+
<span class="label">Score So Far</span>
|
| 60 |
+
<span class="value">{env_state.info.get('current_score', 0):.3f}</span>
|
| 61 |
+
</div>
|
| 62 |
+
<div class="stat">
|
| 63 |
+
<span class="label">Resolved</span>
|
| 64 |
+
<span class="value">{'YES' if env_state.incident_resolved else 'NO'}</span>
|
| 65 |
+
</div>
|
| 66 |
+
<div class="stat">
|
| 67 |
+
<span class="label">Evidence Gathered</span>
|
| 68 |
+
<span class="value">{len(env_state.current_observation.evidence_log)} items</span>
|
| 69 |
+
</div>
|
| 70 |
+
"""
|
| 71 |
+
else:
|
| 72 |
+
task_info = '<div class="stat"><span class="label">Status</span><span class="value">No active episode — call /reset to start</span></div>'
|
| 73 |
+
|
| 74 |
+
html = f"""<!DOCTYPE html>
|
| 75 |
+
<html>
|
| 76 |
+
<head>
|
| 77 |
+
<title>DevOps Incident Response — OpenEnv</title>
|
| 78 |
+
<meta charset="utf-8">
|
| 79 |
+
<meta http-equiv="refresh" content="10">
|
| 80 |
+
<style>
|
| 81 |
+
body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
|
| 82 |
+
background: #0f1117; color: #e0e0e0; margin: 0; padding: 2rem; }}
|
| 83 |
+
h1 {{ color: #ff6b35; font-size: 1.8rem; margin-bottom: 0.25rem; }}
|
| 84 |
+
h2 {{ color: #888; font-size: 1rem; font-weight: 400; margin-bottom: 2rem; }}
|
| 85 |
+
.grid {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 1rem; margin-bottom: 2rem; }}
|
| 86 |
+
.stat {{ background: #1a1d27; border: 1px solid #2d3148; border-radius: 8px; padding: 1.25rem; }}
|
| 87 |
+
.label {{ display: block; color: #888; font-size: 0.75rem; text-transform: uppercase; letter-spacing: 0.05em; margin-bottom: 0.5rem; }}
|
| 88 |
+
.value {{ display: block; font-size: 1.4rem; font-weight: 600; color: #fff; }}
|
| 89 |
+
.tasks {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(280px, 1fr)); gap: 1rem; margin-bottom: 2rem; }}
|
| 90 |
+
.task {{ background: #1a1d27; border: 1px solid #2d3148; border-radius: 8px; padding: 1.25rem; }}
|
| 91 |
+
.task h3 {{ margin: 0 0 0.5rem; color: #ff6b35; font-size: 1rem; }}
|
| 92 |
+
.task p {{ margin: 0; color: #aaa; font-size: 0.85rem; line-height: 1.5; }}
|
| 93 |
+
.badge {{ display: inline-block; padding: 0.2rem 0.6rem; border-radius: 4px; font-size: 0.7rem; font-weight: 600; margin-bottom: 0.5rem; }}
|
| 94 |
+
.easy {{ background: #1a3a1a; color: #4caf50; }}
|
| 95 |
+
.medium {{ background: #3a2a1a; color: #ff9800; }}
|
| 96 |
+
.hard {{ background: #3a1a1a; color: #f44336; }}
|
| 97 |
+
.bonus {{ background: #1a1a3a; color: #9c27b0; }}
|
| 98 |
+
.endpoints {{ background: #1a1d27; border: 1px solid #2d3148; border-radius: 8px; padding: 1.25rem; margin-bottom: 2rem; }}
|
| 99 |
+
.endpoints h3 {{ margin: 0 0 1rem; color: #fff; }}
|
| 100 |
+
.endpoint {{ display: flex; align-items: center; gap: 0.75rem; margin-bottom: 0.5rem; }}
|
| 101 |
+
.method {{ background: #1e3a5f; color: #64b5f6; padding: 0.15rem 0.5rem; border-radius: 4px; font-size: 0.75rem; font-weight: 600; font-family: monospace; }}
|
| 102 |
+
.path {{ color: #81c784; font-family: monospace; font-size: 0.85rem; }}
|
| 103 |
+
.desc {{ color: #888; font-size: 0.8rem; }}
|
| 104 |
+
.footer {{ color: #555; font-size: 0.8rem; text-align: center; margin-top: 2rem; }}
|
| 105 |
+
</style>
|
| 106 |
+
</head>
|
| 107 |
+
<body>
|
| 108 |
+
<h1>DevOps Incident Response</h1>
|
| 109 |
+
<h2>OpenEnv — Meta x PyTorch x Hugging Face Hackathon Submission</h2>
|
| 110 |
+
|
| 111 |
+
<div class="grid">
|
| 112 |
+
{task_info}
|
| 113 |
+
</div>
|
| 114 |
+
|
| 115 |
+
<div class="tasks">
|
| 116 |
+
<div class="task">
|
| 117 |
+
<span class="badge easy">EASY</span>
|
| 118 |
+
<h3>Single Service OOM</h3>
|
| 119 |
+
<p>One service crash-loops from a memory leak. Which service varies by seed. Max 15 steps.</p>
|
| 120 |
+
</div>
|
| 121 |
+
<div class="task">
|
| 122 |
+
<span class="badge medium">MEDIUM</span>
|
| 123 |
+
<h3>Cascading Failure</h3>
|
| 124 |
+
<p>Bad deployment cascades through 3 services. One red-herring alert included. Max 20 steps.</p>
|
| 125 |
+
</div>
|
| 126 |
+
<div class="task">
|
| 127 |
+
<span class="badge hard">HARD</span>
|
| 128 |
+
<h3>Silent Data Corruption</h3>
|
| 129 |
+
<p>All services green. No error alerts. Requires correlating subtle business metric signals. Max 25 steps.</p>
|
| 130 |
+
</div>
|
| 131 |
+
<div class="task">
|
| 132 |
+
<span class="badge bonus">BONUS</span>
|
| 133 |
+
<h3>Dual Simultaneous Failure</h3>
|
| 134 |
+
<p>Two independent failures at once. Both must be fixed for full credit. Max 25 steps.</p>
|
| 135 |
+
</div>
|
| 136 |
+
</div>
|
| 137 |
+
|
| 138 |
+
<div class="endpoints">
|
| 139 |
+
<h3>API Endpoints</h3>
|
| 140 |
+
<div class="endpoint">
|
| 141 |
+
<span class="method">GET</span>
|
| 142 |
+
<span class="path">/health</span>
|
| 143 |
+
<span class="desc">Health check</span>
|
| 144 |
+
</div>
|
| 145 |
+
<div class="endpoint">
|
| 146 |
+
<span class="method">POST</span>
|
| 147 |
+
<span class="path">/reset</span>
|
| 148 |
+
<span class="desc">Start new episode — body: {{"task_id": "easy", "seed": 42}}</span>
|
| 149 |
+
</div>
|
| 150 |
+
<div class="endpoint">
|
| 151 |
+
<span class="method">POST</span>
|
| 152 |
+
<span class="path">/step</span>
|
| 153 |
+
<span class="desc">Take one action — body: Action JSON</span>
|
| 154 |
+
</div>
|
| 155 |
+
<div class="endpoint">
|
| 156 |
+
<span class="method">GET</span>
|
| 157 |
+
<span class="path">/state</span>
|
| 158 |
+
<span class="desc">Full state with ground truth and analytics</span>
|
| 159 |
+
</div>
|
| 160 |
+
<div class="endpoint">
|
| 161 |
+
<span class="method">GET</span>
|
| 162 |
+
<span class="path">/validate</span>
|
| 163 |
+
<span class="desc">Self-validation report for all 4 tasks</span>
|
| 164 |
+
</div>
|
| 165 |
+
<div class="endpoint">
|
| 166 |
+
<span class="method">GET</span>
|
| 167 |
+
<span class="path">/docs</span>
|
| 168 |
+
<span class="desc">Interactive API documentation (Swagger UI)</span>
|
| 169 |
+
</div>
|
| 170 |
+
</div>
|
| 171 |
+
|
| 172 |
+
<div class="footer">
|
| 173 |
+
Auto-refreshes every 10 seconds |
|
| 174 |
+
<a href="/docs" style="color:#ff6b35;">API Docs</a> |
|
| 175 |
+
<a href="/validate" style="color:#ff6b35;">Run Validation</a> |
|
| 176 |
+
<a href="/health" style="color:#ff6b35;">Health Check</a>
|
| 177 |
+
</div>
|
| 178 |
+
</body>
|
| 179 |
+
</html>"""
|
| 180 |
+
return html
|
| 181 |
+
|
| 182 |
+
|
| 183 |
@app.get("/health")
|
| 184 |
def health():
|
| 185 |
return {"status": "ok", "env": "devops-incident-response", "version": "1.0.0"}
|
inference.py
CHANGED
|
@@ -37,15 +37,17 @@ dependency map, and a log of all evidence you have gathered so far.
|
|
| 37 |
|
| 38 |
Your strategy:
|
| 39 |
1. Read logs and metrics for the most suspicious services BEFORE acting
|
| 40 |
-
2. Use
|
| 41 |
-
3.
|
| 42 |
-
4.
|
| 43 |
-
5.
|
|
|
|
| 44 |
|
| 45 |
Respond with ONLY a valid JSON object — no markdown, no commentary:
|
| 46 |
{
|
| 47 |
-
"action_type": "<diagnose|read_logs|read_metrics|read_runbook|restart_service|rollback|scale_up|alert_oncall|acknowledge|noop>",
|
| 48 |
"service": "<service name or null>",
|
|
|
|
| 49 |
"root_cause": "<diagnosis string if action_type is diagnose, else null>",
|
| 50 |
"runbook": "<runbook filename if action_type is read_runbook, else null>",
|
| 51 |
"version": "<version string if action_type is rollback, else null>",
|
|
@@ -56,6 +58,19 @@ Available runbooks: high_cpu.md, memory_leak.md, db_connection.md,
|
|
| 56 |
deployment_rollback.md, cascade_failure.md, data_corruption.md
|
| 57 |
""").strip()
|
| 58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
def observation_to_text(obs: Observation) -> str:
|
| 61 |
lines = [
|
|
@@ -158,6 +173,7 @@ def parse_action(response_text: str) -> Action:
|
|
| 158 |
return Action(
|
| 159 |
action_type=ActionType(at_str),
|
| 160 |
service=data.get("service"),
|
|
|
|
| 161 |
root_cause=data.get("root_cause"),
|
| 162 |
runbook=data.get("runbook"),
|
| 163 |
version=data.get("version"),
|
|
@@ -183,18 +199,49 @@ def run_task(client: OpenAI, task_id: str, seed: int = 42) -> dict:
|
|
| 183 |
prompt = observation_to_text(obs)
|
| 184 |
|
| 185 |
try:
|
| 186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
model=MODEL_NAME,
|
| 188 |
messages=[
|
| 189 |
{"role": "system", "content": SYSTEM_PROMPT},
|
| 190 |
{"role": "user", "content": prompt},
|
|
|
|
|
|
|
| 191 |
],
|
| 192 |
-
temperature=
|
| 193 |
-
max_tokens=
|
| 194 |
)
|
| 195 |
-
response_text =
|
| 196 |
except Exception as exc:
|
| 197 |
print(f" Step {step:02d}: API error — {exc}")
|
|
|
|
| 198 |
response_text = ""
|
| 199 |
|
| 200 |
action = parse_action(response_text)
|
|
@@ -213,7 +260,8 @@ def run_task(client: OpenAI, task_id: str, seed: int = 42) -> dict:
|
|
| 213 |
|
| 214 |
reward_str = f" reward={result.reward:+.3f}" if result.reward != 0 else ""
|
| 215 |
resolution_str = f" *** {result.info.get('resolution', '')} ***" if result.done and result.info.get("resolution") else ""
|
| 216 |
-
print(f" Step {step:02d}: {
|
|
|
|
| 217 |
|
| 218 |
if obs.last_action_error:
|
| 219 |
print(f" ⚠ {obs.last_action_error[:80]}")
|
|
|
|
| 37 |
|
| 38 |
Your strategy:
|
| 39 |
1. Read logs and metrics for the most suspicious services BEFORE acting
|
| 40 |
+
2. Use search_logs to find specific error patterns efficiently instead of reading all logs when you know what to look for.
|
| 41 |
+
3. Use the dependency map to trace cascades to their ROOT cause
|
| 42 |
+
4. Issue a DIAGNOSE action once you have enough evidence
|
| 43 |
+
5. Apply the precise fix — wrong service or wrong action loses points
|
| 44 |
+
6. On hard incidents: both rollback AND alert_oncall may be required
|
| 45 |
|
| 46 |
Respond with ONLY a valid JSON object — no markdown, no commentary:
|
| 47 |
{
|
| 48 |
+
"action_type": "<diagnose|read_logs|search_logs|read_metrics|read_runbook|restart_service|rollback|scale_up|alert_oncall|acknowledge|noop>",
|
| 49 |
"service": "<service name or null>",
|
| 50 |
+
"query": "<search keyword if action_type is search_logs, else null>",
|
| 51 |
"root_cause": "<diagnosis string if action_type is diagnose, else null>",
|
| 52 |
"runbook": "<runbook filename if action_type is read_runbook, else null>",
|
| 53 |
"version": "<version string if action_type is rollback, else null>",
|
|
|
|
| 58 |
deployment_rollback.md, cascade_failure.md, data_corruption.md
|
| 59 |
""").strip()
|
| 60 |
|
| 61 |
+
REASONING_PROMPT = """
|
| 62 |
+
You are a senior DevOps engineer responding to a production incident.
|
| 63 |
+
|
| 64 |
+
Before deciding your next action, think through what you know:
|
| 65 |
+
1. What services are affected and what is their status?
|
| 66 |
+
2. What evidence have you gathered so far?
|
| 67 |
+
3. What is the most likely root cause based on your evidence?
|
| 68 |
+
4. What is the single most valuable piece of information still missing?
|
| 69 |
+
5. What action would best close that information gap?
|
| 70 |
+
|
| 71 |
+
Respond in plain text with your reasoning. Be concise (3-5 sentences).
|
| 72 |
+
Do NOT output a JSON action yet — just your analysis.
|
| 73 |
+
""".strip()
|
| 74 |
|
| 75 |
def observation_to_text(obs: Observation) -> str:
|
| 76 |
lines = [
|
|
|
|
| 173 |
return Action(
|
| 174 |
action_type=ActionType(at_str),
|
| 175 |
service=data.get("service"),
|
| 176 |
+
query=data.get("query"),
|
| 177 |
root_cause=data.get("root_cause"),
|
| 178 |
runbook=data.get("runbook"),
|
| 179 |
version=data.get("version"),
|
|
|
|
| 199 |
prompt = observation_to_text(obs)
|
| 200 |
|
| 201 |
try:
|
| 202 |
+
reasoning_completion = client.chat.completions.create(
|
| 203 |
+
model=MODEL_NAME,
|
| 204 |
+
messages=[
|
| 205 |
+
{"role": "system", "content": REASONING_PROMPT},
|
| 206 |
+
{"role": "user", "content": prompt},
|
| 207 |
+
],
|
| 208 |
+
temperature=0.3,
|
| 209 |
+
max_tokens=256,
|
| 210 |
+
)
|
| 211 |
+
reasoning = reasoning_completion.choices[0].message.content or ""
|
| 212 |
+
|
| 213 |
+
action_prompt = f"""
|
| 214 |
+
Based on your analysis:
|
| 215 |
+
{reasoning}
|
| 216 |
+
|
| 217 |
+
Now output your action as a JSON object:
|
| 218 |
+
{{
|
| 219 |
+
"action_type": "...",
|
| 220 |
+
"service": "...",
|
| 221 |
+
"query": "...",
|
| 222 |
+
"root_cause": "...",
|
| 223 |
+
"runbook": "...",
|
| 224 |
+
"version": "...",
|
| 225 |
+
"reason": "one sentence summary"
|
| 226 |
+
}}
|
| 227 |
+
Output ONLY the JSON object.
|
| 228 |
+
""".strip()
|
| 229 |
+
|
| 230 |
+
action_completion = client.chat.completions.create(
|
| 231 |
model=MODEL_NAME,
|
| 232 |
messages=[
|
| 233 |
{"role": "system", "content": SYSTEM_PROMPT},
|
| 234 |
{"role": "user", "content": prompt},
|
| 235 |
+
{"role": "assistant", "content": reasoning},
|
| 236 |
+
{"role": "user", "content": action_prompt},
|
| 237 |
],
|
| 238 |
+
temperature=0.1,
|
| 239 |
+
max_tokens=200,
|
| 240 |
)
|
| 241 |
+
response_text = action_completion.choices[0].message.content or ""
|
| 242 |
except Exception as exc:
|
| 243 |
print(f" Step {step:02d}: API error — {exc}")
|
| 244 |
+
reasoning = "(error)"
|
| 245 |
response_text = ""
|
| 246 |
|
| 247 |
action = parse_action(response_text)
|
|
|
|
| 260 |
|
| 261 |
reward_str = f" reward={result.reward:+.3f}" if result.reward != 0 else ""
|
| 262 |
resolution_str = f" *** {result.info.get('resolution', '')} ***" if result.done and result.info.get("resolution") else ""
|
| 263 |
+
print(f" Step {step:02d} reasoning: {reasoning[:100]}...")
|
| 264 |
+
print(f" Step {step:02d} action: {action_label}{reward_str}{resolution_str}")
|
| 265 |
|
| 266 |
if obs.last_action_error:
|
| 267 |
print(f" ⚠ {obs.last_action_error[:80]}")
|
models.py
CHANGED
|
@@ -15,6 +15,7 @@ class ActionType(str, Enum):
|
|
| 15 |
ALERT_ONCALL = "alert_oncall"
|
| 16 |
ACKNOWLEDGE = "acknowledge"
|
| 17 |
NOOP = "noop"
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
class Action(BaseModel):
|
|
@@ -24,6 +25,7 @@ class Action(BaseModel):
|
|
| 24 |
runbook: Optional[str] = None
|
| 25 |
version: Optional[str] = None
|
| 26 |
reason: Optional[str] = None
|
|
|
|
| 27 |
|
| 28 |
|
| 29 |
class Alert(BaseModel):
|
|
|
|
| 15 |
ALERT_ONCALL = "alert_oncall"
|
| 16 |
ACKNOWLEDGE = "acknowledge"
|
| 17 |
NOOP = "noop"
|
| 18 |
+
SEARCH_LOGS = "search_logs"
|
| 19 |
|
| 20 |
|
| 21 |
class Action(BaseModel):
|
|
|
|
| 25 |
runbook: Optional[str] = None
|
| 26 |
version: Optional[str] = None
|
| 27 |
reason: Optional[str] = None
|
| 28 |
+
query: Optional[str] = None # used with search_logs
|
| 29 |
|
| 30 |
|
| 31 |
class Alert(BaseModel):
|
openenv.yaml
CHANGED
|
@@ -86,6 +86,8 @@ action_space:
|
|
| 86 |
description: Record the agent's root cause hypothesis
|
| 87 |
- name: read_logs
|
| 88 |
description: Read recent log lines for a named service
|
|
|
|
|
|
|
| 89 |
- name: read_metrics
|
| 90 |
description: Read CPU, memory, error rate, latency for a named service
|
| 91 |
- name: read_runbook
|
|
|
|
| 86 |
description: Record the agent's root cause hypothesis
|
| 87 |
- name: read_logs
|
| 88 |
description: Read recent log lines for a named service
|
| 89 |
+
- name: search_logs
|
| 90 |
+
description: Search log lines for a service matching a query string
|
| 91 |
- name: read_metrics
|
| 92 |
description: Read CPU, memory, error rate, latency for a named service
|
| 93 |
- name: read_runbook
|
tasks/base.py
CHANGED
|
@@ -62,6 +62,8 @@ class InternalState:
|
|
| 62 |
ground_truth_root_cause: str
|
| 63 |
ground_truth_fix: str
|
| 64 |
incident_start_time: str
|
|
|
|
|
|
|
| 65 |
rewards_given: Set[str] = field(default_factory=set)
|
| 66 |
healthy_services: List[str] = field(default_factory=list)
|
| 67 |
evidence_log: List[dict] = field(default_factory=list)
|
|
@@ -131,6 +133,8 @@ class InternalState:
|
|
| 131 |
last_action_result: Optional[str] = None,
|
| 132 |
last_action_error: Optional[str] = None,
|
| 133 |
) -> Observation:
|
|
|
|
|
|
|
| 134 |
services = []
|
| 135 |
for name, s in self.services.items():
|
| 136 |
services.append(ServiceStatus(
|
|
@@ -160,13 +164,16 @@ class InternalState:
|
|
| 160 |
task_description=TASK_DESCRIPTIONS.get(self.task_id, ""),
|
| 161 |
services=services,
|
| 162 |
active_alerts=alerts,
|
| 163 |
-
recent_logs=
|
|
|
|
|
|
|
|
|
|
| 164 |
available_runbooks=AVAILABLE_RUNBOOKS,
|
| 165 |
service_dependencies=deps,
|
| 166 |
evidence_log=evidence,
|
| 167 |
sla_status=sla,
|
| 168 |
-
last_action_result=last_action_result,
|
| 169 |
-
last_action_error=last_action_error,
|
| 170 |
incident_start_time=self.incident_start_time,
|
| 171 |
elapsed_minutes=self.step * 2,
|
| 172 |
)
|
|
@@ -224,6 +231,27 @@ class BaseTask(ABC):
|
|
| 224 |
return result, None
|
| 225 |
return None, f"No logs found for service '{svc}'"
|
| 226 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
if at == "read_metrics":
|
| 228 |
svc = action.service
|
| 229 |
if svc and svc in state.services:
|
|
|
|
| 62 |
ground_truth_root_cause: str
|
| 63 |
ground_truth_fix: str
|
| 64 |
incident_start_time: str
|
| 65 |
+
last_action_result: Optional[str] = field(default=None)
|
| 66 |
+
last_action_error: Optional[str] = field(default=None)
|
| 67 |
rewards_given: Set[str] = field(default_factory=set)
|
| 68 |
healthy_services: List[str] = field(default_factory=list)
|
| 69 |
evidence_log: List[dict] = field(default_factory=list)
|
|
|
|
| 133 |
last_action_result: Optional[str] = None,
|
| 134 |
last_action_error: Optional[str] = None,
|
| 135 |
) -> Observation:
|
| 136 |
+
if last_action_result is not None: self.last_action_result = last_action_result
|
| 137 |
+
if last_action_error is not None: self.last_action_error = last_action_error
|
| 138 |
services = []
|
| 139 |
for name, s in self.services.items():
|
| 140 |
services.append(ServiceStatus(
|
|
|
|
| 164 |
task_description=TASK_DESCRIPTIONS.get(self.task_id, ""),
|
| 165 |
services=services,
|
| 166 |
active_alerts=alerts,
|
| 167 |
+
recent_logs={
|
| 168 |
+
svc: lines[-2:] + ([f"[... {len(lines)-2} more lines — use read_logs to see full history]"] if len(lines) > 2 else [])
|
| 169 |
+
for svc, lines in self.logs.items()
|
| 170 |
+
},
|
| 171 |
available_runbooks=AVAILABLE_RUNBOOKS,
|
| 172 |
service_dependencies=deps,
|
| 173 |
evidence_log=evidence,
|
| 174 |
sla_status=sla,
|
| 175 |
+
last_action_result=self.last_action_result,
|
| 176 |
+
last_action_error=self.last_action_error,
|
| 177 |
incident_start_time=self.incident_start_time,
|
| 178 |
elapsed_minutes=self.step * 2,
|
| 179 |
)
|
|
|
|
| 231 |
return result, None
|
| 232 |
return None, f"No logs found for service '{svc}'"
|
| 233 |
|
| 234 |
+
if at == "search_logs":
|
| 235 |
+
svc = action.service
|
| 236 |
+
query = (action.query or "").lower()
|
| 237 |
+
if not svc or svc not in state.logs:
|
| 238 |
+
return None, f"Unknown service '{svc}'"
|
| 239 |
+
if not query:
|
| 240 |
+
return None, "search_logs requires a query parameter"
|
| 241 |
+
lines = state.logs[svc]
|
| 242 |
+
matches = [l for l in lines if query in l.lower()]
|
| 243 |
+
if not matches:
|
| 244 |
+
result = f"No lines matching '{query}' in {svc} logs."
|
| 245 |
+
else:
|
| 246 |
+
result = f"Found {len(matches)} lines matching '{query}':\n" + "\n".join(matches)
|
| 247 |
+
state.evidence_log.append({
|
| 248 |
+
"step": state.step,
|
| 249 |
+
"source": f"search:{svc}",
|
| 250 |
+
"summary": f"Searched {svc} for '{query}': {len(matches)} matches",
|
| 251 |
+
"raw": result,
|
| 252 |
+
})
|
| 253 |
+
return result, None
|
| 254 |
+
|
| 255 |
if at == "read_metrics":
|
| 256 |
svc = action.service
|
| 257 |
if svc and svc in state.services:
|
tasks/task_bonus.py
CHANGED
|
@@ -141,7 +141,9 @@ class BonusTask(BaseTask):
|
|
| 141 |
|
| 142 |
gather_map = {
|
| 143 |
("read_logs", "log-aggregator"): ("rl_agg", 0.05),
|
|
|
|
| 144 |
("read_logs", "ml-inference-service"): ("rl_ml", 0.05),
|
|
|
|
| 145 |
("read_metrics", "log-aggregator"): ("rm_agg", 0.05),
|
| 146 |
("read_metrics", "ml-inference-service"): ("rm_ml", 0.05),
|
| 147 |
}
|
|
|
|
| 141 |
|
| 142 |
gather_map = {
|
| 143 |
("read_logs", "log-aggregator"): ("rl_agg", 0.05),
|
| 144 |
+
("search_logs", "log-aggregator"): ("rl_agg", 0.05),
|
| 145 |
("read_logs", "ml-inference-service"): ("rl_ml", 0.05),
|
| 146 |
+
("search_logs", "ml-inference-service"):("rl_ml", 0.05),
|
| 147 |
("read_metrics", "log-aggregator"): ("rm_agg", 0.05),
|
| 148 |
("read_metrics", "ml-inference-service"): ("rm_ml", 0.05),
|
| 149 |
}
|
tasks/task_easy.py
CHANGED
|
@@ -177,10 +177,10 @@ class EasyTask(BaseTask):
|
|
| 177 |
|
| 178 |
result_text, error_text = self._apply_action_to_logs(state, action)
|
| 179 |
|
| 180 |
-
if at
|
| 181 |
-
if "
|
| 182 |
reward += 0.15
|
| 183 |
-
state.rewards_given.add("
|
| 184 |
|
| 185 |
if at == ActionType.READ_METRICS and svc == failing:
|
| 186 |
if "read_metrics" not in state.rewards_given:
|
|
|
|
| 177 |
|
| 178 |
result_text, error_text = self._apply_action_to_logs(state, action)
|
| 179 |
|
| 180 |
+
if at in (ActionType.READ_LOGS, ActionType.SEARCH_LOGS) and svc == failing:
|
| 181 |
+
if "logs_investigated" not in state.rewards_given:
|
| 182 |
reward += 0.15
|
| 183 |
+
state.rewards_given.add("logs_investigated")
|
| 184 |
|
| 185 |
if at == ActionType.READ_METRICS and svc == failing:
|
| 186 |
if "read_metrics" not in state.rewards_given:
|
tasks/task_hard.py
CHANGED
|
@@ -161,8 +161,11 @@ class HardTask(BaseTask):
|
|
| 161 |
|
| 162 |
gather_map = {
|
| 163 |
("read_logs", "price-validation-service"): ("rl_price", 0.05),
|
|
|
|
| 164 |
("read_logs", "analytics-service"): ("rl_analytics", 0.05),
|
|
|
|
| 165 |
("read_logs", "data-pipeline-service"): ("rl_pipeline", 0.05),
|
|
|
|
| 166 |
("read_metrics", "analytics-service"): ("rm_analytics", 0.10),
|
| 167 |
("read_metrics", "data-pipeline-service"): ("rm_pipeline", 0.10),
|
| 168 |
}
|
|
|
|
| 161 |
|
| 162 |
gather_map = {
|
| 163 |
("read_logs", "price-validation-service"): ("rl_price", 0.05),
|
| 164 |
+
("search_logs", "price-validation-service"): ("rl_price", 0.05),
|
| 165 |
("read_logs", "analytics-service"): ("rl_analytics", 0.05),
|
| 166 |
+
("search_logs", "analytics-service"): ("rl_analytics", 0.05),
|
| 167 |
("read_logs", "data-pipeline-service"): ("rl_pipeline", 0.05),
|
| 168 |
+
("search_logs", "data-pipeline-service"): ("rl_pipeline", 0.05),
|
| 169 |
("read_metrics", "analytics-service"): ("rm_analytics", 0.10),
|
| 170 |
("read_metrics", "data-pipeline-service"): ("rm_pipeline", 0.10),
|
| 171 |
}
|
tasks/task_medium.py
CHANGED
|
@@ -207,9 +207,9 @@ class MediumTask(BaseTask):
|
|
| 207 |
|
| 208 |
result_text, error_text = self._apply_action_to_logs(state, action)
|
| 209 |
|
| 210 |
-
if at
|
| 211 |
-
if "
|
| 212 |
-
reward += 0.10; state.rewards_given.add("
|
| 213 |
if at == ActionType.READ_METRICS and svc == "inventory-service":
|
| 214 |
if "read_metrics_inv" not in state.rewards_given:
|
| 215 |
reward += 0.10; state.rewards_given.add("read_metrics_inv")
|
|
|
|
| 207 |
|
| 208 |
result_text, error_text = self._apply_action_to_logs(state, action)
|
| 209 |
|
| 210 |
+
if at in (ActionType.READ_LOGS, ActionType.SEARCH_LOGS) and svc == "inventory-service":
|
| 211 |
+
if "logs_investigated" not in state.rewards_given:
|
| 212 |
+
reward += 0.10; state.rewards_given.add("logs_investigated")
|
| 213 |
if at == ActionType.READ_METRICS and svc == "inventory-service":
|
| 214 |
if "read_metrics_inv" not in state.rewards_given:
|
| 215 |
reward += 0.10; state.rewards_given.add("read_metrics_inv")
|