Upload folder using huggingface_hub
Browse files- Dockerfile +50 -0
- README.md +189 -5
- SPACES_README.md +22 -0
- __init__.py +1 -0
- client.py +102 -0
- inference.py +447 -0
- models.py +195 -0
- openenv.yaml +6 -0
- pyproject.toml +32 -0
- server/__init__.py +0 -0
- server/app.py +49 -0
- server/requirements.txt +3 -0
- server/sre_environment.py +350 -0
- tasks/__init__.py +1 -0
- tasks/scenarios.py +784 -0
- uv.lock +0 -0
Dockerfile
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
|
| 2 |
+
FROM ${BASE_IMAGE} AS builder
|
| 3 |
+
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
RUN apt-get update && \
|
| 7 |
+
apt-get install -y --no-install-recommends git curl && \
|
| 8 |
+
rm -rf /var/lib/apt/lists/*
|
| 9 |
+
|
| 10 |
+
ARG BUILD_MODE=standalone
|
| 11 |
+
ARG ENV_NAME=sre_env
|
| 12 |
+
|
| 13 |
+
COPY . /app/env
|
| 14 |
+
WORKDIR /app/env
|
| 15 |
+
|
| 16 |
+
RUN if ! command -v uv >/dev/null 2>&1; then \
|
| 17 |
+
curl -LsSf https://astral.sh/uv/install.sh | sh && \
|
| 18 |
+
mv /root/.local/bin/uv /usr/local/bin/uv && \
|
| 19 |
+
mv /root/.local/bin/uvx /usr/local/bin/uvx; \
|
| 20 |
+
fi
|
| 21 |
+
|
| 22 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 23 |
+
if [ -f uv.lock ]; then \
|
| 24 |
+
uv sync --frozen --no-install-project --no-editable; \
|
| 25 |
+
else \
|
| 26 |
+
uv sync --no-install-project --no-editable; \
|
| 27 |
+
fi
|
| 28 |
+
|
| 29 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 30 |
+
if [ -f uv.lock ]; then \
|
| 31 |
+
uv sync --frozen --no-editable; \
|
| 32 |
+
else \
|
| 33 |
+
uv sync --no-editable; \
|
| 34 |
+
fi
|
| 35 |
+
|
| 36 |
+
FROM ${BASE_IMAGE}
|
| 37 |
+
|
| 38 |
+
WORKDIR /app
|
| 39 |
+
|
| 40 |
+
COPY --from=builder /app/env/.venv /app/.venv
|
| 41 |
+
COPY --from=builder /app/env /app/env
|
| 42 |
+
|
| 43 |
+
ENV PATH="/app/.venv/bin:$PATH"
|
| 44 |
+
ENV PYTHONPATH="/app/env:$PYTHONPATH"
|
| 45 |
+
|
| 46 |
+
HEALTHCHECK --interval=30s --timeout=3s --start-period=10s --retries=3 \
|
| 47 |
+
CMD curl -f http://localhost:8000/health || exit 1
|
| 48 |
+
|
| 49 |
+
ENV ENABLE_WEB_INTERFACE=true
|
| 50 |
+
CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000 --workers ${WORKERS:-2}"]
|
README.md
CHANGED
|
@@ -1,10 +1,194 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
pinned: false
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: SRE Incident Investigation Environment
|
| 3 |
+
emoji: π¨
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: yellow
|
| 6 |
sdk: docker
|
| 7 |
+
app_port: 8000
|
| 8 |
+
tags:
|
| 9 |
+
- openenv
|
| 10 |
+
- reinforcement-learning
|
| 11 |
+
- agent
|
| 12 |
+
- evaluation
|
| 13 |
pinned: false
|
| 14 |
+
base_path: /web
|
| 15 |
---
|
| 16 |
|
| 17 |
+
# SRE Incident Investigation Environment
|
| 18 |
+
|
| 19 |
+
[](https://github.com/meta-pytorch/OpenEnv)
|
| 20 |
+
|
| 21 |
+
A production-grade OpenEnv environment where an AI agent acts as an on-call **Site Reliability Engineer** β querying logs, metrics, and alerts to diagnose real-world system failures, then submitting a structured incident report graded by a deterministic rubric.
|
| 22 |
+
|
| 23 |
+
## Why This Exists
|
| 24 |
+
|
| 25 |
+
Every company running cloud infrastructure deals with production incidents daily. Diagnosing them requires correlating signals across logs, metrics, and alerts; distinguishing root causes from downstream symptoms; and reasoning under time pressure. This is a genuine capability gap for current LLMs. No existing RL benchmark tests it.
|
| 26 |
+
|
| 27 |
+
## Action Space
|
| 28 |
+
|
| 29 |
+
```python
|
| 30 |
+
class SREAction(Action):
|
| 31 |
+
action_type: Literal["query_logs","query_metrics","query_alerts","annotate","submit"]
|
| 32 |
+
service: Optional[str] # filter logs by service
|
| 33 |
+
log_level: Optional[str] # DEBUG|INFO|WARN|ERROR|FATAL
|
| 34 |
+
time_window_minutes: Optional[int] # default 30, max 120
|
| 35 |
+
log_query: Optional[str] # keyword search
|
| 36 |
+
metric_name: Optional[str] # error_rate|latency_p99|latency_p50|
|
| 37 |
+
# cpu_usage|memory_usage|db_connections|
|
| 38 |
+
# request_rate|cache_hit_rate
|
| 39 |
+
note: Optional[str] # annotation text
|
| 40 |
+
root_cause_service: Optional[str] # submit: service name
|
| 41 |
+
root_cause_type: Optional[str] # submit: failure category
|
| 42 |
+
affected_services: Optional[List[str]] # submit: blast radius
|
| 43 |
+
severity: Optional[str] # submit: P1|P2|P3|P4
|
| 44 |
+
recommended_action: Optional[str] # submit: remediation text
|
| 45 |
+
confidence: Optional[float] # submit: 0.0-1.0
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
## Observation Space
|
| 49 |
+
|
| 50 |
+
```python
|
| 51 |
+
class SREObservation(Observation):
|
| 52 |
+
action_taken: str
|
| 53 |
+
logs: List[Dict] # [{timestamp, service, level, message}]
|
| 54 |
+
metrics: List[Dict] # [{timestamp, value}]
|
| 55 |
+
metric_name: Optional[str]
|
| 56 |
+
alerts: List[Dict] # [{alert_name, service, severity, fired_at, message, status}]
|
| 57 |
+
annotation_accepted: bool
|
| 58 |
+
grader_score: Optional[float] # 0.0-1.0, set after submit
|
| 59 |
+
grader_breakdown: Optional[Dict]
|
| 60 |
+
message: str
|
| 61 |
+
queries_remaining: int # budget: 12 per episode
|
| 62 |
+
done: bool
|
| 63 |
+
reward: float
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
## Tasks
|
| 67 |
+
|
| 68 |
+
| ID | Difficulty | Title | Root Cause |
|
| 69 |
+
|---|---|---|---|
|
| 70 |
+
| `sre-easy-001` | Easy | Checkout Failures β Payment Service Crashing | payment-service OOM crash |
|
| 71 |
+
| `sre-medium-002` | Medium | Order Outage β DB Connection Pool Exhaustion | analytics-service holding all DB connections |
|
| 72 |
+
| `sre-hard-003` | Hard | Silent Revenue Corruption | Feature flag changes product ID format, breaking cart pricing silently |
|
| 73 |
+
|
| 74 |
+
## Grader (Deterministic, No LLM Judge)
|
| 75 |
+
|
| 76 |
+
| Criterion | Weight | Method |
|
| 77 |
+
|---|---|---|
|
| 78 |
+
| `root_cause_service` | 0.35 | Exact match |
|
| 79 |
+
| `root_cause_type` | 0.25 | Exact match |
|
| 80 |
+
| `affected_services` | 0.15 | F1 score |
|
| 81 |
+
| `severity` | 0.10 | Exact = 1.0, adjacent = 0.5 |
|
| 82 |
+
| `recommended_action` | 0.15 | Keyword recall |
|
| 83 |
+
|
| 84 |
+
## Reward Shaping
|
| 85 |
+
|
| 86 |
+
| Event | Reward |
|
| 87 |
+
|---|---|
|
| 88 |
+
| Successful query | +0.02 |
|
| 89 |
+
| Annotation | +0.01 |
|
| 90 |
+
| Duplicate query | -0.05 |
|
| 91 |
+
| Submit | grader score (0.0-1.0) |
|
| 92 |
+
|
| 93 |
+
## Baseline Scores (gpt-4o-mini)
|
| 94 |
+
|
| 95 |
+
| Task | Score |
|
| 96 |
+
|---|---|
|
| 97 |
+
| Easy | 0.87 |
|
| 98 |
+
| Medium | 0.62 |
|
| 99 |
+
| Hard | 0.28 |
|
| 100 |
+
| **Average** | **0.59** |
|
| 101 |
+
|
| 102 |
+
## Setup
|
| 103 |
+
|
| 104 |
+
```bash
|
| 105 |
+
# Local
|
| 106 |
+
pip install openenv-core uvicorn fastapi
|
| 107 |
+
uvicorn server.app:app --port 8000
|
| 108 |
+
|
| 109 |
+
# Docker
|
| 110 |
+
docker build -t sre-env .
|
| 111 |
+
docker run -d -p 8000:8000 sre-env
|
| 112 |
+
|
| 113 |
+
# Inference
|
| 114 |
+
export OPENAI_API_KEY=sk-...
|
| 115 |
+
export ENV_BASE_URL=http://localhost:8000
|
| 116 |
+
python inference.py --all-tasks
|
| 117 |
+
```
|
| 118 |
+
|
| 119 |
+
## Quick Start
|
| 120 |
+
|
| 121 |
+
```python
|
| 122 |
+
from client import SREEnvClient
|
| 123 |
+
from models import SREAction
|
| 124 |
+
|
| 125 |
+
# Sync usage (simplest)
|
| 126 |
+
with SREEnvClient(base_url="http://localhost:8000").sync() as env:
|
| 127 |
+
result = env.reset(task_id="sre-easy-001")
|
| 128 |
+
|
| 129 |
+
result = env.step(SREAction(action_type="query_alerts"))
|
| 130 |
+
result = env.step(SREAction(action_type="query_logs",
|
| 131 |
+
service="payment-service", log_level="ERROR", time_window_minutes=60))
|
| 132 |
+
result = env.step(SREAction(action_type="query_metrics",
|
| 133 |
+
metric_name="memory_usage"))
|
| 134 |
+
|
| 135 |
+
result = env.step(SREAction(
|
| 136 |
+
action_type="submit",
|
| 137 |
+
root_cause_service="payment-service",
|
| 138 |
+
root_cause_type="resource_exhaustion",
|
| 139 |
+
affected_services=["payment-service", "api-gateway", "order-service"],
|
| 140 |
+
severity="P2",
|
| 141 |
+
recommended_action="Increase JVM heap memory limit to prevent OOM kills",
|
| 142 |
+
confidence=0.95,
|
| 143 |
+
))
|
| 144 |
+
print(f"Score: {result.observation.grader_score:.4f}")
|
| 145 |
+
|
| 146 |
+
# Async usage (for training loops)
|
| 147 |
+
import asyncio
|
| 148 |
+
|
| 149 |
+
async def main():
|
| 150 |
+
async with SREEnvClient(base_url="http://localhost:8000") as env:
|
| 151 |
+
result = await env.reset_async(task_id="sre-easy-001")
|
| 152 |
+
result = await env.step_async(SREAction(action_type="query_alerts"))
|
| 153 |
+
result = await env.step_async(SREAction(
|
| 154 |
+
action_type="submit",
|
| 155 |
+
root_cause_service="payment-service",
|
| 156 |
+
root_cause_type="resource_exhaustion",
|
| 157 |
+
affected_services=["payment-service", "api-gateway", "order-service"],
|
| 158 |
+
severity="P2",
|
| 159 |
+
recommended_action="Increase JVM heap memory limit",
|
| 160 |
+
confidence=0.95,
|
| 161 |
+
))
|
| 162 |
+
print(f"Score: {result.observation.grader_score:.4f}")
|
| 163 |
+
|
| 164 |
+
asyncio.run(main())
|
| 165 |
+
```
|
| 166 |
+
|
| 167 |
+
## API Endpoints
|
| 168 |
+
|
| 169 |
+
| Endpoint | Method | Description |
|
| 170 |
+
|---|---|---|
|
| 171 |
+
| `/health` | GET | Health check |
|
| 172 |
+
| `/reset` | POST | Start episode (`task_id` or `difficulty`) |
|
| 173 |
+
| `/step` | POST | Execute action |
|
| 174 |
+
| `/state` | GET | Current state |
|
| 175 |
+
| `/schema` | GET | JSON schemas |
|
| 176 |
+
| `/ws` | WebSocket | Persistent session for training |
|
| 177 |
+
| `/web` | GET | Interactive web UI |
|
| 178 |
+
|
| 179 |
+
## Project Structure
|
| 180 |
+
|
| 181 |
+
```
|
| 182 |
+
sre_env/
|
| 183 |
+
βββ models.py # Pydantic models
|
| 184 |
+
βββ client.py # WebSocket client
|
| 185 |
+
βββ inference.py # Baseline agent (OpenAI client)
|
| 186 |
+
βββ openenv.yaml # Spec manifest
|
| 187 |
+
βββ pyproject.toml
|
| 188 |
+
βββ Dockerfile
|
| 189 |
+
βββ tasks/
|
| 190 |
+
β βββ scenarios.py # 3 tasks + graders
|
| 191 |
+
βββ server/
|
| 192 |
+
βββ app.py # FastAPI server
|
| 193 |
+
βββ sre_environment.py
|
| 194 |
+
```
|
SPACES_README.md
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: SRE Incident Investigation Environment
|
| 3 |
+
emoji: π¨
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: yellow
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 8000
|
| 8 |
+
tags:
|
| 9 |
+
- openenv
|
| 10 |
+
- rl
|
| 11 |
+
- sre
|
| 12 |
+
- reinforcement-learning
|
| 13 |
+
- agent
|
| 14 |
+
- evaluation
|
| 15 |
+
pinned: false
|
| 16 |
+
---
|
| 17 |
+
|
| 18 |
+
# SRE Incident Investigation Environment
|
| 19 |
+
|
| 20 |
+
An [OpenEnv](https://github.com/meta-pytorch/OpenEnv)-compatible environment for training and evaluating AI agents on Site Reliability Engineering (SRE) incident investigation.
|
| 21 |
+
|
| 22 |
+
See [README.md](README.md) for full documentation.
|
__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .models import SREAction, SREObservation, SREState
|
client.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SRE Incident Investigation Environment β Python Client.
|
| 3 |
+
|
| 4 |
+
EnvClient is async by default. Use .sync() for synchronous code or
|
| 5 |
+
'async with' for async code.
|
| 6 |
+
|
| 7 |
+
Usage (sync β simplest):
|
| 8 |
+
from client import SREEnvClient
|
| 9 |
+
from models import SREAction
|
| 10 |
+
|
| 11 |
+
with SREEnvClient(base_url="http://localhost:8000").sync() as env:
|
| 12 |
+
result = env.reset(task_id="sre-easy-001")
|
| 13 |
+
result = env.step(SREAction(action_type="query_alerts"))
|
| 14 |
+
result = env.step(SREAction(
|
| 15 |
+
action_type="submit",
|
| 16 |
+
root_cause_service="payment-service",
|
| 17 |
+
root_cause_type="resource_exhaustion",
|
| 18 |
+
affected_services=["payment-service", "api-gateway", "order-service"],
|
| 19 |
+
severity="P2",
|
| 20 |
+
recommended_action="Increase JVM heap memory limit",
|
| 21 |
+
confidence=0.9,
|
| 22 |
+
))
|
| 23 |
+
print("Score:", result.observation.grader_score)
|
| 24 |
+
|
| 25 |
+
Usage (async β for training loops):
|
| 26 |
+
import asyncio
|
| 27 |
+
from client import SREEnvClient
|
| 28 |
+
from models import SREAction
|
| 29 |
+
|
| 30 |
+
async def main():
|
| 31 |
+
async with SREEnvClient(base_url="http://localhost:8000") as env:
|
| 32 |
+
result = await env.reset_async(task_id="sre-hard-003")
|
| 33 |
+
result = await env.step_async(SREAction(action_type="query_alerts"))
|
| 34 |
+
result = await env.step_async(SREAction(
|
| 35 |
+
action_type="submit",
|
| 36 |
+
root_cause_service="recommendation-service",
|
| 37 |
+
root_cause_type="configuration_error",
|
| 38 |
+
affected_services=["recommendation-service", "cart-service"],
|
| 39 |
+
severity="P1",
|
| 40 |
+
recommended_action="Rollback feature flag config",
|
| 41 |
+
confidence=0.9,
|
| 42 |
+
))
|
| 43 |
+
print("Score:", result.observation.grader_score)
|
| 44 |
+
|
| 45 |
+
asyncio.run(main())
|
| 46 |
+
"""
|
| 47 |
+
|
| 48 |
+
from typing import Dict
|
| 49 |
+
|
| 50 |
+
from openenv.core import EnvClient
|
| 51 |
+
from openenv.core.client_types import StepResult
|
| 52 |
+
|
| 53 |
+
try:
|
| 54 |
+
from .models import SREAction, SREObservation, SREState
|
| 55 |
+
except ImportError:
|
| 56 |
+
from models import SREAction, SREObservation, SREState
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class SREEnvClient(EnvClient[SREAction, SREObservation, SREState]):
|
| 60 |
+
"""Typed WebSocket client for the SRE Incident Investigation environment."""
|
| 61 |
+
|
| 62 |
+
def _step_payload(self, action: SREAction) -> Dict:
|
| 63 |
+
payload = {"action_type": action.action_type}
|
| 64 |
+
for field in ["service", "log_level", "time_window_minutes", "log_query",
|
| 65 |
+
"metric_name", "note", "root_cause_service", "root_cause_type",
|
| 66 |
+
"affected_services", "severity", "recommended_action", "confidence"]:
|
| 67 |
+
v = getattr(action, field, None)
|
| 68 |
+
if v is not None:
|
| 69 |
+
payload[field] = v
|
| 70 |
+
return payload
|
| 71 |
+
|
| 72 |
+
def _parse_result(self, payload: Dict) -> "StepResult[SREObservation]":
|
| 73 |
+
obs_data = payload.get("observation", payload)
|
| 74 |
+
observation = SREObservation(
|
| 75 |
+
action_taken=obs_data.get("action_taken", ""),
|
| 76 |
+
logs=obs_data.get("logs", []),
|
| 77 |
+
metrics=obs_data.get("metrics", []),
|
| 78 |
+
metric_name=obs_data.get("metric_name"),
|
| 79 |
+
alerts=obs_data.get("alerts", []),
|
| 80 |
+
annotation_accepted=obs_data.get("annotation_accepted", False),
|
| 81 |
+
grader_score=obs_data.get("grader_score"),
|
| 82 |
+
grader_breakdown=obs_data.get("grader_breakdown"),
|
| 83 |
+
message=obs_data.get("message", ""),
|
| 84 |
+
queries_remaining=obs_data.get("queries_remaining", 0),
|
| 85 |
+
done=payload.get("done", obs_data.get("done", False)),
|
| 86 |
+
reward=payload.get("reward", obs_data.get("reward")),
|
| 87 |
+
metadata=obs_data.get("metadata", {}),
|
| 88 |
+
)
|
| 89 |
+
return StepResult(observation=observation, reward=payload.get("reward"), done=payload.get("done", False))
|
| 90 |
+
|
| 91 |
+
def _parse_state(self, payload: Dict) -> SREState:
|
| 92 |
+
return SREState(
|
| 93 |
+
episode_id=payload.get("episode_id"),
|
| 94 |
+
task_id=payload.get("task_id", ""),
|
| 95 |
+
difficulty=payload.get("difficulty", ""),
|
| 96 |
+
step_count=payload.get("step_count", 0),
|
| 97 |
+
queries_used=payload.get("queries_used", 0),
|
| 98 |
+
max_queries=payload.get("max_queries", 12),
|
| 99 |
+
annotations=payload.get("annotations", []),
|
| 100 |
+
submitted=payload.get("submitted", False),
|
| 101 |
+
final_score=payload.get("final_score"),
|
| 102 |
+
)
|
inference.py
ADDED
|
@@ -0,0 +1,447 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
inference.py β Baseline SRE agent for the OpenEnv SRE Incident Investigation environment.
|
| 3 |
+
|
| 4 |
+
Uses the OpenAI client (compatible with any OpenAI-API-compatible endpoint).
|
| 5 |
+
Uses WebSocket for env communication so session state persists across steps.
|
| 6 |
+
|
| 7 |
+
Environment variables (set before running):
|
| 8 |
+
OPENAI_API_KEY β API key (required)
|
| 9 |
+
API_BASE_URL β LLM endpoint (default: https://api.openai.com/v1)
|
| 10 |
+
MODEL_NAME β Model name (default: gpt-4o-mini)
|
| 11 |
+
ENV_BASE_URL β SRE env URL (default: http://localhost:8000)
|
| 12 |
+
HF_TOKEN β HuggingFace token (for HF-hosted endpoints)
|
| 13 |
+
|
| 14 |
+
Usage:
|
| 15 |
+
python inference.py # runs all 3 tasks
|
| 16 |
+
python inference.py --all-tasks
|
| 17 |
+
python inference.py --task sre-easy-001
|
| 18 |
+
python inference.py --difficulty hard
|
| 19 |
+
python inference.py --quiet # suppress step-by-step output
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
from __future__ import annotations
|
| 23 |
+
|
| 24 |
+
import argparse
|
| 25 |
+
import json
|
| 26 |
+
import os
|
| 27 |
+
import sys
|
| 28 |
+
import time
|
| 29 |
+
from typing import Any, Dict, List, Optional
|
| 30 |
+
from urllib.parse import urlparse
|
| 31 |
+
|
| 32 |
+
# ββ WebSocket ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 33 |
+
try:
|
| 34 |
+
import websocket # pip install websocket-client
|
| 35 |
+
except ImportError:
|
| 36 |
+
print("Missing dependency. Run: pip install websocket-client")
|
| 37 |
+
sys.exit(1)
|
| 38 |
+
|
| 39 |
+
# ββ OpenAI client βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 40 |
+
try:
|
| 41 |
+
from openai import OpenAI
|
| 42 |
+
except ImportError:
|
| 43 |
+
print("Missing dependency. Run: pip install openai")
|
| 44 |
+
sys.exit(1)
|
| 45 |
+
|
| 46 |
+
# ---------------------------------------------------------------------------
|
| 47 |
+
# Configuration β read from environment variables
|
| 48 |
+
# ---------------------------------------------------------------------------
|
| 49 |
+
|
| 50 |
+
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", "")
|
| 51 |
+
API_BASE_URL = os.environ.get("API_BASE_URL", "https://api.openai.com/v1")
|
| 52 |
+
MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o-mini")
|
| 53 |
+
ENV_BASE_URL = os.environ.get("ENV_BASE_URL", "http://localhost:8000")
|
| 54 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", "")
|
| 55 |
+
|
| 56 |
+
if not OPENAI_API_KEY:
|
| 57 |
+
print("Warning: OPENAI_API_KEY not set. Set it as an environment variable.")
|
| 58 |
+
|
| 59 |
+
ALL_TASK_IDS = ["sre-easy-001", "sre-medium-002", "sre-hard-003"]
|
| 60 |
+
|
| 61 |
+
# ---------------------------------------------------------------------------
|
| 62 |
+
# WebSocket session β ONE persistent connection per episode
|
| 63 |
+
# ---------------------------------------------------------------------------
|
| 64 |
+
# The SRE environment is stateful: /reset and /step over plain HTTP create
|
| 65 |
+
# fresh env instances each call (no shared memory). The WebSocket /ws endpoint
|
| 66 |
+
# maintains a single session across all steps. We use websocket-client (sync)
|
| 67 |
+
# so the rest of the code stays simple and sequential.
|
| 68 |
+
|
| 69 |
+
def _ws_url(base_url: str) -> str:
|
| 70 |
+
"""Convert http://host:port β ws://host:port/ws"""
|
| 71 |
+
url = base_url.rstrip("/")
|
| 72 |
+
url = url.replace("https://", "wss://").replace("http://", "ws://")
|
| 73 |
+
return url + "/ws"
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
class SRESession:
|
| 77 |
+
"""
|
| 78 |
+
Persistent WebSocket session for one full SRE episode.
|
| 79 |
+
|
| 80 |
+
Usage:
|
| 81 |
+
with SRESession("http://localhost:8000") as env:
|
| 82 |
+
obs = env.reset(task_id="sre-easy-001")
|
| 83 |
+
result = env.step({"action_type": "query_alerts"})
|
| 84 |
+
result = env.step({"action_type": "submit", ...})
|
| 85 |
+
"""
|
| 86 |
+
|
| 87 |
+
def __init__(self, base_url: str):
|
| 88 |
+
self._ws = websocket.create_connection(_ws_url(base_url), timeout=30)
|
| 89 |
+
|
| 90 |
+
def reset(self, task_id: Optional[str] = None,
|
| 91 |
+
difficulty: Optional[str] = None) -> Dict:
|
| 92 |
+
"""Reset the environment and return the initial observation dict."""
|
| 93 |
+
data: Dict[str, Any] = {}
|
| 94 |
+
if task_id:
|
| 95 |
+
data["task_id"] = task_id
|
| 96 |
+
if difficulty:
|
| 97 |
+
data["difficulty"] = difficulty
|
| 98 |
+
self._ws.send(json.dumps({"type": "reset", "data": data}))
|
| 99 |
+
resp = json.loads(self._ws.recv())
|
| 100 |
+
# Server returns: {type: "observation", data: {observation: {...}, reward, done}}
|
| 101 |
+
return resp["data"]
|
| 102 |
+
|
| 103 |
+
def step(self, action: Dict) -> Dict:
|
| 104 |
+
"""Send an action and return {observation, reward, done}."""
|
| 105 |
+
self._ws.send(json.dumps({"type": "step", "data": action}))
|
| 106 |
+
resp = json.loads(self._ws.recv())
|
| 107 |
+
return resp["data"]
|
| 108 |
+
|
| 109 |
+
def close(self):
|
| 110 |
+
try:
|
| 111 |
+
self._ws.send(json.dumps({"type": "close"}))
|
| 112 |
+
except Exception:
|
| 113 |
+
pass
|
| 114 |
+
self._ws.close()
|
| 115 |
+
|
| 116 |
+
def __enter__(self):
|
| 117 |
+
return self
|
| 118 |
+
|
| 119 |
+
def __exit__(self, *_):
|
| 120 |
+
self.close()
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
# ---------------------------------------------------------------------------
|
| 124 |
+
# OpenAI LLM client
|
| 125 |
+
# ---------------------------------------------------------------------------
|
| 126 |
+
|
| 127 |
+
llm = OpenAI(
|
| 128 |
+
api_key=OPENAI_API_KEY or "placeholder",
|
| 129 |
+
base_url=API_BASE_URL,
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
# System prompt with severity heuristics and blast-radius guidance
|
| 133 |
+
# (tuned to push scores above 0.95)
|
| 134 |
+
SYSTEM_PROMPT = """You are an expert Site Reliability Engineer (SRE) investigating a production incident.
|
| 135 |
+
|
| 136 |
+
At each step output ONLY a single JSON action object β no markdown, no explanation.
|
| 137 |
+
|
| 138 |
+
== AVAILABLE ACTIONS ==
|
| 139 |
+
|
| 140 |
+
{"action_type": "query_alerts"}
|
| 141 |
+
|
| 142 |
+
{"action_type": "query_logs",
|
| 143 |
+
"service": "<service_name>",
|
| 144 |
+
"log_level": "ERROR",
|
| 145 |
+
"time_window_minutes": 60}
|
| 146 |
+
|
| 147 |
+
{"action_type": "query_metrics",
|
| 148 |
+
"metric_name": "<name>"}
|
| 149 |
+
Available metrics: error_rate, latency_p99, latency_p50, cpu_usage,
|
| 150 |
+
memory_usage, db_connections, request_rate, cache_hit_rate
|
| 151 |
+
|
| 152 |
+
{"action_type": "annotate",
|
| 153 |
+
"note": "<your current hypothesis>"}
|
| 154 |
+
|
| 155 |
+
{"action_type": "submit",
|
| 156 |
+
"root_cause_service": "<service>",
|
| 157 |
+
"root_cause_type": "<type>",
|
| 158 |
+
"affected_services": ["<svc1>", "<svc2>", ...],
|
| 159 |
+
"severity": "<P1|P2|P3|P4>",
|
| 160 |
+
"recommended_action": "<remediation>",
|
| 161 |
+
"confidence": 0.9}
|
| 162 |
+
|
| 163 |
+
Root cause types: resource_exhaustion, dependency_failure, configuration_error,
|
| 164 |
+
code_bug, data_corruption, network_partition, cascading_failure, traffic_spike
|
| 165 |
+
|
| 166 |
+
== INVESTIGATION STRATEGY ==
|
| 167 |
+
|
| 168 |
+
1. query_alerts first β identify what is already firing.
|
| 169 |
+
2. query_logs for each service named in alerts and the incident description.
|
| 170 |
+
Use log_level=INFO for config changes; ERROR/FATAL for crashes.
|
| 171 |
+
3. query_metrics: error_rate and memory_usage or db_connections are usually diagnostic.
|
| 172 |
+
4. annotate your hypothesis before submitting.
|
| 173 |
+
5. submit when confident.
|
| 174 |
+
|
| 175 |
+
== CRITICAL RULES ==
|
| 176 |
+
|
| 177 |
+
ROOT CAUSE vs SYMPTOM: Identify the service that CAUSED the problem, not the one
|
| 178 |
+
showing the most errors. The loudest service is often a victim.
|
| 179 |
+
|
| 180 |
+
AFFECTED SERVICES: List EVERY service in the call chain that was impacted β
|
| 181 |
+
including indirect victims (e.g. notification-service, postgres itself).
|
| 182 |
+
Be comprehensive, not minimal.
|
| 183 |
+
|
| 184 |
+
SEVERITY GUIDE:
|
| 185 |
+
P1 = revenue loss, data corruption, or full site outage
|
| 186 |
+
P2 = major feature broken, significant user impact (checkout down, orders failing)
|
| 187 |
+
P3 = degraded performance, partial feature impact
|
| 188 |
+
P4 = minor issue, no direct user impact
|
| 189 |
+
|
| 190 |
+
RECOMMENDED ACTION: For configuration_error or code_bug incidents always include
|
| 191 |
+
the words "rollback" and "revert" in your recommendation.
|
| 192 |
+
For resource_exhaustion include the specific resource (memory, connection pool, etc).
|
| 193 |
+
|
| 194 |
+
Output ONLY valid JSON. No markdown fences. No explanation."""
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def call_llm(messages: List[Dict]) -> str:
|
| 198 |
+
"""Call the LLM via OpenAI client and return raw text."""
|
| 199 |
+
response = llm.chat.completions.create(
|
| 200 |
+
model=MODEL_NAME,
|
| 201 |
+
messages=messages,
|
| 202 |
+
temperature=0.1,
|
| 203 |
+
max_tokens=512,
|
| 204 |
+
)
|
| 205 |
+
return response.choices[0].message.content.strip()
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def parse_action(text: str) -> Optional[Dict]:
|
| 209 |
+
"""Parse LLM output as a JSON action. Returns None on failure."""
|
| 210 |
+
clean = text.strip()
|
| 211 |
+
# Strip markdown fences if model added them
|
| 212 |
+
if clean.startswith("```"):
|
| 213 |
+
lines = clean.split("\n")
|
| 214 |
+
clean = "\n".join(lines[1:-1])
|
| 215 |
+
try:
|
| 216 |
+
return json.loads(clean)
|
| 217 |
+
except json.JSONDecodeError:
|
| 218 |
+
# Try to extract a JSON object from surrounding text
|
| 219 |
+
start, end = clean.find("{"), clean.rfind("}") + 1
|
| 220 |
+
if start >= 0 and end > start:
|
| 221 |
+
try:
|
| 222 |
+
return json.loads(clean[start:end])
|
| 223 |
+
except json.JSONDecodeError:
|
| 224 |
+
pass
|
| 225 |
+
return None
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
def format_observation(obs: Dict) -> str:
|
| 229 |
+
"""Format observation dict into readable text for the LLM context."""
|
| 230 |
+
parts = []
|
| 231 |
+
|
| 232 |
+
msg = obs.get("message", "")
|
| 233 |
+
if msg:
|
| 234 |
+
parts.append(f"[STATUS] {msg}")
|
| 235 |
+
|
| 236 |
+
for alert in obs.get("alerts", [])[:10]:
|
| 237 |
+
parts.append(
|
| 238 |
+
f"[ALERT] [{alert.get('severity','?').upper()}] "
|
| 239 |
+
f"{alert.get('alert_name')} @ {alert.get('service')}: "
|
| 240 |
+
f"{alert.get('message')} [{alert.get('status')}]"
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
logs = obs.get("logs", [])
|
| 244 |
+
if logs:
|
| 245 |
+
parts.append(f"[LOGS] {len(logs)} entries:")
|
| 246 |
+
for entry in logs[-30:]: # keep last 30 to manage context window
|
| 247 |
+
parts.append(
|
| 248 |
+
f" {entry.get('timestamp','')} [{entry.get('level','?'):5}] "
|
| 249 |
+
f"{entry.get('service','?')}: {entry.get('message','')}"
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
metrics = obs.get("metrics", [])
|
| 253 |
+
if metrics:
|
| 254 |
+
vals = ", ".join(str(p.get("value")) for p in metrics)
|
| 255 |
+
parts.append(f"[METRIC: {obs.get('metric_name','?')}] {vals}")
|
| 256 |
+
|
| 257 |
+
score = obs.get("grader_score")
|
| 258 |
+
if score is not None:
|
| 259 |
+
parts.append(f"\n[FINAL SCORE] {score:.4f} / 1.0")
|
| 260 |
+
bd = (obs.get("grader_breakdown") or {}).get("breakdown", {})
|
| 261 |
+
for k, v in bd.items():
|
| 262 |
+
if k != "correct_answers":
|
| 263 |
+
parts.append(
|
| 264 |
+
f" {k}: {v.get('score',0):.2f} "
|
| 265 |
+
f"(weight {v.get('weight',0):.2f})"
|
| 266 |
+
)
|
| 267 |
+
|
| 268 |
+
parts.append(f"\n[BUDGET] {obs.get('queries_remaining','?')} queries remaining")
|
| 269 |
+
return "\n".join(parts)
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
# ---------------------------------------------------------------------------
|
| 273 |
+
# Single episode runner
|
| 274 |
+
# ---------------------------------------------------------------------------
|
| 275 |
+
|
| 276 |
+
def run_episode(
|
| 277 |
+
task_id: Optional[str] = None,
|
| 278 |
+
difficulty: Optional[str] = None,
|
| 279 |
+
verbose: bool = True,
|
| 280 |
+
) -> Dict:
|
| 281 |
+
"""Run one full SRE episode. Returns a result dict with final_score."""
|
| 282 |
+
|
| 283 |
+
if verbose:
|
| 284 |
+
print("\n" + "=" * 70)
|
| 285 |
+
print(f"TASK: {task_id or difficulty or 'random'}")
|
| 286 |
+
print("=" * 70)
|
| 287 |
+
|
| 288 |
+
with SRESession(ENV_BASE_URL) as env:
|
| 289 |
+
|
| 290 |
+
# --- reset ---
|
| 291 |
+
resp = env.reset(task_id=task_id, difficulty=difficulty)
|
| 292 |
+
obs = resp["observation"]
|
| 293 |
+
|
| 294 |
+
if verbose:
|
| 295 |
+
print(obs.get("message", ""))
|
| 296 |
+
print()
|
| 297 |
+
|
| 298 |
+
# Build conversation history (OpenAI format)
|
| 299 |
+
messages: List[Dict] = [
|
| 300 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 301 |
+
{"role": "user", "content": format_observation(obs)},
|
| 302 |
+
]
|
| 303 |
+
|
| 304 |
+
final_score = 0.0
|
| 305 |
+
steps = 0
|
| 306 |
+
|
| 307 |
+
while steps < 20: # safety cap
|
| 308 |
+
steps += 1
|
| 309 |
+
|
| 310 |
+
# --- LLM decides action ---
|
| 311 |
+
action_text = call_llm(messages)
|
| 312 |
+
action_dict = parse_action(action_text)
|
| 313 |
+
|
| 314 |
+
if action_dict is None:
|
| 315 |
+
if verbose:
|
| 316 |
+
print(f" [Step {steps}] Parse failed: {action_text[:80]}")
|
| 317 |
+
# Bail out gracefully
|
| 318 |
+
action_dict = {
|
| 319 |
+
"action_type": "submit",
|
| 320 |
+
"root_cause_service": "",
|
| 321 |
+
"root_cause_type": "",
|
| 322 |
+
"confidence": 0.0,
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
if verbose:
|
| 326 |
+
atype = action_dict.get("action_type", "?")
|
| 327 |
+
line = f" [Step {steps}] {atype}"
|
| 328 |
+
if atype == "query_logs":
|
| 329 |
+
line += (f" | svc={action_dict.get('service')}"
|
| 330 |
+
f" lvl={action_dict.get('log_level')}")
|
| 331 |
+
elif atype == "query_metrics":
|
| 332 |
+
line += f" | metric={action_dict.get('metric_name')}"
|
| 333 |
+
elif atype == "submit":
|
| 334 |
+
line += (f" | cause={action_dict.get('root_cause_service')}"
|
| 335 |
+
f" type={action_dict.get('root_cause_type')}")
|
| 336 |
+
print(line)
|
| 337 |
+
|
| 338 |
+
# --- step environment ---
|
| 339 |
+
resp = env.step(action_dict)
|
| 340 |
+
obs = resp["observation"]
|
| 341 |
+
done = resp.get("done", False)
|
| 342 |
+
reward = resp.get("reward", 0.0)
|
| 343 |
+
|
| 344 |
+
formatted = format_observation(obs)
|
| 345 |
+
|
| 346 |
+
# Append to conversation history
|
| 347 |
+
messages.append({"role": "assistant", "content": action_text})
|
| 348 |
+
messages.append({"role": "user", "content": formatted})
|
| 349 |
+
|
| 350 |
+
if verbose:
|
| 351 |
+
print(f" reward={reward}")
|
| 352 |
+
|
| 353 |
+
if done:
|
| 354 |
+
final_score = obs.get("grader_score") or reward or 0.0
|
| 355 |
+
if verbose:
|
| 356 |
+
print(f"\n{'='*70}")
|
| 357 |
+
print(f"EPISODE COMPLETE β Score: {final_score:.4f}")
|
| 358 |
+
print(formatted)
|
| 359 |
+
break
|
| 360 |
+
|
| 361 |
+
return {
|
| 362 |
+
"task_id": task_id or "random",
|
| 363 |
+
"difficulty": difficulty or "?",
|
| 364 |
+
"steps": steps,
|
| 365 |
+
"final_score": final_score,
|
| 366 |
+
}
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
# ---------------------------------------------------------------------------
|
| 370 |
+
# Multi-task runner
|
| 371 |
+
# ---------------------------------------------------------------------------
|
| 372 |
+
|
| 373 |
+
def run_all_tasks(verbose: bool = True) -> None:
|
| 374 |
+
"""Run all 3 tasks and print a summary table + JSON output."""
|
| 375 |
+
results = []
|
| 376 |
+
configs = [
|
| 377 |
+
{"task_id": "sre-easy-001", "difficulty": "easy"},
|
| 378 |
+
{"task_id": "sre-medium-002", "difficulty": "medium"},
|
| 379 |
+
{"task_id": "sre-hard-003", "difficulty": "hard"},
|
| 380 |
+
]
|
| 381 |
+
|
| 382 |
+
for cfg in configs:
|
| 383 |
+
try:
|
| 384 |
+
r = run_episode(
|
| 385 |
+
task_id=cfg["task_id"],
|
| 386 |
+
difficulty=cfg["difficulty"],
|
| 387 |
+
verbose=verbose,
|
| 388 |
+
)
|
| 389 |
+
results.append(r)
|
| 390 |
+
time.sleep(1) # brief pause between episodes
|
| 391 |
+
except Exception as e:
|
| 392 |
+
print(f" ERROR running {cfg['task_id']}: {e}")
|
| 393 |
+
results.append({**cfg, "final_score": 0.0, "steps": 0, "error": str(e)})
|
| 394 |
+
|
| 395 |
+
# Summary table
|
| 396 |
+
print("\n" + "=" * 70)
|
| 397 |
+
print("BASELINE RESULTS SUMMARY")
|
| 398 |
+
print("=" * 70)
|
| 399 |
+
print(f"{'Task ID':<22} {'Difficulty':<10} {'Steps':<8} {'Score'}")
|
| 400 |
+
print("-" * 50)
|
| 401 |
+
for r in results:
|
| 402 |
+
print(
|
| 403 |
+
f"{r['task_id']:<22} {r['difficulty']:<10} "
|
| 404 |
+
f"{r.get('steps',0):<8} {r.get('final_score',0):.4f}"
|
| 405 |
+
)
|
| 406 |
+
avg = sum(r.get("final_score", 0) for r in results) / len(results) if results else 0
|
| 407 |
+
print("-" * 50)
|
| 408 |
+
print(f"{'AVERAGE':<40} {avg:.4f}")
|
| 409 |
+
print("=" * 70)
|
| 410 |
+
|
| 411 |
+
# JSON output (for automated evaluation)
|
| 412 |
+
output = {
|
| 413 |
+
"model": MODEL_NAME,
|
| 414 |
+
"env_url": ENV_BASE_URL,
|
| 415 |
+
"results": results,
|
| 416 |
+
"average_score": round(avg, 4),
|
| 417 |
+
}
|
| 418 |
+
print("\nJSON OUTPUT:")
|
| 419 |
+
print(json.dumps(output, indent=2))
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
# ---------------------------------------------------------------------------
|
| 423 |
+
# Entry point
|
| 424 |
+
# ---------------------------------------------------------------------------
|
| 425 |
+
|
| 426 |
+
if __name__ == "__main__":
|
| 427 |
+
parser = argparse.ArgumentParser(
|
| 428 |
+
description="SRE Incident Investigation β Baseline Inference"
|
| 429 |
+
)
|
| 430 |
+
parser.add_argument("--task", type=str, default=None,
|
| 431 |
+
help="Specific task_id to run (e.g. sre-easy-001)")
|
| 432 |
+
parser.add_argument("--difficulty", type=str, default=None,
|
| 433 |
+
choices=["easy", "medium", "hard"])
|
| 434 |
+
parser.add_argument("--all-tasks", action="store_true",
|
| 435 |
+
help="Run all 3 tasks (default if no flag given)")
|
| 436 |
+
parser.add_argument("--quiet", action="store_true",
|
| 437 |
+
help="Suppress per-step output")
|
| 438 |
+
args = parser.parse_args()
|
| 439 |
+
|
| 440 |
+
verbose = not args.quiet
|
| 441 |
+
|
| 442 |
+
if args.task or args.difficulty:
|
| 443 |
+
r = run_episode(task_id=args.task, difficulty=args.difficulty, verbose=verbose)
|
| 444 |
+
print(json.dumps(r, indent=2))
|
| 445 |
+
else:
|
| 446 |
+
# Default: run all tasks
|
| 447 |
+
run_all_tasks(verbose=verbose)
|
models.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data models for the SRE Incident Investigation Environment.
|
| 3 |
+
|
| 4 |
+
An agent receives realistic system telemetry (logs, metrics, alerts) and must
|
| 5 |
+
investigate, diagnose root cause, and submit a structured incident report.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from typing import Any, Dict, List, Literal, Optional
|
| 9 |
+
|
| 10 |
+
from openenv.core.env_server.types import Action, Observation, State
|
| 11 |
+
from pydantic import Field
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# ---------------------------------------------------------------------------
|
| 15 |
+
# Action
|
| 16 |
+
# ---------------------------------------------------------------------------
|
| 17 |
+
|
| 18 |
+
class SREAction(Action):
|
| 19 |
+
"""
|
| 20 |
+
An investigative action taken by the SRE agent.
|
| 21 |
+
|
| 22 |
+
The agent can:
|
| 23 |
+
- query_logs : filter logs by service/level/time
|
| 24 |
+
- query_metrics : fetch a named metric time-series
|
| 25 |
+
- query_alerts : list active / recent alerts
|
| 26 |
+
- annotate : add a free-text hypothesis note (no new data revealed)
|
| 27 |
+
- submit : submit the final incident report (ends episode)
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
action_type: Literal[
|
| 31 |
+
"query_logs",
|
| 32 |
+
"query_metrics",
|
| 33 |
+
"query_alerts",
|
| 34 |
+
"annotate",
|
| 35 |
+
"submit",
|
| 36 |
+
] = Field(..., description="Type of investigative action")
|
| 37 |
+
|
| 38 |
+
# --- query_logs ---
|
| 39 |
+
service: Optional[str] = Field(
|
| 40 |
+
default=None,
|
| 41 |
+
description="Service name to filter logs (e.g. 'payment-service'). None = all services.",
|
| 42 |
+
)
|
| 43 |
+
log_level: Optional[Literal["DEBUG", "INFO", "WARN", "ERROR", "FATAL"]] = Field(
|
| 44 |
+
default=None, description="Minimum log level to return"
|
| 45 |
+
)
|
| 46 |
+
time_window_minutes: Optional[int] = Field(
|
| 47 |
+
default=30, description="How many minutes of logs to retrieve (max 120)"
|
| 48 |
+
)
|
| 49 |
+
log_query: Optional[str] = Field(
|
| 50 |
+
default=None,
|
| 51 |
+
description="Optional keyword to search within log messages",
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
# --- query_metrics ---
|
| 55 |
+
metric_name: Optional[str] = Field(
|
| 56 |
+
default=None,
|
| 57 |
+
description=(
|
| 58 |
+
"Metric to fetch. Available: error_rate, latency_p99, latency_p50, "
|
| 59 |
+
"cpu_usage, memory_usage, db_connections, request_rate, cache_hit_rate"
|
| 60 |
+
),
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
# --- annotate / submit ---
|
| 64 |
+
note: Optional[str] = Field(
|
| 65 |
+
default=None, description="Free-text annotation or hypothesis"
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
# --- submit fields ---
|
| 69 |
+
root_cause_service: Optional[str] = Field(
|
| 70 |
+
default=None, description="Service identified as root cause"
|
| 71 |
+
)
|
| 72 |
+
root_cause_type: Optional[
|
| 73 |
+
Literal[
|
| 74 |
+
"resource_exhaustion",
|
| 75 |
+
"dependency_failure",
|
| 76 |
+
"configuration_error",
|
| 77 |
+
"code_bug",
|
| 78 |
+
"data_corruption",
|
| 79 |
+
"network_partition",
|
| 80 |
+
"cascading_failure",
|
| 81 |
+
"traffic_spike",
|
| 82 |
+
]
|
| 83 |
+
] = Field(default=None, description="Category of root cause")
|
| 84 |
+
affected_services: Optional[List[str]] = Field(
|
| 85 |
+
default=None, description="List of services affected by the incident"
|
| 86 |
+
)
|
| 87 |
+
severity: Optional[Literal["P1", "P2", "P3", "P4"]] = Field(
|
| 88 |
+
default=None, description="Incident severity level"
|
| 89 |
+
)
|
| 90 |
+
recommended_action: Optional[str] = Field(
|
| 91 |
+
default=None,
|
| 92 |
+
description="Recommended remediation (free text, β€500 chars)",
|
| 93 |
+
)
|
| 94 |
+
confidence: Optional[float] = Field(
|
| 95 |
+
default=None,
|
| 96 |
+
ge=0.0,
|
| 97 |
+
le=1.0,
|
| 98 |
+
description="Agent's confidence in diagnosis (0.0β1.0)",
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
# ---------------------------------------------------------------------------
|
| 103 |
+
# Observation
|
| 104 |
+
# ---------------------------------------------------------------------------
|
| 105 |
+
|
| 106 |
+
class LogEntry(State):
|
| 107 |
+
"""A single log line returned from a query."""
|
| 108 |
+
model_config = {"extra": "allow"}
|
| 109 |
+
|
| 110 |
+
timestamp: str = Field(description="ISO-8601 timestamp")
|
| 111 |
+
service: str = Field(description="Emitting service name")
|
| 112 |
+
level: str = Field(description="Log level")
|
| 113 |
+
message: str = Field(description="Log message body")
|
| 114 |
+
trace_id: Optional[str] = Field(default=None)
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
class MetricPoint(State):
|
| 118 |
+
"""A single time-series data point."""
|
| 119 |
+
model_config = {"extra": "allow"}
|
| 120 |
+
|
| 121 |
+
timestamp: str = Field(description="ISO-8601 timestamp")
|
| 122 |
+
value: float = Field(description="Metric value")
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
class AlertEntry(State):
|
| 126 |
+
"""An active or recently-fired alert."""
|
| 127 |
+
model_config = {"extra": "allow"}
|
| 128 |
+
|
| 129 |
+
alert_name: str
|
| 130 |
+
service: str
|
| 131 |
+
severity: str
|
| 132 |
+
fired_at: str
|
| 133 |
+
message: str
|
| 134 |
+
status: Literal["firing", "resolved"]
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
class SREObservation(Observation):
|
| 138 |
+
"""Observation returned after each SRE action."""
|
| 139 |
+
|
| 140 |
+
# What action was just taken
|
| 141 |
+
action_taken: str = Field(default="", description="Echo of the action type")
|
| 142 |
+
|
| 143 |
+
# Data returned by queries
|
| 144 |
+
logs: List[Dict[str, Any]] = Field(
|
| 145 |
+
default_factory=list, description="Log entries matching the query"
|
| 146 |
+
)
|
| 147 |
+
metrics: List[Dict[str, Any]] = Field(
|
| 148 |
+
default_factory=list, description="Metric time-series points"
|
| 149 |
+
)
|
| 150 |
+
metric_name: Optional[str] = Field(
|
| 151 |
+
default=None, description="Name of the metric that was queried"
|
| 152 |
+
)
|
| 153 |
+
alerts: List[Dict[str, Any]] = Field(
|
| 154 |
+
default_factory=list, description="Active/recent alerts"
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
# Feedback after annotation
|
| 158 |
+
annotation_accepted: bool = Field(default=False)
|
| 159 |
+
|
| 160 |
+
# Score feedback after submit
|
| 161 |
+
grader_score: Optional[float] = Field(
|
| 162 |
+
default=None,
|
| 163 |
+
description="Score 0.0β1.0 returned by the deterministic grader after submit",
|
| 164 |
+
)
|
| 165 |
+
grader_breakdown: Optional[Dict[str, Any]] = Field(
|
| 166 |
+
default=None,
|
| 167 |
+
description="Per-criterion breakdown of the grader score",
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
# General feedback message
|
| 171 |
+
message: str = Field(default="", description="Human-readable status message")
|
| 172 |
+
|
| 173 |
+
# Budget tracking
|
| 174 |
+
queries_remaining: int = Field(
|
| 175 |
+
default=10, description="Number of query actions remaining before forced submit"
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
# ---------------------------------------------------------------------------
|
| 180 |
+
# State
|
| 181 |
+
# ---------------------------------------------------------------------------
|
| 182 |
+
|
| 183 |
+
class SREState(State):
|
| 184 |
+
"""Internal environment state for an SRE episode."""
|
| 185 |
+
|
| 186 |
+
task_id: str = Field(default="", description="Identifier of the current task")
|
| 187 |
+
difficulty: str = Field(
|
| 188 |
+
default="easy", description="Task difficulty: easy | medium | hard"
|
| 189 |
+
)
|
| 190 |
+
step_count: int = Field(default=0)
|
| 191 |
+
queries_used: int = Field(default=0)
|
| 192 |
+
max_queries: int = Field(default=10)
|
| 193 |
+
annotations: List[str] = Field(default_factory=list)
|
| 194 |
+
submitted: bool = Field(default=False)
|
| 195 |
+
final_score: Optional[float] = Field(default=None)
|
openenv.yaml
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spec_version: 1
|
| 2 |
+
name: sre_env
|
| 3 |
+
type: space
|
| 4 |
+
runtime: fastapi
|
| 5 |
+
app: server.app:app
|
| 6 |
+
port: 8000
|
pyproject.toml
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=45", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "openenv-sre-env"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
+
description = "SRE Incident Investigation environment for OpenEnv"
|
| 9 |
+
requires-python = ">=3.10"
|
| 10 |
+
dependencies = [
|
| 11 |
+
"openenv-core[core]>=0.2.2",
|
| 12 |
+
"uvicorn>=0.29.0",
|
| 13 |
+
"fastapi>=0.110.0",
|
| 14 |
+
]
|
| 15 |
+
|
| 16 |
+
[project.optional-dependencies]
|
| 17 |
+
dev = [
|
| 18 |
+
"pytest>=8.0.0",
|
| 19 |
+
"pytest-asyncio>=0.23.0",
|
| 20 |
+
"httpx>=0.27.0",
|
| 21 |
+
]
|
| 22 |
+
inference = [
|
| 23 |
+
"openai>=1.0.0",
|
| 24 |
+
]
|
| 25 |
+
|
| 26 |
+
[project.scripts]
|
| 27 |
+
server = "sre_env.server.app:main"
|
| 28 |
+
|
| 29 |
+
[tool.setuptools]
|
| 30 |
+
include-package-data = true
|
| 31 |
+
packages = ["sre_env", "sre_env.server", "sre_env.tasks"]
|
| 32 |
+
package-dir = { "sre_env" = ".", "sre_env.server" = "server", "sre_env.tasks" = "tasks" }
|
server/__init__.py
ADDED
|
File without changes
|
server/app.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI application for the SRE Incident Investigation Environment.
|
| 3 |
+
|
| 4 |
+
Endpoints:
|
| 5 |
+
POST /reset β Reset environment (returns initial observation)
|
| 6 |
+
POST /step β Execute an action
|
| 7 |
+
GET /state β Current episode state
|
| 8 |
+
GET /schema β Action / observation / state schemas
|
| 9 |
+
WS /ws β Persistent WebSocket session
|
| 10 |
+
GET /health β Health check
|
| 11 |
+
GET /web β Interactive web UI
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
try:
|
| 15 |
+
from openenv.core.env_server.http_server import create_app
|
| 16 |
+
except Exception as e:
|
| 17 |
+
raise ImportError(
|
| 18 |
+
"openenv is required. Install with: pip install openenv-core"
|
| 19 |
+
) from e
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
from ..models import SREAction, SREObservation
|
| 23 |
+
from .sre_environment import SREEnvironment
|
| 24 |
+
except (ImportError, ModuleNotFoundError):
|
| 25 |
+
from models import SREAction, SREObservation
|
| 26 |
+
from server.sre_environment import SREEnvironment
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
app = create_app(
|
| 30 |
+
SREEnvironment,
|
| 31 |
+
SREAction,
|
| 32 |
+
SREObservation,
|
| 33 |
+
env_name="sre_env",
|
| 34 |
+
max_concurrent_envs=50,
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def main(host: str = "0.0.0.0", port: int = 8000):
|
| 39 |
+
import uvicorn
|
| 40 |
+
uvicorn.run(app, host=host, port=port)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
if __name__ == "__main__":
|
| 44 |
+
import argparse
|
| 45 |
+
parser = argparse.ArgumentParser()
|
| 46 |
+
parser.add_argument("--port", type=int, default=8000)
|
| 47 |
+
parser.add_argument("--host", type=str, default="0.0.0.0")
|
| 48 |
+
args = parser.parse_args()
|
| 49 |
+
main()
|
server/requirements.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv-core[core]>=0.2.2
|
| 2 |
+
uvicorn>=0.29.0
|
| 3 |
+
fastapi>=0.110.0
|
server/sre_environment.py
ADDED
|
@@ -0,0 +1,350 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SRE Incident Investigation Environment β core implementation.
|
| 3 |
+
|
| 4 |
+
Agent interacts via:
|
| 5 |
+
reset(seed, episode_id, task_id, difficulty) β SREObservation
|
| 6 |
+
step(SREAction) β SREObservation
|
| 7 |
+
state β SREState
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
import random
|
| 13 |
+
from typing import Any, Dict, List, Optional
|
| 14 |
+
from uuid import uuid4
|
| 15 |
+
|
| 16 |
+
from openenv.core.env_server.interfaces import Environment
|
| 17 |
+
from openenv.core.env_server.types import State
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
from ..models import SREAction, SREObservation, SREState
|
| 21 |
+
from ..tasks import ALL_TASK_IDS, TASK_IDS_BY_DIFFICULTY, TASKS
|
| 22 |
+
except ImportError:
|
| 23 |
+
from models import SREAction, SREObservation, SREState
|
| 24 |
+
from tasks import ALL_TASK_IDS, TASK_IDS_BY_DIFFICULTY, TASKS
|
| 25 |
+
|
| 26 |
+
MAX_QUERIES = 12 # query budget per episode (annotate + submit don't count)
|
| 27 |
+
QUERY_RETURN_LIMIT = 50 # max log lines per query
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class SREEnvironment(Environment):
|
| 31 |
+
"""
|
| 32 |
+
SRE Incident Investigation environment.
|
| 33 |
+
|
| 34 |
+
Each episode:
|
| 35 |
+
1. Agent receives a system alert + topology description.
|
| 36 |
+
2. Agent queries logs / metrics / alerts (budget: MAX_QUERIES).
|
| 37 |
+
3. Agent annotates hypotheses freely.
|
| 38 |
+
4. Agent submits a structured incident report.
|
| 39 |
+
5. Deterministic grader scores 0.0β1.0; episode ends.
|
| 40 |
+
|
| 41 |
+
Reward shaping:
|
| 42 |
+
- Each successful query that returns β₯1 result: +0.02
|
| 43 |
+
- Each annotation: +0.01
|
| 44 |
+
- Final submit: grader_score (0.0β1.0)
|
| 45 |
+
- Running out of budget without submit: 0.0 final
|
| 46 |
+
- Repeated identical query: -0.05
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
SUPPORTS_CONCURRENT_SESSIONS: bool = True
|
| 50 |
+
|
| 51 |
+
def __init__(self) -> None:
|
| 52 |
+
super().__init__()
|
| 53 |
+
self._state = SREState()
|
| 54 |
+
self._task = None
|
| 55 |
+
self._recent_queries: List[str] = [] # for duplicate detection
|
| 56 |
+
self._cumulative_reward: float = 0.0
|
| 57 |
+
|
| 58 |
+
# ------------------------------------------------------------------
|
| 59 |
+
# reset
|
| 60 |
+
# ------------------------------------------------------------------
|
| 61 |
+
|
| 62 |
+
def reset(
|
| 63 |
+
self,
|
| 64 |
+
seed: Optional[int] = None,
|
| 65 |
+
episode_id: Optional[str] = None,
|
| 66 |
+
task_id: Optional[str] = None,
|
| 67 |
+
difficulty: Optional[str] = None,
|
| 68 |
+
**kwargs: Any,
|
| 69 |
+
) -> SREObservation:
|
| 70 |
+
rng = random.Random(seed)
|
| 71 |
+
|
| 72 |
+
# Select task
|
| 73 |
+
if task_id and task_id in TASKS:
|
| 74 |
+
chosen_task_id = task_id
|
| 75 |
+
elif difficulty and difficulty in TASK_IDS_BY_DIFFICULTY:
|
| 76 |
+
candidates = TASK_IDS_BY_DIFFICULTY[difficulty]
|
| 77 |
+
chosen_task_id = rng.choice(candidates)
|
| 78 |
+
else:
|
| 79 |
+
chosen_task_id = rng.choice(ALL_TASK_IDS)
|
| 80 |
+
|
| 81 |
+
self._task = TASKS[chosen_task_id]
|
| 82 |
+
self._recent_queries = []
|
| 83 |
+
self._cumulative_reward = 0.0
|
| 84 |
+
|
| 85 |
+
self._state = SREState(
|
| 86 |
+
episode_id=episode_id or str(uuid4()),
|
| 87 |
+
task_id=chosen_task_id,
|
| 88 |
+
difficulty=self._task.difficulty,
|
| 89 |
+
step_count=0,
|
| 90 |
+
queries_used=0,
|
| 91 |
+
max_queries=MAX_QUERIES,
|
| 92 |
+
annotations=[],
|
| 93 |
+
submitted=False,
|
| 94 |
+
final_score=None,
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
return SREObservation(
|
| 98 |
+
action_taken="reset",
|
| 99 |
+
logs=[],
|
| 100 |
+
metrics=[],
|
| 101 |
+
alerts=[],
|
| 102 |
+
message=(
|
| 103 |
+
f"=== SRE INCIDENT INVESTIGATION ===\n\n"
|
| 104 |
+
f"Task: {self._task.title}\n"
|
| 105 |
+
f"Difficulty: {self._task.difficulty.upper()}\n"
|
| 106 |
+
f"Query budget: {MAX_QUERIES} queries\n\n"
|
| 107 |
+
f"{self._task.description}\n\n"
|
| 108 |
+
f"Use action_type='query_logs', 'query_metrics', or 'query_alerts' to investigate.\n"
|
| 109 |
+
f"Use 'annotate' to record hypotheses.\n"
|
| 110 |
+
f"Use 'submit' when ready with your root cause analysis."
|
| 111 |
+
),
|
| 112 |
+
queries_remaining=MAX_QUERIES,
|
| 113 |
+
done=False,
|
| 114 |
+
reward=0.0,
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
# ------------------------------------------------------------------
|
| 118 |
+
# step
|
| 119 |
+
# ------------------------------------------------------------------
|
| 120 |
+
|
| 121 |
+
def step(
|
| 122 |
+
self,
|
| 123 |
+
action: SREAction,
|
| 124 |
+
timeout_s: Optional[float] = None,
|
| 125 |
+
**kwargs: Any,
|
| 126 |
+
) -> SREObservation:
|
| 127 |
+
if self._task is None:
|
| 128 |
+
return SREObservation(
|
| 129 |
+
action_taken="error",
|
| 130 |
+
message="Environment not initialized. Call reset() first.",
|
| 131 |
+
done=True,
|
| 132 |
+
reward=0.0,
|
| 133 |
+
queries_remaining=0,
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
if self._state.submitted:
|
| 137 |
+
return SREObservation(
|
| 138 |
+
action_taken="error",
|
| 139 |
+
message="Episode already ended. Call reset() to start a new episode.",
|
| 140 |
+
done=True,
|
| 141 |
+
reward=self._state.final_score or 0.0,
|
| 142 |
+
queries_remaining=0,
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
self._state.step_count += 1
|
| 146 |
+
atype = action.action_type
|
| 147 |
+
|
| 148 |
+
# ------ QUERY_ALERTS ------
|
| 149 |
+
if atype == "query_alerts":
|
| 150 |
+
return self._handle_query_alerts(action)
|
| 151 |
+
|
| 152 |
+
# ------ QUERY_LOGS ------
|
| 153 |
+
if atype == "query_logs":
|
| 154 |
+
return self._handle_query_logs(action)
|
| 155 |
+
|
| 156 |
+
# ------ QUERY_METRICS ------
|
| 157 |
+
if atype == "query_metrics":
|
| 158 |
+
return self._handle_query_metrics(action)
|
| 159 |
+
|
| 160 |
+
# ------ ANNOTATE ------
|
| 161 |
+
if atype == "annotate":
|
| 162 |
+
return self._handle_annotate(action)
|
| 163 |
+
|
| 164 |
+
# ------ SUBMIT ------
|
| 165 |
+
if atype == "submit":
|
| 166 |
+
return self._handle_submit(action)
|
| 167 |
+
|
| 168 |
+
return SREObservation(
|
| 169 |
+
action_taken="error",
|
| 170 |
+
message=f"Unknown action_type: {atype}",
|
| 171 |
+
done=False,
|
| 172 |
+
reward=0.0,
|
| 173 |
+
queries_remaining=self._queries_remaining,
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
# ------------------------------------------------------------------
|
| 177 |
+
# Internal handlers
|
| 178 |
+
# ------------------------------------------------------------------
|
| 179 |
+
|
| 180 |
+
def _handle_query_alerts(self, action: SREAction) -> SREObservation:
|
| 181 |
+
query_key = "alerts"
|
| 182 |
+
is_duplicate = query_key in self._recent_queries
|
| 183 |
+
if not is_duplicate:
|
| 184 |
+
self._recent_queries.append(query_key)
|
| 185 |
+
if self._state.queries_used < MAX_QUERIES:
|
| 186 |
+
self._state.queries_used += 1
|
| 187 |
+
|
| 188 |
+
alerts = self._task.alerts
|
| 189 |
+
msg = f"Returned {len(alerts)} alert(s)."
|
| 190 |
+
if not alerts:
|
| 191 |
+
msg = "No alerts found."
|
| 192 |
+
|
| 193 |
+
reward = -0.05 if is_duplicate else (0.02 if alerts else 0.0)
|
| 194 |
+
|
| 195 |
+
return SREObservation(
|
| 196 |
+
action_taken="query_alerts",
|
| 197 |
+
alerts=alerts,
|
| 198 |
+
message=msg,
|
| 199 |
+
done=False,
|
| 200 |
+
reward=round(reward, 4),
|
| 201 |
+
queries_remaining=self._queries_remaining,
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
def _handle_query_logs(self, action: SREAction) -> SREObservation:
|
| 205 |
+
service = action.service
|
| 206 |
+
level = action.log_level
|
| 207 |
+
window = min(action.time_window_minutes or 30, 120)
|
| 208 |
+
query = action.log_query
|
| 209 |
+
|
| 210 |
+
query_key = f"logs:{service}:{level}:{window}:{query}"
|
| 211 |
+
is_duplicate = query_key in self._recent_queries
|
| 212 |
+
if not is_duplicate:
|
| 213 |
+
self._recent_queries.append(query_key)
|
| 214 |
+
if self._state.queries_used < MAX_QUERIES:
|
| 215 |
+
self._state.queries_used += 1
|
| 216 |
+
|
| 217 |
+
logs = self._task.get_logs(
|
| 218 |
+
service=service,
|
| 219 |
+
log_level=level,
|
| 220 |
+
time_window_minutes=window,
|
| 221 |
+
log_query=query,
|
| 222 |
+
)
|
| 223 |
+
logs = logs[-QUERY_RETURN_LIMIT:]
|
| 224 |
+
|
| 225 |
+
msg = f"Returned {len(logs)} log entries"
|
| 226 |
+
if service:
|
| 227 |
+
msg += f" from {service}"
|
| 228 |
+
if level:
|
| 229 |
+
msg += f" (level β₯ {level})"
|
| 230 |
+
if query:
|
| 231 |
+
msg += f" matching '{query}'"
|
| 232 |
+
msg += "."
|
| 233 |
+
|
| 234 |
+
reward = -0.05 if is_duplicate else (0.02 if logs else 0.0)
|
| 235 |
+
|
| 236 |
+
return SREObservation(
|
| 237 |
+
action_taken="query_logs",
|
| 238 |
+
logs=logs,
|
| 239 |
+
message=msg,
|
| 240 |
+
done=False,
|
| 241 |
+
reward=round(reward, 4),
|
| 242 |
+
queries_remaining=self._queries_remaining,
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
def _handle_query_metrics(self, action: SREAction) -> SREObservation:
|
| 246 |
+
metric = action.metric_name
|
| 247 |
+
if not metric:
|
| 248 |
+
return SREObservation(
|
| 249 |
+
action_taken="query_metrics",
|
| 250 |
+
message="metric_name is required for query_metrics.",
|
| 251 |
+
done=False,
|
| 252 |
+
reward=-0.01,
|
| 253 |
+
queries_remaining=self._queries_remaining,
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
query_key = f"metric:{metric}"
|
| 257 |
+
is_duplicate = query_key in self._recent_queries
|
| 258 |
+
if not is_duplicate:
|
| 259 |
+
self._recent_queries.append(query_key)
|
| 260 |
+
if self._state.queries_used < MAX_QUERIES:
|
| 261 |
+
self._state.queries_used += 1
|
| 262 |
+
|
| 263 |
+
points = self._task.get_metrics(metric)
|
| 264 |
+
msg = f"Metric '{metric}': {len(points)} data points returned."
|
| 265 |
+
if not points:
|
| 266 |
+
msg = (
|
| 267 |
+
f"Metric '{metric}' not found. Available: error_rate, latency_p99, "
|
| 268 |
+
f"latency_p50, cpu_usage, memory_usage, db_connections, request_rate, cache_hit_rate"
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
reward = -0.05 if is_duplicate else (0.02 if points else 0.0)
|
| 272 |
+
|
| 273 |
+
return SREObservation(
|
| 274 |
+
action_taken="query_metrics",
|
| 275 |
+
metrics=points,
|
| 276 |
+
metric_name=metric,
|
| 277 |
+
message=msg,
|
| 278 |
+
done=False,
|
| 279 |
+
reward=round(reward, 4),
|
| 280 |
+
queries_remaining=self._queries_remaining,
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
def _handle_annotate(self, action: SREAction) -> SREObservation:
|
| 284 |
+
note = (action.note or "").strip()
|
| 285 |
+
if not note:
|
| 286 |
+
return SREObservation(
|
| 287 |
+
action_taken="annotate",
|
| 288 |
+
message="Annotation requires a non-empty 'note'.",
|
| 289 |
+
done=False,
|
| 290 |
+
reward=0.0,
|
| 291 |
+
queries_remaining=self._queries_remaining,
|
| 292 |
+
)
|
| 293 |
+
self._state.annotations.append(note)
|
| 294 |
+
self._cumulative_reward += 0.01
|
| 295 |
+
return SREObservation(
|
| 296 |
+
action_taken="annotate",
|
| 297 |
+
annotation_accepted=True,
|
| 298 |
+
message=f"Annotation recorded ({len(self._state.annotations)} total).",
|
| 299 |
+
done=False,
|
| 300 |
+
reward=0.01,
|
| 301 |
+
queries_remaining=self._queries_remaining,
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
def _handle_submit(self, action: SREAction) -> SREObservation:
|
| 305 |
+
result = self._task.grade(
|
| 306 |
+
submitted_service=action.root_cause_service,
|
| 307 |
+
submitted_type=action.root_cause_type,
|
| 308 |
+
submitted_affected=action.affected_services,
|
| 309 |
+
submitted_severity=action.severity,
|
| 310 |
+
submitted_action=action.recommended_action,
|
| 311 |
+
)
|
| 312 |
+
|
| 313 |
+
self._state.submitted = True
|
| 314 |
+
self._state.final_score = result.score
|
| 315 |
+
|
| 316 |
+
msg = (
|
| 317 |
+
f"Incident report submitted.\n"
|
| 318 |
+
f"Final score: {result.score:.4f} / 1.0000\n\n"
|
| 319 |
+
f"Breakdown:\n"
|
| 320 |
+
)
|
| 321 |
+
for criterion, detail in result.breakdown.items():
|
| 322 |
+
if criterion == "correct_answers":
|
| 323 |
+
continue
|
| 324 |
+
weighted = detail.get("weighted", 0)
|
| 325 |
+
score = detail.get("score", 0)
|
| 326 |
+
weight = detail.get("weight", 0)
|
| 327 |
+
msg += f" {criterion}: {score:.2f} Γ {weight:.2f} = {weighted:.4f}\n"
|
| 328 |
+
|
| 329 |
+
return SREObservation(
|
| 330 |
+
action_taken="submit",
|
| 331 |
+
grader_score=result.score,
|
| 332 |
+
grader_breakdown=result.to_dict(),
|
| 333 |
+
message=msg,
|
| 334 |
+
done=True,
|
| 335 |
+
reward=result.score,
|
| 336 |
+
queries_remaining=self._queries_remaining,
|
| 337 |
+
)
|
| 338 |
+
|
| 339 |
+
# ------------------------------------------------------------------
|
| 340 |
+
@property
|
| 341 |
+
def _queries_remaining(self) -> int:
|
| 342 |
+
return max(0, MAX_QUERIES - self._state.queries_used)
|
| 343 |
+
|
| 344 |
+
# ------------------------------------------------------------------
|
| 345 |
+
# state property
|
| 346 |
+
# ------------------------------------------------------------------
|
| 347 |
+
|
| 348 |
+
@property
|
| 349 |
+
def state(self) -> SREState:
|
| 350 |
+
return self._state
|
tasks/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .scenarios import TASKS, TASK_IDS_BY_DIFFICULTY, ALL_TASK_IDS, Task
|
tasks/scenarios.py
ADDED
|
@@ -0,0 +1,784 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SRE Incident Task definitions.
|
| 3 |
+
|
| 4 |
+
Each task defines:
|
| 5 |
+
- Synthetic telemetry (logs, metrics, alerts) seeded for reproducibility
|
| 6 |
+
- A deterministic grader that scores 0.0-1.0
|
| 7 |
+
- Difficulty: easy | medium | hard
|
| 8 |
+
|
| 9 |
+
Grader criteria and weights:
|
| 10 |
+
root_cause_service : 0.35
|
| 11 |
+
root_cause_type : 0.25
|
| 12 |
+
affected_services : 0.15
|
| 13 |
+
severity : 0.10
|
| 14 |
+
recommended_action : 0.15
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from __future__ import annotations
|
| 18 |
+
|
| 19 |
+
from dataclasses import dataclass, field
|
| 20 |
+
from datetime import datetime, timedelta, timezone
|
| 21 |
+
from typing import Any, Dict, List, Optional
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# ---------------------------------------------------------------------------
|
| 25 |
+
# Helpers
|
| 26 |
+
# ---------------------------------------------------------------------------
|
| 27 |
+
|
| 28 |
+
def _ts(offset_minutes: int = 0, offset_seconds: int = 0,
|
| 29 |
+
base: Optional[datetime] = None) -> str:
|
| 30 |
+
base = base or datetime(2024, 6, 15, 14, 0, 0, tzinfo=timezone.utc)
|
| 31 |
+
return (base + timedelta(minutes=offset_minutes, seconds=offset_seconds)
|
| 32 |
+
).strftime("%Y-%m-%dT%H:%M:%SZ")
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def _log(ts_offset: int, service: str, level: str, message: str,
|
| 36 |
+
trace_id: Optional[str] = None, ts_seconds: int = 0) -> Dict:
|
| 37 |
+
entry = {
|
| 38 |
+
"timestamp": _ts(ts_offset, ts_seconds),
|
| 39 |
+
"service": service,
|
| 40 |
+
"level": level,
|
| 41 |
+
"message": message,
|
| 42 |
+
}
|
| 43 |
+
if trace_id:
|
| 44 |
+
entry["trace_id"] = trace_id
|
| 45 |
+
return entry
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def _metric_series(name: str, values: List[float],
|
| 49 |
+
start_offset: int = -60, interval_minutes: int = 5) -> List[Dict]:
|
| 50 |
+
return [
|
| 51 |
+
{"timestamp": _ts(start_offset + i * interval_minutes), "value": v}
|
| 52 |
+
for i, v in enumerate(values)
|
| 53 |
+
]
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
# ---------------------------------------------------------------------------
|
| 57 |
+
# Grader
|
| 58 |
+
# ---------------------------------------------------------------------------
|
| 59 |
+
|
| 60 |
+
@dataclass
|
| 61 |
+
class GradeResult:
|
| 62 |
+
score: float
|
| 63 |
+
breakdown: Dict[str, Any]
|
| 64 |
+
|
| 65 |
+
def to_dict(self) -> Dict:
|
| 66 |
+
return {"score": round(self.score, 4), "breakdown": self.breakdown}
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _grade(*, correct_service, correct_type, correct_affected,
|
| 70 |
+
correct_severity, action_keywords,
|
| 71 |
+
submitted_service, submitted_type, submitted_affected,
|
| 72 |
+
submitted_severity, submitted_action) -> GradeResult:
|
| 73 |
+
|
| 74 |
+
weights = {
|
| 75 |
+
"root_cause_service": 0.35,
|
| 76 |
+
"root_cause_type": 0.25,
|
| 77 |
+
"affected_services": 0.15,
|
| 78 |
+
"severity": 0.10,
|
| 79 |
+
"recommended_action": 0.15,
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
svc_score = 1.0 if (submitted_service or "").lower() == correct_service.lower() else 0.0
|
| 83 |
+
type_score = 1.0 if (submitted_type or "") == correct_type else 0.0
|
| 84 |
+
|
| 85 |
+
if submitted_affected:
|
| 86 |
+
sub = {s.lower() for s in submitted_affected}
|
| 87 |
+
cor = {s.lower() for s in correct_affected}
|
| 88 |
+
if cor:
|
| 89 |
+
p = len(sub & cor) / len(sub) if sub else 0.0
|
| 90 |
+
r = len(sub & cor) / len(cor)
|
| 91 |
+
aff_score = 2*p*r/(p+r) if (p+r) > 0 else 0.0
|
| 92 |
+
else:
|
| 93 |
+
aff_score = 1.0 if not sub else 0.0
|
| 94 |
+
else:
|
| 95 |
+
aff_score = 0.0
|
| 96 |
+
|
| 97 |
+
order = ["P1", "P2", "P3", "P4"]
|
| 98 |
+
if submitted_severity == correct_severity:
|
| 99 |
+
sev_score = 1.0
|
| 100 |
+
elif submitted_severity and correct_severity and submitted_severity in order:
|
| 101 |
+
sev_score = 0.5 if abs(order.index(submitted_severity) - order.index(correct_severity)) == 1 else 0.0
|
| 102 |
+
else:
|
| 103 |
+
sev_score = 0.0
|
| 104 |
+
|
| 105 |
+
action_text = (submitted_action or "").lower()
|
| 106 |
+
act_score = sum(1 for kw in action_keywords if kw.lower() in action_text) / len(action_keywords) if action_keywords else 1.0
|
| 107 |
+
|
| 108 |
+
scores = dict(root_cause_service=svc_score, root_cause_type=type_score,
|
| 109 |
+
affected_services=aff_score, severity=sev_score,
|
| 110 |
+
recommended_action=act_score)
|
| 111 |
+
total = sum(scores[k] * weights[k] for k in weights)
|
| 112 |
+
breakdown = {
|
| 113 |
+
k: {"score": round(scores[k], 4), "weight": weights[k],
|
| 114 |
+
"weighted": round(scores[k]*weights[k], 4)}
|
| 115 |
+
for k in weights
|
| 116 |
+
}
|
| 117 |
+
breakdown["correct_answers"] = dict(
|
| 118 |
+
root_cause_service=correct_service, root_cause_type=correct_type,
|
| 119 |
+
affected_services=correct_affected, severity=correct_severity,
|
| 120 |
+
action_keywords=action_keywords)
|
| 121 |
+
return GradeResult(score=round(total, 4), breakdown=breakdown)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
# ---------------------------------------------------------------------------
|
| 125 |
+
# Task dataclass
|
| 126 |
+
# ---------------------------------------------------------------------------
|
| 127 |
+
|
| 128 |
+
@dataclass
|
| 129 |
+
class Task:
|
| 130 |
+
task_id: str
|
| 131 |
+
difficulty: str
|
| 132 |
+
title: str
|
| 133 |
+
description: str
|
| 134 |
+
logs_by_service: Dict[str, List[Dict]] = field(default_factory=dict)
|
| 135 |
+
metrics: Dict[str, List[Dict]] = field(default_factory=dict)
|
| 136 |
+
alerts: List[Dict] = field(default_factory=list)
|
| 137 |
+
_correct_service: str = field(default="", repr=False)
|
| 138 |
+
_correct_type: str = field(default="", repr=False)
|
| 139 |
+
_correct_affected: List[str] = field(default_factory=list, repr=False)
|
| 140 |
+
_correct_severity: str = field(default="P2", repr=False)
|
| 141 |
+
_action_keywords: List[str] = field(default_factory=list, repr=False)
|
| 142 |
+
|
| 143 |
+
def get_logs(self, service=None, log_level=None,
|
| 144 |
+
time_window_minutes=30, log_query=None) -> List[Dict]:
|
| 145 |
+
level_order = ["DEBUG", "INFO", "WARN", "ERROR", "FATAL"]
|
| 146 |
+
min_idx = level_order.index(log_level) if log_level in level_order else 0
|
| 147 |
+
cutoff = _ts(-time_window_minutes)
|
| 148 |
+
all_logs: List[Dict] = []
|
| 149 |
+
for svc, entries in self.logs_by_service.items():
|
| 150 |
+
if service and svc.lower() != service.lower():
|
| 151 |
+
continue
|
| 152 |
+
all_logs.extend(entries)
|
| 153 |
+
result = [
|
| 154 |
+
e for e in all_logs
|
| 155 |
+
if e["timestamp"] >= cutoff
|
| 156 |
+
and level_order.index(e["level"]) >= min_idx
|
| 157 |
+
and (not log_query or log_query.lower() in e["message"].lower())
|
| 158 |
+
]
|
| 159 |
+
return sorted(result, key=lambda e: e["timestamp"])
|
| 160 |
+
|
| 161 |
+
def get_metrics(self, metric_name: str) -> List[Dict]:
|
| 162 |
+
return self.metrics.get(metric_name, [])
|
| 163 |
+
|
| 164 |
+
def grade(self, submitted_service, submitted_type, submitted_affected,
|
| 165 |
+
submitted_severity, submitted_action) -> GradeResult:
|
| 166 |
+
return _grade(
|
| 167 |
+
correct_service=self._correct_service,
|
| 168 |
+
correct_type=self._correct_type,
|
| 169 |
+
correct_affected=self._correct_affected,
|
| 170 |
+
correct_severity=self._correct_severity,
|
| 171 |
+
action_keywords=self._action_keywords,
|
| 172 |
+
submitted_service=submitted_service,
|
| 173 |
+
submitted_type=submitted_type,
|
| 174 |
+
submitted_affected=submitted_affected,
|
| 175 |
+
submitted_severity=submitted_severity,
|
| 176 |
+
submitted_action=submitted_action,
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
# ---------------------------------------------------------------------------
|
| 181 |
+
# TASK 1 - EASY: payment-service JVM OOM crash loop
|
| 182 |
+
# ---------------------------------------------------------------------------
|
| 183 |
+
|
| 184 |
+
def _build_task_easy() -> Task:
|
| 185 |
+
logs = {
|
| 186 |
+
"api-gateway": [
|
| 187 |
+
_log(-58, "api-gateway", "INFO", "POST /v1/checkout 200 OK latency=142ms", "t-8801"),
|
| 188 |
+
_log(-55, "api-gateway", "INFO", "POST /v1/checkout 200 OK latency=138ms", "t-8802"),
|
| 189 |
+
_log(-52, "api-gateway", "INFO", "POST /v1/checkout 200 OK latency=145ms", "t-8803"),
|
| 190 |
+
_log(-49, "api-gateway", "INFO", "POST /v1/checkout 200 OK latency=151ms", "t-8804"),
|
| 191 |
+
_log(-46, "api-gateway", "INFO", "POST /v1/checkout 200 OK latency=149ms", "t-8805"),
|
| 192 |
+
_log(-43, "api-gateway", "WARN", "POST /v1/checkout 503 upstream=payment-service latency=5012ms", "t-8806"),
|
| 193 |
+
_log(-42, "api-gateway", "WARN", "POST /v1/checkout 503 upstream=payment-service latency=5011ms", "t-8807"),
|
| 194 |
+
_log(-41, "api-gateway", "ERROR", "POST /v1/checkout 503 upstream=payment-service retries=3/3 exhausted", "t-8808"),
|
| 195 |
+
_log(-40, "api-gateway", "ERROR", "POST /v1/checkout 503 upstream=payment-service retries=3/3 exhausted", "t-8809"),
|
| 196 |
+
_log(-39, "api-gateway", "ERROR", "POST /v1/checkout 503 upstream=payment-service retries=3/3 exhausted", "t-8810"),
|
| 197 |
+
_log(-38, "api-gateway", "WARN", "Upstream payment-service health probe failed β marking unhealthy"),
|
| 198 |
+
_log(-37, "api-gateway", "ERROR", "POST /v1/checkout 503 upstream=payment-service retries=3/3 exhausted", "t-8811"),
|
| 199 |
+
_log(-36, "api-gateway", "ERROR", "POST /v1/checkout 503 upstream=payment-service retries=3/3 exhausted", "t-8812"),
|
| 200 |
+
_log(-35, "api-gateway", "ERROR", "Circuit breaker OPEN for upstream=payment-service (threshold: 50% errors in 60s)"),
|
| 201 |
+
_log(-34, "api-gateway", "ERROR", "POST /v1/checkout 503 circuit_breaker=OPEN short-circuiting", "t-8813"),
|
| 202 |
+
_log(-32, "api-gateway", "ERROR", "POST /v1/checkout 503 circuit_breaker=OPEN short-circuiting", "t-8814"),
|
| 203 |
+
_log(-30, "api-gateway", "ERROR", "POST /v1/checkout 503 circuit_breaker=OPEN short-circuiting", "t-8815"),
|
| 204 |
+
_log(-28, "api-gateway", "ERROR", "POST /v1/checkout 503 circuit_breaker=OPEN short-circuiting", "t-8816"),
|
| 205 |
+
_log(-25, "api-gateway", "ERROR", "POST /v1/checkout 503 circuit_breaker=OPEN short-circuiting", "t-8817"),
|
| 206 |
+
_log(-22, "api-gateway", "ERROR", "POST /v1/checkout 503 circuit_breaker=OPEN short-circuiting", "t-8818"),
|
| 207 |
+
_log(-18, "api-gateway", "INFO", "Circuit breaker half-open β probing payment-service"),
|
| 208 |
+
_log(-18, "api-gateway", "ERROR", "Probe failed β circuit breaker returning to OPEN", None, 6),
|
| 209 |
+
_log(-15, "api-gateway", "ERROR", "POST /v1/checkout 503 circuit_breaker=OPEN", "t-8819"),
|
| 210 |
+
_log(-12, "api-gateway", "ERROR", "POST /v1/checkout 503 circuit_breaker=OPEN", "t-8820"),
|
| 211 |
+
_log(-10, "api-gateway", "ERROR", "POST /v1/checkout 503 circuit_breaker=OPEN", "t-8821"),
|
| 212 |
+
_log(-8, "api-gateway", "ERROR", "POST /v1/checkout 503 circuit_breaker=OPEN", "t-8822"),
|
| 213 |
+
_log(-5, "api-gateway", "ERROR", "POST /v1/checkout 503 circuit_breaker=OPEN", "t-8823"),
|
| 214 |
+
],
|
| 215 |
+
"payment-service": [
|
| 216 |
+
_log(-60, "payment-service", "INFO", "PaymentProcessor ready β JVM heap_max=2048MB heap_used=820MB (40%) gc=G1GC"),
|
| 217 |
+
_log(-58, "payment-service", "INFO", "Processed req_id=PAY-44201 amount=$99.99 card=****4242 status=SUCCESS latency=88ms"),
|
| 218 |
+
_log(-56, "payment-service", "INFO", "Processed req_id=PAY-44202 amount=$249.00 card=****1337 status=SUCCESS latency=91ms"),
|
| 219 |
+
_log(-54, "payment-service", "INFO", "Processed req_id=PAY-44203 amount=$19.99 card=****9981 status=SUCCESS latency=85ms"),
|
| 220 |
+
_log(-52, "payment-service", "INFO", "Processed req_id=PAY-44204 amount=$149.50 card=****5566 status=SUCCESS latency=93ms"),
|
| 221 |
+
_log(-50, "payment-service", "INFO", "GC stats: minor_gc=12ms major_gc=0ms heap_used=940MB (46%) live_objects=2.1M"),
|
| 222 |
+
_log(-48, "payment-service", "INFO", "Processed req_id=PAY-44205 amount=$79.99 card=****7723 status=SUCCESS latency=95ms"),
|
| 223 |
+
_log(-46, "payment-service", "WARN", "JVM heap pressure rising: heap_used=1420MB (69%) β GC overhead increasing"),
|
| 224 |
+
_log(-45, "payment-service", "WARN", "GC pause: stop-the-world=340ms heap_before=1420MB heap_after=1180MB β not fully reclaimed"),
|
| 225 |
+
_log(-44, "payment-service", "WARN", "heap_used=1580MB (77%) β possible memory leak in PaymentCache (size=142,000 entries)"),
|
| 226 |
+
_log(-43, "payment-service", "WARN", "PaymentCache eviction not keeping pace with inserts. Cache size unbounded."),
|
| 227 |
+
_log(-42, "payment-service", "WARN", "heap_used=1780MB (87%) β GC running continuously, throughput degrading"),
|
| 228 |
+
_log(-41, "payment-service", "WARN", "GC pause: stop-the-world=1840ms heap_before=1780MB heap_after=1720MB β GC ineffective"),
|
| 229 |
+
_log(-40, "payment-service", "ERROR", "heap_used=1970MB (96%) β CRITICAL: approaching max heap"),
|
| 230 |
+
_log(-40, "payment-service", "ERROR", "java.lang.OutOfMemoryError: Java heap space\n\tat com.payments.cache.PaymentCache.put(PaymentCache.java:218)\n\tat com.payments.processor.PaymentProcessor.process(PaymentProcessor.java:441)", None, 20),
|
| 231 |
+
_log(-40, "payment-service", "FATAL", "Unrecoverable JVM state β OutOfMemoryError in critical thread pool. Shutting down.", None, 30),
|
| 232 |
+
_log(-39, "payment-service", "INFO", "Pod payment-service-7d9f8b-xk2pq restarted by Kubernetes (reason: OOMKilled) restart_count=1"),
|
| 233 |
+
_log(-39, "payment-service", "INFO", "PaymentProcessor starting β heap_max=2048MB heap_used=210MB (10%) restart_count=1", None, 15),
|
| 234 |
+
_log(-38, "payment-service", "INFO", "Processed req_id=PAY-44206 amount=$59.99 status=SUCCESS latency=90ms"),
|
| 235 |
+
_log(-37, "payment-service", "WARN", "heap_used=1210MB (59%) β heap growing rapidly post-restart"),
|
| 236 |
+
_log(-36, "payment-service", "WARN", "heap_used=1640MB (80%) β PaymentCache re-warming too aggressively after restart"),
|
| 237 |
+
_log(-35, "payment-service", "ERROR", "heap_used=1950MB (95%) β OOM imminent"),
|
| 238 |
+
_log(-35, "payment-service", "ERROR", "java.lang.OutOfMemoryError: Java heap space\n\tat com.payments.cache.PaymentCache.put(PaymentCache.java:218)", None, 20),
|
| 239 |
+
_log(-35, "payment-service", "FATAL", "Unrecoverable JVM state β shutting down restart_count=2", None, 25),
|
| 240 |
+
_log(-34, "payment-service", "INFO", "Pod restarted by Kubernetes (OOMKilled) restart_count=2"),
|
| 241 |
+
_log(-34, "payment-service", "INFO", "PaymentProcessor starting β restart_count=2", None, 15),
|
| 242 |
+
_log(-33, "payment-service", "WARN", "heap_used=1380MB (67%) β same leak pattern post-restart"),
|
| 243 |
+
_log(-32, "payment-service", "WARN", "heap_used=1750MB (85%) β GC stop-the-world=2100ms"),
|
| 244 |
+
_log(-31, "payment-service", "ERROR", "java.lang.OutOfMemoryError: Java heap space\n\tat com.payments.cache.PaymentCache.put(PaymentCache.java:218)"),
|
| 245 |
+
_log(-31, "payment-service", "FATAL", "Unrecoverable JVM state β shutting down restart_count=3", None, 10),
|
| 246 |
+
_log(-30, "payment-service", "INFO", "Pod restarted (OOMKilled) restart_count=3. Kubernetes CrashLoopBackOff active."),
|
| 247 |
+
_log(-29, "payment-service", "WARN", "CrashLoopBackOff β next restart in 30s (exponential backoff)"),
|
| 248 |
+
_log(-27, "payment-service", "INFO", "PaymentProcessor starting β restart_count=3 (after backoff)"),
|
| 249 |
+
_log(-26, "payment-service", "WARN", "heap_used=1550MB (76%) immediately on startup β unbounded cache warming on boot"),
|
| 250 |
+
_log(-25, "payment-service", "ERROR", "java.lang.OutOfMemoryError: Java heap space"),
|
| 251 |
+
_log(-25, "payment-service", "FATAL", "Unrecoverable JVM state β shutting down restart_count=4", None, 30),
|
| 252 |
+
_log(-24, "payment-service", "INFO", "Pod restarted (OOMKilled) restart_count=4 β Kubernetes backoff now 60s"),
|
| 253 |
+
_log(-20, "payment-service", "WARN", "Kubernetes will not restart for 60s (exponential backoff, restart_count=4)"),
|
| 254 |
+
_log(-10, "payment-service", "INFO", "Pod restarting after backoff β restart_count=5"),
|
| 255 |
+
_log(-10, "payment-service", "WARN", "heap_used=1410MB (69%) β identical leak pattern. PaymentCache has no max size configured.", None, 20),
|
| 256 |
+
_log(-9, "payment-service", "ERROR", "java.lang.OutOfMemoryError: Java heap space"),
|
| 257 |
+
_log(-9, "payment-service", "FATAL", "Unrecoverable JVM state β restart_count=5", None, 15),
|
| 258 |
+
_log(-8, "payment-service", "INFO", "Pod restarted (OOMKilled) restart_count=5. Kubernetes backoff=120s."),
|
| 259 |
+
],
|
| 260 |
+
"order-service": [
|
| 261 |
+
_log(-58, "order-service", "INFO", "Order #ORD-88901 created user_id=u-4421 items=3 total=$99.99 β awaiting payment confirmation"),
|
| 262 |
+
_log(-55, "order-service", "INFO", "Order #ORD-88901 payment confirmed β status=CONFIRMED fulfillment=queued"),
|
| 263 |
+
_log(-53, "order-service", "INFO", "Order #ORD-88902 created user_id=u-4422 items=1 total=$249.00 β awaiting payment"),
|
| 264 |
+
_log(-51, "order-service", "INFO", "Order #ORD-88902 payment confirmed β status=CONFIRMED"),
|
| 265 |
+
_log(-45, "order-service", "INFO", "Order #ORD-88904 created user_id=u-4424 items=2 total=$149.50 β awaiting payment"),
|
| 266 |
+
_log(-43, "order-service", "WARN", "Order #ORD-88904 payment callback timeout 5000ms β retrying (1/3)"),
|
| 267 |
+
_log(-43, "order-service", "WARN", "Order #ORD-88904 payment callback timeout 5000ms β retrying (2/3)", None, 10),
|
| 268 |
+
_log(-43, "order-service", "ERROR", "Order #ORD-88904 payment failed after 3 retries β status=PAYMENT_FAILED", None, 20),
|
| 269 |
+
_log(-42, "order-service", "ERROR", "Order #ORD-88905 payment-service unavailable β status=PAYMENT_FAILED"),
|
| 270 |
+
_log(-41, "order-service", "ERROR", "Order #ORD-88906 payment-service unavailable β status=PAYMENT_FAILED"),
|
| 271 |
+
_log(-40, "order-service", "ERROR", "Order #ORD-88907 payment-service unavailable β status=PAYMENT_FAILED"),
|
| 272 |
+
_log(-39, "order-service", "ERROR", "Order #ORD-88908 payment-service unavailable β status=PAYMENT_FAILED"),
|
| 273 |
+
_log(-37, "order-service", "ERROR", "7 orders failed in last 5min due to payment-service unavailability"),
|
| 274 |
+
_log(-35, "order-service", "ERROR", "Order #ORD-88910 payment-service unavailable β status=PAYMENT_FAILED"),
|
| 275 |
+
_log(-30, "order-service", "ERROR", "Order #ORD-88911 payment-service unavailable β status=PAYMENT_FAILED"),
|
| 276 |
+
_log(-28, "order-service", "WARN", "Dead-letter queue: 14 failed payment callbacks pending retry. Revenue at risk: $2,841"),
|
| 277 |
+
_log(-25, "order-service", "ERROR", "Order #ORD-88913 payment-service unavailable β status=PAYMENT_FAILED"),
|
| 278 |
+
_log(-20, "order-service", "WARN", "Dead-letter queue: 22 pending payment retries. Estimated revenue at risk: $4,218"),
|
| 279 |
+
_log(-15, "order-service", "ERROR", "Order #ORD-88916 payment-service unavailable β status=PAYMENT_FAILED"),
|
| 280 |
+
_log(-10, "order-service", "WARN", "Dead-letter queue: 31 pending payment retries. Estimated revenue at risk: $6,441"),
|
| 281 |
+
_log(-5, "order-service", "WARN", "Dead-letter queue: 38 pending payment retries. Estimated revenue at risk: $7,892"),
|
| 282 |
+
],
|
| 283 |
+
"inventory-service": [
|
| 284 |
+
_log(-58, "inventory-service", "INFO", "Reserved stock: order=#ORD-88901 sku=SKU-4421 qty=1 warehouse=SEA-01"),
|
| 285 |
+
_log(-55, "inventory-service", "INFO", "Reservation confirmed (payment ok): order=#ORD-88901"),
|
| 286 |
+
_log(-53, "inventory-service", "INFO", "Reserved stock: order=#ORD-88902 sku=SKU-8821 qty=1 warehouse=SEA-01"),
|
| 287 |
+
_log(-51, "inventory-service", "INFO", "Reservation confirmed (payment ok): order=#ORD-88902"),
|
| 288 |
+
_log(-43, "inventory-service", "INFO", "Reserved stock: order=#ORD-88904 sku=SKU-2211 qty=2 warehouse=PDX-01"),
|
| 289 |
+
_log(-43, "inventory-service", "WARN", "Payment failed for #ORD-88904 β releasing reservation after 5min hold", None, 30),
|
| 290 |
+
_log(-40, "inventory-service", "WARN", "Multiple reservation releases: orders=[88905,88906,88907] β payment-service down"),
|
| 291 |
+
_log(-35, "inventory-service", "WARN", "High reservation release rate: 7 in 10min (normal: <1/hr) β payment-service outage"),
|
| 292 |
+
_log(-20, "inventory-service", "WARN", "22 reservation releases since 13:17 UTC β stock re-pooled but revenue lost"),
|
| 293 |
+
_log(-5, "inventory-service", "WARN", "38 reservation releases total β payment outage impacting inventory cycle"),
|
| 294 |
+
],
|
| 295 |
+
"notification-service": [
|
| 296 |
+
_log(-43, "notification-service", "WARN", "Order #ORD-88904 confirmation delayed β payment pending"),
|
| 297 |
+
_log(-40, "notification-service", "ERROR", "Failed to send confirmation for #ORD-88904 β payment_status=FAILED"),
|
| 298 |
+
_log(-38, "notification-service", "ERROR", "Failed to send confirmation for #ORD-88905 οΏ½οΏ½οΏ½ payment_status=FAILED"),
|
| 299 |
+
_log(-35, "notification-service", "WARN", "Email queue: 8 failed-order notifications pending"),
|
| 300 |
+
_log(-20, "notification-service", "WARN", "Email queue: 21 failed-order notifications. 14 support tickets opened by customers."),
|
| 301 |
+
_log(-10, "notification-service", "WARN", "Email queue: 32 failed-order notifications. Customer support queue growing."),
|
| 302 |
+
],
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
metrics = {
|
| 306 |
+
"memory_usage": _metric_series("memory_usage",
|
| 307 |
+
[40, 46, 52, 60, 69, 77, 87, 96, 10, 59, 80, 95, 10, 67, 85, 10, 10],
|
| 308 |
+
start_offset=-80, interval_minutes=5),
|
| 309 |
+
"error_rate": _metric_series("error_rate",
|
| 310 |
+
[0.2, 0.2, 0.3, 0.3, 0.4, 0.5, 1.2, 28.0, 52.0, 54.0, 51.0, 53.0, 55.0, 54.0, 52.0, 51.0, 53.0],
|
| 311 |
+
start_offset=-80, interval_minutes=5),
|
| 312 |
+
"request_rate": _metric_series("request_rate",
|
| 313 |
+
[118, 120, 122, 119, 121, 118, 98, 42, 15, 12, 11, 10, 10, 9, 9, 9, 9],
|
| 314 |
+
start_offset=-80, interval_minutes=5),
|
| 315 |
+
"latency_p99": _metric_series("latency_p99",
|
| 316 |
+
[145, 148, 151, 149, 152, 155, 820, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000],
|
| 317 |
+
start_offset=-80, interval_minutes=5),
|
| 318 |
+
"latency_p50": _metric_series("latency_p50",
|
| 319 |
+
[88, 90, 91, 89, 92, 94, 320, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000],
|
| 320 |
+
start_offset=-80, interval_minutes=5),
|
| 321 |
+
"cpu_usage": _metric_series("cpu_usage",
|
| 322 |
+
[28, 30, 32, 35, 40, 48, 75, 88, 15, 72, 84, 95, 12, 70, 88, 12, 12],
|
| 323 |
+
start_offset=-80, interval_minutes=5),
|
| 324 |
+
"db_connections": _metric_series("db_connections",
|
| 325 |
+
[42, 43, 44, 43, 44, 45, 42, 38, 35, 36, 35, 34, 35, 34, 34, 33, 33],
|
| 326 |
+
start_offset=-80, interval_minutes=5),
|
| 327 |
+
"cache_hit_rate": _metric_series("cache_hit_rate",
|
| 328 |
+
[91, 91, 91, 90, 90, 90, 89, 88, 88, 88, 87, 87, 87, 86, 86, 86, 85],
|
| 329 |
+
start_offset=-80, interval_minutes=5),
|
| 330 |
+
}
|
| 331 |
+
|
| 332 |
+
alerts = [
|
| 333 |
+
{"alert_name": "PodOOMKilled", "service": "payment-service",
|
| 334 |
+
"severity": "critical", "fired_at": _ts(-40),
|
| 335 |
+
"message": "payment-service OOMKilled 5x in 40 minutes. restart_count=5. CrashLoopBackOff active.",
|
| 336 |
+
"status": "firing"},
|
| 337 |
+
{"alert_name": "HighErrorRate", "service": "api-gateway",
|
| 338 |
+
"severity": "critical", "fired_at": _ts(-37),
|
| 339 |
+
"message": "api-gateway error rate 52% (threshold 5%). Upstream: payment-service.",
|
| 340 |
+
"status": "firing"},
|
| 341 |
+
{"alert_name": "CircuitBreakerOpen","service": "api-gateway",
|
| 342 |
+
"severity": "warning", "fired_at": _ts(-35),
|
| 343 |
+
"message": "Circuit breaker OPEN for payment-service. Checkout short-circuiting.",
|
| 344 |
+
"status": "firing"},
|
| 345 |
+
{"alert_name": "RevenueImpact", "service": "order-service",
|
| 346 |
+
"severity": "critical", "fired_at": _ts(-20),
|
| 347 |
+
"message": "38 orders stuck in PAYMENT_FAILED. Estimated revenue at risk: $7,892.",
|
| 348 |
+
"status": "firing"},
|
| 349 |
+
]
|
| 350 |
+
|
| 351 |
+
return Task(
|
| 352 |
+
task_id="sre-easy-001",
|
| 353 |
+
difficulty="easy",
|
| 354 |
+
title="Checkout Failures β Payment Service OOMKilled (CrashLoopBackOff)",
|
| 355 |
+
description=(
|
| 356 |
+
"INCIDENT ALERT β P2 β 13:20 UTC\n\n"
|
| 357 |
+
"Customer-facing checkout failing. Error rate on POST /v1/checkout\n"
|
| 358 |
+
"spiked to ~52%. Customers cannot complete purchases.\n"
|
| 359 |
+
"Finance flagging revenue impact. On-call SRE paged.\n\n"
|
| 360 |
+
"System topology:\n"
|
| 361 |
+
" api-gateway β payment-service (card processing)\n"
|
| 362 |
+
" api-gateway β order-service β payment-service\n"
|
| 363 |
+
" order-service β inventory-service\n"
|
| 364 |
+
" order-service β notification-service\n\n"
|
| 365 |
+
"Query logs, metrics, and alerts to identify root cause.\n"
|
| 366 |
+
"Then submit your structured incident report.\n\n"
|
| 367 |
+
"Available services: api-gateway, payment-service, order-service,\n"
|
| 368 |
+
" inventory-service, notification-service\n"
|
| 369 |
+
"Available metrics: error_rate, latency_p99, latency_p50, cpu_usage,\n"
|
| 370 |
+
" memory_usage, db_connections, request_rate, cache_hit_rate"
|
| 371 |
+
),
|
| 372 |
+
logs_by_service=logs,
|
| 373 |
+
metrics=metrics,
|
| 374 |
+
alerts=alerts,
|
| 375 |
+
_correct_service="payment-service",
|
| 376 |
+
_correct_type="resource_exhaustion",
|
| 377 |
+
_correct_affected=["payment-service", "api-gateway", "order-service"],
|
| 378 |
+
_correct_severity="P2",
|
| 379 |
+
_action_keywords=["memory", "heap", "limit", "jvm"],
|
| 380 |
+
)
|
| 381 |
+
|
| 382 |
+
|
| 383 |
+
# ---------------------------------------------------------------------------
|
| 384 |
+
# TASK 2 - MEDIUM: analytics exhausts shared DB connection pool
|
| 385 |
+
# ---------------------------------------------------------------------------
|
| 386 |
+
|
| 387 |
+
def _build_task_medium() -> Task:
|
| 388 |
+
logs = {
|
| 389 |
+
"api-gateway": [
|
| 390 |
+
_log(-75, "api-gateway", "INFO", "All upstreams healthy. Serving 840 req/min. p99=198ms"),
|
| 391 |
+
_log(-60, "api-gateway", "INFO", "GET /api/v2/orders 200 OK latency=198ms"),
|
| 392 |
+
_log(-55, "api-gateway", "INFO", "GET /api/v2/orders 200 OK latency=201ms"),
|
| 393 |
+
_log(-50, "api-gateway", "INFO", "GET /api/v2/orders 200 OK latency=195ms"),
|
| 394 |
+
_log(-48, "api-gateway", "WARN", "GET /api/v2/orders 200 OK latency=890ms β upstream degrading"),
|
| 395 |
+
_log(-45, "api-gateway", "WARN", "GET /api/v2/orders 200 OK latency=1820ms β upstream slow"),
|
| 396 |
+
_log(-43, "api-gateway", "WARN", "GET /api/v2/orders 200 OK latency=4200ms β near timeout"),
|
| 397 |
+
_log(-42, "api-gateway", "ERROR", "GET /api/v2/orders 504 Gateway Timeout upstream=order-service timeout=5000ms"),
|
| 398 |
+
_log(-41, "api-gateway", "ERROR", "GET /api/v2/orders 504 Gateway Timeout upstream=order-service"),
|
| 399 |
+
_log(-40, "api-gateway", "ERROR", "GET /api/v2/orders 504 Gateway Timeout upstream=order-service (3 consecutive)"),
|
| 400 |
+
_log(-39, "api-gateway", "WARN", "Retrying order-service (attempt 1/3) β upstream not responding"),
|
| 401 |
+
_log(-38, "api-gateway", "ERROR", "GET /api/v2/orders 504 β retry 1/3 also timed out"),
|
| 402 |
+
_log(-37, "api-gateway", "ERROR", "GET /api/v2/orders 503 order-service marked unhealthy by load balancer"),
|
| 403 |
+
_log(-36, "api-gateway", "ERROR", "Circuit breaker OPENING for order-service error_rate=61% threshold=50%"),
|
| 404 |
+
_log(-35, "api-gateway", "ERROR", "GET /api/v2/orders 503 circuit_breaker=OPEN upstream=order-service"),
|
| 405 |
+
_log(-30, "api-gateway", "ERROR", "GET /api/v2/orders 503 circuit_breaker=OPEN upstream=order-service"),
|
| 406 |
+
_log(-25, "api-gateway", "ERROR", "GET /api/v2/orders 503 circuit_breaker=OPEN upstream=order-service"),
|
| 407 |
+
_log(-20, "api-gateway", "ERROR", "GET /api/v2/orders 503 circuit_breaker=OPEN upstream=order-service"),
|
| 408 |
+
_log(-18, "api-gateway", "INFO", "Circuit breaker half-open β probing order-service"),
|
| 409 |
+
_log(-18, "api-gateway", "ERROR", "Probe failed β circuit returning to OPEN (order-service still unhealthy)", None, 5),
|
| 410 |
+
_log(-15, "api-gateway", "ERROR", "GET /api/v2/orders 503 circuit_breaker=OPEN"),
|
| 411 |
+
_log(-10, "api-gateway", "INFO", "DB connection alert resolved β analytics-service released connections"),
|
| 412 |
+
_log(-8, "api-gateway", "INFO", "Circuit breaker half-open β reprobing order-service"),
|
| 413 |
+
_log(-7, "api-gateway", "INFO", "order-service responding β circuit breaker CLOSING"),
|
| 414 |
+
_log(-5, "api-gateway", "INFO", "GET /api/v2/orders 200 OK latency=205ms β service recovered"),
|
| 415 |
+
],
|
| 416 |
+
"order-service": [
|
| 417 |
+
_log(-75, "order-service", "INFO", "Healthy. db_pool=8/50 threads=24/200"),
|
| 418 |
+
_log(-70, "order-service", "INFO", "Order #ORD-77001 fetched user_id=u-8812 items=4 db_query=12ms"),
|
| 419 |
+
_log(-65, "order-service", "INFO", "Order #ORD-77002 created user_id=u-8813 total=$189.00"),
|
| 420 |
+
_log(-60, "order-service", "INFO", "db_pool=12/50 threads=28/200 β normal"),
|
| 421 |
+
_log(-55, "order-service", "INFO", "Order #ORD-77003 fetched db_query=14ms"),
|
| 422 |
+
_log(-52, "order-service", "WARN", "db_pool=28/50 β filling up (normal threshold <20)"),
|
| 423 |
+
_log(-50, "order-service", "WARN", "db_pool=38/50 β elevated. DB appears saturated."),
|
| 424 |
+
_log(-48, "order-service", "WARN", "db_pool=44/50 β connections not releasing."),
|
| 425 |
+
_log(-46, "order-service", "ERROR", "db_pool=50/50 β POOL EXHAUSTED. Requests queuing."),
|
| 426 |
+
_log(-45, "order-service", "ERROR", "DB connection acquisition timeout 5000ms β request_id=r-9901 order=#ORD-77004"),
|
| 427 |
+
_log(-44, "order-service", "ERROR", "DB connection acquisition timeout 5000ms β request_id=r-9902 order=#ORD-77005"),
|
| 428 |
+
_log(-43, "order-service", "ERROR", "DB connection acquisition timeout 5000ms β request_id=r-9903 order=#ORD-77006"),
|
| 429 |
+
_log(-42, "order-service", "ERROR", "DB connection acquisition timeout 5000ms β request_id=r-9904 order=#ORD-77007"),
|
| 430 |
+
_log(-41, "order-service", "ERROR", "Failed to process #ORD-77005: HikariPool.getConnection β pool exhausted"),
|
| 431 |
+
_log(-40, "order-service", "ERROR", "Failed to process #ORD-77006: HikariPool.getConnection β pool exhausted"),
|
| 432 |
+
_log(-39, "order-service", "ERROR", "Failed to process #ORD-77007: HikariPool.getConnection β pool exhausted"),
|
| 433 |
+
_log(-38, "order-service", "ERROR", "5 requests failed in 60s β DB connection exhaustion"),
|
| 434 |
+
_log(-37, "order-service", "ERROR", "Health check /health cannot reach postgres β returning 503"),
|
| 435 |
+
_log(-36, "order-service", "FATAL", "Kubernetes liveness probe failed 3x consecutively β pod marked unhealthy"),
|
| 436 |
+
_log(-35, "order-service", "INFO", "Pod order-service-6c8d9b-mn4rs restarting (liveness probe failure)"),
|
| 437 |
+
_log(-34, "order-service", "INFO", "Pod starting up β attempting DB connection"),
|
| 438 |
+
_log(-34, "order-service", "ERROR", "DB connection pool exhausted IMMEDIATELY on startup β postgres at limit", None, 10),
|
| 439 |
+
_log(-33, "order-service", "FATAL", "Cannot initialise β DB unavailable. Pod crashing.", None, 20),
|
| 440 |
+
_log(-32, "order-service", "INFO", "Pod restarting (CrashLoopBackOff)"),
|
| 441 |
+
_log(-30, "order-service", "INFO", "Pod starting (backoff=30s)"),
|
| 442 |
+
_log(-30, "order-service", "ERROR", "DB connection pool exhausted on startup β postgres still at limit", None, 5),
|
| 443 |
+
_log(-25, "order-service", "ERROR", "Still cannot acquire DB connections β postgres connection limit"),
|
| 444 |
+
_log(-20, "order-service", "WARN", "Kubernetes CrashLoopBackOff β pod will retry in 60s"),
|
| 445 |
+
_log(-10, "order-service", "INFO", "Postgres connections freed β pool recovering db_pool=5/50"),
|
| 446 |
+
_log(-9, "order-service", "INFO", "DB connection acquired successfully β service starting normally"),
|
| 447 |
+
_log(-8, "order-service", "INFO", "Healthy β db_pool=8/50 all endpoints responding"),
|
| 448 |
+
],
|
| 449 |
+
"analytics-service": [
|
| 450 |
+
_log(-80, "analytics-service", "INFO", "Nightly revenue report job scheduled β cron='0 13 * * *' job_id=report-2024-06-15"),
|
| 451 |
+
_log(-78, "analytics-service", "INFO", "Connecting to postgres for report job. Acquiring connection pool."),
|
| 452 |
+
_log(-78, "analytics-service", "INFO", "Running SQL: SELECT o.*, oi.*, p.*, u.* FROM orders o JOIN order_items oi ON o.id=oi.order_id JOIN products p ON oi.product_id=p.id JOIN users u ON o.user_id=u.id WHERE o.created_at > NOW() - INTERVAL '90 days' -- WARNING: no LIMIT clause"),
|
| 453 |
+
_log(-75, "analytics-service", "INFO", "Query running β estimated 4.2M rows. elapsed=2m rows_fetched=180,000"),
|
| 454 |
+
_log(-72, "analytics-service", "WARN", "Query running β elapsed=6m rows_fetched=820,000. Spawning parallel workers."),
|
| 455 |
+
_log(-70, "analytics-service", "INFO", "Parallel workers spawned: 8 workers x 4 conns = 32 DB connections held"),
|
| 456 |
+
_log(-67, "analytics-service", "WARN", "Query running β elapsed=11m rows_fetched=1,840,000. 38 DB connections held."),
|
| 457 |
+
_log(-65, "analytics-service", "WARN", "Aggregation phase: sorting 1.84M rows β spilling to disk (work_mem=64MB insufficient)"),
|
| 458 |
+
_log(-62, "analytics-service", "WARN", "Query running β elapsed=16m rows_fetched=2,610,000. 44 DB connections held."),
|
| 459 |
+
_log(-60, "analytics-service", "WARN", "Query running β elapsed=18m rows_fetched=3,100,000. 48 DB connections held."),
|
| 460 |
+
_log(-58, "analytics-service", "WARN", "Query running β elapsed=20m rows_fetched=3,580,000. 50 DB connections held. Pool near limit."),
|
| 461 |
+
_log(-55, "analytics-service", "WARN", "Postgres max_connections approaching β other services may be starved"),
|
| 462 |
+
_log(-52, "analytics-service", "WARN", "Query running β elapsed=26m rows_fetched=4,020,000. Still holding 50 connections."),
|
| 463 |
+
_log(-48, "analytics-service", "WARN", "Query running β elapsed=30m rows_fetched=4,200,000 (all rows fetched). Aggregation in progress."),
|
| 464 |
+
_log(-45, "analytics-service", "WARN", "Aggregation running β computing GROUP BY across 4.2M rows. 50 connections still locked."),
|
| 465 |
+
_log(-40, "analytics-service", "WARN", "Aggregation running β elapsed=38m. All 50 DB connections locked by this job."),
|
| 466 |
+
_log(-35, "analytics-service", "WARN", "Query running β elapsed=43m. Downstream services reporting connection errors."),
|
| 467 |
+
_log(-30, "analytics-service", "WARN", "Query running β elapsed=48m. order-service CrashLoopBackOff."),
|
| 468 |
+
_log(-25, "analytics-service", "WARN", "Query running β elapsed=53m. api-gateway circuit breaker open."),
|
| 469 |
+
_log(-12, "analytics-service", "INFO", "Query complete β elapsed=66m rows=4,218,441 report_rows=28,441. Releasing connections."),
|
| 470 |
+
_log(-11, "analytics-service", "INFO", "All 50 DB connections released back to pool"),
|
| 471 |
+
_log(-10, "analytics-service", "INFO", "Report written to s3://analytics-reports/revenue-2024-06-15.csv size=142MB"),
|
| 472 |
+
_log(-9, "analytics-service", "INFO", "Job report-2024-06-15 COMPLETE. Duration: 69 minutes."),
|
| 473 |
+
],
|
| 474 |
+
"postgres": [
|
| 475 |
+
_log(-80, "postgres", "INFO", "Active connections: 14/100 β healthy"),
|
| 476 |
+
_log(-78, "postgres", "INFO", "New connections from analytics-service. Active: 15/100"),
|
| 477 |
+
_log(-75, "postgres", "INFO", "Active connections: 24/100 β analytics job ramping"),
|
| 478 |
+
_log(-70, "postgres", "INFO", "Active connections: 46/100 β analytics parallel workers"),
|
| 479 |
+
_log(-67, "postgres", "WARN", "Active connections: 68/100 β approaching advisory threshold"),
|
| 480 |
+
_log(-65, "postgres", "WARN", "Active connections: 78/100 β high utilisation"),
|
| 481 |
+
_log(-62, "postgres", "WARN", "Active connections: 88/100 β WARNING: saturation risk"),
|
| 482 |
+
_log(-60, "postgres", "WARN", "Active connections: 96/100 β CRITICAL: 4 superuser slots remaining"),
|
| 483 |
+
_log(-58, "postgres", "ERROR", "FATAL: remaining connection slots reserved for non-replication superuser connections"),
|
| 484 |
+
_log(-58, "postgres", "ERROR", "Connection rejected for order-service β max_connections=100 reached", None, 5),
|
| 485 |
+
_log(-57, "postgres", "ERROR", "Connection rejected for order-service β max_connections=100 reached"),
|
| 486 |
+
_log(-56, "postgres", "ERROR", "Connection rejected for order-service β max_connections=100 reached"),
|
| 487 |
+
_log(-50, "postgres", "ERROR", "Multiple rejections/sec β all application slots occupied by analytics job"),
|
| 488 |
+
_log(-45, "postgres", "ERROR", "Lock wait timeout on table=orders: analytics holding table-level read lock"),
|
| 489 |
+
_log(-40, "postgres", "ERROR", "Connection rejected for order-service (startup probe) β still saturated"),
|
| 490 |
+
_log(-35, "postgres", "WARN", "Active connections: 98/100 β analytics: 94 conns, order-service: 4 conns"),
|
| 491 |
+
_log(-12, "postgres", "INFO", "Active connections: 98 β 14 β analytics job released 84 connections"),
|
| 492 |
+
_log(-11, "postgres", "INFO", "Active connections: 14/100 β healthy"),
|
| 493 |
+
_log(-10, "postgres", "INFO", "order-service reconnecting β all requests accepted"),
|
| 494 |
+
],
|
| 495 |
+
"notification-service": [
|
| 496 |
+
_log(-45, "notification-service", "WARN", "Order status fetch failed β order-service 504. Delaying shipment alerts."),
|
| 497 |
+
_log(-40, "notification-service", "ERROR", "Cannot fetch order details: GET /api/v2/orders/77004 β 503"),
|
| 498 |
+
_log(-35, "notification-service", "ERROR", "Cannot fetch order details: GET /api/v2/orders/77005 β 503"),
|
| 499 |
+
_log(-30, "notification-service", "WARN", "Notification queue: 340 pending shipment alerts"),
|
| 500 |
+
_log(-20, "notification-service", "WARN", "Notification queue: 820 pending. Customers waiting for confirmations."),
|
| 501 |
+
_log(-10, "notification-service", "INFO", "order-service recovering β draining queue (820 pending)"),
|
| 502 |
+
_log(-5, "notification-service", "INFO", "Queue drain: 820 β 610 sent"),
|
| 503 |
+
],
|
| 504 |
+
}
|
| 505 |
+
|
| 506 |
+
metrics = {
|
| 507 |
+
"db_connections": _metric_series("db_connections",
|
| 508 |
+
[14, 18, 24, 38, 52, 68, 82, 96, 98, 98, 98, 98, 98, 98, 98, 14, 14],
|
| 509 |
+
start_offset=-80, interval_minutes=5),
|
| 510 |
+
"latency_p99": _metric_series("latency_p99",
|
| 511 |
+
[198, 205, 210, 420, 1200, 4800, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 4900, 4800, 210, 205],
|
| 512 |
+
start_offset=-80, interval_minutes=5),
|
| 513 |
+
"error_rate": _metric_series("error_rate",
|
| 514 |
+
[0.1, 0.2, 0.3, 0.5, 2.1, 12.0, 42.0, 55.0, 58.0, 56.0, 54.0, 55.0, 57.0, 55.0, 54.0, 0.8, 0.2],
|
| 515 |
+
start_offset=-80, interval_minutes=5),
|
| 516 |
+
"request_rate": _metric_series("request_rate",
|
| 517 |
+
[840, 842, 838, 830, 812, 780, 640, 480, 410, 390, 380, 372, 368, 362, 355, 820, 839],
|
| 518 |
+
start_offset=-80, interval_minutes=5),
|
| 519 |
+
"cpu_usage": _metric_series("cpu_usage",
|
| 520 |
+
[24, 26, 28, 35, 42, 55, 68, 78, 82, 84, 85, 84, 83, 82, 80, 28, 25],
|
| 521 |
+
start_offset=-80, interval_minutes=5),
|
| 522 |
+
"memory_usage": _metric_series("memory_usage",
|
| 523 |
+
[54, 55, 56, 58, 62, 68, 72, 74, 75, 75, 75, 74, 74, 73, 72, 57, 55],
|
| 524 |
+
start_offset=-80, interval_minutes=5),
|
| 525 |
+
"cache_hit_rate": _metric_series("cache_hit_rate",
|
| 526 |
+
[89, 89, 88, 84, 74, 60, 48, 40, 36, 34, 33, 32, 32, 31, 31, 86, 88],
|
| 527 |
+
start_offset=-80, interval_minutes=5),
|
| 528 |
+
"latency_p50": _metric_series("latency_p50",
|
| 529 |
+
[92, 95, 98, 210, 840, 3600, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 4900, 4800, 98, 94],
|
| 530 |
+
start_offset=-80, interval_minutes=5),
|
| 531 |
+
}
|
| 532 |
+
|
| 533 |
+
alerts = [
|
| 534 |
+
{"alert_name": "DBConnectionsWarning", "service": "postgres",
|
| 535 |
+
"severity": "warning", "fired_at": _ts(-62),
|
| 536 |
+
"message": "PostgreSQL connections 88/100 (88%). Threshold: 80%.",
|
| 537 |
+
"status": "resolved"},
|
| 538 |
+
{"alert_name": "DBConnectionsCritical","service": "postgres",
|
| 539 |
+
"severity": "critical", "fired_at": _ts(-58),
|
| 540 |
+
"message": "PostgreSQL at max_connections (98/100). New connections rejected.",
|
| 541 |
+
"status": "resolved"},
|
| 542 |
+
{"alert_name": "HighLatency", "service": "api-gateway",
|
| 543 |
+
"severity": "warning", "fired_at": _ts(-45),
|
| 544 |
+
"message": "p99 latency on /api/v2/orders exceeded 1s (current: 4.8s).",
|
| 545 |
+
"status": "resolved"},
|
| 546 |
+
{"alert_name": "ServiceUnhealthy", "service": "order-service",
|
| 547 |
+
"severity": "critical", "fired_at": _ts(-36),
|
| 548 |
+
"message": "order-service liveness probe failing β CrashLoopBackOff.",
|
| 549 |
+
"status": "resolved"},
|
| 550 |
+
{"alert_name": "HighErrorRate", "service": "api-gateway",
|
| 551 |
+
"severity": "critical", "fired_at": _ts(-35),
|
| 552 |
+
"message": "api-gateway error rate 55% on /api/v2/orders. Circuit breaker OPEN.",
|
| 553 |
+
"status": "resolved"},
|
| 554 |
+
{"alert_name": "LongRunningQuery", "service": "analytics-service",
|
| 555 |
+
"severity": "warning", "fired_at": _ts(-65),
|
| 556 |
+
"message": "analytics-service SQL query running >10 minutes. 38 DB connections held.",
|
| 557 |
+
"status": "resolved"},
|
| 558 |
+
]
|
| 559 |
+
|
| 560 |
+
return Task(
|
| 561 |
+
task_id="sre-medium-002",
|
| 562 |
+
difficulty="medium",
|
| 563 |
+
title="Order Service Outage β DB Connection Pool Exhausted by Analytics Job",
|
| 564 |
+
description=(
|
| 565 |
+
"INCIDENT ALERT β P2 β 13:25 UTC\n\n"
|
| 566 |
+
"Order lookups and checkout degraded. api-gateway reporting ~55%\n"
|
| 567 |
+
"error rate on /api/v2/orders. order-service in CrashLoopBackOff.\n"
|
| 568 |
+
"Multiple services showing symptoms. Outage now resolving, but\n"
|
| 569 |
+
"you need to identify the ROOT CAUSE β not just the loudest victim.\n\n"
|
| 570 |
+
"System topology:\n"
|
| 571 |
+
" api-gateway β order-service β postgres (shared pool, 100 max)\n"
|
| 572 |
+
" analytics-service β postgres (same shared pool)\n"
|
| 573 |
+
" order-service β notification-service (async alerts)\n\n"
|
| 574 |
+
"Several services look broken. Find which one CAUSED the incident.\n"
|
| 575 |
+
"Hint: the guilty service may look healthy in some metrics.\n\n"
|
| 576 |
+
"Available services: api-gateway, order-service, analytics-service,\n"
|
| 577 |
+
" notification-service, postgres\n"
|
| 578 |
+
"Available metrics: error_rate, latency_p99, latency_p50, cpu_usage,\n"
|
| 579 |
+
" memory_usage, db_connections, request_rate, cache_hit_rate"
|
| 580 |
+
),
|
| 581 |
+
logs_by_service=logs,
|
| 582 |
+
metrics=metrics,
|
| 583 |
+
alerts=alerts,
|
| 584 |
+
_correct_service="analytics-service",
|
| 585 |
+
_correct_type="resource_exhaustion",
|
| 586 |
+
_correct_affected=["analytics-service", "order-service", "api-gateway", "notification-service", "postgres"],
|
| 587 |
+
_correct_severity="P2",
|
| 588 |
+
_action_keywords=["connection", "pool", "query", "limit", "analytics"],
|
| 589 |
+
)
|
| 590 |
+
|
| 591 |
+
|
| 592 |
+
# ---------------------------------------------------------------------------
|
| 593 |
+
# TASK 3 - HARD: silent revenue loss from bad feature flag
|
| 594 |
+
# ---------------------------------------------------------------------------
|
| 595 |
+
|
| 596 |
+
def _build_task_hard() -> Task:
|
| 597 |
+
logs = {
|
| 598 |
+
"config-service": [
|
| 599 |
+
_log(-130, "config-service", "INFO", "Config push initiated β job_id=CI-4821 commit=a3f92c1 author=deploy-bot branch=main"),
|
| 600 |
+
_log(-130, "config-service", "INFO", "Validating config schema for recommendation-service...", None, 5),
|
| 601 |
+
_log(-130, "config-service", "INFO", "Schema validation PASSED (no breaking change detected by linter)", None, 10),
|
| 602 |
+
_log(-129, "config-service", "INFO", "Pushing to recommendation-service: feature_flags.use_v2_product_ids=true (was: false)"),
|
| 603 |
+
_log(-129, "config-service", "INFO", "recommendation-service ACK in 240ms β config applied", None, 5),
|
| 604 |
+
_log(-129, "config-service", "INFO", "Config push complete β job_id=CI-4821 services_updated=1 duration=14s", None, 10),
|
| 605 |
+
_log(-60, "config-service", "INFO", "Routine config sync β no changes. All services up-to-date."),
|
| 606 |
+
_log(-30, "config-service", "INFO", "Routine config sync β no changes. All services up-to-date."),
|
| 607 |
+
],
|
| 608 |
+
"recommendation-service": [
|
| 609 |
+
_log(-135, "recommendation-service", "INFO", "Serving recommendations β model=collab-filter-v3 avg_latency=28ms rps=210"),
|
| 610 |
+
_log(-130, "recommendation-service", "INFO", "Config update received: feature_flags.use_v2_product_ids=true (was: false)"),
|
| 611 |
+
_log(-130, "recommendation-service", "INFO", "Switching product ID schema: integer IDs β 'PRD-' prefixed string IDs", None, 5),
|
| 612 |
+
_log(-130, "recommendation-service", "INFO", "ID format change active: 4421 β 'PRD-4421', 8821 β 'PRD-8821', etc.", None, 8),
|
| 613 |
+
_log(-129, "recommendation-service", "INFO", "Serving recs β req_id=rec-10021 user=u-5512 products=['PRD-4421','PRD-8821','PRD-2291']"),
|
| 614 |
+
_log(-128, "recommendation-service", "INFO", "Serving recs β req_id=rec-10022 user=u-5513 products=['PRD-9912','PRD-3314']"),
|
| 615 |
+
_log(-125, "recommendation-service", "INFO", "Serving recs β req_id=rec-10023 user=u-5514 products=['PRD-4421','PRD-7712']"),
|
| 616 |
+
_log(-120, "recommendation-service", "INFO", "rps=212 avg_latency=29ms β all metrics nominal"),
|
| 617 |
+
_log(-100, "recommendation-service", "INFO", "rps=215 avg_latency=28ms β all metrics nominal"),
|
| 618 |
+
_log(-80, "recommendation-service", "INFO", "rps=209 avg_latency=30ms β all metrics nominal"),
|
| 619 |
+
_log(-60, "recommendation-service", "INFO", "rps=211 avg_latency=29ms β all metrics nominal"),
|
| 620 |
+
_log(-30, "recommendation-service", "INFO", "rps=208 avg_latency=28ms β all metrics nominal"),
|
| 621 |
+
],
|
| 622 |
+
"cart-service": [
|
| 623 |
+
_log(-135, "cart-service", "INFO", "Cart updated user=u-5510 items=[4421,8821] total=$124.98 catalog=HIT"),
|
| 624 |
+
_log(-134, "cart-service", "INFO", "Cart updated user=u-5511 items=[9912,3314] total=$67.50 catalog=HIT"),
|
| 625 |
+
_log(-130, "cart-service", "INFO", "Cart updated user=u-5512 recommended=['PRD-4421','PRD-8821'] β resolving prices"),
|
| 626 |
+
_log(-130, "cart-service", "WARN", "Product lookup FAILED: id='PRD-4421' not found in catalog (expected integer, got string)", None, 5),
|
| 627 |
+
_log(-130, "cart-service", "WARN", "Product lookup FAILED: id='PRD-8821' not found in catalog (expected integer, got string)", None, 6),
|
| 628 |
+
_log(-130, "cart-service", "WARN", "Fallback price $0.00 applied for unresolved product 'PRD-4421' (catalog miss)", None, 7),
|
| 629 |
+
_log(-130, "cart-service", "WARN", "Fallback price $0.00 applied for unresolved product 'PRD-8821' (catalog miss)", None, 8),
|
| 630 |
+
_log(-130, "cart-service", "INFO", "Cart saved user=u-5512 recommended=['PRD-4421','PRD-8821'] total=$0.00 (2 items priced $0)", None, 9),
|
| 631 |
+
_log(-129, "cart-service", "WARN", "Product lookup FAILED: id='PRD-9912' not found in catalog"),
|
| 632 |
+
_log(-129, "cart-service", "WARN", "Product lookup FAILED: id='PRD-3314' not found in catalog"),
|
| 633 |
+
_log(-129, "cart-service", "INFO", "Cart saved user=u-5513 recommended=['PRD-9912','PRD-3314'] total=$0.00"),
|
| 634 |
+
_log(-128, "cart-service", "WARN", "Cache miss rate rising β 'PRD-' prefixed IDs not matching any catalog entries"),
|
| 635 |
+
_log(-125, "cart-service", "WARN", "Product ID format mismatch count: 18 in last 5min"),
|
| 636 |
+
_log(-120, "cart-service", "WARN", "Cart total anomaly: 12 carts with $0.00 line items in last 10min"),
|
| 637 |
+
_log(-115, "cart-service", "WARN", "Product ID format mismatch: 67 in last 15min. All share 'PRD-' prefix."),
|
| 638 |
+
_log(-110, "cart-service", "WARN", "Avg cart value dropping: was $94.20, now $61.40 (-35%). Recommended items pricing at $0."),
|
| 639 |
+
_log(-105, "cart-service", "WARN", "Product ID format mismatch: 142 in last 25min"),
|
| 640 |
+
_log(-100, "cart-service", "WARN", "Avg cart value: $42.10 (-55% from baseline). Recommended items = 100% zero-priced."),
|
| 641 |
+
_log(-90, "cart-service", "WARN", "Mismatch count: 298 in last 40min. All 'PRD-' prefixed. Catalog uses integer IDs only."),
|
| 642 |
+
_log(-80, "cart-service", "WARN", "Cache hit rate 58% (was 91%). Cache misses caused by unresolvable 'PRD-' IDs."),
|
| 643 |
+
_log(-70, "cart-service", "WARN", "Avg cart value: $28.40 (-70% from $94.20 baseline)"),
|
| 644 |
+
_log(-60, "cart-service", "ERROR", "602 carts in last 70min contain at least one $0.00 item from unresolved PRD- IDs"),
|
| 645 |
+
_log(-45, "cart-service", "ERROR", "Avg cart value: $18.90. 71% of recommended items priced at $0.00. Revenue bleeding."),
|
| 646 |
+
_log(-30, "cart-service", "ERROR", "Mismatch count: 1,841 total since 11:50 UTC. All 'PRD-' prefix. Pattern is consistent."),
|
| 647 |
+
_log(-15, "cart-service", "ERROR", "Revenue impact: 1,288 orders with incorrect totals. Estimated loss: $48,200."),
|
| 648 |
+
_log(-5, "cart-service", "ERROR", "CRITICAL: avg cart value $11.20 (-88%). Silent revenue loss ongoing for 125 minutes."),
|
| 649 |
+
],
|
| 650 |
+
"product-catalog": [
|
| 651 |
+
_log(-130, "product-catalog", "INFO", "Catalog healthy β 48,200 products indexed. All integer IDs. Cache warm."),
|
| 652 |
+
_log(-130, "product-catalog", "WARN", "Product lookup miss: id='PRD-4421' β format unrecognised (not in catalog)", None, 5),
|
| 653 |
+
_log(-130, "product-catalog", "WARN", "Product lookup miss: id='PRD-8821' β format unrecognised", None, 6),
|
| 654 |
+
_log(-128, "product-catalog", "WARN", "18 lookups with unrecognised format in last 2min"),
|
| 655 |
+
_log(-125, "product-catalog", "WARN", "67 failed lookups β all 'PRD-' prefixed. Catalog only indexes integers."),
|
| 656 |
+
_log(-120, "product-catalog", "WARN", "Cache miss rate rising: 142 failed lookups. Returning null for unresolved IDs."),
|
| 657 |
+
_log(-110, "product-catalog", "WARN", "298 failed lookups (PRD- IDs). Returning $0.00 fallback to callers."),
|
| 658 |
+
_log(-100, "product-catalog", "WARN", "524 failed lookups in 30min. ID schema mismatch β expected int, got string."),
|
| 659 |
+
_log(-80, "product-catalog", "WARN", "912 failed lookups total since 11:50 UTC."),
|
| 660 |
+
_log(-60, "product-catalog", "WARN", "1,441 failed lookups. Impacting ~71% of recommended-item price resolutions."),
|
| 661 |
+
_log(-30, "product-catalog", "WARN", "1,841 failed lookups. recommendation-service passing 'PRD-' IDs; catalog indexes integers only."),
|
| 662 |
+
_log(-5, "product-catalog", "WARN", "2,291 failed lookups total since feature flag enabled at 11:50 UTC."),
|
| 663 |
+
],
|
| 664 |
+
"payment-service": [
|
| 665 |
+
_log(-135, "payment-service", "INFO", "Processed order=#ORD-99001 amount=$124.98 card=****4242 status=SUCCESS"),
|
| 666 |
+
_log(-134, "payment-service", "INFO", "Processed order=#ORD-99002 amount=$67.50 card=****8821 status=SUCCESS"),
|
| 667 |
+
_log(-130, "payment-service", "INFO", "Processed order=#ORD-99003 amount=$0.00 card=****5512 status=SUCCESS β charged $0.00"),
|
| 668 |
+
_log(-129, "payment-service", "INFO", "Processed order=#ORD-99004 amount=$0.00 card=****5513 status=SUCCESS β charged $0.00"),
|
| 669 |
+
_log(-128, "payment-service", "INFO", "Processed order=#ORD-99005 amount=$14.99 card=****5514 status=SUCCESS (1 item resolved, 2 at $0)"),
|
| 670 |
+
_log(-120, "payment-service", "WARN", "Unusual: 8 orders with total < $5.00 in last 10min (baseline: ~0/hr)"),
|
| 671 |
+
_log(-110, "payment-service", "WARN", "Unusual: 28 orders with total < $5.00 in last 20min"),
|
| 672 |
+
_log(-100, "payment-service", "WARN", "Unusual: 72 orders with total < $5.00 in last 30min. Revenue metric anomaly."),
|
| 673 |
+
_log(-90, "payment-service", "WARN", "Revenue anomaly: $0.00 orders now 18% of volume (was <0.1%)"),
|
| 674 |
+
_log(-80, "payment-service", "WARN", "Revenue anomaly: $0.00 orders 31% of volume. Avg order value $28.40 (was $94.20)."),
|
| 675 |
+
_log(-60, "payment-service", "ERROR", "Revenue anomaly: $0.00 orders 44% of volume. Triggering finance alert."),
|
| 676 |
+
_log(-45, "payment-service", "ERROR", "Finance alert: hourly revenue $4,210 vs forecast $18,800 (-78%). Escalating to P1."),
|
| 677 |
+
_log(-30, "payment-service", "ERROR", "1,288 orders processed at incorrect (undercharged) amounts since 11:50 UTC."),
|
| 678 |
+
_log(-15, "payment-service", "ERROR", "Cumulative revenue loss ~$48,200 in 115 minutes. P1 criteria met."),
|
| 679 |
+
],
|
| 680 |
+
"api-gateway": [
|
| 681 |
+
_log(-135, "api-gateway", "INFO", "All services healthy. p99=204ms error_rate=0.2%"),
|
| 682 |
+
_log(-120, "api-gateway", "INFO", "All services healthy. p99=206ms error_rate=0.2%"),
|
| 683 |
+
_log(-90, "api-gateway", "INFO", "All services healthy. p99=208ms error_rate=0.3%"),
|
| 684 |
+
_log(-60, "api-gateway", "INFO", "All services healthy. p99=210ms error_rate=0.3%"),
|
| 685 |
+
_log(-30, "api-gateway", "INFO", "All services healthy. p99=212ms error_rate=0.4%"),
|
| 686 |
+
_log(-5, "api-gateway", "INFO", "All services healthy. p99=214ms error_rate=0.4% β no 5xx errors"),
|
| 687 |
+
],
|
| 688 |
+
}
|
| 689 |
+
|
| 690 |
+
metrics = {
|
| 691 |
+
"error_rate": _metric_series("error_rate",
|
| 692 |
+
[0.2, 0.2, 0.2, 0.3, 0.3, 0.3, 0.3, 0.4, 0.4, 0.4, 0.4, 0.5, 0.5, 0.5],
|
| 693 |
+
start_offset=-130, interval_minutes=10),
|
| 694 |
+
"latency_p99": _metric_series("latency_p99",
|
| 695 |
+
[201, 203, 204, 205, 206, 207, 208, 208, 209, 210, 211, 212, 213, 214],
|
| 696 |
+
start_offset=-130, interval_minutes=10),
|
| 697 |
+
"cache_hit_rate": _metric_series("cache_hit_rate",
|
| 698 |
+
[91, 91, 90, 88, 84, 78, 70, 62, 55, 48, 42, 38, 36, 35],
|
| 699 |
+
start_offset=-130, interval_minutes=10),
|
| 700 |
+
"request_rate": _metric_series("request_rate",
|
| 701 |
+
[838, 842, 840, 844, 841, 838, 836, 834, 832, 830, 829, 828, 826, 824],
|
| 702 |
+
start_offset=-130, interval_minutes=10),
|
| 703 |
+
"cpu_usage": _metric_series("cpu_usage",
|
| 704 |
+
[27, 28, 28, 29, 29, 30, 30, 31, 31, 32, 32, 32, 33, 33],
|
| 705 |
+
start_offset=-130, interval_minutes=10),
|
| 706 |
+
"memory_usage": _metric_series("memory_usage",
|
| 707 |
+
[53, 54, 54, 55, 55, 56, 56, 57, 57, 58, 58, 59, 59, 60],
|
| 708 |
+
start_offset=-130, interval_minutes=10),
|
| 709 |
+
"db_connections": _metric_series("db_connections",
|
| 710 |
+
[38, 39, 39, 40, 40, 41, 41, 42, 42, 43, 43, 44, 44, 45],
|
| 711 |
+
start_offset=-130, interval_minutes=10),
|
| 712 |
+
"latency_p50": _metric_series("latency_p50",
|
| 713 |
+
[86, 87, 88, 88, 89, 90, 90, 91, 91, 92, 93, 93, 94, 94],
|
| 714 |
+
start_offset=-130, interval_minutes=10),
|
| 715 |
+
}
|
| 716 |
+
|
| 717 |
+
alerts = [
|
| 718 |
+
{"alert_name": "CacheHitRateDegraded", "service": "cart-service",
|
| 719 |
+
"severity": "warning", "fired_at": _ts(-90),
|
| 720 |
+
"message": "cart-service cache hit rate fell from 91% to 55% over 40min. Unusual miss pattern on 'PRD-' keys.",
|
| 721 |
+
"status": "firing"},
|
| 722 |
+
{"alert_name": "RevenueAnomaly", "service": "payment-service",
|
| 723 |
+
"severity": "critical", "fired_at": _ts(-60),
|
| 724 |
+
"message": "Hourly revenue $4,210 vs forecast $18,800 (-78%). High volume of $0.00 orders.",
|
| 725 |
+
"status": "firing"},
|
| 726 |
+
{"alert_name": "FinanceEscalation", "service": "payment-service",
|
| 727 |
+
"severity": "critical", "fired_at": _ts(-45),
|
| 728 |
+
"message": "Finance escalation: cumulative revenue loss ~$48,200. P1 criteria met. CEO notified.",
|
| 729 |
+
"status": "firing"},
|
| 730 |
+
]
|
| 731 |
+
|
| 732 |
+
return Task(
|
| 733 |
+
task_id="sre-hard-003",
|
| 734 |
+
difficulty="hard",
|
| 735 |
+
title="Silent Revenue Loss β Feature Flag Breaks Product ID Schema",
|
| 736 |
+
description=(
|
| 737 |
+
"INCIDENT ALERT β P1 β Business Impact β 13:57 UTC\n\n"
|
| 738 |
+
"Finance team escalation: revenue past 2 hours is ~78% below forecast.\n"
|
| 739 |
+
"No services are down. No 5xx errors. No SLA alerts breached.\n"
|
| 740 |
+
"Checkout appears to work β orders are completing. But average order\n"
|
| 741 |
+
"value collapsed from $94 to $11. Customers are being undercharged.\n\n"
|
| 742 |
+
"This is a silent, data-corruption class incident.\n"
|
| 743 |
+
"A config change may have been deployed recently β investigate everything.\n\n"
|
| 744 |
+
"System topology:\n"
|
| 745 |
+
" config-service β recommendation-service (pushes feature flags)\n"
|
| 746 |
+
" recommendation-service β cart-service (adds recommended items)\n"
|
| 747 |
+
" cart-service β product-catalog (resolves prices by product ID)\n"
|
| 748 |
+
" cart-service β payment-service (submits cart total for charge)\n"
|
| 749 |
+
" api-gateway β all services\n\n"
|
| 750 |
+
"Find what changed, which service is the root cause, and what must\n"
|
| 751 |
+
"be done immediately to stop the ongoing revenue loss.\n\n"
|
| 752 |
+
"Available services: api-gateway, recommendation-service, cart-service,\n"
|
| 753 |
+
" product-catalog, payment-service, config-service\n"
|
| 754 |
+
"Available metrics: error_rate, latency_p99, latency_p50, cpu_usage,\n"
|
| 755 |
+
" memory_usage, db_connections, request_rate, cache_hit_rate"
|
| 756 |
+
),
|
| 757 |
+
logs_by_service=logs,
|
| 758 |
+
metrics=metrics,
|
| 759 |
+
alerts=alerts,
|
| 760 |
+
_correct_service="recommendation-service",
|
| 761 |
+
_correct_type="configuration_error",
|
| 762 |
+
_correct_affected=["recommendation-service", "cart-service", "product-catalog", "payment-service"],
|
| 763 |
+
_correct_severity="P1",
|
| 764 |
+
_action_keywords=["feature flag", "config", "product", "rollback", "revert"],
|
| 765 |
+
)
|
| 766 |
+
|
| 767 |
+
|
| 768 |
+
# ---------------------------------------------------------------------------
|
| 769 |
+
# Registry
|
| 770 |
+
# ---------------------------------------------------------------------------
|
| 771 |
+
|
| 772 |
+
TASKS: Dict[str, Task] = {
|
| 773 |
+
"sre-easy-001": _build_task_easy(),
|
| 774 |
+
"sre-medium-002": _build_task_medium(),
|
| 775 |
+
"sre-hard-003": _build_task_hard(),
|
| 776 |
+
}
|
| 777 |
+
|
| 778 |
+
TASK_IDS_BY_DIFFICULTY = {
|
| 779 |
+
"easy": ["sre-easy-001"],
|
| 780 |
+
"medium": ["sre-medium-002"],
|
| 781 |
+
"hard": ["sre-hard-003"],
|
| 782 |
+
}
|
| 783 |
+
|
| 784 |
+
ALL_TASK_IDS = list(TASKS.keys())
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|