Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- Dockerfile +81 -0
- README.md +122 -5
- __init__.py +16 -0
- client.py +58 -0
- inference.py +279 -0
- models.py +58 -0
- openenv.yaml +7 -0
- pyproject.toml +45 -0
- server/Dockerfile +80 -0
- server/__init__.py +11 -0
- server/app.py +49 -0
- server/postmortem_env_environment.py +239 -0
- server/requirements.txt +5 -0
- server/scenarios.py +170 -0
- uv.lock +0 -0
Dockerfile
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
# Multi-stage build using openenv-base
|
| 8 |
+
# This Dockerfile is flexible and works for both:
|
| 9 |
+
# - In-repo environments (with local OpenEnv sources)
|
| 10 |
+
# - Standalone environments (with openenv from PyPI/Git)
|
| 11 |
+
# The build script (openenv build) handles context detection and sets appropriate build args.
|
| 12 |
+
|
| 13 |
+
ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
|
| 14 |
+
FROM ${BASE_IMAGE} AS builder
|
| 15 |
+
|
| 16 |
+
WORKDIR /app
|
| 17 |
+
|
| 18 |
+
# Ensure git is available (required for installing dependencies from VCS)
|
| 19 |
+
RUN apt-get update && \
|
| 20 |
+
apt-get install -y --no-install-recommends git && \
|
| 21 |
+
rm -rf /var/lib/apt/lists/*
|
| 22 |
+
|
| 23 |
+
# Build argument to control whether we're building standalone or in-repo
|
| 24 |
+
ARG BUILD_MODE=in-repo
|
| 25 |
+
ARG ENV_NAME=postmortem_env
|
| 26 |
+
|
| 27 |
+
# Copy environment code (always at root of build context)
|
| 28 |
+
COPY . /app/env
|
| 29 |
+
|
| 30 |
+
# For in-repo builds, openenv is already vendored in the build context
|
| 31 |
+
# For standalone builds, openenv will be installed via pyproject.toml
|
| 32 |
+
WORKDIR /app/env
|
| 33 |
+
|
| 34 |
+
# Ensure uv is available (for local builds where base image lacks it)
|
| 35 |
+
RUN if ! command -v uv >/dev/null 2>&1; then \
|
| 36 |
+
curl -LsSf https://astral.sh/uv/install.sh | sh && \
|
| 37 |
+
mv /root/.local/bin/uv /usr/local/bin/uv && \
|
| 38 |
+
mv /root/.local/bin/uvx /usr/local/bin/uvx; \
|
| 39 |
+
fi
|
| 40 |
+
|
| 41 |
+
# Install dependencies using uv sync
|
| 42 |
+
# If uv.lock exists, use it; otherwise resolve on the fly
|
| 43 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 44 |
+
if [ -f uv.lock ]; then \
|
| 45 |
+
uv sync --frozen --no-install-project --no-editable; \
|
| 46 |
+
else \
|
| 47 |
+
uv sync --no-install-project --no-editable; \
|
| 48 |
+
fi
|
| 49 |
+
|
| 50 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 51 |
+
if [ -f uv.lock ]; then \
|
| 52 |
+
uv sync --frozen --no-editable; \
|
| 53 |
+
else \
|
| 54 |
+
uv sync --no-editable; \
|
| 55 |
+
fi
|
| 56 |
+
|
| 57 |
+
# Final runtime stage
|
| 58 |
+
FROM ${BASE_IMAGE}
|
| 59 |
+
|
| 60 |
+
WORKDIR /app
|
| 61 |
+
|
| 62 |
+
# Copy the virtual environment from builder
|
| 63 |
+
COPY --from=builder /app/env/.venv /app/.venv
|
| 64 |
+
|
| 65 |
+
# Copy the environment code
|
| 66 |
+
COPY --from=builder /app/env /app/env
|
| 67 |
+
|
| 68 |
+
# Set PATH to use the virtual environment
|
| 69 |
+
ENV PATH="/app/.venv/bin:$PATH"
|
| 70 |
+
|
| 71 |
+
# Set PYTHONPATH so imports work correctly
|
| 72 |
+
ENV PYTHONPATH="/app/env:$PYTHONPATH"
|
| 73 |
+
|
| 74 |
+
# Health check
|
| 75 |
+
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
|
| 76 |
+
CMD curl -f http://localhost:8000/health || exit 1
|
| 77 |
+
|
| 78 |
+
# Run the FastAPI server
|
| 79 |
+
# The module path is constructed to work with the /app/env structure
|
| 80 |
+
ENV ENABLE_WEB_INTERFACE=true
|
| 81 |
+
CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
|
README.md
CHANGED
|
@@ -1,10 +1,127 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
|
|
|
| 7 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: PostMortem Incident Triage OpenEnv
|
| 3 |
+
emoji: 🚨
|
| 4 |
+
colorFrom: red
|
| 5 |
+
colorTo: yellow
|
| 6 |
sdk: docker
|
| 7 |
+
app_port: 8000
|
| 8 |
pinned: false
|
| 9 |
+
license: bsd-3-clause
|
| 10 |
+
tags:
|
| 11 |
+
- openenv
|
| 12 |
+
- rl
|
| 13 |
+
- sre
|
| 14 |
+
- incident-response
|
| 15 |
+
base_path: /web
|
| 16 |
---
|
| 17 |
|
| 18 |
+
# PostMortem — Live Incident Triage Environment
|
| 19 |
+
|
| 20 |
+
An OpenEnv environment where an LLM agent plays an on-call SRE responding to a
|
| 21 |
+
live production incident. Real-world task, typed OpenEnv spec, deterministic
|
| 22 |
+
grader, three difficulty tiers, dense process-reward signal.
|
| 23 |
+
|
| 24 |
+
## The task
|
| 25 |
+
|
| 26 |
+
On each episode the agent receives an alert. It must:
|
| 27 |
+
|
| 28 |
+
1. **ack** the incident (accept ownership)
|
| 29 |
+
2. **query_logs / query_metrics / query_traces** on services to gather evidence
|
| 30 |
+
3. **scope** the blast radius
|
| 31 |
+
4. **hypothesize** the root cause
|
| 32 |
+
5. **mitigate** (propose a concrete remediation)
|
| 33 |
+
6. **write_status** (post a customer-facing update)
|
| 34 |
+
|
| 35 |
+
All six verbs are exposed as a single typed action:
|
| 36 |
+
|
| 37 |
+
```python
|
| 38 |
+
PostmortemAction(tool="query_logs", args={"service": "api"})
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
## Action space
|
| 42 |
+
|
| 43 |
+
| tool | args | effect |
|
| 44 |
+
|----------------|-----------------------------------|-------------------------------------|
|
| 45 |
+
| `ack` | `{}` | accept the incident (sub-goal 1) |
|
| 46 |
+
| `query_logs` | `{"service": str}` | return recent log lines |
|
| 47 |
+
| `query_metrics`| `{"service": str}` | return latest metrics |
|
| 48 |
+
| `query_traces` | `{"trace_id": str}` | return distributed trace spans |
|
| 49 |
+
| `scope` | `{"services": list[str]}` | declare blast radius (sub-goal 2) |
|
| 50 |
+
| `hypothesize` | `{"root_cause": str}` | declare root cause (sub-goal 3) |
|
| 51 |
+
| `mitigate` | `{"action": str}` | apply mitigation (sub-goal 4) |
|
| 52 |
+
| `write_status` | `{"text": str}` | publish update, ends ep (sub-goal 5)|
|
| 53 |
+
|
| 54 |
+
## Observation space
|
| 55 |
+
|
| 56 |
+
Key fields of `PostmortemObservation`:
|
| 57 |
+
|
| 58 |
+
- `task_id`, `task_description`, `available_services`, `available_trace_ids`
|
| 59 |
+
- `tool_result` — free text result of the last tool call
|
| 60 |
+
- `subgoals` — bool dict `{acked, scoped, hypothesized, mitigated, written}`
|
| 61 |
+
- `reward_so_far` — cumulative reward in [0, 1]
|
| 62 |
+
- `steps_remaining`, `last_error`
|
| 63 |
+
- `done`, `reward` (current step)
|
| 64 |
+
|
| 65 |
+
## Tasks (3 difficulty tiers)
|
| 66 |
+
|
| 67 |
+
On each `reset()` the env rotates to the next scenario. Running three resets in
|
| 68 |
+
a row covers all three tiers in order.
|
| 69 |
+
|
| 70 |
+
| task_id | difficulty | incident |
|
| 71 |
+
|-------------------|------------|--------------------------------------------------------------|
|
| 72 |
+
| `easy_oom` | easy | `api` OOM-killed; cause directly visible in logs |
|
| 73 |
+
| `medium_cascade` | medium | checkout latency cascade; must correlate trace across 3 svcs |
|
| 74 |
+
| `hard_dns` | hard | 503s blamed on fresh `api` deploy, real cause is upstream DNS|
|
| 75 |
+
|
| 76 |
+
## Reward design
|
| 77 |
+
|
| 78 |
+
The reward is a **5-stage process-reward ladder** in `[0, 1]`:
|
| 79 |
+
|
| 80 |
+
```
|
| 81 |
+
ack +0.10 (granted on first successful ack)
|
| 82 |
+
scope +0.20 × Jaccard(agent_services, gold_services)
|
| 83 |
+
hypothesize +0.20 × keyword_fraction(agent_text, gold_hypothesis_keywords)
|
| 84 |
+
mitigate +0.20 × keyword_fraction(agent_text, gold_mitigation_keywords)
|
| 85 |
+
write_status +0.30 × keyword_fraction(agent_text, gold_writeup_keywords)
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
+
Each sub-goal is awarded once. The grader is fully **deterministic** — no LLM
|
| 89 |
+
judge, no randomness. Partial credit gives a smooth gradient. The episode
|
| 90 |
+
terminates when `write_status` fires or after `MAX_STEPS = 12`.
|
| 91 |
+
|
| 92 |
+
## Setup
|
| 93 |
+
|
| 94 |
+
```bash
|
| 95 |
+
pip install openenv-core
|
| 96 |
+
openenv build . # build Docker image
|
| 97 |
+
python inference.py # run baseline (3 scenarios)
|
| 98 |
+
```
|
| 99 |
+
|
| 100 |
+
### Required environment variables
|
| 101 |
+
|
| 102 |
+
| var | default | notes |
|
| 103 |
+
|----------------|------------------------------------------------|-------|
|
| 104 |
+
| `HF_TOKEN` | (required) | HuggingFace token, also used as the OpenAI client API key |
|
| 105 |
+
| `API_BASE_URL` | `https://router.huggingface.co/v1` | any OpenAI-compatible endpoint |
|
| 106 |
+
| `MODEL_NAME` | `Qwen/Qwen2.5-72B-Instruct` | any chat model |
|
| 107 |
+
| `IMAGE_NAME` | `postmortem_env-env:latest` | docker tag of the env image |
|
| 108 |
+
|
| 109 |
+
## Baseline reproduction
|
| 110 |
+
|
| 111 |
+
```bash
|
| 112 |
+
export HF_TOKEN=hf_...
|
| 113 |
+
export IMAGE_NAME=postmortem_env-env:latest
|
| 114 |
+
python inference.py
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
Emits strict `[START] / [STEP] / [END]` lines, one `[END]` per task.
|
| 118 |
+
|
| 119 |
+
## Resource budget
|
| 120 |
+
|
| 121 |
+
Well within the hackathon limits of **2 vCPU / 8 GB RAM**, and completes the
|
| 122 |
+
3-task sweep in **well under 20 minutes** (dominated by LLM latency, ≤ 36 LLM
|
| 123 |
+
calls total).
|
| 124 |
+
|
| 125 |
+
## License
|
| 126 |
+
|
| 127 |
+
BSD-3-Clause (matches OpenEnv core).
|
__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Postmortem Env Environment."""
|
| 8 |
+
|
| 9 |
+
from .client import PostmortemEnv
|
| 10 |
+
from .models import PostmortemAction, PostmortemObservation
|
| 11 |
+
|
| 12 |
+
__all__ = [
|
| 13 |
+
"PostmortemAction",
|
| 14 |
+
"PostmortemObservation",
|
| 15 |
+
"PostmortemEnv",
|
| 16 |
+
]
|
client.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
|
| 4 |
+
"""PostMortem Environment Client."""
|
| 5 |
+
|
| 6 |
+
from typing import Dict
|
| 7 |
+
|
| 8 |
+
from openenv.core import EnvClient
|
| 9 |
+
from openenv.core.client_types import StepResult
|
| 10 |
+
from openenv.core.env_server.types import State
|
| 11 |
+
|
| 12 |
+
from .models import PostmortemAction, PostmortemObservation
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class PostmortemEnv(EnvClient[PostmortemAction, PostmortemObservation, State]):
|
| 16 |
+
"""
|
| 17 |
+
Client for the PostMortem incident-triage environment.
|
| 18 |
+
|
| 19 |
+
Example:
|
| 20 |
+
>>> client = PostmortemEnv.from_docker_image("postmortem_env-env:latest")
|
| 21 |
+
>>> try:
|
| 22 |
+
... r = client.reset()
|
| 23 |
+
... print(r.observation.task_description)
|
| 24 |
+
... r = client.step(PostmortemAction(tool="ack"))
|
| 25 |
+
... finally:
|
| 26 |
+
... client.close()
|
| 27 |
+
"""
|
| 28 |
+
|
| 29 |
+
def _step_payload(self, action: PostmortemAction) -> Dict:
|
| 30 |
+
return {"tool": action.tool, "args": action.args}
|
| 31 |
+
|
| 32 |
+
def _parse_result(self, payload: Dict) -> StepResult[PostmortemObservation]:
|
| 33 |
+
obs_data = payload.get("observation", {})
|
| 34 |
+
observation = PostmortemObservation(
|
| 35 |
+
task_id=obs_data.get("task_id", ""),
|
| 36 |
+
task_description=obs_data.get("task_description", ""),
|
| 37 |
+
available_services=obs_data.get("available_services", []) or [],
|
| 38 |
+
available_trace_ids=obs_data.get("available_trace_ids", []) or [],
|
| 39 |
+
tool_result=obs_data.get("tool_result", ""),
|
| 40 |
+
subgoals=obs_data.get("subgoals", {}) or {},
|
| 41 |
+
reward_so_far=obs_data.get("reward_so_far", 0.0) or 0.0,
|
| 42 |
+
steps_remaining=obs_data.get("steps_remaining", 0) or 0,
|
| 43 |
+
last_error=obs_data.get("last_error", "") or "",
|
| 44 |
+
done=payload.get("done", False),
|
| 45 |
+
reward=payload.get("reward"),
|
| 46 |
+
metadata=obs_data.get("metadata", {}) or {},
|
| 47 |
+
)
|
| 48 |
+
return StepResult(
|
| 49 |
+
observation=observation,
|
| 50 |
+
reward=payload.get("reward"),
|
| 51 |
+
done=payload.get("done", False),
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
def _parse_state(self, payload: Dict) -> State:
|
| 55 |
+
return State(
|
| 56 |
+
episode_id=payload.get("episode_id"),
|
| 57 |
+
step_count=payload.get("step_count", 0),
|
| 58 |
+
)
|
inference.py
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Inference script for the PostMortem OpenEnv environment.
|
| 3 |
+
|
| 4 |
+
Runs all 3 scenarios (easy/medium/hard) end-to-end against a containerised env,
|
| 5 |
+
calling an LLM via the OpenAI client against a Hugging Face Router endpoint.
|
| 6 |
+
|
| 7 |
+
Environment variables (all read via os.getenv):
|
| 8 |
+
HF_TOKEN -- HuggingFace token used as the OpenAI API key.
|
| 9 |
+
API_BASE_URL -- LLM base URL (default: HF Router).
|
| 10 |
+
MODEL_NAME -- LLM model id (default: Qwen/Qwen2.5-72B-Instruct).
|
| 11 |
+
IMAGE_NAME -- local Docker image tag for the env container.
|
| 12 |
+
|
| 13 |
+
Strict stdout format enforced by the grader:
|
| 14 |
+
[START] task=<id> env=<benchmark> model=<model>
|
| 15 |
+
[STEP] step=<n> action=<tool:args> reward=<0.00> done=<true|false> error=<msg|null>
|
| 16 |
+
[END] success=<true|false> steps=<n> score=<0.000> rewards=<r1,r2,...>
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
from __future__ import annotations
|
| 20 |
+
|
| 21 |
+
import asyncio
|
| 22 |
+
import json
|
| 23 |
+
import os
|
| 24 |
+
import textwrap
|
| 25 |
+
from typing import List, Optional
|
| 26 |
+
|
| 27 |
+
from openai import OpenAI
|
| 28 |
+
|
| 29 |
+
# Support running as `python inference.py` from inside the env dir
|
| 30 |
+
# as well as `python -m postmortem_env.inference` from its parent.
|
| 31 |
+
try:
|
| 32 |
+
from postmortem_env.client import PostmortemEnv
|
| 33 |
+
from postmortem_env.models import PostmortemAction
|
| 34 |
+
except ModuleNotFoundError:
|
| 35 |
+
import os as _os, sys as _sys
|
| 36 |
+
_here = _os.path.dirname(_os.path.abspath(__file__))
|
| 37 |
+
_sys.path.insert(0, _os.path.dirname(_here))
|
| 38 |
+
from postmortem_env.client import PostmortemEnv # type: ignore
|
| 39 |
+
from postmortem_env.models import PostmortemAction # type: ignore
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# ---------- Config ----------
|
| 43 |
+
|
| 44 |
+
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 45 |
+
API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
|
| 46 |
+
MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
|
| 47 |
+
IMAGE_NAME = os.getenv("IMAGE_NAME") or "postmortem_env-env:latest"
|
| 48 |
+
BENCHMARK = "postmortem_env"
|
| 49 |
+
|
| 50 |
+
MAX_STEPS = 12
|
| 51 |
+
TEMPERATURE = 0.2
|
| 52 |
+
MAX_TOKENS = 220
|
| 53 |
+
SUCCESS_SCORE_THRESHOLD = 0.5 # per-task success threshold (normalized [0,1])
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
SYSTEM_PROMPT = textwrap.dedent(
|
| 57 |
+
"""
|
| 58 |
+
You are an on-call Site Reliability Engineer responding to a live production
|
| 59 |
+
incident. You MUST progress through 5 sub-goals in order to maximise reward:
|
| 60 |
+
|
| 61 |
+
1. ack (claim the incident)
|
| 62 |
+
2. scope (declare the affected services)
|
| 63 |
+
3. hypothesize (name the root cause)
|
| 64 |
+
4. mitigate (propose a concrete remediation)
|
| 65 |
+
5. write_status (publish a customer-facing update — ENDS the episode)
|
| 66 |
+
|
| 67 |
+
Between ack and scope, issue AT MOST ONE query_logs call per service and
|
| 68 |
+
AT MOST ONE query_metrics call per service, plus any needed query_traces.
|
| 69 |
+
DO NOT repeat the same query. Efficient investigation is rewarded.
|
| 70 |
+
|
| 71 |
+
Reply on each turn with a SINGLE JSON object on one line, nothing else:
|
| 72 |
+
{"tool": "<verb>", "args": {...}}
|
| 73 |
+
|
| 74 |
+
Valid tools and args:
|
| 75 |
+
- {"tool": "ack", "args": {}}
|
| 76 |
+
- {"tool": "query_logs", "args": {"service": "<name>"}}
|
| 77 |
+
- {"tool": "query_metrics", "args": {"service": "<name>"}}
|
| 78 |
+
- {"tool": "query_traces", "args": {"trace_id": "<id>"}}
|
| 79 |
+
- {"tool": "scope", "args": {"services": ["<name>", ...]}}
|
| 80 |
+
- {"tool": "hypothesize", "args": {"root_cause": "<short sentence with concrete keywords>"}}
|
| 81 |
+
- {"tool": "mitigate", "args": {"action": "<short sentence with a concrete fix>"}}
|
| 82 |
+
- {"tool": "write_status", "args": {"text": "<concise update naming service, cause, and fix>"}}
|
| 83 |
+
|
| 84 |
+
When you hypothesize/mitigate/write_status, INCLUDE the specific technical
|
| 85 |
+
keywords you observed in the logs (e.g. 'OOM', 'heap', 'DNS resolver',
|
| 86 |
+
'connection pool'), because the grader scores keyword coverage.
|
| 87 |
+
|
| 88 |
+
Reply with ONLY the JSON object. No prose, no markdown fences.
|
| 89 |
+
"""
|
| 90 |
+
).strip()
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
# ---------- Strict log format helpers ----------
|
| 94 |
+
|
| 95 |
+
def log_start(task: str, env: str, model: str) -> None:
|
| 96 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
|
| 100 |
+
error_val = error if error else "null"
|
| 101 |
+
done_val = str(done).lower()
|
| 102 |
+
# Strip newlines to honour the one-line-per-step rule
|
| 103 |
+
action_s = action.replace("\n", " ").replace("\r", " ")
|
| 104 |
+
err_s = error_val.replace("\n", " ").replace("\r", " ")
|
| 105 |
+
print(
|
| 106 |
+
f"[STEP] step={step} action={action_s} reward={reward:.2f} done={done_val} error={err_s}",
|
| 107 |
+
flush=True,
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
|
| 112 |
+
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 113 |
+
print(
|
| 114 |
+
f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}",
|
| 115 |
+
flush=True,
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
# ---------- Agent ----------
|
| 120 |
+
|
| 121 |
+
def build_user_prompt(step: int, obs_payload: dict, last_reward: float, queried: dict) -> str:
|
| 122 |
+
subgoals = obs_payload.get("subgoals") or {}
|
| 123 |
+
# Compute the next sub-goal to target
|
| 124 |
+
order = ["acked", "scoped", "hypothesized", "mitigated", "written"]
|
| 125 |
+
nxt = next((g for g in order if not subgoals.get(g, False)), "written")
|
| 126 |
+
hint_map = {
|
| 127 |
+
"acked": "Next: send {\"tool\": \"ack\", \"args\": {}}",
|
| 128 |
+
"scoped": 'Next: gather minimal evidence then send {"tool": "scope", "args": {"services": [...]}}',
|
| 129 |
+
"hypothesized": 'Next: send {"tool": "hypothesize", "args": {"root_cause": "..."}} with concrete keywords from logs',
|
| 130 |
+
"mitigated": 'Next: send {"tool": "mitigate", "args": {"action": "..."}} naming a concrete remediation',
|
| 131 |
+
"written": 'Next: send {"tool": "write_status", "args": {"text": "..."}} to end the episode with a concise status update',
|
| 132 |
+
}
|
| 133 |
+
return textwrap.dedent(
|
| 134 |
+
f"""
|
| 135 |
+
Step: {step} Steps remaining: {obs_payload.get('steps_remaining')}
|
| 136 |
+
Task: {obs_payload.get('task_id')}
|
| 137 |
+
Incident brief: {obs_payload.get('task_description')}
|
| 138 |
+
Services available: {obs_payload.get('available_services')}
|
| 139 |
+
Trace ids available: {obs_payload.get('available_trace_ids')}
|
| 140 |
+
Sub-goals: {subgoals}
|
| 141 |
+
Total reward so far: {obs_payload.get('reward_so_far'):.2f}
|
| 142 |
+
Services already queried (logs): {sorted(queried.get('logs', set()))}
|
| 143 |
+
Services already queried (metrics): {sorted(queried.get('metrics', set()))}
|
| 144 |
+
Traces already queried: {sorted(queried.get('traces', set()))}
|
| 145 |
+
|
| 146 |
+
Last tool result:
|
| 147 |
+
{obs_payload.get('tool_result')}
|
| 148 |
+
|
| 149 |
+
{hint_map[nxt]}
|
| 150 |
+
|
| 151 |
+
Reply with a single JSON object on one line.
|
| 152 |
+
"""
|
| 153 |
+
).strip()
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
def ask_model(client: OpenAI, user_prompt: str) -> dict:
|
| 157 |
+
try:
|
| 158 |
+
completion = client.chat.completions.create(
|
| 159 |
+
model=MODEL_NAME,
|
| 160 |
+
messages=[
|
| 161 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 162 |
+
{"role": "user", "content": user_prompt},
|
| 163 |
+
],
|
| 164 |
+
temperature=TEMPERATURE,
|
| 165 |
+
max_tokens=MAX_TOKENS,
|
| 166 |
+
)
|
| 167 |
+
text = (completion.choices[0].message.content or "").strip()
|
| 168 |
+
# Strip optional code fences
|
| 169 |
+
if text.startswith("```"):
|
| 170 |
+
text = text.strip("`")
|
| 171 |
+
# drop a leading "json" language tag if present
|
| 172 |
+
if text.lower().startswith("json"):
|
| 173 |
+
text = text[4:].strip()
|
| 174 |
+
# Find the first {...} block
|
| 175 |
+
lb, rb = text.find("{"), text.rfind("}")
|
| 176 |
+
if lb != -1 and rb != -1:
|
| 177 |
+
text = text[lb : rb + 1]
|
| 178 |
+
return json.loads(text)
|
| 179 |
+
except Exception as exc:
|
| 180 |
+
print(f"[DEBUG] Model call failed, defaulting to ack: {exc}", flush=True)
|
| 181 |
+
return {"tool": "ack", "args": {}}
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def obs_to_payload(obs) -> dict:
|
| 185 |
+
"""Serialise a PostmortemObservation for prompt construction."""
|
| 186 |
+
return {
|
| 187 |
+
"task_id": getattr(obs, "task_id", ""),
|
| 188 |
+
"task_description": getattr(obs, "task_description", ""),
|
| 189 |
+
"available_services": getattr(obs, "available_services", []),
|
| 190 |
+
"available_trace_ids": getattr(obs, "available_trace_ids", []),
|
| 191 |
+
"subgoals": getattr(obs, "subgoals", {}),
|
| 192 |
+
"reward_so_far": getattr(obs, "reward_so_far", 0.0),
|
| 193 |
+
"steps_remaining": getattr(obs, "steps_remaining", 0),
|
| 194 |
+
"tool_result": getattr(obs, "tool_result", ""),
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
async def run_one_task(client: OpenAI, env: PostmortemEnv) -> None:
|
| 199 |
+
rewards: List[float] = []
|
| 200 |
+
steps_taken = 0
|
| 201 |
+
score = 0.0
|
| 202 |
+
success = False
|
| 203 |
+
task_name = "unknown"
|
| 204 |
+
|
| 205 |
+
try:
|
| 206 |
+
result = await env.reset()
|
| 207 |
+
obs = result.observation
|
| 208 |
+
task_name = getattr(obs, "task_id", "unknown") or "unknown"
|
| 209 |
+
log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
|
| 210 |
+
|
| 211 |
+
last_reward = 0.0
|
| 212 |
+
done = result.done
|
| 213 |
+
queried = {"logs": set(), "metrics": set(), "traces": set()}
|
| 214 |
+
|
| 215 |
+
for step in range(1, MAX_STEPS + 1):
|
| 216 |
+
if done:
|
| 217 |
+
break
|
| 218 |
+
|
| 219 |
+
payload = obs_to_payload(obs)
|
| 220 |
+
user_prompt = build_user_prompt(step, payload, last_reward, queried)
|
| 221 |
+
tool_call = ask_model(client, user_prompt)
|
| 222 |
+
|
| 223 |
+
tool = str(tool_call.get("tool", "ack"))
|
| 224 |
+
args = tool_call.get("args", {}) or {}
|
| 225 |
+
if not isinstance(args, dict):
|
| 226 |
+
args = {}
|
| 227 |
+
|
| 228 |
+
# Track queries so the prompt can discourage repeats
|
| 229 |
+
if tool == "query_logs":
|
| 230 |
+
queried["logs"].add(args.get("service", ""))
|
| 231 |
+
elif tool == "query_metrics":
|
| 232 |
+
queried["metrics"].add(args.get("service", ""))
|
| 233 |
+
elif tool == "query_traces":
|
| 234 |
+
queried["traces"].add(args.get("trace_id", ""))
|
| 235 |
+
|
| 236 |
+
action = PostmortemAction(tool=tool, args=args)
|
| 237 |
+
result = await env.step(action)
|
| 238 |
+
obs = result.observation
|
| 239 |
+
|
| 240 |
+
reward = float(result.reward or 0.0)
|
| 241 |
+
done = bool(result.done)
|
| 242 |
+
error = getattr(obs, "last_error", "") or None
|
| 243 |
+
action_str = f"{tool}:{json.dumps(args, separators=(',', ':'))}"
|
| 244 |
+
|
| 245 |
+
rewards.append(reward)
|
| 246 |
+
steps_taken = step
|
| 247 |
+
last_reward = reward
|
| 248 |
+
|
| 249 |
+
log_step(step=step, action=action_str, reward=reward, done=done, error=error)
|
| 250 |
+
|
| 251 |
+
if done:
|
| 252 |
+
break
|
| 253 |
+
|
| 254 |
+
score = float(getattr(obs, "reward_so_far", 0.0) or sum(rewards))
|
| 255 |
+
score = min(max(score, 0.0), 1.0)
|
| 256 |
+
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 257 |
+
|
| 258 |
+
finally:
|
| 259 |
+
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
async def main() -> None:
|
| 263 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 264 |
+
|
| 265 |
+
# Spin up one containerised env; run the 3 scenarios back-to-back
|
| 266 |
+
# by issuing 3 resets (the env rotates scenarios on each reset).
|
| 267 |
+
env = await PostmortemEnv.from_docker_image(IMAGE_NAME)
|
| 268 |
+
try:
|
| 269 |
+
for _ in range(3):
|
| 270 |
+
await run_one_task(client, env)
|
| 271 |
+
finally:
|
| 272 |
+
try:
|
| 273 |
+
await env.close()
|
| 274 |
+
except Exception as exc:
|
| 275 |
+
print(f"[DEBUG] env.close() error: {exc}", flush=True)
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
if __name__ == "__main__":
|
| 279 |
+
asyncio.run(main())
|
models.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
|
| 4 |
+
"""
|
| 5 |
+
Data models for the PostMortem incident-triage environment.
|
| 6 |
+
|
| 7 |
+
An agent plays an on-call SRE responding to a live incident. It queries fake
|
| 8 |
+
telemetry (logs / metrics / traces), scopes blast radius, hypothesises a root
|
| 9 |
+
cause, applies a mitigation, and writes a status-page update. The reward is a
|
| 10 |
+
5-stage process-reward ladder in [0, 1].
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from typing import Any, Dict, List
|
| 14 |
+
|
| 15 |
+
from openenv.core.env_server.types import Action, Observation
|
| 16 |
+
from pydantic import Field
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class PostmortemAction(Action):
|
| 20 |
+
"""
|
| 21 |
+
Single action envelope. `tool` selects an investigation / response verb,
|
| 22 |
+
`args` is a tool-specific dict.
|
| 23 |
+
|
| 24 |
+
Supported tools:
|
| 25 |
+
- "ack" args: {}
|
| 26 |
+
- "query_logs" args: {"service": str}
|
| 27 |
+
- "query_metrics" args: {"service": str}
|
| 28 |
+
- "query_traces" args: {"trace_id": str}
|
| 29 |
+
- "scope" args: {"services": list[str]}
|
| 30 |
+
- "hypothesize" args: {"root_cause": str}
|
| 31 |
+
- "mitigate" args: {"action": str}
|
| 32 |
+
- "write_status" args: {"text": str}
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
tool: str = Field(..., description="Investigation/response verb")
|
| 36 |
+
args: Dict[str, Any] = Field(default_factory=dict, description="Tool args")
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class PostmortemObservation(Observation):
|
| 40 |
+
"""Observation returned after each step."""
|
| 41 |
+
|
| 42 |
+
task_id: str = Field(default="", description="Current scenario id")
|
| 43 |
+
task_description: str = Field(default="", description="Incident brief")
|
| 44 |
+
available_services: List[str] = Field(default_factory=list)
|
| 45 |
+
available_trace_ids: List[str] = Field(default_factory=list)
|
| 46 |
+
tool_result: str = Field(default="", description="Result from the last tool call")
|
| 47 |
+
subgoals: Dict[str, bool] = Field(
|
| 48 |
+
default_factory=lambda: {
|
| 49 |
+
"acked": False,
|
| 50 |
+
"scoped": False,
|
| 51 |
+
"hypothesized": False,
|
| 52 |
+
"mitigated": False,
|
| 53 |
+
"written": False,
|
| 54 |
+
}
|
| 55 |
+
)
|
| 56 |
+
reward_so_far: float = Field(default=0.0)
|
| 57 |
+
steps_remaining: int = Field(default=0)
|
| 58 |
+
last_error: str = Field(default="")
|
openenv.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
spec_version: 1
|
| 2 |
+
name: postmortem_env
|
| 3 |
+
type: space
|
| 4 |
+
runtime: fastapi
|
| 5 |
+
app: server.app:app
|
| 6 |
+
port: 8000
|
| 7 |
+
|
pyproject.toml
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
[build-system]
|
| 8 |
+
requires = ["setuptools>=45", "wheel"]
|
| 9 |
+
build-backend = "setuptools.build_meta"
|
| 10 |
+
|
| 11 |
+
[project]
|
| 12 |
+
name = "openenv-postmortem_env"
|
| 13 |
+
version = "0.1.0"
|
| 14 |
+
description = "Postmortem Env environment for OpenEnv"
|
| 15 |
+
requires-python = ">=3.10"
|
| 16 |
+
dependencies = [
|
| 17 |
+
# Core OpenEnv runtime (provides FastAPI server + HTTP client types)
|
| 18 |
+
# install from github
|
| 19 |
+
# "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
|
| 20 |
+
"openenv-core[core]>=0.2.2",
|
| 21 |
+
# Environment-specific dependencies
|
| 22 |
+
# Add all dependencies needed for your environment here
|
| 23 |
+
# Examples:
|
| 24 |
+
# "numpy>=1.19.0",
|
| 25 |
+
# "torch>=2.0.0",
|
| 26 |
+
# "gymnasium>=0.29.0",
|
| 27 |
+
# "openspiel>=1.0.0",
|
| 28 |
+
# "smolagents>=1.22.0,<2",
|
| 29 |
+
]
|
| 30 |
+
|
| 31 |
+
[project.optional-dependencies]
|
| 32 |
+
dev = [
|
| 33 |
+
"pytest>=8.0.0",
|
| 34 |
+
"pytest-cov>=4.0.0",
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
+
[project.scripts]
|
| 38 |
+
# Server entry point - enables running via: uv run --project . server
|
| 39 |
+
# or: python -m postmortem_env.server.app
|
| 40 |
+
server = "postmortem_env.server.app:main"
|
| 41 |
+
|
| 42 |
+
[tool.setuptools]
|
| 43 |
+
include-package-data = true
|
| 44 |
+
packages = ["postmortem_env", "postmortem_env.server"]
|
| 45 |
+
package-dir = { "postmortem_env" = ".", "postmortem_env.server" = "server" }
|
server/Dockerfile
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
# Multi-stage build using openenv-base
|
| 8 |
+
# This Dockerfile is flexible and works for both:
|
| 9 |
+
# - In-repo environments (with local OpenEnv sources)
|
| 10 |
+
# - Standalone environments (with openenv from PyPI/Git)
|
| 11 |
+
# The build script (openenv build) handles context detection and sets appropriate build args.
|
| 12 |
+
|
| 13 |
+
ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
|
| 14 |
+
FROM ${BASE_IMAGE} AS builder
|
| 15 |
+
|
| 16 |
+
WORKDIR /app
|
| 17 |
+
|
| 18 |
+
# Ensure git is available (required for installing dependencies from VCS)
|
| 19 |
+
RUN apt-get update && \
|
| 20 |
+
apt-get install -y --no-install-recommends git && \
|
| 21 |
+
rm -rf /var/lib/apt/lists/*
|
| 22 |
+
|
| 23 |
+
# Build argument to control whether we're building standalone or in-repo
|
| 24 |
+
ARG BUILD_MODE=in-repo
|
| 25 |
+
ARG ENV_NAME=postmortem_env
|
| 26 |
+
|
| 27 |
+
# Copy environment code (always at root of build context)
|
| 28 |
+
COPY . /app/env
|
| 29 |
+
|
| 30 |
+
# For in-repo builds, openenv is already vendored in the build context
|
| 31 |
+
# For standalone builds, openenv will be installed via pyproject.toml
|
| 32 |
+
WORKDIR /app/env
|
| 33 |
+
|
| 34 |
+
# Ensure uv is available (for local builds where base image lacks it)
|
| 35 |
+
RUN if ! command -v uv >/dev/null 2>&1; then \
|
| 36 |
+
curl -LsSf https://astral.sh/uv/install.sh | sh && \
|
| 37 |
+
mv /root/.local/bin/uv /usr/local/bin/uv && \
|
| 38 |
+
mv /root/.local/bin/uvx /usr/local/bin/uvx; \
|
| 39 |
+
fi
|
| 40 |
+
|
| 41 |
+
# Install dependencies using uv sync
|
| 42 |
+
# If uv.lock exists, use it; otherwise resolve on the fly
|
| 43 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 44 |
+
if [ -f uv.lock ]; then \
|
| 45 |
+
uv sync --frozen --no-install-project --no-editable; \
|
| 46 |
+
else \
|
| 47 |
+
uv sync --no-install-project --no-editable; \
|
| 48 |
+
fi
|
| 49 |
+
|
| 50 |
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
| 51 |
+
if [ -f uv.lock ]; then \
|
| 52 |
+
uv sync --frozen --no-editable; \
|
| 53 |
+
else \
|
| 54 |
+
uv sync --no-editable; \
|
| 55 |
+
fi
|
| 56 |
+
|
| 57 |
+
# Final runtime stage
|
| 58 |
+
FROM ${BASE_IMAGE}
|
| 59 |
+
|
| 60 |
+
WORKDIR /app
|
| 61 |
+
|
| 62 |
+
# Copy the virtual environment from builder
|
| 63 |
+
COPY --from=builder /app/env/.venv /app/.venv
|
| 64 |
+
|
| 65 |
+
# Copy the environment code
|
| 66 |
+
COPY --from=builder /app/env /app/env
|
| 67 |
+
|
| 68 |
+
# Set PATH to use the virtual environment
|
| 69 |
+
ENV PATH="/app/.venv/bin:$PATH"
|
| 70 |
+
|
| 71 |
+
# Set PYTHONPATH so imports work correctly
|
| 72 |
+
ENV PYTHONPATH="/app/env:$PYTHONPATH"
|
| 73 |
+
|
| 74 |
+
# Health check
|
| 75 |
+
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
|
| 76 |
+
CMD curl -f http://localhost:8000/health || exit 1
|
| 77 |
+
|
| 78 |
+
# Run the FastAPI server
|
| 79 |
+
# The module path is constructed to work with the /app/env structure
|
| 80 |
+
CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]
|
server/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Postmortem Env environment server components."""
|
| 8 |
+
|
| 9 |
+
from .postmortem_env_environment import PostmortemEnvironment
|
| 10 |
+
|
| 11 |
+
__all__ = ["PostmortemEnvironment"]
|
server/app.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
|
| 4 |
+
"""FastAPI application for the PostMortem Environment."""
|
| 5 |
+
|
| 6 |
+
try:
|
| 7 |
+
from openenv.core.env_server.http_server import create_app
|
| 8 |
+
except Exception as e: # pragma: no cover
|
| 9 |
+
raise ImportError(
|
| 10 |
+
"openenv-core is required. Install with 'pip install openenv-core'."
|
| 11 |
+
) from e
|
| 12 |
+
|
| 13 |
+
try:
|
| 14 |
+
from ..models import PostmortemAction, PostmortemObservation
|
| 15 |
+
from .postmortem_env_environment import PostmortemEnvironment
|
| 16 |
+
except (ImportError, ModuleNotFoundError): # Docker / direct-run fallback
|
| 17 |
+
import os, sys
|
| 18 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 19 |
+
from models import PostmortemAction, PostmortemObservation # type: ignore
|
| 20 |
+
from server.postmortem_env_environment import PostmortemEnvironment # type: ignore
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
app = create_app(
|
| 24 |
+
PostmortemEnvironment,
|
| 25 |
+
PostmortemAction,
|
| 26 |
+
PostmortemObservation,
|
| 27 |
+
env_name="postmortem_env",
|
| 28 |
+
max_concurrent_envs=1,
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def main(host: str = "0.0.0.0", port: int = 8000) -> None:
|
| 33 |
+
"""Entry point for direct execution."""
|
| 34 |
+
import uvicorn
|
| 35 |
+
|
| 36 |
+
uvicorn.run(app, host=host, port=port)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
if __name__ == "__main__":
|
| 40 |
+
import argparse
|
| 41 |
+
|
| 42 |
+
parser = argparse.ArgumentParser()
|
| 43 |
+
parser.add_argument("--port", type=int, default=8000)
|
| 44 |
+
parser.add_argument("--host", type=str, default="0.0.0.0")
|
| 45 |
+
args, _ = parser.parse_known_args()
|
| 46 |
+
if args.host == "0.0.0.0" and args.port == 8000:
|
| 47 |
+
main()
|
| 48 |
+
else:
|
| 49 |
+
main(host=args.host, port=args.port)
|
server/postmortem_env_environment.py
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
|
| 4 |
+
"""
|
| 5 |
+
PostMortem Environment — incident triage as an OpenEnv env.
|
| 6 |
+
|
| 7 |
+
Agent plays an on-call SRE. It interacts via typed actions (query_logs,
|
| 8 |
+
query_metrics, query_traces, ack, scope, hypothesize, mitigate, write_status)
|
| 9 |
+
against one of three fixed scenarios that rotate on reset(). The reward is a
|
| 10 |
+
5-stage process-reward ladder in [0, 1]:
|
| 11 |
+
|
| 12 |
+
ack +0.10
|
| 13 |
+
scope +0.20 (Jaccard overlap vs. gold service set)
|
| 14 |
+
hypothesize +0.20 (fraction of gold keywords mentioned)
|
| 15 |
+
mitigate +0.20 (fraction of gold keywords mentioned)
|
| 16 |
+
write_status +0.30 (fraction of gold keywords mentioned)
|
| 17 |
+
|
| 18 |
+
Each sub-goal can only be claimed once. Episodes terminate on `write_status`
|
| 19 |
+
or after MAX_STEPS (12).
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
from typing import Any, Dict, List
|
| 23 |
+
from uuid import uuid4
|
| 24 |
+
|
| 25 |
+
from openenv.core.env_server.interfaces import Environment
|
| 26 |
+
from openenv.core.env_server.types import State
|
| 27 |
+
|
| 28 |
+
try:
|
| 29 |
+
from ..models import PostmortemAction, PostmortemObservation
|
| 30 |
+
from .scenarios import SCENARIOS, num_scenarios
|
| 31 |
+
except (ImportError, ModuleNotFoundError): # Docker / direct-run fallback
|
| 32 |
+
import os, sys
|
| 33 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 34 |
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
| 35 |
+
from models import PostmortemAction, PostmortemObservation # type: ignore
|
| 36 |
+
from scenarios import SCENARIOS, num_scenarios # type: ignore
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
MAX_STEPS = 12
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# ---------- Reward helpers ----------
|
| 43 |
+
|
| 44 |
+
def _jaccard(a: List[str], b: List[str]) -> float:
|
| 45 |
+
if not a and not b:
|
| 46 |
+
return 1.0
|
| 47 |
+
sa, sb = {x.strip().lower() for x in a}, {x.strip().lower() for x in b}
|
| 48 |
+
if not sa or not sb:
|
| 49 |
+
return 0.0
|
| 50 |
+
return len(sa & sb) / len(sa | sb)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def _keyword_fraction(text: str, keywords: List[str]) -> float:
|
| 54 |
+
if not keywords:
|
| 55 |
+
return 0.0
|
| 56 |
+
t = text.lower()
|
| 57 |
+
hits = sum(1 for k in keywords if k.lower() in t)
|
| 58 |
+
return hits / len(keywords)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
# ---------- Environment ----------
|
| 62 |
+
|
| 63 |
+
class PostmortemEnvironment(Environment):
|
| 64 |
+
"""Incident triage environment."""
|
| 65 |
+
|
| 66 |
+
SUPPORTS_CONCURRENT_SESSIONS: bool = True
|
| 67 |
+
|
| 68 |
+
def __init__(self) -> None:
|
| 69 |
+
self._state = State(episode_id=str(uuid4()), step_count=0)
|
| 70 |
+
self._scenario_idx = 0
|
| 71 |
+
self._scenario: Dict[str, Any] = SCENARIOS[0]
|
| 72 |
+
self._subgoals: Dict[str, bool] = {
|
| 73 |
+
"acked": False,
|
| 74 |
+
"scoped": False,
|
| 75 |
+
"hypothesized": False,
|
| 76 |
+
"mitigated": False,
|
| 77 |
+
"written": False,
|
| 78 |
+
}
|
| 79 |
+
self._reward_so_far = 0.0
|
| 80 |
+
self._done = False
|
| 81 |
+
self._last_error = ""
|
| 82 |
+
|
| 83 |
+
# ---- env API ----
|
| 84 |
+
|
| 85 |
+
def reset(self) -> PostmortemObservation:
|
| 86 |
+
# Rotate to next scenario on each reset so a run of 3 resets
|
| 87 |
+
# covers all three difficulty tiers in order.
|
| 88 |
+
self._scenario = SCENARIOS[self._scenario_idx % num_scenarios()]
|
| 89 |
+
self._scenario_idx += 1
|
| 90 |
+
self._state = State(episode_id=str(uuid4()), step_count=0)
|
| 91 |
+
self._subgoals = {k: False for k in self._subgoals}
|
| 92 |
+
self._reward_so_far = 0.0
|
| 93 |
+
self._done = False
|
| 94 |
+
self._last_error = ""
|
| 95 |
+
|
| 96 |
+
return PostmortemObservation(
|
| 97 |
+
task_id=self._scenario["task_id"],
|
| 98 |
+
task_description=self._scenario["description"],
|
| 99 |
+
available_services=list(self._scenario["services"]),
|
| 100 |
+
available_trace_ids=list(self._scenario.get("traces", {}).keys()),
|
| 101 |
+
tool_result="Incident opened. Begin investigation.",
|
| 102 |
+
subgoals=dict(self._subgoals),
|
| 103 |
+
reward_so_far=0.0,
|
| 104 |
+
steps_remaining=MAX_STEPS,
|
| 105 |
+
last_error="",
|
| 106 |
+
done=False,
|
| 107 |
+
reward=0.0,
|
| 108 |
+
metadata={"difficulty": self._scenario.get("difficulty", "")},
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
def step(self, action: PostmortemAction) -> PostmortemObservation: # type: ignore[override]
|
| 112 |
+
self._state.step_count += 1
|
| 113 |
+
tool = (action.tool or "").strip().lower()
|
| 114 |
+
args = action.args or {}
|
| 115 |
+
tool_result = ""
|
| 116 |
+
step_reward = 0.0
|
| 117 |
+
self._last_error = ""
|
| 118 |
+
|
| 119 |
+
try:
|
| 120 |
+
if tool == "ack":
|
| 121 |
+
if not self._subgoals["acked"]:
|
| 122 |
+
self._subgoals["acked"] = True
|
| 123 |
+
step_reward = 0.10
|
| 124 |
+
tool_result = "Acknowledged. You now own this incident."
|
| 125 |
+
else:
|
| 126 |
+
tool_result = "Already acknowledged."
|
| 127 |
+
|
| 128 |
+
elif tool == "query_logs":
|
| 129 |
+
service = str(args.get("service", "")).strip()
|
| 130 |
+
logs = self._scenario.get("logs", {}).get(service)
|
| 131 |
+
if logs is None:
|
| 132 |
+
self._last_error = f"unknown service '{service}'"
|
| 133 |
+
tool_result = f"ERROR: {self._last_error}"
|
| 134 |
+
else:
|
| 135 |
+
tool_result = "\n".join(logs)
|
| 136 |
+
|
| 137 |
+
elif tool == "query_metrics":
|
| 138 |
+
service = str(args.get("service", "")).strip()
|
| 139 |
+
metrics = self._scenario.get("metrics", {}).get(service)
|
| 140 |
+
if metrics is None:
|
| 141 |
+
self._last_error = f"unknown service '{service}'"
|
| 142 |
+
tool_result = f"ERROR: {self._last_error}"
|
| 143 |
+
else:
|
| 144 |
+
tool_result = ", ".join(f"{k}={v}" for k, v in metrics.items())
|
| 145 |
+
|
| 146 |
+
elif tool == "query_traces":
|
| 147 |
+
trace_id = str(args.get("trace_id", "")).strip()
|
| 148 |
+
trace = self._scenario.get("traces", {}).get(trace_id)
|
| 149 |
+
if trace is None:
|
| 150 |
+
self._last_error = f"unknown trace_id '{trace_id}'"
|
| 151 |
+
tool_result = f"ERROR: {self._last_error}"
|
| 152 |
+
else:
|
| 153 |
+
tool_result = " | ".join(
|
| 154 |
+
f"{s['service']}:{s['op']} {s['duration_ms']}ms err={s.get('error', False)}"
|
| 155 |
+
for s in trace
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
elif tool == "scope":
|
| 159 |
+
services = args.get("services", [])
|
| 160 |
+
if not isinstance(services, list):
|
| 161 |
+
self._last_error = "scope.services must be a list"
|
| 162 |
+
tool_result = f"ERROR: {self._last_error}"
|
| 163 |
+
elif not self._subgoals["scoped"]:
|
| 164 |
+
jac = _jaccard(services, self._scenario["gold"]["scope"])
|
| 165 |
+
gained = 0.20 * jac
|
| 166 |
+
step_reward = gained
|
| 167 |
+
self._subgoals["scoped"] = True
|
| 168 |
+
tool_result = f"Scope recorded. Match vs gold = {jac:.2f}"
|
| 169 |
+
else:
|
| 170 |
+
tool_result = "Scope already set."
|
| 171 |
+
|
| 172 |
+
elif tool == "hypothesize":
|
| 173 |
+
cause = str(args.get("root_cause", ""))
|
| 174 |
+
if not self._subgoals["hypothesized"]:
|
| 175 |
+
frac = _keyword_fraction(cause, self._scenario["gold"]["hypothesis_keywords"])
|
| 176 |
+
gained = 0.20 * frac
|
| 177 |
+
step_reward = gained
|
| 178 |
+
self._subgoals["hypothesized"] = True
|
| 179 |
+
tool_result = f"Hypothesis recorded. Keyword match = {frac:.2f}"
|
| 180 |
+
else:
|
| 181 |
+
tool_result = "Hypothesis already set."
|
| 182 |
+
|
| 183 |
+
elif tool == "mitigate":
|
| 184 |
+
mit = str(args.get("action", ""))
|
| 185 |
+
if not self._subgoals["mitigated"]:
|
| 186 |
+
frac = _keyword_fraction(mit, self._scenario["gold"]["mitigation_keywords"])
|
| 187 |
+
gained = 0.20 * frac
|
| 188 |
+
step_reward = gained
|
| 189 |
+
self._subgoals["mitigated"] = True
|
| 190 |
+
tool_result = f"Mitigation applied. Keyword match = {frac:.2f}"
|
| 191 |
+
else:
|
| 192 |
+
tool_result = "Mitigation already applied."
|
| 193 |
+
|
| 194 |
+
elif tool == "write_status":
|
| 195 |
+
text = str(args.get("text", ""))
|
| 196 |
+
if not self._subgoals["written"]:
|
| 197 |
+
frac = _keyword_fraction(text, self._scenario["gold"]["writeup_keywords"])
|
| 198 |
+
gained = 0.30 * frac
|
| 199 |
+
step_reward = gained
|
| 200 |
+
self._subgoals["written"] = True
|
| 201 |
+
tool_result = f"Status update published. Keyword match = {frac:.2f}"
|
| 202 |
+
self._done = True # writeup ends the episode
|
| 203 |
+
else:
|
| 204 |
+
tool_result = "Status update already published."
|
| 205 |
+
|
| 206 |
+
else:
|
| 207 |
+
self._last_error = f"unknown tool '{tool}'"
|
| 208 |
+
tool_result = (
|
| 209 |
+
f"ERROR: {self._last_error}. Valid: ack, query_logs, query_metrics, "
|
| 210 |
+
"query_traces, scope, hypothesize, mitigate, write_status."
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
except Exception as exc: # defensive — never crash the server
|
| 214 |
+
self._last_error = f"internal: {exc}"
|
| 215 |
+
tool_result = f"ERROR: {self._last_error}"
|
| 216 |
+
|
| 217 |
+
self._reward_so_far = min(1.0, max(0.0, self._reward_so_far + step_reward))
|
| 218 |
+
|
| 219 |
+
if self._state.step_count >= MAX_STEPS:
|
| 220 |
+
self._done = True
|
| 221 |
+
|
| 222 |
+
return PostmortemObservation(
|
| 223 |
+
task_id=self._scenario["task_id"],
|
| 224 |
+
task_description=self._scenario["description"],
|
| 225 |
+
available_services=list(self._scenario["services"]),
|
| 226 |
+
available_trace_ids=list(self._scenario.get("traces", {}).keys()),
|
| 227 |
+
tool_result=tool_result,
|
| 228 |
+
subgoals=dict(self._subgoals),
|
| 229 |
+
reward_so_far=self._reward_so_far,
|
| 230 |
+
steps_remaining=max(0, MAX_STEPS - self._state.step_count),
|
| 231 |
+
last_error=self._last_error,
|
| 232 |
+
done=self._done,
|
| 233 |
+
reward=step_reward,
|
| 234 |
+
metadata={"difficulty": self._scenario.get("difficulty", "")},
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
@property
|
| 238 |
+
def state(self) -> State:
|
| 239 |
+
return self._state
|
server/requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv-core[core]>=0.2.2
|
| 2 |
+
fastapi>=0.115.0
|
| 3 |
+
uvicorn>=0.24.0
|
| 4 |
+
openai>=1.40.0
|
| 5 |
+
pydantic>=2.0.0
|
server/scenarios.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
|
| 4 |
+
"""
|
| 5 |
+
Three fixed incident scenarios for the PostMortem env.
|
| 6 |
+
|
| 7 |
+
Each scenario is a self-contained dict with:
|
| 8 |
+
- description: the brief an on-call engineer would receive
|
| 9 |
+
- services: list of service names the agent may query
|
| 10 |
+
- logs: {service: [log lines]}
|
| 11 |
+
- metrics: {service: {metric: value}}
|
| 12 |
+
- traces: {trace_id: [span dicts]}
|
| 13 |
+
- gold: the oracle answers for grading
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
from typing import Dict, List, Any
|
| 17 |
+
|
| 18 |
+
SCENARIOS: List[Dict[str, Any]] = [
|
| 19 |
+
# ---------- EASY ----------
|
| 20 |
+
{
|
| 21 |
+
"task_id": "easy_oom",
|
| 22 |
+
"difficulty": "easy",
|
| 23 |
+
"description": (
|
| 24 |
+
"ALERT: Service `api` is returning HTTP 500 on ~80% of requests. "
|
| 25 |
+
"On-call has paged you. Investigate, scope the blast radius, "
|
| 26 |
+
"identify the root cause, mitigate, and write a status-page update."
|
| 27 |
+
),
|
| 28 |
+
"services": ["api", "db", "auth"],
|
| 29 |
+
"logs": {
|
| 30 |
+
"api": [
|
| 31 |
+
"2026-04-08T19:55:01Z INFO request id=req-001 path=/v1/users",
|
| 32 |
+
"2026-04-08T19:55:02Z ERROR java.lang.OutOfMemoryError: Java heap space",
|
| 33 |
+
"2026-04-08T19:55:02Z ERROR java.lang.OutOfMemoryError: Java heap space",
|
| 34 |
+
"2026-04-08T19:55:03Z WARN GC overhead limit exceeded",
|
| 35 |
+
"2026-04-08T19:55:03Z ERROR pod api-7f8b restarting OOMKilled",
|
| 36 |
+
],
|
| 37 |
+
"db": [
|
| 38 |
+
"2026-04-08T19:55:01Z INFO connection accepted from api",
|
| 39 |
+
"2026-04-08T19:55:02Z INFO query completed in 12ms",
|
| 40 |
+
"2026-04-08T19:55:05Z INFO idle connections=45",
|
| 41 |
+
],
|
| 42 |
+
"auth": [
|
| 43 |
+
"2026-04-08T19:55:01Z INFO token issued",
|
| 44 |
+
"2026-04-08T19:55:03Z INFO token validated",
|
| 45 |
+
],
|
| 46 |
+
},
|
| 47 |
+
"metrics": {
|
| 48 |
+
"api": {"cpu_pct": 98, "mem_pct": 99, "p99_latency_ms": 12000, "error_rate": 0.82},
|
| 49 |
+
"db": {"cpu_pct": 22, "mem_pct": 34, "p99_latency_ms": 18, "error_rate": 0.0},
|
| 50 |
+
"auth": {"cpu_pct": 8, "mem_pct": 14, "p99_latency_ms": 6, "error_rate": 0.0},
|
| 51 |
+
},
|
| 52 |
+
"traces": {
|
| 53 |
+
"trace_abc": [
|
| 54 |
+
{"service": "api", "op": "GET /v1/users", "duration_ms": 11800, "error": True},
|
| 55 |
+
{"service": "db", "op": "SELECT users", "duration_ms": 14, "error": False},
|
| 56 |
+
],
|
| 57 |
+
},
|
| 58 |
+
"gold": {
|
| 59 |
+
"scope": ["api"],
|
| 60 |
+
"hypothesis_keywords": ["oom", "memory", "heap"],
|
| 61 |
+
"mitigation_keywords": ["restart", "rollback", "scale", "increase heap", "increase memory"],
|
| 62 |
+
"writeup_keywords": ["api", "memory", "restart", "resolved"],
|
| 63 |
+
},
|
| 64 |
+
},
|
| 65 |
+
|
| 66 |
+
# ---------- MEDIUM ----------
|
| 67 |
+
{
|
| 68 |
+
"task_id": "medium_cascade",
|
| 69 |
+
"difficulty": "medium",
|
| 70 |
+
"description": (
|
| 71 |
+
"ALERT: Checkout latency p99 has crossed 5 seconds in the last 10 minutes. "
|
| 72 |
+
"Three services are involved: `checkout`, `payments`, `inventory`. "
|
| 73 |
+
"Correlate across logs, metrics and traces to find the root service, "
|
| 74 |
+
"then scope / hypothesise / mitigate / communicate."
|
| 75 |
+
),
|
| 76 |
+
"services": ["checkout", "payments", "inventory"],
|
| 77 |
+
"logs": {
|
| 78 |
+
"checkout": [
|
| 79 |
+
"2026-04-08T20:10:01Z INFO POST /checkout trace_id=trace_xyz",
|
| 80 |
+
"2026-04-08T20:10:08Z WARN downstream payments slow (7s)",
|
| 81 |
+
"2026-04-08T20:10:08Z INFO returning 200 to client",
|
| 82 |
+
],
|
| 83 |
+
"payments": [
|
| 84 |
+
"2026-04-08T20:10:01Z INFO charge_card trace_id=trace_xyz",
|
| 85 |
+
"2026-04-08T20:10:06Z WARN inventory check blocking",
|
| 86 |
+
"2026-04-08T20:10:07Z INFO charge_card success",
|
| 87 |
+
],
|
| 88 |
+
"inventory": [
|
| 89 |
+
"2026-04-08T20:10:01Z INFO reserve_items trace_id=trace_xyz",
|
| 90 |
+
"2026-04-08T20:10:05Z ERROR connection pool exhausted (max=20)",
|
| 91 |
+
"2026-04-08T20:10:06Z ERROR connection pool exhausted (max=20)",
|
| 92 |
+
"2026-04-08T20:10:06Z WARN request queued for 5400ms",
|
| 93 |
+
],
|
| 94 |
+
},
|
| 95 |
+
"metrics": {
|
| 96 |
+
"checkout": {"cpu_pct": 40, "mem_pct": 55, "p99_latency_ms": 7800, "error_rate": 0.01},
|
| 97 |
+
"payments": {"cpu_pct": 35, "mem_pct": 42, "p99_latency_ms": 6900, "error_rate": 0.0},
|
| 98 |
+
"inventory": {"cpu_pct": 12, "mem_pct": 28, "p99_latency_ms": 5600, "error_rate": 0.0, "conn_pool_waiting": 44},
|
| 99 |
+
},
|
| 100 |
+
"traces": {
|
| 101 |
+
"trace_xyz": [
|
| 102 |
+
{"service": "checkout", "op": "POST /checkout", "duration_ms": 7800, "error": False},
|
| 103 |
+
{"service": "payments", "op": "charge_card", "duration_ms": 6900, "error": False},
|
| 104 |
+
{"service": "inventory", "op": "reserve_items", "duration_ms": 5500, "error": False},
|
| 105 |
+
],
|
| 106 |
+
},
|
| 107 |
+
"gold": {
|
| 108 |
+
"scope": ["checkout", "payments", "inventory"],
|
| 109 |
+
"hypothesis_keywords": ["inventory", "connection pool", "pool exhaust", "conn"],
|
| 110 |
+
"mitigation_keywords": ["increase pool", "pool size", "restart inventory", "scale inventory"],
|
| 111 |
+
"writeup_keywords": ["inventory", "connection", "pool", "latency"],
|
| 112 |
+
},
|
| 113 |
+
},
|
| 114 |
+
|
| 115 |
+
# ---------- HARD ----------
|
| 116 |
+
{
|
| 117 |
+
"task_id": "hard_dns",
|
| 118 |
+
"difficulty": "hard",
|
| 119 |
+
"description": (
|
| 120 |
+
"ALERT: Intermittent 503s across multiple services (`web`, `api`, `worker`). "
|
| 121 |
+
"A deploy of `api` went out 10 minutes ago and is the obvious suspect. "
|
| 122 |
+
"Correlate carefully — the real root cause may be upstream. "
|
| 123 |
+
"Scope, hypothesise, mitigate, and write a customer-facing status update."
|
| 124 |
+
),
|
| 125 |
+
"services": ["web", "api", "worker"],
|
| 126 |
+
"logs": {
|
| 127 |
+
"web": [
|
| 128 |
+
"2026-04-08T21:00:01Z INFO GET /home 200",
|
| 129 |
+
"2026-04-08T21:00:07Z ERROR getaddrinfo ENOTFOUND api.internal",
|
| 130 |
+
"2026-04-08T21:00:08Z ERROR getaddrinfo ENOTFOUND api.internal",
|
| 131 |
+
"2026-04-08T21:00:09Z ERROR upstream connect timeout",
|
| 132 |
+
],
|
| 133 |
+
"api": [
|
| 134 |
+
"2026-04-08T20:50:00Z INFO deploy v2.31.0 started",
|
| 135 |
+
"2026-04-08T20:51:10Z INFO deploy v2.31.0 complete, healthy",
|
| 136 |
+
"2026-04-08T21:00:07Z INFO process healthy, listening on :8080",
|
| 137 |
+
"2026-04-08T21:00:07Z INFO request handled 200",
|
| 138 |
+
],
|
| 139 |
+
"worker": [
|
| 140 |
+
"2026-04-08T21:00:01Z INFO picked up job id=42",
|
| 141 |
+
"2026-04-08T21:00:06Z ERROR dial tcp: lookup api.internal on 10.0.0.2:53: no such host",
|
| 142 |
+
"2026-04-08T21:00:08Z ERROR dial tcp: lookup api.internal on 10.0.0.2:53: no such host",
|
| 143 |
+
],
|
| 144 |
+
},
|
| 145 |
+
"metrics": {
|
| 146 |
+
"web": {"cpu_pct": 20, "mem_pct": 35, "p99_latency_ms": 9000, "error_rate": 0.45},
|
| 147 |
+
"api": {"cpu_pct": 14, "mem_pct": 31, "p99_latency_ms": 42, "error_rate": 0.0},
|
| 148 |
+
"worker": {"cpu_pct": 18, "mem_pct": 27, "p99_latency_ms": 8800, "error_rate": 0.55},
|
| 149 |
+
},
|
| 150 |
+
"traces": {
|
| 151 |
+
"trace_qqq": [
|
| 152 |
+
{"service": "web", "op": "GET /home", "duration_ms": 9000, "error": True, "note": "dns resolution failed"},
|
| 153 |
+
],
|
| 154 |
+
},
|
| 155 |
+
"gold": {
|
| 156 |
+
"scope": ["web", "worker"],
|
| 157 |
+
"hypothesis_keywords": ["dns", "resolver", "10.0.0.2", "enotfound", "no such host"],
|
| 158 |
+
"mitigation_keywords": ["restart dns", "restart resolver", "failover dns", "point to backup resolver", "flush dns"],
|
| 159 |
+
"writeup_keywords": ["dns", "resolution", "intermittent", "503", "restored"],
|
| 160 |
+
},
|
| 161 |
+
},
|
| 162 |
+
]
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def get_scenario(index: int) -> Dict[str, Any]:
|
| 166 |
+
return SCENARIOS[index % len(SCENARIOS)]
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def num_scenarios() -> int:
|
| 170 |
+
return len(SCENARIOS)
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|