Spaces:

yashppawar
/

postmortem_env

Sleeping

App Files Files Community

yashppawar commited on Apr 8

Commit

b29893e

verified ·

1 Parent(s): f13c6d3

Upload folder using huggingface_hub

Browse files

Files changed (15) hide show

Dockerfile +81 -0
README.md +122 -5
__init__.py +16 -0
client.py +58 -0
inference.py +279 -0
models.py +58 -0
openenv.yaml +7 -0
pyproject.toml +45 -0
server/Dockerfile +80 -0
server/__init__.py +11 -0
server/app.py +49 -0
server/postmortem_env_environment.py +239 -0
server/requirements.txt +5 -0
server/scenarios.py +170 -0
uv.lock +0 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,81 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Multi-stage build using openenv-base
+# This Dockerfile is flexible and works for both:
+# - In-repo environments (with local OpenEnv sources)
+# - Standalone environments (with openenv from PyPI/Git)
+# The build script (openenv build) handles context detection and sets appropriate build args.
+ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
+FROM ${BASE_IMAGE} AS builder
+WORKDIR /app
+# Ensure git is available (required for installing dependencies from VCS)
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends git && \
+    rm -rf /var/lib/apt/lists/*
+# Build argument to control whether we're building standalone or in-repo
+ARG BUILD_MODE=in-repo
+ARG ENV_NAME=postmortem_env
+# Copy environment code (always at root of build context)
+COPY . /app/env
+# For in-repo builds, openenv is already vendored in the build context
+# For standalone builds, openenv will be installed via pyproject.toml
+WORKDIR /app/env
+# Ensure uv is available (for local builds where base image lacks it)
+RUN if ! command -v uv >/dev/null 2>&1; then \
+        curl -LsSf https://astral.sh/uv/install.sh | sh && \
+        mv /root/.local/bin/uv /usr/local/bin/uv && \
+        mv /root/.local/bin/uvx /usr/local/bin/uvx; \
+    fi
+# Install dependencies using uv sync
+# If uv.lock exists, use it; otherwise resolve on the fly
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-install-project --no-editable; \
+    else \
+        uv sync --no-install-project --no-editable; \
+    fi
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-editable; \
+    else \
+        uv sync --no-editable; \
+    fi
+# Final runtime stage
+FROM ${BASE_IMAGE}
+WORKDIR /app
+# Copy the virtual environment from builder
+COPY --from=builder /app/env/.venv /app/.venv
+# Copy the environment code
+COPY --from=builder /app/env /app/env
+# Set PATH to use the virtual environment
+ENV PATH="/app/.venv/bin:$PATH"
+# Set PYTHONPATH so imports work correctly
+ENV PYTHONPATH="/app/env:$PYTHONPATH"
+# Health check
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+# Run the FastAPI server
+# The module path is constructed to work with the /app/env structure
+ENV ENABLE_WEB_INTERFACE=true
+CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]

README.md CHANGED Viewed

@@ -1,10 +1,127 @@
 ---
-title: Postmortem Env
-emoji: 🏆
-colorFrom: indigo
-colorTo: green
 sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: PostMortem Incident Triage OpenEnv
+emoji: 🚨
+colorFrom: red
+colorTo: yellow
 sdk: docker
+app_port: 8000
 pinned: false
+license: bsd-3-clause
+tags:
+  - openenv
+  - rl
+  - sre
+  - incident-response
+base_path: /web
 ---
+# PostMortem — Live Incident Triage Environment
+An OpenEnv environment where an LLM agent plays an on-call SRE responding to a
+live production incident. Real-world task, typed OpenEnv spec, deterministic
+grader, three difficulty tiers, dense process-reward signal.
+## The task
+On each episode the agent receives an alert. It must:
+1. **ack** the incident (accept ownership)
+2. **query_logs / query_metrics / query_traces** on services to gather evidence
+3. **scope** the blast radius
+4. **hypothesize** the root cause
+5. **mitigate** (propose a concrete remediation)
+6. **write_status** (post a customer-facing update)
+All six verbs are exposed as a single typed action:
+```python
+PostmortemAction(tool="query_logs", args={"service": "api"})
+```
+## Action space
+| tool           | args                              | effect                              |
+|----------------|-----------------------------------|-------------------------------------|
+| `ack`          | `{}`                              | accept the incident (sub-goal 1)    |
+| `query_logs`   | `{"service": str}`                | return recent log lines             |
+| `query_metrics`| `{"service": str}`                | return latest metrics               |
+| `query_traces` | `{"trace_id": str}`               | return distributed trace spans      |
+| `scope`        | `{"services": list[str]}`         | declare blast radius (sub-goal 2)   |
+| `hypothesize`  | `{"root_cause": str}`             | declare root cause (sub-goal 3)     |
+| `mitigate`     | `{"action": str}`                 | apply mitigation (sub-goal 4)       |
+| `write_status` | `{"text": str}`                   | publish update, ends ep (sub-goal 5)|
+## Observation space
+Key fields of `PostmortemObservation`:
+- `task_id`, `task_description`, `available_services`, `available_trace_ids`
+- `tool_result` — free text result of the last tool call
+- `subgoals` — bool dict `{acked, scoped, hypothesized, mitigated, written}`
+- `reward_so_far` — cumulative reward in [0, 1]
+- `steps_remaining`, `last_error`
+- `done`, `reward` (current step)
+## Tasks (3 difficulty tiers)
+On each `reset()` the env rotates to the next scenario. Running three resets in
+a row covers all three tiers in order.
+| task_id           | difficulty | incident                                                     |
+|-------------------|------------|--------------------------------------------------------------|
+| `easy_oom`        | easy       | `api` OOM-killed; cause directly visible in logs             |
+| `medium_cascade`  | medium     | checkout latency cascade; must correlate trace across 3 svcs |
+| `hard_dns`        | hard       | 503s blamed on fresh `api` deploy, real cause is upstream DNS|
+## Reward design
+The reward is a **5-stage process-reward ladder** in `[0, 1]`:
+```
+ack           +0.10   (granted on first successful ack)
+scope         +0.20 × Jaccard(agent_services, gold_services)
+hypothesize   +0.20 × keyword_fraction(agent_text, gold_hypothesis_keywords)
+mitigate      +0.20 × keyword_fraction(agent_text, gold_mitigation_keywords)
+write_status  +0.30 × keyword_fraction(agent_text, gold_writeup_keywords)
+```
+Each sub-goal is awarded once. The grader is fully **deterministic** — no LLM
+judge, no randomness. Partial credit gives a smooth gradient. The episode
+terminates when `write_status` fires or after `MAX_STEPS = 12`.
+## Setup
+```bash
+pip install openenv-core
+openenv build .                    # build Docker image
+python inference.py                # run baseline (3 scenarios)
+```
+### Required environment variables
+| var            | default                                        | notes |
+|----------------|------------------------------------------------|-------|
+| `HF_TOKEN`     | (required)                                     | HuggingFace token, also used as the OpenAI client API key |
+| `API_BASE_URL` | `https://router.huggingface.co/v1`             | any OpenAI-compatible endpoint |
+| `MODEL_NAME`   | `Qwen/Qwen2.5-72B-Instruct`                    | any chat model |
+| `IMAGE_NAME`   | `postmortem_env-env:latest`                    | docker tag of the env image |
+## Baseline reproduction
+```bash
+export HF_TOKEN=hf_...
+export IMAGE_NAME=postmortem_env-env:latest
+python inference.py
+```
+Emits strict `[START] / [STEP] / [END]` lines, one `[END]` per task.
+## Resource budget
+Well within the hackathon limits of **2 vCPU / 8 GB RAM**, and completes the
+3-task sweep in **well under 20 minutes** (dominated by LLM latency, ≤ 36 LLM
+calls total).
+## License
+BSD-3-Clause (matches OpenEnv core).

__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Postmortem Env Environment."""
+from .client import PostmortemEnv
+from .models import PostmortemAction, PostmortemObservation
+__all__ = [
+    "PostmortemAction",
+    "PostmortemObservation",
+    "PostmortemEnv",
+]

client.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+"""PostMortem Environment Client."""
+from typing import Dict
+from openenv.core import EnvClient
+from openenv.core.client_types import StepResult
+from openenv.core.env_server.types import State
+from .models import PostmortemAction, PostmortemObservation
+class PostmortemEnv(EnvClient[PostmortemAction, PostmortemObservation, State]):
+    """
+    Client for the PostMortem incident-triage environment.
+    Example:
+        >>> client = PostmortemEnv.from_docker_image("postmortem_env-env:latest")
+        >>> try:
+        ...     r = client.reset()
+        ...     print(r.observation.task_description)
+        ...     r = client.step(PostmortemAction(tool="ack"))
+        ... finally:
+        ...     client.close()
+    """
+    def _step_payload(self, action: PostmortemAction) -> Dict:
+        return {"tool": action.tool, "args": action.args}
+    def _parse_result(self, payload: Dict) -> StepResult[PostmortemObservation]:
+        obs_data = payload.get("observation", {})
+        observation = PostmortemObservation(
+            task_id=obs_data.get("task_id", ""),
+            task_description=obs_data.get("task_description", ""),
+            available_services=obs_data.get("available_services", []) or [],
+            available_trace_ids=obs_data.get("available_trace_ids", []) or [],
+            tool_result=obs_data.get("tool_result", ""),
+            subgoals=obs_data.get("subgoals", {}) or {},
+            reward_so_far=obs_data.get("reward_so_far", 0.0) or 0.0,
+            steps_remaining=obs_data.get("steps_remaining", 0) or 0,
+            last_error=obs_data.get("last_error", "") or "",
+            done=payload.get("done", False),
+            reward=payload.get("reward"),
+            metadata=obs_data.get("metadata", {}) or {},
+        )
+        return StepResult(
+            observation=observation,
+            reward=payload.get("reward"),
+            done=payload.get("done", False),
+        )
+    def _parse_state(self, payload: Dict) -> State:
+        return State(
+            episode_id=payload.get("episode_id"),
+            step_count=payload.get("step_count", 0),
+        )

inference.py ADDED Viewed

	@@ -0,0 +1,279 @@

+"""
+Inference script for the PostMortem OpenEnv environment.
+Runs all 3 scenarios (easy/medium/hard) end-to-end against a containerised env,
+calling an LLM via the OpenAI client against a Hugging Face Router endpoint.
+Environment variables (all read via os.getenv):
+    HF_TOKEN         -- HuggingFace token used as the OpenAI API key.
+    API_BASE_URL     -- LLM base URL (default: HF Router).
+    MODEL_NAME       -- LLM model id (default: Qwen/Qwen2.5-72B-Instruct).
+    IMAGE_NAME       -- local Docker image tag for the env container.
+Strict stdout format enforced by the grader:
+    [START] task=<id> env=<benchmark> model=<model>
+    [STEP]  step=<n> action=<tool:args> reward=<0.00> done=<true|false> error=<msg|null>
+    [END]   success=<true|false> steps=<n> score=<0.000> rewards=<r1,r2,...>
+"""
+from __future__ import annotations
+import asyncio
+import json
+import os
+import textwrap
+from typing import List, Optional
+from openai import OpenAI
+# Support running as `python inference.py` from inside the env dir
+# as well as `python -m postmortem_env.inference` from its parent.
+try:
+    from postmortem_env.client import PostmortemEnv
+    from postmortem_env.models import PostmortemAction
+except ModuleNotFoundError:
+    import os as _os, sys as _sys
+    _here = _os.path.dirname(_os.path.abspath(__file__))
+    _sys.path.insert(0, _os.path.dirname(_here))
+    from postmortem_env.client import PostmortemEnv  # type: ignore
+    from postmortem_env.models import PostmortemAction  # type: ignore
+# ---------- Config ----------
+API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
+API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
+MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
+IMAGE_NAME = os.getenv("IMAGE_NAME") or "postmortem_env-env:latest"
+BENCHMARK = "postmortem_env"
+MAX_STEPS = 12
+TEMPERATURE = 0.2
+MAX_TOKENS = 220
+SUCCESS_SCORE_THRESHOLD = 0.5  # per-task success threshold (normalized [0,1])
+SYSTEM_PROMPT = textwrap.dedent(
+    """
+    You are an on-call Site Reliability Engineer responding to a live production
+    incident. You MUST progress through 5 sub-goals in order to maximise reward:
+        1. ack            (claim the incident)
+        2. scope          (declare the affected services)
+        3. hypothesize    (name the root cause)
+        4. mitigate       (propose a concrete remediation)
+        5. write_status   (publish a customer-facing update — ENDS the episode)
+    Between ack and scope, issue AT MOST ONE query_logs call per service and
+    AT MOST ONE query_metrics call per service, plus any needed query_traces.
+    DO NOT repeat the same query. Efficient investigation is rewarded.
+    Reply on each turn with a SINGLE JSON object on one line, nothing else:
+        {"tool": "<verb>", "args": {...}}
+    Valid tools and args:
+      - {"tool": "ack",            "args": {}}
+      - {"tool": "query_logs",     "args": {"service": "<name>"}}
+      - {"tool": "query_metrics",  "args": {"service": "<name>"}}
+      - {"tool": "query_traces",   "args": {"trace_id": "<id>"}}
+      - {"tool": "scope",          "args": {"services": ["<name>", ...]}}
+      - {"tool": "hypothesize",    "args": {"root_cause": "<short sentence with concrete keywords>"}}
+      - {"tool": "mitigate",       "args": {"action": "<short sentence with a concrete fix>"}}
+      - {"tool": "write_status",   "args": {"text": "<concise update naming service, cause, and fix>"}}
+    When you hypothesize/mitigate/write_status, INCLUDE the specific technical
+    keywords you observed in the logs (e.g. 'OOM', 'heap', 'DNS resolver',
+    'connection pool'), because the grader scores keyword coverage.
+    Reply with ONLY the JSON object. No prose, no markdown fences.
+    """
+).strip()
+# ---------- Strict log format helpers ----------
+def log_start(task: str, env: str, model: str) -> None:
+    print(f"[START] task={task} env={env} model={model}", flush=True)
+def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
+    error_val = error if error else "null"
+    done_val = str(done).lower()
+    # Strip newlines to honour the one-line-per-step rule
+    action_s = action.replace("\n", " ").replace("\r", " ")
+    err_s = error_val.replace("\n", " ").replace("\r", " ")
+    print(
+        f"[STEP] step={step} action={action_s} reward={reward:.2f} done={done_val} error={err_s}",
+        flush=True,
+    )
+def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
+    rewards_str = ",".join(f"{r:.2f}" for r in rewards)
+    print(
+        f"[END] success={str(success).lower()} steps={steps} score={score:.3f} rewards={rewards_str}",
+        flush=True,
+    )
+# ---------- Agent ----------
+def build_user_prompt(step: int, obs_payload: dict, last_reward: float, queried: dict) -> str:
+    subgoals = obs_payload.get("subgoals") or {}
+    # Compute the next sub-goal to target
+    order = ["acked", "scoped", "hypothesized", "mitigated", "written"]
+    nxt = next((g for g in order if not subgoals.get(g, False)), "written")
+    hint_map = {
+        "acked": "Next: send {\"tool\": \"ack\", \"args\": {}}",
+        "scoped": 'Next: gather minimal evidence then send {"tool": "scope", "args": {"services": [...]}}',
+        "hypothesized": 'Next: send {"tool": "hypothesize", "args": {"root_cause": "..."}} with concrete keywords from logs',
+        "mitigated": 'Next: send {"tool": "mitigate", "args": {"action": "..."}} naming a concrete remediation',
+        "written": 'Next: send {"tool": "write_status", "args": {"text": "..."}} to end the episode with a concise status update',
+    }
+    return textwrap.dedent(
+        f"""
+        Step: {step}   Steps remaining: {obs_payload.get('steps_remaining')}
+        Task: {obs_payload.get('task_id')}
+        Incident brief: {obs_payload.get('task_description')}
+        Services available: {obs_payload.get('available_services')}
+        Trace ids available: {obs_payload.get('available_trace_ids')}
+        Sub-goals: {subgoals}
+        Total reward so far: {obs_payload.get('reward_so_far'):.2f}
+        Services already queried (logs): {sorted(queried.get('logs', set()))}
+        Services already queried (metrics): {sorted(queried.get('metrics', set()))}
+        Traces already queried: {sorted(queried.get('traces', set()))}
+        Last tool result:
+        {obs_payload.get('tool_result')}
+        {hint_map[nxt]}
+        Reply with a single JSON object on one line.
+        """
+    ).strip()
+def ask_model(client: OpenAI, user_prompt: str) -> dict:
+    try:
+        completion = client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=[
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": user_prompt},
+            ],
+            temperature=TEMPERATURE,
+            max_tokens=MAX_TOKENS,
+        )
+        text = (completion.choices[0].message.content or "").strip()
+        # Strip optional code fences
+        if text.startswith("```"):
+            text = text.strip("`")
+            # drop a leading "json" language tag if present
+            if text.lower().startswith("json"):
+                text = text[4:].strip()
+        # Find the first {...} block
+        lb, rb = text.find("{"), text.rfind("}")
+        if lb != -1 and rb != -1:
+            text = text[lb : rb + 1]
+        return json.loads(text)
+    except Exception as exc:
+        print(f"[DEBUG] Model call failed, defaulting to ack: {exc}", flush=True)
+        return {"tool": "ack", "args": {}}
+def obs_to_payload(obs) -> dict:
+    """Serialise a PostmortemObservation for prompt construction."""
+    return {
+        "task_id": getattr(obs, "task_id", ""),
+        "task_description": getattr(obs, "task_description", ""),
+        "available_services": getattr(obs, "available_services", []),
+        "available_trace_ids": getattr(obs, "available_trace_ids", []),
+        "subgoals": getattr(obs, "subgoals", {}),
+        "reward_so_far": getattr(obs, "reward_so_far", 0.0),
+        "steps_remaining": getattr(obs, "steps_remaining", 0),
+        "tool_result": getattr(obs, "tool_result", ""),
+    }
+async def run_one_task(client: OpenAI, env: PostmortemEnv) -> None:
+    rewards: List[float] = []
+    steps_taken = 0
+    score = 0.0
+    success = False
+    task_name = "unknown"
+    try:
+        result = await env.reset()
+        obs = result.observation
+        task_name = getattr(obs, "task_id", "unknown") or "unknown"
+        log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
+        last_reward = 0.0
+        done = result.done
+        queried = {"logs": set(), "metrics": set(), "traces": set()}
+        for step in range(1, MAX_STEPS + 1):
+            if done:
+                break
+            payload = obs_to_payload(obs)
+            user_prompt = build_user_prompt(step, payload, last_reward, queried)
+            tool_call = ask_model(client, user_prompt)
+            tool = str(tool_call.get("tool", "ack"))
+            args = tool_call.get("args", {}) or {}
+            if not isinstance(args, dict):
+                args = {}
+            # Track queries so the prompt can discourage repeats
+            if tool == "query_logs":
+                queried["logs"].add(args.get("service", ""))
+            elif tool == "query_metrics":
+                queried["metrics"].add(args.get("service", ""))
+            elif tool == "query_traces":
+                queried["traces"].add(args.get("trace_id", ""))
+            action = PostmortemAction(tool=tool, args=args)
+            result = await env.step(action)
+            obs = result.observation
+            reward = float(result.reward or 0.0)
+            done = bool(result.done)
+            error = getattr(obs, "last_error", "") or None
+            action_str = f"{tool}:{json.dumps(args, separators=(',', ':'))}"
+            rewards.append(reward)
+            steps_taken = step
+            last_reward = reward
+            log_step(step=step, action=action_str, reward=reward, done=done, error=error)
+            if done:
+                break
+        score = float(getattr(obs, "reward_so_far", 0.0) or sum(rewards))
+        score = min(max(score, 0.0), 1.0)
+        success = score >= SUCCESS_SCORE_THRESHOLD
+    finally:
+        log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
+async def main() -> None:
+    client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
+    # Spin up one containerised env; run the 3 scenarios back-to-back
+    # by issuing 3 resets (the env rotates scenarios on each reset).
+    env = await PostmortemEnv.from_docker_image(IMAGE_NAME)
+    try:
+        for _ in range(3):
+            await run_one_task(client, env)
+    finally:
+        try:
+            await env.close()
+        except Exception as exc:
+            print(f"[DEBUG] env.close() error: {exc}", flush=True)
+if __name__ == "__main__":
+    asyncio.run(main())

models.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+"""
+Data models for the PostMortem incident-triage environment.
+An agent plays an on-call SRE responding to a live incident. It queries fake
+telemetry (logs / metrics / traces), scopes blast radius, hypothesises a root
+cause, applies a mitigation, and writes a status-page update. The reward is a
+5-stage process-reward ladder in [0, 1].
+"""
+from typing import Any, Dict, List
+from openenv.core.env_server.types import Action, Observation
+from pydantic import Field
+class PostmortemAction(Action):
+    """
+    Single action envelope. `tool` selects an investigation / response verb,
+    `args` is a tool-specific dict.
+    Supported tools:
+      - "ack"              args: {}
+      - "query_logs"       args: {"service": str}
+      - "query_metrics"    args: {"service": str}
+      - "query_traces"     args: {"trace_id": str}
+      - "scope"            args: {"services": list[str]}
+      - "hypothesize"      args: {"root_cause": str}
+      - "mitigate"         args: {"action": str}
+      - "write_status"     args: {"text": str}
+    """
+    tool: str = Field(..., description="Investigation/response verb")
+    args: Dict[str, Any] = Field(default_factory=dict, description="Tool args")
+class PostmortemObservation(Observation):
+    """Observation returned after each step."""
+    task_id: str = Field(default="", description="Current scenario id")
+    task_description: str = Field(default="", description="Incident brief")
+    available_services: List[str] = Field(default_factory=list)
+    available_trace_ids: List[str] = Field(default_factory=list)
+    tool_result: str = Field(default="", description="Result from the last tool call")
+    subgoals: Dict[str, bool] = Field(
+        default_factory=lambda: {
+            "acked": False,
+            "scoped": False,
+            "hypothesized": False,
+            "mitigated": False,
+            "written": False,
+        }
+    )
+    reward_so_far: float = Field(default=0.0)
+    steps_remaining: int = Field(default=0)
+    last_error: str = Field(default="")

openenv.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+spec_version: 1
+name: postmortem_env
+type: space
+runtime: fastapi
+app: server.app:app
+port: 8000

pyproject.toml ADDED Viewed

	@@ -0,0 +1,45 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+[build-system]
+requires = ["setuptools>=45", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "openenv-postmortem_env"
+version = "0.1.0"
+description = "Postmortem Env environment for OpenEnv"
+requires-python = ">=3.10"
+dependencies = [
+    # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
+    # install from github
+    # "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
+    "openenv-core[core]>=0.2.2",
+    # Environment-specific dependencies
+    # Add all dependencies needed for your environment here
+    # Examples:
+    # "numpy>=1.19.0",
+    # "torch>=2.0.0",
+    # "gymnasium>=0.29.0",
+    # "openspiel>=1.0.0",
+    # "smolagents>=1.22.0,<2",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-cov>=4.0.0",
+]
+[project.scripts]
+# Server entry point - enables running via: uv run --project . server
+# or: python -m postmortem_env.server.app
+server = "postmortem_env.server.app:main"
+[tool.setuptools]
+include-package-data = true
+packages = ["postmortem_env", "postmortem_env.server"]
+package-dir = { "postmortem_env" = ".", "postmortem_env.server" = "server" }

server/Dockerfile ADDED Viewed

	@@ -0,0 +1,80 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Multi-stage build using openenv-base
+# This Dockerfile is flexible and works for both:
+# - In-repo environments (with local OpenEnv sources)
+# - Standalone environments (with openenv from PyPI/Git)
+# The build script (openenv build) handles context detection and sets appropriate build args.
+ARG BASE_IMAGE=ghcr.io/meta-pytorch/openenv-base:latest
+FROM ${BASE_IMAGE} AS builder
+WORKDIR /app
+# Ensure git is available (required for installing dependencies from VCS)
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends git && \
+    rm -rf /var/lib/apt/lists/*
+# Build argument to control whether we're building standalone or in-repo
+ARG BUILD_MODE=in-repo
+ARG ENV_NAME=postmortem_env
+# Copy environment code (always at root of build context)
+COPY . /app/env
+# For in-repo builds, openenv is already vendored in the build context
+# For standalone builds, openenv will be installed via pyproject.toml
+WORKDIR /app/env
+# Ensure uv is available (for local builds where base image lacks it)
+RUN if ! command -v uv >/dev/null 2>&1; then \
+        curl -LsSf https://astral.sh/uv/install.sh | sh && \
+        mv /root/.local/bin/uv /usr/local/bin/uv && \
+        mv /root/.local/bin/uvx /usr/local/bin/uvx; \
+    fi
+# Install dependencies using uv sync
+# If uv.lock exists, use it; otherwise resolve on the fly
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-install-project --no-editable; \
+    else \
+        uv sync --no-install-project --no-editable; \
+    fi
+RUN --mount=type=cache,target=/root/.cache/uv \
+    if [ -f uv.lock ]; then \
+        uv sync --frozen --no-editable; \
+    else \
+        uv sync --no-editable; \
+    fi
+# Final runtime stage
+FROM ${BASE_IMAGE}
+WORKDIR /app
+# Copy the virtual environment from builder
+COPY --from=builder /app/env/.venv /app/.venv
+# Copy the environment code
+COPY --from=builder /app/env /app/env
+# Set PATH to use the virtual environment
+ENV PATH="/app/.venv/bin:$PATH"
+# Set PYTHONPATH so imports work correctly
+ENV PYTHONPATH="/app/env:$PYTHONPATH"
+# Health check
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+# Run the FastAPI server
+# The module path is constructed to work with the /app/env structure
+CMD ["sh", "-c", "cd /app/env && uvicorn server.app:app --host 0.0.0.0 --port 8000"]

server/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+"""Postmortem Env environment server components."""
+from .postmortem_env_environment import PostmortemEnvironment
+__all__ = ["PostmortemEnvironment"]

server/app.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+"""FastAPI application for the PostMortem Environment."""
+try:
+    from openenv.core.env_server.http_server import create_app
+except Exception as e:  # pragma: no cover
+    raise ImportError(
+        "openenv-core is required. Install with 'pip install openenv-core'."
+    ) from e
+try:
+    from ..models import PostmortemAction, PostmortemObservation
+    from .postmortem_env_environment import PostmortemEnvironment
+except (ImportError, ModuleNotFoundError):  # Docker / direct-run fallback
+    import os, sys
+    sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    from models import PostmortemAction, PostmortemObservation  # type: ignore
+    from server.postmortem_env_environment import PostmortemEnvironment  # type: ignore
+app = create_app(
+    PostmortemEnvironment,
+    PostmortemAction,
+    PostmortemObservation,
+    env_name="postmortem_env",
+    max_concurrent_envs=1,
+)
+def main(host: str = "0.0.0.0", port: int = 8000) -> None:
+    """Entry point for direct execution."""
+    import uvicorn
+    uvicorn.run(app, host=host, port=port)
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--host", type=str, default="0.0.0.0")
+    args, _ = parser.parse_known_args()
+    if args.host == "0.0.0.0" and args.port == 8000:
+        main()
+    else:
+        main(host=args.host, port=args.port)

server/postmortem_env_environment.py ADDED Viewed

	@@ -0,0 +1,239 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+"""
+PostMortem Environment — incident triage as an OpenEnv env.
+Agent plays an on-call SRE. It interacts via typed actions (query_logs,
+query_metrics, query_traces, ack, scope, hypothesize, mitigate, write_status)
+against one of three fixed scenarios that rotate on reset(). The reward is a
+5-stage process-reward ladder in [0, 1]:
+    ack           +0.10
+    scope         +0.20  (Jaccard overlap vs. gold service set)
+    hypothesize   +0.20  (fraction of gold keywords mentioned)
+    mitigate      +0.20  (fraction of gold keywords mentioned)
+    write_status  +0.30  (fraction of gold keywords mentioned)
+Each sub-goal can only be claimed once. Episodes terminate on `write_status`
+or after MAX_STEPS (12).
+"""
+from typing import Any, Dict, List
+from uuid import uuid4
+from openenv.core.env_server.interfaces import Environment
+from openenv.core.env_server.types import State
+try:
+    from ..models import PostmortemAction, PostmortemObservation
+    from .scenarios import SCENARIOS, num_scenarios
+except (ImportError, ModuleNotFoundError):  # Docker / direct-run fallback
+    import os, sys
+    sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+    from models import PostmortemAction, PostmortemObservation  # type: ignore
+    from scenarios import SCENARIOS, num_scenarios  # type: ignore
+MAX_STEPS = 12
+# ---------- Reward helpers ----------
+def _jaccard(a: List[str], b: List[str]) -> float:
+    if not a and not b:
+        return 1.0
+    sa, sb = {x.strip().lower() for x in a}, {x.strip().lower() for x in b}
+    if not sa or not sb:
+        return 0.0
+    return len(sa & sb) / len(sa | sb)
+def _keyword_fraction(text: str, keywords: List[str]) -> float:
+    if not keywords:
+        return 0.0
+    t = text.lower()
+    hits = sum(1 for k in keywords if k.lower() in t)
+    return hits / len(keywords)
+# ---------- Environment ----------
+class PostmortemEnvironment(Environment):
+    """Incident triage environment."""
+    SUPPORTS_CONCURRENT_SESSIONS: bool = True
+    def __init__(self) -> None:
+        self._state = State(episode_id=str(uuid4()), step_count=0)
+        self._scenario_idx = 0
+        self._scenario: Dict[str, Any] = SCENARIOS[0]
+        self._subgoals: Dict[str, bool] = {
+            "acked": False,
+            "scoped": False,
+            "hypothesized": False,
+            "mitigated": False,
+            "written": False,
+        }
+        self._reward_so_far = 0.0
+        self._done = False
+        self._last_error = ""
+    # ---- env API ----
+    def reset(self) -> PostmortemObservation:
+        # Rotate to next scenario on each reset so a run of 3 resets
+        # covers all three difficulty tiers in order.
+        self._scenario = SCENARIOS[self._scenario_idx % num_scenarios()]
+        self._scenario_idx += 1
+        self._state = State(episode_id=str(uuid4()), step_count=0)
+        self._subgoals = {k: False for k in self._subgoals}
+        self._reward_so_far = 0.0
+        self._done = False
+        self._last_error = ""
+        return PostmortemObservation(
+            task_id=self._scenario["task_id"],
+            task_description=self._scenario["description"],
+            available_services=list(self._scenario["services"]),
+            available_trace_ids=list(self._scenario.get("traces", {}).keys()),
+            tool_result="Incident opened. Begin investigation.",
+            subgoals=dict(self._subgoals),
+            reward_so_far=0.0,
+            steps_remaining=MAX_STEPS,
+            last_error="",
+            done=False,
+            reward=0.0,
+            metadata={"difficulty": self._scenario.get("difficulty", "")},
+        )
+    def step(self, action: PostmortemAction) -> PostmortemObservation:  # type: ignore[override]
+        self._state.step_count += 1
+        tool = (action.tool or "").strip().lower()
+        args = action.args or {}
+        tool_result = ""
+        step_reward = 0.0
+        self._last_error = ""
+        try:
+            if tool == "ack":
+                if not self._subgoals["acked"]:
+                    self._subgoals["acked"] = True
+                    step_reward = 0.10
+                    tool_result = "Acknowledged. You now own this incident."
+                else:
+                    tool_result = "Already acknowledged."
+            elif tool == "query_logs":
+                service = str(args.get("service", "")).strip()
+                logs = self._scenario.get("logs", {}).get(service)
+                if logs is None:
+                    self._last_error = f"unknown service '{service}'"
+                    tool_result = f"ERROR: {self._last_error}"
+                else:
+                    tool_result = "\n".join(logs)
+            elif tool == "query_metrics":
+                service = str(args.get("service", "")).strip()
+                metrics = self._scenario.get("metrics", {}).get(service)
+                if metrics is None:
+                    self._last_error = f"unknown service '{service}'"
+                    tool_result = f"ERROR: {self._last_error}"
+                else:
+                    tool_result = ", ".join(f"{k}={v}" for k, v in metrics.items())
+            elif tool == "query_traces":
+                trace_id = str(args.get("trace_id", "")).strip()
+                trace = self._scenario.get("traces", {}).get(trace_id)
+                if trace is None:
+                    self._last_error = f"unknown trace_id '{trace_id}'"
+                    tool_result = f"ERROR: {self._last_error}"
+                else:
+                    tool_result = " | ".join(
+                        f"{s['service']}:{s['op']} {s['duration_ms']}ms err={s.get('error', False)}"
+                        for s in trace
+                    )
+            elif tool == "scope":
+                services = args.get("services", [])
+                if not isinstance(services, list):
+                    self._last_error = "scope.services must be a list"
+                    tool_result = f"ERROR: {self._last_error}"
+                elif not self._subgoals["scoped"]:
+                    jac = _jaccard(services, self._scenario["gold"]["scope"])
+                    gained = 0.20 * jac
+                    step_reward = gained
+                    self._subgoals["scoped"] = True
+                    tool_result = f"Scope recorded. Match vs gold = {jac:.2f}"
+                else:
+                    tool_result = "Scope already set."
+            elif tool == "hypothesize":
+                cause = str(args.get("root_cause", ""))
+                if not self._subgoals["hypothesized"]:
+                    frac = _keyword_fraction(cause, self._scenario["gold"]["hypothesis_keywords"])
+                    gained = 0.20 * frac
+                    step_reward = gained
+                    self._subgoals["hypothesized"] = True
+                    tool_result = f"Hypothesis recorded. Keyword match = {frac:.2f}"
+                else:
+                    tool_result = "Hypothesis already set."
+            elif tool == "mitigate":
+                mit = str(args.get("action", ""))
+                if not self._subgoals["mitigated"]:
+                    frac = _keyword_fraction(mit, self._scenario["gold"]["mitigation_keywords"])
+                    gained = 0.20 * frac
+                    step_reward = gained
+                    self._subgoals["mitigated"] = True
+                    tool_result = f"Mitigation applied. Keyword match = {frac:.2f}"
+                else:
+                    tool_result = "Mitigation already applied."
+            elif tool == "write_status":
+                text = str(args.get("text", ""))
+                if not self._subgoals["written"]:
+                    frac = _keyword_fraction(text, self._scenario["gold"]["writeup_keywords"])
+                    gained = 0.30 * frac
+                    step_reward = gained
+                    self._subgoals["written"] = True
+                    tool_result = f"Status update published. Keyword match = {frac:.2f}"
+                    self._done = True  # writeup ends the episode
+                else:
+                    tool_result = "Status update already published."
+            else:
+                self._last_error = f"unknown tool '{tool}'"
+                tool_result = (
+                    f"ERROR: {self._last_error}. Valid: ack, query_logs, query_metrics, "
+                    "query_traces, scope, hypothesize, mitigate, write_status."
+                )
+        except Exception as exc:  # defensive — never crash the server
+            self._last_error = f"internal: {exc}"
+            tool_result = f"ERROR: {self._last_error}"
+        self._reward_so_far = min(1.0, max(0.0, self._reward_so_far + step_reward))
+        if self._state.step_count >= MAX_STEPS:
+            self._done = True
+        return PostmortemObservation(
+            task_id=self._scenario["task_id"],
+            task_description=self._scenario["description"],
+            available_services=list(self._scenario["services"]),
+            available_trace_ids=list(self._scenario.get("traces", {}).keys()),
+            tool_result=tool_result,
+            subgoals=dict(self._subgoals),
+            reward_so_far=self._reward_so_far,
+            steps_remaining=max(0, MAX_STEPS - self._state.step_count),
+            last_error=self._last_error,
+            done=self._done,
+            reward=step_reward,
+            metadata={"difficulty": self._scenario.get("difficulty", "")},
+        )
+    @property
+    def state(self) -> State:
+        return self._state

server/requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+openenv-core[core]>=0.2.2
+fastapi>=0.115.0
+uvicorn>=0.24.0
+openai>=1.40.0
+pydantic>=2.0.0

server/scenarios.py ADDED Viewed

	@@ -0,0 +1,170 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+"""
+Three fixed incident scenarios for the PostMortem env.
+Each scenario is a self-contained dict with:
+  - description:    the brief an on-call engineer would receive
+  - services:       list of service names the agent may query
+  - logs:           {service: [log lines]}
+  - metrics:        {service: {metric: value}}
+  - traces:         {trace_id: [span dicts]}
+  - gold:           the oracle answers for grading
+"""
+from typing import Dict, List, Any
+SCENARIOS: List[Dict[str, Any]] = [
+    # ---------- EASY ----------
+    {
+        "task_id": "easy_oom",
+        "difficulty": "easy",
+        "description": (
+            "ALERT: Service `api` is returning HTTP 500 on ~80% of requests. "
+            "On-call has paged you. Investigate, scope the blast radius, "
+            "identify the root cause, mitigate, and write a status-page update."
+        ),
+        "services": ["api", "db", "auth"],
+        "logs": {
+            "api": [
+                "2026-04-08T19:55:01Z INFO  request id=req-001 path=/v1/users",
+                "2026-04-08T19:55:02Z ERROR java.lang.OutOfMemoryError: Java heap space",
+                "2026-04-08T19:55:02Z ERROR java.lang.OutOfMemoryError: Java heap space",
+                "2026-04-08T19:55:03Z WARN  GC overhead limit exceeded",
+                "2026-04-08T19:55:03Z ERROR pod api-7f8b restarting OOMKilled",
+            ],
+            "db": [
+                "2026-04-08T19:55:01Z INFO  connection accepted from api",
+                "2026-04-08T19:55:02Z INFO  query completed in 12ms",
+                "2026-04-08T19:55:05Z INFO  idle connections=45",
+            ],
+            "auth": [
+                "2026-04-08T19:55:01Z INFO  token issued",
+                "2026-04-08T19:55:03Z INFO  token validated",
+            ],
+        },
+        "metrics": {
+            "api": {"cpu_pct": 98, "mem_pct": 99, "p99_latency_ms": 12000, "error_rate": 0.82},
+            "db": {"cpu_pct": 22, "mem_pct": 34, "p99_latency_ms": 18, "error_rate": 0.0},
+            "auth": {"cpu_pct": 8, "mem_pct": 14, "p99_latency_ms": 6, "error_rate": 0.0},
+        },
+        "traces": {
+            "trace_abc": [
+                {"service": "api", "op": "GET /v1/users", "duration_ms": 11800, "error": True},
+                {"service": "db", "op": "SELECT users", "duration_ms": 14, "error": False},
+            ],
+        },
+        "gold": {
+            "scope": ["api"],
+            "hypothesis_keywords": ["oom", "memory", "heap"],
+            "mitigation_keywords": ["restart", "rollback", "scale", "increase heap", "increase memory"],
+            "writeup_keywords": ["api", "memory", "restart", "resolved"],
+        },
+    },
+    # ---------- MEDIUM ----------
+    {
+        "task_id": "medium_cascade",
+        "difficulty": "medium",
+        "description": (
+            "ALERT: Checkout latency p99 has crossed 5 seconds in the last 10 minutes. "
+            "Three services are involved: `checkout`, `payments`, `inventory`. "
+            "Correlate across logs, metrics and traces to find the root service, "
+            "then scope / hypothesise / mitigate / communicate."
+        ),
+        "services": ["checkout", "payments", "inventory"],
+        "logs": {
+            "checkout": [
+                "2026-04-08T20:10:01Z INFO  POST /checkout trace_id=trace_xyz",
+                "2026-04-08T20:10:08Z WARN  downstream payments slow (7s)",
+                "2026-04-08T20:10:08Z INFO  returning 200 to client",
+            ],
+            "payments": [
+                "2026-04-08T20:10:01Z INFO  charge_card trace_id=trace_xyz",
+                "2026-04-08T20:10:06Z WARN  inventory check blocking",
+                "2026-04-08T20:10:07Z INFO  charge_card success",
+            ],
+            "inventory": [
+                "2026-04-08T20:10:01Z INFO  reserve_items trace_id=trace_xyz",
+                "2026-04-08T20:10:05Z ERROR connection pool exhausted (max=20)",
+                "2026-04-08T20:10:06Z ERROR connection pool exhausted (max=20)",
+                "2026-04-08T20:10:06Z WARN  request queued for 5400ms",
+            ],
+        },
+        "metrics": {
+            "checkout": {"cpu_pct": 40, "mem_pct": 55, "p99_latency_ms": 7800, "error_rate": 0.01},
+            "payments": {"cpu_pct": 35, "mem_pct": 42, "p99_latency_ms": 6900, "error_rate": 0.0},
+            "inventory": {"cpu_pct": 12, "mem_pct": 28, "p99_latency_ms": 5600, "error_rate": 0.0, "conn_pool_waiting": 44},
+        },
+        "traces": {
+            "trace_xyz": [
+                {"service": "checkout", "op": "POST /checkout", "duration_ms": 7800, "error": False},
+                {"service": "payments", "op": "charge_card", "duration_ms": 6900, "error": False},
+                {"service": "inventory", "op": "reserve_items", "duration_ms": 5500, "error": False},
+            ],
+        },
+        "gold": {
+            "scope": ["checkout", "payments", "inventory"],
+            "hypothesis_keywords": ["inventory", "connection pool", "pool exhaust", "conn"],
+            "mitigation_keywords": ["increase pool", "pool size", "restart inventory", "scale inventory"],
+            "writeup_keywords": ["inventory", "connection", "pool", "latency"],
+        },
+    },
+    # ---------- HARD ----------
+    {
+        "task_id": "hard_dns",
+        "difficulty": "hard",
+        "description": (
+            "ALERT: Intermittent 503s across multiple services (`web`, `api`, `worker`). "
+            "A deploy of `api` went out 10 minutes ago and is the obvious suspect. "
+            "Correlate carefully — the real root cause may be upstream. "
+            "Scope, hypothesise, mitigate, and write a customer-facing status update."
+        ),
+        "services": ["web", "api", "worker"],
+        "logs": {
+            "web": [
+                "2026-04-08T21:00:01Z INFO  GET /home 200",
+                "2026-04-08T21:00:07Z ERROR getaddrinfo ENOTFOUND api.internal",
+                "2026-04-08T21:00:08Z ERROR getaddrinfo ENOTFOUND api.internal",
+                "2026-04-08T21:00:09Z ERROR upstream connect timeout",
+            ],
+            "api": [
+                "2026-04-08T20:50:00Z INFO  deploy v2.31.0 started",
+                "2026-04-08T20:51:10Z INFO  deploy v2.31.0 complete, healthy",
+                "2026-04-08T21:00:07Z INFO  process healthy, listening on :8080",
+                "2026-04-08T21:00:07Z INFO  request handled 200",
+            ],
+            "worker": [
+                "2026-04-08T21:00:01Z INFO  picked up job id=42",
+                "2026-04-08T21:00:06Z ERROR dial tcp: lookup api.internal on 10.0.0.2:53: no such host",
+                "2026-04-08T21:00:08Z ERROR dial tcp: lookup api.internal on 10.0.0.2:53: no such host",
+            ],
+        },
+        "metrics": {
+            "web":    {"cpu_pct": 20, "mem_pct": 35, "p99_latency_ms": 9000, "error_rate": 0.45},
+            "api":    {"cpu_pct": 14, "mem_pct": 31, "p99_latency_ms": 42, "error_rate": 0.0},
+            "worker": {"cpu_pct": 18, "mem_pct": 27, "p99_latency_ms": 8800, "error_rate": 0.55},
+        },
+        "traces": {
+            "trace_qqq": [
+                {"service": "web", "op": "GET /home", "duration_ms": 9000, "error": True, "note": "dns resolution failed"},
+            ],
+        },
+        "gold": {
+            "scope": ["web", "worker"],
+            "hypothesis_keywords": ["dns", "resolver", "10.0.0.2", "enotfound", "no such host"],
+            "mitigation_keywords": ["restart dns", "restart resolver", "failover dns", "point to backup resolver", "flush dns"],
+            "writeup_keywords": ["dns", "resolution", "intermittent", "503", "restored"],
+        },
+    },
+]
+def get_scenario(index: int) -> Dict[str, Any]:
+    return SCENARIOS[index % len(SCENARIOS)]
+def num_scenarios() -> int:
+    return len(SCENARIOS)

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff