Spaces:
Running
Running
deploy sre-gym v2: easy/medium/hard scenarios + skill + verified-runbooks + demo
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .dockerignore +18 -0
- .gitignore +12 -0
- Dockerfile +21 -0
- Makefile +39 -0
- README.md +111 -4
- demo/pitch.md +49 -0
- demo/run_demo.sh +70 -0
- deploy/push_to_hf.sh +58 -0
- execution.md +53 -0
- inference.py +264 -0
- openenv.yaml +21 -0
- pyproject.toml +43 -0
- requirements.txt +1 -0
- run_demo.py +86 -0
- server/Dockerfile +21 -0
- server/__init__.py +1 -0
- server/app.py +14 -0
- server/requirements.txt +8 -0
- skill/SKILL.md +100 -0
- skill/tools/sre_gym_client.py +238 -0
- skill/verified-runbooks/.gitkeep +0 -0
- skill/verified-runbooks/db_config_rollout.md +23 -0
- skill/verified-runbooks/gateway_auth_rollout.md +21 -0
- skill/verified-runbooks/worker_deploy_cascade.md +23 -0
- unified_incident_env/README.md +10 -0
- unified_incident_env/__init__.py +17 -0
- unified_incident_env/client.py +35 -0
- unified_incident_env/interface.py +17 -0
- unified_incident_env/models.py +332 -0
- unified_incident_env/scripts/__init__.py +1 -0
- unified_incident_env/scripts/baseline_agent.py +43 -0
- unified_incident_env/scripts/walkthrough.py +41 -0
- unified_incident_env/server/__init__.py +1 -0
- unified_incident_env/server/app.py +148 -0
- unified_incident_env/server/challenge.py +753 -0
- unified_incident_env/server/environment.py +613 -0
- unified_incident_env/server/grader.py +145 -0
- unified_incident_env/tests/__init__.py +1 -0
- unified_incident_env/tests/test_environment.py +192 -0
- unified_incident_env/tests/test_submission_inference.py +119 -0
- unified_incident_env/tests/test_trainer.py +46 -0
- unified_incident_env/tests/test_trainer_session.py +32 -0
- unified_incident_env/trainer/__init__.py +8 -0
- unified_incident_env/trainer/action_adapter.py +204 -0
- unified_incident_env/trainer/analyze_failures.py +251 -0
- unified_incident_env/trainer/backend.py +165 -0
- unified_incident_env/trainer/build_datasets.py +258 -0
- unified_incident_env/trainer/build_sft_dataset.py +101 -0
- unified_incident_env/trainer/collect_trajectory.py +53 -0
- unified_incident_env/trainer/eval_models.py +145 -0
.dockerignore
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.venv/
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
| 4 |
+
.git/
|
| 5 |
+
.pytest_cache/
|
| 6 |
+
outputs/
|
| 7 |
+
.omx/
|
| 8 |
+
.codex/
|
| 9 |
+
AGENTS.md
|
| 10 |
+
sre_env/
|
| 11 |
+
*.egg-info/
|
| 12 |
+
dist/
|
| 13 |
+
build/
|
| 14 |
+
.gemini/
|
| 15 |
+
madhav_trial/
|
| 16 |
+
*.png
|
| 17 |
+
*.npz
|
| 18 |
+
node_modules/
|
.gitignore
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.venv/
|
| 2 |
+
__pycache__/
|
| 3 |
+
.pytest_cache/
|
| 4 |
+
*.pyc
|
| 5 |
+
learning_curve.png
|
| 6 |
+
.omx/
|
| 7 |
+
.codex/
|
| 8 |
+
outputs/
|
| 9 |
+
AGENTS.md
|
| 10 |
+
.sisyphus/
|
| 11 |
+
*.egg-info/
|
| 12 |
+
uv.lock
|
Dockerfile
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 6 |
+
PYTHONUNBUFFERED=1 \
|
| 7 |
+
ENABLE_WEB_INTERFACE=true
|
| 8 |
+
|
| 9 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 10 |
+
build-essential \
|
| 11 |
+
curl \
|
| 12 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 13 |
+
|
| 14 |
+
COPY . /app
|
| 15 |
+
|
| 16 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 17 |
+
pip install --no-cache-dir .
|
| 18 |
+
|
| 19 |
+
EXPOSE 8000
|
| 20 |
+
|
| 21 |
+
CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
Makefile
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.PHONY: install dev test baseline walkthrough trainer-eval trainer-dataset trainer-session docker-build docker-run validate clean
|
| 2 |
+
|
| 3 |
+
install:
|
| 4 |
+
python3 -m pip install -e ".[dev]"
|
| 5 |
+
@echo "Dependencies installed"
|
| 6 |
+
|
| 7 |
+
dev:
|
| 8 |
+
ENABLE_WEB_INTERFACE=true uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
|
| 9 |
+
|
| 10 |
+
test:
|
| 11 |
+
pytest unified_incident_env/tests -v --tb=short
|
| 12 |
+
|
| 13 |
+
baseline:
|
| 14 |
+
python -m unified_incident_env.scripts.baseline_agent
|
| 15 |
+
|
| 16 |
+
walkthrough:
|
| 17 |
+
python -m unified_incident_env.scripts.walkthrough --scenario easy_sqli_db_outage
|
| 18 |
+
|
| 19 |
+
trainer-eval:
|
| 20 |
+
python -m unified_incident_env.trainer.eval_models --models qwen2.5:0.5b gemma2:2b qwen2.5:7b-instruct-q4_K_M --mode strict
|
| 21 |
+
|
| 22 |
+
trainer-dataset:
|
| 23 |
+
python -m unified_incident_env.trainer.build_sft_dataset --source combined --output outputs/trainer/sft_dataset.jsonl
|
| 24 |
+
|
| 25 |
+
trainer-session:
|
| 26 |
+
python -m unified_incident_env.trainer.run_session --model qwen2.5:0.5b --base-url http://127.0.0.1:8000
|
| 27 |
+
|
| 28 |
+
docker-build:
|
| 29 |
+
docker buildx build --platform linux/amd64 -t sre-env:latest .
|
| 30 |
+
|
| 31 |
+
docker-run:
|
| 32 |
+
docker run -p 8000:8000 -e ENABLE_WEB_INTERFACE=true sre-env:latest
|
| 33 |
+
|
| 34 |
+
validate:
|
| 35 |
+
openenv validate .
|
| 36 |
+
|
| 37 |
+
clean:
|
| 38 |
+
rm -rf outputs __pycache__ .pytest_cache
|
| 39 |
+
find . -name "*.pyc" -delete
|
README.md
CHANGED
|
@@ -1,10 +1,117 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: red
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
|
|
|
| 7 |
pinned: false
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: SRE Gym
|
| 3 |
+
emoji: 🚨
|
| 4 |
colorFrom: red
|
| 5 |
+
colorTo: yellow
|
| 6 |
sdk: docker
|
| 7 |
+
app_port: 8000
|
| 8 |
pinned: false
|
| 9 |
+
license: apache-2.0
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# sre-gym — Fault-injecting SRE training env for OpenEnv
|
| 13 |
+
|
| 14 |
+
Most SRE agent skills are runbooks and good intentions. **sre-gym** is the other half: a fault-injecting environment with deterministic grading where an agent diagnoses a real production-style incident, chooses a safe remediation, verifies recovery, and declares resolved. Every run is scored the same way twice.
|
| 15 |
+
|
| 16 |
+
- Spec-compliant OpenEnv environment (typed Pydantic action / observation / state, `reset` / `step` / `state`, `openenv validate` green).
|
| 17 |
+
- 3 curriculum scenarios — easy, medium, hard — with decoy services and causal dependencies.
|
| 18 |
+
- 11 bounded actions. Honest state transitions. No hidden oracles.
|
| 19 |
+
- 21 tests passing.
|
| 20 |
+
- Ships a Claude Code skill + verified-runbook loop — successful solves write markdown runbooks that the next run reads back.
|
| 21 |
+
|
| 22 |
+
## 30-second demo
|
| 23 |
+
|
| 24 |
+
```bash
|
| 25 |
+
./demo/run_demo.sh
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
Starts the env, solves each scenario cold, writes a runbook for each, re-solves to prove the loop. Full transcript takes ~10 seconds.
|
| 29 |
+
|
| 30 |
+
## Curriculum
|
| 31 |
+
|
| 32 |
+
| Difficulty | Scenario | Story | Decoy | Correct path |
|
| 33 |
+
|---|---|---|---|---|
|
| 34 |
+
| easy | `worker_deploy_cascade` | Bad worker deploy → DB crash-loop → login 502s | — | rollback worker → restart db → verify → resolve |
|
| 35 |
+
| medium | `db_config_rollout` | DB config push shrank connection pool from 80→12 | recent worker deploy | rollback **db** → restart db → verify → resolve |
|
| 36 |
+
| hard | `gateway_auth_rollout` | Gateway auth-middleware rollout rejects valid logins | recent worker deploy | rollback **gateway** → verify → resolve (no restart) |
|
| 37 |
+
|
| 38 |
+
Rolling back the wrong service returns a negative reward and `failure_type="wrong_remediation_target"`. Restarting before the cause is removed re-inherits the bad state. `declare_resolved` is rejected until the scenario's resolution check passes against the actual world model.
|
| 39 |
+
|
| 40 |
+
## Install
|
| 41 |
+
|
| 42 |
+
```bash
|
| 43 |
+
# 1. Create a venv and install
|
| 44 |
+
python3 -m venv .venv && source .venv/bin/activate
|
| 45 |
+
pip install -e '.[dev]'
|
| 46 |
+
|
| 47 |
+
# 2. Start the env
|
| 48 |
+
uvicorn server.app:app --host 127.0.0.1 --port 8000
|
| 49 |
+
|
| 50 |
+
# 3. Run the baseline inference against it
|
| 51 |
+
export HF_TOKEN="…"; export ENV_BASE_URL=http://127.0.0.1:8000
|
| 52 |
+
python inference.py
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
## Install the Claude Code skill
|
| 56 |
+
|
| 57 |
+
```bash
|
| 58 |
+
ln -s "$PWD/skill" "$HOME/.claude/skills/sre-gym"
|
| 59 |
+
```
|
| 60 |
+
|
| 61 |
+
Then, in Claude Code, ask: *"Solve the db_config_rollout scenario in sre-gym."* The skill will drive the env via `skill/tools/sre_gym_client.py`, load any existing runbook from `skill/verified-runbooks/`, and append a fresh runbook on any clean solve (score > 0.85).
|
| 62 |
+
|
| 63 |
+
## Architecture
|
| 64 |
+
|
| 65 |
+
```
|
| 66 |
+
┌────────────────────┐ HTTP / WS ┌──────────────────────┐
|
| 67 |
+
│ Claude Code │ ──────────────────▶ │ OpenEnv server │
|
| 68 |
+
│ (with sre-gym │ ◀────────────────── │ (FastAPI, uvicorn) │
|
| 69 |
+
│ skill loaded) │ obs, reward │ unified_incident_env │
|
| 70 |
+
└────────────────────┘ └──────────────────────┘
|
| 71 |
+
│ ▲
|
| 72 |
+
▼ on clean solve (score > 0.85) │
|
| 73 |
+
┌────────────────────┐ │
|
| 74 |
+
│ verified-runbooks/ │ ────── loaded at skill load ──┘
|
| 75 |
+
│ *.md │
|
| 76 |
+
└────────────────────┘
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
## Scoring
|
| 80 |
+
|
| 81 |
+
Deterministic, 5 dimensions, sums to a public score in `[0.01, 0.99]`:
|
| 82 |
+
|
| 83 |
+
- **Recovery** (0–0.4): critical-path services healthy
|
| 84 |
+
- **Containment** (0–0.3): root cause removed or offending service isolated
|
| 85 |
+
- **Verification** (0–0.35): `database_recovery` + `end_to_end` checks passed
|
| 86 |
+
- **Impact** (0–0.15): user-impact reduced
|
| 87 |
+
- **Efficiency** (0–0.10): budget preserved, no wasteful repeats
|
| 88 |
+
|
| 89 |
+
Target **> 0.85** for "clean solve." That's also the runbook-record threshold.
|
| 90 |
+
|
| 91 |
+
## Repo layout
|
| 92 |
+
|
| 93 |
+
```
|
| 94 |
+
unified_incident_env/ # env core: models, environment, grader, challenge, tests
|
| 95 |
+
server/ # OpenEnv entrypoint wrapper
|
| 96 |
+
skill/ # Claude Code skill: SKILL.md, tools/, verified-runbooks/
|
| 97 |
+
demo/ # run_demo.sh + pitch.md
|
| 98 |
+
inference.py # OpenAI-client baseline for OpenEnv hackathon submission
|
| 99 |
+
openenv.yaml # OpenEnv manifest
|
| 100 |
+
Dockerfile # HF Space deployment
|
| 101 |
+
```
|
| 102 |
+
|
| 103 |
+
## Verify
|
| 104 |
+
|
| 105 |
+
```bash
|
| 106 |
+
pytest unified_incident_env/tests -q # 21 tests
|
| 107 |
+
python -m openenv.cli validate . # OpenEnv manifest check
|
| 108 |
+
docker build -t sre-engineer-llm:v2 . # HF Space image
|
| 109 |
+
```
|
| 110 |
+
|
| 111 |
+
## Roadmap — v2
|
| 112 |
+
|
| 113 |
+
Distill the accumulated `verified-runbooks/` corpus into a local 3B reviewer via [OpenClaw-RL](https://github.com/Gen-Verse/OpenClaw-RL)'s async GRPO-on-next-state loop. Same reward contract (`run_check` passes / `failure_type` absent), same grader, but a compact policy that runs without a frontier API.
|
| 114 |
+
|
| 115 |
+
## License
|
| 116 |
+
|
| 117 |
+
Apache 2.0
|
demo/pitch.md
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# sre-gym — 60-second pitch
|
| 2 |
+
|
| 3 |
+
> You can't train SRE agents on production. We built the gym.
|
| 4 |
+
|
| 5 |
+
## The story (00:00–01:00)
|
| 6 |
+
|
| 7 |
+
**[0:00–0:10 · Hook]** "Most SRE agent skills are prompts — a runbook and a good intention. We built the other half: a fault-injecting environment with deterministic grading, where every run is scored the same way twice."
|
| 8 |
+
|
| 9 |
+
**[0:10–0:25 · What it is]**
|
| 10 |
+
- OpenEnv-compliant. `openenv validate` passes.
|
| 11 |
+
- Three curriculum scenarios, easy → hard:
|
| 12 |
+
- **easy** `worker_deploy_cascade` — bad worker deploy cascades to a DB crash.
|
| 13 |
+
- **medium** `db_config_rollout` — DB config shrank the connection pool; a recent worker deploy is a decoy.
|
| 14 |
+
- **hard** `gateway_auth_rollout` — bad auth-middleware rollout; two plausible suspects, one right answer.
|
| 15 |
+
- 11 bounded actions, honest state transitions (rolling back the wrong thing *fails*), deterministic grader across recovery / containment / verification / impact / efficiency.
|
| 16 |
+
- 21 tests passing. One public Space URL.
|
| 17 |
+
|
| 18 |
+
**[0:25–0:55 · Live demo]** `./demo/run_demo.sh`
|
| 19 |
+
- Env starts. Three scenarios visible in `/tasks`.
|
| 20 |
+
- Runbook dir cleared; demo starts cold.
|
| 21 |
+
- Each scenario solves end-to-end (score ≈ 0.99, 8–10 steps).
|
| 22 |
+
- A markdown runbook is written per scenario from the successful trace.
|
| 23 |
+
- Re-solve the easy scenario — this time the skill loads the runbook first. Same score, same path, zero wasted investigation.
|
| 24 |
+
- Point to `skill/verified-runbooks/` — "Every clean solve makes the next one deterministic. No GRPO required for v1."
|
| 25 |
+
|
| 26 |
+
**[0:55–1:00 · Close]** "Install the skill by symlinking `skill/` into `~/.claude/skills/sre-gym`. Open source, Apache 2. v2 is the OpenClaw-RL loop — distill this corpus of verified runbooks into a local 3B reviewer."
|
| 27 |
+
|
| 28 |
+
## The one technical claim you should be ready to defend
|
| 29 |
+
|
| 30 |
+
> "The env is honest."
|
| 31 |
+
|
| 32 |
+
- No hidden oracles. Rolling back the wrong service returns a negative reward and `failure_type="wrong_remediation_target"` — same observation contract as any other action.
|
| 33 |
+
- `declare_resolved` is rejected until the scenario's `resolution_check` passes, verified by actual service states in the world model, not a flag the grader peeks at.
|
| 34 |
+
- Rewards reward *effects*, not evidence-gathering — you can't farm the env by spamming `query_logs`.
|
| 35 |
+
- `restart_service` on the database before the root cause is removed returns a negative reward. Always. Because in the real world, it would crash again.
|
| 36 |
+
|
| 37 |
+
## Judge Q&A cheat sheet
|
| 38 |
+
|
| 39 |
+
**"How is this different from running a real staging env?"**
|
| 40 |
+
Deterministic scoring. Every agent gets graded against the same signatures, same decoys, same tick budget. You can't do that on real infra.
|
| 41 |
+
|
| 42 |
+
**"Why only three scenarios?"**
|
| 43 |
+
Three clears the hackathon DQ gate (`easy/medium/hard`). Each has a decoy + causal chain — building another one is a data-entry exercise, not a design one. Adding scenarios #4–#20 is the v2 data scaling lane.
|
| 44 |
+
|
| 45 |
+
**"Why runbooks instead of GRPO?"**
|
| 46 |
+
For this submission, GRPO means 48 hours of training convergence risk on top of an env we just shipped. Markdown runbooks demonstrate the same loop (verified signal → persisted artefact → next run improves) in an auditable form. The GRPO wiring slots on top of the same traces when we're ready.
|
| 47 |
+
|
| 48 |
+
**"What's the skill actually doing at runtime?"**
|
| 49 |
+
The skill lives in `skill/SKILL.md`. It directs Claude (or any agent) to read `verified-runbooks/{scenario}.md` before the first action, drive the env through `skill/tools/sre_gym_client.py`, and append a fresh runbook on any solve with `final_score > 0.85`.
|
demo/run_demo.sh
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# sre-gym end-to-end demo.
|
| 3 |
+
# Spins up the env (or reuses a running one), solves each of the 3 scenarios
|
| 4 |
+
# with the baseline policy, records runbooks, shows the artefacts.
|
| 5 |
+
#
|
| 6 |
+
# Requires: python3.10+, docker (for the HF-Space-equivalent image) OR the
|
| 7 |
+
# repo's .venv. Defaults to .venv if present.
|
| 8 |
+
|
| 9 |
+
set -euo pipefail
|
| 10 |
+
cd "$(dirname "$0")/.."
|
| 11 |
+
|
| 12 |
+
PORT="${PORT:-8013}"
|
| 13 |
+
URL="http://127.0.0.1:${PORT}"
|
| 14 |
+
PY="${PYTHON:-.venv/bin/python}"
|
| 15 |
+
RUNBOOK_DIR="skill/verified-runbooks"
|
| 16 |
+
|
| 17 |
+
banner() { printf '\n\033[1;36m== %s ==\033[0m\n' "$*"; }
|
| 18 |
+
ok() { printf '\033[0;32m ✓ %s\033[0m\n' "$*"; }
|
| 19 |
+
|
| 20 |
+
banner "0 / preflight"
|
| 21 |
+
if [[ ! -x "$PY" ]]; then
|
| 22 |
+
echo " note: $PY not found, falling back to system python3" >&2
|
| 23 |
+
PY="python3"
|
| 24 |
+
fi
|
| 25 |
+
"$PY" -c "import unified_incident_env" 2>/dev/null || {
|
| 26 |
+
echo " error: unified_incident_env not importable; run 'pip install -e .' first" >&2
|
| 27 |
+
exit 1
|
| 28 |
+
}
|
| 29 |
+
ok "python + package ready"
|
| 30 |
+
|
| 31 |
+
banner "1 / start env"
|
| 32 |
+
if curl -sf "$URL/health" > /dev/null 2>&1; then
|
| 33 |
+
ok "env already running on $URL"
|
| 34 |
+
SERVER_STARTED=0
|
| 35 |
+
else
|
| 36 |
+
"$PY" -m uvicorn server.app:app --host 127.0.0.1 --port "$PORT" > /tmp/sre_gym_demo.log 2>&1 &
|
| 37 |
+
SERVER_PID=$!
|
| 38 |
+
SERVER_STARTED=1
|
| 39 |
+
for _ in $(seq 1 20); do
|
| 40 |
+
if curl -sf "$URL/health" > /dev/null 2>&1; then break; fi
|
| 41 |
+
sleep 0.3
|
| 42 |
+
done
|
| 43 |
+
curl -sf "$URL/health" > /dev/null || { echo " error: env failed to start" >&2; cat /tmp/sre_gym_demo.log >&2; exit 1; }
|
| 44 |
+
ok "env started on $URL (pid $SERVER_PID)"
|
| 45 |
+
fi
|
| 46 |
+
trap '[[ ${SERVER_STARTED:-0} -eq 1 ]] && kill ${SERVER_PID:-0} 2>/dev/null || true' EXIT
|
| 47 |
+
|
| 48 |
+
banner "2 / available scenarios"
|
| 49 |
+
SRE_GYM_URL="$URL" "$PY" skill/tools/sre_gym_client.py list
|
| 50 |
+
|
| 51 |
+
banner "3 / clear prior runbooks (demo starts cold)"
|
| 52 |
+
rm -f "$RUNBOOK_DIR"/*.md
|
| 53 |
+
ok "runbook directory cleared"
|
| 54 |
+
|
| 55 |
+
for scenario in worker_deploy_cascade db_config_rollout gateway_auth_rollout; do
|
| 56 |
+
banner "4 / solve: $scenario"
|
| 57 |
+
SRE_GYM_URL="$URL" "$PY" skill/tools/sre_gym_client.py solve "$scenario"
|
| 58 |
+
SRE_GYM_URL="$URL" "$PY" skill/tools/sre_gym_client.py record-runbook "$scenario"
|
| 59 |
+
done
|
| 60 |
+
|
| 61 |
+
banner "5 / verified runbooks now on disk"
|
| 62 |
+
ls -1 "$RUNBOOK_DIR"/*.md | sed 's|^| |'
|
| 63 |
+
|
| 64 |
+
banner "6 / re-solve easy scenario — runbook is loaded this time"
|
| 65 |
+
SRE_GYM_URL="$URL" "$PY" skill/tools/sre_gym_client.py solve worker_deploy_cascade | tail -4
|
| 66 |
+
|
| 67 |
+
banner "done"
|
| 68 |
+
echo " install the skill globally: ln -s \"$PWD/skill\" \"\$HOME/.claude/skills/sre-gym\""
|
| 69 |
+
echo " env log: /tmp/sre_gym_demo.log"
|
| 70 |
+
echo " runbooks: $RUNBOOK_DIR/"
|
deploy/push_to_hf.sh
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Deploy this repo to a Hugging Face Space (Docker SDK).
|
| 3 |
+
#
|
| 4 |
+
# Required:
|
| 5 |
+
# HF_TOKEN write-scoped HF access token
|
| 6 |
+
# HF_SPACE_ID e.g. yourname/sre-gym (create it at huggingface.co/new-space
|
| 7 |
+
# first, SDK=Docker, or let this script try to create it)
|
| 8 |
+
#
|
| 9 |
+
# Usage:
|
| 10 |
+
# HF_TOKEN=hf_xxx HF_SPACE_ID=yourname/sre-gym ./deploy/push_to_hf.sh
|
| 11 |
+
#
|
| 12 |
+
# After a successful push, verify from a different network:
|
| 13 |
+
# curl https://${space_subdomain}.hf.space/health
|
| 14 |
+
# curl https://${space_subdomain}.hf.space/tasks | jq '.scenarios[].difficulty'
|
| 15 |
+
|
| 16 |
+
set -euo pipefail
|
| 17 |
+
cd "$(dirname "$0")/.."
|
| 18 |
+
|
| 19 |
+
: "${HF_TOKEN:?HF_TOKEN is required}"
|
| 20 |
+
: "${HF_SPACE_ID:?HF_SPACE_ID is required, e.g. yourname/sre-gym}"
|
| 21 |
+
|
| 22 |
+
if ! command -v huggingface-cli > /dev/null; then
|
| 23 |
+
echo "error: huggingface-cli not installed. pip install 'huggingface_hub[cli]'" >&2
|
| 24 |
+
exit 1
|
| 25 |
+
fi
|
| 26 |
+
|
| 27 |
+
echo "== syncing openenv.yaml with HF_SPACE_ID =="
|
| 28 |
+
python3 - <<PY
|
| 29 |
+
import pathlib, re
|
| 30 |
+
path = pathlib.Path("openenv.yaml")
|
| 31 |
+
text = path.read_text()
|
| 32 |
+
text = re.sub(r"^ space_id:.*$", f" space_id: $HF_SPACE_ID", text, flags=re.M)
|
| 33 |
+
path.write_text(text)
|
| 34 |
+
print(f"openenv.yaml space_id -> $HF_SPACE_ID")
|
| 35 |
+
PY
|
| 36 |
+
|
| 37 |
+
echo "== ensuring the space exists (idempotent) =="
|
| 38 |
+
huggingface-cli repo create "$HF_SPACE_ID" \
|
| 39 |
+
--type space \
|
| 40 |
+
--space_sdk docker \
|
| 41 |
+
--token "$HF_TOKEN" \
|
| 42 |
+
--yes 2>&1 | grep -v "already created" || true
|
| 43 |
+
|
| 44 |
+
echo "== uploading repo =="
|
| 45 |
+
huggingface-cli upload "$HF_SPACE_ID" . \
|
| 46 |
+
--repo-type space \
|
| 47 |
+
--token "$HF_TOKEN" \
|
| 48 |
+
--commit-message "deploy sre-gym v2 (easy/medium/hard scenarios)"
|
| 49 |
+
|
| 50 |
+
subdomain="$(echo "$HF_SPACE_ID" | tr '/' '-')"
|
| 51 |
+
echo
|
| 52 |
+
echo "== deployment kicked off =="
|
| 53 |
+
echo " Logs: https://huggingface.co/spaces/$HF_SPACE_ID"
|
| 54 |
+
echo " Public: https://$subdomain.hf.space"
|
| 55 |
+
echo
|
| 56 |
+
echo "== verify from a different network (phone hotspot) =="
|
| 57 |
+
echo " curl https://$subdomain.hf.space/health"
|
| 58 |
+
echo " curl https://$subdomain.hf.space/tasks | jq '.scenarios[].difficulty'"
|
execution.md
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# How To Run (v2)
|
| 2 |
+
|
| 3 |
+
## 1. Setup
|
| 4 |
+
|
| 5 |
+
```bash
|
| 6 |
+
python3 -m venv .venv
|
| 7 |
+
source .venv/bin/activate
|
| 8 |
+
pip install -e '.[dev]'
|
| 9 |
+
```
|
| 10 |
+
|
| 11 |
+
## 2. Start the environment
|
| 12 |
+
|
| 13 |
+
```bash
|
| 14 |
+
source .venv/bin/activate
|
| 15 |
+
uvicorn server.app:app --host 127.0.0.1 --port 8000
|
| 16 |
+
```
|
| 17 |
+
|
| 18 |
+
## 3. Manual API smoke test
|
| 19 |
+
|
| 20 |
+
```bash
|
| 21 |
+
curl -X POST http://127.0.0.1:8000/reset -H 'content-type: application/json' -d '{}'
|
| 22 |
+
curl -X POST http://127.0.0.1:8000/step -H 'content-type: application/json' -d '{"action":{"action_type":"query_deploys","service":"worker"}}'
|
| 23 |
+
```
|
| 24 |
+
|
| 25 |
+
## 4. Run inference
|
| 26 |
+
|
| 27 |
+
```bash
|
| 28 |
+
source .venv/bin/activate
|
| 29 |
+
|
| 30 |
+
export HF_TOKEN="your_hf_token"
|
| 31 |
+
export API_BASE_URL="https://router.huggingface.co/v1"
|
| 32 |
+
export MODEL_NAME="Qwen/Qwen2.5-72B-Instruct:novita"
|
| 33 |
+
export ENV_BASE_URL="http://127.0.0.1:8000"
|
| 34 |
+
|
| 35 |
+
python inference.py
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
## 5. Verification
|
| 39 |
+
|
| 40 |
+
```bash
|
| 41 |
+
source .venv/bin/activate
|
| 42 |
+
pytest unified_incident_env/tests -q
|
| 43 |
+
openenv validate .
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
## 6. Reward semantics
|
| 47 |
+
|
| 48 |
+
- queries reveal evidence but do not directly mint positive breadcrumb reward
|
| 49 |
+
- remediation actions change the world state
|
| 50 |
+
- `run_check` verifies recovery explicitly
|
| 51 |
+
- `declare_resolved` succeeds only after objective checks pass
|
| 52 |
+
|
| 53 |
+
Public benchmark score is deterministic and separate from the per-step training reward.
|
inference.py
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Submission inference script for the honest narrow incident environment."""
|
| 3 |
+
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
from typing import Any
|
| 9 |
+
|
| 10 |
+
from openai import OpenAI
|
| 11 |
+
|
| 12 |
+
from unified_incident_env.client import UnifiedIncidentEnv
|
| 13 |
+
from unified_incident_env.models import UnifiedIncidentAction, UnifiedIncidentObservation
|
| 14 |
+
from unified_incident_env.server.challenge import DEFAULT_SCENARIO_ID, SCENARIOS
|
| 15 |
+
|
| 16 |
+
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
|
| 17 |
+
MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct:novita")
|
| 18 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 19 |
+
ENV_BASE_URL = os.getenv("ENV_BASE_URL") or UnifiedIncidentEnv.DEFAULT_BASE_URL
|
| 20 |
+
ENV_NAME = "unified-incident-env"
|
| 21 |
+
MAX_TOKENS = 260
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def create_client() -> OpenAI | None:
|
| 25 |
+
if not HF_TOKEN:
|
| 26 |
+
return None
|
| 27 |
+
return OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def log_start(*, task: str, env: str, model: str) -> None:
|
| 31 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def log_step(*, step: int, action: str, reward: float, done: bool, error: str | None) -> None:
|
| 35 |
+
print(
|
| 36 |
+
f"[STEP] step={step} action={action} reward={reward:.2f} done={str(done).lower()} error={error or 'null'}",
|
| 37 |
+
flush=True,
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def log_end(*, success: bool, steps: int, score: float, rewards: list[float]) -> None:
|
| 42 |
+
rewards_text = ",".join(f"{reward:.2f}" for reward in rewards)
|
| 43 |
+
print(f"[END] success={str(success).lower()} steps={steps} score={score:.2f} rewards={rewards_text}", flush=True)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _service_order(observation: UnifiedIncidentObservation) -> list[str]:
|
| 47 |
+
services = list(observation.service_health.items())
|
| 48 |
+
services.sort(key=lambda item: (item[1].status != "crashed", item[1].status != "degraded", -item[1].error_rate_pct))
|
| 49 |
+
return [name for name, _payload in services]
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _default_action_for_type(action_type: str, observation: UnifiedIncidentObservation) -> dict[str, Any]:
|
| 53 |
+
services = _service_order(observation)
|
| 54 |
+
service = services[0] if services else "database"
|
| 55 |
+
if action_type in {"query_logs", "query_dependencies", "query_deploys", "rollback_deploy", "restart_service", "isolate_service"}:
|
| 56 |
+
if action_type == "rollback_deploy":
|
| 57 |
+
service = "worker"
|
| 58 |
+
return {"action_type": action_type, "service": service}
|
| 59 |
+
if action_type == "query_metrics":
|
| 60 |
+
return {"action_type": action_type, "service": service, "metric": "cpu"}
|
| 61 |
+
if action_type == "run_check":
|
| 62 |
+
check_name = "database_recovery"
|
| 63 |
+
if observation.service_health.get("database") and observation.service_health["database"].status == "healthy":
|
| 64 |
+
check_name = "end_to_end"
|
| 65 |
+
return {"action_type": action_type, "check_name": check_name}
|
| 66 |
+
if action_type == "submit_hypothesis":
|
| 67 |
+
return {
|
| 68 |
+
"action_type": "submit_hypothesis",
|
| 69 |
+
"hypothesis": {
|
| 70 |
+
"root_cause": "bad_worker_deploy",
|
| 71 |
+
"affected_services": ["worker", "database"],
|
| 72 |
+
"confidence": 0.5,
|
| 73 |
+
"recommended_next_action": "query_deploys",
|
| 74 |
+
},
|
| 75 |
+
}
|
| 76 |
+
return {"action_type": action_type}
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def parse_action(raw: str, observation: UnifiedIncidentObservation) -> UnifiedIncidentAction | None:
|
| 80 |
+
text = raw.strip()
|
| 81 |
+
if not text:
|
| 82 |
+
return None
|
| 83 |
+
try:
|
| 84 |
+
data = json.loads(text)
|
| 85 |
+
except Exception:
|
| 86 |
+
return None
|
| 87 |
+
if not isinstance(data, dict):
|
| 88 |
+
return None
|
| 89 |
+
if "action" in data and "action_type" not in data and isinstance(data["action"], str):
|
| 90 |
+
data = {**data, "action_type": data["action"]}
|
| 91 |
+
data.pop("action", None)
|
| 92 |
+
action_type = data.get("action_type")
|
| 93 |
+
if action_type not in observation.allowed_actions:
|
| 94 |
+
return None
|
| 95 |
+
try:
|
| 96 |
+
return UnifiedIncidentAction(**data)
|
| 97 |
+
except Exception:
|
| 98 |
+
return None
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def build_user_prompt(observation: UnifiedIncidentObservation) -> str:
|
| 102 |
+
required_lines = []
|
| 103 |
+
for action, fields in observation.required_fields_by_action.items():
|
| 104 |
+
required_lines.append(f"- {action}: {', '.join(fields) if fields else '(no extra fields)'}")
|
| 105 |
+
checks = "\n".join(
|
| 106 |
+
f"- {check.name}: {'passed' if check.passed else 'pending'} - {check.detail}"
|
| 107 |
+
for check in observation.checks
|
| 108 |
+
) or "- none"
|
| 109 |
+
return (
|
| 110 |
+
"Return exactly one JSON object representing the next action.\n"
|
| 111 |
+
f"Current stage: {observation.workflow_stage}\n"
|
| 112 |
+
f"Incident summary: {observation.incident_summary}\n"
|
| 113 |
+
f"Current score: {observation.final_score:.4f}\n"
|
| 114 |
+
f"Last action result: {observation.last_action_result or 'none'}\n"
|
| 115 |
+
f"Tool output: {observation.tool_output or 'none'}\n"
|
| 116 |
+
f"Failure: {observation.failure_type or 'none'}\n"
|
| 117 |
+
f"Why failed: {observation.why_failed or 'none'}\n"
|
| 118 |
+
f"User impact: {observation.user_impact:.2f}\n"
|
| 119 |
+
f"SLO burn rate: {observation.slo_burn_rate:.2f}\n"
|
| 120 |
+
"Allowed actions:\n"
|
| 121 |
+
+ "\n".join(f"- {action}" for action in observation.allowed_actions)
|
| 122 |
+
+ "\nRequired fields:\n"
|
| 123 |
+
+ "\n".join(required_lines)
|
| 124 |
+
+ "\nChecks:\n"
|
| 125 |
+
+ checks
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def _schema(observation: UnifiedIncidentObservation) -> dict[str, Any]:
|
| 130 |
+
properties: dict[str, Any] = {
|
| 131 |
+
"action_type": {"type": "string", "enum": observation.allowed_actions},
|
| 132 |
+
"service": {"type": "string", "enum": sorted(observation.service_health)},
|
| 133 |
+
"metric": {"type": "string", "enum": ["cpu", "error_rate", "latency"]},
|
| 134 |
+
"check_name": {"type": "string", "enum": ["database_recovery", "end_to_end"]},
|
| 135 |
+
"hypothesis": {
|
| 136 |
+
"type": "object",
|
| 137 |
+
"properties": {
|
| 138 |
+
"root_cause": {"type": "string", "enum": ["bad_worker_deploy", "database_only_failure", "api_gateway_fault"]},
|
| 139 |
+
"affected_services": {
|
| 140 |
+
"type": "array",
|
| 141 |
+
"items": {"type": "string", "enum": sorted(observation.service_health)},
|
| 142 |
+
"minItems": 1,
|
| 143 |
+
},
|
| 144 |
+
"confidence": {"type": "number", "minimum": 0.0, "maximum": 1.0},
|
| 145 |
+
"recommended_next_action": {
|
| 146 |
+
"type": "string",
|
| 147 |
+
"enum": [
|
| 148 |
+
"query_logs",
|
| 149 |
+
"query_metrics",
|
| 150 |
+
"query_dependencies",
|
| 151 |
+
"query_deploys",
|
| 152 |
+
"rollback_deploy",
|
| 153 |
+
"restart_service",
|
| 154 |
+
"run_check",
|
| 155 |
+
"isolate_service",
|
| 156 |
+
"escalate",
|
| 157 |
+
"declare_resolved",
|
| 158 |
+
],
|
| 159 |
+
},
|
| 160 |
+
},
|
| 161 |
+
"required": ["root_cause", "affected_services", "confidence", "recommended_next_action"],
|
| 162 |
+
"additionalProperties": False,
|
| 163 |
+
},
|
| 164 |
+
}
|
| 165 |
+
required = ["action_type"]
|
| 166 |
+
for action, fields in observation.required_fields_by_action.items():
|
| 167 |
+
if action in observation.allowed_actions:
|
| 168 |
+
for field in fields:
|
| 169 |
+
if field not in required:
|
| 170 |
+
required.append(field)
|
| 171 |
+
return {
|
| 172 |
+
"type": "object",
|
| 173 |
+
"properties": properties,
|
| 174 |
+
"required": required,
|
| 175 |
+
"additionalProperties": False,
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def request_action(client: OpenAI, observation: UnifiedIncidentObservation) -> str:
|
| 180 |
+
completion = client.chat.completions.create(
|
| 181 |
+
model=MODEL_NAME,
|
| 182 |
+
messages=[
|
| 183 |
+
{"role": "system", "content": "You are an incident responder. Respond with JSON only."},
|
| 184 |
+
{"role": "user", "content": build_user_prompt(observation)},
|
| 185 |
+
],
|
| 186 |
+
response_format={
|
| 187 |
+
"type": "json_schema",
|
| 188 |
+
"json_schema": {
|
| 189 |
+
"name": "incident_action",
|
| 190 |
+
"strict": True,
|
| 191 |
+
"schema": _schema(observation),
|
| 192 |
+
},
|
| 193 |
+
},
|
| 194 |
+
max_tokens=MAX_TOKENS,
|
| 195 |
+
temperature=0.0,
|
| 196 |
+
)
|
| 197 |
+
return (completion.choices[0].message.content or "").strip()
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def build_fallback_action(observation: UnifiedIncidentObservation) -> UnifiedIncidentAction:
|
| 201 |
+
services = _service_order(observation)
|
| 202 |
+
if "query_deploys" in observation.allowed_actions and "worker" in observation.service_health:
|
| 203 |
+
return UnifiedIncidentAction(action_type="query_deploys", service="worker")
|
| 204 |
+
if "query_logs" in observation.allowed_actions:
|
| 205 |
+
return UnifiedIncidentAction(action_type="query_logs", service=services[0] if services else "database")
|
| 206 |
+
if "query_metrics" in observation.allowed_actions:
|
| 207 |
+
return UnifiedIncidentAction(action_type="query_metrics", service=services[0] if services else "database", metric="cpu")
|
| 208 |
+
action_type = observation.allowed_actions[0]
|
| 209 |
+
return UnifiedIncidentAction(**_default_action_for_type(action_type, observation))
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
def get_model_action(client: OpenAI | None, observation: UnifiedIncidentObservation) -> tuple[UnifiedIncidentAction, str | None]:
|
| 213 |
+
if client is None:
|
| 214 |
+
return build_fallback_action(observation), "model_unavailable"
|
| 215 |
+
try:
|
| 216 |
+
parsed = parse_action(request_action(client, observation), observation)
|
| 217 |
+
if parsed is not None:
|
| 218 |
+
return parsed, None
|
| 219 |
+
except Exception:
|
| 220 |
+
pass
|
| 221 |
+
return build_fallback_action(observation), "fallback_used"
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
def run_scenario(client: OpenAI | None, scenario_id: str) -> dict[str, Any]:
|
| 225 |
+
with UnifiedIncidentEnv(base_url=ENV_BASE_URL).sync() as env:
|
| 226 |
+
observation = env.reset(scenario_id=scenario_id).observation
|
| 227 |
+
rewards: list[float] = []
|
| 228 |
+
step = 0
|
| 229 |
+
log_start(task=scenario_id, env=ENV_NAME, model=MODEL_NAME)
|
| 230 |
+
while not observation.done:
|
| 231 |
+
step += 1
|
| 232 |
+
action, error = get_model_action(client, observation)
|
| 233 |
+
result = env.step(action)
|
| 234 |
+
observation = result.observation
|
| 235 |
+
rewards.append(float(result.reward))
|
| 236 |
+
log_step(
|
| 237 |
+
step=step,
|
| 238 |
+
action=json.dumps(action.model_dump(exclude_none=True), separators=(",", ":")),
|
| 239 |
+
reward=float(result.reward),
|
| 240 |
+
done=bool(result.done),
|
| 241 |
+
error=error or observation.failure_type,
|
| 242 |
+
)
|
| 243 |
+
log_end(
|
| 244 |
+
success=bool(observation.done and observation.incident_resolved),
|
| 245 |
+
steps=step,
|
| 246 |
+
score=observation.final_score,
|
| 247 |
+
rewards=rewards,
|
| 248 |
+
)
|
| 249 |
+
return {
|
| 250 |
+
"success": bool(observation.done and observation.incident_resolved),
|
| 251 |
+
"score": observation.final_score,
|
| 252 |
+
"steps": step,
|
| 253 |
+
"rewards": rewards,
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
def main() -> None:
|
| 258 |
+
client = create_client()
|
| 259 |
+
for scenario_id in SCENARIOS:
|
| 260 |
+
run_scenario(client, scenario_id)
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
if __name__ == "__main__":
|
| 264 |
+
main()
|
openenv.yaml
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: sre-engineer-llm
|
| 2 |
+
version: 2.0.0
|
| 3 |
+
description: >
|
| 4 |
+
Honest narrow OpenEnv benchmark for incident diagnosis and safe remediation.
|
| 5 |
+
Agents query evidence, choose bounded remediation actions, run explicit checks,
|
| 6 |
+
and declare resolution only after objective recovery succeeds.
|
| 7 |
+
author: Daksh Verma
|
| 8 |
+
license: MIT
|
| 9 |
+
|
| 10 |
+
environment:
|
| 11 |
+
action_type: UnifiedIncidentAction
|
| 12 |
+
observation_type: UnifiedIncidentObservation
|
| 13 |
+
state_type: UnifiedIncidentState
|
| 14 |
+
max_steps: 12
|
| 15 |
+
difficulties: [easy, medium, hard]
|
| 16 |
+
reward_type: dense
|
| 17 |
+
|
| 18 |
+
huggingface:
|
| 19 |
+
space_id: dakshdoesdev/sre-gym
|
| 20 |
+
sdk: docker
|
| 21 |
+
hardware: cpu-basic
|
pyproject.toml
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["hatchling"]
|
| 3 |
+
build-backend = "hatchling.build"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "unified-incident-env"
|
| 7 |
+
version = "1.0.0"
|
| 8 |
+
description = "Unified OpenEnv benchmark for incident response with causally linked WebSec remediation"
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
requires-python = ">=3.10"
|
| 11 |
+
dependencies = [
|
| 12 |
+
"openenv-core>=0.2.1",
|
| 13 |
+
"fastapi>=0.115.0",
|
| 14 |
+
"uvicorn[standard]>=0.30.0",
|
| 15 |
+
"pydantic>=2.8.0",
|
| 16 |
+
"httpx>=0.27.0",
|
| 17 |
+
"openai>=1.0.0",
|
| 18 |
+
"websockets>=12.0",
|
| 19 |
+
"rich>=13.0.0",
|
| 20 |
+
"matplotlib>=3.9.0",
|
| 21 |
+
"numpy>=2.0.0"
|
| 22 |
+
]
|
| 23 |
+
|
| 24 |
+
[project.optional-dependencies]
|
| 25 |
+
dev = [
|
| 26 |
+
"pytest>=8.0.0",
|
| 27 |
+
"pytest-asyncio>=0.23.0"
|
| 28 |
+
]
|
| 29 |
+
|
| 30 |
+
[project.scripts]
|
| 31 |
+
server = "server.app:main"
|
| 32 |
+
baseline = "unified_incident_env.scripts.baseline_agent:main"
|
| 33 |
+
walkthrough = "unified_incident_env.scripts.walkthrough:main"
|
| 34 |
+
trainer-run-episode = "unified_incident_env.trainer.run_episode:main"
|
| 35 |
+
trainer-build-dataset = "unified_incident_env.trainer.build_sft_dataset:main"
|
| 36 |
+
trainer-eval-models = "unified_incident_env.trainer.eval_models:main"
|
| 37 |
+
trainer-build-datasets = "unified_incident_env.trainer.build_datasets:main"
|
| 38 |
+
trainer-update-model = "unified_incident_env.trainer.update_model:main"
|
| 39 |
+
trainer-run-session = "unified_incident_env.trainer.run_session:main"
|
| 40 |
+
trainer-train-external = "unified_incident_env.trainer.train_external:main"
|
| 41 |
+
|
| 42 |
+
[tool.hatch.build.targets.wheel]
|
| 43 |
+
packages = ["unified_incident_env", "server"]
|
requirements.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
-e .
|
run_demo.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Run a local end-to-end benchmark demo against the OpenEnv server."""
|
| 3 |
+
|
| 4 |
+
from __future__ import annotations
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import signal
|
| 8 |
+
import subprocess
|
| 9 |
+
import sys
|
| 10 |
+
import time
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
import httpx
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
REPO_ROOT = Path(__file__).resolve().parent
|
| 17 |
+
BASE_URL = os.getenv("ENV_BASE_URL", "http://127.0.0.1:8000")
|
| 18 |
+
HEALTH_URL = f"{BASE_URL.rstrip('/')}/health"
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def server_is_ready() -> bool:
|
| 22 |
+
try:
|
| 23 |
+
response = httpx.get(HEALTH_URL, timeout=2.0)
|
| 24 |
+
return response.status_code == 200
|
| 25 |
+
except Exception:
|
| 26 |
+
return False
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def start_server() -> subprocess.Popen[str]:
|
| 30 |
+
return subprocess.Popen(
|
| 31 |
+
[
|
| 32 |
+
sys.executable,
|
| 33 |
+
"-m",
|
| 34 |
+
"uvicorn",
|
| 35 |
+
"server.app:app",
|
| 36 |
+
"--host",
|
| 37 |
+
"127.0.0.1",
|
| 38 |
+
"--port",
|
| 39 |
+
"8000",
|
| 40 |
+
],
|
| 41 |
+
cwd=REPO_ROOT,
|
| 42 |
+
text=True,
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def wait_for_server(timeout_s: float = 20.0) -> None:
|
| 47 |
+
deadline = time.time() + timeout_s
|
| 48 |
+
while time.time() < deadline:
|
| 49 |
+
if server_is_ready():
|
| 50 |
+
return
|
| 51 |
+
time.sleep(0.5)
|
| 52 |
+
raise RuntimeError(f"Server did not become ready at {HEALTH_URL}")
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def stop_server(process: subprocess.Popen[str]) -> None:
|
| 56 |
+
if process.poll() is not None:
|
| 57 |
+
return
|
| 58 |
+
process.send_signal(signal.SIGTERM)
|
| 59 |
+
try:
|
| 60 |
+
process.wait(timeout=10)
|
| 61 |
+
except subprocess.TimeoutExpired:
|
| 62 |
+
process.kill()
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def main() -> None:
|
| 66 |
+
server_process: subprocess.Popen[str] | None = None
|
| 67 |
+
try:
|
| 68 |
+
if not server_is_ready():
|
| 69 |
+
server_process = start_server()
|
| 70 |
+
wait_for_server()
|
| 71 |
+
|
| 72 |
+
env = os.environ.copy()
|
| 73 |
+
env.setdefault("ENV_BASE_URL", BASE_URL)
|
| 74 |
+
subprocess.run(
|
| 75 |
+
[sys.executable, "inference.py"],
|
| 76 |
+
cwd=REPO_ROOT,
|
| 77 |
+
env=env,
|
| 78 |
+
check=True,
|
| 79 |
+
)
|
| 80 |
+
finally:
|
| 81 |
+
if server_process is not None:
|
| 82 |
+
stop_server(server_process)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
if __name__ == "__main__":
|
| 86 |
+
main()
|
server/Dockerfile
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 6 |
+
PYTHONUNBUFFERED=1 \
|
| 7 |
+
ENABLE_WEB_INTERFACE=true
|
| 8 |
+
|
| 9 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 10 |
+
build-essential \
|
| 11 |
+
curl \
|
| 12 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 13 |
+
|
| 14 |
+
COPY . /app
|
| 15 |
+
|
| 16 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 17 |
+
pip install --no-cache-dir .
|
| 18 |
+
|
| 19 |
+
EXPOSE 8000
|
| 20 |
+
|
| 21 |
+
CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
server/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""OpenEnv server wrapper package."""
|
server/app.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Top-level OpenEnv entrypoint wrapper."""
|
| 2 |
+
|
| 3 |
+
from unified_incident_env.server.app import app, serve
|
| 4 |
+
from unified_incident_env.server.app import main as _main
|
| 5 |
+
|
| 6 |
+
__all__ = ["app", "main", "serve"]
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def main() -> None:
|
| 10 |
+
_main()
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
if __name__ == "__main__":
|
| 14 |
+
main()
|
server/requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
openenv-core>=0.2.1
|
| 2 |
+
fastapi>=0.115.0
|
| 3 |
+
uvicorn[standard]>=0.30.0
|
| 4 |
+
pydantic>=2.8.0
|
| 5 |
+
websockets>=12.0
|
| 6 |
+
openai>=1.0.0
|
| 7 |
+
matplotlib>=3.9.0
|
| 8 |
+
numpy>=2.0.0
|
skill/SKILL.md
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
name: sre-gym
|
| 3 |
+
description: SRE incident-response training environment with fault injection and deterministic grading. Use when the user wants to practice SRE skills, solve an injected production incident, or run one of three scenarios (worker_deploy_cascade / db_config_rollout / gateway_auth_rollout) against the sre-gym HTTP server. Invokes scripts in skill/tools/ to query the env and records verified runbooks after clean solves.
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# SRE Gym — Incident Response Skill
|
| 7 |
+
|
| 8 |
+
You are an SRE agent connected to a running sre-gym environment (HTTP, default `http://127.0.0.1:8000`). The env simulates production incidents with decoy services, deterministic grading, and explicit resolution checks. Your job is to diagnose from evidence, pick the correct remediation, verify recovery, then declare resolved.
|
| 9 |
+
|
| 10 |
+
## When to use this skill
|
| 11 |
+
|
| 12 |
+
- The user names a scenario (`worker_deploy_cascade`, `db_config_rollout`, `gateway_auth_rollout`) or says "solve an incident / run SRE scenario"
|
| 13 |
+
- The user asks you to practice, benchmark, or demo incident response
|
| 14 |
+
- The user points you at an sre-gym URL
|
| 15 |
+
|
| 16 |
+
## Core rules (never break these)
|
| 17 |
+
|
| 18 |
+
1. **Never guess at remediation.** Query evidence (`query_logs`, `query_deploys`, `query_metrics`) before `rollback_deploy` / `restart_service`.
|
| 19 |
+
2. **Root cause before restart.** Restarting a service before rolling back the triggering change re-inherits the bad state.
|
| 20 |
+
3. **Never call `declare_resolved` before the scenario's resolution check passes.** Each scenario specifies which check is required; read it from `observation.checks` and from any loaded runbook.
|
| 21 |
+
4. **Watch for decoys.** Each scenario has a plausible-looking wrong answer. Example: `db_config_rollout` has a recent worker deploy that is *not* the cause. Read logs before committing to a target.
|
| 22 |
+
5. **Repeating the same no-progress action wastes ticks.** The env emits `loop_warning` when you do this — treat it as a hard signal to try a different evidence source.
|
| 23 |
+
|
| 24 |
+
## Workflow
|
| 25 |
+
|
| 26 |
+
### 1. Load prior knowledge
|
| 27 |
+
|
| 28 |
+
Before your first action, check `skill/verified-runbooks/{scenario_id}.md`. If it exists, read it — it's a log of previously-successful solves for this exact scenario, written by earlier runs of this skill. Use the winning path and the decoy list.
|
| 29 |
+
|
| 30 |
+
### 2. Drive the env
|
| 31 |
+
|
| 32 |
+
Use `skill/tools/sre_gym_client.py` to call the env:
|
| 33 |
+
|
| 34 |
+
```bash
|
| 35 |
+
python skill/tools/sre_gym_client.py list # show available scenarios
|
| 36 |
+
python skill/tools/sre_gym_client.py reset <id> # start an episode
|
| 37 |
+
python skill/tools/sre_gym_client.py step '<json>' # take one action
|
| 38 |
+
python skill/tools/sre_gym_client.py status # current obs + grader
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
Action JSON matches the env's `UnifiedIncidentAction` model. Examples:
|
| 42 |
+
```json
|
| 43 |
+
{"action_type": "query_logs", "service": "database"}
|
| 44 |
+
{"action_type": "query_deploys", "service": "worker"}
|
| 45 |
+
{"action_type": "rollback_deploy", "service": "database"}
|
| 46 |
+
{"action_type": "run_check", "check_name": "end_to_end"}
|
| 47 |
+
{"action_type": "declare_resolved"}
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
### 3. Investigation loop (per tick)
|
| 51 |
+
|
| 52 |
+
1. Read `observation.prompt_text` — services, alerts, last result, failure_type, why_failed.
|
| 53 |
+
2. If `observation.failure_type` is set, your previous action was rejected — **do not repeat it**, read `why_failed` and pick a different evidence source or remediation.
|
| 54 |
+
3. Form a hypothesis with `submit_hypothesis` once you have enough evidence (usually 2–4 queries). Calibrate `confidence`: ≥0.7 only if you're sure.
|
| 55 |
+
4. Remediate (`rollback_deploy` → `restart_service` if scenario requires → `run_check`).
|
| 56 |
+
5. `declare_resolved` only after the required check passes.
|
| 57 |
+
|
| 58 |
+
### 4. Record the runbook
|
| 59 |
+
|
| 60 |
+
If the episode finishes with `incident_resolved=true` and `final_score > 0.85`, run:
|
| 61 |
+
|
| 62 |
+
```bash
|
| 63 |
+
python skill/tools/sre_gym_client.py record-runbook <scenario_id>
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
This appends a new entry to `skill/verified-runbooks/{scenario_id}.md`. Future runs of this skill (yours or another Claude's) load it automatically.
|
| 67 |
+
|
| 68 |
+
## Action reference (11 actions)
|
| 69 |
+
|
| 70 |
+
| Action | Required fields | Purpose |
|
| 71 |
+
|---|---|---|
|
| 72 |
+
| `query_logs` | `service` | Read service-level error logs |
|
| 73 |
+
| `query_metrics` | `service`, `metric` (cpu/error_rate/latency) | Read quantitative signals |
|
| 74 |
+
| `query_dependencies` | `service` | Map upstream/downstream |
|
| 75 |
+
| `query_deploys` | `service` | Recent deploy history |
|
| 76 |
+
| `rollback_deploy` | `service` | Revert last deploy — SCENARIO-SPECIFIC TARGET |
|
| 77 |
+
| `restart_service` | `service` | Reboot a service (usually after rollback) |
|
| 78 |
+
| `run_check` | `check_name` (`database_recovery` / `end_to_end`) | Objective recovery check |
|
| 79 |
+
| `isolate_service` | `service` | Containment only, does not resolve |
|
| 80 |
+
| `escalate` | — | Record escalation note |
|
| 81 |
+
| `submit_hypothesis` | `hypothesis` object | Commit RCA with confidence calibration |
|
| 82 |
+
| `declare_resolved` | — | Finalize; rejected if required check has not passed |
|
| 83 |
+
|
| 84 |
+
## Scoring rubric (deterministic from the env)
|
| 85 |
+
|
| 86 |
+
- **Recovery (0–0.4):** services healthy on the critical path
|
| 87 |
+
- **Containment (0–0.3):** root cause removed OR offending service isolated
|
| 88 |
+
- **Verification (0–0.35):** both checks passed
|
| 89 |
+
- **Impact (0–0.15):** user_impact reduced
|
| 90 |
+
- **Efficiency (0–0.10):** budget preserved, no wasteful repeats
|
| 91 |
+
|
| 92 |
+
Clean solve target: **> 0.85**. That's the runbook-record threshold.
|
| 93 |
+
|
| 94 |
+
## Decoy knowledge (read before hypothesizing)
|
| 95 |
+
|
| 96 |
+
- `worker_deploy_cascade`: the only true cause; no decoys.
|
| 97 |
+
- `db_config_rollout`: the recent worker deploy is a **decoy**. Rolling back worker yields `wrong_remediation_target`.
|
| 98 |
+
- `gateway_auth_rollout`: the recent worker deploy (`worker@...-hotfix` — log-format tweak) is a **decoy**. The gateway auth rollout is the cause.
|
| 99 |
+
|
| 100 |
+
If you take a wrong remediation, the env returns `failure_type="wrong_remediation_target"` and a negative reward — **do not retry the same wrong target**, re-read the logs.
|
skill/tools/sre_gym_client.py
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""CLI client for the sre-gym skill.
|
| 3 |
+
|
| 4 |
+
Usage:
|
| 5 |
+
sre_gym_client.py list
|
| 6 |
+
sre_gym_client.py solve <scenario_id> [--policy baseline]
|
| 7 |
+
sre_gym_client.py interactive <scenario_id> # stdin: one JSON action per line
|
| 8 |
+
sre_gym_client.py record-runbook <scenario_id> <session.json>
|
| 9 |
+
|
| 10 |
+
Because OpenEnv's HTTP /reset and /step handlers create a fresh environment per
|
| 11 |
+
call, episode state only persists within a single client session. This CLI wraps
|
| 12 |
+
one episode inside one Python process so the session is preserved.
|
| 13 |
+
|
| 14 |
+
SRE_GYM_URL env var overrides the base URL (default http://127.0.0.1:8000).
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from __future__ import annotations
|
| 18 |
+
|
| 19 |
+
import datetime as _dt
|
| 20 |
+
import json
|
| 21 |
+
import os
|
| 22 |
+
import sys
|
| 23 |
+
from pathlib import Path
|
| 24 |
+
from typing import Any
|
| 25 |
+
|
| 26 |
+
# Make the sibling package importable whether the script is invoked from the
|
| 27 |
+
# repo root or from the skill/ directory directly.
|
| 28 |
+
_REPO_ROOT = Path(__file__).resolve().parent.parent.parent
|
| 29 |
+
if str(_REPO_ROOT) not in sys.path:
|
| 30 |
+
sys.path.insert(0, str(_REPO_ROOT))
|
| 31 |
+
|
| 32 |
+
from unified_incident_env.client import UnifiedIncidentEnv # noqa: E402
|
| 33 |
+
from unified_incident_env.models import UnifiedIncidentAction, UnifiedIncidentObservation # noqa: E402
|
| 34 |
+
from unified_incident_env.server.challenge import SCENARIOS, list_baselines # noqa: E402
|
| 35 |
+
|
| 36 |
+
BASE_URL = os.environ.get("SRE_GYM_URL", "http://127.0.0.1:8000").rstrip("/")
|
| 37 |
+
RUNBOOK_DIR = Path(__file__).resolve().parent.parent / "verified-runbooks"
|
| 38 |
+
SCORE_THRESHOLD = 0.85
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def _clean_action(action: UnifiedIncidentAction) -> dict[str, Any]:
|
| 42 |
+
data = action.model_dump(exclude_none=True)
|
| 43 |
+
if data.get("metadata") == {}:
|
| 44 |
+
data.pop("metadata")
|
| 45 |
+
hypothesis = data.get("hypothesis")
|
| 46 |
+
if isinstance(hypothesis, dict) and hypothesis.get("metadata") == {}:
|
| 47 |
+
hypothesis.pop("metadata", None)
|
| 48 |
+
return data
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _summarize_obs(obs: UnifiedIncidentObservation) -> dict[str, Any]:
|
| 52 |
+
return {
|
| 53 |
+
"tick": obs.tick_count,
|
| 54 |
+
"workflow_stage": obs.workflow_stage,
|
| 55 |
+
"last_action_result": obs.last_action_result,
|
| 56 |
+
"tool_output": obs.tool_output,
|
| 57 |
+
"failure_type": obs.failure_type,
|
| 58 |
+
"why_failed": obs.why_failed,
|
| 59 |
+
"loop_warning": obs.loop_warning,
|
| 60 |
+
"checks": [{"name": c.name, "passed": c.passed} for c in obs.checks],
|
| 61 |
+
"final_score": obs.final_score,
|
| 62 |
+
"incident_resolved": obs.incident_resolved,
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def _session_path(scenario_id: str) -> Path:
|
| 67 |
+
return Path(f"/tmp/sre_gym_session.{scenario_id}.json")
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def cmd_list() -> None:
|
| 71 |
+
for scenario in SCENARIOS.values():
|
| 72 |
+
print(f" {scenario['difficulty']:<6} {scenario['id']:<25} {scenario['name']}")
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def cmd_solve(scenario_id: str, policy: str = "baseline") -> None:
|
| 76 |
+
"""Run an entire episode end-to-end inside one process."""
|
| 77 |
+
if scenario_id not in SCENARIOS:
|
| 78 |
+
print(f"error: unknown scenario {scenario_id!r}", file=sys.stderr)
|
| 79 |
+
sys.exit(2)
|
| 80 |
+
if policy != "baseline":
|
| 81 |
+
print(f"error: unknown policy {policy!r} (only 'baseline' available)", file=sys.stderr)
|
| 82 |
+
sys.exit(2)
|
| 83 |
+
|
| 84 |
+
trace: list[dict[str, Any]] = []
|
| 85 |
+
with UnifiedIncidentEnv(base_url=BASE_URL).sync() as env:
|
| 86 |
+
obs = env.reset(scenario_id=scenario_id).observation
|
| 87 |
+
print(f"[reset] scenario={scenario_id} difficulty={obs.difficulty}")
|
| 88 |
+
for step in list_baselines(scenario_id).baselines[0].actions:
|
| 89 |
+
result = env.step(step.action)
|
| 90 |
+
obs = result.observation
|
| 91 |
+
record = {
|
| 92 |
+
"step": obs.tick_count,
|
| 93 |
+
"action": _clean_action(step.action),
|
| 94 |
+
"rationale": step.rationale,
|
| 95 |
+
"reward": result.reward,
|
| 96 |
+
**_summarize_obs(obs),
|
| 97 |
+
}
|
| 98 |
+
trace.append(record)
|
| 99 |
+
action_repr = json.dumps(record["action"], separators=(",", ":"))
|
| 100 |
+
print(f"[step {obs.tick_count}] action={action_repr} reward={result.reward:+.2f} score={obs.final_score:.2f}")
|
| 101 |
+
if result.done:
|
| 102 |
+
break
|
| 103 |
+
final = _summarize_obs(obs)
|
| 104 |
+
|
| 105 |
+
_session_path(scenario_id).write_text(
|
| 106 |
+
json.dumps({"scenario_id": scenario_id, "trace": trace, "final": final}, indent=2),
|
| 107 |
+
encoding="utf-8",
|
| 108 |
+
)
|
| 109 |
+
print(
|
| 110 |
+
f"[done] resolved={final['incident_resolved']} score={final['final_score']:.2f} "
|
| 111 |
+
f"steps={final['tick']} session={_session_path(scenario_id)}"
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def cmd_interactive(scenario_id: str) -> None:
|
| 116 |
+
"""One JSON action per stdin line. Preserves session for the whole process lifetime."""
|
| 117 |
+
if scenario_id not in SCENARIOS:
|
| 118 |
+
print(f"error: unknown scenario {scenario_id!r}", file=sys.stderr)
|
| 119 |
+
sys.exit(2)
|
| 120 |
+
|
| 121 |
+
trace: list[dict[str, Any]] = []
|
| 122 |
+
with UnifiedIncidentEnv(base_url=BASE_URL).sync() as env:
|
| 123 |
+
obs = env.reset(scenario_id=scenario_id).observation
|
| 124 |
+
print(json.dumps({"event": "reset", "scenario_id": scenario_id, "obs": _summarize_obs(obs)}), flush=True)
|
| 125 |
+
for line in sys.stdin:
|
| 126 |
+
line = line.strip()
|
| 127 |
+
if not line:
|
| 128 |
+
continue
|
| 129 |
+
try:
|
| 130 |
+
data = json.loads(line)
|
| 131 |
+
action = UnifiedIncidentAction(**data)
|
| 132 |
+
except Exception as exc:
|
| 133 |
+
print(json.dumps({"event": "error", "detail": str(exc)}), flush=True)
|
| 134 |
+
continue
|
| 135 |
+
result = env.step(action)
|
| 136 |
+
obs = result.observation
|
| 137 |
+
record = {"step": obs.tick_count, "action": _clean_action(action), "reward": result.reward, **_summarize_obs(obs)}
|
| 138 |
+
trace.append(record)
|
| 139 |
+
print(json.dumps({"event": "step", **record}), flush=True)
|
| 140 |
+
if result.done:
|
| 141 |
+
print(json.dumps({"event": "done", "final": _summarize_obs(obs)}), flush=True)
|
| 142 |
+
break
|
| 143 |
+
|
| 144 |
+
_session_path(scenario_id).write_text(
|
| 145 |
+
json.dumps({"scenario_id": scenario_id, "trace": trace, "final": _summarize_obs(obs)}, indent=2),
|
| 146 |
+
encoding="utf-8",
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def cmd_record_runbook(scenario_id: str, session_file: str | None = None) -> None:
|
| 151 |
+
"""Append a new runbook entry if the referenced session cleared the threshold."""
|
| 152 |
+
path = Path(session_file) if session_file else _session_path(scenario_id)
|
| 153 |
+
if not path.exists():
|
| 154 |
+
print(f"error: no session file at {path}", file=sys.stderr)
|
| 155 |
+
sys.exit(2)
|
| 156 |
+
session = json.loads(path.read_text(encoding="utf-8"))
|
| 157 |
+
final = session.get("final", {})
|
| 158 |
+
score = float(final.get("final_score", 0.0))
|
| 159 |
+
|
| 160 |
+
if not final.get("incident_resolved"):
|
| 161 |
+
print(f"skip: session not resolved (resolved={final.get('incident_resolved')})", file=sys.stderr)
|
| 162 |
+
sys.exit(1)
|
| 163 |
+
if score < SCORE_THRESHOLD:
|
| 164 |
+
print(f"skip: score {score:.2f} below runbook threshold {SCORE_THRESHOLD:.2f}", file=sys.stderr)
|
| 165 |
+
sys.exit(1)
|
| 166 |
+
|
| 167 |
+
RUNBOOK_DIR.mkdir(parents=True, exist_ok=True)
|
| 168 |
+
runbook_path = RUNBOOK_DIR / f"{scenario_id}.md"
|
| 169 |
+
|
| 170 |
+
timestamp = _dt.datetime.now(_dt.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
| 171 |
+
steps = int(final.get("tick", 0))
|
| 172 |
+
checks_passed = [c["name"] for c in final.get("checks", []) if c.get("passed")]
|
| 173 |
+
trace = session.get("trace", [])
|
| 174 |
+
|
| 175 |
+
header = (
|
| 176 |
+
f"# verified-runbooks/{scenario_id}.md\n\n"
|
| 177 |
+
"Runbook entries are written by the sre-gym skill after a successful solve "
|
| 178 |
+
f"(incident_resolved=true and final_score > {SCORE_THRESHOLD:.2f}).\n"
|
| 179 |
+
"Each entry is immutable evidence — treat it as ground truth for the winning path.\n\n---\n"
|
| 180 |
+
)
|
| 181 |
+
lines = [f"\n## Run {timestamp} — Score {score:.2f}\n"]
|
| 182 |
+
lines.append(f"- Steps: **{steps}**")
|
| 183 |
+
lines.append(f"- Checks passed: {', '.join(checks_passed) or 'none'}")
|
| 184 |
+
lines.append("")
|
| 185 |
+
lines.append("**Winning path:**")
|
| 186 |
+
for entry in trace:
|
| 187 |
+
act = entry["action"]
|
| 188 |
+
action_type = act.get("action_type")
|
| 189 |
+
extras = ", ".join(
|
| 190 |
+
f"{k}={v if not isinstance(v, dict) else v.get('root_cause', v)}"
|
| 191 |
+
for k, v in act.items()
|
| 192 |
+
if k != "action_type" and v not in (None, {})
|
| 193 |
+
)
|
| 194 |
+
extra_str = f" ({extras})" if extras else ""
|
| 195 |
+
rationale = entry.get("rationale", "").rstrip(".")
|
| 196 |
+
lines.append(f"{entry['step']}. `{action_type}{extra_str}` — {rationale}")
|
| 197 |
+
lines.append("")
|
| 198 |
+
entry_text = "\n".join(lines)
|
| 199 |
+
|
| 200 |
+
if not runbook_path.exists():
|
| 201 |
+
runbook_path.write_text(header + entry_text, encoding="utf-8")
|
| 202 |
+
else:
|
| 203 |
+
with runbook_path.open("a", encoding="utf-8") as f:
|
| 204 |
+
f.write(entry_text)
|
| 205 |
+
print(f"recorded runbook entry → {runbook_path} (score {score:.2f}, {steps} steps)")
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def main() -> None:
|
| 209 |
+
argv = sys.argv[1:]
|
| 210 |
+
if not argv:
|
| 211 |
+
print(__doc__, file=sys.stderr)
|
| 212 |
+
sys.exit(2)
|
| 213 |
+
cmd, *rest = argv
|
| 214 |
+
if cmd == "list":
|
| 215 |
+
cmd_list()
|
| 216 |
+
elif cmd == "solve":
|
| 217 |
+
if not rest:
|
| 218 |
+
print("error: solve requires <scenario_id>", file=sys.stderr)
|
| 219 |
+
sys.exit(2)
|
| 220 |
+
cmd_solve(rest[0], rest[1] if len(rest) > 1 else "baseline")
|
| 221 |
+
elif cmd == "interactive":
|
| 222 |
+
if not rest:
|
| 223 |
+
print("error: interactive requires <scenario_id>", file=sys.stderr)
|
| 224 |
+
sys.exit(2)
|
| 225 |
+
cmd_interactive(rest[0])
|
| 226 |
+
elif cmd == "record-runbook":
|
| 227 |
+
if not rest:
|
| 228 |
+
print("error: record-runbook requires <scenario_id>", file=sys.stderr)
|
| 229 |
+
sys.exit(2)
|
| 230 |
+
cmd_record_runbook(rest[0], rest[1] if len(rest) > 1 else None)
|
| 231 |
+
else:
|
| 232 |
+
print(f"error: unknown command {cmd!r}", file=sys.stderr)
|
| 233 |
+
print(__doc__, file=sys.stderr)
|
| 234 |
+
sys.exit(2)
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
if __name__ == "__main__":
|
| 238 |
+
main()
|
skill/verified-runbooks/.gitkeep
ADDED
|
File without changes
|
skill/verified-runbooks/db_config_rollout.md
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# verified-runbooks/db_config_rollout.md
|
| 2 |
+
|
| 3 |
+
Runbook entries are written by the sre-gym skill after a successful solve (incident_resolved=true and final_score > 0.85).
|
| 4 |
+
Each entry is immutable evidence — treat it as ground truth for the winning path.
|
| 5 |
+
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
## Run 2026-04-23T22:01:33Z — Score 0.99
|
| 9 |
+
|
| 10 |
+
- Steps: **10**
|
| 11 |
+
- Checks passed: database_recovery, end_to_end
|
| 12 |
+
|
| 13 |
+
**Winning path:**
|
| 14 |
+
1. `query_logs (service=database)` — Database is the loudest alert; inspect logs for the actual error signature
|
| 15 |
+
2. `query_deploys (service=database)` — Pool-acquire errors suggest a config change; check recent database rollouts
|
| 16 |
+
3. `query_metrics (service=database, metric=error_rate)` — Confirm the error pattern is pool exhaustion rather than compute overload
|
| 17 |
+
4. `query_logs (service=worker)` — Rule out the decoy worker deploy by reading worker logs directly
|
| 18 |
+
5. `submit_hypothesis (hypothesis=database_only_failure)` — Localize the fault to the database config before remediating
|
| 19 |
+
6. `rollback_deploy (service=database)` — Roll back the offending database config rollout
|
| 20 |
+
7. `restart_service (service=database)` — Restart the database cleanly against the restored pool config
|
| 21 |
+
8. `run_check (check_name=database_recovery)` — Verify database pool health and write latency are back within SLO
|
| 22 |
+
9. `run_check (check_name=end_to_end)` — Verify gateway write-path traffic succeeds end-to-end
|
| 23 |
+
10. `declare_resolved` — Declare resolved only after objective checks pass
|
skill/verified-runbooks/gateway_auth_rollout.md
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# verified-runbooks/gateway_auth_rollout.md
|
| 2 |
+
|
| 3 |
+
Runbook entries are written by the sre-gym skill after a successful solve (incident_resolved=true and final_score > 0.85).
|
| 4 |
+
Each entry is immutable evidence — treat it as ground truth for the winning path.
|
| 5 |
+
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
## Run 2026-04-23T22:01:37Z — Score 0.99
|
| 9 |
+
|
| 10 |
+
- Steps: **8**
|
| 11 |
+
- Checks passed: database_recovery, end_to_end
|
| 12 |
+
|
| 13 |
+
**Winning path:**
|
| 14 |
+
1. `query_logs (service=api-gateway)` — Gateway is rejecting logins; read gateway logs to localize the rejection class
|
| 15 |
+
2. `query_deploys (service=api-gateway)` — Login rejection aligns with a recent auth middleware rollout; confirm deploy timing
|
| 16 |
+
3. `query_deploys (service=worker)` — Rule out the worker deploy explicitly rather than assuming
|
| 17 |
+
4. `submit_hypothesis (hypothesis=api_gateway_fault)` — Commit a calibrated hypothesis localizing to the gateway auth rollout
|
| 18 |
+
5. `rollback_deploy (service=api-gateway)` — Roll back the bad auth middleware rollout; no restart needed
|
| 19 |
+
6. `run_check (check_name=end_to_end)` — Verify that gateway login traffic now succeeds end-to-end
|
| 20 |
+
7. `run_check (check_name=database_recovery)` — Confirm the database is (and stayed) healthy throughout
|
| 21 |
+
8. `declare_resolved` — Declare resolved only after objective checks pass
|
skill/verified-runbooks/worker_deploy_cascade.md
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# verified-runbooks/worker_deploy_cascade.md
|
| 2 |
+
|
| 3 |
+
Runbook entries are written by the sre-gym skill after a successful solve (incident_resolved=true and final_score > 0.85).
|
| 4 |
+
Each entry is immutable evidence — treat it as ground truth for the winning path.
|
| 5 |
+
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
## Run 2026-04-23T22:01:29Z — Score 0.99
|
| 9 |
+
|
| 10 |
+
- Steps: **10**
|
| 11 |
+
- Checks passed: database_recovery, end_to_end
|
| 12 |
+
|
| 13 |
+
**Winning path:**
|
| 14 |
+
1. `query_deploys (service=worker)` — Check whether any recent deploy aligns with the incident start
|
| 15 |
+
2. `query_logs (service=worker)` — Inspect worker logs because deploy timing and queue pressure suggest worker-originated harm
|
| 16 |
+
3. `query_metrics (service=database, metric=cpu)` — Confirm that the database is overloaded as a downstream effect
|
| 17 |
+
4. `query_dependencies (service=api-gateway)` — Verify the gateway depends on the worker and database path
|
| 18 |
+
5. `submit_hypothesis (hypothesis=bad_worker_deploy)` — Commit a calibrated hypothesis before taking an invasive mitigation step
|
| 19 |
+
6. `rollback_deploy (service=worker)` — Remove the triggering change before restarting downstream services
|
| 20 |
+
7. `restart_service (service=database)` — Bring the database back cleanly after the root cause is removed
|
| 21 |
+
8. `run_check (check_name=database_recovery)` — Verify the database is no longer crashing
|
| 22 |
+
9. `run_check (check_name=end_to_end)` — Verify gateway traffic succeeds end-to-end
|
| 23 |
+
10. `declare_resolved` — Declare resolved only after objective checks pass
|
unified_incident_env/README.md
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Unified Incident Env
|
| 2 |
+
|
| 3 |
+
The runnable submission surface lives at the project root. This package contains the actual environment implementation:
|
| 4 |
+
|
| 5 |
+
- typed models in `models.py`
|
| 6 |
+
- environment logic in `server/environment.py`
|
| 7 |
+
- scoring in `server/grader.py`
|
| 8 |
+
- scenario catalog in `server/challenge.py`
|
| 9 |
+
|
| 10 |
+
Use the root `README.md` for run commands, scoring, and example interaction.
|
unified_incident_env/__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Unified incident-response OpenEnv package."""
|
| 2 |
+
|
| 3 |
+
from .interface import (
|
| 4 |
+
UnifiedIncidentAction,
|
| 5 |
+
UnifiedIncidentEnv,
|
| 6 |
+
UnifiedIncidentEnvironment,
|
| 7 |
+
UnifiedIncidentObservation,
|
| 8 |
+
UnifiedIncidentState,
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
__all__ = [
|
| 12 |
+
"UnifiedIncidentAction",
|
| 13 |
+
"UnifiedIncidentEnv",
|
| 14 |
+
"UnifiedIncidentEnvironment",
|
| 15 |
+
"UnifiedIncidentObservation",
|
| 16 |
+
"UnifiedIncidentState",
|
| 17 |
+
]
|
unified_incident_env/client.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Typed OpenEnv client for the unified incident environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Any
|
| 6 |
+
|
| 7 |
+
from openenv.core import EnvClient
|
| 8 |
+
from openenv.core.client_types import StepResult
|
| 9 |
+
|
| 10 |
+
from .models import UnifiedIncidentAction, UnifiedIncidentObservation, UnifiedIncidentState
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class UnifiedIncidentEnv(
|
| 14 |
+
EnvClient[UnifiedIncidentAction, UnifiedIncidentObservation, UnifiedIncidentState]
|
| 15 |
+
):
|
| 16 |
+
"""Typed client wrapper around the OpenEnv HTTP API."""
|
| 17 |
+
|
| 18 |
+
DEFAULT_BASE_URL = "http://127.0.0.1:8000"
|
| 19 |
+
|
| 20 |
+
def _step_payload(self, action: UnifiedIncidentAction) -> dict[str, Any]:
|
| 21 |
+
return action.model_dump(exclude_none=True)
|
| 22 |
+
|
| 23 |
+
def _parse_result(self, payload: dict[str, Any]) -> StepResult[UnifiedIncidentObservation]:
|
| 24 |
+
observation_data = dict(payload.get("observation", {}))
|
| 25 |
+
observation_data.setdefault("reward", payload.get("reward", 0.0))
|
| 26 |
+
observation_data.setdefault("done", payload.get("done", False))
|
| 27 |
+
observation = UnifiedIncidentObservation.model_validate(observation_data)
|
| 28 |
+
return StepResult(
|
| 29 |
+
observation=observation,
|
| 30 |
+
reward=payload.get("reward", observation.reward),
|
| 31 |
+
done=payload.get("done", observation.done),
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
def _parse_state(self, payload: dict[str, Any]) -> UnifiedIncidentState:
|
| 35 |
+
return UnifiedIncidentState.model_validate(payload)
|
unified_incident_env/interface.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Single public interface surface for the unified incident benchmark."""
|
| 2 |
+
|
| 3 |
+
from .client import UnifiedIncidentEnv
|
| 4 |
+
from .models import (
|
| 5 |
+
UnifiedIncidentAction,
|
| 6 |
+
UnifiedIncidentObservation,
|
| 7 |
+
UnifiedIncidentState,
|
| 8 |
+
)
|
| 9 |
+
from .server.environment import UnifiedIncidentEnvironment
|
| 10 |
+
|
| 11 |
+
__all__ = [
|
| 12 |
+
"UnifiedIncidentAction",
|
| 13 |
+
"UnifiedIncidentEnv",
|
| 14 |
+
"UnifiedIncidentEnvironment",
|
| 15 |
+
"UnifiedIncidentObservation",
|
| 16 |
+
"UnifiedIncidentState",
|
| 17 |
+
]
|
unified_incident_env/models.py
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Typed models for the honest narrow incident-remediation environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Any, Literal
|
| 6 |
+
|
| 7 |
+
from openenv.core import Action, Observation, State
|
| 8 |
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
| 9 |
+
from pydantic_core import PydanticCustomError
|
| 10 |
+
|
| 11 |
+
ActionType = Literal[
|
| 12 |
+
"query_logs",
|
| 13 |
+
"query_metrics",
|
| 14 |
+
"query_dependencies",
|
| 15 |
+
"query_deploys",
|
| 16 |
+
"rollback_deploy",
|
| 17 |
+
"restart_service",
|
| 18 |
+
"run_check",
|
| 19 |
+
"isolate_service",
|
| 20 |
+
"escalate",
|
| 21 |
+
"submit_hypothesis",
|
| 22 |
+
"declare_resolved",
|
| 23 |
+
]
|
| 24 |
+
Difficulty = Literal["easy", "medium", "hard"]
|
| 25 |
+
MetricName = Literal["cpu", "error_rate", "latency"]
|
| 26 |
+
ServiceName = Literal["api-gateway", "cache", "database", "worker"]
|
| 27 |
+
ServiceStatus = Literal["healthy", "degraded", "crashed", "isolated"]
|
| 28 |
+
WorkflowStage = Literal["triage", "mitigation", "validation", "resolved"]
|
| 29 |
+
CheckName = Literal["database_recovery", "end_to_end"]
|
| 30 |
+
RootCauseType = Literal[
|
| 31 |
+
"bad_worker_deploy",
|
| 32 |
+
"database_only_failure",
|
| 33 |
+
"api_gateway_fault",
|
| 34 |
+
]
|
| 35 |
+
RecommendedActionType = Literal[
|
| 36 |
+
"query_logs",
|
| 37 |
+
"query_metrics",
|
| 38 |
+
"query_dependencies",
|
| 39 |
+
"query_deploys",
|
| 40 |
+
"rollback_deploy",
|
| 41 |
+
"restart_service",
|
| 42 |
+
"run_check",
|
| 43 |
+
"isolate_service",
|
| 44 |
+
"escalate",
|
| 45 |
+
"declare_resolved",
|
| 46 |
+
]
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
class PostmortemPayload(BaseModel):
|
| 50 |
+
"""Deprecated compatibility shell for the removed v1 postmortem action."""
|
| 51 |
+
|
| 52 |
+
model_config = ConfigDict(extra="forbid")
|
| 53 |
+
|
| 54 |
+
root_cause: str = ""
|
| 55 |
+
attack_vector: str = ""
|
| 56 |
+
timeline: list[str] = Field(default_factory=list)
|
| 57 |
+
remediation_steps: list[str] = Field(default_factory=list)
|
| 58 |
+
prevention_steps: list[str] = Field(default_factory=list)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
class SecurityContext(BaseModel):
|
| 62 |
+
"""Deprecated compatibility shell for the removed v1 security subquest state."""
|
| 63 |
+
|
| 64 |
+
model_config = ConfigDict(extra="forbid")
|
| 65 |
+
|
| 66 |
+
code_visible: bool = False
|
| 67 |
+
selected_vulnerability: str | None = None
|
| 68 |
+
selected_patch: str | None = None
|
| 69 |
+
exploit_blocked: bool | None = None
|
| 70 |
+
functionality_preserved: bool | None = None
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
class HypothesisPayload(BaseModel):
|
| 74 |
+
"""Structured hypothesis submitted by the agent."""
|
| 75 |
+
|
| 76 |
+
model_config = ConfigDict(extra="forbid")
|
| 77 |
+
|
| 78 |
+
root_cause: RootCauseType
|
| 79 |
+
affected_services: list[ServiceName] = Field(default_factory=list, min_length=1)
|
| 80 |
+
confidence: float = Field(ge=0.0, le=1.0)
|
| 81 |
+
recommended_next_action: RecommendedActionType
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
class ServiceHealth(BaseModel):
|
| 85 |
+
"""Health snapshot for a service."""
|
| 86 |
+
|
| 87 |
+
model_config = ConfigDict(extra="forbid")
|
| 88 |
+
|
| 89 |
+
name: ServiceName
|
| 90 |
+
status: ServiceStatus
|
| 91 |
+
cpu_pct: float = Field(ge=0.0, le=100.0)
|
| 92 |
+
memory_pct: float = Field(ge=0.0, le=100.0)
|
| 93 |
+
error_rate_pct: float = Field(ge=0.0, le=100.0)
|
| 94 |
+
latency_ms: float = Field(ge=0.0)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
class Alert(BaseModel):
|
| 98 |
+
"""Alert exposed to the agent."""
|
| 99 |
+
|
| 100 |
+
model_config = ConfigDict(extra="forbid")
|
| 101 |
+
|
| 102 |
+
service: ServiceName
|
| 103 |
+
severity: Literal["warning", "critical"]
|
| 104 |
+
message: str
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
class CheckResult(BaseModel):
|
| 108 |
+
"""Result of a verification check."""
|
| 109 |
+
|
| 110 |
+
model_config = ConfigDict(extra="forbid")
|
| 111 |
+
|
| 112 |
+
name: CheckName
|
| 113 |
+
passed: bool
|
| 114 |
+
detail: str
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
class UnifiedIncidentAction(Action):
|
| 118 |
+
"""One structured environment action."""
|
| 119 |
+
|
| 120 |
+
model_config = ConfigDict(extra="ignore")
|
| 121 |
+
|
| 122 |
+
action_type: ActionType
|
| 123 |
+
service: ServiceName | None = None
|
| 124 |
+
metric: MetricName | None = None
|
| 125 |
+
check_name: CheckName | None = None
|
| 126 |
+
hypothesis: HypothesisPayload | None = None
|
| 127 |
+
|
| 128 |
+
@model_validator(mode="after")
|
| 129 |
+
def _validate_payload(self) -> "UnifiedIncidentAction":
|
| 130 |
+
if self.action_type in {
|
| 131 |
+
"query_logs",
|
| 132 |
+
"query_dependencies",
|
| 133 |
+
"query_deploys",
|
| 134 |
+
"rollback_deploy",
|
| 135 |
+
"restart_service",
|
| 136 |
+
"isolate_service",
|
| 137 |
+
} and not self.service:
|
| 138 |
+
raise PydanticCustomError(
|
| 139 |
+
"missing_service",
|
| 140 |
+
"service is required for {action_type}",
|
| 141 |
+
{"action_type": self.action_type},
|
| 142 |
+
)
|
| 143 |
+
if self.action_type == "query_metrics":
|
| 144 |
+
if not self.service:
|
| 145 |
+
raise PydanticCustomError(
|
| 146 |
+
"missing_service",
|
| 147 |
+
"service is required for {action_type}",
|
| 148 |
+
{"action_type": self.action_type},
|
| 149 |
+
)
|
| 150 |
+
if not self.metric:
|
| 151 |
+
raise PydanticCustomError(
|
| 152 |
+
"missing_metric",
|
| 153 |
+
"metric is required for {action_type}",
|
| 154 |
+
{"action_type": self.action_type},
|
| 155 |
+
)
|
| 156 |
+
if self.action_type == "run_check" and not self.check_name:
|
| 157 |
+
raise PydanticCustomError(
|
| 158 |
+
"missing_check_name",
|
| 159 |
+
"check_name is required for {action_type}",
|
| 160 |
+
{"action_type": self.action_type},
|
| 161 |
+
)
|
| 162 |
+
if self.action_type == "submit_hypothesis" and self.hypothesis is None:
|
| 163 |
+
raise PydanticCustomError(
|
| 164 |
+
"missing_hypothesis",
|
| 165 |
+
"hypothesis is required for {action_type}",
|
| 166 |
+
{"action_type": self.action_type},
|
| 167 |
+
)
|
| 168 |
+
return self
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
class UnifiedIncidentObservation(Observation):
|
| 172 |
+
"""Observation returned after reset and each step."""
|
| 173 |
+
|
| 174 |
+
model_config = ConfigDict(extra="forbid")
|
| 175 |
+
|
| 176 |
+
prompt_text: str
|
| 177 |
+
incident_summary: str
|
| 178 |
+
tick_count: int
|
| 179 |
+
max_ticks: int
|
| 180 |
+
difficulty: Difficulty
|
| 181 |
+
workflow_stage: WorkflowStage
|
| 182 |
+
active_alerts: list[Alert] = Field(default_factory=list)
|
| 183 |
+
service_health: dict[str, ServiceHealth] = Field(default_factory=dict)
|
| 184 |
+
discovered_evidence: list[str] = Field(default_factory=list)
|
| 185 |
+
recent_deploys: list[str] = Field(default_factory=list)
|
| 186 |
+
checks: list[CheckResult] = Field(default_factory=list)
|
| 187 |
+
user_impact: float = Field(ge=0.0, le=1.0)
|
| 188 |
+
slo_burn_rate: float = Field(ge=0.0, le=1.0)
|
| 189 |
+
incident_resolved: bool = False
|
| 190 |
+
containment_applied: bool = False
|
| 191 |
+
last_action_result: str = ""
|
| 192 |
+
tool_output: str | None = None
|
| 193 |
+
failure_type: str | None = None
|
| 194 |
+
why_failed: str | None = None
|
| 195 |
+
allowed_actions: list[str] = Field(default_factory=list)
|
| 196 |
+
required_fields_by_action: dict[str, list[str]] = Field(default_factory=dict)
|
| 197 |
+
valid_action_example: dict[str, Any] | None = None
|
| 198 |
+
common_trap: str | None = None
|
| 199 |
+
loop_warning: str | None = None
|
| 200 |
+
blocked_until_security_complete: bool = False
|
| 201 |
+
security_unlock_reason: str | None = None
|
| 202 |
+
best_recovery_action_family: str | None = None
|
| 203 |
+
progress_flags: dict[str, bool] = Field(default_factory=dict)
|
| 204 |
+
security_subquest_status: str | None = None
|
| 205 |
+
security_context: dict[str, Any] = Field(default_factory=dict)
|
| 206 |
+
final_score: float = 0.0
|
| 207 |
+
score_breakdown: dict[str, float] = Field(default_factory=dict)
|
| 208 |
+
reward: float = 0.0
|
| 209 |
+
done: bool = False
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
class UnifiedIncidentState(State):
|
| 213 |
+
"""Persistent episode state."""
|
| 214 |
+
|
| 215 |
+
model_config = ConfigDict(extra="forbid")
|
| 216 |
+
|
| 217 |
+
episode_id: str
|
| 218 |
+
step_count: int
|
| 219 |
+
scenario_id: str
|
| 220 |
+
difficulty: Difficulty
|
| 221 |
+
current_tick: int
|
| 222 |
+
max_ticks: int
|
| 223 |
+
workflow_stage: WorkflowStage
|
| 224 |
+
active_alerts: list[Alert] = Field(default_factory=list)
|
| 225 |
+
service_health: dict[str, ServiceHealth] = Field(default_factory=dict)
|
| 226 |
+
discovered_evidence: list[str] = Field(default_factory=list)
|
| 227 |
+
recent_deploys: list[str] = Field(default_factory=list)
|
| 228 |
+
checks: list[CheckResult] = Field(default_factory=list)
|
| 229 |
+
user_impact: float = Field(ge=0.0, le=1.0)
|
| 230 |
+
slo_burn_rate: float = Field(ge=0.0, le=1.0)
|
| 231 |
+
incident_resolved: bool = False
|
| 232 |
+
containment_applied: bool = False
|
| 233 |
+
allowed_actions: list[str] = Field(default_factory=list)
|
| 234 |
+
required_fields_by_action: dict[str, list[str]] = Field(default_factory=dict)
|
| 235 |
+
valid_action_example: dict[str, Any] | None = None
|
| 236 |
+
progress_flags: dict[str, bool] = Field(default_factory=dict)
|
| 237 |
+
final_score: float = 0.0
|
| 238 |
+
score_breakdown: dict[str, float] = Field(default_factory=dict)
|
| 239 |
+
cumulative_reward: float = 0.0
|
| 240 |
+
wasteful_ticks: int = 0
|
| 241 |
+
last_action_result: str = ""
|
| 242 |
+
failure_type: str | None = None
|
| 243 |
+
why_failed: str | None = None
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
class ScenarioSummary(BaseModel):
|
| 247 |
+
"""Public scenario summary."""
|
| 248 |
+
|
| 249 |
+
model_config = ConfigDict(extra="forbid")
|
| 250 |
+
|
| 251 |
+
id: str
|
| 252 |
+
difficulty: Difficulty
|
| 253 |
+
name: str
|
| 254 |
+
description: str
|
| 255 |
+
root_cause: str
|
| 256 |
+
optimal_ticks: int
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
class ScenarioCatalog(BaseModel):
|
| 260 |
+
"""Public scenario catalog."""
|
| 261 |
+
|
| 262 |
+
model_config = ConfigDict(extra="forbid")
|
| 263 |
+
|
| 264 |
+
environment: str = "unified_incident_env"
|
| 265 |
+
default_scenario_id: str
|
| 266 |
+
available_difficulties: list[Difficulty]
|
| 267 |
+
filtered_difficulty: Difficulty | None = None
|
| 268 |
+
scenarios: list[ScenarioSummary]
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
class BaselineStep(BaseModel):
|
| 272 |
+
"""One baseline action."""
|
| 273 |
+
|
| 274 |
+
model_config = ConfigDict(extra="forbid")
|
| 275 |
+
|
| 276 |
+
action: UnifiedIncidentAction
|
| 277 |
+
rationale: str = ""
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
class BaselineDefinition(BaseModel):
|
| 281 |
+
"""One baseline trajectory."""
|
| 282 |
+
|
| 283 |
+
model_config = ConfigDict(extra="forbid")
|
| 284 |
+
|
| 285 |
+
scenario_id: str
|
| 286 |
+
name: str
|
| 287 |
+
description: str
|
| 288 |
+
optimal_ticks: int
|
| 289 |
+
actions: list[BaselineStep] = Field(default_factory=list)
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
class BaselineCatalog(BaseModel):
|
| 293 |
+
"""Public baseline catalog."""
|
| 294 |
+
|
| 295 |
+
model_config = ConfigDict(extra="forbid")
|
| 296 |
+
|
| 297 |
+
environment: str = "unified_incident_env"
|
| 298 |
+
baselines: list[BaselineDefinition]
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
class GraderCheck(BaseModel):
|
| 302 |
+
"""One normalized grader check."""
|
| 303 |
+
|
| 304 |
+
model_config = ConfigDict(extra="forbid")
|
| 305 |
+
|
| 306 |
+
name: str
|
| 307 |
+
passed: bool
|
| 308 |
+
detail: str
|
| 309 |
+
weight: float
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
class GraderReport(BaseModel):
|
| 313 |
+
"""Episode-grade report."""
|
| 314 |
+
|
| 315 |
+
model_config = ConfigDict(extra="forbid")
|
| 316 |
+
|
| 317 |
+
scenario_id: str
|
| 318 |
+
passed: bool
|
| 319 |
+
score: float = Field(ge=0.0, le=1.0)
|
| 320 |
+
message: str
|
| 321 |
+
breakdown: dict[str, float] = Field(default_factory=dict)
|
| 322 |
+
checks: list[GraderCheck] = Field(default_factory=list)
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
class RuntimeStatus(BaseModel):
|
| 326 |
+
"""Runtime status route payload."""
|
| 327 |
+
|
| 328 |
+
model_config = ConfigDict(extra="forbid")
|
| 329 |
+
|
| 330 |
+
environment: str = "unified_incident_env"
|
| 331 |
+
progress: UnifiedIncidentState
|
| 332 |
+
grader: GraderReport
|
unified_incident_env/scripts/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Scripts for the unified incident environment."""
|
unified_incident_env/scripts/baseline_agent.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deterministic scripted baseline for the honest narrow incident environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
import json
|
| 7 |
+
|
| 8 |
+
from ..client import UnifiedIncidentEnv
|
| 9 |
+
from ..server.challenge import DEFAULT_SCENARIO_ID, SCENARIOS, list_baselines
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def plan_for_scenario(scenario_id: str):
|
| 13 |
+
catalog = list_baselines(scenario_id)
|
| 14 |
+
return [step.action for step in catalog.baselines[0].actions]
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def run_scenario(base_url: str, scenario_id: str) -> dict[str, object]:
|
| 18 |
+
with UnifiedIncidentEnv(base_url=base_url).sync() as env:
|
| 19 |
+
env.reset(scenario_id=scenario_id)
|
| 20 |
+
final = None
|
| 21 |
+
for action in plan_for_scenario(scenario_id):
|
| 22 |
+
final = env.step(action).observation
|
| 23 |
+
assert final is not None
|
| 24 |
+
return {
|
| 25 |
+
"scenario_id": scenario_id,
|
| 26 |
+
"success": bool(final.done and final.incident_resolved),
|
| 27 |
+
"final_score": final.final_score,
|
| 28 |
+
"workflow_stage": final.workflow_stage,
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def main() -> None:
|
| 33 |
+
parser = argparse.ArgumentParser()
|
| 34 |
+
parser.add_argument("--base-url", default=UnifiedIncidentEnv.DEFAULT_BASE_URL)
|
| 35 |
+
parser.add_argument("--scenario", choices=sorted(SCENARIOS), default=DEFAULT_SCENARIO_ID)
|
| 36 |
+
args = parser.parse_args()
|
| 37 |
+
|
| 38 |
+
results = [run_scenario(args.base_url, args.scenario)]
|
| 39 |
+
print(json.dumps(results, indent=2))
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
if __name__ == "__main__":
|
| 43 |
+
main()
|
unified_incident_env/scripts/walkthrough.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Simple walkthrough that prints a full episode interaction."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
import json
|
| 7 |
+
|
| 8 |
+
from ..client import UnifiedIncidentEnv
|
| 9 |
+
from .baseline_agent import plan_for_scenario
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def main() -> None:
|
| 13 |
+
parser = argparse.ArgumentParser()
|
| 14 |
+
parser.add_argument(
|
| 15 |
+
"--base-url",
|
| 16 |
+
default=UnifiedIncidentEnv.DEFAULT_BASE_URL,
|
| 17 |
+
)
|
| 18 |
+
parser.add_argument(
|
| 19 |
+
"--scenario",
|
| 20 |
+
default="easy_sqli_db_outage",
|
| 21 |
+
)
|
| 22 |
+
args = parser.parse_args()
|
| 23 |
+
|
| 24 |
+
with UnifiedIncidentEnv(base_url=args.base_url).sync() as env:
|
| 25 |
+
reset = env.reset(scenario_id=args.scenario).observation
|
| 26 |
+
print(json.dumps({"reset": reset.model_dump()}, indent=2))
|
| 27 |
+
for action in plan_for_scenario(args.scenario):
|
| 28 |
+
step = env.step(action).observation
|
| 29 |
+
print(
|
| 30 |
+
json.dumps(
|
| 31 |
+
{
|
| 32 |
+
"action": action.model_dump(exclude_none=True),
|
| 33 |
+
"observation": step.model_dump(),
|
| 34 |
+
},
|
| 35 |
+
indent=2,
|
| 36 |
+
)
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
if __name__ == "__main__":
|
| 41 |
+
main()
|
unified_incident_env/server/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Server package for the unified incident environment."""
|
unified_incident_env/server/app.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FastAPI app and metadata routes for the honest narrow incident environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
import os
|
| 7 |
+
from typing import Any
|
| 8 |
+
|
| 9 |
+
from fastapi import HTTPException
|
| 10 |
+
from fastapi.responses import HTMLResponse, RedirectResponse
|
| 11 |
+
from openenv.core.env_server.http_server import create_fastapi_app
|
| 12 |
+
|
| 13 |
+
from ..models import (
|
| 14 |
+
BaselineCatalog,
|
| 15 |
+
GraderReport,
|
| 16 |
+
RuntimeStatus,
|
| 17 |
+
ScenarioCatalog,
|
| 18 |
+
UnifiedIncidentAction,
|
| 19 |
+
UnifiedIncidentObservation,
|
| 20 |
+
UnifiedIncidentState,
|
| 21 |
+
)
|
| 22 |
+
from .challenge import current_runtime_progress, grade_episode, list_baselines, list_scenarios, set_runtime_progress
|
| 23 |
+
from .environment import UnifiedIncidentEnvironment
|
| 24 |
+
|
| 25 |
+
_BOOTSTRAP_ENV = UnifiedIncidentEnvironment()
|
| 26 |
+
set_runtime_progress(_BOOTSTRAP_ENV.state.model_dump())
|
| 27 |
+
|
| 28 |
+
_SIMPLE_HTML = """<!doctype html>
|
| 29 |
+
<html lang="en">
|
| 30 |
+
<head>
|
| 31 |
+
<meta charset="utf-8" />
|
| 32 |
+
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
| 33 |
+
<title>Unified Incident Env</title>
|
| 34 |
+
<style>
|
| 35 |
+
body { font-family: system-ui, sans-serif; max-width: 900px; margin: 40px auto; padding: 0 20px; line-height: 1.5; }
|
| 36 |
+
code, pre { background: #f4f4f4; padding: 2px 6px; border-radius: 6px; }
|
| 37 |
+
pre { padding: 12px; overflow: auto; }
|
| 38 |
+
</style>
|
| 39 |
+
</head>
|
| 40 |
+
<body>
|
| 41 |
+
<h1>Unified Incident Env</h1>
|
| 42 |
+
<p>This v2 environment exposes an honest bounded-action incident diagnosis and remediation task.</p>
|
| 43 |
+
<ul>
|
| 44 |
+
<li><a href="/docs">API docs</a></li>
|
| 45 |
+
<li><a href="/tasks">Scenario catalog</a></li>
|
| 46 |
+
<li><a href="/baseline">Baseline plan</a></li>
|
| 47 |
+
<li><a href="/status">Runtime status</a></li>
|
| 48 |
+
<li><a href="/health">Health</a></li>
|
| 49 |
+
</ul>
|
| 50 |
+
<h2>Core ideas</h2>
|
| 51 |
+
<ul>
|
| 52 |
+
<li>Queries reveal evidence but do not directly mint positive reward.</li>
|
| 53 |
+
<li>Remediation actions change the world state.</li>
|
| 54 |
+
<li><code>run_check</code> verifies recovery explicitly.</li>
|
| 55 |
+
<li><code>declare_resolved</code> succeeds only after objective checks pass.</li>
|
| 56 |
+
</ul>
|
| 57 |
+
<h2>Manual example</h2>
|
| 58 |
+
<pre>curl -X POST http://127.0.0.1:8000/reset -H 'content-type: application/json' -d '{}'
|
| 59 |
+
curl -X POST http://127.0.0.1:8000/step -H 'content-type: application/json' -d '{"action_type":"query_deploys","service":"worker"}'</pre>
|
| 60 |
+
</body>
|
| 61 |
+
</html>
|
| 62 |
+
"""
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def create_compatible_app():
|
| 66 |
+
env_factory = lambda: UnifiedIncidentEnvironment()
|
| 67 |
+
app = create_fastapi_app(
|
| 68 |
+
env_factory,
|
| 69 |
+
UnifiedIncidentAction,
|
| 70 |
+
UnifiedIncidentObservation,
|
| 71 |
+
max_concurrent_envs=1,
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
@app.get("/", include_in_schema=False)
|
| 75 |
+
async def web_root():
|
| 76 |
+
return RedirectResponse(url="/simple")
|
| 77 |
+
|
| 78 |
+
@app.get("/simple", include_in_schema=False)
|
| 79 |
+
async def simple_console():
|
| 80 |
+
return HTMLResponse(_SIMPLE_HTML)
|
| 81 |
+
|
| 82 |
+
_attach_metadata_routes(app)
|
| 83 |
+
|
| 84 |
+
return app
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def _attach_metadata_routes(app):
|
| 88 |
+
@app.get("/tasks", response_model=ScenarioCatalog, tags=["challenge"])
|
| 89 |
+
def tasks(difficulty: str | None = None) -> ScenarioCatalog:
|
| 90 |
+
try:
|
| 91 |
+
return list_scenarios(difficulty=difficulty)
|
| 92 |
+
except ValueError as exc:
|
| 93 |
+
raise HTTPException(status_code=404, detail=str(exc)) from exc
|
| 94 |
+
|
| 95 |
+
@app.get("/baseline", response_model=BaselineCatalog, tags=["challenge"])
|
| 96 |
+
def baseline(scenario_id: str | None = None) -> BaselineCatalog:
|
| 97 |
+
try:
|
| 98 |
+
return list_baselines(scenario_id=scenario_id)
|
| 99 |
+
except ValueError as exc:
|
| 100 |
+
raise HTTPException(status_code=404, detail=str(exc)) from exc
|
| 101 |
+
|
| 102 |
+
@app.get("/grader", response_model=GraderReport, tags=["challenge"])
|
| 103 |
+
def grader(scenario_id: str | None = None) -> GraderReport:
|
| 104 |
+
progress = current_runtime_progress()
|
| 105 |
+
if scenario_id is not None:
|
| 106 |
+
progress["scenario_id"] = scenario_id
|
| 107 |
+
try:
|
| 108 |
+
return grade_episode(progress)
|
| 109 |
+
except ValueError as exc:
|
| 110 |
+
raise HTTPException(status_code=404, detail=str(exc)) from exc
|
| 111 |
+
|
| 112 |
+
@app.get("/status", response_model=RuntimeStatus, tags=["challenge"])
|
| 113 |
+
def status() -> RuntimeStatus:
|
| 114 |
+
progress = current_runtime_progress()
|
| 115 |
+
return RuntimeStatus(
|
| 116 |
+
progress=UnifiedIncidentState(**progress),
|
| 117 |
+
grader=grade_episode(progress),
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
@app.get("/health", tags=["challenge"])
|
| 121 |
+
def health() -> dict[str, object]:
|
| 122 |
+
return {
|
| 123 |
+
"status": "ok",
|
| 124 |
+
"environment": "unified_incident_env",
|
| 125 |
+
"version": "2.0.0",
|
| 126 |
+
"stages": ["triage", "mitigation", "validation", "resolved"],
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
app = create_compatible_app()
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def serve(host: str = "0.0.0.0", port: int = 8000) -> None:
|
| 134 |
+
import uvicorn
|
| 135 |
+
|
| 136 |
+
uvicorn.run(app, host=host, port=port)
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def main() -> None:
|
| 140 |
+
parser = argparse.ArgumentParser()
|
| 141 |
+
parser.add_argument("--host", default=os.environ.get("HOST", "0.0.0.0"))
|
| 142 |
+
parser.add_argument("--port", type=int, default=int(os.environ.get("PORT", "8000")))
|
| 143 |
+
args = parser.parse_args()
|
| 144 |
+
serve(host=args.host, port=args.port)
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
if __name__ == "__main__":
|
| 148 |
+
main()
|
unified_incident_env/server/challenge.py
ADDED
|
@@ -0,0 +1,753 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Scenario catalog, baselines, and runtime helpers for the honest v2 core."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from copy import deepcopy
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
from ..models import (
|
| 9 |
+
BaselineCatalog,
|
| 10 |
+
BaselineDefinition,
|
| 11 |
+
BaselineStep,
|
| 12 |
+
ScenarioCatalog,
|
| 13 |
+
ScenarioSummary,
|
| 14 |
+
UnifiedIncidentAction,
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
DEFAULT_SCENARIO_ID = "worker_deploy_cascade"
|
| 18 |
+
|
| 19 |
+
SCENARIOS: dict[str, dict[str, Any]] = {
|
| 20 |
+
"worker_deploy_cascade": {
|
| 21 |
+
"id": "worker_deploy_cascade",
|
| 22 |
+
"difficulty": "easy",
|
| 23 |
+
"name": "Worker Deploy Cascade",
|
| 24 |
+
"description": (
|
| 25 |
+
"A bad worker deploy causes sustained database overload and login 502s at the gateway. "
|
| 26 |
+
"The agent must diagnose from evidence, choose a safe remediation, verify recovery, and declare resolved only after checks pass."
|
| 27 |
+
),
|
| 28 |
+
"root_cause": "A bad worker deploy is driving repeated database overload.",
|
| 29 |
+
"optimal_ticks": 10,
|
| 30 |
+
"max_ticks": 12,
|
| 31 |
+
"critical_service_weights": {
|
| 32 |
+
"worker": 0.4,
|
| 33 |
+
"database": 0.4,
|
| 34 |
+
"api-gateway": 0.2,
|
| 35 |
+
"cache": 0.0,
|
| 36 |
+
},
|
| 37 |
+
"reward_config": {
|
| 38 |
+
"step_cost": 0.01,
|
| 39 |
+
"redundant_action_penalty": 0.02,
|
| 40 |
+
"unsafe_action_penalty": 0.08,
|
| 41 |
+
"premature_resolution_penalty": 0.2,
|
| 42 |
+
"successful_resolution_bonus": 0.25,
|
| 43 |
+
"hypothesis_bonus_scale": 0.12,
|
| 44 |
+
"forbidden_reward_sources": [
|
| 45 |
+
"evidence_discovery",
|
| 46 |
+
"query_success",
|
| 47 |
+
"unlock_events",
|
| 48 |
+
"stage_advancement",
|
| 49 |
+
"patch_id_selection",
|
| 50 |
+
],
|
| 51 |
+
},
|
| 52 |
+
"initial_services": {
|
| 53 |
+
"api-gateway": {
|
| 54 |
+
"status": "degraded",
|
| 55 |
+
"cpu_pct": 61.0,
|
| 56 |
+
"memory_pct": 38.0,
|
| 57 |
+
"error_rate_pct": 24.0,
|
| 58 |
+
"latency_ms": 640.0,
|
| 59 |
+
},
|
| 60 |
+
"cache": {
|
| 61 |
+
"status": "healthy",
|
| 62 |
+
"cpu_pct": 18.0,
|
| 63 |
+
"memory_pct": 24.0,
|
| 64 |
+
"error_rate_pct": 0.0,
|
| 65 |
+
"latency_ms": 14.0,
|
| 66 |
+
},
|
| 67 |
+
"database": {
|
| 68 |
+
"status": "crashed",
|
| 69 |
+
"cpu_pct": 99.0,
|
| 70 |
+
"memory_pct": 97.0,
|
| 71 |
+
"error_rate_pct": 100.0,
|
| 72 |
+
"latency_ms": 0.0,
|
| 73 |
+
},
|
| 74 |
+
"worker": {
|
| 75 |
+
"status": "degraded",
|
| 76 |
+
"cpu_pct": 88.0,
|
| 77 |
+
"memory_pct": 71.0,
|
| 78 |
+
"error_rate_pct": 19.0,
|
| 79 |
+
"latency_ms": 420.0,
|
| 80 |
+
},
|
| 81 |
+
},
|
| 82 |
+
"initial_alerts": [
|
| 83 |
+
{
|
| 84 |
+
"service": "api-gateway",
|
| 85 |
+
"severity": "critical",
|
| 86 |
+
"message": "Login requests are returning sustained 502s.",
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
"service": "database",
|
| 90 |
+
"severity": "critical",
|
| 91 |
+
"message": "Database process is crashing under repeated overload.",
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"service": "worker",
|
| 95 |
+
"severity": "warning",
|
| 96 |
+
"message": "Worker queue depth and retry volume spiked after a recent rollout.",
|
| 97 |
+
},
|
| 98 |
+
],
|
| 99 |
+
"logs": {
|
| 100 |
+
"api-gateway": (
|
| 101 |
+
"Gateway upstream errors point to worker timeouts followed by database connection failures. "
|
| 102 |
+
"No recent gateway deploys are recorded."
|
| 103 |
+
),
|
| 104 |
+
"cache": "Cache hit ratio is stable and cache upstream probes remain healthy.",
|
| 105 |
+
"database": (
|
| 106 |
+
"Database logs show repeated bursts of expensive worker-originated writes immediately before each crash."
|
| 107 |
+
),
|
| 108 |
+
"worker": (
|
| 109 |
+
"Worker logs show request fanout amplification and elevated retries beginning right after rollout build worker@2026.04.23-bad."
|
| 110 |
+
),
|
| 111 |
+
},
|
| 112 |
+
"metrics": {
|
| 113 |
+
"api-gateway": {
|
| 114 |
+
"error_rate": "Gateway 502 rate is 24% and closely tracks worker timeout bursts.",
|
| 115 |
+
"latency": "Gateway p95 latency climbed to 640ms while waiting on downstream worker/database calls.",
|
| 116 |
+
},
|
| 117 |
+
"database": {
|
| 118 |
+
"cpu": "Database CPU is pinned at 99% until the process exits.",
|
| 119 |
+
"latency": "Database latency spikes sharply before each crash loop.",
|
| 120 |
+
},
|
| 121 |
+
"worker": {
|
| 122 |
+
"cpu": "Worker CPU is 88% with growing queue pressure.",
|
| 123 |
+
"error_rate": "Worker retry/error rate is elevated after rollout.",
|
| 124 |
+
},
|
| 125 |
+
},
|
| 126 |
+
"dependencies": {
|
| 127 |
+
"api-gateway": "api-gateway -> worker -> database",
|
| 128 |
+
"worker": "worker -> database",
|
| 129 |
+
"database": "database is a terminal dependency for write-heavy worker jobs",
|
| 130 |
+
},
|
| 131 |
+
"deploy_history": {
|
| 132 |
+
"api-gateway": "No gateway deploys in the last 24h.",
|
| 133 |
+
"cache": "No cache deploys in the last 24h.",
|
| 134 |
+
"database": "No database deploys in the last 24h.",
|
| 135 |
+
"worker": "Rolled out worker@2026.04.23-bad 12 minutes ago.",
|
| 136 |
+
},
|
| 137 |
+
"checks": {
|
| 138 |
+
"database_recovery": "Confirms the database is healthy and no longer crashing.",
|
| 139 |
+
"end_to_end": "Confirms login traffic succeeds without worker-induced overload.",
|
| 140 |
+
},
|
| 141 |
+
"truth": {
|
| 142 |
+
"root_cause": "bad_worker_deploy",
|
| 143 |
+
"affected_services": ["worker", "database", "api-gateway"],
|
| 144 |
+
"best_next_action": "rollback_deploy",
|
| 145 |
+
},
|
| 146 |
+
"remediation_recipe": {
|
| 147 |
+
"rollback_target": "worker",
|
| 148 |
+
"restart_target": "database",
|
| 149 |
+
"isolate_target": "worker",
|
| 150 |
+
"restart_requires_cause_removed": True,
|
| 151 |
+
"incident_driver": "worker",
|
| 152 |
+
"resolution_check": "end_to_end",
|
| 153 |
+
},
|
| 154 |
+
"post_rollback_services": {
|
| 155 |
+
"worker": {"status": "healthy", "cpu_pct": 32.0, "memory_pct": 37.0, "error_rate_pct": 2.0, "latency_ms": 40.0},
|
| 156 |
+
},
|
| 157 |
+
"post_rollback_user_impact": 0.55,
|
| 158 |
+
"post_rollback_slo_burn": 0.58,
|
| 159 |
+
"post_restart_services": {
|
| 160 |
+
"database": {"status": "healthy", "cpu_pct": 34.0, "memory_pct": 39.0, "error_rate_pct": 0.0, "latency_ms": 22.0},
|
| 161 |
+
"api-gateway": {"status": "healthy", "cpu_pct": 28.0, "memory_pct": 31.0, "error_rate_pct": 0.0, "latency_ms": 38.0},
|
| 162 |
+
},
|
| 163 |
+
"post_restart_user_impact": 0.14,
|
| 164 |
+
"post_restart_slo_burn": 0.18,
|
| 165 |
+
"post_isolate_services": {
|
| 166 |
+
"worker": {"status": "isolated", "cpu_pct": 8.0, "memory_pct": 18.0, "error_rate_pct": 0.0, "latency_ms": 0.0},
|
| 167 |
+
"database": {"status": "healthy", "cpu_pct": 41.0, "memory_pct": 46.0, "error_rate_pct": 0.0, "latency_ms": 26.0},
|
| 168 |
+
"api-gateway": {"status": "degraded", "cpu_pct": 34.0, "memory_pct": 33.0, "error_rate_pct": 7.0, "latency_ms": 91.0},
|
| 169 |
+
},
|
| 170 |
+
"post_isolate_user_impact": 0.45,
|
| 171 |
+
"post_isolate_slo_burn": 0.47,
|
| 172 |
+
"degraded_services": {
|
| 173 |
+
"worker": {"status": "degraded", "cpu_pct": 88.0, "memory_pct": 71.0, "error_rate_pct": 19.0, "latency_ms": 420.0},
|
| 174 |
+
"database": {"status": "crashed", "cpu_pct": 99.0, "memory_pct": 97.0, "error_rate_pct": 100.0, "latency_ms": 0.0},
|
| 175 |
+
"api-gateway": {"status": "degraded", "cpu_pct": 61.0, "memory_pct": 38.0, "error_rate_pct": 24.0, "latency_ms": 640.0},
|
| 176 |
+
},
|
| 177 |
+
"degraded_user_impact": 0.82,
|
| 178 |
+
"degraded_slo_burn": 0.91,
|
| 179 |
+
"failure_messages": {
|
| 180 |
+
"wrong_rollback_target": "Rolling back a service without a causal link wastes time and risk.",
|
| 181 |
+
"low_value_restart": "Restarting that service is not the safe next remediation step for this incident.",
|
| 182 |
+
"premature_restart": "Restarting before removing the trigger only causes another crash loop.",
|
| 183 |
+
"wrong_isolation_target": "Isolating that service does not contain the dominant failure path.",
|
| 184 |
+
},
|
| 185 |
+
},
|
| 186 |
+
"db_config_rollout": {
|
| 187 |
+
"id": "db_config_rollout",
|
| 188 |
+
"difficulty": "medium",
|
| 189 |
+
"name": "Database Config Rollout Regression",
|
| 190 |
+
"description": (
|
| 191 |
+
"A database config push cut connection pool size and write requests now time out. "
|
| 192 |
+
"A separate worker deploy landed around the same time and looks suspicious but is not the cause. "
|
| 193 |
+
"The agent must avoid the decoy, roll back the database config, restart it, and verify recovery."
|
| 194 |
+
),
|
| 195 |
+
"root_cause": "A bad database config rollout shrank the connection pool and is dropping writes.",
|
| 196 |
+
"optimal_ticks": 10,
|
| 197 |
+
"max_ticks": 12,
|
| 198 |
+
"critical_service_weights": {
|
| 199 |
+
"worker": 0.2,
|
| 200 |
+
"database": 0.5,
|
| 201 |
+
"api-gateway": 0.3,
|
| 202 |
+
"cache": 0.0,
|
| 203 |
+
},
|
| 204 |
+
"reward_config": {
|
| 205 |
+
"step_cost": 0.01,
|
| 206 |
+
"redundant_action_penalty": 0.02,
|
| 207 |
+
"unsafe_action_penalty": 0.08,
|
| 208 |
+
"premature_resolution_penalty": 0.2,
|
| 209 |
+
"successful_resolution_bonus": 0.25,
|
| 210 |
+
"hypothesis_bonus_scale": 0.12,
|
| 211 |
+
"forbidden_reward_sources": [
|
| 212 |
+
"evidence_discovery",
|
| 213 |
+
"query_success",
|
| 214 |
+
"unlock_events",
|
| 215 |
+
"stage_advancement",
|
| 216 |
+
"patch_id_selection",
|
| 217 |
+
],
|
| 218 |
+
},
|
| 219 |
+
"initial_services": {
|
| 220 |
+
"api-gateway": {
|
| 221 |
+
"status": "degraded",
|
| 222 |
+
"cpu_pct": 44.0,
|
| 223 |
+
"memory_pct": 36.0,
|
| 224 |
+
"error_rate_pct": 17.0,
|
| 225 |
+
"latency_ms": 520.0,
|
| 226 |
+
},
|
| 227 |
+
"cache": {
|
| 228 |
+
"status": "healthy",
|
| 229 |
+
"cpu_pct": 20.0,
|
| 230 |
+
"memory_pct": 26.0,
|
| 231 |
+
"error_rate_pct": 0.0,
|
| 232 |
+
"latency_ms": 15.0,
|
| 233 |
+
},
|
| 234 |
+
"database": {
|
| 235 |
+
"status": "degraded",
|
| 236 |
+
"cpu_pct": 62.0,
|
| 237 |
+
"memory_pct": 54.0,
|
| 238 |
+
"error_rate_pct": 48.0,
|
| 239 |
+
"latency_ms": 880.0,
|
| 240 |
+
},
|
| 241 |
+
"worker": {
|
| 242 |
+
"status": "degraded",
|
| 243 |
+
"cpu_pct": 51.0,
|
| 244 |
+
"memory_pct": 44.0,
|
| 245 |
+
"error_rate_pct": 12.0,
|
| 246 |
+
"latency_ms": 310.0,
|
| 247 |
+
},
|
| 248 |
+
},
|
| 249 |
+
"initial_alerts": [
|
| 250 |
+
{
|
| 251 |
+
"service": "database",
|
| 252 |
+
"severity": "critical",
|
| 253 |
+
"message": "Database connection acquire timeouts at 48% and climbing.",
|
| 254 |
+
},
|
| 255 |
+
{
|
| 256 |
+
"service": "api-gateway",
|
| 257 |
+
"severity": "warning",
|
| 258 |
+
"message": "Write-path requests are returning sustained 5xx.",
|
| 259 |
+
},
|
| 260 |
+
{
|
| 261 |
+
"service": "worker",
|
| 262 |
+
"severity": "warning",
|
| 263 |
+
"message": "Worker write latency is elevated; retries are climbing.",
|
| 264 |
+
},
|
| 265 |
+
],
|
| 266 |
+
"logs": {
|
| 267 |
+
"api-gateway": (
|
| 268 |
+
"Gateway upstream errors are downstream-driven: writes to the worker path return pool-exhaustion "
|
| 269 |
+
"errors originating from the database. No gateway deploys recorded in the last 24h."
|
| 270 |
+
),
|
| 271 |
+
"cache": "Cache reads are healthy and unrelated to the current write-path failures.",
|
| 272 |
+
"database": (
|
| 273 |
+
"Database logs show 'could not acquire connection' errors immediately after config rollout "
|
| 274 |
+
"db@2026.04.24-cfg lowered max_connections from 80 to 12."
|
| 275 |
+
),
|
| 276 |
+
"worker": (
|
| 277 |
+
"Worker logs show retries driven by downstream database pool exhaustion, not local faults. "
|
| 278 |
+
"Worker code deploy worker@2026.04.24-refactor is unrelated to the pool error signature."
|
| 279 |
+
),
|
| 280 |
+
},
|
| 281 |
+
"metrics": {
|
| 282 |
+
"api-gateway": {
|
| 283 |
+
"error_rate": "Gateway 5xx rate is 17% and matches the database pool-exhaustion windows one-for-one.",
|
| 284 |
+
"latency": "Gateway p95 climbed to 520ms waiting on database connection acquire.",
|
| 285 |
+
},
|
| 286 |
+
"database": {
|
| 287 |
+
"cpu": "Database CPU is moderate (~62%), so this is not a compute overload pattern.",
|
| 288 |
+
"error_rate": "Database error rate is 48% and dominated by 'connection acquire timeout'.",
|
| 289 |
+
"latency": "Database write latency jumped to 880ms after the config rollout.",
|
| 290 |
+
},
|
| 291 |
+
"worker": {
|
| 292 |
+
"cpu": "Worker CPU is 51% — no local overload; retries are reactive.",
|
| 293 |
+
"error_rate": "Worker errors are retries against the saturated database pool.",
|
| 294 |
+
},
|
| 295 |
+
},
|
| 296 |
+
"dependencies": {
|
| 297 |
+
"api-gateway": "api-gateway -> worker -> database",
|
| 298 |
+
"worker": "worker -> database",
|
| 299 |
+
"database": "database is the terminal dependency; pool exhaustion here starves all upstream writers",
|
| 300 |
+
},
|
| 301 |
+
"deploy_history": {
|
| 302 |
+
"api-gateway": "No gateway deploys in the last 24h.",
|
| 303 |
+
"cache": "No cache deploys in the last 24h.",
|
| 304 |
+
"database": "Applied config db@2026.04.24-cfg 15 minutes ago (max_connections 80 -> 12).",
|
| 305 |
+
"worker": "Rolled out worker@2026.04.24-refactor 22 minutes ago (unrelated code cleanup).",
|
| 306 |
+
},
|
| 307 |
+
"checks": {
|
| 308 |
+
"database_recovery": "Confirms database write latency and pool health are back within SLO.",
|
| 309 |
+
"end_to_end": "Confirms gateway write-path traffic succeeds end-to-end.",
|
| 310 |
+
},
|
| 311 |
+
"truth": {
|
| 312 |
+
"root_cause": "database_only_failure",
|
| 313 |
+
"affected_services": ["database", "api-gateway", "worker"],
|
| 314 |
+
"best_next_action": "rollback_deploy",
|
| 315 |
+
},
|
| 316 |
+
"remediation_recipe": {
|
| 317 |
+
"rollback_target": "database",
|
| 318 |
+
"restart_target": "database",
|
| 319 |
+
"isolate_target": None,
|
| 320 |
+
"restart_requires_cause_removed": True,
|
| 321 |
+
"incident_driver": "database",
|
| 322 |
+
"resolution_check": "end_to_end",
|
| 323 |
+
},
|
| 324 |
+
"post_rollback_services": {
|
| 325 |
+
"database": {"status": "degraded", "cpu_pct": 48.0, "memory_pct": 42.0, "error_rate_pct": 6.0, "latency_ms": 120.0},
|
| 326 |
+
},
|
| 327 |
+
"post_rollback_user_impact": 0.40,
|
| 328 |
+
"post_rollback_slo_burn": 0.45,
|
| 329 |
+
"post_restart_services": {
|
| 330 |
+
"database": {"status": "healthy", "cpu_pct": 36.0, "memory_pct": 40.0, "error_rate_pct": 0.0, "latency_ms": 26.0},
|
| 331 |
+
"api-gateway": {"status": "healthy", "cpu_pct": 29.0, "memory_pct": 30.0, "error_rate_pct": 0.0, "latency_ms": 44.0},
|
| 332 |
+
"worker": {"status": "healthy", "cpu_pct": 33.0, "memory_pct": 36.0, "error_rate_pct": 1.0, "latency_ms": 48.0},
|
| 333 |
+
},
|
| 334 |
+
"post_restart_user_impact": 0.10,
|
| 335 |
+
"post_restart_slo_burn": 0.14,
|
| 336 |
+
"post_isolate_services": {},
|
| 337 |
+
"post_isolate_user_impact": 0.70,
|
| 338 |
+
"post_isolate_slo_burn": 0.75,
|
| 339 |
+
"degraded_services": {
|
| 340 |
+
"database": {"status": "degraded", "cpu_pct": 62.0, "memory_pct": 54.0, "error_rate_pct": 48.0, "latency_ms": 880.0},
|
| 341 |
+
"api-gateway": {"status": "degraded", "cpu_pct": 44.0, "memory_pct": 36.0, "error_rate_pct": 17.0, "latency_ms": 520.0},
|
| 342 |
+
"worker": {"status": "degraded", "cpu_pct": 51.0, "memory_pct": 44.0, "error_rate_pct": 12.0, "latency_ms": 310.0},
|
| 343 |
+
},
|
| 344 |
+
"degraded_user_impact": 0.70,
|
| 345 |
+
"degraded_slo_burn": 0.78,
|
| 346 |
+
"failure_messages": {
|
| 347 |
+
"wrong_rollback_target": "The worker deploy is a decoy; worker errors are reactive to database pool exhaustion.",
|
| 348 |
+
"low_value_restart": "Restarting that service does not address a database-config regression.",
|
| 349 |
+
"premature_restart": "Restarting the database before rolling back the config will re-inherit the 12-connection pool and fail again.",
|
| 350 |
+
"wrong_isolation_target": "Isolation is not useful here: the cause is a config regression, not a runaway service.",
|
| 351 |
+
},
|
| 352 |
+
},
|
| 353 |
+
"gateway_auth_rollout": {
|
| 354 |
+
"id": "gateway_auth_rollout",
|
| 355 |
+
"difficulty": "hard",
|
| 356 |
+
"name": "Gateway Auth Rollout Regression",
|
| 357 |
+
"description": (
|
| 358 |
+
"A new api-gateway auth-middleware rollout is rejecting ~40% of valid logins. "
|
| 359 |
+
"A recent worker deploy and elevated worker queue depth make the worker look like a plausible suspect. "
|
| 360 |
+
"The agent must localize to the gateway, roll back its deploy, and verify recovery without unnecessary restarts."
|
| 361 |
+
),
|
| 362 |
+
"root_cause": "A bad api-gateway auth-middleware rollout is rejecting valid logins.",
|
| 363 |
+
"optimal_ticks": 8,
|
| 364 |
+
"max_ticks": 10,
|
| 365 |
+
"critical_service_weights": {
|
| 366 |
+
"worker": 0.15,
|
| 367 |
+
"database": 0.15,
|
| 368 |
+
"api-gateway": 0.70,
|
| 369 |
+
"cache": 0.0,
|
| 370 |
+
},
|
| 371 |
+
"reward_config": {
|
| 372 |
+
"step_cost": 0.01,
|
| 373 |
+
"redundant_action_penalty": 0.02,
|
| 374 |
+
"unsafe_action_penalty": 0.12,
|
| 375 |
+
"premature_resolution_penalty": 0.3,
|
| 376 |
+
"successful_resolution_bonus": 0.3,
|
| 377 |
+
"hypothesis_bonus_scale": 0.12,
|
| 378 |
+
"forbidden_reward_sources": [
|
| 379 |
+
"evidence_discovery",
|
| 380 |
+
"query_success",
|
| 381 |
+
"unlock_events",
|
| 382 |
+
"stage_advancement",
|
| 383 |
+
"patch_id_selection",
|
| 384 |
+
],
|
| 385 |
+
},
|
| 386 |
+
"initial_services": {
|
| 387 |
+
"api-gateway": {
|
| 388 |
+
"status": "degraded",
|
| 389 |
+
"cpu_pct": 38.0,
|
| 390 |
+
"memory_pct": 42.0,
|
| 391 |
+
"error_rate_pct": 41.0,
|
| 392 |
+
"latency_ms": 180.0,
|
| 393 |
+
},
|
| 394 |
+
"cache": {
|
| 395 |
+
"status": "healthy",
|
| 396 |
+
"cpu_pct": 17.0,
|
| 397 |
+
"memory_pct": 23.0,
|
| 398 |
+
"error_rate_pct": 0.0,
|
| 399 |
+
"latency_ms": 12.0,
|
| 400 |
+
},
|
| 401 |
+
"database": {
|
| 402 |
+
"status": "healthy",
|
| 403 |
+
"cpu_pct": 38.0,
|
| 404 |
+
"memory_pct": 41.0,
|
| 405 |
+
"error_rate_pct": 1.0,
|
| 406 |
+
"latency_ms": 28.0,
|
| 407 |
+
},
|
| 408 |
+
"worker": {
|
| 409 |
+
"status": "degraded",
|
| 410 |
+
"cpu_pct": 63.0,
|
| 411 |
+
"memory_pct": 48.0,
|
| 412 |
+
"error_rate_pct": 4.0,
|
| 413 |
+
"latency_ms": 220.0,
|
| 414 |
+
},
|
| 415 |
+
},
|
| 416 |
+
"initial_alerts": [
|
| 417 |
+
{
|
| 418 |
+
"service": "api-gateway",
|
| 419 |
+
"severity": "critical",
|
| 420 |
+
"message": "Gateway is returning 401 on ~40% of valid login attempts.",
|
| 421 |
+
},
|
| 422 |
+
{
|
| 423 |
+
"service": "worker",
|
| 424 |
+
"severity": "warning",
|
| 425 |
+
"message": "Worker queue depth is elevated from the retry storm upstream.",
|
| 426 |
+
},
|
| 427 |
+
],
|
| 428 |
+
"logs": {
|
| 429 |
+
"api-gateway": (
|
| 430 |
+
"Gateway logs show auth-middleware rejecting tokens with valid signatures. "
|
| 431 |
+
"Rejection rate started exactly at the gateway@2026.04.24-auth rollout boundary."
|
| 432 |
+
),
|
| 433 |
+
"cache": "Cache hit ratio stable and unrelated.",
|
| 434 |
+
"database": "Database logs are clean; no increase in errors or latency.",
|
| 435 |
+
"worker": (
|
| 436 |
+
"Worker logs show client-side retry storms triggered by upstream 401s, not local faults. "
|
| 437 |
+
"Worker deploy worker@2026.04.24-hotfix is a log-format tweak and does not touch auth."
|
| 438 |
+
),
|
| 439 |
+
},
|
| 440 |
+
"metrics": {
|
| 441 |
+
"api-gateway": {
|
| 442 |
+
"error_rate": "Gateway error rate is 41%, dominated by 401 responses (auth failures).",
|
| 443 |
+
"latency": "Gateway latency is normal — errors are fast rejections, not timeouts.",
|
| 444 |
+
},
|
| 445 |
+
"database": {
|
| 446 |
+
"cpu": "Database CPU is 38% (normal).",
|
| 447 |
+
"error_rate": "Database error rate is ~1% and flat.",
|
| 448 |
+
},
|
| 449 |
+
"worker": {
|
| 450 |
+
"cpu": "Worker CPU is 63% from retry volume, not workload.",
|
| 451 |
+
"error_rate": "Worker errors are reactive retries, not primary failures.",
|
| 452 |
+
},
|
| 453 |
+
},
|
| 454 |
+
"dependencies": {
|
| 455 |
+
"api-gateway": "api-gateway -> (auth) -> worker -> database",
|
| 456 |
+
"worker": "worker -> database",
|
| 457 |
+
"database": "database is healthy; it is not on the fault path",
|
| 458 |
+
},
|
| 459 |
+
"deploy_history": {
|
| 460 |
+
"api-gateway": "Rolled out gateway@2026.04.24-auth 9 minutes ago (auth middleware rewrite).",
|
| 461 |
+
"cache": "No cache deploys in the last 24h.",
|
| 462 |
+
"database": "No database deploys in the last 24h.",
|
| 463 |
+
"worker": "Rolled out worker@2026.04.24-hotfix 18 minutes ago (log-format tweak, no auth changes).",
|
| 464 |
+
},
|
| 465 |
+
"checks": {
|
| 466 |
+
"database_recovery": "Confirms the database is healthy (always healthy in this scenario).",
|
| 467 |
+
"end_to_end": "Confirms gateway login traffic succeeds end-to-end.",
|
| 468 |
+
},
|
| 469 |
+
"truth": {
|
| 470 |
+
"root_cause": "api_gateway_fault",
|
| 471 |
+
"affected_services": ["api-gateway", "worker"],
|
| 472 |
+
"best_next_action": "rollback_deploy",
|
| 473 |
+
},
|
| 474 |
+
"remediation_recipe": {
|
| 475 |
+
"rollback_target": "api-gateway",
|
| 476 |
+
"restart_target": None,
|
| 477 |
+
"isolate_target": "api-gateway",
|
| 478 |
+
"restart_requires_cause_removed": True,
|
| 479 |
+
"incident_driver": "api-gateway",
|
| 480 |
+
"resolution_check": "end_to_end",
|
| 481 |
+
},
|
| 482 |
+
"post_rollback_services": {
|
| 483 |
+
"api-gateway": {"status": "healthy", "cpu_pct": 30.0, "memory_pct": 34.0, "error_rate_pct": 1.0, "latency_ms": 38.0},
|
| 484 |
+
"worker": {"status": "healthy", "cpu_pct": 34.0, "memory_pct": 36.0, "error_rate_pct": 1.0, "latency_ms": 52.0},
|
| 485 |
+
},
|
| 486 |
+
"post_rollback_user_impact": 0.12,
|
| 487 |
+
"post_rollback_slo_burn": 0.18,
|
| 488 |
+
"post_restart_services": {},
|
| 489 |
+
"post_restart_user_impact": 0.12,
|
| 490 |
+
"post_restart_slo_burn": 0.18,
|
| 491 |
+
"post_isolate_services": {
|
| 492 |
+
"api-gateway": {"status": "isolated", "cpu_pct": 6.0, "memory_pct": 14.0, "error_rate_pct": 0.0, "latency_ms": 0.0},
|
| 493 |
+
},
|
| 494 |
+
"post_isolate_user_impact": 0.55,
|
| 495 |
+
"post_isolate_slo_burn": 0.60,
|
| 496 |
+
"degraded_services": {
|
| 497 |
+
"api-gateway": {"status": "degraded", "cpu_pct": 38.0, "memory_pct": 42.0, "error_rate_pct": 41.0, "latency_ms": 180.0},
|
| 498 |
+
"worker": {"status": "degraded", "cpu_pct": 63.0, "memory_pct": 48.0, "error_rate_pct": 4.0, "latency_ms": 220.0},
|
| 499 |
+
},
|
| 500 |
+
"degraded_user_impact": 0.65,
|
| 501 |
+
"degraded_slo_burn": 0.72,
|
| 502 |
+
"failure_messages": {
|
| 503 |
+
"wrong_rollback_target": "The worker deploy is a log-format tweak and is not on the auth fault path.",
|
| 504 |
+
"low_value_restart": "Restarting a service does not fix a config/middleware regression rolled out as a deploy.",
|
| 505 |
+
"premature_restart": "Restarting before rolling back the gateway auth change just restarts the same bad middleware.",
|
| 506 |
+
"wrong_isolation_target": "Isolating workers or database cuts healthy traffic without fixing the gateway auth fault.",
|
| 507 |
+
},
|
| 508 |
+
},
|
| 509 |
+
}
|
| 510 |
+
|
| 511 |
+
_RUNTIME_PROGRESS: dict[str, Any] | None = None
|
| 512 |
+
|
| 513 |
+
|
| 514 |
+
def get_scenario(scenario_id: str) -> dict[str, Any]:
|
| 515 |
+
if scenario_id not in SCENARIOS:
|
| 516 |
+
raise ValueError(f"Unknown scenario_id {scenario_id!r}")
|
| 517 |
+
return deepcopy(SCENARIOS[scenario_id])
|
| 518 |
+
|
| 519 |
+
|
| 520 |
+
SUPPORTED_DIFFICULTIES: tuple[str, ...] = ("easy", "medium", "hard")
|
| 521 |
+
|
| 522 |
+
|
| 523 |
+
def scenario_for_difficulty(difficulty: str) -> dict[str, Any]:
|
| 524 |
+
for scenario in SCENARIOS.values():
|
| 525 |
+
if scenario["difficulty"] == difficulty:
|
| 526 |
+
return deepcopy(scenario)
|
| 527 |
+
raise ValueError(f"Unknown difficulty {difficulty!r}")
|
| 528 |
+
|
| 529 |
+
|
| 530 |
+
def list_scenarios(difficulty: str | None = None) -> ScenarioCatalog:
|
| 531 |
+
if difficulty is not None and difficulty not in SUPPORTED_DIFFICULTIES:
|
| 532 |
+
raise ValueError(f"Unknown difficulty {difficulty!r}")
|
| 533 |
+
scenarios = [
|
| 534 |
+
ScenarioSummary(
|
| 535 |
+
id=scenario["id"],
|
| 536 |
+
difficulty=scenario["difficulty"],
|
| 537 |
+
name=scenario["name"],
|
| 538 |
+
description=scenario["description"],
|
| 539 |
+
root_cause=scenario["root_cause"],
|
| 540 |
+
optimal_ticks=scenario["optimal_ticks"],
|
| 541 |
+
)
|
| 542 |
+
for scenario in SCENARIOS.values()
|
| 543 |
+
if difficulty is None or scenario["difficulty"] == difficulty
|
| 544 |
+
]
|
| 545 |
+
return ScenarioCatalog(
|
| 546 |
+
default_scenario_id=DEFAULT_SCENARIO_ID,
|
| 547 |
+
available_difficulties=list(SUPPORTED_DIFFICULTIES),
|
| 548 |
+
filtered_difficulty=difficulty,
|
| 549 |
+
scenarios=scenarios,
|
| 550 |
+
)
|
| 551 |
+
|
| 552 |
+
|
| 553 |
+
def _worker_cascade_baseline() -> list[BaselineStep]:
|
| 554 |
+
return [
|
| 555 |
+
BaselineStep(
|
| 556 |
+
action=UnifiedIncidentAction(action_type="query_deploys", service="worker"),
|
| 557 |
+
rationale="Check whether any recent deploy aligns with the incident start.",
|
| 558 |
+
),
|
| 559 |
+
BaselineStep(
|
| 560 |
+
action=UnifiedIncidentAction(action_type="query_logs", service="worker"),
|
| 561 |
+
rationale="Inspect worker logs because deploy timing and queue pressure suggest worker-originated harm.",
|
| 562 |
+
),
|
| 563 |
+
BaselineStep(
|
| 564 |
+
action=UnifiedIncidentAction(action_type="query_metrics", service="database", metric="cpu"),
|
| 565 |
+
rationale="Confirm that the database is overloaded as a downstream effect.",
|
| 566 |
+
),
|
| 567 |
+
BaselineStep(
|
| 568 |
+
action=UnifiedIncidentAction(action_type="query_dependencies", service="api-gateway"),
|
| 569 |
+
rationale="Verify the gateway depends on the worker and database path.",
|
| 570 |
+
),
|
| 571 |
+
BaselineStep(
|
| 572 |
+
action=UnifiedIncidentAction(
|
| 573 |
+
action_type="submit_hypothesis",
|
| 574 |
+
hypothesis={
|
| 575 |
+
"root_cause": "bad_worker_deploy",
|
| 576 |
+
"affected_services": ["worker", "database", "api-gateway"],
|
| 577 |
+
"confidence": 0.82,
|
| 578 |
+
"recommended_next_action": "rollback_deploy",
|
| 579 |
+
},
|
| 580 |
+
),
|
| 581 |
+
rationale="Commit a calibrated hypothesis before taking an invasive mitigation step.",
|
| 582 |
+
),
|
| 583 |
+
BaselineStep(
|
| 584 |
+
action=UnifiedIncidentAction(action_type="rollback_deploy", service="worker"),
|
| 585 |
+
rationale="Remove the triggering change before restarting downstream services.",
|
| 586 |
+
),
|
| 587 |
+
BaselineStep(
|
| 588 |
+
action=UnifiedIncidentAction(action_type="restart_service", service="database"),
|
| 589 |
+
rationale="Bring the database back cleanly after the root cause is removed.",
|
| 590 |
+
),
|
| 591 |
+
BaselineStep(
|
| 592 |
+
action=UnifiedIncidentAction(action_type="run_check", check_name="database_recovery"),
|
| 593 |
+
rationale="Verify the database is no longer crashing.",
|
| 594 |
+
),
|
| 595 |
+
BaselineStep(
|
| 596 |
+
action=UnifiedIncidentAction(action_type="run_check", check_name="end_to_end"),
|
| 597 |
+
rationale="Verify gateway traffic succeeds end-to-end.",
|
| 598 |
+
),
|
| 599 |
+
BaselineStep(
|
| 600 |
+
action=UnifiedIncidentAction(action_type="declare_resolved"),
|
| 601 |
+
rationale="Declare resolved only after objective checks pass.",
|
| 602 |
+
),
|
| 603 |
+
]
|
| 604 |
+
|
| 605 |
+
|
| 606 |
+
def _db_config_rollout_baseline() -> list[BaselineStep]:
|
| 607 |
+
return [
|
| 608 |
+
BaselineStep(
|
| 609 |
+
action=UnifiedIncidentAction(action_type="query_logs", service="database"),
|
| 610 |
+
rationale="Database is the loudest alert; inspect logs for the actual error signature.",
|
| 611 |
+
),
|
| 612 |
+
BaselineStep(
|
| 613 |
+
action=UnifiedIncidentAction(action_type="query_deploys", service="database"),
|
| 614 |
+
rationale="Pool-acquire errors suggest a config change; check recent database rollouts.",
|
| 615 |
+
),
|
| 616 |
+
BaselineStep(
|
| 617 |
+
action=UnifiedIncidentAction(action_type="query_metrics", service="database", metric="error_rate"),
|
| 618 |
+
rationale="Confirm the error pattern is pool exhaustion rather than compute overload.",
|
| 619 |
+
),
|
| 620 |
+
BaselineStep(
|
| 621 |
+
action=UnifiedIncidentAction(action_type="query_logs", service="worker"),
|
| 622 |
+
rationale="Rule out the decoy worker deploy by reading worker logs directly.",
|
| 623 |
+
),
|
| 624 |
+
BaselineStep(
|
| 625 |
+
action=UnifiedIncidentAction(
|
| 626 |
+
action_type="submit_hypothesis",
|
| 627 |
+
hypothesis={
|
| 628 |
+
"root_cause": "database_only_failure",
|
| 629 |
+
"affected_services": ["database", "api-gateway", "worker"],
|
| 630 |
+
"confidence": 0.8,
|
| 631 |
+
"recommended_next_action": "rollback_deploy",
|
| 632 |
+
},
|
| 633 |
+
),
|
| 634 |
+
rationale="Localize the fault to the database config before remediating.",
|
| 635 |
+
),
|
| 636 |
+
BaselineStep(
|
| 637 |
+
action=UnifiedIncidentAction(action_type="rollback_deploy", service="database"),
|
| 638 |
+
rationale="Roll back the offending database config rollout.",
|
| 639 |
+
),
|
| 640 |
+
BaselineStep(
|
| 641 |
+
action=UnifiedIncidentAction(action_type="restart_service", service="database"),
|
| 642 |
+
rationale="Restart the database cleanly against the restored pool config.",
|
| 643 |
+
),
|
| 644 |
+
BaselineStep(
|
| 645 |
+
action=UnifiedIncidentAction(action_type="run_check", check_name="database_recovery"),
|
| 646 |
+
rationale="Verify database pool health and write latency are back within SLO.",
|
| 647 |
+
),
|
| 648 |
+
BaselineStep(
|
| 649 |
+
action=UnifiedIncidentAction(action_type="run_check", check_name="end_to_end"),
|
| 650 |
+
rationale="Verify gateway write-path traffic succeeds end-to-end.",
|
| 651 |
+
),
|
| 652 |
+
BaselineStep(
|
| 653 |
+
action=UnifiedIncidentAction(action_type="declare_resolved"),
|
| 654 |
+
rationale="Declare resolved only after objective checks pass.",
|
| 655 |
+
),
|
| 656 |
+
]
|
| 657 |
+
|
| 658 |
+
|
| 659 |
+
def _gateway_auth_rollout_baseline() -> list[BaselineStep]:
|
| 660 |
+
return [
|
| 661 |
+
BaselineStep(
|
| 662 |
+
action=UnifiedIncidentAction(action_type="query_logs", service="api-gateway"),
|
| 663 |
+
rationale="Gateway is rejecting logins; read gateway logs to localize the rejection class.",
|
| 664 |
+
),
|
| 665 |
+
BaselineStep(
|
| 666 |
+
action=UnifiedIncidentAction(action_type="query_deploys", service="api-gateway"),
|
| 667 |
+
rationale="Login rejection aligns with a recent auth middleware rollout; confirm deploy timing.",
|
| 668 |
+
),
|
| 669 |
+
BaselineStep(
|
| 670 |
+
action=UnifiedIncidentAction(action_type="query_deploys", service="worker"),
|
| 671 |
+
rationale="Rule out the worker deploy explicitly rather than assuming.",
|
| 672 |
+
),
|
| 673 |
+
BaselineStep(
|
| 674 |
+
action=UnifiedIncidentAction(
|
| 675 |
+
action_type="submit_hypothesis",
|
| 676 |
+
hypothesis={
|
| 677 |
+
"root_cause": "api_gateway_fault",
|
| 678 |
+
"affected_services": ["api-gateway", "worker"],
|
| 679 |
+
"confidence": 0.85,
|
| 680 |
+
"recommended_next_action": "rollback_deploy",
|
| 681 |
+
},
|
| 682 |
+
),
|
| 683 |
+
rationale="Commit a calibrated hypothesis localizing to the gateway auth rollout.",
|
| 684 |
+
),
|
| 685 |
+
BaselineStep(
|
| 686 |
+
action=UnifiedIncidentAction(action_type="rollback_deploy", service="api-gateway"),
|
| 687 |
+
rationale="Roll back the bad auth middleware rollout; no restart needed.",
|
| 688 |
+
),
|
| 689 |
+
BaselineStep(
|
| 690 |
+
action=UnifiedIncidentAction(action_type="run_check", check_name="end_to_end"),
|
| 691 |
+
rationale="Verify that gateway login traffic now succeeds end-to-end.",
|
| 692 |
+
),
|
| 693 |
+
BaselineStep(
|
| 694 |
+
action=UnifiedIncidentAction(action_type="run_check", check_name="database_recovery"),
|
| 695 |
+
rationale="Confirm the database is (and stayed) healthy throughout.",
|
| 696 |
+
),
|
| 697 |
+
BaselineStep(
|
| 698 |
+
action=UnifiedIncidentAction(action_type="declare_resolved"),
|
| 699 |
+
rationale="Declare resolved only after objective checks pass.",
|
| 700 |
+
),
|
| 701 |
+
]
|
| 702 |
+
|
| 703 |
+
|
| 704 |
+
_BASELINE_BUILDERS = {
|
| 705 |
+
"worker_deploy_cascade": _worker_cascade_baseline,
|
| 706 |
+
"db_config_rollout": _db_config_rollout_baseline,
|
| 707 |
+
"gateway_auth_rollout": _gateway_auth_rollout_baseline,
|
| 708 |
+
}
|
| 709 |
+
|
| 710 |
+
|
| 711 |
+
def _baseline_actions(scenario_id: str) -> list[BaselineStep]:
|
| 712 |
+
builder = _BASELINE_BUILDERS.get(scenario_id)
|
| 713 |
+
if builder is None:
|
| 714 |
+
raise ValueError(f"No baseline for scenario_id {scenario_id!r}")
|
| 715 |
+
return builder()
|
| 716 |
+
|
| 717 |
+
|
| 718 |
+
def list_baselines(scenario_id: str | None = None) -> BaselineCatalog:
|
| 719 |
+
if scenario_id is not None:
|
| 720 |
+
if scenario_id not in SCENARIOS:
|
| 721 |
+
raise ValueError(f"Unknown scenario_id {scenario_id!r}")
|
| 722 |
+
scenario_ids = [scenario_id]
|
| 723 |
+
else:
|
| 724 |
+
scenario_ids = list(SCENARIOS.keys())
|
| 725 |
+
baselines = [
|
| 726 |
+
BaselineDefinition(
|
| 727 |
+
scenario_id=current_id,
|
| 728 |
+
name="deterministic-remediation-baseline",
|
| 729 |
+
description=SCENARIOS[current_id]["description"],
|
| 730 |
+
optimal_ticks=SCENARIOS[current_id]["optimal_ticks"],
|
| 731 |
+
actions=_baseline_actions(current_id),
|
| 732 |
+
)
|
| 733 |
+
for current_id in scenario_ids
|
| 734 |
+
]
|
| 735 |
+
return BaselineCatalog(baselines=baselines)
|
| 736 |
+
|
| 737 |
+
|
| 738 |
+
def set_runtime_progress(progress: dict[str, Any]) -> None:
|
| 739 |
+
global _RUNTIME_PROGRESS
|
| 740 |
+
_RUNTIME_PROGRESS = deepcopy(progress)
|
| 741 |
+
|
| 742 |
+
|
| 743 |
+
def current_runtime_progress() -> dict[str, Any]:
|
| 744 |
+
if _RUNTIME_PROGRESS is None:
|
| 745 |
+
raise ValueError("Runtime progress is not initialized")
|
| 746 |
+
return deepcopy(_RUNTIME_PROGRESS)
|
| 747 |
+
|
| 748 |
+
|
| 749 |
+
def grade_episode(state: dict[str, Any]):
|
| 750 |
+
from .grader import UnifiedIncidentGrader
|
| 751 |
+
|
| 752 |
+
scenario_id = state.get("scenario_id", DEFAULT_SCENARIO_ID)
|
| 753 |
+
return UnifiedIncidentGrader().build_report(state, get_scenario(scenario_id))
|
unified_incident_env/server/environment.py
ADDED
|
@@ -0,0 +1,613 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Honest narrow incident-remediation environment core."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import uuid
|
| 7 |
+
from typing import Any
|
| 8 |
+
|
| 9 |
+
from openenv.core.env_server import Environment
|
| 10 |
+
from openenv.core.env_server.types import EnvironmentMetadata
|
| 11 |
+
|
| 12 |
+
from ..models import (
|
| 13 |
+
Alert,
|
| 14 |
+
CheckResult,
|
| 15 |
+
ServiceHealth,
|
| 16 |
+
UnifiedIncidentAction,
|
| 17 |
+
UnifiedIncidentObservation,
|
| 18 |
+
UnifiedIncidentState,
|
| 19 |
+
)
|
| 20 |
+
from .challenge import DEFAULT_SCENARIO_ID, SCENARIOS, get_scenario, scenario_for_difficulty, set_runtime_progress
|
| 21 |
+
from .grader import UnifiedIncidentGrader
|
| 22 |
+
|
| 23 |
+
SERVICE_ORDER = ("api-gateway", "cache", "database", "worker")
|
| 24 |
+
ALL_ACTIONS = [
|
| 25 |
+
"query_logs",
|
| 26 |
+
"query_metrics",
|
| 27 |
+
"query_dependencies",
|
| 28 |
+
"query_deploys",
|
| 29 |
+
"rollback_deploy",
|
| 30 |
+
"restart_service",
|
| 31 |
+
"run_check",
|
| 32 |
+
"isolate_service",
|
| 33 |
+
"escalate",
|
| 34 |
+
"submit_hypothesis",
|
| 35 |
+
"declare_resolved",
|
| 36 |
+
]
|
| 37 |
+
REQUIRED_FIELDS_BY_ACTION: dict[str, list[str]] = {
|
| 38 |
+
"query_logs": ["service"],
|
| 39 |
+
"query_metrics": ["service", "metric"],
|
| 40 |
+
"query_dependencies": ["service"],
|
| 41 |
+
"query_deploys": ["service"],
|
| 42 |
+
"rollback_deploy": ["service"],
|
| 43 |
+
"restart_service": ["service"],
|
| 44 |
+
"run_check": ["check_name"],
|
| 45 |
+
"isolate_service": ["service"],
|
| 46 |
+
"escalate": [],
|
| 47 |
+
"submit_hypothesis": ["hypothesis"],
|
| 48 |
+
"declare_resolved": [],
|
| 49 |
+
}
|
| 50 |
+
STATUS_VALUES = {
|
| 51 |
+
"healthy": 1.0,
|
| 52 |
+
"degraded": 0.4,
|
| 53 |
+
"crashed": 0.0,
|
| 54 |
+
"isolated": 0.2,
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class UnifiedIncidentEnvironment(Environment[UnifiedIncidentAction, UnifiedIncidentObservation, UnifiedIncidentState]):
|
| 59 |
+
"""A bounded-action incident diagnosis and safe remediation environment."""
|
| 60 |
+
|
| 61 |
+
SUPPORTS_CONCURRENT_SESSIONS = False
|
| 62 |
+
|
| 63 |
+
def __init__(self) -> None:
|
| 64 |
+
super().__init__()
|
| 65 |
+
self._grader = UnifiedIncidentGrader()
|
| 66 |
+
self._episode = self._make_episode(get_scenario(DEFAULT_SCENARIO_ID))
|
| 67 |
+
set_runtime_progress(self._state_dict())
|
| 68 |
+
|
| 69 |
+
def get_metadata(self) -> EnvironmentMetadata:
|
| 70 |
+
return EnvironmentMetadata(
|
| 71 |
+
name="unified_incident_env",
|
| 72 |
+
description=(
|
| 73 |
+
"A narrow incident diagnosis and safe remediation environment with bounded actions, "
|
| 74 |
+
"world-state transitions, explicit checks, and effect-based rewards."
|
| 75 |
+
),
|
| 76 |
+
version="2.0.0",
|
| 77 |
+
author="Daksh Verma",
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
def reset(self, seed: int | None = None, episode_id: str | None = None, **kwargs: Any) -> UnifiedIncidentObservation:
|
| 81 |
+
del seed
|
| 82 |
+
scenario_id = kwargs.get("scenario_id")
|
| 83 |
+
difficulty = kwargs.get("difficulty")
|
| 84 |
+
if scenario_id:
|
| 85 |
+
scenario = get_scenario(scenario_id)
|
| 86 |
+
elif difficulty:
|
| 87 |
+
scenario = scenario_for_difficulty(difficulty)
|
| 88 |
+
else:
|
| 89 |
+
scenario = get_scenario(DEFAULT_SCENARIO_ID)
|
| 90 |
+
self._episode = self._make_episode(scenario, episode_id=episode_id)
|
| 91 |
+
set_runtime_progress(self._state_dict())
|
| 92 |
+
return self._build_observation(
|
| 93 |
+
last_action_result="Episode reset.",
|
| 94 |
+
tool_output=None,
|
| 95 |
+
reward=0.0,
|
| 96 |
+
done=False,
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
def step(self, action: UnifiedIncidentAction | dict[str, Any], timeout_s: float | None = None, **kwargs: Any) -> UnifiedIncidentObservation:
|
| 100 |
+
del timeout_s, kwargs
|
| 101 |
+
if isinstance(action, dict):
|
| 102 |
+
action = UnifiedIncidentAction(**action)
|
| 103 |
+
|
| 104 |
+
if self._episode["done"]:
|
| 105 |
+
return self._build_observation(
|
| 106 |
+
last_action_result="Episode complete. Reset to start another run.",
|
| 107 |
+
tool_output=None,
|
| 108 |
+
reward=0.0,
|
| 109 |
+
done=True,
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
self._episode["tick"] += 1
|
| 113 |
+
self._episode["step_count"] += 1
|
| 114 |
+
before_potential = self._incident_health_potential()
|
| 115 |
+
base_step_cost = float(self._episode["scenario"]["reward_config"]["step_cost"])
|
| 116 |
+
penalty = 0.0
|
| 117 |
+
bonus = 0.0
|
| 118 |
+
tool_output: str | None = None
|
| 119 |
+
state_changed = False
|
| 120 |
+
useful_observation = False
|
| 121 |
+
|
| 122 |
+
self._episode["failure_type"] = None
|
| 123 |
+
self._episode["why_failed"] = None
|
| 124 |
+
self._episode["loop_warning"] = None
|
| 125 |
+
|
| 126 |
+
if action.action_type == "query_logs":
|
| 127 |
+
tool_output = self._query_logs(action.service)
|
| 128 |
+
useful_observation = self._mark_evidence_once(f"logs:{action.service}", tool_output)
|
| 129 |
+
last_action_result = f"Queried logs for {action.service}."
|
| 130 |
+
elif action.action_type == "query_metrics":
|
| 131 |
+
tool_output = self._query_metrics(action.service, action.metric)
|
| 132 |
+
useful_observation = self._mark_evidence_once(f"metrics:{action.service}:{action.metric}", tool_output)
|
| 133 |
+
last_action_result = f"Queried {action.metric} for {action.service}."
|
| 134 |
+
elif action.action_type == "query_dependencies":
|
| 135 |
+
tool_output = self._query_dependencies(action.service)
|
| 136 |
+
useful_observation = self._mark_evidence_once(f"deps:{action.service}", tool_output)
|
| 137 |
+
last_action_result = f"Queried dependencies for {action.service}."
|
| 138 |
+
elif action.action_type == "query_deploys":
|
| 139 |
+
tool_output = self._query_deploys(action.service)
|
| 140 |
+
useful_observation = self._mark_evidence_once(f"deploys:{action.service}", tool_output)
|
| 141 |
+
last_action_result = f"Queried deploy history for {action.service}."
|
| 142 |
+
elif action.action_type == "submit_hypothesis":
|
| 143 |
+
bonus, useful_observation, last_action_result = self._submit_hypothesis(action)
|
| 144 |
+
elif action.action_type == "rollback_deploy":
|
| 145 |
+
state_changed, penalty, last_action_result = self._rollback_deploy(action.service)
|
| 146 |
+
elif action.action_type == "restart_service":
|
| 147 |
+
state_changed, penalty, last_action_result = self._restart_service(action.service)
|
| 148 |
+
elif action.action_type == "isolate_service":
|
| 149 |
+
state_changed, penalty, last_action_result = self._isolate_service(action.service)
|
| 150 |
+
elif action.action_type == "run_check":
|
| 151 |
+
tool_output, useful_observation, last_action_result = self._run_check(action.check_name)
|
| 152 |
+
elif action.action_type == "escalate":
|
| 153 |
+
useful_observation = self._mark_evidence_once(
|
| 154 |
+
f"escalate:{self._episode['tick']}",
|
| 155 |
+
"Escalation note recorded: expert attention requested while keeping the environment state unchanged.",
|
| 156 |
+
)
|
| 157 |
+
last_action_result = "Escalated for human attention."
|
| 158 |
+
tool_output = "Escalation does not fix the incident, but records that expert attention was requested."
|
| 159 |
+
elif action.action_type == "declare_resolved":
|
| 160 |
+
resolved, penalty, bonus, last_action_result = self._declare_resolved()
|
| 161 |
+
state_changed = resolved
|
| 162 |
+
else:
|
| 163 |
+
last_action_result = f"Unsupported action {action.action_type!r}."
|
| 164 |
+
penalty += self._unsafe_penalty()
|
| 165 |
+
self._set_failure("unsupported_action", "That action is not part of this honest narrow environment.")
|
| 166 |
+
|
| 167 |
+
self._advance_world()
|
| 168 |
+
self._refresh_alerts()
|
| 169 |
+
self._update_loop_feedback(action, useful_observation or state_changed)
|
| 170 |
+
after_potential = self._incident_health_potential()
|
| 171 |
+
|
| 172 |
+
reward = -base_step_cost + (after_potential - before_potential) + bonus - penalty
|
| 173 |
+
if not useful_observation and not state_changed and bonus <= 0.0:
|
| 174 |
+
self._episode["wasteful_ticks"] += 1
|
| 175 |
+
|
| 176 |
+
if self._episode["tick"] >= self._episode["max_ticks"] and not self._episode["done"]:
|
| 177 |
+
self._episode["done"] = True
|
| 178 |
+
last_action_result = f"{last_action_result} Tick budget exhausted.".strip()
|
| 179 |
+
|
| 180 |
+
self._episode["last_action_result"] = last_action_result
|
| 181 |
+
self._episode["workflow_stage"] = self._workflow_stage()
|
| 182 |
+
self._episode["score_breakdown"] = self._grader.compute_breakdown(self._state_dict(), self._episode["scenario"])
|
| 183 |
+
self._episode["final_score"] = self._episode["score_breakdown"]["final_score"]
|
| 184 |
+
self._episode["cumulative_reward"] = round(self._episode["cumulative_reward"] + reward, 4)
|
| 185 |
+
|
| 186 |
+
set_runtime_progress(self._state_dict())
|
| 187 |
+
return self._build_observation(
|
| 188 |
+
last_action_result=last_action_result,
|
| 189 |
+
tool_output=tool_output,
|
| 190 |
+
reward=round(reward, 4),
|
| 191 |
+
done=self._episode["done"],
|
| 192 |
+
)
|
| 193 |
+
|
| 194 |
+
@property
|
| 195 |
+
def state(self) -> UnifiedIncidentState:
|
| 196 |
+
return UnifiedIncidentState(**self._state_dict())
|
| 197 |
+
|
| 198 |
+
def _make_episode(self, scenario: dict[str, Any], episode_id: str | None = None) -> dict[str, Any]:
|
| 199 |
+
services = {
|
| 200 |
+
name: ServiceHealth(name=name, **payload)
|
| 201 |
+
for name, payload in scenario["initial_services"].items()
|
| 202 |
+
}
|
| 203 |
+
checks = {
|
| 204 |
+
"database_recovery": CheckResult(name="database_recovery", passed=False, detail="Database recovery has not been verified yet."),
|
| 205 |
+
"end_to_end": CheckResult(name="end_to_end", passed=False, detail="End-to-end health has not been verified yet."),
|
| 206 |
+
}
|
| 207 |
+
recipe = scenario.get("remediation_recipe", {})
|
| 208 |
+
rollback_target = recipe.get("rollback_target", "worker")
|
| 209 |
+
recent_deploy_service = rollback_target if rollback_target in scenario["deploy_history"] else "worker"
|
| 210 |
+
return {
|
| 211 |
+
"episode_id": episode_id or str(uuid.uuid4()),
|
| 212 |
+
"scenario": scenario,
|
| 213 |
+
"tick": 0,
|
| 214 |
+
"step_count": 0,
|
| 215 |
+
"max_ticks": scenario["max_ticks"],
|
| 216 |
+
"difficulty": scenario["difficulty"],
|
| 217 |
+
"services": services,
|
| 218 |
+
"alerts": [Alert(**payload) for payload in scenario["initial_alerts"]],
|
| 219 |
+
"discovered_evidence": [],
|
| 220 |
+
"evidence_seen": set(),
|
| 221 |
+
"recent_deploys": [scenario["deploy_history"].get(recent_deploy_service, "")],
|
| 222 |
+
"checks": checks,
|
| 223 |
+
"user_impact": scenario.get("degraded_user_impact", 0.82),
|
| 224 |
+
"slo_burn_rate": scenario.get("degraded_slo_burn", 0.91),
|
| 225 |
+
"containment_applied": False,
|
| 226 |
+
"cause_removed": False,
|
| 227 |
+
"isolated_service": None,
|
| 228 |
+
"hypothesis_seen": set(),
|
| 229 |
+
"failure_type": None,
|
| 230 |
+
"why_failed": None,
|
| 231 |
+
"loop_warning": None,
|
| 232 |
+
"last_action_key": None,
|
| 233 |
+
"repeat_count": 0,
|
| 234 |
+
"incident_resolved": False,
|
| 235 |
+
"workflow_stage": "triage",
|
| 236 |
+
"cumulative_reward": 0.0,
|
| 237 |
+
"wasteful_ticks": 0,
|
| 238 |
+
"score_breakdown": {
|
| 239 |
+
"recovery_score": 0.0,
|
| 240 |
+
"containment_score": 0.0,
|
| 241 |
+
"verification_score": 0.0,
|
| 242 |
+
"impact_score": 0.0,
|
| 243 |
+
"efficiency_score": 0.10,
|
| 244 |
+
"final_score": 0.10,
|
| 245 |
+
},
|
| 246 |
+
"final_score": 0.10,
|
| 247 |
+
"last_action_result": "",
|
| 248 |
+
"done": False,
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
def _query_logs(self, service: str | None) -> str:
|
| 252 |
+
assert service is not None
|
| 253 |
+
return self._episode["scenario"]["logs"][service]
|
| 254 |
+
|
| 255 |
+
def _query_metrics(self, service: str | None, metric: str | None) -> str:
|
| 256 |
+
assert service is not None and metric is not None
|
| 257 |
+
return self._episode["scenario"]["metrics"][service][metric]
|
| 258 |
+
|
| 259 |
+
def _query_dependencies(self, service: str | None) -> str:
|
| 260 |
+
assert service is not None
|
| 261 |
+
return self._episode["scenario"]["dependencies"][service]
|
| 262 |
+
|
| 263 |
+
def _query_deploys(self, service: str | None) -> str:
|
| 264 |
+
assert service is not None
|
| 265 |
+
return self._episode["scenario"]["deploy_history"][service]
|
| 266 |
+
|
| 267 |
+
def _submit_hypothesis(self, action: UnifiedIncidentAction) -> tuple[float, bool, str]:
|
| 268 |
+
assert action.hypothesis is not None
|
| 269 |
+
normalized = json.dumps(action.hypothesis.model_dump(), sort_keys=True)
|
| 270 |
+
if normalized in self._episode["hypothesis_seen"]:
|
| 271 |
+
return 0.0, False, "Repeated hypothesis recorded with no additional reward."
|
| 272 |
+
self._episode["hypothesis_seen"].add(normalized)
|
| 273 |
+
truth = self._episode["scenario"]["truth"]
|
| 274 |
+
payload = action.hypothesis
|
| 275 |
+
cause_match = 1.0 if payload.root_cause == truth["root_cause"] else 0.0
|
| 276 |
+
service_match = len(set(payload.affected_services) & set(truth["affected_services"])) / len(set(truth["affected_services"]))
|
| 277 |
+
action_quality = 1.0 if payload.recommended_next_action == truth["best_next_action"] else -0.4
|
| 278 |
+
if cause_match == 1.0:
|
| 279 |
+
calibration = 1.0 if payload.confidence >= 0.7 else 0.5
|
| 280 |
+
else:
|
| 281 |
+
calibration = -1.0 if payload.confidence >= 0.7 else -0.2
|
| 282 |
+
reward = (0.04 * cause_match) + (0.03 * service_match) + (0.03 * action_quality) + (0.02 * calibration)
|
| 283 |
+
return round(reward, 4), True, "Hypothesis recorded. Reward reflects root-cause accuracy, service localization, confidence calibration, and next-action quality."
|
| 284 |
+
|
| 285 |
+
def _recipe(self) -> dict[str, Any]:
|
| 286 |
+
return self._episode["scenario"].get("remediation_recipe", {})
|
| 287 |
+
|
| 288 |
+
def _failure_message(self, key: str, default: str) -> str:
|
| 289 |
+
return self._episode["scenario"].get("failure_messages", {}).get(key, default)
|
| 290 |
+
|
| 291 |
+
def _apply_service_updates(self, updates: dict[str, dict[str, Any]]) -> None:
|
| 292 |
+
for name, payload in updates.items():
|
| 293 |
+
self._episode["services"][name] = ServiceHealth(name=name, **payload)
|
| 294 |
+
|
| 295 |
+
def _rollback_deploy(self, service: str | None) -> tuple[bool, float, str]:
|
| 296 |
+
assert service is not None
|
| 297 |
+
recipe = self._recipe()
|
| 298 |
+
rollback_target = recipe.get("rollback_target")
|
| 299 |
+
if rollback_target is None or service != rollback_target:
|
| 300 |
+
self._set_failure(
|
| 301 |
+
"wrong_remediation_target",
|
| 302 |
+
self._failure_message("wrong_rollback_target", "Rolling back a service without a causal link wastes time and risk."),
|
| 303 |
+
)
|
| 304 |
+
return False, self._unsafe_penalty(), f"Rollback on {service} did not address the incident."
|
| 305 |
+
if self._episode["cause_removed"]:
|
| 306 |
+
return False, 0.0, f"{rollback_target} deploy is already rolled back."
|
| 307 |
+
self._episode["cause_removed"] = True
|
| 308 |
+
self._episode["containment_applied"] = True
|
| 309 |
+
self._apply_service_updates(self._episode["scenario"].get("post_rollback_services", {}))
|
| 310 |
+
scenario = self._episode["scenario"]
|
| 311 |
+
self._episode["user_impact"] = min(self._episode["user_impact"], scenario.get("post_rollback_user_impact", self._episode["user_impact"]))
|
| 312 |
+
self._episode["slo_burn_rate"] = min(self._episode["slo_burn_rate"], scenario.get("post_rollback_slo_burn", self._episode["slo_burn_rate"]))
|
| 313 |
+
return True, 0.0, f"Rolled back the {rollback_target} deploy; the underlying cause is removed."
|
| 314 |
+
|
| 315 |
+
def _restart_service(self, service: str | None) -> tuple[bool, float, str]:
|
| 316 |
+
assert service is not None
|
| 317 |
+
recipe = self._recipe()
|
| 318 |
+
restart_target = recipe.get("restart_target")
|
| 319 |
+
if restart_target is None or service != restart_target:
|
| 320 |
+
self._set_failure(
|
| 321 |
+
"low_value_restart",
|
| 322 |
+
self._failure_message("low_value_restart", f"Restarting {service} is not the safe next remediation step for this incident."),
|
| 323 |
+
)
|
| 324 |
+
return False, self._unsafe_penalty() / 2, f"Restarting {service} had little or no positive effect."
|
| 325 |
+
if recipe.get("restart_requires_cause_removed", True) and not self._episode["cause_removed"]:
|
| 326 |
+
self._set_failure(
|
| 327 |
+
"premature_restart",
|
| 328 |
+
self._failure_message("premature_restart", f"Restarting {service} before removing the trigger only causes another failure."),
|
| 329 |
+
)
|
| 330 |
+
return False, self._unsafe_penalty(), f"Restart of {service} failed because the triggering cause is still present."
|
| 331 |
+
self._apply_service_updates(self._episode["scenario"].get("post_restart_services", {}))
|
| 332 |
+
scenario = self._episode["scenario"]
|
| 333 |
+
self._episode["user_impact"] = scenario.get("post_restart_user_impact", self._episode["user_impact"])
|
| 334 |
+
self._episode["slo_burn_rate"] = scenario.get("post_restart_slo_burn", self._episode["slo_burn_rate"])
|
| 335 |
+
return True, 0.0, f"{service} restarted cleanly after the triggering cause was removed."
|
| 336 |
+
|
| 337 |
+
def _isolate_service(self, service: str | None) -> tuple[bool, float, str]:
|
| 338 |
+
assert service is not None
|
| 339 |
+
recipe = self._recipe()
|
| 340 |
+
isolate_target = recipe.get("isolate_target")
|
| 341 |
+
if isolate_target is None or service != isolate_target:
|
| 342 |
+
self._set_failure(
|
| 343 |
+
"wrong_isolation_target",
|
| 344 |
+
self._failure_message("wrong_isolation_target", f"Isolating {service} does not contain the dominant failure path."),
|
| 345 |
+
)
|
| 346 |
+
return False, self._unsafe_penalty() / 2, f"Isolation of {service} did not materially reduce blast radius."
|
| 347 |
+
if self._episode["isolated_service"] == isolate_target:
|
| 348 |
+
return False, 0.0, f"{isolate_target} is already isolated."
|
| 349 |
+
self._episode["isolated_service"] = isolate_target
|
| 350 |
+
self._episode["containment_applied"] = True
|
| 351 |
+
self._apply_service_updates(self._episode["scenario"].get("post_isolate_services", {}))
|
| 352 |
+
scenario = self._episode["scenario"]
|
| 353 |
+
self._episode["user_impact"] = scenario.get("post_isolate_user_impact", self._episode["user_impact"])
|
| 354 |
+
self._episode["slo_burn_rate"] = scenario.get("post_isolate_slo_burn", self._episode["slo_burn_rate"])
|
| 355 |
+
return True, 0.0, f"{isolate_target} isolated. Blast radius shrank, but full resolution still requires addressing the root cause."
|
| 356 |
+
|
| 357 |
+
def _run_check(self, check_name: str | None) -> tuple[str, bool, str]:
|
| 358 |
+
assert check_name is not None
|
| 359 |
+
recipe = self._recipe()
|
| 360 |
+
isolated = self._episode["isolated_service"]
|
| 361 |
+
cause_removed = self._episode["cause_removed"]
|
| 362 |
+
services = self._episode["services"]
|
| 363 |
+
if check_name == "database_recovery":
|
| 364 |
+
db_healthy = services["database"].status == "healthy"
|
| 365 |
+
incident_driver = recipe.get("incident_driver")
|
| 366 |
+
if incident_driver in {"worker", "database"}:
|
| 367 |
+
passed = db_healthy and cause_removed
|
| 368 |
+
else:
|
| 369 |
+
passed = db_healthy
|
| 370 |
+
detail = (
|
| 371 |
+
"Database is healthy and no longer failing."
|
| 372 |
+
if passed
|
| 373 |
+
else "Database is still unstable or the triggering cause is still present."
|
| 374 |
+
)
|
| 375 |
+
else:
|
| 376 |
+
gateway_healthy = services["api-gateway"].status == "healthy"
|
| 377 |
+
db_healthy = services["database"].status == "healthy"
|
| 378 |
+
worker_healthy = services["worker"].status == "healthy"
|
| 379 |
+
passed = (
|
| 380 |
+
gateway_healthy
|
| 381 |
+
and db_healthy
|
| 382 |
+
and worker_healthy
|
| 383 |
+
and cause_removed
|
| 384 |
+
and isolated is None
|
| 385 |
+
)
|
| 386 |
+
detail = (
|
| 387 |
+
"End-to-end login traffic is healthy."
|
| 388 |
+
if passed
|
| 389 |
+
else "End-to-end traffic still fails or remains degraded."
|
| 390 |
+
)
|
| 391 |
+
self._episode["checks"][check_name] = CheckResult(name=check_name, passed=passed, detail=detail)
|
| 392 |
+
useful = self._mark_evidence_once(f"check:{check_name}:{passed}", detail)
|
| 393 |
+
return detail, useful, f"Ran {check_name} check."
|
| 394 |
+
|
| 395 |
+
def _declare_resolved(self) -> tuple[bool, float, float, str]:
|
| 396 |
+
checks = self._episode["checks"]
|
| 397 |
+
resolution_check = self._recipe().get("resolution_check", "end_to_end")
|
| 398 |
+
safe_to_resolve = bool(checks.get(resolution_check) and checks[resolution_check].passed)
|
| 399 |
+
if not safe_to_resolve:
|
| 400 |
+
self._set_failure("premature_resolution", "The incident is not verified as resolved yet.")
|
| 401 |
+
return False, self._episode["scenario"]["reward_config"]["premature_resolution_penalty"], 0.0, "Resolution declaration rejected: required checks have not passed."
|
| 402 |
+
self._episode["incident_resolved"] = True
|
| 403 |
+
self._episode["done"] = True
|
| 404 |
+
return True, 0.0, self._episode["scenario"]["reward_config"]["successful_resolution_bonus"], "Incident declared resolved after passing objective checks."
|
| 405 |
+
|
| 406 |
+
def _mark_evidence_once(self, key: str, detail: str) -> bool:
|
| 407 |
+
if key in self._episode["evidence_seen"]:
|
| 408 |
+
return False
|
| 409 |
+
self._episode["evidence_seen"].add(key)
|
| 410 |
+
self._episode["discovered_evidence"].append(detail)
|
| 411 |
+
return True
|
| 412 |
+
|
| 413 |
+
def _unsafe_penalty(self) -> float:
|
| 414 |
+
return float(self._episode["scenario"]["reward_config"]["unsafe_action_penalty"])
|
| 415 |
+
|
| 416 |
+
def _set_failure(self, failure_type: str, why_failed: str) -> None:
|
| 417 |
+
self._episode["failure_type"] = failure_type
|
| 418 |
+
self._episode["why_failed"] = why_failed
|
| 419 |
+
|
| 420 |
+
def _advance_world(self) -> None:
|
| 421 |
+
cause_removed = self._episode["cause_removed"]
|
| 422 |
+
isolated = self._episode["isolated_service"]
|
| 423 |
+
if not cause_removed and isolated is None:
|
| 424 |
+
self._apply_service_updates(self._episode["scenario"].get("degraded_services", {}))
|
| 425 |
+
scenario = self._episode["scenario"]
|
| 426 |
+
self._episode["user_impact"] = max(self._episode["user_impact"], scenario.get("degraded_user_impact", self._episode["user_impact"]))
|
| 427 |
+
self._episode["slo_burn_rate"] = max(self._episode["slo_burn_rate"], scenario.get("degraded_slo_burn", self._episode["slo_burn_rate"]))
|
| 428 |
+
if isolated is not None and not cause_removed:
|
| 429 |
+
self._episode["containment_applied"] = True
|
| 430 |
+
self._episode["workflow_stage"] = self._workflow_stage()
|
| 431 |
+
|
| 432 |
+
def _refresh_alerts(self) -> None:
|
| 433 |
+
alerts: list[Alert] = []
|
| 434 |
+
for service_name in SERVICE_ORDER:
|
| 435 |
+
service = self._episode["services"][service_name]
|
| 436 |
+
if service.status == "crashed":
|
| 437 |
+
alerts.append(Alert(service=service_name, severity="critical", message=f"{service_name} is unavailable."))
|
| 438 |
+
elif service.status == "degraded":
|
| 439 |
+
alerts.append(Alert(service=service_name, severity="warning", message=f"{service_name} is degraded."))
|
| 440 |
+
if self._episode["user_impact"] >= 0.3 and not any(alert.service == "api-gateway" for alert in alerts):
|
| 441 |
+
alerts.append(Alert(service="api-gateway", severity="warning", message="User-visible impact remains elevated."))
|
| 442 |
+
self._episode["alerts"] = alerts
|
| 443 |
+
|
| 444 |
+
def _update_loop_feedback(self, action: UnifiedIncidentAction, progressed: bool) -> None:
|
| 445 |
+
action_key = repr(action.model_dump(exclude_none=True))
|
| 446 |
+
if progressed:
|
| 447 |
+
self._episode["last_action_key"] = action_key
|
| 448 |
+
self._episode["repeat_count"] = 0
|
| 449 |
+
return
|
| 450 |
+
if self._episode["last_action_key"] == action_key:
|
| 451 |
+
self._episode["repeat_count"] += 1
|
| 452 |
+
else:
|
| 453 |
+
self._episode["repeat_count"] = 1
|
| 454 |
+
self._episode["last_action_key"] = action_key
|
| 455 |
+
if self._episode["repeat_count"] >= 2:
|
| 456 |
+
self._episode["loop_warning"] = "The same no-progress action has repeated; choose a different evidence source or remediation step."
|
| 457 |
+
|
| 458 |
+
def _workflow_stage(self) -> str:
|
| 459 |
+
if self._episode["incident_resolved"]:
|
| 460 |
+
return "resolved"
|
| 461 |
+
checks = self._episode["checks"]
|
| 462 |
+
if checks["database_recovery"].passed or checks["end_to_end"].passed:
|
| 463 |
+
return "validation"
|
| 464 |
+
if self._episode["containment_applied"] or self._episode["cause_removed"] or self._episode["isolated_service"] is not None:
|
| 465 |
+
return "mitigation"
|
| 466 |
+
return "triage"
|
| 467 |
+
|
| 468 |
+
def _allowed_actions(self) -> list[str]:
|
| 469 |
+
return list(ALL_ACTIONS)
|
| 470 |
+
|
| 471 |
+
def _required_fields_by_action(self) -> dict[str, list[str]]:
|
| 472 |
+
return {action: REQUIRED_FIELDS_BY_ACTION[action] for action in self._allowed_actions()}
|
| 473 |
+
|
| 474 |
+
def _progress_flags(self) -> dict[str, bool]:
|
| 475 |
+
checks = self._episode["checks"]
|
| 476 |
+
return {
|
| 477 |
+
"containment_applied": self._episode["containment_applied"],
|
| 478 |
+
"cause_removed": self._episode["cause_removed"],
|
| 479 |
+
"database_recovery": checks["database_recovery"].passed,
|
| 480 |
+
"end_to_end": checks["end_to_end"].passed,
|
| 481 |
+
"incident_resolved": self._episode["incident_resolved"],
|
| 482 |
+
"isolation_applied": self._episode["isolated_service"] is not None,
|
| 483 |
+
}
|
| 484 |
+
|
| 485 |
+
def _incident_summary(self) -> str:
|
| 486 |
+
description = self._episode["scenario"].get("description")
|
| 487 |
+
if description:
|
| 488 |
+
return description
|
| 489 |
+
return (
|
| 490 |
+
"An incident is degrading user traffic. Use evidence-gathering actions to diagnose, "
|
| 491 |
+
"then choose a safe remediation and verify with explicit checks."
|
| 492 |
+
)
|
| 493 |
+
|
| 494 |
+
def _prompt_text(self, tool_output: str | None) -> str:
|
| 495 |
+
lines = [
|
| 496 |
+
f"TICK {self._episode['tick']}/{self._episode['max_ticks']}",
|
| 497 |
+
f"WORKFLOW_STAGE: {self._episode['workflow_stage']}",
|
| 498 |
+
"",
|
| 499 |
+
"INCIDENT_SUMMARY:",
|
| 500 |
+
self._incident_summary(),
|
| 501 |
+
"",
|
| 502 |
+
"ACTIVE_ALERTS:",
|
| 503 |
+
]
|
| 504 |
+
if self._episode["alerts"]:
|
| 505 |
+
lines.extend(f"- [{alert.severity.upper()}] {alert.service}: {alert.message}" for alert in self._episode["alerts"])
|
| 506 |
+
else:
|
| 507 |
+
lines.append("- none")
|
| 508 |
+
lines.extend([
|
| 509 |
+
"",
|
| 510 |
+
"SERVICES:",
|
| 511 |
+
])
|
| 512 |
+
for service_name in SERVICE_ORDER:
|
| 513 |
+
health = self._episode["services"][service_name]
|
| 514 |
+
lines.append(
|
| 515 |
+
f"- {service_name}: {health.status} cpu={health.cpu_pct:.1f} mem={health.memory_pct:.1f} err={health.error_rate_pct:.1f} latency={health.latency_ms:.1f}"
|
| 516 |
+
)
|
| 517 |
+
lines.extend([
|
| 518 |
+
"",
|
| 519 |
+
f"USER_IMPACT: {self._episode['user_impact']:.2f}",
|
| 520 |
+
f"SLO_BURN_RATE: {self._episode['slo_burn_rate']:.2f}",
|
| 521 |
+
f"LAST_ACTION_RESULT: {self._episode['last_action_result'] or 'none'}",
|
| 522 |
+
f"TOOL_OUTPUT: {tool_output or 'none'}",
|
| 523 |
+
f"FAILURE_TYPE: {self._episode['failure_type'] or 'none'}",
|
| 524 |
+
f"WHY_FAILED: {self._episode['why_failed'] or 'none'}",
|
| 525 |
+
"",
|
| 526 |
+
"CHECKS:",
|
| 527 |
+
])
|
| 528 |
+
for check in self._episode["checks"].values():
|
| 529 |
+
lines.append(f"- {check.name}: {'passed' if check.passed else 'pending'} - {check.detail}")
|
| 530 |
+
lines.extend([
|
| 531 |
+
"",
|
| 532 |
+
"ALLOWED_ACTIONS:",
|
| 533 |
+
])
|
| 534 |
+
lines.extend(f"- {action}" for action in self._allowed_actions())
|
| 535 |
+
return "\n".join(lines)
|
| 536 |
+
|
| 537 |
+
def _incident_health_potential(self) -> float:
|
| 538 |
+
weights = self._episode["scenario"]["critical_service_weights"]
|
| 539 |
+
services = self._episode["services"]
|
| 540 |
+
operational = sum(weights.get(name, 0.0) * STATUS_VALUES[services[name].status] for name in weights)
|
| 541 |
+
impact_relief = 1.0 - self._episode["user_impact"]
|
| 542 |
+
burn_relief = 1.0 - self._episode["slo_burn_rate"]
|
| 543 |
+
containment = 1.0 if self._episode["containment_applied"] else 0.0
|
| 544 |
+
return round((0.55 * operational) + (0.2 * impact_relief) + (0.15 * burn_relief) + (0.10 * containment), 4)
|
| 545 |
+
|
| 546 |
+
def _state_dict(self) -> dict[str, Any]:
|
| 547 |
+
return {
|
| 548 |
+
"episode_id": self._episode["episode_id"],
|
| 549 |
+
"step_count": self._episode["step_count"],
|
| 550 |
+
"scenario_id": self._episode["scenario"]["id"],
|
| 551 |
+
"difficulty": self._episode["difficulty"],
|
| 552 |
+
"current_tick": self._episode["tick"],
|
| 553 |
+
"max_ticks": self._episode["max_ticks"],
|
| 554 |
+
"workflow_stage": self._episode["workflow_stage"],
|
| 555 |
+
"active_alerts": [alert.model_dump() for alert in self._episode["alerts"]],
|
| 556 |
+
"service_health": {name: service.model_dump() for name, service in self._episode["services"].items()},
|
| 557 |
+
"discovered_evidence": list(self._episode["discovered_evidence"]),
|
| 558 |
+
"recent_deploys": list(self._episode["recent_deploys"]),
|
| 559 |
+
"checks": [check.model_dump() for check in self._episode["checks"].values()],
|
| 560 |
+
"user_impact": self._episode["user_impact"],
|
| 561 |
+
"slo_burn_rate": self._episode["slo_burn_rate"],
|
| 562 |
+
"incident_resolved": self._episode["incident_resolved"],
|
| 563 |
+
"containment_applied": self._episode["containment_applied"],
|
| 564 |
+
"allowed_actions": self._allowed_actions(),
|
| 565 |
+
"required_fields_by_action": self._required_fields_by_action(),
|
| 566 |
+
"valid_action_example": None,
|
| 567 |
+
"progress_flags": self._progress_flags(),
|
| 568 |
+
"final_score": self._episode["final_score"],
|
| 569 |
+
"score_breakdown": dict(self._episode["score_breakdown"]),
|
| 570 |
+
"cumulative_reward": self._episode["cumulative_reward"],
|
| 571 |
+
"wasteful_ticks": self._episode["wasteful_ticks"],
|
| 572 |
+
"last_action_result": self._episode["last_action_result"],
|
| 573 |
+
"failure_type": self._episode["failure_type"],
|
| 574 |
+
"why_failed": self._episode["why_failed"],
|
| 575 |
+
}
|
| 576 |
+
|
| 577 |
+
def _build_observation(self, last_action_result: str, tool_output: str | None, reward: float, done: bool) -> UnifiedIncidentObservation:
|
| 578 |
+
return UnifiedIncidentObservation(
|
| 579 |
+
prompt_text=self._prompt_text(tool_output),
|
| 580 |
+
incident_summary=self._incident_summary(),
|
| 581 |
+
tick_count=self._episode["tick"],
|
| 582 |
+
max_ticks=self._episode["max_ticks"],
|
| 583 |
+
difficulty=self._episode["difficulty"],
|
| 584 |
+
workflow_stage=self._episode["workflow_stage"],
|
| 585 |
+
active_alerts=list(self._episode["alerts"]),
|
| 586 |
+
service_health=dict(self._episode["services"]),
|
| 587 |
+
discovered_evidence=list(self._episode["discovered_evidence"]),
|
| 588 |
+
recent_deploys=list(self._episode["recent_deploys"]),
|
| 589 |
+
checks=list(self._episode["checks"].values()),
|
| 590 |
+
user_impact=self._episode["user_impact"],
|
| 591 |
+
slo_burn_rate=self._episode["slo_burn_rate"],
|
| 592 |
+
incident_resolved=self._episode["incident_resolved"],
|
| 593 |
+
containment_applied=self._episode["containment_applied"],
|
| 594 |
+
last_action_result=last_action_result,
|
| 595 |
+
tool_output=tool_output,
|
| 596 |
+
failure_type=self._episode["failure_type"],
|
| 597 |
+
why_failed=self._episode["why_failed"],
|
| 598 |
+
allowed_actions=self._allowed_actions(),
|
| 599 |
+
required_fields_by_action=self._required_fields_by_action(),
|
| 600 |
+
valid_action_example=None,
|
| 601 |
+
common_trap=self._episode["scenario"].get("description"),
|
| 602 |
+
loop_warning=self._episode["loop_warning"],
|
| 603 |
+
blocked_until_security_complete=False,
|
| 604 |
+
security_unlock_reason=None,
|
| 605 |
+
best_recovery_action_family=None,
|
| 606 |
+
progress_flags=self._progress_flags(),
|
| 607 |
+
security_subquest_status=None,
|
| 608 |
+
security_context={},
|
| 609 |
+
final_score=self._episode["final_score"],
|
| 610 |
+
score_breakdown=dict(self._episode["score_breakdown"]),
|
| 611 |
+
reward=round(reward, 4),
|
| 612 |
+
done=done,
|
| 613 |
+
)
|
unified_incident_env/server/grader.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Deterministic public scoring for the honest narrow incident-remediation environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Any
|
| 6 |
+
|
| 7 |
+
from ..models import GraderCheck, GraderReport
|
| 8 |
+
|
| 9 |
+
MIN_PUBLIC_SCORE = 0.01
|
| 10 |
+
MAX_PUBLIC_SCORE = 0.99
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _strict_public_score(score: float) -> float:
|
| 14 |
+
return round(min(MAX_PUBLIC_SCORE, max(MIN_PUBLIC_SCORE, score)), 4)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _service_score(status: str) -> float:
|
| 18 |
+
return {
|
| 19 |
+
"healthy": 1.0,
|
| 20 |
+
"degraded": 0.4,
|
| 21 |
+
"crashed": 0.0,
|
| 22 |
+
"isolated": 0.2,
|
| 23 |
+
}.get(status, 0.0)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class UnifiedIncidentGrader:
|
| 27 |
+
"""Deterministic scorer focused on executed effects, not scripted clues."""
|
| 28 |
+
|
| 29 |
+
def compute_breakdown(
|
| 30 |
+
self,
|
| 31 |
+
state: dict[str, Any],
|
| 32 |
+
scenario: dict[str, Any],
|
| 33 |
+
) -> dict[str, float]:
|
| 34 |
+
services = state.get("service_health", {})
|
| 35 |
+
weights = scenario["critical_service_weights"]
|
| 36 |
+
recovery_score = round(
|
| 37 |
+
sum(
|
| 38 |
+
weights.get(service, 0.0) * _service_score((services.get(service) or {}).get("status", "crashed"))
|
| 39 |
+
for service in weights
|
| 40 |
+
),
|
| 41 |
+
4,
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
containment_score = 0.2 if state.get("containment_applied") else 0.0
|
| 45 |
+
if state.get("containment_applied") and (services.get("worker") or {}).get("status") == "healthy":
|
| 46 |
+
containment_score = 0.3
|
| 47 |
+
|
| 48 |
+
checks = {item.get("name"): bool(item.get("passed")) for item in state.get("checks", [])}
|
| 49 |
+
verification_score = 0.0
|
| 50 |
+
if checks.get("database_recovery"):
|
| 51 |
+
verification_score += 0.15
|
| 52 |
+
if checks.get("end_to_end"):
|
| 53 |
+
verification_score += 0.2
|
| 54 |
+
|
| 55 |
+
user_impact = float(state.get("user_impact", 1.0))
|
| 56 |
+
impact_score = round(max(0.0, 0.15 * (1.0 - user_impact)), 4)
|
| 57 |
+
|
| 58 |
+
wasteful_ticks = int(state.get("wasteful_ticks", 0))
|
| 59 |
+
efficiency_score = round(max(0.0, 0.10 - (0.01 * wasteful_ticks)), 4)
|
| 60 |
+
|
| 61 |
+
final_score = _strict_public_score(
|
| 62 |
+
recovery_score + containment_score + verification_score + impact_score + efficiency_score
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
return {
|
| 66 |
+
"recovery_score": recovery_score,
|
| 67 |
+
"containment_score": round(containment_score, 4),
|
| 68 |
+
"verification_score": round(verification_score, 4),
|
| 69 |
+
"impact_score": impact_score,
|
| 70 |
+
"efficiency_score": efficiency_score,
|
| 71 |
+
"final_score": final_score,
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
def build_report(self, state: dict[str, Any], scenario: dict[str, Any]) -> GraderReport:
|
| 75 |
+
breakdown = self.compute_breakdown(state, scenario)
|
| 76 |
+
checks = {item.get("name"): bool(item.get("passed")) for item in state.get("checks", [])}
|
| 77 |
+
passed = bool(
|
| 78 |
+
state.get("incident_resolved")
|
| 79 |
+
and checks.get("database_recovery")
|
| 80 |
+
and checks.get("end_to_end")
|
| 81 |
+
)
|
| 82 |
+
report_checks = [
|
| 83 |
+
GraderCheck(
|
| 84 |
+
name="root_cause_removed",
|
| 85 |
+
passed=bool(state.get("containment_applied")),
|
| 86 |
+
detail=(
|
| 87 |
+
"The root cause has been safely contained or removed."
|
| 88 |
+
if state.get("containment_applied")
|
| 89 |
+
else "The root cause is still active or only partially contained."
|
| 90 |
+
),
|
| 91 |
+
weight=0.30,
|
| 92 |
+
),
|
| 93 |
+
GraderCheck(
|
| 94 |
+
name="database_recovery",
|
| 95 |
+
passed=checks.get("database_recovery", False),
|
| 96 |
+
detail=(
|
| 97 |
+
"The database recovery check passed."
|
| 98 |
+
if checks.get("database_recovery")
|
| 99 |
+
else "The database recovery check has not passed yet."
|
| 100 |
+
),
|
| 101 |
+
weight=0.20,
|
| 102 |
+
),
|
| 103 |
+
GraderCheck(
|
| 104 |
+
name="end_to_end_check",
|
| 105 |
+
passed=checks.get("end_to_end", False),
|
| 106 |
+
detail=(
|
| 107 |
+
"The end-to-end service check passed."
|
| 108 |
+
if checks.get("end_to_end")
|
| 109 |
+
else "The end-to-end service check has not passed yet."
|
| 110 |
+
),
|
| 111 |
+
weight=0.20,
|
| 112 |
+
),
|
| 113 |
+
GraderCheck(
|
| 114 |
+
name="critical_services_recovered",
|
| 115 |
+
passed=breakdown["recovery_score"] >= 0.8,
|
| 116 |
+
detail=(
|
| 117 |
+
"Critical-path services are recovered."
|
| 118 |
+
if breakdown["recovery_score"] >= 0.8
|
| 119 |
+
else "Critical-path services are still degraded or crashed."
|
| 120 |
+
),
|
| 121 |
+
weight=0.20,
|
| 122 |
+
),
|
| 123 |
+
GraderCheck(
|
| 124 |
+
name="declare_resolved",
|
| 125 |
+
passed=bool(state.get("incident_resolved")),
|
| 126 |
+
detail=(
|
| 127 |
+
"The agent declared the incident resolved after objective checks passed."
|
| 128 |
+
if state.get("incident_resolved")
|
| 129 |
+
else "The incident has not been safely declared resolved."
|
| 130 |
+
),
|
| 131 |
+
weight=0.10,
|
| 132 |
+
),
|
| 133 |
+
]
|
| 134 |
+
return GraderReport(
|
| 135 |
+
scenario_id=scenario["id"],
|
| 136 |
+
passed=passed,
|
| 137 |
+
score=breakdown["final_score"],
|
| 138 |
+
message=(
|
| 139 |
+
"Incident diagnosed, remediated, and verified honestly."
|
| 140 |
+
if passed
|
| 141 |
+
else "Incident is not yet safely resolved."
|
| 142 |
+
),
|
| 143 |
+
breakdown=breakdown,
|
| 144 |
+
checks=report_checks,
|
| 145 |
+
)
|
unified_incident_env/tests/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the unified incident environment."""
|
unified_incident_env/tests/test_environment.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Behavior and API tests for the honest narrow incident environment."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from fastapi.testclient import TestClient
|
| 6 |
+
|
| 7 |
+
from unified_incident_env.models import HypothesisPayload, UnifiedIncidentAction
|
| 8 |
+
from unified_incident_env.server import app as app_module
|
| 9 |
+
from unified_incident_env.server.challenge import DEFAULT_SCENARIO_ID, list_baselines
|
| 10 |
+
from unified_incident_env.server.environment import UnifiedIncidentEnvironment
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _run_baseline(env: UnifiedIncidentEnvironment):
|
| 14 |
+
env.reset(scenario_id=DEFAULT_SCENARIO_ID)
|
| 15 |
+
last = None
|
| 16 |
+
for step in list_baselines(DEFAULT_SCENARIO_ID).baselines[0].actions:
|
| 17 |
+
last = env.step(step.action)
|
| 18 |
+
return last
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def test_baseline_resolves_honestly() -> None:
|
| 22 |
+
env = UnifiedIncidentEnvironment()
|
| 23 |
+
obs = _run_baseline(env)
|
| 24 |
+
assert obs is not None
|
| 25 |
+
assert obs.done is True
|
| 26 |
+
assert obs.incident_resolved is True
|
| 27 |
+
checks = {check.name: check.passed for check in obs.checks}
|
| 28 |
+
assert checks["database_recovery"] is True
|
| 29 |
+
assert checks["end_to_end"] is True
|
| 30 |
+
assert obs.final_score > 0.7
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def test_query_deploys_reveals_evidence_but_not_positive_reward() -> None:
|
| 34 |
+
env = UnifiedIncidentEnvironment()
|
| 35 |
+
env.reset(scenario_id=DEFAULT_SCENARIO_ID)
|
| 36 |
+
obs = env.step(UnifiedIncidentAction(action_type="query_deploys", service="worker"))
|
| 37 |
+
assert obs.reward <= 0.0
|
| 38 |
+
assert "worker@2026.04.23-bad" in (obs.tool_output or "")
|
| 39 |
+
assert obs.incident_resolved is False
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def test_restart_database_before_rollback_is_negative() -> None:
|
| 43 |
+
env = UnifiedIncidentEnvironment()
|
| 44 |
+
env.reset(scenario_id=DEFAULT_SCENARIO_ID)
|
| 45 |
+
obs = env.step(UnifiedIncidentAction(action_type="restart_service", service="database"))
|
| 46 |
+
assert obs.reward < 0.0
|
| 47 |
+
assert obs.failure_type == "premature_restart"
|
| 48 |
+
assert obs.incident_resolved is False
|
| 49 |
+
assert obs.service_health["database"].status == "crashed"
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def test_duplicate_hypothesis_bonus_is_not_farmable() -> None:
|
| 53 |
+
env = UnifiedIncidentEnvironment()
|
| 54 |
+
env.reset(scenario_id=DEFAULT_SCENARIO_ID)
|
| 55 |
+
action = UnifiedIncidentAction(
|
| 56 |
+
action_type="submit_hypothesis",
|
| 57 |
+
hypothesis=HypothesisPayload(
|
| 58 |
+
root_cause="bad_worker_deploy",
|
| 59 |
+
affected_services=["worker", "database", "api-gateway"],
|
| 60 |
+
confidence=0.82,
|
| 61 |
+
recommended_next_action="rollback_deploy",
|
| 62 |
+
),
|
| 63 |
+
)
|
| 64 |
+
first = env.step(action)
|
| 65 |
+
second = env.step(action)
|
| 66 |
+
assert first.reward > second.reward
|
| 67 |
+
assert second.reward <= 0.0
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def test_isolating_worker_contains_but_does_not_resolve() -> None:
|
| 71 |
+
env = UnifiedIncidentEnvironment()
|
| 72 |
+
env.reset(scenario_id=DEFAULT_SCENARIO_ID)
|
| 73 |
+
isolated = env.step(UnifiedIncidentAction(action_type="isolate_service", service="worker"))
|
| 74 |
+
assert isolated.containment_applied is True
|
| 75 |
+
assert isolated.incident_resolved is False
|
| 76 |
+
checked = env.step(UnifiedIncidentAction(action_type="run_check", check_name="end_to_end"))
|
| 77 |
+
checks = {check.name: check.passed for check in checked.checks}
|
| 78 |
+
assert checks["end_to_end"] is False
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def test_declare_resolved_requires_checks() -> None:
|
| 82 |
+
env = UnifiedIncidentEnvironment()
|
| 83 |
+
env.reset(scenario_id=DEFAULT_SCENARIO_ID)
|
| 84 |
+
obs = env.step(UnifiedIncidentAction(action_type="declare_resolved"))
|
| 85 |
+
assert obs.reward < 0.0
|
| 86 |
+
assert obs.done is False
|
| 87 |
+
assert obs.failure_type == "premature_resolution"
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def test_observation_exposes_bounded_actions_without_valid_example() -> None:
|
| 91 |
+
env = UnifiedIncidentEnvironment()
|
| 92 |
+
obs = env.reset(scenario_id=DEFAULT_SCENARIO_ID)
|
| 93 |
+
assert obs.allowed_actions == [
|
| 94 |
+
"query_logs",
|
| 95 |
+
"query_metrics",
|
| 96 |
+
"query_dependencies",
|
| 97 |
+
"query_deploys",
|
| 98 |
+
"rollback_deploy",
|
| 99 |
+
"restart_service",
|
| 100 |
+
"run_check",
|
| 101 |
+
"isolate_service",
|
| 102 |
+
"escalate",
|
| 103 |
+
"submit_hypothesis",
|
| 104 |
+
"declare_resolved",
|
| 105 |
+
]
|
| 106 |
+
assert obs.valid_action_example is None
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def test_routes_expose_new_catalog_and_status(monkeypatch) -> None:
|
| 110 |
+
monkeypatch.setenv("ENABLE_WEB_INTERFACE", "false")
|
| 111 |
+
client = TestClient(app_module.create_compatible_app())
|
| 112 |
+
|
| 113 |
+
tasks = client.get("/tasks")
|
| 114 |
+
assert tasks.status_code == 200
|
| 115 |
+
payload = tasks.json()
|
| 116 |
+
assert payload["default_scenario_id"] == DEFAULT_SCENARIO_ID
|
| 117 |
+
scenarios_by_difficulty = {scenario["difficulty"] for scenario in payload["scenarios"]}
|
| 118 |
+
assert {"easy", "medium", "hard"}.issubset(scenarios_by_difficulty)
|
| 119 |
+
assert {"easy", "medium", "hard"}.issubset(set(payload["available_difficulties"]))
|
| 120 |
+
|
| 121 |
+
baseline = client.get("/baseline")
|
| 122 |
+
assert baseline.status_code == 200
|
| 123 |
+
baseline_payload = baseline.json()
|
| 124 |
+
baseline_ids = {item["scenario_id"] for item in baseline_payload["baselines"]}
|
| 125 |
+
assert {"worker_deploy_cascade", "db_config_rollout", "gateway_auth_rollout"}.issubset(baseline_ids)
|
| 126 |
+
|
| 127 |
+
health = client.get("/health")
|
| 128 |
+
assert health.status_code == 200
|
| 129 |
+
assert health.json()["status"] in {"ok", "healthy"}
|
| 130 |
+
|
| 131 |
+
status = client.get("/status")
|
| 132 |
+
assert status.status_code == 200
|
| 133 |
+
status_payload = status.json()
|
| 134 |
+
assert status_payload["progress"]["scenario_id"] == DEFAULT_SCENARIO_ID
|
| 135 |
+
assert status_payload["grader"]["score"] > 0.0
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def _run_baseline_for_scenario(scenario_id: str):
|
| 139 |
+
env = UnifiedIncidentEnvironment()
|
| 140 |
+
env.reset(scenario_id=scenario_id)
|
| 141 |
+
last = None
|
| 142 |
+
for step in list_baselines(scenario_id).baselines[0].actions:
|
| 143 |
+
last = env.step(step.action)
|
| 144 |
+
return last
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def test_medium_baseline_resolves_honestly() -> None:
|
| 148 |
+
obs = _run_baseline_for_scenario("db_config_rollout")
|
| 149 |
+
assert obs is not None
|
| 150 |
+
assert obs.done is True
|
| 151 |
+
assert obs.incident_resolved is True
|
| 152 |
+
checks = {check.name: check.passed for check in obs.checks}
|
| 153 |
+
assert checks["database_recovery"] is True
|
| 154 |
+
assert checks["end_to_end"] is True
|
| 155 |
+
assert obs.final_score > 0.7
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def test_hard_baseline_resolves_honestly() -> None:
|
| 159 |
+
obs = _run_baseline_for_scenario("gateway_auth_rollout")
|
| 160 |
+
assert obs is not None
|
| 161 |
+
assert obs.done is True
|
| 162 |
+
assert obs.incident_resolved is True
|
| 163 |
+
checks = {check.name: check.passed for check in obs.checks}
|
| 164 |
+
assert checks["end_to_end"] is True
|
| 165 |
+
assert obs.final_score > 0.7
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def test_medium_wrong_rollback_target_is_penalized() -> None:
|
| 169 |
+
env = UnifiedIncidentEnvironment()
|
| 170 |
+
env.reset(scenario_id="db_config_rollout")
|
| 171 |
+
obs = env.step(UnifiedIncidentAction(action_type="rollback_deploy", service="worker"))
|
| 172 |
+
assert obs.reward < 0.0
|
| 173 |
+
assert obs.failure_type == "wrong_remediation_target"
|
| 174 |
+
assert obs.incident_resolved is False
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def test_hard_wrong_rollback_target_is_penalized() -> None:
|
| 178 |
+
env = UnifiedIncidentEnvironment()
|
| 179 |
+
env.reset(scenario_id="gateway_auth_rollout")
|
| 180 |
+
obs = env.step(UnifiedIncidentAction(action_type="rollback_deploy", service="worker"))
|
| 181 |
+
assert obs.reward < 0.0
|
| 182 |
+
assert obs.failure_type == "wrong_remediation_target"
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def test_hard_does_not_require_database_recovery_check() -> None:
|
| 186 |
+
env = UnifiedIncidentEnvironment()
|
| 187 |
+
env.reset(scenario_id="gateway_auth_rollout")
|
| 188 |
+
env.step(UnifiedIncidentAction(action_type="rollback_deploy", service="api-gateway"))
|
| 189 |
+
end_to_end = env.step(UnifiedIncidentAction(action_type="run_check", check_name="end_to_end"))
|
| 190 |
+
assert any(check.name == "end_to_end" and check.passed for check in end_to_end.checks)
|
| 191 |
+
resolved = env.step(UnifiedIncidentAction(action_type="declare_resolved"))
|
| 192 |
+
assert resolved.incident_resolved is True
|
unified_incident_env/tests/test_submission_inference.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import inference
|
| 4 |
+
from unified_incident_env.models import Alert, CheckResult, ServiceHealth, UnifiedIncidentObservation
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def make_observation(**overrides: object) -> UnifiedIncidentObservation:
|
| 8 |
+
defaults = {
|
| 9 |
+
"prompt_text": "Honest incident prompt",
|
| 10 |
+
"incident_summary": "Worker deploy is overloading the database.",
|
| 11 |
+
"tick_count": 0,
|
| 12 |
+
"max_ticks": 12,
|
| 13 |
+
"difficulty": "easy",
|
| 14 |
+
"workflow_stage": "triage",
|
| 15 |
+
"active_alerts": [
|
| 16 |
+
Alert(service="database", severity="critical", message="database crashing"),
|
| 17 |
+
Alert(service="worker", severity="warning", message="worker retry volume elevated"),
|
| 18 |
+
],
|
| 19 |
+
"service_health": {
|
| 20 |
+
"api-gateway": ServiceHealth(name="api-gateway", status="degraded", cpu_pct=61.0, memory_pct=38.0, error_rate_pct=24.0, latency_ms=640.0),
|
| 21 |
+
"cache": ServiceHealth(name="cache", status="healthy", cpu_pct=18.0, memory_pct=24.0, error_rate_pct=0.0, latency_ms=14.0),
|
| 22 |
+
"database": ServiceHealth(name="database", status="crashed", cpu_pct=99.0, memory_pct=97.0, error_rate_pct=100.0, latency_ms=0.0),
|
| 23 |
+
"worker": ServiceHealth(name="worker", status="degraded", cpu_pct=88.0, memory_pct=71.0, error_rate_pct=19.0, latency_ms=420.0),
|
| 24 |
+
},
|
| 25 |
+
"discovered_evidence": [],
|
| 26 |
+
"recent_deploys": ["Rolled out worker@2026.04.23-bad 12 minutes ago."],
|
| 27 |
+
"checks": [
|
| 28 |
+
CheckResult(name="database_recovery", passed=False, detail="Database recovery has not been verified yet."),
|
| 29 |
+
CheckResult(name="end_to_end", passed=False, detail="End-to-end health has not been verified yet."),
|
| 30 |
+
],
|
| 31 |
+
"user_impact": 0.82,
|
| 32 |
+
"slo_burn_rate": 0.91,
|
| 33 |
+
"incident_resolved": False,
|
| 34 |
+
"containment_applied": False,
|
| 35 |
+
"last_action_result": "",
|
| 36 |
+
"tool_output": None,
|
| 37 |
+
"failure_type": None,
|
| 38 |
+
"why_failed": None,
|
| 39 |
+
"allowed_actions": [
|
| 40 |
+
"query_logs",
|
| 41 |
+
"query_metrics",
|
| 42 |
+
"query_dependencies",
|
| 43 |
+
"query_deploys",
|
| 44 |
+
"rollback_deploy",
|
| 45 |
+
"restart_service",
|
| 46 |
+
"run_check",
|
| 47 |
+
"isolate_service",
|
| 48 |
+
"escalate",
|
| 49 |
+
"submit_hypothesis",
|
| 50 |
+
"declare_resolved",
|
| 51 |
+
],
|
| 52 |
+
"required_fields_by_action": {
|
| 53 |
+
"query_logs": ["service"],
|
| 54 |
+
"query_metrics": ["service", "metric"],
|
| 55 |
+
"query_dependencies": ["service"],
|
| 56 |
+
"query_deploys": ["service"],
|
| 57 |
+
"rollback_deploy": ["service"],
|
| 58 |
+
"restart_service": ["service"],
|
| 59 |
+
"run_check": ["check_name"],
|
| 60 |
+
"isolate_service": ["service"],
|
| 61 |
+
"escalate": [],
|
| 62 |
+
"submit_hypothesis": ["hypothesis"],
|
| 63 |
+
"declare_resolved": [],
|
| 64 |
+
},
|
| 65 |
+
"valid_action_example": None,
|
| 66 |
+
"common_trap": None,
|
| 67 |
+
"loop_warning": None,
|
| 68 |
+
"blocked_until_security_complete": False,
|
| 69 |
+
"security_unlock_reason": None,
|
| 70 |
+
"best_recovery_action_family": None,
|
| 71 |
+
"progress_flags": {},
|
| 72 |
+
"security_subquest_status": None,
|
| 73 |
+
"security_context": {},
|
| 74 |
+
"final_score": 0.1,
|
| 75 |
+
"score_breakdown": {"final_score": 0.1},
|
| 76 |
+
"reward": 0.0,
|
| 77 |
+
"done": False,
|
| 78 |
+
}
|
| 79 |
+
defaults.update(overrides)
|
| 80 |
+
return UnifiedIncidentObservation(**defaults)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def test_log_helpers_match_required_format(capsys) -> None:
|
| 84 |
+
inference.log_start(task="worker_deploy_cascade", env="unified-incident-env", model="demo-model")
|
| 85 |
+
inference.log_step(step=2, action='{"action_type":"query_logs","service":"database"}', reward=-0.01, done=False, error=None)
|
| 86 |
+
inference.log_end(success=True, steps=2, score=0.37, rewards=[-0.01, 0.27])
|
| 87 |
+
captured = capsys.readouterr().out.strip().splitlines()
|
| 88 |
+
assert captured == [
|
| 89 |
+
"[START] task=worker_deploy_cascade env=unified-incident-env model=demo-model",
|
| 90 |
+
'[STEP] step=2 action={"action_type":"query_logs","service":"database"} reward=-0.01 done=false error=null',
|
| 91 |
+
"[END] success=true steps=2 score=0.37 rewards=-0.01,0.27",
|
| 92 |
+
]
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def test_parse_action_accepts_valid_json() -> None:
|
| 96 |
+
observation = make_observation()
|
| 97 |
+
action = inference.parse_action('{"action_type":"query_deploys","service":"worker"}', observation)
|
| 98 |
+
assert action == inference.UnifiedIncidentAction(action_type="query_deploys", service="worker")
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def test_parse_action_rejects_incomplete_metric_query() -> None:
|
| 102 |
+
observation = make_observation()
|
| 103 |
+
assert inference.parse_action('{"action_type":"query_metrics","service":"database"}', observation) is None
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def test_build_user_prompt_includes_public_state_without_examples() -> None:
|
| 107 |
+
observation = make_observation()
|
| 108 |
+
prompt = inference.build_user_prompt(observation)
|
| 109 |
+
assert "Incident summary:" in prompt
|
| 110 |
+
assert "Allowed actions:" in prompt
|
| 111 |
+
assert "Required fields:" in prompt
|
| 112 |
+
assert "Valid example" not in prompt
|
| 113 |
+
assert "worker@2026.04.23-bad" not in prompt
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
def test_build_fallback_action_prefers_public_deploy_query() -> None:
|
| 117 |
+
observation = make_observation()
|
| 118 |
+
action = inference.build_fallback_action(observation)
|
| 119 |
+
assert action == inference.UnifiedIncidentAction(action_type="query_deploys", service="worker")
|
unified_incident_env/tests/test_trainer.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Smoke tests for reusable trainer-shell pieces after the v2 pivot."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
from unified_incident_env.trainer.trajectory_memory import CorrectionMemory
|
| 8 |
+
from unified_incident_env.trainer.trajectory_store import TrajectoryStore
|
| 9 |
+
from unified_incident_env.trainer.types import EpisodeRecord, StepRecord
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def test_correction_memory_empty_prompt_is_safe() -> None:
|
| 13 |
+
memory = CorrectionMemory()
|
| 14 |
+
addendum = memory.build_prompt_addendum("worker_deploy_cascade", "triage")
|
| 15 |
+
assert isinstance(addendum, str)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def test_trajectory_store_roundtrip(tmp_path: Path) -> None:
|
| 19 |
+
store = TrajectoryStore(tmp_path / "episodes.jsonl")
|
| 20 |
+
record = EpisodeRecord(
|
| 21 |
+
run_id="run-1",
|
| 22 |
+
scenario_id="worker_deploy_cascade",
|
| 23 |
+
difficulty="easy",
|
| 24 |
+
model_name="stub",
|
| 25 |
+
mode="strict",
|
| 26 |
+
success=False,
|
| 27 |
+
final_score=0.1,
|
| 28 |
+
steps=1,
|
| 29 |
+
elapsed_s=0.01,
|
| 30 |
+
step_records=[
|
| 31 |
+
StepRecord(
|
| 32 |
+
step_index=1,
|
| 33 |
+
tick=1,
|
| 34 |
+
workflow_stage="triage",
|
| 35 |
+
observation={},
|
| 36 |
+
prompt_text="prompt",
|
| 37 |
+
raw_model_output="{}",
|
| 38 |
+
parse_status="invalid_json",
|
| 39 |
+
reward=None,
|
| 40 |
+
)
|
| 41 |
+
],
|
| 42 |
+
)
|
| 43 |
+
store.append_episode(record)
|
| 44 |
+
loaded = store.load_episodes()
|
| 45 |
+
assert len(loaded) == 1
|
| 46 |
+
assert loaded[0].scenario_id == "worker_deploy_cascade"
|
unified_incident_env/tests/test_trainer_session.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Smoke tests for session/report shells after the v2 pivot."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from unified_incident_env.trainer.reporting import build_phase_deltas
|
| 6 |
+
from unified_incident_env.trainer.types import SessionPhaseReport
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def test_build_phase_deltas_handles_simple_progression() -> None:
|
| 10 |
+
phases = [
|
| 11 |
+
SessionPhaseReport(
|
| 12 |
+
phase_name="probe",
|
| 13 |
+
episode_ids=[1, 2],
|
| 14 |
+
avg_score=0.2,
|
| 15 |
+
success_rate=0.0,
|
| 16 |
+
schema_failures=1,
|
| 17 |
+
loop_failures=1,
|
| 18 |
+
updates_applied=[],
|
| 19 |
+
),
|
| 20 |
+
SessionPhaseReport(
|
| 21 |
+
phase_name="final_evaluation",
|
| 22 |
+
episode_ids=[3, 4],
|
| 23 |
+
avg_score=0.8,
|
| 24 |
+
success_rate=1.0,
|
| 25 |
+
schema_failures=0,
|
| 26 |
+
loop_failures=0,
|
| 27 |
+
updates_applied=[],
|
| 28 |
+
),
|
| 29 |
+
]
|
| 30 |
+
deltas = build_phase_deltas(phases)
|
| 31 |
+
assert deltas[1].phase_name == "final_evaluation"
|
| 32 |
+
assert deltas[1].score_delta == 0.6
|
unified_incident_env/trainer/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Trainer package namespace.
|
| 2 |
+
|
| 3 |
+
This package intentionally avoids eager importing of legacy trainer flows so the
|
| 4 |
+
honest v2 environment can reuse shell utilities without pulling in deprecated
|
| 5 |
+
benchmark-specific modules at import time.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
__all__: list[str] = []
|
unified_incident_env/trainer/action_adapter.py
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Strict and lenient action parsers for training and eval."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
from ..models import ActionType, UnifiedIncidentAction
|
| 9 |
+
from .types import ParseResult
|
| 10 |
+
|
| 11 |
+
_ALLOWED_KEYS = {
|
| 12 |
+
"action_type",
|
| 13 |
+
"service",
|
| 14 |
+
"metric",
|
| 15 |
+
"vulnerability_type",
|
| 16 |
+
"patch_id",
|
| 17 |
+
"postmortem",
|
| 18 |
+
}
|
| 19 |
+
_KNOWN_ACTIONS: set[str] = {
|
| 20 |
+
"query_logs",
|
| 21 |
+
"query_metrics",
|
| 22 |
+
"query_dependencies",
|
| 23 |
+
"restart_service",
|
| 24 |
+
"rollback_deploy",
|
| 25 |
+
"inspect_code",
|
| 26 |
+
"classify_vulnerability",
|
| 27 |
+
"apply_patch",
|
| 28 |
+
"verify_security_fix",
|
| 29 |
+
"submit_security_fix",
|
| 30 |
+
"submit_postmortem",
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def _extract_json_text(raw_text: str) -> str:
|
| 35 |
+
text = raw_text.strip()
|
| 36 |
+
if "```" in text:
|
| 37 |
+
parts = text.split("```")
|
| 38 |
+
if len(parts) >= 2:
|
| 39 |
+
text = parts[1]
|
| 40 |
+
if text.startswith("json"):
|
| 41 |
+
text = text[4:]
|
| 42 |
+
start = text.find("{")
|
| 43 |
+
end = text.rfind("}")
|
| 44 |
+
if start != -1 and end != -1 and start < end:
|
| 45 |
+
text = text[start : end + 1]
|
| 46 |
+
return text.strip()
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def _compact_action(action: UnifiedIncidentAction) -> dict[str, Any]:
|
| 50 |
+
payload = action.model_dump(exclude_none=True)
|
| 51 |
+
if payload.get("metadata") == {}:
|
| 52 |
+
payload.pop("metadata", None)
|
| 53 |
+
return payload
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class StrictActionParser:
|
| 57 |
+
"""Exact parser for judge-style evaluation."""
|
| 58 |
+
|
| 59 |
+
def parse(self, raw_text: str) -> ParseResult:
|
| 60 |
+
bare = raw_text.strip().strip('"').strip("'")
|
| 61 |
+
if bare in {"inspect_code", "verify_security_fix", "submit_security_fix"}:
|
| 62 |
+
action = UnifiedIncidentAction(action_type=bare)
|
| 63 |
+
return ParseResult(
|
| 64 |
+
parse_status="repaired",
|
| 65 |
+
cleaned_action=_compact_action(action),
|
| 66 |
+
repair_labels=["bare_action_wrapped"],
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
try:
|
| 70 |
+
data = json.loads(_extract_json_text(raw_text))
|
| 71 |
+
except Exception as exc:
|
| 72 |
+
return ParseResult(parse_status="invalid_json", error=type(exc).__name__)
|
| 73 |
+
|
| 74 |
+
if not isinstance(data, dict):
|
| 75 |
+
return ParseResult(parse_status="invalid_action", error="root must be object")
|
| 76 |
+
|
| 77 |
+
repaired_labels: list[str] = []
|
| 78 |
+
cleaned: dict[str, Any] = {k: v for k, v in data.items() if k in _ALLOWED_KEYS}
|
| 79 |
+
repaired = cleaned != data
|
| 80 |
+
if repaired:
|
| 81 |
+
repaired_labels.append("extra_keys_stripped")
|
| 82 |
+
|
| 83 |
+
if "action_type" not in cleaned and isinstance(data.get("action"), str):
|
| 84 |
+
if data["action"] in _KNOWN_ACTIONS:
|
| 85 |
+
cleaned["action_type"] = data["action"]
|
| 86 |
+
repaired = True
|
| 87 |
+
repaired_labels.append("action_alias_normalized")
|
| 88 |
+
|
| 89 |
+
if (
|
| 90 |
+
"vulnerability_type" not in cleaned
|
| 91 |
+
and isinstance(data.get("vulnerability"), str)
|
| 92 |
+
):
|
| 93 |
+
cleaned["vulnerability_type"] = data["vulnerability"]
|
| 94 |
+
repaired = True
|
| 95 |
+
repaired_labels.append("vulnerability_alias_normalized")
|
| 96 |
+
|
| 97 |
+
metrics_value = data.get("metrics")
|
| 98 |
+
if "metric" not in cleaned and isinstance(metrics_value, list) and len(metrics_value) == 1:
|
| 99 |
+
cleaned["metric"] = metrics_value[0]
|
| 100 |
+
repaired = True
|
| 101 |
+
repaired_labels.append("metric_list_normalized")
|
| 102 |
+
|
| 103 |
+
if "metrics" in data and (
|
| 104 |
+
not isinstance(metrics_value, list) or len(metrics_value) != 1
|
| 105 |
+
):
|
| 106 |
+
return ParseResult(
|
| 107 |
+
parse_status="invalid_action",
|
| 108 |
+
error="metrics alias is ambiguous",
|
| 109 |
+
repair_labels=repaired_labels,
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
try:
|
| 113 |
+
action = UnifiedIncidentAction(**cleaned)
|
| 114 |
+
except Exception as exc:
|
| 115 |
+
return ParseResult(
|
| 116 |
+
parse_status="invalid_action",
|
| 117 |
+
error=str(exc),
|
| 118 |
+
repair_labels=repaired_labels,
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
return ParseResult(
|
| 122 |
+
parse_status="repaired" if repaired else "ok",
|
| 123 |
+
cleaned_action=_compact_action(action),
|
| 124 |
+
repair_labels=repaired_labels,
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
class LenientActionAdapter:
|
| 129 |
+
"""Training-time parser that repairs small schema mistakes only."""
|
| 130 |
+
|
| 131 |
+
def parse(self, raw_text: str) -> ParseResult:
|
| 132 |
+
bare = raw_text.strip().strip('"').strip("'")
|
| 133 |
+
if bare in _KNOWN_ACTIONS:
|
| 134 |
+
try:
|
| 135 |
+
action = UnifiedIncidentAction(action_type=bare)
|
| 136 |
+
except Exception as exc:
|
| 137 |
+
return ParseResult(
|
| 138 |
+
parse_status="invalid_action",
|
| 139 |
+
error=str(exc),
|
| 140 |
+
repair_labels=["bare_action_wrapped"],
|
| 141 |
+
)
|
| 142 |
+
return ParseResult(
|
| 143 |
+
parse_status="repaired",
|
| 144 |
+
cleaned_action=_compact_action(action),
|
| 145 |
+
repair_labels=["bare_action_wrapped"],
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
try:
|
| 149 |
+
data = json.loads(_extract_json_text(raw_text))
|
| 150 |
+
except Exception as exc:
|
| 151 |
+
return ParseResult(parse_status="invalid_json", error=type(exc).__name__)
|
| 152 |
+
|
| 153 |
+
if not isinstance(data, dict):
|
| 154 |
+
return ParseResult(parse_status="invalid_action", error="root must be object")
|
| 155 |
+
|
| 156 |
+
repaired_labels: list[str] = []
|
| 157 |
+
cleaned: dict[str, Any] = {k: v for k, v in data.items() if k in _ALLOWED_KEYS}
|
| 158 |
+
repaired = cleaned != data
|
| 159 |
+
if repaired:
|
| 160 |
+
repaired_labels.append("extra_keys_stripped")
|
| 161 |
+
|
| 162 |
+
if "action_type" not in cleaned and isinstance(data.get("action"), str):
|
| 163 |
+
if data["action"] in _KNOWN_ACTIONS:
|
| 164 |
+
cleaned["action_type"] = data["action"]
|
| 165 |
+
repaired = True
|
| 166 |
+
repaired_labels.append("action_alias_normalized")
|
| 167 |
+
|
| 168 |
+
if (
|
| 169 |
+
"vulnerability_type" not in cleaned
|
| 170 |
+
and isinstance(data.get("vulnerability"), str)
|
| 171 |
+
):
|
| 172 |
+
cleaned["vulnerability_type"] = data["vulnerability"]
|
| 173 |
+
repaired = True
|
| 174 |
+
repaired_labels.append("vulnerability_alias_normalized")
|
| 175 |
+
|
| 176 |
+
metrics_value = data.get("metrics")
|
| 177 |
+
if "metric" not in cleaned and isinstance(metrics_value, list) and len(metrics_value) == 1:
|
| 178 |
+
cleaned["metric"] = metrics_value[0]
|
| 179 |
+
repaired = True
|
| 180 |
+
repaired_labels.append("metric_list_normalized")
|
| 181 |
+
|
| 182 |
+
if "metrics" in data and (
|
| 183 |
+
not isinstance(metrics_value, list) or len(metrics_value) != 1
|
| 184 |
+
):
|
| 185 |
+
return ParseResult(
|
| 186 |
+
parse_status="invalid_action",
|
| 187 |
+
error="metrics alias is ambiguous",
|
| 188 |
+
repair_labels=repaired_labels,
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
try:
|
| 192 |
+
action = UnifiedIncidentAction(**cleaned)
|
| 193 |
+
except Exception as exc:
|
| 194 |
+
return ParseResult(
|
| 195 |
+
parse_status="invalid_action",
|
| 196 |
+
error=str(exc),
|
| 197 |
+
repair_labels=repaired_labels,
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
return ParseResult(
|
| 201 |
+
parse_status="repaired" if repaired else "ok",
|
| 202 |
+
cleaned_action=_compact_action(action),
|
| 203 |
+
repair_labels=repaired_labels,
|
| 204 |
+
)
|
unified_incident_env/trainer/analyze_failures.py
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Failure analysis for episode trajectories."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from collections import Counter
|
| 6 |
+
|
| 7 |
+
from .types import EpisodeRecord, FailureAnalysisReport, FailureBucketEntry, StepRecord
|
| 8 |
+
|
| 9 |
+
_INFRA_ACTIONS = {"restart_service", "rollback_deploy"}
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def analyze_episode(record: EpisodeRecord) -> FailureAnalysisReport:
|
| 13 |
+
"""Classify one episode into schema, policy, looping, and reasoning buckets."""
|
| 14 |
+
entries: list[FailureBucketEntry] = []
|
| 15 |
+
|
| 16 |
+
for step in record.step_records:
|
| 17 |
+
entries.extend(_classify_step(record, step))
|
| 18 |
+
|
| 19 |
+
entries.extend(_classify_episode_level(record))
|
| 20 |
+
|
| 21 |
+
schema = sorted({entry.failure_type for entry in entries if entry.bucket == "schema"})
|
| 22 |
+
policy = sorted({entry.failure_type for entry in entries if entry.bucket == "policy"})
|
| 23 |
+
looping = sorted({entry.failure_type for entry in entries if entry.bucket == "looping"})
|
| 24 |
+
reasoning = sorted({entry.failure_type for entry in entries if entry.bucket == "reasoning"})
|
| 25 |
+
summary = Counter(entry.bucket for entry in entries)
|
| 26 |
+
|
| 27 |
+
return FailureAnalysisReport(
|
| 28 |
+
episode_ids=[record.episode_id or 0],
|
| 29 |
+
scenario_ids=[record.scenario_id],
|
| 30 |
+
entries=entries,
|
| 31 |
+
schema_failures=schema,
|
| 32 |
+
policy_failures=policy,
|
| 33 |
+
looping_failures=looping,
|
| 34 |
+
reasoning_failures=reasoning,
|
| 35 |
+
summary={
|
| 36 |
+
"schema": summary.get("schema", 0),
|
| 37 |
+
"policy": summary.get("policy", 0),
|
| 38 |
+
"looping": summary.get("looping", 0),
|
| 39 |
+
"reasoning": summary.get("reasoning", 0),
|
| 40 |
+
},
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def analyze_block(records: list[EpisodeRecord]) -> FailureAnalysisReport:
|
| 45 |
+
"""Combine multiple episode analyses into one block report."""
|
| 46 |
+
analyses = [analyze_episode(record) for record in records]
|
| 47 |
+
entries = [entry for analysis in analyses for entry in analysis.entries]
|
| 48 |
+
summary = Counter(entry.bucket for entry in entries)
|
| 49 |
+
return FailureAnalysisReport(
|
| 50 |
+
episode_ids=[record.episode_id or 0 for record in records],
|
| 51 |
+
scenario_ids=[record.scenario_id for record in records],
|
| 52 |
+
entries=entries,
|
| 53 |
+
schema_failures=sorted({entry.failure_type for entry in entries if entry.bucket == "schema"}),
|
| 54 |
+
policy_failures=sorted({entry.failure_type for entry in entries if entry.bucket == "policy"}),
|
| 55 |
+
looping_failures=sorted({entry.failure_type for entry in entries if entry.bucket == "looping"}),
|
| 56 |
+
reasoning_failures=sorted({entry.failure_type for entry in entries if entry.bucket == "reasoning"}),
|
| 57 |
+
summary={
|
| 58 |
+
"schema": summary.get("schema", 0),
|
| 59 |
+
"policy": summary.get("policy", 0),
|
| 60 |
+
"looping": summary.get("looping", 0),
|
| 61 |
+
"reasoning": summary.get("reasoning", 0),
|
| 62 |
+
},
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def _classify_step(record: EpisodeRecord, step: StepRecord) -> list[FailureBucketEntry]:
|
| 67 |
+
entries: list[FailureBucketEntry] = []
|
| 68 |
+
if step.parse_status in {"invalid_json", "invalid_action"}:
|
| 69 |
+
entries.append(
|
| 70 |
+
FailureBucketEntry(
|
| 71 |
+
episode_id=record.episode_id or 0,
|
| 72 |
+
scenario_id=record.scenario_id,
|
| 73 |
+
step_index=step.step_index,
|
| 74 |
+
bucket="schema",
|
| 75 |
+
failure_type=_schema_failure_type(step),
|
| 76 |
+
detail=step.failure_reason or "schema failure",
|
| 77 |
+
)
|
| 78 |
+
)
|
| 79 |
+
return entries
|
| 80 |
+
|
| 81 |
+
student = step.cleaned_action or {}
|
| 82 |
+
teacher = step.teacher_action or {}
|
| 83 |
+
if not teacher or not student or student == teacher:
|
| 84 |
+
return entries
|
| 85 |
+
|
| 86 |
+
student_type = student.get("action_type")
|
| 87 |
+
teacher_type = teacher.get("action_type")
|
| 88 |
+
|
| 89 |
+
if student_type == "classify_vulnerability":
|
| 90 |
+
failure_type = (
|
| 91 |
+
"wrong_vulnerability"
|
| 92 |
+
if teacher_type == "classify_vulnerability"
|
| 93 |
+
else "fails_to_identify_real_vulnerability"
|
| 94 |
+
)
|
| 95 |
+
entries.append(
|
| 96 |
+
FailureBucketEntry(
|
| 97 |
+
episode_id=record.episode_id or 0,
|
| 98 |
+
scenario_id=record.scenario_id,
|
| 99 |
+
step_index=step.step_index,
|
| 100 |
+
bucket="reasoning",
|
| 101 |
+
failure_type=failure_type,
|
| 102 |
+
detail=f"student={student} teacher={teacher}",
|
| 103 |
+
)
|
| 104 |
+
)
|
| 105 |
+
return entries
|
| 106 |
+
|
| 107 |
+
if student_type == "apply_patch" and teacher_type == "apply_patch":
|
| 108 |
+
entries.append(
|
| 109 |
+
FailureBucketEntry(
|
| 110 |
+
episode_id=record.episode_id or 0,
|
| 111 |
+
scenario_id=record.scenario_id,
|
| 112 |
+
step_index=step.step_index,
|
| 113 |
+
bucket="policy",
|
| 114 |
+
failure_type="wrong_patch",
|
| 115 |
+
detail=f"student={student} teacher={teacher}",
|
| 116 |
+
)
|
| 117 |
+
)
|
| 118 |
+
return entries
|
| 119 |
+
|
| 120 |
+
if student_type == "verify_security_fix" and teacher_type != "verify_security_fix":
|
| 121 |
+
entries.append(
|
| 122 |
+
FailureBucketEntry(
|
| 123 |
+
episode_id=record.episode_id or 0,
|
| 124 |
+
scenario_id=record.scenario_id,
|
| 125 |
+
step_index=step.step_index,
|
| 126 |
+
bucket="policy",
|
| 127 |
+
failure_type="verify_too_early",
|
| 128 |
+
detail=f"student={student} teacher={teacher}",
|
| 129 |
+
)
|
| 130 |
+
)
|
| 131 |
+
return entries
|
| 132 |
+
|
| 133 |
+
if student_type == "submit_security_fix" and teacher_type != "submit_security_fix":
|
| 134 |
+
entries.append(
|
| 135 |
+
FailureBucketEntry(
|
| 136 |
+
episode_id=record.episode_id or 0,
|
| 137 |
+
scenario_id=record.scenario_id,
|
| 138 |
+
step_index=step.step_index,
|
| 139 |
+
bucket="policy",
|
| 140 |
+
failure_type="submit_too_early",
|
| 141 |
+
detail=f"student={student} teacher={teacher}",
|
| 142 |
+
)
|
| 143 |
+
)
|
| 144 |
+
return entries
|
| 145 |
+
|
| 146 |
+
if student_type in _INFRA_ACTIONS and teacher_type not in _INFRA_ACTIONS:
|
| 147 |
+
entries.append(
|
| 148 |
+
FailureBucketEntry(
|
| 149 |
+
episode_id=record.episode_id or 0,
|
| 150 |
+
scenario_id=record.scenario_id,
|
| 151 |
+
step_index=step.step_index,
|
| 152 |
+
bucket="policy",
|
| 153 |
+
failure_type="infra_before_security",
|
| 154 |
+
detail=f"student={student} teacher={teacher}",
|
| 155 |
+
)
|
| 156 |
+
)
|
| 157 |
+
return entries
|
| 158 |
+
|
| 159 |
+
if student_type in _INFRA_ACTIONS and teacher_type in _INFRA_ACTIONS:
|
| 160 |
+
failure_type = "wrong_service"
|
| 161 |
+
if student_type == "restart_service":
|
| 162 |
+
failure_type = "wrong_restart"
|
| 163 |
+
elif student_type == "rollback_deploy":
|
| 164 |
+
failure_type = "wrong_rollback"
|
| 165 |
+
entries.append(
|
| 166 |
+
FailureBucketEntry(
|
| 167 |
+
episode_id=record.episode_id or 0,
|
| 168 |
+
scenario_id=record.scenario_id,
|
| 169 |
+
step_index=step.step_index,
|
| 170 |
+
bucket="policy",
|
| 171 |
+
failure_type=failure_type,
|
| 172 |
+
detail=f"student={student} teacher={teacher}",
|
| 173 |
+
)
|
| 174 |
+
)
|
| 175 |
+
return entries
|
| 176 |
+
|
| 177 |
+
entries.append(
|
| 178 |
+
FailureBucketEntry(
|
| 179 |
+
episode_id=record.episode_id or 0,
|
| 180 |
+
scenario_id=record.scenario_id,
|
| 181 |
+
step_index=step.step_index,
|
| 182 |
+
bucket="policy",
|
| 183 |
+
failure_type="wrong_action_choice",
|
| 184 |
+
detail=f"student={student} teacher={teacher}",
|
| 185 |
+
)
|
| 186 |
+
)
|
| 187 |
+
return entries
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def _classify_episode_level(record: EpisodeRecord) -> list[FailureBucketEntry]:
|
| 191 |
+
entries: list[FailureBucketEntry] = []
|
| 192 |
+
previous = None
|
| 193 |
+
repeat_count = 0
|
| 194 |
+
for step in record.step_records:
|
| 195 |
+
current = step.cleaned_action
|
| 196 |
+
if current and current == previous:
|
| 197 |
+
repeat_count += 1
|
| 198 |
+
if repeat_count >= 1:
|
| 199 |
+
entries.append(
|
| 200 |
+
FailureBucketEntry(
|
| 201 |
+
episode_id=record.episode_id or 0,
|
| 202 |
+
scenario_id=record.scenario_id,
|
| 203 |
+
step_index=step.step_index,
|
| 204 |
+
bucket="looping",
|
| 205 |
+
failure_type="repeated_same_action",
|
| 206 |
+
detail=f"action={current}",
|
| 207 |
+
)
|
| 208 |
+
)
|
| 209 |
+
else:
|
| 210 |
+
repeat_count = 0
|
| 211 |
+
previous = current
|
| 212 |
+
|
| 213 |
+
stopped = record.stopped_reason or ""
|
| 214 |
+
if stopped in {"diagnosis", "root_cause_analysis"}:
|
| 215 |
+
entries.append(
|
| 216 |
+
FailureBucketEntry(
|
| 217 |
+
episode_id=record.episode_id or 0,
|
| 218 |
+
scenario_id=record.scenario_id,
|
| 219 |
+
step_index=None,
|
| 220 |
+
bucket="looping",
|
| 221 |
+
failure_type="stuck_in_diagnosis",
|
| 222 |
+
detail=f"stopped_reason={stopped}",
|
| 223 |
+
)
|
| 224 |
+
)
|
| 225 |
+
elif stopped == "security_subquest":
|
| 226 |
+
entries.append(
|
| 227 |
+
FailureBucketEntry(
|
| 228 |
+
episode_id=record.episode_id or 0,
|
| 229 |
+
scenario_id=record.scenario_id,
|
| 230 |
+
step_index=None,
|
| 231 |
+
bucket="looping",
|
| 232 |
+
failure_type="stuck_in_security_subquest",
|
| 233 |
+
detail=f"stopped_reason={stopped}",
|
| 234 |
+
)
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
return entries
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
def _schema_failure_type(step: StepRecord) -> str:
|
| 241 |
+
raw = step.raw_model_output.lower()
|
| 242 |
+
error = (step.failure_reason or "").lower()
|
| 243 |
+
if '"reason"' in raw or '"details"' in raw or "extra_forbidden" in error:
|
| 244 |
+
return "extra_unsupported_fields"
|
| 245 |
+
if '"services"' in raw or '"metrics"' in raw or "field required" in error:
|
| 246 |
+
return "wrong_field_names"
|
| 247 |
+
if "required" in error or "missing" in error:
|
| 248 |
+
return "missing_required_fields"
|
| 249 |
+
if step.parse_status == "invalid_json":
|
| 250 |
+
return "invalid_json"
|
| 251 |
+
return "invalid_action"
|
unified_incident_env/trainer/backend.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Backend interfaces for model calls."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import time
|
| 7 |
+
from typing import Protocol
|
| 8 |
+
from urllib.parse import urlparse
|
| 9 |
+
|
| 10 |
+
from openai import OpenAI
|
| 11 |
+
|
| 12 |
+
from .types import ModelRequest, ModelResponse
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class ModelBackend(Protocol):
|
| 16 |
+
"""Minimal backend protocol for trainer use."""
|
| 17 |
+
|
| 18 |
+
def complete(self, request: ModelRequest) -> ModelResponse:
|
| 19 |
+
"""Return raw model text and metadata for one request."""
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class OpenAICompatibleBackend:
|
| 23 |
+
"""OpenAI-compatible backend, suitable for Ollama and similar servers."""
|
| 24 |
+
|
| 25 |
+
def __init__(
|
| 26 |
+
self,
|
| 27 |
+
*,
|
| 28 |
+
base_url: str,
|
| 29 |
+
api_key: str,
|
| 30 |
+
timeout_s: float = 90.0,
|
| 31 |
+
) -> None:
|
| 32 |
+
self.base_url = base_url
|
| 33 |
+
self._client = OpenAI(api_key=api_key, base_url=base_url, timeout=timeout_s)
|
| 34 |
+
|
| 35 |
+
def complete(self, request: ModelRequest) -> ModelResponse:
|
| 36 |
+
started = time.perf_counter()
|
| 37 |
+
create_kwargs = {
|
| 38 |
+
"model": request.model_name,
|
| 39 |
+
"temperature": request.temperature,
|
| 40 |
+
"max_tokens": request.max_tokens,
|
| 41 |
+
"messages": [
|
| 42 |
+
{"role": "system", "content": request.system_prompt},
|
| 43 |
+
{"role": "user", "content": request.user_prompt},
|
| 44 |
+
],
|
| 45 |
+
}
|
| 46 |
+
raw_text = ""
|
| 47 |
+
actual_mode = request.structured_mode
|
| 48 |
+
|
| 49 |
+
if request.structured_mode == "backend_adaptive":
|
| 50 |
+
if self._is_ollama():
|
| 51 |
+
actual_mode = "response_format_json"
|
| 52 |
+
else:
|
| 53 |
+
actual_mode = "tool_calling"
|
| 54 |
+
|
| 55 |
+
try:
|
| 56 |
+
if actual_mode == "tool_calling":
|
| 57 |
+
tool_choice = request.tool_choice or {
|
| 58 |
+
"type": "function",
|
| 59 |
+
"function": {"name": "emit_action"},
|
| 60 |
+
}
|
| 61 |
+
create_kwargs["tools"] = request.tools or [
|
| 62 |
+
self._tool_from_response_format(request.response_format)
|
| 63 |
+
]
|
| 64 |
+
create_kwargs["tool_choice"] = tool_choice
|
| 65 |
+
response = self._client.chat.completions.create(**create_kwargs)
|
| 66 |
+
raw_text = self._extract_tool_text(response)
|
| 67 |
+
elif actual_mode == "response_format_json":
|
| 68 |
+
if self._is_ollama():
|
| 69 |
+
create_kwargs["extra_body"] = {
|
| 70 |
+
"format": self._ollama_format_payload(request.response_format)
|
| 71 |
+
}
|
| 72 |
+
else:
|
| 73 |
+
create_kwargs["response_format"] = request.response_format or {
|
| 74 |
+
"type": "json_object"
|
| 75 |
+
}
|
| 76 |
+
response = self._client.chat.completions.create(**create_kwargs)
|
| 77 |
+
raw_text = response.choices[0].message.content or ""
|
| 78 |
+
else:
|
| 79 |
+
response = self._client.chat.completions.create(**create_kwargs)
|
| 80 |
+
raw_text = response.choices[0].message.content or ""
|
| 81 |
+
except Exception:
|
| 82 |
+
if request.structured_mode == "backend_adaptive" and actual_mode == "tool_calling":
|
| 83 |
+
fallback_kwargs = dict(create_kwargs)
|
| 84 |
+
if "tools" in fallback_kwargs:
|
| 85 |
+
del fallback_kwargs["tools"]
|
| 86 |
+
if "tool_choice" in fallback_kwargs:
|
| 87 |
+
del fallback_kwargs["tool_choice"]
|
| 88 |
+
if self._is_ollama():
|
| 89 |
+
fallback_kwargs["extra_body"] = {
|
| 90 |
+
"format": self._ollama_format_payload(request.response_format)
|
| 91 |
+
}
|
| 92 |
+
else:
|
| 93 |
+
fallback_kwargs["response_format"] = request.response_format or {
|
| 94 |
+
"type": "json_object"
|
| 95 |
+
}
|
| 96 |
+
response = self._client.chat.completions.create(**fallback_kwargs)
|
| 97 |
+
raw_text = response.choices[0].message.content or ""
|
| 98 |
+
actual_mode = "response_format_json"
|
| 99 |
+
else:
|
| 100 |
+
raise
|
| 101 |
+
|
| 102 |
+
elapsed = time.perf_counter() - started
|
| 103 |
+
return ModelResponse(
|
| 104 |
+
raw_text=raw_text,
|
| 105 |
+
latency_s=round(elapsed, 4),
|
| 106 |
+
metadata={
|
| 107 |
+
"model": request.model_name,
|
| 108 |
+
"structured_mode": actual_mode,
|
| 109 |
+
},
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
def _is_ollama(self) -> bool:
|
| 113 |
+
parsed = urlparse(self.base_url)
|
| 114 |
+
host = parsed.netloc.lower()
|
| 115 |
+
return "11434" in host or "ollama" in host or "127.0.0.1" in host or "localhost" in host
|
| 116 |
+
|
| 117 |
+
def _ollama_format_payload(
|
| 118 |
+
self,
|
| 119 |
+
response_format: dict[str, object] | None,
|
| 120 |
+
) -> object:
|
| 121 |
+
if response_format and response_format.get("type") == "json_schema":
|
| 122 |
+
json_schema = response_format.get("json_schema", {})
|
| 123 |
+
if isinstance(json_schema, dict):
|
| 124 |
+
return json_schema.get("schema", "json")
|
| 125 |
+
return "json"
|
| 126 |
+
|
| 127 |
+
def _tool_from_response_format(
|
| 128 |
+
self,
|
| 129 |
+
response_format: dict[str, object] | None,
|
| 130 |
+
) -> dict[str, object]:
|
| 131 |
+
schema = {
|
| 132 |
+
"type": "object",
|
| 133 |
+
"properties": {"action_type": {"type": "string"}},
|
| 134 |
+
"required": ["action_type"],
|
| 135 |
+
"additionalProperties": False,
|
| 136 |
+
}
|
| 137 |
+
if response_format and response_format.get("type") == "json_schema":
|
| 138 |
+
json_schema = response_format.get("json_schema", {})
|
| 139 |
+
schema = json_schema.get("schema", schema) # type: ignore[assignment]
|
| 140 |
+
return {
|
| 141 |
+
"type": "function",
|
| 142 |
+
"function": {
|
| 143 |
+
"name": "emit_action",
|
| 144 |
+
"description": "Emit exactly one structured environment action.",
|
| 145 |
+
"parameters": schema,
|
| 146 |
+
},
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
def _extract_tool_text(self, response) -> str:
|
| 150 |
+
message = response.choices[0].message
|
| 151 |
+
tool_calls = getattr(message, "tool_calls", None) or []
|
| 152 |
+
if tool_calls:
|
| 153 |
+
function = getattr(tool_calls[0], "function", None)
|
| 154 |
+
if function is not None and getattr(function, "arguments", None):
|
| 155 |
+
return function.arguments
|
| 156 |
+
content = message.content or ""
|
| 157 |
+
if isinstance(content, list):
|
| 158 |
+
fragments = []
|
| 159 |
+
for item in content:
|
| 160 |
+
if isinstance(item, dict) and item.get("type") == "text":
|
| 161 |
+
fragments.append(item.get("text", ""))
|
| 162 |
+
return "".join(fragments)
|
| 163 |
+
if isinstance(content, str):
|
| 164 |
+
return content
|
| 165 |
+
return json.dumps(content)
|
unified_incident_env/trainer/build_datasets.py
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Build correction datasets from trajectories and failure analyses."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
from .build_sft_dataset import build_baseline_records
|
| 9 |
+
from .trajectory_store import TrajectoryStore
|
| 10 |
+
from .types import EpisodeRecord, FailureAnalysisReport, SFTRecord
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def build_schema_repair_records(
|
| 14 |
+
episodes: list[EpisodeRecord],
|
| 15 |
+
analyses: list[FailureAnalysisReport],
|
| 16 |
+
) -> list[SFTRecord]:
|
| 17 |
+
rows: list[SFTRecord] = []
|
| 18 |
+
analysis_by_episode = {
|
| 19 |
+
analysis.episode_ids[0]: analysis for analysis in analyses if analysis.episode_ids
|
| 20 |
+
}
|
| 21 |
+
for episode in episodes:
|
| 22 |
+
analysis = analysis_by_episode.get(episode.episode_id or 0)
|
| 23 |
+
schema_types = set(analysis.schema_failures if analysis else [])
|
| 24 |
+
for step in episode.step_records:
|
| 25 |
+
if step.parse_status not in {"invalid_json", "invalid_action", "repaired", "teacher_override"}:
|
| 26 |
+
continue
|
| 27 |
+
if step.teacher_action is None:
|
| 28 |
+
continue
|
| 29 |
+
rows.append(
|
| 30 |
+
SFTRecord(
|
| 31 |
+
source="schema_repair",
|
| 32 |
+
scenario_id=episode.scenario_id,
|
| 33 |
+
tick=step.tick,
|
| 34 |
+
messages=[
|
| 35 |
+
{"role": "system", "content": "Repair the action into strict JSON only."},
|
| 36 |
+
{
|
| 37 |
+
"role": "user",
|
| 38 |
+
"content": (
|
| 39 |
+
f"{step.prompt_text}\n\n"
|
| 40 |
+
f"Previous invalid output:\n{step.raw_model_output}"
|
| 41 |
+
),
|
| 42 |
+
},
|
| 43 |
+
],
|
| 44 |
+
target_action=step.teacher_action,
|
| 45 |
+
student_action=step.cleaned_action,
|
| 46 |
+
parse_status=step.parse_status,
|
| 47 |
+
tags=sorted(schema_types) or [step.parse_status],
|
| 48 |
+
metadata={
|
| 49 |
+
"episode_id": episode.episode_id,
|
| 50 |
+
"step_index": step.step_index,
|
| 51 |
+
"repair_retry_used": step.repair_retry_used,
|
| 52 |
+
"teacher_override_used": step.teacher_override_used,
|
| 53 |
+
"normalization_applied": step.normalization_applied,
|
| 54 |
+
"failure_type": step.observation.get("failure_type"),
|
| 55 |
+
"why_failed": step.observation.get("why_failed"),
|
| 56 |
+
"loop_warning": step.observation.get("loop_warning"),
|
| 57 |
+
"blocked_until_security_complete": step.observation.get("blocked_until_security_complete"),
|
| 58 |
+
"security_unlock_reason": step.observation.get("security_unlock_reason"),
|
| 59 |
+
"progress_flags": step.observation.get("progress_flags"),
|
| 60 |
+
},
|
| 61 |
+
)
|
| 62 |
+
)
|
| 63 |
+
return rows
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def build_next_action_records(
|
| 67 |
+
episodes: list[EpisodeRecord],
|
| 68 |
+
analyses: list[FailureAnalysisReport],
|
| 69 |
+
) -> list[SFTRecord]:
|
| 70 |
+
rows: list[SFTRecord] = []
|
| 71 |
+
episode_entries = {
|
| 72 |
+
analysis.episode_ids[0]: analysis.entries
|
| 73 |
+
for analysis in analyses
|
| 74 |
+
if analysis.episode_ids
|
| 75 |
+
}
|
| 76 |
+
allowed = {"policy", "reasoning", "looping"}
|
| 77 |
+
for episode in episodes:
|
| 78 |
+
entries = episode_entries.get(episode.episode_id or 0, [])
|
| 79 |
+
step_indices = {
|
| 80 |
+
entry.step_index
|
| 81 |
+
for entry in entries
|
| 82 |
+
if entry.bucket in allowed and entry.step_index is not None
|
| 83 |
+
}
|
| 84 |
+
for step in episode.step_records:
|
| 85 |
+
if step.step_index not in step_indices:
|
| 86 |
+
continue
|
| 87 |
+
if step.teacher_action is None:
|
| 88 |
+
continue
|
| 89 |
+
tags = [
|
| 90 |
+
entry.failure_type
|
| 91 |
+
for entry in entries
|
| 92 |
+
if entry.step_index == step.step_index and entry.bucket in allowed
|
| 93 |
+
]
|
| 94 |
+
rows.append(
|
| 95 |
+
SFTRecord(
|
| 96 |
+
source="next_action",
|
| 97 |
+
scenario_id=episode.scenario_id,
|
| 98 |
+
tick=step.tick,
|
| 99 |
+
messages=[
|
| 100 |
+
{"role": "system", "content": "Choose the best next action as strict JSON only."},
|
| 101 |
+
{"role": "user", "content": step.prompt_text},
|
| 102 |
+
],
|
| 103 |
+
target_action=step.teacher_action,
|
| 104 |
+
student_action=step.cleaned_action,
|
| 105 |
+
parse_status=step.parse_status,
|
| 106 |
+
tags=sorted(set(tags)) or ["next_action"],
|
| 107 |
+
metadata={
|
| 108 |
+
"episode_id": episode.episode_id,
|
| 109 |
+
"step_index": step.step_index,
|
| 110 |
+
"workflow_stage": step.workflow_stage,
|
| 111 |
+
"teacher_override_used": step.teacher_override_used,
|
| 112 |
+
"failure_type": step.observation.get("failure_type"),
|
| 113 |
+
"why_failed": step.observation.get("why_failed"),
|
| 114 |
+
"loop_warning": step.observation.get("loop_warning"),
|
| 115 |
+
"progress_flags": step.observation.get("progress_flags"),
|
| 116 |
+
},
|
| 117 |
+
)
|
| 118 |
+
)
|
| 119 |
+
return rows
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def build_recovery_records(
|
| 123 |
+
episodes: list[EpisodeRecord],
|
| 124 |
+
analyses: list[FailureAnalysisReport],
|
| 125 |
+
) -> list[SFTRecord]:
|
| 126 |
+
rows: list[SFTRecord] = []
|
| 127 |
+
episode_entries = {
|
| 128 |
+
analysis.episode_ids[0]: analysis.entries
|
| 129 |
+
for analysis in analyses
|
| 130 |
+
if analysis.episode_ids
|
| 131 |
+
}
|
| 132 |
+
recovery_failures = {
|
| 133 |
+
"wrong_restart",
|
| 134 |
+
"wrong_rollback",
|
| 135 |
+
"wrong_service",
|
| 136 |
+
"wrong_patch",
|
| 137 |
+
"wrong_vulnerability",
|
| 138 |
+
"verify_too_early",
|
| 139 |
+
"submit_too_early",
|
| 140 |
+
"infra_before_security",
|
| 141 |
+
"repeated_same_action",
|
| 142 |
+
}
|
| 143 |
+
for episode in episodes:
|
| 144 |
+
entries = episode_entries.get(episode.episode_id or 0, [])
|
| 145 |
+
step_indices = {
|
| 146 |
+
entry.step_index
|
| 147 |
+
for entry in entries
|
| 148 |
+
if entry.failure_type in recovery_failures and entry.step_index is not None
|
| 149 |
+
}
|
| 150 |
+
for step in episode.step_records:
|
| 151 |
+
if step.step_index not in step_indices:
|
| 152 |
+
continue
|
| 153 |
+
if step.teacher_action is None or not step.next_prompt_text:
|
| 154 |
+
continue
|
| 155 |
+
tags = [
|
| 156 |
+
entry.failure_type
|
| 157 |
+
for entry in entries
|
| 158 |
+
if entry.step_index == step.step_index
|
| 159 |
+
and entry.failure_type in recovery_failures
|
| 160 |
+
]
|
| 161 |
+
rows.append(
|
| 162 |
+
SFTRecord(
|
| 163 |
+
source="recovery",
|
| 164 |
+
scenario_id=episode.scenario_id,
|
| 165 |
+
tick=step.tick,
|
| 166 |
+
messages=[
|
| 167 |
+
{"role": "system", "content": "Recover from the previous mistake. Return the best next strict JSON action only."},
|
| 168 |
+
{
|
| 169 |
+
"role": "user",
|
| 170 |
+
"content": (
|
| 171 |
+
f"{step.next_prompt_text}\n\n"
|
| 172 |
+
f"Previous wrong action: {step.cleaned_action}\n"
|
| 173 |
+
f"Penalty or result: reward={step.reward}"
|
| 174 |
+
),
|
| 175 |
+
},
|
| 176 |
+
],
|
| 177 |
+
target_action=step.teacher_action,
|
| 178 |
+
student_action=step.cleaned_action,
|
| 179 |
+
parse_status=step.parse_status,
|
| 180 |
+
tags=sorted(set(tags)) or ["recovery"],
|
| 181 |
+
metadata={
|
| 182 |
+
"episode_id": episode.episode_id,
|
| 183 |
+
"step_index": step.step_index,
|
| 184 |
+
"teacher_override_used": step.teacher_override_used,
|
| 185 |
+
"failure_type": step.observation.get("failure_type"),
|
| 186 |
+
"why_failed": step.observation.get("why_failed"),
|
| 187 |
+
"loop_warning": step.observation.get("loop_warning"),
|
| 188 |
+
"best_recovery_action_family": step.observation.get("best_recovery_action_family"),
|
| 189 |
+
},
|
| 190 |
+
)
|
| 191 |
+
)
|
| 192 |
+
return rows
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def combine_sft_records(
|
| 196 |
+
*,
|
| 197 |
+
baseline_records: list[SFTRecord],
|
| 198 |
+
schema_records: list[SFTRecord],
|
| 199 |
+
next_action_records: list[SFTRecord],
|
| 200 |
+
recovery_records: list[SFTRecord],
|
| 201 |
+
) -> list[SFTRecord]:
|
| 202 |
+
return [
|
| 203 |
+
*baseline_records,
|
| 204 |
+
*schema_records,
|
| 205 |
+
*next_action_records,
|
| 206 |
+
*recovery_records,
|
| 207 |
+
]
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
def write_jsonl(records: list[SFTRecord], path: Path) -> None:
|
| 211 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 212 |
+
with path.open("w", encoding="utf-8") as handle:
|
| 213 |
+
for record in records:
|
| 214 |
+
handle.write(record.model_dump_json())
|
| 215 |
+
handle.write("\n")
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
def load_episodes(path: Path) -> list[EpisodeRecord]:
|
| 219 |
+
return TrajectoryStore(path).load_episodes()
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
def main() -> None:
|
| 223 |
+
parser = argparse.ArgumentParser()
|
| 224 |
+
parser.add_argument("--episodes", default="outputs/trainer/episodes.jsonl")
|
| 225 |
+
parser.add_argument("--output-dir", required=True)
|
| 226 |
+
args = parser.parse_args()
|
| 227 |
+
|
| 228 |
+
output_dir = Path(args.output_dir)
|
| 229 |
+
episodes = load_episodes(Path(args.episodes))
|
| 230 |
+
|
| 231 |
+
from .analyze_failures import analyze_episode
|
| 232 |
+
|
| 233 |
+
analyses = [analyze_episode(episode) for episode in episodes]
|
| 234 |
+
baseline_records = build_baseline_records()
|
| 235 |
+
schema_records = build_schema_repair_records(episodes, analyses)
|
| 236 |
+
next_action_records = build_next_action_records(episodes, analyses)
|
| 237 |
+
recovery_records = build_recovery_records(episodes, analyses)
|
| 238 |
+
combined_records = combine_sft_records(
|
| 239 |
+
baseline_records=baseline_records,
|
| 240 |
+
schema_records=schema_records,
|
| 241 |
+
next_action_records=next_action_records,
|
| 242 |
+
recovery_records=recovery_records,
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
write_jsonl(baseline_records, output_dir / "baseline_teacher_dataset.jsonl")
|
| 246 |
+
write_jsonl(schema_records, output_dir / "schema_repair.jsonl")
|
| 247 |
+
write_jsonl(next_action_records, output_dir / "next_action.jsonl")
|
| 248 |
+
write_jsonl(recovery_records, output_dir / "recovery.jsonl")
|
| 249 |
+
write_jsonl(combined_records, output_dir / "sft_dataset.jsonl")
|
| 250 |
+
print(
|
| 251 |
+
f"wrote baseline={len(baseline_records)} schema={len(schema_records)} "
|
| 252 |
+
f"next_action={len(next_action_records)} recovery={len(recovery_records)} "
|
| 253 |
+
f"combined={len(combined_records)} to {output_dir}"
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
if __name__ == "__main__":
|
| 258 |
+
main()
|
unified_incident_env/trainer/build_sft_dataset.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Build supervised JSONL datasets from baseline and replay trajectories."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
from ..scripts.baseline_agent import plan_for_scenario
|
| 9 |
+
from ..server.challenge import SCENARIOS
|
| 10 |
+
from ..server.environment import UnifiedIncidentEnvironment
|
| 11 |
+
from .prompts import TRAINING_SYSTEM_PROMPT
|
| 12 |
+
from .trajectory_store import TrajectoryStore
|
| 13 |
+
from .types import SFTRecord
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def build_baseline_records() -> list[SFTRecord]:
|
| 17 |
+
rows: list[SFTRecord] = []
|
| 18 |
+
for scenario_id in SCENARIOS:
|
| 19 |
+
env = UnifiedIncidentEnvironment()
|
| 20 |
+
obs = env.reset(scenario_id=scenario_id)
|
| 21 |
+
for step_index, action in enumerate(plan_for_scenario(scenario_id), start=1):
|
| 22 |
+
rows.append(
|
| 23 |
+
SFTRecord(
|
| 24 |
+
source="baseline",
|
| 25 |
+
scenario_id=scenario_id,
|
| 26 |
+
tick=obs.tick_count,
|
| 27 |
+
messages=[
|
| 28 |
+
{"role": "system", "content": TRAINING_SYSTEM_PROMPT},
|
| 29 |
+
{"role": "user", "content": obs.prompt_text},
|
| 30 |
+
],
|
| 31 |
+
target_action=action.model_dump(exclude_none=True),
|
| 32 |
+
tags=["teacher", f"step_{step_index}"],
|
| 33 |
+
)
|
| 34 |
+
)
|
| 35 |
+
obs = env.step(action)
|
| 36 |
+
return rows
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def build_replay_records(episodes_path: Path) -> list[SFTRecord]:
|
| 40 |
+
rows: list[SFTRecord] = []
|
| 41 |
+
for episode in TrajectoryStore(episodes_path).load_episodes():
|
| 42 |
+
for step in episode.step_records:
|
| 43 |
+
if step.teacher_action is None:
|
| 44 |
+
continue
|
| 45 |
+
tags = [episode.mode, step.parse_status]
|
| 46 |
+
if step.failure_reason:
|
| 47 |
+
tags.append("failure")
|
| 48 |
+
rows.append(
|
| 49 |
+
SFTRecord(
|
| 50 |
+
source="replay",
|
| 51 |
+
scenario_id=episode.scenario_id,
|
| 52 |
+
tick=step.tick,
|
| 53 |
+
messages=[
|
| 54 |
+
{"role": "system", "content": TRAINING_SYSTEM_PROMPT},
|
| 55 |
+
{"role": "user", "content": step.prompt_text},
|
| 56 |
+
],
|
| 57 |
+
target_action=step.teacher_action,
|
| 58 |
+
student_action=step.cleaned_action,
|
| 59 |
+
parse_status=step.parse_status,
|
| 60 |
+
tags=tags,
|
| 61 |
+
)
|
| 62 |
+
)
|
| 63 |
+
return rows
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def write_jsonl(records: list[SFTRecord], output_path: Path) -> None:
|
| 67 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 68 |
+
with output_path.open("w", encoding="utf-8") as handle:
|
| 69 |
+
for record in records:
|
| 70 |
+
handle.write(record.model_dump_json())
|
| 71 |
+
handle.write("\n")
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def main() -> None:
|
| 75 |
+
parser = argparse.ArgumentParser()
|
| 76 |
+
parser.add_argument(
|
| 77 |
+
"--source",
|
| 78 |
+
choices=["baseline", "replay", "combined"],
|
| 79 |
+
default="combined",
|
| 80 |
+
)
|
| 81 |
+
parser.add_argument(
|
| 82 |
+
"--episodes",
|
| 83 |
+
default="outputs/trainer/episodes.jsonl",
|
| 84 |
+
)
|
| 85 |
+
parser.add_argument(
|
| 86 |
+
"--output",
|
| 87 |
+
required=True,
|
| 88 |
+
)
|
| 89 |
+
args = parser.parse_args()
|
| 90 |
+
|
| 91 |
+
records: list[SFTRecord] = []
|
| 92 |
+
if args.source in {"baseline", "combined"}:
|
| 93 |
+
records.extend(build_baseline_records())
|
| 94 |
+
if args.source in {"replay", "combined"}:
|
| 95 |
+
records.extend(build_replay_records(Path(args.episodes)))
|
| 96 |
+
write_jsonl(records, Path(args.output))
|
| 97 |
+
print(f"wrote {len(records)} rows to {args.output}")
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
if __name__ == "__main__":
|
| 101 |
+
main()
|
unified_incident_env/trainer/collect_trajectory.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Collection wrapper that turns one episode into trajectory + analysis + summary."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from .analyze_failures import analyze_episode
|
| 6 |
+
from .types import EpisodeRecord, EpisodeSummaryRecord
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def collect_episode(
|
| 10 |
+
*,
|
| 11 |
+
runner,
|
| 12 |
+
scenario_id: str,
|
| 13 |
+
episode_id: int,
|
| 14 |
+
mode: str,
|
| 15 |
+
model_version: str,
|
| 16 |
+
) -> tuple[EpisodeRecord, EpisodeSummaryRecord, object]:
|
| 17 |
+
"""Run, analyze, and summarize one episode."""
|
| 18 |
+
record = runner.run(
|
| 19 |
+
scenario_id=scenario_id,
|
| 20 |
+
mode=mode,
|
| 21 |
+
episode_id=episode_id,
|
| 22 |
+
model_version=model_version,
|
| 23 |
+
)
|
| 24 |
+
analysis = analyze_episode(record)
|
| 25 |
+
record.schema_failures = analysis.summary.get("schema", 0)
|
| 26 |
+
record.policy_failures = analysis.policy_failures
|
| 27 |
+
record.looping_failures = analysis.looping_failures
|
| 28 |
+
record.reasoning_failures = analysis.reasoning_failures
|
| 29 |
+
summary = EpisodeSummaryRecord(
|
| 30 |
+
episode_id=episode_id,
|
| 31 |
+
run_id=record.run_id,
|
| 32 |
+
scenario_id=record.scenario_id,
|
| 33 |
+
difficulty=record.difficulty,
|
| 34 |
+
model_name=record.model_name,
|
| 35 |
+
model_version=record.model_version,
|
| 36 |
+
mode=record.mode,
|
| 37 |
+
steps=record.steps,
|
| 38 |
+
success=record.success,
|
| 39 |
+
final_score=record.final_score,
|
| 40 |
+
schema_failures=analysis.summary.get("schema", 0),
|
| 41 |
+
json_valid_steps=record.json_valid_steps,
|
| 42 |
+
strict_schema_valid_steps=record.strict_schema_valid_steps,
|
| 43 |
+
teacher_override_count=record.teacher_override_count,
|
| 44 |
+
repair_retry_count=record.repair_retry_count,
|
| 45 |
+
policy_failures=analysis.policy_failures,
|
| 46 |
+
looping_failures=analysis.looping_failures,
|
| 47 |
+
reasoning_failures=analysis.reasoning_failures,
|
| 48 |
+
security_subquest_completed=record.security_subquest_completed,
|
| 49 |
+
postmortem_completed=record.postmortem_completed,
|
| 50 |
+
stopped_reason=record.stopped_reason,
|
| 51 |
+
elapsed_s=record.elapsed_s,
|
| 52 |
+
)
|
| 53 |
+
return record, summary, analysis
|
unified_incident_env/trainer/eval_models.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Batch evaluation for one or more models."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
from ..server.challenge import SCENARIOS
|
| 11 |
+
from .action_adapter import LenientActionAdapter, StrictActionParser
|
| 12 |
+
from .backend import OpenAICompatibleBackend
|
| 13 |
+
from .run_episode import EpisodeRunner
|
| 14 |
+
from .trajectory_store import TrajectoryStore
|
| 15 |
+
from .types import EvalScenarioResult, EvalSummary
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def summarize(results: list[EvalScenarioResult], mode: str) -> EvalSummary:
|
| 19 |
+
success_rate = (
|
| 20 |
+
sum(1 for result in results if result.success) / len(results) if results else 0.0
|
| 21 |
+
)
|
| 22 |
+
avg_score = (
|
| 23 |
+
sum(result.final_score for result in results) / len(results) if results else 0.0
|
| 24 |
+
)
|
| 25 |
+
schema_failure_rate = (
|
| 26 |
+
sum(1 for result in results if result.schema_failure) / len(results)
|
| 27 |
+
if results
|
| 28 |
+
else 0.0
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
by_model: dict[str, dict[str, float]] = {}
|
| 32 |
+
by_scenario: dict[str, dict[str, float]] = {}
|
| 33 |
+
for result in results:
|
| 34 |
+
model_bucket = by_model.setdefault(
|
| 35 |
+
result.model_name,
|
| 36 |
+
{"runs": 0.0, "successes": 0.0, "score_sum": 0.0, "schema_failures": 0.0},
|
| 37 |
+
)
|
| 38 |
+
model_bucket["runs"] += 1
|
| 39 |
+
model_bucket["successes"] += 1.0 if result.success else 0.0
|
| 40 |
+
model_bucket["score_sum"] += result.final_score
|
| 41 |
+
model_bucket["schema_failures"] += 1.0 if result.schema_failure else 0.0
|
| 42 |
+
|
| 43 |
+
scenario_bucket = by_scenario.setdefault(
|
| 44 |
+
result.scenario_id,
|
| 45 |
+
{"runs": 0.0, "successes": 0.0, "score_sum": 0.0},
|
| 46 |
+
)
|
| 47 |
+
scenario_bucket["runs"] += 1
|
| 48 |
+
scenario_bucket["successes"] += 1.0 if result.success else 0.0
|
| 49 |
+
scenario_bucket["score_sum"] += result.final_score
|
| 50 |
+
|
| 51 |
+
for bucket in by_model.values():
|
| 52 |
+
runs = bucket["runs"] or 1.0
|
| 53 |
+
bucket["success_rate"] = round(bucket["successes"] / runs, 4)
|
| 54 |
+
bucket["avg_score"] = round(bucket["score_sum"] / runs, 4)
|
| 55 |
+
bucket["schema_failure_rate"] = round(bucket["schema_failures"] / runs, 4)
|
| 56 |
+
del bucket["score_sum"]
|
| 57 |
+
del bucket["successes"]
|
| 58 |
+
del bucket["schema_failures"]
|
| 59 |
+
|
| 60 |
+
for bucket in by_scenario.values():
|
| 61 |
+
runs = bucket["runs"] or 1.0
|
| 62 |
+
bucket["success_rate"] = round(bucket["successes"] / runs, 4)
|
| 63 |
+
bucket["avg_score"] = round(bucket["score_sum"] / runs, 4)
|
| 64 |
+
del bucket["score_sum"]
|
| 65 |
+
del bucket["successes"]
|
| 66 |
+
|
| 67 |
+
return EvalSummary(
|
| 68 |
+
mode=mode,
|
| 69 |
+
results=results,
|
| 70 |
+
success_rate=round(success_rate, 4),
|
| 71 |
+
avg_score=round(avg_score, 4),
|
| 72 |
+
schema_failure_rate=round(schema_failure_rate, 4),
|
| 73 |
+
by_model=by_model,
|
| 74 |
+
by_scenario=by_scenario,
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def main() -> None:
|
| 79 |
+
parser = argparse.ArgumentParser()
|
| 80 |
+
parser.add_argument("--models", nargs="+", required=True)
|
| 81 |
+
parser.add_argument("--mode", choices=["strict", "lenient"], default="strict")
|
| 82 |
+
parser.add_argument("--base-url", default="http://127.0.0.1:8000")
|
| 83 |
+
parser.add_argument(
|
| 84 |
+
"--api-base-url",
|
| 85 |
+
default=os.environ.get("API_BASE_URL", "http://127.0.0.1:11434/v1"),
|
| 86 |
+
)
|
| 87 |
+
parser.add_argument(
|
| 88 |
+
"--api-key",
|
| 89 |
+
default=os.environ.get("OPENAI_API_KEY") or os.environ.get("HF_TOKEN") or "local",
|
| 90 |
+
)
|
| 91 |
+
parser.add_argument(
|
| 92 |
+
"--output",
|
| 93 |
+
default=None,
|
| 94 |
+
)
|
| 95 |
+
parser.add_argument(
|
| 96 |
+
"--episodes-output",
|
| 97 |
+
default="outputs/trainer/episodes.jsonl",
|
| 98 |
+
)
|
| 99 |
+
args = parser.parse_args()
|
| 100 |
+
|
| 101 |
+
backend = OpenAICompatibleBackend(
|
| 102 |
+
base_url=args.api_base_url,
|
| 103 |
+
api_key=args.api_key,
|
| 104 |
+
)
|
| 105 |
+
parser_impl = StrictActionParser() if args.mode == "strict" else LenientActionAdapter()
|
| 106 |
+
episode_store = TrajectoryStore(Path(args.episodes_output))
|
| 107 |
+
|
| 108 |
+
results: list[EvalScenarioResult] = []
|
| 109 |
+
for model_name in args.models:
|
| 110 |
+
runner = EpisodeRunner(
|
| 111 |
+
backend=backend,
|
| 112 |
+
parser=parser_impl,
|
| 113 |
+
model_name=model_name,
|
| 114 |
+
base_url=args.base_url,
|
| 115 |
+
)
|
| 116 |
+
for scenario_id in SCENARIOS:
|
| 117 |
+
episode = runner.run(scenario_id=scenario_id, mode=args.mode)
|
| 118 |
+
episode_store.append_episode(episode)
|
| 119 |
+
results.append(
|
| 120 |
+
EvalScenarioResult(
|
| 121 |
+
model_name=model_name,
|
| 122 |
+
scenario_id=scenario_id,
|
| 123 |
+
success=episode.success,
|
| 124 |
+
final_score=episode.final_score,
|
| 125 |
+
failure_reason=episode.failure_reason,
|
| 126 |
+
schema_failure=bool(
|
| 127 |
+
episode.failure_reason
|
| 128 |
+
and episode.failure_reason.startswith("parse_failure")
|
| 129 |
+
),
|
| 130 |
+
elapsed_s=episode.elapsed_s,
|
| 131 |
+
)
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
summary = summarize(results, mode=args.mode)
|
| 135 |
+
output_path = Path(
|
| 136 |
+
args.output
|
| 137 |
+
or f"outputs/trainer/{args.mode}_eval_summary.json"
|
| 138 |
+
)
|
| 139 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 140 |
+
output_path.write_text(summary.model_dump_json(indent=2), encoding="utf-8")
|
| 141 |
+
print(summary.model_dump_json(indent=2))
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
if __name__ == "__main__":
|
| 145 |
+
main()
|