dakshdoesdev commited on
Commit
dc8501a
·
verified ·
1 Parent(s): 928cc3c

deploy sre-gym v2: easy/medium/hard scenarios + skill + verified-runbooks + demo

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +18 -0
  2. .gitignore +12 -0
  3. Dockerfile +21 -0
  4. Makefile +39 -0
  5. README.md +111 -4
  6. demo/pitch.md +49 -0
  7. demo/run_demo.sh +70 -0
  8. deploy/push_to_hf.sh +58 -0
  9. execution.md +53 -0
  10. inference.py +264 -0
  11. openenv.yaml +21 -0
  12. pyproject.toml +43 -0
  13. requirements.txt +1 -0
  14. run_demo.py +86 -0
  15. server/Dockerfile +21 -0
  16. server/__init__.py +1 -0
  17. server/app.py +14 -0
  18. server/requirements.txt +8 -0
  19. skill/SKILL.md +100 -0
  20. skill/tools/sre_gym_client.py +238 -0
  21. skill/verified-runbooks/.gitkeep +0 -0
  22. skill/verified-runbooks/db_config_rollout.md +23 -0
  23. skill/verified-runbooks/gateway_auth_rollout.md +21 -0
  24. skill/verified-runbooks/worker_deploy_cascade.md +23 -0
  25. unified_incident_env/README.md +10 -0
  26. unified_incident_env/__init__.py +17 -0
  27. unified_incident_env/client.py +35 -0
  28. unified_incident_env/interface.py +17 -0
  29. unified_incident_env/models.py +332 -0
  30. unified_incident_env/scripts/__init__.py +1 -0
  31. unified_incident_env/scripts/baseline_agent.py +43 -0
  32. unified_incident_env/scripts/walkthrough.py +41 -0
  33. unified_incident_env/server/__init__.py +1 -0
  34. unified_incident_env/server/app.py +148 -0
  35. unified_incident_env/server/challenge.py +753 -0
  36. unified_incident_env/server/environment.py +613 -0
  37. unified_incident_env/server/grader.py +145 -0
  38. unified_incident_env/tests/__init__.py +1 -0
  39. unified_incident_env/tests/test_environment.py +192 -0
  40. unified_incident_env/tests/test_submission_inference.py +119 -0
  41. unified_incident_env/tests/test_trainer.py +46 -0
  42. unified_incident_env/tests/test_trainer_session.py +32 -0
  43. unified_incident_env/trainer/__init__.py +8 -0
  44. unified_incident_env/trainer/action_adapter.py +204 -0
  45. unified_incident_env/trainer/analyze_failures.py +251 -0
  46. unified_incident_env/trainer/backend.py +165 -0
  47. unified_incident_env/trainer/build_datasets.py +258 -0
  48. unified_incident_env/trainer/build_sft_dataset.py +101 -0
  49. unified_incident_env/trainer/collect_trajectory.py +53 -0
  50. unified_incident_env/trainer/eval_models.py +145 -0
.dockerignore ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .venv/
2
+ __pycache__/
3
+ *.pyc
4
+ .git/
5
+ .pytest_cache/
6
+ outputs/
7
+ .omx/
8
+ .codex/
9
+ AGENTS.md
10
+ sre_env/
11
+ *.egg-info/
12
+ dist/
13
+ build/
14
+ .gemini/
15
+ madhav_trial/
16
+ *.png
17
+ *.npz
18
+ node_modules/
.gitignore ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .venv/
2
+ __pycache__/
3
+ .pytest_cache/
4
+ *.pyc
5
+ learning_curve.png
6
+ .omx/
7
+ .codex/
8
+ outputs/
9
+ AGENTS.md
10
+ .sisyphus/
11
+ *.egg-info/
12
+ uv.lock
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ ENV PYTHONDONTWRITEBYTECODE=1 \
6
+ PYTHONUNBUFFERED=1 \
7
+ ENABLE_WEB_INTERFACE=true
8
+
9
+ RUN apt-get update && apt-get install -y --no-install-recommends \
10
+ build-essential \
11
+ curl \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ COPY . /app
15
+
16
+ RUN pip install --no-cache-dir --upgrade pip && \
17
+ pip install --no-cache-dir .
18
+
19
+ EXPOSE 8000
20
+
21
+ CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]
Makefile ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .PHONY: install dev test baseline walkthrough trainer-eval trainer-dataset trainer-session docker-build docker-run validate clean
2
+
3
+ install:
4
+ python3 -m pip install -e ".[dev]"
5
+ @echo "Dependencies installed"
6
+
7
+ dev:
8
+ ENABLE_WEB_INTERFACE=true uvicorn server.app:app --reload --host 0.0.0.0 --port 8000
9
+
10
+ test:
11
+ pytest unified_incident_env/tests -v --tb=short
12
+
13
+ baseline:
14
+ python -m unified_incident_env.scripts.baseline_agent
15
+
16
+ walkthrough:
17
+ python -m unified_incident_env.scripts.walkthrough --scenario easy_sqli_db_outage
18
+
19
+ trainer-eval:
20
+ python -m unified_incident_env.trainer.eval_models --models qwen2.5:0.5b gemma2:2b qwen2.5:7b-instruct-q4_K_M --mode strict
21
+
22
+ trainer-dataset:
23
+ python -m unified_incident_env.trainer.build_sft_dataset --source combined --output outputs/trainer/sft_dataset.jsonl
24
+
25
+ trainer-session:
26
+ python -m unified_incident_env.trainer.run_session --model qwen2.5:0.5b --base-url http://127.0.0.1:8000
27
+
28
+ docker-build:
29
+ docker buildx build --platform linux/amd64 -t sre-env:latest .
30
+
31
+ docker-run:
32
+ docker run -p 8000:8000 -e ENABLE_WEB_INTERFACE=true sre-env:latest
33
+
34
+ validate:
35
+ openenv validate .
36
+
37
+ clean:
38
+ rm -rf outputs __pycache__ .pytest_cache
39
+ find . -name "*.pyc" -delete
README.md CHANGED
@@ -1,10 +1,117 @@
1
  ---
2
- title: Sre Gym
3
- emoji: 📚
4
  colorFrom: red
5
- colorTo: pink
6
  sdk: docker
 
7
  pinned: false
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: SRE Gym
3
+ emoji: 🚨
4
  colorFrom: red
5
+ colorTo: yellow
6
  sdk: docker
7
+ app_port: 8000
8
  pinned: false
9
+ license: apache-2.0
10
  ---
11
 
12
+ # sre-gym Fault-injecting SRE training env for OpenEnv
13
+
14
+ Most SRE agent skills are runbooks and good intentions. **sre-gym** is the other half: a fault-injecting environment with deterministic grading where an agent diagnoses a real production-style incident, chooses a safe remediation, verifies recovery, and declares resolved. Every run is scored the same way twice.
15
+
16
+ - Spec-compliant OpenEnv environment (typed Pydantic action / observation / state, `reset` / `step` / `state`, `openenv validate` green).
17
+ - 3 curriculum scenarios — easy, medium, hard — with decoy services and causal dependencies.
18
+ - 11 bounded actions. Honest state transitions. No hidden oracles.
19
+ - 21 tests passing.
20
+ - Ships a Claude Code skill + verified-runbook loop — successful solves write markdown runbooks that the next run reads back.
21
+
22
+ ## 30-second demo
23
+
24
+ ```bash
25
+ ./demo/run_demo.sh
26
+ ```
27
+
28
+ Starts the env, solves each scenario cold, writes a runbook for each, re-solves to prove the loop. Full transcript takes ~10 seconds.
29
+
30
+ ## Curriculum
31
+
32
+ | Difficulty | Scenario | Story | Decoy | Correct path |
33
+ |---|---|---|---|---|
34
+ | easy | `worker_deploy_cascade` | Bad worker deploy → DB crash-loop → login 502s | — | rollback worker → restart db → verify → resolve |
35
+ | medium | `db_config_rollout` | DB config push shrank connection pool from 80→12 | recent worker deploy | rollback **db** → restart db → verify → resolve |
36
+ | hard | `gateway_auth_rollout` | Gateway auth-middleware rollout rejects valid logins | recent worker deploy | rollback **gateway** → verify → resolve (no restart) |
37
+
38
+ Rolling back the wrong service returns a negative reward and `failure_type="wrong_remediation_target"`. Restarting before the cause is removed re-inherits the bad state. `declare_resolved` is rejected until the scenario's resolution check passes against the actual world model.
39
+
40
+ ## Install
41
+
42
+ ```bash
43
+ # 1. Create a venv and install
44
+ python3 -m venv .venv && source .venv/bin/activate
45
+ pip install -e '.[dev]'
46
+
47
+ # 2. Start the env
48
+ uvicorn server.app:app --host 127.0.0.1 --port 8000
49
+
50
+ # 3. Run the baseline inference against it
51
+ export HF_TOKEN="…"; export ENV_BASE_URL=http://127.0.0.1:8000
52
+ python inference.py
53
+ ```
54
+
55
+ ## Install the Claude Code skill
56
+
57
+ ```bash
58
+ ln -s "$PWD/skill" "$HOME/.claude/skills/sre-gym"
59
+ ```
60
+
61
+ Then, in Claude Code, ask: *"Solve the db_config_rollout scenario in sre-gym."* The skill will drive the env via `skill/tools/sre_gym_client.py`, load any existing runbook from `skill/verified-runbooks/`, and append a fresh runbook on any clean solve (score > 0.85).
62
+
63
+ ## Architecture
64
+
65
+ ```
66
+ ┌────────────────────┐ HTTP / WS ┌──────────────────────┐
67
+ │ Claude Code │ ──────────────────▶ │ OpenEnv server │
68
+ │ (with sre-gym │ ◀────────────────── │ (FastAPI, uvicorn) │
69
+ │ skill loaded) │ obs, reward │ unified_incident_env │
70
+ └────────────────────┘ └──────────────────────┘
71
+ │ ▲
72
+ ▼ on clean solve (score > 0.85) │
73
+ ┌────────────────────┐ │
74
+ │ verified-runbooks/ │ ────── loaded at skill load ──┘
75
+ │ *.md │
76
+ └────────────────────┘
77
+ ```
78
+
79
+ ## Scoring
80
+
81
+ Deterministic, 5 dimensions, sums to a public score in `[0.01, 0.99]`:
82
+
83
+ - **Recovery** (0–0.4): critical-path services healthy
84
+ - **Containment** (0–0.3): root cause removed or offending service isolated
85
+ - **Verification** (0–0.35): `database_recovery` + `end_to_end` checks passed
86
+ - **Impact** (0–0.15): user-impact reduced
87
+ - **Efficiency** (0–0.10): budget preserved, no wasteful repeats
88
+
89
+ Target **> 0.85** for "clean solve." That's also the runbook-record threshold.
90
+
91
+ ## Repo layout
92
+
93
+ ```
94
+ unified_incident_env/ # env core: models, environment, grader, challenge, tests
95
+ server/ # OpenEnv entrypoint wrapper
96
+ skill/ # Claude Code skill: SKILL.md, tools/, verified-runbooks/
97
+ demo/ # run_demo.sh + pitch.md
98
+ inference.py # OpenAI-client baseline for OpenEnv hackathon submission
99
+ openenv.yaml # OpenEnv manifest
100
+ Dockerfile # HF Space deployment
101
+ ```
102
+
103
+ ## Verify
104
+
105
+ ```bash
106
+ pytest unified_incident_env/tests -q # 21 tests
107
+ python -m openenv.cli validate . # OpenEnv manifest check
108
+ docker build -t sre-engineer-llm:v2 . # HF Space image
109
+ ```
110
+
111
+ ## Roadmap — v2
112
+
113
+ Distill the accumulated `verified-runbooks/` corpus into a local 3B reviewer via [OpenClaw-RL](https://github.com/Gen-Verse/OpenClaw-RL)'s async GRPO-on-next-state loop. Same reward contract (`run_check` passes / `failure_type` absent), same grader, but a compact policy that runs without a frontier API.
114
+
115
+ ## License
116
+
117
+ Apache 2.0
demo/pitch.md ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # sre-gym — 60-second pitch
2
+
3
+ > You can't train SRE agents on production. We built the gym.
4
+
5
+ ## The story (00:00–01:00)
6
+
7
+ **[0:00–0:10 · Hook]** "Most SRE agent skills are prompts — a runbook and a good intention. We built the other half: a fault-injecting environment with deterministic grading, where every run is scored the same way twice."
8
+
9
+ **[0:10–0:25 · What it is]**
10
+ - OpenEnv-compliant. `openenv validate` passes.
11
+ - Three curriculum scenarios, easy → hard:
12
+ - **easy** `worker_deploy_cascade` — bad worker deploy cascades to a DB crash.
13
+ - **medium** `db_config_rollout` — DB config shrank the connection pool; a recent worker deploy is a decoy.
14
+ - **hard** `gateway_auth_rollout` — bad auth-middleware rollout; two plausible suspects, one right answer.
15
+ - 11 bounded actions, honest state transitions (rolling back the wrong thing *fails*), deterministic grader across recovery / containment / verification / impact / efficiency.
16
+ - 21 tests passing. One public Space URL.
17
+
18
+ **[0:25–0:55 · Live demo]** `./demo/run_demo.sh`
19
+ - Env starts. Three scenarios visible in `/tasks`.
20
+ - Runbook dir cleared; demo starts cold.
21
+ - Each scenario solves end-to-end (score ≈ 0.99, 8–10 steps).
22
+ - A markdown runbook is written per scenario from the successful trace.
23
+ - Re-solve the easy scenario — this time the skill loads the runbook first. Same score, same path, zero wasted investigation.
24
+ - Point to `skill/verified-runbooks/` — "Every clean solve makes the next one deterministic. No GRPO required for v1."
25
+
26
+ **[0:55–1:00 · Close]** "Install the skill by symlinking `skill/` into `~/.claude/skills/sre-gym`. Open source, Apache 2. v2 is the OpenClaw-RL loop — distill this corpus of verified runbooks into a local 3B reviewer."
27
+
28
+ ## The one technical claim you should be ready to defend
29
+
30
+ > "The env is honest."
31
+
32
+ - No hidden oracles. Rolling back the wrong service returns a negative reward and `failure_type="wrong_remediation_target"` — same observation contract as any other action.
33
+ - `declare_resolved` is rejected until the scenario's `resolution_check` passes, verified by actual service states in the world model, not a flag the grader peeks at.
34
+ - Rewards reward *effects*, not evidence-gathering — you can't farm the env by spamming `query_logs`.
35
+ - `restart_service` on the database before the root cause is removed returns a negative reward. Always. Because in the real world, it would crash again.
36
+
37
+ ## Judge Q&A cheat sheet
38
+
39
+ **"How is this different from running a real staging env?"**
40
+ Deterministic scoring. Every agent gets graded against the same signatures, same decoys, same tick budget. You can't do that on real infra.
41
+
42
+ **"Why only three scenarios?"**
43
+ Three clears the hackathon DQ gate (`easy/medium/hard`). Each has a decoy + causal chain — building another one is a data-entry exercise, not a design one. Adding scenarios #4–#20 is the v2 data scaling lane.
44
+
45
+ **"Why runbooks instead of GRPO?"**
46
+ For this submission, GRPO means 48 hours of training convergence risk on top of an env we just shipped. Markdown runbooks demonstrate the same loop (verified signal → persisted artefact → next run improves) in an auditable form. The GRPO wiring slots on top of the same traces when we're ready.
47
+
48
+ **"What's the skill actually doing at runtime?"**
49
+ The skill lives in `skill/SKILL.md`. It directs Claude (or any agent) to read `verified-runbooks/{scenario}.md` before the first action, drive the env through `skill/tools/sre_gym_client.py`, and append a fresh runbook on any solve with `final_score > 0.85`.
demo/run_demo.sh ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # sre-gym end-to-end demo.
3
+ # Spins up the env (or reuses a running one), solves each of the 3 scenarios
4
+ # with the baseline policy, records runbooks, shows the artefacts.
5
+ #
6
+ # Requires: python3.10+, docker (for the HF-Space-equivalent image) OR the
7
+ # repo's .venv. Defaults to .venv if present.
8
+
9
+ set -euo pipefail
10
+ cd "$(dirname "$0")/.."
11
+
12
+ PORT="${PORT:-8013}"
13
+ URL="http://127.0.0.1:${PORT}"
14
+ PY="${PYTHON:-.venv/bin/python}"
15
+ RUNBOOK_DIR="skill/verified-runbooks"
16
+
17
+ banner() { printf '\n\033[1;36m== %s ==\033[0m\n' "$*"; }
18
+ ok() { printf '\033[0;32m ✓ %s\033[0m\n' "$*"; }
19
+
20
+ banner "0 / preflight"
21
+ if [[ ! -x "$PY" ]]; then
22
+ echo " note: $PY not found, falling back to system python3" >&2
23
+ PY="python3"
24
+ fi
25
+ "$PY" -c "import unified_incident_env" 2>/dev/null || {
26
+ echo " error: unified_incident_env not importable; run 'pip install -e .' first" >&2
27
+ exit 1
28
+ }
29
+ ok "python + package ready"
30
+
31
+ banner "1 / start env"
32
+ if curl -sf "$URL/health" > /dev/null 2>&1; then
33
+ ok "env already running on $URL"
34
+ SERVER_STARTED=0
35
+ else
36
+ "$PY" -m uvicorn server.app:app --host 127.0.0.1 --port "$PORT" > /tmp/sre_gym_demo.log 2>&1 &
37
+ SERVER_PID=$!
38
+ SERVER_STARTED=1
39
+ for _ in $(seq 1 20); do
40
+ if curl -sf "$URL/health" > /dev/null 2>&1; then break; fi
41
+ sleep 0.3
42
+ done
43
+ curl -sf "$URL/health" > /dev/null || { echo " error: env failed to start" >&2; cat /tmp/sre_gym_demo.log >&2; exit 1; }
44
+ ok "env started on $URL (pid $SERVER_PID)"
45
+ fi
46
+ trap '[[ ${SERVER_STARTED:-0} -eq 1 ]] && kill ${SERVER_PID:-0} 2>/dev/null || true' EXIT
47
+
48
+ banner "2 / available scenarios"
49
+ SRE_GYM_URL="$URL" "$PY" skill/tools/sre_gym_client.py list
50
+
51
+ banner "3 / clear prior runbooks (demo starts cold)"
52
+ rm -f "$RUNBOOK_DIR"/*.md
53
+ ok "runbook directory cleared"
54
+
55
+ for scenario in worker_deploy_cascade db_config_rollout gateway_auth_rollout; do
56
+ banner "4 / solve: $scenario"
57
+ SRE_GYM_URL="$URL" "$PY" skill/tools/sre_gym_client.py solve "$scenario"
58
+ SRE_GYM_URL="$URL" "$PY" skill/tools/sre_gym_client.py record-runbook "$scenario"
59
+ done
60
+
61
+ banner "5 / verified runbooks now on disk"
62
+ ls -1 "$RUNBOOK_DIR"/*.md | sed 's|^| |'
63
+
64
+ banner "6 / re-solve easy scenario — runbook is loaded this time"
65
+ SRE_GYM_URL="$URL" "$PY" skill/tools/sre_gym_client.py solve worker_deploy_cascade | tail -4
66
+
67
+ banner "done"
68
+ echo " install the skill globally: ln -s \"$PWD/skill\" \"\$HOME/.claude/skills/sre-gym\""
69
+ echo " env log: /tmp/sre_gym_demo.log"
70
+ echo " runbooks: $RUNBOOK_DIR/"
deploy/push_to_hf.sh ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Deploy this repo to a Hugging Face Space (Docker SDK).
3
+ #
4
+ # Required:
5
+ # HF_TOKEN write-scoped HF access token
6
+ # HF_SPACE_ID e.g. yourname/sre-gym (create it at huggingface.co/new-space
7
+ # first, SDK=Docker, or let this script try to create it)
8
+ #
9
+ # Usage:
10
+ # HF_TOKEN=hf_xxx HF_SPACE_ID=yourname/sre-gym ./deploy/push_to_hf.sh
11
+ #
12
+ # After a successful push, verify from a different network:
13
+ # curl https://${space_subdomain}.hf.space/health
14
+ # curl https://${space_subdomain}.hf.space/tasks | jq '.scenarios[].difficulty'
15
+
16
+ set -euo pipefail
17
+ cd "$(dirname "$0")/.."
18
+
19
+ : "${HF_TOKEN:?HF_TOKEN is required}"
20
+ : "${HF_SPACE_ID:?HF_SPACE_ID is required, e.g. yourname/sre-gym}"
21
+
22
+ if ! command -v huggingface-cli > /dev/null; then
23
+ echo "error: huggingface-cli not installed. pip install 'huggingface_hub[cli]'" >&2
24
+ exit 1
25
+ fi
26
+
27
+ echo "== syncing openenv.yaml with HF_SPACE_ID =="
28
+ python3 - <<PY
29
+ import pathlib, re
30
+ path = pathlib.Path("openenv.yaml")
31
+ text = path.read_text()
32
+ text = re.sub(r"^ space_id:.*$", f" space_id: $HF_SPACE_ID", text, flags=re.M)
33
+ path.write_text(text)
34
+ print(f"openenv.yaml space_id -> $HF_SPACE_ID")
35
+ PY
36
+
37
+ echo "== ensuring the space exists (idempotent) =="
38
+ huggingface-cli repo create "$HF_SPACE_ID" \
39
+ --type space \
40
+ --space_sdk docker \
41
+ --token "$HF_TOKEN" \
42
+ --yes 2>&1 | grep -v "already created" || true
43
+
44
+ echo "== uploading repo =="
45
+ huggingface-cli upload "$HF_SPACE_ID" . \
46
+ --repo-type space \
47
+ --token "$HF_TOKEN" \
48
+ --commit-message "deploy sre-gym v2 (easy/medium/hard scenarios)"
49
+
50
+ subdomain="$(echo "$HF_SPACE_ID" | tr '/' '-')"
51
+ echo
52
+ echo "== deployment kicked off =="
53
+ echo " Logs: https://huggingface.co/spaces/$HF_SPACE_ID"
54
+ echo " Public: https://$subdomain.hf.space"
55
+ echo
56
+ echo "== verify from a different network (phone hotspot) =="
57
+ echo " curl https://$subdomain.hf.space/health"
58
+ echo " curl https://$subdomain.hf.space/tasks | jq '.scenarios[].difficulty'"
execution.md ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # How To Run (v2)
2
+
3
+ ## 1. Setup
4
+
5
+ ```bash
6
+ python3 -m venv .venv
7
+ source .venv/bin/activate
8
+ pip install -e '.[dev]'
9
+ ```
10
+
11
+ ## 2. Start the environment
12
+
13
+ ```bash
14
+ source .venv/bin/activate
15
+ uvicorn server.app:app --host 127.0.0.1 --port 8000
16
+ ```
17
+
18
+ ## 3. Manual API smoke test
19
+
20
+ ```bash
21
+ curl -X POST http://127.0.0.1:8000/reset -H 'content-type: application/json' -d '{}'
22
+ curl -X POST http://127.0.0.1:8000/step -H 'content-type: application/json' -d '{"action":{"action_type":"query_deploys","service":"worker"}}'
23
+ ```
24
+
25
+ ## 4. Run inference
26
+
27
+ ```bash
28
+ source .venv/bin/activate
29
+
30
+ export HF_TOKEN="your_hf_token"
31
+ export API_BASE_URL="https://router.huggingface.co/v1"
32
+ export MODEL_NAME="Qwen/Qwen2.5-72B-Instruct:novita"
33
+ export ENV_BASE_URL="http://127.0.0.1:8000"
34
+
35
+ python inference.py
36
+ ```
37
+
38
+ ## 5. Verification
39
+
40
+ ```bash
41
+ source .venv/bin/activate
42
+ pytest unified_incident_env/tests -q
43
+ openenv validate .
44
+ ```
45
+
46
+ ## 6. Reward semantics
47
+
48
+ - queries reveal evidence but do not directly mint positive breadcrumb reward
49
+ - remediation actions change the world state
50
+ - `run_check` verifies recovery explicitly
51
+ - `declare_resolved` succeeds only after objective checks pass
52
+
53
+ Public benchmark score is deterministic and separate from the per-step training reward.
inference.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Submission inference script for the honest narrow incident environment."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import json
7
+ import os
8
+ from typing import Any
9
+
10
+ from openai import OpenAI
11
+
12
+ from unified_incident_env.client import UnifiedIncidentEnv
13
+ from unified_incident_env.models import UnifiedIncidentAction, UnifiedIncidentObservation
14
+ from unified_incident_env.server.challenge import DEFAULT_SCENARIO_ID, SCENARIOS
15
+
16
+ API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
17
+ MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct:novita")
18
+ HF_TOKEN = os.getenv("HF_TOKEN")
19
+ ENV_BASE_URL = os.getenv("ENV_BASE_URL") or UnifiedIncidentEnv.DEFAULT_BASE_URL
20
+ ENV_NAME = "unified-incident-env"
21
+ MAX_TOKENS = 260
22
+
23
+
24
+ def create_client() -> OpenAI | None:
25
+ if not HF_TOKEN:
26
+ return None
27
+ return OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
28
+
29
+
30
+ def log_start(*, task: str, env: str, model: str) -> None:
31
+ print(f"[START] task={task} env={env} model={model}", flush=True)
32
+
33
+
34
+ def log_step(*, step: int, action: str, reward: float, done: bool, error: str | None) -> None:
35
+ print(
36
+ f"[STEP] step={step} action={action} reward={reward:.2f} done={str(done).lower()} error={error or 'null'}",
37
+ flush=True,
38
+ )
39
+
40
+
41
+ def log_end(*, success: bool, steps: int, score: float, rewards: list[float]) -> None:
42
+ rewards_text = ",".join(f"{reward:.2f}" for reward in rewards)
43
+ print(f"[END] success={str(success).lower()} steps={steps} score={score:.2f} rewards={rewards_text}", flush=True)
44
+
45
+
46
+ def _service_order(observation: UnifiedIncidentObservation) -> list[str]:
47
+ services = list(observation.service_health.items())
48
+ services.sort(key=lambda item: (item[1].status != "crashed", item[1].status != "degraded", -item[1].error_rate_pct))
49
+ return [name for name, _payload in services]
50
+
51
+
52
+ def _default_action_for_type(action_type: str, observation: UnifiedIncidentObservation) -> dict[str, Any]:
53
+ services = _service_order(observation)
54
+ service = services[0] if services else "database"
55
+ if action_type in {"query_logs", "query_dependencies", "query_deploys", "rollback_deploy", "restart_service", "isolate_service"}:
56
+ if action_type == "rollback_deploy":
57
+ service = "worker"
58
+ return {"action_type": action_type, "service": service}
59
+ if action_type == "query_metrics":
60
+ return {"action_type": action_type, "service": service, "metric": "cpu"}
61
+ if action_type == "run_check":
62
+ check_name = "database_recovery"
63
+ if observation.service_health.get("database") and observation.service_health["database"].status == "healthy":
64
+ check_name = "end_to_end"
65
+ return {"action_type": action_type, "check_name": check_name}
66
+ if action_type == "submit_hypothesis":
67
+ return {
68
+ "action_type": "submit_hypothesis",
69
+ "hypothesis": {
70
+ "root_cause": "bad_worker_deploy",
71
+ "affected_services": ["worker", "database"],
72
+ "confidence": 0.5,
73
+ "recommended_next_action": "query_deploys",
74
+ },
75
+ }
76
+ return {"action_type": action_type}
77
+
78
+
79
+ def parse_action(raw: str, observation: UnifiedIncidentObservation) -> UnifiedIncidentAction | None:
80
+ text = raw.strip()
81
+ if not text:
82
+ return None
83
+ try:
84
+ data = json.loads(text)
85
+ except Exception:
86
+ return None
87
+ if not isinstance(data, dict):
88
+ return None
89
+ if "action" in data and "action_type" not in data and isinstance(data["action"], str):
90
+ data = {**data, "action_type": data["action"]}
91
+ data.pop("action", None)
92
+ action_type = data.get("action_type")
93
+ if action_type not in observation.allowed_actions:
94
+ return None
95
+ try:
96
+ return UnifiedIncidentAction(**data)
97
+ except Exception:
98
+ return None
99
+
100
+
101
+ def build_user_prompt(observation: UnifiedIncidentObservation) -> str:
102
+ required_lines = []
103
+ for action, fields in observation.required_fields_by_action.items():
104
+ required_lines.append(f"- {action}: {', '.join(fields) if fields else '(no extra fields)'}")
105
+ checks = "\n".join(
106
+ f"- {check.name}: {'passed' if check.passed else 'pending'} - {check.detail}"
107
+ for check in observation.checks
108
+ ) or "- none"
109
+ return (
110
+ "Return exactly one JSON object representing the next action.\n"
111
+ f"Current stage: {observation.workflow_stage}\n"
112
+ f"Incident summary: {observation.incident_summary}\n"
113
+ f"Current score: {observation.final_score:.4f}\n"
114
+ f"Last action result: {observation.last_action_result or 'none'}\n"
115
+ f"Tool output: {observation.tool_output or 'none'}\n"
116
+ f"Failure: {observation.failure_type or 'none'}\n"
117
+ f"Why failed: {observation.why_failed or 'none'}\n"
118
+ f"User impact: {observation.user_impact:.2f}\n"
119
+ f"SLO burn rate: {observation.slo_burn_rate:.2f}\n"
120
+ "Allowed actions:\n"
121
+ + "\n".join(f"- {action}" for action in observation.allowed_actions)
122
+ + "\nRequired fields:\n"
123
+ + "\n".join(required_lines)
124
+ + "\nChecks:\n"
125
+ + checks
126
+ )
127
+
128
+
129
+ def _schema(observation: UnifiedIncidentObservation) -> dict[str, Any]:
130
+ properties: dict[str, Any] = {
131
+ "action_type": {"type": "string", "enum": observation.allowed_actions},
132
+ "service": {"type": "string", "enum": sorted(observation.service_health)},
133
+ "metric": {"type": "string", "enum": ["cpu", "error_rate", "latency"]},
134
+ "check_name": {"type": "string", "enum": ["database_recovery", "end_to_end"]},
135
+ "hypothesis": {
136
+ "type": "object",
137
+ "properties": {
138
+ "root_cause": {"type": "string", "enum": ["bad_worker_deploy", "database_only_failure", "api_gateway_fault"]},
139
+ "affected_services": {
140
+ "type": "array",
141
+ "items": {"type": "string", "enum": sorted(observation.service_health)},
142
+ "minItems": 1,
143
+ },
144
+ "confidence": {"type": "number", "minimum": 0.0, "maximum": 1.0},
145
+ "recommended_next_action": {
146
+ "type": "string",
147
+ "enum": [
148
+ "query_logs",
149
+ "query_metrics",
150
+ "query_dependencies",
151
+ "query_deploys",
152
+ "rollback_deploy",
153
+ "restart_service",
154
+ "run_check",
155
+ "isolate_service",
156
+ "escalate",
157
+ "declare_resolved",
158
+ ],
159
+ },
160
+ },
161
+ "required": ["root_cause", "affected_services", "confidence", "recommended_next_action"],
162
+ "additionalProperties": False,
163
+ },
164
+ }
165
+ required = ["action_type"]
166
+ for action, fields in observation.required_fields_by_action.items():
167
+ if action in observation.allowed_actions:
168
+ for field in fields:
169
+ if field not in required:
170
+ required.append(field)
171
+ return {
172
+ "type": "object",
173
+ "properties": properties,
174
+ "required": required,
175
+ "additionalProperties": False,
176
+ }
177
+
178
+
179
+ def request_action(client: OpenAI, observation: UnifiedIncidentObservation) -> str:
180
+ completion = client.chat.completions.create(
181
+ model=MODEL_NAME,
182
+ messages=[
183
+ {"role": "system", "content": "You are an incident responder. Respond with JSON only."},
184
+ {"role": "user", "content": build_user_prompt(observation)},
185
+ ],
186
+ response_format={
187
+ "type": "json_schema",
188
+ "json_schema": {
189
+ "name": "incident_action",
190
+ "strict": True,
191
+ "schema": _schema(observation),
192
+ },
193
+ },
194
+ max_tokens=MAX_TOKENS,
195
+ temperature=0.0,
196
+ )
197
+ return (completion.choices[0].message.content or "").strip()
198
+
199
+
200
+ def build_fallback_action(observation: UnifiedIncidentObservation) -> UnifiedIncidentAction:
201
+ services = _service_order(observation)
202
+ if "query_deploys" in observation.allowed_actions and "worker" in observation.service_health:
203
+ return UnifiedIncidentAction(action_type="query_deploys", service="worker")
204
+ if "query_logs" in observation.allowed_actions:
205
+ return UnifiedIncidentAction(action_type="query_logs", service=services[0] if services else "database")
206
+ if "query_metrics" in observation.allowed_actions:
207
+ return UnifiedIncidentAction(action_type="query_metrics", service=services[0] if services else "database", metric="cpu")
208
+ action_type = observation.allowed_actions[0]
209
+ return UnifiedIncidentAction(**_default_action_for_type(action_type, observation))
210
+
211
+
212
+ def get_model_action(client: OpenAI | None, observation: UnifiedIncidentObservation) -> tuple[UnifiedIncidentAction, str | None]:
213
+ if client is None:
214
+ return build_fallback_action(observation), "model_unavailable"
215
+ try:
216
+ parsed = parse_action(request_action(client, observation), observation)
217
+ if parsed is not None:
218
+ return parsed, None
219
+ except Exception:
220
+ pass
221
+ return build_fallback_action(observation), "fallback_used"
222
+
223
+
224
+ def run_scenario(client: OpenAI | None, scenario_id: str) -> dict[str, Any]:
225
+ with UnifiedIncidentEnv(base_url=ENV_BASE_URL).sync() as env:
226
+ observation = env.reset(scenario_id=scenario_id).observation
227
+ rewards: list[float] = []
228
+ step = 0
229
+ log_start(task=scenario_id, env=ENV_NAME, model=MODEL_NAME)
230
+ while not observation.done:
231
+ step += 1
232
+ action, error = get_model_action(client, observation)
233
+ result = env.step(action)
234
+ observation = result.observation
235
+ rewards.append(float(result.reward))
236
+ log_step(
237
+ step=step,
238
+ action=json.dumps(action.model_dump(exclude_none=True), separators=(",", ":")),
239
+ reward=float(result.reward),
240
+ done=bool(result.done),
241
+ error=error or observation.failure_type,
242
+ )
243
+ log_end(
244
+ success=bool(observation.done and observation.incident_resolved),
245
+ steps=step,
246
+ score=observation.final_score,
247
+ rewards=rewards,
248
+ )
249
+ return {
250
+ "success": bool(observation.done and observation.incident_resolved),
251
+ "score": observation.final_score,
252
+ "steps": step,
253
+ "rewards": rewards,
254
+ }
255
+
256
+
257
+ def main() -> None:
258
+ client = create_client()
259
+ for scenario_id in SCENARIOS:
260
+ run_scenario(client, scenario_id)
261
+
262
+
263
+ if __name__ == "__main__":
264
+ main()
openenv.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: sre-engineer-llm
2
+ version: 2.0.0
3
+ description: >
4
+ Honest narrow OpenEnv benchmark for incident diagnosis and safe remediation.
5
+ Agents query evidence, choose bounded remediation actions, run explicit checks,
6
+ and declare resolution only after objective recovery succeeds.
7
+ author: Daksh Verma
8
+ license: MIT
9
+
10
+ environment:
11
+ action_type: UnifiedIncidentAction
12
+ observation_type: UnifiedIncidentObservation
13
+ state_type: UnifiedIncidentState
14
+ max_steps: 12
15
+ difficulties: [easy, medium, hard]
16
+ reward_type: dense
17
+
18
+ huggingface:
19
+ space_id: dakshdoesdev/sre-gym
20
+ sdk: docker
21
+ hardware: cpu-basic
pyproject.toml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "unified-incident-env"
7
+ version = "1.0.0"
8
+ description = "Unified OpenEnv benchmark for incident response with causally linked WebSec remediation"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ dependencies = [
12
+ "openenv-core>=0.2.1",
13
+ "fastapi>=0.115.0",
14
+ "uvicorn[standard]>=0.30.0",
15
+ "pydantic>=2.8.0",
16
+ "httpx>=0.27.0",
17
+ "openai>=1.0.0",
18
+ "websockets>=12.0",
19
+ "rich>=13.0.0",
20
+ "matplotlib>=3.9.0",
21
+ "numpy>=2.0.0"
22
+ ]
23
+
24
+ [project.optional-dependencies]
25
+ dev = [
26
+ "pytest>=8.0.0",
27
+ "pytest-asyncio>=0.23.0"
28
+ ]
29
+
30
+ [project.scripts]
31
+ server = "server.app:main"
32
+ baseline = "unified_incident_env.scripts.baseline_agent:main"
33
+ walkthrough = "unified_incident_env.scripts.walkthrough:main"
34
+ trainer-run-episode = "unified_incident_env.trainer.run_episode:main"
35
+ trainer-build-dataset = "unified_incident_env.trainer.build_sft_dataset:main"
36
+ trainer-eval-models = "unified_incident_env.trainer.eval_models:main"
37
+ trainer-build-datasets = "unified_incident_env.trainer.build_datasets:main"
38
+ trainer-update-model = "unified_incident_env.trainer.update_model:main"
39
+ trainer-run-session = "unified_incident_env.trainer.run_session:main"
40
+ trainer-train-external = "unified_incident_env.trainer.train_external:main"
41
+
42
+ [tool.hatch.build.targets.wheel]
43
+ packages = ["unified_incident_env", "server"]
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ -e .
run_demo.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Run a local end-to-end benchmark demo against the OpenEnv server."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import os
7
+ import signal
8
+ import subprocess
9
+ import sys
10
+ import time
11
+ from pathlib import Path
12
+
13
+ import httpx
14
+
15
+
16
+ REPO_ROOT = Path(__file__).resolve().parent
17
+ BASE_URL = os.getenv("ENV_BASE_URL", "http://127.0.0.1:8000")
18
+ HEALTH_URL = f"{BASE_URL.rstrip('/')}/health"
19
+
20
+
21
+ def server_is_ready() -> bool:
22
+ try:
23
+ response = httpx.get(HEALTH_URL, timeout=2.0)
24
+ return response.status_code == 200
25
+ except Exception:
26
+ return False
27
+
28
+
29
+ def start_server() -> subprocess.Popen[str]:
30
+ return subprocess.Popen(
31
+ [
32
+ sys.executable,
33
+ "-m",
34
+ "uvicorn",
35
+ "server.app:app",
36
+ "--host",
37
+ "127.0.0.1",
38
+ "--port",
39
+ "8000",
40
+ ],
41
+ cwd=REPO_ROOT,
42
+ text=True,
43
+ )
44
+
45
+
46
+ def wait_for_server(timeout_s: float = 20.0) -> None:
47
+ deadline = time.time() + timeout_s
48
+ while time.time() < deadline:
49
+ if server_is_ready():
50
+ return
51
+ time.sleep(0.5)
52
+ raise RuntimeError(f"Server did not become ready at {HEALTH_URL}")
53
+
54
+
55
+ def stop_server(process: subprocess.Popen[str]) -> None:
56
+ if process.poll() is not None:
57
+ return
58
+ process.send_signal(signal.SIGTERM)
59
+ try:
60
+ process.wait(timeout=10)
61
+ except subprocess.TimeoutExpired:
62
+ process.kill()
63
+
64
+
65
+ def main() -> None:
66
+ server_process: subprocess.Popen[str] | None = None
67
+ try:
68
+ if not server_is_ready():
69
+ server_process = start_server()
70
+ wait_for_server()
71
+
72
+ env = os.environ.copy()
73
+ env.setdefault("ENV_BASE_URL", BASE_URL)
74
+ subprocess.run(
75
+ [sys.executable, "inference.py"],
76
+ cwd=REPO_ROOT,
77
+ env=env,
78
+ check=True,
79
+ )
80
+ finally:
81
+ if server_process is not None:
82
+ stop_server(server_process)
83
+
84
+
85
+ if __name__ == "__main__":
86
+ main()
server/Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ ENV PYTHONDONTWRITEBYTECODE=1 \
6
+ PYTHONUNBUFFERED=1 \
7
+ ENABLE_WEB_INTERFACE=true
8
+
9
+ RUN apt-get update && apt-get install -y --no-install-recommends \
10
+ build-essential \
11
+ curl \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ COPY . /app
15
+
16
+ RUN pip install --no-cache-dir --upgrade pip && \
17
+ pip install --no-cache-dir .
18
+
19
+ EXPOSE 8000
20
+
21
+ CMD ["uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8000"]
server/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """OpenEnv server wrapper package."""
server/app.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Top-level OpenEnv entrypoint wrapper."""
2
+
3
+ from unified_incident_env.server.app import app, serve
4
+ from unified_incident_env.server.app import main as _main
5
+
6
+ __all__ = ["app", "main", "serve"]
7
+
8
+
9
+ def main() -> None:
10
+ _main()
11
+
12
+
13
+ if __name__ == "__main__":
14
+ main()
server/requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ openenv-core>=0.2.1
2
+ fastapi>=0.115.0
3
+ uvicorn[standard]>=0.30.0
4
+ pydantic>=2.8.0
5
+ websockets>=12.0
6
+ openai>=1.0.0
7
+ matplotlib>=3.9.0
8
+ numpy>=2.0.0
skill/SKILL.md ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ name: sre-gym
3
+ description: SRE incident-response training environment with fault injection and deterministic grading. Use when the user wants to practice SRE skills, solve an injected production incident, or run one of three scenarios (worker_deploy_cascade / db_config_rollout / gateway_auth_rollout) against the sre-gym HTTP server. Invokes scripts in skill/tools/ to query the env and records verified runbooks after clean solves.
4
+ ---
5
+
6
+ # SRE Gym — Incident Response Skill
7
+
8
+ You are an SRE agent connected to a running sre-gym environment (HTTP, default `http://127.0.0.1:8000`). The env simulates production incidents with decoy services, deterministic grading, and explicit resolution checks. Your job is to diagnose from evidence, pick the correct remediation, verify recovery, then declare resolved.
9
+
10
+ ## When to use this skill
11
+
12
+ - The user names a scenario (`worker_deploy_cascade`, `db_config_rollout`, `gateway_auth_rollout`) or says "solve an incident / run SRE scenario"
13
+ - The user asks you to practice, benchmark, or demo incident response
14
+ - The user points you at an sre-gym URL
15
+
16
+ ## Core rules (never break these)
17
+
18
+ 1. **Never guess at remediation.** Query evidence (`query_logs`, `query_deploys`, `query_metrics`) before `rollback_deploy` / `restart_service`.
19
+ 2. **Root cause before restart.** Restarting a service before rolling back the triggering change re-inherits the bad state.
20
+ 3. **Never call `declare_resolved` before the scenario's resolution check passes.** Each scenario specifies which check is required; read it from `observation.checks` and from any loaded runbook.
21
+ 4. **Watch for decoys.** Each scenario has a plausible-looking wrong answer. Example: `db_config_rollout` has a recent worker deploy that is *not* the cause. Read logs before committing to a target.
22
+ 5. **Repeating the same no-progress action wastes ticks.** The env emits `loop_warning` when you do this — treat it as a hard signal to try a different evidence source.
23
+
24
+ ## Workflow
25
+
26
+ ### 1. Load prior knowledge
27
+
28
+ Before your first action, check `skill/verified-runbooks/{scenario_id}.md`. If it exists, read it — it's a log of previously-successful solves for this exact scenario, written by earlier runs of this skill. Use the winning path and the decoy list.
29
+
30
+ ### 2. Drive the env
31
+
32
+ Use `skill/tools/sre_gym_client.py` to call the env:
33
+
34
+ ```bash
35
+ python skill/tools/sre_gym_client.py list # show available scenarios
36
+ python skill/tools/sre_gym_client.py reset <id> # start an episode
37
+ python skill/tools/sre_gym_client.py step '<json>' # take one action
38
+ python skill/tools/sre_gym_client.py status # current obs + grader
39
+ ```
40
+
41
+ Action JSON matches the env's `UnifiedIncidentAction` model. Examples:
42
+ ```json
43
+ {"action_type": "query_logs", "service": "database"}
44
+ {"action_type": "query_deploys", "service": "worker"}
45
+ {"action_type": "rollback_deploy", "service": "database"}
46
+ {"action_type": "run_check", "check_name": "end_to_end"}
47
+ {"action_type": "declare_resolved"}
48
+ ```
49
+
50
+ ### 3. Investigation loop (per tick)
51
+
52
+ 1. Read `observation.prompt_text` — services, alerts, last result, failure_type, why_failed.
53
+ 2. If `observation.failure_type` is set, your previous action was rejected — **do not repeat it**, read `why_failed` and pick a different evidence source or remediation.
54
+ 3. Form a hypothesis with `submit_hypothesis` once you have enough evidence (usually 2–4 queries). Calibrate `confidence`: ≥0.7 only if you're sure.
55
+ 4. Remediate (`rollback_deploy` → `restart_service` if scenario requires → `run_check`).
56
+ 5. `declare_resolved` only after the required check passes.
57
+
58
+ ### 4. Record the runbook
59
+
60
+ If the episode finishes with `incident_resolved=true` and `final_score > 0.85`, run:
61
+
62
+ ```bash
63
+ python skill/tools/sre_gym_client.py record-runbook <scenario_id>
64
+ ```
65
+
66
+ This appends a new entry to `skill/verified-runbooks/{scenario_id}.md`. Future runs of this skill (yours or another Claude's) load it automatically.
67
+
68
+ ## Action reference (11 actions)
69
+
70
+ | Action | Required fields | Purpose |
71
+ |---|---|---|
72
+ | `query_logs` | `service` | Read service-level error logs |
73
+ | `query_metrics` | `service`, `metric` (cpu/error_rate/latency) | Read quantitative signals |
74
+ | `query_dependencies` | `service` | Map upstream/downstream |
75
+ | `query_deploys` | `service` | Recent deploy history |
76
+ | `rollback_deploy` | `service` | Revert last deploy — SCENARIO-SPECIFIC TARGET |
77
+ | `restart_service` | `service` | Reboot a service (usually after rollback) |
78
+ | `run_check` | `check_name` (`database_recovery` / `end_to_end`) | Objective recovery check |
79
+ | `isolate_service` | `service` | Containment only, does not resolve |
80
+ | `escalate` | — | Record escalation note |
81
+ | `submit_hypothesis` | `hypothesis` object | Commit RCA with confidence calibration |
82
+ | `declare_resolved` | — | Finalize; rejected if required check has not passed |
83
+
84
+ ## Scoring rubric (deterministic from the env)
85
+
86
+ - **Recovery (0–0.4):** services healthy on the critical path
87
+ - **Containment (0–0.3):** root cause removed OR offending service isolated
88
+ - **Verification (0–0.35):** both checks passed
89
+ - **Impact (0–0.15):** user_impact reduced
90
+ - **Efficiency (0–0.10):** budget preserved, no wasteful repeats
91
+
92
+ Clean solve target: **> 0.85**. That's the runbook-record threshold.
93
+
94
+ ## Decoy knowledge (read before hypothesizing)
95
+
96
+ - `worker_deploy_cascade`: the only true cause; no decoys.
97
+ - `db_config_rollout`: the recent worker deploy is a **decoy**. Rolling back worker yields `wrong_remediation_target`.
98
+ - `gateway_auth_rollout`: the recent worker deploy (`worker@...-hotfix` — log-format tweak) is a **decoy**. The gateway auth rollout is the cause.
99
+
100
+ If you take a wrong remediation, the env returns `failure_type="wrong_remediation_target"` and a negative reward — **do not retry the same wrong target**, re-read the logs.
skill/tools/sre_gym_client.py ADDED
@@ -0,0 +1,238 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """CLI client for the sre-gym skill.
3
+
4
+ Usage:
5
+ sre_gym_client.py list
6
+ sre_gym_client.py solve <scenario_id> [--policy baseline]
7
+ sre_gym_client.py interactive <scenario_id> # stdin: one JSON action per line
8
+ sre_gym_client.py record-runbook <scenario_id> <session.json>
9
+
10
+ Because OpenEnv's HTTP /reset and /step handlers create a fresh environment per
11
+ call, episode state only persists within a single client session. This CLI wraps
12
+ one episode inside one Python process so the session is preserved.
13
+
14
+ SRE_GYM_URL env var overrides the base URL (default http://127.0.0.1:8000).
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import datetime as _dt
20
+ import json
21
+ import os
22
+ import sys
23
+ from pathlib import Path
24
+ from typing import Any
25
+
26
+ # Make the sibling package importable whether the script is invoked from the
27
+ # repo root or from the skill/ directory directly.
28
+ _REPO_ROOT = Path(__file__).resolve().parent.parent.parent
29
+ if str(_REPO_ROOT) not in sys.path:
30
+ sys.path.insert(0, str(_REPO_ROOT))
31
+
32
+ from unified_incident_env.client import UnifiedIncidentEnv # noqa: E402
33
+ from unified_incident_env.models import UnifiedIncidentAction, UnifiedIncidentObservation # noqa: E402
34
+ from unified_incident_env.server.challenge import SCENARIOS, list_baselines # noqa: E402
35
+
36
+ BASE_URL = os.environ.get("SRE_GYM_URL", "http://127.0.0.1:8000").rstrip("/")
37
+ RUNBOOK_DIR = Path(__file__).resolve().parent.parent / "verified-runbooks"
38
+ SCORE_THRESHOLD = 0.85
39
+
40
+
41
+ def _clean_action(action: UnifiedIncidentAction) -> dict[str, Any]:
42
+ data = action.model_dump(exclude_none=True)
43
+ if data.get("metadata") == {}:
44
+ data.pop("metadata")
45
+ hypothesis = data.get("hypothesis")
46
+ if isinstance(hypothesis, dict) and hypothesis.get("metadata") == {}:
47
+ hypothesis.pop("metadata", None)
48
+ return data
49
+
50
+
51
+ def _summarize_obs(obs: UnifiedIncidentObservation) -> dict[str, Any]:
52
+ return {
53
+ "tick": obs.tick_count,
54
+ "workflow_stage": obs.workflow_stage,
55
+ "last_action_result": obs.last_action_result,
56
+ "tool_output": obs.tool_output,
57
+ "failure_type": obs.failure_type,
58
+ "why_failed": obs.why_failed,
59
+ "loop_warning": obs.loop_warning,
60
+ "checks": [{"name": c.name, "passed": c.passed} for c in obs.checks],
61
+ "final_score": obs.final_score,
62
+ "incident_resolved": obs.incident_resolved,
63
+ }
64
+
65
+
66
+ def _session_path(scenario_id: str) -> Path:
67
+ return Path(f"/tmp/sre_gym_session.{scenario_id}.json")
68
+
69
+
70
+ def cmd_list() -> None:
71
+ for scenario in SCENARIOS.values():
72
+ print(f" {scenario['difficulty']:<6} {scenario['id']:<25} {scenario['name']}")
73
+
74
+
75
+ def cmd_solve(scenario_id: str, policy: str = "baseline") -> None:
76
+ """Run an entire episode end-to-end inside one process."""
77
+ if scenario_id not in SCENARIOS:
78
+ print(f"error: unknown scenario {scenario_id!r}", file=sys.stderr)
79
+ sys.exit(2)
80
+ if policy != "baseline":
81
+ print(f"error: unknown policy {policy!r} (only 'baseline' available)", file=sys.stderr)
82
+ sys.exit(2)
83
+
84
+ trace: list[dict[str, Any]] = []
85
+ with UnifiedIncidentEnv(base_url=BASE_URL).sync() as env:
86
+ obs = env.reset(scenario_id=scenario_id).observation
87
+ print(f"[reset] scenario={scenario_id} difficulty={obs.difficulty}")
88
+ for step in list_baselines(scenario_id).baselines[0].actions:
89
+ result = env.step(step.action)
90
+ obs = result.observation
91
+ record = {
92
+ "step": obs.tick_count,
93
+ "action": _clean_action(step.action),
94
+ "rationale": step.rationale,
95
+ "reward": result.reward,
96
+ **_summarize_obs(obs),
97
+ }
98
+ trace.append(record)
99
+ action_repr = json.dumps(record["action"], separators=(",", ":"))
100
+ print(f"[step {obs.tick_count}] action={action_repr} reward={result.reward:+.2f} score={obs.final_score:.2f}")
101
+ if result.done:
102
+ break
103
+ final = _summarize_obs(obs)
104
+
105
+ _session_path(scenario_id).write_text(
106
+ json.dumps({"scenario_id": scenario_id, "trace": trace, "final": final}, indent=2),
107
+ encoding="utf-8",
108
+ )
109
+ print(
110
+ f"[done] resolved={final['incident_resolved']} score={final['final_score']:.2f} "
111
+ f"steps={final['tick']} session={_session_path(scenario_id)}"
112
+ )
113
+
114
+
115
+ def cmd_interactive(scenario_id: str) -> None:
116
+ """One JSON action per stdin line. Preserves session for the whole process lifetime."""
117
+ if scenario_id not in SCENARIOS:
118
+ print(f"error: unknown scenario {scenario_id!r}", file=sys.stderr)
119
+ sys.exit(2)
120
+
121
+ trace: list[dict[str, Any]] = []
122
+ with UnifiedIncidentEnv(base_url=BASE_URL).sync() as env:
123
+ obs = env.reset(scenario_id=scenario_id).observation
124
+ print(json.dumps({"event": "reset", "scenario_id": scenario_id, "obs": _summarize_obs(obs)}), flush=True)
125
+ for line in sys.stdin:
126
+ line = line.strip()
127
+ if not line:
128
+ continue
129
+ try:
130
+ data = json.loads(line)
131
+ action = UnifiedIncidentAction(**data)
132
+ except Exception as exc:
133
+ print(json.dumps({"event": "error", "detail": str(exc)}), flush=True)
134
+ continue
135
+ result = env.step(action)
136
+ obs = result.observation
137
+ record = {"step": obs.tick_count, "action": _clean_action(action), "reward": result.reward, **_summarize_obs(obs)}
138
+ trace.append(record)
139
+ print(json.dumps({"event": "step", **record}), flush=True)
140
+ if result.done:
141
+ print(json.dumps({"event": "done", "final": _summarize_obs(obs)}), flush=True)
142
+ break
143
+
144
+ _session_path(scenario_id).write_text(
145
+ json.dumps({"scenario_id": scenario_id, "trace": trace, "final": _summarize_obs(obs)}, indent=2),
146
+ encoding="utf-8",
147
+ )
148
+
149
+
150
+ def cmd_record_runbook(scenario_id: str, session_file: str | None = None) -> None:
151
+ """Append a new runbook entry if the referenced session cleared the threshold."""
152
+ path = Path(session_file) if session_file else _session_path(scenario_id)
153
+ if not path.exists():
154
+ print(f"error: no session file at {path}", file=sys.stderr)
155
+ sys.exit(2)
156
+ session = json.loads(path.read_text(encoding="utf-8"))
157
+ final = session.get("final", {})
158
+ score = float(final.get("final_score", 0.0))
159
+
160
+ if not final.get("incident_resolved"):
161
+ print(f"skip: session not resolved (resolved={final.get('incident_resolved')})", file=sys.stderr)
162
+ sys.exit(1)
163
+ if score < SCORE_THRESHOLD:
164
+ print(f"skip: score {score:.2f} below runbook threshold {SCORE_THRESHOLD:.2f}", file=sys.stderr)
165
+ sys.exit(1)
166
+
167
+ RUNBOOK_DIR.mkdir(parents=True, exist_ok=True)
168
+ runbook_path = RUNBOOK_DIR / f"{scenario_id}.md"
169
+
170
+ timestamp = _dt.datetime.now(_dt.timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
171
+ steps = int(final.get("tick", 0))
172
+ checks_passed = [c["name"] for c in final.get("checks", []) if c.get("passed")]
173
+ trace = session.get("trace", [])
174
+
175
+ header = (
176
+ f"# verified-runbooks/{scenario_id}.md\n\n"
177
+ "Runbook entries are written by the sre-gym skill after a successful solve "
178
+ f"(incident_resolved=true and final_score > {SCORE_THRESHOLD:.2f}).\n"
179
+ "Each entry is immutable evidence — treat it as ground truth for the winning path.\n\n---\n"
180
+ )
181
+ lines = [f"\n## Run {timestamp} — Score {score:.2f}\n"]
182
+ lines.append(f"- Steps: **{steps}**")
183
+ lines.append(f"- Checks passed: {', '.join(checks_passed) or 'none'}")
184
+ lines.append("")
185
+ lines.append("**Winning path:**")
186
+ for entry in trace:
187
+ act = entry["action"]
188
+ action_type = act.get("action_type")
189
+ extras = ", ".join(
190
+ f"{k}={v if not isinstance(v, dict) else v.get('root_cause', v)}"
191
+ for k, v in act.items()
192
+ if k != "action_type" and v not in (None, {})
193
+ )
194
+ extra_str = f" ({extras})" if extras else ""
195
+ rationale = entry.get("rationale", "").rstrip(".")
196
+ lines.append(f"{entry['step']}. `{action_type}{extra_str}` — {rationale}")
197
+ lines.append("")
198
+ entry_text = "\n".join(lines)
199
+
200
+ if not runbook_path.exists():
201
+ runbook_path.write_text(header + entry_text, encoding="utf-8")
202
+ else:
203
+ with runbook_path.open("a", encoding="utf-8") as f:
204
+ f.write(entry_text)
205
+ print(f"recorded runbook entry → {runbook_path} (score {score:.2f}, {steps} steps)")
206
+
207
+
208
+ def main() -> None:
209
+ argv = sys.argv[1:]
210
+ if not argv:
211
+ print(__doc__, file=sys.stderr)
212
+ sys.exit(2)
213
+ cmd, *rest = argv
214
+ if cmd == "list":
215
+ cmd_list()
216
+ elif cmd == "solve":
217
+ if not rest:
218
+ print("error: solve requires <scenario_id>", file=sys.stderr)
219
+ sys.exit(2)
220
+ cmd_solve(rest[0], rest[1] if len(rest) > 1 else "baseline")
221
+ elif cmd == "interactive":
222
+ if not rest:
223
+ print("error: interactive requires <scenario_id>", file=sys.stderr)
224
+ sys.exit(2)
225
+ cmd_interactive(rest[0])
226
+ elif cmd == "record-runbook":
227
+ if not rest:
228
+ print("error: record-runbook requires <scenario_id>", file=sys.stderr)
229
+ sys.exit(2)
230
+ cmd_record_runbook(rest[0], rest[1] if len(rest) > 1 else None)
231
+ else:
232
+ print(f"error: unknown command {cmd!r}", file=sys.stderr)
233
+ print(__doc__, file=sys.stderr)
234
+ sys.exit(2)
235
+
236
+
237
+ if __name__ == "__main__":
238
+ main()
skill/verified-runbooks/.gitkeep ADDED
File without changes
skill/verified-runbooks/db_config_rollout.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # verified-runbooks/db_config_rollout.md
2
+
3
+ Runbook entries are written by the sre-gym skill after a successful solve (incident_resolved=true and final_score > 0.85).
4
+ Each entry is immutable evidence — treat it as ground truth for the winning path.
5
+
6
+ ---
7
+
8
+ ## Run 2026-04-23T22:01:33Z — Score 0.99
9
+
10
+ - Steps: **10**
11
+ - Checks passed: database_recovery, end_to_end
12
+
13
+ **Winning path:**
14
+ 1. `query_logs (service=database)` — Database is the loudest alert; inspect logs for the actual error signature
15
+ 2. `query_deploys (service=database)` — Pool-acquire errors suggest a config change; check recent database rollouts
16
+ 3. `query_metrics (service=database, metric=error_rate)` — Confirm the error pattern is pool exhaustion rather than compute overload
17
+ 4. `query_logs (service=worker)` — Rule out the decoy worker deploy by reading worker logs directly
18
+ 5. `submit_hypothesis (hypothesis=database_only_failure)` — Localize the fault to the database config before remediating
19
+ 6. `rollback_deploy (service=database)` — Roll back the offending database config rollout
20
+ 7. `restart_service (service=database)` — Restart the database cleanly against the restored pool config
21
+ 8. `run_check (check_name=database_recovery)` — Verify database pool health and write latency are back within SLO
22
+ 9. `run_check (check_name=end_to_end)` — Verify gateway write-path traffic succeeds end-to-end
23
+ 10. `declare_resolved` — Declare resolved only after objective checks pass
skill/verified-runbooks/gateway_auth_rollout.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # verified-runbooks/gateway_auth_rollout.md
2
+
3
+ Runbook entries are written by the sre-gym skill after a successful solve (incident_resolved=true and final_score > 0.85).
4
+ Each entry is immutable evidence — treat it as ground truth for the winning path.
5
+
6
+ ---
7
+
8
+ ## Run 2026-04-23T22:01:37Z — Score 0.99
9
+
10
+ - Steps: **8**
11
+ - Checks passed: database_recovery, end_to_end
12
+
13
+ **Winning path:**
14
+ 1. `query_logs (service=api-gateway)` — Gateway is rejecting logins; read gateway logs to localize the rejection class
15
+ 2. `query_deploys (service=api-gateway)` — Login rejection aligns with a recent auth middleware rollout; confirm deploy timing
16
+ 3. `query_deploys (service=worker)` — Rule out the worker deploy explicitly rather than assuming
17
+ 4. `submit_hypothesis (hypothesis=api_gateway_fault)` — Commit a calibrated hypothesis localizing to the gateway auth rollout
18
+ 5. `rollback_deploy (service=api-gateway)` — Roll back the bad auth middleware rollout; no restart needed
19
+ 6. `run_check (check_name=end_to_end)` — Verify that gateway login traffic now succeeds end-to-end
20
+ 7. `run_check (check_name=database_recovery)` — Confirm the database is (and stayed) healthy throughout
21
+ 8. `declare_resolved` — Declare resolved only after objective checks pass
skill/verified-runbooks/worker_deploy_cascade.md ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # verified-runbooks/worker_deploy_cascade.md
2
+
3
+ Runbook entries are written by the sre-gym skill after a successful solve (incident_resolved=true and final_score > 0.85).
4
+ Each entry is immutable evidence — treat it as ground truth for the winning path.
5
+
6
+ ---
7
+
8
+ ## Run 2026-04-23T22:01:29Z — Score 0.99
9
+
10
+ - Steps: **10**
11
+ - Checks passed: database_recovery, end_to_end
12
+
13
+ **Winning path:**
14
+ 1. `query_deploys (service=worker)` — Check whether any recent deploy aligns with the incident start
15
+ 2. `query_logs (service=worker)` — Inspect worker logs because deploy timing and queue pressure suggest worker-originated harm
16
+ 3. `query_metrics (service=database, metric=cpu)` — Confirm that the database is overloaded as a downstream effect
17
+ 4. `query_dependencies (service=api-gateway)` — Verify the gateway depends on the worker and database path
18
+ 5. `submit_hypothesis (hypothesis=bad_worker_deploy)` — Commit a calibrated hypothesis before taking an invasive mitigation step
19
+ 6. `rollback_deploy (service=worker)` — Remove the triggering change before restarting downstream services
20
+ 7. `restart_service (service=database)` — Bring the database back cleanly after the root cause is removed
21
+ 8. `run_check (check_name=database_recovery)` — Verify the database is no longer crashing
22
+ 9. `run_check (check_name=end_to_end)` — Verify gateway traffic succeeds end-to-end
23
+ 10. `declare_resolved` — Declare resolved only after objective checks pass
unified_incident_env/README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Unified Incident Env
2
+
3
+ The runnable submission surface lives at the project root. This package contains the actual environment implementation:
4
+
5
+ - typed models in `models.py`
6
+ - environment logic in `server/environment.py`
7
+ - scoring in `server/grader.py`
8
+ - scenario catalog in `server/challenge.py`
9
+
10
+ Use the root `README.md` for run commands, scoring, and example interaction.
unified_incident_env/__init__.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Unified incident-response OpenEnv package."""
2
+
3
+ from .interface import (
4
+ UnifiedIncidentAction,
5
+ UnifiedIncidentEnv,
6
+ UnifiedIncidentEnvironment,
7
+ UnifiedIncidentObservation,
8
+ UnifiedIncidentState,
9
+ )
10
+
11
+ __all__ = [
12
+ "UnifiedIncidentAction",
13
+ "UnifiedIncidentEnv",
14
+ "UnifiedIncidentEnvironment",
15
+ "UnifiedIncidentObservation",
16
+ "UnifiedIncidentState",
17
+ ]
unified_incident_env/client.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Typed OpenEnv client for the unified incident environment."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from openenv.core import EnvClient
8
+ from openenv.core.client_types import StepResult
9
+
10
+ from .models import UnifiedIncidentAction, UnifiedIncidentObservation, UnifiedIncidentState
11
+
12
+
13
+ class UnifiedIncidentEnv(
14
+ EnvClient[UnifiedIncidentAction, UnifiedIncidentObservation, UnifiedIncidentState]
15
+ ):
16
+ """Typed client wrapper around the OpenEnv HTTP API."""
17
+
18
+ DEFAULT_BASE_URL = "http://127.0.0.1:8000"
19
+
20
+ def _step_payload(self, action: UnifiedIncidentAction) -> dict[str, Any]:
21
+ return action.model_dump(exclude_none=True)
22
+
23
+ def _parse_result(self, payload: dict[str, Any]) -> StepResult[UnifiedIncidentObservation]:
24
+ observation_data = dict(payload.get("observation", {}))
25
+ observation_data.setdefault("reward", payload.get("reward", 0.0))
26
+ observation_data.setdefault("done", payload.get("done", False))
27
+ observation = UnifiedIncidentObservation.model_validate(observation_data)
28
+ return StepResult(
29
+ observation=observation,
30
+ reward=payload.get("reward", observation.reward),
31
+ done=payload.get("done", observation.done),
32
+ )
33
+
34
+ def _parse_state(self, payload: dict[str, Any]) -> UnifiedIncidentState:
35
+ return UnifiedIncidentState.model_validate(payload)
unified_incident_env/interface.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Single public interface surface for the unified incident benchmark."""
2
+
3
+ from .client import UnifiedIncidentEnv
4
+ from .models import (
5
+ UnifiedIncidentAction,
6
+ UnifiedIncidentObservation,
7
+ UnifiedIncidentState,
8
+ )
9
+ from .server.environment import UnifiedIncidentEnvironment
10
+
11
+ __all__ = [
12
+ "UnifiedIncidentAction",
13
+ "UnifiedIncidentEnv",
14
+ "UnifiedIncidentEnvironment",
15
+ "UnifiedIncidentObservation",
16
+ "UnifiedIncidentState",
17
+ ]
unified_incident_env/models.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Typed models for the honest narrow incident-remediation environment."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Literal
6
+
7
+ from openenv.core import Action, Observation, State
8
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
9
+ from pydantic_core import PydanticCustomError
10
+
11
+ ActionType = Literal[
12
+ "query_logs",
13
+ "query_metrics",
14
+ "query_dependencies",
15
+ "query_deploys",
16
+ "rollback_deploy",
17
+ "restart_service",
18
+ "run_check",
19
+ "isolate_service",
20
+ "escalate",
21
+ "submit_hypothesis",
22
+ "declare_resolved",
23
+ ]
24
+ Difficulty = Literal["easy", "medium", "hard"]
25
+ MetricName = Literal["cpu", "error_rate", "latency"]
26
+ ServiceName = Literal["api-gateway", "cache", "database", "worker"]
27
+ ServiceStatus = Literal["healthy", "degraded", "crashed", "isolated"]
28
+ WorkflowStage = Literal["triage", "mitigation", "validation", "resolved"]
29
+ CheckName = Literal["database_recovery", "end_to_end"]
30
+ RootCauseType = Literal[
31
+ "bad_worker_deploy",
32
+ "database_only_failure",
33
+ "api_gateway_fault",
34
+ ]
35
+ RecommendedActionType = Literal[
36
+ "query_logs",
37
+ "query_metrics",
38
+ "query_dependencies",
39
+ "query_deploys",
40
+ "rollback_deploy",
41
+ "restart_service",
42
+ "run_check",
43
+ "isolate_service",
44
+ "escalate",
45
+ "declare_resolved",
46
+ ]
47
+
48
+
49
+ class PostmortemPayload(BaseModel):
50
+ """Deprecated compatibility shell for the removed v1 postmortem action."""
51
+
52
+ model_config = ConfigDict(extra="forbid")
53
+
54
+ root_cause: str = ""
55
+ attack_vector: str = ""
56
+ timeline: list[str] = Field(default_factory=list)
57
+ remediation_steps: list[str] = Field(default_factory=list)
58
+ prevention_steps: list[str] = Field(default_factory=list)
59
+
60
+
61
+ class SecurityContext(BaseModel):
62
+ """Deprecated compatibility shell for the removed v1 security subquest state."""
63
+
64
+ model_config = ConfigDict(extra="forbid")
65
+
66
+ code_visible: bool = False
67
+ selected_vulnerability: str | None = None
68
+ selected_patch: str | None = None
69
+ exploit_blocked: bool | None = None
70
+ functionality_preserved: bool | None = None
71
+
72
+
73
+ class HypothesisPayload(BaseModel):
74
+ """Structured hypothesis submitted by the agent."""
75
+
76
+ model_config = ConfigDict(extra="forbid")
77
+
78
+ root_cause: RootCauseType
79
+ affected_services: list[ServiceName] = Field(default_factory=list, min_length=1)
80
+ confidence: float = Field(ge=0.0, le=1.0)
81
+ recommended_next_action: RecommendedActionType
82
+
83
+
84
+ class ServiceHealth(BaseModel):
85
+ """Health snapshot for a service."""
86
+
87
+ model_config = ConfigDict(extra="forbid")
88
+
89
+ name: ServiceName
90
+ status: ServiceStatus
91
+ cpu_pct: float = Field(ge=0.0, le=100.0)
92
+ memory_pct: float = Field(ge=0.0, le=100.0)
93
+ error_rate_pct: float = Field(ge=0.0, le=100.0)
94
+ latency_ms: float = Field(ge=0.0)
95
+
96
+
97
+ class Alert(BaseModel):
98
+ """Alert exposed to the agent."""
99
+
100
+ model_config = ConfigDict(extra="forbid")
101
+
102
+ service: ServiceName
103
+ severity: Literal["warning", "critical"]
104
+ message: str
105
+
106
+
107
+ class CheckResult(BaseModel):
108
+ """Result of a verification check."""
109
+
110
+ model_config = ConfigDict(extra="forbid")
111
+
112
+ name: CheckName
113
+ passed: bool
114
+ detail: str
115
+
116
+
117
+ class UnifiedIncidentAction(Action):
118
+ """One structured environment action."""
119
+
120
+ model_config = ConfigDict(extra="ignore")
121
+
122
+ action_type: ActionType
123
+ service: ServiceName | None = None
124
+ metric: MetricName | None = None
125
+ check_name: CheckName | None = None
126
+ hypothesis: HypothesisPayload | None = None
127
+
128
+ @model_validator(mode="after")
129
+ def _validate_payload(self) -> "UnifiedIncidentAction":
130
+ if self.action_type in {
131
+ "query_logs",
132
+ "query_dependencies",
133
+ "query_deploys",
134
+ "rollback_deploy",
135
+ "restart_service",
136
+ "isolate_service",
137
+ } and not self.service:
138
+ raise PydanticCustomError(
139
+ "missing_service",
140
+ "service is required for {action_type}",
141
+ {"action_type": self.action_type},
142
+ )
143
+ if self.action_type == "query_metrics":
144
+ if not self.service:
145
+ raise PydanticCustomError(
146
+ "missing_service",
147
+ "service is required for {action_type}",
148
+ {"action_type": self.action_type},
149
+ )
150
+ if not self.metric:
151
+ raise PydanticCustomError(
152
+ "missing_metric",
153
+ "metric is required for {action_type}",
154
+ {"action_type": self.action_type},
155
+ )
156
+ if self.action_type == "run_check" and not self.check_name:
157
+ raise PydanticCustomError(
158
+ "missing_check_name",
159
+ "check_name is required for {action_type}",
160
+ {"action_type": self.action_type},
161
+ )
162
+ if self.action_type == "submit_hypothesis" and self.hypothesis is None:
163
+ raise PydanticCustomError(
164
+ "missing_hypothesis",
165
+ "hypothesis is required for {action_type}",
166
+ {"action_type": self.action_type},
167
+ )
168
+ return self
169
+
170
+
171
+ class UnifiedIncidentObservation(Observation):
172
+ """Observation returned after reset and each step."""
173
+
174
+ model_config = ConfigDict(extra="forbid")
175
+
176
+ prompt_text: str
177
+ incident_summary: str
178
+ tick_count: int
179
+ max_ticks: int
180
+ difficulty: Difficulty
181
+ workflow_stage: WorkflowStage
182
+ active_alerts: list[Alert] = Field(default_factory=list)
183
+ service_health: dict[str, ServiceHealth] = Field(default_factory=dict)
184
+ discovered_evidence: list[str] = Field(default_factory=list)
185
+ recent_deploys: list[str] = Field(default_factory=list)
186
+ checks: list[CheckResult] = Field(default_factory=list)
187
+ user_impact: float = Field(ge=0.0, le=1.0)
188
+ slo_burn_rate: float = Field(ge=0.0, le=1.0)
189
+ incident_resolved: bool = False
190
+ containment_applied: bool = False
191
+ last_action_result: str = ""
192
+ tool_output: str | None = None
193
+ failure_type: str | None = None
194
+ why_failed: str | None = None
195
+ allowed_actions: list[str] = Field(default_factory=list)
196
+ required_fields_by_action: dict[str, list[str]] = Field(default_factory=dict)
197
+ valid_action_example: dict[str, Any] | None = None
198
+ common_trap: str | None = None
199
+ loop_warning: str | None = None
200
+ blocked_until_security_complete: bool = False
201
+ security_unlock_reason: str | None = None
202
+ best_recovery_action_family: str | None = None
203
+ progress_flags: dict[str, bool] = Field(default_factory=dict)
204
+ security_subquest_status: str | None = None
205
+ security_context: dict[str, Any] = Field(default_factory=dict)
206
+ final_score: float = 0.0
207
+ score_breakdown: dict[str, float] = Field(default_factory=dict)
208
+ reward: float = 0.0
209
+ done: bool = False
210
+
211
+
212
+ class UnifiedIncidentState(State):
213
+ """Persistent episode state."""
214
+
215
+ model_config = ConfigDict(extra="forbid")
216
+
217
+ episode_id: str
218
+ step_count: int
219
+ scenario_id: str
220
+ difficulty: Difficulty
221
+ current_tick: int
222
+ max_ticks: int
223
+ workflow_stage: WorkflowStage
224
+ active_alerts: list[Alert] = Field(default_factory=list)
225
+ service_health: dict[str, ServiceHealth] = Field(default_factory=dict)
226
+ discovered_evidence: list[str] = Field(default_factory=list)
227
+ recent_deploys: list[str] = Field(default_factory=list)
228
+ checks: list[CheckResult] = Field(default_factory=list)
229
+ user_impact: float = Field(ge=0.0, le=1.0)
230
+ slo_burn_rate: float = Field(ge=0.0, le=1.0)
231
+ incident_resolved: bool = False
232
+ containment_applied: bool = False
233
+ allowed_actions: list[str] = Field(default_factory=list)
234
+ required_fields_by_action: dict[str, list[str]] = Field(default_factory=dict)
235
+ valid_action_example: dict[str, Any] | None = None
236
+ progress_flags: dict[str, bool] = Field(default_factory=dict)
237
+ final_score: float = 0.0
238
+ score_breakdown: dict[str, float] = Field(default_factory=dict)
239
+ cumulative_reward: float = 0.0
240
+ wasteful_ticks: int = 0
241
+ last_action_result: str = ""
242
+ failure_type: str | None = None
243
+ why_failed: str | None = None
244
+
245
+
246
+ class ScenarioSummary(BaseModel):
247
+ """Public scenario summary."""
248
+
249
+ model_config = ConfigDict(extra="forbid")
250
+
251
+ id: str
252
+ difficulty: Difficulty
253
+ name: str
254
+ description: str
255
+ root_cause: str
256
+ optimal_ticks: int
257
+
258
+
259
+ class ScenarioCatalog(BaseModel):
260
+ """Public scenario catalog."""
261
+
262
+ model_config = ConfigDict(extra="forbid")
263
+
264
+ environment: str = "unified_incident_env"
265
+ default_scenario_id: str
266
+ available_difficulties: list[Difficulty]
267
+ filtered_difficulty: Difficulty | None = None
268
+ scenarios: list[ScenarioSummary]
269
+
270
+
271
+ class BaselineStep(BaseModel):
272
+ """One baseline action."""
273
+
274
+ model_config = ConfigDict(extra="forbid")
275
+
276
+ action: UnifiedIncidentAction
277
+ rationale: str = ""
278
+
279
+
280
+ class BaselineDefinition(BaseModel):
281
+ """One baseline trajectory."""
282
+
283
+ model_config = ConfigDict(extra="forbid")
284
+
285
+ scenario_id: str
286
+ name: str
287
+ description: str
288
+ optimal_ticks: int
289
+ actions: list[BaselineStep] = Field(default_factory=list)
290
+
291
+
292
+ class BaselineCatalog(BaseModel):
293
+ """Public baseline catalog."""
294
+
295
+ model_config = ConfigDict(extra="forbid")
296
+
297
+ environment: str = "unified_incident_env"
298
+ baselines: list[BaselineDefinition]
299
+
300
+
301
+ class GraderCheck(BaseModel):
302
+ """One normalized grader check."""
303
+
304
+ model_config = ConfigDict(extra="forbid")
305
+
306
+ name: str
307
+ passed: bool
308
+ detail: str
309
+ weight: float
310
+
311
+
312
+ class GraderReport(BaseModel):
313
+ """Episode-grade report."""
314
+
315
+ model_config = ConfigDict(extra="forbid")
316
+
317
+ scenario_id: str
318
+ passed: bool
319
+ score: float = Field(ge=0.0, le=1.0)
320
+ message: str
321
+ breakdown: dict[str, float] = Field(default_factory=dict)
322
+ checks: list[GraderCheck] = Field(default_factory=list)
323
+
324
+
325
+ class RuntimeStatus(BaseModel):
326
+ """Runtime status route payload."""
327
+
328
+ model_config = ConfigDict(extra="forbid")
329
+
330
+ environment: str = "unified_incident_env"
331
+ progress: UnifiedIncidentState
332
+ grader: GraderReport
unified_incident_env/scripts/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Scripts for the unified incident environment."""
unified_incident_env/scripts/baseline_agent.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Deterministic scripted baseline for the honest narrow incident environment."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+
8
+ from ..client import UnifiedIncidentEnv
9
+ from ..server.challenge import DEFAULT_SCENARIO_ID, SCENARIOS, list_baselines
10
+
11
+
12
+ def plan_for_scenario(scenario_id: str):
13
+ catalog = list_baselines(scenario_id)
14
+ return [step.action for step in catalog.baselines[0].actions]
15
+
16
+
17
+ def run_scenario(base_url: str, scenario_id: str) -> dict[str, object]:
18
+ with UnifiedIncidentEnv(base_url=base_url).sync() as env:
19
+ env.reset(scenario_id=scenario_id)
20
+ final = None
21
+ for action in plan_for_scenario(scenario_id):
22
+ final = env.step(action).observation
23
+ assert final is not None
24
+ return {
25
+ "scenario_id": scenario_id,
26
+ "success": bool(final.done and final.incident_resolved),
27
+ "final_score": final.final_score,
28
+ "workflow_stage": final.workflow_stage,
29
+ }
30
+
31
+
32
+ def main() -> None:
33
+ parser = argparse.ArgumentParser()
34
+ parser.add_argument("--base-url", default=UnifiedIncidentEnv.DEFAULT_BASE_URL)
35
+ parser.add_argument("--scenario", choices=sorted(SCENARIOS), default=DEFAULT_SCENARIO_ID)
36
+ args = parser.parse_args()
37
+
38
+ results = [run_scenario(args.base_url, args.scenario)]
39
+ print(json.dumps(results, indent=2))
40
+
41
+
42
+ if __name__ == "__main__":
43
+ main()
unified_incident_env/scripts/walkthrough.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Simple walkthrough that prints a full episode interaction."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+
8
+ from ..client import UnifiedIncidentEnv
9
+ from .baseline_agent import plan_for_scenario
10
+
11
+
12
+ def main() -> None:
13
+ parser = argparse.ArgumentParser()
14
+ parser.add_argument(
15
+ "--base-url",
16
+ default=UnifiedIncidentEnv.DEFAULT_BASE_URL,
17
+ )
18
+ parser.add_argument(
19
+ "--scenario",
20
+ default="easy_sqli_db_outage",
21
+ )
22
+ args = parser.parse_args()
23
+
24
+ with UnifiedIncidentEnv(base_url=args.base_url).sync() as env:
25
+ reset = env.reset(scenario_id=args.scenario).observation
26
+ print(json.dumps({"reset": reset.model_dump()}, indent=2))
27
+ for action in plan_for_scenario(args.scenario):
28
+ step = env.step(action).observation
29
+ print(
30
+ json.dumps(
31
+ {
32
+ "action": action.model_dump(exclude_none=True),
33
+ "observation": step.model_dump(),
34
+ },
35
+ indent=2,
36
+ )
37
+ )
38
+
39
+
40
+ if __name__ == "__main__":
41
+ main()
unified_incident_env/server/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Server package for the unified incident environment."""
unified_incident_env/server/app.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """FastAPI app and metadata routes for the honest narrow incident environment."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import os
7
+ from typing import Any
8
+
9
+ from fastapi import HTTPException
10
+ from fastapi.responses import HTMLResponse, RedirectResponse
11
+ from openenv.core.env_server.http_server import create_fastapi_app
12
+
13
+ from ..models import (
14
+ BaselineCatalog,
15
+ GraderReport,
16
+ RuntimeStatus,
17
+ ScenarioCatalog,
18
+ UnifiedIncidentAction,
19
+ UnifiedIncidentObservation,
20
+ UnifiedIncidentState,
21
+ )
22
+ from .challenge import current_runtime_progress, grade_episode, list_baselines, list_scenarios, set_runtime_progress
23
+ from .environment import UnifiedIncidentEnvironment
24
+
25
+ _BOOTSTRAP_ENV = UnifiedIncidentEnvironment()
26
+ set_runtime_progress(_BOOTSTRAP_ENV.state.model_dump())
27
+
28
+ _SIMPLE_HTML = """<!doctype html>
29
+ <html lang="en">
30
+ <head>
31
+ <meta charset="utf-8" />
32
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
33
+ <title>Unified Incident Env</title>
34
+ <style>
35
+ body { font-family: system-ui, sans-serif; max-width: 900px; margin: 40px auto; padding: 0 20px; line-height: 1.5; }
36
+ code, pre { background: #f4f4f4; padding: 2px 6px; border-radius: 6px; }
37
+ pre { padding: 12px; overflow: auto; }
38
+ </style>
39
+ </head>
40
+ <body>
41
+ <h1>Unified Incident Env</h1>
42
+ <p>This v2 environment exposes an honest bounded-action incident diagnosis and remediation task.</p>
43
+ <ul>
44
+ <li><a href="/docs">API docs</a></li>
45
+ <li><a href="/tasks">Scenario catalog</a></li>
46
+ <li><a href="/baseline">Baseline plan</a></li>
47
+ <li><a href="/status">Runtime status</a></li>
48
+ <li><a href="/health">Health</a></li>
49
+ </ul>
50
+ <h2>Core ideas</h2>
51
+ <ul>
52
+ <li>Queries reveal evidence but do not directly mint positive reward.</li>
53
+ <li>Remediation actions change the world state.</li>
54
+ <li><code>run_check</code> verifies recovery explicitly.</li>
55
+ <li><code>declare_resolved</code> succeeds only after objective checks pass.</li>
56
+ </ul>
57
+ <h2>Manual example</h2>
58
+ <pre>curl -X POST http://127.0.0.1:8000/reset -H 'content-type: application/json' -d '{}'
59
+ curl -X POST http://127.0.0.1:8000/step -H 'content-type: application/json' -d '{"action_type":"query_deploys","service":"worker"}'</pre>
60
+ </body>
61
+ </html>
62
+ """
63
+
64
+
65
+ def create_compatible_app():
66
+ env_factory = lambda: UnifiedIncidentEnvironment()
67
+ app = create_fastapi_app(
68
+ env_factory,
69
+ UnifiedIncidentAction,
70
+ UnifiedIncidentObservation,
71
+ max_concurrent_envs=1,
72
+ )
73
+
74
+ @app.get("/", include_in_schema=False)
75
+ async def web_root():
76
+ return RedirectResponse(url="/simple")
77
+
78
+ @app.get("/simple", include_in_schema=False)
79
+ async def simple_console():
80
+ return HTMLResponse(_SIMPLE_HTML)
81
+
82
+ _attach_metadata_routes(app)
83
+
84
+ return app
85
+
86
+
87
+ def _attach_metadata_routes(app):
88
+ @app.get("/tasks", response_model=ScenarioCatalog, tags=["challenge"])
89
+ def tasks(difficulty: str | None = None) -> ScenarioCatalog:
90
+ try:
91
+ return list_scenarios(difficulty=difficulty)
92
+ except ValueError as exc:
93
+ raise HTTPException(status_code=404, detail=str(exc)) from exc
94
+
95
+ @app.get("/baseline", response_model=BaselineCatalog, tags=["challenge"])
96
+ def baseline(scenario_id: str | None = None) -> BaselineCatalog:
97
+ try:
98
+ return list_baselines(scenario_id=scenario_id)
99
+ except ValueError as exc:
100
+ raise HTTPException(status_code=404, detail=str(exc)) from exc
101
+
102
+ @app.get("/grader", response_model=GraderReport, tags=["challenge"])
103
+ def grader(scenario_id: str | None = None) -> GraderReport:
104
+ progress = current_runtime_progress()
105
+ if scenario_id is not None:
106
+ progress["scenario_id"] = scenario_id
107
+ try:
108
+ return grade_episode(progress)
109
+ except ValueError as exc:
110
+ raise HTTPException(status_code=404, detail=str(exc)) from exc
111
+
112
+ @app.get("/status", response_model=RuntimeStatus, tags=["challenge"])
113
+ def status() -> RuntimeStatus:
114
+ progress = current_runtime_progress()
115
+ return RuntimeStatus(
116
+ progress=UnifiedIncidentState(**progress),
117
+ grader=grade_episode(progress),
118
+ )
119
+
120
+ @app.get("/health", tags=["challenge"])
121
+ def health() -> dict[str, object]:
122
+ return {
123
+ "status": "ok",
124
+ "environment": "unified_incident_env",
125
+ "version": "2.0.0",
126
+ "stages": ["triage", "mitigation", "validation", "resolved"],
127
+ }
128
+
129
+
130
+ app = create_compatible_app()
131
+
132
+
133
+ def serve(host: str = "0.0.0.0", port: int = 8000) -> None:
134
+ import uvicorn
135
+
136
+ uvicorn.run(app, host=host, port=port)
137
+
138
+
139
+ def main() -> None:
140
+ parser = argparse.ArgumentParser()
141
+ parser.add_argument("--host", default=os.environ.get("HOST", "0.0.0.0"))
142
+ parser.add_argument("--port", type=int, default=int(os.environ.get("PORT", "8000")))
143
+ args = parser.parse_args()
144
+ serve(host=args.host, port=args.port)
145
+
146
+
147
+ if __name__ == "__main__":
148
+ main()
unified_incident_env/server/challenge.py ADDED
@@ -0,0 +1,753 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Scenario catalog, baselines, and runtime helpers for the honest v2 core."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from copy import deepcopy
6
+ from typing import Any
7
+
8
+ from ..models import (
9
+ BaselineCatalog,
10
+ BaselineDefinition,
11
+ BaselineStep,
12
+ ScenarioCatalog,
13
+ ScenarioSummary,
14
+ UnifiedIncidentAction,
15
+ )
16
+
17
+ DEFAULT_SCENARIO_ID = "worker_deploy_cascade"
18
+
19
+ SCENARIOS: dict[str, dict[str, Any]] = {
20
+ "worker_deploy_cascade": {
21
+ "id": "worker_deploy_cascade",
22
+ "difficulty": "easy",
23
+ "name": "Worker Deploy Cascade",
24
+ "description": (
25
+ "A bad worker deploy causes sustained database overload and login 502s at the gateway. "
26
+ "The agent must diagnose from evidence, choose a safe remediation, verify recovery, and declare resolved only after checks pass."
27
+ ),
28
+ "root_cause": "A bad worker deploy is driving repeated database overload.",
29
+ "optimal_ticks": 10,
30
+ "max_ticks": 12,
31
+ "critical_service_weights": {
32
+ "worker": 0.4,
33
+ "database": 0.4,
34
+ "api-gateway": 0.2,
35
+ "cache": 0.0,
36
+ },
37
+ "reward_config": {
38
+ "step_cost": 0.01,
39
+ "redundant_action_penalty": 0.02,
40
+ "unsafe_action_penalty": 0.08,
41
+ "premature_resolution_penalty": 0.2,
42
+ "successful_resolution_bonus": 0.25,
43
+ "hypothesis_bonus_scale": 0.12,
44
+ "forbidden_reward_sources": [
45
+ "evidence_discovery",
46
+ "query_success",
47
+ "unlock_events",
48
+ "stage_advancement",
49
+ "patch_id_selection",
50
+ ],
51
+ },
52
+ "initial_services": {
53
+ "api-gateway": {
54
+ "status": "degraded",
55
+ "cpu_pct": 61.0,
56
+ "memory_pct": 38.0,
57
+ "error_rate_pct": 24.0,
58
+ "latency_ms": 640.0,
59
+ },
60
+ "cache": {
61
+ "status": "healthy",
62
+ "cpu_pct": 18.0,
63
+ "memory_pct": 24.0,
64
+ "error_rate_pct": 0.0,
65
+ "latency_ms": 14.0,
66
+ },
67
+ "database": {
68
+ "status": "crashed",
69
+ "cpu_pct": 99.0,
70
+ "memory_pct": 97.0,
71
+ "error_rate_pct": 100.0,
72
+ "latency_ms": 0.0,
73
+ },
74
+ "worker": {
75
+ "status": "degraded",
76
+ "cpu_pct": 88.0,
77
+ "memory_pct": 71.0,
78
+ "error_rate_pct": 19.0,
79
+ "latency_ms": 420.0,
80
+ },
81
+ },
82
+ "initial_alerts": [
83
+ {
84
+ "service": "api-gateway",
85
+ "severity": "critical",
86
+ "message": "Login requests are returning sustained 502s.",
87
+ },
88
+ {
89
+ "service": "database",
90
+ "severity": "critical",
91
+ "message": "Database process is crashing under repeated overload.",
92
+ },
93
+ {
94
+ "service": "worker",
95
+ "severity": "warning",
96
+ "message": "Worker queue depth and retry volume spiked after a recent rollout.",
97
+ },
98
+ ],
99
+ "logs": {
100
+ "api-gateway": (
101
+ "Gateway upstream errors point to worker timeouts followed by database connection failures. "
102
+ "No recent gateway deploys are recorded."
103
+ ),
104
+ "cache": "Cache hit ratio is stable and cache upstream probes remain healthy.",
105
+ "database": (
106
+ "Database logs show repeated bursts of expensive worker-originated writes immediately before each crash."
107
+ ),
108
+ "worker": (
109
+ "Worker logs show request fanout amplification and elevated retries beginning right after rollout build worker@2026.04.23-bad."
110
+ ),
111
+ },
112
+ "metrics": {
113
+ "api-gateway": {
114
+ "error_rate": "Gateway 502 rate is 24% and closely tracks worker timeout bursts.",
115
+ "latency": "Gateway p95 latency climbed to 640ms while waiting on downstream worker/database calls.",
116
+ },
117
+ "database": {
118
+ "cpu": "Database CPU is pinned at 99% until the process exits.",
119
+ "latency": "Database latency spikes sharply before each crash loop.",
120
+ },
121
+ "worker": {
122
+ "cpu": "Worker CPU is 88% with growing queue pressure.",
123
+ "error_rate": "Worker retry/error rate is elevated after rollout.",
124
+ },
125
+ },
126
+ "dependencies": {
127
+ "api-gateway": "api-gateway -> worker -> database",
128
+ "worker": "worker -> database",
129
+ "database": "database is a terminal dependency for write-heavy worker jobs",
130
+ },
131
+ "deploy_history": {
132
+ "api-gateway": "No gateway deploys in the last 24h.",
133
+ "cache": "No cache deploys in the last 24h.",
134
+ "database": "No database deploys in the last 24h.",
135
+ "worker": "Rolled out worker@2026.04.23-bad 12 minutes ago.",
136
+ },
137
+ "checks": {
138
+ "database_recovery": "Confirms the database is healthy and no longer crashing.",
139
+ "end_to_end": "Confirms login traffic succeeds without worker-induced overload.",
140
+ },
141
+ "truth": {
142
+ "root_cause": "bad_worker_deploy",
143
+ "affected_services": ["worker", "database", "api-gateway"],
144
+ "best_next_action": "rollback_deploy",
145
+ },
146
+ "remediation_recipe": {
147
+ "rollback_target": "worker",
148
+ "restart_target": "database",
149
+ "isolate_target": "worker",
150
+ "restart_requires_cause_removed": True,
151
+ "incident_driver": "worker",
152
+ "resolution_check": "end_to_end",
153
+ },
154
+ "post_rollback_services": {
155
+ "worker": {"status": "healthy", "cpu_pct": 32.0, "memory_pct": 37.0, "error_rate_pct": 2.0, "latency_ms": 40.0},
156
+ },
157
+ "post_rollback_user_impact": 0.55,
158
+ "post_rollback_slo_burn": 0.58,
159
+ "post_restart_services": {
160
+ "database": {"status": "healthy", "cpu_pct": 34.0, "memory_pct": 39.0, "error_rate_pct": 0.0, "latency_ms": 22.0},
161
+ "api-gateway": {"status": "healthy", "cpu_pct": 28.0, "memory_pct": 31.0, "error_rate_pct": 0.0, "latency_ms": 38.0},
162
+ },
163
+ "post_restart_user_impact": 0.14,
164
+ "post_restart_slo_burn": 0.18,
165
+ "post_isolate_services": {
166
+ "worker": {"status": "isolated", "cpu_pct": 8.0, "memory_pct": 18.0, "error_rate_pct": 0.0, "latency_ms": 0.0},
167
+ "database": {"status": "healthy", "cpu_pct": 41.0, "memory_pct": 46.0, "error_rate_pct": 0.0, "latency_ms": 26.0},
168
+ "api-gateway": {"status": "degraded", "cpu_pct": 34.0, "memory_pct": 33.0, "error_rate_pct": 7.0, "latency_ms": 91.0},
169
+ },
170
+ "post_isolate_user_impact": 0.45,
171
+ "post_isolate_slo_burn": 0.47,
172
+ "degraded_services": {
173
+ "worker": {"status": "degraded", "cpu_pct": 88.0, "memory_pct": 71.0, "error_rate_pct": 19.0, "latency_ms": 420.0},
174
+ "database": {"status": "crashed", "cpu_pct": 99.0, "memory_pct": 97.0, "error_rate_pct": 100.0, "latency_ms": 0.0},
175
+ "api-gateway": {"status": "degraded", "cpu_pct": 61.0, "memory_pct": 38.0, "error_rate_pct": 24.0, "latency_ms": 640.0},
176
+ },
177
+ "degraded_user_impact": 0.82,
178
+ "degraded_slo_burn": 0.91,
179
+ "failure_messages": {
180
+ "wrong_rollback_target": "Rolling back a service without a causal link wastes time and risk.",
181
+ "low_value_restart": "Restarting that service is not the safe next remediation step for this incident.",
182
+ "premature_restart": "Restarting before removing the trigger only causes another crash loop.",
183
+ "wrong_isolation_target": "Isolating that service does not contain the dominant failure path.",
184
+ },
185
+ },
186
+ "db_config_rollout": {
187
+ "id": "db_config_rollout",
188
+ "difficulty": "medium",
189
+ "name": "Database Config Rollout Regression",
190
+ "description": (
191
+ "A database config push cut connection pool size and write requests now time out. "
192
+ "A separate worker deploy landed around the same time and looks suspicious but is not the cause. "
193
+ "The agent must avoid the decoy, roll back the database config, restart it, and verify recovery."
194
+ ),
195
+ "root_cause": "A bad database config rollout shrank the connection pool and is dropping writes.",
196
+ "optimal_ticks": 10,
197
+ "max_ticks": 12,
198
+ "critical_service_weights": {
199
+ "worker": 0.2,
200
+ "database": 0.5,
201
+ "api-gateway": 0.3,
202
+ "cache": 0.0,
203
+ },
204
+ "reward_config": {
205
+ "step_cost": 0.01,
206
+ "redundant_action_penalty": 0.02,
207
+ "unsafe_action_penalty": 0.08,
208
+ "premature_resolution_penalty": 0.2,
209
+ "successful_resolution_bonus": 0.25,
210
+ "hypothesis_bonus_scale": 0.12,
211
+ "forbidden_reward_sources": [
212
+ "evidence_discovery",
213
+ "query_success",
214
+ "unlock_events",
215
+ "stage_advancement",
216
+ "patch_id_selection",
217
+ ],
218
+ },
219
+ "initial_services": {
220
+ "api-gateway": {
221
+ "status": "degraded",
222
+ "cpu_pct": 44.0,
223
+ "memory_pct": 36.0,
224
+ "error_rate_pct": 17.0,
225
+ "latency_ms": 520.0,
226
+ },
227
+ "cache": {
228
+ "status": "healthy",
229
+ "cpu_pct": 20.0,
230
+ "memory_pct": 26.0,
231
+ "error_rate_pct": 0.0,
232
+ "latency_ms": 15.0,
233
+ },
234
+ "database": {
235
+ "status": "degraded",
236
+ "cpu_pct": 62.0,
237
+ "memory_pct": 54.0,
238
+ "error_rate_pct": 48.0,
239
+ "latency_ms": 880.0,
240
+ },
241
+ "worker": {
242
+ "status": "degraded",
243
+ "cpu_pct": 51.0,
244
+ "memory_pct": 44.0,
245
+ "error_rate_pct": 12.0,
246
+ "latency_ms": 310.0,
247
+ },
248
+ },
249
+ "initial_alerts": [
250
+ {
251
+ "service": "database",
252
+ "severity": "critical",
253
+ "message": "Database connection acquire timeouts at 48% and climbing.",
254
+ },
255
+ {
256
+ "service": "api-gateway",
257
+ "severity": "warning",
258
+ "message": "Write-path requests are returning sustained 5xx.",
259
+ },
260
+ {
261
+ "service": "worker",
262
+ "severity": "warning",
263
+ "message": "Worker write latency is elevated; retries are climbing.",
264
+ },
265
+ ],
266
+ "logs": {
267
+ "api-gateway": (
268
+ "Gateway upstream errors are downstream-driven: writes to the worker path return pool-exhaustion "
269
+ "errors originating from the database. No gateway deploys recorded in the last 24h."
270
+ ),
271
+ "cache": "Cache reads are healthy and unrelated to the current write-path failures.",
272
+ "database": (
273
+ "Database logs show 'could not acquire connection' errors immediately after config rollout "
274
+ "db@2026.04.24-cfg lowered max_connections from 80 to 12."
275
+ ),
276
+ "worker": (
277
+ "Worker logs show retries driven by downstream database pool exhaustion, not local faults. "
278
+ "Worker code deploy worker@2026.04.24-refactor is unrelated to the pool error signature."
279
+ ),
280
+ },
281
+ "metrics": {
282
+ "api-gateway": {
283
+ "error_rate": "Gateway 5xx rate is 17% and matches the database pool-exhaustion windows one-for-one.",
284
+ "latency": "Gateway p95 climbed to 520ms waiting on database connection acquire.",
285
+ },
286
+ "database": {
287
+ "cpu": "Database CPU is moderate (~62%), so this is not a compute overload pattern.",
288
+ "error_rate": "Database error rate is 48% and dominated by 'connection acquire timeout'.",
289
+ "latency": "Database write latency jumped to 880ms after the config rollout.",
290
+ },
291
+ "worker": {
292
+ "cpu": "Worker CPU is 51% — no local overload; retries are reactive.",
293
+ "error_rate": "Worker errors are retries against the saturated database pool.",
294
+ },
295
+ },
296
+ "dependencies": {
297
+ "api-gateway": "api-gateway -> worker -> database",
298
+ "worker": "worker -> database",
299
+ "database": "database is the terminal dependency; pool exhaustion here starves all upstream writers",
300
+ },
301
+ "deploy_history": {
302
+ "api-gateway": "No gateway deploys in the last 24h.",
303
+ "cache": "No cache deploys in the last 24h.",
304
+ "database": "Applied config db@2026.04.24-cfg 15 minutes ago (max_connections 80 -> 12).",
305
+ "worker": "Rolled out worker@2026.04.24-refactor 22 minutes ago (unrelated code cleanup).",
306
+ },
307
+ "checks": {
308
+ "database_recovery": "Confirms database write latency and pool health are back within SLO.",
309
+ "end_to_end": "Confirms gateway write-path traffic succeeds end-to-end.",
310
+ },
311
+ "truth": {
312
+ "root_cause": "database_only_failure",
313
+ "affected_services": ["database", "api-gateway", "worker"],
314
+ "best_next_action": "rollback_deploy",
315
+ },
316
+ "remediation_recipe": {
317
+ "rollback_target": "database",
318
+ "restart_target": "database",
319
+ "isolate_target": None,
320
+ "restart_requires_cause_removed": True,
321
+ "incident_driver": "database",
322
+ "resolution_check": "end_to_end",
323
+ },
324
+ "post_rollback_services": {
325
+ "database": {"status": "degraded", "cpu_pct": 48.0, "memory_pct": 42.0, "error_rate_pct": 6.0, "latency_ms": 120.0},
326
+ },
327
+ "post_rollback_user_impact": 0.40,
328
+ "post_rollback_slo_burn": 0.45,
329
+ "post_restart_services": {
330
+ "database": {"status": "healthy", "cpu_pct": 36.0, "memory_pct": 40.0, "error_rate_pct": 0.0, "latency_ms": 26.0},
331
+ "api-gateway": {"status": "healthy", "cpu_pct": 29.0, "memory_pct": 30.0, "error_rate_pct": 0.0, "latency_ms": 44.0},
332
+ "worker": {"status": "healthy", "cpu_pct": 33.0, "memory_pct": 36.0, "error_rate_pct": 1.0, "latency_ms": 48.0},
333
+ },
334
+ "post_restart_user_impact": 0.10,
335
+ "post_restart_slo_burn": 0.14,
336
+ "post_isolate_services": {},
337
+ "post_isolate_user_impact": 0.70,
338
+ "post_isolate_slo_burn": 0.75,
339
+ "degraded_services": {
340
+ "database": {"status": "degraded", "cpu_pct": 62.0, "memory_pct": 54.0, "error_rate_pct": 48.0, "latency_ms": 880.0},
341
+ "api-gateway": {"status": "degraded", "cpu_pct": 44.0, "memory_pct": 36.0, "error_rate_pct": 17.0, "latency_ms": 520.0},
342
+ "worker": {"status": "degraded", "cpu_pct": 51.0, "memory_pct": 44.0, "error_rate_pct": 12.0, "latency_ms": 310.0},
343
+ },
344
+ "degraded_user_impact": 0.70,
345
+ "degraded_slo_burn": 0.78,
346
+ "failure_messages": {
347
+ "wrong_rollback_target": "The worker deploy is a decoy; worker errors are reactive to database pool exhaustion.",
348
+ "low_value_restart": "Restarting that service does not address a database-config regression.",
349
+ "premature_restart": "Restarting the database before rolling back the config will re-inherit the 12-connection pool and fail again.",
350
+ "wrong_isolation_target": "Isolation is not useful here: the cause is a config regression, not a runaway service.",
351
+ },
352
+ },
353
+ "gateway_auth_rollout": {
354
+ "id": "gateway_auth_rollout",
355
+ "difficulty": "hard",
356
+ "name": "Gateway Auth Rollout Regression",
357
+ "description": (
358
+ "A new api-gateway auth-middleware rollout is rejecting ~40% of valid logins. "
359
+ "A recent worker deploy and elevated worker queue depth make the worker look like a plausible suspect. "
360
+ "The agent must localize to the gateway, roll back its deploy, and verify recovery without unnecessary restarts."
361
+ ),
362
+ "root_cause": "A bad api-gateway auth-middleware rollout is rejecting valid logins.",
363
+ "optimal_ticks": 8,
364
+ "max_ticks": 10,
365
+ "critical_service_weights": {
366
+ "worker": 0.15,
367
+ "database": 0.15,
368
+ "api-gateway": 0.70,
369
+ "cache": 0.0,
370
+ },
371
+ "reward_config": {
372
+ "step_cost": 0.01,
373
+ "redundant_action_penalty": 0.02,
374
+ "unsafe_action_penalty": 0.12,
375
+ "premature_resolution_penalty": 0.3,
376
+ "successful_resolution_bonus": 0.3,
377
+ "hypothesis_bonus_scale": 0.12,
378
+ "forbidden_reward_sources": [
379
+ "evidence_discovery",
380
+ "query_success",
381
+ "unlock_events",
382
+ "stage_advancement",
383
+ "patch_id_selection",
384
+ ],
385
+ },
386
+ "initial_services": {
387
+ "api-gateway": {
388
+ "status": "degraded",
389
+ "cpu_pct": 38.0,
390
+ "memory_pct": 42.0,
391
+ "error_rate_pct": 41.0,
392
+ "latency_ms": 180.0,
393
+ },
394
+ "cache": {
395
+ "status": "healthy",
396
+ "cpu_pct": 17.0,
397
+ "memory_pct": 23.0,
398
+ "error_rate_pct": 0.0,
399
+ "latency_ms": 12.0,
400
+ },
401
+ "database": {
402
+ "status": "healthy",
403
+ "cpu_pct": 38.0,
404
+ "memory_pct": 41.0,
405
+ "error_rate_pct": 1.0,
406
+ "latency_ms": 28.0,
407
+ },
408
+ "worker": {
409
+ "status": "degraded",
410
+ "cpu_pct": 63.0,
411
+ "memory_pct": 48.0,
412
+ "error_rate_pct": 4.0,
413
+ "latency_ms": 220.0,
414
+ },
415
+ },
416
+ "initial_alerts": [
417
+ {
418
+ "service": "api-gateway",
419
+ "severity": "critical",
420
+ "message": "Gateway is returning 401 on ~40% of valid login attempts.",
421
+ },
422
+ {
423
+ "service": "worker",
424
+ "severity": "warning",
425
+ "message": "Worker queue depth is elevated from the retry storm upstream.",
426
+ },
427
+ ],
428
+ "logs": {
429
+ "api-gateway": (
430
+ "Gateway logs show auth-middleware rejecting tokens with valid signatures. "
431
+ "Rejection rate started exactly at the gateway@2026.04.24-auth rollout boundary."
432
+ ),
433
+ "cache": "Cache hit ratio stable and unrelated.",
434
+ "database": "Database logs are clean; no increase in errors or latency.",
435
+ "worker": (
436
+ "Worker logs show client-side retry storms triggered by upstream 401s, not local faults. "
437
+ "Worker deploy worker@2026.04.24-hotfix is a log-format tweak and does not touch auth."
438
+ ),
439
+ },
440
+ "metrics": {
441
+ "api-gateway": {
442
+ "error_rate": "Gateway error rate is 41%, dominated by 401 responses (auth failures).",
443
+ "latency": "Gateway latency is normal — errors are fast rejections, not timeouts.",
444
+ },
445
+ "database": {
446
+ "cpu": "Database CPU is 38% (normal).",
447
+ "error_rate": "Database error rate is ~1% and flat.",
448
+ },
449
+ "worker": {
450
+ "cpu": "Worker CPU is 63% from retry volume, not workload.",
451
+ "error_rate": "Worker errors are reactive retries, not primary failures.",
452
+ },
453
+ },
454
+ "dependencies": {
455
+ "api-gateway": "api-gateway -> (auth) -> worker -> database",
456
+ "worker": "worker -> database",
457
+ "database": "database is healthy; it is not on the fault path",
458
+ },
459
+ "deploy_history": {
460
+ "api-gateway": "Rolled out gateway@2026.04.24-auth 9 minutes ago (auth middleware rewrite).",
461
+ "cache": "No cache deploys in the last 24h.",
462
+ "database": "No database deploys in the last 24h.",
463
+ "worker": "Rolled out worker@2026.04.24-hotfix 18 minutes ago (log-format tweak, no auth changes).",
464
+ },
465
+ "checks": {
466
+ "database_recovery": "Confirms the database is healthy (always healthy in this scenario).",
467
+ "end_to_end": "Confirms gateway login traffic succeeds end-to-end.",
468
+ },
469
+ "truth": {
470
+ "root_cause": "api_gateway_fault",
471
+ "affected_services": ["api-gateway", "worker"],
472
+ "best_next_action": "rollback_deploy",
473
+ },
474
+ "remediation_recipe": {
475
+ "rollback_target": "api-gateway",
476
+ "restart_target": None,
477
+ "isolate_target": "api-gateway",
478
+ "restart_requires_cause_removed": True,
479
+ "incident_driver": "api-gateway",
480
+ "resolution_check": "end_to_end",
481
+ },
482
+ "post_rollback_services": {
483
+ "api-gateway": {"status": "healthy", "cpu_pct": 30.0, "memory_pct": 34.0, "error_rate_pct": 1.0, "latency_ms": 38.0},
484
+ "worker": {"status": "healthy", "cpu_pct": 34.0, "memory_pct": 36.0, "error_rate_pct": 1.0, "latency_ms": 52.0},
485
+ },
486
+ "post_rollback_user_impact": 0.12,
487
+ "post_rollback_slo_burn": 0.18,
488
+ "post_restart_services": {},
489
+ "post_restart_user_impact": 0.12,
490
+ "post_restart_slo_burn": 0.18,
491
+ "post_isolate_services": {
492
+ "api-gateway": {"status": "isolated", "cpu_pct": 6.0, "memory_pct": 14.0, "error_rate_pct": 0.0, "latency_ms": 0.0},
493
+ },
494
+ "post_isolate_user_impact": 0.55,
495
+ "post_isolate_slo_burn": 0.60,
496
+ "degraded_services": {
497
+ "api-gateway": {"status": "degraded", "cpu_pct": 38.0, "memory_pct": 42.0, "error_rate_pct": 41.0, "latency_ms": 180.0},
498
+ "worker": {"status": "degraded", "cpu_pct": 63.0, "memory_pct": 48.0, "error_rate_pct": 4.0, "latency_ms": 220.0},
499
+ },
500
+ "degraded_user_impact": 0.65,
501
+ "degraded_slo_burn": 0.72,
502
+ "failure_messages": {
503
+ "wrong_rollback_target": "The worker deploy is a log-format tweak and is not on the auth fault path.",
504
+ "low_value_restart": "Restarting a service does not fix a config/middleware regression rolled out as a deploy.",
505
+ "premature_restart": "Restarting before rolling back the gateway auth change just restarts the same bad middleware.",
506
+ "wrong_isolation_target": "Isolating workers or database cuts healthy traffic without fixing the gateway auth fault.",
507
+ },
508
+ },
509
+ }
510
+
511
+ _RUNTIME_PROGRESS: dict[str, Any] | None = None
512
+
513
+
514
+ def get_scenario(scenario_id: str) -> dict[str, Any]:
515
+ if scenario_id not in SCENARIOS:
516
+ raise ValueError(f"Unknown scenario_id {scenario_id!r}")
517
+ return deepcopy(SCENARIOS[scenario_id])
518
+
519
+
520
+ SUPPORTED_DIFFICULTIES: tuple[str, ...] = ("easy", "medium", "hard")
521
+
522
+
523
+ def scenario_for_difficulty(difficulty: str) -> dict[str, Any]:
524
+ for scenario in SCENARIOS.values():
525
+ if scenario["difficulty"] == difficulty:
526
+ return deepcopy(scenario)
527
+ raise ValueError(f"Unknown difficulty {difficulty!r}")
528
+
529
+
530
+ def list_scenarios(difficulty: str | None = None) -> ScenarioCatalog:
531
+ if difficulty is not None and difficulty not in SUPPORTED_DIFFICULTIES:
532
+ raise ValueError(f"Unknown difficulty {difficulty!r}")
533
+ scenarios = [
534
+ ScenarioSummary(
535
+ id=scenario["id"],
536
+ difficulty=scenario["difficulty"],
537
+ name=scenario["name"],
538
+ description=scenario["description"],
539
+ root_cause=scenario["root_cause"],
540
+ optimal_ticks=scenario["optimal_ticks"],
541
+ )
542
+ for scenario in SCENARIOS.values()
543
+ if difficulty is None or scenario["difficulty"] == difficulty
544
+ ]
545
+ return ScenarioCatalog(
546
+ default_scenario_id=DEFAULT_SCENARIO_ID,
547
+ available_difficulties=list(SUPPORTED_DIFFICULTIES),
548
+ filtered_difficulty=difficulty,
549
+ scenarios=scenarios,
550
+ )
551
+
552
+
553
+ def _worker_cascade_baseline() -> list[BaselineStep]:
554
+ return [
555
+ BaselineStep(
556
+ action=UnifiedIncidentAction(action_type="query_deploys", service="worker"),
557
+ rationale="Check whether any recent deploy aligns with the incident start.",
558
+ ),
559
+ BaselineStep(
560
+ action=UnifiedIncidentAction(action_type="query_logs", service="worker"),
561
+ rationale="Inspect worker logs because deploy timing and queue pressure suggest worker-originated harm.",
562
+ ),
563
+ BaselineStep(
564
+ action=UnifiedIncidentAction(action_type="query_metrics", service="database", metric="cpu"),
565
+ rationale="Confirm that the database is overloaded as a downstream effect.",
566
+ ),
567
+ BaselineStep(
568
+ action=UnifiedIncidentAction(action_type="query_dependencies", service="api-gateway"),
569
+ rationale="Verify the gateway depends on the worker and database path.",
570
+ ),
571
+ BaselineStep(
572
+ action=UnifiedIncidentAction(
573
+ action_type="submit_hypothesis",
574
+ hypothesis={
575
+ "root_cause": "bad_worker_deploy",
576
+ "affected_services": ["worker", "database", "api-gateway"],
577
+ "confidence": 0.82,
578
+ "recommended_next_action": "rollback_deploy",
579
+ },
580
+ ),
581
+ rationale="Commit a calibrated hypothesis before taking an invasive mitigation step.",
582
+ ),
583
+ BaselineStep(
584
+ action=UnifiedIncidentAction(action_type="rollback_deploy", service="worker"),
585
+ rationale="Remove the triggering change before restarting downstream services.",
586
+ ),
587
+ BaselineStep(
588
+ action=UnifiedIncidentAction(action_type="restart_service", service="database"),
589
+ rationale="Bring the database back cleanly after the root cause is removed.",
590
+ ),
591
+ BaselineStep(
592
+ action=UnifiedIncidentAction(action_type="run_check", check_name="database_recovery"),
593
+ rationale="Verify the database is no longer crashing.",
594
+ ),
595
+ BaselineStep(
596
+ action=UnifiedIncidentAction(action_type="run_check", check_name="end_to_end"),
597
+ rationale="Verify gateway traffic succeeds end-to-end.",
598
+ ),
599
+ BaselineStep(
600
+ action=UnifiedIncidentAction(action_type="declare_resolved"),
601
+ rationale="Declare resolved only after objective checks pass.",
602
+ ),
603
+ ]
604
+
605
+
606
+ def _db_config_rollout_baseline() -> list[BaselineStep]:
607
+ return [
608
+ BaselineStep(
609
+ action=UnifiedIncidentAction(action_type="query_logs", service="database"),
610
+ rationale="Database is the loudest alert; inspect logs for the actual error signature.",
611
+ ),
612
+ BaselineStep(
613
+ action=UnifiedIncidentAction(action_type="query_deploys", service="database"),
614
+ rationale="Pool-acquire errors suggest a config change; check recent database rollouts.",
615
+ ),
616
+ BaselineStep(
617
+ action=UnifiedIncidentAction(action_type="query_metrics", service="database", metric="error_rate"),
618
+ rationale="Confirm the error pattern is pool exhaustion rather than compute overload.",
619
+ ),
620
+ BaselineStep(
621
+ action=UnifiedIncidentAction(action_type="query_logs", service="worker"),
622
+ rationale="Rule out the decoy worker deploy by reading worker logs directly.",
623
+ ),
624
+ BaselineStep(
625
+ action=UnifiedIncidentAction(
626
+ action_type="submit_hypothesis",
627
+ hypothesis={
628
+ "root_cause": "database_only_failure",
629
+ "affected_services": ["database", "api-gateway", "worker"],
630
+ "confidence": 0.8,
631
+ "recommended_next_action": "rollback_deploy",
632
+ },
633
+ ),
634
+ rationale="Localize the fault to the database config before remediating.",
635
+ ),
636
+ BaselineStep(
637
+ action=UnifiedIncidentAction(action_type="rollback_deploy", service="database"),
638
+ rationale="Roll back the offending database config rollout.",
639
+ ),
640
+ BaselineStep(
641
+ action=UnifiedIncidentAction(action_type="restart_service", service="database"),
642
+ rationale="Restart the database cleanly against the restored pool config.",
643
+ ),
644
+ BaselineStep(
645
+ action=UnifiedIncidentAction(action_type="run_check", check_name="database_recovery"),
646
+ rationale="Verify database pool health and write latency are back within SLO.",
647
+ ),
648
+ BaselineStep(
649
+ action=UnifiedIncidentAction(action_type="run_check", check_name="end_to_end"),
650
+ rationale="Verify gateway write-path traffic succeeds end-to-end.",
651
+ ),
652
+ BaselineStep(
653
+ action=UnifiedIncidentAction(action_type="declare_resolved"),
654
+ rationale="Declare resolved only after objective checks pass.",
655
+ ),
656
+ ]
657
+
658
+
659
+ def _gateway_auth_rollout_baseline() -> list[BaselineStep]:
660
+ return [
661
+ BaselineStep(
662
+ action=UnifiedIncidentAction(action_type="query_logs", service="api-gateway"),
663
+ rationale="Gateway is rejecting logins; read gateway logs to localize the rejection class.",
664
+ ),
665
+ BaselineStep(
666
+ action=UnifiedIncidentAction(action_type="query_deploys", service="api-gateway"),
667
+ rationale="Login rejection aligns with a recent auth middleware rollout; confirm deploy timing.",
668
+ ),
669
+ BaselineStep(
670
+ action=UnifiedIncidentAction(action_type="query_deploys", service="worker"),
671
+ rationale="Rule out the worker deploy explicitly rather than assuming.",
672
+ ),
673
+ BaselineStep(
674
+ action=UnifiedIncidentAction(
675
+ action_type="submit_hypothesis",
676
+ hypothesis={
677
+ "root_cause": "api_gateway_fault",
678
+ "affected_services": ["api-gateway", "worker"],
679
+ "confidence": 0.85,
680
+ "recommended_next_action": "rollback_deploy",
681
+ },
682
+ ),
683
+ rationale="Commit a calibrated hypothesis localizing to the gateway auth rollout.",
684
+ ),
685
+ BaselineStep(
686
+ action=UnifiedIncidentAction(action_type="rollback_deploy", service="api-gateway"),
687
+ rationale="Roll back the bad auth middleware rollout; no restart needed.",
688
+ ),
689
+ BaselineStep(
690
+ action=UnifiedIncidentAction(action_type="run_check", check_name="end_to_end"),
691
+ rationale="Verify that gateway login traffic now succeeds end-to-end.",
692
+ ),
693
+ BaselineStep(
694
+ action=UnifiedIncidentAction(action_type="run_check", check_name="database_recovery"),
695
+ rationale="Confirm the database is (and stayed) healthy throughout.",
696
+ ),
697
+ BaselineStep(
698
+ action=UnifiedIncidentAction(action_type="declare_resolved"),
699
+ rationale="Declare resolved only after objective checks pass.",
700
+ ),
701
+ ]
702
+
703
+
704
+ _BASELINE_BUILDERS = {
705
+ "worker_deploy_cascade": _worker_cascade_baseline,
706
+ "db_config_rollout": _db_config_rollout_baseline,
707
+ "gateway_auth_rollout": _gateway_auth_rollout_baseline,
708
+ }
709
+
710
+
711
+ def _baseline_actions(scenario_id: str) -> list[BaselineStep]:
712
+ builder = _BASELINE_BUILDERS.get(scenario_id)
713
+ if builder is None:
714
+ raise ValueError(f"No baseline for scenario_id {scenario_id!r}")
715
+ return builder()
716
+
717
+
718
+ def list_baselines(scenario_id: str | None = None) -> BaselineCatalog:
719
+ if scenario_id is not None:
720
+ if scenario_id not in SCENARIOS:
721
+ raise ValueError(f"Unknown scenario_id {scenario_id!r}")
722
+ scenario_ids = [scenario_id]
723
+ else:
724
+ scenario_ids = list(SCENARIOS.keys())
725
+ baselines = [
726
+ BaselineDefinition(
727
+ scenario_id=current_id,
728
+ name="deterministic-remediation-baseline",
729
+ description=SCENARIOS[current_id]["description"],
730
+ optimal_ticks=SCENARIOS[current_id]["optimal_ticks"],
731
+ actions=_baseline_actions(current_id),
732
+ )
733
+ for current_id in scenario_ids
734
+ ]
735
+ return BaselineCatalog(baselines=baselines)
736
+
737
+
738
+ def set_runtime_progress(progress: dict[str, Any]) -> None:
739
+ global _RUNTIME_PROGRESS
740
+ _RUNTIME_PROGRESS = deepcopy(progress)
741
+
742
+
743
+ def current_runtime_progress() -> dict[str, Any]:
744
+ if _RUNTIME_PROGRESS is None:
745
+ raise ValueError("Runtime progress is not initialized")
746
+ return deepcopy(_RUNTIME_PROGRESS)
747
+
748
+
749
+ def grade_episode(state: dict[str, Any]):
750
+ from .grader import UnifiedIncidentGrader
751
+
752
+ scenario_id = state.get("scenario_id", DEFAULT_SCENARIO_ID)
753
+ return UnifiedIncidentGrader().build_report(state, get_scenario(scenario_id))
unified_incident_env/server/environment.py ADDED
@@ -0,0 +1,613 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Honest narrow incident-remediation environment core."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import uuid
7
+ from typing import Any
8
+
9
+ from openenv.core.env_server import Environment
10
+ from openenv.core.env_server.types import EnvironmentMetadata
11
+
12
+ from ..models import (
13
+ Alert,
14
+ CheckResult,
15
+ ServiceHealth,
16
+ UnifiedIncidentAction,
17
+ UnifiedIncidentObservation,
18
+ UnifiedIncidentState,
19
+ )
20
+ from .challenge import DEFAULT_SCENARIO_ID, SCENARIOS, get_scenario, scenario_for_difficulty, set_runtime_progress
21
+ from .grader import UnifiedIncidentGrader
22
+
23
+ SERVICE_ORDER = ("api-gateway", "cache", "database", "worker")
24
+ ALL_ACTIONS = [
25
+ "query_logs",
26
+ "query_metrics",
27
+ "query_dependencies",
28
+ "query_deploys",
29
+ "rollback_deploy",
30
+ "restart_service",
31
+ "run_check",
32
+ "isolate_service",
33
+ "escalate",
34
+ "submit_hypothesis",
35
+ "declare_resolved",
36
+ ]
37
+ REQUIRED_FIELDS_BY_ACTION: dict[str, list[str]] = {
38
+ "query_logs": ["service"],
39
+ "query_metrics": ["service", "metric"],
40
+ "query_dependencies": ["service"],
41
+ "query_deploys": ["service"],
42
+ "rollback_deploy": ["service"],
43
+ "restart_service": ["service"],
44
+ "run_check": ["check_name"],
45
+ "isolate_service": ["service"],
46
+ "escalate": [],
47
+ "submit_hypothesis": ["hypothesis"],
48
+ "declare_resolved": [],
49
+ }
50
+ STATUS_VALUES = {
51
+ "healthy": 1.0,
52
+ "degraded": 0.4,
53
+ "crashed": 0.0,
54
+ "isolated": 0.2,
55
+ }
56
+
57
+
58
+ class UnifiedIncidentEnvironment(Environment[UnifiedIncidentAction, UnifiedIncidentObservation, UnifiedIncidentState]):
59
+ """A bounded-action incident diagnosis and safe remediation environment."""
60
+
61
+ SUPPORTS_CONCURRENT_SESSIONS = False
62
+
63
+ def __init__(self) -> None:
64
+ super().__init__()
65
+ self._grader = UnifiedIncidentGrader()
66
+ self._episode = self._make_episode(get_scenario(DEFAULT_SCENARIO_ID))
67
+ set_runtime_progress(self._state_dict())
68
+
69
+ def get_metadata(self) -> EnvironmentMetadata:
70
+ return EnvironmentMetadata(
71
+ name="unified_incident_env",
72
+ description=(
73
+ "A narrow incident diagnosis and safe remediation environment with bounded actions, "
74
+ "world-state transitions, explicit checks, and effect-based rewards."
75
+ ),
76
+ version="2.0.0",
77
+ author="Daksh Verma",
78
+ )
79
+
80
+ def reset(self, seed: int | None = None, episode_id: str | None = None, **kwargs: Any) -> UnifiedIncidentObservation:
81
+ del seed
82
+ scenario_id = kwargs.get("scenario_id")
83
+ difficulty = kwargs.get("difficulty")
84
+ if scenario_id:
85
+ scenario = get_scenario(scenario_id)
86
+ elif difficulty:
87
+ scenario = scenario_for_difficulty(difficulty)
88
+ else:
89
+ scenario = get_scenario(DEFAULT_SCENARIO_ID)
90
+ self._episode = self._make_episode(scenario, episode_id=episode_id)
91
+ set_runtime_progress(self._state_dict())
92
+ return self._build_observation(
93
+ last_action_result="Episode reset.",
94
+ tool_output=None,
95
+ reward=0.0,
96
+ done=False,
97
+ )
98
+
99
+ def step(self, action: UnifiedIncidentAction | dict[str, Any], timeout_s: float | None = None, **kwargs: Any) -> UnifiedIncidentObservation:
100
+ del timeout_s, kwargs
101
+ if isinstance(action, dict):
102
+ action = UnifiedIncidentAction(**action)
103
+
104
+ if self._episode["done"]:
105
+ return self._build_observation(
106
+ last_action_result="Episode complete. Reset to start another run.",
107
+ tool_output=None,
108
+ reward=0.0,
109
+ done=True,
110
+ )
111
+
112
+ self._episode["tick"] += 1
113
+ self._episode["step_count"] += 1
114
+ before_potential = self._incident_health_potential()
115
+ base_step_cost = float(self._episode["scenario"]["reward_config"]["step_cost"])
116
+ penalty = 0.0
117
+ bonus = 0.0
118
+ tool_output: str | None = None
119
+ state_changed = False
120
+ useful_observation = False
121
+
122
+ self._episode["failure_type"] = None
123
+ self._episode["why_failed"] = None
124
+ self._episode["loop_warning"] = None
125
+
126
+ if action.action_type == "query_logs":
127
+ tool_output = self._query_logs(action.service)
128
+ useful_observation = self._mark_evidence_once(f"logs:{action.service}", tool_output)
129
+ last_action_result = f"Queried logs for {action.service}."
130
+ elif action.action_type == "query_metrics":
131
+ tool_output = self._query_metrics(action.service, action.metric)
132
+ useful_observation = self._mark_evidence_once(f"metrics:{action.service}:{action.metric}", tool_output)
133
+ last_action_result = f"Queried {action.metric} for {action.service}."
134
+ elif action.action_type == "query_dependencies":
135
+ tool_output = self._query_dependencies(action.service)
136
+ useful_observation = self._mark_evidence_once(f"deps:{action.service}", tool_output)
137
+ last_action_result = f"Queried dependencies for {action.service}."
138
+ elif action.action_type == "query_deploys":
139
+ tool_output = self._query_deploys(action.service)
140
+ useful_observation = self._mark_evidence_once(f"deploys:{action.service}", tool_output)
141
+ last_action_result = f"Queried deploy history for {action.service}."
142
+ elif action.action_type == "submit_hypothesis":
143
+ bonus, useful_observation, last_action_result = self._submit_hypothesis(action)
144
+ elif action.action_type == "rollback_deploy":
145
+ state_changed, penalty, last_action_result = self._rollback_deploy(action.service)
146
+ elif action.action_type == "restart_service":
147
+ state_changed, penalty, last_action_result = self._restart_service(action.service)
148
+ elif action.action_type == "isolate_service":
149
+ state_changed, penalty, last_action_result = self._isolate_service(action.service)
150
+ elif action.action_type == "run_check":
151
+ tool_output, useful_observation, last_action_result = self._run_check(action.check_name)
152
+ elif action.action_type == "escalate":
153
+ useful_observation = self._mark_evidence_once(
154
+ f"escalate:{self._episode['tick']}",
155
+ "Escalation note recorded: expert attention requested while keeping the environment state unchanged.",
156
+ )
157
+ last_action_result = "Escalated for human attention."
158
+ tool_output = "Escalation does not fix the incident, but records that expert attention was requested."
159
+ elif action.action_type == "declare_resolved":
160
+ resolved, penalty, bonus, last_action_result = self._declare_resolved()
161
+ state_changed = resolved
162
+ else:
163
+ last_action_result = f"Unsupported action {action.action_type!r}."
164
+ penalty += self._unsafe_penalty()
165
+ self._set_failure("unsupported_action", "That action is not part of this honest narrow environment.")
166
+
167
+ self._advance_world()
168
+ self._refresh_alerts()
169
+ self._update_loop_feedback(action, useful_observation or state_changed)
170
+ after_potential = self._incident_health_potential()
171
+
172
+ reward = -base_step_cost + (after_potential - before_potential) + bonus - penalty
173
+ if not useful_observation and not state_changed and bonus <= 0.0:
174
+ self._episode["wasteful_ticks"] += 1
175
+
176
+ if self._episode["tick"] >= self._episode["max_ticks"] and not self._episode["done"]:
177
+ self._episode["done"] = True
178
+ last_action_result = f"{last_action_result} Tick budget exhausted.".strip()
179
+
180
+ self._episode["last_action_result"] = last_action_result
181
+ self._episode["workflow_stage"] = self._workflow_stage()
182
+ self._episode["score_breakdown"] = self._grader.compute_breakdown(self._state_dict(), self._episode["scenario"])
183
+ self._episode["final_score"] = self._episode["score_breakdown"]["final_score"]
184
+ self._episode["cumulative_reward"] = round(self._episode["cumulative_reward"] + reward, 4)
185
+
186
+ set_runtime_progress(self._state_dict())
187
+ return self._build_observation(
188
+ last_action_result=last_action_result,
189
+ tool_output=tool_output,
190
+ reward=round(reward, 4),
191
+ done=self._episode["done"],
192
+ )
193
+
194
+ @property
195
+ def state(self) -> UnifiedIncidentState:
196
+ return UnifiedIncidentState(**self._state_dict())
197
+
198
+ def _make_episode(self, scenario: dict[str, Any], episode_id: str | None = None) -> dict[str, Any]:
199
+ services = {
200
+ name: ServiceHealth(name=name, **payload)
201
+ for name, payload in scenario["initial_services"].items()
202
+ }
203
+ checks = {
204
+ "database_recovery": CheckResult(name="database_recovery", passed=False, detail="Database recovery has not been verified yet."),
205
+ "end_to_end": CheckResult(name="end_to_end", passed=False, detail="End-to-end health has not been verified yet."),
206
+ }
207
+ recipe = scenario.get("remediation_recipe", {})
208
+ rollback_target = recipe.get("rollback_target", "worker")
209
+ recent_deploy_service = rollback_target if rollback_target in scenario["deploy_history"] else "worker"
210
+ return {
211
+ "episode_id": episode_id or str(uuid.uuid4()),
212
+ "scenario": scenario,
213
+ "tick": 0,
214
+ "step_count": 0,
215
+ "max_ticks": scenario["max_ticks"],
216
+ "difficulty": scenario["difficulty"],
217
+ "services": services,
218
+ "alerts": [Alert(**payload) for payload in scenario["initial_alerts"]],
219
+ "discovered_evidence": [],
220
+ "evidence_seen": set(),
221
+ "recent_deploys": [scenario["deploy_history"].get(recent_deploy_service, "")],
222
+ "checks": checks,
223
+ "user_impact": scenario.get("degraded_user_impact", 0.82),
224
+ "slo_burn_rate": scenario.get("degraded_slo_burn", 0.91),
225
+ "containment_applied": False,
226
+ "cause_removed": False,
227
+ "isolated_service": None,
228
+ "hypothesis_seen": set(),
229
+ "failure_type": None,
230
+ "why_failed": None,
231
+ "loop_warning": None,
232
+ "last_action_key": None,
233
+ "repeat_count": 0,
234
+ "incident_resolved": False,
235
+ "workflow_stage": "triage",
236
+ "cumulative_reward": 0.0,
237
+ "wasteful_ticks": 0,
238
+ "score_breakdown": {
239
+ "recovery_score": 0.0,
240
+ "containment_score": 0.0,
241
+ "verification_score": 0.0,
242
+ "impact_score": 0.0,
243
+ "efficiency_score": 0.10,
244
+ "final_score": 0.10,
245
+ },
246
+ "final_score": 0.10,
247
+ "last_action_result": "",
248
+ "done": False,
249
+ }
250
+
251
+ def _query_logs(self, service: str | None) -> str:
252
+ assert service is not None
253
+ return self._episode["scenario"]["logs"][service]
254
+
255
+ def _query_metrics(self, service: str | None, metric: str | None) -> str:
256
+ assert service is not None and metric is not None
257
+ return self._episode["scenario"]["metrics"][service][metric]
258
+
259
+ def _query_dependencies(self, service: str | None) -> str:
260
+ assert service is not None
261
+ return self._episode["scenario"]["dependencies"][service]
262
+
263
+ def _query_deploys(self, service: str | None) -> str:
264
+ assert service is not None
265
+ return self._episode["scenario"]["deploy_history"][service]
266
+
267
+ def _submit_hypothesis(self, action: UnifiedIncidentAction) -> tuple[float, bool, str]:
268
+ assert action.hypothesis is not None
269
+ normalized = json.dumps(action.hypothesis.model_dump(), sort_keys=True)
270
+ if normalized in self._episode["hypothesis_seen"]:
271
+ return 0.0, False, "Repeated hypothesis recorded with no additional reward."
272
+ self._episode["hypothesis_seen"].add(normalized)
273
+ truth = self._episode["scenario"]["truth"]
274
+ payload = action.hypothesis
275
+ cause_match = 1.0 if payload.root_cause == truth["root_cause"] else 0.0
276
+ service_match = len(set(payload.affected_services) & set(truth["affected_services"])) / len(set(truth["affected_services"]))
277
+ action_quality = 1.0 if payload.recommended_next_action == truth["best_next_action"] else -0.4
278
+ if cause_match == 1.0:
279
+ calibration = 1.0 if payload.confidence >= 0.7 else 0.5
280
+ else:
281
+ calibration = -1.0 if payload.confidence >= 0.7 else -0.2
282
+ reward = (0.04 * cause_match) + (0.03 * service_match) + (0.03 * action_quality) + (0.02 * calibration)
283
+ return round(reward, 4), True, "Hypothesis recorded. Reward reflects root-cause accuracy, service localization, confidence calibration, and next-action quality."
284
+
285
+ def _recipe(self) -> dict[str, Any]:
286
+ return self._episode["scenario"].get("remediation_recipe", {})
287
+
288
+ def _failure_message(self, key: str, default: str) -> str:
289
+ return self._episode["scenario"].get("failure_messages", {}).get(key, default)
290
+
291
+ def _apply_service_updates(self, updates: dict[str, dict[str, Any]]) -> None:
292
+ for name, payload in updates.items():
293
+ self._episode["services"][name] = ServiceHealth(name=name, **payload)
294
+
295
+ def _rollback_deploy(self, service: str | None) -> tuple[bool, float, str]:
296
+ assert service is not None
297
+ recipe = self._recipe()
298
+ rollback_target = recipe.get("rollback_target")
299
+ if rollback_target is None or service != rollback_target:
300
+ self._set_failure(
301
+ "wrong_remediation_target",
302
+ self._failure_message("wrong_rollback_target", "Rolling back a service without a causal link wastes time and risk."),
303
+ )
304
+ return False, self._unsafe_penalty(), f"Rollback on {service} did not address the incident."
305
+ if self._episode["cause_removed"]:
306
+ return False, 0.0, f"{rollback_target} deploy is already rolled back."
307
+ self._episode["cause_removed"] = True
308
+ self._episode["containment_applied"] = True
309
+ self._apply_service_updates(self._episode["scenario"].get("post_rollback_services", {}))
310
+ scenario = self._episode["scenario"]
311
+ self._episode["user_impact"] = min(self._episode["user_impact"], scenario.get("post_rollback_user_impact", self._episode["user_impact"]))
312
+ self._episode["slo_burn_rate"] = min(self._episode["slo_burn_rate"], scenario.get("post_rollback_slo_burn", self._episode["slo_burn_rate"]))
313
+ return True, 0.0, f"Rolled back the {rollback_target} deploy; the underlying cause is removed."
314
+
315
+ def _restart_service(self, service: str | None) -> tuple[bool, float, str]:
316
+ assert service is not None
317
+ recipe = self._recipe()
318
+ restart_target = recipe.get("restart_target")
319
+ if restart_target is None or service != restart_target:
320
+ self._set_failure(
321
+ "low_value_restart",
322
+ self._failure_message("low_value_restart", f"Restarting {service} is not the safe next remediation step for this incident."),
323
+ )
324
+ return False, self._unsafe_penalty() / 2, f"Restarting {service} had little or no positive effect."
325
+ if recipe.get("restart_requires_cause_removed", True) and not self._episode["cause_removed"]:
326
+ self._set_failure(
327
+ "premature_restart",
328
+ self._failure_message("premature_restart", f"Restarting {service} before removing the trigger only causes another failure."),
329
+ )
330
+ return False, self._unsafe_penalty(), f"Restart of {service} failed because the triggering cause is still present."
331
+ self._apply_service_updates(self._episode["scenario"].get("post_restart_services", {}))
332
+ scenario = self._episode["scenario"]
333
+ self._episode["user_impact"] = scenario.get("post_restart_user_impact", self._episode["user_impact"])
334
+ self._episode["slo_burn_rate"] = scenario.get("post_restart_slo_burn", self._episode["slo_burn_rate"])
335
+ return True, 0.0, f"{service} restarted cleanly after the triggering cause was removed."
336
+
337
+ def _isolate_service(self, service: str | None) -> tuple[bool, float, str]:
338
+ assert service is not None
339
+ recipe = self._recipe()
340
+ isolate_target = recipe.get("isolate_target")
341
+ if isolate_target is None or service != isolate_target:
342
+ self._set_failure(
343
+ "wrong_isolation_target",
344
+ self._failure_message("wrong_isolation_target", f"Isolating {service} does not contain the dominant failure path."),
345
+ )
346
+ return False, self._unsafe_penalty() / 2, f"Isolation of {service} did not materially reduce blast radius."
347
+ if self._episode["isolated_service"] == isolate_target:
348
+ return False, 0.0, f"{isolate_target} is already isolated."
349
+ self._episode["isolated_service"] = isolate_target
350
+ self._episode["containment_applied"] = True
351
+ self._apply_service_updates(self._episode["scenario"].get("post_isolate_services", {}))
352
+ scenario = self._episode["scenario"]
353
+ self._episode["user_impact"] = scenario.get("post_isolate_user_impact", self._episode["user_impact"])
354
+ self._episode["slo_burn_rate"] = scenario.get("post_isolate_slo_burn", self._episode["slo_burn_rate"])
355
+ return True, 0.0, f"{isolate_target} isolated. Blast radius shrank, but full resolution still requires addressing the root cause."
356
+
357
+ def _run_check(self, check_name: str | None) -> tuple[str, bool, str]:
358
+ assert check_name is not None
359
+ recipe = self._recipe()
360
+ isolated = self._episode["isolated_service"]
361
+ cause_removed = self._episode["cause_removed"]
362
+ services = self._episode["services"]
363
+ if check_name == "database_recovery":
364
+ db_healthy = services["database"].status == "healthy"
365
+ incident_driver = recipe.get("incident_driver")
366
+ if incident_driver in {"worker", "database"}:
367
+ passed = db_healthy and cause_removed
368
+ else:
369
+ passed = db_healthy
370
+ detail = (
371
+ "Database is healthy and no longer failing."
372
+ if passed
373
+ else "Database is still unstable or the triggering cause is still present."
374
+ )
375
+ else:
376
+ gateway_healthy = services["api-gateway"].status == "healthy"
377
+ db_healthy = services["database"].status == "healthy"
378
+ worker_healthy = services["worker"].status == "healthy"
379
+ passed = (
380
+ gateway_healthy
381
+ and db_healthy
382
+ and worker_healthy
383
+ and cause_removed
384
+ and isolated is None
385
+ )
386
+ detail = (
387
+ "End-to-end login traffic is healthy."
388
+ if passed
389
+ else "End-to-end traffic still fails or remains degraded."
390
+ )
391
+ self._episode["checks"][check_name] = CheckResult(name=check_name, passed=passed, detail=detail)
392
+ useful = self._mark_evidence_once(f"check:{check_name}:{passed}", detail)
393
+ return detail, useful, f"Ran {check_name} check."
394
+
395
+ def _declare_resolved(self) -> tuple[bool, float, float, str]:
396
+ checks = self._episode["checks"]
397
+ resolution_check = self._recipe().get("resolution_check", "end_to_end")
398
+ safe_to_resolve = bool(checks.get(resolution_check) and checks[resolution_check].passed)
399
+ if not safe_to_resolve:
400
+ self._set_failure("premature_resolution", "The incident is not verified as resolved yet.")
401
+ return False, self._episode["scenario"]["reward_config"]["premature_resolution_penalty"], 0.0, "Resolution declaration rejected: required checks have not passed."
402
+ self._episode["incident_resolved"] = True
403
+ self._episode["done"] = True
404
+ return True, 0.0, self._episode["scenario"]["reward_config"]["successful_resolution_bonus"], "Incident declared resolved after passing objective checks."
405
+
406
+ def _mark_evidence_once(self, key: str, detail: str) -> bool:
407
+ if key in self._episode["evidence_seen"]:
408
+ return False
409
+ self._episode["evidence_seen"].add(key)
410
+ self._episode["discovered_evidence"].append(detail)
411
+ return True
412
+
413
+ def _unsafe_penalty(self) -> float:
414
+ return float(self._episode["scenario"]["reward_config"]["unsafe_action_penalty"])
415
+
416
+ def _set_failure(self, failure_type: str, why_failed: str) -> None:
417
+ self._episode["failure_type"] = failure_type
418
+ self._episode["why_failed"] = why_failed
419
+
420
+ def _advance_world(self) -> None:
421
+ cause_removed = self._episode["cause_removed"]
422
+ isolated = self._episode["isolated_service"]
423
+ if not cause_removed and isolated is None:
424
+ self._apply_service_updates(self._episode["scenario"].get("degraded_services", {}))
425
+ scenario = self._episode["scenario"]
426
+ self._episode["user_impact"] = max(self._episode["user_impact"], scenario.get("degraded_user_impact", self._episode["user_impact"]))
427
+ self._episode["slo_burn_rate"] = max(self._episode["slo_burn_rate"], scenario.get("degraded_slo_burn", self._episode["slo_burn_rate"]))
428
+ if isolated is not None and not cause_removed:
429
+ self._episode["containment_applied"] = True
430
+ self._episode["workflow_stage"] = self._workflow_stage()
431
+
432
+ def _refresh_alerts(self) -> None:
433
+ alerts: list[Alert] = []
434
+ for service_name in SERVICE_ORDER:
435
+ service = self._episode["services"][service_name]
436
+ if service.status == "crashed":
437
+ alerts.append(Alert(service=service_name, severity="critical", message=f"{service_name} is unavailable."))
438
+ elif service.status == "degraded":
439
+ alerts.append(Alert(service=service_name, severity="warning", message=f"{service_name} is degraded."))
440
+ if self._episode["user_impact"] >= 0.3 and not any(alert.service == "api-gateway" for alert in alerts):
441
+ alerts.append(Alert(service="api-gateway", severity="warning", message="User-visible impact remains elevated."))
442
+ self._episode["alerts"] = alerts
443
+
444
+ def _update_loop_feedback(self, action: UnifiedIncidentAction, progressed: bool) -> None:
445
+ action_key = repr(action.model_dump(exclude_none=True))
446
+ if progressed:
447
+ self._episode["last_action_key"] = action_key
448
+ self._episode["repeat_count"] = 0
449
+ return
450
+ if self._episode["last_action_key"] == action_key:
451
+ self._episode["repeat_count"] += 1
452
+ else:
453
+ self._episode["repeat_count"] = 1
454
+ self._episode["last_action_key"] = action_key
455
+ if self._episode["repeat_count"] >= 2:
456
+ self._episode["loop_warning"] = "The same no-progress action has repeated; choose a different evidence source or remediation step."
457
+
458
+ def _workflow_stage(self) -> str:
459
+ if self._episode["incident_resolved"]:
460
+ return "resolved"
461
+ checks = self._episode["checks"]
462
+ if checks["database_recovery"].passed or checks["end_to_end"].passed:
463
+ return "validation"
464
+ if self._episode["containment_applied"] or self._episode["cause_removed"] or self._episode["isolated_service"] is not None:
465
+ return "mitigation"
466
+ return "triage"
467
+
468
+ def _allowed_actions(self) -> list[str]:
469
+ return list(ALL_ACTIONS)
470
+
471
+ def _required_fields_by_action(self) -> dict[str, list[str]]:
472
+ return {action: REQUIRED_FIELDS_BY_ACTION[action] for action in self._allowed_actions()}
473
+
474
+ def _progress_flags(self) -> dict[str, bool]:
475
+ checks = self._episode["checks"]
476
+ return {
477
+ "containment_applied": self._episode["containment_applied"],
478
+ "cause_removed": self._episode["cause_removed"],
479
+ "database_recovery": checks["database_recovery"].passed,
480
+ "end_to_end": checks["end_to_end"].passed,
481
+ "incident_resolved": self._episode["incident_resolved"],
482
+ "isolation_applied": self._episode["isolated_service"] is not None,
483
+ }
484
+
485
+ def _incident_summary(self) -> str:
486
+ description = self._episode["scenario"].get("description")
487
+ if description:
488
+ return description
489
+ return (
490
+ "An incident is degrading user traffic. Use evidence-gathering actions to diagnose, "
491
+ "then choose a safe remediation and verify with explicit checks."
492
+ )
493
+
494
+ def _prompt_text(self, tool_output: str | None) -> str:
495
+ lines = [
496
+ f"TICK {self._episode['tick']}/{self._episode['max_ticks']}",
497
+ f"WORKFLOW_STAGE: {self._episode['workflow_stage']}",
498
+ "",
499
+ "INCIDENT_SUMMARY:",
500
+ self._incident_summary(),
501
+ "",
502
+ "ACTIVE_ALERTS:",
503
+ ]
504
+ if self._episode["alerts"]:
505
+ lines.extend(f"- [{alert.severity.upper()}] {alert.service}: {alert.message}" for alert in self._episode["alerts"])
506
+ else:
507
+ lines.append("- none")
508
+ lines.extend([
509
+ "",
510
+ "SERVICES:",
511
+ ])
512
+ for service_name in SERVICE_ORDER:
513
+ health = self._episode["services"][service_name]
514
+ lines.append(
515
+ f"- {service_name}: {health.status} cpu={health.cpu_pct:.1f} mem={health.memory_pct:.1f} err={health.error_rate_pct:.1f} latency={health.latency_ms:.1f}"
516
+ )
517
+ lines.extend([
518
+ "",
519
+ f"USER_IMPACT: {self._episode['user_impact']:.2f}",
520
+ f"SLO_BURN_RATE: {self._episode['slo_burn_rate']:.2f}",
521
+ f"LAST_ACTION_RESULT: {self._episode['last_action_result'] or 'none'}",
522
+ f"TOOL_OUTPUT: {tool_output or 'none'}",
523
+ f"FAILURE_TYPE: {self._episode['failure_type'] or 'none'}",
524
+ f"WHY_FAILED: {self._episode['why_failed'] or 'none'}",
525
+ "",
526
+ "CHECKS:",
527
+ ])
528
+ for check in self._episode["checks"].values():
529
+ lines.append(f"- {check.name}: {'passed' if check.passed else 'pending'} - {check.detail}")
530
+ lines.extend([
531
+ "",
532
+ "ALLOWED_ACTIONS:",
533
+ ])
534
+ lines.extend(f"- {action}" for action in self._allowed_actions())
535
+ return "\n".join(lines)
536
+
537
+ def _incident_health_potential(self) -> float:
538
+ weights = self._episode["scenario"]["critical_service_weights"]
539
+ services = self._episode["services"]
540
+ operational = sum(weights.get(name, 0.0) * STATUS_VALUES[services[name].status] for name in weights)
541
+ impact_relief = 1.0 - self._episode["user_impact"]
542
+ burn_relief = 1.0 - self._episode["slo_burn_rate"]
543
+ containment = 1.0 if self._episode["containment_applied"] else 0.0
544
+ return round((0.55 * operational) + (0.2 * impact_relief) + (0.15 * burn_relief) + (0.10 * containment), 4)
545
+
546
+ def _state_dict(self) -> dict[str, Any]:
547
+ return {
548
+ "episode_id": self._episode["episode_id"],
549
+ "step_count": self._episode["step_count"],
550
+ "scenario_id": self._episode["scenario"]["id"],
551
+ "difficulty": self._episode["difficulty"],
552
+ "current_tick": self._episode["tick"],
553
+ "max_ticks": self._episode["max_ticks"],
554
+ "workflow_stage": self._episode["workflow_stage"],
555
+ "active_alerts": [alert.model_dump() for alert in self._episode["alerts"]],
556
+ "service_health": {name: service.model_dump() for name, service in self._episode["services"].items()},
557
+ "discovered_evidence": list(self._episode["discovered_evidence"]),
558
+ "recent_deploys": list(self._episode["recent_deploys"]),
559
+ "checks": [check.model_dump() for check in self._episode["checks"].values()],
560
+ "user_impact": self._episode["user_impact"],
561
+ "slo_burn_rate": self._episode["slo_burn_rate"],
562
+ "incident_resolved": self._episode["incident_resolved"],
563
+ "containment_applied": self._episode["containment_applied"],
564
+ "allowed_actions": self._allowed_actions(),
565
+ "required_fields_by_action": self._required_fields_by_action(),
566
+ "valid_action_example": None,
567
+ "progress_flags": self._progress_flags(),
568
+ "final_score": self._episode["final_score"],
569
+ "score_breakdown": dict(self._episode["score_breakdown"]),
570
+ "cumulative_reward": self._episode["cumulative_reward"],
571
+ "wasteful_ticks": self._episode["wasteful_ticks"],
572
+ "last_action_result": self._episode["last_action_result"],
573
+ "failure_type": self._episode["failure_type"],
574
+ "why_failed": self._episode["why_failed"],
575
+ }
576
+
577
+ def _build_observation(self, last_action_result: str, tool_output: str | None, reward: float, done: bool) -> UnifiedIncidentObservation:
578
+ return UnifiedIncidentObservation(
579
+ prompt_text=self._prompt_text(tool_output),
580
+ incident_summary=self._incident_summary(),
581
+ tick_count=self._episode["tick"],
582
+ max_ticks=self._episode["max_ticks"],
583
+ difficulty=self._episode["difficulty"],
584
+ workflow_stage=self._episode["workflow_stage"],
585
+ active_alerts=list(self._episode["alerts"]),
586
+ service_health=dict(self._episode["services"]),
587
+ discovered_evidence=list(self._episode["discovered_evidence"]),
588
+ recent_deploys=list(self._episode["recent_deploys"]),
589
+ checks=list(self._episode["checks"].values()),
590
+ user_impact=self._episode["user_impact"],
591
+ slo_burn_rate=self._episode["slo_burn_rate"],
592
+ incident_resolved=self._episode["incident_resolved"],
593
+ containment_applied=self._episode["containment_applied"],
594
+ last_action_result=last_action_result,
595
+ tool_output=tool_output,
596
+ failure_type=self._episode["failure_type"],
597
+ why_failed=self._episode["why_failed"],
598
+ allowed_actions=self._allowed_actions(),
599
+ required_fields_by_action=self._required_fields_by_action(),
600
+ valid_action_example=None,
601
+ common_trap=self._episode["scenario"].get("description"),
602
+ loop_warning=self._episode["loop_warning"],
603
+ blocked_until_security_complete=False,
604
+ security_unlock_reason=None,
605
+ best_recovery_action_family=None,
606
+ progress_flags=self._progress_flags(),
607
+ security_subquest_status=None,
608
+ security_context={},
609
+ final_score=self._episode["final_score"],
610
+ score_breakdown=dict(self._episode["score_breakdown"]),
611
+ reward=round(reward, 4),
612
+ done=done,
613
+ )
unified_incident_env/server/grader.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Deterministic public scoring for the honest narrow incident-remediation environment."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from ..models import GraderCheck, GraderReport
8
+
9
+ MIN_PUBLIC_SCORE = 0.01
10
+ MAX_PUBLIC_SCORE = 0.99
11
+
12
+
13
+ def _strict_public_score(score: float) -> float:
14
+ return round(min(MAX_PUBLIC_SCORE, max(MIN_PUBLIC_SCORE, score)), 4)
15
+
16
+
17
+ def _service_score(status: str) -> float:
18
+ return {
19
+ "healthy": 1.0,
20
+ "degraded": 0.4,
21
+ "crashed": 0.0,
22
+ "isolated": 0.2,
23
+ }.get(status, 0.0)
24
+
25
+
26
+ class UnifiedIncidentGrader:
27
+ """Deterministic scorer focused on executed effects, not scripted clues."""
28
+
29
+ def compute_breakdown(
30
+ self,
31
+ state: dict[str, Any],
32
+ scenario: dict[str, Any],
33
+ ) -> dict[str, float]:
34
+ services = state.get("service_health", {})
35
+ weights = scenario["critical_service_weights"]
36
+ recovery_score = round(
37
+ sum(
38
+ weights.get(service, 0.0) * _service_score((services.get(service) or {}).get("status", "crashed"))
39
+ for service in weights
40
+ ),
41
+ 4,
42
+ )
43
+
44
+ containment_score = 0.2 if state.get("containment_applied") else 0.0
45
+ if state.get("containment_applied") and (services.get("worker") or {}).get("status") == "healthy":
46
+ containment_score = 0.3
47
+
48
+ checks = {item.get("name"): bool(item.get("passed")) for item in state.get("checks", [])}
49
+ verification_score = 0.0
50
+ if checks.get("database_recovery"):
51
+ verification_score += 0.15
52
+ if checks.get("end_to_end"):
53
+ verification_score += 0.2
54
+
55
+ user_impact = float(state.get("user_impact", 1.0))
56
+ impact_score = round(max(0.0, 0.15 * (1.0 - user_impact)), 4)
57
+
58
+ wasteful_ticks = int(state.get("wasteful_ticks", 0))
59
+ efficiency_score = round(max(0.0, 0.10 - (0.01 * wasteful_ticks)), 4)
60
+
61
+ final_score = _strict_public_score(
62
+ recovery_score + containment_score + verification_score + impact_score + efficiency_score
63
+ )
64
+
65
+ return {
66
+ "recovery_score": recovery_score,
67
+ "containment_score": round(containment_score, 4),
68
+ "verification_score": round(verification_score, 4),
69
+ "impact_score": impact_score,
70
+ "efficiency_score": efficiency_score,
71
+ "final_score": final_score,
72
+ }
73
+
74
+ def build_report(self, state: dict[str, Any], scenario: dict[str, Any]) -> GraderReport:
75
+ breakdown = self.compute_breakdown(state, scenario)
76
+ checks = {item.get("name"): bool(item.get("passed")) for item in state.get("checks", [])}
77
+ passed = bool(
78
+ state.get("incident_resolved")
79
+ and checks.get("database_recovery")
80
+ and checks.get("end_to_end")
81
+ )
82
+ report_checks = [
83
+ GraderCheck(
84
+ name="root_cause_removed",
85
+ passed=bool(state.get("containment_applied")),
86
+ detail=(
87
+ "The root cause has been safely contained or removed."
88
+ if state.get("containment_applied")
89
+ else "The root cause is still active or only partially contained."
90
+ ),
91
+ weight=0.30,
92
+ ),
93
+ GraderCheck(
94
+ name="database_recovery",
95
+ passed=checks.get("database_recovery", False),
96
+ detail=(
97
+ "The database recovery check passed."
98
+ if checks.get("database_recovery")
99
+ else "The database recovery check has not passed yet."
100
+ ),
101
+ weight=0.20,
102
+ ),
103
+ GraderCheck(
104
+ name="end_to_end_check",
105
+ passed=checks.get("end_to_end", False),
106
+ detail=(
107
+ "The end-to-end service check passed."
108
+ if checks.get("end_to_end")
109
+ else "The end-to-end service check has not passed yet."
110
+ ),
111
+ weight=0.20,
112
+ ),
113
+ GraderCheck(
114
+ name="critical_services_recovered",
115
+ passed=breakdown["recovery_score"] >= 0.8,
116
+ detail=(
117
+ "Critical-path services are recovered."
118
+ if breakdown["recovery_score"] >= 0.8
119
+ else "Critical-path services are still degraded or crashed."
120
+ ),
121
+ weight=0.20,
122
+ ),
123
+ GraderCheck(
124
+ name="declare_resolved",
125
+ passed=bool(state.get("incident_resolved")),
126
+ detail=(
127
+ "The agent declared the incident resolved after objective checks passed."
128
+ if state.get("incident_resolved")
129
+ else "The incident has not been safely declared resolved."
130
+ ),
131
+ weight=0.10,
132
+ ),
133
+ ]
134
+ return GraderReport(
135
+ scenario_id=scenario["id"],
136
+ passed=passed,
137
+ score=breakdown["final_score"],
138
+ message=(
139
+ "Incident diagnosed, remediated, and verified honestly."
140
+ if passed
141
+ else "Incident is not yet safely resolved."
142
+ ),
143
+ breakdown=breakdown,
144
+ checks=report_checks,
145
+ )
unified_incident_env/tests/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Tests for the unified incident environment."""
unified_incident_env/tests/test_environment.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Behavior and API tests for the honest narrow incident environment."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from fastapi.testclient import TestClient
6
+
7
+ from unified_incident_env.models import HypothesisPayload, UnifiedIncidentAction
8
+ from unified_incident_env.server import app as app_module
9
+ from unified_incident_env.server.challenge import DEFAULT_SCENARIO_ID, list_baselines
10
+ from unified_incident_env.server.environment import UnifiedIncidentEnvironment
11
+
12
+
13
+ def _run_baseline(env: UnifiedIncidentEnvironment):
14
+ env.reset(scenario_id=DEFAULT_SCENARIO_ID)
15
+ last = None
16
+ for step in list_baselines(DEFAULT_SCENARIO_ID).baselines[0].actions:
17
+ last = env.step(step.action)
18
+ return last
19
+
20
+
21
+ def test_baseline_resolves_honestly() -> None:
22
+ env = UnifiedIncidentEnvironment()
23
+ obs = _run_baseline(env)
24
+ assert obs is not None
25
+ assert obs.done is True
26
+ assert obs.incident_resolved is True
27
+ checks = {check.name: check.passed for check in obs.checks}
28
+ assert checks["database_recovery"] is True
29
+ assert checks["end_to_end"] is True
30
+ assert obs.final_score > 0.7
31
+
32
+
33
+ def test_query_deploys_reveals_evidence_but_not_positive_reward() -> None:
34
+ env = UnifiedIncidentEnvironment()
35
+ env.reset(scenario_id=DEFAULT_SCENARIO_ID)
36
+ obs = env.step(UnifiedIncidentAction(action_type="query_deploys", service="worker"))
37
+ assert obs.reward <= 0.0
38
+ assert "worker@2026.04.23-bad" in (obs.tool_output or "")
39
+ assert obs.incident_resolved is False
40
+
41
+
42
+ def test_restart_database_before_rollback_is_negative() -> None:
43
+ env = UnifiedIncidentEnvironment()
44
+ env.reset(scenario_id=DEFAULT_SCENARIO_ID)
45
+ obs = env.step(UnifiedIncidentAction(action_type="restart_service", service="database"))
46
+ assert obs.reward < 0.0
47
+ assert obs.failure_type == "premature_restart"
48
+ assert obs.incident_resolved is False
49
+ assert obs.service_health["database"].status == "crashed"
50
+
51
+
52
+ def test_duplicate_hypothesis_bonus_is_not_farmable() -> None:
53
+ env = UnifiedIncidentEnvironment()
54
+ env.reset(scenario_id=DEFAULT_SCENARIO_ID)
55
+ action = UnifiedIncidentAction(
56
+ action_type="submit_hypothesis",
57
+ hypothesis=HypothesisPayload(
58
+ root_cause="bad_worker_deploy",
59
+ affected_services=["worker", "database", "api-gateway"],
60
+ confidence=0.82,
61
+ recommended_next_action="rollback_deploy",
62
+ ),
63
+ )
64
+ first = env.step(action)
65
+ second = env.step(action)
66
+ assert first.reward > second.reward
67
+ assert second.reward <= 0.0
68
+
69
+
70
+ def test_isolating_worker_contains_but_does_not_resolve() -> None:
71
+ env = UnifiedIncidentEnvironment()
72
+ env.reset(scenario_id=DEFAULT_SCENARIO_ID)
73
+ isolated = env.step(UnifiedIncidentAction(action_type="isolate_service", service="worker"))
74
+ assert isolated.containment_applied is True
75
+ assert isolated.incident_resolved is False
76
+ checked = env.step(UnifiedIncidentAction(action_type="run_check", check_name="end_to_end"))
77
+ checks = {check.name: check.passed for check in checked.checks}
78
+ assert checks["end_to_end"] is False
79
+
80
+
81
+ def test_declare_resolved_requires_checks() -> None:
82
+ env = UnifiedIncidentEnvironment()
83
+ env.reset(scenario_id=DEFAULT_SCENARIO_ID)
84
+ obs = env.step(UnifiedIncidentAction(action_type="declare_resolved"))
85
+ assert obs.reward < 0.0
86
+ assert obs.done is False
87
+ assert obs.failure_type == "premature_resolution"
88
+
89
+
90
+ def test_observation_exposes_bounded_actions_without_valid_example() -> None:
91
+ env = UnifiedIncidentEnvironment()
92
+ obs = env.reset(scenario_id=DEFAULT_SCENARIO_ID)
93
+ assert obs.allowed_actions == [
94
+ "query_logs",
95
+ "query_metrics",
96
+ "query_dependencies",
97
+ "query_deploys",
98
+ "rollback_deploy",
99
+ "restart_service",
100
+ "run_check",
101
+ "isolate_service",
102
+ "escalate",
103
+ "submit_hypothesis",
104
+ "declare_resolved",
105
+ ]
106
+ assert obs.valid_action_example is None
107
+
108
+
109
+ def test_routes_expose_new_catalog_and_status(monkeypatch) -> None:
110
+ monkeypatch.setenv("ENABLE_WEB_INTERFACE", "false")
111
+ client = TestClient(app_module.create_compatible_app())
112
+
113
+ tasks = client.get("/tasks")
114
+ assert tasks.status_code == 200
115
+ payload = tasks.json()
116
+ assert payload["default_scenario_id"] == DEFAULT_SCENARIO_ID
117
+ scenarios_by_difficulty = {scenario["difficulty"] for scenario in payload["scenarios"]}
118
+ assert {"easy", "medium", "hard"}.issubset(scenarios_by_difficulty)
119
+ assert {"easy", "medium", "hard"}.issubset(set(payload["available_difficulties"]))
120
+
121
+ baseline = client.get("/baseline")
122
+ assert baseline.status_code == 200
123
+ baseline_payload = baseline.json()
124
+ baseline_ids = {item["scenario_id"] for item in baseline_payload["baselines"]}
125
+ assert {"worker_deploy_cascade", "db_config_rollout", "gateway_auth_rollout"}.issubset(baseline_ids)
126
+
127
+ health = client.get("/health")
128
+ assert health.status_code == 200
129
+ assert health.json()["status"] in {"ok", "healthy"}
130
+
131
+ status = client.get("/status")
132
+ assert status.status_code == 200
133
+ status_payload = status.json()
134
+ assert status_payload["progress"]["scenario_id"] == DEFAULT_SCENARIO_ID
135
+ assert status_payload["grader"]["score"] > 0.0
136
+
137
+
138
+ def _run_baseline_for_scenario(scenario_id: str):
139
+ env = UnifiedIncidentEnvironment()
140
+ env.reset(scenario_id=scenario_id)
141
+ last = None
142
+ for step in list_baselines(scenario_id).baselines[0].actions:
143
+ last = env.step(step.action)
144
+ return last
145
+
146
+
147
+ def test_medium_baseline_resolves_honestly() -> None:
148
+ obs = _run_baseline_for_scenario("db_config_rollout")
149
+ assert obs is not None
150
+ assert obs.done is True
151
+ assert obs.incident_resolved is True
152
+ checks = {check.name: check.passed for check in obs.checks}
153
+ assert checks["database_recovery"] is True
154
+ assert checks["end_to_end"] is True
155
+ assert obs.final_score > 0.7
156
+
157
+
158
+ def test_hard_baseline_resolves_honestly() -> None:
159
+ obs = _run_baseline_for_scenario("gateway_auth_rollout")
160
+ assert obs is not None
161
+ assert obs.done is True
162
+ assert obs.incident_resolved is True
163
+ checks = {check.name: check.passed for check in obs.checks}
164
+ assert checks["end_to_end"] is True
165
+ assert obs.final_score > 0.7
166
+
167
+
168
+ def test_medium_wrong_rollback_target_is_penalized() -> None:
169
+ env = UnifiedIncidentEnvironment()
170
+ env.reset(scenario_id="db_config_rollout")
171
+ obs = env.step(UnifiedIncidentAction(action_type="rollback_deploy", service="worker"))
172
+ assert obs.reward < 0.0
173
+ assert obs.failure_type == "wrong_remediation_target"
174
+ assert obs.incident_resolved is False
175
+
176
+
177
+ def test_hard_wrong_rollback_target_is_penalized() -> None:
178
+ env = UnifiedIncidentEnvironment()
179
+ env.reset(scenario_id="gateway_auth_rollout")
180
+ obs = env.step(UnifiedIncidentAction(action_type="rollback_deploy", service="worker"))
181
+ assert obs.reward < 0.0
182
+ assert obs.failure_type == "wrong_remediation_target"
183
+
184
+
185
+ def test_hard_does_not_require_database_recovery_check() -> None:
186
+ env = UnifiedIncidentEnvironment()
187
+ env.reset(scenario_id="gateway_auth_rollout")
188
+ env.step(UnifiedIncidentAction(action_type="rollback_deploy", service="api-gateway"))
189
+ end_to_end = env.step(UnifiedIncidentAction(action_type="run_check", check_name="end_to_end"))
190
+ assert any(check.name == "end_to_end" and check.passed for check in end_to_end.checks)
191
+ resolved = env.step(UnifiedIncidentAction(action_type="declare_resolved"))
192
+ assert resolved.incident_resolved is True
unified_incident_env/tests/test_submission_inference.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import inference
4
+ from unified_incident_env.models import Alert, CheckResult, ServiceHealth, UnifiedIncidentObservation
5
+
6
+
7
+ def make_observation(**overrides: object) -> UnifiedIncidentObservation:
8
+ defaults = {
9
+ "prompt_text": "Honest incident prompt",
10
+ "incident_summary": "Worker deploy is overloading the database.",
11
+ "tick_count": 0,
12
+ "max_ticks": 12,
13
+ "difficulty": "easy",
14
+ "workflow_stage": "triage",
15
+ "active_alerts": [
16
+ Alert(service="database", severity="critical", message="database crashing"),
17
+ Alert(service="worker", severity="warning", message="worker retry volume elevated"),
18
+ ],
19
+ "service_health": {
20
+ "api-gateway": ServiceHealth(name="api-gateway", status="degraded", cpu_pct=61.0, memory_pct=38.0, error_rate_pct=24.0, latency_ms=640.0),
21
+ "cache": ServiceHealth(name="cache", status="healthy", cpu_pct=18.0, memory_pct=24.0, error_rate_pct=0.0, latency_ms=14.0),
22
+ "database": ServiceHealth(name="database", status="crashed", cpu_pct=99.0, memory_pct=97.0, error_rate_pct=100.0, latency_ms=0.0),
23
+ "worker": ServiceHealth(name="worker", status="degraded", cpu_pct=88.0, memory_pct=71.0, error_rate_pct=19.0, latency_ms=420.0),
24
+ },
25
+ "discovered_evidence": [],
26
+ "recent_deploys": ["Rolled out worker@2026.04.23-bad 12 minutes ago."],
27
+ "checks": [
28
+ CheckResult(name="database_recovery", passed=False, detail="Database recovery has not been verified yet."),
29
+ CheckResult(name="end_to_end", passed=False, detail="End-to-end health has not been verified yet."),
30
+ ],
31
+ "user_impact": 0.82,
32
+ "slo_burn_rate": 0.91,
33
+ "incident_resolved": False,
34
+ "containment_applied": False,
35
+ "last_action_result": "",
36
+ "tool_output": None,
37
+ "failure_type": None,
38
+ "why_failed": None,
39
+ "allowed_actions": [
40
+ "query_logs",
41
+ "query_metrics",
42
+ "query_dependencies",
43
+ "query_deploys",
44
+ "rollback_deploy",
45
+ "restart_service",
46
+ "run_check",
47
+ "isolate_service",
48
+ "escalate",
49
+ "submit_hypothesis",
50
+ "declare_resolved",
51
+ ],
52
+ "required_fields_by_action": {
53
+ "query_logs": ["service"],
54
+ "query_metrics": ["service", "metric"],
55
+ "query_dependencies": ["service"],
56
+ "query_deploys": ["service"],
57
+ "rollback_deploy": ["service"],
58
+ "restart_service": ["service"],
59
+ "run_check": ["check_name"],
60
+ "isolate_service": ["service"],
61
+ "escalate": [],
62
+ "submit_hypothesis": ["hypothesis"],
63
+ "declare_resolved": [],
64
+ },
65
+ "valid_action_example": None,
66
+ "common_trap": None,
67
+ "loop_warning": None,
68
+ "blocked_until_security_complete": False,
69
+ "security_unlock_reason": None,
70
+ "best_recovery_action_family": None,
71
+ "progress_flags": {},
72
+ "security_subquest_status": None,
73
+ "security_context": {},
74
+ "final_score": 0.1,
75
+ "score_breakdown": {"final_score": 0.1},
76
+ "reward": 0.0,
77
+ "done": False,
78
+ }
79
+ defaults.update(overrides)
80
+ return UnifiedIncidentObservation(**defaults)
81
+
82
+
83
+ def test_log_helpers_match_required_format(capsys) -> None:
84
+ inference.log_start(task="worker_deploy_cascade", env="unified-incident-env", model="demo-model")
85
+ inference.log_step(step=2, action='{"action_type":"query_logs","service":"database"}', reward=-0.01, done=False, error=None)
86
+ inference.log_end(success=True, steps=2, score=0.37, rewards=[-0.01, 0.27])
87
+ captured = capsys.readouterr().out.strip().splitlines()
88
+ assert captured == [
89
+ "[START] task=worker_deploy_cascade env=unified-incident-env model=demo-model",
90
+ '[STEP] step=2 action={"action_type":"query_logs","service":"database"} reward=-0.01 done=false error=null',
91
+ "[END] success=true steps=2 score=0.37 rewards=-0.01,0.27",
92
+ ]
93
+
94
+
95
+ def test_parse_action_accepts_valid_json() -> None:
96
+ observation = make_observation()
97
+ action = inference.parse_action('{"action_type":"query_deploys","service":"worker"}', observation)
98
+ assert action == inference.UnifiedIncidentAction(action_type="query_deploys", service="worker")
99
+
100
+
101
+ def test_parse_action_rejects_incomplete_metric_query() -> None:
102
+ observation = make_observation()
103
+ assert inference.parse_action('{"action_type":"query_metrics","service":"database"}', observation) is None
104
+
105
+
106
+ def test_build_user_prompt_includes_public_state_without_examples() -> None:
107
+ observation = make_observation()
108
+ prompt = inference.build_user_prompt(observation)
109
+ assert "Incident summary:" in prompt
110
+ assert "Allowed actions:" in prompt
111
+ assert "Required fields:" in prompt
112
+ assert "Valid example" not in prompt
113
+ assert "worker@2026.04.23-bad" not in prompt
114
+
115
+
116
+ def test_build_fallback_action_prefers_public_deploy_query() -> None:
117
+ observation = make_observation()
118
+ action = inference.build_fallback_action(observation)
119
+ assert action == inference.UnifiedIncidentAction(action_type="query_deploys", service="worker")
unified_incident_env/tests/test_trainer.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Smoke tests for reusable trainer-shell pieces after the v2 pivot."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from unified_incident_env.trainer.trajectory_memory import CorrectionMemory
8
+ from unified_incident_env.trainer.trajectory_store import TrajectoryStore
9
+ from unified_incident_env.trainer.types import EpisodeRecord, StepRecord
10
+
11
+
12
+ def test_correction_memory_empty_prompt_is_safe() -> None:
13
+ memory = CorrectionMemory()
14
+ addendum = memory.build_prompt_addendum("worker_deploy_cascade", "triage")
15
+ assert isinstance(addendum, str)
16
+
17
+
18
+ def test_trajectory_store_roundtrip(tmp_path: Path) -> None:
19
+ store = TrajectoryStore(tmp_path / "episodes.jsonl")
20
+ record = EpisodeRecord(
21
+ run_id="run-1",
22
+ scenario_id="worker_deploy_cascade",
23
+ difficulty="easy",
24
+ model_name="stub",
25
+ mode="strict",
26
+ success=False,
27
+ final_score=0.1,
28
+ steps=1,
29
+ elapsed_s=0.01,
30
+ step_records=[
31
+ StepRecord(
32
+ step_index=1,
33
+ tick=1,
34
+ workflow_stage="triage",
35
+ observation={},
36
+ prompt_text="prompt",
37
+ raw_model_output="{}",
38
+ parse_status="invalid_json",
39
+ reward=None,
40
+ )
41
+ ],
42
+ )
43
+ store.append_episode(record)
44
+ loaded = store.load_episodes()
45
+ assert len(loaded) == 1
46
+ assert loaded[0].scenario_id == "worker_deploy_cascade"
unified_incident_env/tests/test_trainer_session.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Smoke tests for session/report shells after the v2 pivot."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from unified_incident_env.trainer.reporting import build_phase_deltas
6
+ from unified_incident_env.trainer.types import SessionPhaseReport
7
+
8
+
9
+ def test_build_phase_deltas_handles_simple_progression() -> None:
10
+ phases = [
11
+ SessionPhaseReport(
12
+ phase_name="probe",
13
+ episode_ids=[1, 2],
14
+ avg_score=0.2,
15
+ success_rate=0.0,
16
+ schema_failures=1,
17
+ loop_failures=1,
18
+ updates_applied=[],
19
+ ),
20
+ SessionPhaseReport(
21
+ phase_name="final_evaluation",
22
+ episode_ids=[3, 4],
23
+ avg_score=0.8,
24
+ success_rate=1.0,
25
+ schema_failures=0,
26
+ loop_failures=0,
27
+ updates_applied=[],
28
+ ),
29
+ ]
30
+ deltas = build_phase_deltas(phases)
31
+ assert deltas[1].phase_name == "final_evaluation"
32
+ assert deltas[1].score_delta == 0.6
unified_incident_env/trainer/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """Trainer package namespace.
2
+
3
+ This package intentionally avoids eager importing of legacy trainer flows so the
4
+ honest v2 environment can reuse shell utilities without pulling in deprecated
5
+ benchmark-specific modules at import time.
6
+ """
7
+
8
+ __all__: list[str] = []
unified_incident_env/trainer/action_adapter.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Strict and lenient action parsers for training and eval."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from typing import Any
7
+
8
+ from ..models import ActionType, UnifiedIncidentAction
9
+ from .types import ParseResult
10
+
11
+ _ALLOWED_KEYS = {
12
+ "action_type",
13
+ "service",
14
+ "metric",
15
+ "vulnerability_type",
16
+ "patch_id",
17
+ "postmortem",
18
+ }
19
+ _KNOWN_ACTIONS: set[str] = {
20
+ "query_logs",
21
+ "query_metrics",
22
+ "query_dependencies",
23
+ "restart_service",
24
+ "rollback_deploy",
25
+ "inspect_code",
26
+ "classify_vulnerability",
27
+ "apply_patch",
28
+ "verify_security_fix",
29
+ "submit_security_fix",
30
+ "submit_postmortem",
31
+ }
32
+
33
+
34
+ def _extract_json_text(raw_text: str) -> str:
35
+ text = raw_text.strip()
36
+ if "```" in text:
37
+ parts = text.split("```")
38
+ if len(parts) >= 2:
39
+ text = parts[1]
40
+ if text.startswith("json"):
41
+ text = text[4:]
42
+ start = text.find("{")
43
+ end = text.rfind("}")
44
+ if start != -1 and end != -1 and start < end:
45
+ text = text[start : end + 1]
46
+ return text.strip()
47
+
48
+
49
+ def _compact_action(action: UnifiedIncidentAction) -> dict[str, Any]:
50
+ payload = action.model_dump(exclude_none=True)
51
+ if payload.get("metadata") == {}:
52
+ payload.pop("metadata", None)
53
+ return payload
54
+
55
+
56
+ class StrictActionParser:
57
+ """Exact parser for judge-style evaluation."""
58
+
59
+ def parse(self, raw_text: str) -> ParseResult:
60
+ bare = raw_text.strip().strip('"').strip("'")
61
+ if bare in {"inspect_code", "verify_security_fix", "submit_security_fix"}:
62
+ action = UnifiedIncidentAction(action_type=bare)
63
+ return ParseResult(
64
+ parse_status="repaired",
65
+ cleaned_action=_compact_action(action),
66
+ repair_labels=["bare_action_wrapped"],
67
+ )
68
+
69
+ try:
70
+ data = json.loads(_extract_json_text(raw_text))
71
+ except Exception as exc:
72
+ return ParseResult(parse_status="invalid_json", error=type(exc).__name__)
73
+
74
+ if not isinstance(data, dict):
75
+ return ParseResult(parse_status="invalid_action", error="root must be object")
76
+
77
+ repaired_labels: list[str] = []
78
+ cleaned: dict[str, Any] = {k: v for k, v in data.items() if k in _ALLOWED_KEYS}
79
+ repaired = cleaned != data
80
+ if repaired:
81
+ repaired_labels.append("extra_keys_stripped")
82
+
83
+ if "action_type" not in cleaned and isinstance(data.get("action"), str):
84
+ if data["action"] in _KNOWN_ACTIONS:
85
+ cleaned["action_type"] = data["action"]
86
+ repaired = True
87
+ repaired_labels.append("action_alias_normalized")
88
+
89
+ if (
90
+ "vulnerability_type" not in cleaned
91
+ and isinstance(data.get("vulnerability"), str)
92
+ ):
93
+ cleaned["vulnerability_type"] = data["vulnerability"]
94
+ repaired = True
95
+ repaired_labels.append("vulnerability_alias_normalized")
96
+
97
+ metrics_value = data.get("metrics")
98
+ if "metric" not in cleaned and isinstance(metrics_value, list) and len(metrics_value) == 1:
99
+ cleaned["metric"] = metrics_value[0]
100
+ repaired = True
101
+ repaired_labels.append("metric_list_normalized")
102
+
103
+ if "metrics" in data and (
104
+ not isinstance(metrics_value, list) or len(metrics_value) != 1
105
+ ):
106
+ return ParseResult(
107
+ parse_status="invalid_action",
108
+ error="metrics alias is ambiguous",
109
+ repair_labels=repaired_labels,
110
+ )
111
+
112
+ try:
113
+ action = UnifiedIncidentAction(**cleaned)
114
+ except Exception as exc:
115
+ return ParseResult(
116
+ parse_status="invalid_action",
117
+ error=str(exc),
118
+ repair_labels=repaired_labels,
119
+ )
120
+
121
+ return ParseResult(
122
+ parse_status="repaired" if repaired else "ok",
123
+ cleaned_action=_compact_action(action),
124
+ repair_labels=repaired_labels,
125
+ )
126
+
127
+
128
+ class LenientActionAdapter:
129
+ """Training-time parser that repairs small schema mistakes only."""
130
+
131
+ def parse(self, raw_text: str) -> ParseResult:
132
+ bare = raw_text.strip().strip('"').strip("'")
133
+ if bare in _KNOWN_ACTIONS:
134
+ try:
135
+ action = UnifiedIncidentAction(action_type=bare)
136
+ except Exception as exc:
137
+ return ParseResult(
138
+ parse_status="invalid_action",
139
+ error=str(exc),
140
+ repair_labels=["bare_action_wrapped"],
141
+ )
142
+ return ParseResult(
143
+ parse_status="repaired",
144
+ cleaned_action=_compact_action(action),
145
+ repair_labels=["bare_action_wrapped"],
146
+ )
147
+
148
+ try:
149
+ data = json.loads(_extract_json_text(raw_text))
150
+ except Exception as exc:
151
+ return ParseResult(parse_status="invalid_json", error=type(exc).__name__)
152
+
153
+ if not isinstance(data, dict):
154
+ return ParseResult(parse_status="invalid_action", error="root must be object")
155
+
156
+ repaired_labels: list[str] = []
157
+ cleaned: dict[str, Any] = {k: v for k, v in data.items() if k in _ALLOWED_KEYS}
158
+ repaired = cleaned != data
159
+ if repaired:
160
+ repaired_labels.append("extra_keys_stripped")
161
+
162
+ if "action_type" not in cleaned and isinstance(data.get("action"), str):
163
+ if data["action"] in _KNOWN_ACTIONS:
164
+ cleaned["action_type"] = data["action"]
165
+ repaired = True
166
+ repaired_labels.append("action_alias_normalized")
167
+
168
+ if (
169
+ "vulnerability_type" not in cleaned
170
+ and isinstance(data.get("vulnerability"), str)
171
+ ):
172
+ cleaned["vulnerability_type"] = data["vulnerability"]
173
+ repaired = True
174
+ repaired_labels.append("vulnerability_alias_normalized")
175
+
176
+ metrics_value = data.get("metrics")
177
+ if "metric" not in cleaned and isinstance(metrics_value, list) and len(metrics_value) == 1:
178
+ cleaned["metric"] = metrics_value[0]
179
+ repaired = True
180
+ repaired_labels.append("metric_list_normalized")
181
+
182
+ if "metrics" in data and (
183
+ not isinstance(metrics_value, list) or len(metrics_value) != 1
184
+ ):
185
+ return ParseResult(
186
+ parse_status="invalid_action",
187
+ error="metrics alias is ambiguous",
188
+ repair_labels=repaired_labels,
189
+ )
190
+
191
+ try:
192
+ action = UnifiedIncidentAction(**cleaned)
193
+ except Exception as exc:
194
+ return ParseResult(
195
+ parse_status="invalid_action",
196
+ error=str(exc),
197
+ repair_labels=repaired_labels,
198
+ )
199
+
200
+ return ParseResult(
201
+ parse_status="repaired" if repaired else "ok",
202
+ cleaned_action=_compact_action(action),
203
+ repair_labels=repaired_labels,
204
+ )
unified_incident_env/trainer/analyze_failures.py ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Failure analysis for episode trajectories."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections import Counter
6
+
7
+ from .types import EpisodeRecord, FailureAnalysisReport, FailureBucketEntry, StepRecord
8
+
9
+ _INFRA_ACTIONS = {"restart_service", "rollback_deploy"}
10
+
11
+
12
+ def analyze_episode(record: EpisodeRecord) -> FailureAnalysisReport:
13
+ """Classify one episode into schema, policy, looping, and reasoning buckets."""
14
+ entries: list[FailureBucketEntry] = []
15
+
16
+ for step in record.step_records:
17
+ entries.extend(_classify_step(record, step))
18
+
19
+ entries.extend(_classify_episode_level(record))
20
+
21
+ schema = sorted({entry.failure_type for entry in entries if entry.bucket == "schema"})
22
+ policy = sorted({entry.failure_type for entry in entries if entry.bucket == "policy"})
23
+ looping = sorted({entry.failure_type for entry in entries if entry.bucket == "looping"})
24
+ reasoning = sorted({entry.failure_type for entry in entries if entry.bucket == "reasoning"})
25
+ summary = Counter(entry.bucket for entry in entries)
26
+
27
+ return FailureAnalysisReport(
28
+ episode_ids=[record.episode_id or 0],
29
+ scenario_ids=[record.scenario_id],
30
+ entries=entries,
31
+ schema_failures=schema,
32
+ policy_failures=policy,
33
+ looping_failures=looping,
34
+ reasoning_failures=reasoning,
35
+ summary={
36
+ "schema": summary.get("schema", 0),
37
+ "policy": summary.get("policy", 0),
38
+ "looping": summary.get("looping", 0),
39
+ "reasoning": summary.get("reasoning", 0),
40
+ },
41
+ )
42
+
43
+
44
+ def analyze_block(records: list[EpisodeRecord]) -> FailureAnalysisReport:
45
+ """Combine multiple episode analyses into one block report."""
46
+ analyses = [analyze_episode(record) for record in records]
47
+ entries = [entry for analysis in analyses for entry in analysis.entries]
48
+ summary = Counter(entry.bucket for entry in entries)
49
+ return FailureAnalysisReport(
50
+ episode_ids=[record.episode_id or 0 for record in records],
51
+ scenario_ids=[record.scenario_id for record in records],
52
+ entries=entries,
53
+ schema_failures=sorted({entry.failure_type for entry in entries if entry.bucket == "schema"}),
54
+ policy_failures=sorted({entry.failure_type for entry in entries if entry.bucket == "policy"}),
55
+ looping_failures=sorted({entry.failure_type for entry in entries if entry.bucket == "looping"}),
56
+ reasoning_failures=sorted({entry.failure_type for entry in entries if entry.bucket == "reasoning"}),
57
+ summary={
58
+ "schema": summary.get("schema", 0),
59
+ "policy": summary.get("policy", 0),
60
+ "looping": summary.get("looping", 0),
61
+ "reasoning": summary.get("reasoning", 0),
62
+ },
63
+ )
64
+
65
+
66
+ def _classify_step(record: EpisodeRecord, step: StepRecord) -> list[FailureBucketEntry]:
67
+ entries: list[FailureBucketEntry] = []
68
+ if step.parse_status in {"invalid_json", "invalid_action"}:
69
+ entries.append(
70
+ FailureBucketEntry(
71
+ episode_id=record.episode_id or 0,
72
+ scenario_id=record.scenario_id,
73
+ step_index=step.step_index,
74
+ bucket="schema",
75
+ failure_type=_schema_failure_type(step),
76
+ detail=step.failure_reason or "schema failure",
77
+ )
78
+ )
79
+ return entries
80
+
81
+ student = step.cleaned_action or {}
82
+ teacher = step.teacher_action or {}
83
+ if not teacher or not student or student == teacher:
84
+ return entries
85
+
86
+ student_type = student.get("action_type")
87
+ teacher_type = teacher.get("action_type")
88
+
89
+ if student_type == "classify_vulnerability":
90
+ failure_type = (
91
+ "wrong_vulnerability"
92
+ if teacher_type == "classify_vulnerability"
93
+ else "fails_to_identify_real_vulnerability"
94
+ )
95
+ entries.append(
96
+ FailureBucketEntry(
97
+ episode_id=record.episode_id or 0,
98
+ scenario_id=record.scenario_id,
99
+ step_index=step.step_index,
100
+ bucket="reasoning",
101
+ failure_type=failure_type,
102
+ detail=f"student={student} teacher={teacher}",
103
+ )
104
+ )
105
+ return entries
106
+
107
+ if student_type == "apply_patch" and teacher_type == "apply_patch":
108
+ entries.append(
109
+ FailureBucketEntry(
110
+ episode_id=record.episode_id or 0,
111
+ scenario_id=record.scenario_id,
112
+ step_index=step.step_index,
113
+ bucket="policy",
114
+ failure_type="wrong_patch",
115
+ detail=f"student={student} teacher={teacher}",
116
+ )
117
+ )
118
+ return entries
119
+
120
+ if student_type == "verify_security_fix" and teacher_type != "verify_security_fix":
121
+ entries.append(
122
+ FailureBucketEntry(
123
+ episode_id=record.episode_id or 0,
124
+ scenario_id=record.scenario_id,
125
+ step_index=step.step_index,
126
+ bucket="policy",
127
+ failure_type="verify_too_early",
128
+ detail=f"student={student} teacher={teacher}",
129
+ )
130
+ )
131
+ return entries
132
+
133
+ if student_type == "submit_security_fix" and teacher_type != "submit_security_fix":
134
+ entries.append(
135
+ FailureBucketEntry(
136
+ episode_id=record.episode_id or 0,
137
+ scenario_id=record.scenario_id,
138
+ step_index=step.step_index,
139
+ bucket="policy",
140
+ failure_type="submit_too_early",
141
+ detail=f"student={student} teacher={teacher}",
142
+ )
143
+ )
144
+ return entries
145
+
146
+ if student_type in _INFRA_ACTIONS and teacher_type not in _INFRA_ACTIONS:
147
+ entries.append(
148
+ FailureBucketEntry(
149
+ episode_id=record.episode_id or 0,
150
+ scenario_id=record.scenario_id,
151
+ step_index=step.step_index,
152
+ bucket="policy",
153
+ failure_type="infra_before_security",
154
+ detail=f"student={student} teacher={teacher}",
155
+ )
156
+ )
157
+ return entries
158
+
159
+ if student_type in _INFRA_ACTIONS and teacher_type in _INFRA_ACTIONS:
160
+ failure_type = "wrong_service"
161
+ if student_type == "restart_service":
162
+ failure_type = "wrong_restart"
163
+ elif student_type == "rollback_deploy":
164
+ failure_type = "wrong_rollback"
165
+ entries.append(
166
+ FailureBucketEntry(
167
+ episode_id=record.episode_id or 0,
168
+ scenario_id=record.scenario_id,
169
+ step_index=step.step_index,
170
+ bucket="policy",
171
+ failure_type=failure_type,
172
+ detail=f"student={student} teacher={teacher}",
173
+ )
174
+ )
175
+ return entries
176
+
177
+ entries.append(
178
+ FailureBucketEntry(
179
+ episode_id=record.episode_id or 0,
180
+ scenario_id=record.scenario_id,
181
+ step_index=step.step_index,
182
+ bucket="policy",
183
+ failure_type="wrong_action_choice",
184
+ detail=f"student={student} teacher={teacher}",
185
+ )
186
+ )
187
+ return entries
188
+
189
+
190
+ def _classify_episode_level(record: EpisodeRecord) -> list[FailureBucketEntry]:
191
+ entries: list[FailureBucketEntry] = []
192
+ previous = None
193
+ repeat_count = 0
194
+ for step in record.step_records:
195
+ current = step.cleaned_action
196
+ if current and current == previous:
197
+ repeat_count += 1
198
+ if repeat_count >= 1:
199
+ entries.append(
200
+ FailureBucketEntry(
201
+ episode_id=record.episode_id or 0,
202
+ scenario_id=record.scenario_id,
203
+ step_index=step.step_index,
204
+ bucket="looping",
205
+ failure_type="repeated_same_action",
206
+ detail=f"action={current}",
207
+ )
208
+ )
209
+ else:
210
+ repeat_count = 0
211
+ previous = current
212
+
213
+ stopped = record.stopped_reason or ""
214
+ if stopped in {"diagnosis", "root_cause_analysis"}:
215
+ entries.append(
216
+ FailureBucketEntry(
217
+ episode_id=record.episode_id or 0,
218
+ scenario_id=record.scenario_id,
219
+ step_index=None,
220
+ bucket="looping",
221
+ failure_type="stuck_in_diagnosis",
222
+ detail=f"stopped_reason={stopped}",
223
+ )
224
+ )
225
+ elif stopped == "security_subquest":
226
+ entries.append(
227
+ FailureBucketEntry(
228
+ episode_id=record.episode_id or 0,
229
+ scenario_id=record.scenario_id,
230
+ step_index=None,
231
+ bucket="looping",
232
+ failure_type="stuck_in_security_subquest",
233
+ detail=f"stopped_reason={stopped}",
234
+ )
235
+ )
236
+
237
+ return entries
238
+
239
+
240
+ def _schema_failure_type(step: StepRecord) -> str:
241
+ raw = step.raw_model_output.lower()
242
+ error = (step.failure_reason or "").lower()
243
+ if '"reason"' in raw or '"details"' in raw or "extra_forbidden" in error:
244
+ return "extra_unsupported_fields"
245
+ if '"services"' in raw or '"metrics"' in raw or "field required" in error:
246
+ return "wrong_field_names"
247
+ if "required" in error or "missing" in error:
248
+ return "missing_required_fields"
249
+ if step.parse_status == "invalid_json":
250
+ return "invalid_json"
251
+ return "invalid_action"
unified_incident_env/trainer/backend.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Backend interfaces for model calls."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import time
7
+ from typing import Protocol
8
+ from urllib.parse import urlparse
9
+
10
+ from openai import OpenAI
11
+
12
+ from .types import ModelRequest, ModelResponse
13
+
14
+
15
+ class ModelBackend(Protocol):
16
+ """Minimal backend protocol for trainer use."""
17
+
18
+ def complete(self, request: ModelRequest) -> ModelResponse:
19
+ """Return raw model text and metadata for one request."""
20
+
21
+
22
+ class OpenAICompatibleBackend:
23
+ """OpenAI-compatible backend, suitable for Ollama and similar servers."""
24
+
25
+ def __init__(
26
+ self,
27
+ *,
28
+ base_url: str,
29
+ api_key: str,
30
+ timeout_s: float = 90.0,
31
+ ) -> None:
32
+ self.base_url = base_url
33
+ self._client = OpenAI(api_key=api_key, base_url=base_url, timeout=timeout_s)
34
+
35
+ def complete(self, request: ModelRequest) -> ModelResponse:
36
+ started = time.perf_counter()
37
+ create_kwargs = {
38
+ "model": request.model_name,
39
+ "temperature": request.temperature,
40
+ "max_tokens": request.max_tokens,
41
+ "messages": [
42
+ {"role": "system", "content": request.system_prompt},
43
+ {"role": "user", "content": request.user_prompt},
44
+ ],
45
+ }
46
+ raw_text = ""
47
+ actual_mode = request.structured_mode
48
+
49
+ if request.structured_mode == "backend_adaptive":
50
+ if self._is_ollama():
51
+ actual_mode = "response_format_json"
52
+ else:
53
+ actual_mode = "tool_calling"
54
+
55
+ try:
56
+ if actual_mode == "tool_calling":
57
+ tool_choice = request.tool_choice or {
58
+ "type": "function",
59
+ "function": {"name": "emit_action"},
60
+ }
61
+ create_kwargs["tools"] = request.tools or [
62
+ self._tool_from_response_format(request.response_format)
63
+ ]
64
+ create_kwargs["tool_choice"] = tool_choice
65
+ response = self._client.chat.completions.create(**create_kwargs)
66
+ raw_text = self._extract_tool_text(response)
67
+ elif actual_mode == "response_format_json":
68
+ if self._is_ollama():
69
+ create_kwargs["extra_body"] = {
70
+ "format": self._ollama_format_payload(request.response_format)
71
+ }
72
+ else:
73
+ create_kwargs["response_format"] = request.response_format or {
74
+ "type": "json_object"
75
+ }
76
+ response = self._client.chat.completions.create(**create_kwargs)
77
+ raw_text = response.choices[0].message.content or ""
78
+ else:
79
+ response = self._client.chat.completions.create(**create_kwargs)
80
+ raw_text = response.choices[0].message.content or ""
81
+ except Exception:
82
+ if request.structured_mode == "backend_adaptive" and actual_mode == "tool_calling":
83
+ fallback_kwargs = dict(create_kwargs)
84
+ if "tools" in fallback_kwargs:
85
+ del fallback_kwargs["tools"]
86
+ if "tool_choice" in fallback_kwargs:
87
+ del fallback_kwargs["tool_choice"]
88
+ if self._is_ollama():
89
+ fallback_kwargs["extra_body"] = {
90
+ "format": self._ollama_format_payload(request.response_format)
91
+ }
92
+ else:
93
+ fallback_kwargs["response_format"] = request.response_format or {
94
+ "type": "json_object"
95
+ }
96
+ response = self._client.chat.completions.create(**fallback_kwargs)
97
+ raw_text = response.choices[0].message.content or ""
98
+ actual_mode = "response_format_json"
99
+ else:
100
+ raise
101
+
102
+ elapsed = time.perf_counter() - started
103
+ return ModelResponse(
104
+ raw_text=raw_text,
105
+ latency_s=round(elapsed, 4),
106
+ metadata={
107
+ "model": request.model_name,
108
+ "structured_mode": actual_mode,
109
+ },
110
+ )
111
+
112
+ def _is_ollama(self) -> bool:
113
+ parsed = urlparse(self.base_url)
114
+ host = parsed.netloc.lower()
115
+ return "11434" in host or "ollama" in host or "127.0.0.1" in host or "localhost" in host
116
+
117
+ def _ollama_format_payload(
118
+ self,
119
+ response_format: dict[str, object] | None,
120
+ ) -> object:
121
+ if response_format and response_format.get("type") == "json_schema":
122
+ json_schema = response_format.get("json_schema", {})
123
+ if isinstance(json_schema, dict):
124
+ return json_schema.get("schema", "json")
125
+ return "json"
126
+
127
+ def _tool_from_response_format(
128
+ self,
129
+ response_format: dict[str, object] | None,
130
+ ) -> dict[str, object]:
131
+ schema = {
132
+ "type": "object",
133
+ "properties": {"action_type": {"type": "string"}},
134
+ "required": ["action_type"],
135
+ "additionalProperties": False,
136
+ }
137
+ if response_format and response_format.get("type") == "json_schema":
138
+ json_schema = response_format.get("json_schema", {})
139
+ schema = json_schema.get("schema", schema) # type: ignore[assignment]
140
+ return {
141
+ "type": "function",
142
+ "function": {
143
+ "name": "emit_action",
144
+ "description": "Emit exactly one structured environment action.",
145
+ "parameters": schema,
146
+ },
147
+ }
148
+
149
+ def _extract_tool_text(self, response) -> str:
150
+ message = response.choices[0].message
151
+ tool_calls = getattr(message, "tool_calls", None) or []
152
+ if tool_calls:
153
+ function = getattr(tool_calls[0], "function", None)
154
+ if function is not None and getattr(function, "arguments", None):
155
+ return function.arguments
156
+ content = message.content or ""
157
+ if isinstance(content, list):
158
+ fragments = []
159
+ for item in content:
160
+ if isinstance(item, dict) and item.get("type") == "text":
161
+ fragments.append(item.get("text", ""))
162
+ return "".join(fragments)
163
+ if isinstance(content, str):
164
+ return content
165
+ return json.dumps(content)
unified_incident_env/trainer/build_datasets.py ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Build correction datasets from trajectories and failure analyses."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ from pathlib import Path
7
+
8
+ from .build_sft_dataset import build_baseline_records
9
+ from .trajectory_store import TrajectoryStore
10
+ from .types import EpisodeRecord, FailureAnalysisReport, SFTRecord
11
+
12
+
13
+ def build_schema_repair_records(
14
+ episodes: list[EpisodeRecord],
15
+ analyses: list[FailureAnalysisReport],
16
+ ) -> list[SFTRecord]:
17
+ rows: list[SFTRecord] = []
18
+ analysis_by_episode = {
19
+ analysis.episode_ids[0]: analysis for analysis in analyses if analysis.episode_ids
20
+ }
21
+ for episode in episodes:
22
+ analysis = analysis_by_episode.get(episode.episode_id or 0)
23
+ schema_types = set(analysis.schema_failures if analysis else [])
24
+ for step in episode.step_records:
25
+ if step.parse_status not in {"invalid_json", "invalid_action", "repaired", "teacher_override"}:
26
+ continue
27
+ if step.teacher_action is None:
28
+ continue
29
+ rows.append(
30
+ SFTRecord(
31
+ source="schema_repair",
32
+ scenario_id=episode.scenario_id,
33
+ tick=step.tick,
34
+ messages=[
35
+ {"role": "system", "content": "Repair the action into strict JSON only."},
36
+ {
37
+ "role": "user",
38
+ "content": (
39
+ f"{step.prompt_text}\n\n"
40
+ f"Previous invalid output:\n{step.raw_model_output}"
41
+ ),
42
+ },
43
+ ],
44
+ target_action=step.teacher_action,
45
+ student_action=step.cleaned_action,
46
+ parse_status=step.parse_status,
47
+ tags=sorted(schema_types) or [step.parse_status],
48
+ metadata={
49
+ "episode_id": episode.episode_id,
50
+ "step_index": step.step_index,
51
+ "repair_retry_used": step.repair_retry_used,
52
+ "teacher_override_used": step.teacher_override_used,
53
+ "normalization_applied": step.normalization_applied,
54
+ "failure_type": step.observation.get("failure_type"),
55
+ "why_failed": step.observation.get("why_failed"),
56
+ "loop_warning": step.observation.get("loop_warning"),
57
+ "blocked_until_security_complete": step.observation.get("blocked_until_security_complete"),
58
+ "security_unlock_reason": step.observation.get("security_unlock_reason"),
59
+ "progress_flags": step.observation.get("progress_flags"),
60
+ },
61
+ )
62
+ )
63
+ return rows
64
+
65
+
66
+ def build_next_action_records(
67
+ episodes: list[EpisodeRecord],
68
+ analyses: list[FailureAnalysisReport],
69
+ ) -> list[SFTRecord]:
70
+ rows: list[SFTRecord] = []
71
+ episode_entries = {
72
+ analysis.episode_ids[0]: analysis.entries
73
+ for analysis in analyses
74
+ if analysis.episode_ids
75
+ }
76
+ allowed = {"policy", "reasoning", "looping"}
77
+ for episode in episodes:
78
+ entries = episode_entries.get(episode.episode_id or 0, [])
79
+ step_indices = {
80
+ entry.step_index
81
+ for entry in entries
82
+ if entry.bucket in allowed and entry.step_index is not None
83
+ }
84
+ for step in episode.step_records:
85
+ if step.step_index not in step_indices:
86
+ continue
87
+ if step.teacher_action is None:
88
+ continue
89
+ tags = [
90
+ entry.failure_type
91
+ for entry in entries
92
+ if entry.step_index == step.step_index and entry.bucket in allowed
93
+ ]
94
+ rows.append(
95
+ SFTRecord(
96
+ source="next_action",
97
+ scenario_id=episode.scenario_id,
98
+ tick=step.tick,
99
+ messages=[
100
+ {"role": "system", "content": "Choose the best next action as strict JSON only."},
101
+ {"role": "user", "content": step.prompt_text},
102
+ ],
103
+ target_action=step.teacher_action,
104
+ student_action=step.cleaned_action,
105
+ parse_status=step.parse_status,
106
+ tags=sorted(set(tags)) or ["next_action"],
107
+ metadata={
108
+ "episode_id": episode.episode_id,
109
+ "step_index": step.step_index,
110
+ "workflow_stage": step.workflow_stage,
111
+ "teacher_override_used": step.teacher_override_used,
112
+ "failure_type": step.observation.get("failure_type"),
113
+ "why_failed": step.observation.get("why_failed"),
114
+ "loop_warning": step.observation.get("loop_warning"),
115
+ "progress_flags": step.observation.get("progress_flags"),
116
+ },
117
+ )
118
+ )
119
+ return rows
120
+
121
+
122
+ def build_recovery_records(
123
+ episodes: list[EpisodeRecord],
124
+ analyses: list[FailureAnalysisReport],
125
+ ) -> list[SFTRecord]:
126
+ rows: list[SFTRecord] = []
127
+ episode_entries = {
128
+ analysis.episode_ids[0]: analysis.entries
129
+ for analysis in analyses
130
+ if analysis.episode_ids
131
+ }
132
+ recovery_failures = {
133
+ "wrong_restart",
134
+ "wrong_rollback",
135
+ "wrong_service",
136
+ "wrong_patch",
137
+ "wrong_vulnerability",
138
+ "verify_too_early",
139
+ "submit_too_early",
140
+ "infra_before_security",
141
+ "repeated_same_action",
142
+ }
143
+ for episode in episodes:
144
+ entries = episode_entries.get(episode.episode_id or 0, [])
145
+ step_indices = {
146
+ entry.step_index
147
+ for entry in entries
148
+ if entry.failure_type in recovery_failures and entry.step_index is not None
149
+ }
150
+ for step in episode.step_records:
151
+ if step.step_index not in step_indices:
152
+ continue
153
+ if step.teacher_action is None or not step.next_prompt_text:
154
+ continue
155
+ tags = [
156
+ entry.failure_type
157
+ for entry in entries
158
+ if entry.step_index == step.step_index
159
+ and entry.failure_type in recovery_failures
160
+ ]
161
+ rows.append(
162
+ SFTRecord(
163
+ source="recovery",
164
+ scenario_id=episode.scenario_id,
165
+ tick=step.tick,
166
+ messages=[
167
+ {"role": "system", "content": "Recover from the previous mistake. Return the best next strict JSON action only."},
168
+ {
169
+ "role": "user",
170
+ "content": (
171
+ f"{step.next_prompt_text}\n\n"
172
+ f"Previous wrong action: {step.cleaned_action}\n"
173
+ f"Penalty or result: reward={step.reward}"
174
+ ),
175
+ },
176
+ ],
177
+ target_action=step.teacher_action,
178
+ student_action=step.cleaned_action,
179
+ parse_status=step.parse_status,
180
+ tags=sorted(set(tags)) or ["recovery"],
181
+ metadata={
182
+ "episode_id": episode.episode_id,
183
+ "step_index": step.step_index,
184
+ "teacher_override_used": step.teacher_override_used,
185
+ "failure_type": step.observation.get("failure_type"),
186
+ "why_failed": step.observation.get("why_failed"),
187
+ "loop_warning": step.observation.get("loop_warning"),
188
+ "best_recovery_action_family": step.observation.get("best_recovery_action_family"),
189
+ },
190
+ )
191
+ )
192
+ return rows
193
+
194
+
195
+ def combine_sft_records(
196
+ *,
197
+ baseline_records: list[SFTRecord],
198
+ schema_records: list[SFTRecord],
199
+ next_action_records: list[SFTRecord],
200
+ recovery_records: list[SFTRecord],
201
+ ) -> list[SFTRecord]:
202
+ return [
203
+ *baseline_records,
204
+ *schema_records,
205
+ *next_action_records,
206
+ *recovery_records,
207
+ ]
208
+
209
+
210
+ def write_jsonl(records: list[SFTRecord], path: Path) -> None:
211
+ path.parent.mkdir(parents=True, exist_ok=True)
212
+ with path.open("w", encoding="utf-8") as handle:
213
+ for record in records:
214
+ handle.write(record.model_dump_json())
215
+ handle.write("\n")
216
+
217
+
218
+ def load_episodes(path: Path) -> list[EpisodeRecord]:
219
+ return TrajectoryStore(path).load_episodes()
220
+
221
+
222
+ def main() -> None:
223
+ parser = argparse.ArgumentParser()
224
+ parser.add_argument("--episodes", default="outputs/trainer/episodes.jsonl")
225
+ parser.add_argument("--output-dir", required=True)
226
+ args = parser.parse_args()
227
+
228
+ output_dir = Path(args.output_dir)
229
+ episodes = load_episodes(Path(args.episodes))
230
+
231
+ from .analyze_failures import analyze_episode
232
+
233
+ analyses = [analyze_episode(episode) for episode in episodes]
234
+ baseline_records = build_baseline_records()
235
+ schema_records = build_schema_repair_records(episodes, analyses)
236
+ next_action_records = build_next_action_records(episodes, analyses)
237
+ recovery_records = build_recovery_records(episodes, analyses)
238
+ combined_records = combine_sft_records(
239
+ baseline_records=baseline_records,
240
+ schema_records=schema_records,
241
+ next_action_records=next_action_records,
242
+ recovery_records=recovery_records,
243
+ )
244
+
245
+ write_jsonl(baseline_records, output_dir / "baseline_teacher_dataset.jsonl")
246
+ write_jsonl(schema_records, output_dir / "schema_repair.jsonl")
247
+ write_jsonl(next_action_records, output_dir / "next_action.jsonl")
248
+ write_jsonl(recovery_records, output_dir / "recovery.jsonl")
249
+ write_jsonl(combined_records, output_dir / "sft_dataset.jsonl")
250
+ print(
251
+ f"wrote baseline={len(baseline_records)} schema={len(schema_records)} "
252
+ f"next_action={len(next_action_records)} recovery={len(recovery_records)} "
253
+ f"combined={len(combined_records)} to {output_dir}"
254
+ )
255
+
256
+
257
+ if __name__ == "__main__":
258
+ main()
unified_incident_env/trainer/build_sft_dataset.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Build supervised JSONL datasets from baseline and replay trajectories."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ from pathlib import Path
7
+
8
+ from ..scripts.baseline_agent import plan_for_scenario
9
+ from ..server.challenge import SCENARIOS
10
+ from ..server.environment import UnifiedIncidentEnvironment
11
+ from .prompts import TRAINING_SYSTEM_PROMPT
12
+ from .trajectory_store import TrajectoryStore
13
+ from .types import SFTRecord
14
+
15
+
16
+ def build_baseline_records() -> list[SFTRecord]:
17
+ rows: list[SFTRecord] = []
18
+ for scenario_id in SCENARIOS:
19
+ env = UnifiedIncidentEnvironment()
20
+ obs = env.reset(scenario_id=scenario_id)
21
+ for step_index, action in enumerate(plan_for_scenario(scenario_id), start=1):
22
+ rows.append(
23
+ SFTRecord(
24
+ source="baseline",
25
+ scenario_id=scenario_id,
26
+ tick=obs.tick_count,
27
+ messages=[
28
+ {"role": "system", "content": TRAINING_SYSTEM_PROMPT},
29
+ {"role": "user", "content": obs.prompt_text},
30
+ ],
31
+ target_action=action.model_dump(exclude_none=True),
32
+ tags=["teacher", f"step_{step_index}"],
33
+ )
34
+ )
35
+ obs = env.step(action)
36
+ return rows
37
+
38
+
39
+ def build_replay_records(episodes_path: Path) -> list[SFTRecord]:
40
+ rows: list[SFTRecord] = []
41
+ for episode in TrajectoryStore(episodes_path).load_episodes():
42
+ for step in episode.step_records:
43
+ if step.teacher_action is None:
44
+ continue
45
+ tags = [episode.mode, step.parse_status]
46
+ if step.failure_reason:
47
+ tags.append("failure")
48
+ rows.append(
49
+ SFTRecord(
50
+ source="replay",
51
+ scenario_id=episode.scenario_id,
52
+ tick=step.tick,
53
+ messages=[
54
+ {"role": "system", "content": TRAINING_SYSTEM_PROMPT},
55
+ {"role": "user", "content": step.prompt_text},
56
+ ],
57
+ target_action=step.teacher_action,
58
+ student_action=step.cleaned_action,
59
+ parse_status=step.parse_status,
60
+ tags=tags,
61
+ )
62
+ )
63
+ return rows
64
+
65
+
66
+ def write_jsonl(records: list[SFTRecord], output_path: Path) -> None:
67
+ output_path.parent.mkdir(parents=True, exist_ok=True)
68
+ with output_path.open("w", encoding="utf-8") as handle:
69
+ for record in records:
70
+ handle.write(record.model_dump_json())
71
+ handle.write("\n")
72
+
73
+
74
+ def main() -> None:
75
+ parser = argparse.ArgumentParser()
76
+ parser.add_argument(
77
+ "--source",
78
+ choices=["baseline", "replay", "combined"],
79
+ default="combined",
80
+ )
81
+ parser.add_argument(
82
+ "--episodes",
83
+ default="outputs/trainer/episodes.jsonl",
84
+ )
85
+ parser.add_argument(
86
+ "--output",
87
+ required=True,
88
+ )
89
+ args = parser.parse_args()
90
+
91
+ records: list[SFTRecord] = []
92
+ if args.source in {"baseline", "combined"}:
93
+ records.extend(build_baseline_records())
94
+ if args.source in {"replay", "combined"}:
95
+ records.extend(build_replay_records(Path(args.episodes)))
96
+ write_jsonl(records, Path(args.output))
97
+ print(f"wrote {len(records)} rows to {args.output}")
98
+
99
+
100
+ if __name__ == "__main__":
101
+ main()
unified_incident_env/trainer/collect_trajectory.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Collection wrapper that turns one episode into trajectory + analysis + summary."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .analyze_failures import analyze_episode
6
+ from .types import EpisodeRecord, EpisodeSummaryRecord
7
+
8
+
9
+ def collect_episode(
10
+ *,
11
+ runner,
12
+ scenario_id: str,
13
+ episode_id: int,
14
+ mode: str,
15
+ model_version: str,
16
+ ) -> tuple[EpisodeRecord, EpisodeSummaryRecord, object]:
17
+ """Run, analyze, and summarize one episode."""
18
+ record = runner.run(
19
+ scenario_id=scenario_id,
20
+ mode=mode,
21
+ episode_id=episode_id,
22
+ model_version=model_version,
23
+ )
24
+ analysis = analyze_episode(record)
25
+ record.schema_failures = analysis.summary.get("schema", 0)
26
+ record.policy_failures = analysis.policy_failures
27
+ record.looping_failures = analysis.looping_failures
28
+ record.reasoning_failures = analysis.reasoning_failures
29
+ summary = EpisodeSummaryRecord(
30
+ episode_id=episode_id,
31
+ run_id=record.run_id,
32
+ scenario_id=record.scenario_id,
33
+ difficulty=record.difficulty,
34
+ model_name=record.model_name,
35
+ model_version=record.model_version,
36
+ mode=record.mode,
37
+ steps=record.steps,
38
+ success=record.success,
39
+ final_score=record.final_score,
40
+ schema_failures=analysis.summary.get("schema", 0),
41
+ json_valid_steps=record.json_valid_steps,
42
+ strict_schema_valid_steps=record.strict_schema_valid_steps,
43
+ teacher_override_count=record.teacher_override_count,
44
+ repair_retry_count=record.repair_retry_count,
45
+ policy_failures=analysis.policy_failures,
46
+ looping_failures=analysis.looping_failures,
47
+ reasoning_failures=analysis.reasoning_failures,
48
+ security_subquest_completed=record.security_subquest_completed,
49
+ postmortem_completed=record.postmortem_completed,
50
+ stopped_reason=record.stopped_reason,
51
+ elapsed_s=record.elapsed_s,
52
+ )
53
+ return record, summary, analysis
unified_incident_env/trainer/eval_models.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Batch evaluation for one or more models."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+ import os
8
+ from pathlib import Path
9
+
10
+ from ..server.challenge import SCENARIOS
11
+ from .action_adapter import LenientActionAdapter, StrictActionParser
12
+ from .backend import OpenAICompatibleBackend
13
+ from .run_episode import EpisodeRunner
14
+ from .trajectory_store import TrajectoryStore
15
+ from .types import EvalScenarioResult, EvalSummary
16
+
17
+
18
+ def summarize(results: list[EvalScenarioResult], mode: str) -> EvalSummary:
19
+ success_rate = (
20
+ sum(1 for result in results if result.success) / len(results) if results else 0.0
21
+ )
22
+ avg_score = (
23
+ sum(result.final_score for result in results) / len(results) if results else 0.0
24
+ )
25
+ schema_failure_rate = (
26
+ sum(1 for result in results if result.schema_failure) / len(results)
27
+ if results
28
+ else 0.0
29
+ )
30
+
31
+ by_model: dict[str, dict[str, float]] = {}
32
+ by_scenario: dict[str, dict[str, float]] = {}
33
+ for result in results:
34
+ model_bucket = by_model.setdefault(
35
+ result.model_name,
36
+ {"runs": 0.0, "successes": 0.0, "score_sum": 0.0, "schema_failures": 0.0},
37
+ )
38
+ model_bucket["runs"] += 1
39
+ model_bucket["successes"] += 1.0 if result.success else 0.0
40
+ model_bucket["score_sum"] += result.final_score
41
+ model_bucket["schema_failures"] += 1.0 if result.schema_failure else 0.0
42
+
43
+ scenario_bucket = by_scenario.setdefault(
44
+ result.scenario_id,
45
+ {"runs": 0.0, "successes": 0.0, "score_sum": 0.0},
46
+ )
47
+ scenario_bucket["runs"] += 1
48
+ scenario_bucket["successes"] += 1.0 if result.success else 0.0
49
+ scenario_bucket["score_sum"] += result.final_score
50
+
51
+ for bucket in by_model.values():
52
+ runs = bucket["runs"] or 1.0
53
+ bucket["success_rate"] = round(bucket["successes"] / runs, 4)
54
+ bucket["avg_score"] = round(bucket["score_sum"] / runs, 4)
55
+ bucket["schema_failure_rate"] = round(bucket["schema_failures"] / runs, 4)
56
+ del bucket["score_sum"]
57
+ del bucket["successes"]
58
+ del bucket["schema_failures"]
59
+
60
+ for bucket in by_scenario.values():
61
+ runs = bucket["runs"] or 1.0
62
+ bucket["success_rate"] = round(bucket["successes"] / runs, 4)
63
+ bucket["avg_score"] = round(bucket["score_sum"] / runs, 4)
64
+ del bucket["score_sum"]
65
+ del bucket["successes"]
66
+
67
+ return EvalSummary(
68
+ mode=mode,
69
+ results=results,
70
+ success_rate=round(success_rate, 4),
71
+ avg_score=round(avg_score, 4),
72
+ schema_failure_rate=round(schema_failure_rate, 4),
73
+ by_model=by_model,
74
+ by_scenario=by_scenario,
75
+ )
76
+
77
+
78
+ def main() -> None:
79
+ parser = argparse.ArgumentParser()
80
+ parser.add_argument("--models", nargs="+", required=True)
81
+ parser.add_argument("--mode", choices=["strict", "lenient"], default="strict")
82
+ parser.add_argument("--base-url", default="http://127.0.0.1:8000")
83
+ parser.add_argument(
84
+ "--api-base-url",
85
+ default=os.environ.get("API_BASE_URL", "http://127.0.0.1:11434/v1"),
86
+ )
87
+ parser.add_argument(
88
+ "--api-key",
89
+ default=os.environ.get("OPENAI_API_KEY") or os.environ.get("HF_TOKEN") or "local",
90
+ )
91
+ parser.add_argument(
92
+ "--output",
93
+ default=None,
94
+ )
95
+ parser.add_argument(
96
+ "--episodes-output",
97
+ default="outputs/trainer/episodes.jsonl",
98
+ )
99
+ args = parser.parse_args()
100
+
101
+ backend = OpenAICompatibleBackend(
102
+ base_url=args.api_base_url,
103
+ api_key=args.api_key,
104
+ )
105
+ parser_impl = StrictActionParser() if args.mode == "strict" else LenientActionAdapter()
106
+ episode_store = TrajectoryStore(Path(args.episodes_output))
107
+
108
+ results: list[EvalScenarioResult] = []
109
+ for model_name in args.models:
110
+ runner = EpisodeRunner(
111
+ backend=backend,
112
+ parser=parser_impl,
113
+ model_name=model_name,
114
+ base_url=args.base_url,
115
+ )
116
+ for scenario_id in SCENARIOS:
117
+ episode = runner.run(scenario_id=scenario_id, mode=args.mode)
118
+ episode_store.append_episode(episode)
119
+ results.append(
120
+ EvalScenarioResult(
121
+ model_name=model_name,
122
+ scenario_id=scenario_id,
123
+ success=episode.success,
124
+ final_score=episode.final_score,
125
+ failure_reason=episode.failure_reason,
126
+ schema_failure=bool(
127
+ episode.failure_reason
128
+ and episode.failure_reason.startswith("parse_failure")
129
+ ),
130
+ elapsed_s=episode.elapsed_s,
131
+ )
132
+ )
133
+
134
+ summary = summarize(results, mode=args.mode)
135
+ output_path = Path(
136
+ args.output
137
+ or f"outputs/trainer/{args.mode}_eval_summary.json"
138
+ )
139
+ output_path.parent.mkdir(parents=True, exist_ok=True)
140
+ output_path.write_text(summary.model_dump_json(indent=2), encoding="utf-8")
141
+ print(summary.model_dump_json(indent=2))
142
+
143
+
144
+ if __name__ == "__main__":
145
+ main()