Spaces:
Running
Running
Initial SetUp Fixed
Browse files- .dockerignore +9 -0
- .gitignore +12 -10
- CHANGELOG_AND_RUNBOOK.md +504 -0
- Dockerfile +6 -2
- README.md +279 -0
- Readme.md +0 -375
- __init__.py +0 -16
- app.py +176 -28
- client.py +73 -93
- environment.py +179 -31
- graders.py +53 -15
- incidents.py +8 -8
- inference.py +373 -159
- models.py +81 -33
- openenv.yaml +45 -16
- pyproject.toml +23 -29
- requirements.txt +4 -1
- server/__init__.py +1 -0
- server/app.py +13 -0
- tests/test_env.py +126 -0
- tests/test_graders.py +85 -0
- ui/assets/app.js +290 -0
- ui/assets/styles.css +731 -0
- ui/index.html +117 -0
- ui/playground.html +153 -0
- ui/status.html +118 -0
- uv.lock +0 -0
.dockerignore
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.git
|
| 2 |
+
.gitignore
|
| 3 |
+
.venv
|
| 4 |
+
__pycache__
|
| 5 |
+
.pycache
|
| 6 |
+
.DS_Store
|
| 7 |
+
.env
|
| 8 |
+
logs.jsonl
|
| 9 |
+
outputs
|
.gitignore
CHANGED
|
@@ -1,13 +1,13 @@
|
|
| 1 |
.DS_Store
|
| 2 |
# =========================
|
| 3 |
-
# ENV
|
| 4 |
# =========================
|
| 5 |
.env
|
| 6 |
.env.*
|
| 7 |
*.env
|
| 8 |
|
| 9 |
# =========================
|
| 10 |
-
# PYTHON
|
| 11 |
# =========================
|
| 12 |
__pycache__/
|
| 13 |
*.pyc
|
|
@@ -24,19 +24,20 @@ env/
|
|
| 24 |
.venv/
|
| 25 |
|
| 26 |
# =========================
|
| 27 |
-
# LOG FILES
|
| 28 |
# =========================
|
| 29 |
*.log
|
| 30 |
logs.jsonl
|
|
|
|
| 31 |
|
| 32 |
# =========================
|
| 33 |
-
# OS FILES
|
| 34 |
# =========================
|
| 35 |
.DS_Store
|
| 36 |
Thumbs.db
|
| 37 |
|
| 38 |
# =========================
|
| 39 |
-
# IDE / EDITOR
|
| 40 |
# =========================
|
| 41 |
.vscode/
|
| 42 |
.idea/
|
|
@@ -44,7 +45,7 @@ Thumbs.db
|
|
| 44 |
*.swo
|
| 45 |
|
| 46 |
# =========================
|
| 47 |
-
# MODEL / DATA FILES
|
| 48 |
# =========================
|
| 49 |
*.onnx
|
| 50 |
*.pt
|
|
@@ -57,27 +58,28 @@ data/
|
|
| 57 |
datasets/
|
| 58 |
|
| 59 |
# =========================
|
| 60 |
-
# BUILD / OUTPUT
|
| 61 |
# =========================
|
| 62 |
dist/
|
| 63 |
build/
|
| 64 |
out/
|
| 65 |
|
| 66 |
# =========================
|
| 67 |
-
# TEMP FILES
|
| 68 |
# =========================
|
| 69 |
*.tmp
|
| 70 |
*.temp
|
| 71 |
.cache/
|
|
|
|
| 72 |
|
| 73 |
# =========================
|
| 74 |
-
# TEST / COVERAGE
|
| 75 |
# =========================
|
| 76 |
coverage/
|
| 77 |
.nyc_output/
|
| 78 |
|
| 79 |
# =========================
|
| 80 |
-
# DOCKER
|
| 81 |
# =========================
|
| 82 |
*.pid
|
| 83 |
*.seed
|
|
|
|
| 1 |
.DS_Store
|
| 2 |
# =========================
|
| 3 |
+
# ENV AND SECRETS
|
| 4 |
# =========================
|
| 5 |
.env
|
| 6 |
.env.*
|
| 7 |
*.env
|
| 8 |
|
| 9 |
# =========================
|
| 10 |
+
# PYTHON
|
| 11 |
# =========================
|
| 12 |
__pycache__/
|
| 13 |
*.pyc
|
|
|
|
| 24 |
.venv/
|
| 25 |
|
| 26 |
# =========================
|
| 27 |
+
# LOG FILES
|
| 28 |
# =========================
|
| 29 |
*.log
|
| 30 |
logs.jsonl
|
| 31 |
+
outputs/
|
| 32 |
|
| 33 |
# =========================
|
| 34 |
+
# OS FILES
|
| 35 |
# =========================
|
| 36 |
.DS_Store
|
| 37 |
Thumbs.db
|
| 38 |
|
| 39 |
# =========================
|
| 40 |
+
# IDE / EDITOR
|
| 41 |
# =========================
|
| 42 |
.vscode/
|
| 43 |
.idea/
|
|
|
|
| 45 |
*.swo
|
| 46 |
|
| 47 |
# =========================
|
| 48 |
+
# MODEL / DATA FILES
|
| 49 |
# =========================
|
| 50 |
*.onnx
|
| 51 |
*.pt
|
|
|
|
| 58 |
datasets/
|
| 59 |
|
| 60 |
# =========================
|
| 61 |
+
# BUILD / OUTPUT
|
| 62 |
# =========================
|
| 63 |
dist/
|
| 64 |
build/
|
| 65 |
out/
|
| 66 |
|
| 67 |
# =========================
|
| 68 |
+
# TEMP FILES
|
| 69 |
# =========================
|
| 70 |
*.tmp
|
| 71 |
*.temp
|
| 72 |
.cache/
|
| 73 |
+
.pycache/
|
| 74 |
|
| 75 |
# =========================
|
| 76 |
+
# TEST / COVERAGE
|
| 77 |
# =========================
|
| 78 |
coverage/
|
| 79 |
.nyc_output/
|
| 80 |
|
| 81 |
# =========================
|
| 82 |
+
# DOCKER (optional)
|
| 83 |
# =========================
|
| 84 |
*.pid
|
| 85 |
*.seed
|
CHANGELOG_AND_RUNBOOK.md
ADDED
|
@@ -0,0 +1,504 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Change Log and Runbook
|
| 2 |
+
|
| 3 |
+
This file explains what changed in the project and how to run or test each part.
|
| 4 |
+
|
| 5 |
+
Project path:
|
| 6 |
+
|
| 7 |
+
```bash
|
| 8 |
+
cd /Users/adityagaba/Downloads/incident-triage-env
|
| 9 |
+
```
|
| 10 |
+
|
| 11 |
+
## 1. What changed
|
| 12 |
+
|
| 13 |
+
### Backend and OpenEnv API
|
| 14 |
+
|
| 15 |
+
The backend is still a FastAPI app, but it now behaves like a stronger OpenEnv-style environment.
|
| 16 |
+
|
| 17 |
+
Main files:
|
| 18 |
+
|
| 19 |
+
- `app.py`
|
| 20 |
+
- `environment.py`
|
| 21 |
+
- `models.py`
|
| 22 |
+
- `graders.py`
|
| 23 |
+
- `incidents.py`
|
| 24 |
+
- `inference.py`
|
| 25 |
+
- `openenv.yaml`
|
| 26 |
+
|
| 27 |
+
Important backend changes:
|
| 28 |
+
|
| 29 |
+
- Added typed request and response models for observation, action, reward, state, and reset.
|
| 30 |
+
- Added proper `reset`, `step`, and `state` behavior.
|
| 31 |
+
- Added strict action validation.
|
| 32 |
+
- Added deterministic graders with partial credit.
|
| 33 |
+
- Added runtime-validator helper endpoints:
|
| 34 |
+
- `GET /health`
|
| 35 |
+
- `GET /metadata`
|
| 36 |
+
- `GET /schema`
|
| 37 |
+
- `POST /mcp`
|
| 38 |
+
- Updated `inference.py` to print strict `[START]`, `[STEP]`, and `[END]` logs.
|
| 39 |
+
|
| 40 |
+
### Frontend UI
|
| 41 |
+
|
| 42 |
+
A browser UI was added on top of the same FastAPI app.
|
| 43 |
+
|
| 44 |
+
Main files:
|
| 45 |
+
|
| 46 |
+
- `ui/index.html`
|
| 47 |
+
- `ui/status.html`
|
| 48 |
+
- `ui/playground.html`
|
| 49 |
+
- `ui/assets/styles.css`
|
| 50 |
+
- `ui/assets/app.js`
|
| 51 |
+
|
| 52 |
+
New UI routes:
|
| 53 |
+
|
| 54 |
+
- `/` shows the landing page.
|
| 55 |
+
- `/status` shows live health, schema, tasks, and grader status.
|
| 56 |
+
- `/playground` lets you reset an incident and submit an action from the browser.
|
| 57 |
+
- `/docs` still shows FastAPI API docs.
|
| 58 |
+
|
| 59 |
+
Latest UI improvements:
|
| 60 |
+
|
| 61 |
+
- The playground has quick presets for task1, task2, and task3.
|
| 62 |
+
- The playground loads the real ticket inventory from `/tickets`.
|
| 63 |
+
- Invalid ticket IDs such as `INC-105` are blocked in the UI before calling `/reset`.
|
| 64 |
+
- The playground now shows visible success and error messages.
|
| 65 |
+
- The summary strip shows incident id, expected field, reward, and episode status.
|
| 66 |
+
- Cards, form controls, and output panels have more spacing and padding.
|
| 67 |
+
- Reset and step buttons show loading states while requests are running.
|
| 68 |
+
|
| 69 |
+
The UI is served from `app.py` with:
|
| 70 |
+
|
| 71 |
+
- `app.mount("/assets", ...)`
|
| 72 |
+
- `GET /`
|
| 73 |
+
- `GET /status`
|
| 74 |
+
- `GET /playground`
|
| 75 |
+
|
| 76 |
+
### Docker and Space readiness
|
| 77 |
+
|
| 78 |
+
Main files:
|
| 79 |
+
|
| 80 |
+
- `Dockerfile`
|
| 81 |
+
- `.dockerignore`
|
| 82 |
+
- `README.md`
|
| 83 |
+
- `openenv.yaml`
|
| 84 |
+
- `server/app.py`
|
| 85 |
+
|
| 86 |
+
Important changes:
|
| 87 |
+
|
| 88 |
+
- Docker runs `uvicorn app:app` on port `7860`.
|
| 89 |
+
- `README.md` includes Hugging Face Docker Space metadata.
|
| 90 |
+
- `server/app.py` is present as a compatibility entrypoint.
|
| 91 |
+
- `openenv validate` passes locally.
|
| 92 |
+
- Runtime validation was made compatible by adding `/schema`, `/mcp`, and `{"status":"healthy"}` from `/health`.
|
| 93 |
+
|
| 94 |
+
### Tests
|
| 95 |
+
|
| 96 |
+
Main files:
|
| 97 |
+
|
| 98 |
+
- `tests/test_env.py`
|
| 99 |
+
- `tests/test_graders.py`
|
| 100 |
+
|
| 101 |
+
Test coverage includes:
|
| 102 |
+
|
| 103 |
+
- health, schema, and MCP helper endpoints
|
| 104 |
+
- UI routes and static assets
|
| 105 |
+
- reset, step, and state behavior
|
| 106 |
+
- wrong task-type rejection
|
| 107 |
+
- grader score range checks
|
| 108 |
+
- partial-credit checks
|
| 109 |
+
- non-constant grader behavior
|
| 110 |
+
|
| 111 |
+
### Terminal logs
|
| 112 |
+
|
| 113 |
+
The backend now prints useful logs when the UI or API is used:
|
| 114 |
+
|
| 115 |
+
```text
|
| 116 |
+
[RESET] session_id=... incident_id=INC-014 task_type=task3 expected_field=action
|
| 117 |
+
[STEP] session_id=... incident_id=INC-014 task_type=task3 answer=FAILOVER reward=1.0 done=true
|
| 118 |
+
[STATE] session_id=... incident_id=INC-014 done=true
|
| 119 |
+
[STEP_ERROR] session_id=... incident_id=INC-014 error=...
|
| 120 |
+
```
|
| 121 |
+
|
| 122 |
+
These logs appear in the same terminal where `uvicorn` is running.
|
| 123 |
+
|
| 124 |
+
## 2. Start the backend and UI locally
|
| 125 |
+
|
| 126 |
+
Use port `8000` locally if port `7860` is busy.
|
| 127 |
+
|
| 128 |
+
```bash
|
| 129 |
+
cd /Users/adityagaba/Downloads/incident-triage-env
|
| 130 |
+
source .venv/bin/activate
|
| 131 |
+
.venv/bin/python -m uvicorn app:app --host 127.0.0.1 --port 8000
|
| 132 |
+
```
|
| 133 |
+
|
| 134 |
+
Keep that terminal open.
|
| 135 |
+
|
| 136 |
+
Open these browser URLs:
|
| 137 |
+
|
| 138 |
+
```text
|
| 139 |
+
http://127.0.0.1:8000/
|
| 140 |
+
http://127.0.0.1:8000/status
|
| 141 |
+
http://127.0.0.1:8000/playground
|
| 142 |
+
http://127.0.0.1:8000/docs
|
| 143 |
+
```
|
| 144 |
+
|
| 145 |
+
If you already had the server running, stop it with `Ctrl+C` and start it again. Use hard refresh in the browser if the old UI is still visible.
|
| 146 |
+
|
| 147 |
+
Expected results:
|
| 148 |
+
|
| 149 |
+
- `/` shows the Incident Triage landing page.
|
| 150 |
+
- `/status` shows health and task cards.
|
| 151 |
+
- `/playground` lets you reset and step through an incident.
|
| 152 |
+
- `/docs` shows generated API documentation.
|
| 153 |
+
|
| 154 |
+
## 3. Test the UI manually
|
| 155 |
+
|
| 156 |
+
### Landing page
|
| 157 |
+
|
| 158 |
+
Open:
|
| 159 |
+
|
| 160 |
+
```text
|
| 161 |
+
http://127.0.0.1:8000/
|
| 162 |
+
```
|
| 163 |
+
|
| 164 |
+
Check:
|
| 165 |
+
|
| 166 |
+
- The page title says `Welcome to Incident Triage Environment`.
|
| 167 |
+
- Live snapshot cards load data.
|
| 168 |
+
- Task cards appear.
|
| 169 |
+
- Links to `/status`, `/playground`, and `/docs` work.
|
| 170 |
+
|
| 171 |
+
### Status page
|
| 172 |
+
|
| 173 |
+
Open:
|
| 174 |
+
|
| 175 |
+
```text
|
| 176 |
+
http://127.0.0.1:8000/status
|
| 177 |
+
```
|
| 178 |
+
|
| 179 |
+
Check:
|
| 180 |
+
|
| 181 |
+
- Health shows `healthy`.
|
| 182 |
+
- Total incidents shows `36`.
|
| 183 |
+
- Task cards show task1, task2, and task3.
|
| 184 |
+
- Schema coverage shows available runtime contracts.
|
| 185 |
+
- Grader summary loads.
|
| 186 |
+
|
| 187 |
+
### Playground page
|
| 188 |
+
|
| 189 |
+
Open:
|
| 190 |
+
|
| 191 |
+
```text
|
| 192 |
+
http://127.0.0.1:8000/playground
|
| 193 |
+
```
|
| 194 |
+
|
| 195 |
+
Run a correct hard-task case:
|
| 196 |
+
|
| 197 |
+
1. Click the `Action case` preset, or manually select `task3`.
|
| 198 |
+
2. Confirm ticket id is `INC-014`.
|
| 199 |
+
3. Click `Reset Environment`.
|
| 200 |
+
4. Confirm expected field is `action`.
|
| 201 |
+
5. Select `FAILOVER`.
|
| 202 |
+
6. Click `Submit Step`.
|
| 203 |
+
|
| 204 |
+
Expected result:
|
| 205 |
+
|
| 206 |
+
- `reward.value` is `1.0`.
|
| 207 |
+
- `done` is `true`.
|
| 208 |
+
- `info.correct` is `true`.
|
| 209 |
+
- `info.ground_truth` is `FAILOVER`.
|
| 210 |
+
|
| 211 |
+
Important ticket rule:
|
| 212 |
+
|
| 213 |
+
- Valid tickets are `INC-001` through `INC-036`.
|
| 214 |
+
- `INC-105` is not in this dataset, so reset should fail for that ticket.
|
| 215 |
+
- The updated UI loads valid tickets from `/tickets` and warns before sending an invalid ticket to the backend.
|
| 216 |
+
|
| 217 |
+
Expected terminal logs:
|
| 218 |
+
|
| 219 |
+
```text
|
| 220 |
+
[RESET] session_id=... incident_id=INC-014 task_type=task3 expected_field=action
|
| 221 |
+
[STEP] session_id=... incident_id=INC-014 task_type=task3 answer=FAILOVER reward=1.0 done=true
|
| 222 |
+
```
|
| 223 |
+
|
| 224 |
+
Run a task1 case:
|
| 225 |
+
|
| 226 |
+
1. Click the `Severity case` preset, or manually select `task1`.
|
| 227 |
+
2. Confirm ticket id is `INC-001`.
|
| 228 |
+
3. Click `Reset Environment`.
|
| 229 |
+
4. Confirm expected field is `severity`.
|
| 230 |
+
5. Select `SEV1`.
|
| 231 |
+
6. Click `Submit Step`.
|
| 232 |
+
|
| 233 |
+
Expected result:
|
| 234 |
+
|
| 235 |
+
- reward should be `1.0`.
|
| 236 |
+
|
| 237 |
+
Run a task2 case:
|
| 238 |
+
|
| 239 |
+
1. Click the `Root cause case` preset, or manually select `task2`.
|
| 240 |
+
2. Confirm ticket id is `INC-006`.
|
| 241 |
+
3. Click `Reset Environment`.
|
| 242 |
+
4. Confirm expected field is `root_cause`.
|
| 243 |
+
5. Select `DATABASE`.
|
| 244 |
+
6. Click `Submit Step`.
|
| 245 |
+
|
| 246 |
+
Expected result:
|
| 247 |
+
|
| 248 |
+
- reward should be `1.0`.
|
| 249 |
+
|
| 250 |
+
## 4. Test backend API with curl
|
| 251 |
+
|
| 252 |
+
Use a second terminal while the app is running on port `8000`.
|
| 253 |
+
|
| 254 |
+
Health:
|
| 255 |
+
|
| 256 |
+
```bash
|
| 257 |
+
curl -s http://127.0.0.1:8000/health | python3 -m json.tool
|
| 258 |
+
```
|
| 259 |
+
|
| 260 |
+
Expected:
|
| 261 |
+
|
| 262 |
+
```json
|
| 263 |
+
{
|
| 264 |
+
"status": "healthy"
|
| 265 |
+
}
|
| 266 |
+
```
|
| 267 |
+
|
| 268 |
+
Metadata:
|
| 269 |
+
|
| 270 |
+
```bash
|
| 271 |
+
curl -s http://127.0.0.1:8000/metadata | python3 -m json.tool
|
| 272 |
+
```
|
| 273 |
+
|
| 274 |
+
Schema:
|
| 275 |
+
|
| 276 |
+
```bash
|
| 277 |
+
curl -s http://127.0.0.1:8000/schema | python3 -m json.tool
|
| 278 |
+
```
|
| 279 |
+
|
| 280 |
+
Reset a fixed incident:
|
| 281 |
+
|
| 282 |
+
```bash
|
| 283 |
+
curl -s -X POST http://127.0.0.1:8000/reset \
|
| 284 |
+
-H "Content-Type: application/json" \
|
| 285 |
+
-d '{"task_type":"task3","ticket_id":"INC-014"}' > /tmp/reset.json
|
| 286 |
+
python3 -m json.tool /tmp/reset.json
|
| 287 |
+
```
|
| 288 |
+
|
| 289 |
+
Extract session id:
|
| 290 |
+
|
| 291 |
+
```bash
|
| 292 |
+
SESSION_ID=$(python3 -c 'import json; print(json.load(open("/tmp/reset.json"))["info"]["session_id"])')
|
| 293 |
+
echo $SESSION_ID
|
| 294 |
+
```
|
| 295 |
+
|
| 296 |
+
Submit a correct step:
|
| 297 |
+
|
| 298 |
+
```bash
|
| 299 |
+
curl -s -X POST "http://127.0.0.1:8000/step?session_id=$SESSION_ID" \
|
| 300 |
+
-H "Content-Type: application/json" \
|
| 301 |
+
-d '{"incident_id":"INC-014","task_type":"task3","action":"FAILOVER"}' | python3 -m json.tool
|
| 302 |
+
```
|
| 303 |
+
|
| 304 |
+
Check state:
|
| 305 |
+
|
| 306 |
+
```bash
|
| 307 |
+
curl -s "http://127.0.0.1:8000/state?session_id=$SESSION_ID" | python3 -m json.tool
|
| 308 |
+
```
|
| 309 |
+
|
| 310 |
+
Expected state:
|
| 311 |
+
|
| 312 |
+
- `done` is `true`
|
| 313 |
+
- `status` is `completed`
|
| 314 |
+
- `last_reward` is `1.0`
|
| 315 |
+
|
| 316 |
+
## 5. Test backend edge cases
|
| 317 |
+
|
| 318 |
+
Bad session:
|
| 319 |
+
|
| 320 |
+
```bash
|
| 321 |
+
curl -s -X POST "http://127.0.0.1:8000/step?session_id=bad-session" \
|
| 322 |
+
-H "Content-Type: application/json" \
|
| 323 |
+
-d '{"incident_id":"INC-014","task_type":"task3","action":"FAILOVER"}' | python3 -m json.tool
|
| 324 |
+
```
|
| 325 |
+
|
| 326 |
+
Expected:
|
| 327 |
+
|
| 328 |
+
```json
|
| 329 |
+
{
|
| 330 |
+
"detail": "Session not found. Call /reset first."
|
| 331 |
+
}
|
| 332 |
+
```
|
| 333 |
+
|
| 334 |
+
Bad ticket:
|
| 335 |
+
|
| 336 |
+
```bash
|
| 337 |
+
curl -s -X POST http://127.0.0.1:8000/reset \
|
| 338 |
+
-H "Content-Type: application/json" \
|
| 339 |
+
-d '{"task_type":"task1","ticket_id":"INC-999"}' | python3 -m json.tool
|
| 340 |
+
```
|
| 341 |
+
|
| 342 |
+
Expected:
|
| 343 |
+
|
| 344 |
+
```json
|
| 345 |
+
{
|
| 346 |
+
"detail": "No ticket found for ticket_id: INC-999"
|
| 347 |
+
}
|
| 348 |
+
```
|
| 349 |
+
|
| 350 |
+
Wrong field for task3:
|
| 351 |
+
|
| 352 |
+
```bash
|
| 353 |
+
curl -s -X POST http://127.0.0.1:8000/reset \
|
| 354 |
+
-H "Content-Type: application/json" \
|
| 355 |
+
-d '{"task_type":"task3","ticket_id":"INC-014"}' > /tmp/reset_wrong_field.json
|
| 356 |
+
|
| 357 |
+
SESSION_WRONG_FIELD=$(python3 -c 'import json; print(json.load(open("/tmp/reset_wrong_field.json"))["info"]["session_id"])')
|
| 358 |
+
|
| 359 |
+
curl -s -X POST "http://127.0.0.1:8000/step?session_id=$SESSION_WRONG_FIELD" \
|
| 360 |
+
-H "Content-Type: application/json" \
|
| 361 |
+
-d '{"incident_id":"INC-014","task_type":"task3","root_cause":"NETWORK"}' | python3 -m json.tool
|
| 362 |
+
```
|
| 363 |
+
|
| 364 |
+
Expected:
|
| 365 |
+
|
| 366 |
+
```json
|
| 367 |
+
{
|
| 368 |
+
"detail": "Task 'task3' expects field 'action', but got 'root_cause'."
|
| 369 |
+
}
|
| 370 |
+
```
|
| 371 |
+
|
| 372 |
+
## 6. Run automated tests
|
| 373 |
+
|
| 374 |
+
```bash
|
| 375 |
+
cd /Users/adityagaba/Downloads/incident-triage-env
|
| 376 |
+
.venv/bin/python -m unittest discover -s tests -v
|
| 377 |
+
```
|
| 378 |
+
|
| 379 |
+
Expected:
|
| 380 |
+
|
| 381 |
+
```text
|
| 382 |
+
OK
|
| 383 |
+
```
|
| 384 |
+
|
| 385 |
+
## 7. Run OpenEnv local validation
|
| 386 |
+
|
| 387 |
+
```bash
|
| 388 |
+
cd /Users/adityagaba/Downloads/incident-triage-env
|
| 389 |
+
.venv/bin/openenv validate . --json
|
| 390 |
+
```
|
| 391 |
+
|
| 392 |
+
Expected:
|
| 393 |
+
|
| 394 |
+
```json
|
| 395 |
+
"passed": true
|
| 396 |
+
```
|
| 397 |
+
|
| 398 |
+
## 8. Run the baseline inference script
|
| 399 |
+
|
| 400 |
+
If the local app is running on port `8000`:
|
| 401 |
+
|
| 402 |
+
```bash
|
| 403 |
+
cd /Users/adityagaba/Downloads/incident-triage-env
|
| 404 |
+
ENV_URL=http://127.0.0.1:8000 .venv/bin/python inference.py
|
| 405 |
+
```
|
| 406 |
+
|
| 407 |
+
Expected log format:
|
| 408 |
+
|
| 409 |
+
```text
|
| 410 |
+
[START] task=INC-001 env=incident-triage-env model=...
|
| 411 |
+
[STEP] step=1 action=SEV1 reward=1.00 done=true error=null
|
| 412 |
+
[END] success=true steps=1 score=1.00 rewards=1.00
|
| 413 |
+
```
|
| 414 |
+
|
| 415 |
+
If no server is reachable, `inference.py` falls back to an in-process FastAPI client.
|
| 416 |
+
|
| 417 |
+
## 9. Docker commands
|
| 418 |
+
|
| 419 |
+
If `docker` is available on PATH:
|
| 420 |
+
|
| 421 |
+
```bash
|
| 422 |
+
docker build -t incident-triage-env .
|
| 423 |
+
docker run --rm -p 8001:7860 incident-triage-env
|
| 424 |
+
```
|
| 425 |
+
|
| 426 |
+
If using Docker Desktop on macOS and `docker` is not on PATH:
|
| 427 |
+
|
| 428 |
+
```bash
|
| 429 |
+
export PATH=/Applications/Docker.app/Contents/Resources/bin:$PATH
|
| 430 |
+
/Applications/Docker.app/Contents/Resources/bin/docker build -t incident-triage-env .
|
| 431 |
+
/Applications/Docker.app/Contents/Resources/bin/docker run --rm -p 8001:7860 incident-triage-env
|
| 432 |
+
```
|
| 433 |
+
|
| 434 |
+
Then test:
|
| 435 |
+
|
| 436 |
+
```bash
|
| 437 |
+
curl -s http://127.0.0.1:8001/health | python3 -m json.tool
|
| 438 |
+
curl -s -X POST http://127.0.0.1:8001/reset -H "Content-Type: application/json" -d '{}' | python3 -m json.tool
|
| 439 |
+
```
|
| 440 |
+
|
| 441 |
+
Open Docker UI routes:
|
| 442 |
+
|
| 443 |
+
```text
|
| 444 |
+
http://127.0.0.1:8001/
|
| 445 |
+
http://127.0.0.1:8001/status
|
| 446 |
+
http://127.0.0.1:8001/playground
|
| 447 |
+
http://127.0.0.1:8001/docs
|
| 448 |
+
```
|
| 449 |
+
|
| 450 |
+
Expected:
|
| 451 |
+
|
| 452 |
+
- `/health` returns `{"status": "healthy"}`
|
| 453 |
+
- `/reset` returns `observation`, `reward`, `done`, and `info`
|
| 454 |
+
- `/` shows the landing page
|
| 455 |
+
- `/status` shows the live dashboard
|
| 456 |
+
- `/playground` lets you test incidents from the browser
|
| 457 |
+
|
| 458 |
+
## 10. Live Hugging Face Space validation
|
| 459 |
+
|
| 460 |
+
Replace `<space-url>` with the actual public URL:
|
| 461 |
+
|
| 462 |
+
```bash
|
| 463 |
+
curl -s <space-url>/health | python3 -m json.tool
|
| 464 |
+
curl -s -X POST <space-url>/reset -H "Content-Type: application/json" -d '{}' | python3 -m json.tool
|
| 465 |
+
.venv/bin/openenv validate --url <space-url> --timeout 10
|
| 466 |
+
```
|
| 467 |
+
|
| 468 |
+
Expected:
|
| 469 |
+
|
| 470 |
+
- `/health` returns `{"status": "healthy"}`
|
| 471 |
+
- `/reset` returns `200` with a typed environment response
|
| 472 |
+
- `openenv validate --url` returns `"passed": true`
|
| 473 |
+
|
| 474 |
+
## 11. Common issues
|
| 475 |
+
|
| 476 |
+
### Port 7860 is busy
|
| 477 |
+
|
| 478 |
+
Use port `8000` locally:
|
| 479 |
+
|
| 480 |
+
```bash
|
| 481 |
+
.venv/bin/python -m uvicorn app:app --host 127.0.0.1 --port 8000
|
| 482 |
+
```
|
| 483 |
+
|
| 484 |
+
### Root URL returns Not Found
|
| 485 |
+
|
| 486 |
+
This should no longer happen after the UI change. The root route `/` now serves the landing page.
|
| 487 |
+
|
| 488 |
+
### Playground says session not found
|
| 489 |
+
|
| 490 |
+
Click `Reset Environment` first, then submit a step.
|
| 491 |
+
|
| 492 |
+
### Wrong task errors happen after completion
|
| 493 |
+
|
| 494 |
+
Each episode is single-step. To test validation errors, reset a fresh session first.
|
| 495 |
+
|
| 496 |
+
### Docker credential helper error
|
| 497 |
+
|
| 498 |
+
Run:
|
| 499 |
+
|
| 500 |
+
```bash
|
| 501 |
+
export PATH=/Applications/Docker.app/Contents/Resources/bin:$PATH
|
| 502 |
+
```
|
| 503 |
+
|
| 504 |
+
Then retry the Docker command.
|
Dockerfile
CHANGED
|
@@ -1,12 +1,16 @@
|
|
| 1 |
FROM python:3.10-slim
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
WORKDIR /app
|
| 4 |
|
| 5 |
COPY requirements.txt .
|
| 6 |
-
RUN pip install -
|
| 7 |
|
| 8 |
COPY . .
|
| 9 |
|
| 10 |
EXPOSE 7860
|
| 11 |
|
| 12 |
-
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
|
| 1 |
FROM python:3.10-slim
|
| 2 |
|
| 3 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 4 |
+
PYTHONUNBUFFERED=1 \
|
| 5 |
+
PIP_NO_CACHE_DIR=1
|
| 6 |
+
|
| 7 |
WORKDIR /app
|
| 8 |
|
| 9 |
COPY requirements.txt .
|
| 10 |
+
RUN python -m pip install -r requirements.txt
|
| 11 |
|
| 12 |
COPY . .
|
| 13 |
|
| 14 |
EXPOSE 7860
|
| 15 |
|
| 16 |
+
CMD ["python", "-m", "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
README.md
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Incident Triage OpenEnv
|
| 3 |
+
colorFrom: red
|
| 4 |
+
colorTo: blue
|
| 5 |
+
sdk: docker
|
| 6 |
+
app_port: 7860
|
| 7 |
+
pinned: false
|
| 8 |
+
tags:
|
| 9 |
+
- openenv
|
| 10 |
+
- fastapi
|
| 11 |
+
- reinforcement-learning
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
# Production Incident Triage Environment
|
| 15 |
+
|
| 16 |
+
This project is an OpenEnv-compatible evaluation environment for production incident response. An agent receives a typed incident observation and must perform one of three real-world triage tasks: classify severity, identify the most likely root cause, or recommend the best immediate action.
|
| 17 |
+
|
| 18 |
+
The environment is built for the OpenEnv hackathon requirements:
|
| 19 |
+
- real-world utility
|
| 20 |
+
- three graded tasks with easy, medium, and hard difficulty
|
| 21 |
+
- typed observation, action, reward, and state models
|
| 22 |
+
- deterministic reward logic with partial credit
|
| 23 |
+
- root-level `inference.py`
|
| 24 |
+
- Docker-based deployment for Hugging Face Spaces
|
| 25 |
+
|
| 26 |
+
## Overview
|
| 27 |
+
|
| 28 |
+
The dataset contains 36 incidents across three task families:
|
| 29 |
+
|
| 30 |
+
| Task | Difficulty | Count | Objective |
|
| 31 |
+
|---|---|---:|---|
|
| 32 |
+
| `task1` | easy | 11 | Predict incident severity as `SEV1`, `SEV2`, or `SEV3` |
|
| 33 |
+
| `task2` | medium | 12 | Predict the most likely root cause domain |
|
| 34 |
+
| `task3` | hard | 13 | Predict the best immediate operational action |
|
| 35 |
+
|
| 36 |
+
The incidents cover realistic production scenarios such as payment failures, queue backlogs, regional network loss, failed deploys, infrastructure saturation, third-party degradation, and failover decisions.
|
| 37 |
+
|
| 38 |
+
## API
|
| 39 |
+
|
| 40 |
+
The FastAPI app exposes the following endpoints on port `7860`:
|
| 41 |
+
|
| 42 |
+
- `GET /health`
|
| 43 |
+
- `GET /metadata`
|
| 44 |
+
- `GET /tasks`
|
| 45 |
+
- `GET /grader`
|
| 46 |
+
- `GET /schema`
|
| 47 |
+
- `POST /reset`
|
| 48 |
+
- `POST /step`
|
| 49 |
+
- `GET /state`
|
| 50 |
+
- `POST /mcp`
|
| 51 |
+
|
| 52 |
+
### Reset
|
| 53 |
+
|
| 54 |
+
`POST /reset` starts a new single-step episode.
|
| 55 |
+
|
| 56 |
+
Optional request body:
|
| 57 |
+
|
| 58 |
+
```json
|
| 59 |
+
{
|
| 60 |
+
"task_type": "task1",
|
| 61 |
+
"ticket_id": "INC-001",
|
| 62 |
+
"seed": 42
|
| 63 |
+
}
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
Response fields:
|
| 67 |
+
- `observation`
|
| 68 |
+
- `reward`
|
| 69 |
+
- `done`
|
| 70 |
+
- `info`
|
| 71 |
+
|
| 72 |
+
### Step
|
| 73 |
+
|
| 74 |
+
`POST /step?session_id=<id>` accepts an `IncidentAction` and returns a typed `StepResult`.
|
| 75 |
+
|
| 76 |
+
Example request:
|
| 77 |
+
|
| 78 |
+
```json
|
| 79 |
+
{
|
| 80 |
+
"incident_id": "INC-001",
|
| 81 |
+
"task_type": "task1",
|
| 82 |
+
"severity": "SEV1"
|
| 83 |
+
}
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
### State
|
| 87 |
+
|
| 88 |
+
`GET /state?session_id=<id>` returns the current typed `IncidentState`.
|
| 89 |
+
|
| 90 |
+
## Web UI
|
| 91 |
+
|
| 92 |
+
The project also serves a browser-facing UI from the same FastAPI app:
|
| 93 |
+
|
| 94 |
+
- `/` shows the landing page with project overview and task summary
|
| 95 |
+
- `/status` shows live health, schema, and task readiness information
|
| 96 |
+
- `/playground` lets you manually reset a session and submit a step from the browser
|
| 97 |
+
- `/docs` provides the generated FastAPI API reference
|
| 98 |
+
|
| 99 |
+
## Models
|
| 100 |
+
|
| 101 |
+
The core models are defined in [models.py](/Users/adityagaba/Downloads/incident-triage-env/models.py):
|
| 102 |
+
|
| 103 |
+
- `IncidentObservation`
|
| 104 |
+
- `IncidentAction`
|
| 105 |
+
- `IncidentReward`
|
| 106 |
+
- `StepResult`
|
| 107 |
+
- `IncidentState`
|
| 108 |
+
- `ResetRequest`
|
| 109 |
+
|
| 110 |
+
Validation rules:
|
| 111 |
+
- `incident_id` must match the active ticket
|
| 112 |
+
- `task_type` must match the active ticket
|
| 113 |
+
- exactly one of `severity`, `root_cause`, or `action` must be populated
|
| 114 |
+
- the populated field must match the expected field for the task
|
| 115 |
+
|
| 116 |
+
## Reward Logic
|
| 117 |
+
|
| 118 |
+
Rewarding is deterministic and implemented in [graders.py](/Users/adityagaba/Downloads/incident-triage-env/graders.py).
|
| 119 |
+
|
| 120 |
+
- `task1`: `1.0` exact, `0.5` adjacent severity, `0.0` far miss
|
| 121 |
+
- `task2`: `1.0` exact, `0.5` related domain, `0.25` `UNKNOWN`, `0.0` wrong
|
| 122 |
+
- `task3`: `1.0` exact, `0.4` safe `INVESTIGATE` fallback, `0.25` related action, `0.0` wrong
|
| 123 |
+
|
| 124 |
+
This keeps grading reproducible while still giving partial-credit trajectory signal.
|
| 125 |
+
|
| 126 |
+
## Repository Layout
|
| 127 |
+
|
| 128 |
+
```text
|
| 129 |
+
incident-triage-env/
|
| 130 |
+
- app.py
|
| 131 |
+
- client.py
|
| 132 |
+
- environment.py
|
| 133 |
+
- graders.py
|
| 134 |
+
- incidents.py
|
| 135 |
+
- inference.py
|
| 136 |
+
- models.py
|
| 137 |
+
- openenv.yaml
|
| 138 |
+
- pyproject.toml
|
| 139 |
+
- requirements.txt
|
| 140 |
+
- Dockerfile
|
| 141 |
+
- README.md
|
| 142 |
+
- server/
|
| 143 |
+
- tests/
|
| 144 |
+
```
|
| 145 |
+
|
| 146 |
+
Runtime flow:
|
| 147 |
+
1. `incidents.py` stores the ticket dataset.
|
| 148 |
+
2. `environment.py` selects the episode and applies grading.
|
| 149 |
+
3. `app.py` exposes the API surface.
|
| 150 |
+
4. `inference.py` runs the baseline over the environment.
|
| 151 |
+
5. `graders.py` calculates deterministic reward and explanations.
|
| 152 |
+
|
| 153 |
+
## Local Setup
|
| 154 |
+
|
| 155 |
+
Install dependencies:
|
| 156 |
+
|
| 157 |
+
```bash
|
| 158 |
+
pip install -r requirements.txt
|
| 159 |
+
```
|
| 160 |
+
|
| 161 |
+
Optional OpenEnv CLI:
|
| 162 |
+
|
| 163 |
+
```bash
|
| 164 |
+
pip install openenv-core
|
| 165 |
+
```
|
| 166 |
+
|
| 167 |
+
Optional environment variables for `inference.py`:
|
| 168 |
+
|
| 169 |
+
```bash
|
| 170 |
+
export API_BASE_URL="https://your-openai-compatible-endpoint/v1"
|
| 171 |
+
export MODEL_NAME="your-model-name"
|
| 172 |
+
export HF_TOKEN="your-api-key"
|
| 173 |
+
export ENV_URL="http://localhost:7860"
|
| 174 |
+
```
|
| 175 |
+
|
| 176 |
+
If no external environment server is reachable, `inference.py` falls back to an in-process FastAPI client.
|
| 177 |
+
|
| 178 |
+
## Run Locally
|
| 179 |
+
|
| 180 |
+
Start the server:
|
| 181 |
+
|
| 182 |
+
```bash
|
| 183 |
+
uvicorn app:app --host 0.0.0.0 --port 7860
|
| 184 |
+
```
|
| 185 |
+
|
| 186 |
+
Run the baseline:
|
| 187 |
+
|
| 188 |
+
```bash
|
| 189 |
+
python inference.py
|
| 190 |
+
```
|
| 191 |
+
|
| 192 |
+
Run the smoke tests:
|
| 193 |
+
|
| 194 |
+
```bash
|
| 195 |
+
python -m unittest discover -s tests -v
|
| 196 |
+
```
|
| 197 |
+
|
| 198 |
+
## Docker
|
| 199 |
+
|
| 200 |
+
Build the image:
|
| 201 |
+
|
| 202 |
+
```bash
|
| 203 |
+
docker build -t incident-triage-env .
|
| 204 |
+
```
|
| 205 |
+
|
| 206 |
+
Run the container:
|
| 207 |
+
|
| 208 |
+
```bash
|
| 209 |
+
docker run --rm -p 7860:7860 incident-triage-env
|
| 210 |
+
```
|
| 211 |
+
|
| 212 |
+
Check health:
|
| 213 |
+
|
| 214 |
+
```bash
|
| 215 |
+
curl http://localhost:7860/health
|
| 216 |
+
```
|
| 217 |
+
|
| 218 |
+
## Baseline Logging
|
| 219 |
+
|
| 220 |
+
`inference.py` prints the required structured output:
|
| 221 |
+
|
| 222 |
+
```text
|
| 223 |
+
[START] task=INC-001 env=incident-triage-env model=deterministic-baseline
|
| 224 |
+
[STEP] step=1 action=SEV1 reward=1.00 done=true error=null
|
| 225 |
+
[END] success=true steps=1 score=1.00 rewards=1.00
|
| 226 |
+
```
|
| 227 |
+
|
| 228 |
+
## Baseline Scores
|
| 229 |
+
|
| 230 |
+
Latest local deterministic baseline:
|
| 231 |
+
|
| 232 |
+
| Metric | Value |
|
| 233 |
+
|---|---:|
|
| 234 |
+
| Episodes | 36 |
|
| 235 |
+
| Average score | 0.9861 |
|
| 236 |
+
| `task1` average | 1.0000 |
|
| 237 |
+
| `task2` average | 0.9583 |
|
| 238 |
+
| `task3` average | 1.0000 |
|
| 239 |
+
|
| 240 |
+
These results are written to `outputs/baseline_scores.json`.
|
| 241 |
+
|
| 242 |
+
## Quick API Example
|
| 243 |
+
|
| 244 |
+
Reset:
|
| 245 |
+
|
| 246 |
+
```bash
|
| 247 |
+
curl -X POST http://localhost:7860/reset \
|
| 248 |
+
-H "Content-Type: application/json" \
|
| 249 |
+
-d '{"task_type":"task1","ticket_id":"INC-001"}'
|
| 250 |
+
```
|
| 251 |
+
|
| 252 |
+
Step:
|
| 253 |
+
|
| 254 |
+
```bash
|
| 255 |
+
curl -X POST "http://localhost:7860/step?session_id=<session-id>" \
|
| 256 |
+
-H "Content-Type: application/json" \
|
| 257 |
+
-d '{
|
| 258 |
+
"incident_id": "INC-001",
|
| 259 |
+
"task_type": "task1",
|
| 260 |
+
"severity": "SEV1"
|
| 261 |
+
}'
|
| 262 |
+
```
|
| 263 |
+
|
| 264 |
+
## Pre-Submission Checklist
|
| 265 |
+
|
| 266 |
+
- `openenv validate . --json` passes
|
| 267 |
+
- `openenv validate --url <space-url>` passes
|
| 268 |
+
- `POST /reset` returns `200`
|
| 269 |
+
- `POST /step` returns typed `reward`, `done`, and `info`
|
| 270 |
+
- `GET /state` works for active sessions
|
| 271 |
+
- `inference.py` runs from the repo root
|
| 272 |
+
- `Dockerfile` serves the app on port `7860`
|
| 273 |
+
- `openenv.yaml` matches the current API and dataset counts
|
| 274 |
+
|
| 275 |
+
## Notes
|
| 276 |
+
|
| 277 |
+
- `models.py` is the source of truth for valid enum labels.
|
| 278 |
+
- `graders.py` is the source of truth for scoring logic.
|
| 279 |
+
- The environment is intentionally single-step per episode and still exposes typed state for validation and debugging.
|
Readme.md
DELETED
|
@@ -1,375 +0,0 @@
|
|
| 1 |
-
# 🚨 Production Incident Triage Environment
|
| 2 |
-
|
| 3 |
-
An OpenEnv-compatible backend evaluation system where an AI agent triages production incidents like a real SRE (Site Reliability Engineer). Built for deterministic, RL-style evaluation — no UI, no chatbot, pure backend.
|
| 4 |
-
|
| 5 |
-
---
|
| 6 |
-
|
| 7 |
-
## 📌 What This Is
|
| 8 |
-
|
| 9 |
-
This is **not** a chatbot. It is a structured evaluation environment where:
|
| 10 |
-
|
| 11 |
-
1. Environment returns a production incident (alert + context)
|
| 12 |
-
2. AI agent reads the incident
|
| 13 |
-
3. Agent returns a structured JSON action
|
| 14 |
-
4. Environment sends action to a deterministic grader
|
| 15 |
-
5. Grader compares against ground truth
|
| 16 |
-
6. Returns a score between `0.0` and `1.0`
|
| 17 |
-
|
| 18 |
-
---
|
| 19 |
-
|
| 20 |
-
## 🗂️ Project Structure
|
| 21 |
-
|
| 22 |
-
```
|
| 23 |
-
Incident_Triage/
|
| 24 |
-
│
|
| 25 |
-
├── models.py # Pydantic schemas — source of truth for all types
|
| 26 |
-
├── incidents.py # Dataset of 15 production incidents
|
| 27 |
-
├── inference.py # LLM agent (Mistral via NVIDIA API)
|
| 28 |
-
├── openenv.yaml # OpenEnv submission config
|
| 29 |
-
├── pyproject.toml # Project metadata
|
| 30 |
-
├── requirements.txt # Dependencies
|
| 31 |
-
├── README.md
|
| 32 |
-
│
|
| 33 |
-
└── server/
|
| 34 |
-
├── __init__.py # Empty — do not add imports here
|
| 35 |
-
├── app.py # FastAPI server
|
| 36 |
-
├── environment.py # Core RL-style logic (reset / step)
|
| 37 |
-
├── graders.py # Deterministic scoring functions
|
| 38 |
-
├── Dockerfile
|
| 39 |
-
└── requirements.txt
|
| 40 |
-
```
|
| 41 |
-
|
| 42 |
-
---
|
| 43 |
-
|
| 44 |
-
## ⚙️ Setup
|
| 45 |
-
|
| 46 |
-
### 1. Clone and install dependencies
|
| 47 |
-
|
| 48 |
-
```bash
|
| 49 |
-
git clone <your-repo-url>
|
| 50 |
-
cd Incident_Triage
|
| 51 |
-
pip install -r requirements.txt
|
| 52 |
-
```
|
| 53 |
-
|
| 54 |
-
### 2. Set your NVIDIA / Mistral API key
|
| 55 |
-
|
| 56 |
-
```bash
|
| 57 |
-
# Windows
|
| 58 |
-
set NVIDIA_API_KEY=your_nvidia_api_key_here
|
| 59 |
-
|
| 60 |
-
# Mac / Linux
|
| 61 |
-
export NVIDIA_API_KEY=your_nvidia_api_key_here
|
| 62 |
-
```
|
| 63 |
-
|
| 64 |
-
### 3. Start the server
|
| 65 |
-
|
| 66 |
-
```bash
|
| 67 |
-
uvicorn server.app:app --reload
|
| 68 |
-
```
|
| 69 |
-
|
| 70 |
-
Server runs at: `http://localhost:8000`
|
| 71 |
-
|
| 72 |
-
### 4. Run the agent
|
| 73 |
-
|
| 74 |
-
```bash
|
| 75 |
-
python inference.py
|
| 76 |
-
```
|
| 77 |
-
|
| 78 |
-
---
|
| 79 |
-
|
| 80 |
-
## 🔗 API Endpoints
|
| 81 |
-
|
| 82 |
-
### `GET /tasks`
|
| 83 |
-
Returns available task types and their descriptions.
|
| 84 |
-
|
| 85 |
-
**Response:**
|
| 86 |
-
```json
|
| 87 |
-
{
|
| 88 |
-
"tasks": {
|
| 89 |
-
"task1": "Severity Classification → SeverityLevel enum",
|
| 90 |
-
"task2": "Root Cause Category → RootCauseCategory enum",
|
| 91 |
-
"task3": "Recommended Action → RecommendedAction enum"
|
| 92 |
-
}
|
| 93 |
-
}
|
| 94 |
-
```
|
| 95 |
-
|
| 96 |
-
---
|
| 97 |
-
|
| 98 |
-
### `POST /reset`
|
| 99 |
-
Resets the environment and returns a new incident for the agent to triage.
|
| 100 |
-
|
| 101 |
-
**Query Params:**
|
| 102 |
-
|
| 103 |
-
| Param | Type | Required | Description |
|
| 104 |
-
|---|---|---|---|
|
| 105 |
-
| `task_type` | string | No | Filter by `task1`, `task2`, or `task3`. If omitted, picks any incident randomly. |
|
| 106 |
-
|
| 107 |
-
**Example:**
|
| 108 |
-
```bash
|
| 109 |
-
curl -X POST "http://localhost:8000/reset?task_type=task1"
|
| 110 |
-
```
|
| 111 |
-
|
| 112 |
-
**Response:**
|
| 113 |
-
```json
|
| 114 |
-
{
|
| 115 |
-
"incident_id": "INC-001",
|
| 116 |
-
"task_type": "task1",
|
| 117 |
-
"alert_text": "[CRITICAL] Payment service returning HTTP 503. Error rate: 94%.",
|
| 118 |
-
"context": {
|
| 119 |
-
"service": "payment-service",
|
| 120 |
-
"error_rate_pct": 94,
|
| 121 |
-
"affected_users": 120000,
|
| 122 |
-
"region": "us-east-1"
|
| 123 |
-
}
|
| 124 |
-
}
|
| 125 |
-
```
|
| 126 |
-
|
| 127 |
-
---
|
| 128 |
-
|
| 129 |
-
### `POST /step`
|
| 130 |
-
Submits the agent's action and returns a graded result.
|
| 131 |
-
|
| 132 |
-
**Request Body:**
|
| 133 |
-
```json
|
| 134 |
-
{
|
| 135 |
-
"incident_id": "INC-001",
|
| 136 |
-
"task_type": "task1",
|
| 137 |
-
"severity": "SEV1",
|
| 138 |
-
"root_cause": null,
|
| 139 |
-
"action": null
|
| 140 |
-
}
|
| 141 |
-
```
|
| 142 |
-
|
| 143 |
-
> Only populate the field relevant to the `task_type`. Set others to `null`.
|
| 144 |
-
|
| 145 |
-
**Response:**
|
| 146 |
-
```json
|
| 147 |
-
{
|
| 148 |
-
"incident_id": "INC-001",
|
| 149 |
-
"task_type": "task1",
|
| 150 |
-
"reward": 1.0,
|
| 151 |
-
"correct": true,
|
| 152 |
-
"ground_truth": "SEV1",
|
| 153 |
-
"agent_answer": "SEV1"
|
| 154 |
-
}
|
| 155 |
-
```
|
| 156 |
-
|
| 157 |
-
| Field | Type | Description |
|
| 158 |
-
|---|---|---|
|
| 159 |
-
| `reward` | float | `1.0` = correct, `0.0` = wrong |
|
| 160 |
-
| `correct` | bool | True if reward == 1.0 |
|
| 161 |
-
| `ground_truth` | string | Expected answer |
|
| 162 |
-
| `agent_answer` | string | What agent returned |
|
| 163 |
-
|
| 164 |
-
---
|
| 165 |
-
|
| 166 |
-
### `GET /grader`
|
| 167 |
-
Returns grader configuration for transparency.
|
| 168 |
-
|
| 169 |
-
**Response:**
|
| 170 |
-
```json
|
| 171 |
-
{
|
| 172 |
-
"grading": "deterministic",
|
| 173 |
-
"scoring": "binary (0.0 or 1.0)",
|
| 174 |
-
"tasks": {
|
| 175 |
-
"task1": "action.severity == ground_truth.severity",
|
| 176 |
-
"task2": "action.root_cause == ground_truth.root_cause",
|
| 177 |
-
"task3": "action.action == ground_truth.action"
|
| 178 |
-
}
|
| 179 |
-
}
|
| 180 |
-
```
|
| 181 |
-
|
| 182 |
-
---
|
| 183 |
-
|
| 184 |
-
## 📋 Enum Reference
|
| 185 |
-
|
| 186 |
-
All agent outputs must use **exactly** these enum values (case-sensitive):
|
| 187 |
-
|
| 188 |
-
### Task 1 — Severity Classification (`severity` field)
|
| 189 |
-
|
| 190 |
-
| Value | Meaning |
|
| 191 |
-
|---|---|
|
| 192 |
-
| `SEV1` | Total outage / confirmed revenue impact |
|
| 193 |
-
| `SEV2` | Partial outage / degraded performance |
|
| 194 |
-
| `SEV3` | Minor / cosmetic / internal only |
|
| 195 |
-
|
| 196 |
-
### Task 2 — Root Cause Category (`root_cause` field)
|
| 197 |
-
|
| 198 |
-
| Value | Meaning |
|
| 199 |
-
|---|---|
|
| 200 |
-
| `DATABASE` | DB lag, connection pool, replica issues |
|
| 201 |
-
| `NETWORK` | Packet loss, BGP flap, cross-region failures |
|
| 202 |
-
| `APPLICATION` | Code bug, exception, bad deploy |
|
| 203 |
-
| `INFRASTRUCTURE` | Kubernetes, EC2, spot interruption |
|
| 204 |
-
| `THIRD_PARTY` | Stripe, SendGrid, external vendor |
|
| 205 |
-
| `UNKNOWN` | Cannot determine root cause |
|
| 206 |
-
|
| 207 |
-
### Task 3 — Recommended Action (`action` field)
|
| 208 |
-
|
| 209 |
-
| Value | Meaning |
|
| 210 |
-
|---|---|
|
| 211 |
-
| `ROLLBACK` | Revert to last stable deploy |
|
| 212 |
-
| `SCALE_UP` | Increase replicas / resources |
|
| 213 |
-
| `RESTART_SERVICE` | Restart stuck / deadlocked process |
|
| 214 |
-
| `FAILOVER` | Switch to replica / standby |
|
| 215 |
-
| `NOTIFY_VENDOR` | Escalate to third-party vendor |
|
| 216 |
-
| `INVESTIGATE` | Need more info before acting |
|
| 217 |
-
| `NO_ACTION` | Monitor only, no action needed |
|
| 218 |
-
|
| 219 |
-
---
|
| 220 |
-
|
| 221 |
-
## 🤖 Agent JSON Format
|
| 222 |
-
|
| 223 |
-
The agent must return **strict JSON only** — no markdown, no explanation, no extra text.
|
| 224 |
-
|
| 225 |
-
```json
|
| 226 |
-
{
|
| 227 |
-
"incident_id": "INC-006",
|
| 228 |
-
"task_type": "task2",
|
| 229 |
-
"severity": null,
|
| 230 |
-
"root_cause": "DATABASE",
|
| 231 |
-
"action": null
|
| 232 |
-
}
|
| 233 |
-
```
|
| 234 |
-
|
| 235 |
-
Rules:
|
| 236 |
-
- `incident_id` must match the one returned by `/reset`
|
| 237 |
-
- `task_type` must match the one returned by `/reset`
|
| 238 |
-
- Only one field (`severity`, `root_cause`, or `action`) should be non-null
|
| 239 |
-
- The non-null field must use a valid enum value
|
| 240 |
-
|
| 241 |
-
---
|
| 242 |
-
|
| 243 |
-
## 🧠 How Grading Works
|
| 244 |
-
|
| 245 |
-
Grading is **fully deterministic** — no LLM is used inside the grader.
|
| 246 |
-
|
| 247 |
-
```
|
| 248 |
-
agent_answer == ground_truth → reward: 1.0 (correct)
|
| 249 |
-
agent_answer != ground_truth → reward: 0.0 (wrong)
|
| 250 |
-
missing field (null) → reward: 0.0 (wrong)
|
| 251 |
-
```
|
| 252 |
-
|
| 253 |
-
Scoring is binary because incident triage is a classification task. A wrong severity leads to a wrong on-call response — partial credit would mask bad agent behavior.
|
| 254 |
-
|
| 255 |
-
---
|
| 256 |
-
|
| 257 |
-
## 🧪 Quick Test (curl)
|
| 258 |
-
|
| 259 |
-
```bash
|
| 260 |
-
# 1. Check available tasks
|
| 261 |
-
curl http://localhost:8000/tasks
|
| 262 |
-
|
| 263 |
-
# 2. Get a task1 incident
|
| 264 |
-
curl -X POST "http://localhost:8000/reset?task_type=task1"
|
| 265 |
-
|
| 266 |
-
# 3. Submit agent action (replace incident_id with one from step 2)
|
| 267 |
-
curl -X POST http://localhost:8000/step \
|
| 268 |
-
-H "Content-Type: application/json" \
|
| 269 |
-
-d '{"incident_id": "INC-001", "task_type": "task1", "severity": "SEV1", "root_cause": null, "action": null}'
|
| 270 |
-
|
| 271 |
-
# 4. Check grader config
|
| 272 |
-
curl http://localhost:8000/grader
|
| 273 |
-
```
|
| 274 |
-
|
| 275 |
-
---
|
| 276 |
-
|
| 277 |
-
## 📊 Dataset Overview
|
| 278 |
-
|
| 279 |
-
15 production incidents across 3 task types (5 per task):
|
| 280 |
-
|
| 281 |
-
| Task | Incidents | What agent classifies |
|
| 282 |
-
|---|---|---|
|
| 283 |
-
| `task1` | INC-001 to INC-005 | Severity level |
|
| 284 |
-
| `task2` | INC-006 to INC-010 | Root cause category |
|
| 285 |
-
| `task3` | INC-011 to INC-015 | Recommended action |
|
| 286 |
-
|
| 287 |
-
Incident types include: payment outages, DB replica lag, Kubernetes node failures, BGP flapping, bad deploys, vendor degradations, memory deadlocks, and more.
|
| 288 |
-
|
| 289 |
-
---
|
| 290 |
-
|
| 291 |
-
## 🔧 Inference Script (Mistral via NVIDIA API)
|
| 292 |
-
|
| 293 |
-
`inference.py` uses the Mistral model via NVIDIA's OpenAI-compatible API endpoint.
|
| 294 |
-
|
| 295 |
-
Update the client in `inference.py`:
|
| 296 |
-
|
| 297 |
-
```python
|
| 298 |
-
from openai import OpenAI
|
| 299 |
-
|
| 300 |
-
client = OpenAI(
|
| 301 |
-
base_url="https://integrate.api.nvidia.com/v1",
|
| 302 |
-
api_key=os.environ["NVIDIA_API_KEY"]
|
| 303 |
-
)
|
| 304 |
-
|
| 305 |
-
response = client.chat.completions.create(
|
| 306 |
-
model="mistralai/mistral-7b-instruct-v0.3",
|
| 307 |
-
messages=[
|
| 308 |
-
{"role": "system", "content": SYSTEM_PROMPT},
|
| 309 |
-
{"role": "user", "content": build_user_prompt(observation)}
|
| 310 |
-
],
|
| 311 |
-
max_tokens=256,
|
| 312 |
-
temperature=0.0
|
| 313 |
-
)
|
| 314 |
-
|
| 315 |
-
raw = response.choices[0].message.content.strip()
|
| 316 |
-
```
|
| 317 |
-
|
| 318 |
-
> `temperature=0.0` is critical — keeps outputs deterministic across runs.
|
| 319 |
-
|
| 320 |
-
---
|
| 321 |
-
|
| 322 |
-
## 📦 Requirements
|
| 323 |
-
|
| 324 |
-
```
|
| 325 |
-
fastapi
|
| 326 |
-
uvicorn
|
| 327 |
-
pydantic
|
| 328 |
-
openai
|
| 329 |
-
requests
|
| 330 |
-
```
|
| 331 |
-
|
| 332 |
-
Install:
|
| 333 |
-
```bash
|
| 334 |
-
pip install fastapi uvicorn pydantic openai requests
|
| 335 |
-
```
|
| 336 |
-
|
| 337 |
-
---
|
| 338 |
-
|
| 339 |
-
## 🚀 Run Full Evaluation
|
| 340 |
-
|
| 341 |
-
```bash
|
| 342 |
-
# Terminal 1
|
| 343 |
-
uvicorn server.app:app --reload
|
| 344 |
-
|
| 345 |
-
# Terminal 2
|
| 346 |
-
python inference.py
|
| 347 |
-
```
|
| 348 |
-
|
| 349 |
-
Expected output:
|
| 350 |
-
```
|
| 351 |
-
==================================================
|
| 352 |
-
Incident : INC-003
|
| 353 |
-
Task : task1
|
| 354 |
-
Alert : [INFO] Admin dashboard CSS assets returning 404...
|
| 355 |
-
|
| 356 |
-
LLM Raw : {"incident_id": "INC-003", "task_type": "task1", "severity": "SEV3", "root_cause": null, "action": null}
|
| 357 |
-
Answer : SEV3
|
| 358 |
-
Expected : SEV3
|
| 359 |
-
Correct : True | Reward: 1.0
|
| 360 |
-
|
| 361 |
-
==================================================
|
| 362 |
-
Total Episodes : 15
|
| 363 |
-
Total Correct : 13
|
| 364 |
-
Accuracy : 86.7%
|
| 365 |
-
```
|
| 366 |
-
|
| 367 |
-
---
|
| 368 |
-
|
| 369 |
-
## 📝 Important Rules
|
| 370 |
-
|
| 371 |
-
- Never modify enum values in `models.py` — graders depend on exact string matching
|
| 372 |
-
- Never add LLM calls inside `graders.py` — grading must be deterministic
|
| 373 |
-
- Always call `/reset` before `/step` — environment maintains current incident state
|
| 374 |
-
- `server/__init__.py` must stay empty — do not add imports there
|
| 375 |
-
- Always run uvicorn from the project root: `uvicorn server.app:app --reload`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__init__.py
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
-
# All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# This source code is licensed under the BSD-style license found in the
|
| 5 |
-
# LICENSE file in the root directory of this source tree.
|
| 6 |
-
|
| 7 |
-
"""Incident Triage Environment."""
|
| 8 |
-
|
| 9 |
-
from .client import IncidentTriageEnv
|
| 10 |
-
from .models import IncidentTriageAction, IncidentTriageObservation
|
| 11 |
-
|
| 12 |
-
__all__ = [
|
| 13 |
-
"IncidentTriageAction",
|
| 14 |
-
"IncidentTriageObservation",
|
| 15 |
-
"IncidentTriageEnv",
|
| 16 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
|
@@ -1,77 +1,225 @@
|
|
| 1 |
-
#----- Edited file--------------
|
| 2 |
-
# app.py
|
| 3 |
-
|
| 4 |
import uuid
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
from fastapi import FastAPI, HTTPException
|
| 6 |
-
from
|
| 7 |
-
from
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
app = FastAPI(title="Incident Triage Environment")
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# Session store: session_id -> IncidentEnv instance
|
| 13 |
sessions: dict[str, IncidentEnv] = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
@app.get("/tasks")
|
| 17 |
def get_tasks():
|
| 18 |
return {
|
| 19 |
"tasks": {
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
}
|
| 24 |
}
|
| 25 |
|
| 26 |
|
| 27 |
-
@app.
|
| 28 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
session_id = str(uuid.uuid4())
|
| 30 |
env = IncidentEnv()
|
| 31 |
try:
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
except ValueError as e:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
raise HTTPException(status_code=400, detail=str(e))
|
| 35 |
sessions[session_id] = env
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
|
| 39 |
@app.post("/step", response_model=StepResult)
|
| 40 |
def step(action: IncidentAction, session_id: str):
|
| 41 |
env = sessions.get(session_id)
|
| 42 |
if not env:
|
|
|
|
| 43 |
raise HTTPException(status_code=404, detail="Session not found. Call /reset first.")
|
| 44 |
try:
|
| 45 |
result = env.step(action)
|
| 46 |
except (RuntimeError, ValueError) as e:
|
|
|
|
| 47 |
raise HTTPException(status_code=400, detail=str(e))
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
return result
|
| 51 |
|
| 52 |
|
| 53 |
-
@app.get("/state")
|
| 54 |
def state(session_id: str):
|
| 55 |
env = sessions.get(session_id)
|
| 56 |
-
if not env
|
|
|
|
| 57 |
raise HTTPException(status_code=404, detail="No active session.")
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
"
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
"
|
| 64 |
-
|
| 65 |
|
| 66 |
|
| 67 |
@app.get("/grader")
|
| 68 |
def get_grader_info():
|
| 69 |
return {
|
| 70 |
"grading": "deterministic",
|
| 71 |
-
"scoring": "task1: partial
|
| 72 |
"tasks": {
|
| 73 |
"task1": "exact=1.0, adjacent=0.5, far=0.0",
|
| 74 |
-
"task2": "
|
| 75 |
-
"task3": "
|
| 76 |
}
|
| 77 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import uuid
|
| 2 |
+
from collections import Counter
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
import sys
|
| 5 |
+
from typing import Any
|
| 6 |
+
|
| 7 |
from fastapi import FastAPI, HTTPException
|
| 8 |
+
from fastapi.responses import FileResponse
|
| 9 |
+
from fastapi.staticfiles import StaticFiles
|
| 10 |
+
|
| 11 |
+
from environment import IncidentEnv, TASK_SPECS
|
| 12 |
+
from incidents import TICKETS
|
| 13 |
+
from models import (
|
| 14 |
+
IncidentAction,
|
| 15 |
+
IncidentObservation,
|
| 16 |
+
IncidentReward,
|
| 17 |
+
IncidentState,
|
| 18 |
+
ResetRequest,
|
| 19 |
+
StepResult,
|
| 20 |
+
TaskType,
|
| 21 |
+
)
|
| 22 |
|
| 23 |
app = FastAPI(title="Incident Triage Environment")
|
| 24 |
+
UI_DIR = Path(__file__).parent / "ui"
|
| 25 |
+
ASSETS_DIR = UI_DIR / "assets"
|
| 26 |
|
| 27 |
# Session store: session_id -> IncidentEnv instance
|
| 28 |
sessions: dict[str, IncidentEnv] = {}
|
| 29 |
+
task_counts = Counter(ticket["task_type"] for ticket in TICKETS)
|
| 30 |
+
|
| 31 |
+
app.mount("/assets", StaticFiles(directory=ASSETS_DIR), name="assets")
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def log_event(event: str, **fields: Any) -> None:
|
| 35 |
+
details = " ".join(f"{key}={value}" for key, value in fields.items())
|
| 36 |
+
print(f"[{event}] {details}", file=sys.stderr, flush=True)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
@app.get("/", include_in_schema=False)
|
| 40 |
+
def home_page():
|
| 41 |
+
return FileResponse(UI_DIR / "index.html")
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@app.get("/status", include_in_schema=False)
|
| 45 |
+
def status_page():
|
| 46 |
+
return FileResponse(UI_DIR / "status.html")
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
@app.get("/playground", include_in_schema=False)
|
| 50 |
+
def playground_page():
|
| 51 |
+
return FileResponse(UI_DIR / "playground.html")
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
@app.get("/health")
|
| 55 |
+
def health():
|
| 56 |
+
return {"status": "healthy"}
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
@app.get("/metadata")
|
| 60 |
+
def metadata():
|
| 61 |
+
return {
|
| 62 |
+
"name": "incident-triage-env",
|
| 63 |
+
"description": "Production incident triage environment for severity, root-cause, and remediation decisions.",
|
| 64 |
+
"tasks": {
|
| 65 |
+
task_type.value: {
|
| 66 |
+
"name": spec["name"],
|
| 67 |
+
"difficulty": spec["difficulty"],
|
| 68 |
+
"expected_field": spec["expected_field"],
|
| 69 |
+
"allowed_values": spec["allowed_values"],
|
| 70 |
+
"ticket_count": task_counts[task_type.value],
|
| 71 |
+
}
|
| 72 |
+
for task_type, spec in TASK_SPECS.items()
|
| 73 |
+
},
|
| 74 |
+
"total_tickets": len(TICKETS),
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
@app.get("/schema")
|
| 79 |
+
def schema():
|
| 80 |
+
return {
|
| 81 |
+
"action": IncidentAction.model_json_schema(),
|
| 82 |
+
"observation": IncidentObservation.model_json_schema(),
|
| 83 |
+
"reward": IncidentReward.model_json_schema(),
|
| 84 |
+
"state": IncidentState.model_json_schema(),
|
| 85 |
+
"step_result": StepResult.model_json_schema(),
|
| 86 |
+
}
|
| 87 |
|
| 88 |
|
| 89 |
@app.get("/tasks")
|
| 90 |
def get_tasks():
|
| 91 |
return {
|
| 92 |
"tasks": {
|
| 93 |
+
task_type.value: {
|
| 94 |
+
"name": spec["name"],
|
| 95 |
+
"difficulty": spec["difficulty"],
|
| 96 |
+
"expected_field": spec["expected_field"],
|
| 97 |
+
"allowed_values": spec["allowed_values"],
|
| 98 |
+
"ticket_count": task_counts[task_type.value],
|
| 99 |
+
}
|
| 100 |
+
for task_type, spec in TASK_SPECS.items()
|
| 101 |
}
|
| 102 |
}
|
| 103 |
|
| 104 |
|
| 105 |
+
@app.get("/tickets")
|
| 106 |
+
def get_tickets():
|
| 107 |
+
tickets = []
|
| 108 |
+
for ticket in TICKETS:
|
| 109 |
+
task_type = TaskType(ticket["task_type"])
|
| 110 |
+
spec = TASK_SPECS[task_type]
|
| 111 |
+
tickets.append(
|
| 112 |
+
{
|
| 113 |
+
"incident_id": ticket["incident_id"],
|
| 114 |
+
"task_type": ticket["task_type"],
|
| 115 |
+
"difficulty": spec["difficulty"],
|
| 116 |
+
"task_name": spec["name"],
|
| 117 |
+
"expected_field": spec["expected_field"],
|
| 118 |
+
"alert_preview": ticket["alert_text"][:120],
|
| 119 |
+
}
|
| 120 |
+
)
|
| 121 |
+
return {"tickets": tickets, "count": len(tickets)}
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
@app.post("/reset", response_model=StepResult)
|
| 125 |
+
def reset(reset_request: ResetRequest | None = None):
|
| 126 |
+
request = reset_request or ResetRequest()
|
| 127 |
session_id = str(uuid.uuid4())
|
| 128 |
env = IncidentEnv()
|
| 129 |
try:
|
| 130 |
+
result = env.reset(
|
| 131 |
+
task_type=request.task_type,
|
| 132 |
+
ticket_id=request.ticket_id,
|
| 133 |
+
seed=request.seed,
|
| 134 |
+
)
|
| 135 |
except ValueError as e:
|
| 136 |
+
log_event(
|
| 137 |
+
"RESET_ERROR",
|
| 138 |
+
task_type=request.task_type.value if request.task_type else "any",
|
| 139 |
+
ticket_id=request.ticket_id or "random",
|
| 140 |
+
error=str(e),
|
| 141 |
+
)
|
| 142 |
raise HTTPException(status_code=400, detail=str(e))
|
| 143 |
sessions[session_id] = env
|
| 144 |
+
result.info["session_id"] = session_id
|
| 145 |
+
result.info["state"] = env.state(session_id=session_id).model_dump()
|
| 146 |
+
log_event(
|
| 147 |
+
"RESET",
|
| 148 |
+
session_id=session_id,
|
| 149 |
+
incident_id=result.observation.incident_id,
|
| 150 |
+
task_type=result.observation.task_type.value,
|
| 151 |
+
expected_field=result.observation.expected_field,
|
| 152 |
+
)
|
| 153 |
+
return result
|
| 154 |
|
| 155 |
|
| 156 |
@app.post("/step", response_model=StepResult)
|
| 157 |
def step(action: IncidentAction, session_id: str):
|
| 158 |
env = sessions.get(session_id)
|
| 159 |
if not env:
|
| 160 |
+
log_event("STEP_ERROR", session_id=session_id, error="session_not_found")
|
| 161 |
raise HTTPException(status_code=404, detail="Session not found. Call /reset first.")
|
| 162 |
try:
|
| 163 |
result = env.step(action)
|
| 164 |
except (RuntimeError, ValueError) as e:
|
| 165 |
+
log_event("STEP_ERROR", session_id=session_id, incident_id=action.incident_id, error=str(e))
|
| 166 |
raise HTTPException(status_code=400, detail=str(e))
|
| 167 |
+
result.info["session_id"] = session_id
|
| 168 |
+
result.info["state"] = env.state(session_id=session_id).model_dump()
|
| 169 |
+
log_event(
|
| 170 |
+
"STEP",
|
| 171 |
+
session_id=session_id,
|
| 172 |
+
incident_id=action.incident_id,
|
| 173 |
+
task_type=action.task_type.value,
|
| 174 |
+
answer=action.selected_value() or "NONE",
|
| 175 |
+
reward=result.reward.value,
|
| 176 |
+
done=str(result.done).lower(),
|
| 177 |
+
)
|
| 178 |
return result
|
| 179 |
|
| 180 |
|
| 181 |
+
@app.get("/state", response_model=IncidentState)
|
| 182 |
def state(session_id: str):
|
| 183 |
env = sessions.get(session_id)
|
| 184 |
+
if not env:
|
| 185 |
+
log_event("STATE_ERROR", session_id=session_id, error="no_active_session")
|
| 186 |
raise HTTPException(status_code=404, detail="No active session.")
|
| 187 |
+
try:
|
| 188 |
+
current_state = env.state(session_id=session_id)
|
| 189 |
+
log_event("STATE", session_id=session_id, incident_id=current_state.incident_id, done=str(current_state.done).lower())
|
| 190 |
+
return current_state
|
| 191 |
+
except RuntimeError as e:
|
| 192 |
+
log_event("STATE_ERROR", session_id=session_id, error=str(e))
|
| 193 |
+
raise HTTPException(status_code=404, detail=str(e))
|
| 194 |
|
| 195 |
|
| 196 |
@app.get("/grader")
|
| 197 |
def get_grader_info():
|
| 198 |
return {
|
| 199 |
"grading": "deterministic",
|
| 200 |
+
"scoring": "task1: adjacent-severity partial credit; task2/task3: exact match plus conservative near-miss partial credit",
|
| 201 |
"tasks": {
|
| 202 |
"task1": "exact=1.0, adjacent=0.5, far=0.0",
|
| 203 |
+
"task2": "exact=1.0, related-domain=0.5, unknown=0.25, wrong=0.0",
|
| 204 |
+
"task3": "exact=1.0, investigate fallback=0.4, related response=0.25, wrong=0.0",
|
| 205 |
}
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
@app.post("/mcp")
|
| 210 |
+
def mcp(payload: dict[str, Any] | None = None):
|
| 211 |
+
request = payload or {}
|
| 212 |
+
method = request.get("method")
|
| 213 |
+
rpc_id = request.get("id")
|
| 214 |
+
|
| 215 |
+
if method == "ping":
|
| 216 |
+
result: dict[str, Any] = {"status": "ok"}
|
| 217 |
+
elif method == "tools/list":
|
| 218 |
+
result = {"tools": []}
|
| 219 |
+
else:
|
| 220 |
+
result = {
|
| 221 |
+
"status": "ok",
|
| 222 |
+
"message": "Incident triage environment does not expose MCP tools.",
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
return {"jsonrpc": "2.0", "id": rpc_id, "result": result}
|
client.py
CHANGED
|
@@ -1,99 +1,79 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
from
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
Client for the Incident Triage Environment.
|
| 23 |
-
|
| 24 |
-
This client maintains a persistent WebSocket connection to the environment server,
|
| 25 |
-
enabling efficient multi-step interactions with lower latency.
|
| 26 |
-
Each client instance has its own dedicated environment session on the server.
|
| 27 |
-
|
| 28 |
-
Example:
|
| 29 |
-
>>> # Connect to a running server
|
| 30 |
-
>>> with IncidentTriageEnv(base_url="http://localhost:8000") as client:
|
| 31 |
-
... result = client.reset()
|
| 32 |
-
... print(result.observation.echoed_message)
|
| 33 |
-
...
|
| 34 |
-
... result = client.step(IncidentTriageAction(message="Hello!"))
|
| 35 |
-
... print(result.observation.echoed_message)
|
| 36 |
-
|
| 37 |
-
Example with Docker:
|
| 38 |
-
>>> # Automatically start container and connect
|
| 39 |
-
>>> client = IncidentTriageEnv.from_docker_image("Incident_Triage-env:latest")
|
| 40 |
-
>>> try:
|
| 41 |
-
... result = client.reset()
|
| 42 |
-
... result = client.step(IncidentTriageAction(message="Test"))
|
| 43 |
-
... finally:
|
| 44 |
-
... client.close()
|
| 45 |
-
"""
|
| 46 |
-
|
| 47 |
-
def _step_payload(self, action: IncidentTriageAction) -> Dict:
|
| 48 |
-
"""
|
| 49 |
-
Convert IncidentTriageAction to JSON payload for step message.
|
| 50 |
-
|
| 51 |
-
Args:
|
| 52 |
-
action: IncidentTriageAction instance
|
| 53 |
-
|
| 54 |
-
Returns:
|
| 55 |
-
Dictionary representation suitable for JSON encoding
|
| 56 |
-
"""
|
| 57 |
-
return {
|
| 58 |
-
"message": action.message,
|
| 59 |
-
}
|
| 60 |
-
|
| 61 |
-
def _parse_result(self, payload: Dict) -> StepResult[IncidentTriageObservation]:
|
| 62 |
-
"""
|
| 63 |
-
Parse server response into StepResult[IncidentTriageObservation].
|
| 64 |
-
|
| 65 |
-
Args:
|
| 66 |
-
payload: JSON response data from server
|
| 67 |
-
|
| 68 |
-
Returns:
|
| 69 |
-
StepResult with IncidentTriageObservation
|
| 70 |
-
"""
|
| 71 |
-
obs_data = payload.get("observation", {})
|
| 72 |
-
observation = IncidentTriageObservation(
|
| 73 |
-
echoed_message=obs_data.get("echoed_message", ""),
|
| 74 |
-
message_length=obs_data.get("message_length", 0),
|
| 75 |
-
done=payload.get("done", False),
|
| 76 |
-
reward=payload.get("reward"),
|
| 77 |
-
metadata=obs_data.get("metadata", {}),
|
| 78 |
-
)
|
| 79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
return StepResult(
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
)
|
| 85 |
|
| 86 |
-
def
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
Args:
|
| 91 |
-
payload: JSON response from state request
|
| 92 |
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
)
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Lightweight HTTP client for the current FastAPI incident triage server."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Any, Dict, Optional
|
| 6 |
+
|
| 7 |
+
import requests
|
| 8 |
+
|
| 9 |
+
try:
|
| 10 |
+
from .models import IncidentAction, IncidentState, StepResult
|
| 11 |
+
except ImportError:
|
| 12 |
+
from models import IncidentAction, IncidentState, StepResult
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class IncidentTriageClient:
|
| 16 |
+
"""Small helper for calling the local FastAPI endpoints from scripts or notebooks."""
|
| 17 |
+
|
| 18 |
+
def __init__(self, base_url: str = "http://localhost:7860", timeout: float = 30.0):
|
| 19 |
+
self.base_url = base_url.rstrip("/")
|
| 20 |
+
self.timeout = timeout
|
| 21 |
+
self.session = requests.Session()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
+
def __enter__(self) -> "IncidentTriageClient":
|
| 24 |
+
return self
|
| 25 |
+
|
| 26 |
+
def __exit__(self, exc_type, exc, tb) -> None:
|
| 27 |
+
self.close()
|
| 28 |
+
|
| 29 |
+
def close(self) -> None:
|
| 30 |
+
self.session.close()
|
| 31 |
+
|
| 32 |
+
def tasks(self) -> Dict[str, Any]:
|
| 33 |
+
return self._request("GET", "/tasks")
|
| 34 |
+
|
| 35 |
+
def grader_info(self) -> Dict[str, Any]:
|
| 36 |
+
return self._request("GET", "/grader")
|
| 37 |
+
|
| 38 |
+
def reset(
|
| 39 |
+
self,
|
| 40 |
+
task_type: Optional[str] = None,
|
| 41 |
+
ticket_id: Optional[str] = None,
|
| 42 |
+
seed: Optional[int] = None,
|
| 43 |
+
) -> StepResult:
|
| 44 |
return StepResult(
|
| 45 |
+
**self._request(
|
| 46 |
+
"POST",
|
| 47 |
+
"/reset",
|
| 48 |
+
json={
|
| 49 |
+
"task_type": task_type,
|
| 50 |
+
"ticket_id": ticket_id,
|
| 51 |
+
"seed": seed,
|
| 52 |
+
},
|
| 53 |
+
)
|
| 54 |
)
|
| 55 |
|
| 56 |
+
def state(self, session_id: str) -> IncidentState:
|
| 57 |
+
return IncidentState(
|
| 58 |
+
**self._request("GET", "/state", params={"session_id": session_id})
|
| 59 |
+
)
|
|
|
|
|
|
|
| 60 |
|
| 61 |
+
def step(self, session_id: str, action: IncidentAction | Dict[str, Any]) -> StepResult:
|
| 62 |
+
payload = action.model_dump() if isinstance(action, IncidentAction) else action
|
| 63 |
+
result = self._request(
|
| 64 |
+
"POST",
|
| 65 |
+
"/step",
|
| 66 |
+
params={"session_id": session_id},
|
| 67 |
+
json=payload,
|
| 68 |
+
)
|
| 69 |
+
return StepResult(**result)
|
| 70 |
+
|
| 71 |
+
def _request(self, method: str, path: str, **kwargs: Any) -> Dict[str, Any]:
|
| 72 |
+
response = self.session.request(
|
| 73 |
+
method=method,
|
| 74 |
+
url=f"{self.base_url}{path}",
|
| 75 |
+
timeout=self.timeout,
|
| 76 |
+
**kwargs,
|
| 77 |
)
|
| 78 |
+
response.raise_for_status()
|
| 79 |
+
return response.json()
|
environment.py
CHANGED
|
@@ -1,62 +1,210 @@
|
|
| 1 |
-
#----- Edited file--------------
|
| 2 |
-
# environment.py
|
| 3 |
-
|
| 4 |
import random
|
| 5 |
-
|
|
|
|
| 6 |
from incidents import TICKETS
|
| 7 |
from graders import GRADERS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
|
| 10 |
class IncidentEnv:
|
| 11 |
|
| 12 |
def __init__(self):
|
| 13 |
self.current_ticket = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
-
def reset(
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
self.current_ticket =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
return
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
)
|
| 30 |
|
| 31 |
def step(self, action: IncidentAction) -> StepResult:
|
| 32 |
if self.current_ticket is None:
|
| 33 |
raise RuntimeError("Call reset() before step()")
|
|
|
|
|
|
|
| 34 |
|
| 35 |
if action.incident_id != self.current_ticket["incident_id"]:
|
| 36 |
raise ValueError(
|
| 37 |
f"Action incident_id '{action.incident_id}' does not match "
|
| 38 |
f"current ticket '{self.current_ticket['incident_id']}'"
|
| 39 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
task_type = self.current_ticket["task_type"]
|
| 42 |
ground_truth = self.current_ticket["ground_truth"]
|
| 43 |
grader_fn = GRADERS[task_type]
|
| 44 |
-
|
| 45 |
|
| 46 |
-
agent_answer = (
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
action.action.value if task_type == "task3" and action.action else
|
| 50 |
-
"NONE"
|
| 51 |
-
)
|
| 52 |
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
return StepResult(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
incident_id=self.current_ticket["incident_id"],
|
| 57 |
-
task_type=task_type,
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import random
|
| 2 |
+
import uuid
|
| 3 |
+
|
| 4 |
from incidents import TICKETS
|
| 5 |
from graders import GRADERS
|
| 6 |
+
from models import (
|
| 7 |
+
IncidentAction,
|
| 8 |
+
IncidentObservation,
|
| 9 |
+
IncidentReward,
|
| 10 |
+
IncidentState,
|
| 11 |
+
RecommendedAction,
|
| 12 |
+
RootCauseCategory,
|
| 13 |
+
SeverityLevel,
|
| 14 |
+
StepResult,
|
| 15 |
+
TaskType,
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
TASK_SPECS = {
|
| 19 |
+
TaskType.TASK1: {
|
| 20 |
+
"name": "Severity Classification",
|
| 21 |
+
"difficulty": "easy",
|
| 22 |
+
"expected_field": "severity",
|
| 23 |
+
"allowed_values": [item.value for item in SeverityLevel],
|
| 24 |
+
"description": "Classify the severity of the incident using blast radius, user impact, and business risk.",
|
| 25 |
+
},
|
| 26 |
+
TaskType.TASK2: {
|
| 27 |
+
"name": "Root Cause Classification",
|
| 28 |
+
"difficulty": "medium",
|
| 29 |
+
"expected_field": "root_cause",
|
| 30 |
+
"allowed_values": [item.value for item in RootCauseCategory],
|
| 31 |
+
"description": "Identify the most likely failure domain from the incident evidence.",
|
| 32 |
+
},
|
| 33 |
+
TaskType.TASK3: {
|
| 34 |
+
"name": "Recommended Action",
|
| 35 |
+
"difficulty": "hard",
|
| 36 |
+
"expected_field": "action",
|
| 37 |
+
"allowed_values": [item.value for item in RecommendedAction],
|
| 38 |
+
"description": "Choose the best immediate operational response for stabilizing the incident.",
|
| 39 |
+
},
|
| 40 |
+
}
|
| 41 |
+
TICKETS_BY_ID = {ticket["incident_id"]: ticket for ticket in TICKETS}
|
| 42 |
|
| 43 |
|
| 44 |
class IncidentEnv:
|
| 45 |
|
| 46 |
def __init__(self):
|
| 47 |
self.current_ticket = None
|
| 48 |
+
self.episode_id = ""
|
| 49 |
+
self.step_count = 0
|
| 50 |
+
self.max_steps = 1
|
| 51 |
+
self.total_reward = 0.0
|
| 52 |
+
self.done = False
|
| 53 |
+
self.last_reward = 0.0
|
| 54 |
+
self.last_action_summary = None
|
| 55 |
|
| 56 |
+
def reset(
|
| 57 |
+
self,
|
| 58 |
+
task_type: TaskType | str | None = None,
|
| 59 |
+
ticket_id: str | None = None,
|
| 60 |
+
seed: int | None = None,
|
| 61 |
+
) -> StepResult:
|
| 62 |
+
normalized_task = TaskType(task_type) if task_type else None
|
| 63 |
+
self.current_ticket = self._select_ticket(normalized_task, ticket_id, seed)
|
| 64 |
+
self.episode_id = str(uuid.uuid4())
|
| 65 |
+
self.step_count = 0
|
| 66 |
+
self.total_reward = 0.0
|
| 67 |
+
self.done = False
|
| 68 |
+
self.last_reward = 0.0
|
| 69 |
+
self.last_action_summary = None
|
| 70 |
|
| 71 |
+
return StepResult(
|
| 72 |
+
observation=self._build_observation(),
|
| 73 |
+
reward=IncidentReward(value=0.0, reason="Episode initialized."),
|
| 74 |
+
done=False,
|
| 75 |
+
info={
|
| 76 |
+
"episode_id": self.episode_id,
|
| 77 |
+
"task_name": self._task_spec()["name"],
|
| 78 |
+
"difficulty": self._task_spec()["difficulty"],
|
| 79 |
+
"max_steps": self.max_steps,
|
| 80 |
+
},
|
| 81 |
)
|
| 82 |
|
| 83 |
def step(self, action: IncidentAction) -> StepResult:
|
| 84 |
if self.current_ticket is None:
|
| 85 |
raise RuntimeError("Call reset() before step()")
|
| 86 |
+
if self.done:
|
| 87 |
+
raise RuntimeError("Episode already completed. Call reset() to start a new one.")
|
| 88 |
|
| 89 |
if action.incident_id != self.current_ticket["incident_id"]:
|
| 90 |
raise ValueError(
|
| 91 |
f"Action incident_id '{action.incident_id}' does not match "
|
| 92 |
f"current ticket '{self.current_ticket['incident_id']}'"
|
| 93 |
)
|
| 94 |
+
if action.task_type != TaskType(self.current_ticket["task_type"]):
|
| 95 |
+
raise ValueError(
|
| 96 |
+
f"Action task_type '{action.task_type.value}' does not match "
|
| 97 |
+
f"current ticket task_type '{self.current_ticket['task_type']}'"
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
self._validate_action(action)
|
| 101 |
|
| 102 |
task_type = self.current_ticket["task_type"]
|
| 103 |
ground_truth = self.current_ticket["ground_truth"]
|
| 104 |
grader_fn = GRADERS[task_type]
|
| 105 |
+
reward_value, reward_reason = grader_fn(action, ground_truth)
|
| 106 |
|
| 107 |
+
agent_answer = action.selected_value() or "NONE"
|
| 108 |
+
selected_field = action.selected_field() or "NONE"
|
| 109 |
+
ground_truth_value = list(ground_truth.values())[0]
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
+
self.step_count += 1
|
| 112 |
+
self.last_reward = reward_value
|
| 113 |
+
self.total_reward += reward_value
|
| 114 |
+
self.done = self.step_count >= self.max_steps
|
| 115 |
+
self.last_action_summary = f"Submitted {selected_field}={agent_answer}"
|
| 116 |
|
| 117 |
return StepResult(
|
| 118 |
+
observation=self._build_observation(),
|
| 119 |
+
reward=IncidentReward(value=reward_value, reason=reward_reason),
|
| 120 |
+
done=self.done,
|
| 121 |
+
info={
|
| 122 |
+
"episode_id": self.episode_id,
|
| 123 |
+
"task_name": self._task_spec()["name"],
|
| 124 |
+
"difficulty": self._task_spec()["difficulty"],
|
| 125 |
+
"correct": reward_value == 1.0,
|
| 126 |
+
"ground_truth": ground_truth_value,
|
| 127 |
+
"agent_answer": agent_answer,
|
| 128 |
+
"selected_field": selected_field,
|
| 129 |
+
"max_steps": self.max_steps,
|
| 130 |
+
},
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
def state(self, session_id: str | None = None) -> IncidentState:
|
| 134 |
+
if self.current_ticket is None:
|
| 135 |
+
raise RuntimeError("No active episode. Call reset() first.")
|
| 136 |
+
|
| 137 |
+
return IncidentState(
|
| 138 |
+
episode_id=self.episode_id,
|
| 139 |
+
session_id=session_id,
|
| 140 |
+
step_count=self.step_count,
|
| 141 |
+
max_steps=self.max_steps,
|
| 142 |
+
total_reward=self.total_reward,
|
| 143 |
+
done=self.done,
|
| 144 |
incident_id=self.current_ticket["incident_id"],
|
| 145 |
+
task_type=TaskType(self.current_ticket["task_type"]),
|
| 146 |
+
difficulty=self._task_spec()["difficulty"],
|
| 147 |
+
status="completed" if self.done else "awaiting_action",
|
| 148 |
+
last_reward=self.last_reward,
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
def _select_ticket(
|
| 152 |
+
self,
|
| 153 |
+
task_type: TaskType | None = None,
|
| 154 |
+
ticket_id: str | None = None,
|
| 155 |
+
seed: int | None = None,
|
| 156 |
+
) -> dict:
|
| 157 |
+
if ticket_id:
|
| 158 |
+
ticket = TICKETS_BY_ID.get(ticket_id)
|
| 159 |
+
if ticket is None:
|
| 160 |
+
raise ValueError(f"No ticket found for ticket_id: {ticket_id}")
|
| 161 |
+
if task_type and ticket["task_type"] != task_type.value:
|
| 162 |
+
raise ValueError(
|
| 163 |
+
f"Ticket '{ticket_id}' belongs to task_type '{ticket['task_type']}', "
|
| 164 |
+
f"not '{task_type.value}'"
|
| 165 |
+
)
|
| 166 |
+
return ticket
|
| 167 |
+
|
| 168 |
+
pool = TICKETS
|
| 169 |
+
if task_type:
|
| 170 |
+
pool = [ticket for ticket in TICKETS if ticket["task_type"] == task_type.value]
|
| 171 |
+
if not pool:
|
| 172 |
+
raise ValueError(f"No tickets found for task_type: {task_type}")
|
| 173 |
+
|
| 174 |
+
chooser = random.Random(seed) if seed is not None else random
|
| 175 |
+
return chooser.choice(pool)
|
| 176 |
+
|
| 177 |
+
def _task_spec(self) -> dict:
|
| 178 |
+
if self.current_ticket is None:
|
| 179 |
+
raise RuntimeError("No active episode. Call reset() first.")
|
| 180 |
+
return TASK_SPECS[TaskType(self.current_ticket["task_type"])]
|
| 181 |
+
|
| 182 |
+
def _build_observation(self) -> IncidentObservation:
|
| 183 |
+
spec = self._task_spec()
|
| 184 |
+
return IncidentObservation(
|
| 185 |
+
incident_id=self.current_ticket["incident_id"],
|
| 186 |
+
task_type=TaskType(self.current_ticket["task_type"]),
|
| 187 |
+
difficulty=spec["difficulty"],
|
| 188 |
+
task_description=spec["description"],
|
| 189 |
+
alert_text=self.current_ticket["alert_text"],
|
| 190 |
+
context=self.current_ticket["context"],
|
| 191 |
+
expected_field=spec["expected_field"],
|
| 192 |
+
allowed_values=spec["allowed_values"],
|
| 193 |
+
step_count=self.step_count,
|
| 194 |
+
max_steps=self.max_steps,
|
| 195 |
+
last_action_summary=self.last_action_summary,
|
| 196 |
+
last_reward=self.last_reward,
|
| 197 |
+
episode_status="completed" if self.done else "awaiting_action",
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
def _validate_action(self, action: IncidentAction) -> None:
|
| 201 |
+
populated = action.populated_fields()
|
| 202 |
+
if len(populated) != 1:
|
| 203 |
+
raise ValueError("Action must populate exactly one of severity, root_cause, or action.")
|
| 204 |
+
|
| 205 |
+
expected_field = self._task_spec()["expected_field"]
|
| 206 |
+
if expected_field not in populated:
|
| 207 |
+
raise ValueError(
|
| 208 |
+
f"Task '{self.current_ticket['task_type']}' expects field '{expected_field}', "
|
| 209 |
+
f"but got '{next(iter(populated))}'."
|
| 210 |
+
)
|
graders.py
CHANGED
|
@@ -1,33 +1,71 @@
|
|
| 1 |
-
#----- Edited file--------------
|
| 2 |
-
# graders.py
|
| 3 |
-
|
| 4 |
from models import IncidentAction
|
| 5 |
|
| 6 |
_SEV_ORDER = {"SEV1": 0, "SEV2": 1, "SEV3": 2}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
-
def grade_task1(action: IncidentAction, ground_truth: dict) -> float:
|
| 9 |
if action.severity is None:
|
| 10 |
-
return 0.0
|
| 11 |
predicted = _SEV_ORDER.get(action.severity.value, -1)
|
| 12 |
-
expected
|
| 13 |
-
distance
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
-
def grade_task2(action: IncidentAction, ground_truth: dict) -> float:
|
| 18 |
if action.root_cause is None:
|
| 19 |
-
return 0.0
|
| 20 |
-
|
|
|
|
|
|
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
-
|
|
|
|
| 24 |
if action.action is None:
|
| 25 |
-
return 0.0
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
|
| 29 |
GRADERS = {
|
| 30 |
"task1": grade_task1,
|
| 31 |
"task2": grade_task2,
|
| 32 |
"task3": grade_task3,
|
| 33 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from models import IncidentAction
|
| 2 |
|
| 3 |
_SEV_ORDER = {"SEV1": 0, "SEV2": 1, "SEV3": 2}
|
| 4 |
+
_TASK2_RELATED_GROUPS = [
|
| 5 |
+
{"DATABASE", "APPLICATION"},
|
| 6 |
+
{"NETWORK", "INFRASTRUCTURE"},
|
| 7 |
+
{"NETWORK", "THIRD_PARTY"},
|
| 8 |
+
{"INFRASTRUCTURE", "THIRD_PARTY"},
|
| 9 |
+
]
|
| 10 |
+
_TASK3_PARTIAL = {
|
| 11 |
+
("RESTART_SERVICE", "FAILOVER"): 0.25,
|
| 12 |
+
("FAILOVER", "RESTART_SERVICE"): 0.25,
|
| 13 |
+
("NOTIFY_VENDOR", "INVESTIGATE"): 0.25,
|
| 14 |
+
("SCALE_UP", "INVESTIGATE"): 0.25,
|
| 15 |
+
("RESTART_SERVICE", "INVESTIGATE"): 0.25,
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
|
| 19 |
+
def grade_task1(action: IncidentAction, ground_truth: dict) -> tuple[float, str]:
|
| 20 |
if action.severity is None:
|
| 21 |
+
return 0.0, "Missing severity classification."
|
| 22 |
predicted = _SEV_ORDER.get(action.severity.value, -1)
|
| 23 |
+
expected = _SEV_ORDER.get(ground_truth["severity"], -1)
|
| 24 |
+
distance = abs(predicted - expected)
|
| 25 |
+
score = {0: 1.0, 1: 0.5, 2: 0.0}[distance]
|
| 26 |
+
if score == 1.0:
|
| 27 |
+
return score, "Exact severity match."
|
| 28 |
+
if score == 0.5:
|
| 29 |
+
return score, "Adjacent severity band: partial credit for a close escalation call."
|
| 30 |
+
return score, "Severity choice is too far from the ground truth."
|
| 31 |
|
| 32 |
|
| 33 |
+
def grade_task2(action: IncidentAction, ground_truth: dict) -> tuple[float, str]:
|
| 34 |
if action.root_cause is None:
|
| 35 |
+
return 0.0, "Missing root-cause classification."
|
| 36 |
+
|
| 37 |
+
predicted = action.root_cause.value
|
| 38 |
+
expected = ground_truth["root_cause"]
|
| 39 |
|
| 40 |
+
if predicted == expected:
|
| 41 |
+
return 1.0, "Exact root-cause match."
|
| 42 |
+
if predicted == "UNKNOWN":
|
| 43 |
+
return 0.25, "Conservative fallback: uncertainty recognized, but the failure domain was not isolated."
|
| 44 |
+
if any({predicted, expected} == group for group in _TASK2_RELATED_GROUPS):
|
| 45 |
+
return 0.5, "Related failure domain selected: partial credit for a near-miss diagnosis."
|
| 46 |
+
return 0.0, "Root-cause classification does not match the expected failure domain."
|
| 47 |
|
| 48 |
+
|
| 49 |
+
def grade_task3(action: IncidentAction, ground_truth: dict) -> tuple[float, str]:
|
| 50 |
if action.action is None:
|
| 51 |
+
return 0.0, "Missing remediation recommendation."
|
| 52 |
+
|
| 53 |
+
predicted = action.action.value
|
| 54 |
+
expected = ground_truth["action"]
|
| 55 |
+
|
| 56 |
+
if predicted == expected:
|
| 57 |
+
return 1.0, "Exact remediation match."
|
| 58 |
+
if predicted == "INVESTIGATE" and expected != "NO_ACTION":
|
| 59 |
+
return 0.4, "Safe investigative fallback: the incident was recognized, but the optimal action was not taken."
|
| 60 |
+
if predicted == "NO_ACTION" and expected == "INVESTIGATE":
|
| 61 |
+
return 0.25, "Conservative response, but deeper investigation was expected."
|
| 62 |
+
if (predicted, expected) in _TASK3_PARTIAL:
|
| 63 |
+
return _TASK3_PARTIAL[(predicted, expected)], "Related remediation selected: partial credit for a close operational response."
|
| 64 |
+
return 0.0, "Recommended action does not match the expected operator response."
|
| 65 |
|
| 66 |
|
| 67 |
GRADERS = {
|
| 68 |
"task1": grade_task1,
|
| 69 |
"task2": grade_task2,
|
| 70 |
"task3": grade_task3,
|
| 71 |
+
}
|
incidents.py
CHANGED
|
@@ -3,7 +3,7 @@
|
|
| 3 |
|
| 4 |
TICKETS = [
|
| 5 |
|
| 6 |
-
#
|
| 7 |
|
| 8 |
{
|
| 9 |
"incident_id": "INC-001",
|
|
@@ -72,7 +72,7 @@ TICKETS = [
|
|
| 72 |
"ground_truth": {"severity": "SEV2"}
|
| 73 |
},
|
| 74 |
|
| 75 |
-
#
|
| 76 |
|
| 77 |
{
|
| 78 |
"incident_id": "INC-006",
|
|
@@ -142,7 +142,7 @@ TICKETS = [
|
|
| 142 |
"ground_truth": {"root_cause": "INFRASTRUCTURE"}
|
| 143 |
},
|
| 144 |
|
| 145 |
-
#
|
| 146 |
|
| 147 |
{
|
| 148 |
"incident_id": "INC-011",
|
|
@@ -226,7 +226,7 @@ TICKETS = [
|
|
| 226 |
"ground_truth": {"severity": "SEV1"}
|
| 227 |
},
|
| 228 |
|
| 229 |
-
#
|
| 230 |
|
| 231 |
{
|
| 232 |
"incident_id": "INC-017",
|
|
@@ -263,7 +263,7 @@ TICKETS = [
|
|
| 263 |
"ground_truth": {"severity": "SEV3"}
|
| 264 |
},
|
| 265 |
|
| 266 |
-
#
|
| 267 |
|
| 268 |
{
|
| 269 |
"incident_id": "INC-020",
|
|
@@ -310,7 +310,7 @@ TICKETS = [
|
|
| 310 |
"ground_truth": {"root_cause": "INFRASTRUCTURE"}
|
| 311 |
},
|
| 312 |
|
| 313 |
-
#
|
| 314 |
|
| 315 |
{
|
| 316 |
"incident_id": "INC-024",
|
|
@@ -368,7 +368,7 @@ TICKETS = [
|
|
| 368 |
"ground_truth": {"action": "FAILOVER"}
|
| 369 |
},
|
| 370 |
|
| 371 |
-
#
|
| 372 |
|
| 373 |
{
|
| 374 |
"incident_id": "INC-029",
|
|
@@ -458,4 +458,4 @@ TICKETS = [
|
|
| 458 |
"ground_truth": {"action": "INVESTIGATE"}
|
| 459 |
}
|
| 460 |
|
| 461 |
-
]
|
|
|
|
| 3 |
|
| 4 |
TICKETS = [
|
| 5 |
|
| 6 |
+
# TASK 1: Severity Classification
|
| 7 |
|
| 8 |
{
|
| 9 |
"incident_id": "INC-001",
|
|
|
|
| 72 |
"ground_truth": {"severity": "SEV2"}
|
| 73 |
},
|
| 74 |
|
| 75 |
+
# TASK 2: Root Cause Classification
|
| 76 |
|
| 77 |
{
|
| 78 |
"incident_id": "INC-006",
|
|
|
|
| 142 |
"ground_truth": {"root_cause": "INFRASTRUCTURE"}
|
| 143 |
},
|
| 144 |
|
| 145 |
+
# TASK 3: Recommended Action
|
| 146 |
|
| 147 |
{
|
| 148 |
"incident_id": "INC-011",
|
|
|
|
| 226 |
"ground_truth": {"severity": "SEV1"}
|
| 227 |
},
|
| 228 |
|
| 229 |
+
# TASK 1: Severity (Ambiguous + Edge)
|
| 230 |
|
| 231 |
{
|
| 232 |
"incident_id": "INC-017",
|
|
|
|
| 263 |
"ground_truth": {"severity": "SEV3"}
|
| 264 |
},
|
| 265 |
|
| 266 |
+
# TASK 2: Root Cause (Confusing Signals)
|
| 267 |
|
| 268 |
{
|
| 269 |
"incident_id": "INC-020",
|
|
|
|
| 310 |
"ground_truth": {"root_cause": "INFRASTRUCTURE"}
|
| 311 |
},
|
| 312 |
|
| 313 |
+
# TASK 3: Action (Ambiguous Decisions)
|
| 314 |
|
| 315 |
{
|
| 316 |
"incident_id": "INC-024",
|
|
|
|
| 368 |
"ground_truth": {"action": "FAILOVER"}
|
| 369 |
},
|
| 370 |
|
| 371 |
+
# HARD CASES (REAL THINKING)
|
| 372 |
|
| 373 |
{
|
| 374 |
"incident_id": "INC-029",
|
|
|
|
| 458 |
"ground_truth": {"action": "INVESTIGATE"}
|
| 459 |
}
|
| 460 |
|
| 461 |
+
]
|
inference.py
CHANGED
|
@@ -1,194 +1,408 @@
|
|
| 1 |
-
# inference.py
|
| 2 |
-
|
| 3 |
-
import os
|
| 4 |
import json
|
|
|
|
| 5 |
import re
|
|
|
|
|
|
|
|
|
|
| 6 |
import requests
|
|
|
|
| 7 |
from openai import OpenAI
|
|
|
|
| 8 |
from incidents import TICKETS
|
| 9 |
-
from dotenv import load_dotenv
|
| 10 |
|
| 11 |
load_dotenv()
|
| 12 |
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
| 17 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
-
SYSTEM_PROMPT = """You are an expert SRE
|
| 20 |
-
You will receive an incident alert
|
| 21 |
-
|
|
|
|
| 22 |
|
| 23 |
Rules:
|
| 24 |
-
-
|
| 25 |
-
-
|
| 26 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
-
Response format (return this exact structure):
|
| 29 |
-
{"incident_id": "<incident_id>", "task_type": "<task_type>", "severity": "<value or null>", "root_cause": "<value or null>", "action": "<value or null>"}
|
| 30 |
|
| 31 |
-
|
| 32 |
-
""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
def build_user_prompt(observation: dict) -> str:
|
| 36 |
-
return f"""Incident ID: {observation['incident_id']}
|
| 37 |
-
Task Type: {observation['task_type']}
|
| 38 |
|
| 39 |
-
|
| 40 |
-
|
|
|
|
| 41 |
|
| 42 |
-
|
| 43 |
-
{json.dumps(observation['context'], indent=2)}
|
| 44 |
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
match = re.search(r"\{.*\}", raw, re.DOTALL)
|
| 51 |
if not match:
|
| 52 |
-
raise ValueError("No JSON found in response")
|
| 53 |
-
|
| 54 |
return json.loads(match.group(0))
|
| 55 |
|
| 56 |
-
|
|
|
|
|
|
|
| 57 |
return {
|
| 58 |
-
"incident_id":
|
| 59 |
"task_type": task_type,
|
| 60 |
-
"severity":
|
| 61 |
-
"root_cause":
|
| 62 |
-
"action":
|
| 63 |
}
|
| 64 |
|
| 65 |
|
| 66 |
-
def
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
try:
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
-
for chunk in completion:
|
| 83 |
-
if chunk.choices and chunk.choices[0].delta.content is not None:
|
| 84 |
-
full_response += chunk.choices[0].delta.content
|
| 85 |
-
except Exception as e:
|
| 86 |
-
print(f"Error calling LLM: {e}")
|
| 87 |
-
return ""
|
| 88 |
-
|
| 89 |
-
return full_response.strip()
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
def run_episode(task_type: str = None) -> dict:
|
| 93 |
-
# Step 1 — Reset environment
|
| 94 |
-
params = {"task_type": task_type} if task_type else {}
|
| 95 |
-
reset_response = requests.post(f"{BASE_URL}/reset", params=params)
|
| 96 |
-
reset_response.raise_for_status()
|
| 97 |
-
|
| 98 |
-
reset_data = reset_response.json()
|
| 99 |
-
session_id = reset_data["session_id"]
|
| 100 |
-
observation = reset_data
|
| 101 |
-
|
| 102 |
-
print(f"\n{'='*60}")
|
| 103 |
-
print(f"Incident : {observation['incident_id']}")
|
| 104 |
-
print(f"Task : {observation['task_type']}")
|
| 105 |
-
print(f"Alert : {observation['alert_text'][:80]}...")
|
| 106 |
-
|
| 107 |
-
# Step 2 — LLM with retry
|
| 108 |
-
action = None
|
| 109 |
-
raw = ""
|
| 110 |
-
|
| 111 |
-
for attempt in range(3):
|
| 112 |
-
raw = call_llm(observation)
|
| 113 |
-
print(f"LLM Raw (attempt {attempt+1}): {raw}")
|
| 114 |
|
| 115 |
-
try:
|
| 116 |
-
parsed = extract_json(raw)
|
| 117 |
-
action = normalize_action(parsed, observation["task_type"])
|
| 118 |
-
break
|
| 119 |
-
except Exception as e:
|
| 120 |
-
print(f"Parse failed: {e}")
|
| 121 |
-
|
| 122 |
-
if not action:
|
| 123 |
-
return {"error": "invalid_json", "raw": raw}
|
| 124 |
-
|
| 125 |
-
# Step 3 — Validate schema
|
| 126 |
-
required_keys = {"incident_id", "task_type", "severity", "root_cause", "action"}
|
| 127 |
-
if not required_keys.issubset(action.keys()):
|
| 128 |
-
print("Invalid schema from LLM")
|
| 129 |
-
return {"error": "invalid_schema", "raw": raw}
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
# Step 4 — Submit to /step
|
| 133 |
-
step_response = requests.post(f"{BASE_URL}/step", json=action, params={"session_id": session_id})
|
| 134 |
-
step_response.raise_for_status()
|
| 135 |
-
result = step_response.json()
|
| 136 |
-
# This need to be kept for submission grading, so we print it in a structured way
|
| 137 |
-
print(f"[STEP] task_id={result['task_type']} action={result['agent_answer']} reward={result['reward']}")
|
| 138 |
-
|
| 139 |
-
print(f"Answer : {result['agent_answer']}")
|
| 140 |
-
print(f"Expected : {result['ground_truth']}")
|
| 141 |
-
print(f"Correct : {result['correct']} | Reward: {result['reward']}")
|
| 142 |
-
|
| 143 |
-
# 🔥 Logging
|
| 144 |
-
with open("logs.jsonl", "a") as f:
|
| 145 |
-
f.write(json.dumps({
|
| 146 |
-
"observation": observation,
|
| 147 |
-
"response": action,
|
| 148 |
-
"result": result
|
| 149 |
-
}) + "\n")
|
| 150 |
-
|
| 151 |
-
return result
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
def run_full_eval():
|
| 155 |
-
print("[START]")
|
| 156 |
-
task_types = ["task1", "task2", "task3"]
|
| 157 |
-
|
| 158 |
-
rounds = len(TICKETS) # 🔥 FIXED
|
| 159 |
-
scores = []
|
| 160 |
-
errors = 0
|
| 161 |
-
|
| 162 |
-
task_scores = {
|
| 163 |
-
"task1": [],
|
| 164 |
-
"task2": [],
|
| 165 |
-
"task3": []
|
| 166 |
-
}
|
| 167 |
-
|
| 168 |
-
for i in range(rounds):
|
| 169 |
-
task = task_types[i % 3]
|
| 170 |
-
result = run_episode(task_type=task)
|
| 171 |
-
|
| 172 |
-
if "reward" in result:
|
| 173 |
-
scores.append(result["reward"])
|
| 174 |
-
task_scores[task].append(result["reward"])
|
| 175 |
-
else:
|
| 176 |
-
errors += 1
|
| 177 |
-
|
| 178 |
-
print(f"\n{'='*60}")
|
| 179 |
-
print(f"Total Episodes : {rounds}")
|
| 180 |
-
print(f"Graded : {len(scores)}")
|
| 181 |
-
print(f"JSON Errors : {errors}")
|
| 182 |
-
if scores:
|
| 183 |
-
print(f"Total Reward : {sum(scores)}")
|
| 184 |
-
print(f"Average Reward : {sum(scores)/len(scores):.2f}")
|
| 185 |
-
print(f"Overall Accuracy : {sum(scores)/len(scores)*100:.2f}%")
|
| 186 |
-
|
| 187 |
-
for task in task_scores:
|
| 188 |
-
if task_scores[task]:
|
| 189 |
-
acc = sum(task_scores[task]) / len(task_scores[task]) * 100
|
| 190 |
-
print(f"{task} Accuracy : {acc:.2f}%")
|
| 191 |
-
print("[END]")
|
| 192 |
-
|
| 193 |
if __name__ == "__main__":
|
| 194 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import json
|
| 2 |
+
import os
|
| 3 |
import re
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Any, Dict, List, Optional
|
| 6 |
+
|
| 7 |
import requests
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
from openai import OpenAI
|
| 10 |
+
|
| 11 |
from incidents import TICKETS
|
|
|
|
| 12 |
|
| 13 |
load_dotenv()
|
| 14 |
|
| 15 |
+
API_BASE_URL = os.environ.get("API_BASE_URL") or "https://router.huggingface.co/v1"
|
| 16 |
+
MODEL_NAME = os.environ.get("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
|
| 17 |
+
API_KEY = (
|
| 18 |
+
os.environ.get("HF_TOKEN")
|
| 19 |
+
or os.environ.get("API_KEY")
|
| 20 |
+
or os.environ.get("OPENAI_API_KEY")
|
| 21 |
+
or ""
|
| 22 |
)
|
| 23 |
+
ENV_URL = os.environ.get("ENV_URL") or "http://localhost:7860"
|
| 24 |
+
BENCHMARK = "incident-triage-env"
|
| 25 |
+
MAX_TOKENS = 300
|
| 26 |
+
TEMPERATURE = 0.0
|
| 27 |
+
OUTPUT_PATH = Path("outputs/baseline_scores.json")
|
| 28 |
|
| 29 |
+
SYSTEM_PROMPT = """You are an expert SRE triaging production incidents.
|
| 30 |
+
You will receive an incident alert, structured context, and the expected output field.
|
| 31 |
+
Return ONLY a valid JSON object with this exact shape:
|
| 32 |
+
{"incident_id":"<id>","task_type":"<task_type>","severity":null,"root_cause":null,"action":null}
|
| 33 |
|
| 34 |
Rules:
|
| 35 |
+
- Populate exactly one of severity, root_cause, or action based on task_type.
|
| 36 |
+
- Allowed severity values: SEV1, SEV2, SEV3
|
| 37 |
+
- Allowed root_cause values: DATABASE, NETWORK, APPLICATION, INFRASTRUCTURE, THIRD_PARTY, UNKNOWN
|
| 38 |
+
- Allowed action values: ROLLBACK, SCALE_UP, RESTART_SERVICE, FAILOVER, NOTIFY_VENDOR, INVESTIGATE, NO_ACTION
|
| 39 |
+
- Keep incident_id and task_type identical to the observation.
|
| 40 |
+
- Do not return markdown, prose, or any extra keys.
|
| 41 |
+
"""
|
| 42 |
|
|
|
|
|
|
|
| 43 |
|
| 44 |
+
def log_start(task: str, env: str, model: str) -> None:
|
| 45 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
|
| 49 |
+
error_val = error if error else "null"
|
| 50 |
+
done_val = str(done).lower()
|
| 51 |
+
action_clean = action.replace("\n", " ").replace("\r", "")[:100]
|
| 52 |
+
print(
|
| 53 |
+
f"[STEP] step={step} action={action_clean} reward={reward:.2f} done={done_val} error={error_val}",
|
| 54 |
+
flush=True,
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
|
| 59 |
+
rewards_str = ",".join(f"{reward:.2f}" for reward in rewards)
|
| 60 |
+
print(
|
| 61 |
+
f"[END] success={str(success).lower()} steps={steps} score={score:.2f} rewards={rewards_str}",
|
| 62 |
+
flush=True,
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
class EnvironmentTransport:
|
| 67 |
+
def reset(self, task_type: str, ticket_id: str) -> Dict[str, Any]:
|
| 68 |
+
raise NotImplementedError
|
| 69 |
+
|
| 70 |
+
def step(self, session_id: str, action: Dict[str, Any]) -> Dict[str, Any]:
|
| 71 |
+
raise NotImplementedError
|
| 72 |
+
|
| 73 |
+
def close(self) -> None:
|
| 74 |
+
return None
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
class HttpEnvironmentTransport(EnvironmentTransport):
|
| 78 |
+
def __init__(self, base_url: str):
|
| 79 |
+
self.base_url = base_url.rstrip("/")
|
| 80 |
+
self.session = requests.Session()
|
| 81 |
+
|
| 82 |
+
def probe(self) -> bool:
|
| 83 |
+
try:
|
| 84 |
+
response = self.session.get(f"{self.base_url}/health", timeout=5)
|
| 85 |
+
return response.ok
|
| 86 |
+
except requests.RequestException:
|
| 87 |
+
return False
|
| 88 |
+
|
| 89 |
+
def reset(self, task_type: str, ticket_id: str) -> Dict[str, Any]:
|
| 90 |
+
response = self.session.post(
|
| 91 |
+
f"{self.base_url}/reset",
|
| 92 |
+
json={"task_type": task_type, "ticket_id": ticket_id},
|
| 93 |
+
timeout=30,
|
| 94 |
+
)
|
| 95 |
+
response.raise_for_status()
|
| 96 |
+
return response.json()
|
| 97 |
+
|
| 98 |
+
def step(self, session_id: str, action: Dict[str, Any]) -> Dict[str, Any]:
|
| 99 |
+
response = self.session.post(
|
| 100 |
+
f"{self.base_url}/step",
|
| 101 |
+
params={"session_id": session_id},
|
| 102 |
+
json=action,
|
| 103 |
+
timeout=30,
|
| 104 |
+
)
|
| 105 |
+
response.raise_for_status()
|
| 106 |
+
return response.json()
|
| 107 |
|
| 108 |
+
def close(self) -> None:
|
| 109 |
+
self.session.close()
|
| 110 |
|
|
|
|
|
|
|
|
|
|
| 111 |
|
| 112 |
+
class LocalEnvironmentTransport(EnvironmentTransport):
|
| 113 |
+
def __init__(self):
|
| 114 |
+
from fastapi.testclient import TestClient
|
| 115 |
|
| 116 |
+
import app as app_module
|
|
|
|
| 117 |
|
| 118 |
+
self.session = TestClient(app_module.app)
|
| 119 |
+
|
| 120 |
+
def reset(self, task_type: str, ticket_id: str) -> Dict[str, Any]:
|
| 121 |
+
response = self.session.post(
|
| 122 |
+
"/reset",
|
| 123 |
+
json={"task_type": task_type, "ticket_id": ticket_id},
|
| 124 |
+
)
|
| 125 |
+
response.raise_for_status()
|
| 126 |
+
return response.json()
|
| 127 |
+
|
| 128 |
+
def step(self, session_id: str, action: Dict[str, Any]) -> Dict[str, Any]:
|
| 129 |
+
response = self.session.post(
|
| 130 |
+
"/step",
|
| 131 |
+
params={"session_id": session_id},
|
| 132 |
+
json=action,
|
| 133 |
+
)
|
| 134 |
+
response.raise_for_status()
|
| 135 |
+
return response.json()
|
| 136 |
+
|
| 137 |
+
def close(self) -> None:
|
| 138 |
+
self.session.close()
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def build_transport() -> EnvironmentTransport:
|
| 142 |
+
http_transport = HttpEnvironmentTransport(ENV_URL)
|
| 143 |
+
if http_transport.probe():
|
| 144 |
+
return http_transport
|
| 145 |
+
http_transport.close()
|
| 146 |
+
return LocalEnvironmentTransport()
|
| 147 |
|
| 148 |
|
| 149 |
+
def create_model_client() -> Optional[OpenAI]:
|
| 150 |
+
if not (API_BASE_URL and API_KEY and MODEL_NAME):
|
| 151 |
+
return None
|
| 152 |
+
return OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def build_user_prompt(observation: Dict[str, Any]) -> str:
|
| 156 |
+
return (
|
| 157 |
+
f"Incident ID: {observation['incident_id']}\n"
|
| 158 |
+
f"Task Type: {observation['task_type']}\n"
|
| 159 |
+
f"Difficulty: {observation['difficulty']}\n"
|
| 160 |
+
f"Task Description: {observation['task_description']}\n"
|
| 161 |
+
f"Expected Field: {observation['expected_field']}\n"
|
| 162 |
+
f"Allowed Values: {', '.join(observation['allowed_values'])}\n\n"
|
| 163 |
+
f"Alert:\n{observation['alert_text']}\n\n"
|
| 164 |
+
f"Context:\n{json.dumps(observation['context'], indent=2, sort_keys=True)}\n"
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def extract_json(raw: str) -> Dict[str, Any]:
|
| 169 |
+
fenced = re.search(r"```json\s*(.*?)\s*```", raw, re.DOTALL)
|
| 170 |
+
if fenced:
|
| 171 |
+
return json.loads(fenced.group(1))
|
| 172 |
+
|
| 173 |
+
try:
|
| 174 |
+
return json.loads(raw)
|
| 175 |
+
except json.JSONDecodeError:
|
| 176 |
+
pass
|
| 177 |
+
|
| 178 |
match = re.search(r"\{.*\}", raw, re.DOTALL)
|
| 179 |
if not match:
|
| 180 |
+
raise ValueError("No JSON object found in model response.")
|
|
|
|
| 181 |
return json.loads(match.group(0))
|
| 182 |
|
| 183 |
+
|
| 184 |
+
def normalize_action(raw_action: Dict[str, Any], observation: Dict[str, Any]) -> Dict[str, Any]:
|
| 185 |
+
task_type = observation["task_type"]
|
| 186 |
return {
|
| 187 |
+
"incident_id": observation["incident_id"],
|
| 188 |
"task_type": task_type,
|
| 189 |
+
"severity": raw_action.get("severity") if task_type == "task1" else None,
|
| 190 |
+
"root_cause": raw_action.get("root_cause") if task_type == "task2" else None,
|
| 191 |
+
"action": raw_action.get("action") if task_type == "task3" else None,
|
| 192 |
}
|
| 193 |
|
| 194 |
|
| 195 |
+
def _number(value: Any) -> Optional[float]:
|
| 196 |
+
if isinstance(value, (int, float)):
|
| 197 |
+
return float(value)
|
| 198 |
+
if isinstance(value, str):
|
| 199 |
+
match = re.search(r"(\d+(?:\.\d+)?)", value)
|
| 200 |
+
if match:
|
| 201 |
+
return float(match.group(1))
|
| 202 |
+
return None
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def predict_severity(alert_text: str, context: Dict[str, Any]) -> str:
|
| 206 |
+
error_rate = (
|
| 207 |
+
_number(context.get("error_rate_pct"))
|
| 208 |
+
or _number(context.get("failure_rate"))
|
| 209 |
+
or _number(context.get("affected_users_pct"))
|
| 210 |
+
)
|
| 211 |
+
revenue_impact = context.get("revenue_impact") is True or context.get("revenue_dependency") == "high"
|
| 212 |
+
|
| 213 |
+
if (
|
| 214 |
+
"CRITICAL" in alert_text
|
| 215 |
+
or "100%" in alert_text
|
| 216 |
+
or "REVENUE IMPACT" in alert_text
|
| 217 |
+
or context.get("region") == "global"
|
| 218 |
+
or revenue_impact
|
| 219 |
+
or (error_rate is not None and error_rate >= 40)
|
| 220 |
+
):
|
| 221 |
+
return "SEV1"
|
| 222 |
+
|
| 223 |
+
if (
|
| 224 |
+
"INTERNAL ONLY" in alert_text
|
| 225 |
+
or "COSMETIC" in alert_text
|
| 226 |
+
or "NO USER-FACING IMPACT" in alert_text
|
| 227 |
+
or context.get("user_impact") in {"cosmetic", False}
|
| 228 |
+
or context.get("impact") == "cosmetic"
|
| 229 |
+
):
|
| 230 |
+
return "SEV3"
|
| 231 |
+
|
| 232 |
+
return "SEV2"
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
def predict_root_cause(alert_text: str, context_text: str) -> str:
|
| 236 |
+
if any(keyword in alert_text or keyword in context_text for keyword in ["STRIPE", "SENDGRID", "TWILIO", "VENDOR", "WEBHOOK", "EXTERNAL API"]):
|
| 237 |
+
return "THIRD_PARTY"
|
| 238 |
+
if any(keyword in alert_text or keyword in context_text for keyword in ["PACKET LOSS", "BGP", "TRACEROUTE", "ROUTE", "CROSS-REGION", "TRANSIT HOP"]):
|
| 239 |
+
return "NETWORK"
|
| 240 |
+
if any(keyword in alert_text or keyword in context_text for keyword in ["POSTGRES", "DB ", "DATABASE", "SLOW QUERY", "CONNECTION POOL", "REPLICA", "WRITE QUERIES", "DB_CPU"]):
|
| 241 |
+
return "DATABASE"
|
| 242 |
+
if any(keyword in alert_text or keyword in context_text for keyword in ["KUBERNETES", "NODE", "POD", "CLUSTER", "NOTREADY", "MEMORY PRESSURE", "EC2", "SPOT INTERRUPTION"]):
|
| 243 |
+
return "INFRASTRUCTURE"
|
| 244 |
+
if any(keyword in alert_text or keyword in context_text for keyword in ["EXCEPTION", "STACK TRACE", "DEPLOY", "CRASH", "NULLPOINTER", "TIMEOUTEXCEPTION", "CODE"]):
|
| 245 |
+
return "APPLICATION"
|
| 246 |
+
return "UNKNOWN"
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
def predict_action(alert_text: str, context_text: str) -> str:
|
| 250 |
+
if any(keyword in alert_text or keyword in context_text for keyword in ["ROLLBACK", "IMMEDIATELY AFTER DEPLOY", "PREVIOUS_STABLE", "RECENT DEPLOY CAUSED"]):
|
| 251 |
+
return "ROLLBACK"
|
| 252 |
+
if any(keyword in alert_text or keyword in context_text for keyword in ["CPU", "QUEUE", "AUTOSCALER", "MAX_REPLICAS", "TRAFFIC SPIKE", "FLASH SALE"]):
|
| 253 |
+
return "SCALE_UP"
|
| 254 |
+
if any(keyword in alert_text or keyword in context_text for keyword in ["DEADLOCK", "HEALTH CHECK", "STUCK", "NO RESPONSE", "PROCESS NOT RESPONDING"]):
|
| 255 |
+
return "RESTART_SERVICE"
|
| 256 |
+
if any(keyword in alert_text or keyword in context_text for keyword in ["FAILOVER", "READ REPLICA", "PRIMARY DOWN", "PRIMARY RDS", "WRITES FAILING"]):
|
| 257 |
+
return "FAILOVER"
|
| 258 |
+
if any(keyword in alert_text or keyword in context_text for keyword in ["SENDGRID", "STRIPE", "TWILIO", "VENDOR"]):
|
| 259 |
+
return "NOTIFY_VENDOR"
|
| 260 |
+
if any(keyword in alert_text or keyword in context_text for keyword in ["COSMETIC", "MINOR UI GLITCH"]):
|
| 261 |
+
return "NO_ACTION"
|
| 262 |
+
return "INVESTIGATE"
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
def heuristic_action(observation: Dict[str, Any]) -> Dict[str, Any]:
|
| 266 |
+
task_type = observation["task_type"]
|
| 267 |
+
alert_text = observation["alert_text"].upper()
|
| 268 |
+
context_text = json.dumps(observation["context"]).upper()
|
| 269 |
+
|
| 270 |
+
if task_type == "task1":
|
| 271 |
+
return normalize_action({"severity": predict_severity(alert_text, observation["context"])}, observation)
|
| 272 |
+
if task_type == "task2":
|
| 273 |
+
return normalize_action({"root_cause": predict_root_cause(alert_text, context_text)}, observation)
|
| 274 |
+
return normalize_action({"action": predict_action(alert_text, context_text)}, observation)
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
def get_action(model_client: Optional[OpenAI], observation: Dict[str, Any]) -> Dict[str, Any]:
|
| 278 |
+
if model_client is None:
|
| 279 |
+
return heuristic_action(observation)
|
| 280 |
+
|
| 281 |
+
for _ in range(2):
|
| 282 |
+
try:
|
| 283 |
+
completion = model_client.chat.completions.create(
|
| 284 |
+
model=MODEL_NAME,
|
| 285 |
+
messages=[
|
| 286 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 287 |
+
{"role": "user", "content": build_user_prompt(observation)},
|
| 288 |
+
],
|
| 289 |
+
temperature=TEMPERATURE,
|
| 290 |
+
max_tokens=MAX_TOKENS,
|
| 291 |
+
)
|
| 292 |
+
content = (completion.choices[0].message.content or "").strip()
|
| 293 |
+
return normalize_action(extract_json(content), observation)
|
| 294 |
+
except Exception:
|
| 295 |
+
continue
|
| 296 |
+
|
| 297 |
+
return heuristic_action(observation)
|
| 298 |
+
|
| 299 |
+
|
| 300 |
+
def reward_value(step_data: Dict[str, Any]) -> float:
|
| 301 |
+
reward = step_data.get("reward", {})
|
| 302 |
+
if isinstance(reward, dict):
|
| 303 |
+
return float(reward.get("value", 0.0))
|
| 304 |
+
return float(reward or 0.0)
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
def active_model_name(model_client: Optional[OpenAI]) -> str:
|
| 308 |
+
return MODEL_NAME if model_client is not None else "deterministic-baseline"
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
def summarize_action(action: Dict[str, Any]) -> str:
|
| 312 |
+
for field in ("severity", "root_cause", "action"):
|
| 313 |
+
value = action.get(field)
|
| 314 |
+
if value is not None:
|
| 315 |
+
return str(value)
|
| 316 |
+
return "no_action"
|
| 317 |
+
|
| 318 |
+
|
| 319 |
+
def run_episode(
|
| 320 |
+
transport: EnvironmentTransport,
|
| 321 |
+
model_client: Optional[OpenAI],
|
| 322 |
+
ticket: Dict[str, Any],
|
| 323 |
+
) -> Dict[str, Any]:
|
| 324 |
+
rewards: List[float] = []
|
| 325 |
+
steps_taken = 0
|
| 326 |
+
score = 0.0
|
| 327 |
+
success = False
|
| 328 |
+
|
| 329 |
+
log_start(task=ticket["incident_id"], env=BENCHMARK, model=active_model_name(model_client))
|
| 330 |
+
|
| 331 |
try:
|
| 332 |
+
reset_data = transport.reset(ticket["task_type"], ticket["incident_id"])
|
| 333 |
+
observation = reset_data["observation"]
|
| 334 |
+
session_id = reset_data.get("info", {}).get("session_id")
|
| 335 |
+
if not session_id:
|
| 336 |
+
raise RuntimeError("Environment reset did not return a session_id.")
|
| 337 |
+
|
| 338 |
+
steps_taken = 1
|
| 339 |
+
action = get_action(model_client, observation)
|
| 340 |
+
step_data = transport.step(session_id=session_id, action=action)
|
| 341 |
+
score = reward_value(step_data)
|
| 342 |
+
rewards.append(score)
|
| 343 |
+
success = bool(step_data.get("info", {}).get("correct", score >= 0.99))
|
| 344 |
+
|
| 345 |
+
log_step(
|
| 346 |
+
step=1,
|
| 347 |
+
action=summarize_action(action),
|
| 348 |
+
reward=score,
|
| 349 |
+
done=bool(step_data.get("done", True)),
|
| 350 |
+
error=None,
|
| 351 |
)
|
| 352 |
+
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 353 |
+
|
| 354 |
+
return {
|
| 355 |
+
"incident_id": ticket["incident_id"],
|
| 356 |
+
"task_type": ticket["task_type"],
|
| 357 |
+
"difficulty": observation.get("difficulty"),
|
| 358 |
+
"score": score,
|
| 359 |
+
"success": success,
|
| 360 |
+
"ground_truth": step_data.get("info", {}).get("ground_truth"),
|
| 361 |
+
"agent_answer": step_data.get("info", {}).get("agent_answer"),
|
| 362 |
+
}
|
| 363 |
+
except Exception as exc:
|
| 364 |
+
log_step(step=max(steps_taken, 1), action="error", reward=0.0, done=True, error=str(exc))
|
| 365 |
+
log_end(success=False, steps=steps_taken, score=0.0, rewards=rewards)
|
| 366 |
+
return {
|
| 367 |
+
"incident_id": ticket["incident_id"],
|
| 368 |
+
"task_type": ticket["task_type"],
|
| 369 |
+
"score": 0.0,
|
| 370 |
+
"success": False,
|
| 371 |
+
"error": str(exc),
|
| 372 |
+
}
|
| 373 |
+
|
| 374 |
+
|
| 375 |
+
def write_results(results: List[Dict[str, Any]]) -> None:
|
| 376 |
+
grouped: Dict[str, List[float]] = {}
|
| 377 |
+
for result in results:
|
| 378 |
+
grouped.setdefault(result["task_type"], []).append(result.get("score", 0.0))
|
| 379 |
+
|
| 380 |
+
summary = {
|
| 381 |
+
"benchmark": BENCHMARK,
|
| 382 |
+
"model": MODEL_NAME,
|
| 383 |
+
"episodes": len(results),
|
| 384 |
+
"average_score": (sum(result.get("score", 0.0) for result in results) / len(results)) if results else 0.0,
|
| 385 |
+
"by_task": {
|
| 386 |
+
task_type: {
|
| 387 |
+
"episodes": len(scores),
|
| 388 |
+
"average_score": (sum(scores) / len(scores)) if scores else 0.0,
|
| 389 |
+
}
|
| 390 |
+
for task_type, scores in grouped.items()
|
| 391 |
+
},
|
| 392 |
+
"results": results,
|
| 393 |
+
}
|
| 394 |
+
|
| 395 |
+
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
| 396 |
+
OUTPUT_PATH.write_text(json.dumps(summary, indent=2))
|
| 397 |
+
|
| 398 |
+
|
| 399 |
+
def main() -> None:
|
| 400 |
+
transport = build_transport()
|
| 401 |
+
model_client = create_model_client()
|
| 402 |
+
results = [run_episode(transport, model_client, ticket) for ticket in TICKETS]
|
| 403 |
+
write_results(results)
|
| 404 |
+
transport.close()
|
| 405 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 407 |
if __name__ == "__main__":
|
| 408 |
+
main()
|
models.py
CHANGED
|
@@ -1,11 +1,14 @@
|
|
| 1 |
-
|
|
|
|
| 2 |
|
| 3 |
from pydantic import BaseModel, Field
|
| 4 |
-
from enum import Enum
|
| 5 |
-
from typing import Optional, Dict
|
| 6 |
|
| 7 |
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
class SeverityLevel(str, Enum):
|
| 11 |
SEV1 = "SEV1"
|
|
@@ -14,52 +17,97 @@ class SeverityLevel(str, Enum):
|
|
| 14 |
|
| 15 |
|
| 16 |
class RootCauseCategory(str, Enum):
|
| 17 |
-
DATABASE
|
| 18 |
-
NETWORK
|
| 19 |
-
APPLICATION
|
| 20 |
INFRASTRUCTURE = "INFRASTRUCTURE"
|
| 21 |
-
THIRD_PARTY
|
| 22 |
-
UNKNOWN
|
| 23 |
|
| 24 |
|
| 25 |
class RecommendedAction(str, Enum):
|
| 26 |
-
ROLLBACK
|
| 27 |
-
SCALE_UP
|
| 28 |
RESTART_SERVICE = "RESTART_SERVICE"
|
| 29 |
-
FAILOVER
|
| 30 |
-
NOTIFY_VENDOR
|
| 31 |
-
INVESTIGATE
|
| 32 |
-
NO_ACTION
|
| 33 |
|
| 34 |
|
| 35 |
-
# ── Observation (Input to Agent) ──────────────────────
|
| 36 |
-
|
| 37 |
class IncidentObservation(BaseModel):
|
| 38 |
incident_id: str
|
| 39 |
-
task_type:
|
|
|
|
|
|
|
| 40 |
alert_text: str
|
| 41 |
-
context: Dict
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
-
# ── Action (Output from Agent) ────────────────────────
|
| 45 |
|
| 46 |
class IncidentAction(BaseModel):
|
| 47 |
incident_id: str
|
| 48 |
-
task_type:
|
| 49 |
-
|
| 50 |
-
severity: Optional[SeverityLevel] = Field(None)
|
| 51 |
root_cause: Optional[RootCauseCategory] = Field(None)
|
| 52 |
-
action:
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
-
# ── Step Result ───────────────────────────────────────
|
| 56 |
|
| 57 |
class StepResult(BaseModel):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
incident_id: str
|
| 59 |
-
task_type:
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from enum import Enum
|
| 2 |
+
from typing import Any, Dict, List, Optional
|
| 3 |
|
| 4 |
from pydantic import BaseModel, Field
|
|
|
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
+
class TaskType(str, Enum):
|
| 8 |
+
TASK1 = "task1"
|
| 9 |
+
TASK2 = "task2"
|
| 10 |
+
TASK3 = "task3"
|
| 11 |
+
|
| 12 |
|
| 13 |
class SeverityLevel(str, Enum):
|
| 14 |
SEV1 = "SEV1"
|
|
|
|
| 17 |
|
| 18 |
|
| 19 |
class RootCauseCategory(str, Enum):
|
| 20 |
+
DATABASE = "DATABASE"
|
| 21 |
+
NETWORK = "NETWORK"
|
| 22 |
+
APPLICATION = "APPLICATION"
|
| 23 |
INFRASTRUCTURE = "INFRASTRUCTURE"
|
| 24 |
+
THIRD_PARTY = "THIRD_PARTY"
|
| 25 |
+
UNKNOWN = "UNKNOWN"
|
| 26 |
|
| 27 |
|
| 28 |
class RecommendedAction(str, Enum):
|
| 29 |
+
ROLLBACK = "ROLLBACK"
|
| 30 |
+
SCALE_UP = "SCALE_UP"
|
| 31 |
RESTART_SERVICE = "RESTART_SERVICE"
|
| 32 |
+
FAILOVER = "FAILOVER"
|
| 33 |
+
NOTIFY_VENDOR = "NOTIFY_VENDOR"
|
| 34 |
+
INVESTIGATE = "INVESTIGATE"
|
| 35 |
+
NO_ACTION = "NO_ACTION"
|
| 36 |
|
| 37 |
|
|
|
|
|
|
|
| 38 |
class IncidentObservation(BaseModel):
|
| 39 |
incident_id: str
|
| 40 |
+
task_type: TaskType
|
| 41 |
+
difficulty: str
|
| 42 |
+
task_description: str
|
| 43 |
alert_text: str
|
| 44 |
+
context: Dict[str, Any]
|
| 45 |
+
expected_field: str
|
| 46 |
+
allowed_values: List[str] = Field(default_factory=list)
|
| 47 |
+
step_count: int = 0
|
| 48 |
+
max_steps: int = 1
|
| 49 |
+
last_action_summary: Optional[str] = None
|
| 50 |
+
last_reward: float = 0.0
|
| 51 |
+
episode_status: str = "awaiting_action"
|
| 52 |
|
|
|
|
| 53 |
|
| 54 |
class IncidentAction(BaseModel):
|
| 55 |
incident_id: str
|
| 56 |
+
task_type: TaskType
|
| 57 |
+
severity: Optional[SeverityLevel] = Field(None)
|
|
|
|
| 58 |
root_cause: Optional[RootCauseCategory] = Field(None)
|
| 59 |
+
action: Optional[RecommendedAction] = Field(None)
|
| 60 |
|
| 61 |
+
def populated_fields(self) -> Dict[str, str]:
|
| 62 |
+
fields: Dict[str, str] = {}
|
| 63 |
+
if self.severity is not None:
|
| 64 |
+
fields["severity"] = self.severity.value
|
| 65 |
+
if self.root_cause is not None:
|
| 66 |
+
fields["root_cause"] = self.root_cause.value
|
| 67 |
+
if self.action is not None:
|
| 68 |
+
fields["action"] = self.action.value
|
| 69 |
+
return fields
|
| 70 |
+
|
| 71 |
+
def selected_field(self) -> Optional[str]:
|
| 72 |
+
populated = self.populated_fields()
|
| 73 |
+
if len(populated) != 1:
|
| 74 |
+
return None
|
| 75 |
+
return next(iter(populated))
|
| 76 |
+
|
| 77 |
+
def selected_value(self) -> Optional[str]:
|
| 78 |
+
selected = self.selected_field()
|
| 79 |
+
if selected is None:
|
| 80 |
+
return None
|
| 81 |
+
return self.populated_fields()[selected]
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
class IncidentReward(BaseModel):
|
| 85 |
+
value: float = Field(..., ge=0.0, le=1.0)
|
| 86 |
+
reason: str
|
| 87 |
|
|
|
|
| 88 |
|
| 89 |
class StepResult(BaseModel):
|
| 90 |
+
observation: IncidentObservation
|
| 91 |
+
reward: IncidentReward
|
| 92 |
+
done: bool
|
| 93 |
+
info: Dict[str, Any] = Field(default_factory=dict)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
class IncidentState(BaseModel):
|
| 97 |
+
episode_id: str
|
| 98 |
+
session_id: Optional[str] = None
|
| 99 |
+
step_count: int
|
| 100 |
+
max_steps: int
|
| 101 |
+
total_reward: float = 0.0
|
| 102 |
+
done: bool
|
| 103 |
incident_id: str
|
| 104 |
+
task_type: TaskType
|
| 105 |
+
difficulty: str
|
| 106 |
+
status: str
|
| 107 |
+
last_reward: float = 0.0
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
class ResetRequest(BaseModel):
|
| 111 |
+
task_type: Optional[TaskType] = None
|
| 112 |
+
ticket_id: Optional[str] = None
|
| 113 |
+
seed: Optional[int] = None
|
openenv.yaml
CHANGED
|
@@ -1,27 +1,45 @@
|
|
| 1 |
spec_version: 1
|
| 2 |
-
name:
|
| 3 |
type: space
|
| 4 |
runtime: fastapi
|
| 5 |
app: app:app
|
| 6 |
port: 7860
|
| 7 |
version: "1.0.0"
|
|
|
|
| 8 |
description: >
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
identify root cause, or recommend
|
|
|
|
| 12 |
|
| 13 |
api:
|
| 14 |
base_url: http://0.0.0.0:7860
|
| 15 |
endpoints:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
reset:
|
| 17 |
method: POST
|
| 18 |
path: /reset
|
| 19 |
-
|
| 20 |
task_type:
|
| 21 |
type: string
|
| 22 |
required: false
|
| 23 |
enum: [task1, task2, task3]
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
step:
|
| 27 |
method: POST
|
|
@@ -31,7 +49,7 @@ api:
|
|
| 31 |
type: string
|
| 32 |
required: true
|
| 33 |
body: IncidentAction
|
| 34 |
-
returns: StepResult
|
| 35 |
|
| 36 |
state:
|
| 37 |
method: GET
|
|
@@ -40,35 +58,46 @@ api:
|
|
| 40 |
session_id:
|
| 41 |
type: string
|
| 42 |
required: true
|
| 43 |
-
returns:
|
| 44 |
|
| 45 |
tasks:
|
| 46 |
task1:
|
| 47 |
name: Severity Classification
|
|
|
|
| 48 |
output_field: severity
|
| 49 |
labels: [SEV1, SEV2, SEV3]
|
| 50 |
-
reward:
|
| 51 |
|
| 52 |
task2:
|
| 53 |
name: Root Cause Classification
|
|
|
|
| 54 |
output_field: root_cause
|
| 55 |
labels: [DATABASE, NETWORK, APPLICATION, INFRASTRUCTURE, THIRD_PARTY, UNKNOWN]
|
| 56 |
-
reward:
|
| 57 |
|
| 58 |
task3:
|
| 59 |
name: Recommended Action
|
|
|
|
| 60 |
output_field: action
|
| 61 |
labels: [ROLLBACK, SCALE_UP, RESTART_SERVICE, FAILOVER, NOTIFY_VENDOR, INVESTIGATE, NO_ACTION]
|
| 62 |
-
reward:
|
| 63 |
|
| 64 |
dataset:
|
| 65 |
total_tickets: 36
|
| 66 |
split:
|
| 67 |
-
task1:
|
| 68 |
task2: 12
|
| 69 |
-
task3:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
reproducibility:
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
| 1 |
spec_version: 1
|
| 2 |
+
name: incident-triage-env
|
| 3 |
type: space
|
| 4 |
runtime: fastapi
|
| 5 |
app: app:app
|
| 6 |
port: 7860
|
| 7 |
version: "1.0.0"
|
| 8 |
+
tags: [openenv]
|
| 9 |
description: >
|
| 10 |
+
Production incident triage environment for evaluating agents on realistic
|
| 11 |
+
SRE workflows. The agent receives a typed incident observation and must
|
| 12 |
+
classify severity, identify the most likely root cause, or recommend the
|
| 13 |
+
best immediate remediation action.
|
| 14 |
|
| 15 |
api:
|
| 16 |
base_url: http://0.0.0.0:7860
|
| 17 |
endpoints:
|
| 18 |
+
health:
|
| 19 |
+
method: GET
|
| 20 |
+
path: /health
|
| 21 |
+
returns: health status
|
| 22 |
+
|
| 23 |
+
metadata:
|
| 24 |
+
method: GET
|
| 25 |
+
path: /metadata
|
| 26 |
+
returns: task metadata and dataset summary
|
| 27 |
+
|
| 28 |
reset:
|
| 29 |
method: POST
|
| 30 |
path: /reset
|
| 31 |
+
body:
|
| 32 |
task_type:
|
| 33 |
type: string
|
| 34 |
required: false
|
| 35 |
enum: [task1, task2, task3]
|
| 36 |
+
ticket_id:
|
| 37 |
+
type: string
|
| 38 |
+
required: false
|
| 39 |
+
seed:
|
| 40 |
+
type: integer
|
| 41 |
+
required: false
|
| 42 |
+
returns: StepResult with initial observation and session_id in info
|
| 43 |
|
| 44 |
step:
|
| 45 |
method: POST
|
|
|
|
| 49 |
type: string
|
| 50 |
required: true
|
| 51 |
body: IncidentAction
|
| 52 |
+
returns: StepResult with reward object, done flag, and episode info
|
| 53 |
|
| 54 |
state:
|
| 55 |
method: GET
|
|
|
|
| 58 |
session_id:
|
| 59 |
type: string
|
| 60 |
required: true
|
| 61 |
+
returns: IncidentState
|
| 62 |
|
| 63 |
tasks:
|
| 64 |
task1:
|
| 65 |
name: Severity Classification
|
| 66 |
+
difficulty: easy
|
| 67 |
output_field: severity
|
| 68 |
labels: [SEV1, SEV2, SEV3]
|
| 69 |
+
reward: "1.0 exact | 0.5 adjacent severity | 0.0 far miss"
|
| 70 |
|
| 71 |
task2:
|
| 72 |
name: Root Cause Classification
|
| 73 |
+
difficulty: medium
|
| 74 |
output_field: root_cause
|
| 75 |
labels: [DATABASE, NETWORK, APPLICATION, INFRASTRUCTURE, THIRD_PARTY, UNKNOWN]
|
| 76 |
+
reward: "1.0 exact | 0.5 related domain | 0.25 UNKNOWN fallback | 0.0 wrong"
|
| 77 |
|
| 78 |
task3:
|
| 79 |
name: Recommended Action
|
| 80 |
+
difficulty: hard
|
| 81 |
output_field: action
|
| 82 |
labels: [ROLLBACK, SCALE_UP, RESTART_SERVICE, FAILOVER, NOTIFY_VENDOR, INVESTIGATE, NO_ACTION]
|
| 83 |
+
reward: "1.0 exact | 0.4 safe investigate fallback | 0.25 related action | 0.0 wrong"
|
| 84 |
|
| 85 |
dataset:
|
| 86 |
total_tickets: 36
|
| 87 |
split:
|
| 88 |
+
task1: 11
|
| 89 |
task2: 12
|
| 90 |
+
task3: 13
|
| 91 |
+
|
| 92 |
+
baseline:
|
| 93 |
+
script: inference.py
|
| 94 |
+
required_env_vars: [API_BASE_URL, MODEL_NAME, HF_TOKEN]
|
| 95 |
+
optional_env_vars: [ENV_URL]
|
| 96 |
+
latest_local_score: 0.9861
|
| 97 |
+
latest_local_episodes: 36
|
| 98 |
|
| 99 |
reproducibility:
|
| 100 |
+
inference_temperature: 0.0
|
| 101 |
+
max_steps_per_episode: 1
|
| 102 |
+
dataset_order: fixed TICKETS list order in incidents.py
|
| 103 |
+
baseline_selection: deterministic ticket_id-driven evaluation across all tickets
|
pyproject.toml
CHANGED
|
@@ -1,45 +1,39 @@
|
|
| 1 |
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
-
# All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# This source code is licensed under the BSD-style license found in the
|
| 5 |
-
# LICENSE file in the root directory of this source tree.
|
| 6 |
-
|
| 7 |
[build-system]
|
| 8 |
-
requires = ["setuptools
|
| 9 |
build-backend = "setuptools.build_meta"
|
| 10 |
|
| 11 |
[project]
|
| 12 |
-
name = "
|
| 13 |
version = "0.1.0"
|
| 14 |
-
description = "
|
|
|
|
| 15 |
requires-python = ">=3.10"
|
| 16 |
dependencies = [
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
"
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
# "numpy>=1.19.0",
|
| 25 |
-
# "torch>=2.0.0",
|
| 26 |
-
# "gymnasium>=0.29.0",
|
| 27 |
-
# "openspiel>=1.0.0",
|
| 28 |
-
# "smolagents>=1.22.0,<2",
|
| 29 |
]
|
| 30 |
|
|
|
|
|
|
|
|
|
|
| 31 |
[project.optional-dependencies]
|
| 32 |
dev = [
|
| 33 |
"pytest>=8.0.0",
|
| 34 |
"pytest-cov>=4.0.0",
|
| 35 |
]
|
| 36 |
|
| 37 |
-
[project.scripts]
|
| 38 |
-
# Server entry point - enables running via: uv run --project . server
|
| 39 |
-
# or: python -m Incident_Triage.server.app
|
| 40 |
-
server = "Incident_Triage.server.app:main"
|
| 41 |
-
|
| 42 |
[tool.setuptools]
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
[build-system]
|
| 2 |
+
requires = ["setuptools", "wheel"]
|
| 3 |
build-backend = "setuptools.build_meta"
|
| 4 |
|
| 5 |
[project]
|
| 6 |
+
name = "incident-triage-env"
|
| 7 |
version = "0.1.0"
|
| 8 |
+
description = "FastAPI incident triage environment for production alert classification."
|
| 9 |
+
readme = "README.md"
|
| 10 |
requires-python = ">=3.10"
|
| 11 |
dependencies = [
|
| 12 |
+
"fastapi",
|
| 13 |
+
"uvicorn",
|
| 14 |
+
"pydantic",
|
| 15 |
+
"openai",
|
| 16 |
+
"requests",
|
| 17 |
+
"python-dotenv",
|
| 18 |
+
"openenv-core>=0.2.0",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
]
|
| 20 |
|
| 21 |
+
[project.scripts]
|
| 22 |
+
server = "server.app:main"
|
| 23 |
+
|
| 24 |
[project.optional-dependencies]
|
| 25 |
dev = [
|
| 26 |
"pytest>=8.0.0",
|
| 27 |
"pytest-cov>=4.0.0",
|
| 28 |
]
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
[tool.setuptools]
|
| 31 |
+
py-modules = [
|
| 32 |
+
"app",
|
| 33 |
+
"client",
|
| 34 |
+
"environment",
|
| 35 |
+
"graders",
|
| 36 |
+
"incidents",
|
| 37 |
+
"inference",
|
| 38 |
+
"models",
|
| 39 |
+
]
|
requirements.txt
CHANGED
|
@@ -2,4 +2,7 @@ fastapi
|
|
| 2 |
uvicorn
|
| 3 |
pydantic
|
| 4 |
openai
|
| 5 |
-
requests
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
uvicorn
|
| 3 |
pydantic
|
| 4 |
openai
|
| 5 |
+
requests
|
| 6 |
+
python-dotenv
|
| 7 |
+
setuptools
|
| 8 |
+
wheel
|
server/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""OpenEnv compatibility package for the incident triage server."""
|
server/app.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compatibility entrypoint expected by OpenEnv validators and templates."""
|
| 2 |
+
|
| 3 |
+
from app import app
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def main() -> None:
|
| 7 |
+
import uvicorn
|
| 8 |
+
|
| 9 |
+
uvicorn.run("server.app:app", host="0.0.0.0", port=7860)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
if __name__ == "__main__":
|
| 13 |
+
main()
|
tests/test_env.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
|
| 3 |
+
from fastapi.testclient import TestClient
|
| 4 |
+
|
| 5 |
+
from app import app, sessions
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class IncidentEnvApiTests(unittest.TestCase):
|
| 9 |
+
def setUp(self) -> None:
|
| 10 |
+
sessions.clear()
|
| 11 |
+
self.client = TestClient(app)
|
| 12 |
+
|
| 13 |
+
def tearDown(self) -> None:
|
| 14 |
+
sessions.clear()
|
| 15 |
+
|
| 16 |
+
def test_health_schema_and_mcp_helper_endpoints(self) -> None:
|
| 17 |
+
health_response = self.client.get("/health")
|
| 18 |
+
self.assertEqual(health_response.status_code, 200)
|
| 19 |
+
self.assertEqual(health_response.json()["status"], "healthy")
|
| 20 |
+
|
| 21 |
+
schema_response = self.client.get("/schema")
|
| 22 |
+
self.assertEqual(schema_response.status_code, 200)
|
| 23 |
+
schema_body = schema_response.json()
|
| 24 |
+
self.assertIn("action", schema_body)
|
| 25 |
+
self.assertIn("observation", schema_body)
|
| 26 |
+
self.assertIn("state", schema_body)
|
| 27 |
+
|
| 28 |
+
mcp_response = self.client.post("/mcp", json={"jsonrpc": "2.0", "id": 1, "method": "ping"})
|
| 29 |
+
self.assertEqual(mcp_response.status_code, 200)
|
| 30 |
+
mcp_body = mcp_response.json()
|
| 31 |
+
self.assertEqual(mcp_body["jsonrpc"], "2.0")
|
| 32 |
+
self.assertEqual(mcp_body["id"], 1)
|
| 33 |
+
|
| 34 |
+
def test_tickets_endpoint_returns_safe_ticket_inventory(self) -> None:
|
| 35 |
+
response = self.client.get("/tickets")
|
| 36 |
+
self.assertEqual(response.status_code, 200)
|
| 37 |
+
body = response.json()
|
| 38 |
+
self.assertEqual(body["count"], 36)
|
| 39 |
+
self.assertEqual(body["tickets"][0]["incident_id"], "INC-001")
|
| 40 |
+
self.assertIn("expected_field", body["tickets"][0])
|
| 41 |
+
self.assertNotIn("ground_truth", body["tickets"][0])
|
| 42 |
+
|
| 43 |
+
def test_ui_routes_and_assets_are_served(self) -> None:
|
| 44 |
+
home_response = self.client.get("/")
|
| 45 |
+
self.assertEqual(home_response.status_code, 200)
|
| 46 |
+
self.assertIn("Incident Triage Environment", home_response.text)
|
| 47 |
+
|
| 48 |
+
status_response = self.client.get("/status")
|
| 49 |
+
self.assertEqual(status_response.status_code, 200)
|
| 50 |
+
self.assertIn("Environment readiness dashboard", status_response.text)
|
| 51 |
+
|
| 52 |
+
playground_response = self.client.get("/playground")
|
| 53 |
+
self.assertEqual(playground_response.status_code, 200)
|
| 54 |
+
self.assertIn("Interactive playground", playground_response.text)
|
| 55 |
+
|
| 56 |
+
asset_response = self.client.get("/assets/app.js")
|
| 57 |
+
self.assertEqual(asset_response.status_code, 200)
|
| 58 |
+
self.assertIn("bootstrap", asset_response.text)
|
| 59 |
+
|
| 60 |
+
def test_reset_returns_requested_ticket_and_session_state(self) -> None:
|
| 61 |
+
response = self.client.post(
|
| 62 |
+
"/reset",
|
| 63 |
+
json={"task_type": "task3", "ticket_id": "INC-014"},
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
self.assertEqual(response.status_code, 200)
|
| 67 |
+
body = response.json()
|
| 68 |
+
self.assertEqual(body["observation"]["incident_id"], "INC-014")
|
| 69 |
+
self.assertEqual(body["observation"]["task_type"], "task3")
|
| 70 |
+
self.assertEqual(body["reward"]["value"], 0.0)
|
| 71 |
+
self.assertFalse(body["done"])
|
| 72 |
+
self.assertIn("session_id", body["info"])
|
| 73 |
+
self.assertEqual(body["info"]["state"]["status"], "awaiting_action")
|
| 74 |
+
|
| 75 |
+
def test_step_completes_episode_and_state_endpoint_reflects_completion(self) -> None:
|
| 76 |
+
reset_response = self.client.post(
|
| 77 |
+
"/reset",
|
| 78 |
+
json={"task_type": "task3", "ticket_id": "INC-014"},
|
| 79 |
+
)
|
| 80 |
+
session_id = reset_response.json()["info"]["session_id"]
|
| 81 |
+
|
| 82 |
+
step_response = self.client.post(
|
| 83 |
+
f"/step?session_id={session_id}",
|
| 84 |
+
json={
|
| 85 |
+
"incident_id": "INC-014",
|
| 86 |
+
"task_type": "task3",
|
| 87 |
+
"action": "FAILOVER",
|
| 88 |
+
},
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
self.assertEqual(step_response.status_code, 200)
|
| 92 |
+
step_body = step_response.json()
|
| 93 |
+
self.assertTrue(step_body["done"])
|
| 94 |
+
self.assertEqual(step_body["reward"]["value"], 1.0)
|
| 95 |
+
self.assertTrue(step_body["info"]["correct"])
|
| 96 |
+
self.assertEqual(step_body["info"]["ground_truth"], "FAILOVER")
|
| 97 |
+
|
| 98 |
+
state_response = self.client.get(f"/state?session_id={session_id}")
|
| 99 |
+
self.assertEqual(state_response.status_code, 200)
|
| 100 |
+
state_body = state_response.json()
|
| 101 |
+
self.assertTrue(state_body["done"])
|
| 102 |
+
self.assertEqual(state_body["status"], "completed")
|
| 103 |
+
self.assertEqual(state_body["last_reward"], 1.0)
|
| 104 |
+
|
| 105 |
+
def test_step_rejects_action_for_wrong_task_type(self) -> None:
|
| 106 |
+
reset_response = self.client.post(
|
| 107 |
+
"/reset",
|
| 108 |
+
json={"task_type": "task3", "ticket_id": "INC-014"},
|
| 109 |
+
)
|
| 110 |
+
session_id = reset_response.json()["info"]["session_id"]
|
| 111 |
+
|
| 112 |
+
step_response = self.client.post(
|
| 113 |
+
f"/step?session_id={session_id}",
|
| 114 |
+
json={
|
| 115 |
+
"incident_id": "INC-014",
|
| 116 |
+
"task_type": "task2",
|
| 117 |
+
"root_cause": "NETWORK",
|
| 118 |
+
},
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
self.assertEqual(step_response.status_code, 400)
|
| 122 |
+
self.assertIn("does not match", step_response.json()["detail"])
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
if __name__ == "__main__":
|
| 126 |
+
unittest.main()
|
tests/test_graders.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import unittest
|
| 2 |
+
|
| 3 |
+
from graders import GRADERS, grade_task1, grade_task2, grade_task3
|
| 4 |
+
from incidents import TICKETS
|
| 5 |
+
from models import IncidentAction
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class GraderTests(unittest.TestCase):
|
| 9 |
+
def test_all_ticket_ground_truth_scores_are_bounded(self) -> None:
|
| 10 |
+
for ticket in TICKETS:
|
| 11 |
+
action = IncidentAction(
|
| 12 |
+
incident_id=ticket["incident_id"],
|
| 13 |
+
task_type=ticket["task_type"],
|
| 14 |
+
**ticket["ground_truth"],
|
| 15 |
+
)
|
| 16 |
+
score, reason = GRADERS[ticket["task_type"]](action, ticket["ground_truth"])
|
| 17 |
+
self.assertGreaterEqual(score, 0.0, ticket["incident_id"])
|
| 18 |
+
self.assertLessEqual(score, 1.0, ticket["incident_id"])
|
| 19 |
+
self.assertIsInstance(reason, str)
|
| 20 |
+
|
| 21 |
+
def test_task1_grader_supports_partial_credit(self) -> None:
|
| 22 |
+
exact = IncidentAction(
|
| 23 |
+
incident_id="INC-TEST-1",
|
| 24 |
+
task_type="task1",
|
| 25 |
+
severity="SEV1",
|
| 26 |
+
)
|
| 27 |
+
adjacent = IncidentAction(
|
| 28 |
+
incident_id="INC-TEST-1",
|
| 29 |
+
task_type="task1",
|
| 30 |
+
severity="SEV2",
|
| 31 |
+
)
|
| 32 |
+
exact_score, _ = grade_task1(exact, {"severity": "SEV1"})
|
| 33 |
+
adjacent_score, _ = grade_task1(adjacent, {"severity": "SEV1"})
|
| 34 |
+
self.assertEqual(exact_score, 1.0)
|
| 35 |
+
self.assertEqual(adjacent_score, 0.5)
|
| 36 |
+
|
| 37 |
+
def test_task2_grader_is_not_constant(self) -> None:
|
| 38 |
+
exact = IncidentAction(
|
| 39 |
+
incident_id="INC-TEST-2",
|
| 40 |
+
task_type="task2",
|
| 41 |
+
root_cause="DATABASE",
|
| 42 |
+
)
|
| 43 |
+
fallback = IncidentAction(
|
| 44 |
+
incident_id="INC-TEST-2",
|
| 45 |
+
task_type="task2",
|
| 46 |
+
root_cause="UNKNOWN",
|
| 47 |
+
)
|
| 48 |
+
wrong = IncidentAction(
|
| 49 |
+
incident_id="INC-TEST-2",
|
| 50 |
+
task_type="task2",
|
| 51 |
+
root_cause="NETWORK",
|
| 52 |
+
)
|
| 53 |
+
exact_score, _ = grade_task2(exact, {"root_cause": "DATABASE"})
|
| 54 |
+
fallback_score, _ = grade_task2(fallback, {"root_cause": "DATABASE"})
|
| 55 |
+
wrong_score, _ = grade_task2(wrong, {"root_cause": "DATABASE"})
|
| 56 |
+
self.assertEqual(exact_score, 1.0)
|
| 57 |
+
self.assertEqual(fallback_score, 0.25)
|
| 58 |
+
self.assertEqual(wrong_score, 0.0)
|
| 59 |
+
|
| 60 |
+
def test_task3_grader_rewards_safe_fallbacks(self) -> None:
|
| 61 |
+
exact = IncidentAction(
|
| 62 |
+
incident_id="INC-TEST-3",
|
| 63 |
+
task_type="task3",
|
| 64 |
+
action="FAILOVER",
|
| 65 |
+
)
|
| 66 |
+
fallback = IncidentAction(
|
| 67 |
+
incident_id="INC-TEST-3",
|
| 68 |
+
task_type="task3",
|
| 69 |
+
action="INVESTIGATE",
|
| 70 |
+
)
|
| 71 |
+
wrong = IncidentAction(
|
| 72 |
+
incident_id="INC-TEST-3",
|
| 73 |
+
task_type="task3",
|
| 74 |
+
action="NO_ACTION",
|
| 75 |
+
)
|
| 76 |
+
exact_score, _ = grade_task3(exact, {"action": "FAILOVER"})
|
| 77 |
+
fallback_score, _ = grade_task3(fallback, {"action": "FAILOVER"})
|
| 78 |
+
wrong_score, _ = grade_task3(wrong, {"action": "FAILOVER"})
|
| 79 |
+
self.assertEqual(exact_score, 1.0)
|
| 80 |
+
self.assertEqual(fallback_score, 0.4)
|
| 81 |
+
self.assertEqual(wrong_score, 0.0)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
if __name__ == "__main__":
|
| 85 |
+
unittest.main()
|
ui/assets/app.js
ADDED
|
@@ -0,0 +1,290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
async function fetchJson(url, options = {}) {
|
| 2 |
+
const response = await fetch(url, options);
|
| 3 |
+
const contentType = response.headers.get("content-type") || "";
|
| 4 |
+
const body = contentType.includes("application/json") ? await response.json() : await response.text();
|
| 5 |
+
if (!response.ok) {
|
| 6 |
+
const detail = typeof body === "object" ? body.detail || JSON.stringify(body) : body;
|
| 7 |
+
throw new Error(`${response.status} ${response.statusText}: ${detail}`);
|
| 8 |
+
}
|
| 9 |
+
return body;
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
function safeText(value) {
|
| 13 |
+
return value == null ? "--" : String(value);
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
function setHealthPill(status) {
|
| 17 |
+
const pills = document.querySelectorAll("[data-health-pill]");
|
| 18 |
+
pills.forEach((pill) => {
|
| 19 |
+
pill.textContent = status === "healthy" ? "Healthy" : "Unavailable";
|
| 20 |
+
pill.classList.toggle("is-pending", status !== "healthy");
|
| 21 |
+
});
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
function renderTaskCards(target, tasks) {
|
| 25 |
+
if (!target) return;
|
| 26 |
+
target.innerHTML = "";
|
| 27 |
+
Object.entries(tasks).forEach(([taskId, task]) => {
|
| 28 |
+
const article = document.createElement("article");
|
| 29 |
+
article.className = "task-card";
|
| 30 |
+
article.innerHTML = `
|
| 31 |
+
<span class="badge difficulty-${task.difficulty}">${task.difficulty}</span>
|
| 32 |
+
<h3>${task.name}</h3>
|
| 33 |
+
<p>Expected field: <strong>${task.expected_field || task.output_field}</strong></p>
|
| 34 |
+
<div class="task-meta">
|
| 35 |
+
<span class="badge">${taskId}</span>
|
| 36 |
+
<span class="badge">${task.ticket_count || 0} incidents</span>
|
| 37 |
+
</div>
|
| 38 |
+
<div class="task-values">
|
| 39 |
+
${(task.allowed_values || task.labels || []).map((value) => `<span class="badge">${value}</span>`).join("")}
|
| 40 |
+
</div>
|
| 41 |
+
`;
|
| 42 |
+
target.appendChild(article);
|
| 43 |
+
});
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
async function initHome() {
|
| 47 |
+
const [health, metadata] = await Promise.all([
|
| 48 |
+
fetchJson("/health"),
|
| 49 |
+
fetchJson("/metadata"),
|
| 50 |
+
]);
|
| 51 |
+
|
| 52 |
+
setHealthPill(health.status);
|
| 53 |
+
document.querySelector("[data-total-incidents]").textContent = safeText(metadata.total_tickets);
|
| 54 |
+
document.querySelector("[data-task-count]").textContent = safeText(Object.keys(metadata.tasks).length);
|
| 55 |
+
renderTaskCards(document.querySelector("[data-task-grid]"), metadata.tasks);
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
async function initStatus() {
|
| 59 |
+
const [health, metadata, grader, schema] = await Promise.all([
|
| 60 |
+
fetchJson("/health"),
|
| 61 |
+
fetchJson("/metadata"),
|
| 62 |
+
fetchJson("/grader"),
|
| 63 |
+
fetchJson("/schema"),
|
| 64 |
+
]);
|
| 65 |
+
|
| 66 |
+
document.querySelector("[data-health-text]").textContent = health.status;
|
| 67 |
+
document.querySelector("[data-total-incidents]").textContent = safeText(metadata.total_tickets);
|
| 68 |
+
document.querySelector("[data-schema-count]").textContent = safeText(Object.keys(schema).length);
|
| 69 |
+
renderTaskCards(document.querySelector("[data-task-grid]"), metadata.tasks);
|
| 70 |
+
|
| 71 |
+
const schemaGrid = document.querySelector("[data-schema-grid]");
|
| 72 |
+
schemaGrid.innerHTML = Object.keys(schema)
|
| 73 |
+
.map((name) => `<span class="badge">${name}</span>`)
|
| 74 |
+
.join("");
|
| 75 |
+
|
| 76 |
+
document.querySelector("[data-grader-summary]").textContent = grader.scoring;
|
| 77 |
+
const graderList = document.querySelector("[data-grader-list]");
|
| 78 |
+
graderList.innerHTML = Object.entries(grader.tasks)
|
| 79 |
+
.map(([task, rule]) => `<li><strong>${task}</strong>: ${rule}</li>`)
|
| 80 |
+
.join("");
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
function buildActionPayload(observation, selectedValue) {
|
| 84 |
+
const payload = {
|
| 85 |
+
incident_id: observation.incident_id,
|
| 86 |
+
task_type: observation.task_type,
|
| 87 |
+
};
|
| 88 |
+
payload[observation.expected_field] = selectedValue;
|
| 89 |
+
return payload;
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
async function initPlayground() {
|
| 93 |
+
const resetForm = document.getElementById("reset-form");
|
| 94 |
+
const stepForm = document.getElementById("step-form");
|
| 95 |
+
const taskTypeInput = document.getElementById("task-type");
|
| 96 |
+
const ticketIdInput = document.getElementById("ticket-id");
|
| 97 |
+
const ticketOptions = document.getElementById("ticket-options");
|
| 98 |
+
const ticketHelper = document.getElementById("ticket-helper");
|
| 99 |
+
const expectedFieldInput = document.getElementById("expected-field");
|
| 100 |
+
const actionValueSelect = document.getElementById("action-value");
|
| 101 |
+
const stepButton = document.getElementById("step-button");
|
| 102 |
+
const resetButton = document.getElementById("reset-button");
|
| 103 |
+
const sessionIdTarget = document.getElementById("session-id");
|
| 104 |
+
const observationOutput = document.getElementById("observation-output");
|
| 105 |
+
const resultOutput = document.getElementById("result-output");
|
| 106 |
+
const messageTarget = document.getElementById("playground-message");
|
| 107 |
+
const summaryIncident = document.getElementById("summary-incident");
|
| 108 |
+
const summaryField = document.getElementById("summary-field");
|
| 109 |
+
const summaryReward = document.getElementById("summary-reward");
|
| 110 |
+
const summaryStatus = document.getElementById("summary-status");
|
| 111 |
+
|
| 112 |
+
let sessionId = null;
|
| 113 |
+
let observation = null;
|
| 114 |
+
let validTickets = [];
|
| 115 |
+
|
| 116 |
+
const setOutput = (target, data) => {
|
| 117 |
+
target.textContent = typeof data === "string" ? data : JSON.stringify(data, null, 2);
|
| 118 |
+
};
|
| 119 |
+
|
| 120 |
+
const setMessage = (message, mode = "neutral") => {
|
| 121 |
+
messageTarget.textContent = message;
|
| 122 |
+
messageTarget.dataset.mode = mode;
|
| 123 |
+
};
|
| 124 |
+
|
| 125 |
+
const setBusy = (button, isBusy, busyText, idleText) => {
|
| 126 |
+
button.disabled = isBusy;
|
| 127 |
+
button.textContent = isBusy ? busyText : idleText;
|
| 128 |
+
};
|
| 129 |
+
|
| 130 |
+
const updateSummaryFromObservation = (nextObservation) => {
|
| 131 |
+
summaryIncident.textContent = nextObservation.incident_id;
|
| 132 |
+
summaryField.textContent = nextObservation.expected_field;
|
| 133 |
+
summaryReward.textContent = "--";
|
| 134 |
+
summaryStatus.textContent = "Awaiting action";
|
| 135 |
+
};
|
| 136 |
+
|
| 137 |
+
const updateSummaryFromResult = (result) => {
|
| 138 |
+
summaryReward.textContent = result.reward?.value ?? "--";
|
| 139 |
+
summaryStatus.textContent = result.done ? "Completed" : "In progress";
|
| 140 |
+
};
|
| 141 |
+
|
| 142 |
+
const findTicket = (ticketId) => validTickets.find((ticket) => ticket.incident_id === ticketId);
|
| 143 |
+
|
| 144 |
+
const syncTaskTypeFromTicket = () => {
|
| 145 |
+
const ticket = findTicket(ticketIdInput.value.trim());
|
| 146 |
+
if (!ticket) return;
|
| 147 |
+
taskTypeInput.value = ticket.task_type;
|
| 148 |
+
ticketHelper.textContent = `${ticket.incident_id} is a ${ticket.task_type} ${ticket.difficulty} ticket.`;
|
| 149 |
+
};
|
| 150 |
+
|
| 151 |
+
const chooseFirstTicketForTask = () => {
|
| 152 |
+
if (!taskTypeInput.value) return;
|
| 153 |
+
const ticket = validTickets.find((item) => item.task_type === taskTypeInput.value);
|
| 154 |
+
if (ticket) {
|
| 155 |
+
ticketIdInput.value = ticket.incident_id;
|
| 156 |
+
ticketHelper.textContent = `${ticket.incident_id} selected for ${taskTypeInput.value}.`;
|
| 157 |
+
}
|
| 158 |
+
};
|
| 159 |
+
|
| 160 |
+
try {
|
| 161 |
+
const ticketData = await fetchJson("/tickets");
|
| 162 |
+
validTickets = ticketData.tickets || [];
|
| 163 |
+
ticketOptions.innerHTML = validTickets
|
| 164 |
+
.map((ticket) => `<option value="${ticket.incident_id}" label="${ticket.task_type} / ${ticket.task_name}"></option>`)
|
| 165 |
+
.join("");
|
| 166 |
+
ticketHelper.textContent = `Valid ticket range: ${validTickets[0]?.incident_id || "--"} to ${validTickets.at(-1)?.incident_id || "--"}.`;
|
| 167 |
+
} catch (error) {
|
| 168 |
+
ticketHelper.textContent = `Could not load ticket list: ${error.message}`;
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
document.querySelectorAll("[data-preset-task]").forEach((button) => {
|
| 172 |
+
button.addEventListener("click", () => {
|
| 173 |
+
taskTypeInput.value = button.dataset.presetTask;
|
| 174 |
+
ticketIdInput.value = button.dataset.presetTicket;
|
| 175 |
+
setMessage(`Preset loaded: ${button.dataset.presetTask} / ${button.dataset.presetTicket}. Click Start / Reset Environment.`, "success");
|
| 176 |
+
});
|
| 177 |
+
});
|
| 178 |
+
|
| 179 |
+
resetForm.addEventListener("submit", async (event) => {
|
| 180 |
+
event.preventDefault();
|
| 181 |
+
const formData = new FormData(resetForm);
|
| 182 |
+
const payload = {};
|
| 183 |
+
|
| 184 |
+
for (const [key, value] of formData.entries()) {
|
| 185 |
+
if (value !== "") {
|
| 186 |
+
payload[key] = key === "seed" ? Number(value) : value;
|
| 187 |
+
}
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
const requestedTicket = payload.ticket_id;
|
| 191 |
+
const knownTicket = requestedTicket ? findTicket(requestedTicket) : null;
|
| 192 |
+
if (requestedTicket && validTickets.length > 0 && !knownTicket) {
|
| 193 |
+
const message = `Ticket ${requestedTicket} does not exist. Use one of ${validTickets[0].incident_id} to ${validTickets.at(-1).incident_id}, or click a preset.`;
|
| 194 |
+
setOutput(observationOutput, { error: message });
|
| 195 |
+
setMessage(message, "error");
|
| 196 |
+
return;
|
| 197 |
+
}
|
| 198 |
+
if (knownTicket && payload.task_type && payload.task_type !== knownTicket.task_type) {
|
| 199 |
+
payload.task_type = knownTicket.task_type;
|
| 200 |
+
taskTypeInput.value = knownTicket.task_type;
|
| 201 |
+
ticketHelper.textContent = `Task type changed to ${knownTicket.task_type} because ${knownTicket.incident_id} belongs to that task.`;
|
| 202 |
+
}
|
| 203 |
+
|
| 204 |
+
try {
|
| 205 |
+
setBusy(resetButton, true, "Starting...", "Start / Reset Environment");
|
| 206 |
+
setMessage("Reset request sent. Watch the terminal for a [RESET] log.", "neutral");
|
| 207 |
+
const result = await fetchJson("/reset", {
|
| 208 |
+
method: "POST",
|
| 209 |
+
headers: { "Content-Type": "application/json" },
|
| 210 |
+
body: JSON.stringify(payload),
|
| 211 |
+
});
|
| 212 |
+
|
| 213 |
+
sessionId = result.info.session_id;
|
| 214 |
+
observation = result.observation;
|
| 215 |
+
|
| 216 |
+
sessionIdTarget.textContent = sessionId;
|
| 217 |
+
expectedFieldInput.value = observation.expected_field;
|
| 218 |
+
actionValueSelect.disabled = false;
|
| 219 |
+
stepButton.disabled = false;
|
| 220 |
+
actionValueSelect.innerHTML = observation.allowed_values
|
| 221 |
+
.map((value) => `<option value="${value}">${value}</option>`)
|
| 222 |
+
.join("");
|
| 223 |
+
|
| 224 |
+
setOutput(observationOutput, result);
|
| 225 |
+
setOutput(resultOutput, "No step submitted yet.");
|
| 226 |
+
updateSummaryFromObservation(observation);
|
| 227 |
+
setMessage(`Session ready for ${observation.incident_id}. Pick a value and submit the step.`, "success");
|
| 228 |
+
} catch (error) {
|
| 229 |
+
setOutput(observationOutput, { error: error.message });
|
| 230 |
+
setMessage(error.message, "error");
|
| 231 |
+
} finally {
|
| 232 |
+
setBusy(resetButton, false, "Starting...", "Start / Reset Environment");
|
| 233 |
+
}
|
| 234 |
+
});
|
| 235 |
+
|
| 236 |
+
stepForm.addEventListener("submit", async (event) => {
|
| 237 |
+
event.preventDefault();
|
| 238 |
+
if (!sessionId || !observation) {
|
| 239 |
+
setOutput(resultOutput, { error: "Start a session first." });
|
| 240 |
+
setMessage("Start a session before submitting a step.", "error");
|
| 241 |
+
return;
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
try {
|
| 245 |
+
setBusy(stepButton, true, "Submitting...", "Submit Step");
|
| 246 |
+
setMessage("Step request sent. Watch the terminal for a [STEP] log.", "neutral");
|
| 247 |
+
const result = await fetchJson(`/step?session_id=${encodeURIComponent(sessionId)}`, {
|
| 248 |
+
method: "POST",
|
| 249 |
+
headers: { "Content-Type": "application/json" },
|
| 250 |
+
body: JSON.stringify(buildActionPayload(observation, actionValueSelect.value)),
|
| 251 |
+
});
|
| 252 |
+
setOutput(resultOutput, result);
|
| 253 |
+
updateSummaryFromResult(result);
|
| 254 |
+
const reward = result.reward?.value ?? "--";
|
| 255 |
+
setMessage(`Step completed with reward ${reward}.`, reward === 1 ? "success" : "neutral");
|
| 256 |
+
} catch (error) {
|
| 257 |
+
setOutput(resultOutput, { error: error.message });
|
| 258 |
+
setMessage(error.message, "error");
|
| 259 |
+
} finally {
|
| 260 |
+
if (observation) {
|
| 261 |
+
setBusy(stepButton, false, "Submitting...", "Submit Step");
|
| 262 |
+
}
|
| 263 |
+
}
|
| 264 |
+
});
|
| 265 |
+
|
| 266 |
+
ticketIdInput.addEventListener("change", syncTaskTypeFromTicket);
|
| 267 |
+
ticketIdInput.addEventListener("blur", syncTaskTypeFromTicket);
|
| 268 |
+
taskTypeInput.addEventListener("change", chooseFirstTicketForTask);
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
async function bootstrap() {
|
| 272 |
+
const page = document.body.dataset.page;
|
| 273 |
+
try {
|
| 274 |
+
if (page === "home") {
|
| 275 |
+
await initHome();
|
| 276 |
+
} else if (page === "status") {
|
| 277 |
+
await initStatus();
|
| 278 |
+
} else if (page === "playground") {
|
| 279 |
+
await initPlayground();
|
| 280 |
+
}
|
| 281 |
+
} catch (error) {
|
| 282 |
+
const pageShell = document.querySelector(".page-shell");
|
| 283 |
+
const banner = document.createElement("div");
|
| 284 |
+
banner.className = "floating-panel";
|
| 285 |
+
banner.innerHTML = `<strong>UI data load failed.</strong><p class="status-helper">${error.message}</p>`;
|
| 286 |
+
pageShell?.prepend(banner);
|
| 287 |
+
}
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
window.addEventListener("DOMContentLoaded", bootstrap);
|
ui/assets/styles.css
ADDED
|
@@ -0,0 +1,731 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
@import url("https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;500;700&family=IBM+Plex+Mono:wght@400;500&display=swap");
|
| 2 |
+
|
| 3 |
+
:root {
|
| 4 |
+
--bg: #f4ede2;
|
| 5 |
+
--bg-accent: #e7f1f0;
|
| 6 |
+
--surface: rgba(255, 251, 246, 0.86);
|
| 7 |
+
--surface-strong: rgba(255, 255, 255, 0.94);
|
| 8 |
+
--border: rgba(24, 46, 56, 0.12);
|
| 9 |
+
--text: #13232c;
|
| 10 |
+
--muted: #5d6a70;
|
| 11 |
+
--accent: #c5532f;
|
| 12 |
+
--accent-deep: #7a2f1a;
|
| 13 |
+
--signal: #0d7c66;
|
| 14 |
+
--signal-soft: rgba(13, 124, 102, 0.14);
|
| 15 |
+
--shadow: 0 24px 50px rgba(33, 48, 55, 0.12);
|
| 16 |
+
--radius-xl: 28px;
|
| 17 |
+
--radius-lg: 20px;
|
| 18 |
+
--radius-md: 14px;
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
* {
|
| 22 |
+
box-sizing: border-box;
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
html {
|
| 26 |
+
scroll-behavior: smooth;
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
body {
|
| 30 |
+
margin: 0;
|
| 31 |
+
min-height: 100vh;
|
| 32 |
+
color: var(--text);
|
| 33 |
+
background:
|
| 34 |
+
radial-gradient(circle at top left, rgba(197, 83, 47, 0.16), transparent 28%),
|
| 35 |
+
radial-gradient(circle at top right, rgba(13, 124, 102, 0.18), transparent 26%),
|
| 36 |
+
linear-gradient(145deg, #f7efe4 0%, #edf6f4 48%, #f5ebdf 100%);
|
| 37 |
+
font-family: "Space Grotesk", "Avenir Next", sans-serif;
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
body::before,
|
| 41 |
+
body::after {
|
| 42 |
+
content: "";
|
| 43 |
+
position: fixed;
|
| 44 |
+
inset: auto;
|
| 45 |
+
width: 320px;
|
| 46 |
+
height: 320px;
|
| 47 |
+
border-radius: 50%;
|
| 48 |
+
filter: blur(12px);
|
| 49 |
+
opacity: 0.4;
|
| 50 |
+
pointer-events: none;
|
| 51 |
+
z-index: 0;
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
body::before {
|
| 55 |
+
right: -80px;
|
| 56 |
+
top: 120px;
|
| 57 |
+
background: rgba(197, 83, 47, 0.18);
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
body::after {
|
| 61 |
+
left: -100px;
|
| 62 |
+
bottom: 40px;
|
| 63 |
+
background: rgba(13, 124, 102, 0.14);
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
a {
|
| 67 |
+
color: inherit;
|
| 68 |
+
text-decoration: none;
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
code,
|
| 72 |
+
pre,
|
| 73 |
+
.mono-label {
|
| 74 |
+
font-family: "IBM Plex Mono", "SFMono-Regular", monospace;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
.page-shell {
|
| 78 |
+
position: relative;
|
| 79 |
+
z-index: 1;
|
| 80 |
+
width: min(1240px, calc(100vw - 48px));
|
| 81 |
+
margin: 0 auto;
|
| 82 |
+
padding: 24px 0 80px;
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
.topbar {
|
| 86 |
+
display: flex;
|
| 87 |
+
align-items: center;
|
| 88 |
+
justify-content: space-between;
|
| 89 |
+
gap: 20px;
|
| 90 |
+
padding: 14px 0 24px;
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
.brand {
|
| 94 |
+
display: inline-flex;
|
| 95 |
+
flex-direction: column;
|
| 96 |
+
gap: 2px;
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
.brand-kicker,
|
| 100 |
+
.eyebrow {
|
| 101 |
+
color: var(--accent-deep);
|
| 102 |
+
font-size: 0.78rem;
|
| 103 |
+
font-weight: 700;
|
| 104 |
+
letter-spacing: 0.18em;
|
| 105 |
+
text-transform: uppercase;
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
.brand-title {
|
| 109 |
+
font-size: 1.35rem;
|
| 110 |
+
font-weight: 700;
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
.nav-links {
|
| 114 |
+
display: flex;
|
| 115 |
+
gap: 18px;
|
| 116 |
+
flex-wrap: wrap;
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
.nav-links a {
|
| 120 |
+
color: var(--muted);
|
| 121 |
+
font-weight: 500;
|
| 122 |
+
transition: color 180ms ease, transform 180ms ease;
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
.nav-links a:hover {
|
| 126 |
+
color: var(--text);
|
| 127 |
+
transform: translateY(-1px);
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
.hero,
|
| 131 |
+
.dual-grid,
|
| 132 |
+
.playground-grid,
|
| 133 |
+
.playground-summary,
|
| 134 |
+
.guide-grid,
|
| 135 |
+
.status-overview,
|
| 136 |
+
.feature-strip,
|
| 137 |
+
.route-grid,
|
| 138 |
+
.task-grid {
|
| 139 |
+
display: grid;
|
| 140 |
+
gap: 22px;
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
.hero {
|
| 144 |
+
grid-template-columns: minmax(0, 1.35fr) minmax(320px, 0.95fr);
|
| 145 |
+
align-items: stretch;
|
| 146 |
+
min-height: 520px;
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
.hero-copy,
|
| 150 |
+
.floating-panel,
|
| 151 |
+
.feature-card,
|
| 152 |
+
.task-card,
|
| 153 |
+
.route-card,
|
| 154 |
+
.stat-card {
|
| 155 |
+
background: var(--surface);
|
| 156 |
+
backdrop-filter: blur(18px);
|
| 157 |
+
border: 1px solid var(--border);
|
| 158 |
+
box-shadow: var(--shadow);
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
.hero-copy,
|
| 162 |
+
.floating-panel {
|
| 163 |
+
border-radius: var(--radius-xl);
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
.hero-copy {
|
| 167 |
+
padding: 44px;
|
| 168 |
+
display: flex;
|
| 169 |
+
flex-direction: column;
|
| 170 |
+
justify-content: center;
|
| 171 |
+
animation: rise-in 650ms ease both;
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
.hero-copy h1,
|
| 175 |
+
.section-heading h1 {
|
| 176 |
+
margin: 12px 0;
|
| 177 |
+
font-size: clamp(2.8rem, 6vw, 5.6rem);
|
| 178 |
+
line-height: 0.95;
|
| 179 |
+
letter-spacing: -0.05em;
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
.hero-text,
|
| 183 |
+
.section-copy {
|
| 184 |
+
max-width: 56ch;
|
| 185 |
+
font-size: 1.05rem;
|
| 186 |
+
line-height: 1.7;
|
| 187 |
+
color: var(--muted);
|
| 188 |
+
}
|
| 189 |
+
|
| 190 |
+
.hero-actions {
|
| 191 |
+
display: flex;
|
| 192 |
+
flex-wrap: wrap;
|
| 193 |
+
gap: 14px;
|
| 194 |
+
margin-top: 26px;
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
.button {
|
| 198 |
+
display: inline-flex;
|
| 199 |
+
align-items: center;
|
| 200 |
+
justify-content: center;
|
| 201 |
+
min-height: 46px;
|
| 202 |
+
padding: 0 20px;
|
| 203 |
+
border-radius: 999px;
|
| 204 |
+
border: 1px solid transparent;
|
| 205 |
+
font-weight: 700;
|
| 206 |
+
cursor: pointer;
|
| 207 |
+
transition: transform 180ms ease, box-shadow 180ms ease, background 180ms ease;
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
.button:hover {
|
| 211 |
+
transform: translateY(-1px);
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
.button:disabled {
|
| 215 |
+
cursor: not-allowed;
|
| 216 |
+
opacity: 0.58;
|
| 217 |
+
transform: none;
|
| 218 |
+
}
|
| 219 |
+
|
| 220 |
+
.button-primary {
|
| 221 |
+
background: linear-gradient(135deg, var(--accent), #dd7e36);
|
| 222 |
+
color: #fff;
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
.button-secondary {
|
| 226 |
+
background: rgba(19, 35, 44, 0.06);
|
| 227 |
+
border-color: rgba(19, 35, 44, 0.08);
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
.hero-panel,
|
| 231 |
+
.big-status-card {
|
| 232 |
+
padding: 24px;
|
| 233 |
+
animation: rise-in 800ms ease both;
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
.panel-header {
|
| 237 |
+
display: flex;
|
| 238 |
+
align-items: center;
|
| 239 |
+
justify-content: space-between;
|
| 240 |
+
gap: 16px;
|
| 241 |
+
margin-bottom: 18px;
|
| 242 |
+
font-weight: 700;
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
.live-pill,
|
| 246 |
+
.badge,
|
| 247 |
+
.status-chip {
|
| 248 |
+
display: inline-flex;
|
| 249 |
+
align-items: center;
|
| 250 |
+
gap: 8px;
|
| 251 |
+
min-height: 32px;
|
| 252 |
+
padding: 0 12px;
|
| 253 |
+
border-radius: 999px;
|
| 254 |
+
background: var(--signal-soft);
|
| 255 |
+
color: var(--signal);
|
| 256 |
+
font-size: 0.86rem;
|
| 257 |
+
font-weight: 700;
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
.live-pill.is-pending,
|
| 261 |
+
.status-chip.is-pending {
|
| 262 |
+
background: rgba(197, 83, 47, 0.12);
|
| 263 |
+
color: var(--accent-deep);
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
.stats-grid,
|
| 267 |
+
.feature-strip,
|
| 268 |
+
.status-overview {
|
| 269 |
+
grid-template-columns: repeat(2, minmax(0, 1fr));
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
.feature-strip {
|
| 273 |
+
margin-top: 24px;
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
.guide-grid {
|
| 277 |
+
grid-template-columns: repeat(3, minmax(0, 1fr));
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
.guide-card {
|
| 281 |
+
min-height: 168px;
|
| 282 |
+
padding: 24px;
|
| 283 |
+
border: 1px solid rgba(19, 35, 44, 0.1);
|
| 284 |
+
border-radius: var(--radius-lg);
|
| 285 |
+
background: rgba(255, 255, 255, 0.68);
|
| 286 |
+
box-shadow: 0 16px 34px rgba(33, 48, 55, 0.08);
|
| 287 |
+
}
|
| 288 |
+
|
| 289 |
+
.guide-card span {
|
| 290 |
+
display: inline-flex;
|
| 291 |
+
align-items: center;
|
| 292 |
+
justify-content: center;
|
| 293 |
+
min-width: 34px;
|
| 294 |
+
min-height: 34px;
|
| 295 |
+
padding: 0 10px;
|
| 296 |
+
border-radius: 999px;
|
| 297 |
+
background: rgba(197, 83, 47, 0.12);
|
| 298 |
+
color: var(--accent-deep);
|
| 299 |
+
font-weight: 700;
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
+
.guide-card strong {
|
| 303 |
+
display: block;
|
| 304 |
+
margin-top: 16px;
|
| 305 |
+
font-size: 1.2rem;
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
.guide-card p {
|
| 309 |
+
margin: 10px 0 0;
|
| 310 |
+
color: var(--muted);
|
| 311 |
+
line-height: 1.65;
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
.stat-card,
|
| 315 |
+
.feature-card,
|
| 316 |
+
.task-card,
|
| 317 |
+
.route-card {
|
| 318 |
+
border-radius: var(--radius-lg);
|
| 319 |
+
padding: 24px;
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
.stat-label,
|
| 323 |
+
.status-caption {
|
| 324 |
+
display: block;
|
| 325 |
+
color: var(--muted);
|
| 326 |
+
font-size: 0.84rem;
|
| 327 |
+
text-transform: uppercase;
|
| 328 |
+
letter-spacing: 0.08em;
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
.stat-value,
|
| 332 |
+
.status-display {
|
| 333 |
+
display: block;
|
| 334 |
+
margin-top: 12px;
|
| 335 |
+
font-size: clamp(1.8rem, 4vw, 3.2rem);
|
| 336 |
+
line-height: 1;
|
| 337 |
+
}
|
| 338 |
+
|
| 339 |
+
.feature-card {
|
| 340 |
+
min-height: 220px;
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
.feature-index {
|
| 344 |
+
color: var(--accent);
|
| 345 |
+
font-size: 0.9rem;
|
| 346 |
+
font-weight: 700;
|
| 347 |
+
letter-spacing: 0.16em;
|
| 348 |
+
}
|
| 349 |
+
|
| 350 |
+
.feature-card h2,
|
| 351 |
+
.task-card h3,
|
| 352 |
+
.route-card strong {
|
| 353 |
+
margin: 16px 0 12px;
|
| 354 |
+
font-size: 1.4rem;
|
| 355 |
+
}
|
| 356 |
+
|
| 357 |
+
.feature-card p,
|
| 358 |
+
.task-card p,
|
| 359 |
+
.route-card span,
|
| 360 |
+
.status-helper,
|
| 361 |
+
.copy-block p,
|
| 362 |
+
.bullet-list {
|
| 363 |
+
color: var(--muted);
|
| 364 |
+
line-height: 1.7;
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
.tasks-section,
|
| 368 |
+
.stack-layout {
|
| 369 |
+
display: grid;
|
| 370 |
+
gap: 20px;
|
| 371 |
+
margin-top: 24px;
|
| 372 |
+
}
|
| 373 |
+
|
| 374 |
+
.section-heading {
|
| 375 |
+
padding-top: 8px;
|
| 376 |
+
}
|
| 377 |
+
|
| 378 |
+
.section-heading h2 {
|
| 379 |
+
margin: 8px 0 0;
|
| 380 |
+
font-size: clamp(1.9rem, 4vw, 3.4rem);
|
| 381 |
+
line-height: 1.02;
|
| 382 |
+
}
|
| 383 |
+
|
| 384 |
+
.section-heading.compact h2 {
|
| 385 |
+
font-size: clamp(1.35rem, 3vw, 2rem);
|
| 386 |
+
}
|
| 387 |
+
|
| 388 |
+
.task-grid,
|
| 389 |
+
.route-grid {
|
| 390 |
+
grid-template-columns: repeat(3, minmax(0, 1fr));
|
| 391 |
+
}
|
| 392 |
+
|
| 393 |
+
.task-card {
|
| 394 |
+
position: relative;
|
| 395 |
+
overflow: hidden;
|
| 396 |
+
display: flex;
|
| 397 |
+
min-height: 245px;
|
| 398 |
+
flex-direction: column;
|
| 399 |
+
justify-content: space-between;
|
| 400 |
+
gap: 14px;
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
.task-card::after {
|
| 404 |
+
content: "";
|
| 405 |
+
position: absolute;
|
| 406 |
+
inset: auto -20% -60% auto;
|
| 407 |
+
width: 180px;
|
| 408 |
+
height: 180px;
|
| 409 |
+
border-radius: 50%;
|
| 410 |
+
background: radial-gradient(circle, rgba(13, 124, 102, 0.12), transparent 70%);
|
| 411 |
+
}
|
| 412 |
+
|
| 413 |
+
.task-meta,
|
| 414 |
+
.task-values {
|
| 415 |
+
display: flex;
|
| 416 |
+
flex-wrap: wrap;
|
| 417 |
+
gap: 8px;
|
| 418 |
+
margin-top: 14px;
|
| 419 |
+
}
|
| 420 |
+
|
| 421 |
+
.badge {
|
| 422 |
+
background: rgba(19, 35, 44, 0.06);
|
| 423 |
+
color: var(--text);
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
+
.difficulty-easy {
|
| 427 |
+
background: rgba(13, 124, 102, 0.14);
|
| 428 |
+
color: var(--signal);
|
| 429 |
+
}
|
| 430 |
+
|
| 431 |
+
.difficulty-medium {
|
| 432 |
+
background: rgba(227, 154, 52, 0.16);
|
| 433 |
+
color: #8b5a12;
|
| 434 |
+
}
|
| 435 |
+
|
| 436 |
+
.difficulty-hard {
|
| 437 |
+
background: rgba(197, 83, 47, 0.14);
|
| 438 |
+
color: var(--accent-deep);
|
| 439 |
+
}
|
| 440 |
+
|
| 441 |
+
.routes-section {
|
| 442 |
+
margin-top: 24px;
|
| 443 |
+
padding: 26px;
|
| 444 |
+
}
|
| 445 |
+
|
| 446 |
+
.route-card {
|
| 447 |
+
min-height: 150px;
|
| 448 |
+
transition: transform 180ms ease, border-color 180ms ease;
|
| 449 |
+
}
|
| 450 |
+
|
| 451 |
+
.route-card:hover {
|
| 452 |
+
transform: translateY(-4px);
|
| 453 |
+
border-color: rgba(197, 83, 47, 0.2);
|
| 454 |
+
}
|
| 455 |
+
|
| 456 |
+
.status-overview {
|
| 457 |
+
grid-template-columns: 1.2fr 1fr 1fr;
|
| 458 |
+
}
|
| 459 |
+
|
| 460 |
+
.big-status-card {
|
| 461 |
+
background: linear-gradient(145deg, rgba(255, 247, 241, 0.92), rgba(243, 255, 251, 0.82));
|
| 462 |
+
}
|
| 463 |
+
|
| 464 |
+
.dual-grid {
|
| 465 |
+
grid-template-columns: repeat(2, minmax(0, 1fr));
|
| 466 |
+
}
|
| 467 |
+
|
| 468 |
+
.floating-panel {
|
| 469 |
+
padding: 30px;
|
| 470 |
+
}
|
| 471 |
+
|
| 472 |
+
.playground-grid {
|
| 473 |
+
grid-template-columns: minmax(320px, 0.9fr) minmax(360px, 1fr);
|
| 474 |
+
align-items: start;
|
| 475 |
+
}
|
| 476 |
+
|
| 477 |
+
.control-panel {
|
| 478 |
+
min-height: 100%;
|
| 479 |
+
}
|
| 480 |
+
|
| 481 |
+
.preset-row {
|
| 482 |
+
display: grid;
|
| 483 |
+
grid-template-columns: repeat(3, minmax(0, 1fr));
|
| 484 |
+
gap: 10px;
|
| 485 |
+
margin: 20px 0;
|
| 486 |
+
}
|
| 487 |
+
|
| 488 |
+
.preset-button {
|
| 489 |
+
min-height: 44px;
|
| 490 |
+
border: 1px solid rgba(19, 35, 44, 0.1);
|
| 491 |
+
border-radius: 999px;
|
| 492 |
+
background: rgba(255, 255, 255, 0.72);
|
| 493 |
+
color: var(--text);
|
| 494 |
+
font: inherit;
|
| 495 |
+
font-size: 0.92rem;
|
| 496 |
+
font-weight: 700;
|
| 497 |
+
cursor: pointer;
|
| 498 |
+
transition: transform 180ms ease, border-color 180ms ease, background 180ms ease;
|
| 499 |
+
}
|
| 500 |
+
|
| 501 |
+
.preset-button:hover {
|
| 502 |
+
transform: translateY(-1px);
|
| 503 |
+
border-color: rgba(197, 83, 47, 0.28);
|
| 504 |
+
background: rgba(255, 255, 255, 0.94);
|
| 505 |
+
}
|
| 506 |
+
|
| 507 |
+
.ui-message {
|
| 508 |
+
margin-top: 18px;
|
| 509 |
+
padding: 14px 16px;
|
| 510 |
+
border-radius: var(--radius-md);
|
| 511 |
+
background: rgba(19, 35, 44, 0.05);
|
| 512 |
+
color: var(--muted);
|
| 513 |
+
line-height: 1.5;
|
| 514 |
+
}
|
| 515 |
+
|
| 516 |
+
.ui-message[data-mode="success"] {
|
| 517 |
+
background: var(--signal-soft);
|
| 518 |
+
color: var(--signal);
|
| 519 |
+
}
|
| 520 |
+
|
| 521 |
+
.ui-message[data-mode="error"] {
|
| 522 |
+
background: rgba(197, 83, 47, 0.14);
|
| 523 |
+
color: var(--accent-deep);
|
| 524 |
+
}
|
| 525 |
+
|
| 526 |
+
.playground-summary {
|
| 527 |
+
grid-template-columns: repeat(4, minmax(0, 1fr));
|
| 528 |
+
}
|
| 529 |
+
|
| 530 |
+
.summary-card {
|
| 531 |
+
padding: 18px 20px;
|
| 532 |
+
border: 1px solid var(--border);
|
| 533 |
+
border-radius: var(--radius-lg);
|
| 534 |
+
background: var(--surface-strong);
|
| 535 |
+
box-shadow: 0 14px 30px rgba(33, 48, 55, 0.08);
|
| 536 |
+
}
|
| 537 |
+
|
| 538 |
+
.summary-card span {
|
| 539 |
+
display: block;
|
| 540 |
+
color: var(--muted);
|
| 541 |
+
font-size: 0.8rem;
|
| 542 |
+
font-weight: 700;
|
| 543 |
+
letter-spacing: 0.08em;
|
| 544 |
+
text-transform: uppercase;
|
| 545 |
+
}
|
| 546 |
+
|
| 547 |
+
.summary-card strong {
|
| 548 |
+
display: block;
|
| 549 |
+
margin-top: 10px;
|
| 550 |
+
font-size: clamp(1.2rem, 3vw, 2rem);
|
| 551 |
+
}
|
| 552 |
+
|
| 553 |
+
.badge-grid {
|
| 554 |
+
display: flex;
|
| 555 |
+
flex-wrap: wrap;
|
| 556 |
+
gap: 10px;
|
| 557 |
+
}
|
| 558 |
+
|
| 559 |
+
.copy-block,
|
| 560 |
+
.bullet-list {
|
| 561 |
+
margin: 0;
|
| 562 |
+
padding: 0;
|
| 563 |
+
}
|
| 564 |
+
|
| 565 |
+
.bullet-list {
|
| 566 |
+
list-style: none;
|
| 567 |
+
display: grid;
|
| 568 |
+
gap: 10px;
|
| 569 |
+
margin-top: 14px;
|
| 570 |
+
}
|
| 571 |
+
|
| 572 |
+
.bullet-list li {
|
| 573 |
+
padding: 12px 14px;
|
| 574 |
+
border-radius: var(--radius-md);
|
| 575 |
+
background: rgba(19, 35, 44, 0.04);
|
| 576 |
+
}
|
| 577 |
+
|
| 578 |
+
.form-grid {
|
| 579 |
+
display: grid;
|
| 580 |
+
gap: 18px;
|
| 581 |
+
}
|
| 582 |
+
|
| 583 |
+
.form-grid label {
|
| 584 |
+
display: grid;
|
| 585 |
+
gap: 8px;
|
| 586 |
+
color: var(--muted);
|
| 587 |
+
font-weight: 500;
|
| 588 |
+
}
|
| 589 |
+
|
| 590 |
+
.form-grid input,
|
| 591 |
+
.form-grid select,
|
| 592 |
+
.form-grid small,
|
| 593 |
+
.form-grid button {
|
| 594 |
+
width: 100%;
|
| 595 |
+
}
|
| 596 |
+
|
| 597 |
+
.form-grid input,
|
| 598 |
+
.form-grid select {
|
| 599 |
+
min-height: 50px;
|
| 600 |
+
padding: 0 16px;
|
| 601 |
+
border-radius: 16px;
|
| 602 |
+
border: 1px solid rgba(19, 35, 44, 0.12);
|
| 603 |
+
background: rgba(255, 255, 255, 0.9);
|
| 604 |
+
color: var(--text);
|
| 605 |
+
font: inherit;
|
| 606 |
+
}
|
| 607 |
+
|
| 608 |
+
.form-grid input:focus,
|
| 609 |
+
.form-grid select:focus {
|
| 610 |
+
border-color: rgba(197, 83, 47, 0.45);
|
| 611 |
+
box-shadow: 0 0 0 4px rgba(197, 83, 47, 0.1);
|
| 612 |
+
outline: none;
|
| 613 |
+
}
|
| 614 |
+
|
| 615 |
+
.form-grid small {
|
| 616 |
+
display: block;
|
| 617 |
+
color: var(--muted);
|
| 618 |
+
line-height: 1.45;
|
| 619 |
+
}
|
| 620 |
+
|
| 621 |
+
.form-grid input:disabled {
|
| 622 |
+
color: var(--muted);
|
| 623 |
+
}
|
| 624 |
+
|
| 625 |
+
.inline-status {
|
| 626 |
+
display: flex;
|
| 627 |
+
align-items: center;
|
| 628 |
+
gap: 10px;
|
| 629 |
+
margin-top: 20px;
|
| 630 |
+
color: var(--muted);
|
| 631 |
+
overflow-wrap: anywhere;
|
| 632 |
+
}
|
| 633 |
+
|
| 634 |
+
.code-panel {
|
| 635 |
+
min-height: 340px;
|
| 636 |
+
margin: 0;
|
| 637 |
+
padding: 22px;
|
| 638 |
+
border-radius: var(--radius-lg);
|
| 639 |
+
background: #101a21;
|
| 640 |
+
color: #e7f3f0;
|
| 641 |
+
overflow: auto;
|
| 642 |
+
font-size: 0.92rem;
|
| 643 |
+
line-height: 1.6;
|
| 644 |
+
}
|
| 645 |
+
|
| 646 |
+
.output-grid {
|
| 647 |
+
align-items: stretch;
|
| 648 |
+
}
|
| 649 |
+
|
| 650 |
+
.skeleton-card {
|
| 651 |
+
min-height: 220px;
|
| 652 |
+
background:
|
| 653 |
+
linear-gradient(90deg, rgba(255, 255, 255, 0.55), rgba(255, 255, 255, 0.9), rgba(255, 255, 255, 0.55));
|
| 654 |
+
background-size: 200% 100%;
|
| 655 |
+
animation: shimmer 1.2s infinite linear;
|
| 656 |
+
}
|
| 657 |
+
|
| 658 |
+
@keyframes shimmer {
|
| 659 |
+
from { background-position: 200% 0; }
|
| 660 |
+
to { background-position: -200% 0; }
|
| 661 |
+
}
|
| 662 |
+
|
| 663 |
+
@keyframes rise-in {
|
| 664 |
+
from {
|
| 665 |
+
opacity: 0;
|
| 666 |
+
transform: translateY(18px);
|
| 667 |
+
}
|
| 668 |
+
to {
|
| 669 |
+
opacity: 1;
|
| 670 |
+
transform: translateY(0);
|
| 671 |
+
}
|
| 672 |
+
}
|
| 673 |
+
|
| 674 |
+
@media (max-width: 1080px) {
|
| 675 |
+
.hero,
|
| 676 |
+
.dual-grid,
|
| 677 |
+
.playground-grid,
|
| 678 |
+
.playground-summary,
|
| 679 |
+
.guide-grid,
|
| 680 |
+
.status-overview,
|
| 681 |
+
.task-grid,
|
| 682 |
+
.route-grid {
|
| 683 |
+
grid-template-columns: 1fr;
|
| 684 |
+
}
|
| 685 |
+
|
| 686 |
+
.stats-grid,
|
| 687 |
+
.feature-strip {
|
| 688 |
+
grid-template-columns: 1fr 1fr;
|
| 689 |
+
}
|
| 690 |
+
}
|
| 691 |
+
|
| 692 |
+
@media (max-width: 720px) {
|
| 693 |
+
.page-shell {
|
| 694 |
+
width: min(100vw - 24px, 1180px);
|
| 695 |
+
padding-top: 12px;
|
| 696 |
+
padding-bottom: 40px;
|
| 697 |
+
}
|
| 698 |
+
|
| 699 |
+
.topbar {
|
| 700 |
+
flex-direction: column;
|
| 701 |
+
align-items: flex-start;
|
| 702 |
+
}
|
| 703 |
+
|
| 704 |
+
.hero-copy,
|
| 705 |
+
.floating-panel,
|
| 706 |
+
.routes-section {
|
| 707 |
+
padding: 24px;
|
| 708 |
+
}
|
| 709 |
+
|
| 710 |
+
.preset-row {
|
| 711 |
+
grid-template-columns: 1fr;
|
| 712 |
+
}
|
| 713 |
+
|
| 714 |
+
.stats-grid,
|
| 715 |
+
.feature-strip {
|
| 716 |
+
grid-template-columns: 1fr;
|
| 717 |
+
}
|
| 718 |
+
|
| 719 |
+
.hero-copy h1,
|
| 720 |
+
.section-heading h1 {
|
| 721 |
+
font-size: clamp(2.4rem, 14vw, 3.6rem);
|
| 722 |
+
}
|
| 723 |
+
|
| 724 |
+
.section-heading h2 {
|
| 725 |
+
font-size: clamp(1.6rem, 8vw, 2.4rem);
|
| 726 |
+
}
|
| 727 |
+
|
| 728 |
+
.hero-actions {
|
| 729 |
+
flex-direction: column;
|
| 730 |
+
}
|
| 731 |
+
}
|
ui/index.html
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="utf-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
| 6 |
+
<title>Incident Triage Environment</title>
|
| 7 |
+
<link rel="stylesheet" href="/assets/styles.css?v=3">
|
| 8 |
+
</head>
|
| 9 |
+
<body data-page="home">
|
| 10 |
+
<div class="page-shell">
|
| 11 |
+
<header class="topbar">
|
| 12 |
+
<a class="brand" href="/">
|
| 13 |
+
<span class="brand-kicker">OpenEnv Environment</span>
|
| 14 |
+
<span class="brand-title">Incident Triage</span>
|
| 15 |
+
</a>
|
| 16 |
+
<nav class="nav-links">
|
| 17 |
+
<a href="/status">Status</a>
|
| 18 |
+
<a href="/playground">Playground</a>
|
| 19 |
+
<a href="/docs">API Docs</a>
|
| 20 |
+
</nav>
|
| 21 |
+
</header>
|
| 22 |
+
|
| 23 |
+
<main>
|
| 24 |
+
<section class="hero">
|
| 25 |
+
<div class="hero-copy">
|
| 26 |
+
<p class="eyebrow">Production Incident Response</p>
|
| 27 |
+
<h1>Welcome to Incident Triage Environment</h1>
|
| 28 |
+
<p class="hero-text">
|
| 29 |
+
A real-world OpenEnv environment for severity classification, root-cause analysis,
|
| 30 |
+
and next-action recommendation across production incidents.
|
| 31 |
+
</p>
|
| 32 |
+
<div class="hero-actions">
|
| 33 |
+
<a class="button button-primary" href="/playground">Launch Playground</a>
|
| 34 |
+
<a class="button button-secondary" href="/status">View Live Status</a>
|
| 35 |
+
</div>
|
| 36 |
+
</div>
|
| 37 |
+
<div class="hero-panel floating-panel">
|
| 38 |
+
<div class="panel-header">
|
| 39 |
+
<span>Live Snapshot</span>
|
| 40 |
+
<span class="live-pill" data-health-pill>Checking</span>
|
| 41 |
+
</div>
|
| 42 |
+
<div class="stats-grid">
|
| 43 |
+
<article class="stat-card">
|
| 44 |
+
<span class="stat-label">Total Incidents</span>
|
| 45 |
+
<strong class="stat-value" data-total-incidents>--</strong>
|
| 46 |
+
</article>
|
| 47 |
+
<article class="stat-card">
|
| 48 |
+
<span class="stat-label">Task Families</span>
|
| 49 |
+
<strong class="stat-value" data-task-count>--</strong>
|
| 50 |
+
</article>
|
| 51 |
+
<article class="stat-card">
|
| 52 |
+
<span class="stat-label">API Mode</span>
|
| 53 |
+
<strong class="stat-value">FastAPI</strong>
|
| 54 |
+
</article>
|
| 55 |
+
<article class="stat-card">
|
| 56 |
+
<span class="stat-label">Episode Shape</span>
|
| 57 |
+
<strong class="stat-value">Single Step</strong>
|
| 58 |
+
</article>
|
| 59 |
+
</div>
|
| 60 |
+
</div>
|
| 61 |
+
</section>
|
| 62 |
+
|
| 63 |
+
<section class="feature-strip">
|
| 64 |
+
<article class="feature-card">
|
| 65 |
+
<span class="feature-index">01</span>
|
| 66 |
+
<h2>Typed contracts</h2>
|
| 67 |
+
<p>Observation, action, reward, state, and reset inputs are all strongly modeled for repeatable evaluation.</p>
|
| 68 |
+
</article>
|
| 69 |
+
<article class="feature-card">
|
| 70 |
+
<span class="feature-index">02</span>
|
| 71 |
+
<h2>Deterministic graders</h2>
|
| 72 |
+
<p>Each task returns scores in the 0.0 to 1.0 range with exact-match and partial-credit rules.</p>
|
| 73 |
+
</article>
|
| 74 |
+
<article class="feature-card">
|
| 75 |
+
<span class="feature-index">03</span>
|
| 76 |
+
<h2>Deployed workflow</h2>
|
| 77 |
+
<p>Docker packaging, Space-ready metadata, runtime validation, and a root-level baseline script are all included.</p>
|
| 78 |
+
</article>
|
| 79 |
+
</section>
|
| 80 |
+
|
| 81 |
+
<section class="tasks-section">
|
| 82 |
+
<div class="section-heading">
|
| 83 |
+
<p class="eyebrow">Task Ladder</p>
|
| 84 |
+
<h2>Three escalating operator decisions</h2>
|
| 85 |
+
</div>
|
| 86 |
+
<div class="task-grid" data-task-grid>
|
| 87 |
+
<article class="task-card skeleton-card"></article>
|
| 88 |
+
<article class="task-card skeleton-card"></article>
|
| 89 |
+
<article class="task-card skeleton-card"></article>
|
| 90 |
+
</div>
|
| 91 |
+
</section>
|
| 92 |
+
|
| 93 |
+
<section class="routes-section floating-panel">
|
| 94 |
+
<div class="section-heading compact">
|
| 95 |
+
<p class="eyebrow">Quick Links</p>
|
| 96 |
+
<h2>Use the environment your way</h2>
|
| 97 |
+
</div>
|
| 98 |
+
<div class="route-grid">
|
| 99 |
+
<a class="route-card" href="/status">
|
| 100 |
+
<strong>/status</strong>
|
| 101 |
+
<span>Live environment health, task inventory, and schema coverage.</span>
|
| 102 |
+
</a>
|
| 103 |
+
<a class="route-card" href="/playground">
|
| 104 |
+
<strong>/playground</strong>
|
| 105 |
+
<span>Manually reset a session, submit an action, and inspect the typed response.</span>
|
| 106 |
+
</a>
|
| 107 |
+
<a class="route-card" href="/docs">
|
| 108 |
+
<strong>/docs</strong>
|
| 109 |
+
<span>FastAPI-generated API reference for every endpoint and schema.</span>
|
| 110 |
+
</a>
|
| 111 |
+
</div>
|
| 112 |
+
</section>
|
| 113 |
+
</main>
|
| 114 |
+
</div>
|
| 115 |
+
<script src="/assets/app.js?v=3" defer></script>
|
| 116 |
+
</body>
|
| 117 |
+
</html>
|
ui/playground.html
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="utf-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
| 6 |
+
<title>Incident Triage Playground</title>
|
| 7 |
+
<link rel="stylesheet" href="/assets/styles.css?v=3">
|
| 8 |
+
</head>
|
| 9 |
+
<body data-page="playground">
|
| 10 |
+
<div class="page-shell">
|
| 11 |
+
<header class="topbar">
|
| 12 |
+
<a class="brand" href="/">
|
| 13 |
+
<span class="brand-kicker">OpenEnv Environment</span>
|
| 14 |
+
<span class="brand-title">Incident Triage</span>
|
| 15 |
+
</a>
|
| 16 |
+
<nav class="nav-links">
|
| 17 |
+
<a href="/">Home</a>
|
| 18 |
+
<a href="/status">Status</a>
|
| 19 |
+
<a href="/docs">API Docs</a>
|
| 20 |
+
</nav>
|
| 21 |
+
</header>
|
| 22 |
+
|
| 23 |
+
<main class="stack-layout">
|
| 24 |
+
<section class="section-heading">
|
| 25 |
+
<p class="eyebrow">Manual Evaluation</p>
|
| 26 |
+
<h1>Interactive playground</h1>
|
| 27 |
+
<p class="section-copy">
|
| 28 |
+
This page is a browser version of the OpenEnv flow. Reset starts one evaluation episode,
|
| 29 |
+
Step submits one agent answer, and the result shows the reward returned by the grader.
|
| 30 |
+
</p>
|
| 31 |
+
</section>
|
| 32 |
+
|
| 33 |
+
<section class="guide-grid">
|
| 34 |
+
<article class="guide-card">
|
| 35 |
+
<span>1</span>
|
| 36 |
+
<strong>Start / Reset Environment</strong>
|
| 37 |
+
<p>Starts a new incident episode and returns the observation. No grading happens yet.</p>
|
| 38 |
+
</article>
|
| 39 |
+
<article class="guide-card">
|
| 40 |
+
<span>2</span>
|
| 41 |
+
<strong>Read Observation</strong>
|
| 42 |
+
<p>Check the incident, expected field, allowed values, and context.</p>
|
| 43 |
+
</article>
|
| 44 |
+
<article class="guide-card">
|
| 45 |
+
<span>3</span>
|
| 46 |
+
<strong>Submit Step</strong>
|
| 47 |
+
<p>Send one answer. The backend grades it and prints a terminal log.</p>
|
| 48 |
+
</article>
|
| 49 |
+
</section>
|
| 50 |
+
|
| 51 |
+
<section class="playground-grid">
|
| 52 |
+
<article class="floating-panel control-panel">
|
| 53 |
+
<div class="section-heading compact">
|
| 54 |
+
<p class="eyebrow">Step 1</p>
|
| 55 |
+
<h2>Start a session</h2>
|
| 56 |
+
</div>
|
| 57 |
+
<div class="preset-row" aria-label="Quick presets">
|
| 58 |
+
<button class="preset-button" type="button" data-preset-task="task1" data-preset-ticket="INC-001">Severity case</button>
|
| 59 |
+
<button class="preset-button" type="button" data-preset-task="task2" data-preset-ticket="INC-006">Root cause case</button>
|
| 60 |
+
<button class="preset-button" type="button" data-preset-task="task3" data-preset-ticket="INC-014">Action case</button>
|
| 61 |
+
</div>
|
| 62 |
+
<form id="reset-form" class="form-grid">
|
| 63 |
+
<label>
|
| 64 |
+
<span>Task type</span>
|
| 65 |
+
<select name="task_type" id="task-type">
|
| 66 |
+
<option value="">Any task</option>
|
| 67 |
+
<option value="task1">task1</option>
|
| 68 |
+
<option value="task2">task2</option>
|
| 69 |
+
<option value="task3">task3</option>
|
| 70 |
+
</select>
|
| 71 |
+
</label>
|
| 72 |
+
<label>
|
| 73 |
+
<span>Ticket ID</span>
|
| 74 |
+
<input type="text" name="ticket_id" id="ticket-id" list="ticket-options" placeholder="INC-014">
|
| 75 |
+
<datalist id="ticket-options"></datalist>
|
| 76 |
+
<small id="ticket-helper">Loading valid tickets from the backend.</small>
|
| 77 |
+
</label>
|
| 78 |
+
<label>
|
| 79 |
+
<span>Seed</span>
|
| 80 |
+
<input type="number" name="seed" placeholder="42">
|
| 81 |
+
</label>
|
| 82 |
+
<button class="button button-primary" type="submit" id="reset-button">Start / Reset Environment</button>
|
| 83 |
+
</form>
|
| 84 |
+
<div class="inline-status">
|
| 85 |
+
<span class="mono-label">Session</span>
|
| 86 |
+
<code id="session-id">Not started</code>
|
| 87 |
+
</div>
|
| 88 |
+
<div class="ui-message" id="playground-message">Pick a preset or enter a task and ticket manually.</div>
|
| 89 |
+
</article>
|
| 90 |
+
|
| 91 |
+
<article class="floating-panel control-panel">
|
| 92 |
+
<div class="section-heading compact">
|
| 93 |
+
<p class="eyebrow">Step 2</p>
|
| 94 |
+
<h2>Submit an action</h2>
|
| 95 |
+
</div>
|
| 96 |
+
<form id="step-form" class="form-grid">
|
| 97 |
+
<label>
|
| 98 |
+
<span>Expected field</span>
|
| 99 |
+
<input id="expected-field" type="text" disabled value="Start a session first">
|
| 100 |
+
</label>
|
| 101 |
+
<label>
|
| 102 |
+
<span>Allowed values</span>
|
| 103 |
+
<select id="action-value" disabled>
|
| 104 |
+
<option>Start a session first</option>
|
| 105 |
+
</select>
|
| 106 |
+
</label>
|
| 107 |
+
<button class="button button-secondary" type="submit" disabled id="step-button">Submit Step</button>
|
| 108 |
+
</form>
|
| 109 |
+
<p class="status-helper">The playground automatically maps your choice to `severity`, `root_cause`, or `action`. If you choose a known ticket, it also sets the matching task type for you.</p>
|
| 110 |
+
</article>
|
| 111 |
+
</section>
|
| 112 |
+
|
| 113 |
+
<section class="playground-summary" id="summary-strip">
|
| 114 |
+
<article class="summary-card">
|
| 115 |
+
<span>Incident</span>
|
| 116 |
+
<strong id="summary-incident">--</strong>
|
| 117 |
+
</article>
|
| 118 |
+
<article class="summary-card">
|
| 119 |
+
<span>Expected field</span>
|
| 120 |
+
<strong id="summary-field">--</strong>
|
| 121 |
+
</article>
|
| 122 |
+
<article class="summary-card">
|
| 123 |
+
<span>Reward</span>
|
| 124 |
+
<strong id="summary-reward">--</strong>
|
| 125 |
+
</article>
|
| 126 |
+
<article class="summary-card">
|
| 127 |
+
<span>Status</span>
|
| 128 |
+
<strong id="summary-status">Waiting</strong>
|
| 129 |
+
</article>
|
| 130 |
+
</section>
|
| 131 |
+
|
| 132 |
+
<section class="dual-grid output-grid">
|
| 133 |
+
<article class="floating-panel">
|
| 134 |
+
<div class="section-heading compact">
|
| 135 |
+
<p class="eyebrow">Observation</p>
|
| 136 |
+
<h2>Latest reset payload</h2>
|
| 137 |
+
</div>
|
| 138 |
+
<pre class="code-panel" id="observation-output">No observation yet.</pre>
|
| 139 |
+
</article>
|
| 140 |
+
|
| 141 |
+
<article class="floating-panel">
|
| 142 |
+
<div class="section-heading compact">
|
| 143 |
+
<p class="eyebrow">Result</p>
|
| 144 |
+
<h2>Latest step payload</h2>
|
| 145 |
+
</div>
|
| 146 |
+
<pre class="code-panel" id="result-output">No step submitted yet.</pre>
|
| 147 |
+
</article>
|
| 148 |
+
</section>
|
| 149 |
+
</main>
|
| 150 |
+
</div>
|
| 151 |
+
<script src="/assets/app.js?v=3" defer></script>
|
| 152 |
+
</body>
|
| 153 |
+
</html>
|
ui/status.html
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="utf-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
| 6 |
+
<title>Incident Triage Status</title>
|
| 7 |
+
<link rel="stylesheet" href="/assets/styles.css?v=3">
|
| 8 |
+
</head>
|
| 9 |
+
<body data-page="status">
|
| 10 |
+
<div class="page-shell">
|
| 11 |
+
<header class="topbar">
|
| 12 |
+
<a class="brand" href="/">
|
| 13 |
+
<span class="brand-kicker">OpenEnv Environment</span>
|
| 14 |
+
<span class="brand-title">Incident Triage</span>
|
| 15 |
+
</a>
|
| 16 |
+
<nav class="nav-links">
|
| 17 |
+
<a href="/">Home</a>
|
| 18 |
+
<a href="/playground">Playground</a>
|
| 19 |
+
<a href="/docs">API Docs</a>
|
| 20 |
+
</nav>
|
| 21 |
+
</header>
|
| 22 |
+
|
| 23 |
+
<main class="stack-layout">
|
| 24 |
+
<section class="section-heading">
|
| 25 |
+
<p class="eyebrow">Live Status</p>
|
| 26 |
+
<h1>Environment readiness dashboard</h1>
|
| 27 |
+
<p class="section-copy">
|
| 28 |
+
This page does not start an episode. It reads the running API and confirms the environment
|
| 29 |
+
is ready: health, dataset size, schemas, tasks, and grader rules.
|
| 30 |
+
</p>
|
| 31 |
+
</section>
|
| 32 |
+
|
| 33 |
+
<section class="guide-grid">
|
| 34 |
+
<article class="guide-card">
|
| 35 |
+
<span>Health</span>
|
| 36 |
+
<strong>Server is reachable</strong>
|
| 37 |
+
<p>Reads `/health`. The validator expects the value `healthy`.</p>
|
| 38 |
+
</article>
|
| 39 |
+
<article class="guide-card">
|
| 40 |
+
<span>Schema</span>
|
| 41 |
+
<strong>Contracts are exposed</strong>
|
| 42 |
+
<p>Reads `/schema` to verify action, observation, reward, and state shapes.</p>
|
| 43 |
+
</article>
|
| 44 |
+
<article class="guide-card">
|
| 45 |
+
<span>Tasks</span>
|
| 46 |
+
<strong>Dataset is loaded</strong>
|
| 47 |
+
<p>Reads `/metadata` and `/grader` to show task counts and scoring rules.</p>
|
| 48 |
+
</article>
|
| 49 |
+
</section>
|
| 50 |
+
|
| 51 |
+
<section class="status-overview">
|
| 52 |
+
<article class="floating-panel big-status-card">
|
| 53 |
+
<span class="status-caption">Health</span>
|
| 54 |
+
<strong class="status-display" data-health-text>Checking</strong>
|
| 55 |
+
<p class="status-helper">Expected validator value: <code>healthy</code></p>
|
| 56 |
+
</article>
|
| 57 |
+
<article class="floating-panel">
|
| 58 |
+
<span class="status-caption">Dataset</span>
|
| 59 |
+
<strong class="status-display" data-total-incidents>--</strong>
|
| 60 |
+
<p class="status-helper">Total incidents currently exposed by the environment.</p>
|
| 61 |
+
</article>
|
| 62 |
+
<article class="floating-panel">
|
| 63 |
+
<span class="status-caption">Schemas</span>
|
| 64 |
+
<strong class="status-display" data-schema-count>--</strong>
|
| 65 |
+
<p class="status-helper">Typed schemas available through <code>/schema</code>.</p>
|
| 66 |
+
</article>
|
| 67 |
+
</section>
|
| 68 |
+
|
| 69 |
+
<section class="floating-panel">
|
| 70 |
+
<div class="section-heading compact">
|
| 71 |
+
<p class="eyebrow">Task Inventory</p>
|
| 72 |
+
<h2>Difficulty progression and label space</h2>
|
| 73 |
+
</div>
|
| 74 |
+
<div class="task-grid" data-task-grid>
|
| 75 |
+
<article class="task-card skeleton-card"></article>
|
| 76 |
+
<article class="task-card skeleton-card"></article>
|
| 77 |
+
<article class="task-card skeleton-card"></article>
|
| 78 |
+
</div>
|
| 79 |
+
</section>
|
| 80 |
+
|
| 81 |
+
<section class="dual-grid">
|
| 82 |
+
<article class="floating-panel">
|
| 83 |
+
<div class="section-heading compact">
|
| 84 |
+
<p class="eyebrow">Schema Coverage</p>
|
| 85 |
+
<h2>Runtime contracts</h2>
|
| 86 |
+
</div>
|
| 87 |
+
<div class="badge-grid" data-schema-grid></div>
|
| 88 |
+
</article>
|
| 89 |
+
|
| 90 |
+
<article class="floating-panel">
|
| 91 |
+
<div class="section-heading compact">
|
| 92 |
+
<p class="eyebrow">Grader Summary</p>
|
| 93 |
+
<h2>Deterministic scoring rules</h2>
|
| 94 |
+
</div>
|
| 95 |
+
<div class="copy-block">
|
| 96 |
+
<p data-grader-summary>Loading grader details.</p>
|
| 97 |
+
<ul class="bullet-list" data-grader-list></ul>
|
| 98 |
+
</div>
|
| 99 |
+
</article>
|
| 100 |
+
</section>
|
| 101 |
+
|
| 102 |
+
<section class="floating-panel">
|
| 103 |
+
<div class="section-heading compact">
|
| 104 |
+
<p class="eyebrow">Endpoint Surface</p>
|
| 105 |
+
<h2>Available routes</h2>
|
| 106 |
+
</div>
|
| 107 |
+
<div class="route-grid">
|
| 108 |
+
<a class="route-card" href="/health"><strong>/health</strong><span>Validator-facing health endpoint.</span></a>
|
| 109 |
+
<a class="route-card" href="/metadata"><strong>/metadata</strong><span>Environment name, description, and task counts.</span></a>
|
| 110 |
+
<a class="route-card" href="/schema"><strong>/schema</strong><span>Action, observation, reward, and state schemas.</span></a>
|
| 111 |
+
<a class="route-card" href="/openapi.json"><strong>/openapi.json</strong><span>FastAPI OpenAPI contract for external tooling.</span></a>
|
| 112 |
+
</div>
|
| 113 |
+
</section>
|
| 114 |
+
</main>
|
| 115 |
+
</div>
|
| 116 |
+
<script src="/assets/app.js?v=3" defer></script>
|
| 117 |
+
</body>
|
| 118 |
+
</html>
|
uv.lock
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|