Xcode_Addy commited on
Commit
25a72c2
·
unverified ·
2 Parent(s): c43e3979347ce5

Merge pull request #1 from ADITYAGABA1322/development

Browse files
Files changed (14) hide show
  1. .gitignore +89 -1
  2. Dockerfile +12 -0
  3. Readme.md +375 -0
  4. __init__.py +16 -0
  5. app.py +77 -0
  6. client.py +99 -0
  7. environment.py +62 -0
  8. graders.py +33 -0
  9. incidents.py +461 -0
  10. inference.py +194 -0
  11. models.py +65 -0
  12. openenv.yaml +74 -0
  13. pyproject.toml +45 -0
  14. requirements.txt +5 -0
.gitignore CHANGED
@@ -1 +1,89 @@
1
- .DS_Store
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .DS_Store
2
+ # =========================
3
+ # ENV & SECRETS 🔐
4
+ # =========================
5
+ .env
6
+ .env.*
7
+ *.env
8
+
9
+ # =========================
10
+ # PYTHON 🐍
11
+ # =========================
12
+ __pycache__/
13
+ *.pyc
14
+ *.pyo
15
+ *.pyd
16
+ *.pyc.*
17
+ *.egg-info/
18
+ dist/
19
+ build/
20
+ .eggs/
21
+ *.egg
22
+ venv/
23
+ env/
24
+ .venv/
25
+
26
+ # =========================
27
+ # LOG FILES 📄
28
+ # =========================
29
+ *.log
30
+ logs.jsonl
31
+
32
+ # =========================
33
+ # OS FILES 💻
34
+ # =========================
35
+ .DS_Store
36
+ Thumbs.db
37
+
38
+ # =========================
39
+ # IDE / EDITOR ⚙️
40
+ # =========================
41
+ .vscode/
42
+ .idea/
43
+ *.swp
44
+ *.swo
45
+
46
+ # =========================
47
+ # MODEL / DATA FILES 🤖
48
+ # =========================
49
+ *.onnx
50
+ *.pt
51
+ *.pth
52
+ *.ckpt
53
+ *.h5
54
+
55
+ # Large datasets (customize if needed)
56
+ data/
57
+ datasets/
58
+
59
+ # =========================
60
+ # BUILD / OUTPUT 🚀
61
+ # =========================
62
+ dist/
63
+ build/
64
+ out/
65
+
66
+ # =========================
67
+ # TEMP FILES 🗑️
68
+ # =========================
69
+ *.tmp
70
+ *.temp
71
+ .cache/
72
+
73
+ # =========================
74
+ # TEST / COVERAGE 🧪
75
+ # =========================
76
+ coverage/
77
+ .nyc_output/
78
+
79
+ # =========================
80
+ # DOCKER 🐳 (optional)
81
+ # =========================
82
+ *.pid
83
+ *.seed
84
+
85
+ # =========================
86
+ # MISC
87
+ # =========================
88
+ *.bak
89
+ *.old
Dockerfile ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt .
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+
8
+ COPY . .
9
+
10
+ EXPOSE 7860
11
+
12
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
Readme.md ADDED
@@ -0,0 +1,375 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚨 Production Incident Triage Environment
2
+
3
+ An OpenEnv-compatible backend evaluation system where an AI agent triages production incidents like a real SRE (Site Reliability Engineer). Built for deterministic, RL-style evaluation — no UI, no chatbot, pure backend.
4
+
5
+ ---
6
+
7
+ ## 📌 What This Is
8
+
9
+ This is **not** a chatbot. It is a structured evaluation environment where:
10
+
11
+ 1. Environment returns a production incident (alert + context)
12
+ 2. AI agent reads the incident
13
+ 3. Agent returns a structured JSON action
14
+ 4. Environment sends action to a deterministic grader
15
+ 5. Grader compares against ground truth
16
+ 6. Returns a score between `0.0` and `1.0`
17
+
18
+ ---
19
+
20
+ ## 🗂️ Project Structure
21
+
22
+ ```
23
+ Incident_Triage/
24
+
25
+ ├── models.py # Pydantic schemas — source of truth for all types
26
+ ├── incidents.py # Dataset of 15 production incidents
27
+ ├── inference.py # LLM agent (Mistral via NVIDIA API)
28
+ ├── openenv.yaml # OpenEnv submission config
29
+ ├── pyproject.toml # Project metadata
30
+ ├── requirements.txt # Dependencies
31
+ ├── README.md
32
+
33
+ └── server/
34
+ ├── __init__.py # Empty — do not add imports here
35
+ ├── app.py # FastAPI server
36
+ ├── environment.py # Core RL-style logic (reset / step)
37
+ ├── graders.py # Deterministic scoring functions
38
+ ├── Dockerfile
39
+ └── requirements.txt
40
+ ```
41
+
42
+ ---
43
+
44
+ ## ⚙️ Setup
45
+
46
+ ### 1. Clone and install dependencies
47
+
48
+ ```bash
49
+ git clone <your-repo-url>
50
+ cd Incident_Triage
51
+ pip install -r requirements.txt
52
+ ```
53
+
54
+ ### 2. Set your NVIDIA / Mistral API key
55
+
56
+ ```bash
57
+ # Windows
58
+ set NVIDIA_API_KEY=your_nvidia_api_key_here
59
+
60
+ # Mac / Linux
61
+ export NVIDIA_API_KEY=your_nvidia_api_key_here
62
+ ```
63
+
64
+ ### 3. Start the server
65
+
66
+ ```bash
67
+ uvicorn server.app:app --reload
68
+ ```
69
+
70
+ Server runs at: `http://localhost:8000`
71
+
72
+ ### 4. Run the agent
73
+
74
+ ```bash
75
+ python inference.py
76
+ ```
77
+
78
+ ---
79
+
80
+ ## 🔗 API Endpoints
81
+
82
+ ### `GET /tasks`
83
+ Returns available task types and their descriptions.
84
+
85
+ **Response:**
86
+ ```json
87
+ {
88
+ "tasks": {
89
+ "task1": "Severity Classification → SeverityLevel enum",
90
+ "task2": "Root Cause Category → RootCauseCategory enum",
91
+ "task3": "Recommended Action → RecommendedAction enum"
92
+ }
93
+ }
94
+ ```
95
+
96
+ ---
97
+
98
+ ### `POST /reset`
99
+ Resets the environment and returns a new incident for the agent to triage.
100
+
101
+ **Query Params:**
102
+
103
+ | Param | Type | Required | Description |
104
+ |---|---|---|---|
105
+ | `task_type` | string | No | Filter by `task1`, `task2`, or `task3`. If omitted, picks any incident randomly. |
106
+
107
+ **Example:**
108
+ ```bash
109
+ curl -X POST "http://localhost:8000/reset?task_type=task1"
110
+ ```
111
+
112
+ **Response:**
113
+ ```json
114
+ {
115
+ "incident_id": "INC-001",
116
+ "task_type": "task1",
117
+ "alert_text": "[CRITICAL] Payment service returning HTTP 503. Error rate: 94%.",
118
+ "context": {
119
+ "service": "payment-service",
120
+ "error_rate_pct": 94,
121
+ "affected_users": 120000,
122
+ "region": "us-east-1"
123
+ }
124
+ }
125
+ ```
126
+
127
+ ---
128
+
129
+ ### `POST /step`
130
+ Submits the agent's action and returns a graded result.
131
+
132
+ **Request Body:**
133
+ ```json
134
+ {
135
+ "incident_id": "INC-001",
136
+ "task_type": "task1",
137
+ "severity": "SEV1",
138
+ "root_cause": null,
139
+ "action": null
140
+ }
141
+ ```
142
+
143
+ > Only populate the field relevant to the `task_type`. Set others to `null`.
144
+
145
+ **Response:**
146
+ ```json
147
+ {
148
+ "incident_id": "INC-001",
149
+ "task_type": "task1",
150
+ "reward": 1.0,
151
+ "correct": true,
152
+ "ground_truth": "SEV1",
153
+ "agent_answer": "SEV1"
154
+ }
155
+ ```
156
+
157
+ | Field | Type | Description |
158
+ |---|---|---|
159
+ | `reward` | float | `1.0` = correct, `0.0` = wrong |
160
+ | `correct` | bool | True if reward == 1.0 |
161
+ | `ground_truth` | string | Expected answer |
162
+ | `agent_answer` | string | What agent returned |
163
+
164
+ ---
165
+
166
+ ### `GET /grader`
167
+ Returns grader configuration for transparency.
168
+
169
+ **Response:**
170
+ ```json
171
+ {
172
+ "grading": "deterministic",
173
+ "scoring": "binary (0.0 or 1.0)",
174
+ "tasks": {
175
+ "task1": "action.severity == ground_truth.severity",
176
+ "task2": "action.root_cause == ground_truth.root_cause",
177
+ "task3": "action.action == ground_truth.action"
178
+ }
179
+ }
180
+ ```
181
+
182
+ ---
183
+
184
+ ## 📋 Enum Reference
185
+
186
+ All agent outputs must use **exactly** these enum values (case-sensitive):
187
+
188
+ ### Task 1 — Severity Classification (`severity` field)
189
+
190
+ | Value | Meaning |
191
+ |---|---|
192
+ | `SEV1` | Total outage / confirmed revenue impact |
193
+ | `SEV2` | Partial outage / degraded performance |
194
+ | `SEV3` | Minor / cosmetic / internal only |
195
+
196
+ ### Task 2 — Root Cause Category (`root_cause` field)
197
+
198
+ | Value | Meaning |
199
+ |---|---|
200
+ | `DATABASE` | DB lag, connection pool, replica issues |
201
+ | `NETWORK` | Packet loss, BGP flap, cross-region failures |
202
+ | `APPLICATION` | Code bug, exception, bad deploy |
203
+ | `INFRASTRUCTURE` | Kubernetes, EC2, spot interruption |
204
+ | `THIRD_PARTY` | Stripe, SendGrid, external vendor |
205
+ | `UNKNOWN` | Cannot determine root cause |
206
+
207
+ ### Task 3 — Recommended Action (`action` field)
208
+
209
+ | Value | Meaning |
210
+ |---|---|
211
+ | `ROLLBACK` | Revert to last stable deploy |
212
+ | `SCALE_UP` | Increase replicas / resources |
213
+ | `RESTART_SERVICE` | Restart stuck / deadlocked process |
214
+ | `FAILOVER` | Switch to replica / standby |
215
+ | `NOTIFY_VENDOR` | Escalate to third-party vendor |
216
+ | `INVESTIGATE` | Need more info before acting |
217
+ | `NO_ACTION` | Monitor only, no action needed |
218
+
219
+ ---
220
+
221
+ ## 🤖 Agent JSON Format
222
+
223
+ The agent must return **strict JSON only** — no markdown, no explanation, no extra text.
224
+
225
+ ```json
226
+ {
227
+ "incident_id": "INC-006",
228
+ "task_type": "task2",
229
+ "severity": null,
230
+ "root_cause": "DATABASE",
231
+ "action": null
232
+ }
233
+ ```
234
+
235
+ Rules:
236
+ - `incident_id` must match the one returned by `/reset`
237
+ - `task_type` must match the one returned by `/reset`
238
+ - Only one field (`severity`, `root_cause`, or `action`) should be non-null
239
+ - The non-null field must use a valid enum value
240
+
241
+ ---
242
+
243
+ ## 🧠 How Grading Works
244
+
245
+ Grading is **fully deterministic** — no LLM is used inside the grader.
246
+
247
+ ```
248
+ agent_answer == ground_truth → reward: 1.0 (correct)
249
+ agent_answer != ground_truth → reward: 0.0 (wrong)
250
+ missing field (null) → reward: 0.0 (wrong)
251
+ ```
252
+
253
+ Scoring is binary because incident triage is a classification task. A wrong severity leads to a wrong on-call response — partial credit would mask bad agent behavior.
254
+
255
+ ---
256
+
257
+ ## 🧪 Quick Test (curl)
258
+
259
+ ```bash
260
+ # 1. Check available tasks
261
+ curl http://localhost:8000/tasks
262
+
263
+ # 2. Get a task1 incident
264
+ curl -X POST "http://localhost:8000/reset?task_type=task1"
265
+
266
+ # 3. Submit agent action (replace incident_id with one from step 2)
267
+ curl -X POST http://localhost:8000/step \
268
+ -H "Content-Type: application/json" \
269
+ -d '{"incident_id": "INC-001", "task_type": "task1", "severity": "SEV1", "root_cause": null, "action": null}'
270
+
271
+ # 4. Check grader config
272
+ curl http://localhost:8000/grader
273
+ ```
274
+
275
+ ---
276
+
277
+ ## 📊 Dataset Overview
278
+
279
+ 15 production incidents across 3 task types (5 per task):
280
+
281
+ | Task | Incidents | What agent classifies |
282
+ |---|---|---|
283
+ | `task1` | INC-001 to INC-005 | Severity level |
284
+ | `task2` | INC-006 to INC-010 | Root cause category |
285
+ | `task3` | INC-011 to INC-015 | Recommended action |
286
+
287
+ Incident types include: payment outages, DB replica lag, Kubernetes node failures, BGP flapping, bad deploys, vendor degradations, memory deadlocks, and more.
288
+
289
+ ---
290
+
291
+ ## 🔧 Inference Script (Mistral via NVIDIA API)
292
+
293
+ `inference.py` uses the Mistral model via NVIDIA's OpenAI-compatible API endpoint.
294
+
295
+ Update the client in `inference.py`:
296
+
297
+ ```python
298
+ from openai import OpenAI
299
+
300
+ client = OpenAI(
301
+ base_url="https://integrate.api.nvidia.com/v1",
302
+ api_key=os.environ["NVIDIA_API_KEY"]
303
+ )
304
+
305
+ response = client.chat.completions.create(
306
+ model="mistralai/mistral-7b-instruct-v0.3",
307
+ messages=[
308
+ {"role": "system", "content": SYSTEM_PROMPT},
309
+ {"role": "user", "content": build_user_prompt(observation)}
310
+ ],
311
+ max_tokens=256,
312
+ temperature=0.0
313
+ )
314
+
315
+ raw = response.choices[0].message.content.strip()
316
+ ```
317
+
318
+ > `temperature=0.0` is critical — keeps outputs deterministic across runs.
319
+
320
+ ---
321
+
322
+ ## 📦 Requirements
323
+
324
+ ```
325
+ fastapi
326
+ uvicorn
327
+ pydantic
328
+ openai
329
+ requests
330
+ ```
331
+
332
+ Install:
333
+ ```bash
334
+ pip install fastapi uvicorn pydantic openai requests
335
+ ```
336
+
337
+ ---
338
+
339
+ ## 🚀 Run Full Evaluation
340
+
341
+ ```bash
342
+ # Terminal 1
343
+ uvicorn server.app:app --reload
344
+
345
+ # Terminal 2
346
+ python inference.py
347
+ ```
348
+
349
+ Expected output:
350
+ ```
351
+ ==================================================
352
+ Incident : INC-003
353
+ Task : task1
354
+ Alert : [INFO] Admin dashboard CSS assets returning 404...
355
+
356
+ LLM Raw : {"incident_id": "INC-003", "task_type": "task1", "severity": "SEV3", "root_cause": null, "action": null}
357
+ Answer : SEV3
358
+ Expected : SEV3
359
+ Correct : True | Reward: 1.0
360
+
361
+ ==================================================
362
+ Total Episodes : 15
363
+ Total Correct : 13
364
+ Accuracy : 86.7%
365
+ ```
366
+
367
+ ---
368
+
369
+ ## 📝 Important Rules
370
+
371
+ - Never modify enum values in `models.py` — graders depend on exact string matching
372
+ - Never add LLM calls inside `graders.py` — grading must be deterministic
373
+ - Always call `/reset` before `/step` — environment maintains current incident state
374
+ - `server/__init__.py` must stay empty — do not add imports there
375
+ - Always run uvicorn from the project root: `uvicorn server.app:app --reload`
__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Incident Triage Environment."""
8
+
9
+ from .client import IncidentTriageEnv
10
+ from .models import IncidentTriageAction, IncidentTriageObservation
11
+
12
+ __all__ = [
13
+ "IncidentTriageAction",
14
+ "IncidentTriageObservation",
15
+ "IncidentTriageEnv",
16
+ ]
app.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #----- Edited file--------------
2
+ # app.py
3
+
4
+ import uuid
5
+ from fastapi import FastAPI, HTTPException
6
+ from models import IncidentAction, StepResult
7
+ from environment import IncidentEnv
8
+ from graders import GRADERS
9
+
10
+ app = FastAPI(title="Incident Triage Environment")
11
+
12
+ # Session store: session_id -> IncidentEnv instance
13
+ sessions: dict[str, IncidentEnv] = {}
14
+
15
+
16
+ @app.get("/tasks")
17
+ def get_tasks():
18
+ return {
19
+ "tasks": {
20
+ "task1": "Severity Classification → SEV1, SEV2, SEV3",
21
+ "task2": "Root Cause Category → DATABASE, NETWORK, APPLICATION, INFRASTRUCTURE, THIRD_PARTY, UNKNOWN",
22
+ "task3": "Recommended Action → ROLLBACK, SCALE_UP, RESTART_SERVICE, FAILOVER, NOTIFY_VENDOR, INVESTIGATE, NO_ACTION",
23
+ }
24
+ }
25
+
26
+
27
+ @app.post("/reset")
28
+ def reset(task_type: str = None):
29
+ session_id = str(uuid.uuid4())
30
+ env = IncidentEnv()
31
+ try:
32
+ observation = env.reset(task_type=task_type)
33
+ except ValueError as e:
34
+ raise HTTPException(status_code=400, detail=str(e))
35
+ sessions[session_id] = env
36
+ return {"session_id": session_id, **observation.model_dump()}
37
+
38
+
39
+ @app.post("/step", response_model=StepResult)
40
+ def step(action: IncidentAction, session_id: str):
41
+ env = sessions.get(session_id)
42
+ if not env:
43
+ raise HTTPException(status_code=404, detail="Session not found. Call /reset first.")
44
+ try:
45
+ result = env.step(action)
46
+ except (RuntimeError, ValueError) as e:
47
+ raise HTTPException(status_code=400, detail=str(e))
48
+ # Clean up session after step — one action per episode
49
+ sessions.pop(session_id, None)
50
+ return result
51
+
52
+
53
+ @app.get("/state")
54
+ def state(session_id: str):
55
+ env = sessions.get(session_id)
56
+ if not env or env.current_ticket is None:
57
+ raise HTTPException(status_code=404, detail="No active session.")
58
+ t = env.current_ticket
59
+ return {
60
+ "session_id": session_id,
61
+ "incident_id": t["incident_id"],
62
+ "task_type": t["task_type"],
63
+ "status": "awaiting_action",
64
+ }
65
+
66
+
67
+ @app.get("/grader")
68
+ def get_grader_info():
69
+ return {
70
+ "grading": "deterministic",
71
+ "scoring": "task1: partial (1.0/0.5/0.0), task2/task3: binary (1.0/0.0)",
72
+ "tasks": {
73
+ "task1": "exact=1.0, adjacent=0.5, far=0.0",
74
+ "task2": "action.root_cause == ground_truth.root_cause",
75
+ "task3": "action.action == ground_truth.action",
76
+ }
77
+ }
client.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Incident Triage Environment Client."""
8
+
9
+ from typing import Dict
10
+
11
+ from openenv.core import EnvClient
12
+ from openenv.core.client_types import StepResult
13
+ from openenv.core.env_server.types import State
14
+
15
+ from .models import IncidentTriageAction, IncidentTriageObservation
16
+
17
+
18
+ class IncidentTriageEnv(
19
+ EnvClient[IncidentTriageAction, IncidentTriageObservation, State]
20
+ ):
21
+ """
22
+ Client for the Incident Triage Environment.
23
+
24
+ This client maintains a persistent WebSocket connection to the environment server,
25
+ enabling efficient multi-step interactions with lower latency.
26
+ Each client instance has its own dedicated environment session on the server.
27
+
28
+ Example:
29
+ >>> # Connect to a running server
30
+ >>> with IncidentTriageEnv(base_url="http://localhost:8000") as client:
31
+ ... result = client.reset()
32
+ ... print(result.observation.echoed_message)
33
+ ...
34
+ ... result = client.step(IncidentTriageAction(message="Hello!"))
35
+ ... print(result.observation.echoed_message)
36
+
37
+ Example with Docker:
38
+ >>> # Automatically start container and connect
39
+ >>> client = IncidentTriageEnv.from_docker_image("Incident_Triage-env:latest")
40
+ >>> try:
41
+ ... result = client.reset()
42
+ ... result = client.step(IncidentTriageAction(message="Test"))
43
+ ... finally:
44
+ ... client.close()
45
+ """
46
+
47
+ def _step_payload(self, action: IncidentTriageAction) -> Dict:
48
+ """
49
+ Convert IncidentTriageAction to JSON payload for step message.
50
+
51
+ Args:
52
+ action: IncidentTriageAction instance
53
+
54
+ Returns:
55
+ Dictionary representation suitable for JSON encoding
56
+ """
57
+ return {
58
+ "message": action.message,
59
+ }
60
+
61
+ def _parse_result(self, payload: Dict) -> StepResult[IncidentTriageObservation]:
62
+ """
63
+ Parse server response into StepResult[IncidentTriageObservation].
64
+
65
+ Args:
66
+ payload: JSON response data from server
67
+
68
+ Returns:
69
+ StepResult with IncidentTriageObservation
70
+ """
71
+ obs_data = payload.get("observation", {})
72
+ observation = IncidentTriageObservation(
73
+ echoed_message=obs_data.get("echoed_message", ""),
74
+ message_length=obs_data.get("message_length", 0),
75
+ done=payload.get("done", False),
76
+ reward=payload.get("reward"),
77
+ metadata=obs_data.get("metadata", {}),
78
+ )
79
+
80
+ return StepResult(
81
+ observation=observation,
82
+ reward=payload.get("reward"),
83
+ done=payload.get("done", False),
84
+ )
85
+
86
+ def _parse_state(self, payload: Dict) -> State:
87
+ """
88
+ Parse server response into State object.
89
+
90
+ Args:
91
+ payload: JSON response from state request
92
+
93
+ Returns:
94
+ State object with episode_id and step_count
95
+ """
96
+ return State(
97
+ episode_id=payload.get("episode_id"),
98
+ step_count=payload.get("step_count", 0),
99
+ )
environment.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #----- Edited file--------------
2
+ # environment.py
3
+
4
+ import random
5
+ from models import IncidentAction, IncidentObservation, StepResult
6
+ from incidents import TICKETS
7
+ from graders import GRADERS
8
+
9
+
10
+ class IncidentEnv:
11
+
12
+ def __init__(self):
13
+ self.current_ticket = None
14
+
15
+ def reset(self, task_type: str = None) -> IncidentObservation:
16
+ pool = TICKETS
17
+ if task_type:
18
+ pool = [t for t in TICKETS if t["task_type"] == task_type]
19
+ if not pool:
20
+ raise ValueError(f"No tickets found for task_type: {task_type}")
21
+
22
+ self.current_ticket = random.choice(pool)
23
+
24
+ return IncidentObservation(
25
+ incident_id=self.current_ticket["incident_id"],
26
+ task_type=self.current_ticket["task_type"],
27
+ alert_text=self.current_ticket["alert_text"],
28
+ context=self.current_ticket["context"],
29
+ )
30
+
31
+ def step(self, action: IncidentAction) -> StepResult:
32
+ if self.current_ticket is None:
33
+ raise RuntimeError("Call reset() before step()")
34
+
35
+ if action.incident_id != self.current_ticket["incident_id"]:
36
+ raise ValueError(
37
+ f"Action incident_id '{action.incident_id}' does not match "
38
+ f"current ticket '{self.current_ticket['incident_id']}'"
39
+ )
40
+
41
+ task_type = self.current_ticket["task_type"]
42
+ ground_truth = self.current_ticket["ground_truth"]
43
+ grader_fn = GRADERS[task_type]
44
+ reward = grader_fn(action, ground_truth)
45
+
46
+ agent_answer = (
47
+ action.severity.value if task_type == "task1" and action.severity else
48
+ action.root_cause.value if task_type == "task2" and action.root_cause else
49
+ action.action.value if task_type == "task3" and action.action else
50
+ "NONE"
51
+ )
52
+
53
+ gt_field = list(ground_truth.values())[0]
54
+
55
+ return StepResult(
56
+ incident_id=self.current_ticket["incident_id"],
57
+ task_type=task_type,
58
+ reward=reward,
59
+ correct=reward == 1.0,
60
+ ground_truth=gt_field,
61
+ agent_answer=agent_answer,
62
+ )
graders.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #----- Edited file--------------
2
+ # graders.py
3
+
4
+ from models import IncidentAction
5
+
6
+ _SEV_ORDER = {"SEV1": 0, "SEV2": 1, "SEV3": 2}
7
+
8
+ def grade_task1(action: IncidentAction, ground_truth: dict) -> float:
9
+ if action.severity is None:
10
+ return 0.0
11
+ predicted = _SEV_ORDER.get(action.severity.value, -1)
12
+ expected = _SEV_ORDER.get(ground_truth["severity"], -1)
13
+ distance = abs(predicted - expected)
14
+ return {0: 1.0, 1: 0.5, 2: 0.0}[distance]
15
+
16
+
17
+ def grade_task2(action: IncidentAction, ground_truth: dict) -> float:
18
+ if action.root_cause is None:
19
+ return 0.0
20
+ return 1.0 if action.root_cause.value == ground_truth["root_cause"] else 0.0
21
+
22
+
23
+ def grade_task3(action: IncidentAction, ground_truth: dict) -> float:
24
+ if action.action is None:
25
+ return 0.0
26
+ return 1.0 if action.action.value == ground_truth["action"] else 0.0
27
+
28
+
29
+ GRADERS = {
30
+ "task1": grade_task1,
31
+ "task2": grade_task2,
32
+ "task3": grade_task3,
33
+ }
incidents.py ADDED
@@ -0,0 +1,461 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #----- Edited file--------------
2
+ # incidents.py
3
+
4
+ TICKETS = [
5
+
6
+ # ── TASK 1: Severity Classification ───────────────────────────────────────
7
+
8
+ {
9
+ "incident_id": "INC-001",
10
+ "task_type": "task1",
11
+ "alert_text": "[CRITICAL] Payment service returning HTTP 503. Error rate: 94%. Affected users: ~120,000. Revenue impact confirmed.",
12
+ "context": {
13
+ "service": "payment-service",
14
+ "error_rate_pct": 94,
15
+ "affected_users": 120000,
16
+ "region": "us-east-1",
17
+ "last_deploy": "2h ago",
18
+ "on_call_notified": True
19
+ },
20
+ "ground_truth": {"severity": "SEV1"}
21
+ },
22
+ {
23
+ "incident_id": "INC-002",
24
+ "task_type": "task1",
25
+ "alert_text": "[WARNING] Checkout latency elevated. p99 response time: 4800ms (threshold: 2000ms). 18% of requests timing out.",
26
+ "context": {
27
+ "service": "checkout-service",
28
+ "p99_latency_ms": 4800,
29
+ "timeout_rate_pct": 18,
30
+ "db_connections": "82/100",
31
+ "region": "eu-west-1"
32
+ },
33
+ "ground_truth": {"severity": "SEV2"}
34
+ },
35
+ {
36
+ "incident_id": "INC-003",
37
+ "task_type": "task1",
38
+ "alert_text": "[INFO] Admin dashboard CSS assets returning 404. Static file path misconfigured after deploy.",
39
+ "context": {
40
+ "service": "admin-ui",
41
+ "affected_users": "internal only",
42
+ "error_type": "404 on /static/main.css",
43
+ "last_deploy": "30m ago",
44
+ "user_impact": "cosmetic"
45
+ },
46
+ "ground_truth": {"severity": "SEV3"}
47
+ },
48
+ {
49
+ "incident_id": "INC-004",
50
+ "task_type": "task1",
51
+ "alert_text": "[CRITICAL] Auth service down. All login attempts failing with 500. SSO token validation endpoint unreachable.",
52
+ "context": {
53
+ "service": "auth-service",
54
+ "http_500_rate": "100%",
55
+ "affected_flows": ["login", "token_refresh", "SSO"],
56
+ "pod_status": "CrashLoopBackOff",
57
+ "region": "global"
58
+ },
59
+ "ground_truth": {"severity": "SEV1"}
60
+ },
61
+ {
62
+ "incident_id": "INC-005",
63
+ "task_type": "task1",
64
+ "alert_text": "[WARNING] Notification service email queue backlog growing. 14,000 emails pending. Delivery delay: ~22 minutes.",
65
+ "context": {
66
+ "service": "notification-service",
67
+ "queue_backlog": 14000,
68
+ "avg_delay_min": 22,
69
+ "consumer_lag": "high",
70
+ "revenue_impact": False
71
+ },
72
+ "ground_truth": {"severity": "SEV2"}
73
+ },
74
+
75
+ # ── TASK 2: Root Cause Classification ─────────────────────────────────────
76
+
77
+ {
78
+ "incident_id": "INC-006",
79
+ "task_type": "task2",
80
+ "alert_text": "[CRITICAL] PostgreSQL replica lag: 94 seconds. Write queries spilling to disk. Connection pool exhausted on primary.",
81
+ "context": {
82
+ "db": "postgres-primary",
83
+ "replica_lag_sec": 94,
84
+ "connection_pool": "500/500",
85
+ "disk_spill": True,
86
+ "slow_query_count": 312
87
+ },
88
+ "ground_truth": {"root_cause": "DATABASE"}
89
+ },
90
+ {
91
+ "incident_id": "INC-007",
92
+ "task_type": "task2",
93
+ "alert_text": "[CRITICAL] Packet loss 38% between us-east-1 and eu-west-1. Cross-region API calls failing. BGP route flapping detected.",
94
+ "context": {
95
+ "packet_loss_pct": 38,
96
+ "affected_regions": ["us-east-1", "eu-west-1"],
97
+ "bgp_flap": True,
98
+ "provider": "AWS",
99
+ "traceroute": "drops at transit hop 7"
100
+ },
101
+ "ground_truth": {"root_cause": "NETWORK"}
102
+ },
103
+ {
104
+ "incident_id": "INC-008",
105
+ "task_type": "task2",
106
+ "alert_text": "[ERROR] NullPointerException in order-processing-service. Stack trace points to discount_calculator.py line 84. Deploy happened 40min ago.",
107
+ "context": {
108
+ "service": "order-processing",
109
+ "exception": "NullPointerException",
110
+ "file": "discount_calculator.py",
111
+ "line": 84,
112
+ "last_deploy": "40min ago",
113
+ "git_commit": "a3f9c21"
114
+ },
115
+ "ground_truth": {"root_cause": "APPLICATION"}
116
+ },
117
+ {
118
+ "incident_id": "INC-009",
119
+ "task_type": "task2",
120
+ "alert_text": "[WARNING] Stripe webhook delivery failures spiking. 503s from Stripe API. Stripe status page shows degraded payment processing.",
121
+ "context": {
122
+ "vendor": "Stripe",
123
+ "webhook_failures": 840,
124
+ "stripe_status": "degraded",
125
+ "our_service_health": "healthy",
126
+ "stripe_status_url": "https://status.stripe.com"
127
+ },
128
+ "ground_truth": {"root_cause": "THIRD_PARTY"}
129
+ },
130
+ {
131
+ "incident_id": "INC-010",
132
+ "task_type": "task2",
133
+ "alert_text": "[CRITICAL] Node group in Kubernetes cluster terminated. 6/10 worker nodes NotReady. Pods evicted across analytics namespace.",
134
+ "context": {
135
+ "cluster": "prod-k8s-us-east",
136
+ "nodes_not_ready": 6,
137
+ "total_nodes": 10,
138
+ "evicted_pods": 47,
139
+ "namespace": "analytics",
140
+ "cause": "EC2 spot interruption"
141
+ },
142
+ "ground_truth": {"root_cause": "INFRASTRUCTURE"}
143
+ },
144
+
145
+ # ── TASK 3: Recommended Action ────────────────────────────────────────────
146
+
147
+ {
148
+ "incident_id": "INC-011",
149
+ "task_type": "task3",
150
+ "alert_text": "[CRITICAL] API error rate jumped from 0.2% to 67% immediately after deploy v2.4.1. Rollback candidate identified.",
151
+ "context": {
152
+ "service": "api-gateway",
153
+ "error_rate_before": "0.2%",
154
+ "error_rate_after": "67%",
155
+ "deploy_version": "v2.4.1",
156
+ "previous_stable": "v2.4.0",
157
+ "rollback_tested": True
158
+ },
159
+ "ground_truth": {"action": "ROLLBACK"}
160
+ },
161
+ {
162
+ "incident_id": "INC-012",
163
+ "task_type": "task3",
164
+ "alert_text": "[WARNING] Search service CPU at 98%. Request queue growing. Pod autoscaler at max replicas. Flash sale traffic spike ongoing.",
165
+ "context": {
166
+ "service": "search-service",
167
+ "cpu_pct": 98,
168
+ "current_replicas": 20,
169
+ "max_replicas_configured": 20,
170
+ "queue_depth": 9400,
171
+ "event": "flash sale"
172
+ },
173
+ "ground_truth": {"action": "SCALE_UP"}
174
+ },
175
+ {
176
+ "incident_id": "INC-013",
177
+ "task_type": "task3",
178
+ "alert_text": "[ERROR] Worker service stuck in deadlock. Memory usage flat at 99%. Process not responding to health checks. No deploy in 6 days.",
179
+ "context": {
180
+ "service": "background-worker",
181
+ "memory_pct": 99,
182
+ "health_check": "failing",
183
+ "last_deploy_days_ago": 6,
184
+ "deadlock_detected": True
185
+ },
186
+ "ground_truth": {"action": "RESTART_SERVICE"}
187
+ },
188
+ {
189
+ "incident_id": "INC-014",
190
+ "task_type": "task3",
191
+ "alert_text": "[CRITICAL] Primary RDS instance unresponsive. Failover to read replica not yet triggered. Data writes failing across all services.",
192
+ "context": {
193
+ "db": "rds-postgres-primary",
194
+ "status": "unresponsive",
195
+ "read_replica": "healthy",
196
+ "auto_failover": "disabled",
197
+ "write_failure_rate": "100%"
198
+ },
199
+ "ground_truth": {"action": "FAILOVER"}
200
+ },
201
+ {
202
+ "incident_id": "INC-015",
203
+ "task_type": "task3",
204
+ "alert_text": "[WARNING] SendGrid bounce rate at 34% for transactional emails. Delivery failures concentrated on @yahoo.com domains. No infra changes.",
205
+ "context": {
206
+ "vendor": "SendGrid",
207
+ "bounce_rate_pct": 34,
208
+ "affected_domains": ["yahoo.com"],
209
+ "our_infra_changes": False,
210
+ "sendgrid_status": "investigating"
211
+ },
212
+ "ground_truth": {"action": "NOTIFY_VENDOR"}
213
+ },
214
+
215
+ {
216
+ "incident_id": "INC-016",
217
+ "task_type": "task1",
218
+ "alert_text": "[INFO] Cart service intermittently failing for premium users only. Error rate: 12%.",
219
+ "context": {
220
+ "service": "cart-service",
221
+ "error_rate_pct": 12,
222
+ "affected_segment": "premium users",
223
+ "revenue_dependency": "high",
224
+ "region": "global"
225
+ },
226
+ "ground_truth": {"severity": "SEV1"}
227
+ },
228
+
229
+ # ── TASK 1: Severity (Ambiguous + Edge) ─────────────────────────────
230
+
231
+ {
232
+ "incident_id": "INC-017",
233
+ "task_type": "task1",
234
+ "alert_text": "[WARNING] API latency increased to 3.2s. Error rate low (2%) but affecting checkout flow.",
235
+ "context": {
236
+ "service": "api-service",
237
+ "latency_ms": 3200,
238
+ "error_rate_pct": 2,
239
+ "business_impact": "checkout delay"
240
+ },
241
+ "ground_truth": {"severity": "SEV2"}
242
+ },
243
+ {
244
+ "incident_id": "INC-018",
245
+ "task_type": "task1",
246
+ "alert_text": "[CRITICAL] Cart service failing for 40% users. Premium users impacted more. Revenue drop observed.",
247
+ "context": {
248
+ "error_rate_pct": 40,
249
+ "affected_segment": "premium",
250
+ "revenue_impact": True
251
+ },
252
+ "ground_truth": {"severity": "SEV1"}
253
+ },
254
+ {
255
+ "incident_id": "INC-019",
256
+ "task_type": "task1",
257
+ "alert_text": "[INFO] Logging service delay in ingestion pipeline. No user-facing impact.",
258
+ "context": {
259
+ "service": "logging",
260
+ "delay_sec": 120,
261
+ "user_impact": False
262
+ },
263
+ "ground_truth": {"severity": "SEV3"}
264
+ },
265
+
266
+ # ── TASK 2: Root Cause (Confusing Signals) ───────────────────────────
267
+
268
+ {
269
+ "incident_id": "INC-020",
270
+ "task_type": "task2",
271
+ "alert_text": "[CRITICAL] API failures with DB latency high and packet loss observed.",
272
+ "context": {
273
+ "db_latency_ms": 2800,
274
+ "packet_loss_pct": 15,
275
+ "recent_deploy": False
276
+ },
277
+ "ground_truth": {"root_cause": "NETWORK"}
278
+ },
279
+ {
280
+ "incident_id": "INC-021",
281
+ "task_type": "task2",
282
+ "alert_text": "[ERROR] Service throwing timeout exceptions. No infra alerts. Code deployed 10 mins ago.",
283
+ "context": {
284
+ "exception": "TimeoutException",
285
+ "deploy_time": "10m ago",
286
+ "infra_health": "normal"
287
+ },
288
+ "ground_truth": {"root_cause": "APPLICATION"}
289
+ },
290
+ {
291
+ "incident_id": "INC-022",
292
+ "task_type": "task2",
293
+ "alert_text": "[WARNING] DB CPU high and slow queries increasing gradually.",
294
+ "context": {
295
+ "db_cpu_pct": 92,
296
+ "slow_queries": 210,
297
+ "replica_lag": 5
298
+ },
299
+ "ground_truth": {"root_cause": "DATABASE"}
300
+ },
301
+ {
302
+ "incident_id": "INC-023",
303
+ "task_type": "task2",
304
+ "alert_text": "[CRITICAL] Multiple pods evicted. Node memory pressure warnings.",
305
+ "context": {
306
+ "pods_evicted": 30,
307
+ "node_memory_pressure": True,
308
+ "cluster_health": "degraded"
309
+ },
310
+ "ground_truth": {"root_cause": "INFRASTRUCTURE"}
311
+ },
312
+
313
+ # ── TASK 3: Action (Ambiguous Decisions) ─────────────────────────────
314
+
315
+ {
316
+ "incident_id": "INC-024",
317
+ "task_type": "task3",
318
+ "alert_text": "[WARNING] CPU high but traffic spike detected. Autoscaling already active.",
319
+ "context": {
320
+ "cpu_pct": 90,
321
+ "traffic_spike": True,
322
+ "autoscaling": "active"
323
+ },
324
+ "ground_truth": {"action": "SCALE_UP"}
325
+ },
326
+ {
327
+ "incident_id": "INC-025",
328
+ "task_type": "task3",
329
+ "alert_text": "[ERROR] New deploy caused minor errors (5%). System stable otherwise.",
330
+ "context": {
331
+ "error_rate": 5,
332
+ "deploy": "recent",
333
+ "system_stability": "mostly stable"
334
+ },
335
+ "ground_truth": {"action": "INVESTIGATE"}
336
+ },
337
+ {
338
+ "incident_id": "INC-026",
339
+ "task_type": "task3",
340
+ "alert_text": "[CRITICAL] Service stuck. No response. Health checks failing continuously.",
341
+ "context": {
342
+ "health_check": "failing",
343
+ "response": "none",
344
+ "deploy": "old"
345
+ },
346
+ "ground_truth": {"action": "RESTART_SERVICE"}
347
+ },
348
+ {
349
+ "incident_id": "INC-027",
350
+ "task_type": "task3",
351
+ "alert_text": "[WARNING] Vendor API returning intermittent failures.",
352
+ "context": {
353
+ "vendor": "Twilio",
354
+ "failure_rate": 18,
355
+ "our_system": "healthy"
356
+ },
357
+ "ground_truth": {"action": "NOTIFY_VENDOR"}
358
+ },
359
+ {
360
+ "incident_id": "INC-028",
361
+ "task_type": "task3",
362
+ "alert_text": "[CRITICAL] DB primary down, replica healthy.",
363
+ "context": {
364
+ "primary_status": "down",
365
+ "replica": "healthy",
366
+ "writes": "failing"
367
+ },
368
+ "ground_truth": {"action": "FAILOVER"}
369
+ },
370
+
371
+ # ── HARD CASES (REAL THINKING) ──────────────────────────────────────
372
+
373
+ {
374
+ "incident_id": "INC-029",
375
+ "task_type": "task3",
376
+ "alert_text": "[WARNING] Latency increased after deploy but no errors observed.",
377
+ "context": {
378
+ "latency": 2500,
379
+ "error_rate": 0,
380
+ "deploy": "recent"
381
+ },
382
+ "ground_truth": {"action": "INVESTIGATE"}
383
+ },
384
+ {
385
+ "incident_id": "INC-030",
386
+ "task_type": "task2",
387
+ "alert_text": "[CRITICAL] Failures observed. External API slow and DB connections also high.",
388
+ "context": {
389
+ "external_api_latency": 3000,
390
+ "db_connections": "95%",
391
+ "recent_deploy": False
392
+ },
393
+ "ground_truth": {"root_cause": "THIRD_PARTY"}
394
+ },
395
+ {
396
+ "incident_id": "INC-031",
397
+ "task_type": "task1",
398
+ "alert_text": "[WARNING] Partial outage in recommendation engine. Affects 10% users.",
399
+ "context": {
400
+ "affected_users_pct": 10,
401
+ "service": "recommendation",
402
+ "revenue_impact": "low"
403
+ },
404
+ "ground_truth": {"severity": "SEV2"}
405
+ },
406
+ {
407
+ "incident_id": "INC-032",
408
+ "task_type": "task2",
409
+ "alert_text": "[ERROR] Random crashes in service. No infra issues. No recent deploy.",
410
+ "context": {
411
+ "crash_logs": True,
412
+ "infra_health": "good",
413
+ "deploy": "none"
414
+ },
415
+ "ground_truth": {"root_cause": "APPLICATION"}
416
+ },
417
+ {
418
+ "incident_id": "INC-033",
419
+ "task_type": "task3",
420
+ "alert_text": "[INFO] Minor UI glitch reported by users.",
421
+ "context": {
422
+ "impact": "cosmetic",
423
+ "users_affected": 50
424
+ },
425
+ "ground_truth": {"action": "NO_ACTION"}
426
+ },
427
+ {
428
+ "incident_id": "INC-034",
429
+ "task_type": "task1",
430
+ "alert_text": "[CRITICAL] Login failures spike to 70% but only in one region.",
431
+ "context": {
432
+ "failure_rate": 70,
433
+ "region": "ap-south-1",
434
+ "global_impact": False
435
+ },
436
+ "ground_truth": {"severity": "SEV1"}
437
+ },
438
+ {
439
+ "incident_id": "INC-035",
440
+ "task_type": "task2",
441
+ "alert_text": "[WARNING] Increased retries and timeouts. Network stable. DB stable.",
442
+ "context": {
443
+ "timeouts": True,
444
+ "network": "stable",
445
+ "db": "stable"
446
+ },
447
+ "ground_truth": {"root_cause": "APPLICATION"}
448
+ },
449
+ {
450
+ "incident_id": "INC-036",
451
+ "task_type": "task3",
452
+ "alert_text": "[WARNING] Memory leak suspected. Service degrading slowly.",
453
+ "context": {
454
+ "memory_growth": True,
455
+ "crash": False,
456
+ "impact": "gradual"
457
+ },
458
+ "ground_truth": {"action": "INVESTIGATE"}
459
+ }
460
+
461
+ ]
inference.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # inference.py
2
+
3
+ import os
4
+ import json
5
+ import re
6
+ import requests
7
+ from openai import OpenAI
8
+ from incidents import TICKETS
9
+ from dotenv import load_dotenv
10
+
11
+ load_dotenv()
12
+
13
+ BASE_URL = "http://localhost:8000"
14
+ client = OpenAI(
15
+ base_url=os.getenv("API_BASE_URL"),
16
+ api_key=os.getenv("HF_TOKEN")
17
+ )
18
+
19
+ SYSTEM_PROMPT = """You are an expert SRE (Site Reliability Engineer) triaging production incidents.
20
+ You will receive an incident alert and context.
21
+ You must respond with ONLY a valid JSON object. No explanation. No markdown. No extra text. No code blocks.
22
+
23
+ Rules:
24
+ - For task1: classify severity. Choose ONLY from: SEV1, SEV2, SEV3
25
+ - For task2: classify root cause. Choose ONLY from: DATABASE, NETWORK, APPLICATION, INFRASTRUCTURE, THIRD_PARTY, UNKNOWN
26
+ - For task3: recommend action. Choose ONLY from: ROLLBACK, SCALE_UP, RESTART_SERVICE, FAILOVER, NOTIFY_VENDOR, INVESTIGATE, NO_ACTION
27
+
28
+ Response format (return this exact structure):
29
+ {"incident_id": "<incident_id>", "task_type": "<task_type>", "severity": "<value or null>", "root_cause": "<value or null>", "action": "<value or null>"}
30
+
31
+ Only populate the field relevant to the task_type. Set others to null.
32
+ """
33
+
34
+
35
+ def build_user_prompt(observation: dict) -> str:
36
+ return f"""Incident ID: {observation['incident_id']}
37
+ Task Type: {observation['task_type']}
38
+
39
+ Alert:
40
+ {observation['alert_text']}
41
+
42
+ Context:
43
+ {json.dumps(observation['context'], indent=2)}
44
+
45
+ Respond with JSON only. No markdown. No explanation."""
46
+
47
+
48
+ # 🔥 Robust JSON extractor
49
+ def extract_json(raw: str) -> dict:
50
+ match = re.search(r"\{.*\}", raw, re.DOTALL)
51
+ if not match:
52
+ raise ValueError("No JSON found in response")
53
+
54
+ return json.loads(match.group(0))
55
+
56
+ def normalize_action(action: dict, task_type: str) -> dict:
57
+ return {
58
+ "incident_id": action.get("incident_id"),
59
+ "task_type": task_type,
60
+ "severity": action.get("severity") if task_type == "task1" else None,
61
+ "root_cause": action.get("root_cause") if task_type == "task2" else None,
62
+ "action": action.get("action") if task_type == "task3" else None,
63
+ }
64
+
65
+
66
+ def call_llm(observation: dict) -> str:
67
+ full_response = ""
68
+ try:
69
+ completion = client.chat.completions.create(
70
+ model=os.getenv("MODEL_NAME"),
71
+ messages=[
72
+ {"role": "system", "content": SYSTEM_PROMPT},
73
+ {"role": "user", "content": build_user_prompt(observation)}
74
+ ],
75
+ temperature=0.1,
76
+ top_p=0.9,
77
+ max_tokens=200,
78
+ seed=42,
79
+ stream=True
80
+ )
81
+
82
+ for chunk in completion:
83
+ if chunk.choices and chunk.choices[0].delta.content is not None:
84
+ full_response += chunk.choices[0].delta.content
85
+ except Exception as e:
86
+ print(f"Error calling LLM: {e}")
87
+ return ""
88
+
89
+ return full_response.strip()
90
+
91
+
92
+ def run_episode(task_type: str = None) -> dict:
93
+ # Step 1 — Reset environment
94
+ params = {"task_type": task_type} if task_type else {}
95
+ reset_response = requests.post(f"{BASE_URL}/reset", params=params)
96
+ reset_response.raise_for_status()
97
+
98
+ reset_data = reset_response.json()
99
+ session_id = reset_data["session_id"]
100
+ observation = reset_data
101
+
102
+ print(f"\n{'='*60}")
103
+ print(f"Incident : {observation['incident_id']}")
104
+ print(f"Task : {observation['task_type']}")
105
+ print(f"Alert : {observation['alert_text'][:80]}...")
106
+
107
+ # Step 2 — LLM with retry
108
+ action = None
109
+ raw = ""
110
+
111
+ for attempt in range(3):
112
+ raw = call_llm(observation)
113
+ print(f"LLM Raw (attempt {attempt+1}): {raw}")
114
+
115
+ try:
116
+ parsed = extract_json(raw)
117
+ action = normalize_action(parsed, observation["task_type"])
118
+ break
119
+ except Exception as e:
120
+ print(f"Parse failed: {e}")
121
+
122
+ if not action:
123
+ return {"error": "invalid_json", "raw": raw}
124
+
125
+ # Step 3 — Validate schema
126
+ required_keys = {"incident_id", "task_type", "severity", "root_cause", "action"}
127
+ if not required_keys.issubset(action.keys()):
128
+ print("Invalid schema from LLM")
129
+ return {"error": "invalid_schema", "raw": raw}
130
+
131
+
132
+ # Step 4 — Submit to /step
133
+ step_response = requests.post(f"{BASE_URL}/step", json=action, params={"session_id": session_id})
134
+ step_response.raise_for_status()
135
+ result = step_response.json()
136
+ # This need to be kept for submission grading, so we print it in a structured way
137
+ print(f"[STEP] task_id={result['task_type']} action={result['agent_answer']} reward={result['reward']}")
138
+
139
+ print(f"Answer : {result['agent_answer']}")
140
+ print(f"Expected : {result['ground_truth']}")
141
+ print(f"Correct : {result['correct']} | Reward: {result['reward']}")
142
+
143
+ # 🔥 Logging
144
+ with open("logs.jsonl", "a") as f:
145
+ f.write(json.dumps({
146
+ "observation": observation,
147
+ "response": action,
148
+ "result": result
149
+ }) + "\n")
150
+
151
+ return result
152
+
153
+
154
+ def run_full_eval():
155
+ print("[START]")
156
+ task_types = ["task1", "task2", "task3"]
157
+
158
+ rounds = len(TICKETS) # 🔥 FIXED
159
+ scores = []
160
+ errors = 0
161
+
162
+ task_scores = {
163
+ "task1": [],
164
+ "task2": [],
165
+ "task3": []
166
+ }
167
+
168
+ for i in range(rounds):
169
+ task = task_types[i % 3]
170
+ result = run_episode(task_type=task)
171
+
172
+ if "reward" in result:
173
+ scores.append(result["reward"])
174
+ task_scores[task].append(result["reward"])
175
+ else:
176
+ errors += 1
177
+
178
+ print(f"\n{'='*60}")
179
+ print(f"Total Episodes : {rounds}")
180
+ print(f"Graded : {len(scores)}")
181
+ print(f"JSON Errors : {errors}")
182
+ if scores:
183
+ print(f"Total Reward : {sum(scores)}")
184
+ print(f"Average Reward : {sum(scores)/len(scores):.2f}")
185
+ print(f"Overall Accuracy : {sum(scores)/len(scores)*100:.2f}%")
186
+
187
+ for task in task_scores:
188
+ if task_scores[task]:
189
+ acc = sum(task_scores[task]) / len(task_scores[task]) * 100
190
+ print(f"{task} Accuracy : {acc:.2f}%")
191
+ print("[END]")
192
+
193
+ if __name__ == "__main__":
194
+ run_full_eval()
models.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #----- Edited file--------------
2
+
3
+ from pydantic import BaseModel, Field
4
+ from enum import Enum
5
+ from typing import Optional, Dict
6
+
7
+
8
+ # ── Enums ─────────────────────────────────────────────
9
+
10
+ class SeverityLevel(str, Enum):
11
+ SEV1 = "SEV1"
12
+ SEV2 = "SEV2"
13
+ SEV3 = "SEV3"
14
+
15
+
16
+ class RootCauseCategory(str, Enum):
17
+ DATABASE = "DATABASE"
18
+ NETWORK = "NETWORK"
19
+ APPLICATION = "APPLICATION"
20
+ INFRASTRUCTURE = "INFRASTRUCTURE"
21
+ THIRD_PARTY = "THIRD_PARTY"
22
+ UNKNOWN = "UNKNOWN"
23
+
24
+
25
+ class RecommendedAction(str, Enum):
26
+ ROLLBACK = "ROLLBACK"
27
+ SCALE_UP = "SCALE_UP"
28
+ RESTART_SERVICE = "RESTART_SERVICE"
29
+ FAILOVER = "FAILOVER"
30
+ NOTIFY_VENDOR = "NOTIFY_VENDOR"
31
+ INVESTIGATE = "INVESTIGATE"
32
+ NO_ACTION = "NO_ACTION"
33
+
34
+
35
+ # ── Observation (Input to Agent) ──────────────────────
36
+
37
+ class IncidentObservation(BaseModel):
38
+ incident_id: str
39
+ task_type: str # "task1" | "task2" | "task3"
40
+ alert_text: str
41
+ context: Dict
42
+
43
+
44
+ # ── Action (Output from Agent) ────────────────────────
45
+
46
+ class IncidentAction(BaseModel):
47
+ incident_id: str
48
+ task_type: str
49
+
50
+ severity: Optional[SeverityLevel] = Field(None)
51
+ root_cause: Optional[RootCauseCategory] = Field(None)
52
+ action: Optional[RecommendedAction] = Field(None)
53
+
54
+
55
+ # ── Step Result ───────────────────────────────────────
56
+
57
+ class StepResult(BaseModel):
58
+ incident_id: str
59
+ task_type: str
60
+ reward: float
61
+ correct: bool
62
+ ground_truth: str
63
+ agent_answer: str
64
+
65
+
openenv.yaml ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ spec_version: 1
2
+ name: Incident_Triage
3
+ type: space
4
+ runtime: fastapi
5
+ app: app:app
6
+ port: 7860
7
+ version: "1.0.0"
8
+ description: >
9
+ RL-style environment for SRE incident triage.
10
+ An LLM agent receives production alerts and must classify severity,
11
+ identify root cause, or recommend remediation actions.
12
+
13
+ api:
14
+ base_url: http://0.0.0.0:7860
15
+ endpoints:
16
+ reset:
17
+ method: POST
18
+ path: /reset
19
+ params:
20
+ task_type:
21
+ type: string
22
+ required: false
23
+ enum: [task1, task2, task3]
24
+ returns: IncidentObservation + session_id
25
+
26
+ step:
27
+ method: POST
28
+ path: /step
29
+ params:
30
+ session_id:
31
+ type: string
32
+ required: true
33
+ body: IncidentAction
34
+ returns: StepResult
35
+
36
+ state:
37
+ method: GET
38
+ path: /state
39
+ params:
40
+ session_id:
41
+ type: string
42
+ required: true
43
+ returns: current episode state
44
+
45
+ tasks:
46
+ task1:
47
+ name: Severity Classification
48
+ output_field: severity
49
+ labels: [SEV1, SEV2, SEV3]
50
+ reward: partial # 1.0 exact | 0.5 adjacent | 0.0 far
51
+
52
+ task2:
53
+ name: Root Cause Classification
54
+ output_field: root_cause
55
+ labels: [DATABASE, NETWORK, APPLICATION, INFRASTRUCTURE, THIRD_PARTY, UNKNOWN]
56
+ reward: binary # 1.0 correct | 0.0 incorrect
57
+
58
+ task3:
59
+ name: Recommended Action
60
+ output_field: action
61
+ labels: [ROLLBACK, SCALE_UP, RESTART_SERVICE, FAILOVER, NOTIFY_VENDOR, INVESTIGATE, NO_ACTION]
62
+ reward: binary # 1.0 correct | 0.0 incorrect
63
+
64
+ dataset:
65
+ total_tickets: 36
66
+ split:
67
+ task1: 13
68
+ task2: 12
69
+ task3: 11
70
+
71
+ reproducibility:
72
+ llm_seed: 42
73
+ llm_temperature: 0.15
74
+ selection: random per task_type pool
pyproject.toml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ [build-system]
8
+ requires = ["setuptools>=45", "wheel"]
9
+ build-backend = "setuptools.build_meta"
10
+
11
+ [project]
12
+ name = "openenv-Incident_Triage"
13
+ version = "0.1.0"
14
+ description = "Incident Triage environment for OpenEnv"
15
+ requires-python = ">=3.10"
16
+ dependencies = [
17
+ # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
18
+ # install from github
19
+ # "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
20
+ "openenv-core[core]>=0.2.2",
21
+ # Environment-specific dependencies
22
+ # Add all dependencies needed for your environment here
23
+ # Examples:
24
+ # "numpy>=1.19.0",
25
+ # "torch>=2.0.0",
26
+ # "gymnasium>=0.29.0",
27
+ # "openspiel>=1.0.0",
28
+ # "smolagents>=1.22.0,<2",
29
+ ]
30
+
31
+ [project.optional-dependencies]
32
+ dev = [
33
+ "pytest>=8.0.0",
34
+ "pytest-cov>=4.0.0",
35
+ ]
36
+
37
+ [project.scripts]
38
+ # Server entry point - enables running via: uv run --project . server
39
+ # or: python -m Incident_Triage.server.app
40
+ server = "Incident_Triage.server.app:main"
41
+
42
+ [tool.setuptools]
43
+ include-package-data = true
44
+ packages = ["Incident_Triage", "Incident_Triage.server"]
45
+ package-dir = { "Incident_Triage" = ".", "Incident_Triage.server" = "server" }
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ pydantic
4
+ openai
5
+ requests