yashash045 commited on
Commit
83ecd75
Β·
verified Β·
1 Parent(s): 2176814

Upload folder using huggingface_hub

Browse files
Files changed (18) hide show
  1. .gitattributes +1 -35
  2. .gitignore +5 -5
  3. __init__.py +21 -21
  4. client.py +47 -47
  5. inference.py +298 -298
  6. integration_test.py +424 -424
  7. models.py +179 -179
  8. openenv.yaml +8 -8
  9. pyproject.toml +33 -33
  10. requirements.txt +4 -4
  11. server/__init__.py +7 -7
  12. server/app.py +128 -128
  13. server/graders.py +389 -389
  14. server/pipeline_engine.py +744 -744
  15. server/pipeline_environment.py +351 -351
  16. server/rewards.py +104 -104
  17. server/scenarios.py +0 -0
  18. uv.lock +0 -0
.gitattributes CHANGED
@@ -1,35 +1 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ README.md text eol=lf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore CHANGED
@@ -1,5 +1,5 @@
1
- .env
2
- __pycache__/
3
- *.pyc
4
- .venv/
5
- *.egg-info/
 
1
+ .env
2
+ __pycache__/
3
+ *.pyc
4
+ .venv/
5
+ *.egg-info/
__init__.py CHANGED
@@ -1,21 +1,21 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
- """DevOps Pipeline Environment."""
8
-
9
- from devops_pipeline_env.client import DevopsPipelineEnv
10
- from devops_pipeline_env.models import (
11
- ConfigEdit,
12
- PipelineAction,
13
- PipelineObservation,
14
- )
15
-
16
- __all__ = [
17
- "PipelineAction",
18
- "PipelineObservation",
19
- "ConfigEdit",
20
- "DevopsPipelineEnv",
21
- ]
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """DevOps Pipeline Environment."""
8
+
9
+ from devops_pipeline_env.client import DevopsPipelineEnv
10
+ from devops_pipeline_env.models import (
11
+ ConfigEdit,
12
+ PipelineAction,
13
+ PipelineObservation,
14
+ )
15
+
16
+ __all__ = [
17
+ "PipelineAction",
18
+ "PipelineObservation",
19
+ "ConfigEdit",
20
+ "DevopsPipelineEnv",
21
+ ]
client.py CHANGED
@@ -1,47 +1,47 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
- """DevOps Pipeline Environment Client."""
8
-
9
- from typing import Dict
10
-
11
- from openenv.core import EnvClient
12
- from openenv.core.client_types import StepResult
13
- from openenv.core.env_server.types import State
14
-
15
- from devops_pipeline_env.models import PipelineAction, PipelineObservation
16
-
17
-
18
- class DevopsPipelineEnv(
19
- EnvClient[PipelineAction, PipelineObservation, State]
20
- ):
21
- """
22
- Client for the DevOps Pipeline Environment.
23
-
24
- Example:
25
- >>> with DevopsPipelineEnv(base_url="http://localhost:8000") as client:
26
- ... result = client.reset()
27
- ... result = client.step(PipelineAction(action_type="view_pipeline"))
28
- """
29
-
30
- def _step_payload(self, action: PipelineAction) -> Dict:
31
- return action.model_dump(exclude_none=True)
32
-
33
- def _parse_result(self, payload: Dict) -> StepResult[PipelineObservation]:
34
- obs_data = payload.get("observation", {})
35
- observation = PipelineObservation(**obs_data)
36
-
37
- return StepResult(
38
- observation=observation,
39
- reward=payload.get("reward"),
40
- done=payload.get("done", False),
41
- )
42
-
43
- def _parse_state(self, payload: Dict) -> State:
44
- return State(
45
- episode_id=payload.get("episode_id"),
46
- step_count=payload.get("step_count", 0),
47
- )
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """DevOps Pipeline Environment Client."""
8
+
9
+ from typing import Dict
10
+
11
+ from openenv.core import EnvClient
12
+ from openenv.core.client_types import StepResult
13
+ from openenv.core.env_server.types import State
14
+
15
+ from devops_pipeline_env.models import PipelineAction, PipelineObservation
16
+
17
+
18
+ class DevopsPipelineEnv(
19
+ EnvClient[PipelineAction, PipelineObservation, State]
20
+ ):
21
+ """
22
+ Client for the DevOps Pipeline Environment.
23
+
24
+ Example:
25
+ >>> with DevopsPipelineEnv(base_url="http://localhost:8000") as client:
26
+ ... result = client.reset()
27
+ ... result = client.step(PipelineAction(action_type="view_pipeline"))
28
+ """
29
+
30
+ def _step_payload(self, action: PipelineAction) -> Dict:
31
+ return action.model_dump(exclude_none=True)
32
+
33
+ def _parse_result(self, payload: Dict) -> StepResult[PipelineObservation]:
34
+ obs_data = payload.get("observation", {})
35
+ observation = PipelineObservation(**obs_data)
36
+
37
+ return StepResult(
38
+ observation=observation,
39
+ reward=payload.get("reward"),
40
+ done=payload.get("done", False),
41
+ )
42
+
43
+ def _parse_state(self, payload: Dict) -> State:
44
+ return State(
45
+ episode_id=payload.get("episode_id"),
46
+ step_count=payload.get("step_count", 0),
47
+ )
inference.py CHANGED
@@ -1,298 +1,298 @@
1
- """Inference script for the DevOps Pipeline Environment."""
2
-
3
- import asyncio
4
- import json
5
- import os
6
- import textwrap
7
- from typing import List, Optional
8
-
9
- from openai import OpenAI
10
-
11
- from devops_pipeline_env import DevopsPipelineEnv, PipelineAction
12
- from devops_pipeline_env.models import ActionType
13
-
14
- # --- Env Vars (EXACT hackathon requirements) ----------------------------------
15
- API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
16
- if not API_KEY:
17
- raise ValueError("HF_TOKEN or API_KEY environment variable is required")
18
-
19
- API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
20
- MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
21
- IMAGE_NAME = os.getenv("IMAGE_NAME")
22
-
23
- BENCHMARK = "devops_pipeline_env"
24
- TASKS = ["clean_deploy", "broken_pipeline", "judgment_call", "cascading_failure", "capacity_crisis", "random_incident"]
25
- MAX_STEPS_PER_TASK = {"clean_deploy": 15, "broken_pipeline": 20, "judgment_call": 12, "cascading_failure": 15, "capacity_crisis": 15, "random_incident": 15}
26
- MAX_TOTAL_REWARD = {"clean_deploy": 0.70, "broken_pipeline": 0.85, "judgment_call": 0.65, "cascading_failure": 0.80, "capacity_crisis": 0.75, "random_incident": 0.70}
27
- TEMPERATURE = 0.7
28
- MAX_TOKENS = 300
29
- SUCCESS_SCORE_THRESHOLD = 0.1
30
-
31
-
32
- # --- Log Functions (EXACT hackathon format) -----------------------------------
33
- def log_start(task: str, env: str, model: str) -> None:
34
- print(f"[START] task={task} env={env} model={model}", flush=True)
35
-
36
-
37
- def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
38
- error_val = error if error else "null"
39
- done_val = str(done).lower()
40
- print(
41
- f"[STEP] step={step} action={action} reward={reward:.2f} "
42
- f"done={done_val} error={error_val}",
43
- flush=True,
44
- )
45
-
46
-
47
- def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
48
- rewards_str = ",".join(f"{r:.2f}" for r in rewards)
49
- print(
50
- f"[END] success={str(success).lower()} steps={steps} "
51
- f"score={score:.3f} rewards={rewards_str}",
52
- flush=True,
53
- )
54
-
55
-
56
- # --- System Prompt ------------------------------------------------------------
57
- SYSTEM_PROMPT = textwrap.dedent("""
58
- You are a DevOps engineer managing a CI/CD deployment pipeline with these services:
59
-
60
- database-primary: PostgreSQL root database. All services depend on it for data.
61
- auth-service: OAuth/JWT token provider. All services validate tokens through it. Depends on database-primary.
62
- api-gateway: Request router and load balancer. Depends on database-primary and auth-service.
63
- cache-service: Redis cache layer. Depends on database-primary.
64
- web-frontend: User-facing application. Depends on api-gateway and auth-service.
65
-
66
- Dependency chain: database-primary β†’ auth-service β†’ api-gateway β†’ web-frontend
67
- database-primary β†’ cache-service
68
-
69
- STRATEGY:
70
- - Read the summary field first β€” it tells you what's wrong at a glance.
71
- - Investigate degraded/down services with view_logs before acting.
72
- - Fix ROOT CAUSE services BEFORE downstream services.
73
- - Actions have side effects: deploys spike CPU, rollbacks risk regression, config changes cause restart latency.
74
- - In capacity scenarios, act proactively β€” don't wait for failures.
75
-
76
- TASK-SPECIFIC GUIDANCE:
77
- - clean_deploy: Deploy api-gateway then web-frontend. No complications expected.
78
- - broken_pipeline: Check cache-service logs/config first β€” Redis host is usually wrong. Run the pending migration before deploying api-gateway.
79
- - judgment_call: INCIDENT β€” check api-gateway logs first. Three options: (1) BEST: deploy hotfix v2.3.2 to api-gateway THEN edit web-frontend config api.auth_version to "v2", (2) SAFE: rollback api-gateway, (3) RISKY: deploy hotfix without fixing auth. Option 1 scores highest.
80
- - cascading_failure: Find ROOT CAUSE β€” check cache-service first, it's usually the source. Fix its config (max_connections too low), deploy it, then recover downstream services.
81
- - capacity_crisis: Check database-primary IMMEDIATELY β€” connection pool nearly full. Increase max_connections to 100+. Act FAST before tipping points cascade.
82
- - random_incident: Procedurally generated. Read the task description carefully β€” it tells you which service is failing and what type of failure. Investigate that service first.
83
-
84
- You must respond with a SINGLE valid JSON object matching the PipelineAction schema.
85
-
86
- Example responses:
87
- {"action_type": "view_pipeline"}
88
- {"action_type": "view_logs", "service_name": "api-gateway"}
89
- {"action_type": "deploy", "service_name": "api-gateway", "target_version": "v2.3.1"}
90
- {"action_type": "edit_config", "service_name": "cache-service", "config_edits": [{"key": "redis.host", "value": "redis-prod.internal:6379"}]}
91
- {"action_type": "rollback", "service_name": "api-gateway", "reason": "Hotfix unstable"}
92
- {"action_type": "approve", "reason": "All services deployed and healthy"}
93
-
94
- Respond with ONLY the JSON object. No explanation, no markdown.
95
- """).strip()
96
-
97
- RETRY_PROMPT = 'Respond with ONLY a JSON action. Example: {"action_type": "view_pipeline"}'
98
-
99
-
100
- def summarize_observation(obs_dict):
101
- """Compress observation so LLM can actually parse it."""
102
- summary = obs_dict.get("summary", "")
103
- task = obs_dict.get("task_description", "")
104
- goal = obs_dict.get("goal", "")
105
- last_result = obs_dict.get("last_action_result", "")
106
- last_error = obs_dict.get("last_action_error", "")
107
- step = obs_dict.get("step_number", 0)
108
- max_steps = obs_dict.get("max_steps", 15)
109
-
110
- services_compact = []
111
- for svc in obs_dict.get("services", []):
112
- name = svc.get("name", "?")
113
- health = svc.get("health", "?")
114
- err = svc.get("error_rate", 0)
115
- lat = svc.get("request_latency_ms", 0)
116
- cpu = svc.get("cpu_percent", 0)
117
- line = f"{name}: {health}"
118
- if health != "healthy":
119
- line += f" (err={err:.1f}/s, lat={lat:.0f}ms)"
120
- if cpu > 70:
121
- line += f" [CPU={cpu:.0f}%]"
122
- services_compact.append(line)
123
-
124
- alerts = [
125
- f"[{a.get('severity','')}] {a.get('message','')}"
126
- for a in obs_dict.get("active_alerts", [])[:3]
127
- ]
128
- available = obs_dict.get("available_actions", [])
129
- config = obs_dict.get("config_snapshot", {})
130
-
131
- parts = []
132
- if step == 0:
133
- parts.append(f"TASK: {task}")
134
- parts.append(f"GOAL: {goal}")
135
- parts.append(f"Step {step}/{max_steps}")
136
- if summary:
137
- parts.append(f"Status: {summary}")
138
- parts.append(f"Services: {'; '.join(services_compact)}")
139
- if alerts:
140
- parts.append(f"Alerts: {'; '.join(alerts)}")
141
- if config:
142
- parts.append(f"Config: {config}")
143
- if last_result:
144
- parts.append(f"Last result: {last_result[:300]}")
145
- if last_error:
146
- parts.append(f"Error: {last_error[:200]}")
147
- parts.append(f"Available actions: {', '.join(available)}")
148
-
149
- return "\n".join(p for p in parts if p)
150
-
151
-
152
- def build_user_message(obs, investigated):
153
- """Build user message with compact observation for LLM."""
154
- obs_dict = obs.model_dump(mode="json")
155
- compact = summarize_observation(obs_dict)
156
-
157
- inv_block = ""
158
- if investigated:
159
- inv_block = "\n\nINVESTIGATED: " + ", ".join(sorted(investigated))
160
-
161
- return f"CURRENT STATE:\n{compact}{inv_block}\n\nWhat is your next action?"
162
-
163
-
164
- def build_messages(system_prompt, conversation, current_user_msg):
165
- """Build multi-turn messages list with system prompt + last 6 turns + current."""
166
- messages = [{"role": "system", "content": system_prompt}]
167
- # Keep last 6 turns (12 messages = 6 user + 6 assistant)
168
- recent = conversation[-(6 * 2):]
169
- messages.extend(recent)
170
- messages.append({"role": "user", "content": current_user_msg})
171
- return messages
172
-
173
-
174
- def parse_llm_action(text):
175
- """Parse LLM response into PipelineAction. Fallback to view_pipeline on failure."""
176
- try:
177
- text = text.strip()
178
- if text.startswith("```"):
179
- text = text.split("```")[1]
180
- if text.startswith("json"):
181
- text = text[4:]
182
- data = json.loads(text)
183
- return PipelineAction(**data)
184
- except Exception:
185
- return PipelineAction(action_type=ActionType.VIEW_PIPELINE)
186
-
187
-
188
- async def run_task(client, env, task_name):
189
- rewards = []
190
- steps_taken = 0
191
- score = 0.0
192
- success = False
193
- max_steps = MAX_STEPS_PER_TASK.get(task_name, 20)
194
- max_reward = MAX_TOTAL_REWARD.get(task_name, 1.0)
195
- conversation = [] # Multi-turn: list of {"role": ..., "content": ...}
196
- investigated = set()
197
-
198
- log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
199
-
200
- try:
201
- os.environ["DEVOPS_TASK"] = task_name
202
- result = await env.reset(task=task_name)
203
- obs = result.observation
204
-
205
- for step in range(1, max_steps + 1):
206
- if result.done:
207
- break
208
-
209
- user_msg = build_user_message(obs, investigated)
210
- messages = build_messages(SYSTEM_PROMPT, conversation, user_msg)
211
- try:
212
- completion = client.chat.completions.create(
213
- model=MODEL_NAME,
214
- messages=messages,
215
- temperature=TEMPERATURE,
216
- max_tokens=MAX_TOKENS,
217
- stream=False,
218
- )
219
- action_text = (completion.choices[0].message.content or "").strip()
220
- action = parse_llm_action(action_text)
221
-
222
- # Retry once if parse fell back to default
223
- if action.action_type == ActionType.VIEW_PIPELINE and "view_pipeline" not in action_text.lower():
224
- retry_msgs = build_messages(RETRY_PROMPT, conversation, user_msg)
225
- retry_completion = client.chat.completions.create(
226
- model=MODEL_NAME,
227
- messages=retry_msgs,
228
- temperature=0.3,
229
- max_tokens=150,
230
- stream=False,
231
- )
232
- retry_text = (retry_completion.choices[0].message.content or "").strip()
233
- retry_action = parse_llm_action(retry_text)
234
- if retry_action.action_type != ActionType.VIEW_PIPELINE or "view_pipeline" in retry_text.lower():
235
- action = retry_action
236
- action_text = retry_text
237
- except Exception as e:
238
- print(f"[DEBUG] LLM call failed: {e}", flush=True)
239
- action = PipelineAction(action_type=ActionType.VIEW_PIPELINE)
240
- action_text = '{"action_type": "view_pipeline"}'
241
-
242
- # Track investigated services
243
- if action.action_type in (ActionType.VIEW_LOGS, ActionType.VIEW_CONFIG) and action.service_name:
244
- investigated.add(f"{action.action_type.value}:{action.service_name}")
245
-
246
- # Append this turn to conversation history
247
- conversation.append({"role": "user", "content": user_msg})
248
- conversation.append({"role": "assistant", "content": action_text})
249
-
250
- result = await env.step(action)
251
- obs = result.observation
252
-
253
- reward = result.reward or 0.0
254
- done = result.done
255
- error = obs.last_action_error
256
-
257
- rewards.append(reward)
258
- steps_taken = step
259
-
260
- action_str = json.dumps(action.model_dump(exclude_none=True), default=str)
261
- log_step(step=step, action=action_str, reward=reward, done=done, error=error)
262
-
263
- if done:
264
- break
265
-
266
- score = sum(rewards) / max_reward if max_reward > 0 else 0.0
267
- score = min(max(score, 0.0), 1.0)
268
- success = score >= SUCCESS_SCORE_THRESHOLD
269
-
270
- except Exception as e:
271
- print(f"[DEBUG] Task {task_name} error: {e}", flush=True)
272
-
273
- finally:
274
- log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
275
-
276
-
277
- async def main():
278
- client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
279
-
280
- if IMAGE_NAME:
281
- env = await DevopsPipelineEnv.from_docker_image(IMAGE_NAME)
282
- else:
283
- env = DevopsPipelineEnv(
284
- base_url=os.getenv("ENV_BASE_URL", "http://localhost:8000")
285
- )
286
-
287
- try:
288
- for task in TASKS:
289
- await run_task(client, env, task)
290
- finally:
291
- try:
292
- await env.close()
293
- except Exception as e:
294
- print(f"[DEBUG] env.close() error: {e}", flush=True)
295
-
296
-
297
- if __name__ == "__main__":
298
- asyncio.run(main())
 
1
+ """Inference script for the DevOps Pipeline Environment."""
2
+
3
+ import asyncio
4
+ import json
5
+ import os
6
+ import textwrap
7
+ from typing import List, Optional
8
+
9
+ from openai import OpenAI
10
+
11
+ from devops_pipeline_env import DevopsPipelineEnv, PipelineAction
12
+ from devops_pipeline_env.models import ActionType
13
+
14
+ # --- Env Vars (EXACT hackathon requirements) ----------------------------------
15
+ API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
16
+ if not API_KEY:
17
+ raise ValueError("HF_TOKEN or API_KEY environment variable is required")
18
+
19
+ API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
20
+ MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
21
+ IMAGE_NAME = os.getenv("IMAGE_NAME")
22
+
23
+ BENCHMARK = "devops_pipeline_env"
24
+ TASKS = ["clean_deploy", "broken_pipeline", "judgment_call", "cascading_failure", "capacity_crisis", "random_incident"]
25
+ MAX_STEPS_PER_TASK = {"clean_deploy": 15, "broken_pipeline": 20, "judgment_call": 12, "cascading_failure": 15, "capacity_crisis": 15, "random_incident": 15}
26
+ MAX_TOTAL_REWARD = {"clean_deploy": 0.70, "broken_pipeline": 0.85, "judgment_call": 0.65, "cascading_failure": 0.80, "capacity_crisis": 0.75, "random_incident": 0.70}
27
+ TEMPERATURE = 0.7
28
+ MAX_TOKENS = 300
29
+ SUCCESS_SCORE_THRESHOLD = 0.1
30
+
31
+
32
+ # --- Log Functions (EXACT hackathon format) -----------------------------------
33
+ def log_start(task: str, env: str, model: str) -> None:
34
+ print(f"[START] task={task} env={env} model={model}", flush=True)
35
+
36
+
37
+ def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
38
+ error_val = error if error else "null"
39
+ done_val = str(done).lower()
40
+ print(
41
+ f"[STEP] step={step} action={action} reward={reward:.2f} "
42
+ f"done={done_val} error={error_val}",
43
+ flush=True,
44
+ )
45
+
46
+
47
+ def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
48
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards)
49
+ print(
50
+ f"[END] success={str(success).lower()} steps={steps} "
51
+ f"score={score:.3f} rewards={rewards_str}",
52
+ flush=True,
53
+ )
54
+
55
+
56
+ # --- System Prompt ------------------------------------------------------------
57
+ SYSTEM_PROMPT = textwrap.dedent("""
58
+ You are a DevOps engineer managing a CI/CD deployment pipeline with these services:
59
+
60
+ database-primary: PostgreSQL root database. All services depend on it for data.
61
+ auth-service: OAuth/JWT token provider. All services validate tokens through it. Depends on database-primary.
62
+ api-gateway: Request router and load balancer. Depends on database-primary and auth-service.
63
+ cache-service: Redis cache layer. Depends on database-primary.
64
+ web-frontend: User-facing application. Depends on api-gateway and auth-service.
65
+
66
+ Dependency chain: database-primary β†’ auth-service β†’ api-gateway β†’ web-frontend
67
+ database-primary β†’ cache-service
68
+
69
+ STRATEGY:
70
+ - Read the summary field first β€” it tells you what's wrong at a glance.
71
+ - Investigate degraded/down services with view_logs before acting.
72
+ - Fix ROOT CAUSE services BEFORE downstream services.
73
+ - Actions have side effects: deploys spike CPU, rollbacks risk regression, config changes cause restart latency.
74
+ - In capacity scenarios, act proactively β€” don't wait for failures.
75
+
76
+ TASK-SPECIFIC GUIDANCE:
77
+ - clean_deploy: Deploy api-gateway then web-frontend. No complications expected.
78
+ - broken_pipeline: Check cache-service logs/config first β€” Redis host is usually wrong. Run the pending migration before deploying api-gateway.
79
+ - judgment_call: INCIDENT β€” check api-gateway logs first. Three options: (1) BEST: deploy hotfix v2.3.2 to api-gateway THEN edit web-frontend config api.auth_version to "v2", (2) SAFE: rollback api-gateway, (3) RISKY: deploy hotfix without fixing auth. Option 1 scores highest.
80
+ - cascading_failure: Find ROOT CAUSE β€” check cache-service first, it's usually the source. Fix its config (max_connections too low), deploy it, then recover downstream services.
81
+ - capacity_crisis: Check database-primary IMMEDIATELY β€” connection pool nearly full. Increase max_connections to 100+. Act FAST before tipping points cascade.
82
+ - random_incident: Procedurally generated. Read the task description carefully β€” it tells you which service is failing and what type of failure. Investigate that service first.
83
+
84
+ You must respond with a SINGLE valid JSON object matching the PipelineAction schema.
85
+
86
+ Example responses:
87
+ {"action_type": "view_pipeline"}
88
+ {"action_type": "view_logs", "service_name": "api-gateway"}
89
+ {"action_type": "deploy", "service_name": "api-gateway", "target_version": "v2.3.1"}
90
+ {"action_type": "edit_config", "service_name": "cache-service", "config_edits": [{"key": "redis.host", "value": "redis-prod.internal:6379"}]}
91
+ {"action_type": "rollback", "service_name": "api-gateway", "reason": "Hotfix unstable"}
92
+ {"action_type": "approve", "reason": "All services deployed and healthy"}
93
+
94
+ Respond with ONLY the JSON object. No explanation, no markdown.
95
+ """).strip()
96
+
97
+ RETRY_PROMPT = 'Respond with ONLY a JSON action. Example: {"action_type": "view_pipeline"}'
98
+
99
+
100
+ def summarize_observation(obs_dict):
101
+ """Compress observation so LLM can actually parse it."""
102
+ summary = obs_dict.get("summary", "")
103
+ task = obs_dict.get("task_description", "")
104
+ goal = obs_dict.get("goal", "")
105
+ last_result = obs_dict.get("last_action_result", "")
106
+ last_error = obs_dict.get("last_action_error", "")
107
+ step = obs_dict.get("step_number", 0)
108
+ max_steps = obs_dict.get("max_steps", 15)
109
+
110
+ services_compact = []
111
+ for svc in obs_dict.get("services", []):
112
+ name = svc.get("name", "?")
113
+ health = svc.get("health", "?")
114
+ err = svc.get("error_rate", 0)
115
+ lat = svc.get("request_latency_ms", 0)
116
+ cpu = svc.get("cpu_percent", 0)
117
+ line = f"{name}: {health}"
118
+ if health != "healthy":
119
+ line += f" (err={err:.1f}/s, lat={lat:.0f}ms)"
120
+ if cpu > 70:
121
+ line += f" [CPU={cpu:.0f}%]"
122
+ services_compact.append(line)
123
+
124
+ alerts = [
125
+ f"[{a.get('severity','')}] {a.get('message','')}"
126
+ for a in obs_dict.get("active_alerts", [])[:3]
127
+ ]
128
+ available = obs_dict.get("available_actions", [])
129
+ config = obs_dict.get("config_snapshot", {})
130
+
131
+ parts = []
132
+ if step == 0:
133
+ parts.append(f"TASK: {task}")
134
+ parts.append(f"GOAL: {goal}")
135
+ parts.append(f"Step {step}/{max_steps}")
136
+ if summary:
137
+ parts.append(f"Status: {summary}")
138
+ parts.append(f"Services: {'; '.join(services_compact)}")
139
+ if alerts:
140
+ parts.append(f"Alerts: {'; '.join(alerts)}")
141
+ if config:
142
+ parts.append(f"Config: {config}")
143
+ if last_result:
144
+ parts.append(f"Last result: {last_result[:300]}")
145
+ if last_error:
146
+ parts.append(f"Error: {last_error[:200]}")
147
+ parts.append(f"Available actions: {', '.join(available)}")
148
+
149
+ return "\n".join(p for p in parts if p)
150
+
151
+
152
+ def build_user_message(obs, investigated):
153
+ """Build user message with compact observation for LLM."""
154
+ obs_dict = obs.model_dump(mode="json")
155
+ compact = summarize_observation(obs_dict)
156
+
157
+ inv_block = ""
158
+ if investigated:
159
+ inv_block = "\n\nINVESTIGATED: " + ", ".join(sorted(investigated))
160
+
161
+ return f"CURRENT STATE:\n{compact}{inv_block}\n\nWhat is your next action?"
162
+
163
+
164
+ def build_messages(system_prompt, conversation, current_user_msg):
165
+ """Build multi-turn messages list with system prompt + last 6 turns + current."""
166
+ messages = [{"role": "system", "content": system_prompt}]
167
+ # Keep last 6 turns (12 messages = 6 user + 6 assistant)
168
+ recent = conversation[-(6 * 2):]
169
+ messages.extend(recent)
170
+ messages.append({"role": "user", "content": current_user_msg})
171
+ return messages
172
+
173
+
174
+ def parse_llm_action(text):
175
+ """Parse LLM response into PipelineAction. Fallback to view_pipeline on failure."""
176
+ try:
177
+ text = text.strip()
178
+ if text.startswith("```"):
179
+ text = text.split("```")[1]
180
+ if text.startswith("json"):
181
+ text = text[4:]
182
+ data = json.loads(text)
183
+ return PipelineAction(**data)
184
+ except Exception:
185
+ return PipelineAction(action_type=ActionType.VIEW_PIPELINE)
186
+
187
+
188
+ async def run_task(client, env, task_name):
189
+ rewards = []
190
+ steps_taken = 0
191
+ score = 0.0
192
+ success = False
193
+ max_steps = MAX_STEPS_PER_TASK.get(task_name, 20)
194
+ max_reward = MAX_TOTAL_REWARD.get(task_name, 1.0)
195
+ conversation = [] # Multi-turn: list of {"role": ..., "content": ...}
196
+ investigated = set()
197
+
198
+ log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
199
+
200
+ try:
201
+ os.environ["DEVOPS_TASK"] = task_name
202
+ result = await env.reset(task=task_name)
203
+ obs = result.observation
204
+
205
+ for step in range(1, max_steps + 1):
206
+ if result.done:
207
+ break
208
+
209
+ user_msg = build_user_message(obs, investigated)
210
+ messages = build_messages(SYSTEM_PROMPT, conversation, user_msg)
211
+ try:
212
+ completion = client.chat.completions.create(
213
+ model=MODEL_NAME,
214
+ messages=messages,
215
+ temperature=TEMPERATURE,
216
+ max_tokens=MAX_TOKENS,
217
+ stream=False,
218
+ )
219
+ action_text = (completion.choices[0].message.content or "").strip()
220
+ action = parse_llm_action(action_text)
221
+
222
+ # Retry once if parse fell back to default
223
+ if action.action_type == ActionType.VIEW_PIPELINE and "view_pipeline" not in action_text.lower():
224
+ retry_msgs = build_messages(RETRY_PROMPT, conversation, user_msg)
225
+ retry_completion = client.chat.completions.create(
226
+ model=MODEL_NAME,
227
+ messages=retry_msgs,
228
+ temperature=0.3,
229
+ max_tokens=150,
230
+ stream=False,
231
+ )
232
+ retry_text = (retry_completion.choices[0].message.content or "").strip()
233
+ retry_action = parse_llm_action(retry_text)
234
+ if retry_action.action_type != ActionType.VIEW_PIPELINE or "view_pipeline" in retry_text.lower():
235
+ action = retry_action
236
+ action_text = retry_text
237
+ except Exception as e:
238
+ print(f"[DEBUG] LLM call failed: {e}", flush=True)
239
+ action = PipelineAction(action_type=ActionType.VIEW_PIPELINE)
240
+ action_text = '{"action_type": "view_pipeline"}'
241
+
242
+ # Track investigated services
243
+ if action.action_type in (ActionType.VIEW_LOGS, ActionType.VIEW_CONFIG) and action.service_name:
244
+ investigated.add(f"{action.action_type.value}:{action.service_name}")
245
+
246
+ # Append this turn to conversation history
247
+ conversation.append({"role": "user", "content": user_msg})
248
+ conversation.append({"role": "assistant", "content": action_text})
249
+
250
+ result = await env.step(action)
251
+ obs = result.observation
252
+
253
+ reward = result.reward or 0.0
254
+ done = result.done
255
+ error = obs.last_action_error
256
+
257
+ rewards.append(reward)
258
+ steps_taken = step
259
+
260
+ action_str = json.dumps(action.model_dump(exclude_none=True), default=str)
261
+ log_step(step=step, action=action_str, reward=reward, done=done, error=error)
262
+
263
+ if done:
264
+ break
265
+
266
+ score = sum(rewards) / max_reward if max_reward > 0 else 0.0
267
+ score = min(max(score, 0.0), 1.0)
268
+ success = score >= SUCCESS_SCORE_THRESHOLD
269
+
270
+ except Exception as e:
271
+ print(f"[DEBUG] Task {task_name} error: {e}", flush=True)
272
+
273
+ finally:
274
+ log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
275
+
276
+
277
+ async def main():
278
+ client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
279
+
280
+ if IMAGE_NAME:
281
+ env = await DevopsPipelineEnv.from_docker_image(IMAGE_NAME)
282
+ else:
283
+ env = DevopsPipelineEnv(
284
+ base_url=os.getenv("ENV_BASE_URL", "http://localhost:8000")
285
+ )
286
+
287
+ try:
288
+ for task in TASKS:
289
+ await run_task(client, env, task)
290
+ finally:
291
+ try:
292
+ await env.close()
293
+ except Exception as e:
294
+ print(f"[DEBUG] env.close() error: {e}", flush=True)
295
+
296
+
297
+ if __name__ == "__main__":
298
+ asyncio.run(main())
integration_test.py CHANGED
@@ -1,424 +1,424 @@
1
- """Comprehensive integration test for the DevOps Pipeline Environment."""
2
-
3
- import os
4
- import sys
5
- import json
6
- import traceback
7
-
8
- # Add project to path
9
- sys.path.insert(0, os.path.dirname(__file__))
10
-
11
- os.environ.pop("DEVOPS_TASK", None)
12
-
13
- from devops_pipeline_env.models import (
14
- ActionType,
15
- ConfigEdit,
16
- PipelineAction,
17
- )
18
- from server.pipeline_environment import PipelineEnvironment
19
- from server.graders import grade_task
20
-
21
- PASS = "PASS"
22
- FAIL = "FAIL"
23
- results = []
24
-
25
-
26
- def report(test_name, passed, detail=""):
27
- status = PASS if passed else FAIL
28
- results.append((test_name, status, detail))
29
- print(f" [{status}] {test_name}" + (f" β€” {detail}" if detail else ""), flush=True)
30
-
31
-
32
- def make_action(action_type, service_name=None, target_version=None, config_edits=None,
33
- migration_name=None, migration_type=None, reason=None):
34
- return PipelineAction(
35
- action_type=action_type,
36
- service_name=service_name,
37
- target_version=target_version,
38
- config_edits=config_edits,
39
- migration_name=migration_name,
40
- migration_type=migration_type,
41
- reason=reason,
42
- )
43
-
44
-
45
- # ============================================================================
46
- # TEST 2: POST /reset β€” 5 services (test each task)
47
- # ============================================================================
48
- print("\n=== TEST 2: POST /reset β€” 5 services ===", flush=True)
49
- for task in ["clean_deploy", "broken_pipeline", "judgment_call", "cascading_failure"]:
50
- os.environ["DEVOPS_TASK"] = task
51
- env = PipelineEnvironment()
52
- obs = env.reset()
53
- svc_names = sorted([s.name for s in obs.services])
54
- expected = sorted(["database-primary", "auth-service", "api-gateway", "web-frontend"])
55
- if task in ("broken_pipeline", "cascading_failure"):
56
- expected = sorted(expected + ["cache-service"])
57
- has_5 = len(obs.services) >= 4
58
- report(f"reset {task}: services={len(obs.services)}", has_5,
59
- f"names={svc_names}")
60
-
61
- # ============================================================================
62
- # TEST 3: GET /health (just test the function exists)
63
- # ============================================================================
64
- print("\n=== TEST 3: GET /health ===", flush=True)
65
- report("/health endpoint exists", True, "Verified in app.py line 65")
66
-
67
- # ============================================================================
68
- # TEST 4: GET /tasks β€” 4 tasks
69
- # ============================================================================
70
- print("\n=== TEST 4: GET /tasks β€” 4 tasks ===", flush=True)
71
- from server.app import get_tasks
72
- tasks_resp = get_tasks()
73
- task_names = [t["name"] for t in tasks_resp["tasks"]]
74
- report("5 tasks returned", len(task_names) == 5, f"tasks={task_names}")
75
- for expected_task in ["clean_deploy", "broken_pipeline", "judgment_call", "cascading_failure", "capacity_crisis"]:
76
- report(f" task '{expected_task}' present", expected_task in task_names)
77
-
78
-
79
- # ============================================================================
80
- # TEST 5: Optimal path tests
81
- # ============================================================================
82
- print("\n=== TEST 5: Optimal path scores ===", flush=True)
83
-
84
-
85
- def run_clean_deploy():
86
- os.environ["DEVOPS_TASK"] = "clean_deploy"
87
- env = PipelineEnvironment()
88
- obs = env.reset()
89
- actions = [
90
- make_action(ActionType.VIEW_LOGS, service_name="api-gateway"),
91
- make_action(ActionType.VIEW_LOGS, service_name="web-frontend"),
92
- make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"),
93
- make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"),
94
- make_action(ActionType.DEPLOY, service_name="web-frontend", target_version="v1.9.0"),
95
- make_action(ActionType.DEPLOY, service_name="web-frontend", target_version="v1.9.0"),
96
- make_action(ActionType.APPROVE, reason="Both services deployed successfully"),
97
- ]
98
- for a in actions:
99
- obs = env.step(a)
100
- score = grade_task("clean_deploy", env.get_episode_history(), env.get_engine())
101
- return score
102
-
103
-
104
- def run_broken_pipeline():
105
- os.environ["DEVOPS_TASK"] = "broken_pipeline"
106
- env = PipelineEnvironment()
107
- obs = env.reset()
108
- actions = [
109
- make_action(ActionType.VIEW_LOGS, service_name="api-gateway"),
110
- make_action(ActionType.VIEW_LOGS, service_name="cache-service"),
111
- make_action(ActionType.VIEW_CONFIG, service_name="cache-service"),
112
- make_action(ActionType.EDIT_CONFIG, service_name="cache-service",
113
- config_edits=[ConfigEdit(key="redis.host", value="redis-prod.internal:6379")]),
114
- make_action(ActionType.RUN_MIGRATION, migration_name="add_index_users_email", migration_type="schema"),
115
- make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"),
116
- make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"),
117
- make_action(ActionType.DEPLOY, service_name="cache-service", target_version="v1.2.1"),
118
- make_action(ActionType.DEPLOY, service_name="cache-service", target_version="v1.2.1"),
119
- make_action(ActionType.DEPLOY, service_name="web-frontend", target_version="v1.9.0"),
120
- make_action(ActionType.DEPLOY, service_name="web-frontend", target_version="v1.9.0"),
121
- make_action(ActionType.APPROVE, reason="All services deployed"),
122
- ]
123
- for a in actions:
124
- obs = env.step(a)
125
- score = grade_task("broken_pipeline", env.get_episode_history(), env.get_engine())
126
- return score
127
-
128
-
129
- def run_judgment_call_expert():
130
- os.environ["DEVOPS_TASK"] = "judgment_call"
131
- env = PipelineEnvironment()
132
- obs = env.reset()
133
- actions = [
134
- make_action(ActionType.VIEW_LOGS, service_name="api-gateway"),
135
- make_action(ActionType.VIEW_LOGS, service_name="web-frontend"),
136
- make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.2"),
137
- make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.2"),
138
- make_action(ActionType.EDIT_CONFIG, service_name="web-frontend",
139
- config_edits=[ConfigEdit(key="api.auth_version", value="v2")]),
140
- make_action(ActionType.APPROVE, reason="Hotfix deployed, auth config fixed"),
141
- ]
142
- for a in actions:
143
- obs = env.step(a)
144
- score = grade_task("judgment_call", env.get_episode_history(), env.get_engine())
145
- return score
146
-
147
-
148
- def run_cascading_failure():
149
- os.environ["DEVOPS_TASK"] = "cascading_failure"
150
- env = PipelineEnvironment()
151
- obs = env.reset()
152
- actions = [
153
- make_action(ActionType.VIEW_LOGS, service_name="cache-service"),
154
- make_action(ActionType.VIEW_CONFIG, service_name="cache-service"),
155
- make_action(ActionType.EDIT_CONFIG, service_name="cache-service",
156
- config_edits=[ConfigEdit(key="redis.max_connections", value="50")]),
157
- make_action(ActionType.DEPLOY, service_name="cache-service", target_version="v1.2.1"),
158
- make_action(ActionType.DEPLOY, service_name="cache-service", target_version="v1.2.1"),
159
- make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"),
160
- make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"),
161
- make_action(ActionType.DEPLOY, service_name="web-frontend", target_version="v1.9.0"),
162
- make_action(ActionType.DEPLOY, service_name="web-frontend", target_version="v1.9.0"),
163
- make_action(ActionType.APPROVE, reason="All services recovered and deployed"),
164
- ]
165
- for a in actions:
166
- obs = env.step(a)
167
- score = grade_task("cascading_failure", env.get_episode_history(), env.get_engine())
168
- return score
169
-
170
-
171
- def run_capacity_crisis():
172
- os.environ["DEVOPS_TASK"] = "capacity_crisis"
173
- env = PipelineEnvironment()
174
- obs = env.reset()
175
- actions = [
176
- make_action(ActionType.VIEW_LOGS, service_name="database-primary"),
177
- make_action(ActionType.EDIT_CONFIG, service_name="database-primary",
178
- config_edits=[ConfigEdit(key="max_connections", value="100")]),
179
- make_action(ActionType.EDIT_CONFIG, service_name="cache-service",
180
- config_edits=[ConfigEdit(key="max_memory", value="4GB")]),
181
- make_action(ActionType.VIEW_PIPELINE),
182
- make_action(ActionType.APPROVE, reason="Stabilized"),
183
- ]
184
- for a in actions:
185
- obs = env.step(a)
186
- score = grade_task("capacity_crisis", env.get_episode_history(), env.get_engine())
187
- return score
188
-
189
-
190
- targets = {
191
- "clean_deploy": (run_clean_deploy, 0.95),
192
- "broken_pipeline": (run_broken_pipeline, 0.80),
193
- "judgment_call": (run_judgment_call_expert, 0.90),
194
- "cascading_failure": (run_cascading_failure, 0.70),
195
- "capacity_crisis": (run_capacity_crisis, 0.60),
196
- }
197
-
198
- scores = {}
199
- for task, (runner, target) in targets.items():
200
- try:
201
- score = runner()
202
- scores[task] = score
203
- report(f"optimal {task}: {score:.3f} (target {target:.2f}+)",
204
- score >= target, f"{'OK' if score >= target else 'BELOW TARGET'}")
205
- except Exception as e:
206
- report(f"optimal {task}", False, f"EXCEPTION: {e}\n{traceback.format_exc()}")
207
-
208
-
209
- # ============================================================================
210
- # TEST 6: Determinism β€” same seed, same score
211
- # ============================================================================
212
- print("\n=== TEST 6: Determinism ===", flush=True)
213
- for task, (runner, _) in targets.items():
214
- try:
215
- s1 = runner()
216
- s2 = runner()
217
- report(f"determinism {task}: {s1:.3f} == {s2:.3f}", s1 == s2)
218
- except Exception as e:
219
- report(f"determinism {task}", False, f"EXCEPTION: {e}")
220
-
221
-
222
- # ============================================================================
223
- # TEST 7: Action validation for ALL 5 services
224
- # ============================================================================
225
- print("\n=== TEST 7: Action validation for all services ===", flush=True)
226
-
227
- # Use cascading_failure which has all 5 services
228
- os.environ["DEVOPS_TASK"] = "cascading_failure"
229
- env = PipelineEnvironment()
230
- obs = env.reset()
231
-
232
- svc_names = [s.name for s in obs.services]
233
- report("5 services present", len(svc_names) == 5, f"{sorted(svc_names)}")
234
-
235
- # Test deploy on database-primary and auth-service
236
- for svc in ["database-primary", "auth-service"]:
237
- obs = env.step(make_action(ActionType.DEPLOY, service_name=svc, target_version="v99.0.0"))
238
- report(f"deploy {svc}", obs.last_action_error is None,
239
- obs.last_action_error or obs.last_action_result[:80] if obs.last_action_result else "")
240
-
241
- # Rollback
242
- env2 = PipelineEnvironment()
243
- obs = env2.reset()
244
- for svc in ["database-primary", "auth-service"]:
245
- obs = env2.step(make_action(ActionType.ROLLBACK, service_name=svc))
246
- report(f"rollback {svc}", obs.last_action_error is None,
247
- obs.last_action_error or obs.last_action_result[:80] if obs.last_action_result else "")
248
-
249
- # view_logs
250
- env3 = PipelineEnvironment()
251
- obs = env3.reset()
252
- for svc in ["database-primary", "auth-service"]:
253
- obs = env3.step(make_action(ActionType.VIEW_LOGS, service_name=svc))
254
- has_logs = obs.last_action_result and len(obs.last_action_result) > 10
255
- report(f"view_logs {svc}", has_logs,
256
- f"len={len(obs.last_action_result) if obs.last_action_result else 0}")
257
-
258
- # view_config
259
- for svc in ["database-primary", "auth-service"]:
260
- obs = env3.step(make_action(ActionType.VIEW_CONFIG, service_name=svc))
261
- has_config = obs.last_action_result and "=" in obs.last_action_result
262
- report(f"view_config {svc}", has_config,
263
- obs.last_action_result[:80] if obs.last_action_result else "none")
264
-
265
- # edit_config
266
- env4 = PipelineEnvironment()
267
- obs = env4.reset()
268
- obs = env4.step(make_action(ActionType.EDIT_CONFIG, service_name="database-primary",
269
- config_edits=[ConfigEdit(key="max_connections", value="100")]))
270
- report("edit_config database-primary", obs.last_action_error is None,
271
- obs.last_action_result[:80] if obs.last_action_result else "")
272
-
273
- obs = env4.step(make_action(ActionType.EDIT_CONFIG, service_name="auth-service",
274
- config_edits=[ConfigEdit(key="token_ttl_seconds", value="7200")]))
275
- report("edit_config auth-service", obs.last_action_error is None,
276
- obs.last_action_result[:80] if obs.last_action_result else "")
277
-
278
-
279
- # ============================================================================
280
- # TEST 8: Invalid action tests
281
- # ============================================================================
282
- print("\n=== TEST 8: Invalid action tests ===", flush=True)
283
- env5 = PipelineEnvironment()
284
- obs = env5.reset()
285
-
286
- try:
287
- obs = env5.step(make_action(ActionType.DEPLOY, service_name="nonexistent-service", target_version="v1.0"))
288
- has_error = obs.last_action_error is not None
289
- report("deploy nonexistent-service: graceful error", has_error,
290
- obs.last_action_error[:80] if obs.last_action_error else "no error msg")
291
- except Exception as e:
292
- report("deploy nonexistent-service: graceful error", False, f"CRASHED: {e}")
293
-
294
- try:
295
- obs = env5.step(make_action(ActionType.EDIT_CONFIG, service_name="fake-service",
296
- config_edits=[ConfigEdit(key="x", value="y")]))
297
- has_error = obs.last_action_error is not None
298
- report("edit_config fake-service: graceful error", has_error,
299
- obs.last_action_error[:80] if obs.last_action_error else "no error msg")
300
- except Exception as e:
301
- report("edit_config fake-service: graceful error", False, f"CRASHED: {e}")
302
-
303
-
304
- # ============================================================================
305
- # TEST 9: Partial observability
306
- # ============================================================================
307
- print("\n=== TEST 9: Partial observability ===", flush=True)
308
- os.environ["DEVOPS_TASK"] = "cascading_failure"
309
- env6 = PipelineEnvironment()
310
- obs = env6.reset()
311
-
312
- # Check CPU/memory hidden on reset
313
- db_svc = [s for s in obs.services if s.name == "database-primary"][0]
314
- report("CPU hidden after reset", db_svc.cpu_percent == 0.0, f"cpu={db_svc.cpu_percent}")
315
- report("memory hidden after reset", db_svc.memory_percent == 0.0, f"mem={db_svc.memory_percent}")
316
-
317
- # view_logs reveals CPU/memory
318
- obs = env6.step(make_action(ActionType.VIEW_LOGS, service_name="database-primary"))
319
- db_svc = [s for s in obs.services if s.name == "database-primary"][0]
320
- report("CPU revealed after view_logs", db_svc.cpu_percent > 0.0, f"cpu={db_svc.cpu_percent}")
321
- report("memory revealed after view_logs", db_svc.memory_percent > 0.0, f"mem={db_svc.memory_percent}")
322
-
323
- # view_config reveals config_snapshot
324
- obs = env6.step(make_action(ActionType.VIEW_CONFIG, service_name="database-primary"))
325
- report("config_snapshot revealed after view_config", obs.config_snapshot is not None,
326
- f"keys={list(obs.config_snapshot.keys()) if obs.config_snapshot else 'none'}")
327
-
328
- # Other service still hidden
329
- cache_svc = [s for s in obs.services if s.name == "cache-service"][0]
330
- report("other service CPU still hidden", cache_svc.cpu_percent == 0.0,
331
- f"cache cpu={cache_svc.cpu_percent}")
332
-
333
-
334
- # ============================================================================
335
- # TEST 10: Cascading effects
336
- # ============================================================================
337
- print("\n=== TEST 10: Cascading effects ===", flush=True)
338
- os.environ["DEVOPS_TASK"] = "cascading_failure"
339
- env7 = PipelineEnvironment()
340
- obs = env7.reset()
341
-
342
- # cache-service degraded β†’ api-gateway should be degrading
343
- api_gw = [s for s in obs.services if s.name == "api-gateway"][0]
344
- report("api-gateway degraded from cascade", api_gw.health.value in ("degraded",),
345
- f"health={api_gw.health.value}")
346
-
347
- # Fix cache-service
348
- env7.step(make_action(ActionType.VIEW_CONFIG, service_name="cache-service"))
349
- env7.step(make_action(ActionType.EDIT_CONFIG, service_name="cache-service",
350
- config_edits=[ConfigEdit(key="redis.max_connections", value="50")]))
351
- # Deploy cache-service (staging then prod)
352
- env7.step(make_action(ActionType.DEPLOY, service_name="cache-service", target_version="v1.2.1"))
353
- obs = env7.step(make_action(ActionType.DEPLOY, service_name="cache-service", target_version="v1.2.1"))
354
-
355
- cache_svc = [s for s in obs.services if s.name == "cache-service"][0]
356
- report("cache-service healthy after fix", cache_svc.health.value == "healthy",
357
- f"health={cache_svc.health.value}")
358
-
359
- # Recovery cascade β€” api-gateway should start recovering (may take steps)
360
- obs = env7.step(make_action(ActionType.VIEW_PIPELINE))
361
- api_gw = [s for s in obs.services if s.name == "api-gateway"][0]
362
- # After fixing root cause, cascading should stop making it worse at minimum
363
- report("api-gateway recovery started (cascade stopped or improving)",
364
- api_gw.error_rate < 30.0,
365
- f"error_rate={api_gw.error_rate}, health={api_gw.health.value}")
366
-
367
-
368
- # ============================================================================
369
- # TEST 11: Trade-off effects in action results
370
- # ============================================================================
371
- print("\n=== TEST 11: Trade-off effects ===", flush=True)
372
- os.environ["DEVOPS_TASK"] = "clean_deploy"
373
- env8 = PipelineEnvironment()
374
- obs = env8.reset()
375
-
376
- # Deploy β†’ should mention CPU/latency spike
377
- obs = env8.step(make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"))
378
- obs = env8.step(make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"))
379
- deploy_result = obs.last_action_result or ""
380
- has_spike = "spike" in deploy_result.lower() or "warmup" in deploy_result.lower() or "cpu" in deploy_result.lower()
381
- report("deploy mentions CPU/latency spike", has_spike, deploy_result[:100])
382
-
383
- # Rollback β†’ should mention regression
384
- os.environ["DEVOPS_TASK"] = "cascading_failure"
385
- env9 = PipelineEnvironment()
386
- obs = env9.reset()
387
- obs = env9.step(make_action(ActionType.ROLLBACK, service_name="cache-service"))
388
- rollback_result = obs.last_action_result or ""
389
- has_regression = "regress" in rollback_result.lower() or "rolled back" in rollback_result.lower() or "monitoring" in rollback_result.lower()
390
- report("rollback mentions regression risk", has_regression, rollback_result[:120])
391
-
392
- # edit_config β†’ should mention restart/latency
393
- env10 = PipelineEnvironment()
394
- obs = env10.reset()
395
- obs = env10.step(make_action(ActionType.EDIT_CONFIG, service_name="cache-service",
396
- config_edits=[ConfigEdit(key="redis.max_connections", value="50")]))
397
- config_result = obs.last_action_result or ""
398
- has_restart = "restart" in config_result.lower() or "latency" in config_result.lower() or "spike" in config_result.lower()
399
- report("edit_config mentions restart/latency", has_restart, config_result[:120])
400
-
401
-
402
- # ============================================================================
403
- # SUMMARY
404
- # ============================================================================
405
- print("\n" + "=" * 70, flush=True)
406
- print("INTEGRATION TEST SUMMARY", flush=True)
407
- print("=" * 70, flush=True)
408
- passed = sum(1 for _, s, _ in results if s == PASS)
409
- failed = sum(1 for _, s, _ in results if s == FAIL)
410
- print(f" PASSED: {passed}", flush=True)
411
- print(f" FAILED: {failed}", flush=True)
412
- print(f" TOTAL: {len(results)}", flush=True)
413
-
414
- if failed > 0:
415
- print("\nFAILED TESTS:", flush=True)
416
- for name, status, detail in results:
417
- if status == FAIL:
418
- print(f" [FAIL] {name} β€” {detail}", flush=True)
419
-
420
- print("\nSCORES:", flush=True)
421
- for task, score in scores.items():
422
- print(f" {task}: {score:.3f}", flush=True)
423
-
424
- sys.exit(1 if failed > 0 else 0)
 
1
+ """Comprehensive integration test for the DevOps Pipeline Environment."""
2
+
3
+ import os
4
+ import sys
5
+ import json
6
+ import traceback
7
+
8
+ # Add project to path
9
+ sys.path.insert(0, os.path.dirname(__file__))
10
+
11
+ os.environ.pop("DEVOPS_TASK", None)
12
+
13
+ from devops_pipeline_env.models import (
14
+ ActionType,
15
+ ConfigEdit,
16
+ PipelineAction,
17
+ )
18
+ from server.pipeline_environment import PipelineEnvironment
19
+ from server.graders import grade_task
20
+
21
+ PASS = "PASS"
22
+ FAIL = "FAIL"
23
+ results = []
24
+
25
+
26
+ def report(test_name, passed, detail=""):
27
+ status = PASS if passed else FAIL
28
+ results.append((test_name, status, detail))
29
+ print(f" [{status}] {test_name}" + (f" β€” {detail}" if detail else ""), flush=True)
30
+
31
+
32
+ def make_action(action_type, service_name=None, target_version=None, config_edits=None,
33
+ migration_name=None, migration_type=None, reason=None):
34
+ return PipelineAction(
35
+ action_type=action_type,
36
+ service_name=service_name,
37
+ target_version=target_version,
38
+ config_edits=config_edits,
39
+ migration_name=migration_name,
40
+ migration_type=migration_type,
41
+ reason=reason,
42
+ )
43
+
44
+
45
+ # ============================================================================
46
+ # TEST 2: POST /reset β€” 5 services (test each task)
47
+ # ============================================================================
48
+ print("\n=== TEST 2: POST /reset β€” 5 services ===", flush=True)
49
+ for task in ["clean_deploy", "broken_pipeline", "judgment_call", "cascading_failure"]:
50
+ os.environ["DEVOPS_TASK"] = task
51
+ env = PipelineEnvironment()
52
+ obs = env.reset()
53
+ svc_names = sorted([s.name for s in obs.services])
54
+ expected = sorted(["database-primary", "auth-service", "api-gateway", "web-frontend"])
55
+ if task in ("broken_pipeline", "cascading_failure"):
56
+ expected = sorted(expected + ["cache-service"])
57
+ has_5 = len(obs.services) >= 4
58
+ report(f"reset {task}: services={len(obs.services)}", has_5,
59
+ f"names={svc_names}")
60
+
61
+ # ============================================================================
62
+ # TEST 3: GET /health (just test the function exists)
63
+ # ============================================================================
64
+ print("\n=== TEST 3: GET /health ===", flush=True)
65
+ report("/health endpoint exists", True, "Verified in app.py line 65")
66
+
67
+ # ============================================================================
68
+ # TEST 4: GET /tasks β€” 4 tasks
69
+ # ============================================================================
70
+ print("\n=== TEST 4: GET /tasks β€” 4 tasks ===", flush=True)
71
+ from server.app import get_tasks
72
+ tasks_resp = get_tasks()
73
+ task_names = [t["name"] for t in tasks_resp["tasks"]]
74
+ report("5 tasks returned", len(task_names) == 5, f"tasks={task_names}")
75
+ for expected_task in ["clean_deploy", "broken_pipeline", "judgment_call", "cascading_failure", "capacity_crisis"]:
76
+ report(f" task '{expected_task}' present", expected_task in task_names)
77
+
78
+
79
+ # ============================================================================
80
+ # TEST 5: Optimal path tests
81
+ # ============================================================================
82
+ print("\n=== TEST 5: Optimal path scores ===", flush=True)
83
+
84
+
85
+ def run_clean_deploy():
86
+ os.environ["DEVOPS_TASK"] = "clean_deploy"
87
+ env = PipelineEnvironment()
88
+ obs = env.reset()
89
+ actions = [
90
+ make_action(ActionType.VIEW_LOGS, service_name="api-gateway"),
91
+ make_action(ActionType.VIEW_LOGS, service_name="web-frontend"),
92
+ make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"),
93
+ make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"),
94
+ make_action(ActionType.DEPLOY, service_name="web-frontend", target_version="v1.9.0"),
95
+ make_action(ActionType.DEPLOY, service_name="web-frontend", target_version="v1.9.0"),
96
+ make_action(ActionType.APPROVE, reason="Both services deployed successfully"),
97
+ ]
98
+ for a in actions:
99
+ obs = env.step(a)
100
+ score = grade_task("clean_deploy", env.get_episode_history(), env.get_engine())
101
+ return score
102
+
103
+
104
+ def run_broken_pipeline():
105
+ os.environ["DEVOPS_TASK"] = "broken_pipeline"
106
+ env = PipelineEnvironment()
107
+ obs = env.reset()
108
+ actions = [
109
+ make_action(ActionType.VIEW_LOGS, service_name="api-gateway"),
110
+ make_action(ActionType.VIEW_LOGS, service_name="cache-service"),
111
+ make_action(ActionType.VIEW_CONFIG, service_name="cache-service"),
112
+ make_action(ActionType.EDIT_CONFIG, service_name="cache-service",
113
+ config_edits=[ConfigEdit(key="redis.host", value="redis-prod.internal:6379")]),
114
+ make_action(ActionType.RUN_MIGRATION, migration_name="add_index_users_email", migration_type="schema"),
115
+ make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"),
116
+ make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"),
117
+ make_action(ActionType.DEPLOY, service_name="cache-service", target_version="v1.2.1"),
118
+ make_action(ActionType.DEPLOY, service_name="cache-service", target_version="v1.2.1"),
119
+ make_action(ActionType.DEPLOY, service_name="web-frontend", target_version="v1.9.0"),
120
+ make_action(ActionType.DEPLOY, service_name="web-frontend", target_version="v1.9.0"),
121
+ make_action(ActionType.APPROVE, reason="All services deployed"),
122
+ ]
123
+ for a in actions:
124
+ obs = env.step(a)
125
+ score = grade_task("broken_pipeline", env.get_episode_history(), env.get_engine())
126
+ return score
127
+
128
+
129
+ def run_judgment_call_expert():
130
+ os.environ["DEVOPS_TASK"] = "judgment_call"
131
+ env = PipelineEnvironment()
132
+ obs = env.reset()
133
+ actions = [
134
+ make_action(ActionType.VIEW_LOGS, service_name="api-gateway"),
135
+ make_action(ActionType.VIEW_LOGS, service_name="web-frontend"),
136
+ make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.2"),
137
+ make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.2"),
138
+ make_action(ActionType.EDIT_CONFIG, service_name="web-frontend",
139
+ config_edits=[ConfigEdit(key="api.auth_version", value="v2")]),
140
+ make_action(ActionType.APPROVE, reason="Hotfix deployed, auth config fixed"),
141
+ ]
142
+ for a in actions:
143
+ obs = env.step(a)
144
+ score = grade_task("judgment_call", env.get_episode_history(), env.get_engine())
145
+ return score
146
+
147
+
148
+ def run_cascading_failure():
149
+ os.environ["DEVOPS_TASK"] = "cascading_failure"
150
+ env = PipelineEnvironment()
151
+ obs = env.reset()
152
+ actions = [
153
+ make_action(ActionType.VIEW_LOGS, service_name="cache-service"),
154
+ make_action(ActionType.VIEW_CONFIG, service_name="cache-service"),
155
+ make_action(ActionType.EDIT_CONFIG, service_name="cache-service",
156
+ config_edits=[ConfigEdit(key="redis.max_connections", value="50")]),
157
+ make_action(ActionType.DEPLOY, service_name="cache-service", target_version="v1.2.1"),
158
+ make_action(ActionType.DEPLOY, service_name="cache-service", target_version="v1.2.1"),
159
+ make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"),
160
+ make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"),
161
+ make_action(ActionType.DEPLOY, service_name="web-frontend", target_version="v1.9.0"),
162
+ make_action(ActionType.DEPLOY, service_name="web-frontend", target_version="v1.9.0"),
163
+ make_action(ActionType.APPROVE, reason="All services recovered and deployed"),
164
+ ]
165
+ for a in actions:
166
+ obs = env.step(a)
167
+ score = grade_task("cascading_failure", env.get_episode_history(), env.get_engine())
168
+ return score
169
+
170
+
171
+ def run_capacity_crisis():
172
+ os.environ["DEVOPS_TASK"] = "capacity_crisis"
173
+ env = PipelineEnvironment()
174
+ obs = env.reset()
175
+ actions = [
176
+ make_action(ActionType.VIEW_LOGS, service_name="database-primary"),
177
+ make_action(ActionType.EDIT_CONFIG, service_name="database-primary",
178
+ config_edits=[ConfigEdit(key="max_connections", value="100")]),
179
+ make_action(ActionType.EDIT_CONFIG, service_name="cache-service",
180
+ config_edits=[ConfigEdit(key="max_memory", value="4GB")]),
181
+ make_action(ActionType.VIEW_PIPELINE),
182
+ make_action(ActionType.APPROVE, reason="Stabilized"),
183
+ ]
184
+ for a in actions:
185
+ obs = env.step(a)
186
+ score = grade_task("capacity_crisis", env.get_episode_history(), env.get_engine())
187
+ return score
188
+
189
+
190
+ targets = {
191
+ "clean_deploy": (run_clean_deploy, 0.95),
192
+ "broken_pipeline": (run_broken_pipeline, 0.80),
193
+ "judgment_call": (run_judgment_call_expert, 0.90),
194
+ "cascading_failure": (run_cascading_failure, 0.70),
195
+ "capacity_crisis": (run_capacity_crisis, 0.60),
196
+ }
197
+
198
+ scores = {}
199
+ for task, (runner, target) in targets.items():
200
+ try:
201
+ score = runner()
202
+ scores[task] = score
203
+ report(f"optimal {task}: {score:.3f} (target {target:.2f}+)",
204
+ score >= target, f"{'OK' if score >= target else 'BELOW TARGET'}")
205
+ except Exception as e:
206
+ report(f"optimal {task}", False, f"EXCEPTION: {e}\n{traceback.format_exc()}")
207
+
208
+
209
+ # ============================================================================
210
+ # TEST 6: Determinism β€” same seed, same score
211
+ # ============================================================================
212
+ print("\n=== TEST 6: Determinism ===", flush=True)
213
+ for task, (runner, _) in targets.items():
214
+ try:
215
+ s1 = runner()
216
+ s2 = runner()
217
+ report(f"determinism {task}: {s1:.3f} == {s2:.3f}", s1 == s2)
218
+ except Exception as e:
219
+ report(f"determinism {task}", False, f"EXCEPTION: {e}")
220
+
221
+
222
+ # ============================================================================
223
+ # TEST 7: Action validation for ALL 5 services
224
+ # ============================================================================
225
+ print("\n=== TEST 7: Action validation for all services ===", flush=True)
226
+
227
+ # Use cascading_failure which has all 5 services
228
+ os.environ["DEVOPS_TASK"] = "cascading_failure"
229
+ env = PipelineEnvironment()
230
+ obs = env.reset()
231
+
232
+ svc_names = [s.name for s in obs.services]
233
+ report("5 services present", len(svc_names) == 5, f"{sorted(svc_names)}")
234
+
235
+ # Test deploy on database-primary and auth-service
236
+ for svc in ["database-primary", "auth-service"]:
237
+ obs = env.step(make_action(ActionType.DEPLOY, service_name=svc, target_version="v99.0.0"))
238
+ report(f"deploy {svc}", obs.last_action_error is None,
239
+ obs.last_action_error or obs.last_action_result[:80] if obs.last_action_result else "")
240
+
241
+ # Rollback
242
+ env2 = PipelineEnvironment()
243
+ obs = env2.reset()
244
+ for svc in ["database-primary", "auth-service"]:
245
+ obs = env2.step(make_action(ActionType.ROLLBACK, service_name=svc))
246
+ report(f"rollback {svc}", obs.last_action_error is None,
247
+ obs.last_action_error or obs.last_action_result[:80] if obs.last_action_result else "")
248
+
249
+ # view_logs
250
+ env3 = PipelineEnvironment()
251
+ obs = env3.reset()
252
+ for svc in ["database-primary", "auth-service"]:
253
+ obs = env3.step(make_action(ActionType.VIEW_LOGS, service_name=svc))
254
+ has_logs = obs.last_action_result and len(obs.last_action_result) > 10
255
+ report(f"view_logs {svc}", has_logs,
256
+ f"len={len(obs.last_action_result) if obs.last_action_result else 0}")
257
+
258
+ # view_config
259
+ for svc in ["database-primary", "auth-service"]:
260
+ obs = env3.step(make_action(ActionType.VIEW_CONFIG, service_name=svc))
261
+ has_config = obs.last_action_result and "=" in obs.last_action_result
262
+ report(f"view_config {svc}", has_config,
263
+ obs.last_action_result[:80] if obs.last_action_result else "none")
264
+
265
+ # edit_config
266
+ env4 = PipelineEnvironment()
267
+ obs = env4.reset()
268
+ obs = env4.step(make_action(ActionType.EDIT_CONFIG, service_name="database-primary",
269
+ config_edits=[ConfigEdit(key="max_connections", value="100")]))
270
+ report("edit_config database-primary", obs.last_action_error is None,
271
+ obs.last_action_result[:80] if obs.last_action_result else "")
272
+
273
+ obs = env4.step(make_action(ActionType.EDIT_CONFIG, service_name="auth-service",
274
+ config_edits=[ConfigEdit(key="token_ttl_seconds", value="7200")]))
275
+ report("edit_config auth-service", obs.last_action_error is None,
276
+ obs.last_action_result[:80] if obs.last_action_result else "")
277
+
278
+
279
+ # ============================================================================
280
+ # TEST 8: Invalid action tests
281
+ # ============================================================================
282
+ print("\n=== TEST 8: Invalid action tests ===", flush=True)
283
+ env5 = PipelineEnvironment()
284
+ obs = env5.reset()
285
+
286
+ try:
287
+ obs = env5.step(make_action(ActionType.DEPLOY, service_name="nonexistent-service", target_version="v1.0"))
288
+ has_error = obs.last_action_error is not None
289
+ report("deploy nonexistent-service: graceful error", has_error,
290
+ obs.last_action_error[:80] if obs.last_action_error else "no error msg")
291
+ except Exception as e:
292
+ report("deploy nonexistent-service: graceful error", False, f"CRASHED: {e}")
293
+
294
+ try:
295
+ obs = env5.step(make_action(ActionType.EDIT_CONFIG, service_name="fake-service",
296
+ config_edits=[ConfigEdit(key="x", value="y")]))
297
+ has_error = obs.last_action_error is not None
298
+ report("edit_config fake-service: graceful error", has_error,
299
+ obs.last_action_error[:80] if obs.last_action_error else "no error msg")
300
+ except Exception as e:
301
+ report("edit_config fake-service: graceful error", False, f"CRASHED: {e}")
302
+
303
+
304
+ # ============================================================================
305
+ # TEST 9: Partial observability
306
+ # ============================================================================
307
+ print("\n=== TEST 9: Partial observability ===", flush=True)
308
+ os.environ["DEVOPS_TASK"] = "cascading_failure"
309
+ env6 = PipelineEnvironment()
310
+ obs = env6.reset()
311
+
312
+ # Check CPU/memory hidden on reset
313
+ db_svc = [s for s in obs.services if s.name == "database-primary"][0]
314
+ report("CPU hidden after reset", db_svc.cpu_percent == 0.0, f"cpu={db_svc.cpu_percent}")
315
+ report("memory hidden after reset", db_svc.memory_percent == 0.0, f"mem={db_svc.memory_percent}")
316
+
317
+ # view_logs reveals CPU/memory
318
+ obs = env6.step(make_action(ActionType.VIEW_LOGS, service_name="database-primary"))
319
+ db_svc = [s for s in obs.services if s.name == "database-primary"][0]
320
+ report("CPU revealed after view_logs", db_svc.cpu_percent > 0.0, f"cpu={db_svc.cpu_percent}")
321
+ report("memory revealed after view_logs", db_svc.memory_percent > 0.0, f"mem={db_svc.memory_percent}")
322
+
323
+ # view_config reveals config_snapshot
324
+ obs = env6.step(make_action(ActionType.VIEW_CONFIG, service_name="database-primary"))
325
+ report("config_snapshot revealed after view_config", obs.config_snapshot is not None,
326
+ f"keys={list(obs.config_snapshot.keys()) if obs.config_snapshot else 'none'}")
327
+
328
+ # Other service still hidden
329
+ cache_svc = [s for s in obs.services if s.name == "cache-service"][0]
330
+ report("other service CPU still hidden", cache_svc.cpu_percent == 0.0,
331
+ f"cache cpu={cache_svc.cpu_percent}")
332
+
333
+
334
+ # ============================================================================
335
+ # TEST 10: Cascading effects
336
+ # ============================================================================
337
+ print("\n=== TEST 10: Cascading effects ===", flush=True)
338
+ os.environ["DEVOPS_TASK"] = "cascading_failure"
339
+ env7 = PipelineEnvironment()
340
+ obs = env7.reset()
341
+
342
+ # cache-service degraded β†’ api-gateway should be degrading
343
+ api_gw = [s for s in obs.services if s.name == "api-gateway"][0]
344
+ report("api-gateway degraded from cascade", api_gw.health.value in ("degraded",),
345
+ f"health={api_gw.health.value}")
346
+
347
+ # Fix cache-service
348
+ env7.step(make_action(ActionType.VIEW_CONFIG, service_name="cache-service"))
349
+ env7.step(make_action(ActionType.EDIT_CONFIG, service_name="cache-service",
350
+ config_edits=[ConfigEdit(key="redis.max_connections", value="50")]))
351
+ # Deploy cache-service (staging then prod)
352
+ env7.step(make_action(ActionType.DEPLOY, service_name="cache-service", target_version="v1.2.1"))
353
+ obs = env7.step(make_action(ActionType.DEPLOY, service_name="cache-service", target_version="v1.2.1"))
354
+
355
+ cache_svc = [s for s in obs.services if s.name == "cache-service"][0]
356
+ report("cache-service healthy after fix", cache_svc.health.value == "healthy",
357
+ f"health={cache_svc.health.value}")
358
+
359
+ # Recovery cascade β€” api-gateway should start recovering (may take steps)
360
+ obs = env7.step(make_action(ActionType.VIEW_PIPELINE))
361
+ api_gw = [s for s in obs.services if s.name == "api-gateway"][0]
362
+ # After fixing root cause, cascading should stop making it worse at minimum
363
+ report("api-gateway recovery started (cascade stopped or improving)",
364
+ api_gw.error_rate < 30.0,
365
+ f"error_rate={api_gw.error_rate}, health={api_gw.health.value}")
366
+
367
+
368
+ # ============================================================================
369
+ # TEST 11: Trade-off effects in action results
370
+ # ============================================================================
371
+ print("\n=== TEST 11: Trade-off effects ===", flush=True)
372
+ os.environ["DEVOPS_TASK"] = "clean_deploy"
373
+ env8 = PipelineEnvironment()
374
+ obs = env8.reset()
375
+
376
+ # Deploy β†’ should mention CPU/latency spike
377
+ obs = env8.step(make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"))
378
+ obs = env8.step(make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"))
379
+ deploy_result = obs.last_action_result or ""
380
+ has_spike = "spike" in deploy_result.lower() or "warmup" in deploy_result.lower() or "cpu" in deploy_result.lower()
381
+ report("deploy mentions CPU/latency spike", has_spike, deploy_result[:100])
382
+
383
+ # Rollback β†’ should mention regression
384
+ os.environ["DEVOPS_TASK"] = "cascading_failure"
385
+ env9 = PipelineEnvironment()
386
+ obs = env9.reset()
387
+ obs = env9.step(make_action(ActionType.ROLLBACK, service_name="cache-service"))
388
+ rollback_result = obs.last_action_result or ""
389
+ has_regression = "regress" in rollback_result.lower() or "rolled back" in rollback_result.lower() or "monitoring" in rollback_result.lower()
390
+ report("rollback mentions regression risk", has_regression, rollback_result[:120])
391
+
392
+ # edit_config β†’ should mention restart/latency
393
+ env10 = PipelineEnvironment()
394
+ obs = env10.reset()
395
+ obs = env10.step(make_action(ActionType.EDIT_CONFIG, service_name="cache-service",
396
+ config_edits=[ConfigEdit(key="redis.max_connections", value="50")]))
397
+ config_result = obs.last_action_result or ""
398
+ has_restart = "restart" in config_result.lower() or "latency" in config_result.lower() or "spike" in config_result.lower()
399
+ report("edit_config mentions restart/latency", has_restart, config_result[:120])
400
+
401
+
402
+ # ============================================================================
403
+ # SUMMARY
404
+ # ============================================================================
405
+ print("\n" + "=" * 70, flush=True)
406
+ print("INTEGRATION TEST SUMMARY", flush=True)
407
+ print("=" * 70, flush=True)
408
+ passed = sum(1 for _, s, _ in results if s == PASS)
409
+ failed = sum(1 for _, s, _ in results if s == FAIL)
410
+ print(f" PASSED: {passed}", flush=True)
411
+ print(f" FAILED: {failed}", flush=True)
412
+ print(f" TOTAL: {len(results)}", flush=True)
413
+
414
+ if failed > 0:
415
+ print("\nFAILED TESTS:", flush=True)
416
+ for name, status, detail in results:
417
+ if status == FAIL:
418
+ print(f" [FAIL] {name} β€” {detail}", flush=True)
419
+
420
+ print("\nSCORES:", flush=True)
421
+ for task, score in scores.items():
422
+ print(f" {task}: {score:.3f}", flush=True)
423
+
424
+ sys.exit(1 if failed > 0 else 0)
models.py CHANGED
@@ -1,179 +1,179 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
- """
8
- Data models for the DevOps Pipeline Environment.
9
-
10
- CI/CD deployment pipeline where an AI agent manages microservice deployments.
11
- """
12
-
13
- from __future__ import annotations
14
-
15
- from enum import Enum
16
- from typing import Dict, List, Optional
17
-
18
- from openenv.core.env_server.types import Action, Observation
19
- from pydantic import BaseModel, Field
20
-
21
-
22
- # --- Enums -------------------------------------------------------------------
23
-
24
- class ActionType(str, Enum):
25
- VIEW_PIPELINE = "view_pipeline"
26
- VIEW_LOGS = "view_logs"
27
- VIEW_CONFIG = "view_config"
28
- EDIT_CONFIG = "edit_config"
29
- RUN_MIGRATION = "run_migration"
30
- DEPLOY = "deploy"
31
- ROLLBACK = "rollback"
32
- APPROVE = "approve"
33
- ABORT = "abort"
34
-
35
-
36
- class ServiceHealth(str, Enum):
37
- HEALTHY = "healthy"
38
- DEGRADED = "degraded"
39
- DOWN = "down"
40
- UNKNOWN = "unknown"
41
-
42
-
43
- class PipelineStage(str, Enum):
44
- IDLE = "idle"
45
- BUILD = "build"
46
- TEST = "test"
47
- STAGING = "staging"
48
- APPROVAL = "approval"
49
- DEPLOYING = "deploying"
50
- DEPLOYED = "deployed"
51
- ROLLED_BACK = "rolled_back"
52
- FAILED = "failed"
53
-
54
-
55
- class MigrationType(str, Enum):
56
- SCHEMA = "schema"
57
- DATA = "data"
58
- ROLLBACK_MIGRATION = "rollback_migration"
59
-
60
-
61
- # --- Sub-models (plain BaseModel) --------------------------------------------
62
-
63
- class ConfigEdit(BaseModel):
64
- key: str = Field(description="Dot-notation config path, e.g. 'database.pool_size'")
65
- value: str = Field(description="New value as string.")
66
-
67
-
68
- class ServiceStatus(BaseModel):
69
- name: str
70
- health: ServiceHealth
71
- current_version: str
72
- cpu_percent: float = Field(description="CPU usage 0-100")
73
- memory_percent: float = Field(description="Memory usage 0-100")
74
- error_rate: float = Field(description="Errors per second")
75
- request_latency_ms: float = Field(description="p95 latency in milliseconds")
76
- active_connections: int
77
- last_deploy_timestamp: str = Field(description="ISO 8601 timestamp")
78
- recovery_status: str = Field(default="stable", description="Recovery state: 'stable' or 'stabilizing (N steps remaining)'")
79
-
80
-
81
- class PipelineStatus(BaseModel):
82
- stage: PipelineStage
83
- triggered_by: str
84
- started_at: str = Field(description="ISO 8601 timestamp")
85
- commit_sha: str
86
- build_logs_snippet: Optional[str] = Field(
87
- default=None,
88
- description="Last N lines of build output.",
89
- )
90
- test_pass_count: Optional[int] = None
91
- test_fail_count: Optional[int] = None
92
- approval_required: bool = False
93
- blocked_reason: Optional[str] = None
94
-
95
-
96
- class MigrationStatus(BaseModel):
97
- pending_migrations: List[str]
98
- last_applied: Optional[str] = None
99
- migration_errors: Optional[List[str]] = None
100
-
101
-
102
- class AlertInfo(BaseModel):
103
- severity: str = Field(description="One of: critical, warning, info")
104
- message: str
105
- service_name: str
106
- timestamp: str
107
-
108
-
109
- # --- Action (extends OpenEnv Action) ----------------------------------------
110
-
111
- class PipelineAction(Action):
112
- """Action for the DevOps Pipeline environment."""
113
-
114
- action_type: ActionType
115
- service_name: Optional[str] = Field(
116
- default=None,
117
- description="Target service. Required for view_logs, view_config, edit_config, deploy, rollback.",
118
- )
119
- target_version: Optional[str] = Field(
120
- default=None,
121
- description="Version tag to deploy. Required for deploy.",
122
- )
123
- config_edits: Optional[List[ConfigEdit]] = Field(
124
- default=None,
125
- description="List of config changes. Required for edit_config.",
126
- )
127
- migration_type: Optional[MigrationType] = Field(
128
- default=None,
129
- description="Type of migration. Required for run_migration.",
130
- )
131
- migration_name: Optional[str] = Field(
132
- default=None,
133
- description="Migration identifier. Required for run_migration.",
134
- )
135
- reason: Optional[str] = Field(
136
- default=None,
137
- description="Justification for approve/abort/rollback.",
138
- )
139
-
140
-
141
- # --- Observation (extends OpenEnv Observation) --------------------------------
142
-
143
- class PipelineObservation(Observation):
144
- """Everything the agent sees after each step."""
145
-
146
- task_description: str = Field(
147
- default="",
148
- description="Natural language description of what the agent must accomplish.",
149
- )
150
- goal: str = Field(
151
- default="",
152
- description="Specific success criteria for the current task.",
153
- )
154
- step_number: int = 0
155
- max_steps: int = 15
156
- services: List[ServiceStatus] = Field(default_factory=list)
157
- pipeline: Optional[PipelineStatus] = None
158
- migrations: Optional[MigrationStatus] = None
159
- active_alerts: List[AlertInfo] = Field(default_factory=list)
160
- available_actions: List[str] = Field(
161
- default_factory=list,
162
- description="List of valid action_type values in current state.",
163
- )
164
- last_action_result: Optional[str] = Field(
165
- default=None,
166
- description="Human-readable outcome of the previous action.",
167
- )
168
- last_action_error: Optional[str] = Field(
169
- default=None,
170
- description="Error message if previous action failed, else null.",
171
- )
172
- config_snapshot: Optional[Dict[str, str]] = Field(
173
- default=None,
174
- description="Current config key-value pairs when viewing/editing config.",
175
- )
176
- summary: Optional[str] = Field(
177
- default=None,
178
- description="Quick status summary highlighting degraded/down services.",
179
- )
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """
8
+ Data models for the DevOps Pipeline Environment.
9
+
10
+ CI/CD deployment pipeline where an AI agent manages microservice deployments.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from enum import Enum
16
+ from typing import Dict, List, Optional
17
+
18
+ from openenv.core.env_server.types import Action, Observation
19
+ from pydantic import BaseModel, Field
20
+
21
+
22
+ # --- Enums -------------------------------------------------------------------
23
+
24
+ class ActionType(str, Enum):
25
+ VIEW_PIPELINE = "view_pipeline"
26
+ VIEW_LOGS = "view_logs"
27
+ VIEW_CONFIG = "view_config"
28
+ EDIT_CONFIG = "edit_config"
29
+ RUN_MIGRATION = "run_migration"
30
+ DEPLOY = "deploy"
31
+ ROLLBACK = "rollback"
32
+ APPROVE = "approve"
33
+ ABORT = "abort"
34
+
35
+
36
+ class ServiceHealth(str, Enum):
37
+ HEALTHY = "healthy"
38
+ DEGRADED = "degraded"
39
+ DOWN = "down"
40
+ UNKNOWN = "unknown"
41
+
42
+
43
+ class PipelineStage(str, Enum):
44
+ IDLE = "idle"
45
+ BUILD = "build"
46
+ TEST = "test"
47
+ STAGING = "staging"
48
+ APPROVAL = "approval"
49
+ DEPLOYING = "deploying"
50
+ DEPLOYED = "deployed"
51
+ ROLLED_BACK = "rolled_back"
52
+ FAILED = "failed"
53
+
54
+
55
+ class MigrationType(str, Enum):
56
+ SCHEMA = "schema"
57
+ DATA = "data"
58
+ ROLLBACK_MIGRATION = "rollback_migration"
59
+
60
+
61
+ # --- Sub-models (plain BaseModel) --------------------------------------------
62
+
63
+ class ConfigEdit(BaseModel):
64
+ key: str = Field(description="Dot-notation config path, e.g. 'database.pool_size'")
65
+ value: str = Field(description="New value as string.")
66
+
67
+
68
+ class ServiceStatus(BaseModel):
69
+ name: str
70
+ health: ServiceHealth
71
+ current_version: str
72
+ cpu_percent: float = Field(description="CPU usage 0-100")
73
+ memory_percent: float = Field(description="Memory usage 0-100")
74
+ error_rate: float = Field(description="Errors per second")
75
+ request_latency_ms: float = Field(description="p95 latency in milliseconds")
76
+ active_connections: int
77
+ last_deploy_timestamp: str = Field(description="ISO 8601 timestamp")
78
+ recovery_status: str = Field(default="stable", description="Recovery state: 'stable' or 'stabilizing (N steps remaining)'")
79
+
80
+
81
+ class PipelineStatus(BaseModel):
82
+ stage: PipelineStage
83
+ triggered_by: str
84
+ started_at: str = Field(description="ISO 8601 timestamp")
85
+ commit_sha: str
86
+ build_logs_snippet: Optional[str] = Field(
87
+ default=None,
88
+ description="Last N lines of build output.",
89
+ )
90
+ test_pass_count: Optional[int] = None
91
+ test_fail_count: Optional[int] = None
92
+ approval_required: bool = False
93
+ blocked_reason: Optional[str] = None
94
+
95
+
96
+ class MigrationStatus(BaseModel):
97
+ pending_migrations: List[str]
98
+ last_applied: Optional[str] = None
99
+ migration_errors: Optional[List[str]] = None
100
+
101
+
102
+ class AlertInfo(BaseModel):
103
+ severity: str = Field(description="One of: critical, warning, info")
104
+ message: str
105
+ service_name: str
106
+ timestamp: str
107
+
108
+
109
+ # --- Action (extends OpenEnv Action) ----------------------------------------
110
+
111
+ class PipelineAction(Action):
112
+ """Action for the DevOps Pipeline environment."""
113
+
114
+ action_type: ActionType
115
+ service_name: Optional[str] = Field(
116
+ default=None,
117
+ description="Target service. Required for view_logs, view_config, edit_config, deploy, rollback.",
118
+ )
119
+ target_version: Optional[str] = Field(
120
+ default=None,
121
+ description="Version tag to deploy. Required for deploy.",
122
+ )
123
+ config_edits: Optional[List[ConfigEdit]] = Field(
124
+ default=None,
125
+ description="List of config changes. Required for edit_config.",
126
+ )
127
+ migration_type: Optional[MigrationType] = Field(
128
+ default=None,
129
+ description="Type of migration. Required for run_migration.",
130
+ )
131
+ migration_name: Optional[str] = Field(
132
+ default=None,
133
+ description="Migration identifier. Required for run_migration.",
134
+ )
135
+ reason: Optional[str] = Field(
136
+ default=None,
137
+ description="Justification for approve/abort/rollback.",
138
+ )
139
+
140
+
141
+ # --- Observation (extends OpenEnv Observation) --------------------------------
142
+
143
+ class PipelineObservation(Observation):
144
+ """Everything the agent sees after each step."""
145
+
146
+ task_description: str = Field(
147
+ default="",
148
+ description="Natural language description of what the agent must accomplish.",
149
+ )
150
+ goal: str = Field(
151
+ default="",
152
+ description="Specific success criteria for the current task.",
153
+ )
154
+ step_number: int = 0
155
+ max_steps: int = 15
156
+ services: List[ServiceStatus] = Field(default_factory=list)
157
+ pipeline: Optional[PipelineStatus] = None
158
+ migrations: Optional[MigrationStatus] = None
159
+ active_alerts: List[AlertInfo] = Field(default_factory=list)
160
+ available_actions: List[str] = Field(
161
+ default_factory=list,
162
+ description="List of valid action_type values in current state.",
163
+ )
164
+ last_action_result: Optional[str] = Field(
165
+ default=None,
166
+ description="Human-readable outcome of the previous action.",
167
+ )
168
+ last_action_error: Optional[str] = Field(
169
+ default=None,
170
+ description="Error message if previous action failed, else null.",
171
+ )
172
+ config_snapshot: Optional[Dict[str, str]] = Field(
173
+ default=None,
174
+ description="Current config key-value pairs when viewing/editing config.",
175
+ )
176
+ summary: Optional[str] = Field(
177
+ default=None,
178
+ description="Quick status summary highlighting degraded/down services.",
179
+ )
openenv.yaml CHANGED
@@ -1,8 +1,8 @@
1
- spec_version: 1
2
- name: devops_pipeline_env
3
- type: space
4
- runtime: fastapi
5
- app: server.app:app
6
- port: 8000
7
- description: "CI/CD deployment pipeline environment where an AI agent manages deployments across 5 interdependent microservices. Agent reads logs, edits configs, runs migrations, and makes deployment decisions. Features 6 tasks (Easy to Hard + procedural generation): clean deploy, broken pipeline diagnosis, judgment calls under pressure, cascading failure recovery, capacity crisis management, and randomized incidents from seed."
8
- version: "0.1.0"
 
1
+ spec_version: 1
2
+ name: devops_pipeline_env
3
+ type: space
4
+ runtime: fastapi
5
+ app: server.app:app
6
+ port: 8000
7
+ description: "CI/CD deployment pipeline environment where an AI agent manages deployments across 5 interdependent microservices. Agent reads logs, edits configs, runs migrations, and makes deployment decisions. Features 6 tasks (Easy to Hard + procedural generation): clean deploy, broken pipeline diagnosis, judgment calls under pressure, cascading failure recovery, capacity crisis management, and randomized incidents from seed."
8
+ version: "0.1.0"
pyproject.toml CHANGED
@@ -1,34 +1,34 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
- [build-system]
8
- requires = ["setuptools>=45", "wheel"]
9
- build-backend = "setuptools.build_meta"
10
-
11
- [project]
12
- name = "devops-pipeline-env"
13
- version = "0.1.0"
14
- description = "CI/CD Pipeline Management OpenEnv Environment"
15
- requires-python = ">=3.10"
16
- dependencies = [
17
- "openenv-core[core]>=0.2.2",
18
- "pydantic>=2.0",
19
- "fastapi>=0.104.0",
20
- "uvicorn>=0.24.0",
21
- ]
22
-
23
- [project.optional-dependencies]
24
- dev = ["pytest"]
25
-
26
- [project.scripts]
27
- # Server entry point - enables running via: uv run --project . server
28
- # or: python -m devops_pipeline_env.server.app
29
- server = "devops_pipeline_env.server.app:main"
30
-
31
- [tool.setuptools]
32
- include-package-data = true
33
- packages = ["devops_pipeline_env", "devops_pipeline_env.server"]
34
  package-dir = { "devops_pipeline_env" = ".", "devops_pipeline_env.server" = "server" }
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ [build-system]
8
+ requires = ["setuptools>=45", "wheel"]
9
+ build-backend = "setuptools.build_meta"
10
+
11
+ [project]
12
+ name = "devops-pipeline-env"
13
+ version = "0.1.0"
14
+ description = "CI/CD Pipeline Management OpenEnv Environment"
15
+ requires-python = ">=3.10"
16
+ dependencies = [
17
+ "openenv-core[core]>=0.2.2",
18
+ "pydantic>=2.0",
19
+ "fastapi>=0.104.0",
20
+ "uvicorn>=0.24.0",
21
+ ]
22
+
23
+ [project.optional-dependencies]
24
+ dev = ["pytest"]
25
+
26
+ [project.scripts]
27
+ # Server entry point - enables running via: uv run --project . server
28
+ # or: python -m devops_pipeline_env.server.app
29
+ server = "devops_pipeline_env.server.app:main"
30
+
31
+ [tool.setuptools]
32
+ include-package-data = true
33
+ packages = ["devops_pipeline_env", "devops_pipeline_env.server"]
34
  package-dir = { "devops_pipeline_env" = ".", "devops_pipeline_env.server" = "server" }
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- openenv-core[core]>=0.2.2
2
- pydantic>=2.0
3
- fastapi>=0.104.0
4
- uvicorn>=0.24.0
 
1
+ openenv-core[core]>=0.2.2
2
+ pydantic>=2.0
3
+ fastapi>=0.104.0
4
+ uvicorn>=0.24.0
server/__init__.py CHANGED
@@ -1,7 +1,7 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
- """DevOps Pipeline environment server components."""
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """DevOps Pipeline environment server components."""
server/app.py CHANGED
@@ -1,128 +1,128 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
- """FastAPI application for the DevOps Pipeline Environment."""
8
-
9
- from openenv.core.env_server.http_server import create_app
10
-
11
- from devops_pipeline_env.models import PipelineAction, PipelineObservation
12
- from server.pipeline_environment import PipelineEnvironment
13
-
14
- app = create_app(
15
- PipelineEnvironment,
16
- PipelineAction,
17
- PipelineObservation,
18
- env_name="devops_pipeline_env",
19
- max_concurrent_envs=1,
20
- )
21
-
22
- # Store active env on app.state so /grader can access it without class singletons.
23
- # PipelineEnvironment.reset() calls _register_callback if set.
24
- app.state.active_env = None
25
- PipelineEnvironment._register_callback = lambda env: setattr(app.state, "active_env", env)
26
-
27
-
28
- # --- Additional Required Endpoints -------------------------------------------
29
-
30
- @app.get("/tasks")
31
- def get_tasks():
32
- """Returns list of tasks and the action schema."""
33
- return {
34
- "tasks": [
35
- {
36
- "name": "clean_deploy",
37
- "difficulty": "easy",
38
- "description": "Deploy 2 services with all tests passing. No complications.",
39
- "max_steps": 15,
40
- },
41
- {
42
- "name": "broken_pipeline",
43
- "difficulty": "medium",
44
- "description": "Diagnose test failures, fix config errors, run migrations.",
45
- "max_steps": 20,
46
- },
47
- {
48
- "name": "judgment_call",
49
- "difficulty": "hard",
50
- "description": "Production incident with cascading failures. Hotfix breaks downstream service. 12-step time limit with degrading health.",
51
- "max_steps": 12,
52
- },
53
- {
54
- "name": "cascading_failure",
55
- "difficulty": "medium-hard",
56
- "description": "Root cause analysis across dependency chain. cache-service down, dragging api-gateway and web-frontend. Fix root cause first.",
57
- "max_steps": 15,
58
- },
59
- {
60
- "name": "capacity_crisis",
61
- "difficulty": "medium-hard",
62
- "description": "Peak traffic 4x normal. database-primary connection pool nearly full. Stabilize before tipping points trigger cascading collapse.",
63
- "max_steps": 15,
64
- },
65
- {
66
- "name": "random_incident",
67
- "difficulty": "variable",
68
- "description": "Procedurally generated incident. Service, failure type, and severity are randomized from seed. Infinite variation for curriculum learning.",
69
- "max_steps": 15,
70
- },
71
- ],
72
- "action_schema": PipelineAction.model_json_schema(),
73
- }
74
-
75
-
76
- @app.get("/health")
77
- def health_check():
78
- """Health check endpoint."""
79
- return {"status": "healthy"}
80
-
81
-
82
- @app.post("/baseline")
83
- async def run_baseline():
84
- """Return pre-recorded baseline scores. Does NOT run inference.py."""
85
- return {
86
- "scores": {
87
- "clean_deploy": 0.700,
88
- "broken_pipeline": 0.482,
89
- "judgment_call": 0.184,
90
- "cascading_failure": 0.280,
91
- "capacity_crisis": 0.250,
92
- "random_incident": 0.350,
93
- },
94
- "model": "Qwen/Qwen2.5-72B-Instruct",
95
- "note": "Baselines re-calibrated after environment tuning for clean_deploy (v2). Recorded 2026-04-08.",
96
- }
97
-
98
-
99
- @app.post("/grader")
100
- async def run_grader(task_name: str = ""):
101
- """Score from active session's episode history."""
102
- from server.graders import grade_task as _grade_task
103
-
104
- env = getattr(app.state, "active_env", None)
105
- if env is None or env.get_engine() is None:
106
- return {"task": task_name, "score": 0.0, "error": "No active session. Call /reset first."}
107
- if not env.get_episode_history():
108
- return {"task": env.get_task_name(), "score": 0.0, "error": "No steps taken. Call /step first."}
109
- active_task = env.get_task_name()
110
- if task_name and task_name != active_task:
111
- return {"task": task_name, "score": 0.0, "error": f"Task mismatch: requested '{task_name}' but active task is '{active_task}'."}
112
- if not task_name:
113
- task_name = active_task
114
- score = _grade_task(
115
- env.get_task_name(),
116
- env.get_episode_history(),
117
- env.get_engine(),
118
- )
119
- return {"task": env.get_task_name(), "score": score}
120
-
121
-
122
- def main(host: str = "0.0.0.0", port: int = 8000):
123
- import uvicorn
124
- uvicorn.run(app, host=host, port=port)
125
-
126
-
127
- if __name__ == "__main__":
128
- main()
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """FastAPI application for the DevOps Pipeline Environment."""
8
+
9
+ from openenv.core.env_server.http_server import create_app
10
+
11
+ from devops_pipeline_env.models import PipelineAction, PipelineObservation
12
+ from server.pipeline_environment import PipelineEnvironment
13
+
14
+ app = create_app(
15
+ PipelineEnvironment,
16
+ PipelineAction,
17
+ PipelineObservation,
18
+ env_name="devops_pipeline_env",
19
+ max_concurrent_envs=1,
20
+ )
21
+
22
+ # Store active env on app.state so /grader can access it without class singletons.
23
+ # PipelineEnvironment.reset() calls _register_callback if set.
24
+ app.state.active_env = None
25
+ PipelineEnvironment._register_callback = lambda env: setattr(app.state, "active_env", env)
26
+
27
+
28
+ # --- Additional Required Endpoints -------------------------------------------
29
+
30
+ @app.get("/tasks")
31
+ def get_tasks():
32
+ """Returns list of tasks and the action schema."""
33
+ return {
34
+ "tasks": [
35
+ {
36
+ "name": "clean_deploy",
37
+ "difficulty": "easy",
38
+ "description": "Deploy 2 services with all tests passing. No complications.",
39
+ "max_steps": 15,
40
+ },
41
+ {
42
+ "name": "broken_pipeline",
43
+ "difficulty": "medium",
44
+ "description": "Diagnose test failures, fix config errors, run migrations.",
45
+ "max_steps": 20,
46
+ },
47
+ {
48
+ "name": "judgment_call",
49
+ "difficulty": "hard",
50
+ "description": "Production incident with cascading failures. Hotfix breaks downstream service. 12-step time limit with degrading health.",
51
+ "max_steps": 12,
52
+ },
53
+ {
54
+ "name": "cascading_failure",
55
+ "difficulty": "medium-hard",
56
+ "description": "Root cause analysis across dependency chain. cache-service down, dragging api-gateway and web-frontend. Fix root cause first.",
57
+ "max_steps": 15,
58
+ },
59
+ {
60
+ "name": "capacity_crisis",
61
+ "difficulty": "medium-hard",
62
+ "description": "Peak traffic 4x normal. database-primary connection pool nearly full. Stabilize before tipping points trigger cascading collapse.",
63
+ "max_steps": 15,
64
+ },
65
+ {
66
+ "name": "random_incident",
67
+ "difficulty": "variable",
68
+ "description": "Procedurally generated incident. Service, failure type, and severity are randomized from seed. Infinite variation for curriculum learning.",
69
+ "max_steps": 15,
70
+ },
71
+ ],
72
+ "action_schema": PipelineAction.model_json_schema(),
73
+ }
74
+
75
+
76
+ @app.get("/health")
77
+ def health_check():
78
+ """Health check endpoint."""
79
+ return {"status": "healthy"}
80
+
81
+
82
+ @app.post("/baseline")
83
+ async def run_baseline():
84
+ """Return pre-recorded baseline scores. Does NOT run inference.py."""
85
+ return {
86
+ "scores": {
87
+ "clean_deploy": 0.700,
88
+ "broken_pipeline": 0.482,
89
+ "judgment_call": 0.184,
90
+ "cascading_failure": 0.280,
91
+ "capacity_crisis": 0.250,
92
+ "random_incident": 0.350,
93
+ },
94
+ "model": "Qwen/Qwen2.5-72B-Instruct",
95
+ "note": "Baselines re-calibrated after environment tuning for clean_deploy (v2). Recorded 2026-04-08.",
96
+ }
97
+
98
+
99
+ @app.post("/grader")
100
+ async def run_grader(task_name: str = ""):
101
+ """Score from active session's episode history."""
102
+ from server.graders import grade_task as _grade_task
103
+
104
+ env = getattr(app.state, "active_env", None)
105
+ if env is None or env.get_engine() is None:
106
+ return {"task": task_name, "score": 0.001, "error": "No active session. Call /reset first."}
107
+ if not env.get_episode_history():
108
+ return {"task": env.get_task_name(), "score": 0.001, "error": "No steps taken. Call /step first."}
109
+ active_task = env.get_task_name()
110
+ if task_name and task_name != active_task:
111
+ return {"task": task_name, "score": 0.001, "error": f"Task mismatch: requested '{task_name}' but active task is '{active_task}'."}
112
+ if not task_name:
113
+ task_name = active_task
114
+ score = _grade_task(
115
+ env.get_task_name(),
116
+ env.get_episode_history(),
117
+ env.get_engine(),
118
+ )
119
+ return {"task": env.get_task_name(), "score": score}
120
+
121
+
122
+ def main(host: str = "0.0.0.0", port: int = 8000):
123
+ import uvicorn
124
+ uvicorn.run(app, host=host, port=port)
125
+
126
+
127
+ if __name__ == "__main__":
128
+ main()
server/graders.py CHANGED
@@ -1,389 +1,389 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
- """Deterministic graders for the DevOps Pipeline Environment.
8
-
9
- Each grader produces a score in [0.0, 1.0].
10
- Same actions -> same score. Always.
11
- All criteria are outcome-based β€” no procedure bonuses.
12
- """
13
-
14
-
15
- def grade_clean_deploy(episode_history, engine):
16
- """
17
- Task 1 grader:
18
- - 0.50 * (services at target version in prod / services with targets)
19
- - 0.30 * (final system_health / 100)
20
- - 0.20 * max(0, 1 - steps_used / max_steps)
21
- """
22
- target_services = [s for s in engine.services.values() if s.target_version]
23
- deployed_count = sum(
24
- 1 for svc in target_services
25
- if svc.prod_deployed and svc.current_version == svc.target_version
26
- )
27
-
28
- deploy_ratio = deployed_count / len(target_services) if target_services else 0.0
29
- system_health = engine.get_system_health()
30
-
31
- steps_used = len(episode_history)
32
- max_steps = 15
33
- efficiency = max(0.0, 1.0 - steps_used / max_steps)
34
-
35
- score = 0.50 * deploy_ratio + 0.30 * (system_health / 100.0) + 0.20 * efficiency
36
- return min(max(score, 0.001), 0.999)
37
-
38
-
39
- def grade_broken_pipeline(episode_history, engine):
40
- """
41
- Task 2 grader (all outcome-based):
42
- - 0.30 β€” cache-service config redis.host == redis-prod.internal:6379
43
- - 0.15 β€” migration applied (add_index_users_email in applied list)
44
- - 0.30 β€” (services at target in prod / 3)
45
- - 0.15 β€” (final system_health / 100)
46
- - 0.10 β€” step efficiency: max(0, 1 - steps_used / max_steps)
47
- """
48
- score = 0.0
49
-
50
- # Config fix outcome β€” is the config correct at end of episode?
51
- cache_svc = engine.services.get("cache-service")
52
- if cache_svc and cache_svc.config.get("redis.host") == "redis-prod.internal:6379":
53
- score += 0.30
54
-
55
- # Migration outcome β€” was the migration applied?
56
- if "add_index_users_email" in engine.migrations_applied:
57
- score += 0.15
58
-
59
- # Services at target in prod (only count services with deploy targets)
60
- target_services = [s for s in engine.services.values() if s.target_version]
61
- deployed_count = sum(
62
- 1 for svc in target_services
63
- if svc.prod_deployed and svc.current_version == svc.target_version
64
- )
65
- if target_services:
66
- score += 0.30 * (deployed_count / len(target_services))
67
-
68
- # System health
69
- system_health = engine.get_system_health()
70
- score += 0.15 * (system_health / 100.0)
71
-
72
- # Step efficiency
73
- steps_used = len(episode_history)
74
- max_steps = 20
75
- efficiency = max(0.0, 1.0 - steps_used / max_steps)
76
- score += 0.10 * efficiency
77
-
78
- return min(max(score, 0.001), 0.999)
79
-
80
-
81
- def grade_judgment_call(episode_history, engine):
82
- """
83
- Task 3 grader β€” 3 valid paths with different scores:
84
- - Expert path: deploy hotfix + fix web-frontend auth config -> best score
85
- - Hotfix only: deploy hotfix without fixing auth -> partial score
86
- - Rollback: safe but loses features -> moderate score
87
-
88
- Dimensions:
89
- - 0.30 β€” incident resolution (path-aware)
90
- - 0.25 β€” web-frontend collateral damage
91
- - 0.15 β€” time to resolution
92
- - 0.15 β€” no new issues introduced
93
- - 0.15 β€” final system health
94
- """
95
- score = 0.0
96
- api_gw = engine.services.get("api-gateway")
97
- web_fe = engine.services.get("web-frontend")
98
-
99
- # Detect which path the agent took
100
- deployed_hotfix = False
101
- did_rollback = False
102
- fixed_auth_config = False
103
- for entry in episode_history:
104
- action = entry.get("action", {})
105
- if action.get("action_type") == "deploy" and action.get("service_name") == "api-gateway":
106
- if action.get("target_version") == "v2.3.2":
107
- deployed_hotfix = True
108
- if action.get("action_type") == "rollback" and action.get("service_name") == "api-gateway":
109
- did_rollback = True
110
- if action.get("action_type") == "edit_config" and action.get("service_name") == "web-frontend":
111
- edits = action.get("config_edits", [])
112
- for edit in edits:
113
- if edit.get("key") == "api.auth_version" and edit.get("value") == "v2":
114
- fixed_auth_config = True
115
-
116
- # Incident resolution (path-aware)
117
- resolved = False
118
- if api_gw:
119
- api_healthy = api_gw.health.value == "healthy" and api_gw.error_rate < 5.0
120
- api_fully_resolved = api_gw.latency_ms < 100 and api_gw.error_rate < 1.0
121
- if deployed_hotfix and fixed_auth_config and api_healthy:
122
- score += 0.30 # Expert path: full credit (root cause fixed + auth handled)
123
- resolved = True
124
- elif api_fully_resolved:
125
- resolved = True
126
- if did_rollback:
127
- score += 0.20 # Rollback: safe but lost features
128
- else:
129
- score += 0.25 # Some other resolution
130
- elif deployed_hotfix and api_healthy:
131
- score += 0.15 # Hotfix without auth fix: partial
132
- resolved = True
133
- elif api_gw.latency_ms < 500:
134
- score += 0.10 # Partial improvement
135
-
136
- # web-frontend collateral damage (smooth gradient)
137
- if web_fe:
138
- if web_fe.health.value == "healthy" and web_fe.error_rate < 0.5:
139
- score += 0.25
140
- elif web_fe.health.value == "healthy" and web_fe.error_rate < 2.0:
141
- score += 0.20
142
- elif web_fe.health.value == "healthy" and web_fe.error_rate < 5.0:
143
- score += 0.15
144
- elif web_fe.error_rate < 10.0:
145
- score += 0.10
146
- elif web_fe.error_rate < 20.0:
147
- score += 0.05
148
-
149
- # Time to resolution β€” when api-gateway actually became healthy (outcome-based)
150
- resolution_step = len(episode_history)
151
- if resolved:
152
- for entry in episode_history:
153
- sh = entry.get("system_health", 0)
154
- if sh >= 80:
155
- resolution_step = entry.get("step", len(episode_history))
156
- break
157
- score += max(0.0, 1.0 - resolution_step / 10.0) * 0.15
158
-
159
- # No new issues introduced (forgive issues the agent subsequently fixed)
160
- new_issues = 0
161
- for entry in episode_history:
162
- if entry.get("broke_healthy", False):
163
- new_issues += 1
164
- # Expert path necessarily breaks web-frontend then fixes it β€” if web-frontend
165
- # ended healthy AND auth was fixed, the breakage was handled, not reckless.
166
- recovered_issues = 0
167
- if deployed_hotfix and fixed_auth_config and web_fe:
168
- if web_fe.health.value == "healthy" and web_fe.error_rate < 2.0:
169
- recovered_issues = 1 # The expected web-frontend break was recovered
170
- unrecovered = max(0, new_issues - recovered_issues)
171
- if unrecovered == 0:
172
- score += 0.15
173
- elif unrecovered == 1:
174
- score += 0.05
175
-
176
- # System health
177
- system_health = engine.get_system_health()
178
- score += 0.15 * (system_health / 100.0)
179
-
180
- return min(max(score, 0.001), 0.999)
181
-
182
-
183
- def grade_cascading_failure(episode_history, engine):
184
- """
185
- Task 4 grader (all outcome-based):
186
- - 0.30 β€” root cause fixed (cache-service healthy AND max_connections != "5")
187
- - 0.25 β€” all services deployed to prod at target version
188
- - 0.20 β€” final system_health / 100 (only full marks if > 90%)
189
- - 0.15 β€” dependency health (cache-service healthy when api-gateway deployed)
190
- - 0.10 β€” step efficiency: max(0, 1 - steps_used / max_steps)
191
- """
192
- score = 0.0
193
-
194
- # Root cause fixed: cache-service healthy with correct config
195
- cache_svc = engine.services.get("cache-service")
196
- if cache_svc:
197
- cache_healthy = cache_svc.health.value == "healthy"
198
- config_fixed = cache_svc.config.get("redis.max_connections") != "5"
199
- if cache_healthy and config_fixed:
200
- score += 0.30
201
-
202
- # All services deployed to prod at target version (only those with targets)
203
- target_services = [s for s in engine.services.values() if s.target_version]
204
- deployed_count = sum(
205
- 1 for svc in target_services
206
- if svc.prod_deployed and svc.current_version == svc.target_version
207
- )
208
- if target_services:
209
- score += 0.25 * (deployed_count / len(target_services))
210
-
211
- # System health
212
- system_health = engine.get_system_health()
213
- score += 0.20 * (system_health / 100.0)
214
-
215
- # Dependency health outcome: was cache-service healthy when api-gateway deployed?
216
- # Uses deploy-time snapshot recorded in episode_history for grader integrity.
217
- api_deployed_with_healthy_dep = False
218
- for entry in episode_history:
219
- cache_health = entry.get("cache_health_at_deploy")
220
- if cache_health is not None:
221
- if cache_health == "healthy":
222
- api_deployed_with_healthy_dep = True
223
- break
224
- # Also award if api-gateway was never deployed (agent focused on root cause only)
225
- # and cache-service ended healthy
226
- if not api_deployed_with_healthy_dep:
227
- api_gw = engine.services.get("api-gateway")
228
- if api_gw and api_gw.prod_deployed and cache_svc and cache_svc.health.value == "healthy":
229
- api_deployed_with_healthy_dep = True
230
- if api_deployed_with_healthy_dep:
231
- score += 0.15
232
-
233
- # Step efficiency
234
- steps_used = len(episode_history)
235
- max_steps = 15
236
- efficiency = max(0.0, 1.0 - steps_used / max_steps)
237
- score += 0.10 * efficiency
238
-
239
- return min(max(score, 0.001), 0.999)
240
-
241
-
242
- def grade_capacity_crisis(episode_history, engine):
243
- """
244
- Task 5 grader β€” capacity crisis (all outcome-based):
245
- - 0.30 β€” system stability: final system health / 100
246
- - 0.30 β€” root cause: database-primary protected (CPU<85, config fixed)
247
- - 0.20 β€” critical services maintained (api-gateway not DOWN, low errors)
248
- - 0.10 β€” proactive response: no service went DOWN during episode
249
- - 0.10 β€” step efficiency: max(0, 1 - steps_used / max_steps)
250
- """
251
- score = 0.0
252
- db = engine.services.get("database-primary")
253
- api_gw = engine.services.get("api-gateway")
254
-
255
- # System stability (0.30) β€” only perfect health gets full marks
256
- system_health = engine.get_system_health()
257
- score += (system_health / 100.0) * 0.30
258
-
259
- # Root cause: database protected (0.30)
260
- if db:
261
- try:
262
- max_conn = int(db.config.get("max_connections", "50"))
263
- except (ValueError, TypeError):
264
- max_conn = 50
265
- shared_buf = db.config.get("shared_buffers", "4GB")
266
- try:
267
- shared_gb = int(shared_buf.replace("GB", "")) if "GB" in str(shared_buf) else 4
268
- except (ValueError, TypeError):
269
- shared_gb = 4
270
- if max_conn >= 100 and db.cpu_percent < 85 and shared_gb >= 6:
271
- score += 0.30 # Both configs optimized
272
- elif max_conn >= 100 and db.cpu_percent < 85:
273
- score += 0.25 # Connections fixed, buffers not
274
- elif max_conn >= 75 and db.cpu_percent < 85:
275
- score += 0.20
276
- elif max_conn >= 75:
277
- score += 0.10
278
- elif db.cpu_percent < 85:
279
- score += 0.05
280
-
281
- # Critical services maintained (0.20)
282
- if api_gw:
283
- if api_gw.health.value != "down":
284
- if api_gw.error_rate < 5.0:
285
- score += 0.20
286
- elif api_gw.error_rate < 10.0:
287
- score += 0.10
288
- else:
289
- score += 0.03
290
-
291
- # Proactive response: system health maintained or improved (0.10)
292
- # In capacity_crisis, initial cascading is inevitable β€” reward agents
293
- # that stabilize health rather than penalizing unavoidable cascades.
294
- if system_health >= 70:
295
- score += 0.10
296
- elif system_health >= 50:
297
- score += 0.05
298
-
299
- # Step efficiency (0.10)
300
- steps_used = len(episode_history)
301
- max_steps = 15
302
- efficiency = max(0.0, 1.0 - steps_used / max_steps)
303
- score += 0.10 * efficiency
304
-
305
- return min(max(score, 0.001), 0.999)
306
-
307
-
308
- def grade_random_incident(episode_history, engine):
309
- """
310
- Task 6 grader β€” procedurally generated incident (all outcome-based):
311
- - 0.35 β€” failing service restored to healthy
312
- - 0.25 β€” system health maintained
313
- - 0.20 β€” config error fixed (if applicable)
314
- - 0.10 β€” no collateral damage (no healthy services broken)
315
- - 0.10 β€” step efficiency
316
- """
317
- score = 0.0
318
- scenario = engine.scenario
319
- failing_name = getattr(scenario, 'failing_service', None)
320
- failing_svc = engine.services.get(failing_name) if failing_name else None
321
-
322
- # Failing service restored (0.35)
323
- if failing_svc and failing_svc.health.value == "healthy":
324
- score += 0.35
325
- elif failing_svc and failing_svc.health.value == "degraded" and failing_svc.error_rate < 5.0:
326
- score += 0.15
327
-
328
- # System health (0.25)
329
- system_health = engine.get_system_health()
330
- score += (system_health / 100.0) * 0.25
331
-
332
- # Config fixed (0.20) β€” only if there was a config error to fix
333
- if failing_svc:
334
- had_config_error = getattr(scenario, 'failure_type', '') in ('config_error', 'capacity_limit', 'certificate_expiry')
335
- if had_config_error and not scenario.check_config_error(failing_name, failing_svc.config):
336
- score += 0.20 # Actually fixed the config error
337
- elif had_config_error:
338
- score += 0.0 # Config error still present
339
- else:
340
- # No config error for this failure type (degraded_performance, memory_leak)
341
- # Redistribute to "service restored via other means" (deploy/rollback)
342
- if failing_svc.health.value == "healthy" and failing_svc.error_rate < 2.0:
343
- score += 0.20 # Fully restored without config fix
344
- elif failing_svc.health.value == "healthy":
345
- score += 0.10 # Partially restored
346
- elif failing_svc.error_rate < 5.0:
347
- score += 0.05 # Some improvement
348
-
349
- # No collateral damage (0.10) β€” outcome-based, not procedure-based
350
- any_broke = any(entry.get("broke_healthy", False) for entry in episode_history)
351
- if not any_broke:
352
- score += 0.10
353
- elif system_health > 60:
354
- score += 0.05
355
-
356
- # Compound vs efficiency β€” weights sum to exactly 1.00 either way
357
- has_secondary = getattr(scenario, 'secondary_service', None) is not None
358
- efficiency_weight = 0.00 if has_secondary else 0.10
359
-
360
- # Efficiency (0.10 for single incidents, 0.00 for compound β€” replaced by compound bonus)
361
- steps = len(episode_history)
362
- max_steps = 15
363
- score += max(0.0, 1.0 - steps / max_steps) * efficiency_weight
364
-
365
- # Compound incident bonus (0.10) β€” replaces efficiency for compound incidents
366
- if has_secondary:
367
- secondary_svc = engine.services.get(scenario.secondary_service)
368
- if secondary_svc and secondary_svc.health.value == "healthy":
369
- score += 0.10
370
-
371
- return min(max(score, 0.001), 0.999)
372
-
373
-
374
- GRADERS = {
375
- "clean_deploy": grade_clean_deploy,
376
- "broken_pipeline": grade_broken_pipeline,
377
- "judgment_call": grade_judgment_call,
378
- "cascading_failure": grade_cascading_failure,
379
- "capacity_crisis": grade_capacity_crisis,
380
- "random_incident": grade_random_incident,
381
- }
382
-
383
-
384
- def grade_task(task_name, episode_history, engine):
385
- """Grade an episode. Returns score in [0.0, 1.0]."""
386
- grader = GRADERS.get(task_name)
387
- if grader is None:
388
- return 0.0
389
- return grader(episode_history, engine)
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Deterministic graders for the DevOps Pipeline Environment.
8
+
9
+ Each grader produces a score in [0.0, 1.0].
10
+ Same actions -> same score. Always.
11
+ All criteria are outcome-based β€” no procedure bonuses.
12
+ """
13
+
14
+
15
+ def grade_clean_deploy(episode_history, engine):
16
+ """
17
+ Task 1 grader:
18
+ - 0.50 * (services at target version in prod / services with targets)
19
+ - 0.30 * (final system_health / 100)
20
+ - 0.20 * max(0, 1 - steps_used / max_steps)
21
+ """
22
+ target_services = [s for s in engine.services.values() if s.target_version]
23
+ deployed_count = sum(
24
+ 1 for svc in target_services
25
+ if svc.prod_deployed and svc.current_version == svc.target_version
26
+ )
27
+
28
+ deploy_ratio = deployed_count / len(target_services) if target_services else 0.0
29
+ system_health = engine.get_system_health()
30
+
31
+ steps_used = len(episode_history)
32
+ max_steps = 15
33
+ efficiency = max(0.0, 1.0 - steps_used / max_steps)
34
+
35
+ score = 0.50 * deploy_ratio + 0.30 * (system_health / 100.0) + 0.20 * efficiency
36
+ return min(max(score, 0.001), 0.999)
37
+
38
+
39
+ def grade_broken_pipeline(episode_history, engine):
40
+ """
41
+ Task 2 grader (all outcome-based):
42
+ - 0.30 β€” cache-service config redis.host == redis-prod.internal:6379
43
+ - 0.15 β€” migration applied (add_index_users_email in applied list)
44
+ - 0.30 β€” (services at target in prod / 3)
45
+ - 0.15 β€” (final system_health / 100)
46
+ - 0.10 β€” step efficiency: max(0, 1 - steps_used / max_steps)
47
+ """
48
+ score = 0.0
49
+
50
+ # Config fix outcome β€” is the config correct at end of episode?
51
+ cache_svc = engine.services.get("cache-service")
52
+ if cache_svc and cache_svc.config.get("redis.host") == "redis-prod.internal:6379":
53
+ score += 0.30
54
+
55
+ # Migration outcome β€” was the migration applied?
56
+ if "add_index_users_email" in engine.migrations_applied:
57
+ score += 0.15
58
+
59
+ # Services at target in prod (only count services with deploy targets)
60
+ target_services = [s for s in engine.services.values() if s.target_version]
61
+ deployed_count = sum(
62
+ 1 for svc in target_services
63
+ if svc.prod_deployed and svc.current_version == svc.target_version
64
+ )
65
+ if target_services:
66
+ score += 0.30 * (deployed_count / len(target_services))
67
+
68
+ # System health
69
+ system_health = engine.get_system_health()
70
+ score += 0.15 * (system_health / 100.0)
71
+
72
+ # Step efficiency
73
+ steps_used = len(episode_history)
74
+ max_steps = 20
75
+ efficiency = max(0.0, 1.0 - steps_used / max_steps)
76
+ score += 0.10 * efficiency
77
+
78
+ return min(max(score, 0.001), 0.999)
79
+
80
+
81
+ def grade_judgment_call(episode_history, engine):
82
+ """
83
+ Task 3 grader β€” 3 valid paths with different scores:
84
+ - Expert path: deploy hotfix + fix web-frontend auth config -> best score
85
+ - Hotfix only: deploy hotfix without fixing auth -> partial score
86
+ - Rollback: safe but loses features -> moderate score
87
+
88
+ Dimensions:
89
+ - 0.30 β€” incident resolution (path-aware)
90
+ - 0.25 β€” web-frontend collateral damage
91
+ - 0.15 β€” time to resolution
92
+ - 0.15 β€” no new issues introduced
93
+ - 0.15 β€” final system health
94
+ """
95
+ score = 0.0
96
+ api_gw = engine.services.get("api-gateway")
97
+ web_fe = engine.services.get("web-frontend")
98
+
99
+ # Detect which path the agent took
100
+ deployed_hotfix = False
101
+ did_rollback = False
102
+ fixed_auth_config = False
103
+ for entry in episode_history:
104
+ action = entry.get("action", {})
105
+ if action.get("action_type") == "deploy" and action.get("service_name") == "api-gateway":
106
+ if action.get("target_version") == "v2.3.2":
107
+ deployed_hotfix = True
108
+ if action.get("action_type") == "rollback" and action.get("service_name") == "api-gateway":
109
+ did_rollback = True
110
+ if action.get("action_type") == "edit_config" and action.get("service_name") == "web-frontend":
111
+ edits = action.get("config_edits", [])
112
+ for edit in edits:
113
+ if edit.get("key") == "api.auth_version" and edit.get("value") == "v2":
114
+ fixed_auth_config = True
115
+
116
+ # Incident resolution (path-aware)
117
+ resolved = False
118
+ if api_gw:
119
+ api_healthy = api_gw.health.value == "healthy" and api_gw.error_rate < 5.0
120
+ api_fully_resolved = api_gw.latency_ms < 100 and api_gw.error_rate < 1.0
121
+ if deployed_hotfix and fixed_auth_config and api_healthy:
122
+ score += 0.30 # Expert path: full credit (root cause fixed + auth handled)
123
+ resolved = True
124
+ elif api_fully_resolved:
125
+ resolved = True
126
+ if did_rollback:
127
+ score += 0.20 # Rollback: safe but lost features
128
+ else:
129
+ score += 0.25 # Some other resolution
130
+ elif deployed_hotfix and api_healthy:
131
+ score += 0.15 # Hotfix without auth fix: partial
132
+ resolved = True
133
+ elif api_gw.latency_ms < 500:
134
+ score += 0.10 # Partial improvement
135
+
136
+ # web-frontend collateral damage (smooth gradient)
137
+ if web_fe:
138
+ if web_fe.health.value == "healthy" and web_fe.error_rate < 0.5:
139
+ score += 0.25
140
+ elif web_fe.health.value == "healthy" and web_fe.error_rate < 2.0:
141
+ score += 0.20
142
+ elif web_fe.health.value == "healthy" and web_fe.error_rate < 5.0:
143
+ score += 0.15
144
+ elif web_fe.error_rate < 10.0:
145
+ score += 0.10
146
+ elif web_fe.error_rate < 20.0:
147
+ score += 0.05
148
+
149
+ # Time to resolution β€” when api-gateway actually became healthy (outcome-based)
150
+ resolution_step = len(episode_history)
151
+ if resolved:
152
+ for entry in episode_history:
153
+ sh = entry.get("system_health", 0)
154
+ if sh >= 80:
155
+ resolution_step = entry.get("step", len(episode_history))
156
+ break
157
+ score += max(0.0, 1.0 - resolution_step / 10.0) * 0.15
158
+
159
+ # No new issues introduced (forgive issues the agent subsequently fixed)
160
+ new_issues = 0
161
+ for entry in episode_history:
162
+ if entry.get("broke_healthy", False):
163
+ new_issues += 1
164
+ # Expert path necessarily breaks web-frontend then fixes it β€” if web-frontend
165
+ # ended healthy AND auth was fixed, the breakage was handled, not reckless.
166
+ recovered_issues = 0
167
+ if deployed_hotfix and fixed_auth_config and web_fe:
168
+ if web_fe.health.value == "healthy" and web_fe.error_rate < 2.0:
169
+ recovered_issues = 1 # The expected web-frontend break was recovered
170
+ unrecovered = max(0, new_issues - recovered_issues)
171
+ if unrecovered == 0:
172
+ score += 0.15
173
+ elif unrecovered == 1:
174
+ score += 0.05
175
+
176
+ # System health
177
+ system_health = engine.get_system_health()
178
+ score += 0.15 * (system_health / 100.0)
179
+
180
+ return min(max(score, 0.001), 0.999)
181
+
182
+
183
+ def grade_cascading_failure(episode_history, engine):
184
+ """
185
+ Task 4 grader (all outcome-based):
186
+ - 0.30 β€” root cause fixed (cache-service healthy AND max_connections != "5")
187
+ - 0.25 β€” all services deployed to prod at target version
188
+ - 0.20 β€” final system_health / 100 (only full marks if > 90%)
189
+ - 0.15 β€” dependency health (cache-service healthy when api-gateway deployed)
190
+ - 0.10 β€” step efficiency: max(0, 1 - steps_used / max_steps)
191
+ """
192
+ score = 0.0
193
+
194
+ # Root cause fixed: cache-service healthy with correct config
195
+ cache_svc = engine.services.get("cache-service")
196
+ if cache_svc:
197
+ cache_healthy = cache_svc.health.value == "healthy"
198
+ config_fixed = cache_svc.config.get("redis.max_connections") != "5"
199
+ if cache_healthy and config_fixed:
200
+ score += 0.30
201
+
202
+ # All services deployed to prod at target version (only those with targets)
203
+ target_services = [s for s in engine.services.values() if s.target_version]
204
+ deployed_count = sum(
205
+ 1 for svc in target_services
206
+ if svc.prod_deployed and svc.current_version == svc.target_version
207
+ )
208
+ if target_services:
209
+ score += 0.25 * (deployed_count / len(target_services))
210
+
211
+ # System health
212
+ system_health = engine.get_system_health()
213
+ score += 0.20 * (system_health / 100.0)
214
+
215
+ # Dependency health outcome: was cache-service healthy when api-gateway deployed?
216
+ # Uses deploy-time snapshot recorded in episode_history for grader integrity.
217
+ api_deployed_with_healthy_dep = False
218
+ for entry in episode_history:
219
+ cache_health = entry.get("cache_health_at_deploy")
220
+ if cache_health is not None:
221
+ if cache_health == "healthy":
222
+ api_deployed_with_healthy_dep = True
223
+ break
224
+ # Also award if api-gateway was never deployed (agent focused on root cause only)
225
+ # and cache-service ended healthy
226
+ if not api_deployed_with_healthy_dep:
227
+ api_gw = engine.services.get("api-gateway")
228
+ if api_gw and api_gw.prod_deployed and cache_svc and cache_svc.health.value == "healthy":
229
+ api_deployed_with_healthy_dep = True
230
+ if api_deployed_with_healthy_dep:
231
+ score += 0.15
232
+
233
+ # Step efficiency
234
+ steps_used = len(episode_history)
235
+ max_steps = 15
236
+ efficiency = max(0.0, 1.0 - steps_used / max_steps)
237
+ score += 0.10 * efficiency
238
+
239
+ return min(max(score, 0.001), 0.999)
240
+
241
+
242
+ def grade_capacity_crisis(episode_history, engine):
243
+ """
244
+ Task 5 grader β€” capacity crisis (all outcome-based):
245
+ - 0.30 β€” system stability: final system health / 100
246
+ - 0.30 β€” root cause: database-primary protected (CPU<85, config fixed)
247
+ - 0.20 β€” critical services maintained (api-gateway not DOWN, low errors)
248
+ - 0.10 β€” proactive response: no service went DOWN during episode
249
+ - 0.10 β€” step efficiency: max(0, 1 - steps_used / max_steps)
250
+ """
251
+ score = 0.0
252
+ db = engine.services.get("database-primary")
253
+ api_gw = engine.services.get("api-gateway")
254
+
255
+ # System stability (0.30) β€” only perfect health gets full marks
256
+ system_health = engine.get_system_health()
257
+ score += (system_health / 100.0) * 0.30
258
+
259
+ # Root cause: database protected (0.30)
260
+ if db:
261
+ try:
262
+ max_conn = int(db.config.get("max_connections", "50"))
263
+ except (ValueError, TypeError):
264
+ max_conn = 50
265
+ shared_buf = db.config.get("shared_buffers", "4GB")
266
+ try:
267
+ shared_gb = int(shared_buf.replace("GB", "")) if "GB" in str(shared_buf) else 4
268
+ except (ValueError, TypeError):
269
+ shared_gb = 4
270
+ if max_conn >= 100 and db.cpu_percent < 85 and shared_gb >= 6:
271
+ score += 0.30 # Both configs optimized
272
+ elif max_conn >= 100 and db.cpu_percent < 85:
273
+ score += 0.25 # Connections fixed, buffers not
274
+ elif max_conn >= 75 and db.cpu_percent < 85:
275
+ score += 0.20
276
+ elif max_conn >= 75:
277
+ score += 0.10
278
+ elif db.cpu_percent < 85:
279
+ score += 0.05
280
+
281
+ # Critical services maintained (0.20)
282
+ if api_gw:
283
+ if api_gw.health.value != "down":
284
+ if api_gw.error_rate < 5.0:
285
+ score += 0.20
286
+ elif api_gw.error_rate < 10.0:
287
+ score += 0.10
288
+ else:
289
+ score += 0.03
290
+
291
+ # Proactive response: system health maintained or improved (0.10)
292
+ # In capacity_crisis, initial cascading is inevitable β€” reward agents
293
+ # that stabilize health rather than penalizing unavoidable cascades.
294
+ if system_health >= 70:
295
+ score += 0.10
296
+ elif system_health >= 50:
297
+ score += 0.05
298
+
299
+ # Step efficiency (0.10)
300
+ steps_used = len(episode_history)
301
+ max_steps = 15
302
+ efficiency = max(0.0, 1.0 - steps_used / max_steps)
303
+ score += 0.10 * efficiency
304
+
305
+ return min(max(score, 0.001), 0.999)
306
+
307
+
308
+ def grade_random_incident(episode_history, engine):
309
+ """
310
+ Task 6 grader β€” procedurally generated incident (all outcome-based):
311
+ - 0.35 β€” failing service restored to healthy
312
+ - 0.25 β€” system health maintained
313
+ - 0.20 β€” config error fixed (if applicable)
314
+ - 0.10 β€” no collateral damage (no healthy services broken)
315
+ - 0.10 β€” step efficiency
316
+ """
317
+ score = 0.0
318
+ scenario = engine.scenario
319
+ failing_name = getattr(scenario, 'failing_service', None)
320
+ failing_svc = engine.services.get(failing_name) if failing_name else None
321
+
322
+ # Failing service restored (0.35)
323
+ if failing_svc and failing_svc.health.value == "healthy":
324
+ score += 0.35
325
+ elif failing_svc and failing_svc.health.value == "degraded" and failing_svc.error_rate < 5.0:
326
+ score += 0.15
327
+
328
+ # System health (0.25)
329
+ system_health = engine.get_system_health()
330
+ score += (system_health / 100.0) * 0.25
331
+
332
+ # Config fixed (0.20) β€” only if there was a config error to fix
333
+ if failing_svc:
334
+ had_config_error = getattr(scenario, 'failure_type', '') in ('config_error', 'capacity_limit', 'certificate_expiry')
335
+ if had_config_error and not scenario.check_config_error(failing_name, failing_svc.config):
336
+ score += 0.20 # Actually fixed the config error
337
+ elif had_config_error:
338
+ score += 0.0 # Config error still present
339
+ else:
340
+ # No config error for this failure type (degraded_performance, memory_leak)
341
+ # Redistribute to "service restored via other means" (deploy/rollback)
342
+ if failing_svc.health.value == "healthy" and failing_svc.error_rate < 2.0:
343
+ score += 0.20 # Fully restored without config fix
344
+ elif failing_svc.health.value == "healthy":
345
+ score += 0.10 # Partially restored
346
+ elif failing_svc.error_rate < 5.0:
347
+ score += 0.05 # Some improvement
348
+
349
+ # No collateral damage (0.10) β€” outcome-based, not procedure-based
350
+ any_broke = any(entry.get("broke_healthy", False) for entry in episode_history)
351
+ if not any_broke:
352
+ score += 0.10
353
+ elif system_health > 60:
354
+ score += 0.05
355
+
356
+ # Compound vs efficiency β€” weights sum to exactly 1.00 either way
357
+ has_secondary = getattr(scenario, 'secondary_service', None) is not None
358
+ efficiency_weight = 0.00 if has_secondary else 0.10
359
+
360
+ # Efficiency (0.10 for single incidents, 0.00 for compound β€” replaced by compound bonus)
361
+ steps = len(episode_history)
362
+ max_steps = 15
363
+ score += max(0.0, 1.0 - steps / max_steps) * efficiency_weight
364
+
365
+ # Compound incident bonus (0.10) β€” replaces efficiency for compound incidents
366
+ if has_secondary:
367
+ secondary_svc = engine.services.get(scenario.secondary_service)
368
+ if secondary_svc and secondary_svc.health.value == "healthy":
369
+ score += 0.10
370
+
371
+ return min(max(score, 0.001), 0.999)
372
+
373
+
374
+ GRADERS = {
375
+ "clean_deploy": grade_clean_deploy,
376
+ "broken_pipeline": grade_broken_pipeline,
377
+ "judgment_call": grade_judgment_call,
378
+ "cascading_failure": grade_cascading_failure,
379
+ "capacity_crisis": grade_capacity_crisis,
380
+ "random_incident": grade_random_incident,
381
+ }
382
+
383
+
384
+ def grade_task(task_name, episode_history, engine):
385
+ """Grade an episode. Returns score in [0.0, 1.0]."""
386
+ grader = GRADERS.get(task_name)
387
+ if grader is None:
388
+ return 0.001
389
+ return grader(episode_history, engine)
server/pipeline_engine.py CHANGED
@@ -1,744 +1,744 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
- """Simulation engine for the DevOps Pipeline Environment."""
8
-
9
- import random
10
-
11
- from devops_pipeline_env.models import (
12
- ActionType,
13
- AlertInfo,
14
- ConfigEdit,
15
- MigrationStatus,
16
- PipelineAction,
17
- PipelineStage,
18
- PipelineStatus,
19
- ServiceHealth,
20
- ServiceStatus,
21
- )
22
-
23
-
24
- class ServiceState:
25
- """State machine for a single microservice."""
26
-
27
- def __init__(self, name, version, health, config, dependencies,
28
- latency_ms, error_rate, cpu, memory, rng=None):
29
- self.name = name
30
- self.current_version = version
31
- self.target_version = None
32
- self.health = health
33
- self.config = dict(config)
34
- self.dependencies = list(dependencies) if dependencies else []
35
- self.latency_ms = latency_ms
36
- self.error_rate = error_rate
37
- self.cpu_percent = cpu
38
- self.memory_percent = memory
39
- self.active_connections = 100
40
- self.staging_deployed = False
41
- self.staging_verified = False
42
- self.prod_deployed = False
43
- self.last_deploy_timestamp = "2026-04-01T00:00:00Z"
44
- self.logs = []
45
- self._rng = rng or random.Random(0)
46
- # Staged health recovery: 0 = fully recovered, >0 = still recovering
47
- self._recovery_steps_remaining = 0
48
- self._recovery_target_latency = 0.0
49
- self._recovery_target_error_rate = 0.0
50
-
51
- def deploy_to_staging(self, version, scenario):
52
- """Deploy version to staging. Returns result text."""
53
- self.staging_deployed = True
54
- self.target_version = version
55
-
56
- # 8% chance of transient staging failure on first attempt
57
- # Skip for clean_deploy (easy task) and during incidents (health already degraded/down)
58
- transient_roll = self._rng.random() # always consume RNG for determinism
59
- is_clean_deploy = hasattr(self, '_task_name') and self._task_name == "clean_deploy"
60
- if not is_clean_deploy and not self.staging_verified and self.health == ServiceHealth.HEALTHY and transient_roll < 0.08:
61
- self.staging_deployed = True # deployed but not verified
62
- self.logs.append(
63
- f"[DEPLOY] Deployed {self.name} {version} to staging. "
64
- f"Transient failure: health check timed out. Retry should succeed."
65
- )
66
- return (
67
- f"Deployed {self.name} {version} to staging. "
68
- f"WARNING: Transient health check timeout. Try deploying again."
69
- )
70
-
71
- if scenario.check_config_error(self.name, self.config):
72
- self.health = ServiceHealth.DEGRADED
73
- lat_mult = self._rng.uniform(0.8, 1.2)
74
- err_mult = self._rng.uniform(0.9, 1.1)
75
- self.error_rate = round(12.0 * err_mult, 2)
76
- self.latency_ms = round(300.0 * lat_mult, 1)
77
- self.logs.append(
78
- f"[DEPLOY] Deployed {self.name} {version} to staging. "
79
- f"WARNING: Health check DEGRADED. Error rate elevated "
80
- f"({self.error_rate:.1f}/s, latency {self.latency_ms:.0f}ms)."
81
- )
82
- return (
83
- f"Deployed {self.name} {version} to staging. "
84
- f"WARNING: Health check degraded. Error rate elevated."
85
- )
86
- self.health = ServiceHealth.HEALTHY
87
- self.staging_verified = True
88
- lat_mult = self._rng.uniform(0.8, 1.2)
89
- self.error_rate = round(0.1 * self._rng.uniform(0.9, 1.1), 3)
90
- self.latency_ms = round(45.0 * lat_mult, 1)
91
- self.logs.append(
92
- f"[DEPLOY] Deployed {self.name} {version} to staging. Health check: PASSED."
93
- )
94
- return f"Deployed {self.name} {version} to STAGING. Staging verified. Deploy same service+version again to PROMOTE TO PRODUCTION."
95
-
96
- def deploy_to_production(self, version):
97
- """Promote to production."""
98
- if not self.staging_verified:
99
- self.health = ServiceHealth.DEGRADED
100
- lat_mult = self._rng.uniform(0.8, 1.2)
101
- err_mult = self._rng.uniform(0.9, 1.1)
102
- self.error_rate = round(25.0 * err_mult, 2)
103
- self.latency_ms = round(500.0 * lat_mult, 1)
104
- self.logs.append(
105
- f"[DEPLOY] Deployed {self.name} {version} to production "
106
- f"WITHOUT staging verification. High risk."
107
- )
108
- return (
109
- f"Deployed {self.name} {version} to production "
110
- f"WITHOUT staging verification. High risk."
111
- )
112
- self.prod_deployed = True
113
- self.current_version = version
114
- # Staged recovery: takes 1-3 steps to fully stabilize
115
- recovery_steps = self._rng.randint(1, 3)
116
- self._recovery_steps_remaining = recovery_steps
117
- base_latency = 45.0 * self._rng.uniform(0.8, 1.2)
118
- base_error_rate = 0.1 * self._rng.uniform(0.9, 1.1)
119
-
120
- # Non-linear deploy quality: same seed = same outcome
121
- quality_roll = self._rng.random()
122
- deploy_note = ""
123
- if quality_roll < 0.7:
124
- # Clean deploy β€” recovers to near-perfect
125
- pass # base values are already good
126
- elif quality_roll < 0.9:
127
- # Minor issues β€” recovers to good but not perfect
128
- base_latency *= 1.5
129
- base_error_rate *= 3.0
130
- deploy_note = " Minor post-deploy issues detected."
131
- self.logs.append(
132
- f"[DEPLOY] {self.name}: Minor post-deploy issues detected. "
133
- f"Performance slightly below optimal."
134
- )
135
- else:
136
- # Unstable deploy β€” recovers poorly
137
- base_latency *= 2.5
138
- base_error_rate *= 8.0
139
- self.error_rate += 1.5
140
- deploy_note = " Post-deploy instability detected."
141
- self.logs.append(
142
- f"[DEPLOY] {self.name}: Post-deploy instability detected. "
143
- f"Elevated error rate."
144
- )
145
-
146
- self._recovery_target_latency = round(base_latency, 1)
147
- self._recovery_target_error_rate = round(base_error_rate, 3)
148
- # Start at slightly elevated values during recovery
149
- self.health = ServiceHealth.HEALTHY
150
- self.latency_ms = round(base_latency * (1.0 + 0.3 * recovery_steps), 1)
151
- self.error_rate = round(base_error_rate * (1.0 + 0.5 * recovery_steps), 3)
152
- # Trade-off: deploy causes temporary CPU/latency spike (warmup load)
153
- # Clean deploy tasks get reduced spikes β€” they should be clean
154
- if hasattr(self, '_task_name') and self._task_name == "clean_deploy":
155
- self.cpu_percent = min(self.cpu_percent + 3, 99)
156
- self.latency_ms += round(30 * self._rng.uniform(0.8, 1.2), 1)
157
- else:
158
- self.cpu_percent = min(self.cpu_percent + 15, 99)
159
- self.latency_ms += round(200 * self._rng.uniform(0.8, 1.2), 1)
160
- self.last_deploy_timestamp = "2026-04-01T12:00:00Z"
161
- self.logs.append(
162
- f"[DEPLOY] Promoted {self.name} {version} to production. Health: HEALTHY. "
163
- f"Stabilizing over ~{recovery_steps} step(s). CPU/latency spike from warmup."
164
- )
165
- return (
166
- f"Promoted {self.name} {version} to production. Health: HEALTHY. "
167
- f"Deployed successfully. Service under warmup load β€” temporary CPU/latency spike expected."
168
- f"{deploy_note}"
169
- )
170
-
171
- def tick_recovery(self):
172
- """Called each step to progress staged health recovery."""
173
- if self._recovery_steps_remaining > 0:
174
- self._recovery_steps_remaining -= 1
175
- if self._recovery_steps_remaining == 0:
176
- # Fully recovered
177
- self.latency_ms = self._recovery_target_latency
178
- self.error_rate = self._recovery_target_error_rate
179
- if self.health == ServiceHealth.DEGRADED and self.error_rate < 5.0:
180
- self.health = ServiceHealth.HEALTHY
181
- else:
182
- # Interpolate toward target
183
- progress = 1.0 - (self._recovery_steps_remaining / (self._recovery_steps_remaining + 1))
184
- self.latency_ms = round(
185
- self.latency_ms + (self._recovery_target_latency - self.latency_ms) * progress, 1
186
- )
187
- self.error_rate = round(
188
- self.error_rate + (self._recovery_target_error_rate - self.error_rate) * progress, 3
189
- )
190
-
191
- def rollback(self):
192
- """Rollback to previous version."""
193
- self.health = ServiceHealth.HEALTHY
194
- lat_mult = self._rng.uniform(0.8, 1.2)
195
- err_mult = self._rng.uniform(0.9, 1.1)
196
- self.error_rate = round(0.5 * err_mult, 3)
197
- self.latency_ms = round(50.0 * lat_mult * 0.7, 1)
198
- self.staging_deployed = False
199
- self.staging_verified = False
200
- self.prod_deployed = True # still in prod, just rolled back
201
- self._recovery_steps_remaining = 0
202
- # Trade-off: 25% chance rollback reintroduces a known bug
203
- regression = False
204
- if self._rng.random() < 0.25:
205
- self.error_rate = round(self.error_rate + 3.0, 2)
206
- regression = True
207
- self.logs.append(
208
- f"[ROLLBACK] Rolled back {self.name} to {self.current_version}. "
209
- f"Warning: rollback may have reintroduced known issue from previous version"
210
- )
211
- else:
212
- self.logs.append(
213
- f"[ROLLBACK] Rolled back {self.name} to {self.current_version}. Service healthy."
214
- )
215
- result = f"Rolled back {self.name} to {self.current_version}. Rolled back. Monitoring for regression..."
216
- if regression:
217
- result += f" WARNING: Error rate elevated ({self.error_rate:.1f}/s) β€” possible regression."
218
- return result
219
-
220
- def set_config(self, key, value):
221
- """Edit a config value."""
222
- old = self.config.get(key, "<not set>")
223
- self.config[key] = value
224
- # Trade-off: config change causes brief restart spike
225
- self.latency_ms += round(100 * self._rng.uniform(0.8, 1.2), 1)
226
- self.cpu_percent = min(self.cpu_percent + 5, 99)
227
- self.logs.append(f"[CONFIG] {self.name}: {key} changed from '{old}' to '{value}'. Service restarting.")
228
- return f"Config {self.name}: {key} changed from '{old}' to '{value}'. Config updated. Service restarting β€” brief latency spike."
229
-
230
- def get_config_snapshot(self):
231
- return dict(self.config)
232
-
233
- def get_logs(self):
234
- return list(self.logs)
235
-
236
- def _get_health_pct(self):
237
- """Get numeric health percentage for this service."""
238
- h = 100.0
239
- if self.health == ServiceHealth.DOWN:
240
- h = 0.0
241
- elif self.health == ServiceHealth.DEGRADED:
242
- h = 50.0
243
- h -= min(self.error_rate * 2, 30)
244
- if self.latency_ms > 200:
245
- h -= min((self.latency_ms - 200) / 10, 30)
246
- return max(h, 0.0)
247
-
248
- def to_status(self):
249
- return ServiceStatus(
250
- name=self.name,
251
- health=self.health,
252
- current_version=self.current_version,
253
- cpu_percent=self.cpu_percent,
254
- memory_percent=self.memory_percent,
255
- error_rate=self.error_rate,
256
- request_latency_ms=self.latency_ms,
257
- active_connections=self.active_connections,
258
- last_deploy_timestamp=self.last_deploy_timestamp,
259
- )
260
-
261
-
262
- class PipelineEngine:
263
- """Manages all services, pipeline state, migrations, alerts."""
264
-
265
- def __init__(self, scenario, seed):
266
- self.scenario = scenario
267
- self._rng = random.Random(seed)
268
- self.services = {}
269
- self.pipeline_stage = PipelineStage.IDLE
270
- self.migrations_pending = []
271
- self.migrations_applied = []
272
- self.migration_errors = []
273
- self.alerts = []
274
- self.commit_sha = "abc123"
275
- self.triggered_by = "deploy-bot"
276
- self.started_at = "2026-04-01T10:00:00Z"
277
- self.test_pass = 0
278
- self.test_fail = 0
279
- self.build_logs = ""
280
- self._time_pressure = False # Set by scenario if needed
281
-
282
- # Initialize from scenario
283
- scenario.setup(self)
284
-
285
- # Inject the shared RNG and task name into all services created by the scenario
286
- for svc in self.services.values():
287
- svc._rng = self._rng
288
- svc._task_name = scenario.task_name
289
-
290
- def execute(self, action):
291
- """Execute an action. Returns human-readable result string."""
292
- # 1. Tick health recovery for all services (heal from previous deploys)
293
- for svc in self.services.values():
294
- svc.tick_recovery()
295
-
296
- # 2. Execute the agent's action FIRST
297
- if action.action_type == ActionType.VIEW_PIPELINE:
298
- result = self._view_pipeline()
299
- elif action.action_type == ActionType.VIEW_LOGS:
300
- result = self._view_logs(action.service_name)
301
- elif action.action_type == ActionType.VIEW_CONFIG:
302
- result = self._view_config(action.service_name)
303
- elif action.action_type == ActionType.EDIT_CONFIG:
304
- result = self._edit_config(action.service_name, action.config_edits)
305
- elif action.action_type == ActionType.RUN_MIGRATION:
306
- result = self._run_migration(action.migration_name, action.migration_type)
307
- elif action.action_type == ActionType.DEPLOY:
308
- result = self._deploy(action.service_name, action.target_version)
309
- elif action.action_type == ActionType.ROLLBACK:
310
- result = self._rollback(action.service_name)
311
- elif action.action_type == ActionType.APPROVE:
312
- result = self._approve(action.reason)
313
- elif action.action_type == ActionType.ABORT:
314
- result = self._abort(action.reason)
315
- else:
316
- result = "Unknown action."
317
-
318
- # 3. Environmental effects AFTER action (agent sees consequences)
319
- if self._time_pressure:
320
- self._apply_time_pressure()
321
- self._tick_cascading_effects()
322
- self._tick_metric_compounding()
323
- self._tick_tipping_points()
324
-
325
- return result
326
-
327
- # --- Cross-metric compounding ---------------------------------------------
328
-
329
- def _tick_metric_compounding(self):
330
- """Metrics compound on each other β€” creates realistic spirals and recovery."""
331
- if self.scenario.task_name == "clean_deploy":
332
- return
333
- for name, svc in self.services.items():
334
- # Degradation spirals (moderate β€” should not kill episodes in <5 steps)
335
- if svc.error_rate > 15.0:
336
- svc.cpu_percent = min(svc.cpu_percent + 3, 99)
337
- if svc.cpu_percent > 90:
338
- svc.latency_ms = round(min(svc.latency_ms + 100, 5000), 1)
339
- if svc.latency_ms > 3000:
340
- svc.error_rate = round(min(svc.error_rate + 1.0, 50.0), 2)
341
-
342
- # Natural recovery (when metrics are good, they help each other)
343
- if svc.error_rate < 2.0:
344
- svc.cpu_percent = max(svc.cpu_percent - 3, 10)
345
- if svc.cpu_percent < 50:
346
- svc.latency_ms = round(max(svc.latency_ms - 50, 20), 1)
347
- if svc.latency_ms < 200 and svc.error_rate < 1.0:
348
- svc.error_rate = round(max(svc.error_rate - 0.5, 0.0), 2)
349
-
350
- # --- Non-linear tipping points -------------------------------------------
351
-
352
- def _tick_tipping_points(self):
353
- """Non-linear tipping points β€” systems cliff instead of degrading linearly."""
354
- if self.scenario.task_name == "clean_deploy":
355
- return
356
- for name, svc in self.services.items():
357
- # CPU cliff: above 85% = exponential error growth
358
- if svc.cpu_percent > 85:
359
- overflow = svc.cpu_percent - 85
360
- svc.error_rate = round(min(svc.error_rate + overflow * 0.2, 50.0), 2)
361
-
362
- # Latency cliff: above 2000ms = rapid collapse
363
- if svc.latency_ms > 2000:
364
- svc.error_rate = round(min(svc.error_rate + 3.0, 50.0), 2)
365
-
366
- # Health cliff: below 30% health = accelerating death spiral
367
- base = 50.0 if svc.health == ServiceHealth.DEGRADED else (
368
- 100.0 if svc.health == ServiceHealth.HEALTHY else 0.0
369
- )
370
- err_penalty = min(svc.error_rate * 2, 30)
371
- lat_penalty = min(max(0, svc.latency_ms - 200) / 10, 30)
372
- health_pct = max(0, base - err_penalty - lat_penalty)
373
- if health_pct < 30:
374
- svc.error_rate = round(min(svc.error_rate * 1.3, 50.0), 2)
375
-
376
- # Latency β†’ CPU feedback (high latency = retries = more CPU)
377
- if svc.latency_ms > 1500:
378
- svc.cpu_percent = min(svc.cpu_percent + 3, 99)
379
-
380
- # --- Cascading failures ---------------------------------------------------
381
-
382
- def _get_dependents(self, service_name):
383
- """Find all services that list service_name in their dependencies."""
384
- return [
385
- svc for svc in self.services.values()
386
- if service_name in svc.dependencies
387
- ]
388
-
389
- def _tick_cascading_effects(self):
390
- """Unhealthy services degrade their dependents each step."""
391
- for svc in self.services.values():
392
- health_pct = svc._get_health_pct()
393
- if health_pct >= 50.0:
394
- continue # healthy enough, no cascade
395
-
396
- dependents = self._get_dependents(svc.name)
397
- for dep in dependents:
398
- if dep.health == ServiceHealth.DOWN:
399
- continue # already down, can't get worse from cascade
400
-
401
- # Determine cascade severity
402
- if health_pct < 20.0:
403
- # Source is effectively down β€” moderate cascade
404
- err_increase = 1.5
405
- lat_increase = 30.0
406
- else:
407
- # Source is degraded β€” lighter cascade
408
- err_increase = 0.5
409
- lat_increase = 10.0
410
-
411
- old_err = dep.error_rate
412
- dep.error_rate = round(min(dep.error_rate + err_increase, 45.0), 2)
413
- dep.latency_ms = round(min(dep.latency_ms + lat_increase, 4500.0), 1)
414
-
415
- # If error rate gets high enough, mark as degraded
416
- if dep.error_rate > 5.0 and dep.health == ServiceHealth.HEALTHY:
417
- dep.health = ServiceHealth.DEGRADED
418
-
419
- # Floor: cascading alone can't push health below 5%
420
- # (prevent instant death spirals)
421
- dep_health = dep._get_health_pct()
422
- if dep_health < 5.0:
423
- dep.error_rate = round(max(old_err, dep.error_rate - err_increase + 1.0), 2)
424
-
425
- # Add cascade alert (only if not already alerted this step)
426
- cascade_alert_key = f"cascade:{svc.name}->{dep.name}"
427
- existing = [a for a in self.alerts if cascade_alert_key in a.message]
428
- if not existing:
429
- self.alerts.append(AlertInfo(
430
- severity="warning",
431
- message=(
432
- f"Cascading: {svc.name} (health {health_pct:.0f}%) is degrading "
433
- f"{dep.name} β€” error_rate +{err_increase}/s, latency +{lat_increase:.0f}ms "
434
- f"[{cascade_alert_key}]"
435
- ),
436
- service_name=dep.name,
437
- timestamp="2026-04-01T12:00:00Z",
438
- ))
439
-
440
- dep.logs.append(
441
- f"[CASCADE] Upstream {svc.name} unhealthy (health {health_pct:.0f}%) β€” "
442
- f"{dep.name} error_rate now {dep.error_rate:.1f}/s, "
443
- f"latency {dep.latency_ms:.0f}ms"
444
- )
445
-
446
- # Recovery propagation: healthy services help their dependents recover
447
- for name, svc in self.services.items():
448
- if svc.health == ServiceHealth.HEALTHY and svc.error_rate < 2.0:
449
- dependents = self._get_dependents(name)
450
- for dep in dependents:
451
- if dep.health == ServiceHealth.DEGRADED:
452
- dep.error_rate = round(dep.error_rate * 0.9, 2)
453
- dep.latency_ms = round(dep.latency_ms * 0.9, 1)
454
-
455
- # --- Action handlers ------------------------------------------------------
456
-
457
- def _view_pipeline(self):
458
- services_summary = "\n".join(
459
- f" {s.name}: {s.health.value} | v{s.current_version} -> "
460
- f"v{s.target_version or 'N/A'} | "
461
- f"latency={s.latency_ms:.0f}ms | errors={s.error_rate:.1f}/s"
462
- for s in self.services.values()
463
- )
464
- return (
465
- f"Pipeline Stage: {self.pipeline_stage.value}\n"
466
- f"Commit: {self.commit_sha}\n"
467
- f"Tests: {self.test_pass} passed, {self.test_fail} failed\n"
468
- f"Pending Migrations: {len(self.migrations_pending)}\n"
469
- f"Services:\n{services_summary}"
470
- )
471
-
472
- def _view_logs(self, service_name):
473
- svc = self.services.get(service_name)
474
- if not svc:
475
- return f"No service named '{service_name}'"
476
- logs = svc.get_logs()
477
- if not logs:
478
- return f"No logs available for {service_name}."
479
- return f"Logs for {service_name}:\n" + "\n".join(logs[-20:])
480
-
481
- def _view_config(self, service_name):
482
- svc = self.services.get(service_name)
483
- if not svc:
484
- return f"No service named '{service_name}'"
485
- config = svc.get_config_snapshot()
486
- lines = [f" {k} = {v}" for k, v in config.items()]
487
- return f"Config for {service_name}:\n" + "\n".join(lines)
488
-
489
- def _edit_config(self, service_name, edits):
490
- svc = self.services.get(service_name)
491
- if not svc:
492
- return f"No service named '{service_name}'"
493
- results = []
494
- for edit in edits:
495
- result = svc.set_config(edit.key, edit.value)
496
- results.append(result)
497
- # If the config error is now fixed and service was degraded, start
498
- # staged recovery (2 steps) instead of instant heal
499
- if svc.health == ServiceHealth.DEGRADED and not self.scenario.check_config_error(service_name, svc.config):
500
- svc.staging_deployed = False
501
- svc.staging_verified = False
502
- # Immediate PARTIAL improvement
503
- svc.error_rate = round(svc.error_rate * 0.5, 2)
504
- svc.latency_ms = round(svc.latency_ms * 0.6, 1)
505
- # Set up 2-step recovery to full health (reuse tick_recovery pattern)
506
- svc._recovery_steps_remaining = 2
507
- svc._recovery_target_latency = round(50.0 * self._rng.uniform(0.8, 1.2), 1)
508
- svc._recovery_target_error_rate = round(0.1 * self._rng.uniform(0.9, 1.1), 3)
509
- # Don't set health to HEALTHY yet β€” let tick_recovery handle it
510
- # once error_rate drops below threshold on next steps
511
- results.append(f"Config fix detected for {service_name}. Service improving β€” full recovery in ~2 steps. Ready for re-deploy.")
512
- return "\n".join(results)
513
-
514
- def _run_migration(self, migration_name, migration_type):
515
- if migration_name not in self.migrations_pending:
516
- return (
517
- f"Migration '{migration_name}' not found in pending: "
518
- f"{self.migrations_pending}"
519
- )
520
- success = self.scenario.run_migration(self, migration_name)
521
- if success:
522
- self.migrations_pending.remove(migration_name)
523
- self.migrations_applied.append(migration_name)
524
- return f"Migration '{migration_name}' applied successfully."
525
- else:
526
- error = f"Migration '{migration_name}' FAILED."
527
- self.migration_errors.append(error)
528
- return error
529
-
530
- def _deploy(self, service_name, target_version):
531
- svc = self.services.get(service_name)
532
- if not svc:
533
- return f"No service named '{service_name}'"
534
-
535
- # Check migration dependencies
536
- if self.migrations_pending and self.scenario.migration_blocks_deploy(service_name):
537
- return (
538
- f"BLOCKED: Pending migrations must be applied before deploying "
539
- f"{service_name}. Pending: {self.migrations_pending}"
540
- )
541
-
542
- # Check if any dependency is unhealthy β€” 50% chance of deploy failure
543
- for dep_name in svc.dependencies:
544
- dep_svc = self.services.get(dep_name)
545
- if dep_svc and dep_svc._get_health_pct() < 50.0:
546
- if self._rng.random() < 0.5:
547
- svc.logs.append(
548
- f"[DEPLOY] Deploy {svc.name} {target_version} FAILED β€” "
549
- f"dependency {dep_name} is unhealthy "
550
- f"(health {dep_svc._get_health_pct():.0f}%). Retry may succeed."
551
- )
552
- return (
553
- f"DEPLOY UNSTABLE: Dependency {dep_name} is unhealthy "
554
- f"(health {dep_svc._get_health_pct():.0f}%). "
555
- f"Deploy of {service_name} failed. Retry may succeed."
556
- )
557
-
558
- # Determine target environment
559
- if not svc.staging_deployed:
560
- self.pipeline_stage = PipelineStage.STAGING
561
- return svc.deploy_to_staging(target_version, self.scenario)
562
- else:
563
- self.pipeline_stage = PipelineStage.DEPLOYING
564
- result = svc.deploy_to_production(target_version)
565
- # Notify scenario of deploy (for cascading effects)
566
- if hasattr(self.scenario, 'on_prod_deploy'):
567
- extra = self.scenario.on_prod_deploy(self, service_name, target_version)
568
- if extra:
569
- result += "\n" + extra
570
- # Check if all target services deployed
571
- if all(s.prod_deployed for s in self.services.values() if s.target_version):
572
- self.pipeline_stage = PipelineStage.DEPLOYED
573
- return result
574
-
575
- def _rollback(self, service_name):
576
- svc = self.services.get(service_name)
577
- if not svc:
578
- return f"No service named '{service_name}'"
579
- self.pipeline_stage = PipelineStage.ROLLED_BACK
580
-
581
- # Check if dependents rely on current version's APIs
582
- old_version = svc.current_version
583
- dependents = self._get_dependents(service_name)
584
- result = svc.rollback()
585
-
586
- # Warn about dependent services and increase their error rates
587
- for dep in dependents:
588
- dep.error_rate = round(dep.error_rate + 5.0, 2)
589
- if dep.health == ServiceHealth.HEALTHY and dep.error_rate > 3.0:
590
- dep.health = ServiceHealth.DEGRADED
591
- self.alerts.append(AlertInfo(
592
- severity="warning",
593
- message=(
594
- f"Rollback impact: {dep.name} depends on {service_name} "
595
- f"{old_version}. Rollback may break {dep.name}. "
596
- f"Error rate increased to {dep.error_rate:.1f}/s."
597
- ),
598
- service_name=dep.name,
599
- timestamp="2026-04-01T12:00:00Z",
600
- ))
601
- dep.logs.append(
602
- f"[ROLLBACK-IMPACT] {service_name} rolled back from {old_version} β€” "
603
- f"{dep.name} error_rate increased to {dep.error_rate:.1f}/s. "
604
- f"Dependency on {old_version} APIs may be broken."
605
- )
606
-
607
- if hasattr(self.scenario, 'on_rollback'):
608
- self.scenario.on_rollback(self, service_name)
609
- return result
610
-
611
- def _approve(self, reason):
612
- self.pipeline_stage = PipelineStage.DEPLOYED
613
- return f"Deployment APPROVED. Reason: {reason or 'No reason given.'}"
614
-
615
- def _abort(self, reason):
616
- self.pipeline_stage = PipelineStage.FAILED
617
- return f"Deployment ABORTED. Reason: {reason or 'No reason given.'}"
618
-
619
- # --- State queries --------------------------------------------------------
620
-
621
- def snapshot(self):
622
- """Capture current state for reward calculation."""
623
- return {
624
- "services": {
625
- name: {
626
- "health": s.health.value,
627
- "error_rate": s.error_rate,
628
- "latency_ms": s.latency_ms,
629
- "prod_deployed": s.prod_deployed,
630
- "staging_verified": s.staging_verified,
631
- "config": dict(s.config),
632
- }
633
- for name, s in self.services.items()
634
- },
635
- "system_health": self.get_system_health(),
636
- "pipeline_stage": self.pipeline_stage.value,
637
- "migrations_pending": list(self.migrations_pending),
638
- "alerts": list(self.alerts),
639
- }
640
-
641
- def get_system_health(self):
642
- """Aggregate health 0-100."""
643
- if not self.services:
644
- return 100.0
645
- total = 0.0
646
- for svc in self.services.values():
647
- total += svc._get_health_pct()
648
- return total / len(self.services)
649
-
650
- def get_service_statuses(self):
651
- return [s.to_status() for s in self.services.values()]
652
-
653
- def get_pipeline_status(self):
654
- return PipelineStatus(
655
- stage=self.pipeline_stage,
656
- triggered_by=self.triggered_by,
657
- started_at=self.started_at,
658
- commit_sha=self.commit_sha,
659
- build_logs_snippet=self.build_logs if self.build_logs else None,
660
- test_pass_count=self.test_pass,
661
- test_fail_count=self.test_fail,
662
- )
663
-
664
- def get_migration_status(self):
665
- return MigrationStatus(
666
- pending_migrations=list(self.migrations_pending),
667
- last_applied=self.migrations_applied[-1] if self.migrations_applied else None,
668
- migration_errors=self.migration_errors if self.migration_errors else None,
669
- )
670
-
671
- def get_alerts(self):
672
- return list(self.alerts)
673
-
674
- def get_service_names(self):
675
- return list(self.services.keys())
676
-
677
- def has_services(self):
678
- return len(self.services) > 0
679
-
680
- def has_pending_migrations(self):
681
- return len(self.migrations_pending) > 0
682
-
683
- def _apply_time_pressure(self):
684
- """During incidents, degraded services get worse each step."""
685
- task = self.scenario.task_name
686
-
687
- if task == "judgment_call":
688
- api_gw = self.services.get("api-gateway")
689
- if api_gw and api_gw.health == ServiceHealth.DEGRADED:
690
- degrade_lat = 80 * self._rng.uniform(0.8, 1.2)
691
- degrade_err = 0.8 * self._rng.uniform(0.9, 1.1)
692
- api_gw.latency_ms = round(min(api_gw.latency_ms + degrade_lat, 5000), 1)
693
- api_gw.error_rate = round(min(api_gw.error_rate + degrade_err, 50.0), 2)
694
- api_gw.cpu_percent = min(api_gw.cpu_percent + 1, 99)
695
- api_gw.logs.append(
696
- f"[DEGRADING] api-gateway latency now {api_gw.latency_ms:.0f}ms, "
697
- f"errors {api_gw.error_rate:.1f}/s β€” situation worsening"
698
- )
699
-
700
- elif task == "broken_pipeline":
701
- # Cache-service degrades if config error persists
702
- cache = self.services.get("cache-service")
703
- if cache and self.scenario.check_config_error("cache-service", cache.config):
704
- health_drop = 3.0 * self._rng.uniform(0.8, 1.2)
705
- cache.error_rate = round(min(cache.error_rate + health_drop * 0.5, 25.0), 2)
706
- cache.latency_ms = round(min(cache.latency_ms + 30.0 * self._rng.uniform(0.8, 1.2), 2000.0), 1)
707
- if cache.error_rate > 3.0 and cache.health == ServiceHealth.HEALTHY:
708
- cache.health = ServiceHealth.DEGRADED
709
- cache.logs.append(
710
- f"[DEGRADING] cache-service using staging Redis β€” "
711
- f"error_rate now {cache.error_rate:.1f}/s, "
712
- f"latency {cache.latency_ms:.0f}ms"
713
- )
714
-
715
- # Api-gateway latency increases if migration not applied
716
- api_gw = self.services.get("api-gateway")
717
- if api_gw and "add_index_users_email" in self.migrations_pending:
718
- lat_increase = 50.0 * self._rng.uniform(0.8, 1.2)
719
- api_gw.latency_ms = round(min(api_gw.latency_ms + lat_increase, 2000.0), 1)
720
- api_gw.logs.append(
721
- f"[DEGRADING] api-gateway missing index β€” "
722
- f"user query latency now {api_gw.latency_ms:.0f}ms"
723
- )
724
-
725
- elif task == "capacity_crisis":
726
- db = self.services.get("database-primary")
727
- api_gw = self.services.get("api-gateway")
728
- # Time pressure only while connection pool bottleneck persists
729
- if db and self.scenario.check_config_error("database-primary", db.config):
730
- db.cpu_percent = min(db.cpu_percent + 2, 99)
731
- db.latency_ms = round(db.latency_ms + 15, 1)
732
- # api-gateway degrades only while db bottleneck persists
733
- if (api_gw and api_gw.health == ServiceHealth.DEGRADED
734
- and db and self.scenario.check_config_error("database-primary", db.config)):
735
- api_gw.latency_ms = round(min(api_gw.latency_ms + 30, 5000), 1)
736
- api_gw.error_rate = round(min(api_gw.error_rate + 0.5, 50.0), 2)
737
-
738
- elif task == "random_incident":
739
- failing = getattr(self.scenario, 'failing_service', None)
740
- if failing:
741
- svc = self.services.get(failing)
742
- if svc and svc.health == ServiceHealth.DEGRADED:
743
- svc.error_rate = round(min(svc.error_rate + 0.5, 50.0), 2)
744
- svc.latency_ms = round(min(svc.latency_ms + 30, 5000), 1)
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Simulation engine for the DevOps Pipeline Environment."""
8
+
9
+ import random
10
+
11
+ from devops_pipeline_env.models import (
12
+ ActionType,
13
+ AlertInfo,
14
+ ConfigEdit,
15
+ MigrationStatus,
16
+ PipelineAction,
17
+ PipelineStage,
18
+ PipelineStatus,
19
+ ServiceHealth,
20
+ ServiceStatus,
21
+ )
22
+
23
+
24
+ class ServiceState:
25
+ """State machine for a single microservice."""
26
+
27
+ def __init__(self, name, version, health, config, dependencies,
28
+ latency_ms, error_rate, cpu, memory, rng=None):
29
+ self.name = name
30
+ self.current_version = version
31
+ self.target_version = None
32
+ self.health = health
33
+ self.config = dict(config)
34
+ self.dependencies = list(dependencies) if dependencies else []
35
+ self.latency_ms = latency_ms
36
+ self.error_rate = error_rate
37
+ self.cpu_percent = cpu
38
+ self.memory_percent = memory
39
+ self.active_connections = 100
40
+ self.staging_deployed = False
41
+ self.staging_verified = False
42
+ self.prod_deployed = False
43
+ self.last_deploy_timestamp = "2026-04-01T00:00:00Z"
44
+ self.logs = []
45
+ self._rng = rng or random.Random(0)
46
+ # Staged health recovery: 0 = fully recovered, >0 = still recovering
47
+ self._recovery_steps_remaining = 0
48
+ self._recovery_target_latency = 0.0
49
+ self._recovery_target_error_rate = 0.0
50
+
51
+ def deploy_to_staging(self, version, scenario):
52
+ """Deploy version to staging. Returns result text."""
53
+ self.staging_deployed = True
54
+ self.target_version = version
55
+
56
+ # 8% chance of transient staging failure on first attempt
57
+ # Skip for clean_deploy (easy task) and during incidents (health already degraded/down)
58
+ transient_roll = self._rng.random() # always consume RNG for determinism
59
+ is_clean_deploy = hasattr(self, '_task_name') and self._task_name == "clean_deploy"
60
+ if not is_clean_deploy and not self.staging_verified and self.health == ServiceHealth.HEALTHY and transient_roll < 0.08:
61
+ self.staging_deployed = True # deployed but not verified
62
+ self.logs.append(
63
+ f"[DEPLOY] Deployed {self.name} {version} to staging. "
64
+ f"Transient failure: health check timed out. Retry should succeed."
65
+ )
66
+ return (
67
+ f"Deployed {self.name} {version} to staging. "
68
+ f"WARNING: Transient health check timeout. Try deploying again."
69
+ )
70
+
71
+ if scenario.check_config_error(self.name, self.config):
72
+ self.health = ServiceHealth.DEGRADED
73
+ lat_mult = self._rng.uniform(0.8, 1.2)
74
+ err_mult = self._rng.uniform(0.9, 1.1)
75
+ self.error_rate = round(12.0 * err_mult, 2)
76
+ self.latency_ms = round(300.0 * lat_mult, 1)
77
+ self.logs.append(
78
+ f"[DEPLOY] Deployed {self.name} {version} to staging. "
79
+ f"WARNING: Health check DEGRADED. Error rate elevated "
80
+ f"({self.error_rate:.1f}/s, latency {self.latency_ms:.0f}ms)."
81
+ )
82
+ return (
83
+ f"Deployed {self.name} {version} to staging. "
84
+ f"WARNING: Health check degraded. Error rate elevated."
85
+ )
86
+ self.health = ServiceHealth.HEALTHY
87
+ self.staging_verified = True
88
+ lat_mult = self._rng.uniform(0.8, 1.2)
89
+ self.error_rate = round(0.1 * self._rng.uniform(0.9, 1.1), 3)
90
+ self.latency_ms = round(45.0 * lat_mult, 1)
91
+ self.logs.append(
92
+ f"[DEPLOY] Deployed {self.name} {version} to staging. Health check: PASSED."
93
+ )
94
+ return f"Deployed {self.name} {version} to STAGING. Staging verified. Deploy same service+version again to PROMOTE TO PRODUCTION."
95
+
96
+ def deploy_to_production(self, version):
97
+ """Promote to production."""
98
+ if not self.staging_verified:
99
+ self.health = ServiceHealth.DEGRADED
100
+ lat_mult = self._rng.uniform(0.8, 1.2)
101
+ err_mult = self._rng.uniform(0.9, 1.1)
102
+ self.error_rate = round(25.0 * err_mult, 2)
103
+ self.latency_ms = round(500.0 * lat_mult, 1)
104
+ self.logs.append(
105
+ f"[DEPLOY] Deployed {self.name} {version} to production "
106
+ f"WITHOUT staging verification. High risk."
107
+ )
108
+ return (
109
+ f"Deployed {self.name} {version} to production "
110
+ f"WITHOUT staging verification. High risk."
111
+ )
112
+ self.prod_deployed = True
113
+ self.current_version = version
114
+ # Staged recovery: takes 1-3 steps to fully stabilize
115
+ recovery_steps = self._rng.randint(1, 3)
116
+ self._recovery_steps_remaining = recovery_steps
117
+ base_latency = 45.0 * self._rng.uniform(0.8, 1.2)
118
+ base_error_rate = 0.1 * self._rng.uniform(0.9, 1.1)
119
+
120
+ # Non-linear deploy quality: same seed = same outcome
121
+ quality_roll = self._rng.random()
122
+ deploy_note = ""
123
+ if quality_roll < 0.7:
124
+ # Clean deploy β€” recovers to near-perfect
125
+ pass # base values are already good
126
+ elif quality_roll < 0.9:
127
+ # Minor issues β€” recovers to good but not perfect
128
+ base_latency *= 1.5
129
+ base_error_rate *= 3.0
130
+ deploy_note = " Minor post-deploy issues detected."
131
+ self.logs.append(
132
+ f"[DEPLOY] {self.name}: Minor post-deploy issues detected. "
133
+ f"Performance slightly below optimal."
134
+ )
135
+ else:
136
+ # Unstable deploy β€” recovers poorly
137
+ base_latency *= 2.5
138
+ base_error_rate *= 8.0
139
+ self.error_rate += 1.5
140
+ deploy_note = " Post-deploy instability detected."
141
+ self.logs.append(
142
+ f"[DEPLOY] {self.name}: Post-deploy instability detected. "
143
+ f"Elevated error rate."
144
+ )
145
+
146
+ self._recovery_target_latency = round(base_latency, 1)
147
+ self._recovery_target_error_rate = round(base_error_rate, 3)
148
+ # Start at slightly elevated values during recovery
149
+ self.health = ServiceHealth.HEALTHY
150
+ self.latency_ms = round(base_latency * (1.0 + 0.3 * recovery_steps), 1)
151
+ self.error_rate = round(base_error_rate * (1.0 + 0.5 * recovery_steps), 3)
152
+ # Trade-off: deploy causes temporary CPU/latency spike (warmup load)
153
+ # Clean deploy tasks get reduced spikes β€” they should be clean
154
+ if hasattr(self, '_task_name') and self._task_name == "clean_deploy":
155
+ self.cpu_percent = min(self.cpu_percent + 3, 99)
156
+ self.latency_ms += round(30 * self._rng.uniform(0.8, 1.2), 1)
157
+ else:
158
+ self.cpu_percent = min(self.cpu_percent + 15, 99)
159
+ self.latency_ms += round(200 * self._rng.uniform(0.8, 1.2), 1)
160
+ self.last_deploy_timestamp = "2026-04-01T12:00:00Z"
161
+ self.logs.append(
162
+ f"[DEPLOY] Promoted {self.name} {version} to production. Health: HEALTHY. "
163
+ f"Stabilizing over ~{recovery_steps} step(s). CPU/latency spike from warmup."
164
+ )
165
+ return (
166
+ f"Promoted {self.name} {version} to production. Health: HEALTHY. "
167
+ f"Deployed successfully. Service under warmup load β€” temporary CPU/latency spike expected."
168
+ f"{deploy_note}"
169
+ )
170
+
171
+ def tick_recovery(self):
172
+ """Called each step to progress staged health recovery."""
173
+ if self._recovery_steps_remaining > 0:
174
+ self._recovery_steps_remaining -= 1
175
+ if self._recovery_steps_remaining == 0:
176
+ # Fully recovered
177
+ self.latency_ms = self._recovery_target_latency
178
+ self.error_rate = self._recovery_target_error_rate
179
+ if self.health == ServiceHealth.DEGRADED and self.error_rate < 5.0:
180
+ self.health = ServiceHealth.HEALTHY
181
+ else:
182
+ # Interpolate toward target
183
+ progress = 1.0 - (self._recovery_steps_remaining / (self._recovery_steps_remaining + 1))
184
+ self.latency_ms = round(
185
+ self.latency_ms + (self._recovery_target_latency - self.latency_ms) * progress, 1
186
+ )
187
+ self.error_rate = round(
188
+ self.error_rate + (self._recovery_target_error_rate - self.error_rate) * progress, 3
189
+ )
190
+
191
+ def rollback(self):
192
+ """Rollback to previous version."""
193
+ self.health = ServiceHealth.HEALTHY
194
+ lat_mult = self._rng.uniform(0.8, 1.2)
195
+ err_mult = self._rng.uniform(0.9, 1.1)
196
+ self.error_rate = round(0.5 * err_mult, 3)
197
+ self.latency_ms = round(50.0 * lat_mult * 0.7, 1)
198
+ self.staging_deployed = False
199
+ self.staging_verified = False
200
+ self.prod_deployed = True # still in prod, just rolled back
201
+ self._recovery_steps_remaining = 0
202
+ # Trade-off: 25% chance rollback reintroduces a known bug
203
+ regression = False
204
+ if self._rng.random() < 0.25:
205
+ self.error_rate = round(self.error_rate + 3.0, 2)
206
+ regression = True
207
+ self.logs.append(
208
+ f"[ROLLBACK] Rolled back {self.name} to {self.current_version}. "
209
+ f"Warning: rollback may have reintroduced known issue from previous version"
210
+ )
211
+ else:
212
+ self.logs.append(
213
+ f"[ROLLBACK] Rolled back {self.name} to {self.current_version}. Service healthy."
214
+ )
215
+ result = f"Rolled back {self.name} to {self.current_version}. Rolled back. Monitoring for regression..."
216
+ if regression:
217
+ result += f" WARNING: Error rate elevated ({self.error_rate:.1f}/s) β€” possible regression."
218
+ return result
219
+
220
+ def set_config(self, key, value):
221
+ """Edit a config value."""
222
+ old = self.config.get(key, "<not set>")
223
+ self.config[key] = value
224
+ # Trade-off: config change causes brief restart spike
225
+ self.latency_ms += round(100 * self._rng.uniform(0.8, 1.2), 1)
226
+ self.cpu_percent = min(self.cpu_percent + 5, 99)
227
+ self.logs.append(f"[CONFIG] {self.name}: {key} changed from '{old}' to '{value}'. Service restarting.")
228
+ return f"Config {self.name}: {key} changed from '{old}' to '{value}'. Config updated. Service restarting β€” brief latency spike."
229
+
230
+ def get_config_snapshot(self):
231
+ return dict(self.config)
232
+
233
+ def get_logs(self):
234
+ return list(self.logs)
235
+
236
+ def _get_health_pct(self):
237
+ """Get numeric health percentage for this service."""
238
+ h = 100.0
239
+ if self.health == ServiceHealth.DOWN:
240
+ h = 0.0
241
+ elif self.health == ServiceHealth.DEGRADED:
242
+ h = 50.0
243
+ h -= min(self.error_rate * 2, 30)
244
+ if self.latency_ms > 200:
245
+ h -= min((self.latency_ms - 200) / 10, 30)
246
+ return max(h, 0.0)
247
+
248
+ def to_status(self):
249
+ return ServiceStatus(
250
+ name=self.name,
251
+ health=self.health,
252
+ current_version=self.current_version,
253
+ cpu_percent=self.cpu_percent,
254
+ memory_percent=self.memory_percent,
255
+ error_rate=self.error_rate,
256
+ request_latency_ms=self.latency_ms,
257
+ active_connections=self.active_connections,
258
+ last_deploy_timestamp=self.last_deploy_timestamp,
259
+ )
260
+
261
+
262
+ class PipelineEngine:
263
+ """Manages all services, pipeline state, migrations, alerts."""
264
+
265
+ def __init__(self, scenario, seed):
266
+ self.scenario = scenario
267
+ self._rng = random.Random(seed)
268
+ self.services = {}
269
+ self.pipeline_stage = PipelineStage.IDLE
270
+ self.migrations_pending = []
271
+ self.migrations_applied = []
272
+ self.migration_errors = []
273
+ self.alerts = []
274
+ self.commit_sha = "abc123"
275
+ self.triggered_by = "deploy-bot"
276
+ self.started_at = "2026-04-01T10:00:00Z"
277
+ self.test_pass = 0
278
+ self.test_fail = 0
279
+ self.build_logs = ""
280
+ self._time_pressure = False # Set by scenario if needed
281
+
282
+ # Initialize from scenario
283
+ scenario.setup(self)
284
+
285
+ # Inject the shared RNG and task name into all services created by the scenario
286
+ for svc in self.services.values():
287
+ svc._rng = self._rng
288
+ svc._task_name = scenario.task_name
289
+
290
+ def execute(self, action):
291
+ """Execute an action. Returns human-readable result string."""
292
+ # 1. Tick health recovery for all services (heal from previous deploys)
293
+ for svc in self.services.values():
294
+ svc.tick_recovery()
295
+
296
+ # 2. Execute the agent's action FIRST
297
+ if action.action_type == ActionType.VIEW_PIPELINE:
298
+ result = self._view_pipeline()
299
+ elif action.action_type == ActionType.VIEW_LOGS:
300
+ result = self._view_logs(action.service_name)
301
+ elif action.action_type == ActionType.VIEW_CONFIG:
302
+ result = self._view_config(action.service_name)
303
+ elif action.action_type == ActionType.EDIT_CONFIG:
304
+ result = self._edit_config(action.service_name, action.config_edits)
305
+ elif action.action_type == ActionType.RUN_MIGRATION:
306
+ result = self._run_migration(action.migration_name, action.migration_type)
307
+ elif action.action_type == ActionType.DEPLOY:
308
+ result = self._deploy(action.service_name, action.target_version)
309
+ elif action.action_type == ActionType.ROLLBACK:
310
+ result = self._rollback(action.service_name)
311
+ elif action.action_type == ActionType.APPROVE:
312
+ result = self._approve(action.reason)
313
+ elif action.action_type == ActionType.ABORT:
314
+ result = self._abort(action.reason)
315
+ else:
316
+ result = "Unknown action."
317
+
318
+ # 3. Environmental effects AFTER action (agent sees consequences)
319
+ if self._time_pressure:
320
+ self._apply_time_pressure()
321
+ self._tick_cascading_effects()
322
+ self._tick_metric_compounding()
323
+ self._tick_tipping_points()
324
+
325
+ return result
326
+
327
+ # --- Cross-metric compounding ---------------------------------------------
328
+
329
+ def _tick_metric_compounding(self):
330
+ """Metrics compound on each other β€” creates realistic spirals and recovery."""
331
+ if self.scenario.task_name == "clean_deploy":
332
+ return
333
+ for name, svc in self.services.items():
334
+ # Degradation spirals (moderate β€” should not kill episodes in <5 steps)
335
+ if svc.error_rate > 15.0:
336
+ svc.cpu_percent = min(svc.cpu_percent + 3, 99)
337
+ if svc.cpu_percent > 90:
338
+ svc.latency_ms = round(min(svc.latency_ms + 100, 5000), 1)
339
+ if svc.latency_ms > 3000:
340
+ svc.error_rate = round(min(svc.error_rate + 1.0, 50.0), 2)
341
+
342
+ # Natural recovery (when metrics are good, they help each other)
343
+ if svc.error_rate < 2.0:
344
+ svc.cpu_percent = max(svc.cpu_percent - 3, 10)
345
+ if svc.cpu_percent < 50:
346
+ svc.latency_ms = round(max(svc.latency_ms - 50, 20), 1)
347
+ if svc.latency_ms < 200 and svc.error_rate < 1.0:
348
+ svc.error_rate = round(max(svc.error_rate - 0.5, 0.0), 2)
349
+
350
+ # --- Non-linear tipping points -------------------------------------------
351
+
352
+ def _tick_tipping_points(self):
353
+ """Non-linear tipping points β€” systems cliff instead of degrading linearly."""
354
+ if self.scenario.task_name == "clean_deploy":
355
+ return
356
+ for name, svc in self.services.items():
357
+ # CPU cliff: above 85% = exponential error growth
358
+ if svc.cpu_percent > 85:
359
+ overflow = svc.cpu_percent - 85
360
+ svc.error_rate = round(min(svc.error_rate + overflow * 0.2, 50.0), 2)
361
+
362
+ # Latency cliff: above 2000ms = rapid collapse
363
+ if svc.latency_ms > 2000:
364
+ svc.error_rate = round(min(svc.error_rate + 3.0, 50.0), 2)
365
+
366
+ # Health cliff: below 30% health = accelerating death spiral
367
+ base = 50.0 if svc.health == ServiceHealth.DEGRADED else (
368
+ 100.0 if svc.health == ServiceHealth.HEALTHY else 0.0
369
+ )
370
+ err_penalty = min(svc.error_rate * 2, 30)
371
+ lat_penalty = min(max(0, svc.latency_ms - 200) / 10, 30)
372
+ health_pct = max(0, base - err_penalty - lat_penalty)
373
+ if health_pct < 30:
374
+ svc.error_rate = round(min(svc.error_rate * 1.3, 50.0), 2)
375
+
376
+ # Latency β†’ CPU feedback (high latency = retries = more CPU)
377
+ if svc.latency_ms > 1500:
378
+ svc.cpu_percent = min(svc.cpu_percent + 3, 99)
379
+
380
+ # --- Cascading failures ---------------------------------------------------
381
+
382
+ def _get_dependents(self, service_name):
383
+ """Find all services that list service_name in their dependencies."""
384
+ return [
385
+ svc for svc in self.services.values()
386
+ if service_name in svc.dependencies
387
+ ]
388
+
389
+ def _tick_cascading_effects(self):
390
+ """Unhealthy services degrade their dependents each step."""
391
+ for svc in self.services.values():
392
+ health_pct = svc._get_health_pct()
393
+ if health_pct >= 50.0:
394
+ continue # healthy enough, no cascade
395
+
396
+ dependents = self._get_dependents(svc.name)
397
+ for dep in dependents:
398
+ if dep.health == ServiceHealth.DOWN:
399
+ continue # already down, can't get worse from cascade
400
+
401
+ # Determine cascade severity
402
+ if health_pct < 20.0:
403
+ # Source is effectively down β€” moderate cascade
404
+ err_increase = 1.5
405
+ lat_increase = 30.0
406
+ else:
407
+ # Source is degraded β€” lighter cascade
408
+ err_increase = 0.5
409
+ lat_increase = 10.0
410
+
411
+ old_err = dep.error_rate
412
+ dep.error_rate = round(min(dep.error_rate + err_increase, 45.0), 2)
413
+ dep.latency_ms = round(min(dep.latency_ms + lat_increase, 4500.0), 1)
414
+
415
+ # If error rate gets high enough, mark as degraded
416
+ if dep.error_rate > 5.0 and dep.health == ServiceHealth.HEALTHY:
417
+ dep.health = ServiceHealth.DEGRADED
418
+
419
+ # Floor: cascading alone can't push health below 5%
420
+ # (prevent instant death spirals)
421
+ dep_health = dep._get_health_pct()
422
+ if dep_health < 5.0:
423
+ dep.error_rate = round(max(old_err, dep.error_rate - err_increase + 1.0), 2)
424
+
425
+ # Add cascade alert (only if not already alerted this step)
426
+ cascade_alert_key = f"cascade:{svc.name}->{dep.name}"
427
+ existing = [a for a in self.alerts if cascade_alert_key in a.message]
428
+ if not existing:
429
+ self.alerts.append(AlertInfo(
430
+ severity="warning",
431
+ message=(
432
+ f"Cascading: {svc.name} (health {health_pct:.0f}%) is degrading "
433
+ f"{dep.name} β€” error_rate +{err_increase}/s, latency +{lat_increase:.0f}ms "
434
+ f"[{cascade_alert_key}]"
435
+ ),
436
+ service_name=dep.name,
437
+ timestamp="2026-04-01T12:00:00Z",
438
+ ))
439
+
440
+ dep.logs.append(
441
+ f"[CASCADE] Upstream {svc.name} unhealthy (health {health_pct:.0f}%) β€” "
442
+ f"{dep.name} error_rate now {dep.error_rate:.1f}/s, "
443
+ f"latency {dep.latency_ms:.0f}ms"
444
+ )
445
+
446
+ # Recovery propagation: healthy services help their dependents recover
447
+ for name, svc in self.services.items():
448
+ if svc.health == ServiceHealth.HEALTHY and svc.error_rate < 2.0:
449
+ dependents = self._get_dependents(name)
450
+ for dep in dependents:
451
+ if dep.health == ServiceHealth.DEGRADED:
452
+ dep.error_rate = round(dep.error_rate * 0.9, 2)
453
+ dep.latency_ms = round(dep.latency_ms * 0.9, 1)
454
+
455
+ # --- Action handlers ------------------------------------------------------
456
+
457
+ def _view_pipeline(self):
458
+ services_summary = "\n".join(
459
+ f" {s.name}: {s.health.value} | v{s.current_version} -> "
460
+ f"v{s.target_version or 'N/A'} | "
461
+ f"latency={s.latency_ms:.0f}ms | errors={s.error_rate:.1f}/s"
462
+ for s in self.services.values()
463
+ )
464
+ return (
465
+ f"Pipeline Stage: {self.pipeline_stage.value}\n"
466
+ f"Commit: {self.commit_sha}\n"
467
+ f"Tests: {self.test_pass} passed, {self.test_fail} failed\n"
468
+ f"Pending Migrations: {len(self.migrations_pending)}\n"
469
+ f"Services:\n{services_summary}"
470
+ )
471
+
472
+ def _view_logs(self, service_name):
473
+ svc = self.services.get(service_name)
474
+ if not svc:
475
+ return f"No service named '{service_name}'"
476
+ logs = svc.get_logs()
477
+ if not logs:
478
+ return f"No logs available for {service_name}."
479
+ return f"Logs for {service_name}:\n" + "\n".join(logs[-20:])
480
+
481
+ def _view_config(self, service_name):
482
+ svc = self.services.get(service_name)
483
+ if not svc:
484
+ return f"No service named '{service_name}'"
485
+ config = svc.get_config_snapshot()
486
+ lines = [f" {k} = {v}" for k, v in config.items()]
487
+ return f"Config for {service_name}:\n" + "\n".join(lines)
488
+
489
+ def _edit_config(self, service_name, edits):
490
+ svc = self.services.get(service_name)
491
+ if not svc:
492
+ return f"No service named '{service_name}'"
493
+ results = []
494
+ for edit in edits:
495
+ result = svc.set_config(edit.key, edit.value)
496
+ results.append(result)
497
+ # If the config error is now fixed and service was degraded, start
498
+ # staged recovery (2 steps) instead of instant heal
499
+ if svc.health == ServiceHealth.DEGRADED and not self.scenario.check_config_error(service_name, svc.config):
500
+ svc.staging_deployed = False
501
+ svc.staging_verified = False
502
+ # Immediate PARTIAL improvement
503
+ svc.error_rate = round(svc.error_rate * 0.5, 2)
504
+ svc.latency_ms = round(svc.latency_ms * 0.6, 1)
505
+ # Set up 2-step recovery to full health (reuse tick_recovery pattern)
506
+ svc._recovery_steps_remaining = 2
507
+ svc._recovery_target_latency = round(50.0 * self._rng.uniform(0.8, 1.2), 1)
508
+ svc._recovery_target_error_rate = round(0.1 * self._rng.uniform(0.9, 1.1), 3)
509
+ # Don't set health to HEALTHY yet β€” let tick_recovery handle it
510
+ # once error_rate drops below threshold on next steps
511
+ results.append(f"Config fix detected for {service_name}. Service improving β€” full recovery in ~2 steps. Ready for re-deploy.")
512
+ return "\n".join(results)
513
+
514
+ def _run_migration(self, migration_name, migration_type):
515
+ if migration_name not in self.migrations_pending:
516
+ return (
517
+ f"Migration '{migration_name}' not found in pending: "
518
+ f"{self.migrations_pending}"
519
+ )
520
+ success = self.scenario.run_migration(self, migration_name)
521
+ if success:
522
+ self.migrations_pending.remove(migration_name)
523
+ self.migrations_applied.append(migration_name)
524
+ return f"Migration '{migration_name}' applied successfully."
525
+ else:
526
+ error = f"Migration '{migration_name}' FAILED."
527
+ self.migration_errors.append(error)
528
+ return error
529
+
530
+ def _deploy(self, service_name, target_version):
531
+ svc = self.services.get(service_name)
532
+ if not svc:
533
+ return f"No service named '{service_name}'"
534
+
535
+ # Check migration dependencies
536
+ if self.migrations_pending and self.scenario.migration_blocks_deploy(service_name):
537
+ return (
538
+ f"BLOCKED: Pending migrations must be applied before deploying "
539
+ f"{service_name}. Pending: {self.migrations_pending}"
540
+ )
541
+
542
+ # Check if any dependency is unhealthy β€” 50% chance of deploy failure
543
+ for dep_name in svc.dependencies:
544
+ dep_svc = self.services.get(dep_name)
545
+ if dep_svc and dep_svc._get_health_pct() < 50.0:
546
+ if self._rng.random() < 0.5:
547
+ svc.logs.append(
548
+ f"[DEPLOY] Deploy {svc.name} {target_version} FAILED β€” "
549
+ f"dependency {dep_name} is unhealthy "
550
+ f"(health {dep_svc._get_health_pct():.0f}%). Retry may succeed."
551
+ )
552
+ return (
553
+ f"DEPLOY UNSTABLE: Dependency {dep_name} is unhealthy "
554
+ f"(health {dep_svc._get_health_pct():.0f}%). "
555
+ f"Deploy of {service_name} failed. Retry may succeed."
556
+ )
557
+
558
+ # Determine target environment
559
+ if not svc.staging_deployed:
560
+ self.pipeline_stage = PipelineStage.STAGING
561
+ return svc.deploy_to_staging(target_version, self.scenario)
562
+ else:
563
+ self.pipeline_stage = PipelineStage.DEPLOYING
564
+ result = svc.deploy_to_production(target_version)
565
+ # Notify scenario of deploy (for cascading effects)
566
+ if hasattr(self.scenario, 'on_prod_deploy'):
567
+ extra = self.scenario.on_prod_deploy(self, service_name, target_version)
568
+ if extra:
569
+ result += "\n" + extra
570
+ # Check if all target services deployed
571
+ if all(s.prod_deployed for s in self.services.values() if s.target_version):
572
+ self.pipeline_stage = PipelineStage.DEPLOYED
573
+ return result
574
+
575
+ def _rollback(self, service_name):
576
+ svc = self.services.get(service_name)
577
+ if not svc:
578
+ return f"No service named '{service_name}'"
579
+ self.pipeline_stage = PipelineStage.ROLLED_BACK
580
+
581
+ # Check if dependents rely on current version's APIs
582
+ old_version = svc.current_version
583
+ dependents = self._get_dependents(service_name)
584
+ result = svc.rollback()
585
+
586
+ # Warn about dependent services and increase their error rates
587
+ for dep in dependents:
588
+ dep.error_rate = round(dep.error_rate + 5.0, 2)
589
+ if dep.health == ServiceHealth.HEALTHY and dep.error_rate > 3.0:
590
+ dep.health = ServiceHealth.DEGRADED
591
+ self.alerts.append(AlertInfo(
592
+ severity="warning",
593
+ message=(
594
+ f"Rollback impact: {dep.name} depends on {service_name} "
595
+ f"{old_version}. Rollback may break {dep.name}. "
596
+ f"Error rate increased to {dep.error_rate:.1f}/s."
597
+ ),
598
+ service_name=dep.name,
599
+ timestamp="2026-04-01T12:00:00Z",
600
+ ))
601
+ dep.logs.append(
602
+ f"[ROLLBACK-IMPACT] {service_name} rolled back from {old_version} β€” "
603
+ f"{dep.name} error_rate increased to {dep.error_rate:.1f}/s. "
604
+ f"Dependency on {old_version} APIs may be broken."
605
+ )
606
+
607
+ if hasattr(self.scenario, 'on_rollback'):
608
+ self.scenario.on_rollback(self, service_name)
609
+ return result
610
+
611
+ def _approve(self, reason):
612
+ self.pipeline_stage = PipelineStage.DEPLOYED
613
+ return f"Deployment APPROVED. Reason: {reason or 'No reason given.'}"
614
+
615
+ def _abort(self, reason):
616
+ self.pipeline_stage = PipelineStage.FAILED
617
+ return f"Deployment ABORTED. Reason: {reason or 'No reason given.'}"
618
+
619
+ # --- State queries --------------------------------------------------------
620
+
621
+ def snapshot(self):
622
+ """Capture current state for reward calculation."""
623
+ return {
624
+ "services": {
625
+ name: {
626
+ "health": s.health.value,
627
+ "error_rate": s.error_rate,
628
+ "latency_ms": s.latency_ms,
629
+ "prod_deployed": s.prod_deployed,
630
+ "staging_verified": s.staging_verified,
631
+ "config": dict(s.config),
632
+ }
633
+ for name, s in self.services.items()
634
+ },
635
+ "system_health": self.get_system_health(),
636
+ "pipeline_stage": self.pipeline_stage.value,
637
+ "migrations_pending": list(self.migrations_pending),
638
+ "alerts": list(self.alerts),
639
+ }
640
+
641
+ def get_system_health(self):
642
+ """Aggregate health 0-100."""
643
+ if not self.services:
644
+ return 100.0
645
+ total = 0.0
646
+ for svc in self.services.values():
647
+ total += svc._get_health_pct()
648
+ return total / len(self.services)
649
+
650
+ def get_service_statuses(self):
651
+ return [s.to_status() for s in self.services.values()]
652
+
653
+ def get_pipeline_status(self):
654
+ return PipelineStatus(
655
+ stage=self.pipeline_stage,
656
+ triggered_by=self.triggered_by,
657
+ started_at=self.started_at,
658
+ commit_sha=self.commit_sha,
659
+ build_logs_snippet=self.build_logs if self.build_logs else None,
660
+ test_pass_count=self.test_pass,
661
+ test_fail_count=self.test_fail,
662
+ )
663
+
664
+ def get_migration_status(self):
665
+ return MigrationStatus(
666
+ pending_migrations=list(self.migrations_pending),
667
+ last_applied=self.migrations_applied[-1] if self.migrations_applied else None,
668
+ migration_errors=self.migration_errors if self.migration_errors else None,
669
+ )
670
+
671
+ def get_alerts(self):
672
+ return list(self.alerts)
673
+
674
+ def get_service_names(self):
675
+ return list(self.services.keys())
676
+
677
+ def has_services(self):
678
+ return len(self.services) > 0
679
+
680
+ def has_pending_migrations(self):
681
+ return len(self.migrations_pending) > 0
682
+
683
+ def _apply_time_pressure(self):
684
+ """During incidents, degraded services get worse each step."""
685
+ task = self.scenario.task_name
686
+
687
+ if task == "judgment_call":
688
+ api_gw = self.services.get("api-gateway")
689
+ if api_gw and api_gw.health == ServiceHealth.DEGRADED:
690
+ degrade_lat = 80 * self._rng.uniform(0.8, 1.2)
691
+ degrade_err = 0.8 * self._rng.uniform(0.9, 1.1)
692
+ api_gw.latency_ms = round(min(api_gw.latency_ms + degrade_lat, 5000), 1)
693
+ api_gw.error_rate = round(min(api_gw.error_rate + degrade_err, 50.0), 2)
694
+ api_gw.cpu_percent = min(api_gw.cpu_percent + 1, 99)
695
+ api_gw.logs.append(
696
+ f"[DEGRADING] api-gateway latency now {api_gw.latency_ms:.0f}ms, "
697
+ f"errors {api_gw.error_rate:.1f}/s β€” situation worsening"
698
+ )
699
+
700
+ elif task == "broken_pipeline":
701
+ # Cache-service degrades if config error persists
702
+ cache = self.services.get("cache-service")
703
+ if cache and self.scenario.check_config_error("cache-service", cache.config):
704
+ health_drop = 3.0 * self._rng.uniform(0.8, 1.2)
705
+ cache.error_rate = round(min(cache.error_rate + health_drop * 0.5, 25.0), 2)
706
+ cache.latency_ms = round(min(cache.latency_ms + 30.0 * self._rng.uniform(0.8, 1.2), 2000.0), 1)
707
+ if cache.error_rate > 3.0 and cache.health == ServiceHealth.HEALTHY:
708
+ cache.health = ServiceHealth.DEGRADED
709
+ cache.logs.append(
710
+ f"[DEGRADING] cache-service using staging Redis β€” "
711
+ f"error_rate now {cache.error_rate:.1f}/s, "
712
+ f"latency {cache.latency_ms:.0f}ms"
713
+ )
714
+
715
+ # Api-gateway latency increases if migration not applied
716
+ api_gw = self.services.get("api-gateway")
717
+ if api_gw and "add_index_users_email" in self.migrations_pending:
718
+ lat_increase = 50.0 * self._rng.uniform(0.8, 1.2)
719
+ api_gw.latency_ms = round(min(api_gw.latency_ms + lat_increase, 2000.0), 1)
720
+ api_gw.logs.append(
721
+ f"[DEGRADING] api-gateway missing index β€” "
722
+ f"user query latency now {api_gw.latency_ms:.0f}ms"
723
+ )
724
+
725
+ elif task == "capacity_crisis":
726
+ db = self.services.get("database-primary")
727
+ api_gw = self.services.get("api-gateway")
728
+ # Time pressure only while connection pool bottleneck persists
729
+ if db and self.scenario.check_config_error("database-primary", db.config):
730
+ db.cpu_percent = min(db.cpu_percent + 2, 99)
731
+ db.latency_ms = round(db.latency_ms + 15, 1)
732
+ # api-gateway degrades only while db bottleneck persists
733
+ if (api_gw and api_gw.health == ServiceHealth.DEGRADED
734
+ and db and self.scenario.check_config_error("database-primary", db.config)):
735
+ api_gw.latency_ms = round(min(api_gw.latency_ms + 30, 5000), 1)
736
+ api_gw.error_rate = round(min(api_gw.error_rate + 0.5, 50.0), 2)
737
+
738
+ elif task == "random_incident":
739
+ failing = getattr(self.scenario, 'failing_service', None)
740
+ if failing:
741
+ svc = self.services.get(failing)
742
+ if svc and svc.health == ServiceHealth.DEGRADED:
743
+ svc.error_rate = round(min(svc.error_rate + 0.5, 50.0), 2)
744
+ svc.latency_ms = round(min(svc.latency_ms + 30, 5000), 1)
server/pipeline_environment.py CHANGED
@@ -1,351 +1,351 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
- """DevOps Pipeline Environment Implementation."""
8
-
9
- import os
10
- from uuid import uuid4
11
-
12
- from openenv.core.env_server.interfaces import Environment
13
- from openenv.core.env_server.types import State
14
-
15
- from devops_pipeline_env.models import (
16
- ActionType,
17
- PipelineAction,
18
- PipelineObservation,
19
- ServiceHealth,
20
- ServiceStatus,
21
- )
22
- from server.pipeline_engine import PipelineEngine
23
- from server.rewards import calculate_reward
24
- from server.scenarios import load_scenario
25
-
26
- # Deterministic seeds per task
27
- TASK_SEEDS = {
28
- "clean_deploy": 1001,
29
- "broken_pipeline": 2002,
30
- "judgment_call": 3003,
31
- "cascading_failure": 4004,
32
- "capacity_crisis": 5005,
33
- "random_incident": 6006,
34
- }
35
-
36
- TASK_MAX_STEPS = {
37
- "clean_deploy": 15,
38
- "broken_pipeline": 20,
39
- "judgment_call": 12,
40
- "cascading_failure": 15,
41
- "capacity_crisis": 15,
42
- "random_incident": 15,
43
- }
44
-
45
- # Goal suffixes that hint at investigation without giving away answers
46
- _INVESTIGATION_HINTS = {
47
- "clean_deploy": " Use view_logs and view_config to inspect services before deploying.",
48
- "broken_pipeline": " Investigate service logs and configs to diagnose issues before acting.",
49
- "judgment_call": " Check service logs and configs to understand the incident before deciding.",
50
- "capacity_crisis": " Inspect database-primary logs and config to find the bottleneck.",
51
- "random_incident": " Investigate service logs and config to find the root cause.",
52
- }
53
-
54
-
55
- class PipelineEnvironment(Environment):
56
- """CI/CD Pipeline environment β€” manages microservice deployments."""
57
-
58
- SUPPORTS_CONCURRENT_SESSIONS: bool = False
59
- _register_callback = None # Set by app.py to register active env for /grader
60
-
61
- def __init__(self):
62
- self._state = State(episode_id=str(uuid4()), step_count=0)
63
- self._engine = None
64
- self._task_name = "clean_deploy"
65
- self._max_steps = 15
66
- self._episode_history = []
67
- self._viewed_actions = set()
68
- self._last_action_key = None
69
- self._investigated_services = set() # e.g. "logs:api-gateway", "config:cache-service"
70
-
71
- def reset(self, seed=None, episode_id=None, **kwargs) -> PipelineObservation:
72
- """Initialize a new episode. Task from reset body, env var, or default."""
73
- self._task_name = (
74
- kwargs.get("task")
75
- or os.environ.get("DEVOPS_TASK")
76
- or "clean_deploy"
77
- )
78
- self._state = State(episode_id=str(uuid4()), step_count=0)
79
- self._episode_history = []
80
- self._viewed_actions = set()
81
- self._last_action_key = None
82
- self._investigated_services = set()
83
- if PipelineEnvironment._register_callback:
84
- PipelineEnvironment._register_callback(self)
85
-
86
- seed = TASK_SEEDS.get(self._task_name, 9999)
87
- if self._task_name == "random_incident":
88
- seed = int(os.environ.get("DEVOPS_SEED", str(seed)))
89
- scenario = load_scenario(self._task_name, seed)
90
- self._engine = PipelineEngine(scenario, seed)
91
- self._max_steps = TASK_MAX_STEPS.get(self._task_name, 15)
92
-
93
- return self._build_observation(
94
- last_action_result="Environment reset. Deployment pipeline initialized.",
95
- last_action_error=None,
96
- done=False,
97
- reward=0.0,
98
- )
99
-
100
- def step(self, action: PipelineAction) -> PipelineObservation:
101
- """Execute action, return observation with reward and done."""
102
- self._state.step_count += 1
103
-
104
- prev_state = self._engine.snapshot()
105
-
106
- # Validate action
107
- error = self._validate_action(action)
108
- if error:
109
- self._episode_history.append({
110
- "step": self._state.step_count,
111
- "action": action.model_dump(),
112
- "reward": -0.05,
113
- "error": error,
114
- })
115
- done = self._state.step_count >= self._max_steps
116
- return self._build_observation(
117
- last_action_result=None,
118
- last_action_error=error,
119
- done=done,
120
- reward=-0.05,
121
- )
122
-
123
- # Track investigation actions BEFORE executing (so the observation
124
- # returned from this step already includes the revealed info)
125
- if action.action_type == ActionType.VIEW_LOGS and action.service_name:
126
- self._investigated_services.add(f"logs:{action.service_name}")
127
- elif action.action_type == ActionType.VIEW_CONFIG and action.service_name:
128
- self._investigated_services.add(f"config:{action.service_name}")
129
-
130
- # Execute action
131
- result_text = self._engine.execute(action)
132
-
133
- current_state = self._engine.snapshot()
134
-
135
- # Calculate outcome-based reward
136
- reward = calculate_reward(
137
- prev_state, current_state, action, self._viewed_actions,
138
- last_action_key=self._last_action_key, task_name=self._task_name,
139
- )
140
- self._last_action_key = f"{action.action_type.value}:{action.service_name or ''}"
141
-
142
- # Check episode termination
143
- done = self._check_done(action)
144
-
145
- # Track if we broke a healthy service (for grader)
146
- broke_healthy = False
147
- for name, curr_svc in current_state["services"].items():
148
- prev_svc = prev_state["services"].get(name, {})
149
- if prev_svc.get("health") == "healthy" and curr_svc["health"] in ("degraded", "down"):
150
- broke_healthy = True
151
-
152
- history_entry = {
153
- "step": self._state.step_count,
154
- "action": action.model_dump(),
155
- "reward": reward,
156
- "error": None,
157
- "broke_healthy": broke_healthy,
158
- "system_health": self._engine.get_system_health(),
159
- }
160
-
161
- # Record cache health at deploy time for grader integrity
162
- if action.action_type == ActionType.DEPLOY and action.service_name == "api-gateway":
163
- cache_svc = self._engine.services.get("cache-service")
164
- if cache_svc:
165
- history_entry["cache_health_at_deploy"] = cache_svc.health.value
166
-
167
- self._episode_history.append(history_entry)
168
-
169
- # Include config_snapshot if viewing/editing config
170
- config_snapshot = None
171
- if action.action_type in (ActionType.VIEW_CONFIG, ActionType.EDIT_CONFIG):
172
- svc = self._engine.services.get(action.service_name)
173
- if svc:
174
- config_snapshot = svc.get_config_snapshot()
175
-
176
- return self._build_observation(
177
- last_action_result=result_text,
178
- last_action_error=None,
179
- done=done,
180
- reward=reward,
181
- config_snapshot=config_snapshot,
182
- )
183
-
184
- @property
185
- def state(self) -> State:
186
- return self._state
187
-
188
- def get_episode_history(self):
189
- return self._episode_history
190
-
191
- def get_engine(self):
192
- return self._engine
193
-
194
- def get_task_name(self):
195
- return self._task_name
196
-
197
- def _build_observation(self, last_action_result, last_action_error,
198
- done, reward, config_snapshot=None):
199
- """Build observation from current engine state.
200
-
201
- Partial observability: services show only high-level metrics by default.
202
- CPU, memory are hidden until the agent runs view_logs for that service.
203
- Config is hidden until the agent runs view_config for that service.
204
- """
205
- scenario = self._engine.scenario
206
-
207
- # Build service statuses with partial observability
208
- raw_statuses = self._engine.get_service_statuses()
209
- filtered_statuses = []
210
- for svc in raw_statuses:
211
- investigated = (
212
- f"logs:{svc.name}" in self._investigated_services
213
- or f"config:{svc.name}" in self._investigated_services
214
- )
215
- # Get recovery status from engine state
216
- svc_state = self._engine.services.get(svc.name)
217
- recovery = "stable"
218
- if svc_state and hasattr(svc_state, '_recovery_steps_remaining') and svc_state._recovery_steps_remaining > 0:
219
- recovery = f"stabilizing ({svc_state._recovery_steps_remaining} steps remaining)"
220
- # Mask health for uninvestigated unhealthy services
221
- shown_health = svc.health
222
- if not investigated and svc.health.value != "healthy":
223
- shown_health = ServiceHealth.UNKNOWN
224
- filtered_statuses.append(ServiceStatus(
225
- name=svc.name,
226
- health=shown_health,
227
- current_version=svc.current_version,
228
- # Metrics visible only after investigation
229
- error_rate=svc.error_rate if investigated else 0.0,
230
- request_latency_ms=svc.request_latency_ms if investigated else 0.0,
231
- active_connections=svc.active_connections,
232
- last_deploy_timestamp=svc.last_deploy_timestamp,
233
- # Hidden until view_logs: detailed resource usage
234
- cpu_percent=svc.cpu_percent if investigated else 0.0,
235
- memory_percent=svc.memory_percent if investigated else 0.0,
236
- recovery_status=recovery,
237
- ))
238
-
239
- # Append investigation hint to goal
240
- goal = scenario.goal
241
- hint = _INVESTIGATION_HINTS.get(self._task_name, "")
242
- if hint and not self._investigated_services:
243
- goal = goal + hint
244
-
245
- # Build summary β€” only reveal details for investigated services
246
- alerts = []
247
- uninvestigated_alerts = 0
248
- for name, svc_state in self._engine.services.items():
249
- investigated = (
250
- f"logs:{name}" in self._investigated_services
251
- or f"config:{name}" in self._investigated_services
252
- )
253
- if svc_state.health == ServiceHealth.DOWN:
254
- if investigated:
255
- alerts.append(f"CRITICAL: {name} is DOWN")
256
- else:
257
- uninvestigated_alerts += 1
258
- elif svc_state.health == ServiceHealth.DEGRADED:
259
- if investigated:
260
- alerts.append(
261
- f"WARNING: {name} degraded "
262
- f"(lat={svc_state.latency_ms:.0f}ms, err={svc_state.error_rate:.1f}/s)"
263
- )
264
- else:
265
- uninvestigated_alerts += 1
266
- elif investigated and svc_state.cpu_percent > 80:
267
- alerts.append(f"CAUTION: {name} CPU high ({svc_state.cpu_percent:.0f}%)")
268
- # Recovery status alert β€” inside the loop, for THIS service
269
- if hasattr(svc_state, '_recovery_steps_remaining') and svc_state._recovery_steps_remaining > 0:
270
- alerts.append(f"INFO: {name} recovering β€” stabilizing ({svc_state._recovery_steps_remaining} steps remaining)")
271
- if uninvestigated_alerts > 0:
272
- alerts.append(f"ALERT: {uninvestigated_alerts} service(s) may have issues β€” use view_logs to investigate")
273
- # Add dependency chain hints for investigated degraded services only
274
- for name, svc_state in self._engine.services.items():
275
- investigated = (
276
- f"logs:{name}" in self._investigated_services
277
- or f"config:{name}" in self._investigated_services
278
- )
279
- if investigated and svc_state.health in (ServiceHealth.DEGRADED, ServiceHealth.DOWN):
280
- upstream_issues = [
281
- d for d in svc_state.dependencies
282
- if d in self._engine.services
283
- and self._engine.services[d].health in (ServiceHealth.DEGRADED, ServiceHealth.DOWN)
284
- ]
285
- if upstream_issues:
286
- alerts.append(
287
- f"HINT: {name} depends on {', '.join(upstream_issues)} "
288
- f"(also unhealthy β€” root cause likely upstream)"
289
- )
290
- summary = "; ".join(alerts) if alerts else "All services nominal."
291
-
292
- return PipelineObservation(
293
- task_description=scenario.task_description,
294
- goal=goal,
295
- step_number=self._state.step_count,
296
- max_steps=self._max_steps,
297
- services=filtered_statuses,
298
- pipeline=self._engine.get_pipeline_status(),
299
- migrations=self._engine.get_migration_status(),
300
- active_alerts=self._engine.get_alerts(),
301
- available_actions=self._get_available_actions(),
302
- last_action_result=last_action_result,
303
- last_action_error=last_action_error,
304
- config_snapshot=config_snapshot,
305
- done=done,
306
- reward=reward,
307
- summary=summary,
308
- )
309
-
310
- def _get_available_actions(self):
311
- """Context-sensitive: only show valid actions."""
312
- actions = ["view_pipeline", "view_logs", "approve", "abort"]
313
- if self._engine.has_services():
314
- actions.extend(["view_config", "edit_config", "deploy", "rollback"])
315
- if self._engine.has_pending_migrations():
316
- actions.append("run_migration")
317
- return actions
318
-
319
- def _validate_action(self, action):
320
- """Return error string if action is invalid, None if valid."""
321
- if action.action_type in (
322
- ActionType.VIEW_LOGS, ActionType.VIEW_CONFIG,
323
- ActionType.EDIT_CONFIG, ActionType.DEPLOY,
324
- ActionType.ROLLBACK,
325
- ):
326
- if not action.service_name:
327
- return f"action_type '{action.action_type.value}' requires service_name"
328
- if action.service_name not in self._engine.get_service_names():
329
- return (
330
- f"Unknown service '{action.service_name}'. "
331
- f"Available: {self._engine.get_service_names()}"
332
- )
333
- if action.action_type == ActionType.DEPLOY and not action.target_version:
334
- return "deploy requires target_version"
335
- if action.action_type == ActionType.EDIT_CONFIG and not action.config_edits:
336
- return "edit_config requires config_edits"
337
- if action.action_type == ActionType.RUN_MIGRATION and not action.migration_name:
338
- return "run_migration requires migration_name"
339
- return None
340
-
341
- def _check_done(self, action):
342
- """Episode ends on approve, abort, max steps, or catastrophic failure."""
343
- if action.action_type == ActionType.APPROVE:
344
- return True
345
- if action.action_type == ActionType.ABORT:
346
- return True
347
- if self._state.step_count >= self._max_steps:
348
- return True
349
- if self._engine.get_system_health() < 20.0:
350
- return True
351
- return False
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """DevOps Pipeline Environment Implementation."""
8
+
9
+ import os
10
+ from uuid import uuid4
11
+
12
+ from openenv.core.env_server.interfaces import Environment
13
+ from openenv.core.env_server.types import State
14
+
15
+ from devops_pipeline_env.models import (
16
+ ActionType,
17
+ PipelineAction,
18
+ PipelineObservation,
19
+ ServiceHealth,
20
+ ServiceStatus,
21
+ )
22
+ from server.pipeline_engine import PipelineEngine
23
+ from server.rewards import calculate_reward
24
+ from server.scenarios import load_scenario
25
+
26
+ # Deterministic seeds per task
27
+ TASK_SEEDS = {
28
+ "clean_deploy": 1001,
29
+ "broken_pipeline": 2002,
30
+ "judgment_call": 3003,
31
+ "cascading_failure": 4004,
32
+ "capacity_crisis": 5005,
33
+ "random_incident": 6006,
34
+ }
35
+
36
+ TASK_MAX_STEPS = {
37
+ "clean_deploy": 15,
38
+ "broken_pipeline": 20,
39
+ "judgment_call": 12,
40
+ "cascading_failure": 15,
41
+ "capacity_crisis": 15,
42
+ "random_incident": 15,
43
+ }
44
+
45
+ # Goal suffixes that hint at investigation without giving away answers
46
+ _INVESTIGATION_HINTS = {
47
+ "clean_deploy": " Use view_logs and view_config to inspect services before deploying.",
48
+ "broken_pipeline": " Investigate service logs and configs to diagnose issues before acting.",
49
+ "judgment_call": " Check service logs and configs to understand the incident before deciding.",
50
+ "capacity_crisis": " Inspect database-primary logs and config to find the bottleneck.",
51
+ "random_incident": " Investigate service logs and config to find the root cause.",
52
+ }
53
+
54
+
55
+ class PipelineEnvironment(Environment):
56
+ """CI/CD Pipeline environment β€” manages microservice deployments."""
57
+
58
+ SUPPORTS_CONCURRENT_SESSIONS: bool = False
59
+ _register_callback = None # Set by app.py to register active env for /grader
60
+
61
+ def __init__(self):
62
+ self._state = State(episode_id=str(uuid4()), step_count=0)
63
+ self._engine = None
64
+ self._task_name = "clean_deploy"
65
+ self._max_steps = 15
66
+ self._episode_history = []
67
+ self._viewed_actions = set()
68
+ self._last_action_key = None
69
+ self._investigated_services = set() # e.g. "logs:api-gateway", "config:cache-service"
70
+
71
+ def reset(self, seed=None, episode_id=None, **kwargs) -> PipelineObservation:
72
+ """Initialize a new episode. Task from reset body, env var, or default."""
73
+ self._task_name = (
74
+ kwargs.get("task")
75
+ or os.environ.get("DEVOPS_TASK")
76
+ or "clean_deploy"
77
+ )
78
+ self._state = State(episode_id=str(uuid4()), step_count=0)
79
+ self._episode_history = []
80
+ self._viewed_actions = set()
81
+ self._last_action_key = None
82
+ self._investigated_services = set()
83
+ if PipelineEnvironment._register_callback:
84
+ PipelineEnvironment._register_callback(self)
85
+
86
+ seed = TASK_SEEDS.get(self._task_name, 9999)
87
+ if self._task_name == "random_incident":
88
+ seed = int(os.environ.get("DEVOPS_SEED", str(seed)))
89
+ scenario = load_scenario(self._task_name, seed)
90
+ self._engine = PipelineEngine(scenario, seed)
91
+ self._max_steps = TASK_MAX_STEPS.get(self._task_name, 15)
92
+
93
+ return self._build_observation(
94
+ last_action_result="Environment reset. Deployment pipeline initialized.",
95
+ last_action_error=None,
96
+ done=False,
97
+ reward=0.0,
98
+ )
99
+
100
+ def step(self, action: PipelineAction) -> PipelineObservation:
101
+ """Execute action, return observation with reward and done."""
102
+ self._state.step_count += 1
103
+
104
+ prev_state = self._engine.snapshot()
105
+
106
+ # Validate action
107
+ error = self._validate_action(action)
108
+ if error:
109
+ self._episode_history.append({
110
+ "step": self._state.step_count,
111
+ "action": action.model_dump(),
112
+ "reward": -0.05,
113
+ "error": error,
114
+ })
115
+ done = self._state.step_count >= self._max_steps
116
+ return self._build_observation(
117
+ last_action_result=None,
118
+ last_action_error=error,
119
+ done=done,
120
+ reward=-0.05,
121
+ )
122
+
123
+ # Track investigation actions BEFORE executing (so the observation
124
+ # returned from this step already includes the revealed info)
125
+ if action.action_type == ActionType.VIEW_LOGS and action.service_name:
126
+ self._investigated_services.add(f"logs:{action.service_name}")
127
+ elif action.action_type == ActionType.VIEW_CONFIG and action.service_name:
128
+ self._investigated_services.add(f"config:{action.service_name}")
129
+
130
+ # Execute action
131
+ result_text = self._engine.execute(action)
132
+
133
+ current_state = self._engine.snapshot()
134
+
135
+ # Calculate outcome-based reward
136
+ reward = calculate_reward(
137
+ prev_state, current_state, action, self._viewed_actions,
138
+ last_action_key=self._last_action_key, task_name=self._task_name,
139
+ )
140
+ self._last_action_key = f"{action.action_type.value}:{action.service_name or ''}"
141
+
142
+ # Check episode termination
143
+ done = self._check_done(action)
144
+
145
+ # Track if we broke a healthy service (for grader)
146
+ broke_healthy = False
147
+ for name, curr_svc in current_state["services"].items():
148
+ prev_svc = prev_state["services"].get(name, {})
149
+ if prev_svc.get("health") == "healthy" and curr_svc["health"] in ("degraded", "down"):
150
+ broke_healthy = True
151
+
152
+ history_entry = {
153
+ "step": self._state.step_count,
154
+ "action": action.model_dump(),
155
+ "reward": reward,
156
+ "error": None,
157
+ "broke_healthy": broke_healthy,
158
+ "system_health": self._engine.get_system_health(),
159
+ }
160
+
161
+ # Record cache health at deploy time for grader integrity
162
+ if action.action_type == ActionType.DEPLOY and action.service_name == "api-gateway":
163
+ cache_svc = self._engine.services.get("cache-service")
164
+ if cache_svc:
165
+ history_entry["cache_health_at_deploy"] = cache_svc.health.value
166
+
167
+ self._episode_history.append(history_entry)
168
+
169
+ # Include config_snapshot if viewing/editing config
170
+ config_snapshot = None
171
+ if action.action_type in (ActionType.VIEW_CONFIG, ActionType.EDIT_CONFIG):
172
+ svc = self._engine.services.get(action.service_name)
173
+ if svc:
174
+ config_snapshot = svc.get_config_snapshot()
175
+
176
+ return self._build_observation(
177
+ last_action_result=result_text,
178
+ last_action_error=None,
179
+ done=done,
180
+ reward=reward,
181
+ config_snapshot=config_snapshot,
182
+ )
183
+
184
+ @property
185
+ def state(self) -> State:
186
+ return self._state
187
+
188
+ def get_episode_history(self):
189
+ return self._episode_history
190
+
191
+ def get_engine(self):
192
+ return self._engine
193
+
194
+ def get_task_name(self):
195
+ return self._task_name
196
+
197
+ def _build_observation(self, last_action_result, last_action_error,
198
+ done, reward, config_snapshot=None):
199
+ """Build observation from current engine state.
200
+
201
+ Partial observability: services show only high-level metrics by default.
202
+ CPU, memory are hidden until the agent runs view_logs for that service.
203
+ Config is hidden until the agent runs view_config for that service.
204
+ """
205
+ scenario = self._engine.scenario
206
+
207
+ # Build service statuses with partial observability
208
+ raw_statuses = self._engine.get_service_statuses()
209
+ filtered_statuses = []
210
+ for svc in raw_statuses:
211
+ investigated = (
212
+ f"logs:{svc.name}" in self._investigated_services
213
+ or f"config:{svc.name}" in self._investigated_services
214
+ )
215
+ # Get recovery status from engine state
216
+ svc_state = self._engine.services.get(svc.name)
217
+ recovery = "stable"
218
+ if svc_state and hasattr(svc_state, '_recovery_steps_remaining') and svc_state._recovery_steps_remaining > 0:
219
+ recovery = f"stabilizing ({svc_state._recovery_steps_remaining} steps remaining)"
220
+ # Mask health for uninvestigated unhealthy services
221
+ shown_health = svc.health
222
+ if not investigated and svc.health.value != "healthy":
223
+ shown_health = ServiceHealth.UNKNOWN
224
+ filtered_statuses.append(ServiceStatus(
225
+ name=svc.name,
226
+ health=shown_health,
227
+ current_version=svc.current_version,
228
+ # Metrics visible only after investigation
229
+ error_rate=svc.error_rate if investigated else 0.0,
230
+ request_latency_ms=svc.request_latency_ms if investigated else 0.0,
231
+ active_connections=svc.active_connections,
232
+ last_deploy_timestamp=svc.last_deploy_timestamp,
233
+ # Hidden until view_logs: detailed resource usage
234
+ cpu_percent=svc.cpu_percent if investigated else 0.0,
235
+ memory_percent=svc.memory_percent if investigated else 0.0,
236
+ recovery_status=recovery,
237
+ ))
238
+
239
+ # Append investigation hint to goal
240
+ goal = scenario.goal
241
+ hint = _INVESTIGATION_HINTS.get(self._task_name, "")
242
+ if hint and not self._investigated_services:
243
+ goal = goal + hint
244
+
245
+ # Build summary β€” only reveal details for investigated services
246
+ alerts = []
247
+ uninvestigated_alerts = 0
248
+ for name, svc_state in self._engine.services.items():
249
+ investigated = (
250
+ f"logs:{name}" in self._investigated_services
251
+ or f"config:{name}" in self._investigated_services
252
+ )
253
+ if svc_state.health == ServiceHealth.DOWN:
254
+ if investigated:
255
+ alerts.append(f"CRITICAL: {name} is DOWN")
256
+ else:
257
+ uninvestigated_alerts += 1
258
+ elif svc_state.health == ServiceHealth.DEGRADED:
259
+ if investigated:
260
+ alerts.append(
261
+ f"WARNING: {name} degraded "
262
+ f"(lat={svc_state.latency_ms:.0f}ms, err={svc_state.error_rate:.1f}/s)"
263
+ )
264
+ else:
265
+ uninvestigated_alerts += 1
266
+ elif investigated and svc_state.cpu_percent > 80:
267
+ alerts.append(f"CAUTION: {name} CPU high ({svc_state.cpu_percent:.0f}%)")
268
+ # Recovery status alert β€” inside the loop, for THIS service
269
+ if hasattr(svc_state, '_recovery_steps_remaining') and svc_state._recovery_steps_remaining > 0:
270
+ alerts.append(f"INFO: {name} recovering β€” stabilizing ({svc_state._recovery_steps_remaining} steps remaining)")
271
+ if uninvestigated_alerts > 0:
272
+ alerts.append(f"ALERT: {uninvestigated_alerts} service(s) may have issues β€” use view_logs to investigate")
273
+ # Add dependency chain hints for investigated degraded services only
274
+ for name, svc_state in self._engine.services.items():
275
+ investigated = (
276
+ f"logs:{name}" in self._investigated_services
277
+ or f"config:{name}" in self._investigated_services
278
+ )
279
+ if investigated and svc_state.health in (ServiceHealth.DEGRADED, ServiceHealth.DOWN):
280
+ upstream_issues = [
281
+ d for d in svc_state.dependencies
282
+ if d in self._engine.services
283
+ and self._engine.services[d].health in (ServiceHealth.DEGRADED, ServiceHealth.DOWN)
284
+ ]
285
+ if upstream_issues:
286
+ alerts.append(
287
+ f"HINT: {name} depends on {', '.join(upstream_issues)} "
288
+ f"(also unhealthy β€” root cause likely upstream)"
289
+ )
290
+ summary = "; ".join(alerts) if alerts else "All services nominal."
291
+
292
+ return PipelineObservation(
293
+ task_description=scenario.task_description,
294
+ goal=goal,
295
+ step_number=self._state.step_count,
296
+ max_steps=self._max_steps,
297
+ services=filtered_statuses,
298
+ pipeline=self._engine.get_pipeline_status(),
299
+ migrations=self._engine.get_migration_status(),
300
+ active_alerts=self._engine.get_alerts(),
301
+ available_actions=self._get_available_actions(),
302
+ last_action_result=last_action_result,
303
+ last_action_error=last_action_error,
304
+ config_snapshot=config_snapshot,
305
+ done=done,
306
+ reward=reward,
307
+ summary=summary,
308
+ )
309
+
310
+ def _get_available_actions(self):
311
+ """Context-sensitive: only show valid actions."""
312
+ actions = ["view_pipeline", "view_logs", "approve", "abort"]
313
+ if self._engine.has_services():
314
+ actions.extend(["view_config", "edit_config", "deploy", "rollback"])
315
+ if self._engine.has_pending_migrations():
316
+ actions.append("run_migration")
317
+ return actions
318
+
319
+ def _validate_action(self, action):
320
+ """Return error string if action is invalid, None if valid."""
321
+ if action.action_type in (
322
+ ActionType.VIEW_LOGS, ActionType.VIEW_CONFIG,
323
+ ActionType.EDIT_CONFIG, ActionType.DEPLOY,
324
+ ActionType.ROLLBACK,
325
+ ):
326
+ if not action.service_name:
327
+ return f"action_type '{action.action_type.value}' requires service_name"
328
+ if action.service_name not in self._engine.get_service_names():
329
+ return (
330
+ f"Unknown service '{action.service_name}'. "
331
+ f"Available: {self._engine.get_service_names()}"
332
+ )
333
+ if action.action_type == ActionType.DEPLOY and not action.target_version:
334
+ return "deploy requires target_version"
335
+ if action.action_type == ActionType.EDIT_CONFIG and not action.config_edits:
336
+ return "edit_config requires config_edits"
337
+ if action.action_type == ActionType.RUN_MIGRATION and not action.migration_name:
338
+ return "run_migration requires migration_name"
339
+ return None
340
+
341
+ def _check_done(self, action):
342
+ """Episode ends on approve, abort, max steps, or catastrophic failure."""
343
+ if action.action_type == ActionType.APPROVE:
344
+ return True
345
+ if action.action_type == ActionType.ABORT:
346
+ return True
347
+ if self._state.step_count >= self._max_steps:
348
+ return True
349
+ if self._engine.get_system_health() < 20.0:
350
+ return True
351
+ return False
server/rewards.py CHANGED
@@ -1,104 +1,104 @@
1
- # Copyright (c) Meta Platforms, Inc. and affiliates.
2
- # All rights reserved.
3
- #
4
- # This source code is licensed under the BSD-style license found in the
5
- # LICENSE file in the root directory of this source tree.
6
-
7
- """Outcome-based reward calculator for the DevOps Pipeline Environment."""
8
-
9
- from devops_pipeline_env.models import ActionType
10
-
11
-
12
- # Task urgency multipliers β€” harder tasks get steeper reward gradients
13
- TASK_URGENCY = {
14
- "clean_deploy": 1.0,
15
- "broken_pipeline": 1.2,
16
- "judgment_call": 1.5,
17
- "cascading_failure": 1.3,
18
- "capacity_crisis": 1.4,
19
- "random_incident": 1.3,
20
- }
21
-
22
-
23
- def calculate_reward(prev_snapshot, current_snapshot, action, viewed_actions,
24
- last_action_key=None, task_name="clean_deploy"):
25
- """
26
- Outcome-based reward. No procedure bonuses. No early returns.
27
- ALL actions go through the full reward pipeline.
28
- Returns a float bounded to [-0.35, +0.30].
29
- """
30
- reward = 0.0
31
-
32
- # 1. System health delta (ALL actions)
33
- health_delta = current_snapshot["system_health"] - prev_snapshot["system_health"]
34
- reward += health_delta * 0.005
35
-
36
- # 2. Deployment progress (ALL actions β€” recovery ticks can change state)
37
- for name, curr_svc in current_snapshot["services"].items():
38
- prev_svc = prev_snapshot["services"].get(name, {})
39
- if curr_svc["prod_deployed"] and not prev_svc.get("prod_deployed"):
40
- reward += 0.15
41
- if curr_svc["staging_verified"] and not prev_svc.get("staging_verified"):
42
- reward += 0.05
43
-
44
- # 3. Broke healthy service penalty (ALL actions)
45
- for name, curr_svc in current_snapshot["services"].items():
46
- prev_svc = prev_snapshot["services"].get(name, {})
47
- if prev_svc.get("health") == "healthy" and curr_svc["health"] in ("degraded", "down"):
48
- reward -= 0.30
49
-
50
- # 4. Sub-goal completion rewards (intermediate milestones for RL)
51
- # Config change detected β€” check if it fixed an error
52
- for name, curr_svc in current_snapshot["services"].items():
53
- prev_svc = prev_snapshot["services"].get(name, {})
54
- prev_config = prev_svc.get("config", {})
55
- curr_config = curr_svc.get("config", {})
56
- if prev_config and curr_config and prev_config != curr_config:
57
- # Config changed β€” reward if health improved on this service
58
- if prev_svc.get("health") in ("degraded", "down") and curr_svc["health"] == "healthy":
59
- reward += 0.08
60
-
61
- # Migration completed
62
- prev_pending = len(prev_snapshot.get("migrations_pending", []))
63
- curr_pending = len(current_snapshot.get("migrations_pending", []))
64
- if curr_pending < prev_pending:
65
- reward += 0.06
66
-
67
- # Alert resolved
68
- prev_alerts = len(prev_snapshot.get("alerts", []))
69
- curr_alerts = len(current_snapshot.get("alerts", []))
70
- if curr_alerts < prev_alerts:
71
- reward += 0.03
72
-
73
- # 5. Investigation bonus with diminishing returns (view_* actions only)
74
- if action.action_type in (ActionType.VIEW_PIPELINE, ActionType.VIEW_LOGS, ActionType.VIEW_CONFIG):
75
- action_key = f"{action.action_type.value}:{action.service_name or 'global'}"
76
- if action_key not in viewed_actions:
77
- viewed_actions.add(action_key)
78
- investigation_count = len(viewed_actions)
79
- decay_factor = 1.0 / (1 + (investigation_count - 1) * 0.3)
80
- if action.service_name:
81
- svc_data = current_snapshot["services"].get(action.service_name, {})
82
- if svc_data.get("health") in ("degraded", "down"):
83
- reward += 0.04 * decay_factor
84
- else:
85
- reward += 0.01 * decay_factor
86
- else:
87
- reward += 0.02 * decay_factor
88
- else:
89
- # Stronger penalty for consecutive repeat of same view action
90
- current_action_key = f"{action.action_type.value}:{action.service_name or 'global'}"
91
- if last_action_key and current_action_key == last_action_key:
92
- reward -= 0.03 # Consecutive spam = harsh penalty
93
- else:
94
- reward -= 0.01 # Non-consecutive repeat = mild penalty
95
-
96
- # 6. Repeated exact action penalty (non-view actions)
97
- if action.action_type not in (ActionType.VIEW_PIPELINE, ActionType.VIEW_LOGS, ActionType.VIEW_CONFIG):
98
- current_action_key = f"{action.action_type.value}:{action.service_name or ''}"
99
- if last_action_key and current_action_key == last_action_key:
100
- reward -= 0.02
101
-
102
- # 7. Apply task urgency scaling and bound
103
- reward *= TASK_URGENCY.get(task_name, 1.0)
104
- return max(min(reward, 0.30), -0.35)
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the BSD-style license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ """Outcome-based reward calculator for the DevOps Pipeline Environment."""
8
+
9
+ from devops_pipeline_env.models import ActionType
10
+
11
+
12
+ # Task urgency multipliers β€” harder tasks get steeper reward gradients
13
+ TASK_URGENCY = {
14
+ "clean_deploy": 1.0,
15
+ "broken_pipeline": 1.2,
16
+ "judgment_call": 1.5,
17
+ "cascading_failure": 1.3,
18
+ "capacity_crisis": 1.4,
19
+ "random_incident": 1.3,
20
+ }
21
+
22
+
23
+ def calculate_reward(prev_snapshot, current_snapshot, action, viewed_actions,
24
+ last_action_key=None, task_name="clean_deploy"):
25
+ """
26
+ Outcome-based reward. No procedure bonuses. No early returns.
27
+ ALL actions go through the full reward pipeline.
28
+ Returns a float bounded to [-0.35, +0.30].
29
+ """
30
+ reward = 0.0
31
+
32
+ # 1. System health delta (ALL actions)
33
+ health_delta = current_snapshot["system_health"] - prev_snapshot["system_health"]
34
+ reward += health_delta * 0.005
35
+
36
+ # 2. Deployment progress (ALL actions β€” recovery ticks can change state)
37
+ for name, curr_svc in current_snapshot["services"].items():
38
+ prev_svc = prev_snapshot["services"].get(name, {})
39
+ if curr_svc["prod_deployed"] and not prev_svc.get("prod_deployed"):
40
+ reward += 0.15
41
+ if curr_svc["staging_verified"] and not prev_svc.get("staging_verified"):
42
+ reward += 0.05
43
+
44
+ # 3. Broke healthy service penalty (ALL actions)
45
+ for name, curr_svc in current_snapshot["services"].items():
46
+ prev_svc = prev_snapshot["services"].get(name, {})
47
+ if prev_svc.get("health") == "healthy" and curr_svc["health"] in ("degraded", "down"):
48
+ reward -= 0.30
49
+
50
+ # 4. Sub-goal completion rewards (intermediate milestones for RL)
51
+ # Config change detected β€” check if it fixed an error
52
+ for name, curr_svc in current_snapshot["services"].items():
53
+ prev_svc = prev_snapshot["services"].get(name, {})
54
+ prev_config = prev_svc.get("config", {})
55
+ curr_config = curr_svc.get("config", {})
56
+ if prev_config and curr_config and prev_config != curr_config:
57
+ # Config changed β€” reward if health improved on this service
58
+ if prev_svc.get("health") in ("degraded", "down") and curr_svc["health"] == "healthy":
59
+ reward += 0.08
60
+
61
+ # Migration completed
62
+ prev_pending = len(prev_snapshot.get("migrations_pending", []))
63
+ curr_pending = len(current_snapshot.get("migrations_pending", []))
64
+ if curr_pending < prev_pending:
65
+ reward += 0.06
66
+
67
+ # Alert resolved
68
+ prev_alerts = len(prev_snapshot.get("alerts", []))
69
+ curr_alerts = len(current_snapshot.get("alerts", []))
70
+ if curr_alerts < prev_alerts:
71
+ reward += 0.03
72
+
73
+ # 5. Investigation bonus with diminishing returns (view_* actions only)
74
+ if action.action_type in (ActionType.VIEW_PIPELINE, ActionType.VIEW_LOGS, ActionType.VIEW_CONFIG):
75
+ action_key = f"{action.action_type.value}:{action.service_name or 'global'}"
76
+ if action_key not in viewed_actions:
77
+ viewed_actions.add(action_key)
78
+ investigation_count = len(viewed_actions)
79
+ decay_factor = 1.0 / (1 + (investigation_count - 1) * 0.3)
80
+ if action.service_name:
81
+ svc_data = current_snapshot["services"].get(action.service_name, {})
82
+ if svc_data.get("health") in ("degraded", "down"):
83
+ reward += 0.04 * decay_factor
84
+ else:
85
+ reward += 0.01 * decay_factor
86
+ else:
87
+ reward += 0.02 * decay_factor
88
+ else:
89
+ # Stronger penalty for consecutive repeat of same view action
90
+ current_action_key = f"{action.action_type.value}:{action.service_name or 'global'}"
91
+ if last_action_key and current_action_key == last_action_key:
92
+ reward -= 0.03 # Consecutive spam = harsh penalty
93
+ else:
94
+ reward -= 0.01 # Non-consecutive repeat = mild penalty
95
+
96
+ # 6. Repeated exact action penalty (non-view actions)
97
+ if action.action_type not in (ActionType.VIEW_PIPELINE, ActionType.VIEW_LOGS, ActionType.VIEW_CONFIG):
98
+ current_action_key = f"{action.action_type.value}:{action.service_name or ''}"
99
+ if last_action_key and current_action_key == last_action_key:
100
+ reward -= 0.02
101
+
102
+ # 7. Apply task urgency scaling and bound
103
+ reward *= TASK_URGENCY.get(task_name, 1.0)
104
+ return max(min(reward, 0.30), -0.35)
server/scenarios.py CHANGED
The diff for this file is too large to render. See raw diff
 
uv.lock CHANGED
The diff for this file is too large to render. See raw diff