Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .gitattributes +1 -35
- .gitignore +5 -5
- __init__.py +21 -21
- client.py +47 -47
- inference.py +298 -298
- integration_test.py +424 -424
- models.py +179 -179
- openenv.yaml +8 -8
- pyproject.toml +33 -33
- requirements.txt +4 -4
- server/__init__.py +7 -7
- server/app.py +128 -128
- server/graders.py +389 -389
- server/pipeline_engine.py +744 -744
- server/pipeline_environment.py +351 -351
- server/rewards.py +104 -104
- server/scenarios.py +0 -0
- uv.lock +0 -0
.gitattributes
CHANGED
|
@@ -1,35 +1 @@
|
|
| 1 |
-
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
README.md text eol=lf
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
.env
|
| 2 |
-
__pycache__/
|
| 3 |
-
*.pyc
|
| 4 |
-
.venv/
|
| 5 |
-
*.egg-info/
|
|
|
|
| 1 |
+
.env
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.pyc
|
| 4 |
+
.venv/
|
| 5 |
+
*.egg-info/
|
__init__.py
CHANGED
|
@@ -1,21 +1,21 @@
|
|
| 1 |
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
-
# All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# This source code is licensed under the BSD-style license found in the
|
| 5 |
-
# LICENSE file in the root directory of this source tree.
|
| 6 |
-
|
| 7 |
-
"""DevOps Pipeline Environment."""
|
| 8 |
-
|
| 9 |
-
from devops_pipeline_env.client import DevopsPipelineEnv
|
| 10 |
-
from devops_pipeline_env.models import (
|
| 11 |
-
ConfigEdit,
|
| 12 |
-
PipelineAction,
|
| 13 |
-
PipelineObservation,
|
| 14 |
-
)
|
| 15 |
-
|
| 16 |
-
__all__ = [
|
| 17 |
-
"PipelineAction",
|
| 18 |
-
"PipelineObservation",
|
| 19 |
-
"ConfigEdit",
|
| 20 |
-
"DevopsPipelineEnv",
|
| 21 |
-
]
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""DevOps Pipeline Environment."""
|
| 8 |
+
|
| 9 |
+
from devops_pipeline_env.client import DevopsPipelineEnv
|
| 10 |
+
from devops_pipeline_env.models import (
|
| 11 |
+
ConfigEdit,
|
| 12 |
+
PipelineAction,
|
| 13 |
+
PipelineObservation,
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
__all__ = [
|
| 17 |
+
"PipelineAction",
|
| 18 |
+
"PipelineObservation",
|
| 19 |
+
"ConfigEdit",
|
| 20 |
+
"DevopsPipelineEnv",
|
| 21 |
+
]
|
client.py
CHANGED
|
@@ -1,47 +1,47 @@
|
|
| 1 |
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
-
# All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# This source code is licensed under the BSD-style license found in the
|
| 5 |
-
# LICENSE file in the root directory of this source tree.
|
| 6 |
-
|
| 7 |
-
"""DevOps Pipeline Environment Client."""
|
| 8 |
-
|
| 9 |
-
from typing import Dict
|
| 10 |
-
|
| 11 |
-
from openenv.core import EnvClient
|
| 12 |
-
from openenv.core.client_types import StepResult
|
| 13 |
-
from openenv.core.env_server.types import State
|
| 14 |
-
|
| 15 |
-
from devops_pipeline_env.models import PipelineAction, PipelineObservation
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
class DevopsPipelineEnv(
|
| 19 |
-
EnvClient[PipelineAction, PipelineObservation, State]
|
| 20 |
-
):
|
| 21 |
-
"""
|
| 22 |
-
Client for the DevOps Pipeline Environment.
|
| 23 |
-
|
| 24 |
-
Example:
|
| 25 |
-
>>> with DevopsPipelineEnv(base_url="http://localhost:8000") as client:
|
| 26 |
-
... result = client.reset()
|
| 27 |
-
... result = client.step(PipelineAction(action_type="view_pipeline"))
|
| 28 |
-
"""
|
| 29 |
-
|
| 30 |
-
def _step_payload(self, action: PipelineAction) -> Dict:
|
| 31 |
-
return action.model_dump(exclude_none=True)
|
| 32 |
-
|
| 33 |
-
def _parse_result(self, payload: Dict) -> StepResult[PipelineObservation]:
|
| 34 |
-
obs_data = payload.get("observation", {})
|
| 35 |
-
observation = PipelineObservation(**obs_data)
|
| 36 |
-
|
| 37 |
-
return StepResult(
|
| 38 |
-
observation=observation,
|
| 39 |
-
reward=payload.get("reward"),
|
| 40 |
-
done=payload.get("done", False),
|
| 41 |
-
)
|
| 42 |
-
|
| 43 |
-
def _parse_state(self, payload: Dict) -> State:
|
| 44 |
-
return State(
|
| 45 |
-
episode_id=payload.get("episode_id"),
|
| 46 |
-
step_count=payload.get("step_count", 0),
|
| 47 |
-
)
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""DevOps Pipeline Environment Client."""
|
| 8 |
+
|
| 9 |
+
from typing import Dict
|
| 10 |
+
|
| 11 |
+
from openenv.core import EnvClient
|
| 12 |
+
from openenv.core.client_types import StepResult
|
| 13 |
+
from openenv.core.env_server.types import State
|
| 14 |
+
|
| 15 |
+
from devops_pipeline_env.models import PipelineAction, PipelineObservation
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class DevopsPipelineEnv(
|
| 19 |
+
EnvClient[PipelineAction, PipelineObservation, State]
|
| 20 |
+
):
|
| 21 |
+
"""
|
| 22 |
+
Client for the DevOps Pipeline Environment.
|
| 23 |
+
|
| 24 |
+
Example:
|
| 25 |
+
>>> with DevopsPipelineEnv(base_url="http://localhost:8000") as client:
|
| 26 |
+
... result = client.reset()
|
| 27 |
+
... result = client.step(PipelineAction(action_type="view_pipeline"))
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
def _step_payload(self, action: PipelineAction) -> Dict:
|
| 31 |
+
return action.model_dump(exclude_none=True)
|
| 32 |
+
|
| 33 |
+
def _parse_result(self, payload: Dict) -> StepResult[PipelineObservation]:
|
| 34 |
+
obs_data = payload.get("observation", {})
|
| 35 |
+
observation = PipelineObservation(**obs_data)
|
| 36 |
+
|
| 37 |
+
return StepResult(
|
| 38 |
+
observation=observation,
|
| 39 |
+
reward=payload.get("reward"),
|
| 40 |
+
done=payload.get("done", False),
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
def _parse_state(self, payload: Dict) -> State:
|
| 44 |
+
return State(
|
| 45 |
+
episode_id=payload.get("episode_id"),
|
| 46 |
+
step_count=payload.get("step_count", 0),
|
| 47 |
+
)
|
inference.py
CHANGED
|
@@ -1,298 +1,298 @@
|
|
| 1 |
-
"""Inference script for the DevOps Pipeline Environment."""
|
| 2 |
-
|
| 3 |
-
import asyncio
|
| 4 |
-
import json
|
| 5 |
-
import os
|
| 6 |
-
import textwrap
|
| 7 |
-
from typing import List, Optional
|
| 8 |
-
|
| 9 |
-
from openai import OpenAI
|
| 10 |
-
|
| 11 |
-
from devops_pipeline_env import DevopsPipelineEnv, PipelineAction
|
| 12 |
-
from devops_pipeline_env.models import ActionType
|
| 13 |
-
|
| 14 |
-
# --- Env Vars (EXACT hackathon requirements) ----------------------------------
|
| 15 |
-
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 16 |
-
if not API_KEY:
|
| 17 |
-
raise ValueError("HF_TOKEN or API_KEY environment variable is required")
|
| 18 |
-
|
| 19 |
-
API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
|
| 20 |
-
MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
|
| 21 |
-
IMAGE_NAME = os.getenv("IMAGE_NAME")
|
| 22 |
-
|
| 23 |
-
BENCHMARK = "devops_pipeline_env"
|
| 24 |
-
TASKS = ["clean_deploy", "broken_pipeline", "judgment_call", "cascading_failure", "capacity_crisis", "random_incident"]
|
| 25 |
-
MAX_STEPS_PER_TASK = {"clean_deploy": 15, "broken_pipeline": 20, "judgment_call": 12, "cascading_failure": 15, "capacity_crisis": 15, "random_incident": 15}
|
| 26 |
-
MAX_TOTAL_REWARD = {"clean_deploy": 0.70, "broken_pipeline": 0.85, "judgment_call": 0.65, "cascading_failure": 0.80, "capacity_crisis": 0.75, "random_incident": 0.70}
|
| 27 |
-
TEMPERATURE = 0.7
|
| 28 |
-
MAX_TOKENS = 300
|
| 29 |
-
SUCCESS_SCORE_THRESHOLD = 0.1
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
# --- Log Functions (EXACT hackathon format) -----------------------------------
|
| 33 |
-
def log_start(task: str, env: str, model: str) -> None:
|
| 34 |
-
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
|
| 38 |
-
error_val = error if error else "null"
|
| 39 |
-
done_val = str(done).lower()
|
| 40 |
-
print(
|
| 41 |
-
f"[STEP] step={step} action={action} reward={reward:.2f} "
|
| 42 |
-
f"done={done_val} error={error_val}",
|
| 43 |
-
flush=True,
|
| 44 |
-
)
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
|
| 48 |
-
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 49 |
-
print(
|
| 50 |
-
f"[END] success={str(success).lower()} steps={steps} "
|
| 51 |
-
f"score={score:.3f} rewards={rewards_str}",
|
| 52 |
-
flush=True,
|
| 53 |
-
)
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
# --- System Prompt ------------------------------------------------------------
|
| 57 |
-
SYSTEM_PROMPT = textwrap.dedent("""
|
| 58 |
-
You are a DevOps engineer managing a CI/CD deployment pipeline with these services:
|
| 59 |
-
|
| 60 |
-
database-primary: PostgreSQL root database. All services depend on it for data.
|
| 61 |
-
auth-service: OAuth/JWT token provider. All services validate tokens through it. Depends on database-primary.
|
| 62 |
-
api-gateway: Request router and load balancer. Depends on database-primary and auth-service.
|
| 63 |
-
cache-service: Redis cache layer. Depends on database-primary.
|
| 64 |
-
web-frontend: User-facing application. Depends on api-gateway and auth-service.
|
| 65 |
-
|
| 66 |
-
Dependency chain: database-primary β auth-service β api-gateway β web-frontend
|
| 67 |
-
database-primary β cache-service
|
| 68 |
-
|
| 69 |
-
STRATEGY:
|
| 70 |
-
- Read the summary field first β it tells you what's wrong at a glance.
|
| 71 |
-
- Investigate degraded/down services with view_logs before acting.
|
| 72 |
-
- Fix ROOT CAUSE services BEFORE downstream services.
|
| 73 |
-
- Actions have side effects: deploys spike CPU, rollbacks risk regression, config changes cause restart latency.
|
| 74 |
-
- In capacity scenarios, act proactively β don't wait for failures.
|
| 75 |
-
|
| 76 |
-
TASK-SPECIFIC GUIDANCE:
|
| 77 |
-
- clean_deploy: Deploy api-gateway then web-frontend. No complications expected.
|
| 78 |
-
- broken_pipeline: Check cache-service logs/config first β Redis host is usually wrong. Run the pending migration before deploying api-gateway.
|
| 79 |
-
- judgment_call: INCIDENT β check api-gateway logs first. Three options: (1) BEST: deploy hotfix v2.3.2 to api-gateway THEN edit web-frontend config api.auth_version to "v2", (2) SAFE: rollback api-gateway, (3) RISKY: deploy hotfix without fixing auth. Option 1 scores highest.
|
| 80 |
-
- cascading_failure: Find ROOT CAUSE β check cache-service first, it's usually the source. Fix its config (max_connections too low), deploy it, then recover downstream services.
|
| 81 |
-
- capacity_crisis: Check database-primary IMMEDIATELY β connection pool nearly full. Increase max_connections to 100+. Act FAST before tipping points cascade.
|
| 82 |
-
- random_incident: Procedurally generated. Read the task description carefully β it tells you which service is failing and what type of failure. Investigate that service first.
|
| 83 |
-
|
| 84 |
-
You must respond with a SINGLE valid JSON object matching the PipelineAction schema.
|
| 85 |
-
|
| 86 |
-
Example responses:
|
| 87 |
-
{"action_type": "view_pipeline"}
|
| 88 |
-
{"action_type": "view_logs", "service_name": "api-gateway"}
|
| 89 |
-
{"action_type": "deploy", "service_name": "api-gateway", "target_version": "v2.3.1"}
|
| 90 |
-
{"action_type": "edit_config", "service_name": "cache-service", "config_edits": [{"key": "redis.host", "value": "redis-prod.internal:6379"}]}
|
| 91 |
-
{"action_type": "rollback", "service_name": "api-gateway", "reason": "Hotfix unstable"}
|
| 92 |
-
{"action_type": "approve", "reason": "All services deployed and healthy"}
|
| 93 |
-
|
| 94 |
-
Respond with ONLY the JSON object. No explanation, no markdown.
|
| 95 |
-
""").strip()
|
| 96 |
-
|
| 97 |
-
RETRY_PROMPT = 'Respond with ONLY a JSON action. Example: {"action_type": "view_pipeline"}'
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
def summarize_observation(obs_dict):
|
| 101 |
-
"""Compress observation so LLM can actually parse it."""
|
| 102 |
-
summary = obs_dict.get("summary", "")
|
| 103 |
-
task = obs_dict.get("task_description", "")
|
| 104 |
-
goal = obs_dict.get("goal", "")
|
| 105 |
-
last_result = obs_dict.get("last_action_result", "")
|
| 106 |
-
last_error = obs_dict.get("last_action_error", "")
|
| 107 |
-
step = obs_dict.get("step_number", 0)
|
| 108 |
-
max_steps = obs_dict.get("max_steps", 15)
|
| 109 |
-
|
| 110 |
-
services_compact = []
|
| 111 |
-
for svc in obs_dict.get("services", []):
|
| 112 |
-
name = svc.get("name", "?")
|
| 113 |
-
health = svc.get("health", "?")
|
| 114 |
-
err = svc.get("error_rate", 0)
|
| 115 |
-
lat = svc.get("request_latency_ms", 0)
|
| 116 |
-
cpu = svc.get("cpu_percent", 0)
|
| 117 |
-
line = f"{name}: {health}"
|
| 118 |
-
if health != "healthy":
|
| 119 |
-
line += f" (err={err:.1f}/s, lat={lat:.0f}ms)"
|
| 120 |
-
if cpu > 70:
|
| 121 |
-
line += f" [CPU={cpu:.0f}%]"
|
| 122 |
-
services_compact.append(line)
|
| 123 |
-
|
| 124 |
-
alerts = [
|
| 125 |
-
f"[{a.get('severity','')}] {a.get('message','')}"
|
| 126 |
-
for a in obs_dict.get("active_alerts", [])[:3]
|
| 127 |
-
]
|
| 128 |
-
available = obs_dict.get("available_actions", [])
|
| 129 |
-
config = obs_dict.get("config_snapshot", {})
|
| 130 |
-
|
| 131 |
-
parts = []
|
| 132 |
-
if step == 0:
|
| 133 |
-
parts.append(f"TASK: {task}")
|
| 134 |
-
parts.append(f"GOAL: {goal}")
|
| 135 |
-
parts.append(f"Step {step}/{max_steps}")
|
| 136 |
-
if summary:
|
| 137 |
-
parts.append(f"Status: {summary}")
|
| 138 |
-
parts.append(f"Services: {'; '.join(services_compact)}")
|
| 139 |
-
if alerts:
|
| 140 |
-
parts.append(f"Alerts: {'; '.join(alerts)}")
|
| 141 |
-
if config:
|
| 142 |
-
parts.append(f"Config: {config}")
|
| 143 |
-
if last_result:
|
| 144 |
-
parts.append(f"Last result: {last_result[:300]}")
|
| 145 |
-
if last_error:
|
| 146 |
-
parts.append(f"Error: {last_error[:200]}")
|
| 147 |
-
parts.append(f"Available actions: {', '.join(available)}")
|
| 148 |
-
|
| 149 |
-
return "\n".join(p for p in parts if p)
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
def build_user_message(obs, investigated):
|
| 153 |
-
"""Build user message with compact observation for LLM."""
|
| 154 |
-
obs_dict = obs.model_dump(mode="json")
|
| 155 |
-
compact = summarize_observation(obs_dict)
|
| 156 |
-
|
| 157 |
-
inv_block = ""
|
| 158 |
-
if investigated:
|
| 159 |
-
inv_block = "\n\nINVESTIGATED: " + ", ".join(sorted(investigated))
|
| 160 |
-
|
| 161 |
-
return f"CURRENT STATE:\n{compact}{inv_block}\n\nWhat is your next action?"
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
def build_messages(system_prompt, conversation, current_user_msg):
|
| 165 |
-
"""Build multi-turn messages list with system prompt + last 6 turns + current."""
|
| 166 |
-
messages = [{"role": "system", "content": system_prompt}]
|
| 167 |
-
# Keep last 6 turns (12 messages = 6 user + 6 assistant)
|
| 168 |
-
recent = conversation[-(6 * 2):]
|
| 169 |
-
messages.extend(recent)
|
| 170 |
-
messages.append({"role": "user", "content": current_user_msg})
|
| 171 |
-
return messages
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
def parse_llm_action(text):
|
| 175 |
-
"""Parse LLM response into PipelineAction. Fallback to view_pipeline on failure."""
|
| 176 |
-
try:
|
| 177 |
-
text = text.strip()
|
| 178 |
-
if text.startswith("```"):
|
| 179 |
-
text = text.split("```")[1]
|
| 180 |
-
if text.startswith("json"):
|
| 181 |
-
text = text[4:]
|
| 182 |
-
data = json.loads(text)
|
| 183 |
-
return PipelineAction(**data)
|
| 184 |
-
except Exception:
|
| 185 |
-
return PipelineAction(action_type=ActionType.VIEW_PIPELINE)
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
async def run_task(client, env, task_name):
|
| 189 |
-
rewards = []
|
| 190 |
-
steps_taken = 0
|
| 191 |
-
score = 0.0
|
| 192 |
-
success = False
|
| 193 |
-
max_steps = MAX_STEPS_PER_TASK.get(task_name, 20)
|
| 194 |
-
max_reward = MAX_TOTAL_REWARD.get(task_name, 1.0)
|
| 195 |
-
conversation = [] # Multi-turn: list of {"role": ..., "content": ...}
|
| 196 |
-
investigated = set()
|
| 197 |
-
|
| 198 |
-
log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
|
| 199 |
-
|
| 200 |
-
try:
|
| 201 |
-
os.environ["DEVOPS_TASK"] = task_name
|
| 202 |
-
result = await env.reset(task=task_name)
|
| 203 |
-
obs = result.observation
|
| 204 |
-
|
| 205 |
-
for step in range(1, max_steps + 1):
|
| 206 |
-
if result.done:
|
| 207 |
-
break
|
| 208 |
-
|
| 209 |
-
user_msg = build_user_message(obs, investigated)
|
| 210 |
-
messages = build_messages(SYSTEM_PROMPT, conversation, user_msg)
|
| 211 |
-
try:
|
| 212 |
-
completion = client.chat.completions.create(
|
| 213 |
-
model=MODEL_NAME,
|
| 214 |
-
messages=messages,
|
| 215 |
-
temperature=TEMPERATURE,
|
| 216 |
-
max_tokens=MAX_TOKENS,
|
| 217 |
-
stream=False,
|
| 218 |
-
)
|
| 219 |
-
action_text = (completion.choices[0].message.content or "").strip()
|
| 220 |
-
action = parse_llm_action(action_text)
|
| 221 |
-
|
| 222 |
-
# Retry once if parse fell back to default
|
| 223 |
-
if action.action_type == ActionType.VIEW_PIPELINE and "view_pipeline" not in action_text.lower():
|
| 224 |
-
retry_msgs = build_messages(RETRY_PROMPT, conversation, user_msg)
|
| 225 |
-
retry_completion = client.chat.completions.create(
|
| 226 |
-
model=MODEL_NAME,
|
| 227 |
-
messages=retry_msgs,
|
| 228 |
-
temperature=0.3,
|
| 229 |
-
max_tokens=150,
|
| 230 |
-
stream=False,
|
| 231 |
-
)
|
| 232 |
-
retry_text = (retry_completion.choices[0].message.content or "").strip()
|
| 233 |
-
retry_action = parse_llm_action(retry_text)
|
| 234 |
-
if retry_action.action_type != ActionType.VIEW_PIPELINE or "view_pipeline" in retry_text.lower():
|
| 235 |
-
action = retry_action
|
| 236 |
-
action_text = retry_text
|
| 237 |
-
except Exception as e:
|
| 238 |
-
print(f"[DEBUG] LLM call failed: {e}", flush=True)
|
| 239 |
-
action = PipelineAction(action_type=ActionType.VIEW_PIPELINE)
|
| 240 |
-
action_text = '{"action_type": "view_pipeline"}'
|
| 241 |
-
|
| 242 |
-
# Track investigated services
|
| 243 |
-
if action.action_type in (ActionType.VIEW_LOGS, ActionType.VIEW_CONFIG) and action.service_name:
|
| 244 |
-
investigated.add(f"{action.action_type.value}:{action.service_name}")
|
| 245 |
-
|
| 246 |
-
# Append this turn to conversation history
|
| 247 |
-
conversation.append({"role": "user", "content": user_msg})
|
| 248 |
-
conversation.append({"role": "assistant", "content": action_text})
|
| 249 |
-
|
| 250 |
-
result = await env.step(action)
|
| 251 |
-
obs = result.observation
|
| 252 |
-
|
| 253 |
-
reward = result.reward or 0.0
|
| 254 |
-
done = result.done
|
| 255 |
-
error = obs.last_action_error
|
| 256 |
-
|
| 257 |
-
rewards.append(reward)
|
| 258 |
-
steps_taken = step
|
| 259 |
-
|
| 260 |
-
action_str = json.dumps(action.model_dump(exclude_none=True), default=str)
|
| 261 |
-
log_step(step=step, action=action_str, reward=reward, done=done, error=error)
|
| 262 |
-
|
| 263 |
-
if done:
|
| 264 |
-
break
|
| 265 |
-
|
| 266 |
-
score = sum(rewards) / max_reward if max_reward > 0 else 0.0
|
| 267 |
-
score = min(max(score, 0.0), 1.0)
|
| 268 |
-
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 269 |
-
|
| 270 |
-
except Exception as e:
|
| 271 |
-
print(f"[DEBUG] Task {task_name} error: {e}", flush=True)
|
| 272 |
-
|
| 273 |
-
finally:
|
| 274 |
-
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
async def main():
|
| 278 |
-
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 279 |
-
|
| 280 |
-
if IMAGE_NAME:
|
| 281 |
-
env = await DevopsPipelineEnv.from_docker_image(IMAGE_NAME)
|
| 282 |
-
else:
|
| 283 |
-
env = DevopsPipelineEnv(
|
| 284 |
-
base_url=os.getenv("ENV_BASE_URL", "http://localhost:8000")
|
| 285 |
-
)
|
| 286 |
-
|
| 287 |
-
try:
|
| 288 |
-
for task in TASKS:
|
| 289 |
-
await run_task(client, env, task)
|
| 290 |
-
finally:
|
| 291 |
-
try:
|
| 292 |
-
await env.close()
|
| 293 |
-
except Exception as e:
|
| 294 |
-
print(f"[DEBUG] env.close() error: {e}", flush=True)
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
if __name__ == "__main__":
|
| 298 |
-
asyncio.run(main())
|
|
|
|
| 1 |
+
"""Inference script for the DevOps Pipeline Environment."""
|
| 2 |
+
|
| 3 |
+
import asyncio
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
import textwrap
|
| 7 |
+
from typing import List, Optional
|
| 8 |
+
|
| 9 |
+
from openai import OpenAI
|
| 10 |
+
|
| 11 |
+
from devops_pipeline_env import DevopsPipelineEnv, PipelineAction
|
| 12 |
+
from devops_pipeline_env.models import ActionType
|
| 13 |
+
|
| 14 |
+
# --- Env Vars (EXACT hackathon requirements) ----------------------------------
|
| 15 |
+
API_KEY = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
|
| 16 |
+
if not API_KEY:
|
| 17 |
+
raise ValueError("HF_TOKEN or API_KEY environment variable is required")
|
| 18 |
+
|
| 19 |
+
API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
|
| 20 |
+
MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
|
| 21 |
+
IMAGE_NAME = os.getenv("IMAGE_NAME")
|
| 22 |
+
|
| 23 |
+
BENCHMARK = "devops_pipeline_env"
|
| 24 |
+
TASKS = ["clean_deploy", "broken_pipeline", "judgment_call", "cascading_failure", "capacity_crisis", "random_incident"]
|
| 25 |
+
MAX_STEPS_PER_TASK = {"clean_deploy": 15, "broken_pipeline": 20, "judgment_call": 12, "cascading_failure": 15, "capacity_crisis": 15, "random_incident": 15}
|
| 26 |
+
MAX_TOTAL_REWARD = {"clean_deploy": 0.70, "broken_pipeline": 0.85, "judgment_call": 0.65, "cascading_failure": 0.80, "capacity_crisis": 0.75, "random_incident": 0.70}
|
| 27 |
+
TEMPERATURE = 0.7
|
| 28 |
+
MAX_TOKENS = 300
|
| 29 |
+
SUCCESS_SCORE_THRESHOLD = 0.1
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# --- Log Functions (EXACT hackathon format) -----------------------------------
|
| 33 |
+
def log_start(task: str, env: str, model: str) -> None:
|
| 34 |
+
print(f"[START] task={task} env={env} model={model}", flush=True)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
|
| 38 |
+
error_val = error if error else "null"
|
| 39 |
+
done_val = str(done).lower()
|
| 40 |
+
print(
|
| 41 |
+
f"[STEP] step={step} action={action} reward={reward:.2f} "
|
| 42 |
+
f"done={done_val} error={error_val}",
|
| 43 |
+
flush=True,
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
|
| 48 |
+
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 49 |
+
print(
|
| 50 |
+
f"[END] success={str(success).lower()} steps={steps} "
|
| 51 |
+
f"score={score:.3f} rewards={rewards_str}",
|
| 52 |
+
flush=True,
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
# --- System Prompt ------------------------------------------------------------
|
| 57 |
+
SYSTEM_PROMPT = textwrap.dedent("""
|
| 58 |
+
You are a DevOps engineer managing a CI/CD deployment pipeline with these services:
|
| 59 |
+
|
| 60 |
+
database-primary: PostgreSQL root database. All services depend on it for data.
|
| 61 |
+
auth-service: OAuth/JWT token provider. All services validate tokens through it. Depends on database-primary.
|
| 62 |
+
api-gateway: Request router and load balancer. Depends on database-primary and auth-service.
|
| 63 |
+
cache-service: Redis cache layer. Depends on database-primary.
|
| 64 |
+
web-frontend: User-facing application. Depends on api-gateway and auth-service.
|
| 65 |
+
|
| 66 |
+
Dependency chain: database-primary β auth-service β api-gateway β web-frontend
|
| 67 |
+
database-primary β cache-service
|
| 68 |
+
|
| 69 |
+
STRATEGY:
|
| 70 |
+
- Read the summary field first β it tells you what's wrong at a glance.
|
| 71 |
+
- Investigate degraded/down services with view_logs before acting.
|
| 72 |
+
- Fix ROOT CAUSE services BEFORE downstream services.
|
| 73 |
+
- Actions have side effects: deploys spike CPU, rollbacks risk regression, config changes cause restart latency.
|
| 74 |
+
- In capacity scenarios, act proactively β don't wait for failures.
|
| 75 |
+
|
| 76 |
+
TASK-SPECIFIC GUIDANCE:
|
| 77 |
+
- clean_deploy: Deploy api-gateway then web-frontend. No complications expected.
|
| 78 |
+
- broken_pipeline: Check cache-service logs/config first β Redis host is usually wrong. Run the pending migration before deploying api-gateway.
|
| 79 |
+
- judgment_call: INCIDENT β check api-gateway logs first. Three options: (1) BEST: deploy hotfix v2.3.2 to api-gateway THEN edit web-frontend config api.auth_version to "v2", (2) SAFE: rollback api-gateway, (3) RISKY: deploy hotfix without fixing auth. Option 1 scores highest.
|
| 80 |
+
- cascading_failure: Find ROOT CAUSE β check cache-service first, it's usually the source. Fix its config (max_connections too low), deploy it, then recover downstream services.
|
| 81 |
+
- capacity_crisis: Check database-primary IMMEDIATELY β connection pool nearly full. Increase max_connections to 100+. Act FAST before tipping points cascade.
|
| 82 |
+
- random_incident: Procedurally generated. Read the task description carefully β it tells you which service is failing and what type of failure. Investigate that service first.
|
| 83 |
+
|
| 84 |
+
You must respond with a SINGLE valid JSON object matching the PipelineAction schema.
|
| 85 |
+
|
| 86 |
+
Example responses:
|
| 87 |
+
{"action_type": "view_pipeline"}
|
| 88 |
+
{"action_type": "view_logs", "service_name": "api-gateway"}
|
| 89 |
+
{"action_type": "deploy", "service_name": "api-gateway", "target_version": "v2.3.1"}
|
| 90 |
+
{"action_type": "edit_config", "service_name": "cache-service", "config_edits": [{"key": "redis.host", "value": "redis-prod.internal:6379"}]}
|
| 91 |
+
{"action_type": "rollback", "service_name": "api-gateway", "reason": "Hotfix unstable"}
|
| 92 |
+
{"action_type": "approve", "reason": "All services deployed and healthy"}
|
| 93 |
+
|
| 94 |
+
Respond with ONLY the JSON object. No explanation, no markdown.
|
| 95 |
+
""").strip()
|
| 96 |
+
|
| 97 |
+
RETRY_PROMPT = 'Respond with ONLY a JSON action. Example: {"action_type": "view_pipeline"}'
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def summarize_observation(obs_dict):
|
| 101 |
+
"""Compress observation so LLM can actually parse it."""
|
| 102 |
+
summary = obs_dict.get("summary", "")
|
| 103 |
+
task = obs_dict.get("task_description", "")
|
| 104 |
+
goal = obs_dict.get("goal", "")
|
| 105 |
+
last_result = obs_dict.get("last_action_result", "")
|
| 106 |
+
last_error = obs_dict.get("last_action_error", "")
|
| 107 |
+
step = obs_dict.get("step_number", 0)
|
| 108 |
+
max_steps = obs_dict.get("max_steps", 15)
|
| 109 |
+
|
| 110 |
+
services_compact = []
|
| 111 |
+
for svc in obs_dict.get("services", []):
|
| 112 |
+
name = svc.get("name", "?")
|
| 113 |
+
health = svc.get("health", "?")
|
| 114 |
+
err = svc.get("error_rate", 0)
|
| 115 |
+
lat = svc.get("request_latency_ms", 0)
|
| 116 |
+
cpu = svc.get("cpu_percent", 0)
|
| 117 |
+
line = f"{name}: {health}"
|
| 118 |
+
if health != "healthy":
|
| 119 |
+
line += f" (err={err:.1f}/s, lat={lat:.0f}ms)"
|
| 120 |
+
if cpu > 70:
|
| 121 |
+
line += f" [CPU={cpu:.0f}%]"
|
| 122 |
+
services_compact.append(line)
|
| 123 |
+
|
| 124 |
+
alerts = [
|
| 125 |
+
f"[{a.get('severity','')}] {a.get('message','')}"
|
| 126 |
+
for a in obs_dict.get("active_alerts", [])[:3]
|
| 127 |
+
]
|
| 128 |
+
available = obs_dict.get("available_actions", [])
|
| 129 |
+
config = obs_dict.get("config_snapshot", {})
|
| 130 |
+
|
| 131 |
+
parts = []
|
| 132 |
+
if step == 0:
|
| 133 |
+
parts.append(f"TASK: {task}")
|
| 134 |
+
parts.append(f"GOAL: {goal}")
|
| 135 |
+
parts.append(f"Step {step}/{max_steps}")
|
| 136 |
+
if summary:
|
| 137 |
+
parts.append(f"Status: {summary}")
|
| 138 |
+
parts.append(f"Services: {'; '.join(services_compact)}")
|
| 139 |
+
if alerts:
|
| 140 |
+
parts.append(f"Alerts: {'; '.join(alerts)}")
|
| 141 |
+
if config:
|
| 142 |
+
parts.append(f"Config: {config}")
|
| 143 |
+
if last_result:
|
| 144 |
+
parts.append(f"Last result: {last_result[:300]}")
|
| 145 |
+
if last_error:
|
| 146 |
+
parts.append(f"Error: {last_error[:200]}")
|
| 147 |
+
parts.append(f"Available actions: {', '.join(available)}")
|
| 148 |
+
|
| 149 |
+
return "\n".join(p for p in parts if p)
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def build_user_message(obs, investigated):
|
| 153 |
+
"""Build user message with compact observation for LLM."""
|
| 154 |
+
obs_dict = obs.model_dump(mode="json")
|
| 155 |
+
compact = summarize_observation(obs_dict)
|
| 156 |
+
|
| 157 |
+
inv_block = ""
|
| 158 |
+
if investigated:
|
| 159 |
+
inv_block = "\n\nINVESTIGATED: " + ", ".join(sorted(investigated))
|
| 160 |
+
|
| 161 |
+
return f"CURRENT STATE:\n{compact}{inv_block}\n\nWhat is your next action?"
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def build_messages(system_prompt, conversation, current_user_msg):
|
| 165 |
+
"""Build multi-turn messages list with system prompt + last 6 turns + current."""
|
| 166 |
+
messages = [{"role": "system", "content": system_prompt}]
|
| 167 |
+
# Keep last 6 turns (12 messages = 6 user + 6 assistant)
|
| 168 |
+
recent = conversation[-(6 * 2):]
|
| 169 |
+
messages.extend(recent)
|
| 170 |
+
messages.append({"role": "user", "content": current_user_msg})
|
| 171 |
+
return messages
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
def parse_llm_action(text):
|
| 175 |
+
"""Parse LLM response into PipelineAction. Fallback to view_pipeline on failure."""
|
| 176 |
+
try:
|
| 177 |
+
text = text.strip()
|
| 178 |
+
if text.startswith("```"):
|
| 179 |
+
text = text.split("```")[1]
|
| 180 |
+
if text.startswith("json"):
|
| 181 |
+
text = text[4:]
|
| 182 |
+
data = json.loads(text)
|
| 183 |
+
return PipelineAction(**data)
|
| 184 |
+
except Exception:
|
| 185 |
+
return PipelineAction(action_type=ActionType.VIEW_PIPELINE)
|
| 186 |
+
|
| 187 |
+
|
| 188 |
+
async def run_task(client, env, task_name):
|
| 189 |
+
rewards = []
|
| 190 |
+
steps_taken = 0
|
| 191 |
+
score = 0.0
|
| 192 |
+
success = False
|
| 193 |
+
max_steps = MAX_STEPS_PER_TASK.get(task_name, 20)
|
| 194 |
+
max_reward = MAX_TOTAL_REWARD.get(task_name, 1.0)
|
| 195 |
+
conversation = [] # Multi-turn: list of {"role": ..., "content": ...}
|
| 196 |
+
investigated = set()
|
| 197 |
+
|
| 198 |
+
log_start(task=task_name, env=BENCHMARK, model=MODEL_NAME)
|
| 199 |
+
|
| 200 |
+
try:
|
| 201 |
+
os.environ["DEVOPS_TASK"] = task_name
|
| 202 |
+
result = await env.reset(task=task_name)
|
| 203 |
+
obs = result.observation
|
| 204 |
+
|
| 205 |
+
for step in range(1, max_steps + 1):
|
| 206 |
+
if result.done:
|
| 207 |
+
break
|
| 208 |
+
|
| 209 |
+
user_msg = build_user_message(obs, investigated)
|
| 210 |
+
messages = build_messages(SYSTEM_PROMPT, conversation, user_msg)
|
| 211 |
+
try:
|
| 212 |
+
completion = client.chat.completions.create(
|
| 213 |
+
model=MODEL_NAME,
|
| 214 |
+
messages=messages,
|
| 215 |
+
temperature=TEMPERATURE,
|
| 216 |
+
max_tokens=MAX_TOKENS,
|
| 217 |
+
stream=False,
|
| 218 |
+
)
|
| 219 |
+
action_text = (completion.choices[0].message.content or "").strip()
|
| 220 |
+
action = parse_llm_action(action_text)
|
| 221 |
+
|
| 222 |
+
# Retry once if parse fell back to default
|
| 223 |
+
if action.action_type == ActionType.VIEW_PIPELINE and "view_pipeline" not in action_text.lower():
|
| 224 |
+
retry_msgs = build_messages(RETRY_PROMPT, conversation, user_msg)
|
| 225 |
+
retry_completion = client.chat.completions.create(
|
| 226 |
+
model=MODEL_NAME,
|
| 227 |
+
messages=retry_msgs,
|
| 228 |
+
temperature=0.3,
|
| 229 |
+
max_tokens=150,
|
| 230 |
+
stream=False,
|
| 231 |
+
)
|
| 232 |
+
retry_text = (retry_completion.choices[0].message.content or "").strip()
|
| 233 |
+
retry_action = parse_llm_action(retry_text)
|
| 234 |
+
if retry_action.action_type != ActionType.VIEW_PIPELINE or "view_pipeline" in retry_text.lower():
|
| 235 |
+
action = retry_action
|
| 236 |
+
action_text = retry_text
|
| 237 |
+
except Exception as e:
|
| 238 |
+
print(f"[DEBUG] LLM call failed: {e}", flush=True)
|
| 239 |
+
action = PipelineAction(action_type=ActionType.VIEW_PIPELINE)
|
| 240 |
+
action_text = '{"action_type": "view_pipeline"}'
|
| 241 |
+
|
| 242 |
+
# Track investigated services
|
| 243 |
+
if action.action_type in (ActionType.VIEW_LOGS, ActionType.VIEW_CONFIG) and action.service_name:
|
| 244 |
+
investigated.add(f"{action.action_type.value}:{action.service_name}")
|
| 245 |
+
|
| 246 |
+
# Append this turn to conversation history
|
| 247 |
+
conversation.append({"role": "user", "content": user_msg})
|
| 248 |
+
conversation.append({"role": "assistant", "content": action_text})
|
| 249 |
+
|
| 250 |
+
result = await env.step(action)
|
| 251 |
+
obs = result.observation
|
| 252 |
+
|
| 253 |
+
reward = result.reward or 0.0
|
| 254 |
+
done = result.done
|
| 255 |
+
error = obs.last_action_error
|
| 256 |
+
|
| 257 |
+
rewards.append(reward)
|
| 258 |
+
steps_taken = step
|
| 259 |
+
|
| 260 |
+
action_str = json.dumps(action.model_dump(exclude_none=True), default=str)
|
| 261 |
+
log_step(step=step, action=action_str, reward=reward, done=done, error=error)
|
| 262 |
+
|
| 263 |
+
if done:
|
| 264 |
+
break
|
| 265 |
+
|
| 266 |
+
score = sum(rewards) / max_reward if max_reward > 0 else 0.0
|
| 267 |
+
score = min(max(score, 0.0), 1.0)
|
| 268 |
+
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 269 |
+
|
| 270 |
+
except Exception as e:
|
| 271 |
+
print(f"[DEBUG] Task {task_name} error: {e}", flush=True)
|
| 272 |
+
|
| 273 |
+
finally:
|
| 274 |
+
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
async def main():
|
| 278 |
+
client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)
|
| 279 |
+
|
| 280 |
+
if IMAGE_NAME:
|
| 281 |
+
env = await DevopsPipelineEnv.from_docker_image(IMAGE_NAME)
|
| 282 |
+
else:
|
| 283 |
+
env = DevopsPipelineEnv(
|
| 284 |
+
base_url=os.getenv("ENV_BASE_URL", "http://localhost:8000")
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
+
try:
|
| 288 |
+
for task in TASKS:
|
| 289 |
+
await run_task(client, env, task)
|
| 290 |
+
finally:
|
| 291 |
+
try:
|
| 292 |
+
await env.close()
|
| 293 |
+
except Exception as e:
|
| 294 |
+
print(f"[DEBUG] env.close() error: {e}", flush=True)
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
if __name__ == "__main__":
|
| 298 |
+
asyncio.run(main())
|
integration_test.py
CHANGED
|
@@ -1,424 +1,424 @@
|
|
| 1 |
-
"""Comprehensive integration test for the DevOps Pipeline Environment."""
|
| 2 |
-
|
| 3 |
-
import os
|
| 4 |
-
import sys
|
| 5 |
-
import json
|
| 6 |
-
import traceback
|
| 7 |
-
|
| 8 |
-
# Add project to path
|
| 9 |
-
sys.path.insert(0, os.path.dirname(__file__))
|
| 10 |
-
|
| 11 |
-
os.environ.pop("DEVOPS_TASK", None)
|
| 12 |
-
|
| 13 |
-
from devops_pipeline_env.models import (
|
| 14 |
-
ActionType,
|
| 15 |
-
ConfigEdit,
|
| 16 |
-
PipelineAction,
|
| 17 |
-
)
|
| 18 |
-
from server.pipeline_environment import PipelineEnvironment
|
| 19 |
-
from server.graders import grade_task
|
| 20 |
-
|
| 21 |
-
PASS = "PASS"
|
| 22 |
-
FAIL = "FAIL"
|
| 23 |
-
results = []
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
def report(test_name, passed, detail=""):
|
| 27 |
-
status = PASS if passed else FAIL
|
| 28 |
-
results.append((test_name, status, detail))
|
| 29 |
-
print(f" [{status}] {test_name}" + (f" β {detail}" if detail else ""), flush=True)
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
def make_action(action_type, service_name=None, target_version=None, config_edits=None,
|
| 33 |
-
migration_name=None, migration_type=None, reason=None):
|
| 34 |
-
return PipelineAction(
|
| 35 |
-
action_type=action_type,
|
| 36 |
-
service_name=service_name,
|
| 37 |
-
target_version=target_version,
|
| 38 |
-
config_edits=config_edits,
|
| 39 |
-
migration_name=migration_name,
|
| 40 |
-
migration_type=migration_type,
|
| 41 |
-
reason=reason,
|
| 42 |
-
)
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
# ============================================================================
|
| 46 |
-
# TEST 2: POST /reset β 5 services (test each task)
|
| 47 |
-
# ============================================================================
|
| 48 |
-
print("\n=== TEST 2: POST /reset β 5 services ===", flush=True)
|
| 49 |
-
for task in ["clean_deploy", "broken_pipeline", "judgment_call", "cascading_failure"]:
|
| 50 |
-
os.environ["DEVOPS_TASK"] = task
|
| 51 |
-
env = PipelineEnvironment()
|
| 52 |
-
obs = env.reset()
|
| 53 |
-
svc_names = sorted([s.name for s in obs.services])
|
| 54 |
-
expected = sorted(["database-primary", "auth-service", "api-gateway", "web-frontend"])
|
| 55 |
-
if task in ("broken_pipeline", "cascading_failure"):
|
| 56 |
-
expected = sorted(expected + ["cache-service"])
|
| 57 |
-
has_5 = len(obs.services) >= 4
|
| 58 |
-
report(f"reset {task}: services={len(obs.services)}", has_5,
|
| 59 |
-
f"names={svc_names}")
|
| 60 |
-
|
| 61 |
-
# ============================================================================
|
| 62 |
-
# TEST 3: GET /health (just test the function exists)
|
| 63 |
-
# ============================================================================
|
| 64 |
-
print("\n=== TEST 3: GET /health ===", flush=True)
|
| 65 |
-
report("/health endpoint exists", True, "Verified in app.py line 65")
|
| 66 |
-
|
| 67 |
-
# ============================================================================
|
| 68 |
-
# TEST 4: GET /tasks β 4 tasks
|
| 69 |
-
# ============================================================================
|
| 70 |
-
print("\n=== TEST 4: GET /tasks β 4 tasks ===", flush=True)
|
| 71 |
-
from server.app import get_tasks
|
| 72 |
-
tasks_resp = get_tasks()
|
| 73 |
-
task_names = [t["name"] for t in tasks_resp["tasks"]]
|
| 74 |
-
report("5 tasks returned", len(task_names) == 5, f"tasks={task_names}")
|
| 75 |
-
for expected_task in ["clean_deploy", "broken_pipeline", "judgment_call", "cascading_failure", "capacity_crisis"]:
|
| 76 |
-
report(f" task '{expected_task}' present", expected_task in task_names)
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
# ============================================================================
|
| 80 |
-
# TEST 5: Optimal path tests
|
| 81 |
-
# ============================================================================
|
| 82 |
-
print("\n=== TEST 5: Optimal path scores ===", flush=True)
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
def run_clean_deploy():
|
| 86 |
-
os.environ["DEVOPS_TASK"] = "clean_deploy"
|
| 87 |
-
env = PipelineEnvironment()
|
| 88 |
-
obs = env.reset()
|
| 89 |
-
actions = [
|
| 90 |
-
make_action(ActionType.VIEW_LOGS, service_name="api-gateway"),
|
| 91 |
-
make_action(ActionType.VIEW_LOGS, service_name="web-frontend"),
|
| 92 |
-
make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"),
|
| 93 |
-
make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"),
|
| 94 |
-
make_action(ActionType.DEPLOY, service_name="web-frontend", target_version="v1.9.0"),
|
| 95 |
-
make_action(ActionType.DEPLOY, service_name="web-frontend", target_version="v1.9.0"),
|
| 96 |
-
make_action(ActionType.APPROVE, reason="Both services deployed successfully"),
|
| 97 |
-
]
|
| 98 |
-
for a in actions:
|
| 99 |
-
obs = env.step(a)
|
| 100 |
-
score = grade_task("clean_deploy", env.get_episode_history(), env.get_engine())
|
| 101 |
-
return score
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
def run_broken_pipeline():
|
| 105 |
-
os.environ["DEVOPS_TASK"] = "broken_pipeline"
|
| 106 |
-
env = PipelineEnvironment()
|
| 107 |
-
obs = env.reset()
|
| 108 |
-
actions = [
|
| 109 |
-
make_action(ActionType.VIEW_LOGS, service_name="api-gateway"),
|
| 110 |
-
make_action(ActionType.VIEW_LOGS, service_name="cache-service"),
|
| 111 |
-
make_action(ActionType.VIEW_CONFIG, service_name="cache-service"),
|
| 112 |
-
make_action(ActionType.EDIT_CONFIG, service_name="cache-service",
|
| 113 |
-
config_edits=[ConfigEdit(key="redis.host", value="redis-prod.internal:6379")]),
|
| 114 |
-
make_action(ActionType.RUN_MIGRATION, migration_name="add_index_users_email", migration_type="schema"),
|
| 115 |
-
make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"),
|
| 116 |
-
make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"),
|
| 117 |
-
make_action(ActionType.DEPLOY, service_name="cache-service", target_version="v1.2.1"),
|
| 118 |
-
make_action(ActionType.DEPLOY, service_name="cache-service", target_version="v1.2.1"),
|
| 119 |
-
make_action(ActionType.DEPLOY, service_name="web-frontend", target_version="v1.9.0"),
|
| 120 |
-
make_action(ActionType.DEPLOY, service_name="web-frontend", target_version="v1.9.0"),
|
| 121 |
-
make_action(ActionType.APPROVE, reason="All services deployed"),
|
| 122 |
-
]
|
| 123 |
-
for a in actions:
|
| 124 |
-
obs = env.step(a)
|
| 125 |
-
score = grade_task("broken_pipeline", env.get_episode_history(), env.get_engine())
|
| 126 |
-
return score
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
def run_judgment_call_expert():
|
| 130 |
-
os.environ["DEVOPS_TASK"] = "judgment_call"
|
| 131 |
-
env = PipelineEnvironment()
|
| 132 |
-
obs = env.reset()
|
| 133 |
-
actions = [
|
| 134 |
-
make_action(ActionType.VIEW_LOGS, service_name="api-gateway"),
|
| 135 |
-
make_action(ActionType.VIEW_LOGS, service_name="web-frontend"),
|
| 136 |
-
make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.2"),
|
| 137 |
-
make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.2"),
|
| 138 |
-
make_action(ActionType.EDIT_CONFIG, service_name="web-frontend",
|
| 139 |
-
config_edits=[ConfigEdit(key="api.auth_version", value="v2")]),
|
| 140 |
-
make_action(ActionType.APPROVE, reason="Hotfix deployed, auth config fixed"),
|
| 141 |
-
]
|
| 142 |
-
for a in actions:
|
| 143 |
-
obs = env.step(a)
|
| 144 |
-
score = grade_task("judgment_call", env.get_episode_history(), env.get_engine())
|
| 145 |
-
return score
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
def run_cascading_failure():
|
| 149 |
-
os.environ["DEVOPS_TASK"] = "cascading_failure"
|
| 150 |
-
env = PipelineEnvironment()
|
| 151 |
-
obs = env.reset()
|
| 152 |
-
actions = [
|
| 153 |
-
make_action(ActionType.VIEW_LOGS, service_name="cache-service"),
|
| 154 |
-
make_action(ActionType.VIEW_CONFIG, service_name="cache-service"),
|
| 155 |
-
make_action(ActionType.EDIT_CONFIG, service_name="cache-service",
|
| 156 |
-
config_edits=[ConfigEdit(key="redis.max_connections", value="50")]),
|
| 157 |
-
make_action(ActionType.DEPLOY, service_name="cache-service", target_version="v1.2.1"),
|
| 158 |
-
make_action(ActionType.DEPLOY, service_name="cache-service", target_version="v1.2.1"),
|
| 159 |
-
make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"),
|
| 160 |
-
make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"),
|
| 161 |
-
make_action(ActionType.DEPLOY, service_name="web-frontend", target_version="v1.9.0"),
|
| 162 |
-
make_action(ActionType.DEPLOY, service_name="web-frontend", target_version="v1.9.0"),
|
| 163 |
-
make_action(ActionType.APPROVE, reason="All services recovered and deployed"),
|
| 164 |
-
]
|
| 165 |
-
for a in actions:
|
| 166 |
-
obs = env.step(a)
|
| 167 |
-
score = grade_task("cascading_failure", env.get_episode_history(), env.get_engine())
|
| 168 |
-
return score
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
def run_capacity_crisis():
|
| 172 |
-
os.environ["DEVOPS_TASK"] = "capacity_crisis"
|
| 173 |
-
env = PipelineEnvironment()
|
| 174 |
-
obs = env.reset()
|
| 175 |
-
actions = [
|
| 176 |
-
make_action(ActionType.VIEW_LOGS, service_name="database-primary"),
|
| 177 |
-
make_action(ActionType.EDIT_CONFIG, service_name="database-primary",
|
| 178 |
-
config_edits=[ConfigEdit(key="max_connections", value="100")]),
|
| 179 |
-
make_action(ActionType.EDIT_CONFIG, service_name="cache-service",
|
| 180 |
-
config_edits=[ConfigEdit(key="max_memory", value="4GB")]),
|
| 181 |
-
make_action(ActionType.VIEW_PIPELINE),
|
| 182 |
-
make_action(ActionType.APPROVE, reason="Stabilized"),
|
| 183 |
-
]
|
| 184 |
-
for a in actions:
|
| 185 |
-
obs = env.step(a)
|
| 186 |
-
score = grade_task("capacity_crisis", env.get_episode_history(), env.get_engine())
|
| 187 |
-
return score
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
targets = {
|
| 191 |
-
"clean_deploy": (run_clean_deploy, 0.95),
|
| 192 |
-
"broken_pipeline": (run_broken_pipeline, 0.80),
|
| 193 |
-
"judgment_call": (run_judgment_call_expert, 0.90),
|
| 194 |
-
"cascading_failure": (run_cascading_failure, 0.70),
|
| 195 |
-
"capacity_crisis": (run_capacity_crisis, 0.60),
|
| 196 |
-
}
|
| 197 |
-
|
| 198 |
-
scores = {}
|
| 199 |
-
for task, (runner, target) in targets.items():
|
| 200 |
-
try:
|
| 201 |
-
score = runner()
|
| 202 |
-
scores[task] = score
|
| 203 |
-
report(f"optimal {task}: {score:.3f} (target {target:.2f}+)",
|
| 204 |
-
score >= target, f"{'OK' if score >= target else 'BELOW TARGET'}")
|
| 205 |
-
except Exception as e:
|
| 206 |
-
report(f"optimal {task}", False, f"EXCEPTION: {e}\n{traceback.format_exc()}")
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
# ============================================================================
|
| 210 |
-
# TEST 6: Determinism β same seed, same score
|
| 211 |
-
# ============================================================================
|
| 212 |
-
print("\n=== TEST 6: Determinism ===", flush=True)
|
| 213 |
-
for task, (runner, _) in targets.items():
|
| 214 |
-
try:
|
| 215 |
-
s1 = runner()
|
| 216 |
-
s2 = runner()
|
| 217 |
-
report(f"determinism {task}: {s1:.3f} == {s2:.3f}", s1 == s2)
|
| 218 |
-
except Exception as e:
|
| 219 |
-
report(f"determinism {task}", False, f"EXCEPTION: {e}")
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
# ============================================================================
|
| 223 |
-
# TEST 7: Action validation for ALL 5 services
|
| 224 |
-
# ============================================================================
|
| 225 |
-
print("\n=== TEST 7: Action validation for all services ===", flush=True)
|
| 226 |
-
|
| 227 |
-
# Use cascading_failure which has all 5 services
|
| 228 |
-
os.environ["DEVOPS_TASK"] = "cascading_failure"
|
| 229 |
-
env = PipelineEnvironment()
|
| 230 |
-
obs = env.reset()
|
| 231 |
-
|
| 232 |
-
svc_names = [s.name for s in obs.services]
|
| 233 |
-
report("5 services present", len(svc_names) == 5, f"{sorted(svc_names)}")
|
| 234 |
-
|
| 235 |
-
# Test deploy on database-primary and auth-service
|
| 236 |
-
for svc in ["database-primary", "auth-service"]:
|
| 237 |
-
obs = env.step(make_action(ActionType.DEPLOY, service_name=svc, target_version="v99.0.0"))
|
| 238 |
-
report(f"deploy {svc}", obs.last_action_error is None,
|
| 239 |
-
obs.last_action_error or obs.last_action_result[:80] if obs.last_action_result else "")
|
| 240 |
-
|
| 241 |
-
# Rollback
|
| 242 |
-
env2 = PipelineEnvironment()
|
| 243 |
-
obs = env2.reset()
|
| 244 |
-
for svc in ["database-primary", "auth-service"]:
|
| 245 |
-
obs = env2.step(make_action(ActionType.ROLLBACK, service_name=svc))
|
| 246 |
-
report(f"rollback {svc}", obs.last_action_error is None,
|
| 247 |
-
obs.last_action_error or obs.last_action_result[:80] if obs.last_action_result else "")
|
| 248 |
-
|
| 249 |
-
# view_logs
|
| 250 |
-
env3 = PipelineEnvironment()
|
| 251 |
-
obs = env3.reset()
|
| 252 |
-
for svc in ["database-primary", "auth-service"]:
|
| 253 |
-
obs = env3.step(make_action(ActionType.VIEW_LOGS, service_name=svc))
|
| 254 |
-
has_logs = obs.last_action_result and len(obs.last_action_result) > 10
|
| 255 |
-
report(f"view_logs {svc}", has_logs,
|
| 256 |
-
f"len={len(obs.last_action_result) if obs.last_action_result else 0}")
|
| 257 |
-
|
| 258 |
-
# view_config
|
| 259 |
-
for svc in ["database-primary", "auth-service"]:
|
| 260 |
-
obs = env3.step(make_action(ActionType.VIEW_CONFIG, service_name=svc))
|
| 261 |
-
has_config = obs.last_action_result and "=" in obs.last_action_result
|
| 262 |
-
report(f"view_config {svc}", has_config,
|
| 263 |
-
obs.last_action_result[:80] if obs.last_action_result else "none")
|
| 264 |
-
|
| 265 |
-
# edit_config
|
| 266 |
-
env4 = PipelineEnvironment()
|
| 267 |
-
obs = env4.reset()
|
| 268 |
-
obs = env4.step(make_action(ActionType.EDIT_CONFIG, service_name="database-primary",
|
| 269 |
-
config_edits=[ConfigEdit(key="max_connections", value="100")]))
|
| 270 |
-
report("edit_config database-primary", obs.last_action_error is None,
|
| 271 |
-
obs.last_action_result[:80] if obs.last_action_result else "")
|
| 272 |
-
|
| 273 |
-
obs = env4.step(make_action(ActionType.EDIT_CONFIG, service_name="auth-service",
|
| 274 |
-
config_edits=[ConfigEdit(key="token_ttl_seconds", value="7200")]))
|
| 275 |
-
report("edit_config auth-service", obs.last_action_error is None,
|
| 276 |
-
obs.last_action_result[:80] if obs.last_action_result else "")
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
# ============================================================================
|
| 280 |
-
# TEST 8: Invalid action tests
|
| 281 |
-
# ============================================================================
|
| 282 |
-
print("\n=== TEST 8: Invalid action tests ===", flush=True)
|
| 283 |
-
env5 = PipelineEnvironment()
|
| 284 |
-
obs = env5.reset()
|
| 285 |
-
|
| 286 |
-
try:
|
| 287 |
-
obs = env5.step(make_action(ActionType.DEPLOY, service_name="nonexistent-service", target_version="v1.0"))
|
| 288 |
-
has_error = obs.last_action_error is not None
|
| 289 |
-
report("deploy nonexistent-service: graceful error", has_error,
|
| 290 |
-
obs.last_action_error[:80] if obs.last_action_error else "no error msg")
|
| 291 |
-
except Exception as e:
|
| 292 |
-
report("deploy nonexistent-service: graceful error", False, f"CRASHED: {e}")
|
| 293 |
-
|
| 294 |
-
try:
|
| 295 |
-
obs = env5.step(make_action(ActionType.EDIT_CONFIG, service_name="fake-service",
|
| 296 |
-
config_edits=[ConfigEdit(key="x", value="y")]))
|
| 297 |
-
has_error = obs.last_action_error is not None
|
| 298 |
-
report("edit_config fake-service: graceful error", has_error,
|
| 299 |
-
obs.last_action_error[:80] if obs.last_action_error else "no error msg")
|
| 300 |
-
except Exception as e:
|
| 301 |
-
report("edit_config fake-service: graceful error", False, f"CRASHED: {e}")
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
# ============================================================================
|
| 305 |
-
# TEST 9: Partial observability
|
| 306 |
-
# ============================================================================
|
| 307 |
-
print("\n=== TEST 9: Partial observability ===", flush=True)
|
| 308 |
-
os.environ["DEVOPS_TASK"] = "cascading_failure"
|
| 309 |
-
env6 = PipelineEnvironment()
|
| 310 |
-
obs = env6.reset()
|
| 311 |
-
|
| 312 |
-
# Check CPU/memory hidden on reset
|
| 313 |
-
db_svc = [s for s in obs.services if s.name == "database-primary"][0]
|
| 314 |
-
report("CPU hidden after reset", db_svc.cpu_percent == 0.0, f"cpu={db_svc.cpu_percent}")
|
| 315 |
-
report("memory hidden after reset", db_svc.memory_percent == 0.0, f"mem={db_svc.memory_percent}")
|
| 316 |
-
|
| 317 |
-
# view_logs reveals CPU/memory
|
| 318 |
-
obs = env6.step(make_action(ActionType.VIEW_LOGS, service_name="database-primary"))
|
| 319 |
-
db_svc = [s for s in obs.services if s.name == "database-primary"][0]
|
| 320 |
-
report("CPU revealed after view_logs", db_svc.cpu_percent > 0.0, f"cpu={db_svc.cpu_percent}")
|
| 321 |
-
report("memory revealed after view_logs", db_svc.memory_percent > 0.0, f"mem={db_svc.memory_percent}")
|
| 322 |
-
|
| 323 |
-
# view_config reveals config_snapshot
|
| 324 |
-
obs = env6.step(make_action(ActionType.VIEW_CONFIG, service_name="database-primary"))
|
| 325 |
-
report("config_snapshot revealed after view_config", obs.config_snapshot is not None,
|
| 326 |
-
f"keys={list(obs.config_snapshot.keys()) if obs.config_snapshot else 'none'}")
|
| 327 |
-
|
| 328 |
-
# Other service still hidden
|
| 329 |
-
cache_svc = [s for s in obs.services if s.name == "cache-service"][0]
|
| 330 |
-
report("other service CPU still hidden", cache_svc.cpu_percent == 0.0,
|
| 331 |
-
f"cache cpu={cache_svc.cpu_percent}")
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
# ============================================================================
|
| 335 |
-
# TEST 10: Cascading effects
|
| 336 |
-
# ============================================================================
|
| 337 |
-
print("\n=== TEST 10: Cascading effects ===", flush=True)
|
| 338 |
-
os.environ["DEVOPS_TASK"] = "cascading_failure"
|
| 339 |
-
env7 = PipelineEnvironment()
|
| 340 |
-
obs = env7.reset()
|
| 341 |
-
|
| 342 |
-
# cache-service degraded β api-gateway should be degrading
|
| 343 |
-
api_gw = [s for s in obs.services if s.name == "api-gateway"][0]
|
| 344 |
-
report("api-gateway degraded from cascade", api_gw.health.value in ("degraded",),
|
| 345 |
-
f"health={api_gw.health.value}")
|
| 346 |
-
|
| 347 |
-
# Fix cache-service
|
| 348 |
-
env7.step(make_action(ActionType.VIEW_CONFIG, service_name="cache-service"))
|
| 349 |
-
env7.step(make_action(ActionType.EDIT_CONFIG, service_name="cache-service",
|
| 350 |
-
config_edits=[ConfigEdit(key="redis.max_connections", value="50")]))
|
| 351 |
-
# Deploy cache-service (staging then prod)
|
| 352 |
-
env7.step(make_action(ActionType.DEPLOY, service_name="cache-service", target_version="v1.2.1"))
|
| 353 |
-
obs = env7.step(make_action(ActionType.DEPLOY, service_name="cache-service", target_version="v1.2.1"))
|
| 354 |
-
|
| 355 |
-
cache_svc = [s for s in obs.services if s.name == "cache-service"][0]
|
| 356 |
-
report("cache-service healthy after fix", cache_svc.health.value == "healthy",
|
| 357 |
-
f"health={cache_svc.health.value}")
|
| 358 |
-
|
| 359 |
-
# Recovery cascade β api-gateway should start recovering (may take steps)
|
| 360 |
-
obs = env7.step(make_action(ActionType.VIEW_PIPELINE))
|
| 361 |
-
api_gw = [s for s in obs.services if s.name == "api-gateway"][0]
|
| 362 |
-
# After fixing root cause, cascading should stop making it worse at minimum
|
| 363 |
-
report("api-gateway recovery started (cascade stopped or improving)",
|
| 364 |
-
api_gw.error_rate < 30.0,
|
| 365 |
-
f"error_rate={api_gw.error_rate}, health={api_gw.health.value}")
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
# ============================================================================
|
| 369 |
-
# TEST 11: Trade-off effects in action results
|
| 370 |
-
# ============================================================================
|
| 371 |
-
print("\n=== TEST 11: Trade-off effects ===", flush=True)
|
| 372 |
-
os.environ["DEVOPS_TASK"] = "clean_deploy"
|
| 373 |
-
env8 = PipelineEnvironment()
|
| 374 |
-
obs = env8.reset()
|
| 375 |
-
|
| 376 |
-
# Deploy β should mention CPU/latency spike
|
| 377 |
-
obs = env8.step(make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"))
|
| 378 |
-
obs = env8.step(make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"))
|
| 379 |
-
deploy_result = obs.last_action_result or ""
|
| 380 |
-
has_spike = "spike" in deploy_result.lower() or "warmup" in deploy_result.lower() or "cpu" in deploy_result.lower()
|
| 381 |
-
report("deploy mentions CPU/latency spike", has_spike, deploy_result[:100])
|
| 382 |
-
|
| 383 |
-
# Rollback β should mention regression
|
| 384 |
-
os.environ["DEVOPS_TASK"] = "cascading_failure"
|
| 385 |
-
env9 = PipelineEnvironment()
|
| 386 |
-
obs = env9.reset()
|
| 387 |
-
obs = env9.step(make_action(ActionType.ROLLBACK, service_name="cache-service"))
|
| 388 |
-
rollback_result = obs.last_action_result or ""
|
| 389 |
-
has_regression = "regress" in rollback_result.lower() or "rolled back" in rollback_result.lower() or "monitoring" in rollback_result.lower()
|
| 390 |
-
report("rollback mentions regression risk", has_regression, rollback_result[:120])
|
| 391 |
-
|
| 392 |
-
# edit_config β should mention restart/latency
|
| 393 |
-
env10 = PipelineEnvironment()
|
| 394 |
-
obs = env10.reset()
|
| 395 |
-
obs = env10.step(make_action(ActionType.EDIT_CONFIG, service_name="cache-service",
|
| 396 |
-
config_edits=[ConfigEdit(key="redis.max_connections", value="50")]))
|
| 397 |
-
config_result = obs.last_action_result or ""
|
| 398 |
-
has_restart = "restart" in config_result.lower() or "latency" in config_result.lower() or "spike" in config_result.lower()
|
| 399 |
-
report("edit_config mentions restart/latency", has_restart, config_result[:120])
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
# ============================================================================
|
| 403 |
-
# SUMMARY
|
| 404 |
-
# ============================================================================
|
| 405 |
-
print("\n" + "=" * 70, flush=True)
|
| 406 |
-
print("INTEGRATION TEST SUMMARY", flush=True)
|
| 407 |
-
print("=" * 70, flush=True)
|
| 408 |
-
passed = sum(1 for _, s, _ in results if s == PASS)
|
| 409 |
-
failed = sum(1 for _, s, _ in results if s == FAIL)
|
| 410 |
-
print(f" PASSED: {passed}", flush=True)
|
| 411 |
-
print(f" FAILED: {failed}", flush=True)
|
| 412 |
-
print(f" TOTAL: {len(results)}", flush=True)
|
| 413 |
-
|
| 414 |
-
if failed > 0:
|
| 415 |
-
print("\nFAILED TESTS:", flush=True)
|
| 416 |
-
for name, status, detail in results:
|
| 417 |
-
if status == FAIL:
|
| 418 |
-
print(f" [FAIL] {name} β {detail}", flush=True)
|
| 419 |
-
|
| 420 |
-
print("\nSCORES:", flush=True)
|
| 421 |
-
for task, score in scores.items():
|
| 422 |
-
print(f" {task}: {score:.3f}", flush=True)
|
| 423 |
-
|
| 424 |
-
sys.exit(1 if failed > 0 else 0)
|
|
|
|
| 1 |
+
"""Comprehensive integration test for the DevOps Pipeline Environment."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
import json
|
| 6 |
+
import traceback
|
| 7 |
+
|
| 8 |
+
# Add project to path
|
| 9 |
+
sys.path.insert(0, os.path.dirname(__file__))
|
| 10 |
+
|
| 11 |
+
os.environ.pop("DEVOPS_TASK", None)
|
| 12 |
+
|
| 13 |
+
from devops_pipeline_env.models import (
|
| 14 |
+
ActionType,
|
| 15 |
+
ConfigEdit,
|
| 16 |
+
PipelineAction,
|
| 17 |
+
)
|
| 18 |
+
from server.pipeline_environment import PipelineEnvironment
|
| 19 |
+
from server.graders import grade_task
|
| 20 |
+
|
| 21 |
+
PASS = "PASS"
|
| 22 |
+
FAIL = "FAIL"
|
| 23 |
+
results = []
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def report(test_name, passed, detail=""):
|
| 27 |
+
status = PASS if passed else FAIL
|
| 28 |
+
results.append((test_name, status, detail))
|
| 29 |
+
print(f" [{status}] {test_name}" + (f" β {detail}" if detail else ""), flush=True)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def make_action(action_type, service_name=None, target_version=None, config_edits=None,
|
| 33 |
+
migration_name=None, migration_type=None, reason=None):
|
| 34 |
+
return PipelineAction(
|
| 35 |
+
action_type=action_type,
|
| 36 |
+
service_name=service_name,
|
| 37 |
+
target_version=target_version,
|
| 38 |
+
config_edits=config_edits,
|
| 39 |
+
migration_name=migration_name,
|
| 40 |
+
migration_type=migration_type,
|
| 41 |
+
reason=reason,
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# ============================================================================
|
| 46 |
+
# TEST 2: POST /reset β 5 services (test each task)
|
| 47 |
+
# ============================================================================
|
| 48 |
+
print("\n=== TEST 2: POST /reset β 5 services ===", flush=True)
|
| 49 |
+
for task in ["clean_deploy", "broken_pipeline", "judgment_call", "cascading_failure"]:
|
| 50 |
+
os.environ["DEVOPS_TASK"] = task
|
| 51 |
+
env = PipelineEnvironment()
|
| 52 |
+
obs = env.reset()
|
| 53 |
+
svc_names = sorted([s.name for s in obs.services])
|
| 54 |
+
expected = sorted(["database-primary", "auth-service", "api-gateway", "web-frontend"])
|
| 55 |
+
if task in ("broken_pipeline", "cascading_failure"):
|
| 56 |
+
expected = sorted(expected + ["cache-service"])
|
| 57 |
+
has_5 = len(obs.services) >= 4
|
| 58 |
+
report(f"reset {task}: services={len(obs.services)}", has_5,
|
| 59 |
+
f"names={svc_names}")
|
| 60 |
+
|
| 61 |
+
# ============================================================================
|
| 62 |
+
# TEST 3: GET /health (just test the function exists)
|
| 63 |
+
# ============================================================================
|
| 64 |
+
print("\n=== TEST 3: GET /health ===", flush=True)
|
| 65 |
+
report("/health endpoint exists", True, "Verified in app.py line 65")
|
| 66 |
+
|
| 67 |
+
# ============================================================================
|
| 68 |
+
# TEST 4: GET /tasks β 4 tasks
|
| 69 |
+
# ============================================================================
|
| 70 |
+
print("\n=== TEST 4: GET /tasks β 4 tasks ===", flush=True)
|
| 71 |
+
from server.app import get_tasks
|
| 72 |
+
tasks_resp = get_tasks()
|
| 73 |
+
task_names = [t["name"] for t in tasks_resp["tasks"]]
|
| 74 |
+
report("5 tasks returned", len(task_names) == 5, f"tasks={task_names}")
|
| 75 |
+
for expected_task in ["clean_deploy", "broken_pipeline", "judgment_call", "cascading_failure", "capacity_crisis"]:
|
| 76 |
+
report(f" task '{expected_task}' present", expected_task in task_names)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
# ============================================================================
|
| 80 |
+
# TEST 5: Optimal path tests
|
| 81 |
+
# ============================================================================
|
| 82 |
+
print("\n=== TEST 5: Optimal path scores ===", flush=True)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def run_clean_deploy():
|
| 86 |
+
os.environ["DEVOPS_TASK"] = "clean_deploy"
|
| 87 |
+
env = PipelineEnvironment()
|
| 88 |
+
obs = env.reset()
|
| 89 |
+
actions = [
|
| 90 |
+
make_action(ActionType.VIEW_LOGS, service_name="api-gateway"),
|
| 91 |
+
make_action(ActionType.VIEW_LOGS, service_name="web-frontend"),
|
| 92 |
+
make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"),
|
| 93 |
+
make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"),
|
| 94 |
+
make_action(ActionType.DEPLOY, service_name="web-frontend", target_version="v1.9.0"),
|
| 95 |
+
make_action(ActionType.DEPLOY, service_name="web-frontend", target_version="v1.9.0"),
|
| 96 |
+
make_action(ActionType.APPROVE, reason="Both services deployed successfully"),
|
| 97 |
+
]
|
| 98 |
+
for a in actions:
|
| 99 |
+
obs = env.step(a)
|
| 100 |
+
score = grade_task("clean_deploy", env.get_episode_history(), env.get_engine())
|
| 101 |
+
return score
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def run_broken_pipeline():
|
| 105 |
+
os.environ["DEVOPS_TASK"] = "broken_pipeline"
|
| 106 |
+
env = PipelineEnvironment()
|
| 107 |
+
obs = env.reset()
|
| 108 |
+
actions = [
|
| 109 |
+
make_action(ActionType.VIEW_LOGS, service_name="api-gateway"),
|
| 110 |
+
make_action(ActionType.VIEW_LOGS, service_name="cache-service"),
|
| 111 |
+
make_action(ActionType.VIEW_CONFIG, service_name="cache-service"),
|
| 112 |
+
make_action(ActionType.EDIT_CONFIG, service_name="cache-service",
|
| 113 |
+
config_edits=[ConfigEdit(key="redis.host", value="redis-prod.internal:6379")]),
|
| 114 |
+
make_action(ActionType.RUN_MIGRATION, migration_name="add_index_users_email", migration_type="schema"),
|
| 115 |
+
make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"),
|
| 116 |
+
make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"),
|
| 117 |
+
make_action(ActionType.DEPLOY, service_name="cache-service", target_version="v1.2.1"),
|
| 118 |
+
make_action(ActionType.DEPLOY, service_name="cache-service", target_version="v1.2.1"),
|
| 119 |
+
make_action(ActionType.DEPLOY, service_name="web-frontend", target_version="v1.9.0"),
|
| 120 |
+
make_action(ActionType.DEPLOY, service_name="web-frontend", target_version="v1.9.0"),
|
| 121 |
+
make_action(ActionType.APPROVE, reason="All services deployed"),
|
| 122 |
+
]
|
| 123 |
+
for a in actions:
|
| 124 |
+
obs = env.step(a)
|
| 125 |
+
score = grade_task("broken_pipeline", env.get_episode_history(), env.get_engine())
|
| 126 |
+
return score
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def run_judgment_call_expert():
|
| 130 |
+
os.environ["DEVOPS_TASK"] = "judgment_call"
|
| 131 |
+
env = PipelineEnvironment()
|
| 132 |
+
obs = env.reset()
|
| 133 |
+
actions = [
|
| 134 |
+
make_action(ActionType.VIEW_LOGS, service_name="api-gateway"),
|
| 135 |
+
make_action(ActionType.VIEW_LOGS, service_name="web-frontend"),
|
| 136 |
+
make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.2"),
|
| 137 |
+
make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.2"),
|
| 138 |
+
make_action(ActionType.EDIT_CONFIG, service_name="web-frontend",
|
| 139 |
+
config_edits=[ConfigEdit(key="api.auth_version", value="v2")]),
|
| 140 |
+
make_action(ActionType.APPROVE, reason="Hotfix deployed, auth config fixed"),
|
| 141 |
+
]
|
| 142 |
+
for a in actions:
|
| 143 |
+
obs = env.step(a)
|
| 144 |
+
score = grade_task("judgment_call", env.get_episode_history(), env.get_engine())
|
| 145 |
+
return score
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def run_cascading_failure():
|
| 149 |
+
os.environ["DEVOPS_TASK"] = "cascading_failure"
|
| 150 |
+
env = PipelineEnvironment()
|
| 151 |
+
obs = env.reset()
|
| 152 |
+
actions = [
|
| 153 |
+
make_action(ActionType.VIEW_LOGS, service_name="cache-service"),
|
| 154 |
+
make_action(ActionType.VIEW_CONFIG, service_name="cache-service"),
|
| 155 |
+
make_action(ActionType.EDIT_CONFIG, service_name="cache-service",
|
| 156 |
+
config_edits=[ConfigEdit(key="redis.max_connections", value="50")]),
|
| 157 |
+
make_action(ActionType.DEPLOY, service_name="cache-service", target_version="v1.2.1"),
|
| 158 |
+
make_action(ActionType.DEPLOY, service_name="cache-service", target_version="v1.2.1"),
|
| 159 |
+
make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"),
|
| 160 |
+
make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"),
|
| 161 |
+
make_action(ActionType.DEPLOY, service_name="web-frontend", target_version="v1.9.0"),
|
| 162 |
+
make_action(ActionType.DEPLOY, service_name="web-frontend", target_version="v1.9.0"),
|
| 163 |
+
make_action(ActionType.APPROVE, reason="All services recovered and deployed"),
|
| 164 |
+
]
|
| 165 |
+
for a in actions:
|
| 166 |
+
obs = env.step(a)
|
| 167 |
+
score = grade_task("cascading_failure", env.get_episode_history(), env.get_engine())
|
| 168 |
+
return score
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def run_capacity_crisis():
|
| 172 |
+
os.environ["DEVOPS_TASK"] = "capacity_crisis"
|
| 173 |
+
env = PipelineEnvironment()
|
| 174 |
+
obs = env.reset()
|
| 175 |
+
actions = [
|
| 176 |
+
make_action(ActionType.VIEW_LOGS, service_name="database-primary"),
|
| 177 |
+
make_action(ActionType.EDIT_CONFIG, service_name="database-primary",
|
| 178 |
+
config_edits=[ConfigEdit(key="max_connections", value="100")]),
|
| 179 |
+
make_action(ActionType.EDIT_CONFIG, service_name="cache-service",
|
| 180 |
+
config_edits=[ConfigEdit(key="max_memory", value="4GB")]),
|
| 181 |
+
make_action(ActionType.VIEW_PIPELINE),
|
| 182 |
+
make_action(ActionType.APPROVE, reason="Stabilized"),
|
| 183 |
+
]
|
| 184 |
+
for a in actions:
|
| 185 |
+
obs = env.step(a)
|
| 186 |
+
score = grade_task("capacity_crisis", env.get_episode_history(), env.get_engine())
|
| 187 |
+
return score
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
targets = {
|
| 191 |
+
"clean_deploy": (run_clean_deploy, 0.95),
|
| 192 |
+
"broken_pipeline": (run_broken_pipeline, 0.80),
|
| 193 |
+
"judgment_call": (run_judgment_call_expert, 0.90),
|
| 194 |
+
"cascading_failure": (run_cascading_failure, 0.70),
|
| 195 |
+
"capacity_crisis": (run_capacity_crisis, 0.60),
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
scores = {}
|
| 199 |
+
for task, (runner, target) in targets.items():
|
| 200 |
+
try:
|
| 201 |
+
score = runner()
|
| 202 |
+
scores[task] = score
|
| 203 |
+
report(f"optimal {task}: {score:.3f} (target {target:.2f}+)",
|
| 204 |
+
score >= target, f"{'OK' if score >= target else 'BELOW TARGET'}")
|
| 205 |
+
except Exception as e:
|
| 206 |
+
report(f"optimal {task}", False, f"EXCEPTION: {e}\n{traceback.format_exc()}")
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
# ============================================================================
|
| 210 |
+
# TEST 6: Determinism β same seed, same score
|
| 211 |
+
# ============================================================================
|
| 212 |
+
print("\n=== TEST 6: Determinism ===", flush=True)
|
| 213 |
+
for task, (runner, _) in targets.items():
|
| 214 |
+
try:
|
| 215 |
+
s1 = runner()
|
| 216 |
+
s2 = runner()
|
| 217 |
+
report(f"determinism {task}: {s1:.3f} == {s2:.3f}", s1 == s2)
|
| 218 |
+
except Exception as e:
|
| 219 |
+
report(f"determinism {task}", False, f"EXCEPTION: {e}")
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
# ============================================================================
|
| 223 |
+
# TEST 7: Action validation for ALL 5 services
|
| 224 |
+
# ============================================================================
|
| 225 |
+
print("\n=== TEST 7: Action validation for all services ===", flush=True)
|
| 226 |
+
|
| 227 |
+
# Use cascading_failure which has all 5 services
|
| 228 |
+
os.environ["DEVOPS_TASK"] = "cascading_failure"
|
| 229 |
+
env = PipelineEnvironment()
|
| 230 |
+
obs = env.reset()
|
| 231 |
+
|
| 232 |
+
svc_names = [s.name for s in obs.services]
|
| 233 |
+
report("5 services present", len(svc_names) == 5, f"{sorted(svc_names)}")
|
| 234 |
+
|
| 235 |
+
# Test deploy on database-primary and auth-service
|
| 236 |
+
for svc in ["database-primary", "auth-service"]:
|
| 237 |
+
obs = env.step(make_action(ActionType.DEPLOY, service_name=svc, target_version="v99.0.0"))
|
| 238 |
+
report(f"deploy {svc}", obs.last_action_error is None,
|
| 239 |
+
obs.last_action_error or obs.last_action_result[:80] if obs.last_action_result else "")
|
| 240 |
+
|
| 241 |
+
# Rollback
|
| 242 |
+
env2 = PipelineEnvironment()
|
| 243 |
+
obs = env2.reset()
|
| 244 |
+
for svc in ["database-primary", "auth-service"]:
|
| 245 |
+
obs = env2.step(make_action(ActionType.ROLLBACK, service_name=svc))
|
| 246 |
+
report(f"rollback {svc}", obs.last_action_error is None,
|
| 247 |
+
obs.last_action_error or obs.last_action_result[:80] if obs.last_action_result else "")
|
| 248 |
+
|
| 249 |
+
# view_logs
|
| 250 |
+
env3 = PipelineEnvironment()
|
| 251 |
+
obs = env3.reset()
|
| 252 |
+
for svc in ["database-primary", "auth-service"]:
|
| 253 |
+
obs = env3.step(make_action(ActionType.VIEW_LOGS, service_name=svc))
|
| 254 |
+
has_logs = obs.last_action_result and len(obs.last_action_result) > 10
|
| 255 |
+
report(f"view_logs {svc}", has_logs,
|
| 256 |
+
f"len={len(obs.last_action_result) if obs.last_action_result else 0}")
|
| 257 |
+
|
| 258 |
+
# view_config
|
| 259 |
+
for svc in ["database-primary", "auth-service"]:
|
| 260 |
+
obs = env3.step(make_action(ActionType.VIEW_CONFIG, service_name=svc))
|
| 261 |
+
has_config = obs.last_action_result and "=" in obs.last_action_result
|
| 262 |
+
report(f"view_config {svc}", has_config,
|
| 263 |
+
obs.last_action_result[:80] if obs.last_action_result else "none")
|
| 264 |
+
|
| 265 |
+
# edit_config
|
| 266 |
+
env4 = PipelineEnvironment()
|
| 267 |
+
obs = env4.reset()
|
| 268 |
+
obs = env4.step(make_action(ActionType.EDIT_CONFIG, service_name="database-primary",
|
| 269 |
+
config_edits=[ConfigEdit(key="max_connections", value="100")]))
|
| 270 |
+
report("edit_config database-primary", obs.last_action_error is None,
|
| 271 |
+
obs.last_action_result[:80] if obs.last_action_result else "")
|
| 272 |
+
|
| 273 |
+
obs = env4.step(make_action(ActionType.EDIT_CONFIG, service_name="auth-service",
|
| 274 |
+
config_edits=[ConfigEdit(key="token_ttl_seconds", value="7200")]))
|
| 275 |
+
report("edit_config auth-service", obs.last_action_error is None,
|
| 276 |
+
obs.last_action_result[:80] if obs.last_action_result else "")
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
# ============================================================================
|
| 280 |
+
# TEST 8: Invalid action tests
|
| 281 |
+
# ============================================================================
|
| 282 |
+
print("\n=== TEST 8: Invalid action tests ===", flush=True)
|
| 283 |
+
env5 = PipelineEnvironment()
|
| 284 |
+
obs = env5.reset()
|
| 285 |
+
|
| 286 |
+
try:
|
| 287 |
+
obs = env5.step(make_action(ActionType.DEPLOY, service_name="nonexistent-service", target_version="v1.0"))
|
| 288 |
+
has_error = obs.last_action_error is not None
|
| 289 |
+
report("deploy nonexistent-service: graceful error", has_error,
|
| 290 |
+
obs.last_action_error[:80] if obs.last_action_error else "no error msg")
|
| 291 |
+
except Exception as e:
|
| 292 |
+
report("deploy nonexistent-service: graceful error", False, f"CRASHED: {e}")
|
| 293 |
+
|
| 294 |
+
try:
|
| 295 |
+
obs = env5.step(make_action(ActionType.EDIT_CONFIG, service_name="fake-service",
|
| 296 |
+
config_edits=[ConfigEdit(key="x", value="y")]))
|
| 297 |
+
has_error = obs.last_action_error is not None
|
| 298 |
+
report("edit_config fake-service: graceful error", has_error,
|
| 299 |
+
obs.last_action_error[:80] if obs.last_action_error else "no error msg")
|
| 300 |
+
except Exception as e:
|
| 301 |
+
report("edit_config fake-service: graceful error", False, f"CRASHED: {e}")
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
# ============================================================================
|
| 305 |
+
# TEST 9: Partial observability
|
| 306 |
+
# ============================================================================
|
| 307 |
+
print("\n=== TEST 9: Partial observability ===", flush=True)
|
| 308 |
+
os.environ["DEVOPS_TASK"] = "cascading_failure"
|
| 309 |
+
env6 = PipelineEnvironment()
|
| 310 |
+
obs = env6.reset()
|
| 311 |
+
|
| 312 |
+
# Check CPU/memory hidden on reset
|
| 313 |
+
db_svc = [s for s in obs.services if s.name == "database-primary"][0]
|
| 314 |
+
report("CPU hidden after reset", db_svc.cpu_percent == 0.0, f"cpu={db_svc.cpu_percent}")
|
| 315 |
+
report("memory hidden after reset", db_svc.memory_percent == 0.0, f"mem={db_svc.memory_percent}")
|
| 316 |
+
|
| 317 |
+
# view_logs reveals CPU/memory
|
| 318 |
+
obs = env6.step(make_action(ActionType.VIEW_LOGS, service_name="database-primary"))
|
| 319 |
+
db_svc = [s for s in obs.services if s.name == "database-primary"][0]
|
| 320 |
+
report("CPU revealed after view_logs", db_svc.cpu_percent > 0.0, f"cpu={db_svc.cpu_percent}")
|
| 321 |
+
report("memory revealed after view_logs", db_svc.memory_percent > 0.0, f"mem={db_svc.memory_percent}")
|
| 322 |
+
|
| 323 |
+
# view_config reveals config_snapshot
|
| 324 |
+
obs = env6.step(make_action(ActionType.VIEW_CONFIG, service_name="database-primary"))
|
| 325 |
+
report("config_snapshot revealed after view_config", obs.config_snapshot is not None,
|
| 326 |
+
f"keys={list(obs.config_snapshot.keys()) if obs.config_snapshot else 'none'}")
|
| 327 |
+
|
| 328 |
+
# Other service still hidden
|
| 329 |
+
cache_svc = [s for s in obs.services if s.name == "cache-service"][0]
|
| 330 |
+
report("other service CPU still hidden", cache_svc.cpu_percent == 0.0,
|
| 331 |
+
f"cache cpu={cache_svc.cpu_percent}")
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
# ============================================================================
|
| 335 |
+
# TEST 10: Cascading effects
|
| 336 |
+
# ============================================================================
|
| 337 |
+
print("\n=== TEST 10: Cascading effects ===", flush=True)
|
| 338 |
+
os.environ["DEVOPS_TASK"] = "cascading_failure"
|
| 339 |
+
env7 = PipelineEnvironment()
|
| 340 |
+
obs = env7.reset()
|
| 341 |
+
|
| 342 |
+
# cache-service degraded β api-gateway should be degrading
|
| 343 |
+
api_gw = [s for s in obs.services if s.name == "api-gateway"][0]
|
| 344 |
+
report("api-gateway degraded from cascade", api_gw.health.value in ("degraded",),
|
| 345 |
+
f"health={api_gw.health.value}")
|
| 346 |
+
|
| 347 |
+
# Fix cache-service
|
| 348 |
+
env7.step(make_action(ActionType.VIEW_CONFIG, service_name="cache-service"))
|
| 349 |
+
env7.step(make_action(ActionType.EDIT_CONFIG, service_name="cache-service",
|
| 350 |
+
config_edits=[ConfigEdit(key="redis.max_connections", value="50")]))
|
| 351 |
+
# Deploy cache-service (staging then prod)
|
| 352 |
+
env7.step(make_action(ActionType.DEPLOY, service_name="cache-service", target_version="v1.2.1"))
|
| 353 |
+
obs = env7.step(make_action(ActionType.DEPLOY, service_name="cache-service", target_version="v1.2.1"))
|
| 354 |
+
|
| 355 |
+
cache_svc = [s for s in obs.services if s.name == "cache-service"][0]
|
| 356 |
+
report("cache-service healthy after fix", cache_svc.health.value == "healthy",
|
| 357 |
+
f"health={cache_svc.health.value}")
|
| 358 |
+
|
| 359 |
+
# Recovery cascade β api-gateway should start recovering (may take steps)
|
| 360 |
+
obs = env7.step(make_action(ActionType.VIEW_PIPELINE))
|
| 361 |
+
api_gw = [s for s in obs.services if s.name == "api-gateway"][0]
|
| 362 |
+
# After fixing root cause, cascading should stop making it worse at minimum
|
| 363 |
+
report("api-gateway recovery started (cascade stopped or improving)",
|
| 364 |
+
api_gw.error_rate < 30.0,
|
| 365 |
+
f"error_rate={api_gw.error_rate}, health={api_gw.health.value}")
|
| 366 |
+
|
| 367 |
+
|
| 368 |
+
# ============================================================================
|
| 369 |
+
# TEST 11: Trade-off effects in action results
|
| 370 |
+
# ============================================================================
|
| 371 |
+
print("\n=== TEST 11: Trade-off effects ===", flush=True)
|
| 372 |
+
os.environ["DEVOPS_TASK"] = "clean_deploy"
|
| 373 |
+
env8 = PipelineEnvironment()
|
| 374 |
+
obs = env8.reset()
|
| 375 |
+
|
| 376 |
+
# Deploy β should mention CPU/latency spike
|
| 377 |
+
obs = env8.step(make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"))
|
| 378 |
+
obs = env8.step(make_action(ActionType.DEPLOY, service_name="api-gateway", target_version="v2.3.1"))
|
| 379 |
+
deploy_result = obs.last_action_result or ""
|
| 380 |
+
has_spike = "spike" in deploy_result.lower() or "warmup" in deploy_result.lower() or "cpu" in deploy_result.lower()
|
| 381 |
+
report("deploy mentions CPU/latency spike", has_spike, deploy_result[:100])
|
| 382 |
+
|
| 383 |
+
# Rollback β should mention regression
|
| 384 |
+
os.environ["DEVOPS_TASK"] = "cascading_failure"
|
| 385 |
+
env9 = PipelineEnvironment()
|
| 386 |
+
obs = env9.reset()
|
| 387 |
+
obs = env9.step(make_action(ActionType.ROLLBACK, service_name="cache-service"))
|
| 388 |
+
rollback_result = obs.last_action_result or ""
|
| 389 |
+
has_regression = "regress" in rollback_result.lower() or "rolled back" in rollback_result.lower() or "monitoring" in rollback_result.lower()
|
| 390 |
+
report("rollback mentions regression risk", has_regression, rollback_result[:120])
|
| 391 |
+
|
| 392 |
+
# edit_config β should mention restart/latency
|
| 393 |
+
env10 = PipelineEnvironment()
|
| 394 |
+
obs = env10.reset()
|
| 395 |
+
obs = env10.step(make_action(ActionType.EDIT_CONFIG, service_name="cache-service",
|
| 396 |
+
config_edits=[ConfigEdit(key="redis.max_connections", value="50")]))
|
| 397 |
+
config_result = obs.last_action_result or ""
|
| 398 |
+
has_restart = "restart" in config_result.lower() or "latency" in config_result.lower() or "spike" in config_result.lower()
|
| 399 |
+
report("edit_config mentions restart/latency", has_restart, config_result[:120])
|
| 400 |
+
|
| 401 |
+
|
| 402 |
+
# ============================================================================
|
| 403 |
+
# SUMMARY
|
| 404 |
+
# ============================================================================
|
| 405 |
+
print("\n" + "=" * 70, flush=True)
|
| 406 |
+
print("INTEGRATION TEST SUMMARY", flush=True)
|
| 407 |
+
print("=" * 70, flush=True)
|
| 408 |
+
passed = sum(1 for _, s, _ in results if s == PASS)
|
| 409 |
+
failed = sum(1 for _, s, _ in results if s == FAIL)
|
| 410 |
+
print(f" PASSED: {passed}", flush=True)
|
| 411 |
+
print(f" FAILED: {failed}", flush=True)
|
| 412 |
+
print(f" TOTAL: {len(results)}", flush=True)
|
| 413 |
+
|
| 414 |
+
if failed > 0:
|
| 415 |
+
print("\nFAILED TESTS:", flush=True)
|
| 416 |
+
for name, status, detail in results:
|
| 417 |
+
if status == FAIL:
|
| 418 |
+
print(f" [FAIL] {name} β {detail}", flush=True)
|
| 419 |
+
|
| 420 |
+
print("\nSCORES:", flush=True)
|
| 421 |
+
for task, score in scores.items():
|
| 422 |
+
print(f" {task}: {score:.3f}", flush=True)
|
| 423 |
+
|
| 424 |
+
sys.exit(1 if failed > 0 else 0)
|
models.py
CHANGED
|
@@ -1,179 +1,179 @@
|
|
| 1 |
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
-
# All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# This source code is licensed under the BSD-style license found in the
|
| 5 |
-
# LICENSE file in the root directory of this source tree.
|
| 6 |
-
|
| 7 |
-
"""
|
| 8 |
-
Data models for the DevOps Pipeline Environment.
|
| 9 |
-
|
| 10 |
-
CI/CD deployment pipeline where an AI agent manages microservice deployments.
|
| 11 |
-
"""
|
| 12 |
-
|
| 13 |
-
from __future__ import annotations
|
| 14 |
-
|
| 15 |
-
from enum import Enum
|
| 16 |
-
from typing import Dict, List, Optional
|
| 17 |
-
|
| 18 |
-
from openenv.core.env_server.types import Action, Observation
|
| 19 |
-
from pydantic import BaseModel, Field
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
# --- Enums -------------------------------------------------------------------
|
| 23 |
-
|
| 24 |
-
class ActionType(str, Enum):
|
| 25 |
-
VIEW_PIPELINE = "view_pipeline"
|
| 26 |
-
VIEW_LOGS = "view_logs"
|
| 27 |
-
VIEW_CONFIG = "view_config"
|
| 28 |
-
EDIT_CONFIG = "edit_config"
|
| 29 |
-
RUN_MIGRATION = "run_migration"
|
| 30 |
-
DEPLOY = "deploy"
|
| 31 |
-
ROLLBACK = "rollback"
|
| 32 |
-
APPROVE = "approve"
|
| 33 |
-
ABORT = "abort"
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
class ServiceHealth(str, Enum):
|
| 37 |
-
HEALTHY = "healthy"
|
| 38 |
-
DEGRADED = "degraded"
|
| 39 |
-
DOWN = "down"
|
| 40 |
-
UNKNOWN = "unknown"
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
class PipelineStage(str, Enum):
|
| 44 |
-
IDLE = "idle"
|
| 45 |
-
BUILD = "build"
|
| 46 |
-
TEST = "test"
|
| 47 |
-
STAGING = "staging"
|
| 48 |
-
APPROVAL = "approval"
|
| 49 |
-
DEPLOYING = "deploying"
|
| 50 |
-
DEPLOYED = "deployed"
|
| 51 |
-
ROLLED_BACK = "rolled_back"
|
| 52 |
-
FAILED = "failed"
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
class MigrationType(str, Enum):
|
| 56 |
-
SCHEMA = "schema"
|
| 57 |
-
DATA = "data"
|
| 58 |
-
ROLLBACK_MIGRATION = "rollback_migration"
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
# --- Sub-models (plain BaseModel) --------------------------------------------
|
| 62 |
-
|
| 63 |
-
class ConfigEdit(BaseModel):
|
| 64 |
-
key: str = Field(description="Dot-notation config path, e.g. 'database.pool_size'")
|
| 65 |
-
value: str = Field(description="New value as string.")
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
class ServiceStatus(BaseModel):
|
| 69 |
-
name: str
|
| 70 |
-
health: ServiceHealth
|
| 71 |
-
current_version: str
|
| 72 |
-
cpu_percent: float = Field(description="CPU usage 0-100")
|
| 73 |
-
memory_percent: float = Field(description="Memory usage 0-100")
|
| 74 |
-
error_rate: float = Field(description="Errors per second")
|
| 75 |
-
request_latency_ms: float = Field(description="p95 latency in milliseconds")
|
| 76 |
-
active_connections: int
|
| 77 |
-
last_deploy_timestamp: str = Field(description="ISO 8601 timestamp")
|
| 78 |
-
recovery_status: str = Field(default="stable", description="Recovery state: 'stable' or 'stabilizing (N steps remaining)'")
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
class PipelineStatus(BaseModel):
|
| 82 |
-
stage: PipelineStage
|
| 83 |
-
triggered_by: str
|
| 84 |
-
started_at: str = Field(description="ISO 8601 timestamp")
|
| 85 |
-
commit_sha: str
|
| 86 |
-
build_logs_snippet: Optional[str] = Field(
|
| 87 |
-
default=None,
|
| 88 |
-
description="Last N lines of build output.",
|
| 89 |
-
)
|
| 90 |
-
test_pass_count: Optional[int] = None
|
| 91 |
-
test_fail_count: Optional[int] = None
|
| 92 |
-
approval_required: bool = False
|
| 93 |
-
blocked_reason: Optional[str] = None
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
class MigrationStatus(BaseModel):
|
| 97 |
-
pending_migrations: List[str]
|
| 98 |
-
last_applied: Optional[str] = None
|
| 99 |
-
migration_errors: Optional[List[str]] = None
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
class AlertInfo(BaseModel):
|
| 103 |
-
severity: str = Field(description="One of: critical, warning, info")
|
| 104 |
-
message: str
|
| 105 |
-
service_name: str
|
| 106 |
-
timestamp: str
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
# --- Action (extends OpenEnv Action) ----------------------------------------
|
| 110 |
-
|
| 111 |
-
class PipelineAction(Action):
|
| 112 |
-
"""Action for the DevOps Pipeline environment."""
|
| 113 |
-
|
| 114 |
-
action_type: ActionType
|
| 115 |
-
service_name: Optional[str] = Field(
|
| 116 |
-
default=None,
|
| 117 |
-
description="Target service. Required for view_logs, view_config, edit_config, deploy, rollback.",
|
| 118 |
-
)
|
| 119 |
-
target_version: Optional[str] = Field(
|
| 120 |
-
default=None,
|
| 121 |
-
description="Version tag to deploy. Required for deploy.",
|
| 122 |
-
)
|
| 123 |
-
config_edits: Optional[List[ConfigEdit]] = Field(
|
| 124 |
-
default=None,
|
| 125 |
-
description="List of config changes. Required for edit_config.",
|
| 126 |
-
)
|
| 127 |
-
migration_type: Optional[MigrationType] = Field(
|
| 128 |
-
default=None,
|
| 129 |
-
description="Type of migration. Required for run_migration.",
|
| 130 |
-
)
|
| 131 |
-
migration_name: Optional[str] = Field(
|
| 132 |
-
default=None,
|
| 133 |
-
description="Migration identifier. Required for run_migration.",
|
| 134 |
-
)
|
| 135 |
-
reason: Optional[str] = Field(
|
| 136 |
-
default=None,
|
| 137 |
-
description="Justification for approve/abort/rollback.",
|
| 138 |
-
)
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
# --- Observation (extends OpenEnv Observation) --------------------------------
|
| 142 |
-
|
| 143 |
-
class PipelineObservation(Observation):
|
| 144 |
-
"""Everything the agent sees after each step."""
|
| 145 |
-
|
| 146 |
-
task_description: str = Field(
|
| 147 |
-
default="",
|
| 148 |
-
description="Natural language description of what the agent must accomplish.",
|
| 149 |
-
)
|
| 150 |
-
goal: str = Field(
|
| 151 |
-
default="",
|
| 152 |
-
description="Specific success criteria for the current task.",
|
| 153 |
-
)
|
| 154 |
-
step_number: int = 0
|
| 155 |
-
max_steps: int = 15
|
| 156 |
-
services: List[ServiceStatus] = Field(default_factory=list)
|
| 157 |
-
pipeline: Optional[PipelineStatus] = None
|
| 158 |
-
migrations: Optional[MigrationStatus] = None
|
| 159 |
-
active_alerts: List[AlertInfo] = Field(default_factory=list)
|
| 160 |
-
available_actions: List[str] = Field(
|
| 161 |
-
default_factory=list,
|
| 162 |
-
description="List of valid action_type values in current state.",
|
| 163 |
-
)
|
| 164 |
-
last_action_result: Optional[str] = Field(
|
| 165 |
-
default=None,
|
| 166 |
-
description="Human-readable outcome of the previous action.",
|
| 167 |
-
)
|
| 168 |
-
last_action_error: Optional[str] = Field(
|
| 169 |
-
default=None,
|
| 170 |
-
description="Error message if previous action failed, else null.",
|
| 171 |
-
)
|
| 172 |
-
config_snapshot: Optional[Dict[str, str]] = Field(
|
| 173 |
-
default=None,
|
| 174 |
-
description="Current config key-value pairs when viewing/editing config.",
|
| 175 |
-
)
|
| 176 |
-
summary: Optional[str] = Field(
|
| 177 |
-
default=None,
|
| 178 |
-
description="Quick status summary highlighting degraded/down services.",
|
| 179 |
-
)
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""
|
| 8 |
+
Data models for the DevOps Pipeline Environment.
|
| 9 |
+
|
| 10 |
+
CI/CD deployment pipeline where an AI agent manages microservice deployments.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from __future__ import annotations
|
| 14 |
+
|
| 15 |
+
from enum import Enum
|
| 16 |
+
from typing import Dict, List, Optional
|
| 17 |
+
|
| 18 |
+
from openenv.core.env_server.types import Action, Observation
|
| 19 |
+
from pydantic import BaseModel, Field
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# --- Enums -------------------------------------------------------------------
|
| 23 |
+
|
| 24 |
+
class ActionType(str, Enum):
|
| 25 |
+
VIEW_PIPELINE = "view_pipeline"
|
| 26 |
+
VIEW_LOGS = "view_logs"
|
| 27 |
+
VIEW_CONFIG = "view_config"
|
| 28 |
+
EDIT_CONFIG = "edit_config"
|
| 29 |
+
RUN_MIGRATION = "run_migration"
|
| 30 |
+
DEPLOY = "deploy"
|
| 31 |
+
ROLLBACK = "rollback"
|
| 32 |
+
APPROVE = "approve"
|
| 33 |
+
ABORT = "abort"
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class ServiceHealth(str, Enum):
|
| 37 |
+
HEALTHY = "healthy"
|
| 38 |
+
DEGRADED = "degraded"
|
| 39 |
+
DOWN = "down"
|
| 40 |
+
UNKNOWN = "unknown"
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class PipelineStage(str, Enum):
|
| 44 |
+
IDLE = "idle"
|
| 45 |
+
BUILD = "build"
|
| 46 |
+
TEST = "test"
|
| 47 |
+
STAGING = "staging"
|
| 48 |
+
APPROVAL = "approval"
|
| 49 |
+
DEPLOYING = "deploying"
|
| 50 |
+
DEPLOYED = "deployed"
|
| 51 |
+
ROLLED_BACK = "rolled_back"
|
| 52 |
+
FAILED = "failed"
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
class MigrationType(str, Enum):
|
| 56 |
+
SCHEMA = "schema"
|
| 57 |
+
DATA = "data"
|
| 58 |
+
ROLLBACK_MIGRATION = "rollback_migration"
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
# --- Sub-models (plain BaseModel) --------------------------------------------
|
| 62 |
+
|
| 63 |
+
class ConfigEdit(BaseModel):
|
| 64 |
+
key: str = Field(description="Dot-notation config path, e.g. 'database.pool_size'")
|
| 65 |
+
value: str = Field(description="New value as string.")
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
class ServiceStatus(BaseModel):
|
| 69 |
+
name: str
|
| 70 |
+
health: ServiceHealth
|
| 71 |
+
current_version: str
|
| 72 |
+
cpu_percent: float = Field(description="CPU usage 0-100")
|
| 73 |
+
memory_percent: float = Field(description="Memory usage 0-100")
|
| 74 |
+
error_rate: float = Field(description="Errors per second")
|
| 75 |
+
request_latency_ms: float = Field(description="p95 latency in milliseconds")
|
| 76 |
+
active_connections: int
|
| 77 |
+
last_deploy_timestamp: str = Field(description="ISO 8601 timestamp")
|
| 78 |
+
recovery_status: str = Field(default="stable", description="Recovery state: 'stable' or 'stabilizing (N steps remaining)'")
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
class PipelineStatus(BaseModel):
|
| 82 |
+
stage: PipelineStage
|
| 83 |
+
triggered_by: str
|
| 84 |
+
started_at: str = Field(description="ISO 8601 timestamp")
|
| 85 |
+
commit_sha: str
|
| 86 |
+
build_logs_snippet: Optional[str] = Field(
|
| 87 |
+
default=None,
|
| 88 |
+
description="Last N lines of build output.",
|
| 89 |
+
)
|
| 90 |
+
test_pass_count: Optional[int] = None
|
| 91 |
+
test_fail_count: Optional[int] = None
|
| 92 |
+
approval_required: bool = False
|
| 93 |
+
blocked_reason: Optional[str] = None
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
class MigrationStatus(BaseModel):
|
| 97 |
+
pending_migrations: List[str]
|
| 98 |
+
last_applied: Optional[str] = None
|
| 99 |
+
migration_errors: Optional[List[str]] = None
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
class AlertInfo(BaseModel):
|
| 103 |
+
severity: str = Field(description="One of: critical, warning, info")
|
| 104 |
+
message: str
|
| 105 |
+
service_name: str
|
| 106 |
+
timestamp: str
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
# --- Action (extends OpenEnv Action) ----------------------------------------
|
| 110 |
+
|
| 111 |
+
class PipelineAction(Action):
|
| 112 |
+
"""Action for the DevOps Pipeline environment."""
|
| 113 |
+
|
| 114 |
+
action_type: ActionType
|
| 115 |
+
service_name: Optional[str] = Field(
|
| 116 |
+
default=None,
|
| 117 |
+
description="Target service. Required for view_logs, view_config, edit_config, deploy, rollback.",
|
| 118 |
+
)
|
| 119 |
+
target_version: Optional[str] = Field(
|
| 120 |
+
default=None,
|
| 121 |
+
description="Version tag to deploy. Required for deploy.",
|
| 122 |
+
)
|
| 123 |
+
config_edits: Optional[List[ConfigEdit]] = Field(
|
| 124 |
+
default=None,
|
| 125 |
+
description="List of config changes. Required for edit_config.",
|
| 126 |
+
)
|
| 127 |
+
migration_type: Optional[MigrationType] = Field(
|
| 128 |
+
default=None,
|
| 129 |
+
description="Type of migration. Required for run_migration.",
|
| 130 |
+
)
|
| 131 |
+
migration_name: Optional[str] = Field(
|
| 132 |
+
default=None,
|
| 133 |
+
description="Migration identifier. Required for run_migration.",
|
| 134 |
+
)
|
| 135 |
+
reason: Optional[str] = Field(
|
| 136 |
+
default=None,
|
| 137 |
+
description="Justification for approve/abort/rollback.",
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
# --- Observation (extends OpenEnv Observation) --------------------------------
|
| 142 |
+
|
| 143 |
+
class PipelineObservation(Observation):
|
| 144 |
+
"""Everything the agent sees after each step."""
|
| 145 |
+
|
| 146 |
+
task_description: str = Field(
|
| 147 |
+
default="",
|
| 148 |
+
description="Natural language description of what the agent must accomplish.",
|
| 149 |
+
)
|
| 150 |
+
goal: str = Field(
|
| 151 |
+
default="",
|
| 152 |
+
description="Specific success criteria for the current task.",
|
| 153 |
+
)
|
| 154 |
+
step_number: int = 0
|
| 155 |
+
max_steps: int = 15
|
| 156 |
+
services: List[ServiceStatus] = Field(default_factory=list)
|
| 157 |
+
pipeline: Optional[PipelineStatus] = None
|
| 158 |
+
migrations: Optional[MigrationStatus] = None
|
| 159 |
+
active_alerts: List[AlertInfo] = Field(default_factory=list)
|
| 160 |
+
available_actions: List[str] = Field(
|
| 161 |
+
default_factory=list,
|
| 162 |
+
description="List of valid action_type values in current state.",
|
| 163 |
+
)
|
| 164 |
+
last_action_result: Optional[str] = Field(
|
| 165 |
+
default=None,
|
| 166 |
+
description="Human-readable outcome of the previous action.",
|
| 167 |
+
)
|
| 168 |
+
last_action_error: Optional[str] = Field(
|
| 169 |
+
default=None,
|
| 170 |
+
description="Error message if previous action failed, else null.",
|
| 171 |
+
)
|
| 172 |
+
config_snapshot: Optional[Dict[str, str]] = Field(
|
| 173 |
+
default=None,
|
| 174 |
+
description="Current config key-value pairs when viewing/editing config.",
|
| 175 |
+
)
|
| 176 |
+
summary: Optional[str] = Field(
|
| 177 |
+
default=None,
|
| 178 |
+
description="Quick status summary highlighting degraded/down services.",
|
| 179 |
+
)
|
openenv.yaml
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
-
spec_version: 1
|
| 2 |
-
name: devops_pipeline_env
|
| 3 |
-
type: space
|
| 4 |
-
runtime: fastapi
|
| 5 |
-
app: server.app:app
|
| 6 |
-
port: 8000
|
| 7 |
-
description: "CI/CD deployment pipeline environment where an AI agent manages deployments across 5 interdependent microservices. Agent reads logs, edits configs, runs migrations, and makes deployment decisions. Features 6 tasks (Easy to Hard + procedural generation): clean deploy, broken pipeline diagnosis, judgment calls under pressure, cascading failure recovery, capacity crisis management, and randomized incidents from seed."
|
| 8 |
-
version: "0.1.0"
|
|
|
|
| 1 |
+
spec_version: 1
|
| 2 |
+
name: devops_pipeline_env
|
| 3 |
+
type: space
|
| 4 |
+
runtime: fastapi
|
| 5 |
+
app: server.app:app
|
| 6 |
+
port: 8000
|
| 7 |
+
description: "CI/CD deployment pipeline environment where an AI agent manages deployments across 5 interdependent microservices. Agent reads logs, edits configs, runs migrations, and makes deployment decisions. Features 6 tasks (Easy to Hard + procedural generation): clean deploy, broken pipeline diagnosis, judgment calls under pressure, cascading failure recovery, capacity crisis management, and randomized incidents from seed."
|
| 8 |
+
version: "0.1.0"
|
pyproject.toml
CHANGED
|
@@ -1,34 +1,34 @@
|
|
| 1 |
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
-
# All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# This source code is licensed under the BSD-style license found in the
|
| 5 |
-
# LICENSE file in the root directory of this source tree.
|
| 6 |
-
|
| 7 |
-
[build-system]
|
| 8 |
-
requires = ["setuptools>=45", "wheel"]
|
| 9 |
-
build-backend = "setuptools.build_meta"
|
| 10 |
-
|
| 11 |
-
[project]
|
| 12 |
-
name = "devops-pipeline-env"
|
| 13 |
-
version = "0.1.0"
|
| 14 |
-
description = "CI/CD Pipeline Management OpenEnv Environment"
|
| 15 |
-
requires-python = ">=3.10"
|
| 16 |
-
dependencies = [
|
| 17 |
-
"openenv-core[core]>=0.2.2",
|
| 18 |
-
"pydantic>=2.0",
|
| 19 |
-
"fastapi>=0.104.0",
|
| 20 |
-
"uvicorn>=0.24.0",
|
| 21 |
-
]
|
| 22 |
-
|
| 23 |
-
[project.optional-dependencies]
|
| 24 |
-
dev = ["pytest"]
|
| 25 |
-
|
| 26 |
-
[project.scripts]
|
| 27 |
-
# Server entry point - enables running via: uv run --project . server
|
| 28 |
-
# or: python -m devops_pipeline_env.server.app
|
| 29 |
-
server = "devops_pipeline_env.server.app:main"
|
| 30 |
-
|
| 31 |
-
[tool.setuptools]
|
| 32 |
-
include-package-data = true
|
| 33 |
-
packages = ["devops_pipeline_env", "devops_pipeline_env.server"]
|
| 34 |
package-dir = { "devops_pipeline_env" = ".", "devops_pipeline_env.server" = "server" }
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
[build-system]
|
| 8 |
+
requires = ["setuptools>=45", "wheel"]
|
| 9 |
+
build-backend = "setuptools.build_meta"
|
| 10 |
+
|
| 11 |
+
[project]
|
| 12 |
+
name = "devops-pipeline-env"
|
| 13 |
+
version = "0.1.0"
|
| 14 |
+
description = "CI/CD Pipeline Management OpenEnv Environment"
|
| 15 |
+
requires-python = ">=3.10"
|
| 16 |
+
dependencies = [
|
| 17 |
+
"openenv-core[core]>=0.2.2",
|
| 18 |
+
"pydantic>=2.0",
|
| 19 |
+
"fastapi>=0.104.0",
|
| 20 |
+
"uvicorn>=0.24.0",
|
| 21 |
+
]
|
| 22 |
+
|
| 23 |
+
[project.optional-dependencies]
|
| 24 |
+
dev = ["pytest"]
|
| 25 |
+
|
| 26 |
+
[project.scripts]
|
| 27 |
+
# Server entry point - enables running via: uv run --project . server
|
| 28 |
+
# or: python -m devops_pipeline_env.server.app
|
| 29 |
+
server = "devops_pipeline_env.server.app:main"
|
| 30 |
+
|
| 31 |
+
[tool.setuptools]
|
| 32 |
+
include-package-data = true
|
| 33 |
+
packages = ["devops_pipeline_env", "devops_pipeline_env.server"]
|
| 34 |
package-dir = { "devops_pipeline_env" = ".", "devops_pipeline_env.server" = "server" }
|
requirements.txt
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
openenv-core[core]>=0.2.2
|
| 2 |
-
pydantic>=2.0
|
| 3 |
-
fastapi>=0.104.0
|
| 4 |
-
uvicorn>=0.24.0
|
|
|
|
| 1 |
+
openenv-core[core]>=0.2.2
|
| 2 |
+
pydantic>=2.0
|
| 3 |
+
fastapi>=0.104.0
|
| 4 |
+
uvicorn>=0.24.0
|
server/__init__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
-
# All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# This source code is licensed under the BSD-style license found in the
|
| 5 |
-
# LICENSE file in the root directory of this source tree.
|
| 6 |
-
|
| 7 |
-
"""DevOps Pipeline environment server components."""
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""DevOps Pipeline environment server components."""
|
server/app.py
CHANGED
|
@@ -1,128 +1,128 @@
|
|
| 1 |
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
-
# All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# This source code is licensed under the BSD-style license found in the
|
| 5 |
-
# LICENSE file in the root directory of this source tree.
|
| 6 |
-
|
| 7 |
-
"""FastAPI application for the DevOps Pipeline Environment."""
|
| 8 |
-
|
| 9 |
-
from openenv.core.env_server.http_server import create_app
|
| 10 |
-
|
| 11 |
-
from devops_pipeline_env.models import PipelineAction, PipelineObservation
|
| 12 |
-
from server.pipeline_environment import PipelineEnvironment
|
| 13 |
-
|
| 14 |
-
app = create_app(
|
| 15 |
-
PipelineEnvironment,
|
| 16 |
-
PipelineAction,
|
| 17 |
-
PipelineObservation,
|
| 18 |
-
env_name="devops_pipeline_env",
|
| 19 |
-
max_concurrent_envs=1,
|
| 20 |
-
)
|
| 21 |
-
|
| 22 |
-
# Store active env on app.state so /grader can access it without class singletons.
|
| 23 |
-
# PipelineEnvironment.reset() calls _register_callback if set.
|
| 24 |
-
app.state.active_env = None
|
| 25 |
-
PipelineEnvironment._register_callback = lambda env: setattr(app.state, "active_env", env)
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
# --- Additional Required Endpoints -------------------------------------------
|
| 29 |
-
|
| 30 |
-
@app.get("/tasks")
|
| 31 |
-
def get_tasks():
|
| 32 |
-
"""Returns list of tasks and the action schema."""
|
| 33 |
-
return {
|
| 34 |
-
"tasks": [
|
| 35 |
-
{
|
| 36 |
-
"name": "clean_deploy",
|
| 37 |
-
"difficulty": "easy",
|
| 38 |
-
"description": "Deploy 2 services with all tests passing. No complications.",
|
| 39 |
-
"max_steps": 15,
|
| 40 |
-
},
|
| 41 |
-
{
|
| 42 |
-
"name": "broken_pipeline",
|
| 43 |
-
"difficulty": "medium",
|
| 44 |
-
"description": "Diagnose test failures, fix config errors, run migrations.",
|
| 45 |
-
"max_steps": 20,
|
| 46 |
-
},
|
| 47 |
-
{
|
| 48 |
-
"name": "judgment_call",
|
| 49 |
-
"difficulty": "hard",
|
| 50 |
-
"description": "Production incident with cascading failures. Hotfix breaks downstream service. 12-step time limit with degrading health.",
|
| 51 |
-
"max_steps": 12,
|
| 52 |
-
},
|
| 53 |
-
{
|
| 54 |
-
"name": "cascading_failure",
|
| 55 |
-
"difficulty": "medium-hard",
|
| 56 |
-
"description": "Root cause analysis across dependency chain. cache-service down, dragging api-gateway and web-frontend. Fix root cause first.",
|
| 57 |
-
"max_steps": 15,
|
| 58 |
-
},
|
| 59 |
-
{
|
| 60 |
-
"name": "capacity_crisis",
|
| 61 |
-
"difficulty": "medium-hard",
|
| 62 |
-
"description": "Peak traffic 4x normal. database-primary connection pool nearly full. Stabilize before tipping points trigger cascading collapse.",
|
| 63 |
-
"max_steps": 15,
|
| 64 |
-
},
|
| 65 |
-
{
|
| 66 |
-
"name": "random_incident",
|
| 67 |
-
"difficulty": "variable",
|
| 68 |
-
"description": "Procedurally generated incident. Service, failure type, and severity are randomized from seed. Infinite variation for curriculum learning.",
|
| 69 |
-
"max_steps": 15,
|
| 70 |
-
},
|
| 71 |
-
],
|
| 72 |
-
"action_schema": PipelineAction.model_json_schema(),
|
| 73 |
-
}
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
@app.get("/health")
|
| 77 |
-
def health_check():
|
| 78 |
-
"""Health check endpoint."""
|
| 79 |
-
return {"status": "healthy"}
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
@app.post("/baseline")
|
| 83 |
-
async def run_baseline():
|
| 84 |
-
"""Return pre-recorded baseline scores. Does NOT run inference.py."""
|
| 85 |
-
return {
|
| 86 |
-
"scores": {
|
| 87 |
-
"clean_deploy": 0.700,
|
| 88 |
-
"broken_pipeline": 0.482,
|
| 89 |
-
"judgment_call": 0.184,
|
| 90 |
-
"cascading_failure": 0.280,
|
| 91 |
-
"capacity_crisis": 0.250,
|
| 92 |
-
"random_incident": 0.350,
|
| 93 |
-
},
|
| 94 |
-
"model": "Qwen/Qwen2.5-72B-Instruct",
|
| 95 |
-
"note": "Baselines re-calibrated after environment tuning for clean_deploy (v2). Recorded 2026-04-08.",
|
| 96 |
-
}
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
@app.post("/grader")
|
| 100 |
-
async def run_grader(task_name: str = ""):
|
| 101 |
-
"""Score from active session's episode history."""
|
| 102 |
-
from server.graders import grade_task as _grade_task
|
| 103 |
-
|
| 104 |
-
env = getattr(app.state, "active_env", None)
|
| 105 |
-
if env is None or env.get_engine() is None:
|
| 106 |
-
return {"task": task_name, "score": 0.
|
| 107 |
-
if not env.get_episode_history():
|
| 108 |
-
return {"task": env.get_task_name(), "score": 0.
|
| 109 |
-
active_task = env.get_task_name()
|
| 110 |
-
if task_name and task_name != active_task:
|
| 111 |
-
return {"task": task_name, "score": 0.
|
| 112 |
-
if not task_name:
|
| 113 |
-
task_name = active_task
|
| 114 |
-
score = _grade_task(
|
| 115 |
-
env.get_task_name(),
|
| 116 |
-
env.get_episode_history(),
|
| 117 |
-
env.get_engine(),
|
| 118 |
-
)
|
| 119 |
-
return {"task": env.get_task_name(), "score": score}
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
def main(host: str = "0.0.0.0", port: int = 8000):
|
| 123 |
-
import uvicorn
|
| 124 |
-
uvicorn.run(app, host=host, port=port)
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
if __name__ == "__main__":
|
| 128 |
-
main()
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""FastAPI application for the DevOps Pipeline Environment."""
|
| 8 |
+
|
| 9 |
+
from openenv.core.env_server.http_server import create_app
|
| 10 |
+
|
| 11 |
+
from devops_pipeline_env.models import PipelineAction, PipelineObservation
|
| 12 |
+
from server.pipeline_environment import PipelineEnvironment
|
| 13 |
+
|
| 14 |
+
app = create_app(
|
| 15 |
+
PipelineEnvironment,
|
| 16 |
+
PipelineAction,
|
| 17 |
+
PipelineObservation,
|
| 18 |
+
env_name="devops_pipeline_env",
|
| 19 |
+
max_concurrent_envs=1,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
# Store active env on app.state so /grader can access it without class singletons.
|
| 23 |
+
# PipelineEnvironment.reset() calls _register_callback if set.
|
| 24 |
+
app.state.active_env = None
|
| 25 |
+
PipelineEnvironment._register_callback = lambda env: setattr(app.state, "active_env", env)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
# --- Additional Required Endpoints -------------------------------------------
|
| 29 |
+
|
| 30 |
+
@app.get("/tasks")
|
| 31 |
+
def get_tasks():
|
| 32 |
+
"""Returns list of tasks and the action schema."""
|
| 33 |
+
return {
|
| 34 |
+
"tasks": [
|
| 35 |
+
{
|
| 36 |
+
"name": "clean_deploy",
|
| 37 |
+
"difficulty": "easy",
|
| 38 |
+
"description": "Deploy 2 services with all tests passing. No complications.",
|
| 39 |
+
"max_steps": 15,
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"name": "broken_pipeline",
|
| 43 |
+
"difficulty": "medium",
|
| 44 |
+
"description": "Diagnose test failures, fix config errors, run migrations.",
|
| 45 |
+
"max_steps": 20,
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"name": "judgment_call",
|
| 49 |
+
"difficulty": "hard",
|
| 50 |
+
"description": "Production incident with cascading failures. Hotfix breaks downstream service. 12-step time limit with degrading health.",
|
| 51 |
+
"max_steps": 12,
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"name": "cascading_failure",
|
| 55 |
+
"difficulty": "medium-hard",
|
| 56 |
+
"description": "Root cause analysis across dependency chain. cache-service down, dragging api-gateway and web-frontend. Fix root cause first.",
|
| 57 |
+
"max_steps": 15,
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"name": "capacity_crisis",
|
| 61 |
+
"difficulty": "medium-hard",
|
| 62 |
+
"description": "Peak traffic 4x normal. database-primary connection pool nearly full. Stabilize before tipping points trigger cascading collapse.",
|
| 63 |
+
"max_steps": 15,
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
"name": "random_incident",
|
| 67 |
+
"difficulty": "variable",
|
| 68 |
+
"description": "Procedurally generated incident. Service, failure type, and severity are randomized from seed. Infinite variation for curriculum learning.",
|
| 69 |
+
"max_steps": 15,
|
| 70 |
+
},
|
| 71 |
+
],
|
| 72 |
+
"action_schema": PipelineAction.model_json_schema(),
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
@app.get("/health")
|
| 77 |
+
def health_check():
|
| 78 |
+
"""Health check endpoint."""
|
| 79 |
+
return {"status": "healthy"}
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
@app.post("/baseline")
|
| 83 |
+
async def run_baseline():
|
| 84 |
+
"""Return pre-recorded baseline scores. Does NOT run inference.py."""
|
| 85 |
+
return {
|
| 86 |
+
"scores": {
|
| 87 |
+
"clean_deploy": 0.700,
|
| 88 |
+
"broken_pipeline": 0.482,
|
| 89 |
+
"judgment_call": 0.184,
|
| 90 |
+
"cascading_failure": 0.280,
|
| 91 |
+
"capacity_crisis": 0.250,
|
| 92 |
+
"random_incident": 0.350,
|
| 93 |
+
},
|
| 94 |
+
"model": "Qwen/Qwen2.5-72B-Instruct",
|
| 95 |
+
"note": "Baselines re-calibrated after environment tuning for clean_deploy (v2). Recorded 2026-04-08.",
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
@app.post("/grader")
|
| 100 |
+
async def run_grader(task_name: str = ""):
|
| 101 |
+
"""Score from active session's episode history."""
|
| 102 |
+
from server.graders import grade_task as _grade_task
|
| 103 |
+
|
| 104 |
+
env = getattr(app.state, "active_env", None)
|
| 105 |
+
if env is None or env.get_engine() is None:
|
| 106 |
+
return {"task": task_name, "score": 0.001, "error": "No active session. Call /reset first."}
|
| 107 |
+
if not env.get_episode_history():
|
| 108 |
+
return {"task": env.get_task_name(), "score": 0.001, "error": "No steps taken. Call /step first."}
|
| 109 |
+
active_task = env.get_task_name()
|
| 110 |
+
if task_name and task_name != active_task:
|
| 111 |
+
return {"task": task_name, "score": 0.001, "error": f"Task mismatch: requested '{task_name}' but active task is '{active_task}'."}
|
| 112 |
+
if not task_name:
|
| 113 |
+
task_name = active_task
|
| 114 |
+
score = _grade_task(
|
| 115 |
+
env.get_task_name(),
|
| 116 |
+
env.get_episode_history(),
|
| 117 |
+
env.get_engine(),
|
| 118 |
+
)
|
| 119 |
+
return {"task": env.get_task_name(), "score": score}
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def main(host: str = "0.0.0.0", port: int = 8000):
|
| 123 |
+
import uvicorn
|
| 124 |
+
uvicorn.run(app, host=host, port=port)
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
if __name__ == "__main__":
|
| 128 |
+
main()
|
server/graders.py
CHANGED
|
@@ -1,389 +1,389 @@
|
|
| 1 |
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
-
# All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# This source code is licensed under the BSD-style license found in the
|
| 5 |
-
# LICENSE file in the root directory of this source tree.
|
| 6 |
-
|
| 7 |
-
"""Deterministic graders for the DevOps Pipeline Environment.
|
| 8 |
-
|
| 9 |
-
Each grader produces a score in [0.0, 1.0].
|
| 10 |
-
Same actions -> same score. Always.
|
| 11 |
-
All criteria are outcome-based β no procedure bonuses.
|
| 12 |
-
"""
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
def grade_clean_deploy(episode_history, engine):
|
| 16 |
-
"""
|
| 17 |
-
Task 1 grader:
|
| 18 |
-
- 0.50 * (services at target version in prod / services with targets)
|
| 19 |
-
- 0.30 * (final system_health / 100)
|
| 20 |
-
- 0.20 * max(0, 1 - steps_used / max_steps)
|
| 21 |
-
"""
|
| 22 |
-
target_services = [s for s in engine.services.values() if s.target_version]
|
| 23 |
-
deployed_count = sum(
|
| 24 |
-
1 for svc in target_services
|
| 25 |
-
if svc.prod_deployed and svc.current_version == svc.target_version
|
| 26 |
-
)
|
| 27 |
-
|
| 28 |
-
deploy_ratio = deployed_count / len(target_services) if target_services else 0.0
|
| 29 |
-
system_health = engine.get_system_health()
|
| 30 |
-
|
| 31 |
-
steps_used = len(episode_history)
|
| 32 |
-
max_steps = 15
|
| 33 |
-
efficiency = max(0.0, 1.0 - steps_used / max_steps)
|
| 34 |
-
|
| 35 |
-
score = 0.50 * deploy_ratio + 0.30 * (system_health / 100.0) + 0.20 * efficiency
|
| 36 |
-
return min(max(score, 0.001), 0.999)
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
def grade_broken_pipeline(episode_history, engine):
|
| 40 |
-
"""
|
| 41 |
-
Task 2 grader (all outcome-based):
|
| 42 |
-
- 0.30 β cache-service config redis.host == redis-prod.internal:6379
|
| 43 |
-
- 0.15 β migration applied (add_index_users_email in applied list)
|
| 44 |
-
- 0.30 β (services at target in prod / 3)
|
| 45 |
-
- 0.15 β (final system_health / 100)
|
| 46 |
-
- 0.10 β step efficiency: max(0, 1 - steps_used / max_steps)
|
| 47 |
-
"""
|
| 48 |
-
score = 0.0
|
| 49 |
-
|
| 50 |
-
# Config fix outcome β is the config correct at end of episode?
|
| 51 |
-
cache_svc = engine.services.get("cache-service")
|
| 52 |
-
if cache_svc and cache_svc.config.get("redis.host") == "redis-prod.internal:6379":
|
| 53 |
-
score += 0.30
|
| 54 |
-
|
| 55 |
-
# Migration outcome β was the migration applied?
|
| 56 |
-
if "add_index_users_email" in engine.migrations_applied:
|
| 57 |
-
score += 0.15
|
| 58 |
-
|
| 59 |
-
# Services at target in prod (only count services with deploy targets)
|
| 60 |
-
target_services = [s for s in engine.services.values() if s.target_version]
|
| 61 |
-
deployed_count = sum(
|
| 62 |
-
1 for svc in target_services
|
| 63 |
-
if svc.prod_deployed and svc.current_version == svc.target_version
|
| 64 |
-
)
|
| 65 |
-
if target_services:
|
| 66 |
-
score += 0.30 * (deployed_count / len(target_services))
|
| 67 |
-
|
| 68 |
-
# System health
|
| 69 |
-
system_health = engine.get_system_health()
|
| 70 |
-
score += 0.15 * (system_health / 100.0)
|
| 71 |
-
|
| 72 |
-
# Step efficiency
|
| 73 |
-
steps_used = len(episode_history)
|
| 74 |
-
max_steps = 20
|
| 75 |
-
efficiency = max(0.0, 1.0 - steps_used / max_steps)
|
| 76 |
-
score += 0.10 * efficiency
|
| 77 |
-
|
| 78 |
-
return min(max(score, 0.001), 0.999)
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
def grade_judgment_call(episode_history, engine):
|
| 82 |
-
"""
|
| 83 |
-
Task 3 grader β 3 valid paths with different scores:
|
| 84 |
-
- Expert path: deploy hotfix + fix web-frontend auth config -> best score
|
| 85 |
-
- Hotfix only: deploy hotfix without fixing auth -> partial score
|
| 86 |
-
- Rollback: safe but loses features -> moderate score
|
| 87 |
-
|
| 88 |
-
Dimensions:
|
| 89 |
-
- 0.30 β incident resolution (path-aware)
|
| 90 |
-
- 0.25 β web-frontend collateral damage
|
| 91 |
-
- 0.15 β time to resolution
|
| 92 |
-
- 0.15 β no new issues introduced
|
| 93 |
-
- 0.15 β final system health
|
| 94 |
-
"""
|
| 95 |
-
score = 0.0
|
| 96 |
-
api_gw = engine.services.get("api-gateway")
|
| 97 |
-
web_fe = engine.services.get("web-frontend")
|
| 98 |
-
|
| 99 |
-
# Detect which path the agent took
|
| 100 |
-
deployed_hotfix = False
|
| 101 |
-
did_rollback = False
|
| 102 |
-
fixed_auth_config = False
|
| 103 |
-
for entry in episode_history:
|
| 104 |
-
action = entry.get("action", {})
|
| 105 |
-
if action.get("action_type") == "deploy" and action.get("service_name") == "api-gateway":
|
| 106 |
-
if action.get("target_version") == "v2.3.2":
|
| 107 |
-
deployed_hotfix = True
|
| 108 |
-
if action.get("action_type") == "rollback" and action.get("service_name") == "api-gateway":
|
| 109 |
-
did_rollback = True
|
| 110 |
-
if action.get("action_type") == "edit_config" and action.get("service_name") == "web-frontend":
|
| 111 |
-
edits = action.get("config_edits", [])
|
| 112 |
-
for edit in edits:
|
| 113 |
-
if edit.get("key") == "api.auth_version" and edit.get("value") == "v2":
|
| 114 |
-
fixed_auth_config = True
|
| 115 |
-
|
| 116 |
-
# Incident resolution (path-aware)
|
| 117 |
-
resolved = False
|
| 118 |
-
if api_gw:
|
| 119 |
-
api_healthy = api_gw.health.value == "healthy" and api_gw.error_rate < 5.0
|
| 120 |
-
api_fully_resolved = api_gw.latency_ms < 100 and api_gw.error_rate < 1.0
|
| 121 |
-
if deployed_hotfix and fixed_auth_config and api_healthy:
|
| 122 |
-
score += 0.30 # Expert path: full credit (root cause fixed + auth handled)
|
| 123 |
-
resolved = True
|
| 124 |
-
elif api_fully_resolved:
|
| 125 |
-
resolved = True
|
| 126 |
-
if did_rollback:
|
| 127 |
-
score += 0.20 # Rollback: safe but lost features
|
| 128 |
-
else:
|
| 129 |
-
score += 0.25 # Some other resolution
|
| 130 |
-
elif deployed_hotfix and api_healthy:
|
| 131 |
-
score += 0.15 # Hotfix without auth fix: partial
|
| 132 |
-
resolved = True
|
| 133 |
-
elif api_gw.latency_ms < 500:
|
| 134 |
-
score += 0.10 # Partial improvement
|
| 135 |
-
|
| 136 |
-
# web-frontend collateral damage (smooth gradient)
|
| 137 |
-
if web_fe:
|
| 138 |
-
if web_fe.health.value == "healthy" and web_fe.error_rate < 0.5:
|
| 139 |
-
score += 0.25
|
| 140 |
-
elif web_fe.health.value == "healthy" and web_fe.error_rate < 2.0:
|
| 141 |
-
score += 0.20
|
| 142 |
-
elif web_fe.health.value == "healthy" and web_fe.error_rate < 5.0:
|
| 143 |
-
score += 0.15
|
| 144 |
-
elif web_fe.error_rate < 10.0:
|
| 145 |
-
score += 0.10
|
| 146 |
-
elif web_fe.error_rate < 20.0:
|
| 147 |
-
score += 0.05
|
| 148 |
-
|
| 149 |
-
# Time to resolution β when api-gateway actually became healthy (outcome-based)
|
| 150 |
-
resolution_step = len(episode_history)
|
| 151 |
-
if resolved:
|
| 152 |
-
for entry in episode_history:
|
| 153 |
-
sh = entry.get("system_health", 0)
|
| 154 |
-
if sh >= 80:
|
| 155 |
-
resolution_step = entry.get("step", len(episode_history))
|
| 156 |
-
break
|
| 157 |
-
score += max(0.0, 1.0 - resolution_step / 10.0) * 0.15
|
| 158 |
-
|
| 159 |
-
# No new issues introduced (forgive issues the agent subsequently fixed)
|
| 160 |
-
new_issues = 0
|
| 161 |
-
for entry in episode_history:
|
| 162 |
-
if entry.get("broke_healthy", False):
|
| 163 |
-
new_issues += 1
|
| 164 |
-
# Expert path necessarily breaks web-frontend then fixes it β if web-frontend
|
| 165 |
-
# ended healthy AND auth was fixed, the breakage was handled, not reckless.
|
| 166 |
-
recovered_issues = 0
|
| 167 |
-
if deployed_hotfix and fixed_auth_config and web_fe:
|
| 168 |
-
if web_fe.health.value == "healthy" and web_fe.error_rate < 2.0:
|
| 169 |
-
recovered_issues = 1 # The expected web-frontend break was recovered
|
| 170 |
-
unrecovered = max(0, new_issues - recovered_issues)
|
| 171 |
-
if unrecovered == 0:
|
| 172 |
-
score += 0.15
|
| 173 |
-
elif unrecovered == 1:
|
| 174 |
-
score += 0.05
|
| 175 |
-
|
| 176 |
-
# System health
|
| 177 |
-
system_health = engine.get_system_health()
|
| 178 |
-
score += 0.15 * (system_health / 100.0)
|
| 179 |
-
|
| 180 |
-
return min(max(score, 0.001), 0.999)
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
def grade_cascading_failure(episode_history, engine):
|
| 184 |
-
"""
|
| 185 |
-
Task 4 grader (all outcome-based):
|
| 186 |
-
- 0.30 β root cause fixed (cache-service healthy AND max_connections != "5")
|
| 187 |
-
- 0.25 β all services deployed to prod at target version
|
| 188 |
-
- 0.20 β final system_health / 100 (only full marks if > 90%)
|
| 189 |
-
- 0.15 β dependency health (cache-service healthy when api-gateway deployed)
|
| 190 |
-
- 0.10 β step efficiency: max(0, 1 - steps_used / max_steps)
|
| 191 |
-
"""
|
| 192 |
-
score = 0.0
|
| 193 |
-
|
| 194 |
-
# Root cause fixed: cache-service healthy with correct config
|
| 195 |
-
cache_svc = engine.services.get("cache-service")
|
| 196 |
-
if cache_svc:
|
| 197 |
-
cache_healthy = cache_svc.health.value == "healthy"
|
| 198 |
-
config_fixed = cache_svc.config.get("redis.max_connections") != "5"
|
| 199 |
-
if cache_healthy and config_fixed:
|
| 200 |
-
score += 0.30
|
| 201 |
-
|
| 202 |
-
# All services deployed to prod at target version (only those with targets)
|
| 203 |
-
target_services = [s for s in engine.services.values() if s.target_version]
|
| 204 |
-
deployed_count = sum(
|
| 205 |
-
1 for svc in target_services
|
| 206 |
-
if svc.prod_deployed and svc.current_version == svc.target_version
|
| 207 |
-
)
|
| 208 |
-
if target_services:
|
| 209 |
-
score += 0.25 * (deployed_count / len(target_services))
|
| 210 |
-
|
| 211 |
-
# System health
|
| 212 |
-
system_health = engine.get_system_health()
|
| 213 |
-
score += 0.20 * (system_health / 100.0)
|
| 214 |
-
|
| 215 |
-
# Dependency health outcome: was cache-service healthy when api-gateway deployed?
|
| 216 |
-
# Uses deploy-time snapshot recorded in episode_history for grader integrity.
|
| 217 |
-
api_deployed_with_healthy_dep = False
|
| 218 |
-
for entry in episode_history:
|
| 219 |
-
cache_health = entry.get("cache_health_at_deploy")
|
| 220 |
-
if cache_health is not None:
|
| 221 |
-
if cache_health == "healthy":
|
| 222 |
-
api_deployed_with_healthy_dep = True
|
| 223 |
-
break
|
| 224 |
-
# Also award if api-gateway was never deployed (agent focused on root cause only)
|
| 225 |
-
# and cache-service ended healthy
|
| 226 |
-
if not api_deployed_with_healthy_dep:
|
| 227 |
-
api_gw = engine.services.get("api-gateway")
|
| 228 |
-
if api_gw and api_gw.prod_deployed and cache_svc and cache_svc.health.value == "healthy":
|
| 229 |
-
api_deployed_with_healthy_dep = True
|
| 230 |
-
if api_deployed_with_healthy_dep:
|
| 231 |
-
score += 0.15
|
| 232 |
-
|
| 233 |
-
# Step efficiency
|
| 234 |
-
steps_used = len(episode_history)
|
| 235 |
-
max_steps = 15
|
| 236 |
-
efficiency = max(0.0, 1.0 - steps_used / max_steps)
|
| 237 |
-
score += 0.10 * efficiency
|
| 238 |
-
|
| 239 |
-
return min(max(score, 0.001), 0.999)
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
def grade_capacity_crisis(episode_history, engine):
|
| 243 |
-
"""
|
| 244 |
-
Task 5 grader β capacity crisis (all outcome-based):
|
| 245 |
-
- 0.30 β system stability: final system health / 100
|
| 246 |
-
- 0.30 β root cause: database-primary protected (CPU<85, config fixed)
|
| 247 |
-
- 0.20 β critical services maintained (api-gateway not DOWN, low errors)
|
| 248 |
-
- 0.10 β proactive response: no service went DOWN during episode
|
| 249 |
-
- 0.10 β step efficiency: max(0, 1 - steps_used / max_steps)
|
| 250 |
-
"""
|
| 251 |
-
score = 0.0
|
| 252 |
-
db = engine.services.get("database-primary")
|
| 253 |
-
api_gw = engine.services.get("api-gateway")
|
| 254 |
-
|
| 255 |
-
# System stability (0.30) β only perfect health gets full marks
|
| 256 |
-
system_health = engine.get_system_health()
|
| 257 |
-
score += (system_health / 100.0) * 0.30
|
| 258 |
-
|
| 259 |
-
# Root cause: database protected (0.30)
|
| 260 |
-
if db:
|
| 261 |
-
try:
|
| 262 |
-
max_conn = int(db.config.get("max_connections", "50"))
|
| 263 |
-
except (ValueError, TypeError):
|
| 264 |
-
max_conn = 50
|
| 265 |
-
shared_buf = db.config.get("shared_buffers", "4GB")
|
| 266 |
-
try:
|
| 267 |
-
shared_gb = int(shared_buf.replace("GB", "")) if "GB" in str(shared_buf) else 4
|
| 268 |
-
except (ValueError, TypeError):
|
| 269 |
-
shared_gb = 4
|
| 270 |
-
if max_conn >= 100 and db.cpu_percent < 85 and shared_gb >= 6:
|
| 271 |
-
score += 0.30 # Both configs optimized
|
| 272 |
-
elif max_conn >= 100 and db.cpu_percent < 85:
|
| 273 |
-
score += 0.25 # Connections fixed, buffers not
|
| 274 |
-
elif max_conn >= 75 and db.cpu_percent < 85:
|
| 275 |
-
score += 0.20
|
| 276 |
-
elif max_conn >= 75:
|
| 277 |
-
score += 0.10
|
| 278 |
-
elif db.cpu_percent < 85:
|
| 279 |
-
score += 0.05
|
| 280 |
-
|
| 281 |
-
# Critical services maintained (0.20)
|
| 282 |
-
if api_gw:
|
| 283 |
-
if api_gw.health.value != "down":
|
| 284 |
-
if api_gw.error_rate < 5.0:
|
| 285 |
-
score += 0.20
|
| 286 |
-
elif api_gw.error_rate < 10.0:
|
| 287 |
-
score += 0.10
|
| 288 |
-
else:
|
| 289 |
-
score += 0.03
|
| 290 |
-
|
| 291 |
-
# Proactive response: system health maintained or improved (0.10)
|
| 292 |
-
# In capacity_crisis, initial cascading is inevitable β reward agents
|
| 293 |
-
# that stabilize health rather than penalizing unavoidable cascades.
|
| 294 |
-
if system_health >= 70:
|
| 295 |
-
score += 0.10
|
| 296 |
-
elif system_health >= 50:
|
| 297 |
-
score += 0.05
|
| 298 |
-
|
| 299 |
-
# Step efficiency (0.10)
|
| 300 |
-
steps_used = len(episode_history)
|
| 301 |
-
max_steps = 15
|
| 302 |
-
efficiency = max(0.0, 1.0 - steps_used / max_steps)
|
| 303 |
-
score += 0.10 * efficiency
|
| 304 |
-
|
| 305 |
-
return min(max(score, 0.001), 0.999)
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
def grade_random_incident(episode_history, engine):
|
| 309 |
-
"""
|
| 310 |
-
Task 6 grader β procedurally generated incident (all outcome-based):
|
| 311 |
-
- 0.35 β failing service restored to healthy
|
| 312 |
-
- 0.25 β system health maintained
|
| 313 |
-
- 0.20 β config error fixed (if applicable)
|
| 314 |
-
- 0.10 β no collateral damage (no healthy services broken)
|
| 315 |
-
- 0.10 β step efficiency
|
| 316 |
-
"""
|
| 317 |
-
score = 0.0
|
| 318 |
-
scenario = engine.scenario
|
| 319 |
-
failing_name = getattr(scenario, 'failing_service', None)
|
| 320 |
-
failing_svc = engine.services.get(failing_name) if failing_name else None
|
| 321 |
-
|
| 322 |
-
# Failing service restored (0.35)
|
| 323 |
-
if failing_svc and failing_svc.health.value == "healthy":
|
| 324 |
-
score += 0.35
|
| 325 |
-
elif failing_svc and failing_svc.health.value == "degraded" and failing_svc.error_rate < 5.0:
|
| 326 |
-
score += 0.15
|
| 327 |
-
|
| 328 |
-
# System health (0.25)
|
| 329 |
-
system_health = engine.get_system_health()
|
| 330 |
-
score += (system_health / 100.0) * 0.25
|
| 331 |
-
|
| 332 |
-
# Config fixed (0.20) β only if there was a config error to fix
|
| 333 |
-
if failing_svc:
|
| 334 |
-
had_config_error = getattr(scenario, 'failure_type', '') in ('config_error', 'capacity_limit', 'certificate_expiry')
|
| 335 |
-
if had_config_error and not scenario.check_config_error(failing_name, failing_svc.config):
|
| 336 |
-
score += 0.20 # Actually fixed the config error
|
| 337 |
-
elif had_config_error:
|
| 338 |
-
score += 0.0 # Config error still present
|
| 339 |
-
else:
|
| 340 |
-
# No config error for this failure type (degraded_performance, memory_leak)
|
| 341 |
-
# Redistribute to "service restored via other means" (deploy/rollback)
|
| 342 |
-
if failing_svc.health.value == "healthy" and failing_svc.error_rate < 2.0:
|
| 343 |
-
score += 0.20 # Fully restored without config fix
|
| 344 |
-
elif failing_svc.health.value == "healthy":
|
| 345 |
-
score += 0.10 # Partially restored
|
| 346 |
-
elif failing_svc.error_rate < 5.0:
|
| 347 |
-
score += 0.05 # Some improvement
|
| 348 |
-
|
| 349 |
-
# No collateral damage (0.10) β outcome-based, not procedure-based
|
| 350 |
-
any_broke = any(entry.get("broke_healthy", False) for entry in episode_history)
|
| 351 |
-
if not any_broke:
|
| 352 |
-
score += 0.10
|
| 353 |
-
elif system_health > 60:
|
| 354 |
-
score += 0.05
|
| 355 |
-
|
| 356 |
-
# Compound vs efficiency β weights sum to exactly 1.00 either way
|
| 357 |
-
has_secondary = getattr(scenario, 'secondary_service', None) is not None
|
| 358 |
-
efficiency_weight = 0.00 if has_secondary else 0.10
|
| 359 |
-
|
| 360 |
-
# Efficiency (0.10 for single incidents, 0.00 for compound β replaced by compound bonus)
|
| 361 |
-
steps = len(episode_history)
|
| 362 |
-
max_steps = 15
|
| 363 |
-
score += max(0.0, 1.0 - steps / max_steps) * efficiency_weight
|
| 364 |
-
|
| 365 |
-
# Compound incident bonus (0.10) β replaces efficiency for compound incidents
|
| 366 |
-
if has_secondary:
|
| 367 |
-
secondary_svc = engine.services.get(scenario.secondary_service)
|
| 368 |
-
if secondary_svc and secondary_svc.health.value == "healthy":
|
| 369 |
-
score += 0.10
|
| 370 |
-
|
| 371 |
-
return min(max(score, 0.001), 0.999)
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
GRADERS = {
|
| 375 |
-
"clean_deploy": grade_clean_deploy,
|
| 376 |
-
"broken_pipeline": grade_broken_pipeline,
|
| 377 |
-
"judgment_call": grade_judgment_call,
|
| 378 |
-
"cascading_failure": grade_cascading_failure,
|
| 379 |
-
"capacity_crisis": grade_capacity_crisis,
|
| 380 |
-
"random_incident": grade_random_incident,
|
| 381 |
-
}
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
def grade_task(task_name, episode_history, engine):
|
| 385 |
-
"""Grade an episode. Returns score in [0.0, 1.0]."""
|
| 386 |
-
grader = GRADERS.get(task_name)
|
| 387 |
-
if grader is None:
|
| 388 |
-
return 0.
|
| 389 |
-
return grader(episode_history, engine)
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Deterministic graders for the DevOps Pipeline Environment.
|
| 8 |
+
|
| 9 |
+
Each grader produces a score in [0.0, 1.0].
|
| 10 |
+
Same actions -> same score. Always.
|
| 11 |
+
All criteria are outcome-based β no procedure bonuses.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def grade_clean_deploy(episode_history, engine):
|
| 16 |
+
"""
|
| 17 |
+
Task 1 grader:
|
| 18 |
+
- 0.50 * (services at target version in prod / services with targets)
|
| 19 |
+
- 0.30 * (final system_health / 100)
|
| 20 |
+
- 0.20 * max(0, 1 - steps_used / max_steps)
|
| 21 |
+
"""
|
| 22 |
+
target_services = [s for s in engine.services.values() if s.target_version]
|
| 23 |
+
deployed_count = sum(
|
| 24 |
+
1 for svc in target_services
|
| 25 |
+
if svc.prod_deployed and svc.current_version == svc.target_version
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
deploy_ratio = deployed_count / len(target_services) if target_services else 0.0
|
| 29 |
+
system_health = engine.get_system_health()
|
| 30 |
+
|
| 31 |
+
steps_used = len(episode_history)
|
| 32 |
+
max_steps = 15
|
| 33 |
+
efficiency = max(0.0, 1.0 - steps_used / max_steps)
|
| 34 |
+
|
| 35 |
+
score = 0.50 * deploy_ratio + 0.30 * (system_health / 100.0) + 0.20 * efficiency
|
| 36 |
+
return min(max(score, 0.001), 0.999)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def grade_broken_pipeline(episode_history, engine):
|
| 40 |
+
"""
|
| 41 |
+
Task 2 grader (all outcome-based):
|
| 42 |
+
- 0.30 β cache-service config redis.host == redis-prod.internal:6379
|
| 43 |
+
- 0.15 β migration applied (add_index_users_email in applied list)
|
| 44 |
+
- 0.30 β (services at target in prod / 3)
|
| 45 |
+
- 0.15 β (final system_health / 100)
|
| 46 |
+
- 0.10 β step efficiency: max(0, 1 - steps_used / max_steps)
|
| 47 |
+
"""
|
| 48 |
+
score = 0.0
|
| 49 |
+
|
| 50 |
+
# Config fix outcome β is the config correct at end of episode?
|
| 51 |
+
cache_svc = engine.services.get("cache-service")
|
| 52 |
+
if cache_svc and cache_svc.config.get("redis.host") == "redis-prod.internal:6379":
|
| 53 |
+
score += 0.30
|
| 54 |
+
|
| 55 |
+
# Migration outcome β was the migration applied?
|
| 56 |
+
if "add_index_users_email" in engine.migrations_applied:
|
| 57 |
+
score += 0.15
|
| 58 |
+
|
| 59 |
+
# Services at target in prod (only count services with deploy targets)
|
| 60 |
+
target_services = [s for s in engine.services.values() if s.target_version]
|
| 61 |
+
deployed_count = sum(
|
| 62 |
+
1 for svc in target_services
|
| 63 |
+
if svc.prod_deployed and svc.current_version == svc.target_version
|
| 64 |
+
)
|
| 65 |
+
if target_services:
|
| 66 |
+
score += 0.30 * (deployed_count / len(target_services))
|
| 67 |
+
|
| 68 |
+
# System health
|
| 69 |
+
system_health = engine.get_system_health()
|
| 70 |
+
score += 0.15 * (system_health / 100.0)
|
| 71 |
+
|
| 72 |
+
# Step efficiency
|
| 73 |
+
steps_used = len(episode_history)
|
| 74 |
+
max_steps = 20
|
| 75 |
+
efficiency = max(0.0, 1.0 - steps_used / max_steps)
|
| 76 |
+
score += 0.10 * efficiency
|
| 77 |
+
|
| 78 |
+
return min(max(score, 0.001), 0.999)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def grade_judgment_call(episode_history, engine):
|
| 82 |
+
"""
|
| 83 |
+
Task 3 grader β 3 valid paths with different scores:
|
| 84 |
+
- Expert path: deploy hotfix + fix web-frontend auth config -> best score
|
| 85 |
+
- Hotfix only: deploy hotfix without fixing auth -> partial score
|
| 86 |
+
- Rollback: safe but loses features -> moderate score
|
| 87 |
+
|
| 88 |
+
Dimensions:
|
| 89 |
+
- 0.30 β incident resolution (path-aware)
|
| 90 |
+
- 0.25 β web-frontend collateral damage
|
| 91 |
+
- 0.15 β time to resolution
|
| 92 |
+
- 0.15 β no new issues introduced
|
| 93 |
+
- 0.15 β final system health
|
| 94 |
+
"""
|
| 95 |
+
score = 0.0
|
| 96 |
+
api_gw = engine.services.get("api-gateway")
|
| 97 |
+
web_fe = engine.services.get("web-frontend")
|
| 98 |
+
|
| 99 |
+
# Detect which path the agent took
|
| 100 |
+
deployed_hotfix = False
|
| 101 |
+
did_rollback = False
|
| 102 |
+
fixed_auth_config = False
|
| 103 |
+
for entry in episode_history:
|
| 104 |
+
action = entry.get("action", {})
|
| 105 |
+
if action.get("action_type") == "deploy" and action.get("service_name") == "api-gateway":
|
| 106 |
+
if action.get("target_version") == "v2.3.2":
|
| 107 |
+
deployed_hotfix = True
|
| 108 |
+
if action.get("action_type") == "rollback" and action.get("service_name") == "api-gateway":
|
| 109 |
+
did_rollback = True
|
| 110 |
+
if action.get("action_type") == "edit_config" and action.get("service_name") == "web-frontend":
|
| 111 |
+
edits = action.get("config_edits", [])
|
| 112 |
+
for edit in edits:
|
| 113 |
+
if edit.get("key") == "api.auth_version" and edit.get("value") == "v2":
|
| 114 |
+
fixed_auth_config = True
|
| 115 |
+
|
| 116 |
+
# Incident resolution (path-aware)
|
| 117 |
+
resolved = False
|
| 118 |
+
if api_gw:
|
| 119 |
+
api_healthy = api_gw.health.value == "healthy" and api_gw.error_rate < 5.0
|
| 120 |
+
api_fully_resolved = api_gw.latency_ms < 100 and api_gw.error_rate < 1.0
|
| 121 |
+
if deployed_hotfix and fixed_auth_config and api_healthy:
|
| 122 |
+
score += 0.30 # Expert path: full credit (root cause fixed + auth handled)
|
| 123 |
+
resolved = True
|
| 124 |
+
elif api_fully_resolved:
|
| 125 |
+
resolved = True
|
| 126 |
+
if did_rollback:
|
| 127 |
+
score += 0.20 # Rollback: safe but lost features
|
| 128 |
+
else:
|
| 129 |
+
score += 0.25 # Some other resolution
|
| 130 |
+
elif deployed_hotfix and api_healthy:
|
| 131 |
+
score += 0.15 # Hotfix without auth fix: partial
|
| 132 |
+
resolved = True
|
| 133 |
+
elif api_gw.latency_ms < 500:
|
| 134 |
+
score += 0.10 # Partial improvement
|
| 135 |
+
|
| 136 |
+
# web-frontend collateral damage (smooth gradient)
|
| 137 |
+
if web_fe:
|
| 138 |
+
if web_fe.health.value == "healthy" and web_fe.error_rate < 0.5:
|
| 139 |
+
score += 0.25
|
| 140 |
+
elif web_fe.health.value == "healthy" and web_fe.error_rate < 2.0:
|
| 141 |
+
score += 0.20
|
| 142 |
+
elif web_fe.health.value == "healthy" and web_fe.error_rate < 5.0:
|
| 143 |
+
score += 0.15
|
| 144 |
+
elif web_fe.error_rate < 10.0:
|
| 145 |
+
score += 0.10
|
| 146 |
+
elif web_fe.error_rate < 20.0:
|
| 147 |
+
score += 0.05
|
| 148 |
+
|
| 149 |
+
# Time to resolution β when api-gateway actually became healthy (outcome-based)
|
| 150 |
+
resolution_step = len(episode_history)
|
| 151 |
+
if resolved:
|
| 152 |
+
for entry in episode_history:
|
| 153 |
+
sh = entry.get("system_health", 0)
|
| 154 |
+
if sh >= 80:
|
| 155 |
+
resolution_step = entry.get("step", len(episode_history))
|
| 156 |
+
break
|
| 157 |
+
score += max(0.0, 1.0 - resolution_step / 10.0) * 0.15
|
| 158 |
+
|
| 159 |
+
# No new issues introduced (forgive issues the agent subsequently fixed)
|
| 160 |
+
new_issues = 0
|
| 161 |
+
for entry in episode_history:
|
| 162 |
+
if entry.get("broke_healthy", False):
|
| 163 |
+
new_issues += 1
|
| 164 |
+
# Expert path necessarily breaks web-frontend then fixes it β if web-frontend
|
| 165 |
+
# ended healthy AND auth was fixed, the breakage was handled, not reckless.
|
| 166 |
+
recovered_issues = 0
|
| 167 |
+
if deployed_hotfix and fixed_auth_config and web_fe:
|
| 168 |
+
if web_fe.health.value == "healthy" and web_fe.error_rate < 2.0:
|
| 169 |
+
recovered_issues = 1 # The expected web-frontend break was recovered
|
| 170 |
+
unrecovered = max(0, new_issues - recovered_issues)
|
| 171 |
+
if unrecovered == 0:
|
| 172 |
+
score += 0.15
|
| 173 |
+
elif unrecovered == 1:
|
| 174 |
+
score += 0.05
|
| 175 |
+
|
| 176 |
+
# System health
|
| 177 |
+
system_health = engine.get_system_health()
|
| 178 |
+
score += 0.15 * (system_health / 100.0)
|
| 179 |
+
|
| 180 |
+
return min(max(score, 0.001), 0.999)
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def grade_cascading_failure(episode_history, engine):
|
| 184 |
+
"""
|
| 185 |
+
Task 4 grader (all outcome-based):
|
| 186 |
+
- 0.30 β root cause fixed (cache-service healthy AND max_connections != "5")
|
| 187 |
+
- 0.25 β all services deployed to prod at target version
|
| 188 |
+
- 0.20 β final system_health / 100 (only full marks if > 90%)
|
| 189 |
+
- 0.15 β dependency health (cache-service healthy when api-gateway deployed)
|
| 190 |
+
- 0.10 β step efficiency: max(0, 1 - steps_used / max_steps)
|
| 191 |
+
"""
|
| 192 |
+
score = 0.0
|
| 193 |
+
|
| 194 |
+
# Root cause fixed: cache-service healthy with correct config
|
| 195 |
+
cache_svc = engine.services.get("cache-service")
|
| 196 |
+
if cache_svc:
|
| 197 |
+
cache_healthy = cache_svc.health.value == "healthy"
|
| 198 |
+
config_fixed = cache_svc.config.get("redis.max_connections") != "5"
|
| 199 |
+
if cache_healthy and config_fixed:
|
| 200 |
+
score += 0.30
|
| 201 |
+
|
| 202 |
+
# All services deployed to prod at target version (only those with targets)
|
| 203 |
+
target_services = [s for s in engine.services.values() if s.target_version]
|
| 204 |
+
deployed_count = sum(
|
| 205 |
+
1 for svc in target_services
|
| 206 |
+
if svc.prod_deployed and svc.current_version == svc.target_version
|
| 207 |
+
)
|
| 208 |
+
if target_services:
|
| 209 |
+
score += 0.25 * (deployed_count / len(target_services))
|
| 210 |
+
|
| 211 |
+
# System health
|
| 212 |
+
system_health = engine.get_system_health()
|
| 213 |
+
score += 0.20 * (system_health / 100.0)
|
| 214 |
+
|
| 215 |
+
# Dependency health outcome: was cache-service healthy when api-gateway deployed?
|
| 216 |
+
# Uses deploy-time snapshot recorded in episode_history for grader integrity.
|
| 217 |
+
api_deployed_with_healthy_dep = False
|
| 218 |
+
for entry in episode_history:
|
| 219 |
+
cache_health = entry.get("cache_health_at_deploy")
|
| 220 |
+
if cache_health is not None:
|
| 221 |
+
if cache_health == "healthy":
|
| 222 |
+
api_deployed_with_healthy_dep = True
|
| 223 |
+
break
|
| 224 |
+
# Also award if api-gateway was never deployed (agent focused on root cause only)
|
| 225 |
+
# and cache-service ended healthy
|
| 226 |
+
if not api_deployed_with_healthy_dep:
|
| 227 |
+
api_gw = engine.services.get("api-gateway")
|
| 228 |
+
if api_gw and api_gw.prod_deployed and cache_svc and cache_svc.health.value == "healthy":
|
| 229 |
+
api_deployed_with_healthy_dep = True
|
| 230 |
+
if api_deployed_with_healthy_dep:
|
| 231 |
+
score += 0.15
|
| 232 |
+
|
| 233 |
+
# Step efficiency
|
| 234 |
+
steps_used = len(episode_history)
|
| 235 |
+
max_steps = 15
|
| 236 |
+
efficiency = max(0.0, 1.0 - steps_used / max_steps)
|
| 237 |
+
score += 0.10 * efficiency
|
| 238 |
+
|
| 239 |
+
return min(max(score, 0.001), 0.999)
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
def grade_capacity_crisis(episode_history, engine):
|
| 243 |
+
"""
|
| 244 |
+
Task 5 grader β capacity crisis (all outcome-based):
|
| 245 |
+
- 0.30 β system stability: final system health / 100
|
| 246 |
+
- 0.30 β root cause: database-primary protected (CPU<85, config fixed)
|
| 247 |
+
- 0.20 β critical services maintained (api-gateway not DOWN, low errors)
|
| 248 |
+
- 0.10 β proactive response: no service went DOWN during episode
|
| 249 |
+
- 0.10 β step efficiency: max(0, 1 - steps_used / max_steps)
|
| 250 |
+
"""
|
| 251 |
+
score = 0.0
|
| 252 |
+
db = engine.services.get("database-primary")
|
| 253 |
+
api_gw = engine.services.get("api-gateway")
|
| 254 |
+
|
| 255 |
+
# System stability (0.30) β only perfect health gets full marks
|
| 256 |
+
system_health = engine.get_system_health()
|
| 257 |
+
score += (system_health / 100.0) * 0.30
|
| 258 |
+
|
| 259 |
+
# Root cause: database protected (0.30)
|
| 260 |
+
if db:
|
| 261 |
+
try:
|
| 262 |
+
max_conn = int(db.config.get("max_connections", "50"))
|
| 263 |
+
except (ValueError, TypeError):
|
| 264 |
+
max_conn = 50
|
| 265 |
+
shared_buf = db.config.get("shared_buffers", "4GB")
|
| 266 |
+
try:
|
| 267 |
+
shared_gb = int(shared_buf.replace("GB", "")) if "GB" in str(shared_buf) else 4
|
| 268 |
+
except (ValueError, TypeError):
|
| 269 |
+
shared_gb = 4
|
| 270 |
+
if max_conn >= 100 and db.cpu_percent < 85 and shared_gb >= 6:
|
| 271 |
+
score += 0.30 # Both configs optimized
|
| 272 |
+
elif max_conn >= 100 and db.cpu_percent < 85:
|
| 273 |
+
score += 0.25 # Connections fixed, buffers not
|
| 274 |
+
elif max_conn >= 75 and db.cpu_percent < 85:
|
| 275 |
+
score += 0.20
|
| 276 |
+
elif max_conn >= 75:
|
| 277 |
+
score += 0.10
|
| 278 |
+
elif db.cpu_percent < 85:
|
| 279 |
+
score += 0.05
|
| 280 |
+
|
| 281 |
+
# Critical services maintained (0.20)
|
| 282 |
+
if api_gw:
|
| 283 |
+
if api_gw.health.value != "down":
|
| 284 |
+
if api_gw.error_rate < 5.0:
|
| 285 |
+
score += 0.20
|
| 286 |
+
elif api_gw.error_rate < 10.0:
|
| 287 |
+
score += 0.10
|
| 288 |
+
else:
|
| 289 |
+
score += 0.03
|
| 290 |
+
|
| 291 |
+
# Proactive response: system health maintained or improved (0.10)
|
| 292 |
+
# In capacity_crisis, initial cascading is inevitable β reward agents
|
| 293 |
+
# that stabilize health rather than penalizing unavoidable cascades.
|
| 294 |
+
if system_health >= 70:
|
| 295 |
+
score += 0.10
|
| 296 |
+
elif system_health >= 50:
|
| 297 |
+
score += 0.05
|
| 298 |
+
|
| 299 |
+
# Step efficiency (0.10)
|
| 300 |
+
steps_used = len(episode_history)
|
| 301 |
+
max_steps = 15
|
| 302 |
+
efficiency = max(0.0, 1.0 - steps_used / max_steps)
|
| 303 |
+
score += 0.10 * efficiency
|
| 304 |
+
|
| 305 |
+
return min(max(score, 0.001), 0.999)
|
| 306 |
+
|
| 307 |
+
|
| 308 |
+
def grade_random_incident(episode_history, engine):
|
| 309 |
+
"""
|
| 310 |
+
Task 6 grader β procedurally generated incident (all outcome-based):
|
| 311 |
+
- 0.35 β failing service restored to healthy
|
| 312 |
+
- 0.25 β system health maintained
|
| 313 |
+
- 0.20 β config error fixed (if applicable)
|
| 314 |
+
- 0.10 β no collateral damage (no healthy services broken)
|
| 315 |
+
- 0.10 β step efficiency
|
| 316 |
+
"""
|
| 317 |
+
score = 0.0
|
| 318 |
+
scenario = engine.scenario
|
| 319 |
+
failing_name = getattr(scenario, 'failing_service', None)
|
| 320 |
+
failing_svc = engine.services.get(failing_name) if failing_name else None
|
| 321 |
+
|
| 322 |
+
# Failing service restored (0.35)
|
| 323 |
+
if failing_svc and failing_svc.health.value == "healthy":
|
| 324 |
+
score += 0.35
|
| 325 |
+
elif failing_svc and failing_svc.health.value == "degraded" and failing_svc.error_rate < 5.0:
|
| 326 |
+
score += 0.15
|
| 327 |
+
|
| 328 |
+
# System health (0.25)
|
| 329 |
+
system_health = engine.get_system_health()
|
| 330 |
+
score += (system_health / 100.0) * 0.25
|
| 331 |
+
|
| 332 |
+
# Config fixed (0.20) β only if there was a config error to fix
|
| 333 |
+
if failing_svc:
|
| 334 |
+
had_config_error = getattr(scenario, 'failure_type', '') in ('config_error', 'capacity_limit', 'certificate_expiry')
|
| 335 |
+
if had_config_error and not scenario.check_config_error(failing_name, failing_svc.config):
|
| 336 |
+
score += 0.20 # Actually fixed the config error
|
| 337 |
+
elif had_config_error:
|
| 338 |
+
score += 0.0 # Config error still present
|
| 339 |
+
else:
|
| 340 |
+
# No config error for this failure type (degraded_performance, memory_leak)
|
| 341 |
+
# Redistribute to "service restored via other means" (deploy/rollback)
|
| 342 |
+
if failing_svc.health.value == "healthy" and failing_svc.error_rate < 2.0:
|
| 343 |
+
score += 0.20 # Fully restored without config fix
|
| 344 |
+
elif failing_svc.health.value == "healthy":
|
| 345 |
+
score += 0.10 # Partially restored
|
| 346 |
+
elif failing_svc.error_rate < 5.0:
|
| 347 |
+
score += 0.05 # Some improvement
|
| 348 |
+
|
| 349 |
+
# No collateral damage (0.10) β outcome-based, not procedure-based
|
| 350 |
+
any_broke = any(entry.get("broke_healthy", False) for entry in episode_history)
|
| 351 |
+
if not any_broke:
|
| 352 |
+
score += 0.10
|
| 353 |
+
elif system_health > 60:
|
| 354 |
+
score += 0.05
|
| 355 |
+
|
| 356 |
+
# Compound vs efficiency β weights sum to exactly 1.00 either way
|
| 357 |
+
has_secondary = getattr(scenario, 'secondary_service', None) is not None
|
| 358 |
+
efficiency_weight = 0.00 if has_secondary else 0.10
|
| 359 |
+
|
| 360 |
+
# Efficiency (0.10 for single incidents, 0.00 for compound β replaced by compound bonus)
|
| 361 |
+
steps = len(episode_history)
|
| 362 |
+
max_steps = 15
|
| 363 |
+
score += max(0.0, 1.0 - steps / max_steps) * efficiency_weight
|
| 364 |
+
|
| 365 |
+
# Compound incident bonus (0.10) β replaces efficiency for compound incidents
|
| 366 |
+
if has_secondary:
|
| 367 |
+
secondary_svc = engine.services.get(scenario.secondary_service)
|
| 368 |
+
if secondary_svc and secondary_svc.health.value == "healthy":
|
| 369 |
+
score += 0.10
|
| 370 |
+
|
| 371 |
+
return min(max(score, 0.001), 0.999)
|
| 372 |
+
|
| 373 |
+
|
| 374 |
+
GRADERS = {
|
| 375 |
+
"clean_deploy": grade_clean_deploy,
|
| 376 |
+
"broken_pipeline": grade_broken_pipeline,
|
| 377 |
+
"judgment_call": grade_judgment_call,
|
| 378 |
+
"cascading_failure": grade_cascading_failure,
|
| 379 |
+
"capacity_crisis": grade_capacity_crisis,
|
| 380 |
+
"random_incident": grade_random_incident,
|
| 381 |
+
}
|
| 382 |
+
|
| 383 |
+
|
| 384 |
+
def grade_task(task_name, episode_history, engine):
|
| 385 |
+
"""Grade an episode. Returns score in [0.0, 1.0]."""
|
| 386 |
+
grader = GRADERS.get(task_name)
|
| 387 |
+
if grader is None:
|
| 388 |
+
return 0.001
|
| 389 |
+
return grader(episode_history, engine)
|
server/pipeline_engine.py
CHANGED
|
@@ -1,744 +1,744 @@
|
|
| 1 |
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
-
# All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# This source code is licensed under the BSD-style license found in the
|
| 5 |
-
# LICENSE file in the root directory of this source tree.
|
| 6 |
-
|
| 7 |
-
"""Simulation engine for the DevOps Pipeline Environment."""
|
| 8 |
-
|
| 9 |
-
import random
|
| 10 |
-
|
| 11 |
-
from devops_pipeline_env.models import (
|
| 12 |
-
ActionType,
|
| 13 |
-
AlertInfo,
|
| 14 |
-
ConfigEdit,
|
| 15 |
-
MigrationStatus,
|
| 16 |
-
PipelineAction,
|
| 17 |
-
PipelineStage,
|
| 18 |
-
PipelineStatus,
|
| 19 |
-
ServiceHealth,
|
| 20 |
-
ServiceStatus,
|
| 21 |
-
)
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
class ServiceState:
|
| 25 |
-
"""State machine for a single microservice."""
|
| 26 |
-
|
| 27 |
-
def __init__(self, name, version, health, config, dependencies,
|
| 28 |
-
latency_ms, error_rate, cpu, memory, rng=None):
|
| 29 |
-
self.name = name
|
| 30 |
-
self.current_version = version
|
| 31 |
-
self.target_version = None
|
| 32 |
-
self.health = health
|
| 33 |
-
self.config = dict(config)
|
| 34 |
-
self.dependencies = list(dependencies) if dependencies else []
|
| 35 |
-
self.latency_ms = latency_ms
|
| 36 |
-
self.error_rate = error_rate
|
| 37 |
-
self.cpu_percent = cpu
|
| 38 |
-
self.memory_percent = memory
|
| 39 |
-
self.active_connections = 100
|
| 40 |
-
self.staging_deployed = False
|
| 41 |
-
self.staging_verified = False
|
| 42 |
-
self.prod_deployed = False
|
| 43 |
-
self.last_deploy_timestamp = "2026-04-01T00:00:00Z"
|
| 44 |
-
self.logs = []
|
| 45 |
-
self._rng = rng or random.Random(0)
|
| 46 |
-
# Staged health recovery: 0 = fully recovered, >0 = still recovering
|
| 47 |
-
self._recovery_steps_remaining = 0
|
| 48 |
-
self._recovery_target_latency = 0.0
|
| 49 |
-
self._recovery_target_error_rate = 0.0
|
| 50 |
-
|
| 51 |
-
def deploy_to_staging(self, version, scenario):
|
| 52 |
-
"""Deploy version to staging. Returns result text."""
|
| 53 |
-
self.staging_deployed = True
|
| 54 |
-
self.target_version = version
|
| 55 |
-
|
| 56 |
-
# 8% chance of transient staging failure on first attempt
|
| 57 |
-
# Skip for clean_deploy (easy task) and during incidents (health already degraded/down)
|
| 58 |
-
transient_roll = self._rng.random() # always consume RNG for determinism
|
| 59 |
-
is_clean_deploy = hasattr(self, '_task_name') and self._task_name == "clean_deploy"
|
| 60 |
-
if not is_clean_deploy and not self.staging_verified and self.health == ServiceHealth.HEALTHY and transient_roll < 0.08:
|
| 61 |
-
self.staging_deployed = True # deployed but not verified
|
| 62 |
-
self.logs.append(
|
| 63 |
-
f"[DEPLOY] Deployed {self.name} {version} to staging. "
|
| 64 |
-
f"Transient failure: health check timed out. Retry should succeed."
|
| 65 |
-
)
|
| 66 |
-
return (
|
| 67 |
-
f"Deployed {self.name} {version} to staging. "
|
| 68 |
-
f"WARNING: Transient health check timeout. Try deploying again."
|
| 69 |
-
)
|
| 70 |
-
|
| 71 |
-
if scenario.check_config_error(self.name, self.config):
|
| 72 |
-
self.health = ServiceHealth.DEGRADED
|
| 73 |
-
lat_mult = self._rng.uniform(0.8, 1.2)
|
| 74 |
-
err_mult = self._rng.uniform(0.9, 1.1)
|
| 75 |
-
self.error_rate = round(12.0 * err_mult, 2)
|
| 76 |
-
self.latency_ms = round(300.0 * lat_mult, 1)
|
| 77 |
-
self.logs.append(
|
| 78 |
-
f"[DEPLOY] Deployed {self.name} {version} to staging. "
|
| 79 |
-
f"WARNING: Health check DEGRADED. Error rate elevated "
|
| 80 |
-
f"({self.error_rate:.1f}/s, latency {self.latency_ms:.0f}ms)."
|
| 81 |
-
)
|
| 82 |
-
return (
|
| 83 |
-
f"Deployed {self.name} {version} to staging. "
|
| 84 |
-
f"WARNING: Health check degraded. Error rate elevated."
|
| 85 |
-
)
|
| 86 |
-
self.health = ServiceHealth.HEALTHY
|
| 87 |
-
self.staging_verified = True
|
| 88 |
-
lat_mult = self._rng.uniform(0.8, 1.2)
|
| 89 |
-
self.error_rate = round(0.1 * self._rng.uniform(0.9, 1.1), 3)
|
| 90 |
-
self.latency_ms = round(45.0 * lat_mult, 1)
|
| 91 |
-
self.logs.append(
|
| 92 |
-
f"[DEPLOY] Deployed {self.name} {version} to staging. Health check: PASSED."
|
| 93 |
-
)
|
| 94 |
-
return f"Deployed {self.name} {version} to STAGING. Staging verified. Deploy same service+version again to PROMOTE TO PRODUCTION."
|
| 95 |
-
|
| 96 |
-
def deploy_to_production(self, version):
|
| 97 |
-
"""Promote to production."""
|
| 98 |
-
if not self.staging_verified:
|
| 99 |
-
self.health = ServiceHealth.DEGRADED
|
| 100 |
-
lat_mult = self._rng.uniform(0.8, 1.2)
|
| 101 |
-
err_mult = self._rng.uniform(0.9, 1.1)
|
| 102 |
-
self.error_rate = round(25.0 * err_mult, 2)
|
| 103 |
-
self.latency_ms = round(500.0 * lat_mult, 1)
|
| 104 |
-
self.logs.append(
|
| 105 |
-
f"[DEPLOY] Deployed {self.name} {version} to production "
|
| 106 |
-
f"WITHOUT staging verification. High risk."
|
| 107 |
-
)
|
| 108 |
-
return (
|
| 109 |
-
f"Deployed {self.name} {version} to production "
|
| 110 |
-
f"WITHOUT staging verification. High risk."
|
| 111 |
-
)
|
| 112 |
-
self.prod_deployed = True
|
| 113 |
-
self.current_version = version
|
| 114 |
-
# Staged recovery: takes 1-3 steps to fully stabilize
|
| 115 |
-
recovery_steps = self._rng.randint(1, 3)
|
| 116 |
-
self._recovery_steps_remaining = recovery_steps
|
| 117 |
-
base_latency = 45.0 * self._rng.uniform(0.8, 1.2)
|
| 118 |
-
base_error_rate = 0.1 * self._rng.uniform(0.9, 1.1)
|
| 119 |
-
|
| 120 |
-
# Non-linear deploy quality: same seed = same outcome
|
| 121 |
-
quality_roll = self._rng.random()
|
| 122 |
-
deploy_note = ""
|
| 123 |
-
if quality_roll < 0.7:
|
| 124 |
-
# Clean deploy β recovers to near-perfect
|
| 125 |
-
pass # base values are already good
|
| 126 |
-
elif quality_roll < 0.9:
|
| 127 |
-
# Minor issues β recovers to good but not perfect
|
| 128 |
-
base_latency *= 1.5
|
| 129 |
-
base_error_rate *= 3.0
|
| 130 |
-
deploy_note = " Minor post-deploy issues detected."
|
| 131 |
-
self.logs.append(
|
| 132 |
-
f"[DEPLOY] {self.name}: Minor post-deploy issues detected. "
|
| 133 |
-
f"Performance slightly below optimal."
|
| 134 |
-
)
|
| 135 |
-
else:
|
| 136 |
-
# Unstable deploy β recovers poorly
|
| 137 |
-
base_latency *= 2.5
|
| 138 |
-
base_error_rate *= 8.0
|
| 139 |
-
self.error_rate += 1.5
|
| 140 |
-
deploy_note = " Post-deploy instability detected."
|
| 141 |
-
self.logs.append(
|
| 142 |
-
f"[DEPLOY] {self.name}: Post-deploy instability detected. "
|
| 143 |
-
f"Elevated error rate."
|
| 144 |
-
)
|
| 145 |
-
|
| 146 |
-
self._recovery_target_latency = round(base_latency, 1)
|
| 147 |
-
self._recovery_target_error_rate = round(base_error_rate, 3)
|
| 148 |
-
# Start at slightly elevated values during recovery
|
| 149 |
-
self.health = ServiceHealth.HEALTHY
|
| 150 |
-
self.latency_ms = round(base_latency * (1.0 + 0.3 * recovery_steps), 1)
|
| 151 |
-
self.error_rate = round(base_error_rate * (1.0 + 0.5 * recovery_steps), 3)
|
| 152 |
-
# Trade-off: deploy causes temporary CPU/latency spike (warmup load)
|
| 153 |
-
# Clean deploy tasks get reduced spikes β they should be clean
|
| 154 |
-
if hasattr(self, '_task_name') and self._task_name == "clean_deploy":
|
| 155 |
-
self.cpu_percent = min(self.cpu_percent + 3, 99)
|
| 156 |
-
self.latency_ms += round(30 * self._rng.uniform(0.8, 1.2), 1)
|
| 157 |
-
else:
|
| 158 |
-
self.cpu_percent = min(self.cpu_percent + 15, 99)
|
| 159 |
-
self.latency_ms += round(200 * self._rng.uniform(0.8, 1.2), 1)
|
| 160 |
-
self.last_deploy_timestamp = "2026-04-01T12:00:00Z"
|
| 161 |
-
self.logs.append(
|
| 162 |
-
f"[DEPLOY] Promoted {self.name} {version} to production. Health: HEALTHY. "
|
| 163 |
-
f"Stabilizing over ~{recovery_steps} step(s). CPU/latency spike from warmup."
|
| 164 |
-
)
|
| 165 |
-
return (
|
| 166 |
-
f"Promoted {self.name} {version} to production. Health: HEALTHY. "
|
| 167 |
-
f"Deployed successfully. Service under warmup load β temporary CPU/latency spike expected."
|
| 168 |
-
f"{deploy_note}"
|
| 169 |
-
)
|
| 170 |
-
|
| 171 |
-
def tick_recovery(self):
|
| 172 |
-
"""Called each step to progress staged health recovery."""
|
| 173 |
-
if self._recovery_steps_remaining > 0:
|
| 174 |
-
self._recovery_steps_remaining -= 1
|
| 175 |
-
if self._recovery_steps_remaining == 0:
|
| 176 |
-
# Fully recovered
|
| 177 |
-
self.latency_ms = self._recovery_target_latency
|
| 178 |
-
self.error_rate = self._recovery_target_error_rate
|
| 179 |
-
if self.health == ServiceHealth.DEGRADED and self.error_rate < 5.0:
|
| 180 |
-
self.health = ServiceHealth.HEALTHY
|
| 181 |
-
else:
|
| 182 |
-
# Interpolate toward target
|
| 183 |
-
progress = 1.0 - (self._recovery_steps_remaining / (self._recovery_steps_remaining + 1))
|
| 184 |
-
self.latency_ms = round(
|
| 185 |
-
self.latency_ms + (self._recovery_target_latency - self.latency_ms) * progress, 1
|
| 186 |
-
)
|
| 187 |
-
self.error_rate = round(
|
| 188 |
-
self.error_rate + (self._recovery_target_error_rate - self.error_rate) * progress, 3
|
| 189 |
-
)
|
| 190 |
-
|
| 191 |
-
def rollback(self):
|
| 192 |
-
"""Rollback to previous version."""
|
| 193 |
-
self.health = ServiceHealth.HEALTHY
|
| 194 |
-
lat_mult = self._rng.uniform(0.8, 1.2)
|
| 195 |
-
err_mult = self._rng.uniform(0.9, 1.1)
|
| 196 |
-
self.error_rate = round(0.5 * err_mult, 3)
|
| 197 |
-
self.latency_ms = round(50.0 * lat_mult * 0.7, 1)
|
| 198 |
-
self.staging_deployed = False
|
| 199 |
-
self.staging_verified = False
|
| 200 |
-
self.prod_deployed = True # still in prod, just rolled back
|
| 201 |
-
self._recovery_steps_remaining = 0
|
| 202 |
-
# Trade-off: 25% chance rollback reintroduces a known bug
|
| 203 |
-
regression = False
|
| 204 |
-
if self._rng.random() < 0.25:
|
| 205 |
-
self.error_rate = round(self.error_rate + 3.0, 2)
|
| 206 |
-
regression = True
|
| 207 |
-
self.logs.append(
|
| 208 |
-
f"[ROLLBACK] Rolled back {self.name} to {self.current_version}. "
|
| 209 |
-
f"Warning: rollback may have reintroduced known issue from previous version"
|
| 210 |
-
)
|
| 211 |
-
else:
|
| 212 |
-
self.logs.append(
|
| 213 |
-
f"[ROLLBACK] Rolled back {self.name} to {self.current_version}. Service healthy."
|
| 214 |
-
)
|
| 215 |
-
result = f"Rolled back {self.name} to {self.current_version}. Rolled back. Monitoring for regression..."
|
| 216 |
-
if regression:
|
| 217 |
-
result += f" WARNING: Error rate elevated ({self.error_rate:.1f}/s) β possible regression."
|
| 218 |
-
return result
|
| 219 |
-
|
| 220 |
-
def set_config(self, key, value):
|
| 221 |
-
"""Edit a config value."""
|
| 222 |
-
old = self.config.get(key, "<not set>")
|
| 223 |
-
self.config[key] = value
|
| 224 |
-
# Trade-off: config change causes brief restart spike
|
| 225 |
-
self.latency_ms += round(100 * self._rng.uniform(0.8, 1.2), 1)
|
| 226 |
-
self.cpu_percent = min(self.cpu_percent + 5, 99)
|
| 227 |
-
self.logs.append(f"[CONFIG] {self.name}: {key} changed from '{old}' to '{value}'. Service restarting.")
|
| 228 |
-
return f"Config {self.name}: {key} changed from '{old}' to '{value}'. Config updated. Service restarting β brief latency spike."
|
| 229 |
-
|
| 230 |
-
def get_config_snapshot(self):
|
| 231 |
-
return dict(self.config)
|
| 232 |
-
|
| 233 |
-
def get_logs(self):
|
| 234 |
-
return list(self.logs)
|
| 235 |
-
|
| 236 |
-
def _get_health_pct(self):
|
| 237 |
-
"""Get numeric health percentage for this service."""
|
| 238 |
-
h = 100.0
|
| 239 |
-
if self.health == ServiceHealth.DOWN:
|
| 240 |
-
h = 0.0
|
| 241 |
-
elif self.health == ServiceHealth.DEGRADED:
|
| 242 |
-
h = 50.0
|
| 243 |
-
h -= min(self.error_rate * 2, 30)
|
| 244 |
-
if self.latency_ms > 200:
|
| 245 |
-
h -= min((self.latency_ms - 200) / 10, 30)
|
| 246 |
-
return max(h, 0.0)
|
| 247 |
-
|
| 248 |
-
def to_status(self):
|
| 249 |
-
return ServiceStatus(
|
| 250 |
-
name=self.name,
|
| 251 |
-
health=self.health,
|
| 252 |
-
current_version=self.current_version,
|
| 253 |
-
cpu_percent=self.cpu_percent,
|
| 254 |
-
memory_percent=self.memory_percent,
|
| 255 |
-
error_rate=self.error_rate,
|
| 256 |
-
request_latency_ms=self.latency_ms,
|
| 257 |
-
active_connections=self.active_connections,
|
| 258 |
-
last_deploy_timestamp=self.last_deploy_timestamp,
|
| 259 |
-
)
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
class PipelineEngine:
|
| 263 |
-
"""Manages all services, pipeline state, migrations, alerts."""
|
| 264 |
-
|
| 265 |
-
def __init__(self, scenario, seed):
|
| 266 |
-
self.scenario = scenario
|
| 267 |
-
self._rng = random.Random(seed)
|
| 268 |
-
self.services = {}
|
| 269 |
-
self.pipeline_stage = PipelineStage.IDLE
|
| 270 |
-
self.migrations_pending = []
|
| 271 |
-
self.migrations_applied = []
|
| 272 |
-
self.migration_errors = []
|
| 273 |
-
self.alerts = []
|
| 274 |
-
self.commit_sha = "abc123"
|
| 275 |
-
self.triggered_by = "deploy-bot"
|
| 276 |
-
self.started_at = "2026-04-01T10:00:00Z"
|
| 277 |
-
self.test_pass = 0
|
| 278 |
-
self.test_fail = 0
|
| 279 |
-
self.build_logs = ""
|
| 280 |
-
self._time_pressure = False # Set by scenario if needed
|
| 281 |
-
|
| 282 |
-
# Initialize from scenario
|
| 283 |
-
scenario.setup(self)
|
| 284 |
-
|
| 285 |
-
# Inject the shared RNG and task name into all services created by the scenario
|
| 286 |
-
for svc in self.services.values():
|
| 287 |
-
svc._rng = self._rng
|
| 288 |
-
svc._task_name = scenario.task_name
|
| 289 |
-
|
| 290 |
-
def execute(self, action):
|
| 291 |
-
"""Execute an action. Returns human-readable result string."""
|
| 292 |
-
# 1. Tick health recovery for all services (heal from previous deploys)
|
| 293 |
-
for svc in self.services.values():
|
| 294 |
-
svc.tick_recovery()
|
| 295 |
-
|
| 296 |
-
# 2. Execute the agent's action FIRST
|
| 297 |
-
if action.action_type == ActionType.VIEW_PIPELINE:
|
| 298 |
-
result = self._view_pipeline()
|
| 299 |
-
elif action.action_type == ActionType.VIEW_LOGS:
|
| 300 |
-
result = self._view_logs(action.service_name)
|
| 301 |
-
elif action.action_type == ActionType.VIEW_CONFIG:
|
| 302 |
-
result = self._view_config(action.service_name)
|
| 303 |
-
elif action.action_type == ActionType.EDIT_CONFIG:
|
| 304 |
-
result = self._edit_config(action.service_name, action.config_edits)
|
| 305 |
-
elif action.action_type == ActionType.RUN_MIGRATION:
|
| 306 |
-
result = self._run_migration(action.migration_name, action.migration_type)
|
| 307 |
-
elif action.action_type == ActionType.DEPLOY:
|
| 308 |
-
result = self._deploy(action.service_name, action.target_version)
|
| 309 |
-
elif action.action_type == ActionType.ROLLBACK:
|
| 310 |
-
result = self._rollback(action.service_name)
|
| 311 |
-
elif action.action_type == ActionType.APPROVE:
|
| 312 |
-
result = self._approve(action.reason)
|
| 313 |
-
elif action.action_type == ActionType.ABORT:
|
| 314 |
-
result = self._abort(action.reason)
|
| 315 |
-
else:
|
| 316 |
-
result = "Unknown action."
|
| 317 |
-
|
| 318 |
-
# 3. Environmental effects AFTER action (agent sees consequences)
|
| 319 |
-
if self._time_pressure:
|
| 320 |
-
self._apply_time_pressure()
|
| 321 |
-
self._tick_cascading_effects()
|
| 322 |
-
self._tick_metric_compounding()
|
| 323 |
-
self._tick_tipping_points()
|
| 324 |
-
|
| 325 |
-
return result
|
| 326 |
-
|
| 327 |
-
# --- Cross-metric compounding ---------------------------------------------
|
| 328 |
-
|
| 329 |
-
def _tick_metric_compounding(self):
|
| 330 |
-
"""Metrics compound on each other β creates realistic spirals and recovery."""
|
| 331 |
-
if self.scenario.task_name == "clean_deploy":
|
| 332 |
-
return
|
| 333 |
-
for name, svc in self.services.items():
|
| 334 |
-
# Degradation spirals (moderate β should not kill episodes in <5 steps)
|
| 335 |
-
if svc.error_rate > 15.0:
|
| 336 |
-
svc.cpu_percent = min(svc.cpu_percent + 3, 99)
|
| 337 |
-
if svc.cpu_percent > 90:
|
| 338 |
-
svc.latency_ms = round(min(svc.latency_ms + 100, 5000), 1)
|
| 339 |
-
if svc.latency_ms > 3000:
|
| 340 |
-
svc.error_rate = round(min(svc.error_rate + 1.0, 50.0), 2)
|
| 341 |
-
|
| 342 |
-
# Natural recovery (when metrics are good, they help each other)
|
| 343 |
-
if svc.error_rate < 2.0:
|
| 344 |
-
svc.cpu_percent = max(svc.cpu_percent - 3, 10)
|
| 345 |
-
if svc.cpu_percent < 50:
|
| 346 |
-
svc.latency_ms = round(max(svc.latency_ms - 50, 20), 1)
|
| 347 |
-
if svc.latency_ms < 200 and svc.error_rate < 1.0:
|
| 348 |
-
svc.error_rate = round(max(svc.error_rate - 0.5, 0.0), 2)
|
| 349 |
-
|
| 350 |
-
# --- Non-linear tipping points -------------------------------------------
|
| 351 |
-
|
| 352 |
-
def _tick_tipping_points(self):
|
| 353 |
-
"""Non-linear tipping points β systems cliff instead of degrading linearly."""
|
| 354 |
-
if self.scenario.task_name == "clean_deploy":
|
| 355 |
-
return
|
| 356 |
-
for name, svc in self.services.items():
|
| 357 |
-
# CPU cliff: above 85% = exponential error growth
|
| 358 |
-
if svc.cpu_percent > 85:
|
| 359 |
-
overflow = svc.cpu_percent - 85
|
| 360 |
-
svc.error_rate = round(min(svc.error_rate + overflow * 0.2, 50.0), 2)
|
| 361 |
-
|
| 362 |
-
# Latency cliff: above 2000ms = rapid collapse
|
| 363 |
-
if svc.latency_ms > 2000:
|
| 364 |
-
svc.error_rate = round(min(svc.error_rate + 3.0, 50.0), 2)
|
| 365 |
-
|
| 366 |
-
# Health cliff: below 30% health = accelerating death spiral
|
| 367 |
-
base = 50.0 if svc.health == ServiceHealth.DEGRADED else (
|
| 368 |
-
100.0 if svc.health == ServiceHealth.HEALTHY else 0.0
|
| 369 |
-
)
|
| 370 |
-
err_penalty = min(svc.error_rate * 2, 30)
|
| 371 |
-
lat_penalty = min(max(0, svc.latency_ms - 200) / 10, 30)
|
| 372 |
-
health_pct = max(0, base - err_penalty - lat_penalty)
|
| 373 |
-
if health_pct < 30:
|
| 374 |
-
svc.error_rate = round(min(svc.error_rate * 1.3, 50.0), 2)
|
| 375 |
-
|
| 376 |
-
# Latency β CPU feedback (high latency = retries = more CPU)
|
| 377 |
-
if svc.latency_ms > 1500:
|
| 378 |
-
svc.cpu_percent = min(svc.cpu_percent + 3, 99)
|
| 379 |
-
|
| 380 |
-
# --- Cascading failures ---------------------------------------------------
|
| 381 |
-
|
| 382 |
-
def _get_dependents(self, service_name):
|
| 383 |
-
"""Find all services that list service_name in their dependencies."""
|
| 384 |
-
return [
|
| 385 |
-
svc for svc in self.services.values()
|
| 386 |
-
if service_name in svc.dependencies
|
| 387 |
-
]
|
| 388 |
-
|
| 389 |
-
def _tick_cascading_effects(self):
|
| 390 |
-
"""Unhealthy services degrade their dependents each step."""
|
| 391 |
-
for svc in self.services.values():
|
| 392 |
-
health_pct = svc._get_health_pct()
|
| 393 |
-
if health_pct >= 50.0:
|
| 394 |
-
continue # healthy enough, no cascade
|
| 395 |
-
|
| 396 |
-
dependents = self._get_dependents(svc.name)
|
| 397 |
-
for dep in dependents:
|
| 398 |
-
if dep.health == ServiceHealth.DOWN:
|
| 399 |
-
continue # already down, can't get worse from cascade
|
| 400 |
-
|
| 401 |
-
# Determine cascade severity
|
| 402 |
-
if health_pct < 20.0:
|
| 403 |
-
# Source is effectively down β moderate cascade
|
| 404 |
-
err_increase = 1.5
|
| 405 |
-
lat_increase = 30.0
|
| 406 |
-
else:
|
| 407 |
-
# Source is degraded β lighter cascade
|
| 408 |
-
err_increase = 0.5
|
| 409 |
-
lat_increase = 10.0
|
| 410 |
-
|
| 411 |
-
old_err = dep.error_rate
|
| 412 |
-
dep.error_rate = round(min(dep.error_rate + err_increase, 45.0), 2)
|
| 413 |
-
dep.latency_ms = round(min(dep.latency_ms + lat_increase, 4500.0), 1)
|
| 414 |
-
|
| 415 |
-
# If error rate gets high enough, mark as degraded
|
| 416 |
-
if dep.error_rate > 5.0 and dep.health == ServiceHealth.HEALTHY:
|
| 417 |
-
dep.health = ServiceHealth.DEGRADED
|
| 418 |
-
|
| 419 |
-
# Floor: cascading alone can't push health below 5%
|
| 420 |
-
# (prevent instant death spirals)
|
| 421 |
-
dep_health = dep._get_health_pct()
|
| 422 |
-
if dep_health < 5.0:
|
| 423 |
-
dep.error_rate = round(max(old_err, dep.error_rate - err_increase + 1.0), 2)
|
| 424 |
-
|
| 425 |
-
# Add cascade alert (only if not already alerted this step)
|
| 426 |
-
cascade_alert_key = f"cascade:{svc.name}->{dep.name}"
|
| 427 |
-
existing = [a for a in self.alerts if cascade_alert_key in a.message]
|
| 428 |
-
if not existing:
|
| 429 |
-
self.alerts.append(AlertInfo(
|
| 430 |
-
severity="warning",
|
| 431 |
-
message=(
|
| 432 |
-
f"Cascading: {svc.name} (health {health_pct:.0f}%) is degrading "
|
| 433 |
-
f"{dep.name} β error_rate +{err_increase}/s, latency +{lat_increase:.0f}ms "
|
| 434 |
-
f"[{cascade_alert_key}]"
|
| 435 |
-
),
|
| 436 |
-
service_name=dep.name,
|
| 437 |
-
timestamp="2026-04-01T12:00:00Z",
|
| 438 |
-
))
|
| 439 |
-
|
| 440 |
-
dep.logs.append(
|
| 441 |
-
f"[CASCADE] Upstream {svc.name} unhealthy (health {health_pct:.0f}%) β "
|
| 442 |
-
f"{dep.name} error_rate now {dep.error_rate:.1f}/s, "
|
| 443 |
-
f"latency {dep.latency_ms:.0f}ms"
|
| 444 |
-
)
|
| 445 |
-
|
| 446 |
-
# Recovery propagation: healthy services help their dependents recover
|
| 447 |
-
for name, svc in self.services.items():
|
| 448 |
-
if svc.health == ServiceHealth.HEALTHY and svc.error_rate < 2.0:
|
| 449 |
-
dependents = self._get_dependents(name)
|
| 450 |
-
for dep in dependents:
|
| 451 |
-
if dep.health == ServiceHealth.DEGRADED:
|
| 452 |
-
dep.error_rate = round(dep.error_rate * 0.9, 2)
|
| 453 |
-
dep.latency_ms = round(dep.latency_ms * 0.9, 1)
|
| 454 |
-
|
| 455 |
-
# --- Action handlers ------------------------------------------------------
|
| 456 |
-
|
| 457 |
-
def _view_pipeline(self):
|
| 458 |
-
services_summary = "\n".join(
|
| 459 |
-
f" {s.name}: {s.health.value} | v{s.current_version} -> "
|
| 460 |
-
f"v{s.target_version or 'N/A'} | "
|
| 461 |
-
f"latency={s.latency_ms:.0f}ms | errors={s.error_rate:.1f}/s"
|
| 462 |
-
for s in self.services.values()
|
| 463 |
-
)
|
| 464 |
-
return (
|
| 465 |
-
f"Pipeline Stage: {self.pipeline_stage.value}\n"
|
| 466 |
-
f"Commit: {self.commit_sha}\n"
|
| 467 |
-
f"Tests: {self.test_pass} passed, {self.test_fail} failed\n"
|
| 468 |
-
f"Pending Migrations: {len(self.migrations_pending)}\n"
|
| 469 |
-
f"Services:\n{services_summary}"
|
| 470 |
-
)
|
| 471 |
-
|
| 472 |
-
def _view_logs(self, service_name):
|
| 473 |
-
svc = self.services.get(service_name)
|
| 474 |
-
if not svc:
|
| 475 |
-
return f"No service named '{service_name}'"
|
| 476 |
-
logs = svc.get_logs()
|
| 477 |
-
if not logs:
|
| 478 |
-
return f"No logs available for {service_name}."
|
| 479 |
-
return f"Logs for {service_name}:\n" + "\n".join(logs[-20:])
|
| 480 |
-
|
| 481 |
-
def _view_config(self, service_name):
|
| 482 |
-
svc = self.services.get(service_name)
|
| 483 |
-
if not svc:
|
| 484 |
-
return f"No service named '{service_name}'"
|
| 485 |
-
config = svc.get_config_snapshot()
|
| 486 |
-
lines = [f" {k} = {v}" for k, v in config.items()]
|
| 487 |
-
return f"Config for {service_name}:\n" + "\n".join(lines)
|
| 488 |
-
|
| 489 |
-
def _edit_config(self, service_name, edits):
|
| 490 |
-
svc = self.services.get(service_name)
|
| 491 |
-
if not svc:
|
| 492 |
-
return f"No service named '{service_name}'"
|
| 493 |
-
results = []
|
| 494 |
-
for edit in edits:
|
| 495 |
-
result = svc.set_config(edit.key, edit.value)
|
| 496 |
-
results.append(result)
|
| 497 |
-
# If the config error is now fixed and service was degraded, start
|
| 498 |
-
# staged recovery (2 steps) instead of instant heal
|
| 499 |
-
if svc.health == ServiceHealth.DEGRADED and not self.scenario.check_config_error(service_name, svc.config):
|
| 500 |
-
svc.staging_deployed = False
|
| 501 |
-
svc.staging_verified = False
|
| 502 |
-
# Immediate PARTIAL improvement
|
| 503 |
-
svc.error_rate = round(svc.error_rate * 0.5, 2)
|
| 504 |
-
svc.latency_ms = round(svc.latency_ms * 0.6, 1)
|
| 505 |
-
# Set up 2-step recovery to full health (reuse tick_recovery pattern)
|
| 506 |
-
svc._recovery_steps_remaining = 2
|
| 507 |
-
svc._recovery_target_latency = round(50.0 * self._rng.uniform(0.8, 1.2), 1)
|
| 508 |
-
svc._recovery_target_error_rate = round(0.1 * self._rng.uniform(0.9, 1.1), 3)
|
| 509 |
-
# Don't set health to HEALTHY yet β let tick_recovery handle it
|
| 510 |
-
# once error_rate drops below threshold on next steps
|
| 511 |
-
results.append(f"Config fix detected for {service_name}. Service improving β full recovery in ~2 steps. Ready for re-deploy.")
|
| 512 |
-
return "\n".join(results)
|
| 513 |
-
|
| 514 |
-
def _run_migration(self, migration_name, migration_type):
|
| 515 |
-
if migration_name not in self.migrations_pending:
|
| 516 |
-
return (
|
| 517 |
-
f"Migration '{migration_name}' not found in pending: "
|
| 518 |
-
f"{self.migrations_pending}"
|
| 519 |
-
)
|
| 520 |
-
success = self.scenario.run_migration(self, migration_name)
|
| 521 |
-
if success:
|
| 522 |
-
self.migrations_pending.remove(migration_name)
|
| 523 |
-
self.migrations_applied.append(migration_name)
|
| 524 |
-
return f"Migration '{migration_name}' applied successfully."
|
| 525 |
-
else:
|
| 526 |
-
error = f"Migration '{migration_name}' FAILED."
|
| 527 |
-
self.migration_errors.append(error)
|
| 528 |
-
return error
|
| 529 |
-
|
| 530 |
-
def _deploy(self, service_name, target_version):
|
| 531 |
-
svc = self.services.get(service_name)
|
| 532 |
-
if not svc:
|
| 533 |
-
return f"No service named '{service_name}'"
|
| 534 |
-
|
| 535 |
-
# Check migration dependencies
|
| 536 |
-
if self.migrations_pending and self.scenario.migration_blocks_deploy(service_name):
|
| 537 |
-
return (
|
| 538 |
-
f"BLOCKED: Pending migrations must be applied before deploying "
|
| 539 |
-
f"{service_name}. Pending: {self.migrations_pending}"
|
| 540 |
-
)
|
| 541 |
-
|
| 542 |
-
# Check if any dependency is unhealthy β 50% chance of deploy failure
|
| 543 |
-
for dep_name in svc.dependencies:
|
| 544 |
-
dep_svc = self.services.get(dep_name)
|
| 545 |
-
if dep_svc and dep_svc._get_health_pct() < 50.0:
|
| 546 |
-
if self._rng.random() < 0.5:
|
| 547 |
-
svc.logs.append(
|
| 548 |
-
f"[DEPLOY] Deploy {svc.name} {target_version} FAILED β "
|
| 549 |
-
f"dependency {dep_name} is unhealthy "
|
| 550 |
-
f"(health {dep_svc._get_health_pct():.0f}%). Retry may succeed."
|
| 551 |
-
)
|
| 552 |
-
return (
|
| 553 |
-
f"DEPLOY UNSTABLE: Dependency {dep_name} is unhealthy "
|
| 554 |
-
f"(health {dep_svc._get_health_pct():.0f}%). "
|
| 555 |
-
f"Deploy of {service_name} failed. Retry may succeed."
|
| 556 |
-
)
|
| 557 |
-
|
| 558 |
-
# Determine target environment
|
| 559 |
-
if not svc.staging_deployed:
|
| 560 |
-
self.pipeline_stage = PipelineStage.STAGING
|
| 561 |
-
return svc.deploy_to_staging(target_version, self.scenario)
|
| 562 |
-
else:
|
| 563 |
-
self.pipeline_stage = PipelineStage.DEPLOYING
|
| 564 |
-
result = svc.deploy_to_production(target_version)
|
| 565 |
-
# Notify scenario of deploy (for cascading effects)
|
| 566 |
-
if hasattr(self.scenario, 'on_prod_deploy'):
|
| 567 |
-
extra = self.scenario.on_prod_deploy(self, service_name, target_version)
|
| 568 |
-
if extra:
|
| 569 |
-
result += "\n" + extra
|
| 570 |
-
# Check if all target services deployed
|
| 571 |
-
if all(s.prod_deployed for s in self.services.values() if s.target_version):
|
| 572 |
-
self.pipeline_stage = PipelineStage.DEPLOYED
|
| 573 |
-
return result
|
| 574 |
-
|
| 575 |
-
def _rollback(self, service_name):
|
| 576 |
-
svc = self.services.get(service_name)
|
| 577 |
-
if not svc:
|
| 578 |
-
return f"No service named '{service_name}'"
|
| 579 |
-
self.pipeline_stage = PipelineStage.ROLLED_BACK
|
| 580 |
-
|
| 581 |
-
# Check if dependents rely on current version's APIs
|
| 582 |
-
old_version = svc.current_version
|
| 583 |
-
dependents = self._get_dependents(service_name)
|
| 584 |
-
result = svc.rollback()
|
| 585 |
-
|
| 586 |
-
# Warn about dependent services and increase their error rates
|
| 587 |
-
for dep in dependents:
|
| 588 |
-
dep.error_rate = round(dep.error_rate + 5.0, 2)
|
| 589 |
-
if dep.health == ServiceHealth.HEALTHY and dep.error_rate > 3.0:
|
| 590 |
-
dep.health = ServiceHealth.DEGRADED
|
| 591 |
-
self.alerts.append(AlertInfo(
|
| 592 |
-
severity="warning",
|
| 593 |
-
message=(
|
| 594 |
-
f"Rollback impact: {dep.name} depends on {service_name} "
|
| 595 |
-
f"{old_version}. Rollback may break {dep.name}. "
|
| 596 |
-
f"Error rate increased to {dep.error_rate:.1f}/s."
|
| 597 |
-
),
|
| 598 |
-
service_name=dep.name,
|
| 599 |
-
timestamp="2026-04-01T12:00:00Z",
|
| 600 |
-
))
|
| 601 |
-
dep.logs.append(
|
| 602 |
-
f"[ROLLBACK-IMPACT] {service_name} rolled back from {old_version} β "
|
| 603 |
-
f"{dep.name} error_rate increased to {dep.error_rate:.1f}/s. "
|
| 604 |
-
f"Dependency on {old_version} APIs may be broken."
|
| 605 |
-
)
|
| 606 |
-
|
| 607 |
-
if hasattr(self.scenario, 'on_rollback'):
|
| 608 |
-
self.scenario.on_rollback(self, service_name)
|
| 609 |
-
return result
|
| 610 |
-
|
| 611 |
-
def _approve(self, reason):
|
| 612 |
-
self.pipeline_stage = PipelineStage.DEPLOYED
|
| 613 |
-
return f"Deployment APPROVED. Reason: {reason or 'No reason given.'}"
|
| 614 |
-
|
| 615 |
-
def _abort(self, reason):
|
| 616 |
-
self.pipeline_stage = PipelineStage.FAILED
|
| 617 |
-
return f"Deployment ABORTED. Reason: {reason or 'No reason given.'}"
|
| 618 |
-
|
| 619 |
-
# --- State queries --------------------------------------------------------
|
| 620 |
-
|
| 621 |
-
def snapshot(self):
|
| 622 |
-
"""Capture current state for reward calculation."""
|
| 623 |
-
return {
|
| 624 |
-
"services": {
|
| 625 |
-
name: {
|
| 626 |
-
"health": s.health.value,
|
| 627 |
-
"error_rate": s.error_rate,
|
| 628 |
-
"latency_ms": s.latency_ms,
|
| 629 |
-
"prod_deployed": s.prod_deployed,
|
| 630 |
-
"staging_verified": s.staging_verified,
|
| 631 |
-
"config": dict(s.config),
|
| 632 |
-
}
|
| 633 |
-
for name, s in self.services.items()
|
| 634 |
-
},
|
| 635 |
-
"system_health": self.get_system_health(),
|
| 636 |
-
"pipeline_stage": self.pipeline_stage.value,
|
| 637 |
-
"migrations_pending": list(self.migrations_pending),
|
| 638 |
-
"alerts": list(self.alerts),
|
| 639 |
-
}
|
| 640 |
-
|
| 641 |
-
def get_system_health(self):
|
| 642 |
-
"""Aggregate health 0-100."""
|
| 643 |
-
if not self.services:
|
| 644 |
-
return 100.0
|
| 645 |
-
total = 0.0
|
| 646 |
-
for svc in self.services.values():
|
| 647 |
-
total += svc._get_health_pct()
|
| 648 |
-
return total / len(self.services)
|
| 649 |
-
|
| 650 |
-
def get_service_statuses(self):
|
| 651 |
-
return [s.to_status() for s in self.services.values()]
|
| 652 |
-
|
| 653 |
-
def get_pipeline_status(self):
|
| 654 |
-
return PipelineStatus(
|
| 655 |
-
stage=self.pipeline_stage,
|
| 656 |
-
triggered_by=self.triggered_by,
|
| 657 |
-
started_at=self.started_at,
|
| 658 |
-
commit_sha=self.commit_sha,
|
| 659 |
-
build_logs_snippet=self.build_logs if self.build_logs else None,
|
| 660 |
-
test_pass_count=self.test_pass,
|
| 661 |
-
test_fail_count=self.test_fail,
|
| 662 |
-
)
|
| 663 |
-
|
| 664 |
-
def get_migration_status(self):
|
| 665 |
-
return MigrationStatus(
|
| 666 |
-
pending_migrations=list(self.migrations_pending),
|
| 667 |
-
last_applied=self.migrations_applied[-1] if self.migrations_applied else None,
|
| 668 |
-
migration_errors=self.migration_errors if self.migration_errors else None,
|
| 669 |
-
)
|
| 670 |
-
|
| 671 |
-
def get_alerts(self):
|
| 672 |
-
return list(self.alerts)
|
| 673 |
-
|
| 674 |
-
def get_service_names(self):
|
| 675 |
-
return list(self.services.keys())
|
| 676 |
-
|
| 677 |
-
def has_services(self):
|
| 678 |
-
return len(self.services) > 0
|
| 679 |
-
|
| 680 |
-
def has_pending_migrations(self):
|
| 681 |
-
return len(self.migrations_pending) > 0
|
| 682 |
-
|
| 683 |
-
def _apply_time_pressure(self):
|
| 684 |
-
"""During incidents, degraded services get worse each step."""
|
| 685 |
-
task = self.scenario.task_name
|
| 686 |
-
|
| 687 |
-
if task == "judgment_call":
|
| 688 |
-
api_gw = self.services.get("api-gateway")
|
| 689 |
-
if api_gw and api_gw.health == ServiceHealth.DEGRADED:
|
| 690 |
-
degrade_lat = 80 * self._rng.uniform(0.8, 1.2)
|
| 691 |
-
degrade_err = 0.8 * self._rng.uniform(0.9, 1.1)
|
| 692 |
-
api_gw.latency_ms = round(min(api_gw.latency_ms + degrade_lat, 5000), 1)
|
| 693 |
-
api_gw.error_rate = round(min(api_gw.error_rate + degrade_err, 50.0), 2)
|
| 694 |
-
api_gw.cpu_percent = min(api_gw.cpu_percent + 1, 99)
|
| 695 |
-
api_gw.logs.append(
|
| 696 |
-
f"[DEGRADING] api-gateway latency now {api_gw.latency_ms:.0f}ms, "
|
| 697 |
-
f"errors {api_gw.error_rate:.1f}/s β situation worsening"
|
| 698 |
-
)
|
| 699 |
-
|
| 700 |
-
elif task == "broken_pipeline":
|
| 701 |
-
# Cache-service degrades if config error persists
|
| 702 |
-
cache = self.services.get("cache-service")
|
| 703 |
-
if cache and self.scenario.check_config_error("cache-service", cache.config):
|
| 704 |
-
health_drop = 3.0 * self._rng.uniform(0.8, 1.2)
|
| 705 |
-
cache.error_rate = round(min(cache.error_rate + health_drop * 0.5, 25.0), 2)
|
| 706 |
-
cache.latency_ms = round(min(cache.latency_ms + 30.0 * self._rng.uniform(0.8, 1.2), 2000.0), 1)
|
| 707 |
-
if cache.error_rate > 3.0 and cache.health == ServiceHealth.HEALTHY:
|
| 708 |
-
cache.health = ServiceHealth.DEGRADED
|
| 709 |
-
cache.logs.append(
|
| 710 |
-
f"[DEGRADING] cache-service using staging Redis β "
|
| 711 |
-
f"error_rate now {cache.error_rate:.1f}/s, "
|
| 712 |
-
f"latency {cache.latency_ms:.0f}ms"
|
| 713 |
-
)
|
| 714 |
-
|
| 715 |
-
# Api-gateway latency increases if migration not applied
|
| 716 |
-
api_gw = self.services.get("api-gateway")
|
| 717 |
-
if api_gw and "add_index_users_email" in self.migrations_pending:
|
| 718 |
-
lat_increase = 50.0 * self._rng.uniform(0.8, 1.2)
|
| 719 |
-
api_gw.latency_ms = round(min(api_gw.latency_ms + lat_increase, 2000.0), 1)
|
| 720 |
-
api_gw.logs.append(
|
| 721 |
-
f"[DEGRADING] api-gateway missing index β "
|
| 722 |
-
f"user query latency now {api_gw.latency_ms:.0f}ms"
|
| 723 |
-
)
|
| 724 |
-
|
| 725 |
-
elif task == "capacity_crisis":
|
| 726 |
-
db = self.services.get("database-primary")
|
| 727 |
-
api_gw = self.services.get("api-gateway")
|
| 728 |
-
# Time pressure only while connection pool bottleneck persists
|
| 729 |
-
if db and self.scenario.check_config_error("database-primary", db.config):
|
| 730 |
-
db.cpu_percent = min(db.cpu_percent + 2, 99)
|
| 731 |
-
db.latency_ms = round(db.latency_ms + 15, 1)
|
| 732 |
-
# api-gateway degrades only while db bottleneck persists
|
| 733 |
-
if (api_gw and api_gw.health == ServiceHealth.DEGRADED
|
| 734 |
-
and db and self.scenario.check_config_error("database-primary", db.config)):
|
| 735 |
-
api_gw.latency_ms = round(min(api_gw.latency_ms + 30, 5000), 1)
|
| 736 |
-
api_gw.error_rate = round(min(api_gw.error_rate + 0.5, 50.0), 2)
|
| 737 |
-
|
| 738 |
-
elif task == "random_incident":
|
| 739 |
-
failing = getattr(self.scenario, 'failing_service', None)
|
| 740 |
-
if failing:
|
| 741 |
-
svc = self.services.get(failing)
|
| 742 |
-
if svc and svc.health == ServiceHealth.DEGRADED:
|
| 743 |
-
svc.error_rate = round(min(svc.error_rate + 0.5, 50.0), 2)
|
| 744 |
-
svc.latency_ms = round(min(svc.latency_ms + 30, 5000), 1)
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Simulation engine for the DevOps Pipeline Environment."""
|
| 8 |
+
|
| 9 |
+
import random
|
| 10 |
+
|
| 11 |
+
from devops_pipeline_env.models import (
|
| 12 |
+
ActionType,
|
| 13 |
+
AlertInfo,
|
| 14 |
+
ConfigEdit,
|
| 15 |
+
MigrationStatus,
|
| 16 |
+
PipelineAction,
|
| 17 |
+
PipelineStage,
|
| 18 |
+
PipelineStatus,
|
| 19 |
+
ServiceHealth,
|
| 20 |
+
ServiceStatus,
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class ServiceState:
|
| 25 |
+
"""State machine for a single microservice."""
|
| 26 |
+
|
| 27 |
+
def __init__(self, name, version, health, config, dependencies,
|
| 28 |
+
latency_ms, error_rate, cpu, memory, rng=None):
|
| 29 |
+
self.name = name
|
| 30 |
+
self.current_version = version
|
| 31 |
+
self.target_version = None
|
| 32 |
+
self.health = health
|
| 33 |
+
self.config = dict(config)
|
| 34 |
+
self.dependencies = list(dependencies) if dependencies else []
|
| 35 |
+
self.latency_ms = latency_ms
|
| 36 |
+
self.error_rate = error_rate
|
| 37 |
+
self.cpu_percent = cpu
|
| 38 |
+
self.memory_percent = memory
|
| 39 |
+
self.active_connections = 100
|
| 40 |
+
self.staging_deployed = False
|
| 41 |
+
self.staging_verified = False
|
| 42 |
+
self.prod_deployed = False
|
| 43 |
+
self.last_deploy_timestamp = "2026-04-01T00:00:00Z"
|
| 44 |
+
self.logs = []
|
| 45 |
+
self._rng = rng or random.Random(0)
|
| 46 |
+
# Staged health recovery: 0 = fully recovered, >0 = still recovering
|
| 47 |
+
self._recovery_steps_remaining = 0
|
| 48 |
+
self._recovery_target_latency = 0.0
|
| 49 |
+
self._recovery_target_error_rate = 0.0
|
| 50 |
+
|
| 51 |
+
def deploy_to_staging(self, version, scenario):
|
| 52 |
+
"""Deploy version to staging. Returns result text."""
|
| 53 |
+
self.staging_deployed = True
|
| 54 |
+
self.target_version = version
|
| 55 |
+
|
| 56 |
+
# 8% chance of transient staging failure on first attempt
|
| 57 |
+
# Skip for clean_deploy (easy task) and during incidents (health already degraded/down)
|
| 58 |
+
transient_roll = self._rng.random() # always consume RNG for determinism
|
| 59 |
+
is_clean_deploy = hasattr(self, '_task_name') and self._task_name == "clean_deploy"
|
| 60 |
+
if not is_clean_deploy and not self.staging_verified and self.health == ServiceHealth.HEALTHY and transient_roll < 0.08:
|
| 61 |
+
self.staging_deployed = True # deployed but not verified
|
| 62 |
+
self.logs.append(
|
| 63 |
+
f"[DEPLOY] Deployed {self.name} {version} to staging. "
|
| 64 |
+
f"Transient failure: health check timed out. Retry should succeed."
|
| 65 |
+
)
|
| 66 |
+
return (
|
| 67 |
+
f"Deployed {self.name} {version} to staging. "
|
| 68 |
+
f"WARNING: Transient health check timeout. Try deploying again."
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
if scenario.check_config_error(self.name, self.config):
|
| 72 |
+
self.health = ServiceHealth.DEGRADED
|
| 73 |
+
lat_mult = self._rng.uniform(0.8, 1.2)
|
| 74 |
+
err_mult = self._rng.uniform(0.9, 1.1)
|
| 75 |
+
self.error_rate = round(12.0 * err_mult, 2)
|
| 76 |
+
self.latency_ms = round(300.0 * lat_mult, 1)
|
| 77 |
+
self.logs.append(
|
| 78 |
+
f"[DEPLOY] Deployed {self.name} {version} to staging. "
|
| 79 |
+
f"WARNING: Health check DEGRADED. Error rate elevated "
|
| 80 |
+
f"({self.error_rate:.1f}/s, latency {self.latency_ms:.0f}ms)."
|
| 81 |
+
)
|
| 82 |
+
return (
|
| 83 |
+
f"Deployed {self.name} {version} to staging. "
|
| 84 |
+
f"WARNING: Health check degraded. Error rate elevated."
|
| 85 |
+
)
|
| 86 |
+
self.health = ServiceHealth.HEALTHY
|
| 87 |
+
self.staging_verified = True
|
| 88 |
+
lat_mult = self._rng.uniform(0.8, 1.2)
|
| 89 |
+
self.error_rate = round(0.1 * self._rng.uniform(0.9, 1.1), 3)
|
| 90 |
+
self.latency_ms = round(45.0 * lat_mult, 1)
|
| 91 |
+
self.logs.append(
|
| 92 |
+
f"[DEPLOY] Deployed {self.name} {version} to staging. Health check: PASSED."
|
| 93 |
+
)
|
| 94 |
+
return f"Deployed {self.name} {version} to STAGING. Staging verified. Deploy same service+version again to PROMOTE TO PRODUCTION."
|
| 95 |
+
|
| 96 |
+
def deploy_to_production(self, version):
|
| 97 |
+
"""Promote to production."""
|
| 98 |
+
if not self.staging_verified:
|
| 99 |
+
self.health = ServiceHealth.DEGRADED
|
| 100 |
+
lat_mult = self._rng.uniform(0.8, 1.2)
|
| 101 |
+
err_mult = self._rng.uniform(0.9, 1.1)
|
| 102 |
+
self.error_rate = round(25.0 * err_mult, 2)
|
| 103 |
+
self.latency_ms = round(500.0 * lat_mult, 1)
|
| 104 |
+
self.logs.append(
|
| 105 |
+
f"[DEPLOY] Deployed {self.name} {version} to production "
|
| 106 |
+
f"WITHOUT staging verification. High risk."
|
| 107 |
+
)
|
| 108 |
+
return (
|
| 109 |
+
f"Deployed {self.name} {version} to production "
|
| 110 |
+
f"WITHOUT staging verification. High risk."
|
| 111 |
+
)
|
| 112 |
+
self.prod_deployed = True
|
| 113 |
+
self.current_version = version
|
| 114 |
+
# Staged recovery: takes 1-3 steps to fully stabilize
|
| 115 |
+
recovery_steps = self._rng.randint(1, 3)
|
| 116 |
+
self._recovery_steps_remaining = recovery_steps
|
| 117 |
+
base_latency = 45.0 * self._rng.uniform(0.8, 1.2)
|
| 118 |
+
base_error_rate = 0.1 * self._rng.uniform(0.9, 1.1)
|
| 119 |
+
|
| 120 |
+
# Non-linear deploy quality: same seed = same outcome
|
| 121 |
+
quality_roll = self._rng.random()
|
| 122 |
+
deploy_note = ""
|
| 123 |
+
if quality_roll < 0.7:
|
| 124 |
+
# Clean deploy β recovers to near-perfect
|
| 125 |
+
pass # base values are already good
|
| 126 |
+
elif quality_roll < 0.9:
|
| 127 |
+
# Minor issues β recovers to good but not perfect
|
| 128 |
+
base_latency *= 1.5
|
| 129 |
+
base_error_rate *= 3.0
|
| 130 |
+
deploy_note = " Minor post-deploy issues detected."
|
| 131 |
+
self.logs.append(
|
| 132 |
+
f"[DEPLOY] {self.name}: Minor post-deploy issues detected. "
|
| 133 |
+
f"Performance slightly below optimal."
|
| 134 |
+
)
|
| 135 |
+
else:
|
| 136 |
+
# Unstable deploy β recovers poorly
|
| 137 |
+
base_latency *= 2.5
|
| 138 |
+
base_error_rate *= 8.0
|
| 139 |
+
self.error_rate += 1.5
|
| 140 |
+
deploy_note = " Post-deploy instability detected."
|
| 141 |
+
self.logs.append(
|
| 142 |
+
f"[DEPLOY] {self.name}: Post-deploy instability detected. "
|
| 143 |
+
f"Elevated error rate."
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
self._recovery_target_latency = round(base_latency, 1)
|
| 147 |
+
self._recovery_target_error_rate = round(base_error_rate, 3)
|
| 148 |
+
# Start at slightly elevated values during recovery
|
| 149 |
+
self.health = ServiceHealth.HEALTHY
|
| 150 |
+
self.latency_ms = round(base_latency * (1.0 + 0.3 * recovery_steps), 1)
|
| 151 |
+
self.error_rate = round(base_error_rate * (1.0 + 0.5 * recovery_steps), 3)
|
| 152 |
+
# Trade-off: deploy causes temporary CPU/latency spike (warmup load)
|
| 153 |
+
# Clean deploy tasks get reduced spikes β they should be clean
|
| 154 |
+
if hasattr(self, '_task_name') and self._task_name == "clean_deploy":
|
| 155 |
+
self.cpu_percent = min(self.cpu_percent + 3, 99)
|
| 156 |
+
self.latency_ms += round(30 * self._rng.uniform(0.8, 1.2), 1)
|
| 157 |
+
else:
|
| 158 |
+
self.cpu_percent = min(self.cpu_percent + 15, 99)
|
| 159 |
+
self.latency_ms += round(200 * self._rng.uniform(0.8, 1.2), 1)
|
| 160 |
+
self.last_deploy_timestamp = "2026-04-01T12:00:00Z"
|
| 161 |
+
self.logs.append(
|
| 162 |
+
f"[DEPLOY] Promoted {self.name} {version} to production. Health: HEALTHY. "
|
| 163 |
+
f"Stabilizing over ~{recovery_steps} step(s). CPU/latency spike from warmup."
|
| 164 |
+
)
|
| 165 |
+
return (
|
| 166 |
+
f"Promoted {self.name} {version} to production. Health: HEALTHY. "
|
| 167 |
+
f"Deployed successfully. Service under warmup load β temporary CPU/latency spike expected."
|
| 168 |
+
f"{deploy_note}"
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
def tick_recovery(self):
|
| 172 |
+
"""Called each step to progress staged health recovery."""
|
| 173 |
+
if self._recovery_steps_remaining > 0:
|
| 174 |
+
self._recovery_steps_remaining -= 1
|
| 175 |
+
if self._recovery_steps_remaining == 0:
|
| 176 |
+
# Fully recovered
|
| 177 |
+
self.latency_ms = self._recovery_target_latency
|
| 178 |
+
self.error_rate = self._recovery_target_error_rate
|
| 179 |
+
if self.health == ServiceHealth.DEGRADED and self.error_rate < 5.0:
|
| 180 |
+
self.health = ServiceHealth.HEALTHY
|
| 181 |
+
else:
|
| 182 |
+
# Interpolate toward target
|
| 183 |
+
progress = 1.0 - (self._recovery_steps_remaining / (self._recovery_steps_remaining + 1))
|
| 184 |
+
self.latency_ms = round(
|
| 185 |
+
self.latency_ms + (self._recovery_target_latency - self.latency_ms) * progress, 1
|
| 186 |
+
)
|
| 187 |
+
self.error_rate = round(
|
| 188 |
+
self.error_rate + (self._recovery_target_error_rate - self.error_rate) * progress, 3
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
def rollback(self):
|
| 192 |
+
"""Rollback to previous version."""
|
| 193 |
+
self.health = ServiceHealth.HEALTHY
|
| 194 |
+
lat_mult = self._rng.uniform(0.8, 1.2)
|
| 195 |
+
err_mult = self._rng.uniform(0.9, 1.1)
|
| 196 |
+
self.error_rate = round(0.5 * err_mult, 3)
|
| 197 |
+
self.latency_ms = round(50.0 * lat_mult * 0.7, 1)
|
| 198 |
+
self.staging_deployed = False
|
| 199 |
+
self.staging_verified = False
|
| 200 |
+
self.prod_deployed = True # still in prod, just rolled back
|
| 201 |
+
self._recovery_steps_remaining = 0
|
| 202 |
+
# Trade-off: 25% chance rollback reintroduces a known bug
|
| 203 |
+
regression = False
|
| 204 |
+
if self._rng.random() < 0.25:
|
| 205 |
+
self.error_rate = round(self.error_rate + 3.0, 2)
|
| 206 |
+
regression = True
|
| 207 |
+
self.logs.append(
|
| 208 |
+
f"[ROLLBACK] Rolled back {self.name} to {self.current_version}. "
|
| 209 |
+
f"Warning: rollback may have reintroduced known issue from previous version"
|
| 210 |
+
)
|
| 211 |
+
else:
|
| 212 |
+
self.logs.append(
|
| 213 |
+
f"[ROLLBACK] Rolled back {self.name} to {self.current_version}. Service healthy."
|
| 214 |
+
)
|
| 215 |
+
result = f"Rolled back {self.name} to {self.current_version}. Rolled back. Monitoring for regression..."
|
| 216 |
+
if regression:
|
| 217 |
+
result += f" WARNING: Error rate elevated ({self.error_rate:.1f}/s) β possible regression."
|
| 218 |
+
return result
|
| 219 |
+
|
| 220 |
+
def set_config(self, key, value):
|
| 221 |
+
"""Edit a config value."""
|
| 222 |
+
old = self.config.get(key, "<not set>")
|
| 223 |
+
self.config[key] = value
|
| 224 |
+
# Trade-off: config change causes brief restart spike
|
| 225 |
+
self.latency_ms += round(100 * self._rng.uniform(0.8, 1.2), 1)
|
| 226 |
+
self.cpu_percent = min(self.cpu_percent + 5, 99)
|
| 227 |
+
self.logs.append(f"[CONFIG] {self.name}: {key} changed from '{old}' to '{value}'. Service restarting.")
|
| 228 |
+
return f"Config {self.name}: {key} changed from '{old}' to '{value}'. Config updated. Service restarting β brief latency spike."
|
| 229 |
+
|
| 230 |
+
def get_config_snapshot(self):
|
| 231 |
+
return dict(self.config)
|
| 232 |
+
|
| 233 |
+
def get_logs(self):
|
| 234 |
+
return list(self.logs)
|
| 235 |
+
|
| 236 |
+
def _get_health_pct(self):
|
| 237 |
+
"""Get numeric health percentage for this service."""
|
| 238 |
+
h = 100.0
|
| 239 |
+
if self.health == ServiceHealth.DOWN:
|
| 240 |
+
h = 0.0
|
| 241 |
+
elif self.health == ServiceHealth.DEGRADED:
|
| 242 |
+
h = 50.0
|
| 243 |
+
h -= min(self.error_rate * 2, 30)
|
| 244 |
+
if self.latency_ms > 200:
|
| 245 |
+
h -= min((self.latency_ms - 200) / 10, 30)
|
| 246 |
+
return max(h, 0.0)
|
| 247 |
+
|
| 248 |
+
def to_status(self):
|
| 249 |
+
return ServiceStatus(
|
| 250 |
+
name=self.name,
|
| 251 |
+
health=self.health,
|
| 252 |
+
current_version=self.current_version,
|
| 253 |
+
cpu_percent=self.cpu_percent,
|
| 254 |
+
memory_percent=self.memory_percent,
|
| 255 |
+
error_rate=self.error_rate,
|
| 256 |
+
request_latency_ms=self.latency_ms,
|
| 257 |
+
active_connections=self.active_connections,
|
| 258 |
+
last_deploy_timestamp=self.last_deploy_timestamp,
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
class PipelineEngine:
|
| 263 |
+
"""Manages all services, pipeline state, migrations, alerts."""
|
| 264 |
+
|
| 265 |
+
def __init__(self, scenario, seed):
|
| 266 |
+
self.scenario = scenario
|
| 267 |
+
self._rng = random.Random(seed)
|
| 268 |
+
self.services = {}
|
| 269 |
+
self.pipeline_stage = PipelineStage.IDLE
|
| 270 |
+
self.migrations_pending = []
|
| 271 |
+
self.migrations_applied = []
|
| 272 |
+
self.migration_errors = []
|
| 273 |
+
self.alerts = []
|
| 274 |
+
self.commit_sha = "abc123"
|
| 275 |
+
self.triggered_by = "deploy-bot"
|
| 276 |
+
self.started_at = "2026-04-01T10:00:00Z"
|
| 277 |
+
self.test_pass = 0
|
| 278 |
+
self.test_fail = 0
|
| 279 |
+
self.build_logs = ""
|
| 280 |
+
self._time_pressure = False # Set by scenario if needed
|
| 281 |
+
|
| 282 |
+
# Initialize from scenario
|
| 283 |
+
scenario.setup(self)
|
| 284 |
+
|
| 285 |
+
# Inject the shared RNG and task name into all services created by the scenario
|
| 286 |
+
for svc in self.services.values():
|
| 287 |
+
svc._rng = self._rng
|
| 288 |
+
svc._task_name = scenario.task_name
|
| 289 |
+
|
| 290 |
+
def execute(self, action):
|
| 291 |
+
"""Execute an action. Returns human-readable result string."""
|
| 292 |
+
# 1. Tick health recovery for all services (heal from previous deploys)
|
| 293 |
+
for svc in self.services.values():
|
| 294 |
+
svc.tick_recovery()
|
| 295 |
+
|
| 296 |
+
# 2. Execute the agent's action FIRST
|
| 297 |
+
if action.action_type == ActionType.VIEW_PIPELINE:
|
| 298 |
+
result = self._view_pipeline()
|
| 299 |
+
elif action.action_type == ActionType.VIEW_LOGS:
|
| 300 |
+
result = self._view_logs(action.service_name)
|
| 301 |
+
elif action.action_type == ActionType.VIEW_CONFIG:
|
| 302 |
+
result = self._view_config(action.service_name)
|
| 303 |
+
elif action.action_type == ActionType.EDIT_CONFIG:
|
| 304 |
+
result = self._edit_config(action.service_name, action.config_edits)
|
| 305 |
+
elif action.action_type == ActionType.RUN_MIGRATION:
|
| 306 |
+
result = self._run_migration(action.migration_name, action.migration_type)
|
| 307 |
+
elif action.action_type == ActionType.DEPLOY:
|
| 308 |
+
result = self._deploy(action.service_name, action.target_version)
|
| 309 |
+
elif action.action_type == ActionType.ROLLBACK:
|
| 310 |
+
result = self._rollback(action.service_name)
|
| 311 |
+
elif action.action_type == ActionType.APPROVE:
|
| 312 |
+
result = self._approve(action.reason)
|
| 313 |
+
elif action.action_type == ActionType.ABORT:
|
| 314 |
+
result = self._abort(action.reason)
|
| 315 |
+
else:
|
| 316 |
+
result = "Unknown action."
|
| 317 |
+
|
| 318 |
+
# 3. Environmental effects AFTER action (agent sees consequences)
|
| 319 |
+
if self._time_pressure:
|
| 320 |
+
self._apply_time_pressure()
|
| 321 |
+
self._tick_cascading_effects()
|
| 322 |
+
self._tick_metric_compounding()
|
| 323 |
+
self._tick_tipping_points()
|
| 324 |
+
|
| 325 |
+
return result
|
| 326 |
+
|
| 327 |
+
# --- Cross-metric compounding ---------------------------------------------
|
| 328 |
+
|
| 329 |
+
def _tick_metric_compounding(self):
|
| 330 |
+
"""Metrics compound on each other β creates realistic spirals and recovery."""
|
| 331 |
+
if self.scenario.task_name == "clean_deploy":
|
| 332 |
+
return
|
| 333 |
+
for name, svc in self.services.items():
|
| 334 |
+
# Degradation spirals (moderate β should not kill episodes in <5 steps)
|
| 335 |
+
if svc.error_rate > 15.0:
|
| 336 |
+
svc.cpu_percent = min(svc.cpu_percent + 3, 99)
|
| 337 |
+
if svc.cpu_percent > 90:
|
| 338 |
+
svc.latency_ms = round(min(svc.latency_ms + 100, 5000), 1)
|
| 339 |
+
if svc.latency_ms > 3000:
|
| 340 |
+
svc.error_rate = round(min(svc.error_rate + 1.0, 50.0), 2)
|
| 341 |
+
|
| 342 |
+
# Natural recovery (when metrics are good, they help each other)
|
| 343 |
+
if svc.error_rate < 2.0:
|
| 344 |
+
svc.cpu_percent = max(svc.cpu_percent - 3, 10)
|
| 345 |
+
if svc.cpu_percent < 50:
|
| 346 |
+
svc.latency_ms = round(max(svc.latency_ms - 50, 20), 1)
|
| 347 |
+
if svc.latency_ms < 200 and svc.error_rate < 1.0:
|
| 348 |
+
svc.error_rate = round(max(svc.error_rate - 0.5, 0.0), 2)
|
| 349 |
+
|
| 350 |
+
# --- Non-linear tipping points -------------------------------------------
|
| 351 |
+
|
| 352 |
+
def _tick_tipping_points(self):
|
| 353 |
+
"""Non-linear tipping points β systems cliff instead of degrading linearly."""
|
| 354 |
+
if self.scenario.task_name == "clean_deploy":
|
| 355 |
+
return
|
| 356 |
+
for name, svc in self.services.items():
|
| 357 |
+
# CPU cliff: above 85% = exponential error growth
|
| 358 |
+
if svc.cpu_percent > 85:
|
| 359 |
+
overflow = svc.cpu_percent - 85
|
| 360 |
+
svc.error_rate = round(min(svc.error_rate + overflow * 0.2, 50.0), 2)
|
| 361 |
+
|
| 362 |
+
# Latency cliff: above 2000ms = rapid collapse
|
| 363 |
+
if svc.latency_ms > 2000:
|
| 364 |
+
svc.error_rate = round(min(svc.error_rate + 3.0, 50.0), 2)
|
| 365 |
+
|
| 366 |
+
# Health cliff: below 30% health = accelerating death spiral
|
| 367 |
+
base = 50.0 if svc.health == ServiceHealth.DEGRADED else (
|
| 368 |
+
100.0 if svc.health == ServiceHealth.HEALTHY else 0.0
|
| 369 |
+
)
|
| 370 |
+
err_penalty = min(svc.error_rate * 2, 30)
|
| 371 |
+
lat_penalty = min(max(0, svc.latency_ms - 200) / 10, 30)
|
| 372 |
+
health_pct = max(0, base - err_penalty - lat_penalty)
|
| 373 |
+
if health_pct < 30:
|
| 374 |
+
svc.error_rate = round(min(svc.error_rate * 1.3, 50.0), 2)
|
| 375 |
+
|
| 376 |
+
# Latency β CPU feedback (high latency = retries = more CPU)
|
| 377 |
+
if svc.latency_ms > 1500:
|
| 378 |
+
svc.cpu_percent = min(svc.cpu_percent + 3, 99)
|
| 379 |
+
|
| 380 |
+
# --- Cascading failures ---------------------------------------------------
|
| 381 |
+
|
| 382 |
+
def _get_dependents(self, service_name):
|
| 383 |
+
"""Find all services that list service_name in their dependencies."""
|
| 384 |
+
return [
|
| 385 |
+
svc for svc in self.services.values()
|
| 386 |
+
if service_name in svc.dependencies
|
| 387 |
+
]
|
| 388 |
+
|
| 389 |
+
def _tick_cascading_effects(self):
|
| 390 |
+
"""Unhealthy services degrade their dependents each step."""
|
| 391 |
+
for svc in self.services.values():
|
| 392 |
+
health_pct = svc._get_health_pct()
|
| 393 |
+
if health_pct >= 50.0:
|
| 394 |
+
continue # healthy enough, no cascade
|
| 395 |
+
|
| 396 |
+
dependents = self._get_dependents(svc.name)
|
| 397 |
+
for dep in dependents:
|
| 398 |
+
if dep.health == ServiceHealth.DOWN:
|
| 399 |
+
continue # already down, can't get worse from cascade
|
| 400 |
+
|
| 401 |
+
# Determine cascade severity
|
| 402 |
+
if health_pct < 20.0:
|
| 403 |
+
# Source is effectively down β moderate cascade
|
| 404 |
+
err_increase = 1.5
|
| 405 |
+
lat_increase = 30.0
|
| 406 |
+
else:
|
| 407 |
+
# Source is degraded β lighter cascade
|
| 408 |
+
err_increase = 0.5
|
| 409 |
+
lat_increase = 10.0
|
| 410 |
+
|
| 411 |
+
old_err = dep.error_rate
|
| 412 |
+
dep.error_rate = round(min(dep.error_rate + err_increase, 45.0), 2)
|
| 413 |
+
dep.latency_ms = round(min(dep.latency_ms + lat_increase, 4500.0), 1)
|
| 414 |
+
|
| 415 |
+
# If error rate gets high enough, mark as degraded
|
| 416 |
+
if dep.error_rate > 5.0 and dep.health == ServiceHealth.HEALTHY:
|
| 417 |
+
dep.health = ServiceHealth.DEGRADED
|
| 418 |
+
|
| 419 |
+
# Floor: cascading alone can't push health below 5%
|
| 420 |
+
# (prevent instant death spirals)
|
| 421 |
+
dep_health = dep._get_health_pct()
|
| 422 |
+
if dep_health < 5.0:
|
| 423 |
+
dep.error_rate = round(max(old_err, dep.error_rate - err_increase + 1.0), 2)
|
| 424 |
+
|
| 425 |
+
# Add cascade alert (only if not already alerted this step)
|
| 426 |
+
cascade_alert_key = f"cascade:{svc.name}->{dep.name}"
|
| 427 |
+
existing = [a for a in self.alerts if cascade_alert_key in a.message]
|
| 428 |
+
if not existing:
|
| 429 |
+
self.alerts.append(AlertInfo(
|
| 430 |
+
severity="warning",
|
| 431 |
+
message=(
|
| 432 |
+
f"Cascading: {svc.name} (health {health_pct:.0f}%) is degrading "
|
| 433 |
+
f"{dep.name} β error_rate +{err_increase}/s, latency +{lat_increase:.0f}ms "
|
| 434 |
+
f"[{cascade_alert_key}]"
|
| 435 |
+
),
|
| 436 |
+
service_name=dep.name,
|
| 437 |
+
timestamp="2026-04-01T12:00:00Z",
|
| 438 |
+
))
|
| 439 |
+
|
| 440 |
+
dep.logs.append(
|
| 441 |
+
f"[CASCADE] Upstream {svc.name} unhealthy (health {health_pct:.0f}%) β "
|
| 442 |
+
f"{dep.name} error_rate now {dep.error_rate:.1f}/s, "
|
| 443 |
+
f"latency {dep.latency_ms:.0f}ms"
|
| 444 |
+
)
|
| 445 |
+
|
| 446 |
+
# Recovery propagation: healthy services help their dependents recover
|
| 447 |
+
for name, svc in self.services.items():
|
| 448 |
+
if svc.health == ServiceHealth.HEALTHY and svc.error_rate < 2.0:
|
| 449 |
+
dependents = self._get_dependents(name)
|
| 450 |
+
for dep in dependents:
|
| 451 |
+
if dep.health == ServiceHealth.DEGRADED:
|
| 452 |
+
dep.error_rate = round(dep.error_rate * 0.9, 2)
|
| 453 |
+
dep.latency_ms = round(dep.latency_ms * 0.9, 1)
|
| 454 |
+
|
| 455 |
+
# --- Action handlers ------------------------------------------------------
|
| 456 |
+
|
| 457 |
+
def _view_pipeline(self):
|
| 458 |
+
services_summary = "\n".join(
|
| 459 |
+
f" {s.name}: {s.health.value} | v{s.current_version} -> "
|
| 460 |
+
f"v{s.target_version or 'N/A'} | "
|
| 461 |
+
f"latency={s.latency_ms:.0f}ms | errors={s.error_rate:.1f}/s"
|
| 462 |
+
for s in self.services.values()
|
| 463 |
+
)
|
| 464 |
+
return (
|
| 465 |
+
f"Pipeline Stage: {self.pipeline_stage.value}\n"
|
| 466 |
+
f"Commit: {self.commit_sha}\n"
|
| 467 |
+
f"Tests: {self.test_pass} passed, {self.test_fail} failed\n"
|
| 468 |
+
f"Pending Migrations: {len(self.migrations_pending)}\n"
|
| 469 |
+
f"Services:\n{services_summary}"
|
| 470 |
+
)
|
| 471 |
+
|
| 472 |
+
def _view_logs(self, service_name):
|
| 473 |
+
svc = self.services.get(service_name)
|
| 474 |
+
if not svc:
|
| 475 |
+
return f"No service named '{service_name}'"
|
| 476 |
+
logs = svc.get_logs()
|
| 477 |
+
if not logs:
|
| 478 |
+
return f"No logs available for {service_name}."
|
| 479 |
+
return f"Logs for {service_name}:\n" + "\n".join(logs[-20:])
|
| 480 |
+
|
| 481 |
+
def _view_config(self, service_name):
|
| 482 |
+
svc = self.services.get(service_name)
|
| 483 |
+
if not svc:
|
| 484 |
+
return f"No service named '{service_name}'"
|
| 485 |
+
config = svc.get_config_snapshot()
|
| 486 |
+
lines = [f" {k} = {v}" for k, v in config.items()]
|
| 487 |
+
return f"Config for {service_name}:\n" + "\n".join(lines)
|
| 488 |
+
|
| 489 |
+
def _edit_config(self, service_name, edits):
|
| 490 |
+
svc = self.services.get(service_name)
|
| 491 |
+
if not svc:
|
| 492 |
+
return f"No service named '{service_name}'"
|
| 493 |
+
results = []
|
| 494 |
+
for edit in edits:
|
| 495 |
+
result = svc.set_config(edit.key, edit.value)
|
| 496 |
+
results.append(result)
|
| 497 |
+
# If the config error is now fixed and service was degraded, start
|
| 498 |
+
# staged recovery (2 steps) instead of instant heal
|
| 499 |
+
if svc.health == ServiceHealth.DEGRADED and not self.scenario.check_config_error(service_name, svc.config):
|
| 500 |
+
svc.staging_deployed = False
|
| 501 |
+
svc.staging_verified = False
|
| 502 |
+
# Immediate PARTIAL improvement
|
| 503 |
+
svc.error_rate = round(svc.error_rate * 0.5, 2)
|
| 504 |
+
svc.latency_ms = round(svc.latency_ms * 0.6, 1)
|
| 505 |
+
# Set up 2-step recovery to full health (reuse tick_recovery pattern)
|
| 506 |
+
svc._recovery_steps_remaining = 2
|
| 507 |
+
svc._recovery_target_latency = round(50.0 * self._rng.uniform(0.8, 1.2), 1)
|
| 508 |
+
svc._recovery_target_error_rate = round(0.1 * self._rng.uniform(0.9, 1.1), 3)
|
| 509 |
+
# Don't set health to HEALTHY yet β let tick_recovery handle it
|
| 510 |
+
# once error_rate drops below threshold on next steps
|
| 511 |
+
results.append(f"Config fix detected for {service_name}. Service improving β full recovery in ~2 steps. Ready for re-deploy.")
|
| 512 |
+
return "\n".join(results)
|
| 513 |
+
|
| 514 |
+
def _run_migration(self, migration_name, migration_type):
|
| 515 |
+
if migration_name not in self.migrations_pending:
|
| 516 |
+
return (
|
| 517 |
+
f"Migration '{migration_name}' not found in pending: "
|
| 518 |
+
f"{self.migrations_pending}"
|
| 519 |
+
)
|
| 520 |
+
success = self.scenario.run_migration(self, migration_name)
|
| 521 |
+
if success:
|
| 522 |
+
self.migrations_pending.remove(migration_name)
|
| 523 |
+
self.migrations_applied.append(migration_name)
|
| 524 |
+
return f"Migration '{migration_name}' applied successfully."
|
| 525 |
+
else:
|
| 526 |
+
error = f"Migration '{migration_name}' FAILED."
|
| 527 |
+
self.migration_errors.append(error)
|
| 528 |
+
return error
|
| 529 |
+
|
| 530 |
+
def _deploy(self, service_name, target_version):
|
| 531 |
+
svc = self.services.get(service_name)
|
| 532 |
+
if not svc:
|
| 533 |
+
return f"No service named '{service_name}'"
|
| 534 |
+
|
| 535 |
+
# Check migration dependencies
|
| 536 |
+
if self.migrations_pending and self.scenario.migration_blocks_deploy(service_name):
|
| 537 |
+
return (
|
| 538 |
+
f"BLOCKED: Pending migrations must be applied before deploying "
|
| 539 |
+
f"{service_name}. Pending: {self.migrations_pending}"
|
| 540 |
+
)
|
| 541 |
+
|
| 542 |
+
# Check if any dependency is unhealthy β 50% chance of deploy failure
|
| 543 |
+
for dep_name in svc.dependencies:
|
| 544 |
+
dep_svc = self.services.get(dep_name)
|
| 545 |
+
if dep_svc and dep_svc._get_health_pct() < 50.0:
|
| 546 |
+
if self._rng.random() < 0.5:
|
| 547 |
+
svc.logs.append(
|
| 548 |
+
f"[DEPLOY] Deploy {svc.name} {target_version} FAILED β "
|
| 549 |
+
f"dependency {dep_name} is unhealthy "
|
| 550 |
+
f"(health {dep_svc._get_health_pct():.0f}%). Retry may succeed."
|
| 551 |
+
)
|
| 552 |
+
return (
|
| 553 |
+
f"DEPLOY UNSTABLE: Dependency {dep_name} is unhealthy "
|
| 554 |
+
f"(health {dep_svc._get_health_pct():.0f}%). "
|
| 555 |
+
f"Deploy of {service_name} failed. Retry may succeed."
|
| 556 |
+
)
|
| 557 |
+
|
| 558 |
+
# Determine target environment
|
| 559 |
+
if not svc.staging_deployed:
|
| 560 |
+
self.pipeline_stage = PipelineStage.STAGING
|
| 561 |
+
return svc.deploy_to_staging(target_version, self.scenario)
|
| 562 |
+
else:
|
| 563 |
+
self.pipeline_stage = PipelineStage.DEPLOYING
|
| 564 |
+
result = svc.deploy_to_production(target_version)
|
| 565 |
+
# Notify scenario of deploy (for cascading effects)
|
| 566 |
+
if hasattr(self.scenario, 'on_prod_deploy'):
|
| 567 |
+
extra = self.scenario.on_prod_deploy(self, service_name, target_version)
|
| 568 |
+
if extra:
|
| 569 |
+
result += "\n" + extra
|
| 570 |
+
# Check if all target services deployed
|
| 571 |
+
if all(s.prod_deployed for s in self.services.values() if s.target_version):
|
| 572 |
+
self.pipeline_stage = PipelineStage.DEPLOYED
|
| 573 |
+
return result
|
| 574 |
+
|
| 575 |
+
def _rollback(self, service_name):
|
| 576 |
+
svc = self.services.get(service_name)
|
| 577 |
+
if not svc:
|
| 578 |
+
return f"No service named '{service_name}'"
|
| 579 |
+
self.pipeline_stage = PipelineStage.ROLLED_BACK
|
| 580 |
+
|
| 581 |
+
# Check if dependents rely on current version's APIs
|
| 582 |
+
old_version = svc.current_version
|
| 583 |
+
dependents = self._get_dependents(service_name)
|
| 584 |
+
result = svc.rollback()
|
| 585 |
+
|
| 586 |
+
# Warn about dependent services and increase their error rates
|
| 587 |
+
for dep in dependents:
|
| 588 |
+
dep.error_rate = round(dep.error_rate + 5.0, 2)
|
| 589 |
+
if dep.health == ServiceHealth.HEALTHY and dep.error_rate > 3.0:
|
| 590 |
+
dep.health = ServiceHealth.DEGRADED
|
| 591 |
+
self.alerts.append(AlertInfo(
|
| 592 |
+
severity="warning",
|
| 593 |
+
message=(
|
| 594 |
+
f"Rollback impact: {dep.name} depends on {service_name} "
|
| 595 |
+
f"{old_version}. Rollback may break {dep.name}. "
|
| 596 |
+
f"Error rate increased to {dep.error_rate:.1f}/s."
|
| 597 |
+
),
|
| 598 |
+
service_name=dep.name,
|
| 599 |
+
timestamp="2026-04-01T12:00:00Z",
|
| 600 |
+
))
|
| 601 |
+
dep.logs.append(
|
| 602 |
+
f"[ROLLBACK-IMPACT] {service_name} rolled back from {old_version} β "
|
| 603 |
+
f"{dep.name} error_rate increased to {dep.error_rate:.1f}/s. "
|
| 604 |
+
f"Dependency on {old_version} APIs may be broken."
|
| 605 |
+
)
|
| 606 |
+
|
| 607 |
+
if hasattr(self.scenario, 'on_rollback'):
|
| 608 |
+
self.scenario.on_rollback(self, service_name)
|
| 609 |
+
return result
|
| 610 |
+
|
| 611 |
+
def _approve(self, reason):
|
| 612 |
+
self.pipeline_stage = PipelineStage.DEPLOYED
|
| 613 |
+
return f"Deployment APPROVED. Reason: {reason or 'No reason given.'}"
|
| 614 |
+
|
| 615 |
+
def _abort(self, reason):
|
| 616 |
+
self.pipeline_stage = PipelineStage.FAILED
|
| 617 |
+
return f"Deployment ABORTED. Reason: {reason or 'No reason given.'}"
|
| 618 |
+
|
| 619 |
+
# --- State queries --------------------------------------------------------
|
| 620 |
+
|
| 621 |
+
def snapshot(self):
|
| 622 |
+
"""Capture current state for reward calculation."""
|
| 623 |
+
return {
|
| 624 |
+
"services": {
|
| 625 |
+
name: {
|
| 626 |
+
"health": s.health.value,
|
| 627 |
+
"error_rate": s.error_rate,
|
| 628 |
+
"latency_ms": s.latency_ms,
|
| 629 |
+
"prod_deployed": s.prod_deployed,
|
| 630 |
+
"staging_verified": s.staging_verified,
|
| 631 |
+
"config": dict(s.config),
|
| 632 |
+
}
|
| 633 |
+
for name, s in self.services.items()
|
| 634 |
+
},
|
| 635 |
+
"system_health": self.get_system_health(),
|
| 636 |
+
"pipeline_stage": self.pipeline_stage.value,
|
| 637 |
+
"migrations_pending": list(self.migrations_pending),
|
| 638 |
+
"alerts": list(self.alerts),
|
| 639 |
+
}
|
| 640 |
+
|
| 641 |
+
def get_system_health(self):
|
| 642 |
+
"""Aggregate health 0-100."""
|
| 643 |
+
if not self.services:
|
| 644 |
+
return 100.0
|
| 645 |
+
total = 0.0
|
| 646 |
+
for svc in self.services.values():
|
| 647 |
+
total += svc._get_health_pct()
|
| 648 |
+
return total / len(self.services)
|
| 649 |
+
|
| 650 |
+
def get_service_statuses(self):
|
| 651 |
+
return [s.to_status() for s in self.services.values()]
|
| 652 |
+
|
| 653 |
+
def get_pipeline_status(self):
|
| 654 |
+
return PipelineStatus(
|
| 655 |
+
stage=self.pipeline_stage,
|
| 656 |
+
triggered_by=self.triggered_by,
|
| 657 |
+
started_at=self.started_at,
|
| 658 |
+
commit_sha=self.commit_sha,
|
| 659 |
+
build_logs_snippet=self.build_logs if self.build_logs else None,
|
| 660 |
+
test_pass_count=self.test_pass,
|
| 661 |
+
test_fail_count=self.test_fail,
|
| 662 |
+
)
|
| 663 |
+
|
| 664 |
+
def get_migration_status(self):
|
| 665 |
+
return MigrationStatus(
|
| 666 |
+
pending_migrations=list(self.migrations_pending),
|
| 667 |
+
last_applied=self.migrations_applied[-1] if self.migrations_applied else None,
|
| 668 |
+
migration_errors=self.migration_errors if self.migration_errors else None,
|
| 669 |
+
)
|
| 670 |
+
|
| 671 |
+
def get_alerts(self):
|
| 672 |
+
return list(self.alerts)
|
| 673 |
+
|
| 674 |
+
def get_service_names(self):
|
| 675 |
+
return list(self.services.keys())
|
| 676 |
+
|
| 677 |
+
def has_services(self):
|
| 678 |
+
return len(self.services) > 0
|
| 679 |
+
|
| 680 |
+
def has_pending_migrations(self):
|
| 681 |
+
return len(self.migrations_pending) > 0
|
| 682 |
+
|
| 683 |
+
def _apply_time_pressure(self):
|
| 684 |
+
"""During incidents, degraded services get worse each step."""
|
| 685 |
+
task = self.scenario.task_name
|
| 686 |
+
|
| 687 |
+
if task == "judgment_call":
|
| 688 |
+
api_gw = self.services.get("api-gateway")
|
| 689 |
+
if api_gw and api_gw.health == ServiceHealth.DEGRADED:
|
| 690 |
+
degrade_lat = 80 * self._rng.uniform(0.8, 1.2)
|
| 691 |
+
degrade_err = 0.8 * self._rng.uniform(0.9, 1.1)
|
| 692 |
+
api_gw.latency_ms = round(min(api_gw.latency_ms + degrade_lat, 5000), 1)
|
| 693 |
+
api_gw.error_rate = round(min(api_gw.error_rate + degrade_err, 50.0), 2)
|
| 694 |
+
api_gw.cpu_percent = min(api_gw.cpu_percent + 1, 99)
|
| 695 |
+
api_gw.logs.append(
|
| 696 |
+
f"[DEGRADING] api-gateway latency now {api_gw.latency_ms:.0f}ms, "
|
| 697 |
+
f"errors {api_gw.error_rate:.1f}/s β situation worsening"
|
| 698 |
+
)
|
| 699 |
+
|
| 700 |
+
elif task == "broken_pipeline":
|
| 701 |
+
# Cache-service degrades if config error persists
|
| 702 |
+
cache = self.services.get("cache-service")
|
| 703 |
+
if cache and self.scenario.check_config_error("cache-service", cache.config):
|
| 704 |
+
health_drop = 3.0 * self._rng.uniform(0.8, 1.2)
|
| 705 |
+
cache.error_rate = round(min(cache.error_rate + health_drop * 0.5, 25.0), 2)
|
| 706 |
+
cache.latency_ms = round(min(cache.latency_ms + 30.0 * self._rng.uniform(0.8, 1.2), 2000.0), 1)
|
| 707 |
+
if cache.error_rate > 3.0 and cache.health == ServiceHealth.HEALTHY:
|
| 708 |
+
cache.health = ServiceHealth.DEGRADED
|
| 709 |
+
cache.logs.append(
|
| 710 |
+
f"[DEGRADING] cache-service using staging Redis β "
|
| 711 |
+
f"error_rate now {cache.error_rate:.1f}/s, "
|
| 712 |
+
f"latency {cache.latency_ms:.0f}ms"
|
| 713 |
+
)
|
| 714 |
+
|
| 715 |
+
# Api-gateway latency increases if migration not applied
|
| 716 |
+
api_gw = self.services.get("api-gateway")
|
| 717 |
+
if api_gw and "add_index_users_email" in self.migrations_pending:
|
| 718 |
+
lat_increase = 50.0 * self._rng.uniform(0.8, 1.2)
|
| 719 |
+
api_gw.latency_ms = round(min(api_gw.latency_ms + lat_increase, 2000.0), 1)
|
| 720 |
+
api_gw.logs.append(
|
| 721 |
+
f"[DEGRADING] api-gateway missing index β "
|
| 722 |
+
f"user query latency now {api_gw.latency_ms:.0f}ms"
|
| 723 |
+
)
|
| 724 |
+
|
| 725 |
+
elif task == "capacity_crisis":
|
| 726 |
+
db = self.services.get("database-primary")
|
| 727 |
+
api_gw = self.services.get("api-gateway")
|
| 728 |
+
# Time pressure only while connection pool bottleneck persists
|
| 729 |
+
if db and self.scenario.check_config_error("database-primary", db.config):
|
| 730 |
+
db.cpu_percent = min(db.cpu_percent + 2, 99)
|
| 731 |
+
db.latency_ms = round(db.latency_ms + 15, 1)
|
| 732 |
+
# api-gateway degrades only while db bottleneck persists
|
| 733 |
+
if (api_gw and api_gw.health == ServiceHealth.DEGRADED
|
| 734 |
+
and db and self.scenario.check_config_error("database-primary", db.config)):
|
| 735 |
+
api_gw.latency_ms = round(min(api_gw.latency_ms + 30, 5000), 1)
|
| 736 |
+
api_gw.error_rate = round(min(api_gw.error_rate + 0.5, 50.0), 2)
|
| 737 |
+
|
| 738 |
+
elif task == "random_incident":
|
| 739 |
+
failing = getattr(self.scenario, 'failing_service', None)
|
| 740 |
+
if failing:
|
| 741 |
+
svc = self.services.get(failing)
|
| 742 |
+
if svc and svc.health == ServiceHealth.DEGRADED:
|
| 743 |
+
svc.error_rate = round(min(svc.error_rate + 0.5, 50.0), 2)
|
| 744 |
+
svc.latency_ms = round(min(svc.latency_ms + 30, 5000), 1)
|
server/pipeline_environment.py
CHANGED
|
@@ -1,351 +1,351 @@
|
|
| 1 |
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
-
# All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# This source code is licensed under the BSD-style license found in the
|
| 5 |
-
# LICENSE file in the root directory of this source tree.
|
| 6 |
-
|
| 7 |
-
"""DevOps Pipeline Environment Implementation."""
|
| 8 |
-
|
| 9 |
-
import os
|
| 10 |
-
from uuid import uuid4
|
| 11 |
-
|
| 12 |
-
from openenv.core.env_server.interfaces import Environment
|
| 13 |
-
from openenv.core.env_server.types import State
|
| 14 |
-
|
| 15 |
-
from devops_pipeline_env.models import (
|
| 16 |
-
ActionType,
|
| 17 |
-
PipelineAction,
|
| 18 |
-
PipelineObservation,
|
| 19 |
-
ServiceHealth,
|
| 20 |
-
ServiceStatus,
|
| 21 |
-
)
|
| 22 |
-
from server.pipeline_engine import PipelineEngine
|
| 23 |
-
from server.rewards import calculate_reward
|
| 24 |
-
from server.scenarios import load_scenario
|
| 25 |
-
|
| 26 |
-
# Deterministic seeds per task
|
| 27 |
-
TASK_SEEDS = {
|
| 28 |
-
"clean_deploy": 1001,
|
| 29 |
-
"broken_pipeline": 2002,
|
| 30 |
-
"judgment_call": 3003,
|
| 31 |
-
"cascading_failure": 4004,
|
| 32 |
-
"capacity_crisis": 5005,
|
| 33 |
-
"random_incident": 6006,
|
| 34 |
-
}
|
| 35 |
-
|
| 36 |
-
TASK_MAX_STEPS = {
|
| 37 |
-
"clean_deploy": 15,
|
| 38 |
-
"broken_pipeline": 20,
|
| 39 |
-
"judgment_call": 12,
|
| 40 |
-
"cascading_failure": 15,
|
| 41 |
-
"capacity_crisis": 15,
|
| 42 |
-
"random_incident": 15,
|
| 43 |
-
}
|
| 44 |
-
|
| 45 |
-
# Goal suffixes that hint at investigation without giving away answers
|
| 46 |
-
_INVESTIGATION_HINTS = {
|
| 47 |
-
"clean_deploy": " Use view_logs and view_config to inspect services before deploying.",
|
| 48 |
-
"broken_pipeline": " Investigate service logs and configs to diagnose issues before acting.",
|
| 49 |
-
"judgment_call": " Check service logs and configs to understand the incident before deciding.",
|
| 50 |
-
"capacity_crisis": " Inspect database-primary logs and config to find the bottleneck.",
|
| 51 |
-
"random_incident": " Investigate service logs and config to find the root cause.",
|
| 52 |
-
}
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
class PipelineEnvironment(Environment):
|
| 56 |
-
"""CI/CD Pipeline environment β manages microservice deployments."""
|
| 57 |
-
|
| 58 |
-
SUPPORTS_CONCURRENT_SESSIONS: bool = False
|
| 59 |
-
_register_callback = None # Set by app.py to register active env for /grader
|
| 60 |
-
|
| 61 |
-
def __init__(self):
|
| 62 |
-
self._state = State(episode_id=str(uuid4()), step_count=0)
|
| 63 |
-
self._engine = None
|
| 64 |
-
self._task_name = "clean_deploy"
|
| 65 |
-
self._max_steps = 15
|
| 66 |
-
self._episode_history = []
|
| 67 |
-
self._viewed_actions = set()
|
| 68 |
-
self._last_action_key = None
|
| 69 |
-
self._investigated_services = set() # e.g. "logs:api-gateway", "config:cache-service"
|
| 70 |
-
|
| 71 |
-
def reset(self, seed=None, episode_id=None, **kwargs) -> PipelineObservation:
|
| 72 |
-
"""Initialize a new episode. Task from reset body, env var, or default."""
|
| 73 |
-
self._task_name = (
|
| 74 |
-
kwargs.get("task")
|
| 75 |
-
or os.environ.get("DEVOPS_TASK")
|
| 76 |
-
or "clean_deploy"
|
| 77 |
-
)
|
| 78 |
-
self._state = State(episode_id=str(uuid4()), step_count=0)
|
| 79 |
-
self._episode_history = []
|
| 80 |
-
self._viewed_actions = set()
|
| 81 |
-
self._last_action_key = None
|
| 82 |
-
self._investigated_services = set()
|
| 83 |
-
if PipelineEnvironment._register_callback:
|
| 84 |
-
PipelineEnvironment._register_callback(self)
|
| 85 |
-
|
| 86 |
-
seed = TASK_SEEDS.get(self._task_name, 9999)
|
| 87 |
-
if self._task_name == "random_incident":
|
| 88 |
-
seed = int(os.environ.get("DEVOPS_SEED", str(seed)))
|
| 89 |
-
scenario = load_scenario(self._task_name, seed)
|
| 90 |
-
self._engine = PipelineEngine(scenario, seed)
|
| 91 |
-
self._max_steps = TASK_MAX_STEPS.get(self._task_name, 15)
|
| 92 |
-
|
| 93 |
-
return self._build_observation(
|
| 94 |
-
last_action_result="Environment reset. Deployment pipeline initialized.",
|
| 95 |
-
last_action_error=None,
|
| 96 |
-
done=False,
|
| 97 |
-
reward=0.0,
|
| 98 |
-
)
|
| 99 |
-
|
| 100 |
-
def step(self, action: PipelineAction) -> PipelineObservation:
|
| 101 |
-
"""Execute action, return observation with reward and done."""
|
| 102 |
-
self._state.step_count += 1
|
| 103 |
-
|
| 104 |
-
prev_state = self._engine.snapshot()
|
| 105 |
-
|
| 106 |
-
# Validate action
|
| 107 |
-
error = self._validate_action(action)
|
| 108 |
-
if error:
|
| 109 |
-
self._episode_history.append({
|
| 110 |
-
"step": self._state.step_count,
|
| 111 |
-
"action": action.model_dump(),
|
| 112 |
-
"reward": -0.05,
|
| 113 |
-
"error": error,
|
| 114 |
-
})
|
| 115 |
-
done = self._state.step_count >= self._max_steps
|
| 116 |
-
return self._build_observation(
|
| 117 |
-
last_action_result=None,
|
| 118 |
-
last_action_error=error,
|
| 119 |
-
done=done,
|
| 120 |
-
reward=-0.05,
|
| 121 |
-
)
|
| 122 |
-
|
| 123 |
-
# Track investigation actions BEFORE executing (so the observation
|
| 124 |
-
# returned from this step already includes the revealed info)
|
| 125 |
-
if action.action_type == ActionType.VIEW_LOGS and action.service_name:
|
| 126 |
-
self._investigated_services.add(f"logs:{action.service_name}")
|
| 127 |
-
elif action.action_type == ActionType.VIEW_CONFIG and action.service_name:
|
| 128 |
-
self._investigated_services.add(f"config:{action.service_name}")
|
| 129 |
-
|
| 130 |
-
# Execute action
|
| 131 |
-
result_text = self._engine.execute(action)
|
| 132 |
-
|
| 133 |
-
current_state = self._engine.snapshot()
|
| 134 |
-
|
| 135 |
-
# Calculate outcome-based reward
|
| 136 |
-
reward = calculate_reward(
|
| 137 |
-
prev_state, current_state, action, self._viewed_actions,
|
| 138 |
-
last_action_key=self._last_action_key, task_name=self._task_name,
|
| 139 |
-
)
|
| 140 |
-
self._last_action_key = f"{action.action_type.value}:{action.service_name or ''}"
|
| 141 |
-
|
| 142 |
-
# Check episode termination
|
| 143 |
-
done = self._check_done(action)
|
| 144 |
-
|
| 145 |
-
# Track if we broke a healthy service (for grader)
|
| 146 |
-
broke_healthy = False
|
| 147 |
-
for name, curr_svc in current_state["services"].items():
|
| 148 |
-
prev_svc = prev_state["services"].get(name, {})
|
| 149 |
-
if prev_svc.get("health") == "healthy" and curr_svc["health"] in ("degraded", "down"):
|
| 150 |
-
broke_healthy = True
|
| 151 |
-
|
| 152 |
-
history_entry = {
|
| 153 |
-
"step": self._state.step_count,
|
| 154 |
-
"action": action.model_dump(),
|
| 155 |
-
"reward": reward,
|
| 156 |
-
"error": None,
|
| 157 |
-
"broke_healthy": broke_healthy,
|
| 158 |
-
"system_health": self._engine.get_system_health(),
|
| 159 |
-
}
|
| 160 |
-
|
| 161 |
-
# Record cache health at deploy time for grader integrity
|
| 162 |
-
if action.action_type == ActionType.DEPLOY and action.service_name == "api-gateway":
|
| 163 |
-
cache_svc = self._engine.services.get("cache-service")
|
| 164 |
-
if cache_svc:
|
| 165 |
-
history_entry["cache_health_at_deploy"] = cache_svc.health.value
|
| 166 |
-
|
| 167 |
-
self._episode_history.append(history_entry)
|
| 168 |
-
|
| 169 |
-
# Include config_snapshot if viewing/editing config
|
| 170 |
-
config_snapshot = None
|
| 171 |
-
if action.action_type in (ActionType.VIEW_CONFIG, ActionType.EDIT_CONFIG):
|
| 172 |
-
svc = self._engine.services.get(action.service_name)
|
| 173 |
-
if svc:
|
| 174 |
-
config_snapshot = svc.get_config_snapshot()
|
| 175 |
-
|
| 176 |
-
return self._build_observation(
|
| 177 |
-
last_action_result=result_text,
|
| 178 |
-
last_action_error=None,
|
| 179 |
-
done=done,
|
| 180 |
-
reward=reward,
|
| 181 |
-
config_snapshot=config_snapshot,
|
| 182 |
-
)
|
| 183 |
-
|
| 184 |
-
@property
|
| 185 |
-
def state(self) -> State:
|
| 186 |
-
return self._state
|
| 187 |
-
|
| 188 |
-
def get_episode_history(self):
|
| 189 |
-
return self._episode_history
|
| 190 |
-
|
| 191 |
-
def get_engine(self):
|
| 192 |
-
return self._engine
|
| 193 |
-
|
| 194 |
-
def get_task_name(self):
|
| 195 |
-
return self._task_name
|
| 196 |
-
|
| 197 |
-
def _build_observation(self, last_action_result, last_action_error,
|
| 198 |
-
done, reward, config_snapshot=None):
|
| 199 |
-
"""Build observation from current engine state.
|
| 200 |
-
|
| 201 |
-
Partial observability: services show only high-level metrics by default.
|
| 202 |
-
CPU, memory are hidden until the agent runs view_logs for that service.
|
| 203 |
-
Config is hidden until the agent runs view_config for that service.
|
| 204 |
-
"""
|
| 205 |
-
scenario = self._engine.scenario
|
| 206 |
-
|
| 207 |
-
# Build service statuses with partial observability
|
| 208 |
-
raw_statuses = self._engine.get_service_statuses()
|
| 209 |
-
filtered_statuses = []
|
| 210 |
-
for svc in raw_statuses:
|
| 211 |
-
investigated = (
|
| 212 |
-
f"logs:{svc.name}" in self._investigated_services
|
| 213 |
-
or f"config:{svc.name}" in self._investigated_services
|
| 214 |
-
)
|
| 215 |
-
# Get recovery status from engine state
|
| 216 |
-
svc_state = self._engine.services.get(svc.name)
|
| 217 |
-
recovery = "stable"
|
| 218 |
-
if svc_state and hasattr(svc_state, '_recovery_steps_remaining') and svc_state._recovery_steps_remaining > 0:
|
| 219 |
-
recovery = f"stabilizing ({svc_state._recovery_steps_remaining} steps remaining)"
|
| 220 |
-
# Mask health for uninvestigated unhealthy services
|
| 221 |
-
shown_health = svc.health
|
| 222 |
-
if not investigated and svc.health.value != "healthy":
|
| 223 |
-
shown_health = ServiceHealth.UNKNOWN
|
| 224 |
-
filtered_statuses.append(ServiceStatus(
|
| 225 |
-
name=svc.name,
|
| 226 |
-
health=shown_health,
|
| 227 |
-
current_version=svc.current_version,
|
| 228 |
-
# Metrics visible only after investigation
|
| 229 |
-
error_rate=svc.error_rate if investigated else 0.0,
|
| 230 |
-
request_latency_ms=svc.request_latency_ms if investigated else 0.0,
|
| 231 |
-
active_connections=svc.active_connections,
|
| 232 |
-
last_deploy_timestamp=svc.last_deploy_timestamp,
|
| 233 |
-
# Hidden until view_logs: detailed resource usage
|
| 234 |
-
cpu_percent=svc.cpu_percent if investigated else 0.0,
|
| 235 |
-
memory_percent=svc.memory_percent if investigated else 0.0,
|
| 236 |
-
recovery_status=recovery,
|
| 237 |
-
))
|
| 238 |
-
|
| 239 |
-
# Append investigation hint to goal
|
| 240 |
-
goal = scenario.goal
|
| 241 |
-
hint = _INVESTIGATION_HINTS.get(self._task_name, "")
|
| 242 |
-
if hint and not self._investigated_services:
|
| 243 |
-
goal = goal + hint
|
| 244 |
-
|
| 245 |
-
# Build summary β only reveal details for investigated services
|
| 246 |
-
alerts = []
|
| 247 |
-
uninvestigated_alerts = 0
|
| 248 |
-
for name, svc_state in self._engine.services.items():
|
| 249 |
-
investigated = (
|
| 250 |
-
f"logs:{name}" in self._investigated_services
|
| 251 |
-
or f"config:{name}" in self._investigated_services
|
| 252 |
-
)
|
| 253 |
-
if svc_state.health == ServiceHealth.DOWN:
|
| 254 |
-
if investigated:
|
| 255 |
-
alerts.append(f"CRITICAL: {name} is DOWN")
|
| 256 |
-
else:
|
| 257 |
-
uninvestigated_alerts += 1
|
| 258 |
-
elif svc_state.health == ServiceHealth.DEGRADED:
|
| 259 |
-
if investigated:
|
| 260 |
-
alerts.append(
|
| 261 |
-
f"WARNING: {name} degraded "
|
| 262 |
-
f"(lat={svc_state.latency_ms:.0f}ms, err={svc_state.error_rate:.1f}/s)"
|
| 263 |
-
)
|
| 264 |
-
else:
|
| 265 |
-
uninvestigated_alerts += 1
|
| 266 |
-
elif investigated and svc_state.cpu_percent > 80:
|
| 267 |
-
alerts.append(f"CAUTION: {name} CPU high ({svc_state.cpu_percent:.0f}%)")
|
| 268 |
-
# Recovery status alert β inside the loop, for THIS service
|
| 269 |
-
if hasattr(svc_state, '_recovery_steps_remaining') and svc_state._recovery_steps_remaining > 0:
|
| 270 |
-
alerts.append(f"INFO: {name} recovering β stabilizing ({svc_state._recovery_steps_remaining} steps remaining)")
|
| 271 |
-
if uninvestigated_alerts > 0:
|
| 272 |
-
alerts.append(f"ALERT: {uninvestigated_alerts} service(s) may have issues β use view_logs to investigate")
|
| 273 |
-
# Add dependency chain hints for investigated degraded services only
|
| 274 |
-
for name, svc_state in self._engine.services.items():
|
| 275 |
-
investigated = (
|
| 276 |
-
f"logs:{name}" in self._investigated_services
|
| 277 |
-
or f"config:{name}" in self._investigated_services
|
| 278 |
-
)
|
| 279 |
-
if investigated and svc_state.health in (ServiceHealth.DEGRADED, ServiceHealth.DOWN):
|
| 280 |
-
upstream_issues = [
|
| 281 |
-
d for d in svc_state.dependencies
|
| 282 |
-
if d in self._engine.services
|
| 283 |
-
and self._engine.services[d].health in (ServiceHealth.DEGRADED, ServiceHealth.DOWN)
|
| 284 |
-
]
|
| 285 |
-
if upstream_issues:
|
| 286 |
-
alerts.append(
|
| 287 |
-
f"HINT: {name} depends on {', '.join(upstream_issues)} "
|
| 288 |
-
f"(also unhealthy β root cause likely upstream)"
|
| 289 |
-
)
|
| 290 |
-
summary = "; ".join(alerts) if alerts else "All services nominal."
|
| 291 |
-
|
| 292 |
-
return PipelineObservation(
|
| 293 |
-
task_description=scenario.task_description,
|
| 294 |
-
goal=goal,
|
| 295 |
-
step_number=self._state.step_count,
|
| 296 |
-
max_steps=self._max_steps,
|
| 297 |
-
services=filtered_statuses,
|
| 298 |
-
pipeline=self._engine.get_pipeline_status(),
|
| 299 |
-
migrations=self._engine.get_migration_status(),
|
| 300 |
-
active_alerts=self._engine.get_alerts(),
|
| 301 |
-
available_actions=self._get_available_actions(),
|
| 302 |
-
last_action_result=last_action_result,
|
| 303 |
-
last_action_error=last_action_error,
|
| 304 |
-
config_snapshot=config_snapshot,
|
| 305 |
-
done=done,
|
| 306 |
-
reward=reward,
|
| 307 |
-
summary=summary,
|
| 308 |
-
)
|
| 309 |
-
|
| 310 |
-
def _get_available_actions(self):
|
| 311 |
-
"""Context-sensitive: only show valid actions."""
|
| 312 |
-
actions = ["view_pipeline", "view_logs", "approve", "abort"]
|
| 313 |
-
if self._engine.has_services():
|
| 314 |
-
actions.extend(["view_config", "edit_config", "deploy", "rollback"])
|
| 315 |
-
if self._engine.has_pending_migrations():
|
| 316 |
-
actions.append("run_migration")
|
| 317 |
-
return actions
|
| 318 |
-
|
| 319 |
-
def _validate_action(self, action):
|
| 320 |
-
"""Return error string if action is invalid, None if valid."""
|
| 321 |
-
if action.action_type in (
|
| 322 |
-
ActionType.VIEW_LOGS, ActionType.VIEW_CONFIG,
|
| 323 |
-
ActionType.EDIT_CONFIG, ActionType.DEPLOY,
|
| 324 |
-
ActionType.ROLLBACK,
|
| 325 |
-
):
|
| 326 |
-
if not action.service_name:
|
| 327 |
-
return f"action_type '{action.action_type.value}' requires service_name"
|
| 328 |
-
if action.service_name not in self._engine.get_service_names():
|
| 329 |
-
return (
|
| 330 |
-
f"Unknown service '{action.service_name}'. "
|
| 331 |
-
f"Available: {self._engine.get_service_names()}"
|
| 332 |
-
)
|
| 333 |
-
if action.action_type == ActionType.DEPLOY and not action.target_version:
|
| 334 |
-
return "deploy requires target_version"
|
| 335 |
-
if action.action_type == ActionType.EDIT_CONFIG and not action.config_edits:
|
| 336 |
-
return "edit_config requires config_edits"
|
| 337 |
-
if action.action_type == ActionType.RUN_MIGRATION and not action.migration_name:
|
| 338 |
-
return "run_migration requires migration_name"
|
| 339 |
-
return None
|
| 340 |
-
|
| 341 |
-
def _check_done(self, action):
|
| 342 |
-
"""Episode ends on approve, abort, max steps, or catastrophic failure."""
|
| 343 |
-
if action.action_type == ActionType.APPROVE:
|
| 344 |
-
return True
|
| 345 |
-
if action.action_type == ActionType.ABORT:
|
| 346 |
-
return True
|
| 347 |
-
if self._state.step_count >= self._max_steps:
|
| 348 |
-
return True
|
| 349 |
-
if self._engine.get_system_health() < 20.0:
|
| 350 |
-
return True
|
| 351 |
-
return False
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""DevOps Pipeline Environment Implementation."""
|
| 8 |
+
|
| 9 |
+
import os
|
| 10 |
+
from uuid import uuid4
|
| 11 |
+
|
| 12 |
+
from openenv.core.env_server.interfaces import Environment
|
| 13 |
+
from openenv.core.env_server.types import State
|
| 14 |
+
|
| 15 |
+
from devops_pipeline_env.models import (
|
| 16 |
+
ActionType,
|
| 17 |
+
PipelineAction,
|
| 18 |
+
PipelineObservation,
|
| 19 |
+
ServiceHealth,
|
| 20 |
+
ServiceStatus,
|
| 21 |
+
)
|
| 22 |
+
from server.pipeline_engine import PipelineEngine
|
| 23 |
+
from server.rewards import calculate_reward
|
| 24 |
+
from server.scenarios import load_scenario
|
| 25 |
+
|
| 26 |
+
# Deterministic seeds per task
|
| 27 |
+
TASK_SEEDS = {
|
| 28 |
+
"clean_deploy": 1001,
|
| 29 |
+
"broken_pipeline": 2002,
|
| 30 |
+
"judgment_call": 3003,
|
| 31 |
+
"cascading_failure": 4004,
|
| 32 |
+
"capacity_crisis": 5005,
|
| 33 |
+
"random_incident": 6006,
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
TASK_MAX_STEPS = {
|
| 37 |
+
"clean_deploy": 15,
|
| 38 |
+
"broken_pipeline": 20,
|
| 39 |
+
"judgment_call": 12,
|
| 40 |
+
"cascading_failure": 15,
|
| 41 |
+
"capacity_crisis": 15,
|
| 42 |
+
"random_incident": 15,
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
# Goal suffixes that hint at investigation without giving away answers
|
| 46 |
+
_INVESTIGATION_HINTS = {
|
| 47 |
+
"clean_deploy": " Use view_logs and view_config to inspect services before deploying.",
|
| 48 |
+
"broken_pipeline": " Investigate service logs and configs to diagnose issues before acting.",
|
| 49 |
+
"judgment_call": " Check service logs and configs to understand the incident before deciding.",
|
| 50 |
+
"capacity_crisis": " Inspect database-primary logs and config to find the bottleneck.",
|
| 51 |
+
"random_incident": " Investigate service logs and config to find the root cause.",
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
class PipelineEnvironment(Environment):
|
| 56 |
+
"""CI/CD Pipeline environment β manages microservice deployments."""
|
| 57 |
+
|
| 58 |
+
SUPPORTS_CONCURRENT_SESSIONS: bool = False
|
| 59 |
+
_register_callback = None # Set by app.py to register active env for /grader
|
| 60 |
+
|
| 61 |
+
def __init__(self):
|
| 62 |
+
self._state = State(episode_id=str(uuid4()), step_count=0)
|
| 63 |
+
self._engine = None
|
| 64 |
+
self._task_name = "clean_deploy"
|
| 65 |
+
self._max_steps = 15
|
| 66 |
+
self._episode_history = []
|
| 67 |
+
self._viewed_actions = set()
|
| 68 |
+
self._last_action_key = None
|
| 69 |
+
self._investigated_services = set() # e.g. "logs:api-gateway", "config:cache-service"
|
| 70 |
+
|
| 71 |
+
def reset(self, seed=None, episode_id=None, **kwargs) -> PipelineObservation:
|
| 72 |
+
"""Initialize a new episode. Task from reset body, env var, or default."""
|
| 73 |
+
self._task_name = (
|
| 74 |
+
kwargs.get("task")
|
| 75 |
+
or os.environ.get("DEVOPS_TASK")
|
| 76 |
+
or "clean_deploy"
|
| 77 |
+
)
|
| 78 |
+
self._state = State(episode_id=str(uuid4()), step_count=0)
|
| 79 |
+
self._episode_history = []
|
| 80 |
+
self._viewed_actions = set()
|
| 81 |
+
self._last_action_key = None
|
| 82 |
+
self._investigated_services = set()
|
| 83 |
+
if PipelineEnvironment._register_callback:
|
| 84 |
+
PipelineEnvironment._register_callback(self)
|
| 85 |
+
|
| 86 |
+
seed = TASK_SEEDS.get(self._task_name, 9999)
|
| 87 |
+
if self._task_name == "random_incident":
|
| 88 |
+
seed = int(os.environ.get("DEVOPS_SEED", str(seed)))
|
| 89 |
+
scenario = load_scenario(self._task_name, seed)
|
| 90 |
+
self._engine = PipelineEngine(scenario, seed)
|
| 91 |
+
self._max_steps = TASK_MAX_STEPS.get(self._task_name, 15)
|
| 92 |
+
|
| 93 |
+
return self._build_observation(
|
| 94 |
+
last_action_result="Environment reset. Deployment pipeline initialized.",
|
| 95 |
+
last_action_error=None,
|
| 96 |
+
done=False,
|
| 97 |
+
reward=0.0,
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
def step(self, action: PipelineAction) -> PipelineObservation:
|
| 101 |
+
"""Execute action, return observation with reward and done."""
|
| 102 |
+
self._state.step_count += 1
|
| 103 |
+
|
| 104 |
+
prev_state = self._engine.snapshot()
|
| 105 |
+
|
| 106 |
+
# Validate action
|
| 107 |
+
error = self._validate_action(action)
|
| 108 |
+
if error:
|
| 109 |
+
self._episode_history.append({
|
| 110 |
+
"step": self._state.step_count,
|
| 111 |
+
"action": action.model_dump(),
|
| 112 |
+
"reward": -0.05,
|
| 113 |
+
"error": error,
|
| 114 |
+
})
|
| 115 |
+
done = self._state.step_count >= self._max_steps
|
| 116 |
+
return self._build_observation(
|
| 117 |
+
last_action_result=None,
|
| 118 |
+
last_action_error=error,
|
| 119 |
+
done=done,
|
| 120 |
+
reward=-0.05,
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
# Track investigation actions BEFORE executing (so the observation
|
| 124 |
+
# returned from this step already includes the revealed info)
|
| 125 |
+
if action.action_type == ActionType.VIEW_LOGS and action.service_name:
|
| 126 |
+
self._investigated_services.add(f"logs:{action.service_name}")
|
| 127 |
+
elif action.action_type == ActionType.VIEW_CONFIG and action.service_name:
|
| 128 |
+
self._investigated_services.add(f"config:{action.service_name}")
|
| 129 |
+
|
| 130 |
+
# Execute action
|
| 131 |
+
result_text = self._engine.execute(action)
|
| 132 |
+
|
| 133 |
+
current_state = self._engine.snapshot()
|
| 134 |
+
|
| 135 |
+
# Calculate outcome-based reward
|
| 136 |
+
reward = calculate_reward(
|
| 137 |
+
prev_state, current_state, action, self._viewed_actions,
|
| 138 |
+
last_action_key=self._last_action_key, task_name=self._task_name,
|
| 139 |
+
)
|
| 140 |
+
self._last_action_key = f"{action.action_type.value}:{action.service_name or ''}"
|
| 141 |
+
|
| 142 |
+
# Check episode termination
|
| 143 |
+
done = self._check_done(action)
|
| 144 |
+
|
| 145 |
+
# Track if we broke a healthy service (for grader)
|
| 146 |
+
broke_healthy = False
|
| 147 |
+
for name, curr_svc in current_state["services"].items():
|
| 148 |
+
prev_svc = prev_state["services"].get(name, {})
|
| 149 |
+
if prev_svc.get("health") == "healthy" and curr_svc["health"] in ("degraded", "down"):
|
| 150 |
+
broke_healthy = True
|
| 151 |
+
|
| 152 |
+
history_entry = {
|
| 153 |
+
"step": self._state.step_count,
|
| 154 |
+
"action": action.model_dump(),
|
| 155 |
+
"reward": reward,
|
| 156 |
+
"error": None,
|
| 157 |
+
"broke_healthy": broke_healthy,
|
| 158 |
+
"system_health": self._engine.get_system_health(),
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
# Record cache health at deploy time for grader integrity
|
| 162 |
+
if action.action_type == ActionType.DEPLOY and action.service_name == "api-gateway":
|
| 163 |
+
cache_svc = self._engine.services.get("cache-service")
|
| 164 |
+
if cache_svc:
|
| 165 |
+
history_entry["cache_health_at_deploy"] = cache_svc.health.value
|
| 166 |
+
|
| 167 |
+
self._episode_history.append(history_entry)
|
| 168 |
+
|
| 169 |
+
# Include config_snapshot if viewing/editing config
|
| 170 |
+
config_snapshot = None
|
| 171 |
+
if action.action_type in (ActionType.VIEW_CONFIG, ActionType.EDIT_CONFIG):
|
| 172 |
+
svc = self._engine.services.get(action.service_name)
|
| 173 |
+
if svc:
|
| 174 |
+
config_snapshot = svc.get_config_snapshot()
|
| 175 |
+
|
| 176 |
+
return self._build_observation(
|
| 177 |
+
last_action_result=result_text,
|
| 178 |
+
last_action_error=None,
|
| 179 |
+
done=done,
|
| 180 |
+
reward=reward,
|
| 181 |
+
config_snapshot=config_snapshot,
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
@property
|
| 185 |
+
def state(self) -> State:
|
| 186 |
+
return self._state
|
| 187 |
+
|
| 188 |
+
def get_episode_history(self):
|
| 189 |
+
return self._episode_history
|
| 190 |
+
|
| 191 |
+
def get_engine(self):
|
| 192 |
+
return self._engine
|
| 193 |
+
|
| 194 |
+
def get_task_name(self):
|
| 195 |
+
return self._task_name
|
| 196 |
+
|
| 197 |
+
def _build_observation(self, last_action_result, last_action_error,
|
| 198 |
+
done, reward, config_snapshot=None):
|
| 199 |
+
"""Build observation from current engine state.
|
| 200 |
+
|
| 201 |
+
Partial observability: services show only high-level metrics by default.
|
| 202 |
+
CPU, memory are hidden until the agent runs view_logs for that service.
|
| 203 |
+
Config is hidden until the agent runs view_config for that service.
|
| 204 |
+
"""
|
| 205 |
+
scenario = self._engine.scenario
|
| 206 |
+
|
| 207 |
+
# Build service statuses with partial observability
|
| 208 |
+
raw_statuses = self._engine.get_service_statuses()
|
| 209 |
+
filtered_statuses = []
|
| 210 |
+
for svc in raw_statuses:
|
| 211 |
+
investigated = (
|
| 212 |
+
f"logs:{svc.name}" in self._investigated_services
|
| 213 |
+
or f"config:{svc.name}" in self._investigated_services
|
| 214 |
+
)
|
| 215 |
+
# Get recovery status from engine state
|
| 216 |
+
svc_state = self._engine.services.get(svc.name)
|
| 217 |
+
recovery = "stable"
|
| 218 |
+
if svc_state and hasattr(svc_state, '_recovery_steps_remaining') and svc_state._recovery_steps_remaining > 0:
|
| 219 |
+
recovery = f"stabilizing ({svc_state._recovery_steps_remaining} steps remaining)"
|
| 220 |
+
# Mask health for uninvestigated unhealthy services
|
| 221 |
+
shown_health = svc.health
|
| 222 |
+
if not investigated and svc.health.value != "healthy":
|
| 223 |
+
shown_health = ServiceHealth.UNKNOWN
|
| 224 |
+
filtered_statuses.append(ServiceStatus(
|
| 225 |
+
name=svc.name,
|
| 226 |
+
health=shown_health,
|
| 227 |
+
current_version=svc.current_version,
|
| 228 |
+
# Metrics visible only after investigation
|
| 229 |
+
error_rate=svc.error_rate if investigated else 0.0,
|
| 230 |
+
request_latency_ms=svc.request_latency_ms if investigated else 0.0,
|
| 231 |
+
active_connections=svc.active_connections,
|
| 232 |
+
last_deploy_timestamp=svc.last_deploy_timestamp,
|
| 233 |
+
# Hidden until view_logs: detailed resource usage
|
| 234 |
+
cpu_percent=svc.cpu_percent if investigated else 0.0,
|
| 235 |
+
memory_percent=svc.memory_percent if investigated else 0.0,
|
| 236 |
+
recovery_status=recovery,
|
| 237 |
+
))
|
| 238 |
+
|
| 239 |
+
# Append investigation hint to goal
|
| 240 |
+
goal = scenario.goal
|
| 241 |
+
hint = _INVESTIGATION_HINTS.get(self._task_name, "")
|
| 242 |
+
if hint and not self._investigated_services:
|
| 243 |
+
goal = goal + hint
|
| 244 |
+
|
| 245 |
+
# Build summary β only reveal details for investigated services
|
| 246 |
+
alerts = []
|
| 247 |
+
uninvestigated_alerts = 0
|
| 248 |
+
for name, svc_state in self._engine.services.items():
|
| 249 |
+
investigated = (
|
| 250 |
+
f"logs:{name}" in self._investigated_services
|
| 251 |
+
or f"config:{name}" in self._investigated_services
|
| 252 |
+
)
|
| 253 |
+
if svc_state.health == ServiceHealth.DOWN:
|
| 254 |
+
if investigated:
|
| 255 |
+
alerts.append(f"CRITICAL: {name} is DOWN")
|
| 256 |
+
else:
|
| 257 |
+
uninvestigated_alerts += 1
|
| 258 |
+
elif svc_state.health == ServiceHealth.DEGRADED:
|
| 259 |
+
if investigated:
|
| 260 |
+
alerts.append(
|
| 261 |
+
f"WARNING: {name} degraded "
|
| 262 |
+
f"(lat={svc_state.latency_ms:.0f}ms, err={svc_state.error_rate:.1f}/s)"
|
| 263 |
+
)
|
| 264 |
+
else:
|
| 265 |
+
uninvestigated_alerts += 1
|
| 266 |
+
elif investigated and svc_state.cpu_percent > 80:
|
| 267 |
+
alerts.append(f"CAUTION: {name} CPU high ({svc_state.cpu_percent:.0f}%)")
|
| 268 |
+
# Recovery status alert β inside the loop, for THIS service
|
| 269 |
+
if hasattr(svc_state, '_recovery_steps_remaining') and svc_state._recovery_steps_remaining > 0:
|
| 270 |
+
alerts.append(f"INFO: {name} recovering β stabilizing ({svc_state._recovery_steps_remaining} steps remaining)")
|
| 271 |
+
if uninvestigated_alerts > 0:
|
| 272 |
+
alerts.append(f"ALERT: {uninvestigated_alerts} service(s) may have issues β use view_logs to investigate")
|
| 273 |
+
# Add dependency chain hints for investigated degraded services only
|
| 274 |
+
for name, svc_state in self._engine.services.items():
|
| 275 |
+
investigated = (
|
| 276 |
+
f"logs:{name}" in self._investigated_services
|
| 277 |
+
or f"config:{name}" in self._investigated_services
|
| 278 |
+
)
|
| 279 |
+
if investigated and svc_state.health in (ServiceHealth.DEGRADED, ServiceHealth.DOWN):
|
| 280 |
+
upstream_issues = [
|
| 281 |
+
d for d in svc_state.dependencies
|
| 282 |
+
if d in self._engine.services
|
| 283 |
+
and self._engine.services[d].health in (ServiceHealth.DEGRADED, ServiceHealth.DOWN)
|
| 284 |
+
]
|
| 285 |
+
if upstream_issues:
|
| 286 |
+
alerts.append(
|
| 287 |
+
f"HINT: {name} depends on {', '.join(upstream_issues)} "
|
| 288 |
+
f"(also unhealthy β root cause likely upstream)"
|
| 289 |
+
)
|
| 290 |
+
summary = "; ".join(alerts) if alerts else "All services nominal."
|
| 291 |
+
|
| 292 |
+
return PipelineObservation(
|
| 293 |
+
task_description=scenario.task_description,
|
| 294 |
+
goal=goal,
|
| 295 |
+
step_number=self._state.step_count,
|
| 296 |
+
max_steps=self._max_steps,
|
| 297 |
+
services=filtered_statuses,
|
| 298 |
+
pipeline=self._engine.get_pipeline_status(),
|
| 299 |
+
migrations=self._engine.get_migration_status(),
|
| 300 |
+
active_alerts=self._engine.get_alerts(),
|
| 301 |
+
available_actions=self._get_available_actions(),
|
| 302 |
+
last_action_result=last_action_result,
|
| 303 |
+
last_action_error=last_action_error,
|
| 304 |
+
config_snapshot=config_snapshot,
|
| 305 |
+
done=done,
|
| 306 |
+
reward=reward,
|
| 307 |
+
summary=summary,
|
| 308 |
+
)
|
| 309 |
+
|
| 310 |
+
def _get_available_actions(self):
|
| 311 |
+
"""Context-sensitive: only show valid actions."""
|
| 312 |
+
actions = ["view_pipeline", "view_logs", "approve", "abort"]
|
| 313 |
+
if self._engine.has_services():
|
| 314 |
+
actions.extend(["view_config", "edit_config", "deploy", "rollback"])
|
| 315 |
+
if self._engine.has_pending_migrations():
|
| 316 |
+
actions.append("run_migration")
|
| 317 |
+
return actions
|
| 318 |
+
|
| 319 |
+
def _validate_action(self, action):
|
| 320 |
+
"""Return error string if action is invalid, None if valid."""
|
| 321 |
+
if action.action_type in (
|
| 322 |
+
ActionType.VIEW_LOGS, ActionType.VIEW_CONFIG,
|
| 323 |
+
ActionType.EDIT_CONFIG, ActionType.DEPLOY,
|
| 324 |
+
ActionType.ROLLBACK,
|
| 325 |
+
):
|
| 326 |
+
if not action.service_name:
|
| 327 |
+
return f"action_type '{action.action_type.value}' requires service_name"
|
| 328 |
+
if action.service_name not in self._engine.get_service_names():
|
| 329 |
+
return (
|
| 330 |
+
f"Unknown service '{action.service_name}'. "
|
| 331 |
+
f"Available: {self._engine.get_service_names()}"
|
| 332 |
+
)
|
| 333 |
+
if action.action_type == ActionType.DEPLOY and not action.target_version:
|
| 334 |
+
return "deploy requires target_version"
|
| 335 |
+
if action.action_type == ActionType.EDIT_CONFIG and not action.config_edits:
|
| 336 |
+
return "edit_config requires config_edits"
|
| 337 |
+
if action.action_type == ActionType.RUN_MIGRATION and not action.migration_name:
|
| 338 |
+
return "run_migration requires migration_name"
|
| 339 |
+
return None
|
| 340 |
+
|
| 341 |
+
def _check_done(self, action):
|
| 342 |
+
"""Episode ends on approve, abort, max steps, or catastrophic failure."""
|
| 343 |
+
if action.action_type == ActionType.APPROVE:
|
| 344 |
+
return True
|
| 345 |
+
if action.action_type == ActionType.ABORT:
|
| 346 |
+
return True
|
| 347 |
+
if self._state.step_count >= self._max_steps:
|
| 348 |
+
return True
|
| 349 |
+
if self._engine.get_system_health() < 20.0:
|
| 350 |
+
return True
|
| 351 |
+
return False
|
server/rewards.py
CHANGED
|
@@ -1,104 +1,104 @@
|
|
| 1 |
-
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
-
# All rights reserved.
|
| 3 |
-
#
|
| 4 |
-
# This source code is licensed under the BSD-style license found in the
|
| 5 |
-
# LICENSE file in the root directory of this source tree.
|
| 6 |
-
|
| 7 |
-
"""Outcome-based reward calculator for the DevOps Pipeline Environment."""
|
| 8 |
-
|
| 9 |
-
from devops_pipeline_env.models import ActionType
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
# Task urgency multipliers β harder tasks get steeper reward gradients
|
| 13 |
-
TASK_URGENCY = {
|
| 14 |
-
"clean_deploy": 1.0,
|
| 15 |
-
"broken_pipeline": 1.2,
|
| 16 |
-
"judgment_call": 1.5,
|
| 17 |
-
"cascading_failure": 1.3,
|
| 18 |
-
"capacity_crisis": 1.4,
|
| 19 |
-
"random_incident": 1.3,
|
| 20 |
-
}
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
def calculate_reward(prev_snapshot, current_snapshot, action, viewed_actions,
|
| 24 |
-
last_action_key=None, task_name="clean_deploy"):
|
| 25 |
-
"""
|
| 26 |
-
Outcome-based reward. No procedure bonuses. No early returns.
|
| 27 |
-
ALL actions go through the full reward pipeline.
|
| 28 |
-
Returns a float bounded to [-0.35, +0.30].
|
| 29 |
-
"""
|
| 30 |
-
reward = 0.0
|
| 31 |
-
|
| 32 |
-
# 1. System health delta (ALL actions)
|
| 33 |
-
health_delta = current_snapshot["system_health"] - prev_snapshot["system_health"]
|
| 34 |
-
reward += health_delta * 0.005
|
| 35 |
-
|
| 36 |
-
# 2. Deployment progress (ALL actions β recovery ticks can change state)
|
| 37 |
-
for name, curr_svc in current_snapshot["services"].items():
|
| 38 |
-
prev_svc = prev_snapshot["services"].get(name, {})
|
| 39 |
-
if curr_svc["prod_deployed"] and not prev_svc.get("prod_deployed"):
|
| 40 |
-
reward += 0.15
|
| 41 |
-
if curr_svc["staging_verified"] and not prev_svc.get("staging_verified"):
|
| 42 |
-
reward += 0.05
|
| 43 |
-
|
| 44 |
-
# 3. Broke healthy service penalty (ALL actions)
|
| 45 |
-
for name, curr_svc in current_snapshot["services"].items():
|
| 46 |
-
prev_svc = prev_snapshot["services"].get(name, {})
|
| 47 |
-
if prev_svc.get("health") == "healthy" and curr_svc["health"] in ("degraded", "down"):
|
| 48 |
-
reward -= 0.30
|
| 49 |
-
|
| 50 |
-
# 4. Sub-goal completion rewards (intermediate milestones for RL)
|
| 51 |
-
# Config change detected β check if it fixed an error
|
| 52 |
-
for name, curr_svc in current_snapshot["services"].items():
|
| 53 |
-
prev_svc = prev_snapshot["services"].get(name, {})
|
| 54 |
-
prev_config = prev_svc.get("config", {})
|
| 55 |
-
curr_config = curr_svc.get("config", {})
|
| 56 |
-
if prev_config and curr_config and prev_config != curr_config:
|
| 57 |
-
# Config changed β reward if health improved on this service
|
| 58 |
-
if prev_svc.get("health") in ("degraded", "down") and curr_svc["health"] == "healthy":
|
| 59 |
-
reward += 0.08
|
| 60 |
-
|
| 61 |
-
# Migration completed
|
| 62 |
-
prev_pending = len(prev_snapshot.get("migrations_pending", []))
|
| 63 |
-
curr_pending = len(current_snapshot.get("migrations_pending", []))
|
| 64 |
-
if curr_pending < prev_pending:
|
| 65 |
-
reward += 0.06
|
| 66 |
-
|
| 67 |
-
# Alert resolved
|
| 68 |
-
prev_alerts = len(prev_snapshot.get("alerts", []))
|
| 69 |
-
curr_alerts = len(current_snapshot.get("alerts", []))
|
| 70 |
-
if curr_alerts < prev_alerts:
|
| 71 |
-
reward += 0.03
|
| 72 |
-
|
| 73 |
-
# 5. Investigation bonus with diminishing returns (view_* actions only)
|
| 74 |
-
if action.action_type in (ActionType.VIEW_PIPELINE, ActionType.VIEW_LOGS, ActionType.VIEW_CONFIG):
|
| 75 |
-
action_key = f"{action.action_type.value}:{action.service_name or 'global'}"
|
| 76 |
-
if action_key not in viewed_actions:
|
| 77 |
-
viewed_actions.add(action_key)
|
| 78 |
-
investigation_count = len(viewed_actions)
|
| 79 |
-
decay_factor = 1.0 / (1 + (investigation_count - 1) * 0.3)
|
| 80 |
-
if action.service_name:
|
| 81 |
-
svc_data = current_snapshot["services"].get(action.service_name, {})
|
| 82 |
-
if svc_data.get("health") in ("degraded", "down"):
|
| 83 |
-
reward += 0.04 * decay_factor
|
| 84 |
-
else:
|
| 85 |
-
reward += 0.01 * decay_factor
|
| 86 |
-
else:
|
| 87 |
-
reward += 0.02 * decay_factor
|
| 88 |
-
else:
|
| 89 |
-
# Stronger penalty for consecutive repeat of same view action
|
| 90 |
-
current_action_key = f"{action.action_type.value}:{action.service_name or 'global'}"
|
| 91 |
-
if last_action_key and current_action_key == last_action_key:
|
| 92 |
-
reward -= 0.03 # Consecutive spam = harsh penalty
|
| 93 |
-
else:
|
| 94 |
-
reward -= 0.01 # Non-consecutive repeat = mild penalty
|
| 95 |
-
|
| 96 |
-
# 6. Repeated exact action penalty (non-view actions)
|
| 97 |
-
if action.action_type not in (ActionType.VIEW_PIPELINE, ActionType.VIEW_LOGS, ActionType.VIEW_CONFIG):
|
| 98 |
-
current_action_key = f"{action.action_type.value}:{action.service_name or ''}"
|
| 99 |
-
if last_action_key and current_action_key == last_action_key:
|
| 100 |
-
reward -= 0.02
|
| 101 |
-
|
| 102 |
-
# 7. Apply task urgency scaling and bound
|
| 103 |
-
reward *= TASK_URGENCY.get(task_name, 1.0)
|
| 104 |
-
return max(min(reward, 0.30), -0.35)
|
|
|
|
| 1 |
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
| 2 |
+
# All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# This source code is licensed under the BSD-style license found in the
|
| 5 |
+
# LICENSE file in the root directory of this source tree.
|
| 6 |
+
|
| 7 |
+
"""Outcome-based reward calculator for the DevOps Pipeline Environment."""
|
| 8 |
+
|
| 9 |
+
from devops_pipeline_env.models import ActionType
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# Task urgency multipliers β harder tasks get steeper reward gradients
|
| 13 |
+
TASK_URGENCY = {
|
| 14 |
+
"clean_deploy": 1.0,
|
| 15 |
+
"broken_pipeline": 1.2,
|
| 16 |
+
"judgment_call": 1.5,
|
| 17 |
+
"cascading_failure": 1.3,
|
| 18 |
+
"capacity_crisis": 1.4,
|
| 19 |
+
"random_incident": 1.3,
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def calculate_reward(prev_snapshot, current_snapshot, action, viewed_actions,
|
| 24 |
+
last_action_key=None, task_name="clean_deploy"):
|
| 25 |
+
"""
|
| 26 |
+
Outcome-based reward. No procedure bonuses. No early returns.
|
| 27 |
+
ALL actions go through the full reward pipeline.
|
| 28 |
+
Returns a float bounded to [-0.35, +0.30].
|
| 29 |
+
"""
|
| 30 |
+
reward = 0.0
|
| 31 |
+
|
| 32 |
+
# 1. System health delta (ALL actions)
|
| 33 |
+
health_delta = current_snapshot["system_health"] - prev_snapshot["system_health"]
|
| 34 |
+
reward += health_delta * 0.005
|
| 35 |
+
|
| 36 |
+
# 2. Deployment progress (ALL actions β recovery ticks can change state)
|
| 37 |
+
for name, curr_svc in current_snapshot["services"].items():
|
| 38 |
+
prev_svc = prev_snapshot["services"].get(name, {})
|
| 39 |
+
if curr_svc["prod_deployed"] and not prev_svc.get("prod_deployed"):
|
| 40 |
+
reward += 0.15
|
| 41 |
+
if curr_svc["staging_verified"] and not prev_svc.get("staging_verified"):
|
| 42 |
+
reward += 0.05
|
| 43 |
+
|
| 44 |
+
# 3. Broke healthy service penalty (ALL actions)
|
| 45 |
+
for name, curr_svc in current_snapshot["services"].items():
|
| 46 |
+
prev_svc = prev_snapshot["services"].get(name, {})
|
| 47 |
+
if prev_svc.get("health") == "healthy" and curr_svc["health"] in ("degraded", "down"):
|
| 48 |
+
reward -= 0.30
|
| 49 |
+
|
| 50 |
+
# 4. Sub-goal completion rewards (intermediate milestones for RL)
|
| 51 |
+
# Config change detected β check if it fixed an error
|
| 52 |
+
for name, curr_svc in current_snapshot["services"].items():
|
| 53 |
+
prev_svc = prev_snapshot["services"].get(name, {})
|
| 54 |
+
prev_config = prev_svc.get("config", {})
|
| 55 |
+
curr_config = curr_svc.get("config", {})
|
| 56 |
+
if prev_config and curr_config and prev_config != curr_config:
|
| 57 |
+
# Config changed β reward if health improved on this service
|
| 58 |
+
if prev_svc.get("health") in ("degraded", "down") and curr_svc["health"] == "healthy":
|
| 59 |
+
reward += 0.08
|
| 60 |
+
|
| 61 |
+
# Migration completed
|
| 62 |
+
prev_pending = len(prev_snapshot.get("migrations_pending", []))
|
| 63 |
+
curr_pending = len(current_snapshot.get("migrations_pending", []))
|
| 64 |
+
if curr_pending < prev_pending:
|
| 65 |
+
reward += 0.06
|
| 66 |
+
|
| 67 |
+
# Alert resolved
|
| 68 |
+
prev_alerts = len(prev_snapshot.get("alerts", []))
|
| 69 |
+
curr_alerts = len(current_snapshot.get("alerts", []))
|
| 70 |
+
if curr_alerts < prev_alerts:
|
| 71 |
+
reward += 0.03
|
| 72 |
+
|
| 73 |
+
# 5. Investigation bonus with diminishing returns (view_* actions only)
|
| 74 |
+
if action.action_type in (ActionType.VIEW_PIPELINE, ActionType.VIEW_LOGS, ActionType.VIEW_CONFIG):
|
| 75 |
+
action_key = f"{action.action_type.value}:{action.service_name or 'global'}"
|
| 76 |
+
if action_key not in viewed_actions:
|
| 77 |
+
viewed_actions.add(action_key)
|
| 78 |
+
investigation_count = len(viewed_actions)
|
| 79 |
+
decay_factor = 1.0 / (1 + (investigation_count - 1) * 0.3)
|
| 80 |
+
if action.service_name:
|
| 81 |
+
svc_data = current_snapshot["services"].get(action.service_name, {})
|
| 82 |
+
if svc_data.get("health") in ("degraded", "down"):
|
| 83 |
+
reward += 0.04 * decay_factor
|
| 84 |
+
else:
|
| 85 |
+
reward += 0.01 * decay_factor
|
| 86 |
+
else:
|
| 87 |
+
reward += 0.02 * decay_factor
|
| 88 |
+
else:
|
| 89 |
+
# Stronger penalty for consecutive repeat of same view action
|
| 90 |
+
current_action_key = f"{action.action_type.value}:{action.service_name or 'global'}"
|
| 91 |
+
if last_action_key and current_action_key == last_action_key:
|
| 92 |
+
reward -= 0.03 # Consecutive spam = harsh penalty
|
| 93 |
+
else:
|
| 94 |
+
reward -= 0.01 # Non-consecutive repeat = mild penalty
|
| 95 |
+
|
| 96 |
+
# 6. Repeated exact action penalty (non-view actions)
|
| 97 |
+
if action.action_type not in (ActionType.VIEW_PIPELINE, ActionType.VIEW_LOGS, ActionType.VIEW_CONFIG):
|
| 98 |
+
current_action_key = f"{action.action_type.value}:{action.service_name or ''}"
|
| 99 |
+
if last_action_key and current_action_key == last_action_key:
|
| 100 |
+
reward -= 0.02
|
| 101 |
+
|
| 102 |
+
# 7. Apply task urgency scaling and bound
|
| 103 |
+
reward *= TASK_URGENCY.get(task_name, 1.0)
|
| 104 |
+
return max(min(reward, 0.30), -0.35)
|
server/scenarios.py
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
uv.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|