Add core simulation engine, environment, grader, and app wiring
Browse files- server/traces.py: distributed trace generation (Jaeger/Zipkin-style spans)
- server/simulator.py: discrete-event simulation engine with tick-based
failure evolution, propagation, action processing, SLO scoring, dense
reward computation, and pending remediation effects
- server/scenarios.py: procedural scenario generation with 3 task
definitions (easy/medium/hard) and intelligent failure placement
- server/environment.py: SevZeroEnvironment(Environment) bridging
OpenEnv SDK contract with the simulator
- server/grader.py: deterministic grading (0.0-1.0) with SLO recovery,
action efficiency, and time efficiency components
- server/app.py: FastAPI app via create_app() + custom /tasks, /grader routes
- .gitignore: added __pycache__ exclusion
- .gitignore +9 -1
- server/app.py +94 -0
- server/environment.py +121 -0
- server/grader.py +108 -0
- server/scenarios.py +207 -0
- server/simulator.py +965 -0
- server/traces.py +157 -0
.gitignore
CHANGED
|
@@ -2,4 +2,12 @@
|
|
| 2 |
Docs/
|
| 3 |
|
| 4 |
# OpenEnv preparatory course (dev reference only, not part of submission)
|
| 5 |
-
openenv-course/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
Docs/
|
| 3 |
|
| 4 |
# OpenEnv preparatory course (dev reference only, not part of submission)
|
| 5 |
+
openenv-course/
|
| 6 |
+
|
| 7 |
+
# Python
|
| 8 |
+
__pycache__/
|
| 9 |
+
*.pyc
|
| 10 |
+
*.pyo
|
| 11 |
+
|
| 12 |
+
# Environment
|
| 13 |
+
.env
|
server/app.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
server/app.py — FastAPI application wiring.
|
| 3 |
+
|
| 4 |
+
Uses OpenEnv SDK's create_app() for core endpoints (/reset, /step, /state, /ws, /health),
|
| 5 |
+
then adds custom routes for /tasks, /grader, and /baseline.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
from typing import Any, Dict, List, Optional
|
| 11 |
+
|
| 12 |
+
from fastapi import FastAPI
|
| 13 |
+
from openenv.core.env_server import create_app
|
| 14 |
+
from pydantic import BaseModel
|
| 15 |
+
|
| 16 |
+
from models import SevZeroAction, SevZeroObservation
|
| 17 |
+
from server.environment import SevZeroEnvironment
|
| 18 |
+
from server.grader import grade_episode
|
| 19 |
+
from server.scenarios import TASK_DEFINITIONS
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# Create the OpenEnv app (wires /reset, /step, /state, /ws, /health, /schema, /metadata)
|
| 23 |
+
app = create_app(
|
| 24 |
+
SevZeroEnvironment,
|
| 25 |
+
SevZeroAction,
|
| 26 |
+
SevZeroObservation,
|
| 27 |
+
env_name="sevzero",
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# ---------------------------------------------------------------------------
|
| 32 |
+
# Custom routes
|
| 33 |
+
# ---------------------------------------------------------------------------
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
@app.get("/tasks")
|
| 37 |
+
async def list_tasks() -> List[Dict[str, Any]]:
|
| 38 |
+
"""Return the 3 task definitions (easy, medium, hard)."""
|
| 39 |
+
return [
|
| 40 |
+
{
|
| 41 |
+
"task_id": t["task_id"],
|
| 42 |
+
"name": t["name"],
|
| 43 |
+
"difficulty": t["difficulty"],
|
| 44 |
+
"description": t["description"],
|
| 45 |
+
"max_steps": t["max_steps"],
|
| 46 |
+
}
|
| 47 |
+
for t in TASK_DEFINITIONS
|
| 48 |
+
]
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
class GraderRequest(BaseModel):
|
| 52 |
+
final_slo_score: float
|
| 53 |
+
steps_taken: int
|
| 54 |
+
max_steps: int
|
| 55 |
+
actions_taken: List[Dict[str, Any]]
|
| 56 |
+
terminated: bool
|
| 57 |
+
termination_reason: Optional[str] = None
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
@app.post("/grader")
|
| 61 |
+
async def grade(request: GraderRequest) -> Dict[str, Any]:
|
| 62 |
+
"""
|
| 63 |
+
Deterministic grading endpoint.
|
| 64 |
+
Accepts episode results and returns a score 0.0–1.0 with breakdown.
|
| 65 |
+
"""
|
| 66 |
+
result = grade_episode(
|
| 67 |
+
final_slo_score=request.final_slo_score,
|
| 68 |
+
steps_taken=request.steps_taken,
|
| 69 |
+
max_steps=request.max_steps,
|
| 70 |
+
actions_taken=request.actions_taken,
|
| 71 |
+
terminated=request.terminated,
|
| 72 |
+
termination_reason=request.termination_reason,
|
| 73 |
+
)
|
| 74 |
+
return {
|
| 75 |
+
"score": result.score,
|
| 76 |
+
"slo_recovery": result.slo_recovery,
|
| 77 |
+
"action_efficiency": result.action_efficiency,
|
| 78 |
+
"time_efficiency": result.time_efficiency,
|
| 79 |
+
"details": result.details,
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
# ---------------------------------------------------------------------------
|
| 84 |
+
# Entry point
|
| 85 |
+
# ---------------------------------------------------------------------------
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def main() -> None:
|
| 89 |
+
import uvicorn
|
| 90 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
if __name__ == "__main__":
|
| 94 |
+
main()
|
server/environment.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
server/environment.py — SevZeroEnvironment: OpenEnv Environment subclass.
|
| 3 |
+
|
| 4 |
+
Bridges the OpenEnv SDK contract (reset/step/state) with the Simulator engine.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import uuid
|
| 10 |
+
from typing import Any, Optional
|
| 11 |
+
|
| 12 |
+
from openenv.core.env_server import Environment
|
| 13 |
+
from openenv.core.env_server.types import EnvironmentMetadata
|
| 14 |
+
|
| 15 |
+
from models import SevZeroAction, SevZeroObservation, SevZeroState
|
| 16 |
+
from server.scenarios import generate_scenario
|
| 17 |
+
from server.simulator import Simulator
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class SevZeroEnvironment(Environment[SevZeroAction, SevZeroObservation, SevZeroState]):
|
| 21 |
+
"""
|
| 22 |
+
SRE Incident Response Environment.
|
| 23 |
+
|
| 24 |
+
The agent observes service metrics, alerts, and logs, then issues
|
| 25 |
+
remediation commands to restore SLO compliance across a microservice cluster.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
def __init__(self) -> None:
|
| 29 |
+
super().__init__()
|
| 30 |
+
self._sim = Simulator()
|
| 31 |
+
self._episode_id: Optional[str] = None
|
| 32 |
+
self._task_id: str = "easy"
|
| 33 |
+
self._seed: Optional[int] = None
|
| 34 |
+
self._step_count: int = 0
|
| 35 |
+
|
| 36 |
+
def get_metadata(self) -> EnvironmentMetadata:
|
| 37 |
+
return EnvironmentMetadata(
|
| 38 |
+
name="sevzero",
|
| 39 |
+
description=(
|
| 40 |
+
"SRE Incident Response Environment — an autonomous on-call SRE "
|
| 41 |
+
"managing a microservice cluster undergoing cascading failures"
|
| 42 |
+
),
|
| 43 |
+
version="1.0.0",
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
def reset(
|
| 47 |
+
self,
|
| 48 |
+
seed: Optional[int] = None,
|
| 49 |
+
episode_id: Optional[str] = None,
|
| 50 |
+
**kwargs: Any,
|
| 51 |
+
) -> SevZeroObservation:
|
| 52 |
+
self._episode_id = episode_id or str(uuid.uuid4())
|
| 53 |
+
self._task_id = kwargs.get("task_id", "easy")
|
| 54 |
+
self._seed = seed if seed is not None else 42
|
| 55 |
+
self._step_count = 0
|
| 56 |
+
|
| 57 |
+
# Generate scenario and reset simulator
|
| 58 |
+
scenario = generate_scenario(self._seed, self._task_id)
|
| 59 |
+
self._sim.reset(
|
| 60 |
+
seed=self._seed,
|
| 61 |
+
difficulty=scenario.difficulty,
|
| 62 |
+
failure_specs=scenario.failure_specs,
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
return self._build_observation(reward=None, done=False)
|
| 66 |
+
|
| 67 |
+
def step(
|
| 68 |
+
self,
|
| 69 |
+
action: SevZeroAction,
|
| 70 |
+
timeout_s: Optional[float] = None,
|
| 71 |
+
**kwargs: Any,
|
| 72 |
+
) -> SevZeroObservation:
|
| 73 |
+
self._step_count += 1
|
| 74 |
+
|
| 75 |
+
reward = self._sim.step(action.action_type, action.params)
|
| 76 |
+
done = self._sim.terminated
|
| 77 |
+
|
| 78 |
+
return self._build_observation(reward=reward, done=done)
|
| 79 |
+
|
| 80 |
+
@property
|
| 81 |
+
def state(self) -> SevZeroState:
|
| 82 |
+
return SevZeroState(
|
| 83 |
+
episode_id=self._episode_id,
|
| 84 |
+
step_count=self._step_count,
|
| 85 |
+
task_id=self._task_id,
|
| 86 |
+
seed=self._seed,
|
| 87 |
+
global_slo_score=self._sim.get_slo_score(),
|
| 88 |
+
terminated=self._sim.terminated,
|
| 89 |
+
termination_reason=self._sim.termination_reason,
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
def _build_observation(
|
| 93 |
+
self, reward: Optional[float], done: bool,
|
| 94 |
+
) -> SevZeroObservation:
|
| 95 |
+
sim = self._sim
|
| 96 |
+
return SevZeroObservation(
|
| 97 |
+
done=done,
|
| 98 |
+
reward=reward,
|
| 99 |
+
# Episode context
|
| 100 |
+
tick=sim.tick,
|
| 101 |
+
episode_id=self._episode_id,
|
| 102 |
+
task_id=self._task_id,
|
| 103 |
+
status=sim.termination_reason or "playing",
|
| 104 |
+
max_steps=sim.max_steps,
|
| 105 |
+
# Health summary
|
| 106 |
+
global_slo_score=round(sim.get_slo_score(), 4),
|
| 107 |
+
observation_summary=sim.get_observation_summary(),
|
| 108 |
+
# Per-service state
|
| 109 |
+
services=sim.get_service_observations(),
|
| 110 |
+
# Alerts
|
| 111 |
+
alerts=sim.get_alerts(),
|
| 112 |
+
# Context
|
| 113 |
+
recent_deploys=[d for d in sim.deploys if d["ticks_ago"] <= 10],
|
| 114 |
+
actions_taken=sim.actions_taken[-10:],
|
| 115 |
+
# Action space
|
| 116 |
+
legal_actions=sim.get_legal_actions(),
|
| 117 |
+
# Diagnostics
|
| 118 |
+
logs=sim.last_logs,
|
| 119 |
+
metric_history=sim.last_metric_history,
|
| 120 |
+
traces=sim.last_traces,
|
| 121 |
+
)
|
server/grader.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
server/grader.py — Deterministic grading for SevZero episodes.
|
| 3 |
+
|
| 4 |
+
Score formula:
|
| 5 |
+
score = slo_recovery * 0.70 + action_efficiency * 0.15 + time_efficiency * 0.15
|
| 6 |
+
|
| 7 |
+
All inputs are derived from the episode state — fully deterministic.
|
| 8 |
+
Score is continuous 0.0–1.0 with partial credit.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
from dataclasses import dataclass
|
| 14 |
+
from typing import Any, Dict, List, Optional
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@dataclass
|
| 18 |
+
class GradeResult:
|
| 19 |
+
"""Grading result with breakdown."""
|
| 20 |
+
score: float
|
| 21 |
+
slo_recovery: float
|
| 22 |
+
action_efficiency: float
|
| 23 |
+
time_efficiency: float
|
| 24 |
+
details: Dict[str, Any]
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def grade_episode(
|
| 28 |
+
final_slo_score: float,
|
| 29 |
+
steps_taken: int,
|
| 30 |
+
max_steps: int,
|
| 31 |
+
actions_taken: List[Dict[str, Any]],
|
| 32 |
+
terminated: bool,
|
| 33 |
+
termination_reason: Optional[str],
|
| 34 |
+
) -> GradeResult:
|
| 35 |
+
"""
|
| 36 |
+
Grade a completed episode.
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
final_slo_score: fraction of services meeting SLO at episode end (0.0–1.0)
|
| 40 |
+
steps_taken: number of steps the agent took
|
| 41 |
+
max_steps: maximum allowed steps for this task
|
| 42 |
+
actions_taken: list of action records
|
| 43 |
+
terminated: whether the episode ended
|
| 44 |
+
termination_reason: "resolved" | "timeout" | "failed" | None
|
| 45 |
+
"""
|
| 46 |
+
# --- SLO recovery (70%) ---
|
| 47 |
+
# Direct fraction of services recovered
|
| 48 |
+
slo_recovery = final_slo_score
|
| 49 |
+
|
| 50 |
+
# Bonus for full resolution
|
| 51 |
+
if termination_reason == "resolved":
|
| 52 |
+
slo_recovery = 1.0
|
| 53 |
+
|
| 54 |
+
# --- Action efficiency (15%) ---
|
| 55 |
+
# Penalize wasted actions (noops when degraded, failed actions, redundant inspects)
|
| 56 |
+
total_actions = len(actions_taken)
|
| 57 |
+
if total_actions == 0:
|
| 58 |
+
action_efficiency = 0.0
|
| 59 |
+
else:
|
| 60 |
+
successful = sum(1 for a in actions_taken if a.get("success", False))
|
| 61 |
+
remediation_actions = sum(
|
| 62 |
+
1 for a in actions_taken
|
| 63 |
+
if a.get("action") not in ("inspect_logs", "inspect_metrics", "inspect_traces", "noop")
|
| 64 |
+
and a.get("success", False)
|
| 65 |
+
)
|
| 66 |
+
inspect_actions = sum(
|
| 67 |
+
1 for a in actions_taken
|
| 68 |
+
if a.get("action") in ("inspect_logs", "inspect_metrics", "inspect_traces")
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
# Good ratio: some inspection + targeted remediation
|
| 72 |
+
success_rate = successful / total_actions
|
| 73 |
+
# Penalize excessive inspections (>50% of budget is too much looking, not enough doing)
|
| 74 |
+
inspect_penalty = max(0.0, (inspect_actions / total_actions) - 0.5) if total_actions > 0 else 0.0
|
| 75 |
+
action_efficiency = max(0.0, success_rate - inspect_penalty)
|
| 76 |
+
|
| 77 |
+
# --- Time efficiency (15%) ---
|
| 78 |
+
# Faster resolution = higher score
|
| 79 |
+
if max_steps == 0:
|
| 80 |
+
time_efficiency = 0.0
|
| 81 |
+
elif termination_reason == "resolved":
|
| 82 |
+
# Resolved: reward faster resolution
|
| 83 |
+
time_efficiency = max(0.1, 1.0 - (steps_taken / max_steps))
|
| 84 |
+
else:
|
| 85 |
+
# Not resolved: partial credit based on how close we got
|
| 86 |
+
time_efficiency = final_slo_score * 0.3
|
| 87 |
+
|
| 88 |
+
# --- Final score ---
|
| 89 |
+
score = (
|
| 90 |
+
slo_recovery * 0.70
|
| 91 |
+
+ action_efficiency * 0.15
|
| 92 |
+
+ time_efficiency * 0.15
|
| 93 |
+
)
|
| 94 |
+
score = max(0.0, min(1.0, round(score, 4)))
|
| 95 |
+
|
| 96 |
+
return GradeResult(
|
| 97 |
+
score=score,
|
| 98 |
+
slo_recovery=round(slo_recovery, 4),
|
| 99 |
+
action_efficiency=round(action_efficiency, 4),
|
| 100 |
+
time_efficiency=round(time_efficiency, 4),
|
| 101 |
+
details={
|
| 102 |
+
"final_slo_score": round(final_slo_score, 4),
|
| 103 |
+
"steps_taken": steps_taken,
|
| 104 |
+
"max_steps": max_steps,
|
| 105 |
+
"termination_reason": termination_reason,
|
| 106 |
+
"total_actions": len(actions_taken),
|
| 107 |
+
},
|
| 108 |
+
)
|
server/scenarios.py
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
server/scenarios.py — Procedural scenario generation from seed + difficulty.
|
| 3 |
+
|
| 4 |
+
Maps difficulty to graph topology, failure count, and failure placement.
|
| 5 |
+
Same seed + same difficulty = identical scenario every time.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import random
|
| 11 |
+
from dataclasses import dataclass, field
|
| 12 |
+
from typing import List, Optional
|
| 13 |
+
|
| 14 |
+
from server.failures import (
|
| 15 |
+
FailureSpec,
|
| 16 |
+
FailureType,
|
| 17 |
+
make_failure_spec,
|
| 18 |
+
select_failure_type,
|
| 19 |
+
select_multi_root_failures,
|
| 20 |
+
)
|
| 21 |
+
from server.graph import ServiceGraph, generate_graph
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@dataclass
|
| 25 |
+
class ScenarioConfig:
|
| 26 |
+
"""Complete scenario definition for one episode."""
|
| 27 |
+
difficulty: str
|
| 28 |
+
seed: int
|
| 29 |
+
graph: ServiceGraph
|
| 30 |
+
failure_specs: List[FailureSpec]
|
| 31 |
+
max_steps: int
|
| 32 |
+
description: str
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
# ---------------------------------------------------------------------------
|
| 36 |
+
# Task definitions (the 3 required tasks)
|
| 37 |
+
# ---------------------------------------------------------------------------
|
| 38 |
+
|
| 39 |
+
TASK_DEFINITIONS = [
|
| 40 |
+
{
|
| 41 |
+
"task_id": "easy",
|
| 42 |
+
"name": "Single Service Outage",
|
| 43 |
+
"difficulty": "easy",
|
| 44 |
+
"description": (
|
| 45 |
+
"A single service in a small linear microservice chain is experiencing failures. "
|
| 46 |
+
"Diagnose the root cause and apply the correct remediation within 10 steps."
|
| 47 |
+
),
|
| 48 |
+
"max_steps": 10,
|
| 49 |
+
"num_failures": 1,
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"task_id": "medium",
|
| 53 |
+
"name": "Cascading Failure",
|
| 54 |
+
"difficulty": "medium",
|
| 55 |
+
"description": (
|
| 56 |
+
"A failure in a shared infrastructure service is cascading through a branching "
|
| 57 |
+
"dependency graph. Trace the root cause upstream from symptomatic services and "
|
| 58 |
+
"remediate within 20 steps."
|
| 59 |
+
),
|
| 60 |
+
"max_steps": 20,
|
| 61 |
+
"num_failures": 1,
|
| 62 |
+
},
|
| 63 |
+
{
|
| 64 |
+
"task_id": "hard",
|
| 65 |
+
"name": "Multi-Root Sev-0 Incident",
|
| 66 |
+
"difficulty": "hard",
|
| 67 |
+
"description": (
|
| 68 |
+
"Multiple simultaneous failures across a multi-region microservice architecture. "
|
| 69 |
+
"Failures may have conflicting mitigations. Triage, diagnose, and resolve all "
|
| 70 |
+
"root causes within 50 steps."
|
| 71 |
+
),
|
| 72 |
+
"max_steps": 50,
|
| 73 |
+
"num_failures": 3,
|
| 74 |
+
},
|
| 75 |
+
]
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def get_task_definition(task_id: str) -> dict:
|
| 79 |
+
"""Get a task definition by ID."""
|
| 80 |
+
for t in TASK_DEFINITIONS:
|
| 81 |
+
if t["task_id"] == task_id:
|
| 82 |
+
return t
|
| 83 |
+
raise ValueError(f"Unknown task_id: {task_id!r}. Must be one of: easy, medium, hard")
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# ---------------------------------------------------------------------------
|
| 87 |
+
# Failure placement logic
|
| 88 |
+
# ---------------------------------------------------------------------------
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def _pick_failure_target(
|
| 92 |
+
graph: ServiceGraph,
|
| 93 |
+
failure_type: FailureType,
|
| 94 |
+
rng: random.Random,
|
| 95 |
+
exclude: set,
|
| 96 |
+
) -> Optional[str]:
|
| 97 |
+
"""Pick an appropriate service to inject this failure type into."""
|
| 98 |
+
candidates = []
|
| 99 |
+
|
| 100 |
+
for node in graph.nodes:
|
| 101 |
+
if node.id in exclude:
|
| 102 |
+
continue
|
| 103 |
+
|
| 104 |
+
# Cache failures only on cache services
|
| 105 |
+
if failure_type == FailureType.CACHE_FAILURE:
|
| 106 |
+
if node.is_cache:
|
| 107 |
+
candidates.append(node.id)
|
| 108 |
+
continue
|
| 109 |
+
|
| 110 |
+
# DB degradation on infra services (postgres, etc.)
|
| 111 |
+
if failure_type == FailureType.DB_DEGRADATION:
|
| 112 |
+
if node.layer == "infra" and "postgres" in node.id:
|
| 113 |
+
candidates.append(node.id)
|
| 114 |
+
continue
|
| 115 |
+
|
| 116 |
+
# Network errors prefer non-edge services
|
| 117 |
+
if failure_type == FailureType.NETWORK_ERROR:
|
| 118 |
+
if node.layer != "edge":
|
| 119 |
+
candidates.append(node.id)
|
| 120 |
+
continue
|
| 121 |
+
|
| 122 |
+
# Config errors on any non-edge service
|
| 123 |
+
if failure_type in (FailureType.CONFIG_STARTUP, FailureType.CONFIG_RUNTIME):
|
| 124 |
+
if node.layer != "edge":
|
| 125 |
+
candidates.append(node.id)
|
| 126 |
+
continue
|
| 127 |
+
|
| 128 |
+
# Bad deploy on business or identity services
|
| 129 |
+
if failure_type == FailureType.BAD_DEPLOY:
|
| 130 |
+
if node.layer in ("business", "identity"):
|
| 131 |
+
candidates.append(node.id)
|
| 132 |
+
continue
|
| 133 |
+
|
| 134 |
+
# Resource leak on business services
|
| 135 |
+
if failure_type == FailureType.RESOURCE_LEAK:
|
| 136 |
+
if node.layer in ("business", "identity"):
|
| 137 |
+
candidates.append(node.id)
|
| 138 |
+
continue
|
| 139 |
+
|
| 140 |
+
# Crash on any non-edge service
|
| 141 |
+
if failure_type == FailureType.CRASH:
|
| 142 |
+
if node.layer != "edge":
|
| 143 |
+
candidates.append(node.id)
|
| 144 |
+
continue
|
| 145 |
+
|
| 146 |
+
# Cascading latency: prefer hotspot infra or busy business
|
| 147 |
+
if failure_type == FailureType.CASCADING_LATENCY:
|
| 148 |
+
if node.is_hotspot or node.layer == "business":
|
| 149 |
+
candidates.append(node.id)
|
| 150 |
+
continue
|
| 151 |
+
|
| 152 |
+
if not candidates:
|
| 153 |
+
# Fallback: any non-edge service
|
| 154 |
+
candidates = [n.id for n in graph.nodes if n.layer != "edge" and n.id not in exclude]
|
| 155 |
+
|
| 156 |
+
if not candidates:
|
| 157 |
+
return None
|
| 158 |
+
|
| 159 |
+
return rng.choice(candidates)
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
# ---------------------------------------------------------------------------
|
| 163 |
+
# Scenario generation
|
| 164 |
+
# ---------------------------------------------------------------------------
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def generate_scenario(seed: int, task_id: str) -> ScenarioConfig:
|
| 168 |
+
"""
|
| 169 |
+
Generate a complete scenario for the given task and seed.
|
| 170 |
+
Deterministic: same seed + same task_id = identical scenario.
|
| 171 |
+
"""
|
| 172 |
+
task = get_task_definition(task_id)
|
| 173 |
+
rng = random.Random(seed)
|
| 174 |
+
|
| 175 |
+
# Generate graph
|
| 176 |
+
difficulty = task["difficulty"]
|
| 177 |
+
graph = generate_graph(difficulty, rng)
|
| 178 |
+
|
| 179 |
+
# Select and place failures
|
| 180 |
+
num_failures = task["num_failures"]
|
| 181 |
+
used_services: set = set()
|
| 182 |
+
failure_specs: List[FailureSpec] = []
|
| 183 |
+
|
| 184 |
+
if num_failures == 1:
|
| 185 |
+
ft = select_failure_type(rng)
|
| 186 |
+
target = _pick_failure_target(graph, ft, rng, used_services)
|
| 187 |
+
if target:
|
| 188 |
+
spec = make_failure_spec(target, ft, rng)
|
| 189 |
+
failure_specs.append(spec)
|
| 190 |
+
used_services.add(target)
|
| 191 |
+
else:
|
| 192 |
+
failure_types = select_multi_root_failures(rng, count=num_failures)
|
| 193 |
+
for ft in failure_types:
|
| 194 |
+
target = _pick_failure_target(graph, ft, rng, used_services)
|
| 195 |
+
if target:
|
| 196 |
+
spec = make_failure_spec(target, ft, rng)
|
| 197 |
+
failure_specs.append(spec)
|
| 198 |
+
used_services.add(target)
|
| 199 |
+
|
| 200 |
+
return ScenarioConfig(
|
| 201 |
+
difficulty=difficulty,
|
| 202 |
+
seed=seed,
|
| 203 |
+
graph=graph,
|
| 204 |
+
failure_specs=failure_specs,
|
| 205 |
+
max_steps=task["max_steps"],
|
| 206 |
+
description=task["description"],
|
| 207 |
+
)
|
server/simulator.py
ADDED
|
@@ -0,0 +1,965 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
server/simulator.py — Core discrete-event simulation engine.
|
| 3 |
+
|
| 4 |
+
Orchestrates the service graph, failure injection, metric evolution,
|
| 5 |
+
propagation, log generation, and trace generation into a coherent
|
| 6 |
+
per-tick simulation loop.
|
| 7 |
+
|
| 8 |
+
Fully deterministic: random.Random(seed) exclusively.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import random
|
| 14 |
+
from dataclasses import dataclass, field
|
| 15 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 16 |
+
|
| 17 |
+
from server.failures import (
|
| 18 |
+
FailureSpec,
|
| 19 |
+
FailureType,
|
| 20 |
+
apply_failure_to_metrics,
|
| 21 |
+
make_failure_spec,
|
| 22 |
+
)
|
| 23 |
+
from server.graph import ServiceGraph, ServiceNode, generate_graph
|
| 24 |
+
from server.logs import generate_healthy_log, generate_log_message
|
| 25 |
+
from server.propagation import (
|
| 26 |
+
CircuitBreaker,
|
| 27 |
+
ServiceRuntimeState,
|
| 28 |
+
propagate_failures,
|
| 29 |
+
)
|
| 30 |
+
from server.traces import generate_trace
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
# ---------------------------------------------------------------------------
|
| 34 |
+
# SLO targets
|
| 35 |
+
# ---------------------------------------------------------------------------
|
| 36 |
+
|
| 37 |
+
# Per-difficulty SLO thresholds: a service is "meeting SLO" if ALL conditions hold
|
| 38 |
+
SLO_TARGETS = {
|
| 39 |
+
"easy": {"max_error_rate": 0.05, "max_p99_ms": 500, "max_cpu": 85, "max_memory": 90},
|
| 40 |
+
"medium": {"max_error_rate": 0.05, "max_p99_ms": 1000, "max_cpu": 90, "max_memory": 90},
|
| 41 |
+
"hard": {"max_error_rate": 0.05, "max_p99_ms": 2000, "max_cpu": 95, "max_memory": 95},
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _service_meets_slo(state: ServiceRuntimeState, difficulty: str) -> bool:
|
| 46 |
+
targets = SLO_TARGETS[difficulty]
|
| 47 |
+
return (
|
| 48 |
+
state.error_rate <= targets["max_error_rate"]
|
| 49 |
+
and state.latency_p99_ms <= targets["max_p99_ms"]
|
| 50 |
+
and state.cpu_pct <= targets["max_cpu"]
|
| 51 |
+
and state.memory_pct <= targets["max_memory"]
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
# ---------------------------------------------------------------------------
|
| 56 |
+
# Pending action effects (delayed remediation)
|
| 57 |
+
# ---------------------------------------------------------------------------
|
| 58 |
+
|
| 59 |
+
@dataclass
|
| 60 |
+
class PendingEffect:
|
| 61 |
+
"""A remediation action effect that resolves after a delay."""
|
| 62 |
+
action_type: str
|
| 63 |
+
target_service: str
|
| 64 |
+
params: Dict[str, Any]
|
| 65 |
+
resolve_tick: int # Tick at which this effect takes place
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
# ---------------------------------------------------------------------------
|
| 69 |
+
# Simulator
|
| 70 |
+
# ---------------------------------------------------------------------------
|
| 71 |
+
|
| 72 |
+
@dataclass
|
| 73 |
+
class Simulator:
|
| 74 |
+
"""
|
| 75 |
+
Core simulation engine.
|
| 76 |
+
|
| 77 |
+
Usage:
|
| 78 |
+
sim = Simulator()
|
| 79 |
+
obs_data = sim.reset(seed=42, difficulty="easy")
|
| 80 |
+
obs_data = sim.step(action_type="inspect_logs", params={"service_id": "order-service"})
|
| 81 |
+
"""
|
| 82 |
+
|
| 83 |
+
# --- Graph and topology ---
|
| 84 |
+
graph: Optional[ServiceGraph] = None
|
| 85 |
+
difficulty: str = "easy"
|
| 86 |
+
|
| 87 |
+
# --- Mutable per-service state ---
|
| 88 |
+
services: Dict[str, ServiceRuntimeState] = field(default_factory=dict)
|
| 89 |
+
|
| 90 |
+
# --- Failure injection ---
|
| 91 |
+
failures: List[FailureSpec] = field(default_factory=list)
|
| 92 |
+
failure_onset_tick: Dict[str, int] = field(default_factory=dict) # service_id → tick failure started
|
| 93 |
+
|
| 94 |
+
# --- Simulation state ---
|
| 95 |
+
tick: int = 0
|
| 96 |
+
max_steps: int = 10
|
| 97 |
+
terminated: bool = False
|
| 98 |
+
termination_reason: Optional[str] = None
|
| 99 |
+
|
| 100 |
+
# --- Pending remediation effects ---
|
| 101 |
+
pending_effects: List[PendingEffect] = field(default_factory=list)
|
| 102 |
+
|
| 103 |
+
# --- Action history ---
|
| 104 |
+
actions_taken: List[Dict[str, Any]] = field(default_factory=list)
|
| 105 |
+
|
| 106 |
+
# --- Deploy history ---
|
| 107 |
+
deploys: List[Dict[str, Any]] = field(default_factory=list)
|
| 108 |
+
|
| 109 |
+
# --- Diagnostic output (from inspect_* actions, consumed by observation builder) ---
|
| 110 |
+
last_logs: Optional[str] = None
|
| 111 |
+
last_metric_history: Optional[List[Dict[str, Any]]] = None
|
| 112 |
+
last_traces: Optional[Dict[str, Any]] = None
|
| 113 |
+
|
| 114 |
+
# --- Metric history per service (for inspect_metrics) ---
|
| 115 |
+
metric_history: Dict[str, List[Dict[str, Any]]] = field(default_factory=dict)
|
| 116 |
+
|
| 117 |
+
# --- RNG ---
|
| 118 |
+
rng: random.Random = field(default_factory=random.Random)
|
| 119 |
+
|
| 120 |
+
# --- Remediation tracking ---
|
| 121 |
+
remediated_services: Dict[str, int] = field(default_factory=dict) # service_id → tick remediated
|
| 122 |
+
|
| 123 |
+
def reset(
|
| 124 |
+
self,
|
| 125 |
+
seed: int,
|
| 126 |
+
difficulty: str,
|
| 127 |
+
failure_specs: Optional[List[FailureSpec]] = None,
|
| 128 |
+
) -> None:
|
| 129 |
+
"""Initialize a new episode. Call get_observation() after this."""
|
| 130 |
+
self.rng = random.Random(seed)
|
| 131 |
+
self.difficulty = difficulty
|
| 132 |
+
self.tick = 0
|
| 133 |
+
self.terminated = False
|
| 134 |
+
self.termination_reason = None
|
| 135 |
+
self.pending_effects = []
|
| 136 |
+
self.actions_taken = []
|
| 137 |
+
self.deploys = []
|
| 138 |
+
self.last_logs = None
|
| 139 |
+
self.last_metric_history = None
|
| 140 |
+
self.last_traces = None
|
| 141 |
+
self.metric_history = {}
|
| 142 |
+
self.remediated_services = {}
|
| 143 |
+
|
| 144 |
+
# Step budgets
|
| 145 |
+
budgets = {"easy": 10, "medium": 20, "hard": 50}
|
| 146 |
+
self.max_steps = budgets.get(difficulty, 10)
|
| 147 |
+
|
| 148 |
+
# Generate graph
|
| 149 |
+
self.graph = generate_graph(difficulty, self.rng)
|
| 150 |
+
|
| 151 |
+
# Initialize runtime state for each service
|
| 152 |
+
self.services = {}
|
| 153 |
+
for node in self.graph.nodes:
|
| 154 |
+
state = ServiceRuntimeState(
|
| 155 |
+
service_id=node.id,
|
| 156 |
+
arrival_rate=node.base_arrival_rate,
|
| 157 |
+
service_time_local=node.base_service_time_local,
|
| 158 |
+
thread_pool_size=node.thread_pool_size,
|
| 159 |
+
replicas=node.default_replicas,
|
| 160 |
+
version=node.default_version,
|
| 161 |
+
timeout_ms=node.default_timeout_ms,
|
| 162 |
+
retry_max=node.default_retry_max,
|
| 163 |
+
retry_backoff=node.default_retry_backoff,
|
| 164 |
+
pool_size=node.default_pool_size,
|
| 165 |
+
)
|
| 166 |
+
# Initialize circuit breakers for dependencies
|
| 167 |
+
for dep_id in self.graph.adjacency.get(node.id, []):
|
| 168 |
+
state.circuit_breakers[dep_id] = CircuitBreaker(
|
| 169 |
+
error_threshold=node.default_circuit_breaker_threshold,
|
| 170 |
+
)
|
| 171 |
+
self.services[state.service_id] = state
|
| 172 |
+
self.metric_history[state.service_id] = []
|
| 173 |
+
|
| 174 |
+
# Inject failures
|
| 175 |
+
self.failures = failure_specs or []
|
| 176 |
+
self.failure_onset_tick = {}
|
| 177 |
+
for spec in self.failures:
|
| 178 |
+
self.failure_onset_tick[spec.service_id] = 0
|
| 179 |
+
svc = self.services.get(spec.service_id)
|
| 180 |
+
if svc:
|
| 181 |
+
svc.has_active_failure = True
|
| 182 |
+
# Apply bad deploy version
|
| 183 |
+
if spec.failure_type == FailureType.BAD_DEPLOY and spec.bad_version:
|
| 184 |
+
svc.previous_version = svc.version
|
| 185 |
+
svc.version = spec.bad_version
|
| 186 |
+
self.deploys.append({
|
| 187 |
+
"service": spec.service_id,
|
| 188 |
+
"version": spec.bad_version,
|
| 189 |
+
"ticks_ago": 0,
|
| 190 |
+
})
|
| 191 |
+
|
| 192 |
+
# Run initial tick of failure evolution
|
| 193 |
+
self._evolve_failures()
|
| 194 |
+
self._run_propagation()
|
| 195 |
+
self._record_metrics()
|
| 196 |
+
|
| 197 |
+
def step(self, action_type: str, params: Dict[str, Any]) -> float:
|
| 198 |
+
"""
|
| 199 |
+
Execute one agent action and advance the simulation by one tick.
|
| 200 |
+
Returns the step reward (dense Δ-SLO shaping).
|
| 201 |
+
"""
|
| 202 |
+
if self.terminated:
|
| 203 |
+
return 0.0
|
| 204 |
+
|
| 205 |
+
prev_slo = self.get_slo_score()
|
| 206 |
+
|
| 207 |
+
# Clear diagnostic output from previous step
|
| 208 |
+
self.last_logs = None
|
| 209 |
+
self.last_metric_history = None
|
| 210 |
+
self.last_traces = None
|
| 211 |
+
|
| 212 |
+
# Process the action
|
| 213 |
+
action_record = self._process_action(action_type, params)
|
| 214 |
+
self.actions_taken.append(action_record)
|
| 215 |
+
|
| 216 |
+
# Advance tick
|
| 217 |
+
self.tick += 1
|
| 218 |
+
|
| 219 |
+
# Resolve pending effects
|
| 220 |
+
self._resolve_pending_effects()
|
| 221 |
+
|
| 222 |
+
# Evolve failures (for non-remediated services)
|
| 223 |
+
self._evolve_failures()
|
| 224 |
+
|
| 225 |
+
# Run propagation
|
| 226 |
+
self._run_propagation()
|
| 227 |
+
|
| 228 |
+
# Record metric history
|
| 229 |
+
self._record_metrics()
|
| 230 |
+
|
| 231 |
+
# Update deploy ticks_ago
|
| 232 |
+
for d in self.deploys:
|
| 233 |
+
d["ticks_ago"] += 1
|
| 234 |
+
|
| 235 |
+
# Compute reward
|
| 236 |
+
new_slo = self.get_slo_score()
|
| 237 |
+
reward = self._compute_reward(prev_slo, new_slo, action_type, action_record)
|
| 238 |
+
|
| 239 |
+
# Check termination
|
| 240 |
+
self._check_termination()
|
| 241 |
+
|
| 242 |
+
return reward
|
| 243 |
+
|
| 244 |
+
# -------------------------------------------------------------------
|
| 245 |
+
# Action processing
|
| 246 |
+
# -------------------------------------------------------------------
|
| 247 |
+
|
| 248 |
+
def _process_action(self, action_type: str, params: Dict[str, Any]) -> Dict[str, Any]:
|
| 249 |
+
"""Process an agent action. Returns an action record dict."""
|
| 250 |
+
service_id = params.get("service_id")
|
| 251 |
+
record = {
|
| 252 |
+
"tick": self.tick,
|
| 253 |
+
"action": action_type,
|
| 254 |
+
"target": service_id,
|
| 255 |
+
"success": False,
|
| 256 |
+
"note": None,
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
if action_type == "noop":
|
| 260 |
+
record["success"] = True
|
| 261 |
+
record["note"] = "Waited and observed"
|
| 262 |
+
return record
|
| 263 |
+
|
| 264 |
+
if action_type == "inspect_logs":
|
| 265 |
+
return self._do_inspect_logs(service_id, record)
|
| 266 |
+
elif action_type == "inspect_metrics":
|
| 267 |
+
return self._do_inspect_metrics(service_id, record)
|
| 268 |
+
elif action_type == "inspect_traces":
|
| 269 |
+
return self._do_inspect_traces(service_id, record)
|
| 270 |
+
elif action_type == "restart_service":
|
| 271 |
+
return self._do_restart(service_id, record)
|
| 272 |
+
elif action_type == "rollback_service":
|
| 273 |
+
return self._do_rollback(service_id, record)
|
| 274 |
+
elif action_type == "scale_service":
|
| 275 |
+
return self._do_scale(service_id, params, record)
|
| 276 |
+
elif action_type == "tune_config":
|
| 277 |
+
return self._do_tune_config(service_id, params, record)
|
| 278 |
+
elif action_type == "clear_cache":
|
| 279 |
+
return self._do_clear_cache(params, record)
|
| 280 |
+
elif action_type == "rebalance_traffic":
|
| 281 |
+
return self._do_rebalance_traffic(params, record)
|
| 282 |
+
elif action_type == "pause_job":
|
| 283 |
+
return self._do_pause_job(params, record)
|
| 284 |
+
else:
|
| 285 |
+
record["note"] = f"Unknown action type: {action_type}"
|
| 286 |
+
return record
|
| 287 |
+
|
| 288 |
+
def _do_inspect_logs(self, service_id: Optional[str], record: Dict) -> Dict:
|
| 289 |
+
svc = self.services.get(service_id or "")
|
| 290 |
+
if not svc:
|
| 291 |
+
record["note"] = f"Service '{service_id}' not found"
|
| 292 |
+
return record
|
| 293 |
+
|
| 294 |
+
record["success"] = True
|
| 295 |
+
# Generate log output based on service state
|
| 296 |
+
logs_lines = []
|
| 297 |
+
failure = self._get_failure_for_service(service_id)
|
| 298 |
+
if failure and svc.error_rate > 0.01:
|
| 299 |
+
dep = self._get_primary_dependency(service_id)
|
| 300 |
+
for _ in range(self.rng.randint(3, 6)):
|
| 301 |
+
logs_lines.append(generate_log_message(
|
| 302 |
+
failure.failure_type, service_id, self.rng,
|
| 303 |
+
dependency=dep,
|
| 304 |
+
error_rate=svc.error_rate,
|
| 305 |
+
memory_pct=svc.memory_pct,
|
| 306 |
+
p99_ms=svc.latency_p99_ms,
|
| 307 |
+
pool_pct=svc.connection_pool_usage_pct,
|
| 308 |
+
version=svc.version,
|
| 309 |
+
config_key=failure.broken_config_key or "unknown",
|
| 310 |
+
config_value=failure.broken_config_value or "unknown",
|
| 311 |
+
region=self.graph.node_map[service_id].region if self.graph and service_id in self.graph.node_map else "us-east-1",
|
| 312 |
+
throughput=svc.throughput_rps,
|
| 313 |
+
))
|
| 314 |
+
elif svc.error_rate > 0.01:
|
| 315 |
+
# Propagated errors — show upstream dependency issues
|
| 316 |
+
dep = self._get_primary_dependency(service_id)
|
| 317 |
+
logs_lines.append(f"WARN {service_id} Elevated error rate: {svc.error_rate*100:.1f}%. Upstream dependency {dep} may be degraded.")
|
| 318 |
+
logs_lines.append(f"ERROR {service_id} Request to {dep} failed: timeout after {svc.timeout_ms}ms. Retry 1/{svc.retry_max}.")
|
| 319 |
+
else:
|
| 320 |
+
logs_lines.append(generate_healthy_log(service_id, self.rng))
|
| 321 |
+
|
| 322 |
+
self.last_logs = "\n".join(logs_lines)
|
| 323 |
+
return record
|
| 324 |
+
|
| 325 |
+
def _do_inspect_metrics(self, service_id: Optional[str], record: Dict) -> Dict:
|
| 326 |
+
svc = self.services.get(service_id or "")
|
| 327 |
+
if not svc:
|
| 328 |
+
record["note"] = f"Service '{service_id}' not found"
|
| 329 |
+
return record
|
| 330 |
+
|
| 331 |
+
record["success"] = True
|
| 332 |
+
self.last_metric_history = self.metric_history.get(service_id, [])[-10:]
|
| 333 |
+
return record
|
| 334 |
+
|
| 335 |
+
def _do_inspect_traces(self, service_id: Optional[str], record: Dict) -> Dict:
|
| 336 |
+
svc = self.services.get(service_id or "")
|
| 337 |
+
if not svc or not self.graph:
|
| 338 |
+
record["note"] = f"Service '{service_id}' not found"
|
| 339 |
+
return record
|
| 340 |
+
|
| 341 |
+
record["success"] = True
|
| 342 |
+
errors = {sid: s.error_rate for sid, s in self.services.items()}
|
| 343 |
+
latencies = {sid: s.latency_p99_ms for sid, s in self.services.items()}
|
| 344 |
+
self.last_traces = generate_trace(
|
| 345 |
+
service_id, self.graph, errors, latencies, self.rng,
|
| 346 |
+
)
|
| 347 |
+
return record
|
| 348 |
+
|
| 349 |
+
def _do_restart(self, service_id: Optional[str], record: Dict) -> Dict:
|
| 350 |
+
svc = self.services.get(service_id or "")
|
| 351 |
+
if not svc:
|
| 352 |
+
record["note"] = f"Service '{service_id}' not found"
|
| 353 |
+
return record
|
| 354 |
+
|
| 355 |
+
failure = self._get_failure_for_service(service_id)
|
| 356 |
+
# Restart fixes: CRASH, RESOURCE_LEAK (temporarily), CONFIG_STARTUP (if config was fixed)
|
| 357 |
+
if failure and failure.failure_type in (FailureType.CRASH, FailureType.RESOURCE_LEAK):
|
| 358 |
+
delay = self.rng.randint(1, 2)
|
| 359 |
+
self.pending_effects.append(PendingEffect(
|
| 360 |
+
action_type="restart_service",
|
| 361 |
+
target_service=service_id,
|
| 362 |
+
params={},
|
| 363 |
+
resolve_tick=self.tick + delay,
|
| 364 |
+
))
|
| 365 |
+
record["success"] = True
|
| 366 |
+
record["note"] = f"Restarting {service_id}, effect in {delay} tick(s)"
|
| 367 |
+
elif failure and failure.failure_type == FailureType.CONFIG_STARTUP:
|
| 368 |
+
# Config startup: restart alone doesn't fix it (need tune_config first)
|
| 369 |
+
record["success"] = True
|
| 370 |
+
record["note"] = f"Restarted {service_id} but config error persists — fix config first"
|
| 371 |
+
elif failure:
|
| 372 |
+
# Restart gives temporary relief for other failures
|
| 373 |
+
delay = self.rng.randint(1, 2)
|
| 374 |
+
self.pending_effects.append(PendingEffect(
|
| 375 |
+
action_type="restart_partial",
|
| 376 |
+
target_service=service_id,
|
| 377 |
+
params={},
|
| 378 |
+
resolve_tick=self.tick + delay,
|
| 379 |
+
))
|
| 380 |
+
record["success"] = True
|
| 381 |
+
record["note"] = f"Restarting {service_id}, partial recovery expected in {delay} tick(s)"
|
| 382 |
+
else:
|
| 383 |
+
record["success"] = True
|
| 384 |
+
record["note"] = f"{service_id} is healthy, restart had no effect"
|
| 385 |
+
return record
|
| 386 |
+
|
| 387 |
+
def _do_rollback(self, service_id: Optional[str], record: Dict) -> Dict:
|
| 388 |
+
svc = self.services.get(service_id or "")
|
| 389 |
+
if not svc:
|
| 390 |
+
record["note"] = f"Service '{service_id}' not found"
|
| 391 |
+
return record
|
| 392 |
+
|
| 393 |
+
if not svc.previous_version:
|
| 394 |
+
record["note"] = f"No previous version to rollback to for {service_id}"
|
| 395 |
+
return record
|
| 396 |
+
|
| 397 |
+
failure = self._get_failure_for_service(service_id)
|
| 398 |
+
if failure and failure.failure_type == FailureType.BAD_DEPLOY:
|
| 399 |
+
delay = self.rng.randint(2, 3)
|
| 400 |
+
self.pending_effects.append(PendingEffect(
|
| 401 |
+
action_type="rollback_service",
|
| 402 |
+
target_service=service_id,
|
| 403 |
+
params={"version": svc.previous_version},
|
| 404 |
+
resolve_tick=self.tick + delay,
|
| 405 |
+
))
|
| 406 |
+
record["success"] = True
|
| 407 |
+
record["note"] = f"Rolling back {service_id} to {svc.previous_version}, effect in {delay} tick(s)"
|
| 408 |
+
else:
|
| 409 |
+
record["success"] = True
|
| 410 |
+
record["note"] = f"Rollback queued for {service_id} but issue may not be deploy-related"
|
| 411 |
+
delay = self.rng.randint(2, 3)
|
| 412 |
+
self.pending_effects.append(PendingEffect(
|
| 413 |
+
action_type="rollback_service",
|
| 414 |
+
target_service=service_id,
|
| 415 |
+
params={"version": svc.previous_version},
|
| 416 |
+
resolve_tick=self.tick + delay,
|
| 417 |
+
))
|
| 418 |
+
return record
|
| 419 |
+
|
| 420 |
+
def _do_scale(self, service_id: Optional[str], params: Dict, record: Dict) -> Dict:
|
| 421 |
+
svc = self.services.get(service_id or "")
|
| 422 |
+
if not svc:
|
| 423 |
+
record["note"] = f"Service '{service_id}' not found"
|
| 424 |
+
return record
|
| 425 |
+
|
| 426 |
+
target_replicas = params.get("replicas", svc.replicas + 1)
|
| 427 |
+
node = self.graph.node_map.get(service_id) if self.graph else None
|
| 428 |
+
max_r = node.max_replicas if node else 8
|
| 429 |
+
target_replicas = max(1, min(target_replicas, max_r))
|
| 430 |
+
|
| 431 |
+
delay = self.rng.randint(2, 4)
|
| 432 |
+
self.pending_effects.append(PendingEffect(
|
| 433 |
+
action_type="scale_service",
|
| 434 |
+
target_service=service_id,
|
| 435 |
+
params={"replicas": target_replicas},
|
| 436 |
+
resolve_tick=self.tick + delay,
|
| 437 |
+
))
|
| 438 |
+
record["success"] = True
|
| 439 |
+
record["note"] = f"Scaling {service_id} to {target_replicas} replicas, effect in {delay} tick(s)"
|
| 440 |
+
return record
|
| 441 |
+
|
| 442 |
+
def _do_tune_config(self, service_id: Optional[str], params: Dict, record: Dict) -> Dict:
|
| 443 |
+
svc = self.services.get(service_id or "")
|
| 444 |
+
if not svc:
|
| 445 |
+
record["note"] = f"Service '{service_id}' not found"
|
| 446 |
+
return record
|
| 447 |
+
|
| 448 |
+
key = params.get("key", "")
|
| 449 |
+
value = params.get("value", "")
|
| 450 |
+
record["success"] = True
|
| 451 |
+
record["target"] = service_id
|
| 452 |
+
|
| 453 |
+
failure = self._get_failure_for_service(service_id)
|
| 454 |
+
if failure and failure.failure_type in (FailureType.CONFIG_STARTUP, FailureType.CONFIG_RUNTIME):
|
| 455 |
+
if key == failure.broken_config_key:
|
| 456 |
+
# Correct fix!
|
| 457 |
+
self.pending_effects.append(PendingEffect(
|
| 458 |
+
action_type="tune_config_fix",
|
| 459 |
+
target_service=service_id,
|
| 460 |
+
params={"key": key, "value": value},
|
| 461 |
+
resolve_tick=self.tick + 1,
|
| 462 |
+
))
|
| 463 |
+
record["note"] = f"Config key '{key}' updated on {service_id}. Fix takes effect next tick."
|
| 464 |
+
else:
|
| 465 |
+
record["note"] = f"Config key '{key}' updated on {service_id}, but this may not be the broken key."
|
| 466 |
+
else:
|
| 467 |
+
# General config tune (e.g., timeout, retry)
|
| 468 |
+
self._apply_config_immediately(svc, key, value)
|
| 469 |
+
record["note"] = f"Config '{key}'={value} applied to {service_id}"
|
| 470 |
+
return record
|
| 471 |
+
|
| 472 |
+
def _do_clear_cache(self, params: Dict, record: Dict) -> Dict:
|
| 473 |
+
cache_name = params.get("cache_name") or params.get("service_id", "")
|
| 474 |
+
record["target"] = cache_name
|
| 475 |
+
|
| 476 |
+
if not self.graph or cache_name not in self.graph.cache_services:
|
| 477 |
+
record["note"] = f"'{cache_name}' is not a cache service"
|
| 478 |
+
return record
|
| 479 |
+
|
| 480 |
+
failure = self._get_failure_for_service(cache_name)
|
| 481 |
+
if failure and failure.failure_type == FailureType.CACHE_FAILURE:
|
| 482 |
+
self.pending_effects.append(PendingEffect(
|
| 483 |
+
action_type="clear_cache",
|
| 484 |
+
target_service=cache_name,
|
| 485 |
+
params={},
|
| 486 |
+
resolve_tick=self.tick + 1,
|
| 487 |
+
))
|
| 488 |
+
record["success"] = True
|
| 489 |
+
record["note"] = f"Flushing cache {cache_name}, recovery in 1 tick"
|
| 490 |
+
else:
|
| 491 |
+
record["success"] = True
|
| 492 |
+
record["note"] = f"Cache {cache_name} flushed (was not failing)"
|
| 493 |
+
return record
|
| 494 |
+
|
| 495 |
+
def _do_rebalance_traffic(self, params: Dict, record: Dict) -> Dict:
|
| 496 |
+
from_region = params.get("from_region", "")
|
| 497 |
+
to_region = params.get("to_region", "")
|
| 498 |
+
pct = params.get("pct", 50)
|
| 499 |
+
record["target"] = f"{from_region}->{to_region}"
|
| 500 |
+
|
| 501 |
+
if not self.graph or not self.graph.has_multiple_regions:
|
| 502 |
+
record["note"] = "Traffic rebalancing only available in multi-region (hard) mode"
|
| 503 |
+
return record
|
| 504 |
+
|
| 505 |
+
delay = self.rng.randint(2, 3)
|
| 506 |
+
self.pending_effects.append(PendingEffect(
|
| 507 |
+
action_type="rebalance_traffic",
|
| 508 |
+
target_service="",
|
| 509 |
+
params={"from_region": from_region, "to_region": to_region, "pct": pct},
|
| 510 |
+
resolve_tick=self.tick + delay,
|
| 511 |
+
))
|
| 512 |
+
record["success"] = True
|
| 513 |
+
record["note"] = f"Shifting {pct}% traffic from {from_region} to {to_region}, effect in {delay} tick(s)"
|
| 514 |
+
return record
|
| 515 |
+
|
| 516 |
+
def _do_pause_job(self, params: Dict, record: Dict) -> Dict:
|
| 517 |
+
job_name = params.get("job_name") or params.get("service_id", "")
|
| 518 |
+
record["target"] = job_name
|
| 519 |
+
|
| 520 |
+
if not self.graph or job_name not in self.graph.background_jobs:
|
| 521 |
+
record["note"] = f"'{job_name}' is not a background job service"
|
| 522 |
+
return record
|
| 523 |
+
|
| 524 |
+
svc = self.services.get(job_name)
|
| 525 |
+
if svc:
|
| 526 |
+
svc.arrival_rate *= 0.3 # Reduce load significantly
|
| 527 |
+
record["success"] = True
|
| 528 |
+
record["note"] = f"Background job on {job_name} paused, load reduced"
|
| 529 |
+
return record
|
| 530 |
+
|
| 531 |
+
# -------------------------------------------------------------------
|
| 532 |
+
# Effect resolution
|
| 533 |
+
# -------------------------------------------------------------------
|
| 534 |
+
|
| 535 |
+
def _resolve_pending_effects(self) -> None:
|
| 536 |
+
"""Resolve pending effects that have reached their tick."""
|
| 537 |
+
still_pending = []
|
| 538 |
+
for effect in self.pending_effects:
|
| 539 |
+
if self.tick >= effect.resolve_tick:
|
| 540 |
+
self._apply_effect(effect)
|
| 541 |
+
else:
|
| 542 |
+
still_pending.append(effect)
|
| 543 |
+
self.pending_effects = still_pending
|
| 544 |
+
|
| 545 |
+
def _apply_effect(self, effect: PendingEffect) -> None:
|
| 546 |
+
svc = self.services.get(effect.target_service)
|
| 547 |
+
|
| 548 |
+
if effect.action_type == "restart_service":
|
| 549 |
+
# Full restart: clears crash/leak failures
|
| 550 |
+
if svc:
|
| 551 |
+
self._remediate_service(effect.target_service)
|
| 552 |
+
svc.memory_pct = 30.0 # Reset memory (leak fix)
|
| 553 |
+
|
| 554 |
+
elif effect.action_type == "restart_partial":
|
| 555 |
+
# Partial: temporary relief
|
| 556 |
+
if svc:
|
| 557 |
+
svc.error_rate *= 0.5
|
| 558 |
+
svc.memory_pct = max(30.0, svc.memory_pct * 0.7)
|
| 559 |
+
|
| 560 |
+
elif effect.action_type == "rollback_service":
|
| 561 |
+
if svc:
|
| 562 |
+
version = effect.params.get("version", svc.previous_version)
|
| 563 |
+
svc.version = version
|
| 564 |
+
svc.previous_version = None
|
| 565 |
+
self._remediate_service(effect.target_service)
|
| 566 |
+
self.deploys.append({
|
| 567 |
+
"service": effect.target_service,
|
| 568 |
+
"version": version,
|
| 569 |
+
"ticks_ago": 0,
|
| 570 |
+
})
|
| 571 |
+
|
| 572 |
+
elif effect.action_type == "scale_service":
|
| 573 |
+
if svc:
|
| 574 |
+
svc.replicas = effect.params.get("replicas", svc.replicas)
|
| 575 |
+
|
| 576 |
+
elif effect.action_type == "tune_config_fix":
|
| 577 |
+
self._remediate_service(effect.target_service)
|
| 578 |
+
# If config_startup, also need a restart — but we apply partial fix
|
| 579 |
+
failure = self._get_failure_for_service(effect.target_service)
|
| 580 |
+
if failure and failure.failure_type == FailureType.CONFIG_STARTUP:
|
| 581 |
+
# Config fixed + implicit restart
|
| 582 |
+
if svc:
|
| 583 |
+
svc.error_rate = 0.02 # Near-zero while restarting
|
| 584 |
+
|
| 585 |
+
elif effect.action_type == "clear_cache":
|
| 586 |
+
self._remediate_service(effect.target_service)
|
| 587 |
+
|
| 588 |
+
elif effect.action_type == "rebalance_traffic":
|
| 589 |
+
# Reduce arrival rate in from_region, increase in to_region
|
| 590 |
+
from_region = effect.params.get("from_region", "")
|
| 591 |
+
to_region = effect.params.get("to_region", "")
|
| 592 |
+
pct = effect.params.get("pct", 50) / 100.0
|
| 593 |
+
if self.graph:
|
| 594 |
+
for node in self.graph.nodes:
|
| 595 |
+
s = self.services.get(node.id)
|
| 596 |
+
if not s:
|
| 597 |
+
continue
|
| 598 |
+
if node.region == from_region:
|
| 599 |
+
s.arrival_rate *= (1 - pct)
|
| 600 |
+
elif node.region == to_region:
|
| 601 |
+
s.arrival_rate *= (1 + pct * 0.5) # Some traffic absorbed
|
| 602 |
+
|
| 603 |
+
def _remediate_service(self, service_id: str) -> None:
|
| 604 |
+
"""Mark a service as remediated — stop failure evolution."""
|
| 605 |
+
self.remediated_services[service_id] = self.tick
|
| 606 |
+
svc = self.services.get(service_id)
|
| 607 |
+
if svc:
|
| 608 |
+
svc.has_active_failure = False
|
| 609 |
+
svc.failure_ticks = 0
|
| 610 |
+
|
| 611 |
+
def _apply_config_immediately(self, svc: ServiceRuntimeState, key: str, value: Any) -> None:
|
| 612 |
+
"""Apply a config change that takes effect immediately."""
|
| 613 |
+
if key == "timeout_ms":
|
| 614 |
+
svc.timeout_ms = int(value)
|
| 615 |
+
elif key == "retry_max":
|
| 616 |
+
svc.retry_max = int(value)
|
| 617 |
+
elif key == "pool_size":
|
| 618 |
+
svc.pool_size = int(value)
|
| 619 |
+
elif key == "retry_backoff":
|
| 620 |
+
svc.retry_backoff = bool(value)
|
| 621 |
+
|
| 622 |
+
# -------------------------------------------------------------------
|
| 623 |
+
# Failure evolution
|
| 624 |
+
# -------------------------------------------------------------------
|
| 625 |
+
|
| 626 |
+
def _evolve_failures(self) -> None:
|
| 627 |
+
"""Evolve all active failures by one tick."""
|
| 628 |
+
for spec in self.failures:
|
| 629 |
+
sid = spec.service_id
|
| 630 |
+
if sid in self.remediated_services:
|
| 631 |
+
# Remediated — gradually recover
|
| 632 |
+
svc = self.services.get(sid)
|
| 633 |
+
if svc:
|
| 634 |
+
svc.error_rate = max(0.0, svc.error_rate * 0.5)
|
| 635 |
+
svc.latency_p99_ms = max(50.0, svc.latency_p99_ms * 0.7)
|
| 636 |
+
svc.cpu_pct = max(10.0, svc.cpu_pct * 0.8)
|
| 637 |
+
svc.memory_pct = max(25.0, svc.memory_pct * 0.9)
|
| 638 |
+
svc.connection_pool_usage_pct = max(5.0, svc.connection_pool_usage_pct * 0.7)
|
| 639 |
+
svc.status = svc.compute_status()
|
| 640 |
+
continue
|
| 641 |
+
|
| 642 |
+
svc = self.services.get(sid)
|
| 643 |
+
if not svc:
|
| 644 |
+
continue
|
| 645 |
+
|
| 646 |
+
onset = self.failure_onset_tick.get(sid, 0)
|
| 647 |
+
ticks_since = self.tick - onset
|
| 648 |
+
|
| 649 |
+
node = self.graph.node_map.get(sid) if self.graph else None
|
| 650 |
+
base_p99 = 100.0
|
| 651 |
+
base_cpu = 15.0
|
| 652 |
+
base_memory = 30.0
|
| 653 |
+
base_pool = 10.0
|
| 654 |
+
|
| 655 |
+
error_rate, p99_ms, cpu_pct, memory_pct, pool_pct = apply_failure_to_metrics(
|
| 656 |
+
spec, ticks_since,
|
| 657 |
+
base_error_rate=0.0,
|
| 658 |
+
base_p99_ms=base_p99,
|
| 659 |
+
base_cpu=base_cpu,
|
| 660 |
+
base_memory=base_memory,
|
| 661 |
+
base_pool=base_pool,
|
| 662 |
+
rng=self.rng,
|
| 663 |
+
)
|
| 664 |
+
|
| 665 |
+
svc.error_rate = error_rate
|
| 666 |
+
svc.update_latency_percentiles(base_p99, p99_ms / base_p99, self.rng)
|
| 667 |
+
svc.cpu_pct = cpu_pct
|
| 668 |
+
svc.memory_pct = memory_pct
|
| 669 |
+
svc.connection_pool_usage_pct = pool_pct
|
| 670 |
+
svc.failure_ticks = ticks_since
|
| 671 |
+
svc.status = svc.compute_status()
|
| 672 |
+
|
| 673 |
+
def _run_propagation(self) -> None:
|
| 674 |
+
"""Run propagation engine to cascade failures through the graph."""
|
| 675 |
+
if not self.graph:
|
| 676 |
+
return
|
| 677 |
+
|
| 678 |
+
edge_activation = {}
|
| 679 |
+
for edge in self.graph.edges:
|
| 680 |
+
edge_activation[(edge.source, edge.target)] = edge.activation_probability
|
| 681 |
+
|
| 682 |
+
propagate_failures(
|
| 683 |
+
self.services,
|
| 684 |
+
self.graph.adjacency,
|
| 685 |
+
self.graph.reverse_adjacency,
|
| 686 |
+
edge_activation,
|
| 687 |
+
self.rng,
|
| 688 |
+
current_tick=self.tick,
|
| 689 |
+
)
|
| 690 |
+
|
| 691 |
+
# -------------------------------------------------------------------
|
| 692 |
+
# Metric recording
|
| 693 |
+
# -------------------------------------------------------------------
|
| 694 |
+
|
| 695 |
+
def _record_metrics(self) -> None:
|
| 696 |
+
"""Record current metrics snapshot for all services."""
|
| 697 |
+
for sid, svc in self.services.items():
|
| 698 |
+
self.metric_history[sid].append({
|
| 699 |
+
"tick": self.tick,
|
| 700 |
+
"error_rate": round(svc.error_rate, 4),
|
| 701 |
+
"latency_p99_ms": round(svc.latency_p99_ms, 1),
|
| 702 |
+
"cpu_pct": round(svc.cpu_pct, 1),
|
| 703 |
+
"memory_pct": round(svc.memory_pct, 1),
|
| 704 |
+
"pool_pct": round(svc.connection_pool_usage_pct, 1),
|
| 705 |
+
"throughput_rps": round(svc.throughput_rps, 1),
|
| 706 |
+
"status": svc.status,
|
| 707 |
+
})
|
| 708 |
+
|
| 709 |
+
# -------------------------------------------------------------------
|
| 710 |
+
# Reward computation
|
| 711 |
+
# -------------------------------------------------------------------
|
| 712 |
+
|
| 713 |
+
def _compute_reward(
|
| 714 |
+
self, prev_slo: float, new_slo: float,
|
| 715 |
+
action_type: str, record: Dict,
|
| 716 |
+
) -> float:
|
| 717 |
+
"""Dense Δ-SLO reward with action-type penalties."""
|
| 718 |
+
# Base: delta SLO (positive = improvement)
|
| 719 |
+
delta = new_slo - prev_slo
|
| 720 |
+
reward = delta * 10.0 # Scale up for signal strength
|
| 721 |
+
|
| 722 |
+
# Bonus for reaching full recovery
|
| 723 |
+
if new_slo >= 1.0:
|
| 724 |
+
reward += 5.0
|
| 725 |
+
|
| 726 |
+
# Penalty for invalid/failed actions
|
| 727 |
+
if not record.get("success", False):
|
| 728 |
+
reward -= 0.5
|
| 729 |
+
|
| 730 |
+
# Small penalty for non-diagnostic actions (encourage efficiency)
|
| 731 |
+
if action_type not in ("inspect_logs", "inspect_metrics", "inspect_traces", "noop"):
|
| 732 |
+
reward -= 0.1 # Small cost for remediation actions
|
| 733 |
+
|
| 734 |
+
# Penalty for redundant noops when system is degraded
|
| 735 |
+
if action_type == "noop" and new_slo < 0.9:
|
| 736 |
+
reward -= 0.2
|
| 737 |
+
|
| 738 |
+
return round(reward, 4)
|
| 739 |
+
|
| 740 |
+
# -------------------------------------------------------------------
|
| 741 |
+
# Termination
|
| 742 |
+
# -------------------------------------------------------------------
|
| 743 |
+
|
| 744 |
+
def _check_termination(self) -> None:
|
| 745 |
+
"""Check if the episode should end."""
|
| 746 |
+
slo = self.get_slo_score()
|
| 747 |
+
|
| 748 |
+
# Success: all SLOs met
|
| 749 |
+
if slo >= 1.0:
|
| 750 |
+
self.terminated = True
|
| 751 |
+
self.termination_reason = "resolved"
|
| 752 |
+
return
|
| 753 |
+
|
| 754 |
+
# Timeout: exceeded step budget
|
| 755 |
+
if self.tick >= self.max_steps:
|
| 756 |
+
self.terminated = True
|
| 757 |
+
self.termination_reason = "timeout"
|
| 758 |
+
return
|
| 759 |
+
|
| 760 |
+
# System collapse: all services down
|
| 761 |
+
down_count = sum(1 for s in self.services.values() if s.status == "down")
|
| 762 |
+
if down_count == len(self.services) and len(self.services) > 0:
|
| 763 |
+
self.terminated = True
|
| 764 |
+
self.termination_reason = "failed"
|
| 765 |
+
|
| 766 |
+
# -------------------------------------------------------------------
|
| 767 |
+
# Observation helpers
|
| 768 |
+
# -------------------------------------------------------------------
|
| 769 |
+
|
| 770 |
+
def get_slo_score(self) -> float:
|
| 771 |
+
"""Fraction of services meeting SLO targets."""
|
| 772 |
+
if not self.services:
|
| 773 |
+
return 0.0
|
| 774 |
+
meeting = sum(1 for s in self.services.values() if _service_meets_slo(s, self.difficulty))
|
| 775 |
+
return meeting / len(self.services)
|
| 776 |
+
|
| 777 |
+
def get_observation_summary(self) -> str:
|
| 778 |
+
"""Generate a natural-language summary of the current state."""
|
| 779 |
+
slo = self.get_slo_score()
|
| 780 |
+
total = len(self.services)
|
| 781 |
+
healthy = sum(1 for s in self.services.values() if s.status == "healthy")
|
| 782 |
+
degraded = sum(1 for s in self.services.values() if s.status == "degraded")
|
| 783 |
+
critical = sum(1 for s in self.services.values() if s.status == "critical")
|
| 784 |
+
down = sum(1 for s in self.services.values() if s.status == "down")
|
| 785 |
+
|
| 786 |
+
parts = []
|
| 787 |
+
if down > 0:
|
| 788 |
+
parts.append(f"{down} service(s) DOWN")
|
| 789 |
+
if critical > 0:
|
| 790 |
+
parts.append(f"{critical} CRITICAL")
|
| 791 |
+
if degraded > 0:
|
| 792 |
+
parts.append(f"{degraded} degraded")
|
| 793 |
+
if healthy > 0:
|
| 794 |
+
parts.append(f"{healthy} healthy")
|
| 795 |
+
|
| 796 |
+
status_str = ", ".join(parts) if parts else "all nominal"
|
| 797 |
+
return f"Tick {self.tick}/{self.max_steps}: SLO compliance {slo*100:.0f}% ({status_str}). {total} services total."
|
| 798 |
+
|
| 799 |
+
def get_alerts(self) -> List[Dict[str, Any]]:
|
| 800 |
+
"""Generate active alerts from current service states."""
|
| 801 |
+
alerts = []
|
| 802 |
+
for sid, svc in self.services.items():
|
| 803 |
+
if svc.error_rate >= 0.50:
|
| 804 |
+
alerts.append({
|
| 805 |
+
"severity": "critical",
|
| 806 |
+
"service": sid,
|
| 807 |
+
"type": "error_rate_high",
|
| 808 |
+
"message": f"{sid} error rate at {svc.error_rate*100:.0f}%",
|
| 809 |
+
"first_seen_tick": max(0, self.tick - svc.failure_ticks),
|
| 810 |
+
})
|
| 811 |
+
elif svc.error_rate >= 0.05:
|
| 812 |
+
alerts.append({
|
| 813 |
+
"severity": "warning",
|
| 814 |
+
"service": sid,
|
| 815 |
+
"type": "error_rate_high",
|
| 816 |
+
"message": f"{sid} error rate elevated at {svc.error_rate*100:.1f}%",
|
| 817 |
+
"first_seen_tick": max(0, self.tick - svc.failure_ticks),
|
| 818 |
+
})
|
| 819 |
+
|
| 820 |
+
if svc.latency_p99_ms >= 5000:
|
| 821 |
+
alerts.append({
|
| 822 |
+
"severity": "critical",
|
| 823 |
+
"service": sid,
|
| 824 |
+
"type": "latency_high",
|
| 825 |
+
"message": f"{sid} p99 latency {svc.latency_p99_ms:.0f}ms",
|
| 826 |
+
"first_seen_tick": max(0, self.tick - svc.failure_ticks),
|
| 827 |
+
})
|
| 828 |
+
elif svc.latency_p99_ms >= 1000:
|
| 829 |
+
alerts.append({
|
| 830 |
+
"severity": "warning",
|
| 831 |
+
"service": sid,
|
| 832 |
+
"type": "latency_high",
|
| 833 |
+
"message": f"{sid} p99 latency elevated at {svc.latency_p99_ms:.0f}ms",
|
| 834 |
+
"first_seen_tick": max(0, self.tick - svc.failure_ticks),
|
| 835 |
+
})
|
| 836 |
+
|
| 837 |
+
if svc.status == "down":
|
| 838 |
+
alerts.append({
|
| 839 |
+
"severity": "critical",
|
| 840 |
+
"service": sid,
|
| 841 |
+
"type": "service_down",
|
| 842 |
+
"message": f"{sid} is DOWN",
|
| 843 |
+
"first_seen_tick": max(0, self.tick - svc.failure_ticks),
|
| 844 |
+
})
|
| 845 |
+
|
| 846 |
+
if svc.memory_pct >= 90:
|
| 847 |
+
alerts.append({
|
| 848 |
+
"severity": "warning",
|
| 849 |
+
"service": sid,
|
| 850 |
+
"type": "memory_high",
|
| 851 |
+
"message": f"{sid} memory at {svc.memory_pct:.0f}%",
|
| 852 |
+
"first_seen_tick": max(0, self.tick - svc.failure_ticks),
|
| 853 |
+
})
|
| 854 |
+
|
| 855 |
+
if svc.connection_pool_usage_pct >= 80:
|
| 856 |
+
alerts.append({
|
| 857 |
+
"severity": "warning",
|
| 858 |
+
"service": sid,
|
| 859 |
+
"type": "connection_pool_saturated",
|
| 860 |
+
"message": f"{sid} connection pool at {svc.connection_pool_usage_pct:.0f}%",
|
| 861 |
+
"first_seen_tick": max(0, self.tick - svc.failure_ticks),
|
| 862 |
+
})
|
| 863 |
+
|
| 864 |
+
# Circuit breaker alerts
|
| 865 |
+
for dep_id, breaker in svc.circuit_breakers.items():
|
| 866 |
+
if breaker.state.value == "OPEN":
|
| 867 |
+
alerts.append({
|
| 868 |
+
"severity": "warning",
|
| 869 |
+
"service": sid,
|
| 870 |
+
"type": "circuit_breaker_open",
|
| 871 |
+
"message": f"{sid} circuit breaker OPEN for {dep_id}",
|
| 872 |
+
"first_seen_tick": max(0, self.tick - breaker.ticks_in_current_state),
|
| 873 |
+
})
|
| 874 |
+
|
| 875 |
+
# Sort by severity (critical first)
|
| 876 |
+
severity_order = {"critical": 0, "warning": 1, "info": 2}
|
| 877 |
+
alerts.sort(key=lambda a: severity_order.get(a["severity"], 9))
|
| 878 |
+
return alerts
|
| 879 |
+
|
| 880 |
+
def get_legal_actions(self) -> List[Dict[str, Any]]:
|
| 881 |
+
"""Return the set of currently legal actions with valid targets."""
|
| 882 |
+
service_ids = list(self.services.keys())
|
| 883 |
+
actions = [
|
| 884 |
+
{"action_type": "noop", "valid_targets": []},
|
| 885 |
+
{"action_type": "inspect_logs", "valid_targets": service_ids},
|
| 886 |
+
{"action_type": "inspect_metrics", "valid_targets": service_ids},
|
| 887 |
+
{"action_type": "inspect_traces", "valid_targets": service_ids},
|
| 888 |
+
{"action_type": "restart_service", "valid_targets": service_ids},
|
| 889 |
+
]
|
| 890 |
+
|
| 891 |
+
# Rollback: only services with previous versions
|
| 892 |
+
rollback_targets = [sid for sid, s in self.services.items() if s.previous_version]
|
| 893 |
+
if rollback_targets:
|
| 894 |
+
actions.append({"action_type": "rollback_service", "valid_targets": rollback_targets})
|
| 895 |
+
|
| 896 |
+
# Scale: all services
|
| 897 |
+
actions.append({"action_type": "scale_service", "valid_targets": service_ids})
|
| 898 |
+
|
| 899 |
+
# Tune config: all services
|
| 900 |
+
actions.append({"action_type": "tune_config", "valid_targets": service_ids})
|
| 901 |
+
|
| 902 |
+
# Clear cache: only cache services
|
| 903 |
+
if self.graph and self.graph.cache_services:
|
| 904 |
+
actions.append({"action_type": "clear_cache", "valid_targets": self.graph.cache_services})
|
| 905 |
+
|
| 906 |
+
# Rebalance traffic: only in multi-region
|
| 907 |
+
if self.graph and self.graph.has_multiple_regions:
|
| 908 |
+
actions.append({
|
| 909 |
+
"action_type": "rebalance_traffic",
|
| 910 |
+
"valid_targets": self.graph.regions,
|
| 911 |
+
})
|
| 912 |
+
|
| 913 |
+
# Pause job: only background job services
|
| 914 |
+
if self.graph and self.graph.background_jobs:
|
| 915 |
+
actions.append({"action_type": "pause_job", "valid_targets": self.graph.background_jobs})
|
| 916 |
+
|
| 917 |
+
return actions
|
| 918 |
+
|
| 919 |
+
def get_service_observations(self) -> List[Dict[str, Any]]:
|
| 920 |
+
"""Build per-service observation dicts."""
|
| 921 |
+
result = []
|
| 922 |
+
for sid, svc in self.services.items():
|
| 923 |
+
node = self.graph.node_map.get(sid) if self.graph else None
|
| 924 |
+
deps = self.graph.adjacency.get(sid, []) if self.graph else []
|
| 925 |
+
cb_states = {
|
| 926 |
+
dep: breaker.state.value
|
| 927 |
+
for dep, breaker in svc.circuit_breakers.items()
|
| 928 |
+
}
|
| 929 |
+
result.append({
|
| 930 |
+
"id": sid,
|
| 931 |
+
"layer": node.layer if node else "unknown",
|
| 932 |
+
"status": svc.status,
|
| 933 |
+
"error_rate": round(svc.error_rate, 4),
|
| 934 |
+
"latency_p50_ms": round(svc.latency_p50_ms, 1),
|
| 935 |
+
"latency_p95_ms": round(svc.latency_p95_ms, 1),
|
| 936 |
+
"latency_p99_ms": round(svc.latency_p99_ms, 1),
|
| 937 |
+
"throughput_rps": round(svc.throughput_rps, 1),
|
| 938 |
+
"cpu_pct": round(svc.cpu_pct, 1),
|
| 939 |
+
"memory_pct": round(svc.memory_pct, 1),
|
| 940 |
+
"connection_pool_usage_pct": round(svc.connection_pool_usage_pct, 1),
|
| 941 |
+
"replicas": svc.replicas,
|
| 942 |
+
"version": svc.version,
|
| 943 |
+
"previous_version": svc.previous_version,
|
| 944 |
+
"depends_on": deps,
|
| 945 |
+
"circuit_breakers": cb_states,
|
| 946 |
+
})
|
| 947 |
+
return result
|
| 948 |
+
|
| 949 |
+
# -------------------------------------------------------------------
|
| 950 |
+
# Internal helpers
|
| 951 |
+
# -------------------------------------------------------------------
|
| 952 |
+
|
| 953 |
+
def _get_failure_for_service(self, service_id: Optional[str]) -> Optional[FailureSpec]:
|
| 954 |
+
if not service_id:
|
| 955 |
+
return None
|
| 956 |
+
for spec in self.failures:
|
| 957 |
+
if spec.service_id == service_id and service_id not in self.remediated_services:
|
| 958 |
+
return spec
|
| 959 |
+
return None
|
| 960 |
+
|
| 961 |
+
def _get_primary_dependency(self, service_id: Optional[str]) -> str:
|
| 962 |
+
if not service_id or not self.graph:
|
| 963 |
+
return "unknown"
|
| 964 |
+
deps = self.graph.adjacency.get(service_id, [])
|
| 965 |
+
return deps[0] if deps else "unknown"
|
server/traces.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
server/traces.py — Distributed trace generation for inspect_traces action.
|
| 3 |
+
|
| 4 |
+
Generates realistic Jaeger/Zipkin-style trace trees showing request flow
|
| 5 |
+
through the service dependency graph. Healthy services show normal latencies;
|
| 6 |
+
failing services show errors, timeouts, and cascading delays.
|
| 7 |
+
|
| 8 |
+
Each trace is a tree of spans rooted at the inspected service.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import random
|
| 14 |
+
from typing import Any, Dict, List, Optional
|
| 15 |
+
|
| 16 |
+
from server.graph import ServiceGraph
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def _make_span_id(rng: random.Random) -> str:
|
| 20 |
+
return f"{rng.randint(0, 0xFFFFFFFF):08x}"
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _make_trace_id(rng: random.Random) -> str:
|
| 24 |
+
return f"{rng.randint(0, 0xFFFFFFFFFFFFFFFF):016x}"
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def generate_trace(
|
| 28 |
+
service_id: str,
|
| 29 |
+
graph: ServiceGraph,
|
| 30 |
+
service_errors: Dict[str, float],
|
| 31 |
+
service_latencies: Dict[str, float],
|
| 32 |
+
rng: random.Random,
|
| 33 |
+
max_depth: int = 4,
|
| 34 |
+
) -> Dict[str, Any]:
|
| 35 |
+
"""
|
| 36 |
+
Generate a distributed trace tree rooted at service_id.
|
| 37 |
+
|
| 38 |
+
Returns a dict with trace_id, root_span, and flat spans list.
|
| 39 |
+
service_errors: service_id → error_rate (0.0–1.0)
|
| 40 |
+
service_latencies: service_id → p99_ms
|
| 41 |
+
"""
|
| 42 |
+
trace_id = _make_trace_id(rng)
|
| 43 |
+
spans: List[Dict[str, Any]] = []
|
| 44 |
+
|
| 45 |
+
def _build_span(
|
| 46 |
+
svc_id: str,
|
| 47 |
+
parent_span_id: Optional[str],
|
| 48 |
+
depth: int,
|
| 49 |
+
start_offset_ms: float,
|
| 50 |
+
) -> Dict[str, Any]:
|
| 51 |
+
span_id = _make_span_id(rng)
|
| 52 |
+
error_rate = service_errors.get(svc_id, 0.0)
|
| 53 |
+
base_latency = service_latencies.get(svc_id, rng.uniform(5, 50))
|
| 54 |
+
has_error = rng.random() < error_rate
|
| 55 |
+
|
| 56 |
+
# Span duration: base latency + noise
|
| 57 |
+
if has_error and error_rate > 0.8:
|
| 58 |
+
# Fast fail or timeout
|
| 59 |
+
duration_ms = rng.choice([
|
| 60 |
+
rng.uniform(0.5, 5), # Fast fail
|
| 61 |
+
rng.uniform(3000, 10000), # Timeout
|
| 62 |
+
])
|
| 63 |
+
elif has_error:
|
| 64 |
+
duration_ms = base_latency * rng.uniform(1.5, 5.0)
|
| 65 |
+
else:
|
| 66 |
+
duration_ms = base_latency * rng.uniform(0.3, 1.2)
|
| 67 |
+
|
| 68 |
+
duration_ms = max(0.1, duration_ms)
|
| 69 |
+
|
| 70 |
+
span = {
|
| 71 |
+
"span_id": span_id,
|
| 72 |
+
"parent_span_id": parent_span_id,
|
| 73 |
+
"service": svc_id,
|
| 74 |
+
"operation": _operation_name(svc_id, rng),
|
| 75 |
+
"start_ms": round(start_offset_ms, 1),
|
| 76 |
+
"duration_ms": round(duration_ms, 1),
|
| 77 |
+
"status": "ERROR" if has_error else "OK",
|
| 78 |
+
"tags": {},
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
if has_error:
|
| 82 |
+
span["tags"]["error"] = True
|
| 83 |
+
span["tags"]["error.message"] = _error_message(svc_id, error_rate, rng)
|
| 84 |
+
|
| 85 |
+
node = graph.node_map.get(svc_id)
|
| 86 |
+
if node:
|
| 87 |
+
span["tags"]["service.layer"] = node.layer
|
| 88 |
+
span["tags"]["service.region"] = node.region
|
| 89 |
+
|
| 90 |
+
spans.append(span)
|
| 91 |
+
|
| 92 |
+
# Recurse into downstream dependencies
|
| 93 |
+
if depth < max_depth:
|
| 94 |
+
deps = graph.adjacency.get(svc_id, [])
|
| 95 |
+
child_offset = start_offset_ms + rng.uniform(0.1, 2.0)
|
| 96 |
+
for dep_id in deps:
|
| 97 |
+
# Check edge activation (probabilistic)
|
| 98 |
+
edge = next(
|
| 99 |
+
(e for e in graph.edges if e.source == svc_id and e.target == dep_id),
|
| 100 |
+
None,
|
| 101 |
+
)
|
| 102 |
+
if edge and rng.random() > edge.activation_probability:
|
| 103 |
+
continue
|
| 104 |
+
|
| 105 |
+
child_span = _build_span(dep_id, span_id, depth + 1, child_offset)
|
| 106 |
+
child_offset += child_span["duration_ms"] + rng.uniform(0.1, 1.0)
|
| 107 |
+
|
| 108 |
+
return span
|
| 109 |
+
|
| 110 |
+
root_span = _build_span(service_id, None, 0, 0.0)
|
| 111 |
+
|
| 112 |
+
# Compute total trace duration
|
| 113 |
+
if spans:
|
| 114 |
+
total_duration = max(s["start_ms"] + s["duration_ms"] for s in spans)
|
| 115 |
+
else:
|
| 116 |
+
total_duration = 0.0
|
| 117 |
+
|
| 118 |
+
return {
|
| 119 |
+
"trace_id": trace_id,
|
| 120 |
+
"root_service": service_id,
|
| 121 |
+
"span_count": len(spans),
|
| 122 |
+
"total_duration_ms": round(total_duration, 1),
|
| 123 |
+
"spans": spans,
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def _operation_name(service_id: str, rng: random.Random) -> str:
|
| 128 |
+
"""Generate a realistic operation name based on service type."""
|
| 129 |
+
if "gateway" in service_id or "bff" in service_id:
|
| 130 |
+
return rng.choice(["HTTP GET /api/v1/resource", "HTTP POST /api/v1/action", "HTTP GET /health"])
|
| 131 |
+
if "auth" in service_id or "identity" in service_id or "session" in service_id:
|
| 132 |
+
return rng.choice(["validateToken", "authenticate", "refreshSession"])
|
| 133 |
+
if "postgres" in service_id:
|
| 134 |
+
return rng.choice(["SELECT", "INSERT", "UPDATE", "pg_pool.checkout"])
|
| 135 |
+
if "redis" in service_id:
|
| 136 |
+
return rng.choice(["GET", "SET", "MGET", "EXPIRE"])
|
| 137 |
+
if "kafka" in service_id:
|
| 138 |
+
return rng.choice(["produce", "consume", "commitOffset"])
|
| 139 |
+
if "elasticsearch" in service_id:
|
| 140 |
+
return rng.choice(["search", "index", "bulk"])
|
| 141 |
+
return rng.choice(["processRequest", "handleMessage", "execute"])
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def _error_message(service_id: str, error_rate: float, rng: random.Random) -> str:
|
| 145 |
+
"""Generate a trace-level error message."""
|
| 146 |
+
if error_rate > 0.8:
|
| 147 |
+
return rng.choice([
|
| 148 |
+
f"{service_id}: Connection refused",
|
| 149 |
+
f"{service_id}: Service unavailable (HTTP 503)",
|
| 150 |
+
f"{service_id}: Timeout after 5000ms",
|
| 151 |
+
])
|
| 152 |
+
return rng.choice([
|
| 153 |
+
f"{service_id}: Internal server error (HTTP 500)",
|
| 154 |
+
f"{service_id}: Upstream dependency timeout",
|
| 155 |
+
f"{service_id}: Rate limited (HTTP 429)",
|
| 156 |
+
f"{service_id}: Bad gateway (HTTP 502)",
|
| 157 |
+
])
|