Commit ·
64d38cb
0
Parent(s):
Checkpoint: existing implementation (pre-cleanup)
Browse files- .gitignore +5 -0
- models.py +300 -0
- openenv.yaml +10 -0
- pyproject.toml +32 -0
- sdk_info.txt +24 -0
- server/__init__.py +1 -0
- server/failures.py +437 -0
- server/graph.py +470 -0
.gitignore
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Documentation and research (not part of the submission)
|
| 2 |
+
Docs/
|
| 3 |
+
|
| 4 |
+
# OpenEnv preparatory course (dev reference only, not part of submission)
|
| 5 |
+
openenv-course/
|
models.py
ADDED
|
@@ -0,0 +1,300 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SevZero — Typed Pydantic models for Action, Observation, and State.
|
| 3 |
+
|
| 4 |
+
These are the public API contracts at the package root (OpenEnv requirement).
|
| 5 |
+
Every field is documented because the observation JSON must be self-explanatory
|
| 6 |
+
to any LLM evaluator without additional context.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
from typing import Any, Dict, List, Optional, Union
|
| 12 |
+
|
| 13 |
+
from pydantic import Field
|
| 14 |
+
|
| 15 |
+
from openenv.core.env_server import Action, Observation, State
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# ---------------------------------------------------------------------------
|
| 19 |
+
# Sub-models: nested inside SevZeroObservation
|
| 20 |
+
# ---------------------------------------------------------------------------
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class CircuitBreakerInfo(dict):
|
| 24 |
+
"""Maps dependency name -> breaker state ('CLOSED' | 'OPEN' | 'HALF_OPEN')."""
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class ServiceInfo(object):
|
| 28 |
+
"""Per-service observable state — declared as plain dict in observation for
|
| 29 |
+
JSON-serialisability; structured via ServiceInfoModel for validation."""
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class ServiceInfoModel:
|
| 33 |
+
"""Pydantic model for a single service's metrics (used internally)."""
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
from pydantic import BaseModel
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class ServiceInfoModel(BaseModel):
|
| 40 |
+
"""
|
| 41 |
+
All observable per-service metrics, ordered by SRE triage priority:
|
| 42 |
+
symptoms first, traffic second, saturation third, context last.
|
| 43 |
+
"""
|
| 44 |
+
|
| 45 |
+
# Identity
|
| 46 |
+
id: str = Field(description="Service identifier, e.g. 'payment-service'")
|
| 47 |
+
layer: str = Field(
|
| 48 |
+
description="Service layer: 'edge' | 'domain' | 'infra' | 'cross-cutting'"
|
| 49 |
+
)
|
| 50 |
+
status: str = Field(
|
| 51 |
+
description="Aggregate health: 'healthy' | 'degraded' | 'critical' | 'down'"
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
# --- Symptoms (error + latency) ---
|
| 55 |
+
error_rate: float = Field(
|
| 56 |
+
description="Fraction of requests failing this tick (0.0–1.0)"
|
| 57 |
+
)
|
| 58 |
+
latency_p50_ms: float = Field(description="Median request latency in milliseconds")
|
| 59 |
+
latency_p95_ms: float = Field(description="95th-percentile latency in milliseconds")
|
| 60 |
+
latency_p99_ms: float = Field(description="99th-percentile latency in milliseconds")
|
| 61 |
+
|
| 62 |
+
# --- Traffic ---
|
| 63 |
+
throughput_rps: float = Field(
|
| 64 |
+
description="Successful requests served per tick"
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
# --- Saturation ---
|
| 68 |
+
cpu_pct: float = Field(description="CPU utilisation 0–100")
|
| 69 |
+
memory_pct: float = Field(description="Memory utilisation 0–100")
|
| 70 |
+
connection_pool_usage_pct: float = Field(
|
| 71 |
+
description="DB connection pool saturation 0–100; high = I/O bottleneck"
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
# --- Deployment context ---
|
| 75 |
+
replicas: int = Field(description="Number of running replicas")
|
| 76 |
+
version: str = Field(description="Currently deployed version tag")
|
| 77 |
+
previous_version: Optional[str] = Field(
|
| 78 |
+
default=None,
|
| 79 |
+
description="Previous version available for rollback; null if never changed",
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
# --- Dependency graph ---
|
| 83 |
+
depends_on: List[str] = Field(
|
| 84 |
+
default_factory=list,
|
| 85 |
+
description="Direct service dependencies (downstream calls)",
|
| 86 |
+
)
|
| 87 |
+
circuit_breakers: Dict[str, str] = Field(
|
| 88 |
+
default_factory=dict,
|
| 89 |
+
description=(
|
| 90 |
+
"Per-dependency circuit breaker state. "
|
| 91 |
+
"Keys are dependency IDs; values are 'CLOSED' | 'OPEN' | 'HALF_OPEN'."
|
| 92 |
+
),
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
class AlertInfo(BaseModel):
|
| 97 |
+
"""A structured active alert, ordered by severity."""
|
| 98 |
+
|
| 99 |
+
severity: str = Field(description="'critical' | 'warning' | 'info'")
|
| 100 |
+
service: str = Field(description="Service ID that triggered the alert")
|
| 101 |
+
type: str = Field(
|
| 102 |
+
description=(
|
| 103 |
+
"Alert category: 'error_rate_high' | 'latency_high' | "
|
| 104 |
+
"'circuit_breaker_open' | 'connection_pool_saturated' | "
|
| 105 |
+
"'memory_high' | 'cpu_high' | 'service_down'"
|
| 106 |
+
)
|
| 107 |
+
)
|
| 108 |
+
message: str = Field(description="Human-readable alert description with metric values")
|
| 109 |
+
first_seen_tick: int = Field(description="Tick at which this alert first fired")
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
class DeployInfo(BaseModel):
|
| 113 |
+
"""A recent deployment event visible in the observation."""
|
| 114 |
+
|
| 115 |
+
service: str = Field(description="Service that was deployed")
|
| 116 |
+
version: str = Field(description="New version deployed")
|
| 117 |
+
ticks_ago: int = Field(description="How many ticks ago the deploy happened")
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
class ActionRecord(BaseModel):
|
| 121 |
+
"""A previously taken action, shown in the observation for agent context."""
|
| 122 |
+
|
| 123 |
+
tick: int = Field(description="Tick at which the action was executed")
|
| 124 |
+
action: str = Field(description="Action type, e.g. 'restart_service'")
|
| 125 |
+
target: Optional[str] = Field(default=None, description="Primary target service/resource")
|
| 126 |
+
success: bool = Field(description="Whether the action completed successfully")
|
| 127 |
+
note: Optional[str] = Field(
|
| 128 |
+
default=None,
|
| 129 |
+
description="Extra context, e.g. 'service already healthy' or error reason",
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
class LegalAction(BaseModel):
|
| 134 |
+
"""One type of action the agent is currently allowed to take."""
|
| 135 |
+
|
| 136 |
+
action_type: str = Field(
|
| 137 |
+
description=(
|
| 138 |
+
"One of: inspect_logs | inspect_metrics | inspect_traces | "
|
| 139 |
+
"restart_service | rollback_service | scale_service | tune_config | "
|
| 140 |
+
"clear_cache | rebalance_traffic | pause_job | noop"
|
| 141 |
+
)
|
| 142 |
+
)
|
| 143 |
+
valid_targets: List[str] = Field(
|
| 144 |
+
description="Service IDs (or other resource names) this action can target right now"
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
# ---------------------------------------------------------------------------
|
| 149 |
+
# Top-level OpenEnv models
|
| 150 |
+
# ---------------------------------------------------------------------------
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
class SevZeroAction(Action):
|
| 154 |
+
"""
|
| 155 |
+
An action the agent takes in SevZero.
|
| 156 |
+
|
| 157 |
+
Choose exactly one action_type and provide the required params for it:
|
| 158 |
+
|
| 159 |
+
inspect_logs(service_id) -> logs: str in next observation
|
| 160 |
+
inspect_metrics(service_id) -> metric_history in next observation
|
| 161 |
+
inspect_traces(service_id) -> traces in next observation
|
| 162 |
+
restart_service(service_id) -> restarts pod; 1-2 tick delay
|
| 163 |
+
rollback_service(service_id) -> reverts to previous_version; 2-3 tick delay
|
| 164 |
+
scale_service(service_id, replicas=N) -> adjusts replica count; 2-4 tick delay
|
| 165 |
+
tune_config(service_id, key, value) -> updates config param; 1 tick delay
|
| 166 |
+
clear_cache(cache_name) -> flushes cache; 1 tick delay
|
| 167 |
+
rebalance_traffic(from_region, to_region, pct) -> shifts traffic; 2-3 tick delay
|
| 168 |
+
pause_job(job_name) -> pauses background job; 1 tick delay
|
| 169 |
+
noop() -> wait and observe; 0 ticks
|
| 170 |
+
"""
|
| 171 |
+
|
| 172 |
+
action_type: str = Field(
|
| 173 |
+
description=(
|
| 174 |
+
"Which operation to perform. Must be one of the 11 action types. "
|
| 175 |
+
"Must appear in legal_actions from the previous observation."
|
| 176 |
+
)
|
| 177 |
+
)
|
| 178 |
+
params: Dict[str, Any] = Field(
|
| 179 |
+
default_factory=dict,
|
| 180 |
+
description=(
|
| 181 |
+
"Action parameters. Examples: "
|
| 182 |
+
"{'service_id': 'payment-service'}, "
|
| 183 |
+
"{'service_id': 'payment-service', 'replicas': 4}, "
|
| 184 |
+
"{'service_id': 'payment-service', 'key': 'timeout_ms', 'value': 2000}"
|
| 185 |
+
),
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
class SevZeroObservation(Observation):
|
| 190 |
+
"""
|
| 191 |
+
Full observation returned by reset() and step().
|
| 192 |
+
|
| 193 |
+
Fields are ordered by SRE triage priority: incident summary first,
|
| 194 |
+
then per-service metrics, then alerts, then context, then agent state.
|
| 195 |
+
|
| 196 |
+
The `done` and `reward` fields are inherited from Observation base.
|
| 197 |
+
"""
|
| 198 |
+
|
| 199 |
+
# --- Episode context ---
|
| 200 |
+
tick: int = Field(default=0, description="Current simulation tick (0-indexed)")
|
| 201 |
+
episode_id: Optional[str] = Field(
|
| 202 |
+
default=None, description="Unique ID for this episode"
|
| 203 |
+
)
|
| 204 |
+
task_id: str = Field(
|
| 205 |
+
default="easy",
|
| 206 |
+
description="Which task is running: 'easy' | 'medium' | 'hard'",
|
| 207 |
+
)
|
| 208 |
+
status: str = Field(
|
| 209 |
+
default="playing",
|
| 210 |
+
description=(
|
| 211 |
+
"Episode status: 'playing' | 'resolved' (all SLOs met) | "
|
| 212 |
+
"'failed' (system collapse) | 'timeout' (max steps exceeded)"
|
| 213 |
+
),
|
| 214 |
+
)
|
| 215 |
+
max_steps: int = Field(
|
| 216 |
+
default=10, description="Step budget for this task (Easy=10, Medium=20, Hard=50)"
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
# --- Health summary ---
|
| 220 |
+
global_slo_score: float = Field(
|
| 221 |
+
default=0.0,
|
| 222 |
+
description="Fraction of services currently meeting all SLO targets (0.0–1.0)",
|
| 223 |
+
)
|
| 224 |
+
observation_summary: str = Field(
|
| 225 |
+
default="",
|
| 226 |
+
description=(
|
| 227 |
+
"One-sentence natural-language summary of the current situation. "
|
| 228 |
+
"Read this first — it gives you the critical context for your next action."
|
| 229 |
+
),
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
# --- Per-service state ---
|
| 233 |
+
services: List[Dict[str, Any]] = Field(
|
| 234 |
+
default_factory=list,
|
| 235 |
+
description=(
|
| 236 |
+
"Full state for every service in the cluster. "
|
| 237 |
+
"See ServiceInfoModel for field definitions."
|
| 238 |
+
),
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
# --- Active alerts ---
|
| 242 |
+
alerts: List[Dict[str, Any]] = Field(
|
| 243 |
+
default_factory=list,
|
| 244 |
+
description="Active alerts sorted by severity (critical first). See AlertInfo.",
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
# --- Context ---
|
| 248 |
+
recent_deploys: List[Dict[str, Any]] = Field(
|
| 249 |
+
default_factory=list,
|
| 250 |
+
description="Deployments in the last 10 ticks. Correlate with error onset.",
|
| 251 |
+
)
|
| 252 |
+
actions_taken: List[Dict[str, Any]] = Field(
|
| 253 |
+
default_factory=list,
|
| 254 |
+
description="Last 10 actions taken in this episode, for agent context.",
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
# --- Action space ---
|
| 258 |
+
legal_actions: List[Dict[str, Any]] = Field(
|
| 259 |
+
default_factory=list,
|
| 260 |
+
description=(
|
| 261 |
+
"Exactly what actions are available right now with valid targets. "
|
| 262 |
+
"Only use actions listed here. Invalid actions return a -0.5 penalty."
|
| 263 |
+
),
|
| 264 |
+
)
|
| 265 |
+
|
| 266 |
+
# --- Diagnostic output from inspect_* actions ---
|
| 267 |
+
logs: Optional[str] = Field(
|
| 268 |
+
default=None,
|
| 269 |
+
description="Log output from the most recent inspect_logs action, if any.",
|
| 270 |
+
)
|
| 271 |
+
metric_history: Optional[List[Dict[str, Any]]] = Field(
|
| 272 |
+
default=None,
|
| 273 |
+
description="Per-tick metric history from the most recent inspect_metrics action.",
|
| 274 |
+
)
|
| 275 |
+
traces: Optional[Dict[str, Any]] = Field(
|
| 276 |
+
default=None,
|
| 277 |
+
description="Distributed trace from the most recent inspect_traces action.",
|
| 278 |
+
)
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
class SevZeroState(State):
|
| 282 |
+
"""
|
| 283 |
+
Episode metadata returned by the state property.
|
| 284 |
+
`episode_id` and `step_count` are inherited from State base.
|
| 285 |
+
"""
|
| 286 |
+
|
| 287 |
+
task_id: str = Field(default="easy", description="Which task: 'easy' | 'medium' | 'hard'")
|
| 288 |
+
seed: Optional[int] = Field(
|
| 289 |
+
default=None, description="Seed used for this episode (for reproducibility)"
|
| 290 |
+
)
|
| 291 |
+
global_slo_score: float = Field(
|
| 292 |
+
default=0.0, description="Current fraction of services meeting SLO targets"
|
| 293 |
+
)
|
| 294 |
+
terminated: bool = Field(
|
| 295 |
+
default=False, description="Whether the episode has ended for any reason"
|
| 296 |
+
)
|
| 297 |
+
termination_reason: Optional[str] = Field(
|
| 298 |
+
default=None,
|
| 299 |
+
description="Why the episode ended: 'resolved' | 'failed' | 'timeout' | None",
|
| 300 |
+
)
|
openenv.yaml
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: sevzero
|
| 2 |
+
version: "1.0.0"
|
| 3 |
+
description: "SRE Incident Response Environment — an autonomous on-call SRE managing a microservice cluster undergoing cascading failures"
|
| 4 |
+
tags:
|
| 5 |
+
- openenv
|
| 6 |
+
- sre
|
| 7 |
+
- incident-response
|
| 8 |
+
- reinforcement-learning
|
| 9 |
+
- microservices
|
| 10 |
+
- agentic
|
pyproject.toml
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "sevzero"
|
| 3 |
+
version = "1.0.0"
|
| 4 |
+
description = "SRE Incident Response Environment for OpenEnv"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.11"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"openenv-core>=0.2.2",
|
| 9 |
+
"fastapi>=0.104.0",
|
| 10 |
+
"uvicorn>=0.24.0",
|
| 11 |
+
"pydantic>=2.0.0",
|
| 12 |
+
"openai>=1.0.0",
|
| 13 |
+
]
|
| 14 |
+
|
| 15 |
+
[project.optional-dependencies]
|
| 16 |
+
dev = [
|
| 17 |
+
"pytest>=7.0.0",
|
| 18 |
+
"httpx>=0.24.0",
|
| 19 |
+
]
|
| 20 |
+
|
| 21 |
+
[build-system]
|
| 22 |
+
requires = ["hatchling"]
|
| 23 |
+
build-backend = "hatchling.build"
|
| 24 |
+
|
| 25 |
+
[tool.hatch.build.targets.wheel]
|
| 26 |
+
packages = ["server"]
|
| 27 |
+
|
| 28 |
+
[tool.uv]
|
| 29 |
+
dev-dependencies = [
|
| 30 |
+
"pytest>=7.0.0",
|
| 31 |
+
"httpx>=0.24.0",
|
| 32 |
+
]
|
sdk_info.txt
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
=== Observation fields ===
|
| 2 |
+
done: annotation=bool required=False default=False description='Whether the episode has terminated'
|
| 3 |
+
reward: annotation=Union[bool, int, float, NoneType] required=False default=None description='Reward signal from the last action'
|
| 4 |
+
metadata: annotation=Dict[str, Any] required=False default_factory=dict description='Additional metadata for the observation'
|
| 5 |
+
|
| 6 |
+
=== State fields ===
|
| 7 |
+
episode_id: annotation=Union[str, NoneType] required=False default=None description='Unique identifier for the current episode'
|
| 8 |
+
step_count: annotation=int required=False default=0 description='Number of steps taken in the current episode' metadata=[Ge(ge=0)]
|
| 9 |
+
|
| 10 |
+
=== Action fields ===
|
| 11 |
+
metadata: annotation=Dict[str, Any] required=False default_factory=dict description='Additional metadata for the action'
|
| 12 |
+
|
| 13 |
+
=== Environment methods ===
|
| 14 |
+
_apply_rubric(self, action: ~ActT, observation: ~ObsT) -> float
|
| 15 |
+
_apply_rubric_async(self, action: ~ActT, observation: ~ObsT) -> float
|
| 16 |
+
_apply_transform(self, observation: ~ObsT) -> ~ObsT
|
| 17 |
+
_reset_rubric(self) -> None
|
| 18 |
+
_reset_rubric_async(self) -> None
|
| 19 |
+
close(self) -> None
|
| 20 |
+
get_metadata(self) -> openenv.core.env_server.types.EnvironmentMetadata
|
| 21 |
+
reset(self, seed: Optional[int] = None, episode_id: Optional[str] = None, **kwargs: Any) -> ~ObsT
|
| 22 |
+
reset_async(self, seed: Optional[int] = None, episode_id: Optional[str] = None, **kwargs: Any) -> ~ObsT
|
| 23 |
+
step(self, action: ~ActT, timeout_s: Optional[float] = None, **kwargs: Any) -> ~ObsT
|
| 24 |
+
step_async(self, action: ~ActT, timeout_s: Optional[float] = None, **kwargs: Any) -> ~ObsT
|
server/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""server/__init__.py — marks server/ as a Python package."""
|
server/failures.py
ADDED
|
@@ -0,0 +1,437 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
server/failures.py — 8 failure types with injection logic and metric evolution patterns.
|
| 3 |
+
|
| 4 |
+
Each failure type has:
|
| 5 |
+
- A distinctive metric temporal shape (how metrics evolve per tick)
|
| 6 |
+
- Config error subtypes (startup vs runtime)
|
| 7 |
+
- Weighted distribution matching real-world incident data
|
| 8 |
+
|
| 9 |
+
Sources: Google SRE postmortems, Netflix Hystrix, AWS incident reports.
|
| 10 |
+
See Docs/DataResearch.md for full citation.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from __future__ import annotations
|
| 14 |
+
|
| 15 |
+
import random
|
| 16 |
+
from dataclasses import dataclass, field
|
| 17 |
+
from enum import Enum
|
| 18 |
+
from typing import Dict, List, Optional, Tuple
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# ---------------------------------------------------------------------------
|
| 22 |
+
# Failure taxonomy
|
| 23 |
+
# ---------------------------------------------------------------------------
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class FailureType(str, Enum):
|
| 27 |
+
CRASH = "crash"
|
| 28 |
+
BAD_DEPLOY = "bad_deploy"
|
| 29 |
+
CONFIG_STARTUP = "config_startup" # Service can't boot
|
| 30 |
+
CONFIG_RUNTIME = "config_runtime" # Service runs but specific paths fail
|
| 31 |
+
CASCADING_LATENCY = "cascading_latency"
|
| 32 |
+
RESOURCE_LEAK = "resource_leak"
|
| 33 |
+
DB_DEGRADATION = "db_degradation"
|
| 34 |
+
CACHE_FAILURE = "cache_failure"
|
| 35 |
+
NETWORK_ERROR = "network_error"
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# Weighted distribution matching Google empirical incident data
|
| 39 |
+
# config=32%, deploy=25%, cascade=15%, crash=10%, leak=8%, DB=5%, cache=3%, network=2%
|
| 40 |
+
_FAILURE_WEIGHTS: Dict[FailureType, float] = {
|
| 41 |
+
FailureType.CONFIG_STARTUP: 0.16,
|
| 42 |
+
FailureType.CONFIG_RUNTIME: 0.16,
|
| 43 |
+
FailureType.BAD_DEPLOY: 0.25,
|
| 44 |
+
FailureType.CASCADING_LATENCY: 0.15,
|
| 45 |
+
FailureType.CRASH: 0.10,
|
| 46 |
+
FailureType.RESOURCE_LEAK: 0.08,
|
| 47 |
+
FailureType.DB_DEGRADATION: 0.05,
|
| 48 |
+
FailureType.CACHE_FAILURE: 0.03,
|
| 49 |
+
FailureType.NETWORK_ERROR: 0.02,
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
# For multi-root incidents: avoid unlikely combinations
|
| 53 |
+
_INCOMPATIBLE_PAIRS = {
|
| 54 |
+
(FailureType.NETWORK_ERROR, FailureType.NETWORK_ERROR), # Two network errors is unrealistic
|
| 55 |
+
(FailureType.CACHE_FAILURE, FailureType.CACHE_FAILURE), # Two cache failures is unrealistic
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
@dataclass
|
| 60 |
+
class FailureSpec:
|
| 61 |
+
"""Describes a single injected failure and its evolution parameters."""
|
| 62 |
+
|
| 63 |
+
service_id: str
|
| 64 |
+
failure_type: FailureType
|
| 65 |
+
|
| 66 |
+
# Error rates at various stages (used by metric evolution)
|
| 67 |
+
base_error_rate: float = 0.0 # Healthy baseline
|
| 68 |
+
peak_error_rate: float = 0.0 # At full failure
|
| 69 |
+
onset_ticks: int = 1 # Ticks to reach peak (1=instant, 5=gradual)
|
| 70 |
+
|
| 71 |
+
# Latency impact at peak
|
| 72 |
+
latency_multiplier: float = 1.0 # How much p99 multiplies at peak
|
| 73 |
+
|
| 74 |
+
# Resource impact at peak
|
| 75 |
+
cpu_impact: float = 0.0 # CPU increase (0–1)
|
| 76 |
+
memory_impact: float = 0.0 # Memory increase per tick (for leaks)
|
| 77 |
+
pool_saturation: float = 0.0 # Connection pool impact
|
| 78 |
+
|
| 79 |
+
# Config error subtype metadata
|
| 80 |
+
broken_config_key: Optional[str] = None # Which config key is wrong
|
| 81 |
+
broken_config_value: Optional[str] = None # What the wrong value is
|
| 82 |
+
|
| 83 |
+
# Deployment metadata (for bad_deploy)
|
| 84 |
+
bad_version: Optional[str] = None
|
| 85 |
+
good_version: Optional[str] = None
|
| 86 |
+
|
| 87 |
+
# Network error metadata
|
| 88 |
+
affected_region: Optional[str] = None
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
# ---------------------------------------------------------------------------
|
| 92 |
+
# Failure selection
|
| 93 |
+
# ---------------------------------------------------------------------------
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def select_failure_type(
|
| 97 |
+
rng: random.Random,
|
| 98 |
+
exclude: Optional[List[FailureType]] = None,
|
| 99 |
+
) -> FailureType:
|
| 100 |
+
"""Sample a failure type from the empirically-weighted distribution."""
|
| 101 |
+
population = list(_FAILURE_WEIGHTS.keys())
|
| 102 |
+
weights = [_FAILURE_WEIGHTS[f] for f in population]
|
| 103 |
+
|
| 104 |
+
# Remove excluded types
|
| 105 |
+
if exclude:
|
| 106 |
+
filtered = [(f, w) for f, w in zip(population, weights) if f not in exclude]
|
| 107 |
+
if filtered:
|
| 108 |
+
population, weights = zip(*filtered)
|
| 109 |
+
population, weights = list(population), list(weights)
|
| 110 |
+
|
| 111 |
+
return rng.choices(population, weights=weights, k=1)[0]
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def select_multi_root_failures(
|
| 115 |
+
rng: random.Random, count: int = 2
|
| 116 |
+
) -> List[FailureType]:
|
| 117 |
+
"""Select multiple failure types with incompatibility constraints."""
|
| 118 |
+
selected: List[FailureType] = []
|
| 119 |
+
for _ in range(count):
|
| 120 |
+
exclude = selected[:]
|
| 121 |
+
# Also exclude incompatible pairs
|
| 122 |
+
for s in selected:
|
| 123 |
+
for a, b in _INCOMPATIBLE_PAIRS:
|
| 124 |
+
if s == a:
|
| 125 |
+
exclude.append(b)
|
| 126 |
+
elif s == b:
|
| 127 |
+
exclude.append(a)
|
| 128 |
+
ft = select_failure_type(rng, exclude=exclude)
|
| 129 |
+
selected.append(ft)
|
| 130 |
+
return selected
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
# ---------------------------------------------------------------------------
|
| 134 |
+
# Failure specification factories
|
| 135 |
+
# ---------------------------------------------------------------------------
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def make_crash_spec(service_id: str, rng: random.Random) -> FailureSpec:
|
| 139 |
+
"""Service Crash: sudden 5xx spike then drop (service is dead)."""
|
| 140 |
+
return FailureSpec(
|
| 141 |
+
service_id=service_id,
|
| 142 |
+
failure_type=FailureType.CRASH,
|
| 143 |
+
base_error_rate=0.0,
|
| 144 |
+
peak_error_rate=rng.uniform(0.85, 1.0),
|
| 145 |
+
onset_ticks=1, # Instant
|
| 146 |
+
latency_multiplier=0.1, # Latency drops (fast fails, no waiting)
|
| 147 |
+
cpu_impact=0.0, # CPU near zero (process dead)
|
| 148 |
+
memory_impact=0.0,
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def make_bad_deploy_spec(service_id: str, rng: random.Random) -> FailureSpec:
|
| 153 |
+
"""Bad Deployment: step-function error increase after version change."""
|
| 154 |
+
return FailureSpec(
|
| 155 |
+
service_id=service_id,
|
| 156 |
+
failure_type=FailureType.BAD_DEPLOY,
|
| 157 |
+
base_error_rate=0.0,
|
| 158 |
+
peak_error_rate=rng.uniform(0.30, 0.70),
|
| 159 |
+
onset_ticks=1, # Step function — appears at deploy tick
|
| 160 |
+
latency_multiplier=rng.uniform(1.5, 3.0),
|
| 161 |
+
cpu_impact=rng.uniform(0.1, 0.3),
|
| 162 |
+
memory_impact=rng.uniform(0.05, 0.15),
|
| 163 |
+
bad_version="v" + str(rng.randint(2, 9)) + "." + str(rng.randint(0, 9)) + "." + str(rng.randint(1, 9)),
|
| 164 |
+
good_version="v1.0.0",
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def make_config_startup_spec(service_id: str, rng: random.Random) -> FailureSpec:
|
| 169 |
+
"""Config Error (Startup): service can't boot — zero traffic, health checks fail."""
|
| 170 |
+
config_keys = ["db_password", "db_host", "api_endpoint", "env_var", "config_file"]
|
| 171 |
+
return FailureSpec(
|
| 172 |
+
service_id=service_id,
|
| 173 |
+
failure_type=FailureType.CONFIG_STARTUP,
|
| 174 |
+
base_error_rate=0.0,
|
| 175 |
+
peak_error_rate=1.0, # 100% — service is completely down
|
| 176 |
+
onset_ticks=1,
|
| 177 |
+
latency_multiplier=0.0, # No latency, no traffic
|
| 178 |
+
cpu_impact=-0.9, # CPU near zero (process exited immediately)
|
| 179 |
+
memory_impact=-0.9,
|
| 180 |
+
broken_config_key=rng.choice(config_keys),
|
| 181 |
+
broken_config_value="WRONG_VALUE",
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def make_config_runtime_spec(service_id: str, rng: random.Random) -> FailureSpec:
|
| 186 |
+
"""Config Error (Runtime): service runs but specific code paths fail."""
|
| 187 |
+
config_keys = ["api_endpoint", "feature_flag", "timeout_ms", "retry_max"]
|
| 188 |
+
return FailureSpec(
|
| 189 |
+
service_id=service_id,
|
| 190 |
+
failure_type=FailureType.CONFIG_RUNTIME,
|
| 191 |
+
base_error_rate=0.0,
|
| 192 |
+
peak_error_rate=rng.uniform(0.20, 0.60),
|
| 193 |
+
onset_ticks=1,
|
| 194 |
+
latency_multiplier=rng.uniform(1.2, 2.0),
|
| 195 |
+
cpu_impact=0.0, # Normal resource usage
|
| 196 |
+
memory_impact=0.0,
|
| 197 |
+
broken_config_key=rng.choice(config_keys),
|
| 198 |
+
broken_config_value="MISCONFIGURED",
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
def make_cascading_latency_spec(service_id: str, rng: random.Random) -> FailureSpec:
|
| 203 |
+
"""
|
| 204 |
+
Cascading Latency: gradual latency ramp → thread pool exhaustion.
|
| 205 |
+
KEY signature: p99 ramps BEFORE errors appear. CPU rises from blocked threads.
|
| 206 |
+
"""
|
| 207 |
+
return FailureSpec(
|
| 208 |
+
service_id=service_id,
|
| 209 |
+
failure_type=FailureType.CASCADING_LATENCY,
|
| 210 |
+
base_error_rate=0.0,
|
| 211 |
+
peak_error_rate=rng.uniform(0.40, 0.85),
|
| 212 |
+
onset_ticks=rng.randint(3, 6), # Gradual ramp
|
| 213 |
+
latency_multiplier=rng.uniform(8.0, 20.0),
|
| 214 |
+
cpu_impact=rng.uniform(0.30, 0.60), # Rising CPU from blocked threads
|
| 215 |
+
memory_impact=rng.uniform(0.10, 0.25),
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
def make_resource_leak_spec(service_id: str, rng: random.Random) -> FailureSpec:
|
| 220 |
+
"""Resource Leak: steady memory/CPU climb; sawtooth pattern on restarts."""
|
| 221 |
+
return FailureSpec(
|
| 222 |
+
service_id=service_id,
|
| 223 |
+
failure_type=FailureType.RESOURCE_LEAK,
|
| 224 |
+
base_error_rate=0.0,
|
| 225 |
+
peak_error_rate=rng.uniform(0.20, 0.50),
|
| 226 |
+
onset_ticks=rng.randint(5, 10), # Slow burn
|
| 227 |
+
latency_multiplier=rng.uniform(2.0, 5.0),
|
| 228 |
+
cpu_impact=0.05, # Grows per tick (applied in evolution)
|
| 229 |
+
memory_impact=0.06, # LINEAR RAMP — key signature
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
def make_db_degradation_spec(service_id: str, rng: random.Random) -> FailureSpec:
|
| 234 |
+
"""DB Degradation: rising DB latency, pool saturation, app CPU paradoxically LOW."""
|
| 235 |
+
return FailureSpec(
|
| 236 |
+
service_id=service_id,
|
| 237 |
+
failure_type=FailureType.DB_DEGRADATION,
|
| 238 |
+
base_error_rate=0.0,
|
| 239 |
+
peak_error_rate=rng.uniform(0.30, 0.70),
|
| 240 |
+
onset_ticks=rng.randint(2, 4),
|
| 241 |
+
latency_multiplier=rng.uniform(5.0, 15.0),
|
| 242 |
+
cpu_impact=-0.2, # PARADOXICALLY LOW (waiting on I/O)
|
| 243 |
+
memory_impact=0.05,
|
| 244 |
+
pool_saturation=0.90, # Connection pool hits 90%+
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
def make_cache_failure_spec(service_id: str, rng: random.Random) -> FailureSpec:
|
| 249 |
+
"""Cache Failure: hit-rate cliff → backend QPS 10-50x spike → DB overload."""
|
| 250 |
+
return FailureSpec(
|
| 251 |
+
service_id=service_id,
|
| 252 |
+
failure_type=FailureType.CACHE_FAILURE,
|
| 253 |
+
base_error_rate=0.0,
|
| 254 |
+
peak_error_rate=rng.uniform(0.20, 0.50),
|
| 255 |
+
onset_ticks=1, # CLIFF — simultaneous, not gradual
|
| 256 |
+
latency_multiplier=rng.uniform(3.0, 8.0),
|
| 257 |
+
cpu_impact=0.20,
|
| 258 |
+
memory_impact=0.0,
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
def make_network_error_spec(service_id: str, rng: random.Random, region: str = "us-east-1") -> FailureSpec:
|
| 263 |
+
"""Network/Routing Error: connection failures affecting all services to this region."""
|
| 264 |
+
return FailureSpec(
|
| 265 |
+
service_id=service_id,
|
| 266 |
+
failure_type=FailureType.NETWORK_ERROR,
|
| 267 |
+
base_error_rate=0.0,
|
| 268 |
+
peak_error_rate=rng.uniform(0.80, 1.0),
|
| 269 |
+
onset_ticks=1, # Simultaneous, not hop-by-hop
|
| 270 |
+
latency_multiplier=0.2, # Timeout values — fixed high, then drop
|
| 271 |
+
cpu_impact=-0.3, # Low CPU (nothing getting through)
|
| 272 |
+
memory_impact=0.0,
|
| 273 |
+
affected_region=region,
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
_SPEC_FACTORIES = {
|
| 278 |
+
FailureType.CRASH: make_crash_spec,
|
| 279 |
+
FailureType.BAD_DEPLOY: make_bad_deploy_spec,
|
| 280 |
+
FailureType.CONFIG_STARTUP: make_config_startup_spec,
|
| 281 |
+
FailureType.CONFIG_RUNTIME: make_config_runtime_spec,
|
| 282 |
+
FailureType.CASCADING_LATENCY: make_cascading_latency_spec,
|
| 283 |
+
FailureType.RESOURCE_LEAK: make_resource_leak_spec,
|
| 284 |
+
FailureType.DB_DEGRADATION: make_db_degradation_spec,
|
| 285 |
+
FailureType.CACHE_FAILURE: make_cache_failure_spec,
|
| 286 |
+
FailureType.NETWORK_ERROR: make_network_error_spec,
|
| 287 |
+
}
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
def make_failure_spec(
|
| 291 |
+
service_id: str,
|
| 292 |
+
failure_type: FailureType,
|
| 293 |
+
rng: random.Random,
|
| 294 |
+
**kwargs,
|
| 295 |
+
) -> FailureSpec:
|
| 296 |
+
"""Create a FailureSpec for the given service and failure type."""
|
| 297 |
+
factory = _SPEC_FACTORIES[failure_type]
|
| 298 |
+
return factory(service_id, rng, **kwargs)
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
# ---------------------------------------------------------------------------
|
| 302 |
+
# Metric evolution: per-type temporal shapes
|
| 303 |
+
# ---------------------------------------------------------------------------
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
def compute_failure_magnitude(spec: FailureSpec, ticks_since_failure: int) -> float:
|
| 307 |
+
"""
|
| 308 |
+
Return a 0.0–1.0 magnitude factor for how fully the failure has manifested.
|
| 309 |
+
- Instant failures (onset_ticks=1): full magnitude from tick 1
|
| 310 |
+
- Gradual failures: linear ramp over onset_ticks
|
| 311 |
+
- Resource leaks: continues growing after onset (handled separately)
|
| 312 |
+
"""
|
| 313 |
+
if spec.onset_ticks <= 1:
|
| 314 |
+
return 1.0
|
| 315 |
+
return min(1.0, ticks_since_failure / spec.onset_ticks)
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
def apply_failure_to_metrics(
|
| 319 |
+
spec: FailureSpec,
|
| 320 |
+
ticks_since_failure: int,
|
| 321 |
+
base_error_rate: float,
|
| 322 |
+
base_p99_ms: float,
|
| 323 |
+
base_cpu: float,
|
| 324 |
+
base_memory: float,
|
| 325 |
+
base_pool: float,
|
| 326 |
+
rng: random.Random,
|
| 327 |
+
) -> Tuple[float, float, float, float, float]:
|
| 328 |
+
"""
|
| 329 |
+
Apply failure evolution to metrics.
|
| 330 |
+
Returns: (error_rate, p99_ms, cpu_pct, memory_pct, pool_pct)
|
| 331 |
+
|
| 332 |
+
Each failure type produces a DISTINCTIVE temporal shape:
|
| 333 |
+
- crash: instant spike → drop (service dead)
|
| 334 |
+
- bad_deploy: step function up at deploy tick
|
| 335 |
+
- config_startup: 100% error, zero traffic
|
| 336 |
+
- config_runtime: partial errors on affected paths
|
| 337 |
+
- cascading_latency: p99 ramps BEFORE errors (early warning)
|
| 338 |
+
- resource_leak: memory linear ramp, sawtooth CPU
|
| 339 |
+
- db_degradation: pool saturation, CPU paradoxically LOW
|
| 340 |
+
- cache_failure: cliff drop simultaneous
|
| 341 |
+
- network_error: cliff, then fixed-high timeout values
|
| 342 |
+
"""
|
| 343 |
+
mag = compute_failure_magnitude(spec, ticks_since_failure)
|
| 344 |
+
|
| 345 |
+
# Add natural stochastic variance (±5%) — Bernoulli trial model
|
| 346 |
+
noise = rng.uniform(-0.03, 0.03)
|
| 347 |
+
|
| 348 |
+
ft = spec.failure_type
|
| 349 |
+
|
| 350 |
+
if ft == FailureType.CRASH:
|
| 351 |
+
error_rate = spec.peak_error_rate * mag + noise
|
| 352 |
+
p99_ms = base_p99_ms * 0.1 * mag + base_p99_ms * (1 - mag) # Drops fast
|
| 353 |
+
cpu_pct = max(0.0, base_cpu * (1 - 0.9 * mag))
|
| 354 |
+
memory_pct = base_memory
|
| 355 |
+
pool_pct = base_pool
|
| 356 |
+
|
| 357 |
+
elif ft == FailureType.BAD_DEPLOY:
|
| 358 |
+
error_rate = spec.peak_error_rate * mag + noise
|
| 359 |
+
p99_ms = base_p99_ms * (1 + (spec.latency_multiplier - 1) * mag)
|
| 360 |
+
cpu_pct = min(100.0, base_cpu * (1 + spec.cpu_impact * mag))
|
| 361 |
+
memory_pct = min(100.0, base_memory * (1 + spec.memory_impact * mag))
|
| 362 |
+
pool_pct = base_pool
|
| 363 |
+
|
| 364 |
+
elif ft == FailureType.CONFIG_STARTUP:
|
| 365 |
+
error_rate = 1.0 # Always 100% — service won't start
|
| 366 |
+
p99_ms = 0.0 # No traffic = no latency
|
| 367 |
+
cpu_pct = max(0.0, base_cpu * 0.02) # Near zero
|
| 368 |
+
memory_pct = max(0.0, base_memory * 0.02)
|
| 369 |
+
pool_pct = 0.0
|
| 370 |
+
|
| 371 |
+
elif ft == FailureType.CONFIG_RUNTIME:
|
| 372 |
+
error_rate = spec.peak_error_rate * mag + noise
|
| 373 |
+
p99_ms = base_p99_ms * (1 + (spec.latency_multiplier - 1) * mag)
|
| 374 |
+
cpu_pct = base_cpu # Normal — only specific paths fail
|
| 375 |
+
memory_pct = base_memory
|
| 376 |
+
pool_pct = base_pool
|
| 377 |
+
|
| 378 |
+
elif ft == FailureType.CASCADING_LATENCY:
|
| 379 |
+
# p99 ramps BEFORE errors — the key diagnostic signature
|
| 380 |
+
latency_onset_fraction = min(1.0, ticks_since_failure / max(1, spec.onset_ticks - 1))
|
| 381 |
+
error_onset_fraction = min(1.0, max(0.0, (ticks_since_failure - 1) / spec.onset_ticks))
|
| 382 |
+
|
| 383 |
+
error_rate = spec.peak_error_rate * error_onset_fraction + noise
|
| 384 |
+
p99_ms = base_p99_ms * (1 + (spec.latency_multiplier - 1) * latency_onset_fraction)
|
| 385 |
+
cpu_pct = min(100.0, base_cpu * (1 + spec.cpu_impact * latency_onset_fraction))
|
| 386 |
+
memory_pct = min(100.0, base_memory * (1 + spec.memory_impact * latency_onset_fraction))
|
| 387 |
+
pool_pct = base_pool
|
| 388 |
+
|
| 389 |
+
elif ft == FailureType.RESOURCE_LEAK:
|
| 390 |
+
# Memory: LINEAR RAMP to limit (key signature)
|
| 391 |
+
# CPU: Growing GC thrash
|
| 392 |
+
leak_fraction = min(1.0, ticks_since_failure * 0.08) # ~12 ticks to peak
|
| 393 |
+
error_rate = spec.peak_error_rate * min(1.0, leak_fraction * 1.5) + noise
|
| 394 |
+
p99_ms = base_p99_ms * (1 + (spec.latency_multiplier - 1) * leak_fraction)
|
| 395 |
+
cpu_pct = min(100.0, base_cpu * (1 + leak_fraction * 0.8)) # GC pressure
|
| 396 |
+
memory_pct = min(100.0, base_memory + leak_fraction * (100 - base_memory))
|
| 397 |
+
pool_pct = base_pool
|
| 398 |
+
|
| 399 |
+
elif ft == FailureType.DB_DEGRADATION:
|
| 400 |
+
error_rate = spec.peak_error_rate * mag + noise
|
| 401 |
+
p99_ms = base_p99_ms * (1 + (spec.latency_multiplier - 1) * mag)
|
| 402 |
+
# CPU paradoxically LOW — waiting on I/O, not computing
|
| 403 |
+
cpu_pct = max(5.0, base_cpu * (1 + spec.cpu_impact * mag))
|
| 404 |
+
memory_pct = min(100.0, base_memory * (1 + spec.memory_impact * mag))
|
| 405 |
+
pool_pct = min(100.0, base_pool + spec.pool_saturation * mag * 100)
|
| 406 |
+
|
| 407 |
+
elif ft == FailureType.CACHE_FAILURE:
|
| 408 |
+
# CLIFF: simultaneous, not gradual (onset_ticks=1)
|
| 409 |
+
error_rate = spec.peak_error_rate * mag + noise
|
| 410 |
+
p99_ms = base_p99_ms * (1 + (spec.latency_multiplier - 1) * mag)
|
| 411 |
+
cpu_pct = min(100.0, base_cpu * (1 + spec.cpu_impact * mag))
|
| 412 |
+
memory_pct = base_memory
|
| 413 |
+
pool_pct = base_pool
|
| 414 |
+
|
| 415 |
+
elif ft == FailureType.NETWORK_ERROR:
|
| 416 |
+
# Cliff: all fails simultaneously; latency = timeout values then 0
|
| 417 |
+
error_rate = spec.peak_error_rate * mag + noise
|
| 418 |
+
# Latency spikes to timeout then drops (nothing gets through)
|
| 419 |
+
p99_ms = base_p99_ms * 10.0 * max(0.1, 1 - ticks_since_failure * 0.3)
|
| 420 |
+
cpu_pct = max(2.0, base_cpu * (1 + spec.cpu_impact * mag))
|
| 421 |
+
memory_pct = base_memory
|
| 422 |
+
pool_pct = base_pool
|
| 423 |
+
|
| 424 |
+
else:
|
| 425 |
+
error_rate = base_error_rate
|
| 426 |
+
p99_ms = base_p99_ms
|
| 427 |
+
cpu_pct = base_cpu
|
| 428 |
+
memory_pct = base_memory
|
| 429 |
+
pool_pct = base_pool
|
| 430 |
+
|
| 431 |
+
return (
|
| 432 |
+
max(0.0, min(1.0, error_rate)),
|
| 433 |
+
max(1.0, p99_ms),
|
| 434 |
+
max(0.0, min(100.0, cpu_pct)),
|
| 435 |
+
max(0.0, min(100.0, memory_pct)),
|
| 436 |
+
max(0.0, min(100.0, pool_pct)),
|
| 437 |
+
)
|
server/graph.py
ADDED
|
@@ -0,0 +1,470 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
server/graph.py — Service dependency graph generation.
|
| 3 |
+
|
| 4 |
+
Builds layered tree-like DAGs matching real production microservice topologies,
|
| 5 |
+
grounded in Alibaba trace analysis (depth ~3, 5% hotspot services, sparse edges).
|
| 6 |
+
|
| 7 |
+
Design principles:
|
| 8 |
+
- Services chosen from realistic role pools (not generic names)
|
| 9 |
+
- Layered: edge → identity → business → infra; edge → leaf dependencies
|
| 10 |
+
- Dependency edges are directed (A depends_on B = A calls B)
|
| 11 |
+
- ~5% of services are high-in-degree hotspots (shared cache, DB, auth)
|
| 12 |
+
- Sparse and tree-like; most nodes have in-degree 1
|
| 13 |
+
- Conditional edges have activation_probability < 1.0 (Easy: all 1.0)
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
from __future__ import annotations
|
| 17 |
+
|
| 18 |
+
import random
|
| 19 |
+
from dataclasses import dataclass, field
|
| 20 |
+
from typing import Dict, List, Optional, Tuple
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# ---------------------------------------------------------------------------
|
| 24 |
+
# Service role pools (realistic names, not generic)
|
| 25 |
+
# ---------------------------------------------------------------------------
|
| 26 |
+
|
| 27 |
+
_EDGE_POOL = [
|
| 28 |
+
"api-gateway",
|
| 29 |
+
"graphql-gateway",
|
| 30 |
+
"bff-web",
|
| 31 |
+
"bff-mobile",
|
| 32 |
+
"cdn-edge",
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
_IDENTITY_POOL = [
|
| 36 |
+
"auth-service",
|
| 37 |
+
"identity-provider",
|
| 38 |
+
"session-service",
|
| 39 |
+
"oauth-service",
|
| 40 |
+
"token-service",
|
| 41 |
+
]
|
| 42 |
+
|
| 43 |
+
_BUSINESS_POOL = [
|
| 44 |
+
"order-service",
|
| 45 |
+
"payment-service",
|
| 46 |
+
"inventory-service",
|
| 47 |
+
"catalog-service",
|
| 48 |
+
"pricing-service",
|
| 49 |
+
"cart-service",
|
| 50 |
+
"checkout-service",
|
| 51 |
+
"shipping-service",
|
| 52 |
+
"recommendation-service",
|
| 53 |
+
"search-service",
|
| 54 |
+
"review-service",
|
| 55 |
+
"subscription-service",
|
| 56 |
+
"billing-service",
|
| 57 |
+
"refund-service",
|
| 58 |
+
"notification-service",
|
| 59 |
+
]
|
| 60 |
+
|
| 61 |
+
_INFRA_POOL = [
|
| 62 |
+
"postgres-primary",
|
| 63 |
+
"postgres-replica",
|
| 64 |
+
"redis-cache",
|
| 65 |
+
"redis-session",
|
| 66 |
+
"kafka-broker",
|
| 67 |
+
"elasticsearch",
|
| 68 |
+
"object-storage",
|
| 69 |
+
"config-service",
|
| 70 |
+
]
|
| 71 |
+
|
| 72 |
+
_CROSS_CUTTING_POOL = [
|
| 73 |
+
"email-service",
|
| 74 |
+
"sms-service",
|
| 75 |
+
"metrics-collector",
|
| 76 |
+
"fraud-service",
|
| 77 |
+
"audit-service",
|
| 78 |
+
"feature-flags",
|
| 79 |
+
"rate-limiter",
|
| 80 |
+
]
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
# ---------------------------------------------------------------------------
|
| 84 |
+
# Data structures
|
| 85 |
+
# ---------------------------------------------------------------------------
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
@dataclass
|
| 89 |
+
class ServiceNode:
|
| 90 |
+
"""A service node in the dependency graph."""
|
| 91 |
+
|
| 92 |
+
id: str
|
| 93 |
+
layer: str # "edge" | "identity" | "business" | "infra" | "cross-cutting"
|
| 94 |
+
|
| 95 |
+
# Queueing theory baseline parameters (modified by failures at runtime)
|
| 96 |
+
base_arrival_rate: float = 100.0 # λ — requests/tick at baseline
|
| 97 |
+
base_service_time_local: float = 0.05 # S_local — seconds per request (local work)
|
| 98 |
+
thread_pool_size: int = 50 # T — max concurrent in-flight requests
|
| 99 |
+
|
| 100 |
+
# Default config (tunable by agent)
|
| 101 |
+
default_timeout_ms: int = 5000
|
| 102 |
+
default_retry_max: int = 3
|
| 103 |
+
default_retry_backoff: bool = False
|
| 104 |
+
default_circuit_breaker_threshold: float = 0.5
|
| 105 |
+
default_pool_size: int = 20
|
| 106 |
+
|
| 107 |
+
# Deployment defaults
|
| 108 |
+
default_replicas: int = 2
|
| 109 |
+
default_version: str = "v1.0.0"
|
| 110 |
+
|
| 111 |
+
# Whether this node is a "hotspot" (high in-degree shared infra)
|
| 112 |
+
is_hotspot: bool = False
|
| 113 |
+
|
| 114 |
+
# Whether this is a background-job node (can be pause_job target)
|
| 115 |
+
has_background_job: bool = False
|
| 116 |
+
|
| 117 |
+
# Whether this is a cache node (can be clear_cache target)
|
| 118 |
+
is_cache: bool = False
|
| 119 |
+
|
| 120 |
+
# Max replicas the agent can scale to
|
| 121 |
+
max_replicas: int = 8
|
| 122 |
+
|
| 123 |
+
# Region (for Hard mode multi-region topologies)
|
| 124 |
+
region: str = "us-east-1"
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
@dataclass
|
| 128 |
+
class DependencyEdge:
|
| 129 |
+
"""A directed dependency edge: source depends on (calls) target."""
|
| 130 |
+
|
| 131 |
+
source: str # service that makes the call
|
| 132 |
+
target: str # service that receives the call
|
| 133 |
+
|
| 134 |
+
# Fraction of ticks this edge is active (1.0 = always; 0.2 = ~20% of ticks)
|
| 135 |
+
activation_probability: float = 1.0
|
| 136 |
+
|
| 137 |
+
# Edge type for documentation
|
| 138 |
+
edge_type: str = "sync" # "sync" | "async" | "optional"
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
@dataclass
|
| 142 |
+
class ServiceGraph:
|
| 143 |
+
"""Complete service dependency graph for one episode."""
|
| 144 |
+
|
| 145 |
+
nodes: List[ServiceNode] = field(default_factory=list)
|
| 146 |
+
edges: List[DependencyEdge] = field(default_factory=list)
|
| 147 |
+
|
| 148 |
+
# Derived lookup structures (populated after build)
|
| 149 |
+
node_map: Dict[str, ServiceNode] = field(default_factory=dict)
|
| 150 |
+
adjacency: Dict[str, List[str]] = field(default_factory=dict) # source → [targets]
|
| 151 |
+
reverse_adjacency: Dict[str, List[str]] = field(default_factory=dict) # target → [callers]
|
| 152 |
+
|
| 153 |
+
# Metadata
|
| 154 |
+
difficulty: str = "easy"
|
| 155 |
+
has_multiple_regions: bool = False
|
| 156 |
+
regions: List[str] = field(default_factory=lambda: ["us-east-1"])
|
| 157 |
+
cache_services: List[str] = field(default_factory=list)
|
| 158 |
+
background_jobs: List[str] = field(default_factory=list)
|
| 159 |
+
|
| 160 |
+
def build_indices(self) -> None:
|
| 161 |
+
"""Build lookup maps after nodes/edges are populated."""
|
| 162 |
+
self.node_map = {n.id: n for n in self.nodes}
|
| 163 |
+
self.adjacency = {n.id: [] for n in self.nodes}
|
| 164 |
+
self.reverse_adjacency = {n.id: [] for n in self.nodes}
|
| 165 |
+
for edge in self.edges:
|
| 166 |
+
self.adjacency[edge.source].append(edge.target)
|
| 167 |
+
self.reverse_adjacency[edge.target].append(edge.source)
|
| 168 |
+
self.cache_services = [n.id for n in self.nodes if n.is_cache]
|
| 169 |
+
self.background_jobs = [n.id for n in self.nodes if n.has_background_job]
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
# ---------------------------------------------------------------------------
|
| 173 |
+
# Graph generation functions
|
| 174 |
+
# ---------------------------------------------------------------------------
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
def _pick(pool: List[str], rng: random.Random, exclude: set) -> Optional[str]:
|
| 178 |
+
"""Pick a random name from pool not already in exclude set."""
|
| 179 |
+
choices = [x for x in pool if x not in exclude]
|
| 180 |
+
if not choices:
|
| 181 |
+
return None
|
| 182 |
+
return rng.choice(choices)
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def _make_node(
|
| 186 |
+
service_id: str,
|
| 187 |
+
layer: str,
|
| 188 |
+
is_hotspot: bool = False,
|
| 189 |
+
is_cache: bool = False,
|
| 190 |
+
has_background_job: bool = False,
|
| 191 |
+
arrival_rate: float = 100.0,
|
| 192 |
+
service_time: float = 0.05,
|
| 193 |
+
thread_pool: int = 50,
|
| 194 |
+
) -> ServiceNode:
|
| 195 |
+
"""Create a ServiceNode with sensible per-layer defaults."""
|
| 196 |
+
# Infra nodes handle more concurrency, edge nodes get more traffic
|
| 197 |
+
if layer == "edge":
|
| 198 |
+
arrival_rate = 500.0
|
| 199 |
+
thread_pool = 100
|
| 200 |
+
elif layer == "infra":
|
| 201 |
+
arrival_rate = 200.0
|
| 202 |
+
service_time = 0.02 # DBs are fast per-query
|
| 203 |
+
thread_pool = 30
|
| 204 |
+
if is_cache:
|
| 205 |
+
service_time = 0.001
|
| 206 |
+
thread_pool = 200
|
| 207 |
+
|
| 208 |
+
return ServiceNode(
|
| 209 |
+
id=service_id,
|
| 210 |
+
layer=layer,
|
| 211 |
+
base_arrival_rate=arrival_rate,
|
| 212 |
+
base_service_time_local=service_time,
|
| 213 |
+
thread_pool_size=thread_pool,
|
| 214 |
+
is_hotspot=is_hotspot,
|
| 215 |
+
is_cache=is_cache,
|
| 216 |
+
has_background_job=has_background_job,
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
def generate_easy_graph(rng: random.Random) -> ServiceGraph:
|
| 221 |
+
"""
|
| 222 |
+
Easy: 3-5 services, linear chain.
|
| 223 |
+
api-gateway → order-service → postgres-primary
|
| 224 |
+
Agent must identify and fix one failing service in this simple topology.
|
| 225 |
+
"""
|
| 226 |
+
graph = ServiceGraph(difficulty="easy")
|
| 227 |
+
used: set = set()
|
| 228 |
+
|
| 229 |
+
# Always have a gateway at the edge
|
| 230 |
+
gateway_id = "api-gateway"
|
| 231 |
+
used.add(gateway_id)
|
| 232 |
+
|
| 233 |
+
# Pick 1-2 business services
|
| 234 |
+
biz_count = rng.randint(1, 2)
|
| 235 |
+
biz_nodes = []
|
| 236 |
+
for _ in range(biz_count):
|
| 237 |
+
svc = _pick(_BUSINESS_POOL, rng, used)
|
| 238 |
+
if svc:
|
| 239 |
+
used.add(svc)
|
| 240 |
+
biz_nodes.append(svc)
|
| 241 |
+
|
| 242 |
+
# Always have one DB at the leaf
|
| 243 |
+
db_id = "postgres-primary"
|
| 244 |
+
used.add(db_id)
|
| 245 |
+
|
| 246 |
+
# Optionally add a cache
|
| 247 |
+
add_cache = rng.random() > 0.4
|
| 248 |
+
cache_id = "redis-cache" if add_cache else None
|
| 249 |
+
if cache_id:
|
| 250 |
+
used.add(cache_id)
|
| 251 |
+
|
| 252 |
+
# Build nodes
|
| 253 |
+
graph.nodes.append(_make_node(gateway_id, "edge"))
|
| 254 |
+
for biz in biz_nodes:
|
| 255 |
+
graph.nodes.append(_make_node(biz, "business"))
|
| 256 |
+
graph.nodes.append(
|
| 257 |
+
_make_node(db_id, "infra", is_hotspot=True, arrival_rate=200.0)
|
| 258 |
+
)
|
| 259 |
+
if cache_id:
|
| 260 |
+
graph.nodes.append(
|
| 261 |
+
_make_node(cache_id, "infra", is_hotspot=True, is_cache=True)
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
# Build linear dependency chain: gateway → biz[0] → biz[1]? → db
|
| 265 |
+
chain = [gateway_id] + biz_nodes + [db_id]
|
| 266 |
+
for i in range(len(chain) - 1):
|
| 267 |
+
graph.edges.append(DependencyEdge(source=chain[i], target=chain[i + 1]))
|
| 268 |
+
|
| 269 |
+
# If cache exists, business services call it (optional edge for realism)
|
| 270 |
+
if cache_id and biz_nodes:
|
| 271 |
+
for biz in biz_nodes:
|
| 272 |
+
graph.edges.append(
|
| 273 |
+
DependencyEdge(source=biz, target=cache_id, activation_probability=0.9)
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
graph.build_indices()
|
| 277 |
+
return graph
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
def generate_medium_graph(rng: random.Random) -> ServiceGraph:
|
| 281 |
+
"""
|
| 282 |
+
Medium: 8-15 services, branching DAG.
|
| 283 |
+
gateway → auth + 3-4 domain services → shared DB + cache + kafka.
|
| 284 |
+
Agent must trace through the graph to find a root cause that's upstream
|
| 285 |
+
of the service showing the worst symptoms.
|
| 286 |
+
"""
|
| 287 |
+
graph = ServiceGraph(difficulty="medium")
|
| 288 |
+
used: set = set()
|
| 289 |
+
|
| 290 |
+
# Edge layer: 1 gateway
|
| 291 |
+
gateway_id = "api-gateway"
|
| 292 |
+
used.add(gateway_id)
|
| 293 |
+
graph.nodes.append(_make_node(gateway_id, "edge"))
|
| 294 |
+
|
| 295 |
+
# Identity layer: auth (gateway always calls auth)
|
| 296 |
+
auth_id = "auth-service"
|
| 297 |
+
used.add(auth_id)
|
| 298 |
+
graph.nodes.append(_make_node(auth_id, "identity"))
|
| 299 |
+
graph.edges.append(DependencyEdge(source=gateway_id, target=auth_id))
|
| 300 |
+
|
| 301 |
+
# Business layer: 4-6 domain services fanning out from gateway
|
| 302 |
+
biz_count = rng.randint(4, 6)
|
| 303 |
+
biz_nodes = []
|
| 304 |
+
for _ in range(biz_count):
|
| 305 |
+
svc = _pick(_BUSINESS_POOL, rng, used)
|
| 306 |
+
if svc:
|
| 307 |
+
used.add(svc)
|
| 308 |
+
biz_nodes.append(svc)
|
| 309 |
+
graph.nodes.append(_make_node(svc, "business"))
|
| 310 |
+
graph.edges.append(DependencyEdge(source=gateway_id, target=svc))
|
| 311 |
+
|
| 312 |
+
# Infra layer: shared DB + cache (hotspot nodes)
|
| 313 |
+
db_id = "postgres-primary"
|
| 314 |
+
cache_id = "redis-cache"
|
| 315 |
+
used.update([db_id, cache_id])
|
| 316 |
+
graph.nodes.append(_make_node(db_id, "infra", is_hotspot=True, arrival_rate=300.0))
|
| 317 |
+
graph.nodes.append(_make_node(cache_id, "infra", is_hotspot=True, is_cache=True))
|
| 318 |
+
|
| 319 |
+
# Business services call the shared DB and cache
|
| 320 |
+
for biz in biz_nodes:
|
| 321 |
+
graph.edges.append(DependencyEdge(source=biz, target=db_id))
|
| 322 |
+
# Cache: most biz services call it, but with high-freq optional
|
| 323 |
+
graph.edges.append(
|
| 324 |
+
DependencyEdge(source=biz, target=cache_id, activation_probability=0.8)
|
| 325 |
+
)
|
| 326 |
+
|
| 327 |
+
# Optionally add kafka as an async edge (1-2 business services produce to it)
|
| 328 |
+
if rng.random() > 0.4:
|
| 329 |
+
kafka_id = "kafka-broker"
|
| 330 |
+
used.add(kafka_id)
|
| 331 |
+
graph.nodes.append(
|
| 332 |
+
_make_node(kafka_id, "infra", has_background_job=True)
|
| 333 |
+
)
|
| 334 |
+
producers = rng.sample(biz_nodes, min(2, len(biz_nodes)))
|
| 335 |
+
for p in producers:
|
| 336 |
+
graph.edges.append(
|
| 337 |
+
DependencyEdge(source=p, target=kafka_id, edge_type="async", activation_probability=0.6)
|
| 338 |
+
)
|
| 339 |
+
|
| 340 |
+
# Cross-cutting: add 1-2 optional services (fraud, notification) called by some biz
|
| 341 |
+
cross_count = rng.randint(1, 2)
|
| 342 |
+
for _ in range(cross_count):
|
| 343 |
+
svc = _pick(_CROSS_CUTTING_POOL, rng, used)
|
| 344 |
+
if svc and biz_nodes:
|
| 345 |
+
used.add(svc)
|
| 346 |
+
caller = rng.choice(biz_nodes)
|
| 347 |
+
graph.nodes.append(_make_node(svc, "cross-cutting"))
|
| 348 |
+
graph.edges.append(
|
| 349 |
+
DependencyEdge(source=caller, target=svc, activation_probability=0.3)
|
| 350 |
+
)
|
| 351 |
+
|
| 352 |
+
graph.build_indices()
|
| 353 |
+
return graph
|
| 354 |
+
|
| 355 |
+
|
| 356 |
+
def generate_hard_graph(rng: random.Random) -> ServiceGraph:
|
| 357 |
+
"""
|
| 358 |
+
Hard: 15-30 services, complex multi-region DAG with hotspots,
|
| 359 |
+
conditional edges, multiple infra tiers, and background jobs.
|
| 360 |
+
Agent must manage a Sev-0 multi-root incident with conflicting mitigations.
|
| 361 |
+
"""
|
| 362 |
+
graph = ServiceGraph(difficulty="hard", has_multiple_regions=True)
|
| 363 |
+
graph.regions = ["us-east-1", "us-west-2"]
|
| 364 |
+
used: set = set()
|
| 365 |
+
|
| 366 |
+
all_biz_nodes: List[str] = []
|
| 367 |
+
|
| 368 |
+
# Build per-region sub-graphs, then connect them
|
| 369 |
+
for region in graph.regions:
|
| 370 |
+
suffix = "-east" if "east" in region else "-west"
|
| 371 |
+
|
| 372 |
+
# Edge: one gateway per region
|
| 373 |
+
gw = f"api-gateway{suffix}"
|
| 374 |
+
used.add(gw)
|
| 375 |
+
node = _make_node(gw, "edge")
|
| 376 |
+
node.region = region
|
| 377 |
+
graph.nodes.append(node)
|
| 378 |
+
|
| 379 |
+
# Identity: auth per region
|
| 380 |
+
auth = f"auth-service{suffix}"
|
| 381 |
+
used.add(auth)
|
| 382 |
+
node = _make_node(auth, "identity")
|
| 383 |
+
node.region = region
|
| 384 |
+
graph.nodes.append(node)
|
| 385 |
+
graph.edges.append(DependencyEdge(source=gw, target=auth))
|
| 386 |
+
|
| 387 |
+
# Business: 4-6 services per region
|
| 388 |
+
region_biz: List[str] = []
|
| 389 |
+
for _ in range(rng.randint(4, 6)):
|
| 390 |
+
svc_base = _pick(_BUSINESS_POOL, rng, used)
|
| 391 |
+
if svc_base:
|
| 392 |
+
svc = f"{svc_base}{suffix}"
|
| 393 |
+
used.add(svc)
|
| 394 |
+
region_biz.append(svc)
|
| 395 |
+
node = _make_node(svc, "business")
|
| 396 |
+
node.region = region
|
| 397 |
+
graph.nodes.append(node)
|
| 398 |
+
graph.edges.append(DependencyEdge(source=gw, target=svc))
|
| 399 |
+
|
| 400 |
+
all_biz_nodes.extend(region_biz)
|
| 401 |
+
|
| 402 |
+
# Infra: per-region replicas (postgres-replica is a hotspot)
|
| 403 |
+
pg_replica = f"postgres-replica{suffix}"
|
| 404 |
+
redis_svc = f"redis-cache{suffix}"
|
| 405 |
+
used.update([pg_replica, redis_svc])
|
| 406 |
+
node = _make_node(pg_replica, "infra", is_hotspot=True)
|
| 407 |
+
node.region = region
|
| 408 |
+
graph.nodes.append(node)
|
| 409 |
+
node = _make_node(redis_svc, "infra", is_hotspot=True, is_cache=True)
|
| 410 |
+
node.region = region
|
| 411 |
+
graph.nodes.append(node)
|
| 412 |
+
|
| 413 |
+
for biz in region_biz:
|
| 414 |
+
graph.edges.append(DependencyEdge(source=biz, target=pg_replica))
|
| 415 |
+
graph.edges.append(
|
| 416 |
+
DependencyEdge(source=biz, target=redis_svc, activation_probability=0.85)
|
| 417 |
+
)
|
| 418 |
+
|
| 419 |
+
# Shared global infra (hotspots called by both regions)
|
| 420 |
+
pg_primary = "postgres-primary"
|
| 421 |
+
kafka = "kafka-broker"
|
| 422 |
+
config_svc = "config-service"
|
| 423 |
+
used.update([pg_primary, kafka, config_svc])
|
| 424 |
+
|
| 425 |
+
graph.nodes.append(_make_node(pg_primary, "infra", is_hotspot=True, arrival_rate=500.0))
|
| 426 |
+
graph.nodes.append(_make_node(kafka, "infra", has_background_job=True))
|
| 427 |
+
graph.nodes.append(_make_node(config_svc, "infra", is_hotspot=True))
|
| 428 |
+
|
| 429 |
+
# Replicas call primary (replication)
|
| 430 |
+
for region in graph.regions:
|
| 431 |
+
suffix = "-east" if "east" in region else "-west"
|
| 432 |
+
graph.edges.append(
|
| 433 |
+
DependencyEdge(source=f"postgres-replica{suffix}", target=pg_primary)
|
| 434 |
+
)
|
| 435 |
+
|
| 436 |
+
# Business services use kafka for async events and config-service for feature flags
|
| 437 |
+
for biz in all_biz_nodes:
|
| 438 |
+
if rng.random() > 0.5:
|
| 439 |
+
graph.edges.append(
|
| 440 |
+
DependencyEdge(source=biz, target=kafka, edge_type="async", activation_probability=0.5)
|
| 441 |
+
)
|
| 442 |
+
graph.edges.append(
|
| 443 |
+
DependencyEdge(source=biz, target=config_svc, activation_probability=0.2)
|
| 444 |
+
)
|
| 445 |
+
|
| 446 |
+
# Cross-cutting services (low-freq optional edges)
|
| 447 |
+
for _ in range(rng.randint(2, 3)):
|
| 448 |
+
svc = _pick(_CROSS_CUTTING_POOL, rng, used)
|
| 449 |
+
if svc and all_biz_nodes:
|
| 450 |
+
used.add(svc)
|
| 451 |
+
caller = rng.choice(all_biz_nodes)
|
| 452 |
+
graph.nodes.append(_make_node(svc, "cross-cutting"))
|
| 453 |
+
graph.edges.append(
|
| 454 |
+
DependencyEdge(source=caller, target=svc, activation_probability=0.25)
|
| 455 |
+
)
|
| 456 |
+
|
| 457 |
+
graph.build_indices()
|
| 458 |
+
return graph
|
| 459 |
+
|
| 460 |
+
|
| 461 |
+
def generate_graph(difficulty: str, rng: random.Random) -> ServiceGraph:
|
| 462 |
+
"""Generate a service dependency graph for the given difficulty level."""
|
| 463 |
+
if difficulty == "easy":
|
| 464 |
+
return generate_easy_graph(rng)
|
| 465 |
+
elif difficulty == "medium":
|
| 466 |
+
return generate_medium_graph(rng)
|
| 467 |
+
elif difficulty == "hard":
|
| 468 |
+
return generate_hard_graph(rng)
|
| 469 |
+
else:
|
| 470 |
+
raise ValueError(f"Unknown difficulty: {difficulty!r}. Must be easy|medium|hard.")
|