Spaces:

Mist-ic
/

sevzero

Sleeping

App Files Files Community

Mist-ic commited on Mar 29

Commit

64d38cb

0 Parent(s):

Checkpoint: existing implementation (pre-cleanup)

Browse files

Files changed (8) hide show

.gitignore +5 -0
models.py +300 -0
openenv.yaml +10 -0
pyproject.toml +32 -0
sdk_info.txt +24 -0
server/__init__.py +1 -0
server/failures.py +437 -0
server/graph.py +470 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+# Documentation and research (not part of the submission)
+Docs/
+# OpenEnv preparatory course (dev reference only, not part of submission)
+openenv-course/

models.py ADDED Viewed

	@@ -0,0 +1,300 @@

+"""
+SevZero — Typed Pydantic models for Action, Observation, and State.
+These are the public API contracts at the package root (OpenEnv requirement).
+Every field is documented because the observation JSON must be self-explanatory
+to any LLM evaluator without additional context.
+"""
+from __future__ import annotations
+from typing import Any, Dict, List, Optional, Union
+from pydantic import Field
+from openenv.core.env_server import Action, Observation, State
+# ---------------------------------------------------------------------------
+# Sub-models: nested inside SevZeroObservation
+# ---------------------------------------------------------------------------
+class CircuitBreakerInfo(dict):
+    """Maps dependency name -> breaker state ('CLOSED' | 'OPEN' | 'HALF_OPEN')."""
+class ServiceInfo(object):
+    """Per-service observable state — declared as plain dict in observation for
+    JSON-serialisability; structured via ServiceInfoModel for validation."""
+class ServiceInfoModel:
+    """Pydantic model for a single service's metrics (used internally)."""
+from pydantic import BaseModel
+class ServiceInfoModel(BaseModel):
+    """
+    All observable per-service metrics, ordered by SRE triage priority:
+    symptoms first, traffic second, saturation third, context last.
+    """
+    # Identity
+    id: str = Field(description="Service identifier, e.g. 'payment-service'")
+    layer: str = Field(
+        description="Service layer: 'edge' | 'domain' | 'infra' | 'cross-cutting'"
+    )
+    status: str = Field(
+        description="Aggregate health: 'healthy' | 'degraded' | 'critical' | 'down'"
+    )
+    # --- Symptoms (error + latency) ---
+    error_rate: float = Field(
+        description="Fraction of requests failing this tick (0.0–1.0)"
+    )
+    latency_p50_ms: float = Field(description="Median request latency in milliseconds")
+    latency_p95_ms: float = Field(description="95th-percentile latency in milliseconds")
+    latency_p99_ms: float = Field(description="99th-percentile latency in milliseconds")
+    # --- Traffic ---
+    throughput_rps: float = Field(
+        description="Successful requests served per tick"
+    )
+    # --- Saturation ---
+    cpu_pct: float = Field(description="CPU utilisation 0–100")
+    memory_pct: float = Field(description="Memory utilisation 0–100")
+    connection_pool_usage_pct: float = Field(
+        description="DB connection pool saturation 0–100; high = I/O bottleneck"
+    )
+    # --- Deployment context ---
+    replicas: int = Field(description="Number of running replicas")
+    version: str = Field(description="Currently deployed version tag")
+    previous_version: Optional[str] = Field(
+        default=None,
+        description="Previous version available for rollback; null if never changed",
+    )
+    # --- Dependency graph ---
+    depends_on: List[str] = Field(
+        default_factory=list,
+        description="Direct service dependencies (downstream calls)",
+    )
+    circuit_breakers: Dict[str, str] = Field(
+        default_factory=dict,
+        description=(
+            "Per-dependency circuit breaker state. "
+            "Keys are dependency IDs; values are 'CLOSED' | 'OPEN' | 'HALF_OPEN'."
+        ),
+    )
+class AlertInfo(BaseModel):
+    """A structured active alert, ordered by severity."""
+    severity: str = Field(description="'critical' | 'warning' | 'info'")
+    service: str = Field(description="Service ID that triggered the alert")
+    type: str = Field(
+        description=(
+            "Alert category: 'error_rate_high' | 'latency_high' | "
+            "'circuit_breaker_open' | 'connection_pool_saturated' | "
+            "'memory_high' | 'cpu_high' | 'service_down'"
+        )
+    )
+    message: str = Field(description="Human-readable alert description with metric values")
+    first_seen_tick: int = Field(description="Tick at which this alert first fired")
+class DeployInfo(BaseModel):
+    """A recent deployment event visible in the observation."""
+    service: str = Field(description="Service that was deployed")
+    version: str = Field(description="New version deployed")
+    ticks_ago: int = Field(description="How many ticks ago the deploy happened")
+class ActionRecord(BaseModel):
+    """A previously taken action, shown in the observation for agent context."""
+    tick: int = Field(description="Tick at which the action was executed")
+    action: str = Field(description="Action type, e.g. 'restart_service'")
+    target: Optional[str] = Field(default=None, description="Primary target service/resource")
+    success: bool = Field(description="Whether the action completed successfully")
+    note: Optional[str] = Field(
+        default=None,
+        description="Extra context, e.g. 'service already healthy' or error reason",
+    )
+class LegalAction(BaseModel):
+    """One type of action the agent is currently allowed to take."""
+    action_type: str = Field(
+        description=(
+            "One of: inspect_logs | inspect_metrics | inspect_traces | "
+            "restart_service | rollback_service | scale_service | tune_config | "
+            "clear_cache | rebalance_traffic | pause_job | noop"
+        )
+    )
+    valid_targets: List[str] = Field(
+        description="Service IDs (or other resource names) this action can target right now"
+    )
+# ---------------------------------------------------------------------------
+# Top-level OpenEnv models
+# ---------------------------------------------------------------------------
+class SevZeroAction(Action):
+    """
+    An action the agent takes in SevZero.
+    Choose exactly one action_type and provide the required params for it:
+      inspect_logs(service_id)         -> logs: str in next observation
+      inspect_metrics(service_id)      -> metric_history in next observation
+      inspect_traces(service_id)       -> traces in next observation
+      restart_service(service_id)      -> restarts pod; 1-2 tick delay
+      rollback_service(service_id)     -> reverts to previous_version; 2-3 tick delay
+      scale_service(service_id, replicas=N)   -> adjusts replica count; 2-4 tick delay
+      tune_config(service_id, key, value)     -> updates config param; 1 tick delay
+      clear_cache(cache_name)          -> flushes cache; 1 tick delay
+      rebalance_traffic(from_region, to_region, pct)  -> shifts traffic; 2-3 tick delay
+      pause_job(job_name)              -> pauses background job; 1 tick delay
+      noop()                           -> wait and observe; 0 ticks
+    """
+    action_type: str = Field(
+        description=(
+            "Which operation to perform. Must be one of the 11 action types. "
+            "Must appear in legal_actions from the previous observation."
+        )
+    )
+    params: Dict[str, Any] = Field(
+        default_factory=dict,
+        description=(
+            "Action parameters. Examples: "
+            "{'service_id': 'payment-service'}, "
+            "{'service_id': 'payment-service', 'replicas': 4}, "
+            "{'service_id': 'payment-service', 'key': 'timeout_ms', 'value': 2000}"
+        ),
+    )
+class SevZeroObservation(Observation):
+    """
+    Full observation returned by reset() and step().
+    Fields are ordered by SRE triage priority: incident summary first,
+    then per-service metrics, then alerts, then context, then agent state.
+    The `done` and `reward` fields are inherited from Observation base.
+    """
+    # --- Episode context ---
+    tick: int = Field(default=0, description="Current simulation tick (0-indexed)")
+    episode_id: Optional[str] = Field(
+        default=None, description="Unique ID for this episode"
+    )
+    task_id: str = Field(
+        default="easy",
+        description="Which task is running: 'easy' | 'medium' | 'hard'",
+    )
+    status: str = Field(
+        default="playing",
+        description=(
+            "Episode status: 'playing' | 'resolved' (all SLOs met) | "
+            "'failed' (system collapse) | 'timeout' (max steps exceeded)"
+        ),
+    )
+    max_steps: int = Field(
+        default=10, description="Step budget for this task (Easy=10, Medium=20, Hard=50)"
+    )
+    # --- Health summary ---
+    global_slo_score: float = Field(
+        default=0.0,
+        description="Fraction of services currently meeting all SLO targets (0.0–1.0)",
+    )
+    observation_summary: str = Field(
+        default="",
+        description=(
+            "One-sentence natural-language summary of the current situation. "
+            "Read this first — it gives you the critical context for your next action."
+        ),
+    )
+    # --- Per-service state ---
+    services: List[Dict[str, Any]] = Field(
+        default_factory=list,
+        description=(
+            "Full state for every service in the cluster. "
+            "See ServiceInfoModel for field definitions."
+        ),
+    )
+    # --- Active alerts ---
+    alerts: List[Dict[str, Any]] = Field(
+        default_factory=list,
+        description="Active alerts sorted by severity (critical first). See AlertInfo.",
+    )
+    # --- Context ---
+    recent_deploys: List[Dict[str, Any]] = Field(
+        default_factory=list,
+        description="Deployments in the last 10 ticks. Correlate with error onset.",
+    )
+    actions_taken: List[Dict[str, Any]] = Field(
+        default_factory=list,
+        description="Last 10 actions taken in this episode, for agent context.",
+    )
+    # --- Action space ---
+    legal_actions: List[Dict[str, Any]] = Field(
+        default_factory=list,
+        description=(
+            "Exactly what actions are available right now with valid targets. "
+            "Only use actions listed here. Invalid actions return a -0.5 penalty."
+        ),
+    )
+    # --- Diagnostic output from inspect_* actions ---
+    logs: Optional[str] = Field(
+        default=None,
+        description="Log output from the most recent inspect_logs action, if any.",
+    )
+    metric_history: Optional[List[Dict[str, Any]]] = Field(
+        default=None,
+        description="Per-tick metric history from the most recent inspect_metrics action.",
+    )
+    traces: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description="Distributed trace from the most recent inspect_traces action.",
+    )
+class SevZeroState(State):
+    """
+    Episode metadata returned by the state property.
+    `episode_id` and `step_count` are inherited from State base.
+    """
+    task_id: str = Field(default="easy", description="Which task: 'easy' | 'medium' | 'hard'")
+    seed: Optional[int] = Field(
+        default=None, description="Seed used for this episode (for reproducibility)"
+    )
+    global_slo_score: float = Field(
+        default=0.0, description="Current fraction of services meeting SLO targets"
+    )
+    terminated: bool = Field(
+        default=False, description="Whether the episode has ended for any reason"
+    )
+    termination_reason: Optional[str] = Field(
+        default=None,
+        description="Why the episode ended: 'resolved' | 'failed' | 'timeout' | None",
+    )

openenv.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+name: sevzero
+version: "1.0.0"
+description: "SRE Incident Response Environment — an autonomous on-call SRE managing a microservice cluster undergoing cascading failures"
+tags:
+  - openenv
+  - sre
+  - incident-response
+  - reinforcement-learning
+  - microservices
+  - agentic

pyproject.toml ADDED Viewed

	@@ -0,0 +1,32 @@

+[project]
+name = "sevzero"
+version = "1.0.0"
+description = "SRE Incident Response Environment for OpenEnv"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "openenv-core>=0.2.2",
+    "fastapi>=0.104.0",
+    "uvicorn>=0.24.0",
+    "pydantic>=2.0.0",
+    "openai>=1.0.0",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=7.0.0",
+    "httpx>=0.24.0",
+]
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["server"]
+[tool.uv]
+dev-dependencies = [
+    "pytest>=7.0.0",
+    "httpx>=0.24.0",
+]

sdk_info.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+=== Observation fields ===
+done: annotation=bool required=False default=False description='Whether the episode has terminated'
+reward: annotation=Union[bool, int, float, NoneType] required=False default=None description='Reward signal from the last action'
+metadata: annotation=Dict[str, Any] required=False default_factory=dict description='Additional metadata for the observation'
+=== State fields ===
+episode_id: annotation=Union[str, NoneType] required=False default=None description='Unique identifier for the current episode'
+step_count: annotation=int required=False default=0 description='Number of steps taken in the current episode' metadata=[Ge(ge=0)]
+=== Action fields ===
+metadata: annotation=Dict[str, Any] required=False default_factory=dict description='Additional metadata for the action'
+=== Environment methods ===
+_apply_rubric(self, action: ~ActT, observation: ~ObsT) -> float
+_apply_rubric_async(self, action: ~ActT, observation: ~ObsT) -> float
+_apply_transform(self, observation: ~ObsT) -> ~ObsT
+_reset_rubric(self) -> None
+_reset_rubric_async(self) -> None
+close(self) -> None
+get_metadata(self) -> openenv.core.env_server.types.EnvironmentMetadata
+reset(self, seed: Optional[int] = None, episode_id: Optional[str] = None, **kwargs: Any) -> ~ObsT
+reset_async(self, seed: Optional[int] = None, episode_id: Optional[str] = None, **kwargs: Any) -> ~ObsT
+step(self, action: ~ActT, timeout_s: Optional[float] = None, **kwargs: Any) -> ~ObsT
+step_async(self, action: ~ActT, timeout_s: Optional[float] = None, **kwargs: Any) -> ~ObsT

server/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """server/__init__.py — marks server/ as a Python package."""

server/failures.py ADDED Viewed

	@@ -0,0 +1,437 @@

+"""
+server/failures.py — 8 failure types with injection logic and metric evolution patterns.
+Each failure type has:
+  - A distinctive metric temporal shape (how metrics evolve per tick)
+  - Config error subtypes (startup vs runtime)
+  - Weighted distribution matching real-world incident data
+Sources: Google SRE postmortems, Netflix Hystrix, AWS incident reports.
+See Docs/DataResearch.md for full citation.
+"""
+from __future__ import annotations
+import random
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Dict, List, Optional, Tuple
+# ---------------------------------------------------------------------------
+# Failure taxonomy
+# ---------------------------------------------------------------------------
+class FailureType(str, Enum):
+    CRASH = "crash"
+    BAD_DEPLOY = "bad_deploy"
+    CONFIG_STARTUP = "config_startup"   # Service can't boot
+    CONFIG_RUNTIME = "config_runtime"   # Service runs but specific paths fail
+    CASCADING_LATENCY = "cascading_latency"
+    RESOURCE_LEAK = "resource_leak"
+    DB_DEGRADATION = "db_degradation"
+    CACHE_FAILURE = "cache_failure"
+    NETWORK_ERROR = "network_error"
+# Weighted distribution matching Google empirical incident data
+# config=32%, deploy=25%, cascade=15%, crash=10%, leak=8%, DB=5%, cache=3%, network=2%
+_FAILURE_WEIGHTS: Dict[FailureType, float] = {
+    FailureType.CONFIG_STARTUP:    0.16,
+    FailureType.CONFIG_RUNTIME:    0.16,
+    FailureType.BAD_DEPLOY:        0.25,
+    FailureType.CASCADING_LATENCY: 0.15,
+    FailureType.CRASH:             0.10,
+    FailureType.RESOURCE_LEAK:     0.08,
+    FailureType.DB_DEGRADATION:    0.05,
+    FailureType.CACHE_FAILURE:     0.03,
+    FailureType.NETWORK_ERROR:     0.02,
+}
+# For multi-root incidents: avoid unlikely combinations
+_INCOMPATIBLE_PAIRS = {
+    (FailureType.NETWORK_ERROR, FailureType.NETWORK_ERROR),  # Two network errors is unrealistic
+    (FailureType.CACHE_FAILURE, FailureType.CACHE_FAILURE),  # Two cache failures is unrealistic
+}
+@dataclass
+class FailureSpec:
+    """Describes a single injected failure and its evolution parameters."""
+    service_id: str
+    failure_type: FailureType
+    # Error rates at various stages (used by metric evolution)
+    base_error_rate: float = 0.0        # Healthy baseline
+    peak_error_rate: float = 0.0        # At full failure
+    onset_ticks: int = 1                # Ticks to reach peak (1=instant, 5=gradual)
+    # Latency impact at peak
+    latency_multiplier: float = 1.0     # How much p99 multiplies at peak
+    # Resource impact at peak
+    cpu_impact: float = 0.0             # CPU increase (0–1)
+    memory_impact: float = 0.0          # Memory increase per tick (for leaks)
+    pool_saturation: float = 0.0        # Connection pool impact
+    # Config error subtype metadata
+    broken_config_key: Optional[str] = None    # Which config key is wrong
+    broken_config_value: Optional[str] = None  # What the wrong value is
+    # Deployment metadata (for bad_deploy)
+    bad_version: Optional[str] = None
+    good_version: Optional[str] = None
+    # Network error metadata
+    affected_region: Optional[str] = None
+# ---------------------------------------------------------------------------
+# Failure selection
+# ---------------------------------------------------------------------------
+def select_failure_type(
+    rng: random.Random,
+    exclude: Optional[List[FailureType]] = None,
+) -> FailureType:
+    """Sample a failure type from the empirically-weighted distribution."""
+    population = list(_FAILURE_WEIGHTS.keys())
+    weights = [_FAILURE_WEIGHTS[f] for f in population]
+    # Remove excluded types
+    if exclude:
+        filtered = [(f, w) for f, w in zip(population, weights) if f not in exclude]
+        if filtered:
+            population, weights = zip(*filtered)
+            population, weights = list(population), list(weights)
+    return rng.choices(population, weights=weights, k=1)[0]
+def select_multi_root_failures(
+    rng: random.Random, count: int = 2
+) -> List[FailureType]:
+    """Select multiple failure types with incompatibility constraints."""
+    selected: List[FailureType] = []
+    for _ in range(count):
+        exclude = selected[:]
+        # Also exclude incompatible pairs
+        for s in selected:
+            for a, b in _INCOMPATIBLE_PAIRS:
+                if s == a:
+                    exclude.append(b)
+                elif s == b:
+                    exclude.append(a)
+        ft = select_failure_type(rng, exclude=exclude)
+        selected.append(ft)
+    return selected
+# ---------------------------------------------------------------------------
+# Failure specification factories
+# ---------------------------------------------------------------------------
+def make_crash_spec(service_id: str, rng: random.Random) -> FailureSpec:
+    """Service Crash: sudden 5xx spike then drop (service is dead)."""
+    return FailureSpec(
+        service_id=service_id,
+        failure_type=FailureType.CRASH,
+        base_error_rate=0.0,
+        peak_error_rate=rng.uniform(0.85, 1.0),
+        onset_ticks=1,           # Instant
+        latency_multiplier=0.1,  # Latency drops (fast fails, no waiting)
+        cpu_impact=0.0,          # CPU near zero (process dead)
+        memory_impact=0.0,
+    )
+def make_bad_deploy_spec(service_id: str, rng: random.Random) -> FailureSpec:
+    """Bad Deployment: step-function error increase after version change."""
+    return FailureSpec(
+        service_id=service_id,
+        failure_type=FailureType.BAD_DEPLOY,
+        base_error_rate=0.0,
+        peak_error_rate=rng.uniform(0.30, 0.70),
+        onset_ticks=1,                # Step function — appears at deploy tick
+        latency_multiplier=rng.uniform(1.5, 3.0),
+        cpu_impact=rng.uniform(0.1, 0.3),
+        memory_impact=rng.uniform(0.05, 0.15),
+        bad_version="v" + str(rng.randint(2, 9)) + "." + str(rng.randint(0, 9)) + "." + str(rng.randint(1, 9)),
+        good_version="v1.0.0",
+    )
+def make_config_startup_spec(service_id: str, rng: random.Random) -> FailureSpec:
+    """Config Error (Startup): service can't boot — zero traffic, health checks fail."""
+    config_keys = ["db_password", "db_host", "api_endpoint", "env_var", "config_file"]
+    return FailureSpec(
+        service_id=service_id,
+        failure_type=FailureType.CONFIG_STARTUP,
+        base_error_rate=0.0,
+        peak_error_rate=1.0,          # 100% — service is completely down
+        onset_ticks=1,
+        latency_multiplier=0.0,       # No latency, no traffic
+        cpu_impact=-0.9,              # CPU near zero (process exited immediately)
+        memory_impact=-0.9,
+        broken_config_key=rng.choice(config_keys),
+        broken_config_value="WRONG_VALUE",
+    )
+def make_config_runtime_spec(service_id: str, rng: random.Random) -> FailureSpec:
+    """Config Error (Runtime): service runs but specific code paths fail."""
+    config_keys = ["api_endpoint", "feature_flag", "timeout_ms", "retry_max"]
+    return FailureSpec(
+        service_id=service_id,
+        failure_type=FailureType.CONFIG_RUNTIME,
+        base_error_rate=0.0,
+        peak_error_rate=rng.uniform(0.20, 0.60),
+        onset_ticks=1,
+        latency_multiplier=rng.uniform(1.2, 2.0),
+        cpu_impact=0.0,              # Normal resource usage
+        memory_impact=0.0,
+        broken_config_key=rng.choice(config_keys),
+        broken_config_value="MISCONFIGURED",
+    )
+def make_cascading_latency_spec(service_id: str, rng: random.Random) -> FailureSpec:
+    """
+    Cascading Latency: gradual latency ramp → thread pool exhaustion.
+    KEY signature: p99 ramps BEFORE errors appear. CPU rises from blocked threads.
+    """
+    return FailureSpec(
+        service_id=service_id,
+        failure_type=FailureType.CASCADING_LATENCY,
+        base_error_rate=0.0,
+        peak_error_rate=rng.uniform(0.40, 0.85),
+        onset_ticks=rng.randint(3, 6),  # Gradual ramp
+        latency_multiplier=rng.uniform(8.0, 20.0),
+        cpu_impact=rng.uniform(0.30, 0.60),   # Rising CPU from blocked threads
+        memory_impact=rng.uniform(0.10, 0.25),
+    )
+def make_resource_leak_spec(service_id: str, rng: random.Random) -> FailureSpec:
+    """Resource Leak: steady memory/CPU climb; sawtooth pattern on restarts."""
+    return FailureSpec(
+        service_id=service_id,
+        failure_type=FailureType.RESOURCE_LEAK,
+        base_error_rate=0.0,
+        peak_error_rate=rng.uniform(0.20, 0.50),
+        onset_ticks=rng.randint(5, 10),  # Slow burn
+        latency_multiplier=rng.uniform(2.0, 5.0),
+        cpu_impact=0.05,              # Grows per tick (applied in evolution)
+        memory_impact=0.06,           # LINEAR RAMP — key signature
+    )
+def make_db_degradation_spec(service_id: str, rng: random.Random) -> FailureSpec:
+    """DB Degradation: rising DB latency, pool saturation, app CPU paradoxically LOW."""
+    return FailureSpec(
+        service_id=service_id,
+        failure_type=FailureType.DB_DEGRADATION,
+        base_error_rate=0.0,
+        peak_error_rate=rng.uniform(0.30, 0.70),
+        onset_ticks=rng.randint(2, 4),
+        latency_multiplier=rng.uniform(5.0, 15.0),
+        cpu_impact=-0.2,              # PARADOXICALLY LOW (waiting on I/O)
+        memory_impact=0.05,
+        pool_saturation=0.90,         # Connection pool hits 90%+
+    )
+def make_cache_failure_spec(service_id: str, rng: random.Random) -> FailureSpec:
+    """Cache Failure: hit-rate cliff → backend QPS 10-50x spike → DB overload."""
+    return FailureSpec(
+        service_id=service_id,
+        failure_type=FailureType.CACHE_FAILURE,
+        base_error_rate=0.0,
+        peak_error_rate=rng.uniform(0.20, 0.50),
+        onset_ticks=1,               # CLIFF — simultaneous, not gradual
+        latency_multiplier=rng.uniform(3.0, 8.0),
+        cpu_impact=0.20,
+        memory_impact=0.0,
+    )
+def make_network_error_spec(service_id: str, rng: random.Random, region: str = "us-east-1") -> FailureSpec:
+    """Network/Routing Error: connection failures affecting all services to this region."""
+    return FailureSpec(
+        service_id=service_id,
+        failure_type=FailureType.NETWORK_ERROR,
+        base_error_rate=0.0,
+        peak_error_rate=rng.uniform(0.80, 1.0),
+        onset_ticks=1,               # Simultaneous, not hop-by-hop
+        latency_multiplier=0.2,      # Timeout values — fixed high, then drop
+        cpu_impact=-0.3,             # Low CPU (nothing getting through)
+        memory_impact=0.0,
+        affected_region=region,
+    )
+_SPEC_FACTORIES = {
+    FailureType.CRASH:              make_crash_spec,
+    FailureType.BAD_DEPLOY:         make_bad_deploy_spec,
+    FailureType.CONFIG_STARTUP:     make_config_startup_spec,
+    FailureType.CONFIG_RUNTIME:     make_config_runtime_spec,
+    FailureType.CASCADING_LATENCY:  make_cascading_latency_spec,
+    FailureType.RESOURCE_LEAK:      make_resource_leak_spec,
+    FailureType.DB_DEGRADATION:     make_db_degradation_spec,
+    FailureType.CACHE_FAILURE:      make_cache_failure_spec,
+    FailureType.NETWORK_ERROR:      make_network_error_spec,
+}
+def make_failure_spec(
+    service_id: str,
+    failure_type: FailureType,
+    rng: random.Random,
+    **kwargs,
+) -> FailureSpec:
+    """Create a FailureSpec for the given service and failure type."""
+    factory = _SPEC_FACTORIES[failure_type]
+    return factory(service_id, rng, **kwargs)
+# ---------------------------------------------------------------------------
+# Metric evolution: per-type temporal shapes
+# ---------------------------------------------------------------------------
+def compute_failure_magnitude(spec: FailureSpec, ticks_since_failure: int) -> float:
+    """
+    Return a 0.0–1.0 magnitude factor for how fully the failure has manifested.
+    - Instant failures (onset_ticks=1): full magnitude from tick 1
+    - Gradual failures: linear ramp over onset_ticks
+    - Resource leaks: continues growing after onset (handled separately)
+    """
+    if spec.onset_ticks <= 1:
+        return 1.0
+    return min(1.0, ticks_since_failure / spec.onset_ticks)
+def apply_failure_to_metrics(
+    spec: FailureSpec,
+    ticks_since_failure: int,
+    base_error_rate: float,
+    base_p99_ms: float,
+    base_cpu: float,
+    base_memory: float,
+    base_pool: float,
+    rng: random.Random,
+) -> Tuple[float, float, float, float, float]:
+    """
+    Apply failure evolution to metrics.
+    Returns: (error_rate, p99_ms, cpu_pct, memory_pct, pool_pct)
+    Each failure type produces a DISTINCTIVE temporal shape:
+    - crash: instant spike → drop (service dead)
+    - bad_deploy: step function up at deploy tick
+    - config_startup: 100% error, zero traffic
+    - config_runtime: partial errors on affected paths
+    - cascading_latency: p99 ramps BEFORE errors (early warning)
+    - resource_leak: memory linear ramp, sawtooth CPU
+    - db_degradation: pool saturation, CPU paradoxically LOW
+    - cache_failure: cliff drop simultaneous
+    - network_error: cliff, then fixed-high timeout values
+    """
+    mag = compute_failure_magnitude(spec, ticks_since_failure)
+    # Add natural stochastic variance (±5%) — Bernoulli trial model
+    noise = rng.uniform(-0.03, 0.03)
+    ft = spec.failure_type
+    if ft == FailureType.CRASH:
+        error_rate = spec.peak_error_rate * mag + noise
+        p99_ms = base_p99_ms * 0.1 * mag + base_p99_ms * (1 - mag)  # Drops fast
+        cpu_pct = max(0.0, base_cpu * (1 - 0.9 * mag))
+        memory_pct = base_memory
+        pool_pct = base_pool
+    elif ft == FailureType.BAD_DEPLOY:
+        error_rate = spec.peak_error_rate * mag + noise
+        p99_ms = base_p99_ms * (1 + (spec.latency_multiplier - 1) * mag)
+        cpu_pct = min(100.0, base_cpu * (1 + spec.cpu_impact * mag))
+        memory_pct = min(100.0, base_memory * (1 + spec.memory_impact * mag))
+        pool_pct = base_pool
+    elif ft == FailureType.CONFIG_STARTUP:
+        error_rate = 1.0                 # Always 100% — service won't start
+        p99_ms = 0.0                     # No traffic = no latency
+        cpu_pct = max(0.0, base_cpu * 0.02)   # Near zero
+        memory_pct = max(0.0, base_memory * 0.02)
+        pool_pct = 0.0
+    elif ft == FailureType.CONFIG_RUNTIME:
+        error_rate = spec.peak_error_rate * mag + noise
+        p99_ms = base_p99_ms * (1 + (spec.latency_multiplier - 1) * mag)
+        cpu_pct = base_cpu                # Normal — only specific paths fail
+        memory_pct = base_memory
+        pool_pct = base_pool
+    elif ft == FailureType.CASCADING_LATENCY:
+        # p99 ramps BEFORE errors — the key diagnostic signature
+        latency_onset_fraction = min(1.0, ticks_since_failure / max(1, spec.onset_ticks - 1))
+        error_onset_fraction = min(1.0, max(0.0, (ticks_since_failure - 1) / spec.onset_ticks))
+        error_rate = spec.peak_error_rate * error_onset_fraction + noise
+        p99_ms = base_p99_ms * (1 + (spec.latency_multiplier - 1) * latency_onset_fraction)
+        cpu_pct = min(100.0, base_cpu * (1 + spec.cpu_impact * latency_onset_fraction))
+        memory_pct = min(100.0, base_memory * (1 + spec.memory_impact * latency_onset_fraction))
+        pool_pct = base_pool
+    elif ft == FailureType.RESOURCE_LEAK:
+        # Memory: LINEAR RAMP to limit (key signature)
+        # CPU: Growing GC thrash
+        leak_fraction = min(1.0, ticks_since_failure * 0.08)  # ~12 ticks to peak
+        error_rate = spec.peak_error_rate * min(1.0, leak_fraction * 1.5) + noise
+        p99_ms = base_p99_ms * (1 + (spec.latency_multiplier - 1) * leak_fraction)
+        cpu_pct = min(100.0, base_cpu * (1 + leak_fraction * 0.8))     # GC pressure
+        memory_pct = min(100.0, base_memory + leak_fraction * (100 - base_memory))
+        pool_pct = base_pool
+    elif ft == FailureType.DB_DEGRADATION:
+        error_rate = spec.peak_error_rate * mag + noise
+        p99_ms = base_p99_ms * (1 + (spec.latency_multiplier - 1) * mag)
+        # CPU paradoxically LOW — waiting on I/O, not computing
+        cpu_pct = max(5.0, base_cpu * (1 + spec.cpu_impact * mag))
+        memory_pct = min(100.0, base_memory * (1 + spec.memory_impact * mag))
+        pool_pct = min(100.0, base_pool + spec.pool_saturation * mag * 100)
+    elif ft == FailureType.CACHE_FAILURE:
+        # CLIFF: simultaneous, not gradual (onset_ticks=1)
+        error_rate = spec.peak_error_rate * mag + noise
+        p99_ms = base_p99_ms * (1 + (spec.latency_multiplier - 1) * mag)
+        cpu_pct = min(100.0, base_cpu * (1 + spec.cpu_impact * mag))
+        memory_pct = base_memory
+        pool_pct = base_pool
+    elif ft == FailureType.NETWORK_ERROR:
+        # Cliff: all fails simultaneously; latency = timeout values then 0
+        error_rate = spec.peak_error_rate * mag + noise
+        # Latency spikes to timeout then drops (nothing gets through)
+        p99_ms = base_p99_ms * 10.0 * max(0.1, 1 - ticks_since_failure * 0.3)
+        cpu_pct = max(2.0, base_cpu * (1 + spec.cpu_impact * mag))
+        memory_pct = base_memory
+        pool_pct = base_pool
+    else:
+        error_rate = base_error_rate
+        p99_ms = base_p99_ms
+        cpu_pct = base_cpu
+        memory_pct = base_memory
+        pool_pct = base_pool
+    return (
+        max(0.0, min(1.0, error_rate)),
+        max(1.0, p99_ms),
+        max(0.0, min(100.0, cpu_pct)),
+        max(0.0, min(100.0, memory_pct)),
+        max(0.0, min(100.0, pool_pct)),
+    )

server/graph.py ADDED Viewed

	@@ -0,0 +1,470 @@

+"""
+server/graph.py — Service dependency graph generation.
+Builds layered tree-like DAGs matching real production microservice topologies,
+grounded in Alibaba trace analysis (depth ~3, 5% hotspot services, sparse edges).
+Design principles:
+- Services chosen from realistic role pools (not generic names)
+- Layered: edge → identity → business → infra; edge → leaf dependencies
+- Dependency edges are directed (A depends_on B = A calls B)
+- ~5% of services are high-in-degree hotspots (shared cache, DB, auth)
+- Sparse and tree-like; most nodes have in-degree 1
+- Conditional edges have activation_probability < 1.0 (Easy: all 1.0)
+"""
+from __future__ import annotations
+import random
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional, Tuple
+# ---------------------------------------------------------------------------
+# Service role pools (realistic names, not generic)
+# ---------------------------------------------------------------------------
+_EDGE_POOL = [
+    "api-gateway",
+    "graphql-gateway",
+    "bff-web",
+    "bff-mobile",
+    "cdn-edge",
+]
+_IDENTITY_POOL = [
+    "auth-service",
+    "identity-provider",
+    "session-service",
+    "oauth-service",
+    "token-service",
+]
+_BUSINESS_POOL = [
+    "order-service",
+    "payment-service",
+    "inventory-service",
+    "catalog-service",
+    "pricing-service",
+    "cart-service",
+    "checkout-service",
+    "shipping-service",
+    "recommendation-service",
+    "search-service",
+    "review-service",
+    "subscription-service",
+    "billing-service",
+    "refund-service",
+    "notification-service",
+]
+_INFRA_POOL = [
+    "postgres-primary",
+    "postgres-replica",
+    "redis-cache",
+    "redis-session",
+    "kafka-broker",
+    "elasticsearch",
+    "object-storage",
+    "config-service",
+]
+_CROSS_CUTTING_POOL = [
+    "email-service",
+    "sms-service",
+    "metrics-collector",
+    "fraud-service",
+    "audit-service",
+    "feature-flags",
+    "rate-limiter",
+]
+# ---------------------------------------------------------------------------
+# Data structures
+# ---------------------------------------------------------------------------
+@dataclass
+class ServiceNode:
+    """A service node in the dependency graph."""
+    id: str
+    layer: str  # "edge" | "identity" | "business" | "infra" | "cross-cutting"
+    # Queueing theory baseline parameters (modified by failures at runtime)
+    base_arrival_rate: float = 100.0       # λ — requests/tick at baseline
+    base_service_time_local: float = 0.05  # S_local — seconds per request (local work)
+    thread_pool_size: int = 50             # T — max concurrent in-flight requests
+    # Default config (tunable by agent)
+    default_timeout_ms: int = 5000
+    default_retry_max: int = 3
+    default_retry_backoff: bool = False
+    default_circuit_breaker_threshold: float = 0.5
+    default_pool_size: int = 20
+    # Deployment defaults
+    default_replicas: int = 2
+    default_version: str = "v1.0.0"
+    # Whether this node is a "hotspot" (high in-degree shared infra)
+    is_hotspot: bool = False
+    # Whether this is a background-job node (can be pause_job target)
+    has_background_job: bool = False
+    # Whether this is a cache node (can be clear_cache target)
+    is_cache: bool = False
+    # Max replicas the agent can scale to
+    max_replicas: int = 8
+    # Region (for Hard mode multi-region topologies)
+    region: str = "us-east-1"
+@dataclass
+class DependencyEdge:
+    """A directed dependency edge: source depends on (calls) target."""
+    source: str   # service that makes the call
+    target: str   # service that receives the call
+    # Fraction of ticks this edge is active (1.0 = always; 0.2 = ~20% of ticks)
+    activation_probability: float = 1.0
+    # Edge type for documentation
+    edge_type: str = "sync"  # "sync" | "async" | "optional"
+@dataclass
+class ServiceGraph:
+    """Complete service dependency graph for one episode."""
+    nodes: List[ServiceNode] = field(default_factory=list)
+    edges: List[DependencyEdge] = field(default_factory=list)
+    # Derived lookup structures (populated after build)
+    node_map: Dict[str, ServiceNode] = field(default_factory=dict)
+    adjacency: Dict[str, List[str]] = field(default_factory=dict)  # source → [targets]
+    reverse_adjacency: Dict[str, List[str]] = field(default_factory=dict)  # target → [callers]
+    # Metadata
+    difficulty: str = "easy"
+    has_multiple_regions: bool = False
+    regions: List[str] = field(default_factory=lambda: ["us-east-1"])
+    cache_services: List[str] = field(default_factory=list)
+    background_jobs: List[str] = field(default_factory=list)
+    def build_indices(self) -> None:
+        """Build lookup maps after nodes/edges are populated."""
+        self.node_map = {n.id: n for n in self.nodes}
+        self.adjacency = {n.id: [] for n in self.nodes}
+        self.reverse_adjacency = {n.id: [] for n in self.nodes}
+        for edge in self.edges:
+            self.adjacency[edge.source].append(edge.target)
+            self.reverse_adjacency[edge.target].append(edge.source)
+        self.cache_services = [n.id for n in self.nodes if n.is_cache]
+        self.background_jobs = [n.id for n in self.nodes if n.has_background_job]
+# ---------------------------------------------------------------------------
+# Graph generation functions
+# ---------------------------------------------------------------------------
+def _pick(pool: List[str], rng: random.Random, exclude: set) -> Optional[str]:
+    """Pick a random name from pool not already in exclude set."""
+    choices = [x for x in pool if x not in exclude]
+    if not choices:
+        return None
+    return rng.choice(choices)
+def _make_node(
+    service_id: str,
+    layer: str,
+    is_hotspot: bool = False,
+    is_cache: bool = False,
+    has_background_job: bool = False,
+    arrival_rate: float = 100.0,
+    service_time: float = 0.05,
+    thread_pool: int = 50,
+) -> ServiceNode:
+    """Create a ServiceNode with sensible per-layer defaults."""
+    # Infra nodes handle more concurrency, edge nodes get more traffic
+    if layer == "edge":
+        arrival_rate = 500.0
+        thread_pool = 100
+    elif layer == "infra":
+        arrival_rate = 200.0
+        service_time = 0.02   # DBs are fast per-query
+        thread_pool = 30
+        if is_cache:
+            service_time = 0.001
+            thread_pool = 200
+    return ServiceNode(
+        id=service_id,
+        layer=layer,
+        base_arrival_rate=arrival_rate,
+        base_service_time_local=service_time,
+        thread_pool_size=thread_pool,
+        is_hotspot=is_hotspot,
+        is_cache=is_cache,
+        has_background_job=has_background_job,
+    )
+def generate_easy_graph(rng: random.Random) -> ServiceGraph:
+    """
+    Easy: 3-5 services, linear chain.
+    api-gateway → order-service → postgres-primary
+    Agent must identify and fix one failing service in this simple topology.
+    """
+    graph = ServiceGraph(difficulty="easy")
+    used: set = set()
+    # Always have a gateway at the edge
+    gateway_id = "api-gateway"
+    used.add(gateway_id)
+    # Pick 1-2 business services
+    biz_count = rng.randint(1, 2)
+    biz_nodes = []
+    for _ in range(biz_count):
+        svc = _pick(_BUSINESS_POOL, rng, used)
+        if svc:
+            used.add(svc)
+            biz_nodes.append(svc)
+    # Always have one DB at the leaf
+    db_id = "postgres-primary"
+    used.add(db_id)
+    # Optionally add a cache
+    add_cache = rng.random() > 0.4
+    cache_id = "redis-cache" if add_cache else None
+    if cache_id:
+        used.add(cache_id)
+    # Build nodes
+    graph.nodes.append(_make_node(gateway_id, "edge"))
+    for biz in biz_nodes:
+        graph.nodes.append(_make_node(biz, "business"))
+    graph.nodes.append(
+        _make_node(db_id, "infra", is_hotspot=True, arrival_rate=200.0)
+    )
+    if cache_id:
+        graph.nodes.append(
+            _make_node(cache_id, "infra", is_hotspot=True, is_cache=True)
+        )
+    # Build linear dependency chain: gateway → biz[0] → biz[1]? → db
+    chain = [gateway_id] + biz_nodes + [db_id]
+    for i in range(len(chain) - 1):
+        graph.edges.append(DependencyEdge(source=chain[i], target=chain[i + 1]))
+    # If cache exists, business services call it (optional edge for realism)
+    if cache_id and biz_nodes:
+        for biz in biz_nodes:
+            graph.edges.append(
+                DependencyEdge(source=biz, target=cache_id, activation_probability=0.9)
+            )
+    graph.build_indices()
+    return graph
+def generate_medium_graph(rng: random.Random) -> ServiceGraph:
+    """
+    Medium: 8-15 services, branching DAG.
+    gateway → auth + 3-4 domain services → shared DB + cache + kafka.
+    Agent must trace through the graph to find a root cause that's upstream
+    of the service showing the worst symptoms.
+    """
+    graph = ServiceGraph(difficulty="medium")
+    used: set = set()
+    # Edge layer: 1 gateway
+    gateway_id = "api-gateway"
+    used.add(gateway_id)
+    graph.nodes.append(_make_node(gateway_id, "edge"))
+    # Identity layer: auth (gateway always calls auth)
+    auth_id = "auth-service"
+    used.add(auth_id)
+    graph.nodes.append(_make_node(auth_id, "identity"))
+    graph.edges.append(DependencyEdge(source=gateway_id, target=auth_id))
+    # Business layer: 4-6 domain services fanning out from gateway
+    biz_count = rng.randint(4, 6)
+    biz_nodes = []
+    for _ in range(biz_count):
+        svc = _pick(_BUSINESS_POOL, rng, used)
+        if svc:
+            used.add(svc)
+            biz_nodes.append(svc)
+            graph.nodes.append(_make_node(svc, "business"))
+            graph.edges.append(DependencyEdge(source=gateway_id, target=svc))
+    # Infra layer: shared DB + cache (hotspot nodes)
+    db_id = "postgres-primary"
+    cache_id = "redis-cache"
+    used.update([db_id, cache_id])
+    graph.nodes.append(_make_node(db_id, "infra", is_hotspot=True, arrival_rate=300.0))
+    graph.nodes.append(_make_node(cache_id, "infra", is_hotspot=True, is_cache=True))
+    # Business services call the shared DB and cache
+    for biz in biz_nodes:
+        graph.edges.append(DependencyEdge(source=biz, target=db_id))
+        # Cache: most biz services call it, but with high-freq optional
+        graph.edges.append(
+            DependencyEdge(source=biz, target=cache_id, activation_probability=0.8)
+        )
+    # Optionally add kafka as an async edge (1-2 business services produce to it)
+    if rng.random() > 0.4:
+        kafka_id = "kafka-broker"
+        used.add(kafka_id)
+        graph.nodes.append(
+            _make_node(kafka_id, "infra", has_background_job=True)
+        )
+        producers = rng.sample(biz_nodes, min(2, len(biz_nodes)))
+        for p in producers:
+            graph.edges.append(
+                DependencyEdge(source=p, target=kafka_id, edge_type="async", activation_probability=0.6)
+            )
+    # Cross-cutting: add 1-2 optional services (fraud, notification) called by some biz
+    cross_count = rng.randint(1, 2)
+    for _ in range(cross_count):
+        svc = _pick(_CROSS_CUTTING_POOL, rng, used)
+        if svc and biz_nodes:
+            used.add(svc)
+            caller = rng.choice(biz_nodes)
+            graph.nodes.append(_make_node(svc, "cross-cutting"))
+            graph.edges.append(
+                DependencyEdge(source=caller, target=svc, activation_probability=0.3)
+            )
+    graph.build_indices()
+    return graph
+def generate_hard_graph(rng: random.Random) -> ServiceGraph:
+    """
+    Hard: 15-30 services, complex multi-region DAG with hotspots,
+    conditional edges, multiple infra tiers, and background jobs.
+    Agent must manage a Sev-0 multi-root incident with conflicting mitigations.
+    """
+    graph = ServiceGraph(difficulty="hard", has_multiple_regions=True)
+    graph.regions = ["us-east-1", "us-west-2"]
+    used: set = set()
+    all_biz_nodes: List[str] = []
+    # Build per-region sub-graphs, then connect them
+    for region in graph.regions:
+        suffix = "-east" if "east" in region else "-west"
+        # Edge: one gateway per region
+        gw = f"api-gateway{suffix}"
+        used.add(gw)
+        node = _make_node(gw, "edge")
+        node.region = region
+        graph.nodes.append(node)
+        # Identity: auth per region
+        auth = f"auth-service{suffix}"
+        used.add(auth)
+        node = _make_node(auth, "identity")
+        node.region = region
+        graph.nodes.append(node)
+        graph.edges.append(DependencyEdge(source=gw, target=auth))
+        # Business: 4-6 services per region
+        region_biz: List[str] = []
+        for _ in range(rng.randint(4, 6)):
+            svc_base = _pick(_BUSINESS_POOL, rng, used)
+            if svc_base:
+                svc = f"{svc_base}{suffix}"
+                used.add(svc)
+                region_biz.append(svc)
+                node = _make_node(svc, "business")
+                node.region = region
+                graph.nodes.append(node)
+                graph.edges.append(DependencyEdge(source=gw, target=svc))
+        all_biz_nodes.extend(region_biz)
+        # Infra: per-region replicas (postgres-replica is a hotspot)
+        pg_replica = f"postgres-replica{suffix}"
+        redis_svc = f"redis-cache{suffix}"
+        used.update([pg_replica, redis_svc])
+        node = _make_node(pg_replica, "infra", is_hotspot=True)
+        node.region = region
+        graph.nodes.append(node)
+        node = _make_node(redis_svc, "infra", is_hotspot=True, is_cache=True)
+        node.region = region
+        graph.nodes.append(node)
+        for biz in region_biz:
+            graph.edges.append(DependencyEdge(source=biz, target=pg_replica))
+            graph.edges.append(
+                DependencyEdge(source=biz, target=redis_svc, activation_probability=0.85)
+            )
+    # Shared global infra (hotspots called by both regions)
+    pg_primary = "postgres-primary"
+    kafka = "kafka-broker"
+    config_svc = "config-service"
+    used.update([pg_primary, kafka, config_svc])
+    graph.nodes.append(_make_node(pg_primary, "infra", is_hotspot=True, arrival_rate=500.0))
+    graph.nodes.append(_make_node(kafka, "infra", has_background_job=True))
+    graph.nodes.append(_make_node(config_svc, "infra", is_hotspot=True))
+    # Replicas call primary (replication)
+    for region in graph.regions:
+        suffix = "-east" if "east" in region else "-west"
+        graph.edges.append(
+            DependencyEdge(source=f"postgres-replica{suffix}", target=pg_primary)
+        )
+    # Business services use kafka for async events and config-service for feature flags
+    for biz in all_biz_nodes:
+        if rng.random() > 0.5:
+            graph.edges.append(
+                DependencyEdge(source=biz, target=kafka, edge_type="async", activation_probability=0.5)
+            )
+        graph.edges.append(
+            DependencyEdge(source=biz, target=config_svc, activation_probability=0.2)
+        )
+    # Cross-cutting services (low-freq optional edges)
+    for _ in range(rng.randint(2, 3)):
+        svc = _pick(_CROSS_CUTTING_POOL, rng, used)
+        if svc and all_biz_nodes:
+            used.add(svc)
+            caller = rng.choice(all_biz_nodes)
+            graph.nodes.append(_make_node(svc, "cross-cutting"))
+            graph.edges.append(
+                DependencyEdge(source=caller, target=svc, activation_probability=0.25)
+            )
+    graph.build_indices()
+    return graph
+def generate_graph(difficulty: str, rng: random.Random) -> ServiceGraph:
+    """Generate a service dependency graph for the given difficulty level."""
+    if difficulty == "easy":
+        return generate_easy_graph(rng)
+    elif difficulty == "medium":
+        return generate_medium_graph(rng)
+    elif difficulty == "hard":
+        return generate_hard_graph(rng)
+    else:
+        raise ValueError(f"Unknown difficulty: {difficulty!r}. Must be easy|medium|hard.")