File size: 4,238 Bytes
0e4dd30 99b8b51 0e4dd30 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 | """
server/environment.py — SevZeroEnvironment: OpenEnv Environment subclass.
Bridges the OpenEnv SDK contract (reset/step/state) with the Simulator engine.
"""
from __future__ import annotations
import uuid
from typing import Any, Optional
from openenv.core.env_server import Environment
from openenv.core.env_server.types import EnvironmentMetadata
from models import SevZeroAction, SevZeroObservation, SevZeroState
from server.scenarios import generate_scenario
from server.simulator import Simulator
class SevZeroEnvironment(Environment[SevZeroAction, SevZeroObservation, SevZeroState]):
"""
SRE Incident Response Environment.
The agent observes service metrics, alerts, and logs, then issues
remediation commands to restore SLO compliance across a microservice cluster.
"""
def __init__(self) -> None:
super().__init__()
self._sim = Simulator()
self._episode_id: Optional[str] = None
self._task_id: str = "easy"
self._seed: Optional[int] = None
self._step_count: int = 0
def close(self) -> None:
# No-op: the SDK calls close() after every HTTP request, but we need
# state to persist between reset() and step() calls in HTTP mode.
# WebSocket sessions manage their own lifecycle.
pass
def get_metadata(self) -> EnvironmentMetadata:
return EnvironmentMetadata(
name="sevzero",
description=(
"SRE Incident Response Environment — an autonomous on-call SRE "
"managing a microservice cluster undergoing cascading failures"
),
version="1.0.0",
)
def reset(
self,
seed: Optional[int] = None,
episode_id: Optional[str] = None,
**kwargs: Any,
) -> SevZeroObservation:
self._episode_id = episode_id or str(uuid.uuid4())
self._task_id = kwargs.get("task_id", "easy")
self._seed = seed if seed is not None else 42
self._step_count = 0
# Generate scenario and reset simulator
scenario = generate_scenario(self._seed, self._task_id)
self._sim.reset(
seed=self._seed,
difficulty=scenario.difficulty,
failure_specs=scenario.failure_specs,
)
return self._build_observation(reward=None, done=False)
def step(
self,
action: SevZeroAction,
timeout_s: Optional[float] = None,
**kwargs: Any,
) -> SevZeroObservation:
self._step_count += 1
reward = self._sim.step(action.action_type, action.params)
done = self._sim.terminated
return self._build_observation(reward=reward, done=done)
@property
def state(self) -> SevZeroState:
return SevZeroState(
episode_id=self._episode_id,
step_count=self._step_count,
task_id=self._task_id,
seed=self._seed,
global_slo_score=self._sim.get_slo_score(),
terminated=self._sim.terminated,
termination_reason=self._sim.termination_reason,
)
def _build_observation(
self, reward: Optional[float], done: bool,
) -> SevZeroObservation:
sim = self._sim
return SevZeroObservation(
done=done,
reward=reward,
# Episode context
tick=sim.tick,
episode_id=self._episode_id,
task_id=self._task_id,
status=sim.termination_reason or "playing",
max_steps=sim.max_steps,
# Health summary
global_slo_score=round(sim.get_slo_score(), 4),
observation_summary=sim.get_observation_summary(),
# Per-service state
services=sim.get_service_observations(),
# Alerts
alerts=sim.get_alerts(),
# Context
recent_deploys=[d for d in sim.deploys if d["ticks_ago"] <= 10],
actions_taken=sim.actions_taken[-10:],
# Action space
legal_actions=sim.get_legal_actions(),
# Diagnostics
logs=sim.last_logs,
metric_history=sim.last_metric_history,
traces=sim.last_traces,
)
|