Spaces:

Escanor-playground
/

cloud-sre-arbiter

Sleeping

App Files Files Community

cloud-sre-arbiter / server /environment.py

Escanor925

fix: clamp scores to (0.001, 0.999) for Phase 2 strict range validation

d1583fb 7 days ago

raw

history blame contribute delete

15.4 kB

	"""
	Cloud SRE Arbiter — Core Environment Engine
	============================================
	Implements a multi-step, RL-style environment where an AI agent must
	simultaneously contain live incidents (Ops) and investigate root causes
	(Sec/Data). The grader is fully deterministic and penalizes reckless
	actions such as guessing the root cause without gathering evidence first.
	"""

	import json
	from pathlib import Path
	from pydantic import BaseModel, Field
	from typing import Dict, Any, Tuple, Optional, Literal, List


	# ---------------------------------------------------------------------------
	# 1. PYDANTIC MODELS (strict schemas required by OpenEnv)
	# ---------------------------------------------------------------------------

	class Observation(BaseModel):
	"""What the agent sees each turn."""
	incident_id: str = Field(..., description="Unique incident identifier")
	severity: str = Field(..., description="Incident severity level (P1/P2/P3)")
	initial_observation: str = Field(..., description="Human-readable summary of what is happening")
	active_alerts: List[str] = Field(..., description="List of active alert names")
	system_metrics: Dict[str, str] = Field(..., description="Current system metric readings")
	timeline: List[str] = Field(..., description="Recent event timeline")
	investigation_results: Dict[str, str] = Field(
	default_factory=dict,
	description="Results from investigation queries run so far"
	)
	system_health: float = Field(..., ge=0.0, le=100.0, description="Current system health 0-100")
	budget_spent: float = Field(..., ge=0.0, description="Total budget consumed so far ($)")
	turn_number: int = Field(..., ge=0, description="Current turn in this episode")
	turns_remaining: int = Field(..., ge=0, description="Turns left before forced resolution")
	available_actions: Dict[str, List[str]] = Field(
	..., description="Available action choices for each action type"
	)


	class Action(BaseModel):
	"""The agent's two-pronged decision each turn."""
	containment_action: Literal[
	"scale_up_nodes",
	"rate_limit_all",
	"rollback_last_deploy",
	"do_nothing"
	] = Field(..., description="Immediate ops action to keep the system online")

	investigation_query: Literal[
	"analyze_ip_traffic",
	"query_db_locks",
	"check_commit_diffs",
	"check_service_mesh",
	"check_resource_utilization",
	"none"
	] = Field(..., description="Query to run for root-cause investigation")

	declare_root_cause: Literal[
	"ddos_attack",
	"viral_traffic",
	"bad_code",
	"database_lock",
	"unknown"
	] = Field(..., description="Declare the root cause or 'unknown' to keep investigating")

	justification: str = Field(
	...,
	min_length=1,
	description="A short explanation for this decision, citing evidence gathered"
	)


	class Reward(BaseModel):
	"""Deterministic grading result returned after each step."""
	total_score: float = Field(..., gt=0.0, lt=1.0, description="Final score in (0, 1) exclusive")
	breakdown: Dict[str, float] = Field(..., description="Score breakdown by category")


	class State(BaseModel):
	"""Metadata about the current episode."""
	task_name: str = Field(..., description="Current task difficulty level")
	incident_id: str = Field("", description="Current incident ID")
	turn_number: int = Field(0, description="Current turn")
	max_turns: int = Field(0, description="Maximum turns allowed")
	system_health: float = Field(100.0, description="Current system health")
	budget_spent: float = Field(0.0, description="Budget consumed")
	is_done: bool = Field(False, description="Whether the episode has ended")


	# ---------------------------------------------------------------------------
	# 2. COST & REWARD CONSTANTS
	# ---------------------------------------------------------------------------

	# Containment costs (represent real infrastructure spend)
	CONTAINMENT_COSTS = {
	"scale_up_nodes": 500.0,
	"rate_limit_all": 100.0,
	"rollback_last_deploy": 200.0,
	"do_nothing": 0.0,
	}

	# How each containment action affects system health (additive per turn)
	CONTAINMENT_HEALTH_EFFECTS = {
	"scale_up_nodes": +15.0,
	"rate_limit_all": +10.0,
	"rollback_last_deploy": +20.0,
	"do_nothing": -15.0, # doing nothing while system is on fire is bad
	}

	# Investigation costs
	INVESTIGATION_COST = 50.0 # each query costs $50

	# Reward weights (must sum to 1.0)
	W_ROOT_CAUSE = 0.40 # correctly identifying the root cause
	W_CONTAINMENT = 0.25 # picking the ideal containment action
	W_EVIDENCE = 0.15 # gathering required evidence before declaring
	W_EFFICIENCY = 0.10 # budget efficiency
	W_HEALTH = 0.10 # keeping system health above critical

	# Penalties
	PREMATURE_GUESS_PENALTY = 0.30 # deducted if you declare root cause without evidence
	SYSTEM_CRASH_PENALTY = 0.50 # deducted if system health drops to 0
	MAX_BUDGET = 5000.0 # budget ceiling for efficiency calculation
	MAX_TURNS = 6 # maximum turns per incident


	# ---------------------------------------------------------------------------
	# 3. ENVIRONMENT ENGINE
	# ---------------------------------------------------------------------------

	class CloudSREEnv:
	"""
	Gymnasium-style environment for the Cloud SRE Arbiter.

	The agent loops through reset() -> step() -> step() -> ... until done.
	Each task (easy/medium/hard) contains one incident scenario.
	"""

	def __init__(self, data_path: str = "data.json"):
	# Try loading from the same directory as this file first
	p = Path(__file__).parent / data_path
	if not p.exists():
	p = Path(data_path)
	with open(p, "r", encoding="utf-8") as f:
	self.dataset: Dict[str, list] = json.load(f)

	# Episode state
	self._task_name: str = ""
	self._case: Optional[dict] = None
	self._turn: int = 0
	self._budget: float = 0.0
	self._health: float = 50.0 # start at 50 — system is already degraded
	self._investigation_results: Dict[str, str] = {}
	self._evidence_gathered: List[str] = []
	self._containment_used: List[str] = []
	self._done: bool = True

	# ------------------------------------------------------------------
	# PUBLIC API
	# ------------------------------------------------------------------

	def reset(self, task_name: str = "easy") -> Observation:
	"""Start a new episode for the given task difficulty."""
	if task_name not in self.dataset:
	raise ValueError(
	f"Task '{task_name}' not found. Available: {list(self.dataset.keys())}"
	)

	self._task_name = task_name
	self._case = self.dataset[task_name][0] # one case per difficulty
	self._turn = 0
	self._budget = 0.0
	self._health = 50.0 # system is already hurting
	self._investigation_results = {}
	self._evidence_gathered = []
	self._containment_used = []
	self._done = False

	return self._build_observation()

	def step(self, action: Action) -> Tuple[Optional[Observation], Reward, bool, Dict[str, Any]]:
	"""
	Process one agent turn.

	Returns: (observation, reward, done, info)
	"""
	if self._done or self._case is None:
	raise RuntimeError("Episode is over. Call reset() to start a new one.")

	self._turn += 1
	ground_truth = self._case["ground_truth"]
	hidden_data = self._case["hidden_data"]
	info: Dict[str, Any] = {"justification": action.justification, "turn": self._turn}

	# --- A) PROCESS CONTAINMENT ---
	cost = CONTAINMENT_COSTS.get(action.containment_action, 0.0)
	self._budget += cost
	health_delta = CONTAINMENT_HEALTH_EFFECTS.get(action.containment_action, 0.0)
	self._health = max(0.0, min(100.0, self._health + health_delta))
	if action.containment_action != "do_nothing":
	self._containment_used.append(action.containment_action)

	# --- B) PROCESS INVESTIGATION ---
	if action.investigation_query != "none":
	self._budget += INVESTIGATION_COST
	query = action.investigation_query
	if query in hidden_data:
	self._investigation_results[query] = hidden_data[query]
	else:
	self._investigation_results[query] = "Query returned no anomalies."
	if query not in self._evidence_gathered:
	self._evidence_gathered.append(query)

	# --- C) CHECK END CONDITIONS ---
	declared = action.declare_root_cause != "unknown"
	timed_out = self._turn >= MAX_TURNS
	system_crashed = self._health <= 0.0

	if declared or timed_out or system_crashed:
	self._done = True
	reward = self._grade(action, ground_truth, timed_out, system_crashed)
	info["grading_detail"] = reward.breakdown
	return None, reward, True, info

	# --- D) CONTINUE INVESTIGATING ---
	# Natural health decay each turn (the incident is ongoing)
	self._health = max(0.0, self._health - 5.0)

	reward = Reward(
	total_score=0.001,
	breakdown={
	"status": 0.0,
	"message_investigating": 0.0,
	"budget_spent": self._budget,
	"system_health": self._health,
	},
	)
	return self._build_observation(), reward, False, info

	def get_state(self) -> State:
	"""Return metadata about the current episode."""
	return State(
	task_name=self._task_name or "none",
	incident_id=self._case["incident_id"] if self._case else "",
	turn_number=self._turn,
	max_turns=MAX_TURNS,
	system_health=self._health,
	budget_spent=self._budget,
	is_done=self._done,
	)

	# ------------------------------------------------------------------
	# DETERMINISTIC GRADER
	# ------------------------------------------------------------------

	def _grade(
	self,
	action: Action,
	ground_truth: dict,
	timed_out: bool,
	system_crashed: bool,
	) -> Reward:
	"""
	Score the agent's performance. Returns a float in [0.0, 1.0].

	Scoring breakdown:
	- Root cause identification (40%)
	- Containment quality (25%)
	- Evidence gathering (15%)
	- Budget efficiency (10%)
	- System health maintenance (10%)

	Penalties:
	- Premature guess (no evidence) → −0.30
	- System crash (health → 0) → −0.50
	"""
	breakdown: Dict[str, float] = {}

	# 1. Root cause (40%)
	if action.declare_root_cause == ground_truth["root_cause"]:
	breakdown["root_cause"] = W_ROOT_CAUSE
	elif timed_out and action.declare_root_cause == "unknown":
	breakdown["root_cause"] = 0.0 # never even guessed
	else:
	breakdown["root_cause"] = 0.0 # wrong guess

	# 2. Containment (25%) — check if ideal action was used at any point
	if ground_truth["ideal_containment"] in self._containment_used:
	breakdown["containment"] = W_CONTAINMENT
	elif action.containment_action == ground_truth["ideal_containment"]:
	breakdown["containment"] = W_CONTAINMENT
	else:
	breakdown["containment"] = 0.0

	# 3. Evidence (15%) — did the agent gather the required evidence?
	required = set(ground_truth.get("required_evidence", []))
	gathered = set(self._evidence_gathered)
	if required and required.issubset(gathered):
	breakdown["evidence"] = W_EVIDENCE
	elif required:
	# Partial credit for gathering some evidence
	overlap = len(required & gathered) / len(required)
	breakdown["evidence"] = round(W_EVIDENCE * overlap, 4)
	else:
	breakdown["evidence"] = W_EVIDENCE # no evidence required

	# 4. Budget efficiency (10%)
	if self._budget <= 0:
	breakdown["efficiency"] = W_EFFICIENCY
	else:
	breakdown["efficiency"] = round(
	max(0.0, W_EFFICIENCY * (1.0 - self._budget / MAX_BUDGET)), 4
	)

	# 5. System health (10%)
	breakdown["health"] = round(W_HEALTH * (self._health / 100.0), 4)

	# --- Penalties ---
	penalty = 0.0

	# Premature guess: agent declared root cause without gathering any
	# of the required evidence
	if (
	action.declare_root_cause != "unknown"
	and required
	and not required.issubset(gathered)
	):
	penalty += PREMATURE_GUESS_PENALTY
	breakdown["penalty_premature_guess"] = -PREMATURE_GUESS_PENALTY

	# System crash penalty
	if system_crashed:
	penalty += SYSTEM_CRASH_PENALTY
	breakdown["penalty_system_crash"] = -SYSTEM_CRASH_PENALTY

	raw = sum(v for k, v in breakdown.items() if not k.startswith("penalty_"))
	total = max(0.001, min(0.999, round(raw - penalty, 4)))

	breakdown["budget_spent"] = self._budget
	breakdown["final_health"] = self._health
	breakdown["turns_used"] = float(self._turn)

	return Reward(total_score=total, breakdown=breakdown)

	# ------------------------------------------------------------------
	# HELPERS
	# ------------------------------------------------------------------

	def _build_observation(self) -> Observation:
	"""Build an Observation from the current case + internal state."""
	case = self._case
	if case is None:
	raise RuntimeError("No active case — call reset() first.")

	return Observation(
	incident_id=case["incident_id"],
	severity=case.get("severity", "P1"),
	initial_observation=case.get("initial_observation", ""),
	active_alerts=case["active_alerts"],
	system_metrics=case["system_metrics"],
	timeline=case.get("timeline", []),
	investigation_results=dict(self._investigation_results),
	system_health=round(self._health, 2),
	budget_spent=round(self._budget, 2),
	turn_number=self._turn,
	turns_remaining=MAX_TURNS - self._turn,
	available_actions={
	"containment_action": [
	"scale_up_nodes", "rate_limit_all",
	"rollback_last_deploy", "do_nothing",
	],
	"investigation_query": [
	"analyze_ip_traffic", "query_db_locks",
	"check_commit_diffs", "check_service_mesh",
	"check_resource_utilization", "none",
	],
	"declare_root_cause": [
	"ddos_attack", "viral_traffic",
	"bad_code", "database_lock", "unknown",
	],
	},
	)