| """ |
| Typed models for the SRE Incident Response environment. |
| |
| Action space: hierarchical β select action_type first, then target + params. |
| Observation space: POMDP β agent never sees fault_type, only symptoms. |
| """ |
|
|
| from __future__ import annotations |
|
|
| from dataclasses import dataclass, field |
| from enum import Enum |
| from typing import Any, Dict, List, Optional |
|
|
|
|
| |
| |
| |
|
|
| class ActionType(str, Enum): |
| """Level-1 action categories β what kind of operation.""" |
| VIEW_ALERTS = "view_alerts" |
| QUERY_LOGS = "query_logs" |
| CHECK_METRICS = "check_metrics" |
| CHECK_DEPENDENCIES = "check_dependencies" |
| CHECK_DEPLOY_HISTORY = "check_deploy_history" |
| RUN_HEALTH_CHECK = "run_health_check" |
| RESTART_SERVICE = "restart_service" |
| ROLLBACK_DEPLOY = "rollback_deploy" |
| SCALE_SERVICE = "scale_service" |
| DECLARE_ROOT_CAUSE = "declare_root_cause" |
|
|
|
|
| |
| TARGETED_ACTIONS = { |
| ActionType.QUERY_LOGS, |
| ActionType.CHECK_METRICS, |
| ActionType.CHECK_DEPENDENCIES, |
| ActionType.CHECK_DEPLOY_HISTORY, |
| ActionType.RUN_HEALTH_CHECK, |
| ActionType.RESTART_SERVICE, |
| ActionType.ROLLBACK_DEPLOY, |
| ActionType.SCALE_SERVICE, |
| } |
|
|
| |
| DIAGNOSTIC_ACTIONS = { |
| ActionType.VIEW_ALERTS, |
| ActionType.QUERY_LOGS, |
| ActionType.CHECK_METRICS, |
| ActionType.CHECK_DEPENDENCIES, |
| ActionType.CHECK_DEPLOY_HISTORY, |
| ActionType.RUN_HEALTH_CHECK, |
| } |
|
|
| |
| REMEDIATION_ACTIONS = { |
| ActionType.RESTART_SERVICE, |
| ActionType.ROLLBACK_DEPLOY, |
| ActionType.SCALE_SERVICE, |
| } |
|
|
|
|
| @dataclass |
| class IncidentAction: |
| """ |
| Agent action β hierarchical: action_type β target_service β parameters. |
| |
| The LLM emits JSON with these three fields. The action mask in the |
| observation tells it which (action_type, target_service) pairs are legal. |
| """ |
| action_type: str |
| target_service: Optional[str] = None |
| parameters: Dict[str, Any] = field(default_factory=dict) |
|
|
| def parsed_type(self) -> ActionType: |
| return ActionType(self.action_type) |
|
|
|
|
| |
| |
| |
|
|
| @dataclass |
| class AlertInfo: |
| """A single firing alert β what fired, not why.""" |
| alert_id: str |
| severity: str |
| source_service: str |
| description: str |
| firing_since: str |
|
|
| @dataclass |
| class MetricSnapshot: |
| """Time-series metrics for a single service β temporal pattern visible.""" |
| service_name: str |
| timestamps: List[str] |
| cpu_percent: List[float] |
| memory_percent: List[float] |
| error_rate_percent: List[float] |
| latency_p50_ms: List[float] |
| latency_p95_ms: List[float] |
| latency_p99_ms: List[float] |
| requests_per_sec: List[float] |
|
|
| @dataclass |
| class LogEntry: |
| """A single structured log entry β error semantics visible.""" |
| timestamp: str |
| level: str |
| service: str |
| message: str |
| trace_id: Optional[str] = None |
| extra: Dict[str, Any] = field(default_factory=dict) |
|
|
| @dataclass |
| class DeployRecord: |
| """A single deploy β evidence trail for rollback decisions.""" |
| version: str |
| timestamp: str |
| author: str |
| commit_hash: str |
| description: str |
|
|
| @dataclass |
| class DependencyInfo: |
| """Upstream/downstream dependency map for a service.""" |
| service_name: str |
| depends_on: List[str] |
| depended_by: List[str] |
|
|
|
|
| @dataclass |
| class IncidentObservation: |
| """ |
| What the agent sees after each step. |
| |
| This is a PARTIAL observation β the agent never sees fault_type, |
| fault_target, or the internal simulation state. It must infer |
| the root cause from the five observation modalities. |
| """ |
| |
| incident_summary: str |
| severity: str |
| time_elapsed_minutes: int |
| time_budget_minutes: int |
|
|
| |
| action_result: Dict[str, Any] = field(default_factory=dict) |
| action_success: bool = True |
| action_message: str = "" |
|
|
| |
| service_statuses: Dict[str, str] = field(default_factory=dict) |
| active_alerts_count: int = 0 |
|
|
| |
| valid_actions: List[str] = field(default_factory=list) |
| available_services: List[str] = field(default_factory=list) |
|
|
| |
| current_reward: float = 0.0 |
| cumulative_reward: float = 0.0 |
| steps_taken: int = 0 |
| max_steps: int = 20 |
| done: bool = False |
|
|
|
|
| |
| |
| |
|
|
| @dataclass |
| class IncidentState: |
| """Episode metadata β returned by state() for monitoring/debugging.""" |
| episode_id: str = "" |
| task_name: str = "" |
| step_count: int = 0 |
| time_elapsed_minutes: int = 0 |
| done: bool = False |
| cumulative_reward: float = 0.0 |
| declared_root_cause: Optional[str] = None |
|
|
|
|
| |
| |
| |
|
|
| @dataclass |
| class StepRecord: |
| """ |
| Immutable record of a single step β used by the grader. |
| The grader receives List[StepRecord] and scores WITHOUT hidden state. |
| """ |
| step_number: int |
| action: IncidentAction |
| reward: float |
| observation_summary: Dict[str, Any] |
| service_statuses_after: Dict[str, str] |
| timestamp_minutes: int |
|
|