| |
| """ |
| Pydantic models for OpenOps environment |
| """ |
|
|
| from typing import Dict, List, Optional |
| from pydantic import BaseModel, Field |
|
|
|
|
| class IncidentAction(BaseModel): |
| """ |
| Action taken by the agent. |
| |
| Represents a single action in the incident management workflow. |
| """ |
| action_id: int = Field(..., ge=0, le=20, description="Action ID (0-20)") |
| task_id: int = Field(default=1, ge=1, le=3, description="Task ID (1=easy, 2=medium, 3=hard)") |
| |
| class Config: |
| json_schema_extra = { |
| "example": { |
| "action_id": 0, |
| "task_id": 1 |
| } |
| } |
|
|
|
|
| class IncidentObservation(BaseModel): |
| """ |
| Observation returned to agent after each step. |
| |
| Contains partial information about the system state (investigation reveals more). |
| """ |
| active_alerts: List[str] = Field( |
| default_factory=list, |
| description="List of active system alerts" |
| ) |
| service_status: Dict[str, str] = Field( |
| default_factory=dict, |
| description="Status of each service (healthy/degraded/down)" |
| ) |
| recent_logs: Dict[str, List[str]] = Field( |
| default_factory=dict, |
| description="Logs from inspected services only" |
| ) |
| metrics_summary: Dict[str, Dict[str, float]] = Field( |
| default_factory=dict, |
| description="Metrics for checked services (CPU, memory, latency)" |
| ) |
| customer_complaints: int = Field( |
| default=0, |
| description="Number of customer complaints received" |
| ) |
| time_elapsed: int = Field( |
| default=0, |
| description="Minutes since incident started" |
| ) |
| revenue_loss: float = Field( |
| default=0.0, |
| description="Estimated revenue loss in USD" |
| ) |
| teams_notified: bool = Field( |
| default=False, |
| description="Whether engineering team has been notified" |
| ) |
| status_page_updated: bool = Field( |
| default=False, |
| description="Whether public status page has been updated" |
| ) |
| reward: float = Field( |
| default=0.0, |
| description="Reward received for this step" |
| ) |
| done: bool = Field( |
| default=False, |
| description="Whether episode is complete" |
| ) |
| |
| class Config: |
| json_schema_extra = { |
| "example": { |
| "active_alerts": ["CRITICAL: API service down"], |
| "service_status": { |
| "api": "down", |
| "database": "healthy" |
| }, |
| "recent_logs": { |
| "api": ["ERROR: Out of memory"] |
| }, |
| "customer_complaints": 45, |
| "time_elapsed": 5, |
| "revenue_loss": 5000.0, |
| "teams_notified": False, |
| "status_page_updated": False, |
| "reward": 0.05, |
| "done": False |
| } |
| } |
|
|
|
|
| class IncidentState(BaseModel): |
| """ |
| Internal environment state (hidden from agent). |
| |
| Contains ground truth about the incident for evaluation. |
| """ |
| task_id: int = Field(..., ge=1, le=3, description="Task difficulty level") |
| incident_type: str = Field(..., description="Type of incident") |
| affected_services: List[str] = Field( |
| default_factory=list, |
| description="Services affected by the incident" |
| ) |
| root_cause: str = Field(..., description="Root cause of the incident") |
| service_status: Dict[str, str] = Field( |
| default_factory=dict, |
| description="Current status of all services" |
| ) |
| correct_mitigation: List[str] = Field( |
| default_factory=list, |
| description="Correct mitigation actions for this incident" |
| ) |
| revenue_loss: float = Field( |
| default=0.0, |
| description="Accumulated revenue loss" |
| ) |
| customer_complaints: int = Field( |
| default=0, |
| description="Accumulated customer complaints" |
| ) |
| |
| class Config: |
| json_schema_extra = { |
| "example": { |
| "task_id": 1, |
| "incident_type": "api_crash", |
| "affected_services": ["api"], |
| "root_cause": "out_of_memory", |
| "service_status": { |
| "api": "down", |
| "database": "healthy", |
| "auth": "healthy", |
| "frontend": "degraded" |
| }, |
| "correct_mitigation": ["restart_api"], |
| "revenue_loss": 0.0, |
| "customer_complaints": 0 |
| } |
| } |