File size: 4,567 Bytes
d02897f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# Copyright (c) Meta Platforms, Inc.
"""
Pydantic models for OpenOps environment
"""

from typing import Dict, List, Optional
from pydantic import BaseModel, Field


class IncidentAction(BaseModel):
    """
    Action taken by the agent.
    
    Represents a single action in the incident management workflow.
    """
    action_id: int = Field(..., ge=0, le=20, description="Action ID (0-20)")
    task_id: int = Field(default=1, ge=1, le=3, description="Task ID (1=easy, 2=medium, 3=hard)")
    
    class Config:
        json_schema_extra = {
            "example": {
                "action_id": 0,
                "task_id": 1
            }
        }


class IncidentObservation(BaseModel):
    """
    Observation returned to agent after each step.
    
    Contains partial information about the system state (investigation reveals more).
    """
    active_alerts: List[str] = Field(
        default_factory=list,
        description="List of active system alerts"
    )
    service_status: Dict[str, str] = Field(
        default_factory=dict,
        description="Status of each service (healthy/degraded/down)"
    )
    recent_logs: Dict[str, List[str]] = Field(
        default_factory=dict,
        description="Logs from inspected services only"
    )
    metrics_summary: Dict[str, Dict[str, float]] = Field(
        default_factory=dict,
        description="Metrics for checked services (CPU, memory, latency)"
    )
    customer_complaints: int = Field(
        default=0,
        description="Number of customer complaints received"
    )
    time_elapsed: int = Field(
        default=0,
        description="Minutes since incident started"
    )
    revenue_loss: float = Field(
        default=0.0,
        description="Estimated revenue loss in USD"
    )
    teams_notified: bool = Field(
        default=False,
        description="Whether engineering team has been notified"
    )
    status_page_updated: bool = Field(
        default=False,
        description="Whether public status page has been updated"
    )
    reward: float = Field(
        default=0.0,
        description="Reward received for this step"
    )
    done: bool = Field(
        default=False,
        description="Whether episode is complete"
    )
    
    class Config:
        json_schema_extra = {
            "example": {
                "active_alerts": ["CRITICAL: API service down"],
                "service_status": {
                    "api": "down",
                    "database": "healthy"
                },
                "recent_logs": {
                    "api": ["ERROR: Out of memory"]
                },
                "customer_complaints": 45,
                "time_elapsed": 5,
                "revenue_loss": 5000.0,
                "teams_notified": False,
                "status_page_updated": False,
                "reward": 0.05,
                "done": False
            }
        }


class IncidentState(BaseModel):
    """
    Internal environment state (hidden from agent).
    
    Contains ground truth about the incident for evaluation.
    """
    task_id: int = Field(..., ge=1, le=3, description="Task difficulty level")
    incident_type: str = Field(..., description="Type of incident")
    affected_services: List[str] = Field(
        default_factory=list,
        description="Services affected by the incident"
    )
    root_cause: str = Field(..., description="Root cause of the incident")
    service_status: Dict[str, str] = Field(
        default_factory=dict,
        description="Current status of all services"
    )
    correct_mitigation: List[str] = Field(
        default_factory=list,
        description="Correct mitigation actions for this incident"
    )
    revenue_loss: float = Field(
        default=0.0,
        description="Accumulated revenue loss"
    )
    customer_complaints: int = Field(
        default=0,
        description="Accumulated customer complaints"
    )
    
    class Config:
        json_schema_extra = {
            "example": {
                "task_id": 1,
                "incident_type": "api_crash",
                "affected_services": ["api"],
                "root_cause": "out_of_memory",
                "service_status": {
                    "api": "down",
                    "database": "healthy",
                    "auth": "healthy",
                    "frontend": "degraded"
                },
                "correct_mitigation": ["restart_api"],
                "revenue_loss": 0.0,
                "customer_complaints": 0
            }
        }