File size: 4,370 Bytes
bc2ead7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b2a96ac
 
 
 
 
 
 
 
 
 
 
 
 
bc2ead7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
"""
Typed models for the hardened NovaTech OpenEnv environment.
"""
from __future__ import annotations

from typing import Any, Dict, List, Literal, Optional

from pydantic import BaseModel, Field

ServiceName = Literal[
    "auth-service",
    "payment-api",
    "order-service",
    "notification-service",
    "reporting-service",
    "user-service",
]
ServerName = Literal["server_01", "server_02", "server_03", "server_04"]
LogLevel = Literal["INFO", "WARN", "ERROR", "CRITICAL"]
FailureMode = Literal[
    "resource_exhaustion",
    "dependency_outage",
    "storage_saturation",
    "certificate_expiry",
    "application_bug",
    "traffic_abuse",
]
DependencyName = Literal[
    "none",
    "auth-service",
    "payment-api",
    "order-service",
    "notification-service",
    "reporting-service",
    "user-service",
    "payment-gateway",
    "mysql",
    "email-relay",
    "ldap-directory",
]
CustomerImpact = Literal[
    "login_failures",
    "checkout_delays",
    "order_write_failures",
    "notification_delivery_failure",
    "cross_service_major_incident",
]
ContainmentActionName = Literal[
    "increase_auth_heap",
    "enable_login_rate_limiting",
    "restore_payment_gateway_connectivity",
    "reduce_checkout_retry_pressure",
    "free_order_log_disk",
    "reset_mysql_connection_pool",
    "renew_smtp_certificate",
    "reroute_notification_traffic",
    "page_major_incident_team",
    "block_all_login_traffic",
    "wipe_application_logs",
    "restart_everything",
]


class LogEntry(BaseModel):
    log_id: int
    timestamp: str
    server_id: ServerName
    log_level: LogLevel
    service_name: ServiceName
    message: str
    response_time_ms: int
    cpu_usage_percent: float
    memory_usage_percent: float


class IncidentBriefing(BaseModel):
    incident_id: str
    title: str
    objective: str
    incident_window_start: str
    incident_window_end: str
    suspected_services: List[ServiceName]
    customer_statement: str
    operational_constraints: List[str]


class RootCauseHypothesis(BaseModel):
    primary_service: ServiceName
    failure_mode: FailureMode
    dependency: DependencyName = "none"
    customer_impact: CustomerImpact
    confidence: float = Field(..., ge=0.0, le=1.0)


class LogQuery(BaseModel):
    service_name: Optional[ServiceName] = None
    server_id: Optional[ServerName] = None
    levels: Optional[List[LogLevel]] = None
    start_time: Optional[str] = None
    end_time: Optional[str] = None
    text_contains: Optional[str] = Field(default=None, max_length=80)
    limit: int = Field(default=6, ge=1, le=6)


class IncidentReport(BaseModel):
    evidence_log_ids: List[int] = Field(default_factory=list, min_length=1)
    impacted_services: List[ServiceName] = Field(default_factory=list, min_length=1)
    root_cause: RootCauseHypothesis
    containment_plan: List[ContainmentActionName] = Field(default_factory=list)
    summary: str = Field(..., min_length=20, max_length=600)


class Action(BaseModel):
    session_id: Optional[str] = None
    action_type: Literal[
        "query_logs",
        "inspect_dependencies",
        "update_hypothesis",
        "execute_containment",
        "submit_report",
        "request_more",
        "no_anomalies",
    ]
    query: Optional[LogQuery] = None
    target_service: Optional[ServiceName] = None
    hypothesis: Optional[RootCauseHypothesis] = None
    containment_plan: Optional[List[ContainmentActionName]] = None
    report: Optional[IncidentReport] = None


class Observation(BaseModel):
    session_id: str
    task_id: str
    task_title: str
    briefing: IncidentBriefing
    dependency_graph: Dict[ServiceName, List[str]]
    visible_logs: List[LogEntry]
    revealed_log_count: int
    visited_services: List[ServiceName]
    submitted_containment: List[ContainmentActionName]
    last_hypothesis: Optional[RootCauseHypothesis] = None
    step_number: int = 0
    max_steps: int = 8
    feedback: Optional[str] = None
    done: bool = False


class Reward(BaseModel):
    value: float = Field(..., ge=0.0, le=1.0)
    signal_reward: float = Field(default=0.0, ge=0.0, le=1.0)
    hypothesis_reward: float = Field(default=0.0, ge=0.0, le=1.0)
    efficiency_reward: float = Field(default=0.0, ge=0.0, le=1.0)
    penalty: float = Field(default=0.0, ge=0.0, le=1.0)
    info: Dict[str, Any] = Field(default_factory=dict)