Spaces:
Sleeping
Sleeping
File size: 3,352 Bytes
06b4790 77eea12 8be69b1 0a14522 d59268c 06b4790 77eea12 8be69b1 0a14522 d59268c 06b4790 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 | from __future__ import annotations
from pydantic import BaseModel, Field
from typing import List, Optional, Dict, Any, Literal
from enum import Enum
class ActionType(str, Enum):
DIAGNOSE = "diagnose"
READ_LOGS = "read_logs"
READ_METRICS = "read_metrics"
READ_RUNBOOK = "read_runbook"
RESTART_SERVICE = "restart_service"
ROLLBACK = "rollback"
SCALE_UP = "scale_up"
ALERT_ONCALL = "alert_oncall"
ACKNOWLEDGE = "acknowledge"
NOOP = "noop"
SEARCH_LOGS = "search_logs"
BLOCK_IP_RANGE = "block_ip_range"
CREATE_INDEX = "create_index"
FAILOVER = "failover"
class Action(BaseModel):
action_type: ActionType
service: Optional[str] = None
root_cause: Optional[str] = None
runbook: Optional[str] = None
version: Optional[str] = None
reason: Optional[str] = None
query: Optional[str] = None # used with search_logs
ip_range: Optional[str] = None
table: Optional[str] = None
column: Optional[str] = None
target_region: Optional[str] = None
class Alert(BaseModel):
id: str
severity: Literal["critical", "warning", "info"]
service: str
message: str
timestamp: str
acknowledged: bool = False
class ServiceStatus(BaseModel):
name: str
status: Literal["healthy", "degraded", "down", "unknown"]
cpu_percent: float
memory_percent: float
error_rate: float
latency_p99_ms: float
replicas_running: int
replicas_desired: int
current_version: str
last_deployed: str
# SLA tracking — updated each step if unresolved
sla_breach: bool = False
minutes_degraded: int = 0
class ServiceDependency(BaseModel):
"""Describes which services call which — critical for cascade diagnosis."""
service: str
calls: List[str] # services this one depends on
called_by: List[str] # services that depend on this one
class EvidenceEntry(BaseModel):
"""One piece of gathered evidence — accumulated across steps."""
step: int
source: str # e.g. "logs:payment-service" or "metrics:inventory-service"
summary: str # short digest of what was found
raw: str # full content returned by read action
class Observation(BaseModel):
step: int
max_steps: int
task_id: str
task_description: str
services: List[ServiceStatus]
active_alerts: List[Alert]
recent_logs: Dict[str, List[str]]
available_runbooks: List[str]
# NEW: dependency topology so agent can reason about cascades
service_dependencies: List[ServiceDependency] = []
# NEW: accumulated evidence from all previous read actions
evidence_log: List[EvidenceEntry] = []
# NEW: SLA status — shows urgency
sla_status: Dict[str, str] = {} # service -> "ok" | "warning" | "breached"
last_action_result: Optional[str] = None
last_action_error: Optional[str] = None
incident_start_time: str
elapsed_minutes: int
class StepResult(BaseModel):
observation: Observation
reward: float
done: bool
info: Dict[str, Any] = {}
class State(BaseModel):
episode_id: str
task_id: str
step: int
current_observation: Observation
action_history: List[Dict[str, Any]]
total_reward: float
incident_resolved: bool
ground_truth_root_cause: str
ground_truth_fix: str
info: Dict[str, Any] = {}
|