Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| from pydantic import BaseModel, Field | |
| from typing import List, Optional, Dict, Any, Literal | |
| from enum import Enum | |
| class ActionType(str, Enum): | |
| DIAGNOSE = "diagnose" | |
| READ_LOGS = "read_logs" | |
| READ_METRICS = "read_metrics" | |
| READ_RUNBOOK = "read_runbook" | |
| RESTART_SERVICE = "restart_service" | |
| ROLLBACK = "rollback" | |
| SCALE_UP = "scale_up" | |
| ALERT_ONCALL = "alert_oncall" | |
| ACKNOWLEDGE = "acknowledge" | |
| NOOP = "noop" | |
| SEARCH_LOGS = "search_logs" | |
| BLOCK_IP_RANGE = "block_ip_range" | |
| CREATE_INDEX = "create_index" | |
| FAILOVER = "failover" | |
| class Action(BaseModel): | |
| action_type: ActionType | |
| service: Optional[str] = None | |
| root_cause: Optional[str] = None | |
| runbook: Optional[str] = None | |
| version: Optional[str] = None | |
| reason: Optional[str] = None | |
| query: Optional[str] = None # used with search_logs | |
| ip_range: Optional[str] = None | |
| table: Optional[str] = None | |
| column: Optional[str] = None | |
| target_region: Optional[str] = None | |
| class Alert(BaseModel): | |
| id: str | |
| severity: Literal["critical", "warning", "info"] | |
| service: str | |
| message: str | |
| timestamp: str | |
| acknowledged: bool = False | |
| class ServiceStatus(BaseModel): | |
| name: str | |
| status: Literal["healthy", "degraded", "down", "unknown"] | |
| cpu_percent: float | |
| memory_percent: float | |
| error_rate: float | |
| latency_p99_ms: float | |
| replicas_running: int | |
| replicas_desired: int | |
| current_version: str | |
| last_deployed: str | |
| # SLA tracking β updated each step if unresolved | |
| sla_breach: bool = False | |
| minutes_degraded: int = 0 | |
| class ServiceDependency(BaseModel): | |
| """Describes which services call which β critical for cascade diagnosis.""" | |
| service: str | |
| calls: List[str] # services this one depends on | |
| called_by: List[str] # services that depend on this one | |
| class EvidenceEntry(BaseModel): | |
| """One piece of gathered evidence β accumulated across steps.""" | |
| step: int | |
| source: str # e.g. "logs:payment-service" or "metrics:inventory-service" | |
| summary: str # short digest of what was found | |
| raw: str # full content returned by read action | |
| class Observation(BaseModel): | |
| step: int | |
| max_steps: int | |
| task_id: str | |
| task_description: str | |
| services: List[ServiceStatus] | |
| active_alerts: List[Alert] | |
| recent_logs: Dict[str, List[str]] | |
| available_runbooks: List[str] | |
| # NEW: dependency topology so agent can reason about cascades | |
| service_dependencies: List[ServiceDependency] = [] | |
| # NEW: accumulated evidence from all previous read actions | |
| evidence_log: List[EvidenceEntry] = [] | |
| # NEW: SLA status β shows urgency | |
| sla_status: Dict[str, str] = {} # service -> "ok" | "warning" | "breached" | |
| last_action_result: Optional[str] = None | |
| last_action_error: Optional[str] = None | |
| incident_start_time: str | |
| elapsed_minutes: int | |
| class StepResult(BaseModel): | |
| observation: Observation | |
| reward: float | |
| done: bool | |
| info: Dict[str, Any] = {} | |
| class State(BaseModel): | |
| episode_id: str | |
| task_id: str | |
| step: int | |
| current_observation: Observation | |
| action_history: List[Dict[str, Any]] | |
| total_reward: float | |
| incident_resolved: bool | |
| ground_truth_root_cause: str | |
| ground_truth_fix: str | |
| info: Dict[str, Any] = {} | |