Spaces:

Arijit-07
/

devops-incident-response

Sleeping

App Files Files Community

devops-incident-response / models.py

Arijit-07

Add Task 7: Multi-Region Failover (partial failover with compliance constraints)

d59268c 29 days ago

raw

history blame contribute delete

3.35 kB

	from __future__ import annotations
	from pydantic import BaseModel, Field
	from typing import List, Optional, Dict, Any, Literal
	from enum import Enum


	class ActionType(str, Enum):
	DIAGNOSE = "diagnose"
	READ_LOGS = "read_logs"
	READ_METRICS = "read_metrics"
	READ_RUNBOOK = "read_runbook"
	RESTART_SERVICE = "restart_service"
	ROLLBACK = "rollback"
	SCALE_UP = "scale_up"
	ALERT_ONCALL = "alert_oncall"
	ACKNOWLEDGE = "acknowledge"
	NOOP = "noop"
	SEARCH_LOGS = "search_logs"
	BLOCK_IP_RANGE = "block_ip_range"
	CREATE_INDEX = "create_index"
	FAILOVER = "failover"


	class Action(BaseModel):
	action_type: ActionType
	service: Optional[str] = None
	root_cause: Optional[str] = None
	runbook: Optional[str] = None
	version: Optional[str] = None
	reason: Optional[str] = None
	query: Optional[str] = None # used with search_logs
	ip_range: Optional[str] = None
	table: Optional[str] = None
	column: Optional[str] = None
	target_region: Optional[str] = None


	class Alert(BaseModel):
	id: str
	severity: Literal["critical", "warning", "info"]
	service: str
	message: str
	timestamp: str
	acknowledged: bool = False


	class ServiceStatus(BaseModel):
	name: str
	status: Literal["healthy", "degraded", "down", "unknown"]
	cpu_percent: float
	memory_percent: float
	error_rate: float
	latency_p99_ms: float
	replicas_running: int
	replicas_desired: int
	current_version: str
	last_deployed: str
	# SLA tracking — updated each step if unresolved
	sla_breach: bool = False
	minutes_degraded: int = 0


	class ServiceDependency(BaseModel):
	"""Describes which services call which — critical for cascade diagnosis."""
	service: str
	calls: List[str] # services this one depends on
	called_by: List[str] # services that depend on this one


	class EvidenceEntry(BaseModel):
	"""One piece of gathered evidence — accumulated across steps."""
	step: int
	source: str # e.g. "logs:payment-service" or "metrics:inventory-service"
	summary: str # short digest of what was found
	raw: str # full content returned by read action


	class Observation(BaseModel):
	step: int
	max_steps: int
	task_id: str
	task_description: str
	services: List[ServiceStatus]
	active_alerts: List[Alert]
	recent_logs: Dict[str, List[str]]
	available_runbooks: List[str]
	# NEW: dependency topology so agent can reason about cascades
	service_dependencies: List[ServiceDependency] = []
	# NEW: accumulated evidence from all previous read actions
	evidence_log: List[EvidenceEntry] = []
	# NEW: SLA status — shows urgency
	sla_status: Dict[str, str] = {} # service -> "ok" \| "warning" \| "breached"
	last_action_result: Optional[str] = None
	last_action_error: Optional[str] = None
	incident_start_time: str
	elapsed_minutes: int


	class StepResult(BaseModel):
	observation: Observation
	reward: float
	done: bool
	info: Dict[str, Any] = {}


	class State(BaseModel):
	episode_id: str
	task_id: str
	step: int
	current_observation: Observation
	action_history: List[Dict[str, Any]]
	total_reward: float
	incident_resolved: bool
	ground_truth_root_cause: str
	ground_truth_fix: str
	info: Dict[str, Any] = {}