Spaces:

arya89
/

openops

No application file

App Files Files Community

openops / models.py

arya89

Upload folder using huggingface_hub

d02897f verified about 2 months ago

raw

history blame contribute delete

4.57 kB

	# Copyright (c) Meta Platforms, Inc.
	"""
	Pydantic models for OpenOps environment
	"""

	from typing import Dict, List, Optional
	from pydantic import BaseModel, Field


	class IncidentAction(BaseModel):
	"""
	Action taken by the agent.

	Represents a single action in the incident management workflow.
	"""
	action_id: int = Field(..., ge=0, le=20, description="Action ID (0-20)")
	task_id: int = Field(default=1, ge=1, le=3, description="Task ID (1=easy, 2=medium, 3=hard)")

	class Config:
	json_schema_extra = {
	"example": {
	"action_id": 0,
	"task_id": 1
	}
	}


	class IncidentObservation(BaseModel):
	"""
	Observation returned to agent after each step.

	Contains partial information about the system state (investigation reveals more).
	"""
	active_alerts: List[str] = Field(
	default_factory=list,
	description="List of active system alerts"
	)
	service_status: Dict[str, str] = Field(
	default_factory=dict,
	description="Status of each service (healthy/degraded/down)"
	)
	recent_logs: Dict[str, List[str]] = Field(
	default_factory=dict,
	description="Logs from inspected services only"
	)
	metrics_summary: Dict[str, Dict[str, float]] = Field(
	default_factory=dict,
	description="Metrics for checked services (CPU, memory, latency)"
	)
	customer_complaints: int = Field(
	default=0,
	description="Number of customer complaints received"
	)
	time_elapsed: int = Field(
	default=0,
	description="Minutes since incident started"
	)
	revenue_loss: float = Field(
	default=0.0,
	description="Estimated revenue loss in USD"
	)
	teams_notified: bool = Field(
	default=False,
	description="Whether engineering team has been notified"
	)
	status_page_updated: bool = Field(
	default=False,
	description="Whether public status page has been updated"
	)
	reward: float = Field(
	default=0.0,
	description="Reward received for this step"
	)
	done: bool = Field(
	default=False,
	description="Whether episode is complete"
	)

	class Config:
	json_schema_extra = {
	"example": {
	"active_alerts": ["CRITICAL: API service down"],
	"service_status": {
	"api": "down",
	"database": "healthy"
	},
	"recent_logs": {
	"api": ["ERROR: Out of memory"]
	},
	"customer_complaints": 45,
	"time_elapsed": 5,
	"revenue_loss": 5000.0,
	"teams_notified": False,
	"status_page_updated": False,
	"reward": 0.05,
	"done": False
	}
	}


	class IncidentState(BaseModel):
	"""
	Internal environment state (hidden from agent).

	Contains ground truth about the incident for evaluation.
	"""
	task_id: int = Field(..., ge=1, le=3, description="Task difficulty level")
	incident_type: str = Field(..., description="Type of incident")
	affected_services: List[str] = Field(
	default_factory=list,
	description="Services affected by the incident"
	)
	root_cause: str = Field(..., description="Root cause of the incident")
	service_status: Dict[str, str] = Field(
	default_factory=dict,
	description="Current status of all services"
	)
	correct_mitigation: List[str] = Field(
	default_factory=list,
	description="Correct mitigation actions for this incident"
	)
	revenue_loss: float = Field(
	default=0.0,
	description="Accumulated revenue loss"
	)
	customer_complaints: int = Field(
	default=0,
	description="Accumulated customer complaints"
	)

	class Config:
	json_schema_extra = {
	"example": {
	"task_id": 1,
	"incident_type": "api_crash",
	"affected_services": ["api"],
	"root_cause": "out_of_memory",
	"service_status": {
	"api": "down",
	"database": "healthy",
	"auth": "healthy",
	"frontend": "degraded"
	},
	"correct_mitigation": ["restart_api"],
	"revenue_loss": 0.0,
	"customer_complaints": 0
	}
	}