openops / models.py
arya89's picture
Upload folder using huggingface_hub
d02897f verified
# Copyright (c) Meta Platforms, Inc.
"""
Pydantic models for OpenOps environment
"""
from typing import Dict, List, Optional
from pydantic import BaseModel, Field
class IncidentAction(BaseModel):
"""
Action taken by the agent.
Represents a single action in the incident management workflow.
"""
action_id: int = Field(..., ge=0, le=20, description="Action ID (0-20)")
task_id: int = Field(default=1, ge=1, le=3, description="Task ID (1=easy, 2=medium, 3=hard)")
class Config:
json_schema_extra = {
"example": {
"action_id": 0,
"task_id": 1
}
}
class IncidentObservation(BaseModel):
"""
Observation returned to agent after each step.
Contains partial information about the system state (investigation reveals more).
"""
active_alerts: List[str] = Field(
default_factory=list,
description="List of active system alerts"
)
service_status: Dict[str, str] = Field(
default_factory=dict,
description="Status of each service (healthy/degraded/down)"
)
recent_logs: Dict[str, List[str]] = Field(
default_factory=dict,
description="Logs from inspected services only"
)
metrics_summary: Dict[str, Dict[str, float]] = Field(
default_factory=dict,
description="Metrics for checked services (CPU, memory, latency)"
)
customer_complaints: int = Field(
default=0,
description="Number of customer complaints received"
)
time_elapsed: int = Field(
default=0,
description="Minutes since incident started"
)
revenue_loss: float = Field(
default=0.0,
description="Estimated revenue loss in USD"
)
teams_notified: bool = Field(
default=False,
description="Whether engineering team has been notified"
)
status_page_updated: bool = Field(
default=False,
description="Whether public status page has been updated"
)
reward: float = Field(
default=0.0,
description="Reward received for this step"
)
done: bool = Field(
default=False,
description="Whether episode is complete"
)
class Config:
json_schema_extra = {
"example": {
"active_alerts": ["CRITICAL: API service down"],
"service_status": {
"api": "down",
"database": "healthy"
},
"recent_logs": {
"api": ["ERROR: Out of memory"]
},
"customer_complaints": 45,
"time_elapsed": 5,
"revenue_loss": 5000.0,
"teams_notified": False,
"status_page_updated": False,
"reward": 0.05,
"done": False
}
}
class IncidentState(BaseModel):
"""
Internal environment state (hidden from agent).
Contains ground truth about the incident for evaluation.
"""
task_id: int = Field(..., ge=1, le=3, description="Task difficulty level")
incident_type: str = Field(..., description="Type of incident")
affected_services: List[str] = Field(
default_factory=list,
description="Services affected by the incident"
)
root_cause: str = Field(..., description="Root cause of the incident")
service_status: Dict[str, str] = Field(
default_factory=dict,
description="Current status of all services"
)
correct_mitigation: List[str] = Field(
default_factory=list,
description="Correct mitigation actions for this incident"
)
revenue_loss: float = Field(
default=0.0,
description="Accumulated revenue loss"
)
customer_complaints: int = Field(
default=0,
description="Accumulated customer complaints"
)
class Config:
json_schema_extra = {
"example": {
"task_id": 1,
"incident_type": "api_crash",
"affected_services": ["api"],
"root_cause": "out_of_memory",
"service_status": {
"api": "down",
"database": "healthy",
"auth": "healthy",
"frontend": "degraded"
},
"correct_mitigation": ["restart_api"],
"revenue_loss": 0.0,
"customer_complaints": 0
}
}