| """ |
| Data models for the SRE Incident Investigation Environment. |
| |
| An agent receives realistic system telemetry (logs, metrics, alerts) and must |
| investigate, diagnose root cause, and submit a structured incident report. |
| """ |
|
|
| from typing import Any, Dict, List, Literal, Optional |
|
|
| from openenv.core.env_server.types import Action, Observation, State |
| from pydantic import Field |
|
|
|
|
| |
| |
| |
|
|
| class SREAction(Action): |
| """ |
| An investigative action taken by the SRE agent. |
| |
| The agent can: |
| - query_logs : filter logs by service/level/time |
| - query_metrics : fetch a named metric time-series |
| - query_alerts : list active / recent alerts |
| - annotate : add a free-text hypothesis note (no new data revealed) |
| - submit : submit the final incident report (ends episode) |
| """ |
|
|
| action_type: Literal[ |
| "query_logs", |
| "query_metrics", |
| "query_alerts", |
| "annotate", |
| "submit", |
| ] = Field(..., description="Type of investigative action") |
|
|
| |
| service: Optional[str] = Field( |
| default=None, |
| description="Service name to filter logs (e.g. 'payment-service'). None = all services.", |
| ) |
| log_level: Optional[Literal["DEBUG", "INFO", "WARN", "ERROR", "FATAL"]] = Field( |
| default=None, description="Minimum log level to return" |
| ) |
| time_window_minutes: Optional[int] = Field( |
| default=30, description="How many minutes of logs to retrieve (max 120)" |
| ) |
| log_query: Optional[str] = Field( |
| default=None, |
| description="Optional keyword to search within log messages", |
| ) |
|
|
| |
| metric_name: Optional[str] = Field( |
| default=None, |
| description=( |
| "Metric to fetch. Available: error_rate, latency_p99, latency_p50, " |
| "cpu_usage, memory_usage, db_connections, request_rate, cache_hit_rate" |
| ), |
| ) |
|
|
| |
| note: Optional[str] = Field( |
| default=None, description="Free-text annotation or hypothesis" |
| ) |
|
|
| |
| root_cause_service: Optional[str] = Field( |
| default=None, description="Service identified as root cause" |
| ) |
| root_cause_type: Optional[ |
| Literal[ |
| "resource_exhaustion", |
| "dependency_failure", |
| "configuration_error", |
| "code_bug", |
| "data_corruption", |
| "network_partition", |
| "cascading_failure", |
| "traffic_spike", |
| ] |
| ] = Field(default=None, description="Category of root cause") |
| affected_services: Optional[List[str]] = Field( |
| default=None, description="List of services affected by the incident" |
| ) |
| severity: Optional[Literal["P1", "P2", "P3", "P4"]] = Field( |
| default=None, description="Incident severity level" |
| ) |
| recommended_action: Optional[str] = Field( |
| default=None, |
| description="Recommended remediation (free text, ≤500 chars)", |
| ) |
| confidence: Optional[float] = Field( |
| default=None, |
| ge=0.0, |
| le=1.0, |
| description="Agent's confidence in diagnosis (0.0–1.0)", |
| ) |
|
|
|
|
| |
| |
| |
|
|
| class LogEntry(State): |
| """A single log line returned from a query.""" |
| model_config = {"extra": "allow"} |
|
|
| timestamp: str = Field(description="ISO-8601 timestamp") |
| service: str = Field(description="Emitting service name") |
| level: str = Field(description="Log level") |
| message: str = Field(description="Log message body") |
| trace_id: Optional[str] = Field(default=None) |
|
|
|
|
| class MetricPoint(State): |
| """A single time-series data point.""" |
| model_config = {"extra": "allow"} |
|
|
| timestamp: str = Field(description="ISO-8601 timestamp") |
| value: float = Field(description="Metric value") |
|
|
|
|
| class AlertEntry(State): |
| """An active or recently-fired alert.""" |
| model_config = {"extra": "allow"} |
|
|
| alert_name: str |
| service: str |
| severity: str |
| fired_at: str |
| message: str |
| status: Literal["firing", "resolved"] |
|
|
|
|
| class SREObservation(Observation): |
| """Observation returned after each SRE action.""" |
|
|
| |
| action_taken: str = Field(default="", description="Echo of the action type") |
|
|
| |
| logs: List[Dict[str, Any]] = Field( |
| default_factory=list, description="Log entries matching the query" |
| ) |
| metrics: List[Dict[str, Any]] = Field( |
| default_factory=list, description="Metric time-series points" |
| ) |
| metric_name: Optional[str] = Field( |
| default=None, description="Name of the metric that was queried" |
| ) |
| alerts: List[Dict[str, Any]] = Field( |
| default_factory=list, description="Active/recent alerts" |
| ) |
|
|
| |
| annotation_accepted: bool = Field(default=False) |
|
|
| |
| grader_score: Optional[float] = Field( |
| default=None, |
| description="Score 0.0–1.0 returned by the deterministic grader after submit", |
| ) |
| grader_breakdown: Optional[Dict[str, Any]] = Field( |
| default=None, |
| description="Per-criterion breakdown of the grader score", |
| ) |
|
|
| |
| message: str = Field(default="", description="Human-readable status message") |
|
|
| |
| queries_remaining: int = Field( |
| default=10, description="Number of query actions remaining before forced submit" |
| ) |
|
|
|
|
| |
| |
| |
|
|
| class SREState(State): |
| """Internal environment state for an SRE episode.""" |
|
|
| task_id: str = Field(default="", description="Identifier of the current task") |
| difficulty: str = Field( |
| default="easy", description="Task difficulty: easy | medium | hard" |
| ) |
| step_count: int = Field(default=0) |
| queries_used: int = Field(default=0) |
| max_queries: int = Field(default=10) |
| annotations: List[str] = Field(default_factory=list) |
| submitted: bool = Field(default=False) |
| final_score: Optional[float] = Field(default=None) |
|
|