spec_version: 1 name: incident-triage-env type: space runtime: fastapi app: app:app port: 7860 version: "1.0.0" tags: [openenv] description: > Production incident triage environment for evaluating agents on realistic SRE workflows. The agent receives a typed incident observation and must classify severity, identify the most likely root cause, or recommend the best immediate remediation action. api: base_url: http://0.0.0.0:7860 endpoints: health: method: GET path: /health returns: health status metadata: method: GET path: /metadata returns: task metadata and dataset summary reset: method: POST path: /reset body: task_type: type: string required: false enum: [task1, task2, task3] ticket_id: type: string required: false seed: type: integer required: false returns: StepResult with initial observation and session_id in info step: method: POST path: /step params: session_id: type: string required: true body: IncidentAction returns: StepResult with reward object, done flag, and episode info state: method: GET path: /state params: session_id: type: string required: true returns: IncidentState tasks: task1: name: Severity Classification difficulty: easy output_field: severity labels: [SEV1, SEV2, SEV3] reward: "0.99 exact | 0.5 adjacent severity | 0.01 far miss" task2: name: Root Cause Classification difficulty: medium output_field: root_cause labels: [DATABASE, NETWORK, APPLICATION, INFRASTRUCTURE, THIRD_PARTY, UNKNOWN] reward: "0.99 exact | 0.5 related domain | 0.25 UNKNOWN fallback | 0.01 wrong" task3: name: Recommended Action difficulty: hard output_field: action labels: [ROLLBACK, SCALE_UP, RESTART_SERVICE, FAILOVER, NOTIFY_VENDOR, INVESTIGATE, NO_ACTION] reward: "0.99 exact | 0.4 safe investigate fallback | 0.25 related action | 0.01 wrong" dataset: total_tickets: 108 split: task1: 36 task2: 36 task3: 36 baseline: script: inference.py required_env_vars: [API_BASE_URL, MODEL_NAME, HF_TOKEN] optional_env_vars: [ENV_URL] latest_local_score: 0.9855 latest_local_episodes: 108 reproducibility: inference_temperature: 0.0 max_steps_per_episode: 1 dataset_order: fixed TICKETS list order in incidents.py baseline_selection: deterministic ticket_id-driven evaluation across all tickets default_reset_seed: 42 reset_without_ticket_id: deterministic fixed-seed selection within the requested task pool