spec_version: 1
name: incident-triage-env
type: space
runtime: fastapi
app: app:app
port: 7860
version: "1.0.0"
tags: [openenv]
description: >
  Production incident triage environment for evaluating agents on realistic
  SRE workflows. The agent receives a typed incident observation and must
  classify severity, identify the most likely root cause, or recommend the
  best immediate remediation action.

api:
  base_url: http://0.0.0.0:7860
  endpoints:
    health:
      method: GET
      path: /health
      returns: health status

    metadata:
      method: GET
      path: /metadata
      returns: task metadata and dataset summary

    reset:
      method: POST
      path: /reset
      body:
        task_type:
          type: string
          required: false
          enum: [task1, task2, task3]
        ticket_id:
          type: string
          required: false
        seed:
          type: integer
          required: false
      returns: StepResult with initial observation and session_id in info

    step:
      method: POST
      path: /step
      params:
        session_id:
          type: string
          required: true
      body: IncidentAction
      returns: StepResult with reward object, done flag, and episode info

    state:
      method: GET
      path: /state
      params:
        session_id:
          type: string
          required: true
      returns: IncidentState

tasks:
  task1:
    name: Severity Classification
    difficulty: easy
    output_field: severity
    labels: [SEV1, SEV2, SEV3]
    reward: "0.99 exact | 0.5 adjacent severity | 0.01 far miss"

  task2:
    name: Root Cause Classification
    difficulty: medium
    output_field: root_cause
    labels: [DATABASE, NETWORK, APPLICATION, INFRASTRUCTURE, THIRD_PARTY, UNKNOWN]
    reward: "0.99 exact | 0.5 related domain | 0.25 UNKNOWN fallback | 0.01 wrong"

  task3:
    name: Recommended Action
    difficulty: hard
    output_field: action
    labels: [ROLLBACK, SCALE_UP, RESTART_SERVICE, FAILOVER, NOTIFY_VENDOR, INVESTIGATE, NO_ACTION]
    reward: "0.99 exact | 0.4 safe investigate fallback | 0.25 related action | 0.01 wrong"

dataset:
  total_tickets: 108
  split:
    task1: 36
    task2: 36
    task3: 36

baseline:
  script: inference.py
  required_env_vars: [API_BASE_URL, MODEL_NAME, HF_TOKEN]
  optional_env_vars: [ENV_URL]
  latest_local_score: 0.9855
  latest_local_episodes: 108

reproducibility:
  inference_temperature: 0.0
  max_steps_per_episode: 1
  dataset_order: fixed TICKETS list order in incidents.py
  baseline_selection: deterministic ticket_id-driven evaluation across all tickets
  default_reset_seed: 42
  reset_without_ticket_id: deterministic fixed-seed selection within the requested task pool