incident-triage-env / openenv.yaml
XcodeAddy's picture
Keep grader rewards strictly within unit interval
18aa055
spec_version: 1
name: incident-triage-env
type: space
runtime: fastapi
app: app:app
port: 7860
version: "1.0.0"
tags: [openenv]
description: >
Production incident triage environment for evaluating agents on realistic
SRE workflows. The agent receives a typed incident observation and must
classify severity, identify the most likely root cause, or recommend the
best immediate remediation action.
api:
base_url: http://0.0.0.0:7860
endpoints:
health:
method: GET
path: /health
returns: health status
metadata:
method: GET
path: /metadata
returns: task metadata and dataset summary
reset:
method: POST
path: /reset
body:
task_type:
type: string
required: false
enum: [task1, task2, task3]
ticket_id:
type: string
required: false
seed:
type: integer
required: false
returns: StepResult with initial observation and session_id in info
step:
method: POST
path: /step
params:
session_id:
type: string
required: true
body: IncidentAction
returns: StepResult with reward object, done flag, and episode info
state:
method: GET
path: /state
params:
session_id:
type: string
required: true
returns: IncidentState
tasks:
task1:
name: Severity Classification
difficulty: easy
output_field: severity
labels: [SEV1, SEV2, SEV3]
reward: "0.99 exact | 0.5 adjacent severity | 0.01 far miss"
task2:
name: Root Cause Classification
difficulty: medium
output_field: root_cause
labels: [DATABASE, NETWORK, APPLICATION, INFRASTRUCTURE, THIRD_PARTY, UNKNOWN]
reward: "0.99 exact | 0.5 related domain | 0.25 UNKNOWN fallback | 0.01 wrong"
task3:
name: Recommended Action
difficulty: hard
output_field: action
labels: [ROLLBACK, SCALE_UP, RESTART_SERVICE, FAILOVER, NOTIFY_VENDOR, INVESTIGATE, NO_ACTION]
reward: "0.99 exact | 0.4 safe investigate fallback | 0.25 related action | 0.01 wrong"
dataset:
total_tickets: 108
split:
task1: 36
task2: 36
task3: 36
baseline:
script: inference.py
required_env_vars: [API_BASE_URL, MODEL_NAME, HF_TOKEN]
optional_env_vars: [ENV_URL]
latest_local_score: 0.9855
latest_local_episodes: 108
reproducibility:
inference_temperature: 0.0
max_steps_per_episode: 1
dataset_order: fixed TICKETS list order in incidents.py
baseline_selection: deterministic ticket_id-driven evaluation across all tickets
default_reset_seed: 42
reset_without_ticket_id: deterministic fixed-seed selection within the requested task pool