Spaces:
Running
Running
File size: 2,735 Bytes
250ab26 35ea9cd 250ab26 9347ce5 250ab26 35ea9cd 250ab26 35ea9cd 250ab26 9347ce5 250ab26 35ea9cd 250ab26 35ea9cd 250ab26 35ea9cd 250ab26 35ea9cd 250ab26 35ea9cd 250ab26 35ea9cd 250ab26 18aa055 250ab26 35ea9cd 250ab26 18aa055 250ab26 35ea9cd 250ab26 18aa055 250ab26 4b84bac 250ab26 4b84bac 35ea9cd 18aa055 4b84bac 250ab26 35ea9cd b6d1ff0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 | spec_version: 1
name: incident-triage-env
type: space
runtime: fastapi
app: app:app
port: 7860
version: "1.0.0"
tags: [openenv]
description: >
Production incident triage environment for evaluating agents on realistic
SRE workflows. The agent receives a typed incident observation and must
classify severity, identify the most likely root cause, or recommend the
best immediate remediation action.
api:
base_url: http://0.0.0.0:7860
endpoints:
health:
method: GET
path: /health
returns: health status
metadata:
method: GET
path: /metadata
returns: task metadata and dataset summary
reset:
method: POST
path: /reset
body:
task_type:
type: string
required: false
enum: [task1, task2, task3]
ticket_id:
type: string
required: false
seed:
type: integer
required: false
returns: StepResult with initial observation and session_id in info
step:
method: POST
path: /step
params:
session_id:
type: string
required: true
body: IncidentAction
returns: StepResult with reward object, done flag, and episode info
state:
method: GET
path: /state
params:
session_id:
type: string
required: true
returns: IncidentState
tasks:
task1:
name: Severity Classification
difficulty: easy
output_field: severity
labels: [SEV1, SEV2, SEV3]
reward: "0.99 exact | 0.5 adjacent severity | 0.01 far miss"
task2:
name: Root Cause Classification
difficulty: medium
output_field: root_cause
labels: [DATABASE, NETWORK, APPLICATION, INFRASTRUCTURE, THIRD_PARTY, UNKNOWN]
reward: "0.99 exact | 0.5 related domain | 0.25 UNKNOWN fallback | 0.01 wrong"
task3:
name: Recommended Action
difficulty: hard
output_field: action
labels: [ROLLBACK, SCALE_UP, RESTART_SERVICE, FAILOVER, NOTIFY_VENDOR, INVESTIGATE, NO_ACTION]
reward: "0.99 exact | 0.4 safe investigate fallback | 0.25 related action | 0.01 wrong"
dataset:
total_tickets: 108
split:
task1: 36
task2: 36
task3: 36
baseline:
script: inference.py
required_env_vars: [API_BASE_URL, MODEL_NAME, HF_TOKEN]
optional_env_vars: [ENV_URL]
latest_local_score: 0.9855
latest_local_episodes: 108
reproducibility:
inference_temperature: 0.0
max_steps_per_episode: 1
dataset_order: fixed TICKETS list order in incidents.py
baseline_selection: deterministic ticket_id-driven evaluation across all tickets
default_reset_seed: 42
reset_without_ticket_id: deterministic fixed-seed selection within the requested task pool
|