File size: 2,735 Bytes
250ab26
35ea9cd
250ab26
 
9347ce5
 
250ab26
35ea9cd
250ab26
35ea9cd
 
 
 
250ab26
 
9347ce5
250ab26
35ea9cd
 
 
 
 
 
 
 
 
 
250ab26
 
 
35ea9cd
250ab26
 
 
 
35ea9cd
 
 
 
 
 
 
250ab26
 
 
 
 
 
 
 
 
35ea9cd
250ab26
 
 
 
 
 
 
 
35ea9cd
250ab26
 
 
 
35ea9cd
250ab26
 
18aa055
250ab26
 
 
35ea9cd
250ab26
 
18aa055
250ab26
 
 
35ea9cd
250ab26
 
18aa055
250ab26
 
4b84bac
250ab26
4b84bac
 
 
35ea9cd
 
 
 
 
18aa055
4b84bac
250ab26
 
35ea9cd
 
 
 
b6d1ff0
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
spec_version: 1
name: incident-triage-env
type: space
runtime: fastapi
app: app:app
port: 7860
version: "1.0.0"
tags: [openenv]
description: >
  Production incident triage environment for evaluating agents on realistic
  SRE workflows. The agent receives a typed incident observation and must
  classify severity, identify the most likely root cause, or recommend the
  best immediate remediation action.

api:
  base_url: http://0.0.0.0:7860
  endpoints:
    health:
      method: GET
      path: /health
      returns: health status

    metadata:
      method: GET
      path: /metadata
      returns: task metadata and dataset summary

    reset:
      method: POST
      path: /reset
      body:
        task_type:
          type: string
          required: false
          enum: [task1, task2, task3]
        ticket_id:
          type: string
          required: false
        seed:
          type: integer
          required: false
      returns: StepResult with initial observation and session_id in info

    step:
      method: POST
      path: /step
      params:
        session_id:
          type: string
          required: true
      body: IncidentAction
      returns: StepResult with reward object, done flag, and episode info

    state:
      method: GET
      path: /state
      params:
        session_id:
          type: string
          required: true
      returns: IncidentState

tasks:
  task1:
    name: Severity Classification
    difficulty: easy
    output_field: severity
    labels: [SEV1, SEV2, SEV3]
    reward: "0.99 exact | 0.5 adjacent severity | 0.01 far miss"

  task2:
    name: Root Cause Classification
    difficulty: medium
    output_field: root_cause
    labels: [DATABASE, NETWORK, APPLICATION, INFRASTRUCTURE, THIRD_PARTY, UNKNOWN]
    reward: "0.99 exact | 0.5 related domain | 0.25 UNKNOWN fallback | 0.01 wrong"

  task3:
    name: Recommended Action
    difficulty: hard
    output_field: action
    labels: [ROLLBACK, SCALE_UP, RESTART_SERVICE, FAILOVER, NOTIFY_VENDOR, INVESTIGATE, NO_ACTION]
    reward: "0.99 exact | 0.4 safe investigate fallback | 0.25 related action | 0.01 wrong"

dataset:
  total_tickets: 108
  split:
    task1: 36
    task2: 36
    task3: 36

baseline:
  script: inference.py
  required_env_vars: [API_BASE_URL, MODEL_NAME, HF_TOKEN]
  optional_env_vars: [ENV_URL]
  latest_local_score: 0.9855
  latest_local_episodes: 108

reproducibility:
  inference_temperature: 0.0
  max_steps_per_episode: 1
  dataset_order: fixed TICKETS list order in incidents.py
  baseline_selection: deterministic ticket_id-driven evaluation across all tickets
  default_reset_seed: 42
  reset_without_ticket_id: deterministic fixed-seed selection within the requested task pool