Spaces:

XcodeAddy
/

incident-triage-env

Running

App Files Files Community

incident-triage-env / openenv.yaml

XcodeAddy

Keep grader rewards strictly within unit interval

18aa055 about 1 month ago

raw

history blame contribute delete

2.74 kB

	spec_version: 1
	name: incident-triage-env
	type: space
	runtime: fastapi
	app: app:app
	port: 7860
	version: "1.0.0"
	tags: [openenv]
	description: >
	Production incident triage environment for evaluating agents on realistic
	SRE workflows. The agent receives a typed incident observation and must
	classify severity, identify the most likely root cause, or recommend the
	best immediate remediation action.

	api:
	base_url: http://0.0.0.0:7860
	endpoints:
	health:
	method: GET
	path: /health
	returns: health status

	metadata:
	method: GET
	path: /metadata
	returns: task metadata and dataset summary

	reset:
	method: POST
	path: /reset
	body:
	task_type:
	type: string
	required: false
	enum: [task1, task2, task3]
	ticket_id:
	type: string
	required: false
	seed:
	type: integer
	required: false
	returns: StepResult with initial observation and session_id in info

	step:
	method: POST
	path: /step
	params:
	session_id:
	type: string
	required: true
	body: IncidentAction
	returns: StepResult with reward object, done flag, and episode info

	state:
	method: GET
	path: /state
	params:
	session_id:
	type: string
	required: true
	returns: IncidentState

	tasks:
	task1:
	name: Severity Classification
	difficulty: easy
	output_field: severity
	labels: [SEV1, SEV2, SEV3]
	reward: "0.99 exact \| 0.5 adjacent severity \| 0.01 far miss"

	task2:
	name: Root Cause Classification
	difficulty: medium
	output_field: root_cause
	labels: [DATABASE, NETWORK, APPLICATION, INFRASTRUCTURE, THIRD_PARTY, UNKNOWN]
	reward: "0.99 exact \| 0.5 related domain \| 0.25 UNKNOWN fallback \| 0.01 wrong"

	task3:
	name: Recommended Action
	difficulty: hard
	output_field: action
	labels: [ROLLBACK, SCALE_UP, RESTART_SERVICE, FAILOVER, NOTIFY_VENDOR, INVESTIGATE, NO_ACTION]
	reward: "0.99 exact \| 0.4 safe investigate fallback \| 0.25 related action \| 0.01 wrong"

	dataset:
	total_tickets: 108
	split:
	task1: 36
	task2: 36
	task3: 36

	baseline:
	script: inference.py
	required_env_vars: [API_BASE_URL, MODEL_NAME, HF_TOKEN]
	optional_env_vars: [ENV_URL]
	latest_local_score: 0.9855
	latest_local_episodes: 108

	reproducibility:
	inference_temperature: 0.0
	max_steps_per_episode: 1
	dataset_order: fixed TICKETS list order in incidents.py
	baseline_selection: deterministic ticket_id-driven evaluation across all tickets
	default_reset_seed: 42
	reset_without_ticket_id: deterministic fixed-seed selection within the requested task pool