Spaces:

ccyloopss
/

HPCOpenenv

Paused

App Files Files Community

HPCOpenenv / openenv.yaml

huggingmenfordays

deploy: ccyloopss/HPCOpenenv — with OPENENV_API_KEY auth guard

bc35a94 18 days ago

raw

history blame contribute delete

3.06 kB

	name: sysadmin-env
	version: "0.2.0"
	description: reinforcement learning environment for linux server auto remediation

	runtime:
	python: "3.11"
	entry_point: inference.py
	server_entry_point: server.app:app
	live_url: https://huggingmenfordays-enterprise-hpc-openenv.hf.space
	reset_endpoint: /reset
	step_endpoint: /step
	state_endpoint: /state
	websocket_endpoint: /ws
	healthcheck_endpoint: /health
	tasks_endpoint: /tasks

	resources:
	vcpus: 2
	memory_gb: 8
	gpu: none
	max_runtime_minutes: 20

	tasks:
	# warm-up curriculum tier (round 1 legacy): single-app remediations
	# used as a difficulty ramp so a freshly initialized policy can
	# accumulate non-zero reward before the multi-app hpc scenarios kick
	# in. not the story of the round 2 submission.
	- id: nginx_crash
	tier: warmup
	difficulty: easy
	description: nginx crash with stale pid and config syntax error (warm-up tier)
	max_steps: 40
	time_limit_seconds: 300

	- id: disk_full
	tier: warmup
	difficulty: medium
	description: hidden sparse log file filling a loopback mount (warm-up tier)
	max_steps: 55
	time_limit_seconds: 420

	- id: network_broken
	tier: warmup
	difficulty: hard
	description: broken network namespace with corrupted routing tables (warm-up tier)
	max_steps: 70
	time_limit_seconds: 480

	# round 2 hpc tier: multi-app enterprise incident response scenarios.
	# this is the tier the grpo trainer samples from by default and the
	# tier judges should score on for theme #3.1 (scaler ai labs multi-app
	# rl environment for enterprise workflows).
	- id: hpc_outage
	tier: hpc
	difficulty: hard
	description: multi node hpc cluster outage with drained compute and broken ood portal
	max_steps: 90
	time_limit_seconds: 600

	- id: hpc_munge
	tier: hpc
	difficulty: hard
	description: compute node draining due to a munge key permission fault and broken route
	max_steps: 90
	time_limit_seconds: 600

	- id: hpc_pid_stale
	tier: hpc
	difficulty: hard
	description: slurmd refuses to restart after reboot because a stale pid file is still on disk
	max_steps: 90
	time_limit_seconds: 600

	- id: hpc_gpu_ecc
	tier: hpc
	difficulty: hard
	description: compute node drained because nvidia-smi reports gpu-0 uncorrectable ecc errors
	max_steps: 90
	time_limit_seconds: 600

	- id: hpc_nfs_stale
	tier: hpc
	difficulty: hard
	description: compute node drained because the nfs share at /mnt/shared reports stale file handle
	max_steps: 90
	time_limit_seconds: 600

	- id: hpc_ood_apache
	tier: hpc
	difficulty: medium
	description: open ondemand apache portal on :8081 returns 500 due to a one character typo in httpd.conf
	max_steps: 80
	time_limit_seconds: 540

	evaluation:
	protocol: sequential
	max_total_runtime_seconds: 4200
	tasks_order:
	- nginx_crash
	- disk_full
	- network_broken
	- hpc_outage
	- hpc_munge
	- hpc_pid_stale
	- hpc_gpu_ecc
	- hpc_nfs_stale
	- hpc_ood_apache