HPCOpenenv / openenv.yaml
huggingmenfordays's picture
deploy: ccyloopss/HPCOpenenv — with OPENENV_API_KEY auth guard
bc35a94
name: sysadmin-env
version: "0.2.0"
description: reinforcement learning environment for linux server auto remediation
runtime:
python: "3.11"
entry_point: inference.py
server_entry_point: server.app:app
live_url: https://huggingmenfordays-enterprise-hpc-openenv.hf.space
reset_endpoint: /reset
step_endpoint: /step
state_endpoint: /state
websocket_endpoint: /ws
healthcheck_endpoint: /health
tasks_endpoint: /tasks
resources:
vcpus: 2
memory_gb: 8
gpu: none
max_runtime_minutes: 20
tasks:
# warm-up curriculum tier (round 1 legacy): single-app remediations
# used as a difficulty ramp so a freshly initialized policy can
# accumulate non-zero reward before the multi-app hpc scenarios kick
# in. not the story of the round 2 submission.
- id: nginx_crash
tier: warmup
difficulty: easy
description: nginx crash with stale pid and config syntax error (warm-up tier)
max_steps: 40
time_limit_seconds: 300
- id: disk_full
tier: warmup
difficulty: medium
description: hidden sparse log file filling a loopback mount (warm-up tier)
max_steps: 55
time_limit_seconds: 420
- id: network_broken
tier: warmup
difficulty: hard
description: broken network namespace with corrupted routing tables (warm-up tier)
max_steps: 70
time_limit_seconds: 480
# round 2 hpc tier: multi-app enterprise incident response scenarios.
# this is the tier the grpo trainer samples from by default and the
# tier judges should score on for theme #3.1 (scaler ai labs multi-app
# rl environment for enterprise workflows).
- id: hpc_outage
tier: hpc
difficulty: hard
description: multi node hpc cluster outage with drained compute and broken ood portal
max_steps: 90
time_limit_seconds: 600
- id: hpc_munge
tier: hpc
difficulty: hard
description: compute node draining due to a munge key permission fault and broken route
max_steps: 90
time_limit_seconds: 600
- id: hpc_pid_stale
tier: hpc
difficulty: hard
description: slurmd refuses to restart after reboot because a stale pid file is still on disk
max_steps: 90
time_limit_seconds: 600
- id: hpc_gpu_ecc
tier: hpc
difficulty: hard
description: compute node drained because nvidia-smi reports gpu-0 uncorrectable ecc errors
max_steps: 90
time_limit_seconds: 600
- id: hpc_nfs_stale
tier: hpc
difficulty: hard
description: compute node drained because the nfs share at /mnt/shared reports stale file handle
max_steps: 90
time_limit_seconds: 600
- id: hpc_ood_apache
tier: hpc
difficulty: medium
description: open ondemand apache portal on :8081 returns 500 due to a one character typo in httpd.conf
max_steps: 80
time_limit_seconds: 540
evaluation:
protocol: sequential
max_total_runtime_seconds: 4200
tasks_order:
- nginx_crash
- disk_full
- network_broken
- hpc_outage
- hpc_munge
- hpc_pid_stale
- hpc_gpu_ecc
- hpc_nfs_stale
- hpc_ood_apache