Spaces:
Paused
Paused
File size: 3,059 Bytes
bc35a94 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | name: sysadmin-env
version: "0.2.0"
description: reinforcement learning environment for linux server auto remediation
runtime:
python: "3.11"
entry_point: inference.py
server_entry_point: server.app:app
live_url: https://huggingmenfordays-enterprise-hpc-openenv.hf.space
reset_endpoint: /reset
step_endpoint: /step
state_endpoint: /state
websocket_endpoint: /ws
healthcheck_endpoint: /health
tasks_endpoint: /tasks
resources:
vcpus: 2
memory_gb: 8
gpu: none
max_runtime_minutes: 20
tasks:
# warm-up curriculum tier (round 1 legacy): single-app remediations
# used as a difficulty ramp so a freshly initialized policy can
# accumulate non-zero reward before the multi-app hpc scenarios kick
# in. not the story of the round 2 submission.
- id: nginx_crash
tier: warmup
difficulty: easy
description: nginx crash with stale pid and config syntax error (warm-up tier)
max_steps: 40
time_limit_seconds: 300
- id: disk_full
tier: warmup
difficulty: medium
description: hidden sparse log file filling a loopback mount (warm-up tier)
max_steps: 55
time_limit_seconds: 420
- id: network_broken
tier: warmup
difficulty: hard
description: broken network namespace with corrupted routing tables (warm-up tier)
max_steps: 70
time_limit_seconds: 480
# round 2 hpc tier: multi-app enterprise incident response scenarios.
# this is the tier the grpo trainer samples from by default and the
# tier judges should score on for theme #3.1 (scaler ai labs multi-app
# rl environment for enterprise workflows).
- id: hpc_outage
tier: hpc
difficulty: hard
description: multi node hpc cluster outage with drained compute and broken ood portal
max_steps: 90
time_limit_seconds: 600
- id: hpc_munge
tier: hpc
difficulty: hard
description: compute node draining due to a munge key permission fault and broken route
max_steps: 90
time_limit_seconds: 600
- id: hpc_pid_stale
tier: hpc
difficulty: hard
description: slurmd refuses to restart after reboot because a stale pid file is still on disk
max_steps: 90
time_limit_seconds: 600
- id: hpc_gpu_ecc
tier: hpc
difficulty: hard
description: compute node drained because nvidia-smi reports gpu-0 uncorrectable ecc errors
max_steps: 90
time_limit_seconds: 600
- id: hpc_nfs_stale
tier: hpc
difficulty: hard
description: compute node drained because the nfs share at /mnt/shared reports stale file handle
max_steps: 90
time_limit_seconds: 600
- id: hpc_ood_apache
tier: hpc
difficulty: medium
description: open ondemand apache portal on :8081 returns 500 due to a one character typo in httpd.conf
max_steps: 80
time_limit_seconds: 540
evaluation:
protocol: sequential
max_total_runtime_seconds: 4200
tasks_order:
- nginx_crash
- disk_full
- network_broken
- hpc_outage
- hpc_munge
- hpc_pid_stale
- hpc_gpu_ecc
- hpc_nfs_stale
- hpc_ood_apache
|