File size: 3,059 Bytes
bc35a94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
name: sysadmin-env
version: "0.2.0"
description: reinforcement learning environment for linux server auto remediation

runtime:
  python: "3.11"
  entry_point: inference.py
  server_entry_point: server.app:app
  live_url: https://huggingmenfordays-enterprise-hpc-openenv.hf.space
  reset_endpoint: /reset
  step_endpoint: /step
  state_endpoint: /state
  websocket_endpoint: /ws
  healthcheck_endpoint: /health
  tasks_endpoint: /tasks

resources:
  vcpus: 2
  memory_gb: 8
  gpu: none
  max_runtime_minutes: 20

tasks:
  # warm-up curriculum tier (round 1 legacy): single-app remediations
  # used as a difficulty ramp so a freshly initialized policy can
  # accumulate non-zero reward before the multi-app hpc scenarios kick
  # in. not the story of the round 2 submission.
  - id: nginx_crash
    tier: warmup
    difficulty: easy
    description: nginx crash with stale pid and config syntax error (warm-up tier)
    max_steps: 40
    time_limit_seconds: 300

  - id: disk_full
    tier: warmup
    difficulty: medium
    description: hidden sparse log file filling a loopback mount (warm-up tier)
    max_steps: 55
    time_limit_seconds: 420

  - id: network_broken
    tier: warmup
    difficulty: hard
    description: broken network namespace with corrupted routing tables (warm-up tier)
    max_steps: 70
    time_limit_seconds: 480

  # round 2 hpc tier: multi-app enterprise incident response scenarios.
  # this is the tier the grpo trainer samples from by default and the
  # tier judges should score on for theme #3.1 (scaler ai labs multi-app
  # rl environment for enterprise workflows).
  - id: hpc_outage
    tier: hpc
    difficulty: hard
    description: multi node hpc cluster outage with drained compute and broken ood portal
    max_steps: 90
    time_limit_seconds: 600

  - id: hpc_munge
    tier: hpc
    difficulty: hard
    description: compute node draining due to a munge key permission fault and broken route
    max_steps: 90
    time_limit_seconds: 600

  - id: hpc_pid_stale
    tier: hpc
    difficulty: hard
    description: slurmd refuses to restart after reboot because a stale pid file is still on disk
    max_steps: 90
    time_limit_seconds: 600

  - id: hpc_gpu_ecc
    tier: hpc
    difficulty: hard
    description: compute node drained because nvidia-smi reports gpu-0 uncorrectable ecc errors
    max_steps: 90
    time_limit_seconds: 600

  - id: hpc_nfs_stale
    tier: hpc
    difficulty: hard
    description: compute node drained because the nfs share at /mnt/shared reports stale file handle
    max_steps: 90
    time_limit_seconds: 600

  - id: hpc_ood_apache
    tier: hpc
    difficulty: medium
    description: open ondemand apache portal on :8081 returns 500 due to a one character typo in httpd.conf
    max_steps: 80
    time_limit_seconds: 540

evaluation:
  protocol: sequential
  max_total_runtime_seconds: 4200
  tasks_order:
    - nginx_crash
    - disk_full
    - network_broken
    - hpc_outage
    - hpc_munge
    - hpc_pid_stale
    - hpc_gpu_ecc
    - hpc_nfs_stale
    - hpc_ood_apache