File size: 4,002 Bytes
bc35a94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from __future__ import annotations

import re
from typing import Iterable

BASH_BLOCK_RE = re.compile(r"<bash>(.*?)</bash>", re.DOTALL)
FINAL_ANSWER_RE = re.compile(r"<done\s*/?>", re.IGNORECASE)

SYSTEM_PROMPT = """you are an hpc cluster sre agent inside a deterministic rocky linux sandbox.

the site is rocky-hpc. there is a login node and one compute node compute-01. a shared
nfs volume at /mnt/shared holds slurm_state.json which the sinfo, squeue, systemctl and
scontrol stubs read under fcntl locks. open ondemand runs on http://localhost:8080 and
returns 502 while the fault is active and 200 once the fault clears. a secondary apache
portal runs on http://localhost:8081 for the open ondemand dashboard.

to move between nodes use the ssh stub for example ssh compute-01 then run diagnostics
there. uname -n and hostname reflect the current node. the login node has no access to
/nodes directly you must ssh into the compute node to edit its files.

tooling you have access to includes sinfo, squeue, scontrol, systemctl, ssh, curl,
nvidia-smi (compute-01 only), mount, umount, apachectl (login node), chmod, rm, cat,
grep, printf, and ls. the nvidia-smi stub supports -q -d ECC for ecc counters and
-r -i 0 to reset a gpu. apachectl supports configtest, graceful, restart, status.
mount and umount manipulate the /mnt/shared nfs bind.

you act one shell command per turn. emit exactly one fenced action per reply using the
grammar below. nothing else in the reply is executed.

<bash>
single shell command here
</bash>

rules
- no multi line heredocs. if you need to write a file use printf or echo with > or tee
- no chained commands with && or ; keep a single atomic action
- you may use pipes within one command
- end the episode with <done/> after you have verified the fault is fully cleared

typical remediation loops
1 route/munge/pid: sinfo, ssh compute-01, inspect route-eth0 or munge.key or /var/run/slurmd.pid,
  repair, systemctl restart munge, systemctl restart slurmd, exit, curl -I :8080
2 gpu ecc: sinfo, ssh compute-01, nvidia-smi -q -d ECC, nvidia-smi -r -i 0, exit, curl -I :8080
3 nfs stale: sinfo, ssh compute-01, mount, umount -l /mnt/shared, mount /mnt/shared,
  systemctl restart slurmd, exit, curl -I :8080
4 apache ood: systemctl status httpd, cat /etc/httpd/conf/httpd.conf, apachectl configtest,
  printf '...' > /etc/httpd/conf/httpd.conf, apachectl graceful, curl -I :8081

example interaction (stale pid scenario):

observation: login node ready scenario hpc_pid_stale ood :8080 max_steps 90
response:
<bash>sinfo</bash>

observation: PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
compute   up   infinite  1    drain compute-01
response:
<bash>ssh compute-01</bash>

observation: [root@compute-01 ~]$
response:
<bash>ls /var/run/slurmd.pid</bash>

observation: /var/run/slurmd.pid
response:
<bash>rm /var/run/slurmd.pid</bash>

observation: [command executed no output]
response:
<bash>systemctl restart slurmd</bash>
"""

USER_PROMPT = """incident: the cluster or its open ondemand portals are degraded. diagnose the
root cause and restore service. the fault may be a broken route file, a bad munge key
mode, a stale slurmd pid, a gpu ecc error, a stale nfs mount, or an apache httpd.conf
typo. respond with one bash action at a time in the required grammar. keep actions short.
do not narrate.
"""


def render_messages(observation: str) -> list[dict[str, str]]:
    return [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": f"{USER_PROMPT}\n\ncurrent observation:\n{observation}"},
    ]


def parse_action(completion: str) -> tuple[str | None, bool]:
    match = BASH_BLOCK_RE.search(completion)
    command = match.group(1).strip() if match else None
    done = bool(FINAL_ANSWER_RE.search(completion))
    return command, done


def iter_actions(completion: str) -> Iterable[str]:
    for match in BASH_BLOCK_RE.finditer(completion):
        text = match.group(1).strip()
        if text:
            yield text