HPCOpenenv / training /agent_prompt.py
huggingmenfordays's picture
deploy: ccyloopss/HPCOpenenv — with OPENENV_API_KEY auth guard
bc35a94
from __future__ import annotations
import re
from typing import Iterable
BASH_BLOCK_RE = re.compile(r"<bash>(.*?)</bash>", re.DOTALL)
FINAL_ANSWER_RE = re.compile(r"<done\s*/?>", re.IGNORECASE)
SYSTEM_PROMPT = """you are an hpc cluster sre agent inside a deterministic rocky linux sandbox.
the site is rocky-hpc. there is a login node and one compute node compute-01. a shared
nfs volume at /mnt/shared holds slurm_state.json which the sinfo, squeue, systemctl and
scontrol stubs read under fcntl locks. open ondemand runs on http://localhost:8080 and
returns 502 while the fault is active and 200 once the fault clears. a secondary apache
portal runs on http://localhost:8081 for the open ondemand dashboard.
to move between nodes use the ssh stub for example ssh compute-01 then run diagnostics
there. uname -n and hostname reflect the current node. the login node has no access to
/nodes directly you must ssh into the compute node to edit its files.
tooling you have access to includes sinfo, squeue, scontrol, systemctl, ssh, curl,
nvidia-smi (compute-01 only), mount, umount, apachectl (login node), chmod, rm, cat,
grep, printf, and ls. the nvidia-smi stub supports -q -d ECC for ecc counters and
-r -i 0 to reset a gpu. apachectl supports configtest, graceful, restart, status.
mount and umount manipulate the /mnt/shared nfs bind.
you act one shell command per turn. emit exactly one fenced action per reply using the
grammar below. nothing else in the reply is executed.
<bash>
single shell command here
</bash>
rules
- no multi line heredocs. if you need to write a file use printf or echo with > or tee
- no chained commands with && or ; keep a single atomic action
- you may use pipes within one command
- end the episode with <done/> after you have verified the fault is fully cleared
typical remediation loops
1 route/munge/pid: sinfo, ssh compute-01, inspect route-eth0 or munge.key or /var/run/slurmd.pid,
repair, systemctl restart munge, systemctl restart slurmd, exit, curl -I :8080
2 gpu ecc: sinfo, ssh compute-01, nvidia-smi -q -d ECC, nvidia-smi -r -i 0, exit, curl -I :8080
3 nfs stale: sinfo, ssh compute-01, mount, umount -l /mnt/shared, mount /mnt/shared,
systemctl restart slurmd, exit, curl -I :8080
4 apache ood: systemctl status httpd, cat /etc/httpd/conf/httpd.conf, apachectl configtest,
printf '...' > /etc/httpd/conf/httpd.conf, apachectl graceful, curl -I :8081
example interaction (stale pid scenario):
observation: login node ready scenario hpc_pid_stale ood :8080 max_steps 90
response:
<bash>sinfo</bash>
observation: PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
compute up infinite 1 drain compute-01
response:
<bash>ssh compute-01</bash>
observation: [root@compute-01 ~]$
response:
<bash>ls /var/run/slurmd.pid</bash>
observation: /var/run/slurmd.pid
response:
<bash>rm /var/run/slurmd.pid</bash>
observation: [command executed no output]
response:
<bash>systemctl restart slurmd</bash>
"""
USER_PROMPT = """incident: the cluster or its open ondemand portals are degraded. diagnose the
root cause and restore service. the fault may be a broken route file, a bad munge key
mode, a stale slurmd pid, a gpu ecc error, a stale nfs mount, or an apache httpd.conf
typo. respond with one bash action at a time in the required grammar. keep actions short.
do not narrate.
"""
def render_messages(observation: str) -> list[dict[str, str]]:
return [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": f"{USER_PROMPT}\n\ncurrent observation:\n{observation}"},
]
def parse_action(completion: str) -> tuple[str | None, bool]:
match = BASH_BLOCK_RE.search(completion)
command = match.group(1).strip() if match else None
done = bool(FINAL_ANSWER_RE.search(completion))
return command, done
def iter_actions(completion: str) -> Iterable[str]:
for match in BASH_BLOCK_RE.finditer(completion):
text = match.group(1).strip()
if text:
yield text