from __future__ import annotations import re from typing import Iterable BASH_BLOCK_RE = re.compile(r"(.*?)", re.DOTALL) FINAL_ANSWER_RE = re.compile(r"", re.IGNORECASE) SYSTEM_PROMPT = """you are an hpc cluster sre agent inside a deterministic rocky linux sandbox. the site is rocky-hpc. there is a login node and one compute node compute-01. a shared nfs volume at /mnt/shared holds slurm_state.json which the sinfo, squeue, systemctl and scontrol stubs read under fcntl locks. open ondemand runs on http://localhost:8080 and returns 502 while the fault is active and 200 once the fault clears. a secondary apache portal runs on http://localhost:8081 for the open ondemand dashboard. to move between nodes use the ssh stub for example ssh compute-01 then run diagnostics there. uname -n and hostname reflect the current node. the login node has no access to /nodes directly you must ssh into the compute node to edit its files. tooling you have access to includes sinfo, squeue, scontrol, systemctl, ssh, curl, nvidia-smi (compute-01 only), mount, umount, apachectl (login node), chmod, rm, cat, grep, printf, and ls. the nvidia-smi stub supports -q -d ECC for ecc counters and -r -i 0 to reset a gpu. apachectl supports configtest, graceful, restart, status. mount and umount manipulate the /mnt/shared nfs bind. you act one shell command per turn. emit exactly one fenced action per reply using the grammar below. nothing else in the reply is executed. single shell command here rules - no multi line heredocs. if you need to write a file use printf or echo with > or tee - no chained commands with && or ; keep a single atomic action - you may use pipes within one command - end the episode with after you have verified the fault is fully cleared typical remediation loops 1 route/munge/pid: sinfo, ssh compute-01, inspect route-eth0 or munge.key or /var/run/slurmd.pid, repair, systemctl restart munge, systemctl restart slurmd, exit, curl -I :8080 2 gpu ecc: sinfo, ssh compute-01, nvidia-smi -q -d ECC, nvidia-smi -r -i 0, exit, curl -I :8080 3 nfs stale: sinfo, ssh compute-01, mount, umount -l /mnt/shared, mount /mnt/shared, systemctl restart slurmd, exit, curl -I :8080 4 apache ood: systemctl status httpd, cat /etc/httpd/conf/httpd.conf, apachectl configtest, printf '...' > /etc/httpd/conf/httpd.conf, apachectl graceful, curl -I :8081 example interaction (stale pid scenario): observation: login node ready scenario hpc_pid_stale ood :8080 max_steps 90 response: sinfo observation: PARTITION AVAIL TIMELIMIT NODES STATE NODELIST compute up infinite 1 drain compute-01 response: ssh compute-01 observation: [root@compute-01 ~]$ response: ls /var/run/slurmd.pid observation: /var/run/slurmd.pid response: rm /var/run/slurmd.pid observation: [command executed no output] response: systemctl restart slurmd """ USER_PROMPT = """incident: the cluster or its open ondemand portals are degraded. diagnose the root cause and restore service. the fault may be a broken route file, a bad munge key mode, a stale slurmd pid, a gpu ecc error, a stale nfs mount, or an apache httpd.conf typo. respond with one bash action at a time in the required grammar. keep actions short. do not narrate. """ def render_messages(observation: str) -> list[dict[str, str]]: return [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": f"{USER_PROMPT}\n\ncurrent observation:\n{observation}"}, ] def parse_action(completion: str) -> tuple[str | None, bool]: match = BASH_BLOCK_RE.search(completion) command = match.group(1).strip() if match else None done = bool(FINAL_ANSWER_RE.search(completion)) return command, done def iter_actions(completion: str) -> Iterable[str]: for match in BASH_BLOCK_RE.finditer(completion): text = match.group(1).strip() if text: yield text