HPCOpenenv / sysadmin_env /tasks /hpc_gpu_ecc.py
huggingmenfordays's picture
deploy: ccyloopss/HPCOpenenv — with OPENENV_API_KEY auth guard
bc35a94
from __future__ import annotations
import json
import re
from pathlib import Path
from sysadmin_env.models import DiagnosticTrigger
from sysadmin_env.models import DifficultyTier
from sysadmin_env.models import TaskMetadata
from sysadmin_env.models import TaskScenarioDefinition
from sysadmin_env.models import TaskScenarioState
from sysadmin_env.tasks import hpc_outage
TASK_ID = "hpc_gpu_ecc"
COMPLETION_HEALTH = 1.0
SHARED_STATE_PATH = hpc_outage.SHARED_STATE_PATH
NODES_ROOT = hpc_outage.NODES_ROOT
COMPUTE_ROOT = hpc_outage.COMPUTE_ROOT
ECC_RESET_RELATIVE = Path("var/lib/nvidia/ecc_reset.flag")
ECC_RESET_PATH = COMPUTE_ROOT / ECC_RESET_RELATIVE
NVIDIA_SMI_RELATIVE = Path("usr/local/bin/nvidia-smi")
INITIAL_STATE: dict = {
"cluster": "rocky-hpc",
"cores_total": hpc_outage.CLUSTER_CORES_TOTAL,
"cores_per_node": hpc_outage.CLUSTER_CORES_PER_NODE,
"partitions": {
"compute": {"nodes": ["compute-01"], "default": True},
},
"nodes": {
"login": {
"state": "up",
"reason": "",
"cores": hpc_outage.CLUSTER_CORES_PER_NODE,
},
"compute-01": {
"state": "drain",
"reason": "gpu-0 uncorrectable ecc errors",
"cores": hpc_outage.CLUSTER_CORES_PER_NODE,
},
},
"services": {
"slurmd@login": "active",
"slurmd@compute-01": "failed",
"slurmctld@login": "active",
"nvidia-persistenced@compute-01": "active",
},
"gpus": {
"compute-01:gpu-0": {
"model": "NVIDIA H100 80GB HBM3",
"state": "ecc_error",
"ecc_vol_total": 47,
"ecc_agg_total": 213,
},
},
"jobs": [
{
"id": 11301,
"name": "protein_fold",
"user": "biogrid",
"state": "PD",
"partition": "compute",
"nodes": "(NodeDown)",
"time": "0:00",
},
],
}
def build_definition(base_filesystem_path: str) -> TaskScenarioDefinition:
metadata = TaskMetadata(
task_id=TASK_ID,
difficulty=DifficultyTier.hard,
description="compute node drained because nvidia-smi reports gpu-0 uncorrectable ecc errors",
max_steps=90,
time_limit=600.0,
base_filesystem_path=base_filesystem_path,
)
return TaskScenarioDefinition(
metadata=metadata,
requires_network_isolation=False,
allows_nested_sandbox=True,
diagnostic_triggers=diagnostic_triggers(),
)
def diagnostic_triggers() -> list[DiagnosticTrigger]:
return [
DiagnosticTrigger(
fact_id="cluster_queue_inspected",
command_patterns=[r"\bsinfo\b", r"\bsqueue\b"],
reward=0.06,
),
DiagnosticTrigger(
fact_id="compute_node_entered",
command_patterns=[r"\bssh\s+compute-01\b"],
reward=0.07,
),
DiagnosticTrigger(
fact_id="gpu_status_inspected",
command_patterns=[r"\bnvidia-smi\b(?!\s+-r)"],
reward=0.06,
),
DiagnosticTrigger(
fact_id="ecc_counters_queried",
command_patterns=[r"nvidia-smi\s+(-q|--query).*ecc", r"nvidia-smi\s+.*ecc"],
reward=0.05,
),
DiagnosticTrigger(
fact_id="slurmd_service_checked",
command_patterns=[r"systemctl\s+status\s+slurmd", r"systemctl\s+is-failed\s+slurmd"],
reward=0.05,
),
]
def prepare_filesystem(root: str | Path) -> None:
root_path = Path(root)
hpc_outage.prepare_filesystem(root_path)
route_path = root_path / hpc_outage.COMPUTE_ROUTE_PATH
route_path.parent.mkdir(parents=True, exist_ok=True)
route_path.write_text(hpc_outage.FIXED_ROUTE)
ecc_path = root_path / ECC_RESET_PATH
ecc_path.parent.mkdir(parents=True, exist_ok=True)
if ecc_path.exists():
ecc_path.unlink()
_write_state(root_path / SHARED_STATE_PATH, INITIAL_STATE)
_write_executable(root_path / NVIDIA_SMI_RELATIVE, _login_nvidia_smi_stub())
compute_bin = root_path / COMPUTE_ROOT / "usr/local/bin"
compute_bin.mkdir(parents=True, exist_ok=True)
_write_executable(compute_bin / "nvidia-smi", _compute_nvidia_smi_stub())
def inject_fault(root: str | Path) -> None:
prepare_filesystem(root)
def observe_command(root: str | Path, command: str, _result) -> None:
_ = Path(root)
_ = command
def synchronize(root: str | Path) -> None:
root_path = Path(root)
if not (root_path / SHARED_STATE_PATH).exists():
_write_state(root_path / SHARED_STATE_PATH, INITIAL_STATE)
def grade(root: str | Path) -> TaskScenarioState:
root_path = Path(root)
state_doc = _read_state(root_path / SHARED_STATE_PATH)
ecc_reset = (root_path / ECC_RESET_PATH).exists()
gpu_state = (
state_doc.get("gpus", {})
.get("compute-01:gpu-0", {})
.get("state", "")
)
gpu_healthy = gpu_state == "healthy"
slurmd_service = state_doc.get("services", {}).get("slurmd@compute-01", "")
slurmd_active = slurmd_service == "active"
node_state = state_doc.get("nodes", {}).get("compute-01", {}).get("state", "")
node_idle = node_state == "idle"
health = 0.0
if ecc_reset:
health += 0.25
if gpu_healthy:
health += 0.25
if slurmd_active:
health += 0.2
if ecc_reset and gpu_healthy and slurmd_active and node_idle:
health = COMPLETION_HEALTH
done = ecc_reset and gpu_healthy and slurmd_active and node_idle
return TaskScenarioState(
health=health,
done=done,
details={
"ecc_reset_sentinel_present": ecc_reset,
"gpu_healthy": gpu_healthy,
"slurmd_service_active": slurmd_active,
"compute_node_idle": node_idle,
"gpu_state": gpu_state or "unknown",
"expected_sentinel_path": str(ECC_RESET_RELATIVE),
},
)
def command_reveals_fact(command: str, trigger: DiagnosticTrigger) -> bool:
return any(re.search(pattern, command, flags=re.IGNORECASE) for pattern in trigger.command_patterns)
def _write_executable(path: Path, content: str) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(content)
path.chmod(0o755)
def _write_state(path: Path, doc: dict) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(doc, indent=2, sort_keys=True) + "\n")
def _read_state(path: Path) -> dict:
if not path.exists():
return {}
try:
return json.loads(path.read_text() or "{}")
except json.JSONDecodeError:
return {}
def _login_nvidia_smi_stub() -> str:
# on the login node there is no gpu the agent must ssh into compute-01
return """#!/bin/sh
echo "nvidia-smi: no devices were found" >&2
exit 9
"""
def _compute_nvidia_smi_stub() -> str:
return """#!/usr/bin/env python3
import argparse
import fcntl
import json
import os
import sys
STATE_PATH = "/mnt/shared/slurm_state.json"
ECC_SENTINEL = "/var/lib/nvidia/ecc_reset.flag"
GPU_KEY = "compute-01:gpu-0"
def read_state():
try:
with open(STATE_PATH, "r", encoding="utf-8") as fh:
fcntl.flock(fh.fileno(), fcntl.LOCK_SH)
try:
raw = fh.read()
finally:
fcntl.flock(fh.fileno(), fcntl.LOCK_UN)
return json.loads(raw or "{}")
except FileNotFoundError:
return {}
def mutate_state(mutator):
with open(STATE_PATH, "r+", encoding="utf-8") as fh:
fcntl.flock(fh.fileno(), fcntl.LOCK_EX)
try:
raw = fh.read()
doc = json.loads(raw or "{}")
mutator(doc)
fh.seek(0)
fh.truncate()
fh.write(json.dumps(doc, indent=2, sort_keys=True) + "\\n")
fh.flush()
os.fsync(fh.fileno())
finally:
fcntl.flock(fh.fileno(), fcntl.LOCK_UN)
def render_query(doc):
gpu = doc.get("gpus", {}).get(GPU_KEY, {})
model = gpu.get("model", "unknown")
state = gpu.get("state", "unknown")
vol = gpu.get("ecc_vol_total", 0)
agg = gpu.get("ecc_agg_total", 0)
print(f"==============NVSMI LOG==============")
print(f"GPU 00000000:17:00.0 {model}")
print(f" Product State : {state}")
print(f" ECC Errors")
print(f" Volatile")
print(f" Total : {vol}")
print(f" Aggregate")
print(f" Total : {agg}")
def render_summary(doc):
gpu = doc.get("gpus", {}).get(GPU_KEY, {})
state = gpu.get("state", "unknown")
note = "ECC" if state != "healthy" else "OK"
print(f"+-----------------------------------------------------------------------------+")
print(f"| NVIDIA-SMI 555.42.02 Driver Version: 555.42.02 CUDA Version: 12.5 |")
print(f"|-----------------------------------------------------------------------------|")
print(f"| GPU Name Bus-Id Pwr:Usage/Cap | Memory {note:<4} |")
print(f"| 0 {gpu.get('model','unknown'):<24} 0000:17:00.0 78W / 700W | 0MiB {note:<5} |")
print(f"+-----------------------------------------------------------------------------+")
def handle_reset(gpu_id):
open(ECC_SENTINEL, "w").close()
def apply(doc):
gpus = doc.setdefault("gpus", {})
entry = gpus.setdefault(GPU_KEY, {})
entry["state"] = "healthy"
entry["ecc_vol_total"] = 0
services = doc.setdefault("services", {})
services["slurmd@compute-01"] = "active"
nodes = doc.setdefault("nodes", {})
compute = nodes.setdefault("compute-01", {})
compute["state"] = "idle"
compute["reason"] = ""
mutate_state(apply)
print(f"GPU {gpu_id}: ECC error counters reset. Node returned to idle.")
return 0
def main(argv):
parser = argparse.ArgumentParser(add_help=False)
parser.add_argument("-r", "--reset", action="store_true")
parser.add_argument("-i", "--id", default="0")
parser.add_argument("-q", "--query", action="store_true")
parser.add_argument("-d", "--display", default="")
parser.add_argument("--help", action="store_true")
try:
args, extra = parser.parse_known_args(argv[1:])
except SystemExit:
return 2
if args.help:
print("nvidia-smi [-q] [-d ECC] [-r -i <gpu>]")
return 0
os.makedirs(os.path.dirname(ECC_SENTINEL), exist_ok=True)
doc = read_state()
if args.reset:
return handle_reset(args.id)
if args.query:
render_query(doc)
return 0
render_summary(doc)
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv))
"""