Spaces:

ccyloopss
/

HPCOpenenv

Paused

App Files Files Community

HPCOpenenv / sysadmin_env /tasks /hpc_gpu_ecc.py

huggingmenfordays

deploy: ccyloopss/HPCOpenenv — with OPENENV_API_KEY auth guard

bc35a94 18 days ago

raw

history blame contribute delete

10.8 kB

	from __future__ import annotations

	import json
	import re
	from pathlib import Path

	from sysadmin_env.models import DiagnosticTrigger
	from sysadmin_env.models import DifficultyTier
	from sysadmin_env.models import TaskMetadata
	from sysadmin_env.models import TaskScenarioDefinition
	from sysadmin_env.models import TaskScenarioState
	from sysadmin_env.tasks import hpc_outage


	TASK_ID = "hpc_gpu_ecc"
	COMPLETION_HEALTH = 1.0

	SHARED_STATE_PATH = hpc_outage.SHARED_STATE_PATH
	NODES_ROOT = hpc_outage.NODES_ROOT
	COMPUTE_ROOT = hpc_outage.COMPUTE_ROOT
	ECC_RESET_RELATIVE = Path("var/lib/nvidia/ecc_reset.flag")
	ECC_RESET_PATH = COMPUTE_ROOT / ECC_RESET_RELATIVE
	NVIDIA_SMI_RELATIVE = Path("usr/local/bin/nvidia-smi")

	INITIAL_STATE: dict = {
	"cluster": "rocky-hpc",
	"cores_total": hpc_outage.CLUSTER_CORES_TOTAL,
	"cores_per_node": hpc_outage.CLUSTER_CORES_PER_NODE,
	"partitions": {
	"compute": {"nodes": ["compute-01"], "default": True},
	},
	"nodes": {
	"login": {
	"state": "up",
	"reason": "",
	"cores": hpc_outage.CLUSTER_CORES_PER_NODE,
	},
	"compute-01": {
	"state": "drain",
	"reason": "gpu-0 uncorrectable ecc errors",
	"cores": hpc_outage.CLUSTER_CORES_PER_NODE,
	},
	},
	"services": {
	"slurmd@login": "active",
	"slurmd@compute-01": "failed",
	"slurmctld@login": "active",
	"nvidia-persistenced@compute-01": "active",
	},
	"gpus": {
	"compute-01:gpu-0": {
	"model": "NVIDIA H100 80GB HBM3",
	"state": "ecc_error",
	"ecc_vol_total": 47,
	"ecc_agg_total": 213,
	},
	},
	"jobs": [
	{
	"id": 11301,
	"name": "protein_fold",
	"user": "biogrid",
	"state": "PD",
	"partition": "compute",
	"nodes": "(NodeDown)",
	"time": "0:00",
	},
	],
	}


	def build_definition(base_filesystem_path: str) -> TaskScenarioDefinition:
	metadata = TaskMetadata(
	task_id=TASK_ID,
	difficulty=DifficultyTier.hard,
	description="compute node drained because nvidia-smi reports gpu-0 uncorrectable ecc errors",
	max_steps=90,
	time_limit=600.0,
	base_filesystem_path=base_filesystem_path,
	)
	return TaskScenarioDefinition(
	metadata=metadata,
	requires_network_isolation=False,
	allows_nested_sandbox=True,
	diagnostic_triggers=diagnostic_triggers(),
	)


	def diagnostic_triggers() -> list[DiagnosticTrigger]:
	return [
	DiagnosticTrigger(
	fact_id="cluster_queue_inspected",
	command_patterns=[r"\bsinfo\b", r"\bsqueue\b"],
	reward=0.06,
	),
	DiagnosticTrigger(
	fact_id="compute_node_entered",
	command_patterns=[r"\bssh\s+compute-01\b"],
	reward=0.07,
	),
	DiagnosticTrigger(
	fact_id="gpu_status_inspected",
	command_patterns=[r"\bnvidia-smi\b(?!\s+-r)"],
	reward=0.06,
	),
	DiagnosticTrigger(
	fact_id="ecc_counters_queried",
	command_patterns=[r"nvidia-smi\s+(-q\|--query).ecc", r"nvidia-smi\s+.ecc"],
	reward=0.05,
	),
	DiagnosticTrigger(
	fact_id="slurmd_service_checked",
	command_patterns=[r"systemctl\s+status\s+slurmd", r"systemctl\s+is-failed\s+slurmd"],
	reward=0.05,
	),
	]


	def prepare_filesystem(root: str \| Path) -> None:
	root_path = Path(root)
	hpc_outage.prepare_filesystem(root_path)

	route_path = root_path / hpc_outage.COMPUTE_ROUTE_PATH
	route_path.parent.mkdir(parents=True, exist_ok=True)
	route_path.write_text(hpc_outage.FIXED_ROUTE)

	ecc_path = root_path / ECC_RESET_PATH
	ecc_path.parent.mkdir(parents=True, exist_ok=True)
	if ecc_path.exists():
	ecc_path.unlink()

	_write_state(root_path / SHARED_STATE_PATH, INITIAL_STATE)

	_write_executable(root_path / NVIDIA_SMI_RELATIVE, _login_nvidia_smi_stub())
	compute_bin = root_path / COMPUTE_ROOT / "usr/local/bin"
	compute_bin.mkdir(parents=True, exist_ok=True)
	_write_executable(compute_bin / "nvidia-smi", _compute_nvidia_smi_stub())


	def inject_fault(root: str \| Path) -> None:
	prepare_filesystem(root)


	def observe_command(root: str \| Path, command: str, _result) -> None:
	_ = Path(root)
	_ = command


	def synchronize(root: str \| Path) -> None:
	root_path = Path(root)
	if not (root_path / SHARED_STATE_PATH).exists():
	_write_state(root_path / SHARED_STATE_PATH, INITIAL_STATE)


	def grade(root: str \| Path) -> TaskScenarioState:
	root_path = Path(root)
	state_doc = _read_state(root_path / SHARED_STATE_PATH)

	ecc_reset = (root_path / ECC_RESET_PATH).exists()
	gpu_state = (
	state_doc.get("gpus", {})
	.get("compute-01:gpu-0", {})
	.get("state", "")
	)
	gpu_healthy = gpu_state == "healthy"

	slurmd_service = state_doc.get("services", {}).get("slurmd@compute-01", "")
	slurmd_active = slurmd_service == "active"
	node_state = state_doc.get("nodes", {}).get("compute-01", {}).get("state", "")
	node_idle = node_state == "idle"

	health = 0.0
	if ecc_reset:
	health += 0.25
	if gpu_healthy:
	health += 0.25
	if slurmd_active:
	health += 0.2
	if ecc_reset and gpu_healthy and slurmd_active and node_idle:
	health = COMPLETION_HEALTH

	done = ecc_reset and gpu_healthy and slurmd_active and node_idle

	return TaskScenarioState(
	health=health,
	done=done,
	details={
	"ecc_reset_sentinel_present": ecc_reset,
	"gpu_healthy": gpu_healthy,
	"slurmd_service_active": slurmd_active,
	"compute_node_idle": node_idle,
	"gpu_state": gpu_state or "unknown",
	"expected_sentinel_path": str(ECC_RESET_RELATIVE),
	},
	)


	def command_reveals_fact(command: str, trigger: DiagnosticTrigger) -> bool:
	return any(re.search(pattern, command, flags=re.IGNORECASE) for pattern in trigger.command_patterns)


	def _write_executable(path: Path, content: str) -> None:
	path.parent.mkdir(parents=True, exist_ok=True)
	path.write_text(content)
	path.chmod(0o755)


	def _write_state(path: Path, doc: dict) -> None:
	path.parent.mkdir(parents=True, exist_ok=True)
	path.write_text(json.dumps(doc, indent=2, sort_keys=True) + "\n")


	def _read_state(path: Path) -> dict:
	if not path.exists():
	return {}
	try:
	return json.loads(path.read_text() or "{}")
	except json.JSONDecodeError:
	return {}


	def _login_nvidia_smi_stub() -> str:
	# on the login node there is no gpu the agent must ssh into compute-01
	return """#!/bin/sh
	echo "nvidia-smi: no devices were found" >&2
	exit 9
	"""


	def _compute_nvidia_smi_stub() -> str:
	return """#!/usr/bin/env python3
	import argparse
	import fcntl
	import json
	import os
	import sys

	STATE_PATH = "/mnt/shared/slurm_state.json"
	ECC_SENTINEL = "/var/lib/nvidia/ecc_reset.flag"
	GPU_KEY = "compute-01:gpu-0"

	def read_state():
	try:
	with open(STATE_PATH, "r", encoding="utf-8") as fh:
	fcntl.flock(fh.fileno(), fcntl.LOCK_SH)
	try:
	raw = fh.read()
	finally:
	fcntl.flock(fh.fileno(), fcntl.LOCK_UN)
	return json.loads(raw or "{}")
	except FileNotFoundError:
	return {}

	def mutate_state(mutator):
	with open(STATE_PATH, "r+", encoding="utf-8") as fh:
	fcntl.flock(fh.fileno(), fcntl.LOCK_EX)
	try:
	raw = fh.read()
	doc = json.loads(raw or "{}")
	mutator(doc)
	fh.seek(0)
	fh.truncate()
	fh.write(json.dumps(doc, indent=2, sort_keys=True) + "\\n")
	fh.flush()
	os.fsync(fh.fileno())
	finally:
	fcntl.flock(fh.fileno(), fcntl.LOCK_UN)

	def render_query(doc):
	gpu = doc.get("gpus", {}).get(GPU_KEY, {})
	model = gpu.get("model", "unknown")
	state = gpu.get("state", "unknown")
	vol = gpu.get("ecc_vol_total", 0)
	agg = gpu.get("ecc_agg_total", 0)
	print(f"==============NVSMI LOG==============")
	print(f"GPU 00000000:17:00.0 {model}")
	print(f" Product State : {state}")
	print(f" ECC Errors")
	print(f" Volatile")
	print(f" Total : {vol}")
	print(f" Aggregate")
	print(f" Total : {agg}")

	def render_summary(doc):
	gpu = doc.get("gpus", {}).get(GPU_KEY, {})
	state = gpu.get("state", "unknown")
	note = "ECC" if state != "healthy" else "OK"
	print(f"+-----------------------------------------------------------------------------+")
	print(f"\| NVIDIA-SMI 555.42.02 Driver Version: 555.42.02 CUDA Version: 12.5 \|")
	print(f"\|-----------------------------------------------------------------------------\|")
	print(f"\| GPU Name Bus-Id Pwr:Usage/Cap \| Memory {note:<4} \|")
	print(f"\| 0 {gpu.get('model','unknown'):<24} 0000:17:00.0 78W / 700W \| 0MiB {note:<5} \|")
	print(f"+-----------------------------------------------------------------------------+")

	def handle_reset(gpu_id):
	open(ECC_SENTINEL, "w").close()
	def apply(doc):
	gpus = doc.setdefault("gpus", {})
	entry = gpus.setdefault(GPU_KEY, {})
	entry["state"] = "healthy"
	entry["ecc_vol_total"] = 0
	services = doc.setdefault("services", {})
	services["slurmd@compute-01"] = "active"
	nodes = doc.setdefault("nodes", {})
	compute = nodes.setdefault("compute-01", {})
	compute["state"] = "idle"
	compute["reason"] = ""
	mutate_state(apply)
	print(f"GPU {gpu_id}: ECC error counters reset. Node returned to idle.")
	return 0

	def main(argv):
	parser = argparse.ArgumentParser(add_help=False)
	parser.add_argument("-r", "--reset", action="store_true")
	parser.add_argument("-i", "--id", default="0")
	parser.add_argument("-q", "--query", action="store_true")
	parser.add_argument("-d", "--display", default="")
	parser.add_argument("--help", action="store_true")
	try:
	args, extra = parser.parse_known_args(argv[1:])
	except SystemExit:
	return 2
	if args.help:
	print("nvidia-smi [-q] [-d ECC] [-r -i <gpu>]")
	return 0
	os.makedirs(os.path.dirname(ECC_SENTINEL), exist_ok=True)
	doc = read_state()
	if args.reset:
	return handle_reset(args.id)
	if args.query:
	render_query(doc)
	return 0
	render_summary(doc)
	return 0

	if __name__ == "__main__":
	sys.exit(main(sys.argv))
	"""