Spaces:
Paused
Paused
| from __future__ import annotations | |
| import json | |
| import re | |
| from pathlib import Path | |
| from sysadmin_env.models import DiagnosticTrigger | |
| from sysadmin_env.models import DifficultyTier | |
| from sysadmin_env.models import TaskMetadata | |
| from sysadmin_env.models import TaskScenarioDefinition | |
| from sysadmin_env.models import TaskScenarioState | |
| from sysadmin_env.tasks import hpc_outage | |
| TASK_ID = "hpc_gpu_ecc" | |
| COMPLETION_HEALTH = 1.0 | |
| SHARED_STATE_PATH = hpc_outage.SHARED_STATE_PATH | |
| NODES_ROOT = hpc_outage.NODES_ROOT | |
| COMPUTE_ROOT = hpc_outage.COMPUTE_ROOT | |
| ECC_RESET_RELATIVE = Path("var/lib/nvidia/ecc_reset.flag") | |
| ECC_RESET_PATH = COMPUTE_ROOT / ECC_RESET_RELATIVE | |
| NVIDIA_SMI_RELATIVE = Path("usr/local/bin/nvidia-smi") | |
| INITIAL_STATE: dict = { | |
| "cluster": "rocky-hpc", | |
| "cores_total": hpc_outage.CLUSTER_CORES_TOTAL, | |
| "cores_per_node": hpc_outage.CLUSTER_CORES_PER_NODE, | |
| "partitions": { | |
| "compute": {"nodes": ["compute-01"], "default": True}, | |
| }, | |
| "nodes": { | |
| "login": { | |
| "state": "up", | |
| "reason": "", | |
| "cores": hpc_outage.CLUSTER_CORES_PER_NODE, | |
| }, | |
| "compute-01": { | |
| "state": "drain", | |
| "reason": "gpu-0 uncorrectable ecc errors", | |
| "cores": hpc_outage.CLUSTER_CORES_PER_NODE, | |
| }, | |
| }, | |
| "services": { | |
| "slurmd@login": "active", | |
| "slurmd@compute-01": "failed", | |
| "slurmctld@login": "active", | |
| "nvidia-persistenced@compute-01": "active", | |
| }, | |
| "gpus": { | |
| "compute-01:gpu-0": { | |
| "model": "NVIDIA H100 80GB HBM3", | |
| "state": "ecc_error", | |
| "ecc_vol_total": 47, | |
| "ecc_agg_total": 213, | |
| }, | |
| }, | |
| "jobs": [ | |
| { | |
| "id": 11301, | |
| "name": "protein_fold", | |
| "user": "biogrid", | |
| "state": "PD", | |
| "partition": "compute", | |
| "nodes": "(NodeDown)", | |
| "time": "0:00", | |
| }, | |
| ], | |
| } | |
| def build_definition(base_filesystem_path: str) -> TaskScenarioDefinition: | |
| metadata = TaskMetadata( | |
| task_id=TASK_ID, | |
| difficulty=DifficultyTier.hard, | |
| description="compute node drained because nvidia-smi reports gpu-0 uncorrectable ecc errors", | |
| max_steps=90, | |
| time_limit=600.0, | |
| base_filesystem_path=base_filesystem_path, | |
| ) | |
| return TaskScenarioDefinition( | |
| metadata=metadata, | |
| requires_network_isolation=False, | |
| allows_nested_sandbox=True, | |
| diagnostic_triggers=diagnostic_triggers(), | |
| ) | |
| def diagnostic_triggers() -> list[DiagnosticTrigger]: | |
| return [ | |
| DiagnosticTrigger( | |
| fact_id="cluster_queue_inspected", | |
| command_patterns=[r"\bsinfo\b", r"\bsqueue\b"], | |
| reward=0.06, | |
| ), | |
| DiagnosticTrigger( | |
| fact_id="compute_node_entered", | |
| command_patterns=[r"\bssh\s+compute-01\b"], | |
| reward=0.07, | |
| ), | |
| DiagnosticTrigger( | |
| fact_id="gpu_status_inspected", | |
| command_patterns=[r"\bnvidia-smi\b(?!\s+-r)"], | |
| reward=0.06, | |
| ), | |
| DiagnosticTrigger( | |
| fact_id="ecc_counters_queried", | |
| command_patterns=[r"nvidia-smi\s+(-q|--query).*ecc", r"nvidia-smi\s+.*ecc"], | |
| reward=0.05, | |
| ), | |
| DiagnosticTrigger( | |
| fact_id="slurmd_service_checked", | |
| command_patterns=[r"systemctl\s+status\s+slurmd", r"systemctl\s+is-failed\s+slurmd"], | |
| reward=0.05, | |
| ), | |
| ] | |
| def prepare_filesystem(root: str | Path) -> None: | |
| root_path = Path(root) | |
| hpc_outage.prepare_filesystem(root_path) | |
| route_path = root_path / hpc_outage.COMPUTE_ROUTE_PATH | |
| route_path.parent.mkdir(parents=True, exist_ok=True) | |
| route_path.write_text(hpc_outage.FIXED_ROUTE) | |
| ecc_path = root_path / ECC_RESET_PATH | |
| ecc_path.parent.mkdir(parents=True, exist_ok=True) | |
| if ecc_path.exists(): | |
| ecc_path.unlink() | |
| _write_state(root_path / SHARED_STATE_PATH, INITIAL_STATE) | |
| _write_executable(root_path / NVIDIA_SMI_RELATIVE, _login_nvidia_smi_stub()) | |
| compute_bin = root_path / COMPUTE_ROOT / "usr/local/bin" | |
| compute_bin.mkdir(parents=True, exist_ok=True) | |
| _write_executable(compute_bin / "nvidia-smi", _compute_nvidia_smi_stub()) | |
| def inject_fault(root: str | Path) -> None: | |
| prepare_filesystem(root) | |
| def observe_command(root: str | Path, command: str, _result) -> None: | |
| _ = Path(root) | |
| _ = command | |
| def synchronize(root: str | Path) -> None: | |
| root_path = Path(root) | |
| if not (root_path / SHARED_STATE_PATH).exists(): | |
| _write_state(root_path / SHARED_STATE_PATH, INITIAL_STATE) | |
| def grade(root: str | Path) -> TaskScenarioState: | |
| root_path = Path(root) | |
| state_doc = _read_state(root_path / SHARED_STATE_PATH) | |
| ecc_reset = (root_path / ECC_RESET_PATH).exists() | |
| gpu_state = ( | |
| state_doc.get("gpus", {}) | |
| .get("compute-01:gpu-0", {}) | |
| .get("state", "") | |
| ) | |
| gpu_healthy = gpu_state == "healthy" | |
| slurmd_service = state_doc.get("services", {}).get("slurmd@compute-01", "") | |
| slurmd_active = slurmd_service == "active" | |
| node_state = state_doc.get("nodes", {}).get("compute-01", {}).get("state", "") | |
| node_idle = node_state == "idle" | |
| health = 0.0 | |
| if ecc_reset: | |
| health += 0.25 | |
| if gpu_healthy: | |
| health += 0.25 | |
| if slurmd_active: | |
| health += 0.2 | |
| if ecc_reset and gpu_healthy and slurmd_active and node_idle: | |
| health = COMPLETION_HEALTH | |
| done = ecc_reset and gpu_healthy and slurmd_active and node_idle | |
| return TaskScenarioState( | |
| health=health, | |
| done=done, | |
| details={ | |
| "ecc_reset_sentinel_present": ecc_reset, | |
| "gpu_healthy": gpu_healthy, | |
| "slurmd_service_active": slurmd_active, | |
| "compute_node_idle": node_idle, | |
| "gpu_state": gpu_state or "unknown", | |
| "expected_sentinel_path": str(ECC_RESET_RELATIVE), | |
| }, | |
| ) | |
| def command_reveals_fact(command: str, trigger: DiagnosticTrigger) -> bool: | |
| return any(re.search(pattern, command, flags=re.IGNORECASE) for pattern in trigger.command_patterns) | |
| def _write_executable(path: Path, content: str) -> None: | |
| path.parent.mkdir(parents=True, exist_ok=True) | |
| path.write_text(content) | |
| path.chmod(0o755) | |
| def _write_state(path: Path, doc: dict) -> None: | |
| path.parent.mkdir(parents=True, exist_ok=True) | |
| path.write_text(json.dumps(doc, indent=2, sort_keys=True) + "\n") | |
| def _read_state(path: Path) -> dict: | |
| if not path.exists(): | |
| return {} | |
| try: | |
| return json.loads(path.read_text() or "{}") | |
| except json.JSONDecodeError: | |
| return {} | |
| def _login_nvidia_smi_stub() -> str: | |
| # on the login node there is no gpu the agent must ssh into compute-01 | |
| return """#!/bin/sh | |
| echo "nvidia-smi: no devices were found" >&2 | |
| exit 9 | |
| """ | |
| def _compute_nvidia_smi_stub() -> str: | |
| return """#!/usr/bin/env python3 | |
| import argparse | |
| import fcntl | |
| import json | |
| import os | |
| import sys | |
| STATE_PATH = "/mnt/shared/slurm_state.json" | |
| ECC_SENTINEL = "/var/lib/nvidia/ecc_reset.flag" | |
| GPU_KEY = "compute-01:gpu-0" | |
| def read_state(): | |
| try: | |
| with open(STATE_PATH, "r", encoding="utf-8") as fh: | |
| fcntl.flock(fh.fileno(), fcntl.LOCK_SH) | |
| try: | |
| raw = fh.read() | |
| finally: | |
| fcntl.flock(fh.fileno(), fcntl.LOCK_UN) | |
| return json.loads(raw or "{}") | |
| except FileNotFoundError: | |
| return {} | |
| def mutate_state(mutator): | |
| with open(STATE_PATH, "r+", encoding="utf-8") as fh: | |
| fcntl.flock(fh.fileno(), fcntl.LOCK_EX) | |
| try: | |
| raw = fh.read() | |
| doc = json.loads(raw or "{}") | |
| mutator(doc) | |
| fh.seek(0) | |
| fh.truncate() | |
| fh.write(json.dumps(doc, indent=2, sort_keys=True) + "\\n") | |
| fh.flush() | |
| os.fsync(fh.fileno()) | |
| finally: | |
| fcntl.flock(fh.fileno(), fcntl.LOCK_UN) | |
| def render_query(doc): | |
| gpu = doc.get("gpus", {}).get(GPU_KEY, {}) | |
| model = gpu.get("model", "unknown") | |
| state = gpu.get("state", "unknown") | |
| vol = gpu.get("ecc_vol_total", 0) | |
| agg = gpu.get("ecc_agg_total", 0) | |
| print(f"==============NVSMI LOG==============") | |
| print(f"GPU 00000000:17:00.0 {model}") | |
| print(f" Product State : {state}") | |
| print(f" ECC Errors") | |
| print(f" Volatile") | |
| print(f" Total : {vol}") | |
| print(f" Aggregate") | |
| print(f" Total : {agg}") | |
| def render_summary(doc): | |
| gpu = doc.get("gpus", {}).get(GPU_KEY, {}) | |
| state = gpu.get("state", "unknown") | |
| note = "ECC" if state != "healthy" else "OK" | |
| print(f"+-----------------------------------------------------------------------------+") | |
| print(f"| NVIDIA-SMI 555.42.02 Driver Version: 555.42.02 CUDA Version: 12.5 |") | |
| print(f"|-----------------------------------------------------------------------------|") | |
| print(f"| GPU Name Bus-Id Pwr:Usage/Cap | Memory {note:<4} |") | |
| print(f"| 0 {gpu.get('model','unknown'):<24} 0000:17:00.0 78W / 700W | 0MiB {note:<5} |") | |
| print(f"+-----------------------------------------------------------------------------+") | |
| def handle_reset(gpu_id): | |
| open(ECC_SENTINEL, "w").close() | |
| def apply(doc): | |
| gpus = doc.setdefault("gpus", {}) | |
| entry = gpus.setdefault(GPU_KEY, {}) | |
| entry["state"] = "healthy" | |
| entry["ecc_vol_total"] = 0 | |
| services = doc.setdefault("services", {}) | |
| services["slurmd@compute-01"] = "active" | |
| nodes = doc.setdefault("nodes", {}) | |
| compute = nodes.setdefault("compute-01", {}) | |
| compute["state"] = "idle" | |
| compute["reason"] = "" | |
| mutate_state(apply) | |
| print(f"GPU {gpu_id}: ECC error counters reset. Node returned to idle.") | |
| return 0 | |
| def main(argv): | |
| parser = argparse.ArgumentParser(add_help=False) | |
| parser.add_argument("-r", "--reset", action="store_true") | |
| parser.add_argument("-i", "--id", default="0") | |
| parser.add_argument("-q", "--query", action="store_true") | |
| parser.add_argument("-d", "--display", default="") | |
| parser.add_argument("--help", action="store_true") | |
| try: | |
| args, extra = parser.parse_known_args(argv[1:]) | |
| except SystemExit: | |
| return 2 | |
| if args.help: | |
| print("nvidia-smi [-q] [-d ECC] [-r -i <gpu>]") | |
| return 0 | |
| os.makedirs(os.path.dirname(ECC_SENTINEL), exist_ok=True) | |
| doc = read_state() | |
| if args.reset: | |
| return handle_reset(args.id) | |
| if args.query: | |
| render_query(doc) | |
| return 0 | |
| render_summary(doc) | |
| return 0 | |
| if __name__ == "__main__": | |
| sys.exit(main(sys.argv)) | |
| """ | |