Spaces:
Paused
Paused
File size: 6,299 Bytes
bc35a94 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | from __future__ import annotations
import argparse
import json
import stat
import sys
import tempfile
from pathlib import Path
from typing import Callable
from sysadmin_env.tasks import hpc_gpu_ecc
from sysadmin_env.tasks import hpc_munge
from sysadmin_env.tasks import hpc_nfs_stale
from sysadmin_env.tasks import hpc_ood_apache
from sysadmin_env.tasks import hpc_outage
from sysadmin_env.tasks import hpc_pid_stale
def fix_hpc_outage(root: Path) -> None:
route_path = root / hpc_outage.COMPUTE_ROUTE_PATH
route_path.parent.mkdir(parents=True, exist_ok=True)
route_path.write_text(hpc_outage.FIXED_ROUTE)
state_path = root / hpc_outage.SHARED_STATE_PATH
doc = json.loads(state_path.read_text())
doc["nodes"]["compute-01"]["state"] = "idle"
doc["nodes"]["compute-01"]["reason"] = ""
doc["services"]["slurmd@compute-01"] = "active"
state_path.write_text(json.dumps(doc, indent=2, sort_keys=True) + "\n")
def fix_hpc_munge(root: Path) -> None:
fix_hpc_outage(root)
key_path = root / hpc_munge.MUNGE_KEY_PATH
key_path.write_bytes(hpc_munge.EXPECTED_KEY_BYTES)
key_path.chmod(hpc_munge.EXPECTED_KEY_MODE)
state_path = root / hpc_munge.SHARED_STATE_PATH
doc = json.loads(state_path.read_text())
doc["services"]["munge@compute-01"] = "active"
state_path.write_text(json.dumps(doc, indent=2, sort_keys=True) + "\n")
def fix_hpc_pid_stale(root: Path) -> None:
pid_path = root / hpc_pid_stale.STALE_PID_PATH
if pid_path.exists():
pid_path.unlink()
state_path = root / hpc_pid_stale.SHARED_STATE_PATH
doc = json.loads(state_path.read_text())
doc["services"]["slurmd@compute-01"] = "active"
doc["nodes"]["compute-01"]["state"] = "idle"
doc["nodes"]["compute-01"]["reason"] = ""
state_path.write_text(json.dumps(doc, indent=2, sort_keys=True) + "\n")
def fix_hpc_gpu_ecc(root: Path) -> None:
sentinel = root / hpc_gpu_ecc.ECC_RESET_PATH
sentinel.parent.mkdir(parents=True, exist_ok=True)
sentinel.write_text("reset ok\n")
state_path = root / hpc_gpu_ecc.SHARED_STATE_PATH
doc = json.loads(state_path.read_text())
gpus = doc.setdefault("gpus", {})
gpus.setdefault("compute-01:gpu-0", {})["state"] = "healthy"
gpus["compute-01:gpu-0"]["ecc_vol_total"] = 0
doc["services"]["slurmd@compute-01"] = "active"
doc["nodes"]["compute-01"]["state"] = "idle"
doc["nodes"]["compute-01"]["reason"] = ""
state_path.write_text(json.dumps(doc, indent=2, sort_keys=True) + "\n")
def fix_hpc_nfs_stale(root: Path) -> None:
stale = root / hpc_nfs_stale.MOUNT_STALE_RELATIVE
if stale.exists():
stale.unlink()
valid = root / hpc_nfs_stale.MOUNT_VALID_RELATIVE
valid.parent.mkdir(parents=True, exist_ok=True)
valid.write_text("fresh mount handle\n")
state_path = root / hpc_nfs_stale.SHARED_STATE_PATH
doc = json.loads(state_path.read_text())
doc["services"]["slurmd@compute-01"] = "active"
doc["nodes"]["compute-01"]["state"] = "idle"
doc["nodes"]["compute-01"]["reason"] = ""
state_path.write_text(json.dumps(doc, indent=2, sort_keys=True) + "\n")
def fix_hpc_ood_apache(root: Path) -> None:
conf_path = root / hpc_ood_apache.HTTPD_CONF_PATH
conf_path.parent.mkdir(parents=True, exist_ok=True)
conf_path.write_text(hpc_ood_apache.FIXED_HTTPD_CONF)
state_path = root / hpc_ood_apache.SHARED_STATE_PATH
doc = json.loads(state_path.read_text())
doc["services"]["httpd@login"] = "active"
doc.setdefault("portals", {}).setdefault("apache_ood", {})["state"] = "healthy"
doc["portals"]["apache_ood"]["last_error"] = ""
state_path.write_text(json.dumps(doc, indent=2, sort_keys=True) + "\n")
SCENARIOS: dict[str, tuple[object, Callable[[Path], None]]] = {
hpc_outage.TASK_ID: (hpc_outage, fix_hpc_outage),
hpc_munge.TASK_ID: (hpc_munge, fix_hpc_munge),
hpc_pid_stale.TASK_ID: (hpc_pid_stale, fix_hpc_pid_stale),
hpc_gpu_ecc.TASK_ID: (hpc_gpu_ecc, fix_hpc_gpu_ecc),
hpc_nfs_stale.TASK_ID: (hpc_nfs_stale, fix_hpc_nfs_stale),
hpc_ood_apache.TASK_ID: (hpc_ood_apache, fix_hpc_ood_apache),
}
def verify_scenario(scenario_id: str, *, verbose: bool) -> bool:
if scenario_id not in SCENARIOS:
raise KeyError(f"unknown scenario {scenario_id}")
scenario, fix_fn = SCENARIOS[scenario_id]
with tempfile.TemporaryDirectory(prefix=f"verify_{scenario_id}_") as tmp:
root = Path(tmp)
scenario.prepare_filesystem(root)
broken = scenario.grade(root)
if broken.done or broken.health >= 1.0:
print(f"FAIL {scenario_id} initial state already solved health {broken.health}")
return False
if verbose:
print(f" {scenario_id} broken health {broken.health:.2f} details {broken.details}")
fix_fn(root)
fixed = scenario.grade(root)
if not fixed.done or fixed.health < 1.0:
print(f"FAIL {scenario_id} gold fix did not reach done health {fixed.health} details {fixed.details}")
return False
if verbose:
print(f" {scenario_id} fixed health {fixed.health:.2f} details {fixed.details}")
file_count = sum(1 for _ in root.rglob('*') if _.is_file())
mode_key = None
if scenario_id == hpc_munge.TASK_ID:
key = root / hpc_munge.MUNGE_KEY_PATH
mode_key = oct(stat.S_IMODE(key.stat().st_mode))
print(
f"PASS {scenario_id} files {file_count} "
f"{'mode_key ' + mode_key if mode_key else ''}".strip()
)
return True
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--only", nargs="+", default=None, help="limit to specific scenario ids")
parser.add_argument("-v", "--verbose", action="store_true")
args = parser.parse_args()
targets = args.only or list(SCENARIOS)
failures: list[str] = []
for sid in targets:
ok = verify_scenario(sid, verbose=args.verbose)
if not ok:
failures.append(sid)
if failures:
print(f"\nFAIL {len(failures)} scenarios unsolved {failures}")
return 1
print(f"\nok all {len(targets)} scenarios solvable")
return 0
if __name__ == "__main__":
sys.exit(main())
|