Spaces:
Paused
Paused
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import stat | |
| import sys | |
| import tempfile | |
| from pathlib import Path | |
| from typing import Callable | |
| from sysadmin_env.tasks import hpc_gpu_ecc | |
| from sysadmin_env.tasks import hpc_munge | |
| from sysadmin_env.tasks import hpc_nfs_stale | |
| from sysadmin_env.tasks import hpc_ood_apache | |
| from sysadmin_env.tasks import hpc_outage | |
| from sysadmin_env.tasks import hpc_pid_stale | |
| def fix_hpc_outage(root: Path) -> None: | |
| route_path = root / hpc_outage.COMPUTE_ROUTE_PATH | |
| route_path.parent.mkdir(parents=True, exist_ok=True) | |
| route_path.write_text(hpc_outage.FIXED_ROUTE) | |
| state_path = root / hpc_outage.SHARED_STATE_PATH | |
| doc = json.loads(state_path.read_text()) | |
| doc["nodes"]["compute-01"]["state"] = "idle" | |
| doc["nodes"]["compute-01"]["reason"] = "" | |
| doc["services"]["slurmd@compute-01"] = "active" | |
| state_path.write_text(json.dumps(doc, indent=2, sort_keys=True) + "\n") | |
| def fix_hpc_munge(root: Path) -> None: | |
| fix_hpc_outage(root) | |
| key_path = root / hpc_munge.MUNGE_KEY_PATH | |
| key_path.write_bytes(hpc_munge.EXPECTED_KEY_BYTES) | |
| key_path.chmod(hpc_munge.EXPECTED_KEY_MODE) | |
| state_path = root / hpc_munge.SHARED_STATE_PATH | |
| doc = json.loads(state_path.read_text()) | |
| doc["services"]["munge@compute-01"] = "active" | |
| state_path.write_text(json.dumps(doc, indent=2, sort_keys=True) + "\n") | |
| def fix_hpc_pid_stale(root: Path) -> None: | |
| pid_path = root / hpc_pid_stale.STALE_PID_PATH | |
| if pid_path.exists(): | |
| pid_path.unlink() | |
| state_path = root / hpc_pid_stale.SHARED_STATE_PATH | |
| doc = json.loads(state_path.read_text()) | |
| doc["services"]["slurmd@compute-01"] = "active" | |
| doc["nodes"]["compute-01"]["state"] = "idle" | |
| doc["nodes"]["compute-01"]["reason"] = "" | |
| state_path.write_text(json.dumps(doc, indent=2, sort_keys=True) + "\n") | |
| def fix_hpc_gpu_ecc(root: Path) -> None: | |
| sentinel = root / hpc_gpu_ecc.ECC_RESET_PATH | |
| sentinel.parent.mkdir(parents=True, exist_ok=True) | |
| sentinel.write_text("reset ok\n") | |
| state_path = root / hpc_gpu_ecc.SHARED_STATE_PATH | |
| doc = json.loads(state_path.read_text()) | |
| gpus = doc.setdefault("gpus", {}) | |
| gpus.setdefault("compute-01:gpu-0", {})["state"] = "healthy" | |
| gpus["compute-01:gpu-0"]["ecc_vol_total"] = 0 | |
| doc["services"]["slurmd@compute-01"] = "active" | |
| doc["nodes"]["compute-01"]["state"] = "idle" | |
| doc["nodes"]["compute-01"]["reason"] = "" | |
| state_path.write_text(json.dumps(doc, indent=2, sort_keys=True) + "\n") | |
| def fix_hpc_nfs_stale(root: Path) -> None: | |
| stale = root / hpc_nfs_stale.MOUNT_STALE_RELATIVE | |
| if stale.exists(): | |
| stale.unlink() | |
| valid = root / hpc_nfs_stale.MOUNT_VALID_RELATIVE | |
| valid.parent.mkdir(parents=True, exist_ok=True) | |
| valid.write_text("fresh mount handle\n") | |
| state_path = root / hpc_nfs_stale.SHARED_STATE_PATH | |
| doc = json.loads(state_path.read_text()) | |
| doc["services"]["slurmd@compute-01"] = "active" | |
| doc["nodes"]["compute-01"]["state"] = "idle" | |
| doc["nodes"]["compute-01"]["reason"] = "" | |
| state_path.write_text(json.dumps(doc, indent=2, sort_keys=True) + "\n") | |
| def fix_hpc_ood_apache(root: Path) -> None: | |
| conf_path = root / hpc_ood_apache.HTTPD_CONF_PATH | |
| conf_path.parent.mkdir(parents=True, exist_ok=True) | |
| conf_path.write_text(hpc_ood_apache.FIXED_HTTPD_CONF) | |
| state_path = root / hpc_ood_apache.SHARED_STATE_PATH | |
| doc = json.loads(state_path.read_text()) | |
| doc["services"]["httpd@login"] = "active" | |
| doc.setdefault("portals", {}).setdefault("apache_ood", {})["state"] = "healthy" | |
| doc["portals"]["apache_ood"]["last_error"] = "" | |
| state_path.write_text(json.dumps(doc, indent=2, sort_keys=True) + "\n") | |
| SCENARIOS: dict[str, tuple[object, Callable[[Path], None]]] = { | |
| hpc_outage.TASK_ID: (hpc_outage, fix_hpc_outage), | |
| hpc_munge.TASK_ID: (hpc_munge, fix_hpc_munge), | |
| hpc_pid_stale.TASK_ID: (hpc_pid_stale, fix_hpc_pid_stale), | |
| hpc_gpu_ecc.TASK_ID: (hpc_gpu_ecc, fix_hpc_gpu_ecc), | |
| hpc_nfs_stale.TASK_ID: (hpc_nfs_stale, fix_hpc_nfs_stale), | |
| hpc_ood_apache.TASK_ID: (hpc_ood_apache, fix_hpc_ood_apache), | |
| } | |
| def verify_scenario(scenario_id: str, *, verbose: bool) -> bool: | |
| if scenario_id not in SCENARIOS: | |
| raise KeyError(f"unknown scenario {scenario_id}") | |
| scenario, fix_fn = SCENARIOS[scenario_id] | |
| with tempfile.TemporaryDirectory(prefix=f"verify_{scenario_id}_") as tmp: | |
| root = Path(tmp) | |
| scenario.prepare_filesystem(root) | |
| broken = scenario.grade(root) | |
| if broken.done or broken.health >= 1.0: | |
| print(f"FAIL {scenario_id} initial state already solved health {broken.health}") | |
| return False | |
| if verbose: | |
| print(f" {scenario_id} broken health {broken.health:.2f} details {broken.details}") | |
| fix_fn(root) | |
| fixed = scenario.grade(root) | |
| if not fixed.done or fixed.health < 1.0: | |
| print(f"FAIL {scenario_id} gold fix did not reach done health {fixed.health} details {fixed.details}") | |
| return False | |
| if verbose: | |
| print(f" {scenario_id} fixed health {fixed.health:.2f} details {fixed.details}") | |
| file_count = sum(1 for _ in root.rglob('*') if _.is_file()) | |
| mode_key = None | |
| if scenario_id == hpc_munge.TASK_ID: | |
| key = root / hpc_munge.MUNGE_KEY_PATH | |
| mode_key = oct(stat.S_IMODE(key.stat().st_mode)) | |
| print( | |
| f"PASS {scenario_id} files {file_count} " | |
| f"{'mode_key ' + mode_key if mode_key else ''}".strip() | |
| ) | |
| return True | |
| def main() -> int: | |
| parser = argparse.ArgumentParser(description=__doc__) | |
| parser.add_argument("--only", nargs="+", default=None, help="limit to specific scenario ids") | |
| parser.add_argument("-v", "--verbose", action="store_true") | |
| args = parser.parse_args() | |
| targets = args.only or list(SCENARIOS) | |
| failures: list[str] = [] | |
| for sid in targets: | |
| ok = verify_scenario(sid, verbose=args.verbose) | |
| if not ok: | |
| failures.append(sid) | |
| if failures: | |
| print(f"\nFAIL {len(failures)} scenarios unsolved {failures}") | |
| return 1 | |
| print(f"\nok all {len(targets)} scenarios solvable") | |
| return 0 | |
| if __name__ == "__main__": | |
| sys.exit(main()) | |