File size: 6,299 Bytes
bc35a94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
from __future__ import annotations

import argparse
import json
import stat
import sys
import tempfile
from pathlib import Path
from typing import Callable

from sysadmin_env.tasks import hpc_gpu_ecc
from sysadmin_env.tasks import hpc_munge
from sysadmin_env.tasks import hpc_nfs_stale
from sysadmin_env.tasks import hpc_ood_apache
from sysadmin_env.tasks import hpc_outage
from sysadmin_env.tasks import hpc_pid_stale


def fix_hpc_outage(root: Path) -> None:
    route_path = root / hpc_outage.COMPUTE_ROUTE_PATH
    route_path.parent.mkdir(parents=True, exist_ok=True)
    route_path.write_text(hpc_outage.FIXED_ROUTE)

    state_path = root / hpc_outage.SHARED_STATE_PATH
    doc = json.loads(state_path.read_text())
    doc["nodes"]["compute-01"]["state"] = "idle"
    doc["nodes"]["compute-01"]["reason"] = ""
    doc["services"]["slurmd@compute-01"] = "active"
    state_path.write_text(json.dumps(doc, indent=2, sort_keys=True) + "\n")


def fix_hpc_munge(root: Path) -> None:
    fix_hpc_outage(root)
    key_path = root / hpc_munge.MUNGE_KEY_PATH
    key_path.write_bytes(hpc_munge.EXPECTED_KEY_BYTES)
    key_path.chmod(hpc_munge.EXPECTED_KEY_MODE)

    state_path = root / hpc_munge.SHARED_STATE_PATH
    doc = json.loads(state_path.read_text())
    doc["services"]["munge@compute-01"] = "active"
    state_path.write_text(json.dumps(doc, indent=2, sort_keys=True) + "\n")


def fix_hpc_pid_stale(root: Path) -> None:
    pid_path = root / hpc_pid_stale.STALE_PID_PATH
    if pid_path.exists():
        pid_path.unlink()

    state_path = root / hpc_pid_stale.SHARED_STATE_PATH
    doc = json.loads(state_path.read_text())
    doc["services"]["slurmd@compute-01"] = "active"
    doc["nodes"]["compute-01"]["state"] = "idle"
    doc["nodes"]["compute-01"]["reason"] = ""
    state_path.write_text(json.dumps(doc, indent=2, sort_keys=True) + "\n")


def fix_hpc_gpu_ecc(root: Path) -> None:
    sentinel = root / hpc_gpu_ecc.ECC_RESET_PATH
    sentinel.parent.mkdir(parents=True, exist_ok=True)
    sentinel.write_text("reset ok\n")

    state_path = root / hpc_gpu_ecc.SHARED_STATE_PATH
    doc = json.loads(state_path.read_text())
    gpus = doc.setdefault("gpus", {})
    gpus.setdefault("compute-01:gpu-0", {})["state"] = "healthy"
    gpus["compute-01:gpu-0"]["ecc_vol_total"] = 0
    doc["services"]["slurmd@compute-01"] = "active"
    doc["nodes"]["compute-01"]["state"] = "idle"
    doc["nodes"]["compute-01"]["reason"] = ""
    state_path.write_text(json.dumps(doc, indent=2, sort_keys=True) + "\n")


def fix_hpc_nfs_stale(root: Path) -> None:
    stale = root / hpc_nfs_stale.MOUNT_STALE_RELATIVE
    if stale.exists():
        stale.unlink()
    valid = root / hpc_nfs_stale.MOUNT_VALID_RELATIVE
    valid.parent.mkdir(parents=True, exist_ok=True)
    valid.write_text("fresh mount handle\n")

    state_path = root / hpc_nfs_stale.SHARED_STATE_PATH
    doc = json.loads(state_path.read_text())
    doc["services"]["slurmd@compute-01"] = "active"
    doc["nodes"]["compute-01"]["state"] = "idle"
    doc["nodes"]["compute-01"]["reason"] = ""
    state_path.write_text(json.dumps(doc, indent=2, sort_keys=True) + "\n")


def fix_hpc_ood_apache(root: Path) -> None:
    conf_path = root / hpc_ood_apache.HTTPD_CONF_PATH
    conf_path.parent.mkdir(parents=True, exist_ok=True)
    conf_path.write_text(hpc_ood_apache.FIXED_HTTPD_CONF)

    state_path = root / hpc_ood_apache.SHARED_STATE_PATH
    doc = json.loads(state_path.read_text())
    doc["services"]["httpd@login"] = "active"
    doc.setdefault("portals", {}).setdefault("apache_ood", {})["state"] = "healthy"
    doc["portals"]["apache_ood"]["last_error"] = ""
    state_path.write_text(json.dumps(doc, indent=2, sort_keys=True) + "\n")


SCENARIOS: dict[str, tuple[object, Callable[[Path], None]]] = {
    hpc_outage.TASK_ID: (hpc_outage, fix_hpc_outage),
    hpc_munge.TASK_ID: (hpc_munge, fix_hpc_munge),
    hpc_pid_stale.TASK_ID: (hpc_pid_stale, fix_hpc_pid_stale),
    hpc_gpu_ecc.TASK_ID: (hpc_gpu_ecc, fix_hpc_gpu_ecc),
    hpc_nfs_stale.TASK_ID: (hpc_nfs_stale, fix_hpc_nfs_stale),
    hpc_ood_apache.TASK_ID: (hpc_ood_apache, fix_hpc_ood_apache),
}


def verify_scenario(scenario_id: str, *, verbose: bool) -> bool:
    if scenario_id not in SCENARIOS:
        raise KeyError(f"unknown scenario {scenario_id}")
    scenario, fix_fn = SCENARIOS[scenario_id]

    with tempfile.TemporaryDirectory(prefix=f"verify_{scenario_id}_") as tmp:
        root = Path(tmp)
        scenario.prepare_filesystem(root)

        broken = scenario.grade(root)
        if broken.done or broken.health >= 1.0:
            print(f"FAIL {scenario_id} initial state already solved health {broken.health}")
            return False

        if verbose:
            print(f"  {scenario_id} broken  health {broken.health:.2f} details {broken.details}")

        fix_fn(root)
        fixed = scenario.grade(root)
        if not fixed.done or fixed.health < 1.0:
            print(f"FAIL {scenario_id} gold fix did not reach done health {fixed.health} details {fixed.details}")
            return False

        if verbose:
            print(f"  {scenario_id} fixed   health {fixed.health:.2f} details {fixed.details}")

        file_count = sum(1 for _ in root.rglob('*') if _.is_file())
        mode_key = None
        if scenario_id == hpc_munge.TASK_ID:
            key = root / hpc_munge.MUNGE_KEY_PATH
            mode_key = oct(stat.S_IMODE(key.stat().st_mode))
        print(
            f"PASS {scenario_id} files {file_count} "
            f"{'mode_key ' + mode_key if mode_key else ''}".strip()
        )
        return True


def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--only", nargs="+", default=None, help="limit to specific scenario ids")
    parser.add_argument("-v", "--verbose", action="store_true")
    args = parser.parse_args()

    targets = args.only or list(SCENARIOS)
    failures: list[str] = []
    for sid in targets:
        ok = verify_scenario(sid, verbose=args.verbose)
        if not ok:
            failures.append(sid)

    if failures:
        print(f"\nFAIL {len(failures)} scenarios unsolved {failures}")
        return 1
    print(f"\nok all {len(targets)} scenarios solvable")
    return 0


if __name__ == "__main__":
    sys.exit(main())