from __future__ import annotations import json import re from pathlib import Path from sysadmin_env.models import DiagnosticTrigger from sysadmin_env.models import DifficultyTier from sysadmin_env.models import TaskMetadata from sysadmin_env.models import TaskScenarioDefinition from sysadmin_env.models import TaskScenarioState from sysadmin_env.tasks import hpc_outage TASK_ID = "hpc_ood_apache" COMPLETION_HEALTH = 1.0 SHARED_STATE_PATH = hpc_outage.SHARED_STATE_PATH LOGIN_ROOT = hpc_outage.LOGIN_ROOT HTTPD_CONF_RELATIVE = Path("etc/httpd/conf/httpd.conf") HTTPD_CONF_PATH = LOGIN_ROOT / HTTPD_CONF_RELATIVE APACHECTL_RELATIVE = Path("usr/local/bin/apachectl") FIXED_HTTPD_CONF = ( "ServerName hpc-login\n" "Listen 8081\n" "DocumentRoot /var/www/ood\n" "Include conf.modules.d/00-mpm.conf\n" "ErrorLog /var/log/httpd/error_log\n" ) BROKEN_HTTPD_CONF = ( "ServerName hpc-login\n" "Listn 8081\n" "DocumentRoot /var/www/ood\n" "Include conf.modules.d/00-mpm.conf\n" "ErrorLog /var/log/httpd/error_log\n" ) INITIAL_STATE: dict = { "cluster": "rocky-hpc", "cores_total": hpc_outage.CLUSTER_CORES_TOTAL, "cores_per_node": hpc_outage.CLUSTER_CORES_PER_NODE, "partitions": { "compute": {"nodes": ["compute-01"], "default": True}, }, "nodes": { "login": { "state": "up", "reason": "", "cores": hpc_outage.CLUSTER_CORES_PER_NODE, }, "compute-01": { "state": "idle", "reason": "", "cores": hpc_outage.CLUSTER_CORES_PER_NODE, }, }, "services": { "slurmd@login": "active", "slurmd@compute-01": "active", "slurmctld@login": "active", "httpd@login": "failed", }, "portals": { "apache_ood": { "port": 8081, "state": "degraded", "last_error": "AH00526: Syntax error in file /etc/httpd/conf/httpd.conf: Listn", }, }, "jobs": [ { "id": 12044, "name": "weather_ensemble", "user": "meteo", "state": "R", "partition": "compute", "nodes": "compute-01", "time": "1:04:21", }, ], } def build_definition(base_filesystem_path: str) -> TaskScenarioDefinition: metadata = TaskMetadata( task_id=TASK_ID, difficulty=DifficultyTier.medium, description="open ondemand apache portal on :8081 returns 500 due to a one character typo in httpd.conf", max_steps=80, time_limit=540.0, base_filesystem_path=base_filesystem_path, ) return TaskScenarioDefinition( metadata=metadata, requires_network_isolation=False, allows_nested_sandbox=True, diagnostic_triggers=diagnostic_triggers(), ) def diagnostic_triggers() -> list[DiagnosticTrigger]: return [ DiagnosticTrigger( fact_id="cluster_queue_inspected", command_patterns=[r"\bsinfo\b", r"\bsqueue\b"], reward=0.04, ), DiagnosticTrigger( fact_id="httpd_service_checked", command_patterns=[r"systemctl\s+status\s+httpd", r"systemctl\s+is-failed\s+httpd"], reward=0.06, ), DiagnosticTrigger( fact_id="httpd_conf_inspected", command_patterns=[r"cat\s+.+httpd\.conf", r"grep\s+.+httpd\.conf"], reward=0.06, ), DiagnosticTrigger( fact_id="apachectl_configtest_run", command_patterns=[r"\bapachectl\s+configtest\b", r"\bhttpd\s+-t\b"], reward=0.08, ), DiagnosticTrigger( fact_id="apache_portal_probed", command_patterns=[r"curl\s+.+localhost:8081", r"curl\s+.+127\.0\.0\.1:8081"], reward=0.05, ), ] def prepare_filesystem(root: str | Path) -> None: root_path = Path(root) hpc_outage.prepare_filesystem(root_path) route_path = root_path / hpc_outage.COMPUTE_ROUTE_PATH route_path.parent.mkdir(parents=True, exist_ok=True) route_path.write_text(hpc_outage.FIXED_ROUTE) conf_path = root_path / HTTPD_CONF_PATH conf_path.parent.mkdir(parents=True, exist_ok=True) conf_path.write_text(BROKEN_HTTPD_CONF) _write_executable(root_path / APACHECTL_RELATIVE, _apachectl_stub()) login_bin = root_path / LOGIN_ROOT / "usr/local/bin" login_bin.mkdir(parents=True, exist_ok=True) _write_executable(login_bin / "apachectl", _apachectl_stub()) _write_state(root_path / SHARED_STATE_PATH, INITIAL_STATE) def inject_fault(root: str | Path) -> None: prepare_filesystem(root) def observe_command(root: str | Path, command: str, _result) -> None: _ = Path(root) _ = command def synchronize(root: str | Path) -> None: root_path = Path(root) if not (root_path / SHARED_STATE_PATH).exists(): _write_state(root_path / SHARED_STATE_PATH, INITIAL_STATE) def grade(root: str | Path) -> TaskScenarioState: root_path = Path(root) state_doc = _read_state(root_path / SHARED_STATE_PATH) conf_text = _safe_read(root_path / HTTPD_CONF_PATH) conf_fixed = conf_text == FIXED_HTTPD_CONF httpd_service = state_doc.get("services", {}).get("httpd@login", "") httpd_active = httpd_service == "active" portal_state = state_doc.get("portals", {}).get("apache_ood", {}).get("state", "") portal_healthy = portal_state == "healthy" # service state only counts if the conf is fixed so naive systemctl restart does not win effective_httpd = httpd_active and conf_fixed health = 0.0 if conf_fixed: health += 0.35 if effective_httpd: health += 0.35 if conf_fixed and effective_httpd and portal_healthy: health = COMPLETION_HEALTH done = conf_fixed and effective_httpd and portal_healthy return TaskScenarioState( health=health, done=done, details={ "httpd_conf_fixed": conf_fixed, "httpd_service_active": httpd_active, "apache_portal_healthy": portal_healthy, "expected_conf_first_lines": "ServerName hpc-login / Listen 8081", }, ) def command_reveals_fact(command: str, trigger: DiagnosticTrigger) -> bool: return any(re.search(pattern, command, flags=re.IGNORECASE) for pattern in trigger.command_patterns) def _safe_read(path: Path) -> str: if not path.exists(): return "" return path.read_text() def _write_executable(path: Path, content: str) -> None: path.parent.mkdir(parents=True, exist_ok=True) path.write_text(content) path.chmod(0o755) def _write_state(path: Path, doc: dict) -> None: path.parent.mkdir(parents=True, exist_ok=True) path.write_text(json.dumps(doc, indent=2, sort_keys=True) + "\n") def _read_state(path: Path) -> dict: if not path.exists(): return {} try: return json.loads(path.read_text() or "{}") except json.JSONDecodeError: return {} def _apachectl_stub() -> str: return """#!/usr/bin/env python3 import fcntl import json import os import sys STATE_PATH = "/mnt/shared/slurm_state.json" CONF_PATH = "/etc/httpd/conf/httpd.conf" FIXED_CONF = ( "ServerName hpc-login\\n" "Listen 8081\\n" "DocumentRoot /var/www/ood\\n" "Include conf.modules.d/00-mpm.conf\\n" "ErrorLog /var/log/httpd/error_log\\n" ) VALID_DIRECTIVES = { "ServerName", "Listen", "DocumentRoot", "Include", "ErrorLog", "ServerAdmin", "LoadModule", "User", "Group", } def read_conf(): try: with open(CONF_PATH, "r", encoding="utf-8") as fh: return fh.read() except FileNotFoundError: return "" def configtest(): text = read_conf() if not text: print("AH00014: Configuration check failed: httpd.conf missing") return 1 for idx, line in enumerate(text.splitlines(), start=1): stripped = line.strip() if not stripped or stripped.startswith("#"): continue directive = stripped.split()[0] if directive not in VALID_DIRECTIVES: print( f"AH00526: Syntax error on line {idx} of {CONF_PATH}: " f"Invalid directive '{directive}'" ) return 1 if text == FIXED_CONF: print("Syntax OK") return 0 print("Syntax OK") return 0 def mutate_state(mutator): with open(STATE_PATH, "r+", encoding="utf-8") as fh: fcntl.flock(fh.fileno(), fcntl.LOCK_EX) try: raw = fh.read() doc = json.loads(raw or "{}") mutator(doc) fh.seek(0) fh.truncate() fh.write(json.dumps(doc, indent=2, sort_keys=True) + "\\n") fh.flush() os.fsync(fh.fileno()) finally: fcntl.flock(fh.fileno(), fcntl.LOCK_UN) def graceful(): rc = configtest() if rc != 0: sys.stderr.write("apachectl graceful: refusing to reload, config invalid\\n") return rc def apply(doc): services = doc.setdefault("services", {}) services["httpd@login"] = "active" portals = doc.setdefault("portals", {}) ood = portals.setdefault("apache_ood", {}) ood["state"] = "healthy" ood["last_error"] = "" mutate_state(apply) print("apachectl graceful: httpd reloaded, apache_ood portal marked healthy") return 0 def status(): try: with open(STATE_PATH, "r", encoding="utf-8") as fh: fcntl.flock(fh.fileno(), fcntl.LOCK_SH) try: doc = json.loads(fh.read() or "{}") finally: fcntl.flock(fh.fileno(), fcntl.LOCK_UN) except FileNotFoundError: doc = {} svc = doc.get("services", {}).get("httpd@login", "inactive") portal = doc.get("portals", {}).get("apache_ood", {}) print(f"httpd service: {svc}") print(f"apache portal: state={portal.get('state','unknown')} port={portal.get('port','8081')}") return 0 def main(argv): if len(argv) < 2: print("usage: apachectl {configtest|graceful|status|restart}") return 1 cmd = argv[1] if cmd == "configtest" or cmd == "-t": return configtest() if cmd in {"graceful", "restart", "reload", "start"}: return graceful() if cmd == "status": return status() if cmd == "stop": def apply(doc): services = doc.setdefault("services", {}) services["httpd@login"] = "inactive" portals = doc.setdefault("portals", {}) ood = portals.setdefault("apache_ood", {}) ood["state"] = "down" mutate_state(apply) print("apachectl stop: httpd stopped") return 0 print(f"apachectl: unknown subcommand {cmd}") return 1 if __name__ == "__main__": sys.exit(main(sys.argv)) """