HPCOpenenv / sysadmin_env /tasks /hpc_ood_apache.py
huggingmenfordays's picture
deploy: ccyloopss/HPCOpenenv — with OPENENV_API_KEY auth guard
bc35a94
from __future__ import annotations
import json
import re
from pathlib import Path
from sysadmin_env.models import DiagnosticTrigger
from sysadmin_env.models import DifficultyTier
from sysadmin_env.models import TaskMetadata
from sysadmin_env.models import TaskScenarioDefinition
from sysadmin_env.models import TaskScenarioState
from sysadmin_env.tasks import hpc_outage
TASK_ID = "hpc_ood_apache"
COMPLETION_HEALTH = 1.0
SHARED_STATE_PATH = hpc_outage.SHARED_STATE_PATH
LOGIN_ROOT = hpc_outage.LOGIN_ROOT
HTTPD_CONF_RELATIVE = Path("etc/httpd/conf/httpd.conf")
HTTPD_CONF_PATH = LOGIN_ROOT / HTTPD_CONF_RELATIVE
APACHECTL_RELATIVE = Path("usr/local/bin/apachectl")
FIXED_HTTPD_CONF = (
"ServerName hpc-login\n"
"Listen 8081\n"
"DocumentRoot /var/www/ood\n"
"Include conf.modules.d/00-mpm.conf\n"
"ErrorLog /var/log/httpd/error_log\n"
)
BROKEN_HTTPD_CONF = (
"ServerName hpc-login\n"
"Listn 8081\n"
"DocumentRoot /var/www/ood\n"
"Include conf.modules.d/00-mpm.conf\n"
"ErrorLog /var/log/httpd/error_log\n"
)
INITIAL_STATE: dict = {
"cluster": "rocky-hpc",
"cores_total": hpc_outage.CLUSTER_CORES_TOTAL,
"cores_per_node": hpc_outage.CLUSTER_CORES_PER_NODE,
"partitions": {
"compute": {"nodes": ["compute-01"], "default": True},
},
"nodes": {
"login": {
"state": "up",
"reason": "",
"cores": hpc_outage.CLUSTER_CORES_PER_NODE,
},
"compute-01": {
"state": "idle",
"reason": "",
"cores": hpc_outage.CLUSTER_CORES_PER_NODE,
},
},
"services": {
"slurmd@login": "active",
"slurmd@compute-01": "active",
"slurmctld@login": "active",
"httpd@login": "failed",
},
"portals": {
"apache_ood": {
"port": 8081,
"state": "degraded",
"last_error": "AH00526: Syntax error in file /etc/httpd/conf/httpd.conf: Listn",
},
},
"jobs": [
{
"id": 12044,
"name": "weather_ensemble",
"user": "meteo",
"state": "R",
"partition": "compute",
"nodes": "compute-01",
"time": "1:04:21",
},
],
}
def build_definition(base_filesystem_path: str) -> TaskScenarioDefinition:
metadata = TaskMetadata(
task_id=TASK_ID,
difficulty=DifficultyTier.medium,
description="open ondemand apache portal on :8081 returns 500 due to a one character typo in httpd.conf",
max_steps=80,
time_limit=540.0,
base_filesystem_path=base_filesystem_path,
)
return TaskScenarioDefinition(
metadata=metadata,
requires_network_isolation=False,
allows_nested_sandbox=True,
diagnostic_triggers=diagnostic_triggers(),
)
def diagnostic_triggers() -> list[DiagnosticTrigger]:
return [
DiagnosticTrigger(
fact_id="cluster_queue_inspected",
command_patterns=[r"\bsinfo\b", r"\bsqueue\b"],
reward=0.04,
),
DiagnosticTrigger(
fact_id="httpd_service_checked",
command_patterns=[r"systemctl\s+status\s+httpd", r"systemctl\s+is-failed\s+httpd"],
reward=0.06,
),
DiagnosticTrigger(
fact_id="httpd_conf_inspected",
command_patterns=[r"cat\s+.+httpd\.conf", r"grep\s+.+httpd\.conf"],
reward=0.06,
),
DiagnosticTrigger(
fact_id="apachectl_configtest_run",
command_patterns=[r"\bapachectl\s+configtest\b", r"\bhttpd\s+-t\b"],
reward=0.08,
),
DiagnosticTrigger(
fact_id="apache_portal_probed",
command_patterns=[r"curl\s+.+localhost:8081", r"curl\s+.+127\.0\.0\.1:8081"],
reward=0.05,
),
]
def prepare_filesystem(root: str | Path) -> None:
root_path = Path(root)
hpc_outage.prepare_filesystem(root_path)
route_path = root_path / hpc_outage.COMPUTE_ROUTE_PATH
route_path.parent.mkdir(parents=True, exist_ok=True)
route_path.write_text(hpc_outage.FIXED_ROUTE)
conf_path = root_path / HTTPD_CONF_PATH
conf_path.parent.mkdir(parents=True, exist_ok=True)
conf_path.write_text(BROKEN_HTTPD_CONF)
_write_executable(root_path / APACHECTL_RELATIVE, _apachectl_stub())
login_bin = root_path / LOGIN_ROOT / "usr/local/bin"
login_bin.mkdir(parents=True, exist_ok=True)
_write_executable(login_bin / "apachectl", _apachectl_stub())
_write_state(root_path / SHARED_STATE_PATH, INITIAL_STATE)
def inject_fault(root: str | Path) -> None:
prepare_filesystem(root)
def observe_command(root: str | Path, command: str, _result) -> None:
_ = Path(root)
_ = command
def synchronize(root: str | Path) -> None:
root_path = Path(root)
if not (root_path / SHARED_STATE_PATH).exists():
_write_state(root_path / SHARED_STATE_PATH, INITIAL_STATE)
def grade(root: str | Path) -> TaskScenarioState:
root_path = Path(root)
state_doc = _read_state(root_path / SHARED_STATE_PATH)
conf_text = _safe_read(root_path / HTTPD_CONF_PATH)
conf_fixed = conf_text == FIXED_HTTPD_CONF
httpd_service = state_doc.get("services", {}).get("httpd@login", "")
httpd_active = httpd_service == "active"
portal_state = state_doc.get("portals", {}).get("apache_ood", {}).get("state", "")
portal_healthy = portal_state == "healthy"
# service state only counts if the conf is fixed so naive systemctl restart does not win
effective_httpd = httpd_active and conf_fixed
health = 0.0
if conf_fixed:
health += 0.35
if effective_httpd:
health += 0.35
if conf_fixed and effective_httpd and portal_healthy:
health = COMPLETION_HEALTH
done = conf_fixed and effective_httpd and portal_healthy
return TaskScenarioState(
health=health,
done=done,
details={
"httpd_conf_fixed": conf_fixed,
"httpd_service_active": httpd_active,
"apache_portal_healthy": portal_healthy,
"expected_conf_first_lines": "ServerName hpc-login / Listen 8081",
},
)
def command_reveals_fact(command: str, trigger: DiagnosticTrigger) -> bool:
return any(re.search(pattern, command, flags=re.IGNORECASE) for pattern in trigger.command_patterns)
def _safe_read(path: Path) -> str:
if not path.exists():
return ""
return path.read_text()
def _write_executable(path: Path, content: str) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(content)
path.chmod(0o755)
def _write_state(path: Path, doc: dict) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(doc, indent=2, sort_keys=True) + "\n")
def _read_state(path: Path) -> dict:
if not path.exists():
return {}
try:
return json.loads(path.read_text() or "{}")
except json.JSONDecodeError:
return {}
def _apachectl_stub() -> str:
return """#!/usr/bin/env python3
import fcntl
import json
import os
import sys
STATE_PATH = "/mnt/shared/slurm_state.json"
CONF_PATH = "/etc/httpd/conf/httpd.conf"
FIXED_CONF = (
"ServerName hpc-login\\n"
"Listen 8081\\n"
"DocumentRoot /var/www/ood\\n"
"Include conf.modules.d/00-mpm.conf\\n"
"ErrorLog /var/log/httpd/error_log\\n"
)
VALID_DIRECTIVES = {
"ServerName", "Listen", "DocumentRoot", "Include", "ErrorLog",
"ServerAdmin", "LoadModule", "User", "Group",
}
def read_conf():
try:
with open(CONF_PATH, "r", encoding="utf-8") as fh:
return fh.read()
except FileNotFoundError:
return ""
def configtest():
text = read_conf()
if not text:
print("AH00014: Configuration check failed: httpd.conf missing")
return 1
for idx, line in enumerate(text.splitlines(), start=1):
stripped = line.strip()
if not stripped or stripped.startswith("#"):
continue
directive = stripped.split()[0]
if directive not in VALID_DIRECTIVES:
print(
f"AH00526: Syntax error on line {idx} of {CONF_PATH}: "
f"Invalid directive '{directive}'"
)
return 1
if text == FIXED_CONF:
print("Syntax OK")
return 0
print("Syntax OK")
return 0
def mutate_state(mutator):
with open(STATE_PATH, "r+", encoding="utf-8") as fh:
fcntl.flock(fh.fileno(), fcntl.LOCK_EX)
try:
raw = fh.read()
doc = json.loads(raw or "{}")
mutator(doc)
fh.seek(0)
fh.truncate()
fh.write(json.dumps(doc, indent=2, sort_keys=True) + "\\n")
fh.flush()
os.fsync(fh.fileno())
finally:
fcntl.flock(fh.fileno(), fcntl.LOCK_UN)
def graceful():
rc = configtest()
if rc != 0:
sys.stderr.write("apachectl graceful: refusing to reload, config invalid\\n")
return rc
def apply(doc):
services = doc.setdefault("services", {})
services["httpd@login"] = "active"
portals = doc.setdefault("portals", {})
ood = portals.setdefault("apache_ood", {})
ood["state"] = "healthy"
ood["last_error"] = ""
mutate_state(apply)
print("apachectl graceful: httpd reloaded, apache_ood portal marked healthy")
return 0
def status():
try:
with open(STATE_PATH, "r", encoding="utf-8") as fh:
fcntl.flock(fh.fileno(), fcntl.LOCK_SH)
try:
doc = json.loads(fh.read() or "{}")
finally:
fcntl.flock(fh.fileno(), fcntl.LOCK_UN)
except FileNotFoundError:
doc = {}
svc = doc.get("services", {}).get("httpd@login", "inactive")
portal = doc.get("portals", {}).get("apache_ood", {})
print(f"httpd service: {svc}")
print(f"apache portal: state={portal.get('state','unknown')} port={portal.get('port','8081')}")
return 0
def main(argv):
if len(argv) < 2:
print("usage: apachectl {configtest|graceful|status|restart}")
return 1
cmd = argv[1]
if cmd == "configtest" or cmd == "-t":
return configtest()
if cmd in {"graceful", "restart", "reload", "start"}:
return graceful()
if cmd == "status":
return status()
if cmd == "stop":
def apply(doc):
services = doc.setdefault("services", {})
services["httpd@login"] = "inactive"
portals = doc.setdefault("portals", {})
ood = portals.setdefault("apache_ood", {})
ood["state"] = "down"
mutate_state(apply)
print("apachectl stop: httpd stopped")
return 0
print(f"apachectl: unknown subcommand {cmd}")
return 1
if __name__ == "__main__":
sys.exit(main(sys.argv))
"""