from __future__ import annotations

import json
import re
from pathlib import Path

from sysadmin_env.models import DiagnosticTrigger
from sysadmin_env.models import DifficultyTier
from sysadmin_env.models import TaskMetadata
from sysadmin_env.models import TaskScenarioDefinition
from sysadmin_env.models import TaskScenarioState
from sysadmin_env.tasks import hpc_outage


TASK_ID = "hpc_gpu_ecc"
COMPLETION_HEALTH = 1.0

SHARED_STATE_PATH = hpc_outage.SHARED_STATE_PATH
NODES_ROOT = hpc_outage.NODES_ROOT
COMPUTE_ROOT = hpc_outage.COMPUTE_ROOT
ECC_RESET_RELATIVE = Path("var/lib/nvidia/ecc_reset.flag")
ECC_RESET_PATH = COMPUTE_ROOT / ECC_RESET_RELATIVE
NVIDIA_SMI_RELATIVE = Path("usr/local/bin/nvidia-smi")

INITIAL_STATE: dict = {
    "cluster": "rocky-hpc",
    "cores_total": hpc_outage.CLUSTER_CORES_TOTAL,
    "cores_per_node": hpc_outage.CLUSTER_CORES_PER_NODE,
    "partitions": {
        "compute": {"nodes": ["compute-01"], "default": True},
    },
    "nodes": {
        "login": {
            "state": "up",
            "reason": "",
            "cores": hpc_outage.CLUSTER_CORES_PER_NODE,
        },
        "compute-01": {
            "state": "drain",
            "reason": "gpu-0 uncorrectable ecc errors",
            "cores": hpc_outage.CLUSTER_CORES_PER_NODE,
        },
    },
    "services": {
        "slurmd@login": "active",
        "slurmd@compute-01": "failed",
        "slurmctld@login": "active",
        "nvidia-persistenced@compute-01": "active",
    },
    "gpus": {
        "compute-01:gpu-0": {
            "model": "NVIDIA H100 80GB HBM3",
            "state": "ecc_error",
            "ecc_vol_total": 47,
            "ecc_agg_total": 213,
        },
    },
    "jobs": [
        {
            "id": 11301,
            "name": "protein_fold",
            "user": "biogrid",
            "state": "PD",
            "partition": "compute",
            "nodes": "(NodeDown)",
            "time": "0:00",
        },
    ],
}


def build_definition(base_filesystem_path: str) -> TaskScenarioDefinition:
    metadata = TaskMetadata(
        task_id=TASK_ID,
        difficulty=DifficultyTier.hard,
        description="compute node drained because nvidia-smi reports gpu-0 uncorrectable ecc errors",
        max_steps=90,
        time_limit=600.0,
        base_filesystem_path=base_filesystem_path,
    )
    return TaskScenarioDefinition(
        metadata=metadata,
        requires_network_isolation=False,
        allows_nested_sandbox=True,
        diagnostic_triggers=diagnostic_triggers(),
    )


def diagnostic_triggers() -> list[DiagnosticTrigger]:
    return [
        DiagnosticTrigger(
            fact_id="cluster_queue_inspected",
            command_patterns=[r"\bsinfo\b", r"\bsqueue\b"],
            reward=0.06,
        ),
        DiagnosticTrigger(
            fact_id="compute_node_entered",
            command_patterns=[r"\bssh\s+compute-01\b"],
            reward=0.07,
        ),
        DiagnosticTrigger(
            fact_id="gpu_status_inspected",
            command_patterns=[r"\bnvidia-smi\b(?!\s+-r)"],
            reward=0.06,
        ),
        DiagnosticTrigger(
            fact_id="ecc_counters_queried",
            command_patterns=[r"nvidia-smi\s+(-q|--query).*ecc", r"nvidia-smi\s+.*ecc"],
            reward=0.05,
        ),
        DiagnosticTrigger(
            fact_id="slurmd_service_checked",
            command_patterns=[r"systemctl\s+status\s+slurmd", r"systemctl\s+is-failed\s+slurmd"],
            reward=0.05,
        ),
    ]


def prepare_filesystem(root: str | Path) -> None:
    root_path = Path(root)
    hpc_outage.prepare_filesystem(root_path)

    route_path = root_path / hpc_outage.COMPUTE_ROUTE_PATH
    route_path.parent.mkdir(parents=True, exist_ok=True)
    route_path.write_text(hpc_outage.FIXED_ROUTE)

    ecc_path = root_path / ECC_RESET_PATH
    ecc_path.parent.mkdir(parents=True, exist_ok=True)
    if ecc_path.exists():
        ecc_path.unlink()

    _write_state(root_path / SHARED_STATE_PATH, INITIAL_STATE)

    _write_executable(root_path / NVIDIA_SMI_RELATIVE, _login_nvidia_smi_stub())
    compute_bin = root_path / COMPUTE_ROOT / "usr/local/bin"
    compute_bin.mkdir(parents=True, exist_ok=True)
    _write_executable(compute_bin / "nvidia-smi", _compute_nvidia_smi_stub())


def inject_fault(root: str | Path) -> None:
    prepare_filesystem(root)


def observe_command(root: str | Path, command: str, _result) -> None:
    _ = Path(root)
    _ = command


def synchronize(root: str | Path) -> None:
    root_path = Path(root)
    if not (root_path / SHARED_STATE_PATH).exists():
        _write_state(root_path / SHARED_STATE_PATH, INITIAL_STATE)


def grade(root: str | Path) -> TaskScenarioState:
    root_path = Path(root)
    state_doc = _read_state(root_path / SHARED_STATE_PATH)

    ecc_reset = (root_path / ECC_RESET_PATH).exists()
    gpu_state = (
        state_doc.get("gpus", {})
        .get("compute-01:gpu-0", {})
        .get("state", "")
    )
    gpu_healthy = gpu_state == "healthy"

    slurmd_service = state_doc.get("services", {}).get("slurmd@compute-01", "")
    slurmd_active = slurmd_service == "active"
    node_state = state_doc.get("nodes", {}).get("compute-01", {}).get("state", "")
    node_idle = node_state == "idle"

    health = 0.0
    if ecc_reset:
        health += 0.25
    if gpu_healthy:
        health += 0.25
    if slurmd_active:
        health += 0.2
    if ecc_reset and gpu_healthy and slurmd_active and node_idle:
        health = COMPLETION_HEALTH

    done = ecc_reset and gpu_healthy and slurmd_active and node_idle

    return TaskScenarioState(
        health=health,
        done=done,
        details={
            "ecc_reset_sentinel_present": ecc_reset,
            "gpu_healthy": gpu_healthy,
            "slurmd_service_active": slurmd_active,
            "compute_node_idle": node_idle,
            "gpu_state": gpu_state or "unknown",
            "expected_sentinel_path": str(ECC_RESET_RELATIVE),
        },
    )


def command_reveals_fact(command: str, trigger: DiagnosticTrigger) -> bool:
    return any(re.search(pattern, command, flags=re.IGNORECASE) for pattern in trigger.command_patterns)


def _write_executable(path: Path, content: str) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(content)
    path.chmod(0o755)


def _write_state(path: Path, doc: dict) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(json.dumps(doc, indent=2, sort_keys=True) + "\n")


def _read_state(path: Path) -> dict:
    if not path.exists():
        return {}
    try:
        return json.loads(path.read_text() or "{}")
    except json.JSONDecodeError:
        return {}


def _login_nvidia_smi_stub() -> str:
    # on the login node there is no gpu the agent must ssh into compute-01
    return """#!/bin/sh
echo "nvidia-smi: no devices were found" >&2
exit 9
"""


def _compute_nvidia_smi_stub() -> str:
    return """#!/usr/bin/env python3
import argparse
import fcntl
import json
import os
import sys

STATE_PATH = "/mnt/shared/slurm_state.json"
ECC_SENTINEL = "/var/lib/nvidia/ecc_reset.flag"
GPU_KEY = "compute-01:gpu-0"

def read_state():
    try:
        with open(STATE_PATH, "r", encoding="utf-8") as fh:
            fcntl.flock(fh.fileno(), fcntl.LOCK_SH)
            try:
                raw = fh.read()
            finally:
                fcntl.flock(fh.fileno(), fcntl.LOCK_UN)
        return json.loads(raw or "{}")
    except FileNotFoundError:
        return {}

def mutate_state(mutator):
    with open(STATE_PATH, "r+", encoding="utf-8") as fh:
        fcntl.flock(fh.fileno(), fcntl.LOCK_EX)
        try:
            raw = fh.read()
            doc = json.loads(raw or "{}")
            mutator(doc)
            fh.seek(0)
            fh.truncate()
            fh.write(json.dumps(doc, indent=2, sort_keys=True) + "\\n")
            fh.flush()
            os.fsync(fh.fileno())
        finally:
            fcntl.flock(fh.fileno(), fcntl.LOCK_UN)

def render_query(doc):
    gpu = doc.get("gpus", {}).get(GPU_KEY, {})
    model = gpu.get("model", "unknown")
    state = gpu.get("state", "unknown")
    vol = gpu.get("ecc_vol_total", 0)
    agg = gpu.get("ecc_agg_total", 0)
    print(f"==============NVSMI LOG==============")
    print(f"GPU 00000000:17:00.0  {model}")
    print(f"    Product State         : {state}")
    print(f"    ECC Errors")
    print(f"        Volatile")
    print(f"            Total         : {vol}")
    print(f"        Aggregate")
    print(f"            Total         : {agg}")

def render_summary(doc):
    gpu = doc.get("gpus", {}).get(GPU_KEY, {})
    state = gpu.get("state", "unknown")
    note = "ECC" if state != "healthy" else "OK"
    print(f"+-----------------------------------------------------------------------------+")
    print(f"| NVIDIA-SMI 555.42.02   Driver Version: 555.42.02   CUDA Version: 12.5       |")
    print(f"|-----------------------------------------------------------------------------|")
    print(f"| GPU  Name                    Bus-Id          Pwr:Usage/Cap |  Memory  {note:<4} |")
    print(f"|  0   {gpu.get('model','unknown'):<24} 0000:17:00.0     78W / 700W |    0MiB {note:<5} |")
    print(f"+-----------------------------------------------------------------------------+")

def handle_reset(gpu_id):
    open(ECC_SENTINEL, "w").close()
    def apply(doc):
        gpus = doc.setdefault("gpus", {})
        entry = gpus.setdefault(GPU_KEY, {})
        entry["state"] = "healthy"
        entry["ecc_vol_total"] = 0
        services = doc.setdefault("services", {})
        services["slurmd@compute-01"] = "active"
        nodes = doc.setdefault("nodes", {})
        compute = nodes.setdefault("compute-01", {})
        compute["state"] = "idle"
        compute["reason"] = ""
    mutate_state(apply)
    print(f"GPU {gpu_id}: ECC error counters reset. Node returned to idle.")
    return 0

def main(argv):
    parser = argparse.ArgumentParser(add_help=False)
    parser.add_argument("-r", "--reset", action="store_true")
    parser.add_argument("-i", "--id", default="0")
    parser.add_argument("-q", "--query", action="store_true")
    parser.add_argument("-d", "--display", default="")
    parser.add_argument("--help", action="store_true")
    try:
        args, extra = parser.parse_known_args(argv[1:])
    except SystemExit:
        return 2
    if args.help:
        print("nvidia-smi [-q] [-d ECC] [-r -i <gpu>]")
        return 0
    os.makedirs(os.path.dirname(ECC_SENTINEL), exist_ok=True)
    doc = read_state()
    if args.reset:
        return handle_reset(args.id)
    if args.query:
        render_query(doc)
        return 0
    render_summary(doc)
    return 0

if __name__ == "__main__":
    sys.exit(main(sys.argv))
"""