# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""
CyberSOCEnv — Enterprise Cybersecurity Operations Center Environment.

Implements the OpenEnv Environment interface for a deterministic SOC
incident response simulation on a 500-node enterprise network.

The agent receives SIEM/EDR alerts, queries hosts, runs forensics,
isolates segments, blocks IOCs, kills processes, and submits a
containment plan — all while minimizing business downtime.
"""

from __future__ import annotations

import copy
import random
import uuid
from typing import Any, Callable, Dict, List, Optional
from uuid import uuid4

from openenv.core.env_server.interfaces import Environment
from openenv.core.env_server.types import State

try:
    from ..models import (
        SOCObservation,
        SOCActionWrapper,
        SOCState,
        Alert,
        NetworkTopology,
        ForensicsResult,
        TimelineEntry,
        QueryHost,
        IsolateSegment,
        BlockIOC,
        RunForensics,
        KillProcess,
        SubmitContainmentPlan,
        CorrelateAlerts,
        EnrichIOC,
        ScanHostVulnerabilities,
        TerminatePID,
        CreateFirewallRule,
        QuarantineFile,
        RedActionWrapper,
        LateralPivot,
        DeployPayload,
        EvadeDetection,
        PassTurn,
        RED_ACTION_TYPES,
    )
except ImportError:
    from models import (
        SOCObservation,
        SOCActionWrapper,
        SOCState,
        Alert,
        NetworkTopology,
        ForensicsResult,
        TimelineEntry,
        QueryHost,
        IsolateSegment,
        BlockIOC,
        RunForensics,
        KillProcess,
        SubmitContainmentPlan,
        CorrelateAlerts,
        EnrichIOC,
        ScanHostVulnerabilities,
        TerminatePID,
        CreateFirewallRule,
        QuarantineFile,
        RedActionWrapper,
        LateralPivot,
        DeployPayload,
        EvadeDetection,
        PassTurn,
        RED_ACTION_TYPES,
    )

from .tasks import get_task, build_network
from .graders import grade_episode
from .threat_graph import (
    ThreatGraph,
    HostNode,
    ProcessNode,
    IOCNode,
    VulnerabilityNode,
    AlertNode,
    Edge,
)
class ActionMiddleware:
    """Pre-flight validation for SOC actions.

    Detects phase violations (action out of order) and graph-ungrounded actions
    (action references an entity not yet discovered in the ThreatGraph).
    Returns None if the action is valid, or an error dict otherwise.
    """

    def validate(
        self,
        current_phase: str,
        action_type: str,
        args: Dict[str, Any],
        graph,
    ) -> Optional[Dict[str, str]]:
        # Phase violation: plan submission before any investigation
        if action_type == "submit_containment_plan" and current_phase == "triage":
            return {
                "error_type": "PHASE_VIOLATION",
                "message": "submit_containment_plan requires investigation phase first",
            }

        # Graph-groundedness: IOC must be discovered before enrichment
        if action_type == "enrich_ioc":
            ioc_val = args.get("ioc_value", "")
            if ioc_val and graph is not None and ioc_val not in graph.iocs:
                return {
                    "error_type": "GRAPH_FAILURE",
                    "message": f"IOC '{ioc_val}' not in threat graph; receive an alert or run forensics first",
                }

        # Graph-groundedness: host must be known before vulnerability scan
        if action_type == "scan_host_vulnerabilities":
            hostname = args.get("hostname", "")
            if hostname and graph is not None and hostname not in graph.hosts:
                return {
                    "error_type": "GRAPH_FAILURE",
                    "message": f"Host '{hostname}' not in threat graph; run query_host first",
                }

        # Emergency isolation gate: allow early isolate_segment only when a critical
        # alert proves an active threat on the targeted subnet/host; otherwise penalise
        # the panic as UNJUSTIFIED_EMERGENCY.
        if action_type == "isolate_segment" and current_phase == "triage":
            subnet = args.get("subnet", "")
            target_host = args.get("target_host", "")
            has_critical = False
            if graph is not None:
                for alert in graph.alerts.values():
                    if alert.severity != "critical":
                        continue
                    src = alert.source_host
                    if target_host and src == target_host:
                        has_critical = True
                        break
                    if subnet and src in graph.hosts:
                        host_node = graph.hosts.get(src)
                        if host_node and getattr(host_node, "subnet", "") == subnet:
                            has_critical = True
                            break
            if not has_critical:
                return {
                    "error_type": "UNJUSTIFIED_EMERGENCY",
                    "message": (
                        "isolate_segment during triage requires a critical-severity alert "
                        "on the targeted subnet/host to justify emergency response"
                    ),
                }

        return None


class CyberSOCEnvironment(Environment):
    """
    Deterministic SOC incident response environment.

    Simulates a 500-node enterprise network under attack. The agent must
    investigate alerts, contain threats, and submit a containment plan
    while minimizing business downtime.

    Supports concurrent WebSocket sessions (each gets own instance).

    Example:
        >>> env = CyberSOCEnvironment()
        >>> obs = env.reset(task_id="easy")
        >>> print(len(obs.alert_queue))  # Initial alerts
        >>> obs = env.step(SOCActionWrapper(type="query_host", hostname="WS-042"))
    """

    SUPPORTS_CONCURRENT_SESSIONS: bool = True

    def __init__(
        self,
        adaptive: bool = False,
        neural_red_policy: Optional[Any] = None,
        red_team_logger: Optional[Callable[[Dict[str, Any]], None]] = None,
        fsp_mode: bool = False,
    ):
        """Initialize the environment (actual state set in reset).

        Args:
            adaptive: Legacy adaptive-adversary flag (kept for backward compat).
            neural_red_policy: Optional callable for neural Red policy (legacy hook).
            red_team_logger: Optional callback for recording Red decisions.
            fsp_mode: When True, step() uses strict alternating turns and
                step_count only increments after BOTH Blue and Red have acted.
                When False (default), step(SOCActionWrapper) behaves exactly as
                before — Red's PassTurn is applied automatically so existing code
                and tests remain unaffected.
        """
        super().__init__()
        self._adaptive = adaptive
        self._neural_red_policy = neural_red_policy
        self._red_team_logger = red_team_logger
        self._fsp_mode = fsp_mode
        self._red_team_decisions: List[Dict[str, Any]] = []
        self._live_requirements: Dict[str, Any] = {}
        self._threat_graph = None  # will be initialized on reset()
        self._state = SOCState(episode_id=str(uuid4()), step_count=0)
        self._network: Dict[str, List[Dict[str, Any]]] = {}
        self._task_def: Dict[str, Any] = {}
        self._alert_queue: List[Dict[str, Any]] = []
        self._host_index: Dict[str, Dict[str, Any]] = {}  # hostname -> host dict
        self._plan_entries: List[Dict[str, Any]] = []
        self._last_forensics: Optional[ForensicsResult] = None
        self._middleware = ActionMiddleware()
        self._rng = random.Random(0)  # overwritten in reset()
        self._pending_followup: Dict[str, bool] = {}  # hostname -> responded_to
        self._disruption_cost: float = 0.0  # accumulates per clean host/subnet isolated
        self._discovered_iocs: set = set()  # IOCs revealed via run_forensics or enrich_ioc
        self._quarantined_files: set[tuple[str, str]] = set()
        self._step_reward_total: float = 0.0

    def _reset_rubric(self):
        """Initialize live containment requirements for dynamic grading in adaptive mode."""
        import copy
        self._live_requirements = copy.deepcopy(
            self._task_def.get("containment_requirements", {})
        )

    # ===========================================================================
    # reset()
    # ===========================================================================

    def reset(
        self,
        seed: Optional[int] = None,
        episode_id: Optional[str] = None,
        **kwargs: Any,
    ) -> SOCObservation:
        """Reset the environment for a specific task.

        Args:
            seed: Ignored (environment is fully deterministic).
            episode_id: Optional custom episode ID.
            **kwargs: Must include task_id ('easy', 'medium', or 'hard').

        Returns:
            Initial SOCObservation with alert queue and network state.
        """
        task_id = kwargs.get("task_id", "easy")
        self._rng = random.Random(hash(task_id))
        self._task_def = get_task(task_id)
        self._recent_actions = []  # reset stall detector

        # Build deterministic network (cached per task for GRPO throughput)
        if not hasattr(CyberSOCEnvironment, "_network_cache"):
            CyberSOCEnvironment._network_cache = {}
        cache_key = task_id
        if cache_key in CyberSOCEnvironment._network_cache:
            self._network = copy.deepcopy(CyberSOCEnvironment._network_cache[cache_key])
        else:
            self._network = build_network()
            CyberSOCEnvironment._network_cache[cache_key] = copy.deepcopy(self._network)

        # Build hostname index for O(1) lookups
        self._host_index = {}
        for subnet_name, hosts in self._network.items():
            for host in hosts:
                self._host_index[host["hostname"]] = host

        # Inject attack chain: mark compromised hosts, add malicious processes
        for threat in self._task_def["attack_chain"]:
            for hostname in threat["compromised_hosts"]:
                if hostname in self._host_index:
                    host = self._host_index[hostname]
                    host["status"] = "compromised"
                    for proc in threat["malicious_processes"]:
                        if proc not in host["running_processes"]:
                            host["running_processes"].append(proc)

        # Initialize alert queue (deep copy so mutations don't affect task def)
        self._alert_queue = copy.deepcopy(self._task_def["initial_alerts"])

        # Reset state
        eid = episode_id or str(uuid4())
        self._state = SOCState(
            episode_id=eid,
            step_count=0,
            task_id=task_id,
            max_steps=self._task_def["max_steps"],
            total_reward=0.0,
            business_impact=self._task_def["initial_business_impact"],
            contained_threats=[],
            active_threats=[t["threat_id"] for t in self._task_def["attack_chain"]],
            blocked_iocs=[],
            isolated_subnets=[],
            forensics_run=[],
            killed_processes=[],
            queried_hosts=[],
            timeline=[],
            is_done=False,
            submitted_plan=False,
            active_turn="blue",
        )

        self._plan_entries = []
        self._last_forensics = None
        self._reset_rubric()
        self._fired_step_rewards: set = set()
        self._step_reward_total: float = 0.0
        self._pending_followup: Dict[str, bool] = {}
        self._disruption_cost = 0.0
        self._discovered_iocs: set = set()
        self._quarantined_files: set[tuple[str, str]] = set()
        self._red_team_decisions = []

        # Initialize threat graph from task definition
        self._threat_graph = ThreatGraph()
        self._populate_threat_graph()

        # Inject external threat-intel feed IOCs so Blue can immediately enrich/block them
        # without hitting GRAPH_FAILURE (simulates acting on CISA or partner feed data).
        for ioc_entry in self._task_def.get("external_intel_feed", []) or []:
            if isinstance(ioc_entry, str):
                ioc_value = ioc_entry
                parts = ioc_entry.split(".")
                if len(parts) == 4 and all(p.isdigit() for p in parts):
                    ioc_type = "ip"
                elif len(ioc_entry) >= 32 and "." not in ioc_entry:
                    ioc_type = "hash"
                else:
                    ioc_type = "domain"
            elif isinstance(ioc_entry, dict):
                ioc_value = ioc_entry.get("value", "")
                ioc_type = ioc_entry.get("type", "ip")
            else:
                continue
            if not ioc_value:
                continue
            if ioc_value not in self._threat_graph.iocs:
                self._threat_graph.add_ioc(
                    IOCNode(ioc_value=ioc_value, ioc_type=ioc_type, confidence=0.70)
                )
            self._discovered_iocs.add(ioc_value)

        self._last_obs_extras: Dict[str, Any] = {}

        return self._build_observation(reward=0.0, done=False)

    def _populate_threat_graph(self) -> None:
        """Seed the threat graph with hosts, processes, IOCs, and alerts from task_def."""
        graph = self._threat_graph

        # Hosts: include compromised hosts from attack chain + every host they live on
        compromised_set: set[str] = set()
        for threat in self._task_def.get("attack_chain", []):
            for hn in threat.get("compromised_hosts", []):
                compromised_set.add(hn)

        for hostname in compromised_set:
            host_dict = self._host_index.get(hostname)
            if host_dict is None:
                continue
            graph.add_host(HostNode(
                hostname=hostname,
                subnet=host_dict.get("subnet", "corporate"),
                business_criticality="high" if host_dict.get("criticality", 0.5) >= 0.7 else "medium",
                status="compromised",
            ))

        # Processes: malicious processes per compromised host
        for threat in self._task_def.get("attack_chain", []):
            tid = threat.get("threat_id", "T?")
            for hostname in threat.get("compromised_hosts", []):
                if hostname not in graph.hosts:
                    continue
                for proc in threat.get("malicious_processes", []):
                    pid = f"{hostname}:{proc}"
                    if pid not in graph.processes:
                        graph.add_process(ProcessNode(
                            process_id=pid,
                            hostname=hostname,
                            process_name=proc,
                        ))
                # Add part_of_chain edge
                graph.add_edge(Edge(
                    edge_type="part_of_chain",
                    source_id=tid,
                    target_id=hostname,
                ))

        # IOCs from attack chain
        for threat in self._task_def.get("attack_chain", []):
            iocs = threat.get("iocs", {}) or {}
            for ioc_value in iocs.get("hashes", []):
                if ioc_value not in graph.iocs:
                    graph.add_ioc(IOCNode(ioc_value=ioc_value, ioc_type="hash", confidence=0.85))
            for ioc_value in iocs.get("ips", []):
                if ioc_value not in graph.iocs:
                    graph.add_ioc(IOCNode(ioc_value=ioc_value, ioc_type="ip", confidence=0.85))
            for ioc_value in iocs.get("domains", []):
                if ioc_value not in graph.iocs:
                    graph.add_ioc(IOCNode(ioc_value=ioc_value, ioc_type="domain", confidence=0.85))
            for c2 in threat.get("c2_servers", []):
                if c2 not in graph.iocs:
                    graph.add_ioc(IOCNode(ioc_value=c2, ioc_type="ip", confidence=0.95))

        # Alerts
        for a in self._task_def.get("initial_alerts", []):
            aid = a.get("alert_id")
            if aid and aid not in graph.alerts:
                graph.add_alert(AlertNode(
                    alert_id=aid,
                    severity=a.get("severity", "medium"),
                    priority_score=1.0,
                    source_host=a.get("source_host", ""),
                ))

    # ===========================================================================
    # step()
    # ===========================================================================

    def step(
        self,
        action,  # SOCActionWrapper | RedActionWrapper
        timeout_s: Optional[float] = None,
        **kwargs: Any,
    ) -> SOCObservation:
        """Process one agent action — Blue (SOCActionWrapper) or Red (RedActionWrapper).

        Turn semantics (fsp_mode=True):
          • Blue step: execute, flip active_turn → 'red', do NOT increment step_count.
          • Red step:  execute, flip active_turn → 'blue', increment step_count.

        When fsp_mode=False (default / backward-compat):
          • Blue step auto-applies a Red PassTurn so step_count always increments,
            preserving all existing test and dashboard behaviour.

        Returns:
            SOCObservation; includes active_turn and red_observation fields.
        """
        if self._state.is_done:
            return self._build_observation(reward=0.0, done=True)

        if isinstance(action, RedActionWrapper):
            return self._step_red(action)
        return self._step_blue(action)

    # ------------------------------------------------------------------
    # _step_blue — execute a Blue (SOC analyst) action
    # ------------------------------------------------------------------

    def _step_blue(
        self,
        action: SOCActionWrapper,
    ) -> SOCObservation:
        """Execute one Blue turn."""
        # Convert wrapper to typed action — gracefully handle hallucinated
        # action types or wrong parameters from the LLM instead of crashing.
        try:
            typed_action = action.to_typed_action()
        except Exception as exc:
            # Return a negative reward signal so GRPO can learn from the mistake
            penalty = -0.2
            self._state.total_reward += penalty
            self._state.timeline.append({
                "step": self._state.step_count + 1,
                "action_type": getattr(action, "type", "unknown"),
                "target": "N/A",
                "result": f"INVALID_ACTION: {exc}",
                "reward": penalty,
            })
            self._state.step_count += 1
            return self._build_observation(reward=penalty, done=False)

        args = typed_action.model_dump(exclude={"metadata", "type"})

        # Pre-flight validation — penalise without consuming a step
        current_phase = self._get_current_phase()
        validation_error = self._middleware.validate(
            current_phase, typed_action.type, args, self._threat_graph
        )
        if validation_error:
            error_type = validation_error.get("error_type", "")
            if error_type == "PHASE_VIOLATION":
                penalty = -0.10
            elif error_type == "UNJUSTIFIED_EMERGENCY":
                penalty = -0.15
            else:
                penalty = -0.05
            self._state.total_reward += penalty
            return self._build_observation(reward=penalty, done=False)

        # Reset per-step extras
        self._last_obs_extras = {}

        # Dispatch to Blue handler
        reward = 0.0
        result_description = "unknown action"

        if isinstance(typed_action, QueryHost):
            reward, result_description = self._handle_query_host(typed_action)
        elif isinstance(typed_action, IsolateSegment):
            reward, result_description = self._handle_isolate_segment(typed_action)
        elif isinstance(typed_action, BlockIOC):
            reward, result_description = self._handle_block_ioc(typed_action)
        elif isinstance(typed_action, RunForensics):
            reward, result_description = self._handle_run_forensics(typed_action)
        elif isinstance(typed_action, KillProcess):
            reward, result_description = self._handle_kill_process(typed_action)
        elif isinstance(typed_action, SubmitContainmentPlan):
            reward, result_description = self._handle_submit_plan(typed_action)
        elif isinstance(typed_action, CorrelateAlerts):
            result = self._handle_correlate_alerts(typed_action)
            self._last_obs_extras.update(result)
            reward = 0.05 if "error" not in result else -0.05
            result_description = result.get("description", "correlate_alerts")
        elif isinstance(typed_action, EnrichIOC):
            result = self._handle_enrich_ioc(typed_action)
            self._last_obs_extras.update(result)
            reward = 0.05 if "error" not in result else -0.05
            result_description = result.get("description", "enrich_ioc")
        elif isinstance(typed_action, ScanHostVulnerabilities):
            result = self._handle_scan_vulnerabilities(typed_action)
            self._last_obs_extras.update(result)
            reward = 0.05 if "error" not in result else -0.05
            result_description = result.get("description", "scan_host_vulnerabilities")
        elif isinstance(typed_action, TerminatePID):
            reward, result_description = self._handle_terminate_pid(typed_action)
        elif isinstance(typed_action, CreateFirewallRule):
            reward, result_description = self._handle_create_firewall_rule(typed_action)
        elif isinstance(typed_action, QuarantineFile):
            reward, result_description = self._handle_quarantine_file(typed_action)

        # Idempotent step reward
        target = self._get_action_target(typed_action)
        step_r = self._get_step_reward(
            phase="investigation", action_type=typed_action.type, target=target
        )
        reward += step_r
        self._step_reward_total += step_r

        # Stall detection: penalise 3+ consecutive identical actions
        stall_key = (typed_action.type, target)
        if not hasattr(self, "_recent_actions"):
            self._recent_actions = []
        self._recent_actions.append(stall_key)
        if len(self._recent_actions) >= 3:
            last_three = self._recent_actions[-3:]
            if last_three[0] == last_three[1] == last_three[2]:
                reward -= 0.05

        # Business impact grows each step (attacker progresses)
        if not self._state.is_done:
            impact_rate = self._task_def.get("impact_per_step", 0.02)
            active_ratio = len(self._state.active_threats) / max(
                1, len(self._task_def.get("attack_chain", []))
            )
            self._state.business_impact = min(
                1.0, self._state.business_impact + impact_rate * active_ratio
            )

        # Round label: step_count+1 = current round being played (not yet closed)
        round_label = self._state.step_count + 1

        # Record timeline
        self._state.timeline.append({
            "step": round_label,
            "action_type": typed_action.type,
            "target": target,
            "result": result_description,
            "reward": reward,
        })

        # Accumulate reward
        self._state.total_reward += reward

        # Check if episode ends due to Blue action (plan submission)
        done = False
        if self._state.submitted_plan:
            done = True
            self._state.is_done = True
            self._state.active_turn = "blue"  # episode over — keep at blue
            # In non-FSP mode, still increment step_count for consistency
            if not self._fsp_mode:
                self._state.step_count += 1
            return self._build_observation(reward=reward, done=done)

        # Flip turn to Red
        self._state.active_turn = "red"

        # fsp_mode=False (backward compat): auto-apply Red PassTurn so
        # callers that only drive Blue see step_count increment as before.
        if not self._fsp_mode:
            # Embedded Red dynamics: execute neural or deterministic policy.
            # Only fires when a policy is wired (training) or adaptive=True (SFT).
            if self._neural_red_policy is not None or self._adaptive:
                self._apply_red_team_dynamics(typed_action.type, target)
            self._state.step_count += 1
            self._state.active_turn = "blue"
            # Timeout check (done after Red's "auto turn")
            if self._state.step_count >= self._state.max_steps:
                reward -= 0.20
                self._state.total_reward -= 0.20
                self._state.is_done = True
                done = True

        return self._build_observation(reward=reward, done=done)

    # ------------------------------------------------------------------
    # _step_red — execute a Red Team action
    # ------------------------------------------------------------------

    def _step_red(self, action: RedActionWrapper) -> SOCObservation:
        """Execute one Red turn. Only valid when active_turn == 'red'."""
        if self._state.active_turn != "red":
            # Wrong turn — return current obs with 0 reward (no state change)
            return self._build_observation(reward=0.0, done=False)

        typed_action = action.to_typed_action()
        self._last_obs_extras = {}

        reward = 0.0
        result_description = "red: noop"

        if isinstance(typed_action, LateralPivot):
            reward, result_description = self._handle_lateral_pivot(typed_action)
        elif isinstance(typed_action, DeployPayload):
            reward, result_description = self._handle_deploy_payload(typed_action)
        elif isinstance(typed_action, EvadeDetection):
            reward, result_description = self._handle_evade_detection(typed_action)
        elif isinstance(typed_action, PassTurn):
            reward, result_description = self._handle_pass_turn(typed_action)

        # Close the round: increment step_count, flip turn back to Blue
        self._state.step_count += 1
        self._state.active_turn = "blue"

        # Record Red's action in timeline (prefixed with "red:" to distinguish)
        self._state.timeline.append({
            "step": self._state.step_count,
            "action_type": f"red:{typed_action.type}",
            "target": self._get_red_action_target(typed_action),
            "result": result_description,
            "reward": 0.0,  # Red actions don't add to Blue's reward total
        })

        # Timeout check after the full round
        done = False
        if self._state.step_count >= self._state.max_steps:
            done = True
            self._state.is_done = True

        return self._build_observation(reward=reward, done=done)

    # ===========================================================================
    # Action Handlers (return (reward, description))
    # ===========================================================================

    def _handle_query_host(self, action: QueryHost) -> tuple[float, str]:
        """Query a host for status info."""
        hostname = action.hostname
        self._last_forensics = None  # Clear forensics from previous step

        if hostname not in self._host_index:
            return -0.05, f"Host '{hostname}' not found in network"

        host = self._host_index[hostname]

        # Reward for querying compromised hosts (useful investigation)
        reward = 0.0
        if host["status"] == "compromised" and hostname not in self._state.queried_hosts:
            reward = 0.05  # Good: investigating a compromised host
        elif hostname in self._state.queried_hosts:
            reward = -0.02  # Penalty: re-querying same host wastes time

        self._state.queried_hosts.append(hostname)

        # Enhanced observation extras: process_tree + network_connections from graph
        process_tree = []
        if self._threat_graph is not None:
            for p in self._threat_graph.processes.values():
                if p.hostname == hostname:
                    process_tree.append({
                        "process_id": p.process_id,
                        "process_name": p.process_name,
                        "killed": p.killed,
                    })
        network_connections = []
        if self._threat_graph is not None:
            for e in self._threat_graph.edges:
                if e.edge_type == "communicates_with" and (
                    e.source_id == hostname or e.target_id == hostname
                ):
                    other = e.target_id if e.source_id == hostname else e.source_id
                    if other in self._threat_graph.iocs:
                        network_connections.append(other)
        self._last_obs_extras["process_tree"] = process_tree
        self._last_obs_extras["network_connections"] = network_connections

        return reward, f"Queried {hostname}: status={host['status']}, procs={len(host['running_processes'])}"

    def _handle_isolate_segment(self, action: IsolateSegment) -> tuple[float, str]:
        """Isolate a network segment, or a single host if target_host is set."""
        self._last_forensics = None

        # Single-host isolation path
        target_host = getattr(action, "target_host", None)
        if target_host:
            if target_host not in self._host_index:
                return -0.05, f"Host '{target_host}' not found"
            self._host_index[target_host]["status"] = "isolated"
            if self._threat_graph is not None and target_host in self._threat_graph.hosts:
                self._threat_graph.hosts[target_host].status = "isolated"
            if target_host in self._pending_followup:
                self._pending_followup[target_host] = True
            # Penalise isolating a clean (non-compromised) host — business disruption
            compromised_host_set = {
                h for threat in self._task_def["attack_chain"]
                for h in threat.get("compromised_hosts", [])
            }
            if target_host not in compromised_host_set:
                self._disruption_cost += 0.35
                self._state.business_impact = min(1.0, self._state.business_impact + 0.10)
                return -0.35, (
                    f"Isolated clean host '{target_host}' — unjustified business disruption "
                    f"(cumulative cost={self._disruption_cost:.2f})"
                )
            return 0.10, f"Isolated single host '{target_host}'"

        subnet = action.subnet

        if subnet not in self._network:
            return -0.05, f"Subnet '{subnet}' does not exist"

        if subnet in self._state.isolated_subnets:
            return -0.02, f"Subnet '{subnet}' is already isolated"

        # Build compromised host set for disruption tracking
        compromised_host_set = {
            h for threat in self._task_def["attack_chain"]
            for h in threat.get("compromised_hosts", [])
        }

        # Isolate all hosts in the subnet; count clean hosts for disruption cost
        clean_isolated_count = 0
        for host in self._network[subnet]:
            host["status"] = "isolated"
            if self._threat_graph is not None and host["hostname"] in self._threat_graph.hosts:
                self._threat_graph.hosts[host["hostname"]].status = "isolated"
            if host["hostname"] in self._pending_followup:
                self._pending_followup[host["hostname"]] = True
            if host["hostname"] not in compromised_host_set:
                clean_isolated_count += 1

        self._state.isolated_subnets.append(subnet)

        # Accumulate disruption cost for each clean host swept up in the isolation
        if clean_isolated_count > 0:
            self._disruption_cost += 0.25 * clean_isolated_count
            self._state.business_impact = min(
                1.0, self._state.business_impact + 0.05 * clean_isolated_count
            )

        # Check if this contains any active threats
        reward = 0.0
        threats_contained = []
        for threat in self._task_def["attack_chain"]:
            if threat["threat_id"] in self._state.active_threats:
                # Check if any compromised hosts are in this subnet
                for ch in threat["compromised_hosts"]:
                    if ch in self._host_index and self._host_index[ch]["subnet"] == subnet:
                        threats_contained.append(threat["threat_id"])
                        break

        if threats_contained:
            # Reduced reward — isolation is a blunt instrument; prefer kill_process / block_ioc
            reward = 0.07 * len(threats_contained)
            for tid in threats_contained:
                if tid not in self._state.contained_threats:
                    self._state.contained_threats.append(tid)
                if tid in self._state.active_threats:
                    self._state.active_threats.remove(tid)

        # Heavy per-clean-host penalty to deter blunt-force isolation spam
        if clean_isolated_count > 0:
            reward -= 0.25 * clean_isolated_count

        # Additional penalty for explicitly prohibited isolation
        must_not_isolate = self._task_def["containment_requirements"].get("must_not_isolate", [])
        if subnet in must_not_isolate:
            reward -= 0.10
            self._state.business_impact = min(1.0, self._state.business_impact + 0.08)

        return reward, (
            f"Isolated subnet '{subnet}'. Threats contained: {threats_contained}. "
            f"Clean hosts disrupted: {clean_isolated_count} "
            f"(cumulative cost={self._disruption_cost:.2f})"
        )

    def _handle_block_ioc(self, action: BlockIOC) -> tuple[float, str]:
        """Block an IOC at the perimeter.

        Requires prior discovery via run_forensics or enrich_ioc; blind blocks
        are recorded but yield 0 reward to prevent reward hacking.
        """
        ioc = action.ioc_value
        self._last_forensics = None

        if ioc in self._state.blocked_iocs:
            return -0.02, f"IOC '{ioc}' is already blocked"

        # Prerequisite gate: IOC must have been discovered via run_forensics or enrich_ioc
        if ioc not in self._discovered_iocs:
            self._state.blocked_iocs.append(ioc)  # record the block, but no reward
            return 0.0, (
                f"IOC '{ioc}' blocked without prior investigation — 0 reward "
                "(run_forensics or enrich_ioc required to unlock reward)"
            )

        self._state.blocked_iocs.append(ioc)

        # Mark forensics-confirmed hosts as responded-to — only valid for discovered IOCs,
        # ensuring _pending_followup accurately reflects investigated-then-actioned flow
        for hostname, responded in list(self._pending_followup.items()):
            if responded:
                continue
            for threat in self._task_def["attack_chain"]:
                if hostname in threat["compromised_hosts"]:
                    all_threat_iocs = (
                        threat["iocs"].get("hashes", [])
                        + threat["iocs"].get("ips", [])
                        + threat["iocs"].get("domains", [])
                        + threat.get("c2_servers", [])
                    )
                    if ioc in all_threat_iocs:
                        self._pending_followup[hostname] = True
                        break

        # Boosted rewards: surgical strikes are heavily preferred over blunt isolation
        reward = 0.0
        relevant = False
        for threat in self._task_def["attack_chain"]:
            all_iocs = (
                threat["iocs"].get("hashes", [])
                + threat["iocs"].get("ips", [])
                + threat["iocs"].get("domains", [])
            )
            if ioc in all_iocs:
                relevant = True
                if ioc in threat.get("c2_servers", []):
                    reward += 0.30  # High value: severing C2 command channel
                else:
                    reward += 0.20  # Good: blocking an investigated IOC
                break

        if not relevant:
            reward = -0.03  # Noise: blocking irrelevant IOC

        return reward, f"Blocked IOC '{ioc}' (type={action.ioc_type}). Relevant: {relevant}"

    def _handle_run_forensics(self, action: RunForensics) -> tuple[float, str]:
        """Run forensic analysis on a host."""
        hostname = action.hostname

        if hostname not in self._host_index:
            self._last_forensics = None
            return -0.05, f"Host '{hostname}' not found"

        host = self._host_index[hostname]

        # Build forensics result based on actual host state
        is_compromised = host["status"] == "compromised"
        malicious_procs = []
        suspicious_files = []
        network_conns = []
        registry_mods = []
        memory_artifacts = []

        if is_compromised:
            # Find which threat(s) affect this host
            for threat in self._task_def["attack_chain"]:
                if hostname in threat["compromised_hosts"]:
                    malicious_procs.extend(threat["malicious_processes"])
                    # Generate deterministic forensic artifacts
                    for proc in threat["malicious_processes"]:
                        suspicious_files.append(f"C:\\Windows\\Temp\\{proc}.dat")
                        registry_mods.append(f"HKLM\\Software\\Microsoft\\Windows\\CurrentVersion\\Run\\{proc}")
                    for c2 in threat.get("c2_servers", []):
                        network_conns.append(f"{c2}:443")
                    for ioc_hash in threat["iocs"].get("hashes", []):
                        memory_artifacts.append(f"memory_inject_{ioc_hash[:8]}")

        self._last_forensics = ForensicsResult(
            hostname=hostname,
            malicious_processes=malicious_procs,
            suspicious_files=suspicious_files,
            network_connections=network_conns,
            registry_modifications=registry_mods,
            memory_artifacts=memory_artifacts,
            is_compromised=is_compromised,
        )

        # Reward
        reward = 0.0
        if hostname not in self._state.forensics_run:
            if is_compromised:
                reward = 0.10  # Good: found evidence
                self._pending_followup.setdefault(hostname, False)  # needs response action
                # Reveal all IOCs tied to this host's threat chain so block_ioc can earn reward
                for threat in self._task_def["attack_chain"]:
                    if hostname in threat.get("compromised_hosts", []):
                        for ioc in (
                            threat["iocs"].get("hashes", [])
                            + threat["iocs"].get("ips", [])
                            + threat["iocs"].get("domains", [])
                            + threat.get("c2_servers", [])
                        ):
                            self._discovered_iocs.add(ioc)
            else:
                reward = 0.02  # Cleared a host (some value)
            self._state.forensics_run.append(hostname)
        else:
            reward = -0.02  # Re-running forensics wastes time

        # Enhanced: behavioral_chain and network_flows from graph
        behavioral_chain = []
        network_flows = []
        if self._threat_graph is not None:
            for e in self._threat_graph.edges:
                if e.source_id == hostname or e.target_id == hostname:
                    behavioral_chain.append({
                        "edge_type": e.edge_type,
                        "source_id": e.source_id,
                        "target_id": e.target_id,
                    })
            for e in self._threat_graph.edges:
                if e.edge_type == "communicates_with":
                    if e.source_id == hostname or e.target_id == hostname:
                        other = e.target_id if e.source_id == hostname else e.source_id
                        if other in self._threat_graph.iocs:
                            network_flows.append(other)
        self._last_obs_extras["behavioral_chain"] = behavioral_chain
        self._last_obs_extras["network_flows"] = network_flows

        return reward, f"Forensics on {hostname}: compromised={is_compromised}, procs={malicious_procs}"

    def _handle_kill_process(self, action: KillProcess) -> tuple[float, str]:
        """Kill a process on a host."""
        hostname = action.hostname
        process = action.process_name
        self._last_forensics = None

        if hostname not in self._host_index:
            return -0.05, f"Host '{hostname}' not found"

        host = self._host_index[hostname]

        if host["status"] == "isolated":
            return -0.02, f"Host '{hostname}' is isolated — cannot interact"

        if process not in host["running_processes"]:
            return -0.03, f"Process '{process}' not running on {hostname}"

        # Kill the process
        host["running_processes"].remove(process)
        self._state.killed_processes.append({"hostname": hostname, "process": process})
        if hostname in self._pending_followup:
            self._pending_followup[hostname] = True

        # Check if this was a malicious process
        reward = 0.0
        was_malicious = False
        for threat in self._task_def["attack_chain"]:
            if hostname in threat["compromised_hosts"] and process in threat["malicious_processes"]:
                was_malicious = True
                reward = 0.25  # Surgical strike: high reward for targeted process kill

                # Check if all processes for this threat are killed
                all_killed = True
                for th_host in threat["compromised_hosts"]:
                    for th_proc in threat["malicious_processes"]:
                        still_running = (
                            th_host in self._host_index
                            and th_proc in self._host_index[th_host]["running_processes"]
                        )
                        if still_running:
                            all_killed = False
                            break

                if all_killed and threat["threat_id"] in self._state.active_threats:
                    self._state.active_threats.remove(threat["threat_id"])
                    if threat["threat_id"] not in self._state.contained_threats:
                        self._state.contained_threats.append(threat["threat_id"])
                    reward += 0.15  # Bonus: fully contained a threat via surgical action
                break

        if not was_malicious:
            reward = -0.08  # Penalty: killing legitimate process = downtime
            self._state.business_impact = min(1.0, self._state.business_impact + 0.03)

        return reward, f"Killed '{process}' on {hostname}. Malicious: {was_malicious}"

    def _handle_terminate_pid(self, action: TerminatePID) -> tuple[float, str]:
        """Terminate a process by PID. PID is mapped to process name in this simulation."""
        hostname = action.hostname
        pid = action.pid
        self._last_forensics = None

        if hostname not in self._host_index:
            return -0.05, f"Host '{hostname}' not found"

        host = self._host_index[hostname]
        if host["status"] == "isolated":
            return -0.02, f"Host '{hostname}' is isolated - cannot interact"

        process_name = pid
        if ":" in pid:
            pid_host, _, pid_proc = pid.partition(":")
            if pid_host == hostname and pid_proc:
                process_name = pid_proc

        if process_name not in host["running_processes"]:
            return -0.03, f"PID '{pid}' is not running on {hostname}"

        host["running_processes"].remove(process_name)
        self._state.killed_processes.append({"hostname": hostname, "process": process_name, "pid": pid})
        if hostname in self._pending_followup:
            self._pending_followup[hostname] = True

        was_malicious = False
        reward = 0.0
        for threat in self._task_def["attack_chain"]:
            if hostname in threat["compromised_hosts"] and process_name in threat["malicious_processes"]:
                was_malicious = True
                reward = 0.24
                all_killed = True
                for th_host in threat["compromised_hosts"]:
                    for th_proc in threat["malicious_processes"]:
                        if th_host in self._host_index and th_proc in self._host_index[th_host]["running_processes"]:
                            all_killed = False
                            break
                if all_killed and threat["threat_id"] in self._state.active_threats:
                    self._state.active_threats.remove(threat["threat_id"])
                    if threat["threat_id"] not in self._state.contained_threats:
                        self._state.contained_threats.append(threat["threat_id"])
                    reward += 0.12
                break

        if not was_malicious:
            reward = -0.10
            self._state.business_impact = min(1.0, self._state.business_impact + 0.04)
            return reward, f"Terminated benign PID '{pid}' on {hostname} - business disruption"

        return reward, f"Terminated PID '{pid}' on {hostname}. Malicious: True"

    def _handle_create_firewall_rule(self, action: CreateFirewallRule) -> tuple[float, str]:
        """Create firewall rule; drop blocks target IP as IOC, allow is neutral."""
        hostname = action.hostname
        target_ip = action.target_ip

        if hostname not in self._host_index:
            return -0.05, f"Host '{hostname}' not found"

        if action.action == "drop":
            if target_ip in self._state.blocked_iocs:
                return -0.01, f"Firewall drop rule already exists for {target_ip}"
            self._state.blocked_iocs.append(target_ip)
            return 0.08, f"Created firewall DROP rule on {hostname} for {target_ip}"

        return 0.0, f"Created firewall ALLOW rule on {hostname} for {target_ip}"

    def _handle_quarantine_file(self, action: QuarantineFile) -> tuple[float, str]:
        """Quarantine suspicious files; requires terminating associated malicious PID first."""
        hostname = action.hostname
        file_path = action.file_path

        if hostname not in self._host_index:
            return -0.05, f"Host '{hostname}' not found"

        file_key = (hostname, file_path)
        if file_key in self._quarantined_files:
            return -0.01, f"File '{file_path}' already quarantined on {hostname}"

        associated_processes: List[str] = []
        lowered = file_path.lower()
        for threat in self._task_def.get("attack_chain", []):
            if hostname not in threat.get("compromised_hosts", []):
                continue
            for proc in threat.get("malicious_processes", []):
                expected_suffix = f"\\{proc}.dat".lower()
                if lowered.endswith(expected_suffix):
                    associated_processes.append(proc)

        if not associated_processes:
            self._quarantined_files.add(file_key)
            return -0.02, f"Quarantined untracked file '{file_path}' on {hostname}"

        host = self._host_index[hostname]
        locked = any(proc in host["running_processes"] for proc in associated_processes)
        if locked:
            self._state.business_impact = min(1.0, self._state.business_impact + 0.01)
            return -0.04, (
                f"Quarantine failed: file '{file_path}' is locked. "
                "Terminate associated PID first."
            )

        self._quarantined_files.add(file_key)
        return 0.10, f"Quarantined file '{file_path}' on {hostname}"

    def _handle_submit_plan(self, action: SubmitContainmentPlan) -> tuple[float, str]:
        """Submit the final containment plan."""
        self._last_forensics = None
        self._state.submitted_plan = True
        self._plan_entries = [entry.model_dump() for entry in action.plan]

        # Grade the episode using new 10-dim grader
        final_plan_dict = {
            "entries": self._plan_entries,
            "primary_threat_id": (self._plan_entries[0]["threat_id"]
                                  if self._plan_entries else ""),
        }
        grade_result = grade_episode(
            episode_actions=list(self._state.timeline),
            final_plan=final_plan_dict,
            graph=self._threat_graph,
            task_def=self._task_def,
            state=self._state,
            disruption_cost=self._disruption_cost,
        )
        final_score = grade_result["final_score"]

        # Reward proportional to final grade
        reward = final_score * 1.0  # Scale: perfect score = 1.0 reward
        description = (
            f"Containment plan submitted. "
            f"Grade: {final_score:.3f}. "
            f"Threats contained: {len(self._state.contained_threats)}/{len(self._task_def['attack_chain'])}. "
            f"Business impact: {self._state.business_impact:.2f}"
        )

        return reward, description

    # ===========================================================================
    # New Action Handlers (return observation-update dict)
    # ===========================================================================

    def _handle_correlate_alerts(self, action: CorrelateAlerts) -> dict:
        """Correlate alerts to find shared hosts/IOCs."""
        if len(action.alert_ids) < 2:
            return {"error": "correlate_alerts requires at least 2 alert IDs",
                    "description": "correlate_alerts error"}

        graph = self._threat_graph
        known_alerts = {aid: graph.alerts[aid] for aid in action.alert_ids if aid in graph.alerts}
        if len(known_alerts) < 2:
            return {"error": "fewer than 2 alert IDs found in graph",
                    "description": "correlate_alerts error"}

        # Find shared source hosts
        source_hosts: dict[str, list[str]] = {}
        for aid, alert in known_alerts.items():
            source_hosts.setdefault(alert.source_host, []).append(aid)
        shared_hosts = [h for h, aids in source_hosts.items() if len(aids) >= 2]

        # Find shared IOCs via "involves" edges
        shared_iocs: set[str] = set()
        for e in graph.edges:
            if e.edge_type == "involves" and e.source_id in known_alerts:
                if any(
                    e2.edge_type == "involves" and e2.target_id == e.target_id
                    and e2.source_id in known_alerts and e2.source_id != e.source_id
                    for e2 in graph.edges
                ):
                    shared_iocs.add(e.target_id)

        # Update correlated_with on each alert
        all_ids = list(known_alerts.keys())
        for aid, alert in known_alerts.items():
            for other_id in all_ids:
                if other_id != aid and other_id not in alert.correlated_with:
                    alert.correlated_with.append(other_id)

        self._state.correlated_alert_pairs.append(tuple(all_ids))

        shared_count = len(shared_hosts) + len(shared_iocs)
        correlation_score = min(1.0, shared_count / len(all_ids))

        result = {
            "correlation_results": {
                "shared_hosts": shared_hosts,
                "shared_iocs": list(shared_iocs),
                "correlation_score": correlation_score,
            },
            "description": f"Correlated {len(all_ids)} alerts: {len(shared_hosts)} shared hosts",
        }
        return result

    def _handle_enrich_ioc(self, action: EnrichIOC) -> dict:
        """Enrich an IOC with threat-intel data."""
        graph = self._threat_graph

        if action.ioc_value not in graph.iocs:
            return {"error": "IOC not yet discovered",
                    "description": "enrich_ioc error"}

        intel = self._task_def.get("threat_intel_data", {}) or {}
        data = intel.get(action.ioc_value, {
            "reputation": 0.5,
            "threat_actor": "unknown",
            "mitre_ttps": [],
        })

        # Update IOC node in graph
        ioc_node = graph.iocs[action.ioc_value]
        ioc_node.enriched = True
        ioc_node.threat_actor = data.get("threat_actor")
        ioc_node.mitre_ttps = data.get("mitre_ttps", [])

        if action.ioc_value not in self._state.enriched_iocs:
            self._state.enriched_iocs.append(action.ioc_value)

        # Mark IOC as discovered — future block_ioc on it will receive full reward
        self._discovered_iocs.add(action.ioc_value)

        return {
            "ioc_enrichment": data,
            "description": f"Enriched IOC {action.ioc_value}: actor={data.get('threat_actor')}",
        }

    def _handle_scan_vulnerabilities(self, action: ScanHostVulnerabilities) -> dict:
        """Scan a host for CVE vulnerabilities."""
        graph = self._threat_graph
        hostname = action.hostname

        if hostname not in graph.hosts:
            return {"error": f"Host '{hostname}' not in Threat Graph",
                    "description": "scan_host_vulnerabilities error"}

        vuln_chain = self._task_def.get("vulnerability_chain", []) or []
        vuln_results: list[dict] = []
        for entry in vuln_chain:
            if not isinstance(entry, dict):
                continue
            if entry.get("hostname") == hostname or entry.get("affected_hosts") and hostname in entry["affected_hosts"]:
                cve_id = entry.get("cve_id", "CVE-UNKNOWN")
                vuln_node = VulnerabilityNode(
                    cve_id=cve_id,
                    hostname=hostname,
                    cvss_score=entry.get("cvss_score", 5.0),
                    exploitability=entry.get("exploitability", "theoretical"),
                    patch_available=entry.get("patch_available", False),
                    exploited_by_threat=entry.get("threat_id"),
                )
                graph.add_vulnerability(vuln_node)
                graph.add_edge(Edge(
                    edge_type="exploits",
                    source_id=cve_id,
                    target_id=hostname,
                ))
                vuln_results.append(entry)

        # Mark host as scanned
        graph.hosts[hostname].scanned = True
        if hostname not in self._state.scanned_hosts:
            self._state.scanned_hosts.append(hostname)

        return {
            "vulnerability_results": vuln_results,
            "description": f"Scanned {hostname}: found {len(vuln_results)} CVEs",
        }

    # ===========================================================================
    # Red Team Action Handlers
    # ===========================================================================

    def _handle_lateral_pivot(self, action: LateralPivot) -> tuple[float, str]:
        """Red: spread from a compromised host to a new target."""
        src = action.source_host
        dst = action.target_host

        if src not in self._host_index:
            return 0.0, f"red: lateral_pivot — source '{src}' not in network"
        if self._host_index[src].get("status") != "compromised":
            return 0.0, f"red: lateral_pivot — '{src}' not under Red control"
        if dst not in self._host_index:
            return 0.0, f"red: lateral_pivot — target '{dst}' not in network"

        dst_status = self._host_index[dst].get("status", "online")
        if dst_status == "isolated":
            return 0.0, f"red: lateral_pivot — '{dst}' is isolated, pivot blocked by Blue"
        if dst_status == "compromised":
            return 0.0, f"red: lateral_pivot — '{dst}' already compromised"

        # Compromise target and copy a process from source
        self._host_index[dst]["status"] = "compromised"
        src_procs = (
            [p for p in self._threat_graph.processes.values() if p.hostname == src]
            if self._threat_graph else []
        )
        proc_name = src_procs[0].process_name if src_procs else "cmd.exe"
        self._host_index[dst].setdefault("running_processes", [])
        if proc_name not in self._host_index[dst]["running_processes"]:
            self._host_index[dst]["running_processes"].append(proc_name)

        # Update threat graph
        if self._threat_graph is not None:
            if dst not in self._threat_graph.hosts:
                hd = self._host_index[dst]
                self._threat_graph.add_host(HostNode(
                    hostname=dst,
                    subnet=hd.get("subnet", "corporate"),
                    business_criticality="medium",
                    status="compromised",
                ))
            else:
                self._threat_graph.hosts[dst].status = "compromised"

            pid = f"{dst}:{proc_name}"
            if pid not in self._threat_graph.processes:
                self._threat_graph.add_process(ProcessNode(
                    process_id=pid, hostname=dst, process_name=proc_name
                ))
            self._threat_graph.add_edge(Edge(
                edge_type="pivoted_from", source_id=dst, target_id=src
            ))

        # Generate SIEM alert for Blue
        alert_id = f"PIVOT-{uuid.uuid4().hex[:6].upper()}"
        subnet = self._host_index.get(dst, {}).get("subnet", "unknown")
        self._alert_queue.append({
            "alert_id": alert_id,
            "timestamp": "2024-01-01T00:00:00Z",
            "source_host": dst,
            "severity": "critical",
            "threat_type": "lateral_movement",
            "description": (
                f"Lateral movement detected: {proc_name} spawned on {dst} "
                f"(pivot from {src})"
            ),
            "ioc_indicators": [],
            "subnet": subnet,
            "is_acknowledged": False,
        })
        if self._threat_graph is not None:
            self._threat_graph.add_alert(AlertNode(
                alert_id=alert_id, severity="critical",
                priority_score=15.0, source_host=dst,
            ))

        # Update live rubric
        if self._live_requirements is not None:
            self._live_requirements.setdefault("must_kill", []).append({
                "hostname": dst, "process": proc_name, "threat_id": "FSP_PIVOT",
            })

        return 0.0, f"red: lateral_pivot {src} → {dst} (proc={proc_name})"

    def _handle_deploy_payload(self, action: DeployPayload) -> tuple[float, str]:
        """Red: deploy a malicious payload on a host Red controls."""
        hostname = action.hostname
        payload_type = action.payload_type

        if hostname not in self._host_index:
            return 0.0, f"red: deploy_payload — '{hostname}' not in network"
        if self._host_index[hostname].get("status") != "compromised":
            return 0.0, f"red: deploy_payload — no shell on '{hostname}'"

        proc_name = {
            "ransomware": "ransomware.exe",
            "exfiltration": "exfil_agent.exe",
            "c2": "c2_beacon.exe",
        }[payload_type]

        host = self._host_index[hostname]
        if proc_name not in host.get("running_processes", []):
            host.setdefault("running_processes", []).append(proc_name)

        if self._threat_graph is not None:
            pid = f"{hostname}:{proc_name}"
            if pid not in self._threat_graph.processes:
                self._threat_graph.add_process(ProcessNode(
                    process_id=pid, hostname=hostname, process_name=proc_name
                ))

        impact_delta = {"ransomware": 0.15, "exfiltration": 0.08, "c2": 0.05}[payload_type]
        self._state.business_impact = min(1.0, self._state.business_impact + impact_delta)

        severity = {"ransomware": "critical", "exfiltration": "high", "c2": "high"}[payload_type]
        alert_id = f"PAYLOAD-{uuid.uuid4().hex[:6].upper()}"
        self._alert_queue.append({
            "alert_id": alert_id,
            "timestamp": "2024-01-01T00:00:00Z",
            "source_host": hostname,
            "severity": severity,
            "threat_type": payload_type,
            "description": (
                f"{payload_type.capitalize()} payload deployed on {hostname}: {proc_name}"
            ),
            "ioc_indicators": [],
            "subnet": host.get("subnet", "unknown"),
            "is_acknowledged": False,
        })
        if self._threat_graph is not None:
            self._threat_graph.add_alert(AlertNode(
                alert_id=alert_id, severity=severity,
                priority_score=18.0, source_host=hostname,
            ))

        return 0.0, f"red: deployed {payload_type} payload on {hostname}"

    def _handle_evade_detection(self, action: EvadeDetection) -> tuple[float, str]:
        """Red: apply a detection-evasion technique on a controlled host."""
        hostname = action.hostname
        technique = action.technique

        if hostname not in self._host_index:
            return 0.0, f"red: evade_detection — '{hostname}' not in network"
        if self._host_index[hostname].get("status") != "compromised":
            return 0.0, f"red: evade_detection — no shell on '{hostname}'"

        if technique == "migrate_pid":
            host = self._host_index[hostname]
            malicious_procs = {
                proc
                for threat in self._task_def.get("attack_chain", [])
                if hostname in threat.get("compromised_hosts", [])
                for proc in threat.get("malicious_processes", [])
            }
            for i, proc in enumerate(list(host.get("running_processes", []))):
                if proc in malicious_procs:
                    new_name = f"svchost_{i}.exe"
                    host["running_processes"][i] = new_name
                    if self._threat_graph:
                        old_pid = f"{hostname}:{proc}"
                        if old_pid in self._threat_graph.processes:
                            self._threat_graph.processes.pop(old_pid)
                            new_pid = f"{hostname}:{new_name}"
                            self._threat_graph.add_process(ProcessNode(
                                process_id=new_pid, hostname=hostname,
                                process_name=new_name,
                            ))
            return 0.0, f"red: migrated PIDs on {hostname} to blend with system processes"

        if technique == "clear_logs":
            before = len(self._alert_queue)
            self._alert_queue = [
                a for a in self._alert_queue
                if a.get("source_host") != hostname
            ]
            removed = before - len(self._alert_queue)
            return 0.0, f"red: cleared {removed} SIEM alert(s) from {hostname}"

        return 0.0, f"red: evasion '{technique}' applied on {hostname}"

    def _handle_pass_turn(self, action: PassTurn) -> tuple[float, str]:  # noqa: ARG002
        """Red: remain stealthy, take no action."""
        return 0.0, "red: pass_turn (stealth)"

    def _get_red_action_target(self, action: Any) -> str:
        """Extract a compact target string from a Red action for timeline logging."""
        if isinstance(action, LateralPivot):
            return f"{action.source_host}→{action.target_host}"
        if isinstance(action, DeployPayload):
            return f"{action.hostname}/{action.payload_type}"
        if isinstance(action, EvadeDetection):
            return f"{action.hostname}/{action.technique}"
        return "—"

    # ===========================================================================
    # Helpers
    # ===========================================================================

    def _compute_reward_dimensions(self) -> Dict[str, float]:
        """Per-step heuristic partial scores for all 10 grading dimensions.

        Evidence-gated: actions only score if prior evidence justified them.
        Result-usage: forensics-confirmed hosts with no followup are penalized.
        Scores in [0, 1]; terminal grade_breakdown supersedes these on plan submission.
        """
        state = self._state
        task_chain = self._task_def.get("attack_chain", [])
        total_threats = max(1, len(task_chain))

        total_compromised = max(1, sum(len(t.get("compromised_hosts", [])) for t in task_chain))
        total_iocs = max(1, sum(
            len(t.get("iocs", {}).get("hashes", []))
            + len(t.get("iocs", {}).get("ips", []))
            + len(t.get("iocs", {}).get("domains", []))
            for t in task_chain
        ))

        # --- Build evidence pools: what the agent could have observed ---
        # Hosts mentioned as alert source (visible from turn 0)
        alert_source_hosts: set = set()
        for a in self._task_def.get("initial_alerts", []):
            alert_source_hosts.add(a.get("source_host", ""))
        for a in self._alert_queue:
            alert_source_hosts.add(a.get("source_host", ""))
        alert_source_hosts.discard("")

        # IOCs visible from alert ioc_indicators
        alert_iocs: set = set()
        for a_list in (self._task_def.get("initial_alerts", []), self._alert_queue):
            for a in a_list:
                for ioc in a.get("ioc_indicators", []):
                    alert_iocs.add(ioc)

        # IOCs revealed by running forensics on a host
        forensics_revealed_iocs: set = set()
        for hostname in state.forensics_run:
            for threat in task_chain:
                if hostname in threat.get("compromised_hosts", []):
                    forensics_revealed_iocs.update(threat.get("c2_servers", []))
                    forensics_revealed_iocs.update(threat["iocs"].get("hashes", []))
                    forensics_revealed_iocs.update(threat["iocs"].get("ips", []))
                    forensics_revealed_iocs.update(threat["iocs"].get("domains", []))

        discovered_iocs = alert_iocs | forensics_revealed_iocs

        # 1. threat_containment — fraction of threats neutralised (no evidence gate; outcome IS evidence)
        threat_containment = min(1.0, len(state.contained_threats) / total_threats)

        # 2. ioc_blocking — only blocks of IOCs the agent actually discovered count
        justified_blocks = [ioc for ioc in state.blocked_iocs if ioc in discovered_iocs]
        ioc_blocking = min(1.0, len(justified_blocks) / total_iocs)

        # 3. forensic_investigation — only counts forensics on alert-mentioned or previously queried
        #    hosts; penalizes confirmed compromises left with no response action
        justified_forensics = [
            h for h in state.forensics_run
            if h in alert_source_hosts or h in state.queried_hosts
        ]
        pending = self._pending_followup
        unresponded = sum(1 for v in pending.values() if not v)
        followup_penalty = min(0.30, unresponded * 0.10)
        forensic_investigation = max(0.0,
            min(1.0, len(justified_forensics) / total_compromised) - followup_penalty
        )

        # 4. siem_correlation — scored by semantic quality (shared source hosts or IOCs)
        if not state.correlated_alert_pairs:
            siem_correlation = 0.0
        else:
            alert_map: Dict[str, Any] = {}
            for a in self._task_def.get("initial_alerts", []):
                alert_map[a.get("alert_id", "")] = a
            for a in self._alert_queue:
                alert_map[a.get("alert_id", "")] = a
            quality_scores = []
            for pair in state.correlated_alert_pairs:
                pair_alerts = [alert_map[aid] for aid in pair if aid in alert_map]
                if len(pair_alerts) < 2:
                    quality_scores.append(0.3)
                    continue
                sources = [a.get("source_host") for a in pair_alerts]
                ioc_sets = [set(a.get("ioc_indicators", [])) for a in pair_alerts]
                shared_hosts = len(sources) != len({s for s in sources if s})
                shared_iocs = bool(ioc_sets[0] & ioc_sets[1]) if len(ioc_sets) >= 2 else False
                quality_scores.append(1.0 if (shared_hosts or shared_iocs) else 0.2)
            siem_correlation = sum(quality_scores) / max(1, len(quality_scores))

        # 5. threat_intel_usage — only enrichments of discovered IOCs count
        justified_enrichments = [ioc for ioc in state.enriched_iocs if ioc in discovered_iocs]
        threat_intel_usage = min(1.0, len(justified_enrichments) / total_iocs)

        # 6. vuln_root_cause — fraction of threats with a scanned host
        vuln_root_cause = min(1.0, len(state.scanned_hosts) / total_threats)

        # 7. business_impact — proportionate isolation + low overall impact
        #    Reward: isolating confirmed-compromised hosts  Penalize: isolating clean hosts
        isolated_host_set = {
            h for h, hd in self._host_index.items() if hd.get("status") == "isolated"
        } if self._host_index else set()
        compromised_host_set = {
            h for threat in task_chain for h in threat.get("compromised_hosts", [])
        }
        if isolated_host_set:
            over_isolated = isolated_host_set - compromised_host_set
            isolation_proportion = (
                len(isolated_host_set - over_isolated) / len(isolated_host_set)
            )
            over_iso_penalty = min(0.40, len(over_isolated) * 0.15)
        else:
            isolation_proportion = 1.0
            over_iso_penalty = 0.0
        raw_impact_score = max(0.0, 1.0 - state.business_impact)
        business_impact = max(0.0, min(1.0,
            0.6 * raw_impact_score + 0.4 * isolation_proportion - over_iso_penalty
        ))

        # 8. step_efficiency — reward early resolution
        ratio = state.step_count / max(1, state.max_steps)
        step_efficiency = max(0.0, 1.0 - max(0.0, ratio - 0.5) * 1.5)

        # 9. plan_coverage — partial credit scales with threats addressed
        if state.submitted_plan:
            plan_coverage = min(1.0, len(self._plan_entries) / total_threats)
        else:
            plan_coverage = min(0.5, len(state.contained_threats) / total_threats * 0.5)

        # 10. plan_evidence_quality — confidence of submitted plan; else evidence depth proxy
        if state.submitted_plan and self._plan_entries:
            avg_conf = sum(e.get("confidence", 0.0) for e in self._plan_entries) / len(self._plan_entries)
            plan_evidence_quality = float(avg_conf)
        else:
            evidence_count = len(justified_forensics) + len(justified_enrichments) + len(state.scanned_hosts)
            plan_evidence_quality = min(0.5, evidence_count / (total_compromised * 3) * 0.5)

        return {
            "threat_containment":     round(threat_containment,     4),
            "ioc_blocking":           round(ioc_blocking,           4),
            "forensic_investigation": round(forensic_investigation, 4),
            "siem_correlation":       round(siem_correlation,       4),
            "threat_intel_usage":     round(threat_intel_usage,     4),
            "vuln_root_cause":        round(vuln_root_cause,        4),
            "business_impact":        round(business_impact,        4),
            "step_efficiency":        round(step_efficiency,        4),
            "plan_coverage":          round(plan_coverage,          4),
            "plan_evidence_quality":  round(plan_evidence_quality,  4),
        }

    def _get_current_phase(self) -> str:
        """Derive episode phase from the action history in the timeline."""
        action_types = {t["action_type"] for t in self._state.timeline}
        if any(t in action_types for t in ["kill_process", "block_ioc", "isolate_segment", "terminate_pid", "create_firewall_rule", "quarantine_file"]):
            return "remediation"
        if any(t in action_types for t in ["run_forensics", "enrich_ioc", "scan_host_vulnerabilities", "query_host"]):
            return "investigation"
        return "triage"

    def _build_observation(self, reward: float, done: bool) -> SOCObservation:
        """Build the observation from current state."""
        # Compute network topology summary
        subnet_counts = {name: len(hosts) for name, hosts in self._network.items()}
        compromised = sum(
            1 for hosts in self._network.values()
            for h in hosts if h["status"] == "compromised"
        )
        isolated = sum(
            1 for hosts in self._network.values()
            for h in hosts if h["status"] == "isolated"
        )
        total = sum(len(hosts) for hosts in self._network.values())

        topology = NetworkTopology(
            total_hosts=total,
            subnets=subnet_counts,
            compromised_count=compromised,
            isolated_count=isolated,
            online_count=total - compromised - isolated,
        )

        # Build alert list
        alerts = [Alert(**a) for a in self._alert_queue]

        # Build timeline
        timeline = [
            TimelineEntry(
                step=t["step"],
                action_type=t["action_type"],
                target=t["target"],
                result=t["result"],
                reward=t["reward"],
            )
            for t in self._state.timeline
        ]

        # Compute final grade if done
        final_score_val = None
        grade_breakdown_val = None

        if done and self._state.submitted_plan:
            final_plan_dict = {
                "entries": self._plan_entries,
                "primary_threat_id": (self._plan_entries[0]["threat_id"]
                                      if self._plan_entries else ""),
            }
            computed = grade_episode(
                episode_actions=list(self._state.timeline),
                final_plan=final_plan_dict,
                graph=self._threat_graph,
                task_def=self._task_def,
                state=self._state,
                disruption_cost=self._disruption_cost,
            )
            final_score_val = round(computed["final_score"], 4)
            grade_breakdown_val = computed["breakdown"]

        # Merge per-step observation extras (process_tree, correlation_results, etc.)
        extras = getattr(self, "_last_obs_extras", {}) or {}
        threat_graph_summary = None
        if self._threat_graph is not None:
            threat_graph_summary = self._threat_graph.get_context_summary()

        # Per-step partial reward dimensions for GRPO credit assignment
        reward_dimensions = self._compute_reward_dimensions()

        # Red observation — only populated when it is Red's turn next
        red_obs = (
            self._generate_red_observation()
            if self._state.active_turn == "red"
            else None
        )

        return SOCObservation(
            episode_id=self._state.episode_id or "",
            alert_queue=alerts,
            network_topology=topology,
            host_forensics=self._last_forensics,
            timeline=timeline,
            business_impact_score=round(self._state.business_impact, 4),
            step_count=self._state.step_count,
            active_threats=list(self._state.active_threats),
            max_steps=self._state.max_steps,
            task_id=self._state.task_id,
            total_reward=round(self._state.total_reward, 4),
            final_score=final_score_val,
            grade_breakdown=grade_breakdown_val,
            done=done,
            reward=round(reward, 4),
            correlation_results=extras.get("correlation_results"),
            ioc_enrichment=extras.get("ioc_enrichment"),
            vulnerability_results=extras.get("vulnerability_results"),
            playbook_result=None,
            threat_graph_summary=threat_graph_summary,
            available_playbooks=[],
            reward_dimensions=reward_dimensions,
            active_turn=self._state.active_turn,
            red_observation=red_obs,
        )

    def _get_action_target(self, action: Any) -> str:
        """Extract the target string from a typed action for timeline logging."""
        if isinstance(action, QueryHost):
            return action.hostname
        elif isinstance(action, IsolateSegment):
            return getattr(action, "target_host", None) or action.subnet
        elif isinstance(action, BlockIOC):
            return f"{action.ioc_type}:{action.ioc_value}"
        elif isinstance(action, RunForensics):
            return action.hostname
        elif isinstance(action, KillProcess):
            return f"{action.hostname}/{action.process_name}"
        elif isinstance(action, SubmitContainmentPlan):
            return f"{len(action.plan)} entries"
        elif isinstance(action, CorrelateAlerts):
            return ",".join(action.alert_ids)
        elif isinstance(action, EnrichIOC):
            return action.ioc_value
        elif isinstance(action, ScanHostVulnerabilities):
            return action.hostname
        elif isinstance(action, TerminatePID):
            return f"{action.hostname}/{action.pid}"
        elif isinstance(action, CreateFirewallRule):
            return f"{action.hostname}:{action.action}:{action.target_ip}"
        elif isinstance(action, QuarantineFile):
            return f"{action.hostname}:{action.file_path}"
        return "unknown"

    # ===========================================================================
    # Adaptive Red Team + Step Rewards (Task 10)
    # ===========================================================================

    def _generate_red_observation(self) -> Dict[str, Any]:
        """What the Red Team LLM sees: footholds it controls + Blue's last action.

        Returned as the ``red_observation`` field in SOCObservation whenever
        ``active_turn == 'red'``, so inference.py can feed it straight to the
        Red LLM without a separate API call.
        """
        compromised_hosts = [
            h for h, hd in self._host_index.items()
            if hd.get("status") == "compromised"
        ]

        # Most recent Blue action from the timeline (exclude Red's own entries)
        blue_actions_detected: List[Dict[str, Any]] = []
        for entry in reversed(self._state.timeline):
            action_type = entry.get("action_type", "")
            if not action_type.startswith("red:"):
                blue_actions_detected.append({
                    "step": entry["step"],
                    "action": action_type,
                    "target": entry["target"],
                    "result": entry["result"],
                })
                break  # Only the single most recent Blue action

        return {
            "episode_id": self._state.episode_id,
            "round": self._state.step_count + 1,
            "compromised_hosts": compromised_hosts,
            "blue_actions_detected": blue_actions_detected,
            "active_threats": list(self._state.active_threats),
            "business_impact": round(self._state.business_impact, 4),
        }

    def _log_red_decision(self, observation: Dict[str, Any], action: Dict[str, Any]) -> None:
        """Record (observation -> action) tuples for red-team imitation warm-start."""
        record = {"observation": observation, "action": action}
        self._red_team_decisions.append(record)
        if self._red_team_logger is not None:
            try:
                self._red_team_logger(record)
            except Exception:
                # Logging is best effort and should never affect environment execution.
                pass

    def _apply_red_team_dynamics(self, action_type: str, target: str) -> None:
        """Execute embedded Red dynamics in non-FSP mode.

        When neural_red_policy is callable: invoke it with the current red
        observation, route the returned action through the Red handlers, and
        log the (obs → action) pair for offline SFT.

        When neural_red_policy is None (adaptive=True path): apply the
        deterministic fallback policy and log the pair.
        """
        red_obs = self._generate_red_observation()

        if callable(self._neural_red_policy):
            try:
                action_dict = self._neural_red_policy(red_obs)
                if not isinstance(action_dict, dict):
                    action_dict = {"type": "pass_turn"}
            except Exception:
                action_dict = {"type": "pass_turn"}

            atype = action_dict.get("type", "pass_turn")
            if atype == "lateral_pivot":
                src = action_dict.get("source_host", "")
                dst = action_dict.get("target_host", "")
                if src and dst:
                    self._handle_lateral_pivot(
                        LateralPivot(type="lateral_pivot", source_host=src, target_host=dst)
                    )
            elif atype == "deploy_payload":
                h = action_dict.get("hostname", "")
                pl = action_dict.get("payload_type", "ransomware")
                if h:
                    self._handle_deploy_payload(
                        DeployPayload(type="deploy_payload", hostname=h, payload_type=pl)
                    )
            elif atype == "evade_detection":
                h = action_dict.get("hostname", "")
                tech = action_dict.get("technique", "migrate_pid")
                if h:
                    self._handle_evade_detection(
                        EvadeDetection(type="evade_detection", hostname=h, technique=tech)
                    )
            # pass_turn → no graph mutation needed

            self._log_red_decision(red_obs, action_dict)
        else:
            # Deterministic fallback for imitation warm-start (adaptive=True path)
            det_action = self._deterministic_red_policy(action_type, target, red_obs)
            atype = det_action.get("type", "pass_turn")
            if atype == "lateral_pivot":
                self._handle_lateral_pivot(
                    LateralPivot(
                        type="lateral_pivot",
                        source_host=det_action["source_host"],
                        target_host=det_action["target_host"],
                    )
                )
            elif atype == "deploy_payload":
                dp_host = det_action.get("hostname", "")
                dp_payload = det_action.get("payload_type", "ransomware")
                if dp_host:
                    self._handle_deploy_payload(
                        DeployPayload(
                            type="deploy_payload",
                            hostname=dp_host,
                            payload_type=dp_payload,
                        )
                    )
            self._log_red_decision(red_obs, det_action)

    def _deterministic_red_policy(
        self, blue_action: str, blue_target: str, red_obs: Dict[str, Any]
    ) -> Dict[str, Any]:
        """Rule-based Red policy for SFT imitation warm-start data collection.

        Priority order:
          1. Stall punishment — >= 3 consecutive passive Blue actions deploy ransomware.
          2. Reactive pivot   — Blue containment action triggers lateral movement.
          3. Autonomous pivot — 15% chance to spread even on passive Blue actions.
        """
        _passive = frozenset({"query_host", "pass_turn"})
        _containment = frozenset({"kill_process", "isolate_segment", "block_ioc"})

        compromised = red_obs.get("compromised_hosts", [])

        # 1. Stall punishment: >= 3 consecutive passive steps without containment
        if blue_action in _passive and compromised:
            streak = 0
            for entry in reversed(getattr(self, "_recent_actions", [])):
                if isinstance(entry, tuple) and entry[0] in _passive:
                    streak += 1
                else:
                    break
            if streak >= 3:
                return {
                    "type": "deploy_payload",
                    "hostname": compromised[0],
                    "payload_type": "ransomware",
                }

        # 2. Reactive pivot on Blue containment actions
        if blue_action in _containment:
            src = compromised[0] if compromised else (blue_target or None)
            if src is not None and src in self._host_index:
                dst = next(
                    (h for h, hd in self._host_index.items()
                     if hd.get("status") not in ("compromised", "isolated") and h != src),
                    None,
                )
                if dst:
                    return {"type": "lateral_pivot", "source_host": src, "target_host": dst}

        # 3. Autonomous pivot: 15% chance even when Blue is passive
        if blue_action in _passive and compromised and self._rng.random() < 0.15:
            src = compromised[0]
            dst = next(
                (h for h, hd in self._host_index.items()
                 if hd.get("status") not in ("compromised", "isolated") and h != src),
                None,
            )
            if dst:
                return {"type": "lateral_pivot", "source_host": src, "target_host": dst}

        return {"type": "pass_turn"}

    def export_red_team_decisions(self) -> List[Dict[str, Any]]:
        """Return a copy of recorded red-team decisions for offline SFT."""
        return list(self._red_team_decisions)

    STEP_REWARDS: Dict[Any, float] = {
        ("investigation", "run_forensics"):              +0.10,
        ("investigation", "enrich_ioc"):                 +0.05,
        ("investigation", "scan_host_vulnerabilities"):  +0.05,
        ("triage",        "correlate_alerts"):            +0.05,
        "phase_violation_attempt":                       -0.20,
        "ungrounded_action_attempt":                     -0.10,
    }

    def _get_step_reward(self, phase: str, action_type: str, target: str) -> float:
        """Idempotent step reward — fires only once per (phase, action_type, target) triple.

        Hard cap: total step rewards per episode never exceed 0.40.
        """
        if not hasattr(self, "_fired_step_rewards"):
            self._fired_step_rewards = set()
        # Hard cap: once we've reached 0.40 in step rewards, return 0 for all subsequent
        if getattr(self, "_step_reward_total", 0.0) >= 0.40:
            return 0.0
        key = (phase, action_type, target)
        if key in self._fired_step_rewards:
            return 0.0
        reward = self.STEP_REWARDS.get((phase, action_type), 0.0)
        if reward != 0.0:
            self._fired_step_rewards.add(key)
        return reward

    def _maybe_reinfect(self, hostname: str, process_name: str) -> None:
        """30 % chance to reinfect with a _v2 variant when unblocked IOCs exist in the threat chain."""
        if not self._adaptive:
            return
        graph = self._threat_graph
        if graph is None:
            return

        # Check whether any IOC in the host's threat chain is still unblocked
        unblocked_chain_iocs = False
        for ioc_node in graph.iocs.values():
            if not ioc_node.blocked:
                # Is this IOC linked (via any edge) to the same host's chain?
                for e in graph.edges:
                    if e.target_id == hostname or e.source_id == hostname:
                        unblocked_chain_iocs = True
                        break
            if unblocked_chain_iocs:
                break

        if not unblocked_chain_iocs:
            return

        if self._rng.random() >= 0.3:
            return

        # Reinfect: spawn a _v2 variant process on the host
        variant_name = f"{process_name}_v2"
        if hostname in self._host_index:
            host = self._host_index[hostname]
            if variant_name not in host["running_processes"]:
                host["running_processes"].append(variant_name)
                host["status"] = "compromised"

        # Add the variant to the threat graph
        pid = f"{hostname}:{variant_name}"
        if pid not in graph.processes:
            graph.add_process(ProcessNode(
                process_id=pid,
                hostname=hostname,
                process_name=variant_name,
                killed=False,
            ))

        # Emit a CRITICAL alert to signal the reinfection
        alert_id = f"REINFECT-{uuid.uuid4().hex[:6].upper()}"
        graph.add_alert(AlertNode(
            alert_id=alert_id,
            severity="critical",
            priority_score=18.0,
            source_host=hostname,
        ))
        self._alert_queue.append({
            "alert_id": alert_id,
            "timestamp": "2024-01-01T00:00:00Z",
            "source_host": hostname,
            "severity": "critical",
            "threat_type": "malware",
            "description": f"Reinfection detected: {variant_name} spawned on {hostname} (IOC-assisted persistence)",
            "ioc_indicators": [],
            "subnet": self._host_index.get(hostname, {}).get("subnet", "unknown"),
            "is_acknowledged": False,
        })

    def _adversary_react(self, action_type: str, target: str) -> Optional[Dict[str, Any]]:
        """Legacy hook — disabled; Red Team now acts via explicit RedActionWrapper steps."""
        return None

    @property
    def state(self) -> SOCState:
        """Get the current internal environment state."""
        return self._state