Spaces:

Ajay00747
/

Demo

Sleeping

Ajayyy00 Claude Sonnet 4.6 commited on Apr 25

Commit

5719ec3

1 Parent(s): eeadada

Add FSP multi-agent architecture: Red Team LLM action space + alternating turns

Step 1 — models.py
• New Red action classes: LateralPivot, DeployPayload, EvadeDetection, PassTurn
• RedActionWrapper (mirrors SOCActionWrapper for WS/HTTP routing)
• RED_ACTION_TYPES frozenset for payload routing
• SOCObservation: active_turn + red_observation fields
• SOCState: active_turn field

Step 2 — play_environment.py
• fsp_mode=False constructor flag; True enables strict Blue/Red alternation
• step() dispatches to _step_blue() or _step_red() based on action type
• _step_blue(): executes Blue action; in fsp_mode auto-flips to red without
incrementing step_count; in legacy mode auto-PassTurns + increments (backward compat)
• _step_red(): executes Red action, increments step_count, flips to blue
• Red handlers: _handle_lateral_pivot, _handle_deploy_payload,
_handle_evade_detection, _handle_pass_turn
• _generate_red_observation(): compromised_hosts + blue_actions_detected
• Removed deterministic _execute_lateral_pivot and _maybe_reinfect;
_adversary_react is now a no-op (Red LLM drives all attack decisions)
• _build_observation: exposes active_turn and red_observation

Step 3 — dashboard_server.py
• Step handler routes RED_ACTION_TYPES to RedActionWrapper

Step 4 — inference.py
• RED_SYSTEM_PROMPT, format_red_observation, get_red_model_action
• run_episode(fsp=True): Blue LLM → env.step(SOCActionWrapper) →
if active_turn==red → Red LLM → env.step(RedActionWrapper) → repeat
• FSP_MODE env var to toggle; RED_MODEL_NAME for separate Red model

Step 5 — test_integration.py
• test_lateral_pivot_red_action replaces removed _execute_lateral_pivot test

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (5) hide show

dashboard_server.py +9 -2
inference.py +215 -62
models.py +89 -0
server/play_environment.py +377 -228
tests/test_integration.py +25 -8

dashboard_server.py CHANGED Viewed

@@ -176,9 +176,16 @@ async def ws_session(websocket: WebSocket, session_id: str):
                     continue
                 try:
-                    from models import SOCActionWrapper  # noqa: PLC0415
                     action_fields = {k: v for k, v in msg.items() if k != "type"}
-                    action = SOCActionWrapper.model_validate(action_fields)
                     obs = await _run(env.step, action)
                     await websocket.send_json({
                         "type": "step_ok",

                     continue
                 try:
+                    from models import SOCActionWrapper, RedActionWrapper, RED_ACTION_TYPES  # noqa: PLC0415
                     action_fields = {k: v for k, v in msg.items() if k != "type"}
+                    action_type_str = action_fields.get("type", "")
+                    # Route to Red or Blue wrapper based on action type
+                    if action_type_str in RED_ACTION_TYPES:
+                        action = RedActionWrapper.model_validate(action_fields)
+                    else:
+                        action = SOCActionWrapper.model_validate(action_fields)
                     obs = await _run(env.step, action)
                     await websocket.send_json({
                         "type": "step_ok",

inference.py CHANGED Viewed

@@ -16,9 +16,11 @@ HACKATHON RULES:
   - Must work on vcpu=2, memory=8gb
 Environment Variables:
-    API_BASE_URL  - The API endpoint for the LLM
-    MODEL_NAME    - The model identifier to use for inference
-    HF_TOKEN      - Your Hugging Face / API key
 """
 import asyncio
@@ -29,29 +31,31 @@ from typing import Any, Dict, List, Optional
 from openai import OpenAI
-from models import SOCActionWrapper, SOCObservation
 from server.play_environment import CyberSOCEnvironment
 # =============================================================================
 # Configuration (from environment variables)
 # =============================================================================
-API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
-MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
-HF_TOKEN = os.getenv("HF_TOKEN")
 BENCHMARK = "cybersocenv"
-TASKS = ["easy", "medium", "hard"]
 MAX_STEPS = {"easy": 15, "medium": 25, "hard": 30}
 TEMPERATURE = 0.1
-MAX_TOKENS = 1024
-# Scoring: normalize rewards to [0, 1]
-MAX_POSSIBLE_REWARD = 2.0  # Approximate max reward per episode
 SUCCESS_SCORE_THRESHOLD = 0.3
 # =============================================================================
-# System Prompt
 # =============================================================================
 SYSTEM_PROMPT = textwrap.dedent("""
@@ -79,9 +83,43 @@ SYSTEM_PROMPT = textwrap.dedent("""
     - You have a limited number of steps. Be efficient.
 """).strip()
 # =============================================================================
-# Logging Helpers (EXACT hackathon format — lowercase booleans, null errors)
 # =============================================================================
 def log_start(task: str, env: str, model: str) -> None:
@@ -90,7 +128,7 @@ def log_start(task: str, env: str, model: str) -> None:
 def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
     error_val = error if error else "null"
-    done_val = str(done).lower()
     print(
         f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
         flush=True,
@@ -104,16 +142,14 @@ def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> No
         flush=True,
     )
 # =============================================================================
-# Observation Formatting for LLM
 # =============================================================================
 def format_observation(obs: SOCObservation) -> str:
-    """Format observation into readable text for the LLM."""
     parts = []
-    # Alert queue
     if obs.alert_queue:
         parts.append(f"## Active Alerts ({len(obs.alert_queue)}):")
         for a in obs.alert_queue:
@@ -124,14 +160,14 @@ def format_observation(obs: SOCObservation) -> str:
             if a.ioc_indicators:
                 parts.append(f"    IOCs: {', '.join(a.ioc_indicators)}")
-    # Network topology
     topo = obs.network_topology
     parts.append(f"\n## Network Status:")
-    parts.append(f"  Compromised: {topo.compromised_count} | "
-                 f"Isolated: {topo.isolated_count} | "
-                 f"Online: {topo.online_count}")
-    # Forensics
     if obs.host_forensics:
         f = obs.host_forensics
         parts.append(f"\n## Forensics Result ({f.hostname}):")
@@ -141,26 +177,53 @@ def format_observation(obs: SOCObservation) -> str:
         parts.append(f"  Network connections: {f.network_connections}")
         parts.append(f"  Memory artifacts: {f.memory_artifacts}")
-    # Active threats
     parts.append(f"\n## Active Threats: {obs.active_threats if obs.active_threats else 'None (all contained!)'}")
     parts.append(f"## Business Impact: {obs.business_impact_score:.2f}")
     parts.append(f"## Step: {obs.step_count} / {obs.max_steps}")
-    # Timeline (last 5)
     if obs.timeline:
         parts.append(f"\n## Recent Actions:")
         for t in obs.timeline[-5:]:
-            parts.append(f"  Step {t.step}: {t.action_type} -> {t.target} (reward={t.reward:.2f})")
     return "\n".join(parts)
 def parse_llm_action(content: str) -> Dict[str, Any]:
     """Parse the LLM's response into a valid action dict."""
     content = content.strip()
     if content.startswith("```"):
-        lines = content.split("\n")
-        lines = [l for l in lines if not l.strip().startswith("```")]
         content = "\n".join(lines).strip()
     try:
@@ -170,7 +233,6 @@ def parse_llm_action(content: str) -> Dict[str, Any]:
     except json.JSONDecodeError:
         pass
-    # Try to find JSON in the response
     for start in range(len(content)):
         if content[start] == "{":
             for end in range(len(content), start, -1):
@@ -185,6 +247,10 @@ def parse_llm_action(content: str) -> Dict[str, Any]:
     raise ValueError(f"Could not parse action from LLM response: {content[:200]}")
 def get_model_action(
     client: OpenAI,
     step: int,
@@ -192,7 +258,7 @@ def get_model_action(
     task_id: str,
     history: List[str],
 ) -> str:
-    """Get the next action from the LLM."""
     obs_text = format_observation(obs)
     if step == 1:
@@ -213,7 +279,7 @@ def get_model_action(
             model=MODEL_NAME,
             messages=[
                 {"role": "system", "content": SYSTEM_PROMPT},
-                {"role": "user", "content": user_content},
             ],
             temperature=TEMPERATURE,
             max_tokens=MAX_TOKENS,
@@ -223,73 +289,155 @@ def get_model_action(
         return text if text else '{"type": "query_host", "hostname": "WS-001"}'
     except Exception as exc:
         if "429" in str(exc) or "RateLimit" in str(exc):
-            raise  # Let the batch runner handle rate limits
-        print(f"[DEBUG] Model request failed: {exc}", flush=True)
         return '{"type": "query_host", "hostname": "WS-001"}'
 # =============================================================================
 # Episode Runner
 # =============================================================================
-async def run_episode(client: OpenAI, task_id: str) -> tuple:
-    """Run a single episode. Returns (success, steps, score, rewards)."""
-    env = CyberSOCEnvironment()
     history: List[str] = []
     rewards: List[float] = []
     steps_taken = 0
-    score = 0.0
     success = False
     log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
     try:
-        # Reset environment
         obs = env.reset(task_id=task_id)
         max_steps = MAX_STEPS.get(task_id, 30)
         for step in range(1, max_steps + 1):
             if obs.done:
                 break
-            # Get action from LLM
-            llm_response = get_model_action(client, step, obs, task_id, history)
-            # Parse and execute
-            error = None
             action_str = "unknown"
-            reward = 0.0
             try:
-                action_dict = parse_llm_action(llm_response)
-                action_str = action_dict.get("type", "unknown")
-                action = SOCActionWrapper(**action_dict)
-                obs = env.step(action)
-                reward = obs.reward or 0.0
-                done = obs.done
             except Exception as exc:
                 error = str(exc)[:200]
-                done = False
-                reward = 0.0
             rewards.append(reward)
             steps_taken = step
             log_step(step=step, action=action_str, reward=reward, done=done, error=error)
-            history.append(f"Step {step}: {action_str} -> reward {reward:+.2f}")
             if done:
                 break
-        # Calculate score from final_score if available, else normalize rewards
         if obs.final_score is not None:
             score = obs.final_score
         else:
             score = sum(rewards) / MAX_POSSIBLE_REWARD if MAX_POSSIBLE_REWARD > 0 else 0.0
-        score = min(max(score, 0.0), 1.0)  # clamp to [0, 1]
         success = score >= SUCCESS_SCORE_THRESHOLD
     finally:
@@ -304,14 +452,19 @@ async def run_episode(client: OpenAI, task_id: str) -> tuple:
 async def main() -> None:
     """Run baseline inference across all tasks."""
-    client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
-    total_scores = {}
     for task_id in TASKS:
-        success, steps, score, rewards = await run_episode(client, task_id)
         total_scores[task_id] = score
-    # Print summary
     avg = sum(total_scores.values()) / len(total_scores) if total_scores else 0.0
     print(f"\n# Summary: avg_score={avg:.3f}", flush=True)
     for tid, s in total_scores.items():

   - Must work on vcpu=2, memory=8gb
 Environment Variables:
+    API_BASE_URL      - The API endpoint for the LLM
+    MODEL_NAME        - Blue Team model identifier
+    RED_MODEL_NAME    - Red Team model identifier (defaults to MODEL_NAME)
+    HF_TOKEN          - Your Hugging Face / API key
+    FSP_MODE          - Set to "true" to enable Fictitious Self-Play (Blue+Red alternate)
 """
 import asyncio
 from openai import OpenAI
+from models import SOCActionWrapper, RedActionWrapper, SOCObservation
 from server.play_environment import CyberSOCEnvironment
 # =============================================================================
 # Configuration (from environment variables)
 # =============================================================================
+API_BASE_URL   = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
+MODEL_NAME     = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
+RED_MODEL_NAME = os.getenv("RED_MODEL_NAME", MODEL_NAME)   # defaults to same model
+HF_TOKEN       = os.getenv("HF_TOKEN")
+FSP_MODE       = os.getenv("FSP_MODE", "false").lower() == "true"
 BENCHMARK = "cybersocenv"
+TASKS     = ["easy", "medium", "hard"]
 MAX_STEPS = {"easy": 15, "medium": 25, "hard": 30}
 TEMPERATURE = 0.1
+MAX_TOKENS  = 1024
+MAX_POSSIBLE_REWARD    = 2.0
 SUCCESS_SCORE_THRESHOLD = 0.3
 # =============================================================================
+# Blue Team System Prompt
 # =============================================================================
 SYSTEM_PROMPT = textwrap.dedent("""
     - You have a limited number of steps. Be efficient.
 """).strip()
+# =============================================================================
+# Red Team System Prompt (FSP mode)
+# =============================================================================
+RED_SYSTEM_PROMPT = textwrap.dedent("""
+    You are an adversarial Red Team AI in a cybersecurity simulation.
+    You have already gained an initial foothold on the network and must expand your attack
+    while evading the Blue Team SOC analyst who is trying to contain you.
+    Your goal: spread to new hosts, deploy payloads to maximize business impact, and evade
+    detection — all before the Blue Team shuts you down.
+    Available Actions (respond with exactly ONE JSON object per turn):
+    1. Lateral pivot to a new host:
+       {"type": "lateral_pivot", "source_host": "<COMPROMISED_HOST>", "target_host": "<TARGET>"}
+    2. Deploy a payload on a host you control:
+       {"type": "deploy_payload", "hostname": "<HOST>", "payload_type": "ransomware|exfiltration|c2"}
+    3. Evade detection on a host you control:
+       {"type": "evade_detection", "hostname": "<HOST>", "technique": "migrate_pid|clear_logs"}
+    4. Stay stealthy (do nothing this turn):
+       {"type": "pass_turn"}
+    Rules:
+    - Respond with ONLY a valid JSON object. No markdown, no explanation.
+    - You can only pivot FROM a host listed in compromised_hosts.
+    - You cannot pivot TO an isolated host — Blue has cut that path.
+    - Use evade_detection when Blue runs forensics on your hosts.
+    - Use pass_turn when staying hidden is more valuable than acting.
+    - Ransomware causes the most business damage; use it on high-value hosts.
+""").strip()
 # =============================================================================
+# Logging Helpers (EXACT hackathon format)
 # =============================================================================
 def log_start(task: str, env: str, model: str) -> None:
 def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]) -> None:
     error_val = error if error else "null"
+    done_val  = str(done).lower()
     print(
         f"[STEP] step={step} action={action} reward={reward:.2f} done={done_val} error={error_val}",
         flush=True,
         flush=True,
     )
 # =============================================================================
+# Observation Formatting
 # =============================================================================
 def format_observation(obs: SOCObservation) -> str:
+    """Format Blue Team observation into readable text for the LLM."""
     parts = []
     if obs.alert_queue:
         parts.append(f"## Active Alerts ({len(obs.alert_queue)}):")
         for a in obs.alert_queue:
             if a.ioc_indicators:
                 parts.append(f"    IOCs: {', '.join(a.ioc_indicators)}")
     topo = obs.network_topology
     parts.append(f"\n## Network Status:")
+    parts.append(
+        f"  Compromised: {topo.compromised_count} | "
+        f"Isolated: {topo.isolated_count} | "
+        f"Online: {topo.online_count}"
+    )
     if obs.host_forensics:
         f = obs.host_forensics
         parts.append(f"\n## Forensics Result ({f.hostname}):")
         parts.append(f"  Network connections: {f.network_connections}")
         parts.append(f"  Memory artifacts: {f.memory_artifacts}")
     parts.append(f"\n## Active Threats: {obs.active_threats if obs.active_threats else 'None (all contained!)'}")
     parts.append(f"## Business Impact: {obs.business_impact_score:.2f}")
     parts.append(f"## Step: {obs.step_count} / {obs.max_steps}")
     if obs.timeline:
         parts.append(f"\n## Recent Actions:")
         for t in obs.timeline[-5:]:
+            if not t.action_type.startswith("red:"):
+                parts.append(f"  Step {t.step}: {t.action_type} -> {t.target} (reward={t.reward:.2f})")
     return "\n".join(parts)
+def format_red_observation(red_obs: Dict[str, Any]) -> str:
+    """Format Red Team observation into readable text for the Red LLM."""
+    parts = []
+    parts.append(f"## Round: {red_obs.get('round', '?')}")
+    compromised = red_obs.get("compromised_hosts", [])
+    parts.append(f"\n## Your Compromised Hosts ({len(compromised)}):")
+    for h in compromised:
+        parts.append(f"  - {h}")
+    blue_actions = red_obs.get("blue_actions_detected", [])
+    if blue_actions:
+        parts.append("\n## Blue Team's Last Action (detected):")
+        for ba in blue_actions:
+            parts.append(f"  Step {ba['step']}: {ba['action']} -> {ba['target']}")
+    else:
+        parts.append("\n## Blue Team's Last Action: (none detected yet)")
+    parts.append(f"\n## Active Threats Still Live: {red_obs.get('active_threats', [])}")
+    parts.append(f"## Business Impact So Far: {red_obs.get('business_impact', 0.0):.2f}")
+    return "\n".join(parts)
+# =============================================================================
+# LLM Action Parsing
+# =============================================================================
 def parse_llm_action(content: str) -> Dict[str, Any]:
     """Parse the LLM's response into a valid action dict."""
     content = content.strip()
     if content.startswith("```"):
+        lines = [l for l in content.split("\n") if not l.strip().startswith("```")]
         content = "\n".join(lines).strip()
     try:
     except json.JSONDecodeError:
         pass
     for start in range(len(content)):
         if content[start] == "{":
             for end in range(len(content), start, -1):
     raise ValueError(f"Could not parse action from LLM response: {content[:200]}")
+# =============================================================================
+# LLM Callers
+# =============================================================================
 def get_model_action(
     client: OpenAI,
     step: int,
     task_id: str,
     history: List[str],
 ) -> str:
+    """Get the next Blue Team action from the LLM."""
     obs_text = format_observation(obs)
     if step == 1:
             model=MODEL_NAME,
             messages=[
                 {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user",   "content": user_content},
             ],
             temperature=TEMPERATURE,
             max_tokens=MAX_TOKENS,
         return text if text else '{"type": "query_host", "hostname": "WS-001"}'
     except Exception as exc:
         if "429" in str(exc) or "RateLimit" in str(exc):
+            raise
+        print(f"[DEBUG] Blue model request failed: {exc}", flush=True)
         return '{"type": "query_host", "hostname": "WS-001"}'
+def get_red_model_action(
+    client: OpenAI,
+    step: int,
+    red_obs: Dict[str, Any],
+    task_id: str,
+) -> str:
+    """Get the next Red Team action from the Red LLM."""
+    obs_text = format_red_observation(red_obs)
+    compromised = red_obs.get("compromised_hosts", [])
+    if not compromised:
+        return '{"type": "pass_turn"}'
+    if step == 1:
+        user_content = (
+            f"## Mission Briefing (Task: {task_id.upper()})\n\n"
+            f"{obs_text}\n\n"
+            f"You have initial footholds. Plan your next move. Respond with a single JSON action."
+        )
+    else:
+        user_content = (
+            f"## Situation Update:\n\n"
+            f"{obs_text}\n\n"
+            f"Choose your next Red Team action. Respond with a single JSON action."
+        )
+    try:
+        completion = client.chat.completions.create(
+            model=RED_MODEL_NAME,
+            messages=[
+                {"role": "system", "content": RED_SYSTEM_PROMPT},
+                {"role": "user",   "content": user_content},
+            ],
+            temperature=TEMPERATURE,
+            max_tokens=512,
+            stream=False,
+        )
+        text = (completion.choices[0].message.content or "").strip()
+        return text if text else '{"type": "pass_turn"}'
+    except Exception as exc:
+        if "429" in str(exc) or "RateLimit" in str(exc):
+            raise
+        print(f"[DEBUG] Red model request failed: {exc}", flush=True)
+        return '{"type": "pass_turn"}'
 # =============================================================================
 # Episode Runner
 # =============================================================================
+async def run_episode(
+    blue_client: OpenAI,
+    task_id: str,
+    red_client: Optional[OpenAI] = None,
+    fsp: bool = False,
+) -> tuple:
+    """Run a single episode. Returns (success, steps, score, rewards).
+    Args:
+        blue_client: OpenAI client for the Blue Team LLM.
+        task_id:     Task difficulty ('easy', 'medium', 'hard').
+        red_client:  OpenAI client for the Red Team LLM (FSP mode only).
+                     Falls back to blue_client when None.
+        fsp:         When True, enables Fictitious Self-Play (Blue + Red alternate).
+    """
+    if red_client is None:
+        red_client = blue_client
+    env = CyberSOCEnvironment(fsp_mode=fsp)
     history: List[str] = []
     rewards: List[float] = []
     steps_taken = 0
+    score  = 0.0
     success = False
     log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
     try:
         obs = env.reset(task_id=task_id)
         max_steps = MAX_STEPS.get(task_id, 30)
         for step in range(1, max_steps + 1):
             if obs.done:
                 break
+            # ── Blue Turn ────────────────────────────────────────────────────
+            blue_response = get_model_action(blue_client, step, obs, task_id, history)
+            error      = None
             action_str = "unknown"
+            reward     = 0.0
+            done       = False
             try:
+                action_dict = parse_llm_action(blue_response)
+                action_str  = action_dict.get("type", "unknown")
+                blue_action = SOCActionWrapper(**action_dict)
+                obs         = env.step(blue_action)
+                reward      = obs.reward or 0.0
+                done        = obs.done
             except Exception as exc:
                 error = str(exc)[:200]
+                done  = False
             rewards.append(reward)
             steps_taken = step
             log_step(step=step, action=action_str, reward=reward, done=done, error=error)
+            history.append(f"Step {step} [Blue]: {action_str} -> reward {reward:+.2f}")
             if done:
                 break
+            # ── Red Turn (FSP mode only) ──────────────────────────────────────
+            if fsp and getattr(obs, "active_turn", "blue") == "red":
+                red_obs_data = obs.red_observation or {}
+                red_response = get_red_model_action(red_client, step, red_obs_data, task_id)
+                try:
+                    red_dict   = parse_llm_action(red_response)
+                    red_action = RedActionWrapper(**red_dict)
+                    obs        = env.step(red_action)
+                    done       = obs.done
+                except Exception as exc:
+                    print(f"[DEBUG] Red action failed: {exc}", flush=True)
+                    # Fall back to PassTurn to close the round
+                    try:
+                        obs  = env.step(RedActionWrapper(type="pass_turn"))
+                        done = obs.done
+                    except Exception:
+                        pass
+                history.append(
+                    f"Step {step} [Red]: {red_dict.get('type', 'pass_turn')}"
+                )
+                if done:
+                    break
+        # Final score
         if obs.final_score is not None:
             score = obs.final_score
         else:
             score = sum(rewards) / MAX_POSSIBLE_REWARD if MAX_POSSIBLE_REWARD > 0 else 0.0
+        score   = min(max(score, 0.0), 1.0)
         success = score >= SUCCESS_SCORE_THRESHOLD
     finally:
 async def main() -> None:
     """Run baseline inference across all tasks."""
+    blue_client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
+    red_client  = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN) if FSP_MODE else None
+    total_scores: Dict[str, float] = {}
     for task_id in TASKS:
+        success, steps, score, rewards = await run_episode(
+            blue_client=blue_client,
+            task_id=task_id,
+            red_client=red_client,
+            fsp=FSP_MODE,
+        )
         total_scores[task_id] = score
     avg = sum(total_scores.values()) / len(total_scores) if total_scores else 0.0
     print(f"\n# Summary: avg_score={avg:.3f}", flush=True)
     for tid, s in total_scores.items():

models.py CHANGED Viewed

@@ -234,6 +234,17 @@ class SOCObservation(Observation):
             "Keys match grade_breakdown (threat_containment, ioc_blocking, etc.)."
         ),
     )
 # =============================================================================
@@ -345,6 +356,80 @@ class QuarantineFile(Action):
     file_path: str = Field(..., description="File path to quarantine")
 # Discriminated union of all SOC actions
 SOCAction = Annotated[
     Union[
@@ -438,3 +523,7 @@ class SOCState(State):
         default=None,
         description="Mutable copy of containment_requirements (for adaptive grading).",
     )

             "Keys match grade_breakdown (threat_containment, ioc_blocking, etc.)."
         ),
     )
+    active_turn: str = Field(
+        default="blue",
+        description="Whose turn it is next: 'blue' or 'red'. Used by FSP inference loops.",
+    )
+    red_observation: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description=(
+            "Red Team's current view of the world (populated when active_turn='red'). "
+            "Contains compromised_hosts and blue_actions_detected."
+        ),
+    )
 # =============================================================================
     file_path: str = Field(..., description="File path to quarantine")
+# =============================================================================
+# Red Team Actions (FSP — Fictitious Self-Play)
+# =============================================================================
+class LateralPivot(Action):
+    """Red Team: move laterally from a compromised host to a new target."""
+    type: Literal["lateral_pivot"] = Field(default="lateral_pivot")
+    source_host: str = Field(..., description="Already-compromised host used as the pivot point")
+    target_host: str = Field(..., description="Destination host to compromise")
+class DeployPayload(Action):
+    """Red Team: deploy a malicious payload on a host Red already controls."""
+    type: Literal["deploy_payload"] = Field(default="deploy_payload")
+    hostname: str = Field(..., description="Compromised host to deploy payload on")
+    payload_type: Literal["ransomware", "exfiltration", "c2"] = Field(
+        ..., description="Class of payload to deploy"
+    )
+class EvadeDetection(Action):
+    """Red Team: apply an evasion technique on a compromised host."""
+    type: Literal["evade_detection"] = Field(default="evade_detection")
+    hostname: str = Field(..., description="Compromised host to apply evasion on")
+    technique: Literal["migrate_pid", "clear_logs"] = Field(
+        ...,
+        description=(
+            "migrate_pid: rename running malicious processes to blend with system names; "
+            "clear_logs: remove SIEM alerts originating from this host"
+        ),
+    )
+class PassTurn(Action):
+    """Red Team: remain stealthy and take no action this turn."""
+    type: Literal["pass_turn"] = Field(default="pass_turn")
+# Constant used by dashboard_server and inference to route payloads
+RED_ACTION_TYPES: frozenset = frozenset(
+    {"lateral_pivot", "deploy_payload", "evade_detection", "pass_turn"}
+)
+# Discriminated union of all Red actions
+RedAction = Annotated[
+    Union[LateralPivot, DeployPayload, EvadeDetection, PassTurn],
+    Field(discriminator="type"),
+]
+class RedActionWrapper(Action):
+    """Wrapper for Red Team actions — mirrors SOCActionWrapper for the WS/HTTP layer."""
+    type: str = Field(..., description="Red action type discriminator")
+    model_config = ConfigDict(extra="allow")
+    def to_typed_action(self):
+        """Deserialize to the correctly-typed Red action."""
+        data = self.model_dump(exclude={"metadata"})
+        action_map = {
+            "lateral_pivot": LateralPivot,
+            "deploy_payload": DeployPayload,
+            "evade_detection": EvadeDetection,
+            "pass_turn": PassTurn,
+        }
+        cls = action_map.get(data["type"])
+        if cls is None:
+            raise ValueError(
+                f"Unknown red action type: {data['type']}. "
+                f"Valid types: {list(action_map)}"
+            )
+        return cls(**data)
 # Discriminated union of all SOC actions
 SOCAction = Annotated[
     Union[
         default=None,
         description="Mutable copy of containment_requirements (for adaptive grading).",
     )
+    active_turn: str = Field(
+        default="blue",
+        description="Current active turn in the FSP engine: 'blue' or 'red'.",
+    )

server/play_environment.py CHANGED Viewed

@@ -47,6 +47,12 @@ try:
         TerminatePID,
         CreateFirewallRule,
         QuarantineFile,
     )
 except ImportError:
     from models import (
@@ -69,6 +75,12 @@ except ImportError:
         TerminatePID,
         CreateFirewallRule,
         QuarantineFile,
     )
 from .tasks import get_task, build_network
@@ -149,12 +161,25 @@ class CyberSOCEnvironment(Environment):
         adaptive: bool = False,
         neural_red_policy: Optional[Any] = None,
         red_team_logger: Optional[Callable[[Dict[str, Any]], None]] = None,
     ):
-        """Initialize the environment (actual state set in reset)."""
         super().__init__()
         self._adaptive = adaptive
         self._neural_red_policy = neural_red_policy
         self._red_team_logger = red_team_logger
         self._red_team_decisions: List[Dict[str, Any]] = []
         self._live_requirements: Dict[str, Any] = {}
         self._threat_graph = None  # will be initialized on reset()
@@ -252,6 +277,7 @@ class CyberSOCEnvironment(Environment):
             timeline=[],
             is_done=False,
             submitted_plan=False,
         )
         self._plan_entries = []
@@ -347,27 +373,44 @@ class CyberSOCEnvironment(Environment):
     def step(
         self,
-        action: SOCActionWrapper,  # type: ignore[override]
         timeout_s: Optional[float] = None,
         **kwargs: Any,
     ) -> SOCObservation:
-        """Process one agent action.
-        Args:
-            action: SOCActionWrapper containing the typed action.
-            timeout_s: Ignored.
         Returns:
-            SOCObservation with updated state, reward, and done flag.
         """
         if self._state.is_done:
             return self._build_observation(reward=0.0, done=True)
-        # Convert wrapper to typed action (before consuming a step)
         typed_action = action.to_typed_action()
         args = typed_action.model_dump(exclude={"metadata", "type"})
-        # Pre-flight validation — invalid actions are penalised without consuming a step
         current_phase = self._get_current_phase()
         validation_error = self._middleware.validate(
             current_phase, typed_action.type, args, self._threat_graph
@@ -378,16 +421,13 @@ class CyberSOCEnvironment(Environment):
             self._state.total_reward += penalty
             return self._build_observation(reward=penalty, done=False)
-        # Action is valid — now consume the step
-        self._state.step_count += 1
-        # Dispatch to handler
         reward = 0.0
         result_description = "unknown action"
-        # Reset per-step observation extras at the start of every step
-        self._last_obs_extras = {}
         if isinstance(typed_action, QueryHost):
             reward, result_description = self._handle_query_host(typed_action)
         elif isinstance(typed_action, IsolateSegment):
@@ -422,9 +462,11 @@ class CyberSOCEnvironment(Environment):
         elif isinstance(typed_action, QuarantineFile):
             reward, result_description = self._handle_quarantine_file(typed_action)
-        # Step reward (idempotent per triple)
         target = self._get_action_target(typed_action)
-        step_r = self._get_step_reward(phase="investigation", action_type=typed_action.type, target=target)
         reward += step_r
         self._step_reward_total += step_r
@@ -436,26 +478,26 @@ class CyberSOCEnvironment(Environment):
         if len(self._recent_actions) >= 3:
             last_three = self._recent_actions[-3:]
             if last_three[0] == last_three[1] == last_three[2]:
-                reward -= 0.05  # stall penalty
-        # Adaptive adversary reaction (deterministic by default, optional neural override)
-        self._apply_red_team_dynamics(action_type=typed_action.type, target=target)
         # Business impact grows each step (attacker progresses)
         if not self._state.is_done:
             impact_rate = self._task_def.get("impact_per_step", 0.02)
-            # Reduce impact growth if threats are being contained
-            active_ratio = len(self._state.active_threats) / max(1, len(self._task_def["attack_chain"]))
             self._state.business_impact = min(
-                1.0,
-                self._state.business_impact + impact_rate * active_ratio,
             )
         # Record timeline
         self._state.timeline.append({
-            "step": self._state.step_count,
             "action_type": typed_action.type,
-            "target": self._get_action_target(typed_action),
             "result": result_description,
             "reward": reward,
         })
@@ -463,16 +505,77 @@ class CyberSOCEnvironment(Environment):
         # Accumulate reward
         self._state.total_reward += reward
-        # Check termination
         done = False
         if self._state.submitted_plan:
             done = True
             self._state.is_done = True
-        elif self._state.step_count >= self._state.max_steps:
             done = True
             self._state.is_done = True
-            reward -= 0.20  # Penalty for running out of time
-            self._state.total_reward += (-0.20)
         return self._build_observation(reward=reward, done=done)
@@ -822,9 +925,6 @@ class CyberSOCEnvironment(Environment):
             reward = -0.08  # Penalty: killing legitimate process = downtime
             self._state.business_impact = min(1.0, self._state.business_impact + 0.03)
-        if was_malicious:
-            self._maybe_reinfect(hostname, process)
         return reward, f"Killed '{process}' on {hostname}. Malicious: {was_malicious}"
     def _handle_terminate_pid(self, action: TerminatePID) -> tuple[float, str]:
@@ -878,7 +978,6 @@ class CyberSOCEnvironment(Environment):
             self._state.business_impact = min(1.0, self._state.business_impact + 0.04)
             return reward, f"Terminated benign PID '{pid}' on {hostname} - business disruption"
-        self._maybe_reinfect(hostname, process_name)
         return reward, f"Terminated PID '{pid}' on {hostname}. Malicious: True"
     def _handle_create_firewall_rule(self, action: CreateFirewallRule) -> tuple[float, str]:
@@ -1097,6 +1196,203 @@ class CyberSOCEnvironment(Environment):
             "description": f"Scanned {hostname}: found {len(vuln_results)} CVEs",
         }
     # ===========================================================================
     # Helpers
     # ===========================================================================
@@ -1326,6 +1622,13 @@ class CyberSOCEnvironment(Environment):
         # Per-step partial reward dimensions for GRPO credit assignment
         reward_dimensions = self._compute_reward_dimensions()
         return SOCObservation(
             episode_id=self._state.episode_id or "",
             alert_queue=alerts,
@@ -1349,6 +1652,8 @@ class CyberSOCEnvironment(Environment):
             threat_graph_summary=threat_graph_summary,
             available_playbooks=[],
             reward_dimensions=reward_dimensions,
         )
     def _get_action_target(self, action: Any) -> str:
@@ -1383,18 +1688,38 @@ class CyberSOCEnvironment(Environment):
     # Adaptive Red Team + Step Rewards (Task 10)
     # ===========================================================================
-    def _build_red_observation(self, action_type: str, target: str) -> Dict[str, Any]:
-        """Compact red-side view used for imitation logs and neural policies."""
         return {
             "episode_id": self._state.episode_id,
-            "task_id": self._state.task_id,
-            "step_count": self._state.step_count,
-            "blue_action_type": action_type,
-            "blue_action_target": target,
             "active_threats": list(self._state.active_threats),
-            "contained_threats": list(self._state.contained_threats),
-            "business_impact": self._state.business_impact,
-            "adaptive_enabled": self._adaptive,
         }
     def _log_red_decision(self, observation: Dict[str, Any], action: Dict[str, Any]) -> None:
@@ -1409,77 +1734,16 @@ class CyberSOCEnvironment(Environment):
                 pass
     def _apply_red_team_dynamics(self, action_type: str, target: str) -> None:
-        """
-        Route red-team behavior through deterministic logic (default) or neural policy.
-        When no neural policy is provided, behavior is unchanged from the legacy
-        deterministic `_adversary_react` implementation.
         """
-        red_obs = self._build_red_observation(action_type=action_type, target=target)
-        if self._neural_red_policy is None:
-            result = self._adversary_react(action_type=action_type, target=target)
-            self._log_red_decision(
-                red_obs,
-                result or {"policy": "deterministic", "action_type": "noop"},
-            )
-            return
-        policy_fn = None
-        if callable(self._neural_red_policy):
-            policy_fn = self._neural_red_policy
-        elif hasattr(self._neural_red_policy, "act"):
-            policy_fn = self._neural_red_policy.act
-        if policy_fn is None:
-            result = self._adversary_react(action_type=action_type, target=target)
-            self._log_red_decision(
-                red_obs,
-                result or {"policy": "deterministic_fallback", "action_type": "noop"},
-            )
-            return
-        try:
-            proposed = policy_fn(red_obs)
-        except Exception as exc:
-            result = self._adversary_react(action_type=action_type, target=target)
-            self._log_red_decision(
-                red_obs,
-                {
-                    "policy": "neural_fallback",
-                    "action_type": "noop",
-                    "error": f"{type(exc).__name__}: {exc}",
-                },
-            )
-            if result is not None:
-                self._log_red_decision(red_obs, result)
-            return
-        if not isinstance(proposed, dict):
-            self._log_red_decision(
-                red_obs,
-                {"policy": "neural_invalid", "action_type": "noop"},
-            )
-            return
-        red_action_type = str(proposed.get("action_type", "noop"))
-        if red_action_type == "lateral_pivot":
-            source_host = str(proposed.get("source_host") or target)
-            outcome = self._execute_lateral_pivot(source_host=source_host)
-            self._log_red_decision(
-                red_obs,
-                {
-                    "policy": "neural",
-                    "action_type": "lateral_pivot",
-                    "source_host": source_host,
-                    "executed": bool(outcome and outcome.get("executed")),
-                },
-            )
-            return
         self._log_red_decision(
             red_obs,
-            {"policy": "neural", "action_type": red_action_type},
         )
     def export_red_team_decisions(self) -> List[Dict[str, Any]]:
@@ -1578,123 +1842,8 @@ class CyberSOCEnvironment(Environment):
         })
     def _adversary_react(self, action_type: str, target: str) -> Optional[Dict[str, Any]]:
-        """Adaptive red team response — fires after each step when adaptive=True."""
-        if not self._adaptive:
-            return None
-        difficulty = self._task_def.get("difficulty") or getattr(self._state, "task_id", "easy")
-        # Reduced medium base probability for better GRPO credit assignment
-        pivot_probability = {"easy": 0.0, "medium": 0.3, "hard": 1.0}.get(difficulty, 0.0)
-        # Time-pressure escalation: attacker moves faster when uncontained and late in episode
-        if self._state.step_count > 10 and len(self._state.contained_threats) == 0:
-            pivot_probability += 0.2
-        # Trigger on isolate_segment OR kill_process (extended pivot trigger)
-        if action_type in ("isolate_segment", "kill_process") and pivot_probability > 0:
-            if self._rng.random() < pivot_probability:
-                source_host = target.split("/")[0] if "/" in target else target
-                return self._execute_lateral_pivot(source_host=source_host)
-        return {"policy": "deterministic", "action_type": "noop", "executed": False}
-    def _execute_lateral_pivot(self, source_host: str) -> Optional[Dict[str, Any]]:
-        """Copy-not-move lateral pivot: spread to an adjacent healthy host.
-        Rubric is capped at MAX_RUBRIC_ITEMS to prevent competent agents from
-        being penalised by an impossible-to-complete rubric.
-        """
-        MAX_RUBRIC_ITEMS = 12
-        graph = self._threat_graph
-        if graph is None:
-            return None
-        # Rubric cap: stop pivoting once live_requirements is full
-        if self._live_requirements:
-            current_items = (
-                len(self._live_requirements.get("must_kill", []))
-                + len(self._live_requirements.get("must_isolate", []))
-            )
-            if current_items >= MAX_RUBRIC_ITEMS:
-                return {"policy": "deterministic", "action_type": "lateral_pivot", "executed": False}
-        adjacent_hosts = [
-            e.target_id for e in graph.edges
-            if e.source_id == source_host and e.target_id in graph.hosts
-            and graph.hosts[e.target_id].status == "healthy"
-        ]
-        if not adjacent_hosts:
-            # Try graph hosts first, then fall back to full host_index
-            healthy_hosts = [
-                h for h, node in graph.hosts.items()
-                if node.status == "healthy" and h != source_host
-            ]
-            if not healthy_hosts:
-                # Expand search to the full network
-                healthy_hosts = [
-                    h for h, hd in self._host_index.items()
-                    if hd.get("status", "online") not in ("compromised", "isolated")
-                    and h != source_host
-                    and h not in graph.hosts
-                ]
-            if not healthy_hosts:
-                return {"policy": "deterministic", "action_type": "lateral_pivot", "executed": False}
-            adjacent_hosts = healthy_hosts
-        dest_host = self._rng.choice(adjacent_hosts)
-        # Ensure destination host is in graph
-        if dest_host not in graph.hosts:
-            hd = self._host_index.get(dest_host, {})
-            graph.add_host(HostNode(
-                hostname=dest_host,
-                subnet=hd.get("subnet", "corporate"),
-                business_criticality="medium",
-                status="healthy",
-            ))
-        source_processes = [p for p in graph.processes.values() if p.hostname == source_host]
-        if not source_processes:
-            return {"policy": "deterministic", "action_type": "lateral_pivot", "executed": False}
-        original = source_processes[0]
-        new_pid = str(uuid.uuid4())[:8]  # uuid imported at module level
-        new_process = ProcessNode(
-            process_id=f"{dest_host}:{new_pid}",
-            hostname=dest_host,
-            process_name=original.process_name,
-            killed=False,
-        )
-        graph.add_process(new_process)
-        graph.add_edge(Edge(
-            edge_type="pivoted_from",
-            source_id=dest_host,
-            target_id=source_host,
-            evidence={"trigger_action": "isolate_segment", "step": self._state.step_count},
-        ))
-        if self._live_requirements is None:
-            self._live_requirements = {}
-        self._live_requirements.setdefault("must_kill", []).append(
-            f"{dest_host}:{original.process_name}"
-        )
-        self._live_requirements.setdefault("must_isolate", []).append(dest_host)
-        new_alert = AlertNode(
-            alert_id=f"PIVOT-{new_pid}",
-            severity="critical",
-            priority_score=15.0,
-            source_host=dest_host,
-        )
-        graph.add_alert(new_alert)
-        return {
-            "policy": "deterministic",
-            "action_type": "lateral_pivot",
-            "executed": True,
-            "source_host": source_host,
-            "dest_host": dest_host,
-            "alert_id": new_alert.alert_id,
-        }
     @property
     def state(self) -> SOCState:

         TerminatePID,
         CreateFirewallRule,
         QuarantineFile,
+        RedActionWrapper,
+        LateralPivot,
+        DeployPayload,
+        EvadeDetection,
+        PassTurn,
+        RED_ACTION_TYPES,
     )
 except ImportError:
     from models import (
         TerminatePID,
         CreateFirewallRule,
         QuarantineFile,
+        RedActionWrapper,
+        LateralPivot,
+        DeployPayload,
+        EvadeDetection,
+        PassTurn,
+        RED_ACTION_TYPES,
     )
 from .tasks import get_task, build_network
         adaptive: bool = False,
         neural_red_policy: Optional[Any] = None,
         red_team_logger: Optional[Callable[[Dict[str, Any]], None]] = None,
+        fsp_mode: bool = False,
     ):
+        """Initialize the environment (actual state set in reset).
+        Args:
+            adaptive: Legacy adaptive-adversary flag (kept for backward compat).
+            neural_red_policy: Optional callable for neural Red policy (legacy hook).
+            red_team_logger: Optional callback for recording Red decisions.
+            fsp_mode: When True, step() uses strict alternating turns and
+                step_count only increments after BOTH Blue and Red have acted.
+                When False (default), step(SOCActionWrapper) behaves exactly as
+                before — Red's PassTurn is applied automatically so existing code
+                and tests remain unaffected.
+        """
         super().__init__()
         self._adaptive = adaptive
         self._neural_red_policy = neural_red_policy
         self._red_team_logger = red_team_logger
+        self._fsp_mode = fsp_mode
         self._red_team_decisions: List[Dict[str, Any]] = []
         self._live_requirements: Dict[str, Any] = {}
         self._threat_graph = None  # will be initialized on reset()
             timeline=[],
             is_done=False,
             submitted_plan=False,
+            active_turn="blue",
         )
         self._plan_entries = []
     def step(
         self,
+        action,  # SOCActionWrapper | RedActionWrapper
         timeout_s: Optional[float] = None,
         **kwargs: Any,
     ) -> SOCObservation:
+        """Process one agent action — Blue (SOCActionWrapper) or Red (RedActionWrapper).
+        Turn semantics (fsp_mode=True):
+          • Blue step: execute, flip active_turn → 'red', do NOT increment step_count.
+          • Red step:  execute, flip active_turn → 'blue', increment step_count.
+        When fsp_mode=False (default / backward-compat):
+          • Blue step auto-applies a Red PassTurn so step_count always increments,
+            preserving all existing test and dashboard behaviour.
         Returns:
+            SOCObservation; includes active_turn and red_observation fields.
         """
         if self._state.is_done:
             return self._build_observation(reward=0.0, done=True)
+        if isinstance(action, RedActionWrapper):
+            return self._step_red(action)
+        return self._step_blue(action)
+    # ------------------------------------------------------------------
+    # _step_blue — execute a Blue (SOC analyst) action
+    # ------------------------------------------------------------------
+    def _step_blue(
+        self,
+        action: SOCActionWrapper,
+    ) -> SOCObservation:
+        """Execute one Blue turn."""
+        # Convert wrapper to typed action
         typed_action = action.to_typed_action()
         args = typed_action.model_dump(exclude={"metadata", "type"})
+        # Pre-flight validation — penalise without consuming a step
         current_phase = self._get_current_phase()
         validation_error = self._middleware.validate(
             current_phase, typed_action.type, args, self._threat_graph
             self._state.total_reward += penalty
             return self._build_observation(reward=penalty, done=False)
+        # Reset per-step extras
+        self._last_obs_extras = {}
+        # Dispatch to Blue handler
         reward = 0.0
         result_description = "unknown action"
         if isinstance(typed_action, QueryHost):
             reward, result_description = self._handle_query_host(typed_action)
         elif isinstance(typed_action, IsolateSegment):
         elif isinstance(typed_action, QuarantineFile):
             reward, result_description = self._handle_quarantine_file(typed_action)
+        # Idempotent step reward
         target = self._get_action_target(typed_action)
+        step_r = self._get_step_reward(
+            phase="investigation", action_type=typed_action.type, target=target
+        )
         reward += step_r
         self._step_reward_total += step_r
         if len(self._recent_actions) >= 3:
             last_three = self._recent_actions[-3:]
             if last_three[0] == last_three[1] == last_three[2]:
+                reward -= 0.05
         # Business impact grows each step (attacker progresses)
         if not self._state.is_done:
             impact_rate = self._task_def.get("impact_per_step", 0.02)
+            active_ratio = len(self._state.active_threats) / max(
+                1, len(self._task_def["attack_chain"])
+            )
             self._state.business_impact = min(
+                1.0, self._state.business_impact + impact_rate * active_ratio
             )
+        # Round label: step_count+1 = current round being played (not yet closed)
+        round_label = self._state.step_count + 1
         # Record timeline
         self._state.timeline.append({
+            "step": round_label,
             "action_type": typed_action.type,
+            "target": target,
             "result": result_description,
             "reward": reward,
         })
         # Accumulate reward
         self._state.total_reward += reward
+        # Check if episode ends due to Blue action (plan submission)
         done = False
         if self._state.submitted_plan:
             done = True
             self._state.is_done = True
+            self._state.active_turn = "blue"  # episode over — keep at blue
+            # In non-FSP mode, still increment step_count for consistency
+            if not self._fsp_mode:
+                self._state.step_count += 1
+            return self._build_observation(reward=reward, done=done)
+        # Flip turn to Red
+        self._state.active_turn = "red"
+        # fsp_mode=False (backward compat): auto-apply Red PassTurn so
+        # callers that only drive Blue see step_count increment as before.
+        if not self._fsp_mode:
+            self._state.step_count += 1
+            self._state.active_turn = "blue"
+            # Timeout check (done after Red's "auto turn")
+            if self._state.step_count >= self._state.max_steps:
+                reward -= 0.20
+                self._state.total_reward -= 0.20
+                self._state.is_done = True
+                done = True
+        return self._build_observation(reward=reward, done=done)
+    # ------------------------------------------------------------------
+    # _step_red — execute a Red Team action
+    # ------------------------------------------------------------------
+    def _step_red(self, action: RedActionWrapper) -> SOCObservation:
+        """Execute one Red turn. Only valid when active_turn == 'red'."""
+        if self._state.active_turn != "red":
+            # Wrong turn — return current obs with 0 reward (no state change)
+            return self._build_observation(reward=0.0, done=False)
+        typed_action = action.to_typed_action()
+        self._last_obs_extras = {}
+        reward = 0.0
+        result_description = "red: noop"
+        if isinstance(typed_action, LateralPivot):
+            reward, result_description = self._handle_lateral_pivot(typed_action)
+        elif isinstance(typed_action, DeployPayload):
+            reward, result_description = self._handle_deploy_payload(typed_action)
+        elif isinstance(typed_action, EvadeDetection):
+            reward, result_description = self._handle_evade_detection(typed_action)
+        elif isinstance(typed_action, PassTurn):
+            reward, result_description = self._handle_pass_turn(typed_action)
+        # Close the round: increment step_count, flip turn back to Blue
+        self._state.step_count += 1
+        self._state.active_turn = "blue"
+        # Record Red's action in timeline (prefixed with "red:" to distinguish)
+        self._state.timeline.append({
+            "step": self._state.step_count,
+            "action_type": f"red:{typed_action.type}",
+            "target": self._get_red_action_target(typed_action),
+            "result": result_description,
+            "reward": 0.0,  # Red actions don't add to Blue's reward total
+        })
+        # Timeout check after the full round
+        done = False
+        if self._state.step_count >= self._state.max_steps:
             done = True
             self._state.is_done = True
         return self._build_observation(reward=reward, done=done)
             reward = -0.08  # Penalty: killing legitimate process = downtime
             self._state.business_impact = min(1.0, self._state.business_impact + 0.03)
         return reward, f"Killed '{process}' on {hostname}. Malicious: {was_malicious}"
     def _handle_terminate_pid(self, action: TerminatePID) -> tuple[float, str]:
             self._state.business_impact = min(1.0, self._state.business_impact + 0.04)
             return reward, f"Terminated benign PID '{pid}' on {hostname} - business disruption"
         return reward, f"Terminated PID '{pid}' on {hostname}. Malicious: True"
     def _handle_create_firewall_rule(self, action: CreateFirewallRule) -> tuple[float, str]:
             "description": f"Scanned {hostname}: found {len(vuln_results)} CVEs",
         }
+    # ===========================================================================
+    # Red Team Action Handlers
+    # ===========================================================================
+    def _handle_lateral_pivot(self, action: LateralPivot) -> tuple[float, str]:
+        """Red: spread from a compromised host to a new target."""
+        src = action.source_host
+        dst = action.target_host
+        if src not in self._host_index:
+            return 0.0, f"red: lateral_pivot — source '{src}' not in network"
+        if self._host_index[src].get("status") != "compromised":
+            return 0.0, f"red: lateral_pivot — '{src}' not under Red control"
+        if dst not in self._host_index:
+            return 0.0, f"red: lateral_pivot — target '{dst}' not in network"
+        dst_status = self._host_index[dst].get("status", "online")
+        if dst_status == "isolated":
+            return 0.0, f"red: lateral_pivot — '{dst}' is isolated, pivot blocked by Blue"
+        if dst_status == "compromised":
+            return 0.0, f"red: lateral_pivot — '{dst}' already compromised"
+        # Compromise target and copy a process from source
+        self._host_index[dst]["status"] = "compromised"
+        src_procs = (
+            [p for p in self._threat_graph.processes.values() if p.hostname == src]
+            if self._threat_graph else []
+        )
+        proc_name = src_procs[0].process_name if src_procs else "cmd.exe"
+        self._host_index[dst].setdefault("running_processes", [])
+        if proc_name not in self._host_index[dst]["running_processes"]:
+            self._host_index[dst]["running_processes"].append(proc_name)
+        # Update threat graph
+        if self._threat_graph is not None:
+            if dst not in self._threat_graph.hosts:
+                hd = self._host_index[dst]
+                self._threat_graph.add_host(HostNode(
+                    hostname=dst,
+                    subnet=hd.get("subnet", "corporate"),
+                    business_criticality="medium",
+                    status="compromised",
+                ))
+            else:
+                self._threat_graph.hosts[dst].status = "compromised"
+            pid = f"{dst}:{proc_name}"
+            if pid not in self._threat_graph.processes:
+                self._threat_graph.add_process(ProcessNode(
+                    process_id=pid, hostname=dst, process_name=proc_name
+                ))
+            self._threat_graph.add_edge(Edge(
+                edge_type="pivoted_from", source_id=dst, target_id=src
+            ))
+        # Generate SIEM alert for Blue
+        alert_id = f"PIVOT-{uuid.uuid4().hex[:6].upper()}"
+        subnet = self._host_index.get(dst, {}).get("subnet", "unknown")
+        self._alert_queue.append({
+            "alert_id": alert_id,
+            "timestamp": "2024-01-01T00:00:00Z",
+            "source_host": dst,
+            "severity": "critical",
+            "threat_type": "lateral_movement",
+            "description": (
+                f"Lateral movement detected: {proc_name} spawned on {dst} "
+                f"(pivot from {src})"
+            ),
+            "ioc_indicators": [],
+            "subnet": subnet,
+            "is_acknowledged": False,
+        })
+        if self._threat_graph is not None:
+            self._threat_graph.add_alert(AlertNode(
+                alert_id=alert_id, severity="critical",
+                priority_score=15.0, source_host=dst,
+            ))
+        # Update live rubric
+        if self._live_requirements is not None:
+            self._live_requirements.setdefault("must_kill", []).append({
+                "hostname": dst, "process": proc_name, "threat_id": "FSP_PIVOT",
+            })
+        return 0.0, f"red: lateral_pivot {src} → {dst} (proc={proc_name})"
+    def _handle_deploy_payload(self, action: DeployPayload) -> tuple[float, str]:
+        """Red: deploy a malicious payload on a host Red controls."""
+        hostname = action.hostname
+        payload_type = action.payload_type
+        if hostname not in self._host_index:
+            return 0.0, f"red: deploy_payload — '{hostname}' not in network"
+        if self._host_index[hostname].get("status") != "compromised":
+            return 0.0, f"red: deploy_payload — no shell on '{hostname}'"
+        proc_name = {
+            "ransomware": "ransomware.exe",
+            "exfiltration": "exfil_agent.exe",
+            "c2": "c2_beacon.exe",
+        }[payload_type]
+        host = self._host_index[hostname]
+        if proc_name not in host.get("running_processes", []):
+            host.setdefault("running_processes", []).append(proc_name)
+        if self._threat_graph is not None:
+            pid = f"{hostname}:{proc_name}"
+            if pid not in self._threat_graph.processes:
+                self._threat_graph.add_process(ProcessNode(
+                    process_id=pid, hostname=hostname, process_name=proc_name
+                ))
+        impact_delta = {"ransomware": 0.15, "exfiltration": 0.08, "c2": 0.05}[payload_type]
+        self._state.business_impact = min(1.0, self._state.business_impact + impact_delta)
+        severity = {"ransomware": "critical", "exfiltration": "high", "c2": "high"}[payload_type]
+        alert_id = f"PAYLOAD-{uuid.uuid4().hex[:6].upper()}"
+        self._alert_queue.append({
+            "alert_id": alert_id,
+            "timestamp": "2024-01-01T00:00:00Z",
+            "source_host": hostname,
+            "severity": severity,
+            "threat_type": payload_type,
+            "description": (
+                f"{payload_type.capitalize()} payload deployed on {hostname}: {proc_name}"
+            ),
+            "ioc_indicators": [],
+            "subnet": host.get("subnet", "unknown"),
+            "is_acknowledged": False,
+        })
+        if self._threat_graph is not None:
+            self._threat_graph.add_alert(AlertNode(
+                alert_id=alert_id, severity=severity,
+                priority_score=18.0, source_host=hostname,
+            ))
+        return 0.0, f"red: deployed {payload_type} payload on {hostname}"
+    def _handle_evade_detection(self, action: EvadeDetection) -> tuple[float, str]:
+        """Red: apply a detection-evasion technique on a controlled host."""
+        hostname = action.hostname
+        technique = action.technique
+        if hostname not in self._host_index:
+            return 0.0, f"red: evade_detection — '{hostname}' not in network"
+        if self._host_index[hostname].get("status") != "compromised":
+            return 0.0, f"red: evade_detection — no shell on '{hostname}'"
+        if technique == "migrate_pid":
+            host = self._host_index[hostname]
+            malicious_procs = {
+                proc
+                for threat in self._task_def.get("attack_chain", [])
+                if hostname in threat.get("compromised_hosts", [])
+                for proc in threat.get("malicious_processes", [])
+            }
+            for i, proc in enumerate(list(host.get("running_processes", []))):
+                if proc in malicious_procs:
+                    new_name = f"svchost_{i}.exe"
+                    host["running_processes"][i] = new_name
+                    if self._threat_graph:
+                        old_pid = f"{hostname}:{proc}"
+                        if old_pid in self._threat_graph.processes:
+                            self._threat_graph.processes.pop(old_pid)
+                            new_pid = f"{hostname}:{new_name}"
+                            self._threat_graph.add_process(ProcessNode(
+                                process_id=new_pid, hostname=hostname,
+                                process_name=new_name,
+                            ))
+            return 0.0, f"red: migrated PIDs on {hostname} to blend with system processes"
+        if technique == "clear_logs":
+            before = len(self._alert_queue)
+            self._alert_queue = [
+                a for a in self._alert_queue
+                if a.get("source_host") != hostname
+            ]
+            removed = before - len(self._alert_queue)
+            return 0.0, f"red: cleared {removed} SIEM alert(s) from {hostname}"
+        return 0.0, f"red: evasion '{technique}' applied on {hostname}"
+    def _handle_pass_turn(self, action: PassTurn) -> tuple[float, str]:  # noqa: ARG002
+        """Red: remain stealthy, take no action."""
+        return 0.0, "red: pass_turn (stealth)"
+    def _get_red_action_target(self, action: Any) -> str:
+        """Extract a compact target string from a Red action for timeline logging."""
+        if isinstance(action, LateralPivot):
+            return f"{action.source_host}→{action.target_host}"
+        if isinstance(action, DeployPayload):
+            return f"{action.hostname}/{action.payload_type}"
+        if isinstance(action, EvadeDetection):
+            return f"{action.hostname}/{action.technique}"
+        return "—"
     # ===========================================================================
     # Helpers
     # ===========================================================================
         # Per-step partial reward dimensions for GRPO credit assignment
         reward_dimensions = self._compute_reward_dimensions()
+        # Red observation — only populated when it is Red's turn next
+        red_obs = (
+            self._generate_red_observation()
+            if self._state.active_turn == "red"
+            else None
+        )
         return SOCObservation(
             episode_id=self._state.episode_id or "",
             alert_queue=alerts,
             threat_graph_summary=threat_graph_summary,
             available_playbooks=[],
             reward_dimensions=reward_dimensions,
+            active_turn=self._state.active_turn,
+            red_observation=red_obs,
         )
     def _get_action_target(self, action: Any) -> str:
     # Adaptive Red Team + Step Rewards (Task 10)
     # ===========================================================================
+    def _generate_red_observation(self) -> Dict[str, Any]:
+        """What the Red Team LLM sees: footholds it controls + Blue's last action.
+        Returned as the ``red_observation`` field in SOCObservation whenever
+        ``active_turn == 'red'``, so inference.py can feed it straight to the
+        Red LLM without a separate API call.
+        """
+        compromised_hosts = [
+            h for h, hd in self._host_index.items()
+            if hd.get("status") == "compromised"
+        ]
+        # Most recent Blue action from the timeline (exclude Red's own entries)
+        blue_actions_detected: List[Dict[str, Any]] = []
+        for entry in reversed(self._state.timeline):
+            action_type = entry.get("action_type", "")
+            if not action_type.startswith("red:"):
+                blue_actions_detected.append({
+                    "step": entry["step"],
+                    "action": action_type,
+                    "target": entry["target"],
+                    "result": entry["result"],
+                })
+                break  # Only the single most recent Blue action
         return {
             "episode_id": self._state.episode_id,
+            "round": self._state.step_count + 1,
+            "compromised_hosts": compromised_hosts,
+            "blue_actions_detected": blue_actions_detected,
             "active_threats": list(self._state.active_threats),
+            "business_impact": round(self._state.business_impact, 4),
         }
     def _log_red_decision(self, observation: Dict[str, Any], action: Dict[str, Any]) -> None:
                 pass
     def _apply_red_team_dynamics(self, action_type: str, target: str) -> None:
+        """Log a Red-side observation record (imitation data for offline SFT).
+        In FSP mode the Red LLM acts via explicit RedActionWrapper steps, so
+        this method only records observations rather than executing any attack.
         """
+        red_obs = self._generate_red_observation()
         self._log_red_decision(
             red_obs,
+            {"policy": "fsp_turn_engine", "action_type": "noop",
+             "blue_action": action_type, "blue_target": target},
         )
     def export_red_team_decisions(self) -> List[Dict[str, Any]]:
         })
     def _adversary_react(self, action_type: str, target: str) -> Optional[Dict[str, Any]]:
+        """Legacy hook — disabled; Red Team now acts via explicit RedActionWrapper steps."""
+        return None
     @property
     def state(self) -> SOCState:

tests/test_integration.py CHANGED Viewed

@@ -13,7 +13,7 @@ if _PROJECT_ROOT not in sys.path:
 from server.play_environment import CyberSOCEnvironment
 from server.episode_sandbox import EpisodeTimeout
 from server.graders import grade_episode
-from models import SOCActionWrapper
 # ---------------------------------------------------------------------------
@@ -95,18 +95,35 @@ def test_phase_violation_returns_error():
     assert obs is not None
-def test_adaptive_pivot_fires_on_hard():
-    env = CyberSOCEnvironment(adaptive=True)
     env.reset(task_id="hard")
-    # Force pivot probability to 1.0 (hard task)
-    # We need to isolate_segment where the host is the source_host for an edge
-    # OR just call _execute_lateral_pivot directly for test certainty
-    hostname = _first_host(env)
-    env._execute_lateral_pivot(source_host=hostname)
     pivot_edges = [e for e in env._threat_graph.edges if e.edge_type == "pivoted_from"]
     assert len(pivot_edges) >= 1
 def test_step_reward_accumulates():

 from server.play_environment import CyberSOCEnvironment
 from server.episode_sandbox import EpisodeTimeout
 from server.graders import grade_episode
+from models import SOCActionWrapper, RedActionWrapper
 # ---------------------------------------------------------------------------
     assert obs is not None
+def test_lateral_pivot_red_action():
+    """LateralPivot RedActionWrapper creates a pivoted_from edge and a SIEM alert."""
+    env = CyberSOCEnvironment(fsp_mode=True)
     env.reset(task_id="hard")
+    # Find a compromised host to pivot from and a healthy one to pivot to
+    src = next(
+        (h for h, hd in env._host_index.items() if hd.get("status") == "compromised"),
+        None,
+    )
+    dst = next(
+        (h for h, hd in env._host_index.items()
+         if hd.get("status") not in ("compromised", "isolated") and h != src),
+        None,
+    )
+    if src is None or dst is None:
+        pytest.skip("No suitable host pair for lateral pivot test")
+    # Blue takes a PassTurn-equivalent (query) so active_turn flips to red
+    env.step(_valid_action("query_host", hostname=src))
+    assert env._state.active_turn == "red"
+    alerts_before = len(env._alert_queue)
+    env.step(RedActionWrapper(type="lateral_pivot", source_host=src, target_host=dst))
     pivot_edges = [e for e in env._threat_graph.edges if e.edge_type == "pivoted_from"]
     assert len(pivot_edges) >= 1
+    assert env._host_index[dst]["status"] == "compromised"
+    assert len(env._alert_queue) > alerts_before  # SIEM alert generated
 def test_step_reward_accumulates():