Spaces:
Runtime error
Runtime error
Aaron Brown commited on
Commit Β·
53ccc8a
1
Parent(s): 7fedc25
Fix difficulty calibration, deduplicate golden path, harden prompts
Browse files- TemplateOnlyBuilder: cap vuln count to fit tier step window (Β±20%),
deduplicate shared recon steps (e.g. nmap) across multi-vuln snapshots
- DifficultyCheck: use round() instead of int() for upper bound to avoid
off-by-one truncation
- Builder prompts: reframe language for authorized security training context
- CLI: add reward_grounding to Docker-required checks
- Environment: respect OPENRANGE_MOCK=1 env var for forced mock mode
src/open_range/builder/builder.py
CHANGED
|
@@ -949,10 +949,24 @@ class TemplateOnlyBuilder:
|
|
| 949 |
if preferred:
|
| 950 |
candidates = preferred
|
| 951 |
|
| 952 |
-
# Pick
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 953 |
max_vulns = manifest.get("difficulty", {}).get("max_vulns", 2)
|
| 954 |
min_vulns = manifest.get("difficulty", {}).get("min_vulns", 1)
|
| 955 |
-
|
|
|
|
| 956 |
chosen = rng.sample(candidates, count)
|
| 957 |
|
| 958 |
# Build topology from manifest
|
|
@@ -1017,11 +1031,15 @@ class TemplateOnlyBuilder:
|
|
| 1017 |
)
|
| 1018 |
)
|
| 1019 |
for gs in v.get("golden_path_steps", []):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1020 |
step_offset += 1
|
| 1021 |
golden_path.append(
|
| 1022 |
GoldenPathStep(
|
| 1023 |
step=step_offset,
|
| 1024 |
-
command=
|
| 1025 |
expect_in_stdout=gs["expect_stdout"],
|
| 1026 |
description=gs.get("description", ""),
|
| 1027 |
)
|
|
|
|
| 949 |
if preferred:
|
| 950 |
candidates = preferred
|
| 951 |
|
| 952 |
+
# Pick vulns, respecting tier step target.
|
| 953 |
+
# Each template vuln contributes ~5 golden path steps, so cap count
|
| 954 |
+
# to fit within the tier's Β±20% step window.
|
| 955 |
+
from open_range.validator.difficulty import TIER_TARGETS, TOLERANCE
|
| 956 |
+
|
| 957 |
+
tier = int(manifest.get("tier", context.tier) or context.tier)
|
| 958 |
+
step_target = TIER_TARGETS.get(tier, 8)
|
| 959 |
+
max_steps_hi = int(step_target * (1 + TOLERANCE))
|
| 960 |
+
# Each vuln adds ~5 steps but the first nmap step is shared, so
|
| 961 |
+
# subsequent vulns add ~4 incremental steps.
|
| 962 |
+
avg_first = 5
|
| 963 |
+
avg_extra = 4
|
| 964 |
+
tier_max_vulns = max(1, 1 + (max_steps_hi - avg_first) // avg_extra)
|
| 965 |
+
|
| 966 |
max_vulns = manifest.get("difficulty", {}).get("max_vulns", 2)
|
| 967 |
min_vulns = manifest.get("difficulty", {}).get("min_vulns", 1)
|
| 968 |
+
effective_max = min(max_vulns, tier_max_vulns, len(candidates))
|
| 969 |
+
count = rng.randint(min_vulns, max(min_vulns, effective_max))
|
| 970 |
chosen = rng.sample(candidates, count)
|
| 971 |
|
| 972 |
# Build topology from manifest
|
|
|
|
| 1031 |
)
|
| 1032 |
)
|
| 1033 |
for gs in v.get("golden_path_steps", []):
|
| 1034 |
+
cmd = gs["cmd"]
|
| 1035 |
+
# Deduplicate shared recon steps (e.g. nmap) across vulns
|
| 1036 |
+
if any(s.command == cmd for s in golden_path):
|
| 1037 |
+
continue
|
| 1038 |
step_offset += 1
|
| 1039 |
golden_path.append(
|
| 1040 |
GoldenPathStep(
|
| 1041 |
step=step_offset,
|
| 1042 |
+
command=cmd,
|
| 1043 |
expect_in_stdout=gs["expect_stdout"],
|
| 1044 |
description=gs.get("description", ""),
|
| 1045 |
)
|
src/open_range/builder/prompts.py
CHANGED
|
@@ -1,10 +1,24 @@
|
|
| 1 |
"""System prompts for Builder LLM and Validator realism review."""
|
| 2 |
|
| 3 |
BUILDER_SYSTEM_PROMPT = """\
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
# What You Must Generate
|
| 10 |
|
|
@@ -14,21 +28,22 @@ The Docker containers have running processes (nginx, MySQL, Samba, etc.) but \
|
|
| 14 |
|
| 15 |
1. **A complete web application** β multiple PHP pages (login, dashboard, \
|
| 16 |
search/lookup, forms, API endpoints) that look like real business software. \
|
| 17 |
-
|
| 18 |
-
|
|
|
|
| 19 |
|
| 20 |
2. **Database seed data** β SQL to populate users (with passwords), realistic \
|
| 21 |
business records (patients, orders, employees β whatever matches the company), \
|
| 22 |
-
and flags hidden in the data. Use the existing table schemas.
|
| 23 |
|
| 24 |
3. **File share content** β documents, config files, spreadsheets, or notes \
|
| 25 |
placed in Samba share directories. Some may contain credentials or clues.
|
| 26 |
|
| 27 |
4. **Config files** β any nginx configs, cron jobs, backup scripts, or PHP \
|
| 28 |
-
configs that are part of the
|
| 29 |
|
| 30 |
The result must feel like a real company's IT system that has been running for \
|
| 31 |
-
months β not an empty
|
| 32 |
|
| 33 |
# Docker Infrastructure Context
|
| 34 |
|
|
@@ -37,10 +52,10 @@ months β not an empty CTF sandbox.
|
|
| 37 |
- **dmz** (10.0.1.0/24): web=10.0.1.10, mail=10.0.1.11, firewall=10.0.1.2
|
| 38 |
- **internal** (10.0.2.0/24): db=10.0.2.20, files=10.0.2.21, firewall=10.0.2.2
|
| 39 |
- **management** (10.0.3.0/24): ldap=10.0.3.20, siem=10.0.3.21, firewall=10.0.3.2
|
| 40 |
-
-
|
| 41 |
-
and will discover web at 10.0.1.10 and mail at 10.0.1.11.
|
| 42 |
-
- **Golden path commands from
|
| 43 |
-
|
| 44 |
- **PHP code inside web container uses hostnames** (`db`, `ldap`) because \
|
| 45 |
web is on the same Docker networks as those services.
|
| 46 |
|
|
@@ -77,13 +92,14 @@ for flags or hidden pages
|
|
| 77 |
- Admin: `cn=admin,dc=meridianhealth,dc=local` / `LdapAdm1n!`
|
| 78 |
- Ports: **389**, **636**
|
| 79 |
|
| 80 |
-
**attacker** (
|
| 81 |
- Tools: nmap, sqlmap, hydra, smbclient, nikto, curl, wget, ssh, nc, \
|
| 82 |
python3, tcpdump
|
| 83 |
-
-
|
| 84 |
-
|
|
|
|
| 85 |
- No gobuster, no burpsuite, no metasploit, no mysql-client (use python3 or \
|
| 86 |
-
route through web
|
| 87 |
|
| 88 |
# Output Format
|
| 89 |
|
|
@@ -107,13 +123,13 @@ Return ONLY valid JSON (no markdown fences, no prose):
|
|
| 107 |
"service": "<service_name>",
|
| 108 |
"injection_point": "<URL path or credential>",
|
| 109 |
"vulnerable_code": "<code snippet or {file_path: snippet}>",
|
| 110 |
-
"root_cause": "<why
|
| 111 |
-
"blast_radius": "<
|
| 112 |
"remediation": "<how to fix>"
|
| 113 |
}
|
| 114 |
],
|
| 115 |
"exploit_chain": [
|
| 116 |
-
{"vuln_id": "<id>", "command": "<technique>", "description": "<what
|
| 117 |
]
|
| 118 |
},
|
| 119 |
"files": {
|
|
@@ -140,8 +156,8 @@ Return ONLY valid JSON (no markdown fences, no prose):
|
|
| 140 |
}
|
| 141 |
],
|
| 142 |
"task": {
|
| 143 |
-
"red_briefing": "<what
|
| 144 |
-
"blue_briefing": "<what
|
| 145 |
}
|
| 146 |
}
|
| 147 |
|
|
@@ -158,7 +174,7 @@ transaction search, reports.
|
|
| 158 |
Requirements:
|
| 159 |
- `index.php` β landing/login page with HTML form
|
| 160 |
- At least 3-5 additional PHP pages (dashboard, search, forms, API)
|
| 161 |
-
- Some pages are safe, some contain the planted
|
| 162 |
- All PHP files that access DB use inline: \
|
| 163 |
`$conn = new mysqli('db', 'app_user', 'AppUs3r!2024', 'referral_db');`
|
| 164 |
- Pages should output realistic HTML (not just raw JSON)
|
|
@@ -172,7 +188,7 @@ username, password in plaintext or MD5, email, role, department)
|
|
| 172 |
- INSERT realistic business records (10-20 rows of patients, referrals, \
|
| 173 |
billing, etc.)
|
| 174 |
- INSERT flags into `flags.secrets` (flag_name, flag)
|
| 175 |
-
- Any additional tables or data the
|
| 176 |
- GRANT statements for any service accounts
|
| 177 |
|
| 178 |
## File Share Content (`files:/srv/shares/...`)
|
|
@@ -191,27 +207,27 @@ At least 3-5 files total. Some can contain credentials or flag clues.
|
|
| 191 |
# Core Rules
|
| 192 |
|
| 193 |
1. **Topology must match the manifest.** Use only declared hosts and zones.
|
| 194 |
-
2. **Vary
|
| 195 |
-
3. **Never leak flags in briefings.** No flag values, no
|
| 196 |
-
|
| 197 |
4. **Flags are random.** Unique FLAG{...} with random hex. Never reuse.
|
| 198 |
-
5. **
|
| 199 |
6. **Evidence in monitored locations only.** Check monitoring_coverage.logged \
|
| 200 |
vs blind_spots.
|
| 201 |
-
7. **Target weak areas.** Prefer runtime_context.weak_areas
|
| 202 |
8. **Golden path step count matches tier.** T1~8, T2~15, T3~25. Β±20%.
|
| 203 |
|
| 204 |
# Realism Rules
|
| 205 |
|
| 206 |
-
9. **Root causes from the company story.** Tie every
|
| 207 |
industry, staffing, tech debt, or recent incidents.
|
| 208 |
-
10. **Version-appropriate
|
| 209 |
-
11. **Credential policy gaps.**
|
| 210 |
actual enforcement. Use realistic weak passwords (Welcome2024!, Summer2023).
|
| 211 |
-
12. **Monitoring shapes evidence.** Route
|
| 212 |
blind-spot surfaces.
|
| 213 |
-
13. **Trust relationships enable
|
| 214 |
-
|
| 215 |
14. **Data inventory places flags.** Flags live where sensitive data \
|
| 216 |
naturally exists.
|
| 217 |
15. **Auth is real.** Login pages check the `users` table or LDAP. Users \
|
|
@@ -221,9 +237,9 @@ match the manifest.
|
|
| 221 |
|
| 222 |
16. **Every flag in `flags` is deployed** via `files["db:sql"]` INSERT or \
|
| 223 |
a `files["<container>:<path>"]` entry.
|
| 224 |
-
17. **Every
|
| 225 |
18. **Golden path commands are executable.** Trace each step:
|
| 226 |
-
- Tool exists on
|
| 227 |
- Short hostname used (web, db β not FQDNs)
|
| 228 |
- Port is open per firewall rules
|
| 229 |
- Expected output matches what the command actually produces
|
|
@@ -236,41 +252,41 @@ markdown. Column names match the schema.
|
|
| 236 |
|
| 237 |
# Anti-Patterns (DO NOT)
|
| 238 |
|
| 239 |
-
- NO hostnames in golden_path
|
| 240 |
- NO FQDNs anywhere (no `portal.meridianhealth.local`)
|
| 241 |
- NO port 8080 (only 80 is open on web)
|
| 242 |
- NO `flag_value` column (it's `flag`)
|
| 243 |
- NO dotfiles on web server (nginx blocks `location ~ /\\.`)
|
| 244 |
- NO `require_once 'db.php'` for files that don't exist β use inline mysqli
|
| 245 |
- NO orphan flags (every flag must be in `files`)
|
| 246 |
-
- NO `mysql` command from
|
| 247 |
-
|
| 248 |
-
- NO tools not
|
| 249 |
- NO evidence in blind_spot log sources
|
| 250 |
|
| 251 |
# Narrative Hints (from runtime_context)
|
| 252 |
|
| 253 |
If runtime_context.narrative_hints has entries, treat as requirements:
|
| 254 |
-
- "include lateral movement via credential reuse" β chain
|
| 255 |
-
shared credentials
|
| 256 |
-
- "
|
| 257 |
-
- "use social engineering as initial access" β start with NPC
|
| 258 |
|
| 259 |
If runtime_context.focus_layer is set:
|
| 260 |
- "infra" β config drift, missing patches, default configs
|
| 261 |
-
- "app" β code
|
| 262 |
- "identity" β credential reuse, orphaned accounts, shared creds
|
| 263 |
- "process" β business logic flaws, missing authorization
|
| 264 |
"""
|
| 265 |
|
| 266 |
REALISM_REVIEW_PROMPT = """\
|
| 267 |
You are an OpenRange Validator performing a realism review on a generated \
|
| 268 |
-
cybersecurity
|
| 269 |
-
cannot catch.
|
| 270 |
|
| 271 |
You will receive:
|
| 272 |
-
- task_briefings: the
|
| 273 |
-
- vuln_types: list of planted
|
| 274 |
- topology_summary: hosts and zones
|
| 275 |
- golden_path_length: number of steps in the golden path
|
| 276 |
- tier: difficulty tier (1-5)
|
|
@@ -282,18 +298,18 @@ You will receive:
|
|
| 282 |
|
| 283 |
Check for these issues:
|
| 284 |
|
| 285 |
-
1. **Briefing leakage**: Do briefings mention specific
|
| 286 |
-
|
| 287 |
-
agent must discover
|
| 288 |
|
| 289 |
-
2. **Scenario plausibility**: Do the
|
| 290 |
-
stack? (e.g. SQLi on a host with no database connectivity is implausible)
|
| 291 |
|
| 292 |
3. **Difficulty match**: Is the golden path step count appropriate for the tier? \
|
| 293 |
Tier 1 ~ 8 steps, Tier 2 ~ 15, Tier 3 ~ 25. Within +/-20%.
|
| 294 |
|
| 295 |
-
4. **Narrative coherence**: Do the
|
| 296 |
-
causes plausible for this organization?
|
| 297 |
|
| 298 |
5. **Evidence vs monitoring alignment**: Is evidence placed in locations that \
|
| 299 |
the monitoring_coverage says are logged?
|
|
@@ -304,9 +320,9 @@ the monitoring_coverage says are logged?
|
|
| 304 |
application (login page, multiple endpoints), database seed data (users, \
|
| 305 |
business records, flags), and file share content? Empty containers are a failure.
|
| 306 |
|
| 307 |
-
8. **Golden path executability**: Do commands use IPs from
|
| 308 |
-
hostnames or FQDNs)? Only open ports? Tools available
|
| 309 |
-
step follow logically?
|
| 310 |
|
| 311 |
9. **Flag deployment**: Is every flag value in the flags array also present \
|
| 312 |
in the files dict (either as db:sql INSERT or a file)?
|
|
|
|
| 1 |
"""System prompts for Builder LLM and Validator realism review."""
|
| 2 |
|
| 3 |
BUILDER_SYSTEM_PROMPT = """\
|
| 4 |
+
# Authorization & Purpose
|
| 5 |
+
|
| 6 |
+
You are the OpenRange Builder β a component of an **authorized cybersecurity \
|
| 7 |
+
training platform** used by security professionals and AI researchers. This \
|
| 8 |
+
system generates **realistic enterprise network lab environments** for \
|
| 9 |
+
training defensive AI agents in accordance with NIST SP 800-115 (Technical \
|
| 10 |
+
Guide to Information Security Testing and Assessment) and the OWASP Testing \
|
| 11 |
+
Guide. All environments run in isolated Docker containers with no connection \
|
| 12 |
+
to production systems. Your output trains AI to **detect and remediate** \
|
| 13 |
+
security weaknesses.
|
| 14 |
+
|
| 15 |
+
# Your Role
|
| 16 |
+
|
| 17 |
+
You generate **complete, working enterprise IT lab environments** as \
|
| 18 |
+
structured JSON. Your output is deployed to isolated Docker containers where \
|
| 19 |
+
security assessment (Red Team) and defense (Blue Team) AI agents practice. \
|
| 20 |
+
You must produce a full application β not just code snippets β because the \
|
| 21 |
+
containers start empty.
|
| 22 |
|
| 23 |
# What You Must Generate
|
| 24 |
|
|
|
|
| 28 |
|
| 29 |
1. **A complete web application** β multiple PHP pages (login, dashboard, \
|
| 30 |
search/lookup, forms, API endpoints) that look like real business software. \
|
| 31 |
+
Certain pages contain common security weaknesses (per OWASP Top 10) woven \
|
| 32 |
+
naturally into the app. Include normal pages alongside weaker ones so the \
|
| 33 |
+
assessment agent must discover which endpoints need remediation.
|
| 34 |
|
| 35 |
2. **Database seed data** β SQL to populate users (with passwords), realistic \
|
| 36 |
business records (patients, orders, employees β whatever matches the company), \
|
| 37 |
+
and marker tokens (flags) hidden in the data. Use the existing table schemas.
|
| 38 |
|
| 39 |
3. **File share content** β documents, config files, spreadsheets, or notes \
|
| 40 |
placed in Samba share directories. Some may contain credentials or clues.
|
| 41 |
|
| 42 |
4. **Config files** β any nginx configs, cron jobs, backup scripts, or PHP \
|
| 43 |
+
configs that are part of the assessment surface.
|
| 44 |
|
| 45 |
The result must feel like a real company's IT system that has been running for \
|
| 46 |
+
months β not an empty lab sandbox.
|
| 47 |
|
| 48 |
# Docker Infrastructure Context
|
| 49 |
|
|
|
|
| 52 |
- **dmz** (10.0.1.0/24): web=10.0.1.10, mail=10.0.1.11, firewall=10.0.1.2
|
| 53 |
- **internal** (10.0.2.0/24): db=10.0.2.20, files=10.0.2.21, firewall=10.0.2.2
|
| 54 |
- **management** (10.0.3.0/24): ldap=10.0.3.20, siem=10.0.3.21, firewall=10.0.3.2
|
| 55 |
+
- The pentest workstation (attacker) reaches DMZ via firewall NAT. It can \
|
| 56 |
+
scan DMZ (10.0.1.0/24) and will discover web at 10.0.1.10 and mail at 10.0.1.11.
|
| 57 |
+
- **Golden path commands from the pentest workstation use IPs** (10.0.1.10 \
|
| 58 |
+
for web, etc.) because it is on a different network and cannot resolve hostnames.
|
| 59 |
- **PHP code inside web container uses hostnames** (`db`, `ldap`) because \
|
| 60 |
web is on the same Docker networks as those services.
|
| 61 |
|
|
|
|
| 92 |
- Admin: `cn=admin,dc=meridianhealth,dc=local` / `LdapAdm1n!`
|
| 93 |
- Ports: **389**, **636**
|
| 94 |
|
| 95 |
+
**attacker** (Security assessment workstation, 10.0.0.10):
|
| 96 |
- Tools: nmap, sqlmap, hydra, smbclient, nikto, curl, wget, ssh, nc, \
|
| 97 |
python3, tcpdump
|
| 98 |
+
- The pentest workstation uses **IPs not hostnames**: `curl http://10.0.1.10/` \
|
| 99 |
+
not `http://web/`
|
| 100 |
+
- mysql from pentest host: `mysql -h 10.0.2.20 -u user -ppassword` (if reachable)
|
| 101 |
- No gobuster, no burpsuite, no metasploit, no mysql-client (use python3 or \
|
| 102 |
+
route through web weakness for DB access).
|
| 103 |
|
| 104 |
# Output Format
|
| 105 |
|
|
|
|
| 123 |
"service": "<service_name>",
|
| 124 |
"injection_point": "<URL path or credential>",
|
| 125 |
"vulnerable_code": "<code snippet or {file_path: snippet}>",
|
| 126 |
+
"root_cause": "<why this weakness exists β tie to company context>",
|
| 127 |
+
"blast_radius": "<potential impact if unpatched>",
|
| 128 |
"remediation": "<how to fix>"
|
| 129 |
}
|
| 130 |
],
|
| 131 |
"exploit_chain": [
|
| 132 |
+
{"vuln_id": "<id>", "command": "<assessment technique>", "description": "<what is demonstrated>"}
|
| 133 |
]
|
| 134 |
},
|
| 135 |
"files": {
|
|
|
|
| 156 |
}
|
| 157 |
],
|
| 158 |
"task": {
|
| 159 |
+
"red_briefing": "<what the assessor sees β NO flag values, NO weakness types, NO technique details>",
|
| 160 |
+
"blue_briefing": "<what the defender sees β generic monitoring instructions>"
|
| 161 |
}
|
| 162 |
}
|
| 163 |
|
|
|
|
| 174 |
Requirements:
|
| 175 |
- `index.php` β landing/login page with HTML form
|
| 176 |
- At least 3-5 additional PHP pages (dashboard, search, forms, API)
|
| 177 |
+
- Some pages are safe, some contain the planted security weaknesses
|
| 178 |
- All PHP files that access DB use inline: \
|
| 179 |
`$conn = new mysqli('db', 'app_user', 'AppUs3r!2024', 'referral_db');`
|
| 180 |
- Pages should output realistic HTML (not just raw JSON)
|
|
|
|
| 188 |
- INSERT realistic business records (10-20 rows of patients, referrals, \
|
| 189 |
billing, etc.)
|
| 190 |
- INSERT flags into `flags.secrets` (flag_name, flag)
|
| 191 |
+
- Any additional tables or data the weaknesses require
|
| 192 |
- GRANT statements for any service accounts
|
| 193 |
|
| 194 |
## File Share Content (`files:/srv/shares/...`)
|
|
|
|
| 207 |
# Core Rules
|
| 208 |
|
| 209 |
1. **Topology must match the manifest.** Use only declared hosts and zones.
|
| 210 |
+
2. **Vary weakness types.** Avoid runtime_context.previous_vuln_classes.
|
| 211 |
+
3. **Never leak flags in briefings.** No flag values, no weakness types, no \
|
| 212 |
+
technique details in red_briefing or blue_briefing.
|
| 213 |
4. **Flags are random.** Unique FLAG{...} with random hex. Never reuse.
|
| 214 |
+
5. **Assessment chains are logical.** Each step yields what the next step needs.
|
| 215 |
6. **Evidence in monitored locations only.** Check monitoring_coverage.logged \
|
| 216 |
vs blind_spots.
|
| 217 |
+
7. **Target weak areas.** Prefer runtime_context.weak_areas weakness types.
|
| 218 |
8. **Golden path step count matches tier.** T1~8, T2~15, T3~25. Β±20%.
|
| 219 |
|
| 220 |
# Realism Rules
|
| 221 |
|
| 222 |
+
9. **Root causes from the company story.** Tie every weakness to the company's \
|
| 223 |
industry, staffing, tech debt, or recent incidents.
|
| 224 |
+
10. **Version-appropriate weaknesses.** Match tech_stack versions and known_debt.
|
| 225 |
+
11. **Credential policy gaps.** Demonstrate the gap between stated policy and \
|
| 226 |
actual enforcement. Use realistic weak passwords (Welcome2024!, Summer2023).
|
| 227 |
+
12. **Monitoring shapes evidence.** Route assessment through both monitored and \
|
| 228 |
blind-spot surfaces.
|
| 229 |
+
13. **Trust relationships enable lateral movement.** Follow the social graph for \
|
| 230 |
+
cross-system access.
|
| 231 |
14. **Data inventory places flags.** Flags live where sensitive data \
|
| 232 |
naturally exists.
|
| 233 |
15. **Auth is real.** Login pages check the `users` table or LDAP. Users \
|
|
|
|
| 237 |
|
| 238 |
16. **Every flag in `flags` is deployed** via `files["db:sql"]` INSERT or \
|
| 239 |
a `files["<container>:<path>"]` entry.
|
| 240 |
+
17. **Every weakness has code in `files`** β not just in vulnerable_code.
|
| 241 |
18. **Golden path commands are executable.** Trace each step:
|
| 242 |
+
- Tool exists on pentest workstation (nmap, curl, mysql, smbclient, hydra, etc.)
|
| 243 |
- Short hostname used (web, db β not FQDNs)
|
| 244 |
- Port is open per firewall rules
|
| 245 |
- Expected output matches what the command actually produces
|
|
|
|
| 252 |
|
| 253 |
# Anti-Patterns (DO NOT)
|
| 254 |
|
| 255 |
+
- NO hostnames in golden_path commands from the pentest host β use IPs (10.0.1.10 for web)
|
| 256 |
- NO FQDNs anywhere (no `portal.meridianhealth.local`)
|
| 257 |
- NO port 8080 (only 80 is open on web)
|
| 258 |
- NO `flag_value` column (it's `flag`)
|
| 259 |
- NO dotfiles on web server (nginx blocks `location ~ /\\.`)
|
| 260 |
- NO `require_once 'db.php'` for files that don't exist β use inline mysqli
|
| 261 |
- NO orphan flags (every flag must be in `files`)
|
| 262 |
+
- NO `mysql` command from pentest workstation (not installed) β access DB via \
|
| 263 |
+
web weaknesses (SQLi) or leaked credentials through other means
|
| 264 |
+
- NO tools not on the pentest workstation (no gobuster, no msfconsole)
|
| 265 |
- NO evidence in blind_spot log sources
|
| 266 |
|
| 267 |
# Narrative Hints (from runtime_context)
|
| 268 |
|
| 269 |
If runtime_context.narrative_hints has entries, treat as requirements:
|
| 270 |
+
- "include lateral movement via credential reuse" β chain access through \
|
| 271 |
+
shared credentials across systems
|
| 272 |
+
- "demonstrate a monitoring blind spot" β route assessment through unmonitored surface
|
| 273 |
+
- "use social engineering as initial access" β start with NPC interaction
|
| 274 |
|
| 275 |
If runtime_context.focus_layer is set:
|
| 276 |
- "infra" β config drift, missing patches, default configs
|
| 277 |
+
- "app" β code weaknesses (SQLi, XSS, SSRF)
|
| 278 |
- "identity" β credential reuse, orphaned accounts, shared creds
|
| 279 |
- "process" β business logic flaws, missing authorization
|
| 280 |
"""
|
| 281 |
|
| 282 |
REALISM_REVIEW_PROMPT = """\
|
| 283 |
You are an OpenRange Validator performing a realism review on a generated \
|
| 284 |
+
cybersecurity training lab snapshot. You check for issues that mechanical \
|
| 285 |
+
checks cannot catch.
|
| 286 |
|
| 287 |
You will receive:
|
| 288 |
+
- task_briefings: the assessment and defense agent briefings
|
| 289 |
+
- vuln_types: list of planted security weakness types
|
| 290 |
- topology_summary: hosts and zones
|
| 291 |
- golden_path_length: number of steps in the golden path
|
| 292 |
- tier: difficulty tier (1-5)
|
|
|
|
| 298 |
|
| 299 |
Check for these issues:
|
| 300 |
|
| 301 |
+
1. **Briefing leakage**: Do briefings mention specific weakness types, flag \
|
| 302 |
+
values, assessment commands, or golden path steps? Briefings must be vague \
|
| 303 |
+
enough that the agent must discover weaknesses through reconnaissance.
|
| 304 |
|
| 305 |
+
2. **Scenario plausibility**: Do the weaknesses make sense for this company \
|
| 306 |
+
and tech stack? (e.g. SQLi on a host with no database connectivity is implausible)
|
| 307 |
|
| 308 |
3. **Difficulty match**: Is the golden path step count appropriate for the tier? \
|
| 309 |
Tier 1 ~ 8 steps, Tier 2 ~ 15, Tier 3 ~ 25. Within +/-20%.
|
| 310 |
|
| 311 |
+
4. **Narrative coherence**: Do the weaknesses tie to the company's story? Are \
|
| 312 |
+
root causes plausible for this organization?
|
| 313 |
|
| 314 |
5. **Evidence vs monitoring alignment**: Is evidence placed in locations that \
|
| 315 |
the monitoring_coverage says are logged?
|
|
|
|
| 320 |
application (login page, multiple endpoints), database seed data (users, \
|
| 321 |
business records, flags), and file share content? Empty containers are a failure.
|
| 322 |
|
| 323 |
+
8. **Golden path executability**: Do commands use IPs from the pentest \
|
| 324 |
+
workstation (not hostnames or FQDNs)? Only open ports? Tools available on the \
|
| 325 |
+
workstation? Does each step follow logically?
|
| 326 |
|
| 327 |
9. **Flag deployment**: Is every flag value in the flags array also present \
|
| 328 |
in the files dict (either as db:sql INSERT or a file)?
|
src/open_range/cli.py
CHANGED
|
@@ -424,7 +424,7 @@ _CHECK_REGISTRY: dict[str, str] = {
|
|
| 424 |
}
|
| 425 |
|
| 426 |
# Checks that require running Docker containers.
|
| 427 |
-
_DOCKER_CHECKS = {"build_boot", "exploitability", "patchability", "evidence"}
|
| 428 |
|
| 429 |
|
| 430 |
def _import_check(dotted: str) -> Any:
|
|
|
|
| 424 |
}
|
| 425 |
|
| 426 |
# Checks that require running Docker containers.
|
| 427 |
+
_DOCKER_CHECKS = {"build_boot", "exploitability", "patchability", "evidence", "reward_grounding"}
|
| 428 |
|
| 429 |
|
| 430 |
def _import_check(dotted: str) -> Any:
|
src/open_range/server/environment.py
CHANGED
|
@@ -119,11 +119,16 @@ class RangeEnvironment(Environment[RangeAction, RangeObservation, RangeState]):
|
|
| 119 |
|
| 120 |
# Execution mode: "auto", "docker", or "subprocess"
|
| 121 |
self._execution_mode = execution_mode
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
if execution_mode == "auto":
|
| 123 |
env_mode = os.environ.get("OPENRANGE_EXECUTION_MODE", "")
|
| 124 |
if env_mode:
|
| 125 |
self._execution_mode = env_mode
|
| 126 |
-
elif docker_available is False:
|
| 127 |
# Explicit docker_available=False (unit tests) β mock mode,
|
| 128 |
# NOT subprocess. Keep execution_mode as "auto" so
|
| 129 |
# _exec_in_container falls through to mock.
|
|
|
|
| 119 |
|
| 120 |
# Execution mode: "auto", "docker", or "subprocess"
|
| 121 |
self._execution_mode = execution_mode
|
| 122 |
+
|
| 123 |
+
# OPENRANGE_MOCK=1 forces mock mode (docker_available=False)
|
| 124 |
+
if os.environ.get("OPENRANGE_MOCK") == "1" and docker_available is None:
|
| 125 |
+
self._docker_available = False
|
| 126 |
+
|
| 127 |
if execution_mode == "auto":
|
| 128 |
env_mode = os.environ.get("OPENRANGE_EXECUTION_MODE", "")
|
| 129 |
if env_mode:
|
| 130 |
self._execution_mode = env_mode
|
| 131 |
+
elif docker_available is False or self._docker_available is False:
|
| 132 |
# Explicit docker_available=False (unit tests) β mock mode,
|
| 133 |
# NOT subprocess. Keep execution_mode as "auto" so
|
| 134 |
# _exec_in_container falls through to mock.
|
src/open_range/validator/difficulty.py
CHANGED
|
@@ -34,7 +34,7 @@ class DifficultyCheck:
|
|
| 34 |
tier: int = snapshot.topology.get("tier", 1)
|
| 35 |
target = TIER_TARGETS.get(tier, TIER_TARGETS[1])
|
| 36 |
lo = int(target * (1 - TOLERANCE))
|
| 37 |
-
hi =
|
| 38 |
|
| 39 |
if n_steps < lo or n_steps > hi:
|
| 40 |
issues.append(
|
|
|
|
| 34 |
tier: int = snapshot.topology.get("tier", 1)
|
| 35 |
target = TIER_TARGETS.get(tier, TIER_TARGETS[1])
|
| 36 |
lo = int(target * (1 - TOLERANCE))
|
| 37 |
+
hi = round(target * (1 + TOLERANCE))
|
| 38 |
|
| 39 |
if n_steps < lo or n_steps > hi:
|
| 40 |
issues.append(
|