Commit ·
bb6a031
0
Parent(s):
OpenSOC v1
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitignore +19 -0
- .mplcache/fontlist-v390.json +0 -0
- DEPLOY.md +101 -0
- Dockerfile +31 -0
- README.md +249 -0
- SPACE_README.md +22 -0
- TRAIN.md +72 -0
- app_runtime.py +176 -0
- client/__init__.py +3 -0
- client/opensoc_client.py +87 -0
- client/prompts.py +172 -0
- data/demo_examples.json +0 -0
- data/holdout.jsonl +0 -0
- data/holdout_smoke.jsonl +20 -0
- data/sft_defender.jsonl +0 -0
- data/sft_train.jsonl +0 -0
- demo_app.py +119 -0
- demo_data.py +127 -0
- docs/__init__.py +0 -0
- docs/blog.md +134 -0
- docs/build_slides.py +221 -0
- docs/slides.pdf +0 -0
- docs/video_script.md +75 -0
- env.py +423 -0
- eval/__init__.py +0 -0
- eval/bake_demo.py +271 -0
- eval/eval.py +231 -0
- eval/make_holdout.py +82 -0
- eval/metrics.py +97 -0
- eval/plot_results.py +101 -0
- eval/plot_training.py +220 -0
- generator.py +365 -0
- openenv.yaml +166 -0
- pyproject.toml +38 -0
- requirements.txt +8 -0
- rubric.py +137 -0
- schema.py +320 -0
- scripts/deploy_to_hf.sh +45 -0
- scripts/run_full_pipeline.sh +56 -0
- server.py +24 -0
- tasks/__init__.py +0 -0
- tasks/registry.py +57 -0
- tests/__init__.py +0 -0
- tests/test_client.py +69 -0
- tests/test_demo_data.py +115 -0
- tests/test_env.py +259 -0
- tests/test_eval.py +96 -0
- tests/test_grpo_rewards.py +87 -0
- tests/test_prompt_format.py +97 -0
- tests/test_rubric.py +162 -0
.gitignore
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
.pytest_cache/
|
| 5 |
+
.ruff_cache/
|
| 6 |
+
.venv/
|
| 7 |
+
venv/
|
| 8 |
+
.env
|
| 9 |
+
.DS_Store
|
| 10 |
+
*.egg-info/
|
| 11 |
+
build/
|
| 12 |
+
dist/
|
| 13 |
+
checkpoints/
|
| 14 |
+
runs/
|
| 15 |
+
wandb/
|
| 16 |
+
*.ckpt
|
| 17 |
+
*.bin
|
| 18 |
+
*.safetensors
|
| 19 |
+
.ipynb_checkpoints/
|
.mplcache/fontlist-v390.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
DEPLOY.md
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Deploying OpenSOC to Hugging Face Spaces
|
| 2 |
+
|
| 3 |
+
This is the one-time deployment recipe. The same Space serves both the
|
| 4 |
+
OpenEnv API (consumed by judge bots and `OpenSOCClient`) **and** a
|
| 5 |
+
Gradio "before vs after" UI at `/demo` for human reviewers.
|
| 6 |
+
|
| 7 |
+
## 1. Local sanity check
|
| 8 |
+
|
| 9 |
+
```bash
|
| 10 |
+
python -m venv .venv && source .venv/bin/activate
|
| 11 |
+
pip install -r requirements.txt
|
| 12 |
+
python server.py &
|
| 13 |
+
sleep 2
|
| 14 |
+
curl -s http://localhost:7860/health | jq .
|
| 15 |
+
curl -s -X POST 'http://localhost:7860/reset?task=stage1_basic&mode=defender_only' | jq .
|
| 16 |
+
curl -s -I http://localhost:7860/demo | head -1 # should be 200 OK
|
| 17 |
+
kill %1
|
| 18 |
+
```
|
| 19 |
+
|
| 20 |
+
## 2. Build the Docker image locally
|
| 21 |
+
|
| 22 |
+
```bash
|
| 23 |
+
docker build -t opensoc:latest .
|
| 24 |
+
docker run -p 7860:7860 opensoc:latest
|
| 25 |
+
# in another shell:
|
| 26 |
+
curl -s http://localhost:7860/tasks | jq .
|
| 27 |
+
open http://localhost:7860/demo
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
## 3. Push to Hugging Face
|
| 31 |
+
|
| 32 |
+
The simplest path is via `huggingface-cli`; the second is a one-shot
|
| 33 |
+
script that does the same thing.
|
| 34 |
+
|
| 35 |
+
### One-shot
|
| 36 |
+
|
| 37 |
+
```bash
|
| 38 |
+
export HF_USER=<your-username>
|
| 39 |
+
huggingface-cli login # browser-based PAT login
|
| 40 |
+
bash scripts/deploy_to_hf.sh
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
### Manual (equivalent)
|
| 44 |
+
|
| 45 |
+
```bash
|
| 46 |
+
huggingface-cli login
|
| 47 |
+
huggingface-cli repo create opensoc-env --type space --space-sdk docker
|
| 48 |
+
# Use SPACE_README.md as the Space's README so HF picks up the docker SDK config:
|
| 49 |
+
cp SPACE_README.md /tmp/SPACE_README.md # save a copy
|
| 50 |
+
git checkout -b space-deploy
|
| 51 |
+
cp SPACE_README.md README.md # or prepend SPACE_README front-matter to README
|
| 52 |
+
git add README.md && git commit -m "Space metadata header"
|
| 53 |
+
git remote add space https://huggingface.co/spaces/$HF_USER/opensoc-env
|
| 54 |
+
git push space space-deploy:main
|
| 55 |
+
git checkout main && git checkout README.md
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
## 4. Verify the deployed Space
|
| 59 |
+
|
| 60 |
+
```bash
|
| 61 |
+
export OPENSOC_URL=https://<your-username>-opensoc-env.hf.space
|
| 62 |
+
python -c "
|
| 63 |
+
from client import OpenSOCClient
|
| 64 |
+
c = OpenSOCClient(base_url='$OPENSOC_URL')
|
| 65 |
+
print(c.health())
|
| 66 |
+
print(c.tasks())
|
| 67 |
+
obs = c.reset(task='stage1_basic', mode='defender_only', seed=1)
|
| 68 |
+
print('first log id:', obs['log_window'][0]['log_id'])
|
| 69 |
+
"
|
| 70 |
+
# And visually:
|
| 71 |
+
open $OPENSOC_URL/demo
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
`/demo` reads `data/demo_examples.json`. If you deployed before running
|
| 75 |
+
the GPU pipeline, the file holds the *placeholder* before-vs-after pairs
|
| 76 |
+
(always-dismiss vs verifier-oracle). Re-run `python -m eval.bake_demo`
|
| 77 |
+
on a GPU host (no `--placeholder`) and re-push to overwrite with real
|
| 78 |
+
trained-model outputs.
|
| 79 |
+
|
| 80 |
+
## 5. (Optional) Run the eval harness against the live Space
|
| 81 |
+
|
| 82 |
+
```bash
|
| 83 |
+
# Pure-CPU smoke run (no Unsloth required):
|
| 84 |
+
python -m eval.eval --smoke-only --holdout data/holdout.jsonl
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
## OpenEnv hackathon checklist
|
| 88 |
+
|
| 89 |
+
- [x] `openenv.yaml` manifest with `endpoints.demo: GET /demo`
|
| 90 |
+
- [x] gym-style API: `reset` / `step` / `state` (+ `grade`, `tasks`, `health`)
|
| 91 |
+
- [x] non-reserved tool names (`craft_incident`, `submit_triage`)
|
| 92 |
+
- [x] FastAPI app exposed on port 7860 inside the container
|
| 93 |
+
- [x] Gradio UI mounted at `/demo` for the storytelling deliverable
|
| 94 |
+
- [x] Dockerfile suitable for Hugging Face Spaces (`sdk: docker`)
|
| 95 |
+
- [x] Client / server separation (`client/opensoc_client.py` is HTTP-only)
|
| 96 |
+
- [x] Frozen 200-incident eval set committed (`data/holdout.jsonl`)
|
| 97 |
+
- [x] 600-example SFT dataset committed (`data/sft_train.jsonl`)
|
| 98 |
+
- [x] 50 pre-baked demo pairs committed (`data/demo_examples.json`)
|
| 99 |
+
- [x] GRPO Colab/HF Jupyter notebook (`train_grpo.ipynb`) + one-shot
|
| 100 |
+
`scripts/run_full_pipeline.sh`
|
| 101 |
+
- [x] Pytest suite — 93 tests, all green
|
Dockerfile
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
ENV PORT=7860
|
| 4 |
+
ENV PYTHONUNBUFFERED=1
|
| 5 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
| 6 |
+
ENV HF_HOME=/tmp/hf_cache
|
| 7 |
+
|
| 8 |
+
WORKDIR /app
|
| 9 |
+
|
| 10 |
+
COPY requirements.txt .
|
| 11 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 12 |
+
|
| 13 |
+
COPY app_runtime.py .
|
| 14 |
+
COPY env.py .
|
| 15 |
+
COPY schema.py .
|
| 16 |
+
COPY generator.py .
|
| 17 |
+
COPY verifier.py .
|
| 18 |
+
COPY rubric.py .
|
| 19 |
+
COPY server.py .
|
| 20 |
+
COPY demo_app.py .
|
| 21 |
+
COPY demo_data.py .
|
| 22 |
+
COPY openenv.yaml .
|
| 23 |
+
COPY tasks/ tasks/
|
| 24 |
+
COPY client/ client/
|
| 25 |
+
COPY train/ train/
|
| 26 |
+
COPY eval/ eval/
|
| 27 |
+
COPY data/ data/
|
| 28 |
+
|
| 29 |
+
EXPOSE 7860
|
| 30 |
+
|
| 31 |
+
CMD ["python", "server.py"]
|
README.md
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# OpenSOC: Self-Play SOC Triage Environment
|
| 2 |
+
|
| 3 |
+
> An **OpenEnv** environment for training cybersecurity defender LLMs against an attacker LLM that auto-generates novel incidents. Built for the OpenEnv Hackathon, April 2026.
|
| 4 |
+
|
| 5 |
+
Humans cannot watch every alert in a Security Operations Center 24/7, and as stronger generative models start writing exploits and phishing at industrial scale that gap only widens. **OpenSOC** is an environment where a defender LLM learns to triage attacks generated by another LLM in a self-play loop. The trick is **RLVR**: triage ground truth is computed by a deterministic schema-side verifier from the *structured* incident parameters — never from any text the attacker writes — so neither side can hack the reward.
|
| 6 |
+
|
| 7 |
+
## Try it
|
| 8 |
+
|
| 9 |
+
| Link | What it is |
|
| 10 |
+
| --- | --- |
|
| 11 |
+
| **HF Space** — [`<USER>-opensoc-env.hf.space`](https://huggingface.co/spaces/REPLACE_USER/opensoc-env) | Deployed env. OpenEnv judge can hit `/reset` `/step` `/state` `/grade`. |
|
| 12 |
+
| **Live `/demo`** — [`<USER>-opensoc-env.hf.space/demo`](https://REPLACE_USER-opensoc-env.hf.space/demo) | Gradio "before vs after" UI. Click **Next incident** to compare baseline vs trained. |
|
| 13 |
+
| **Walkthrough video** (90s) — [`youtu.be/<UNLISTED>`](https://youtu.be/REPLACE_VIDEO) | One-take demo + headline numbers. Script: [`docs/video_script.md`](docs/video_script.md). |
|
| 14 |
+
| **Mini-blog** — [`huggingface.co/blog/<USER>/opensoc-rlvr-soc-triage`](https://huggingface.co/blog/REPLACE_USER/opensoc-rlvr-soc-triage) | ~600-word write-up. Source: [`docs/blog.md`](docs/blog.md). |
|
| 15 |
+
| **Slide deck** — [`docs/slides.pdf`](docs/slides.pdf) | 5 slides; problem → env → results → demo. |
|
| 16 |
+
|
| 17 |
+
> *Replace the four `REPLACE_*` placeholders above after deploy + recording. The slide PDF auto-rebuilds from `docs/build_slides.py`.*
|
| 18 |
+
|
| 19 |
+
## Table of contents
|
| 20 |
+
|
| 21 |
+
1. [Architecture](#architecture)
|
| 22 |
+
2. [Why the reward cannot be hacked](#why-the-reward-cannot-be-hacked)
|
| 23 |
+
3. [Action space and reward](#action-space-and-reward)
|
| 24 |
+
4. [Run locally](#run-locally)
|
| 25 |
+
5. [Run the training pipeline](#run-the-training-pipeline)
|
| 26 |
+
6. [Headline results](#headline-results)
|
| 27 |
+
7. [Deploy to Hugging Face Spaces](#deploy-to-hugging-face-spaces)
|
| 28 |
+
8. [Repo map](#repo-map)
|
| 29 |
+
9. [Submission deliverables](#submission-deliverables)
|
| 30 |
+
|
| 31 |
+
## Build status
|
| 32 |
+
|
| 33 |
+
| Build artifact | Status |
|
| 34 |
+
| --- | --- |
|
| 35 |
+
| Pure-python env (`OpenSOCEnv`, FastAPI) | shipped |
|
| 36 |
+
| Verifier + plausibility checker | shipped, 17-test adversarial suite |
|
| 37 |
+
| Rubric (defender + attacker rewards) | shipped, anti-hack regression tests |
|
| 38 |
+
| 600-example SFT dataset (`data/sft_train.jsonl`) | shipped |
|
| 39 |
+
| 200-incident frozen hold-out (`data/holdout.jsonl`) | shipped |
|
| 40 |
+
| GRPO training notebook (`train_grpo.ipynb`) + one-shot script | shipped (HF Jupyter L4) |
|
| 41 |
+
| Gradio "before vs after" UI mounted on the same Space | shipped at `/demo` |
|
| 42 |
+
| 50 pre-baked demo pairs (`data/demo_examples.json`) | placeholder shipped; refresh after GPU run |
|
| 43 |
+
| Eval harness + plotters (`eval/`) | shipped |
|
| 44 |
+
| Pytest suite | **93 tests**, all green |
|
| 45 |
+
|
| 46 |
+
## Architecture
|
| 47 |
+
|
| 48 |
+
```mermaid
|
| 49 |
+
flowchart LR
|
| 50 |
+
Defender[Defender LLM trainee]
|
| 51 |
+
Attacker[Attacker LLM trainee]
|
| 52 |
+
Env[OpenSOC FastAPI Environment]
|
| 53 |
+
Verifier[Deterministic verifier + plausibility check]
|
| 54 |
+
Defender -->|submit_triage| Env
|
| 55 |
+
Attacker -->|craft_incident| Env
|
| 56 |
+
Env -->|observation reward| Defender
|
| 57 |
+
Env -->|attacker reward| Attacker
|
| 58 |
+
Env --> Verifier
|
| 59 |
+
Verifier -->|ground truth label| Env
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
An episode has exactly two turns: attacker proposes incident params → env validates them and materializes a SIEM-style alert + log window → defender submits a triage action. The verifier computes the ground-truth action from the *events alone* and scores both sides — the attacker's free-text narrative is never read by the labeler.
|
| 63 |
+
|
| 64 |
+
In `defender_only` mode (used for SFT, eval, smoke tests, and the `/demo` UI) the env auto-generates the incident from `tasks/registry.py` and skips straight to the defender turn.
|
| 65 |
+
|
| 66 |
+
## Why the reward cannot be hacked
|
| 67 |
+
|
| 68 |
+
1. The verifier is a transparent rule set in `verifier.compute_ground_truth(params)`; the *only* inputs are the structured events. The attacker's `narrative` and even its self-claimed `target_label` are ignored.
|
| 69 |
+
2. The plausibility checker (`verifier.check_plausibility(params)`) refuses incoherent stories — for example, a "data exfiltration" claim with a purely-internal destination, or a `lolbin_use` event with no `process` field. The attacker's reward is gated on plausibility passing.
|
| 70 |
+
3. Schema-violation incidents floor attacker reward at -0.5, so trying to short-circuit pydantic's validators is strictly worse than playing along.
|
| 71 |
+
|
| 72 |
+
The anti-hack invariants are pinned in [`tests/test_verifier.py`](tests/test_verifier.py) and [`tests/test_rubric.py`](tests/test_rubric.py).
|
| 73 |
+
|
| 74 |
+
## Action space and reward
|
| 75 |
+
|
| 76 |
+
Tool names are deliberately **non-reserved** — there is no `reset`/`step`/`state`/`close` clash with the OpenEnv `MCPEnvironment` reserved-name list.
|
| 77 |
+
|
| 78 |
+
```yaml
|
| 79 |
+
action_space:
|
| 80 |
+
craft_incident:
|
| 81 |
+
target_label: dismiss | monitor | quarantine_host | block_ip | escalate
|
| 82 |
+
category: malware_execution | c2_beacon | data_exfiltration | ...
|
| 83 |
+
events: [ { event_type, fields, timestamp, log_id }, ... ]
|
| 84 |
+
narrative: string # ignored by the verifier
|
| 85 |
+
submit_triage:
|
| 86 |
+
action: <one of the five triage actions>
|
| 87 |
+
cited_log_id: <id of the log line that drove the decision>
|
| 88 |
+
rationale: short string
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
- **Defender**: +1 correct, −1 missed-malicious, −0.3 over-react on benign, −0.05 unnecessary escalate, +0.1 bonus for citing the right triggering log id, −0.1 floor for format violation.
|
| 92 |
+
- **Attacker**: +1 iff defender wrong AND incident plausible, −0.5 if schema validation fails, +0.2 novelty bonus, 0 for gibberish.
|
| 93 |
+
|
| 94 |
+
Full breakdown: [openenv.yaml](openenv.yaml) and [rubric.py](rubric.py).
|
| 95 |
+
|
| 96 |
+
## Run locally
|
| 97 |
+
|
| 98 |
+
```bash
|
| 99 |
+
python -m venv .venv && source .venv/bin/activate
|
| 100 |
+
pip install -r requirements.txt
|
| 101 |
+
python server.py # serves on :7860
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
Smoke test from another shell:
|
| 105 |
+
|
| 106 |
+
```bash
|
| 107 |
+
curl -s http://localhost:7860/health | jq .
|
| 108 |
+
curl -s -X POST 'http://localhost:7860/reset?task=stage1_basic&mode=defender_only' | jq .
|
| 109 |
+
curl -s -X POST 'http://localhost:7860/step?task=stage1_basic&mode=defender_only' \
|
| 110 |
+
-H 'content-type: application/json' \
|
| 111 |
+
-d '{"submit_triage": {"action": "monitor", "cited_log_id": "L1-0", "rationale": "smoke"}}' | jq .
|
| 112 |
+
open http://localhost:7860/demo # Gradio before-vs-after UI
|
| 113 |
+
```
|
| 114 |
+
|
| 115 |
+
Run the test suite (CPU only, no GPU deps):
|
| 116 |
+
|
| 117 |
+
```bash
|
| 118 |
+
pytest -q # 93 passed
|
| 119 |
+
```
|
| 120 |
+
|
| 121 |
+
Or via the bundled Python client:
|
| 122 |
+
|
| 123 |
+
```python
|
| 124 |
+
from client import OpenSOCClient
|
| 125 |
+
c = OpenSOCClient()
|
| 126 |
+
obs = c.reset(task="stage1_basic", mode="defender_only", seed=1)
|
| 127 |
+
result = c.step({"submit_triage": {"action": "monitor", "cited_log_id": "L1-0", "rationale": "ok"}},
|
| 128 |
+
task="stage1_basic", mode="defender_only", seed=1)
|
| 129 |
+
print(result)
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
## Run the training pipeline
|
| 133 |
+
|
| 134 |
+
Full end-to-end procedure: **[TRAIN.md](TRAIN.md)**. TL;DR — on an HF Jupyter L4 (~$3 of credits, ~3.5h wall time):
|
| 135 |
+
|
| 136 |
+
```bash
|
| 137 |
+
bash scripts/run_full_pipeline.sh
|
| 138 |
+
```
|
| 139 |
+
|
| 140 |
+
Or step-by-step inside [`train_grpo.ipynb`](train_grpo.ipynb):
|
| 141 |
+
|
| 142 |
+
1. SFT warm-start (~12 min) — pushes P(format-OK) from ~0% to ~95%.
|
| 143 |
+
2. GRPO curriculum across 4 stages (~3h) — verifier-grounded reward, group size 8.
|
| 144 |
+
3. Eval on the frozen 200-incident hold-out (~5 min).
|
| 145 |
+
4. `eval.plot_results` + `eval.plot_training` render four PNGs.
|
| 146 |
+
5. `eval.bake_demo` writes 50 before-vs-after pairs to `data/demo_examples.json` for the Gradio UI.
|
| 147 |
+
|
| 148 |
+
## Headline results
|
| 149 |
+
|
| 150 |
+
> *Plots below are auto-generated; the placeholder versions are committed today (always-dismiss vs verifier-oracle) so the README never has broken images. Re-run the pipeline above to overwrite with real numbers.*
|
| 151 |
+
|
| 152 |
+
### Dismiss-on-malicious (the cardinal failure mode)
|
| 153 |
+
|
| 154 |
+

|
| 155 |
+
|
| 156 |
+
### Macro F1 across 200-incident hold-out
|
| 157 |
+
|
| 158 |
+

|
| 159 |
+
|
| 160 |
+
### Confusion matrices
|
| 161 |
+
|
| 162 |
+
| Baseline (zero-shot Qwen2.5-3B) | OpenSOC (after GRPO) |
|
| 163 |
+
| --- | --- |
|
| 164 |
+
|  |  |
|
| 165 |
+
|
| 166 |
+
*(Filenames `confusion_always_dismiss.png` and `confusion_verifier_oracle.png` get replaced by `confusion_baseline_zero_shot.png` and `confusion_opensoc_grpo.png` after the GPU eval run.)*
|
| 167 |
+
|
| 168 |
+
### Reward across the curriculum
|
| 169 |
+
|
| 170 |
+

|
| 171 |
+
|
| 172 |
+
| Model | Accuracy | Macro F1 | Dismiss-on-malicious | Over-react |
|
| 173 |
+
| --- | ---: | ---: | ---: | ---: |
|
| 174 |
+
| `always_dismiss` (floor) | 0.13 | 0.05 | **1.00** | 0.00 |
|
| 175 |
+
| `baseline_zero_shot` (Qwen2.5-3B) | _GPU run_ | _GPU run_ | _GPU run_ | _GPU run_ |
|
| 176 |
+
| `opensoc_grpo` (after training) | _GPU run_ | _GPU run_ | _GPU run_ | _GPU run_ |
|
| 177 |
+
| `verifier_oracle` (ceiling) | 1.00 | 1.00 | 0.00 | 0.00 |
|
| 178 |
+
|
| 179 |
+
## Deploy to Hugging Face Spaces
|
| 180 |
+
|
| 181 |
+
Full recipe: [DEPLOY.md](DEPLOY.md). The fast version, after `huggingface-cli login`:
|
| 182 |
+
|
| 183 |
+
```bash
|
| 184 |
+
export HF_USER=<your-username>
|
| 185 |
+
bash scripts/deploy_to_hf.sh
|
| 186 |
+
# Build takes ~5 minutes; then:
|
| 187 |
+
open https://${HF_USER}-opensoc-env.hf.space/demo
|
| 188 |
+
```
|
| 189 |
+
|
| 190 |
+
The Space runs FastAPI + Gradio in a single container. `/reset`, `/step`, `/state`, `/grade`, `/tasks`, `/health` continue to work for the OpenEnv judge bot; `/demo` is the human-readable UI.
|
| 191 |
+
|
| 192 |
+
## Repo map
|
| 193 |
+
|
| 194 |
+
| File / dir | Purpose |
|
| 195 |
+
| --- | --- |
|
| 196 |
+
| `openenv.yaml` | OpenEnv manifest (tasks, action space, reward range, endpoints) |
|
| 197 |
+
| `schema.py` | Incident / event / action schema with strict validators |
|
| 198 |
+
| `generator.py` | Materializes incidents for `defender_only` mode (eval, SFT) |
|
| 199 |
+
| `verifier.py` | Deterministic ground-truth labeler + plausibility checker |
|
| 200 |
+
| `rubric.py` | Layered defender + attacker reward functions |
|
| 201 |
+
| `env.py` | Two-role `OpenSOCEnv` (`reset` / `step` / `state` / `grade`) |
|
| 202 |
+
| `app_runtime.py` | FastAPI app exposing the OpenEnv API |
|
| 203 |
+
| `demo_app.py` | Gradio Blocks app mounted at `/demo` |
|
| 204 |
+
| `demo_data.py` | Pure-python helpers for the demo UI |
|
| 205 |
+
| `server.py` | Container entry point — imports `demo_app` then starts uvicorn |
|
| 206 |
+
| `tasks/registry.py` | Curriculum stages: `stage1_basic` → `stage4_adversarial` |
|
| 207 |
+
| `client/` | Thin HTTP client (server-internals-free) |
|
| 208 |
+
| `train/` | SFT warm-start + GRPO loop + reusable prompt format |
|
| 209 |
+
| `eval/` | Hold-out generator, metrics, eval driver, plot renderers, `bake_demo` |
|
| 210 |
+
| `scripts/run_full_pipeline.sh` | One-shot training + eval + bake-demo |
|
| 211 |
+
| `scripts/deploy_to_hf.sh` | One-shot HF Space push |
|
| 212 |
+
| `docs/` | Blog post, video script, slide deck builder |
|
| 213 |
+
| `tests/` | Pytest suite (93 tests, anti-hack regressions included) |
|
| 214 |
+
|
| 215 |
+
## Submission deliverables
|
| 216 |
+
|
| 217 |
+
Mapped to the four judging criteria:
|
| 218 |
+
|
| 219 |
+
| Criterion | Weight | Where it lives |
|
| 220 |
+
| --- | ---: | --- |
|
| 221 |
+
| Environment Innovation | 40% | `openenv.yaml`, `schema.py`, `verifier.py`, `env.py`, this README's *Architecture* and *Why the reward cannot be hacked* sections |
|
| 222 |
+
| Storytelling & Presentation | 30% | `/demo` Gradio UI + 90s video + HF blog + 5-slide deck (`docs/slides.pdf`) |
|
| 223 |
+
| Showing Improvement in Rewards | 20% | `eval/results/*.png` (training curves + confusion + headline bar) embedded above |
|
| 224 |
+
| Reward & Training Pipeline | 10% | `rubric.py` + 93-test anti-hack suite + `train_grpo.ipynb` + `scripts/run_full_pipeline.sh` |
|
| 225 |
+
|
| 226 |
+
Submission checklist:
|
| 227 |
+
|
| 228 |
+
- [x] OpenEnv-compatible env (gym-style API, manifest, non-reserved tool names)
|
| 229 |
+
- [x] Deterministic RLVR verifier + plausibility checker
|
| 230 |
+
- [x] Layered defender + attacker reward
|
| 231 |
+
- [x] SFT warm-start dataset (committed)
|
| 232 |
+
- [x] Frozen 200-incident hold-out (committed)
|
| 233 |
+
- [x] GRPO curriculum notebook + one-shot training script
|
| 234 |
+
- [x] Eval harness + plotters
|
| 235 |
+
- [x] Pytest suite (93 tests, anti-hack regressions included)
|
| 236 |
+
- [x] Gradio `/demo` UI mounted on the same Space (free-CPU-tier compatible)
|
| 237 |
+
- [x] 5-slide PDF deck (`docs/slides.pdf`)
|
| 238 |
+
- [x] Blog post draft (`docs/blog.md`)
|
| 239 |
+
- [x] Video script (`docs/video_script.md`)
|
| 240 |
+
- [ ] HF Space pushed (run `bash scripts/deploy_to_hf.sh`)
|
| 241 |
+
- [ ] Trained adapter pushed (run the GPU pipeline; commit the resulting checkpoint)
|
| 242 |
+
- [ ] Real demo data baked (re-run `python -m eval.bake_demo` post-training)
|
| 243 |
+
- [ ] Video recorded + uploaded as unlisted (script in `docs/video_script.md`)
|
| 244 |
+
- [ ] Blog post published on HF (source in `docs/blog.md`)
|
| 245 |
+
- [ ] All four `REPLACE_*` placeholders at the top filled in
|
| 246 |
+
|
| 247 |
+
## License
|
| 248 |
+
|
| 249 |
+
BSD-3-Clause.
|
SPACE_README.md
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: OpenSOC SOC Triage Env
|
| 3 |
+
emoji: shield
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: red
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
pinned: false
|
| 9 |
+
license: bsd-3-clause
|
| 10 |
+
tags:
|
| 11 |
+
- openenv
|
| 12 |
+
- cybersecurity
|
| 13 |
+
- rlvr
|
| 14 |
+
- self-play
|
| 15 |
+
---
|
| 16 |
+
|
| 17 |
+
This file is the Hugging Face Spaces metadata header. When pushing to a
|
| 18 |
+
Space (`git push space main`), copy this file to the Space repo as `README.md`
|
| 19 |
+
(or merge the front-matter into the existing README's first lines).
|
| 20 |
+
|
| 21 |
+
The repository's main `README.md` provides the full project description and
|
| 22 |
+
usage instructions.
|
TRAIN.md
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Training OpenSOC end-to-end
|
| 2 |
+
|
| 3 |
+
Total compute budget (HF Jupyter Notebook L4 @ $0.80/h):
|
| 4 |
+
|
| 5 |
+
| Step | Wall time | $ on L4 |
|
| 6 |
+
| ----------------------------------- | --------- | ------- |
|
| 7 |
+
| SFT warm-start | ~12 min | ~$0.16 |
|
| 8 |
+
| GRPO curriculum (4 stages × 200 st) | ~3 h | ~$2.40 |
|
| 9 |
+
| Eval (200 hold-out incidents) | ~5 min | ~$0.07 |
|
| 10 |
+
| Bake demo (50 incidents × 2 models) | ~3 min | ~$0.04 |
|
| 11 |
+
| **Total** | **~3.5h** | **~$2.7** |
|
| 12 |
+
|
| 13 |
+
Comfortably within the $30 HF credit budget — leaves ~$25 for ablations, retries, or moving to A10G.
|
| 14 |
+
|
| 15 |
+
## Recommended target: Hugging Face Jupyter Notebooks
|
| 16 |
+
|
| 17 |
+
1. Push this repo (or fork) to `huggingface.co/<you>/opensoc-env` so the
|
| 18 |
+
notebook can `git clone` it.
|
| 19 |
+
2. From <https://huggingface.co/notebooks/new>, pick **L4 (24GB)** and
|
| 20 |
+
the **pytorch-cuda** image.
|
| 21 |
+
3. Open `train_grpo.ipynb` from the cloned repo and run cells top-to-bottom.
|
| 22 |
+
|
| 23 |
+
The notebook is idempotent — you can pause/resume between any two cells
|
| 24 |
+
and HF only bills for attached-GPU minutes.
|
| 25 |
+
|
| 26 |
+
## One-shot script alternative
|
| 27 |
+
|
| 28 |
+
If you prefer a single shell command (works on any GPU host with CUDA):
|
| 29 |
+
|
| 30 |
+
```bash
|
| 31 |
+
git clone https://huggingface.co/<you>/opensoc-env && cd opensoc-env
|
| 32 |
+
bash scripts/run_full_pipeline.sh
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
## Fallbacks
|
| 36 |
+
|
| 37 |
+
- **L4 unavailable in your region** → use **A10G** (~$1.05/h, +30% cost).
|
| 38 |
+
- **Only Colab Pro T4 available** → drop `--num-generations` from 8 to 4
|
| 39 |
+
and increase `--steps-per-stage` from 200 to 300; total cost similar.
|
| 40 |
+
- **GRPO reward is flat at zero** → you skipped the SFT warm-start; with
|
| 41 |
+
no SFT, the format-violation penalty dominates and there's no signal.
|
| 42 |
+
- **Adapter file too large for `git push`** → use `huggingface-cli upload`
|
| 43 |
+
with LFS, or push the adapter to a separate `*-defender-grpo` model
|
| 44 |
+
repo and load it by reference at deploy time.
|
| 45 |
+
|
| 46 |
+
## What gets produced
|
| 47 |
+
|
| 48 |
+
After a successful run, the following are added to the repo:
|
| 49 |
+
|
| 50 |
+
```
|
| 51 |
+
checkpoints/
|
| 52 |
+
defender_sft_adapter/ # warm-started LoRA adapter
|
| 53 |
+
defender_grpo/
|
| 54 |
+
stage1_basic/{adapter,training_log.jsonl,runs/}
|
| 55 |
+
stage2_multi/...
|
| 56 |
+
stage3_mixed/...
|
| 57 |
+
stage4_adversarial/{adapter, ...} # final RL-trained adapter
|
| 58 |
+
data/
|
| 59 |
+
demo_examples.json # 50 before-vs-after pairs
|
| 60 |
+
eval/results/
|
| 61 |
+
summary.json
|
| 62 |
+
bar_dismiss_on_malicious.png # headline plot
|
| 63 |
+
bar_macro_f1.png
|
| 64 |
+
confusion_baseline_zero_shot.png
|
| 65 |
+
confusion_opensoc_grpo.png
|
| 66 |
+
training_curves.png # reward across stages
|
| 67 |
+
training_kl_loss.png
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
`scripts/run_full_pipeline.sh` is the canonical end-to-end command; the
|
| 71 |
+
Jupyter notebook is the same pipeline but cell-by-cell so you can
|
| 72 |
+
inspect intermediate outputs.
|
app_runtime.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FastAPI application module for OpenSOC, mountable from server.py.
|
| 2 |
+
|
| 3 |
+
Endpoints follow the OpenEnv conventions plus a lightweight `/grade`:
|
| 4 |
+
|
| 5 |
+
POST /reset?task=<stage>&mode=<self_play|defender_only>&seed=<n>
|
| 6 |
+
POST /step?task=<stage>&mode=...&seed=<n> (body: Action)
|
| 7 |
+
GET /state?task=<stage>&mode=...&seed=<n>
|
| 8 |
+
POST /grade?task=<stage>&mode=...&seed=<n>
|
| 9 |
+
GET /tasks
|
| 10 |
+
GET /health
|
| 11 |
+
|
| 12 |
+
Per-(task, mode, seed) env instances are cached in a process-local dict so
|
| 13 |
+
multiple concurrent clients can share the FastAPI process without stepping
|
| 14 |
+
on each other's episodes.
|
| 15 |
+
|
| 16 |
+
This module does NOT inherit from openenv-core's MCPEnvironment because the
|
| 17 |
+
`craft_incident`/`submit_triage` action surface is non-MCP (single-action
|
| 18 |
+
unions are simpler for GRPO rollouts). Tool names are deliberately
|
| 19 |
+
non-reserved so an MCPEnvironment wrapper can be added later if a team
|
| 20 |
+
wants to expose the env over MCP transports.
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
from __future__ import annotations
|
| 24 |
+
|
| 25 |
+
import os
|
| 26 |
+
from typing import Any, Dict, Optional
|
| 27 |
+
|
| 28 |
+
from fastapi import FastAPI, HTTPException, Query
|
| 29 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 30 |
+
from pydantic import BaseModel
|
| 31 |
+
|
| 32 |
+
from env import Action, Observation, OpenSOCEnv
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
app = FastAPI(
|
| 36 |
+
title="OpenSOC",
|
| 37 |
+
description="Self-play SOC triage OpenEnv environment for cybersecurity defender LLMs.",
|
| 38 |
+
version="1.0.0",
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
app.add_middleware(
|
| 42 |
+
CORSMiddleware,
|
| 43 |
+
allow_origins=["*"],
|
| 44 |
+
allow_methods=["*"],
|
| 45 |
+
allow_headers=["*"],
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
_envs: Dict[str, OpenSOCEnv] = {}
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _env_key(task: str, mode: str, seed: int) -> str:
|
| 52 |
+
return f"{task}::{mode}::{seed}"
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def _get_env(task: str, mode: str, seed: int) -> OpenSOCEnv:
|
| 56 |
+
key = _env_key(task, mode, seed)
|
| 57 |
+
if key not in _envs:
|
| 58 |
+
try:
|
| 59 |
+
_envs[key] = OpenSOCEnv(task_id=task, mode=mode, seed=seed) # type: ignore[arg-type]
|
| 60 |
+
except ValueError as exc:
|
| 61 |
+
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
| 62 |
+
return _envs[key]
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
# ---------------------------------------------------------------------------
|
| 66 |
+
# Response models
|
| 67 |
+
# ---------------------------------------------------------------------------
|
| 68 |
+
|
| 69 |
+
class StepResult(BaseModel):
|
| 70 |
+
observation: Observation
|
| 71 |
+
reward: float
|
| 72 |
+
done: bool
|
| 73 |
+
info: Dict[str, Any]
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
class GradeResult(BaseModel):
|
| 77 |
+
task: str
|
| 78 |
+
mode: str
|
| 79 |
+
score: float
|
| 80 |
+
defender_reward: Optional[float]
|
| 81 |
+
attacker_reward: Optional[float]
|
| 82 |
+
ground_truth: Optional[str]
|
| 83 |
+
plausible: Optional[bool]
|
| 84 |
+
schema_violation: bool
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
# ---------------------------------------------------------------------------
|
| 88 |
+
# Endpoints
|
| 89 |
+
# ---------------------------------------------------------------------------
|
| 90 |
+
|
| 91 |
+
@app.post("/reset", response_model=Observation)
|
| 92 |
+
def reset(
|
| 93 |
+
task: str = Query("stage1_basic", description="Curriculum stage id."),
|
| 94 |
+
mode: str = Query("defender_only", description="self_play | defender_only"),
|
| 95 |
+
seed: int = Query(0),
|
| 96 |
+
):
|
| 97 |
+
"""Reset the environment and return the initial observation."""
|
| 98 |
+
env = _get_env(task, mode, seed)
|
| 99 |
+
return env.reset()
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
@app.post("/step", response_model=StepResult)
|
| 103 |
+
def step(
|
| 104 |
+
action: Action,
|
| 105 |
+
task: str = Query("stage1_basic"),
|
| 106 |
+
mode: str = Query("defender_only"),
|
| 107 |
+
seed: int = Query(0),
|
| 108 |
+
):
|
| 109 |
+
"""Execute one action and return observation, reward, done, info."""
|
| 110 |
+
env = _get_env(task, mode, seed)
|
| 111 |
+
if env._state is None:
|
| 112 |
+
raise HTTPException(status_code=400, detail="Call /reset first.")
|
| 113 |
+
try:
|
| 114 |
+
obs, reward, done, info = env.step(action)
|
| 115 |
+
except RuntimeError as exc:
|
| 116 |
+
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
| 117 |
+
return StepResult(observation=obs, reward=reward, done=done, info=info)
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
@app.get("/state")
|
| 121 |
+
def state(
|
| 122 |
+
task: str = Query("stage1_basic"),
|
| 123 |
+
mode: str = Query("defender_only"),
|
| 124 |
+
seed: int = Query(0),
|
| 125 |
+
):
|
| 126 |
+
"""Return the full internal episode state."""
|
| 127 |
+
env = _get_env(task, mode, seed)
|
| 128 |
+
return env.state()
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
@app.get("/tasks")
|
| 132 |
+
def list_tasks():
|
| 133 |
+
"""List the available curriculum stages."""
|
| 134 |
+
from tasks.registry import STAGE_REGISTRY
|
| 135 |
+
return {
|
| 136 |
+
"tasks": [
|
| 137 |
+
{"id": stage_id, "difficulty": cfg["difficulty"], "description": cfg["description"]}
|
| 138 |
+
for stage_id, cfg in STAGE_REGISTRY.items()
|
| 139 |
+
],
|
| 140 |
+
"modes": ["self_play", "defender_only"],
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
@app.post("/grade", response_model=GradeResult)
|
| 145 |
+
def grade(
|
| 146 |
+
task: str = Query("stage1_basic"),
|
| 147 |
+
mode: str = Query("defender_only"),
|
| 148 |
+
seed: int = Query(0),
|
| 149 |
+
):
|
| 150 |
+
"""Compute a normalized [0, 1] score for the just-finished episode."""
|
| 151 |
+
env = _get_env(task, mode, seed)
|
| 152 |
+
if env._state is None:
|
| 153 |
+
raise HTTPException(status_code=400, detail="No episode to grade. Call /reset first.")
|
| 154 |
+
s = env._state
|
| 155 |
+
return GradeResult(
|
| 156 |
+
task=task,
|
| 157 |
+
mode=mode,
|
| 158 |
+
score=env.grade(),
|
| 159 |
+
defender_reward=s.defender_reward,
|
| 160 |
+
attacker_reward=s.attacker_reward,
|
| 161 |
+
ground_truth=s.ground_truth.value if s.ground_truth else None,
|
| 162 |
+
plausible=s.plausible,
|
| 163 |
+
schema_violation=s.schema_violation,
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
@app.get("/health")
|
| 168 |
+
def health():
|
| 169 |
+
return {"status": "ok", "env": "OpenSOC", "version": "1.0.0"}
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def main() -> None:
|
| 173 |
+
import uvicorn
|
| 174 |
+
|
| 175 |
+
port = int(os.getenv("PORT", 7860))
|
| 176 |
+
uvicorn.run(app, host="0.0.0.0", port=port)
|
client/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .opensoc_client import OpenSOCClient
|
| 2 |
+
|
| 3 |
+
__all__ = ["OpenSOCClient"]
|
client/opensoc_client.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Thin HTTP client for the OpenSOC environment.
|
| 2 |
+
|
| 3 |
+
Importantly: this module **never imports server-side code** (`env.py`,
|
| 4 |
+
`verifier.py`, `rubric.py`). The OpenEnv hackathon brief calls for
|
| 5 |
+
client/server separation so the same client can drive a remote HF Space
|
| 6 |
+
or a local container without re-running the verifier locally.
|
| 7 |
+
|
| 8 |
+
Usage::
|
| 9 |
+
|
| 10 |
+
from client import OpenSOCClient
|
| 11 |
+
c = OpenSOCClient(base_url="http://localhost:7860")
|
| 12 |
+
obs = c.reset(task="stage1_basic", mode="defender_only", seed=1)
|
| 13 |
+
result = c.step(
|
| 14 |
+
{"submit_triage": {"action": "monitor",
|
| 15 |
+
"cited_log_id": "L1-0",
|
| 16 |
+
"rationale": "..."}},
|
| 17 |
+
task="stage1_basic", mode="defender_only", seed=1,
|
| 18 |
+
)
|
| 19 |
+
grade = c.grade(task="stage1_basic", mode="defender_only", seed=1)
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
from __future__ import annotations
|
| 23 |
+
|
| 24 |
+
import os
|
| 25 |
+
from typing import Any, Dict, Optional
|
| 26 |
+
|
| 27 |
+
import requests
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class OpenSOCClient:
|
| 31 |
+
"""Lightweight requests-based client for the OpenSOC FastAPI server."""
|
| 32 |
+
|
| 33 |
+
def __init__(
|
| 34 |
+
self,
|
| 35 |
+
base_url: Optional[str] = None,
|
| 36 |
+
timeout: float = 30.0,
|
| 37 |
+
session: Optional[requests.Session] = None,
|
| 38 |
+
):
|
| 39 |
+
self.base_url = (base_url or os.getenv("OPENSOC_URL", "http://localhost:7860")).rstrip("/")
|
| 40 |
+
self.timeout = timeout
|
| 41 |
+
self.session = session or requests.Session()
|
| 42 |
+
|
| 43 |
+
def health(self) -> Dict[str, Any]:
|
| 44 |
+
return self._get("/health")
|
| 45 |
+
|
| 46 |
+
def tasks(self) -> Dict[str, Any]:
|
| 47 |
+
return self._get("/tasks")
|
| 48 |
+
|
| 49 |
+
def reset(self, task: str = "stage1_basic", mode: str = "defender_only", seed: int = 0) -> Dict[str, Any]:
|
| 50 |
+
return self._post("/reset", params={"task": task, "mode": mode, "seed": seed})
|
| 51 |
+
|
| 52 |
+
def step(
|
| 53 |
+
self,
|
| 54 |
+
action: Dict[str, Any],
|
| 55 |
+
task: str = "stage1_basic",
|
| 56 |
+
mode: str = "defender_only",
|
| 57 |
+
seed: int = 0,
|
| 58 |
+
) -> Dict[str, Any]:
|
| 59 |
+
return self._post(
|
| 60 |
+
"/step",
|
| 61 |
+
params={"task": task, "mode": mode, "seed": seed},
|
| 62 |
+
json=action,
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
def state(self, task: str = "stage1_basic", mode: str = "defender_only", seed: int = 0) -> Dict[str, Any]:
|
| 66 |
+
return self._get("/state", params={"task": task, "mode": mode, "seed": seed})
|
| 67 |
+
|
| 68 |
+
def grade(self, task: str = "stage1_basic", mode: str = "defender_only", seed: int = 0) -> Dict[str, Any]:
|
| 69 |
+
return self._post("/grade", params={"task": task, "mode": mode, "seed": seed})
|
| 70 |
+
|
| 71 |
+
def _get(self, path: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
| 72 |
+
r = self.session.get(self.base_url + path, params=params, timeout=self.timeout)
|
| 73 |
+
r.raise_for_status()
|
| 74 |
+
return r.json()
|
| 75 |
+
|
| 76 |
+
def _post(
|
| 77 |
+
self,
|
| 78 |
+
path: str,
|
| 79 |
+
params: Optional[Dict[str, Any]] = None,
|
| 80 |
+
json: Any = None,
|
| 81 |
+
) -> Dict[str, Any]:
|
| 82 |
+
r = self.session.post(self.base_url + path, params=params, json=json, timeout=self.timeout)
|
| 83 |
+
r.raise_for_status()
|
| 84 |
+
return r.json()
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
__all__ = ["OpenSOCClient"]
|
client/prompts.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Prompt formatting + completion parsing for both roles.
|
| 2 |
+
|
| 3 |
+
Keeping this in one module lets us reuse the exact same prompt template
|
| 4 |
+
across SFT warm-start, GRPO rollouts, and the held-out eval — an oft-forgotten
|
| 5 |
+
prerequisite for the metrics in the report to be comparable.
|
| 6 |
+
|
| 7 |
+
The defender prompt is engineered for a small (3B-class) instruction-tuned
|
| 8 |
+
model: it is short, contains an explicit JSON schema, and shows one example.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import json
|
| 14 |
+
import re
|
| 15 |
+
from typing import Optional
|
| 16 |
+
|
| 17 |
+
from schema import (
|
| 18 |
+
Action,
|
| 19 |
+
Alert,
|
| 20 |
+
CraftIncident,
|
| 21 |
+
Event,
|
| 22 |
+
EventType,
|
| 23 |
+
IncidentCategory,
|
| 24 |
+
SubmitTriage,
|
| 25 |
+
TriageAction,
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# ---------------------------------------------------------------------------
|
| 30 |
+
# System messages
|
| 31 |
+
# ---------------------------------------------------------------------------
|
| 32 |
+
|
| 33 |
+
DEFENDER_SYSTEM = (
|
| 34 |
+
"You are a senior SOC analyst. You triage SIEM alerts. "
|
| 35 |
+
"Read the alert and the structured log window, then choose exactly one "
|
| 36 |
+
"action from {dismiss, monitor, quarantine_host, block_ip, escalate} "
|
| 37 |
+
"and cite the single most diagnostic log_id. "
|
| 38 |
+
"Respond with a JSON object only. Never invent log_ids."
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
ATTACKER_SYSTEM = (
|
| 43 |
+
"You are a red-team incident designer. You craft synthetic security "
|
| 44 |
+
"incidents that the defender model must triage correctly. "
|
| 45 |
+
"Your goal is to design plausible incidents whose ground-truth label is "
|
| 46 |
+
"as confusing as possible to the defender, but they must remain "
|
| 47 |
+
"*coherent* — internal-only events cannot be 'exfiltration', LOLBin "
|
| 48 |
+
"events must include both `process` and `parent_process` fields, etc. "
|
| 49 |
+
"Respond with a JSON object only that conforms to the craft_incident schema."
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
_DEFENDER_RESPONSE_SCHEMA = """\
|
| 54 |
+
{
|
| 55 |
+
"action": "dismiss|monitor|quarantine_host|block_ip|escalate",
|
| 56 |
+
"cited_log_id": "<one log_id from the log_window>",
|
| 57 |
+
"rationale": "<one short sentence>"
|
| 58 |
+
}"""
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
_DEFENDER_FEW_SHOT = """\
|
| 62 |
+
Example:
|
| 63 |
+
ALERT: c2_beacon: 1 event(s); first=net.beacon
|
| 64 |
+
LOG_WINDOW:
|
| 65 |
+
L1-0 2026-04-25T10:00:00Z net.beacon dst_ip=203.0.113.5 interval_s=60
|
| 66 |
+
|
| 67 |
+
Response:
|
| 68 |
+
{"action": "block_ip", "cited_log_id": "L1-0", "rationale": "Periodic beacon to external IP indicates C2."}"""
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
# ---------------------------------------------------------------------------
|
| 72 |
+
# Defender side
|
| 73 |
+
# ---------------------------------------------------------------------------
|
| 74 |
+
|
| 75 |
+
def _render_event(e: Event) -> str:
|
| 76 |
+
fields = " ".join(f"{k}={v}" for k, v in (e.fields or {}).items() if v is not None)
|
| 77 |
+
return f" {e.log_id} {e.timestamp} {e.event_type.value} {fields}".rstrip()
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def format_defender_prompt(alert: Alert, log_window: list[Event]) -> str:
|
| 81 |
+
"""Return the user-message body for a defender turn."""
|
| 82 |
+
body = [f"ALERT: {alert.summary}", "LOG_WINDOW:"]
|
| 83 |
+
for e in log_window:
|
| 84 |
+
body.append(_render_event(e))
|
| 85 |
+
body.append("")
|
| 86 |
+
body.append(f"Respond with JSON in this shape:\n{_DEFENDER_RESPONSE_SCHEMA}")
|
| 87 |
+
body.append("")
|
| 88 |
+
body.append(_DEFENDER_FEW_SHOT)
|
| 89 |
+
return "\n".join(body)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
_JSON_BLOCK_RE = re.compile(r"\{.*\}", re.DOTALL)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def parse_defender_completion(text: str) -> Action:
|
| 96 |
+
"""Parse a model completion into an `Action(submit_triage=...)`.
|
| 97 |
+
|
| 98 |
+
Robust to the common case where the model emits prose then JSON. We
|
| 99 |
+
locate the first balanced JSON object. Schema mismatches raise the
|
| 100 |
+
pydantic ValidationError, which the caller (env.step) treats as a
|
| 101 |
+
schema violation.
|
| 102 |
+
"""
|
| 103 |
+
match = _JSON_BLOCK_RE.search(text)
|
| 104 |
+
if not match:
|
| 105 |
+
raise ValueError("No JSON object found in defender completion")
|
| 106 |
+
payload = json.loads(match.group(0))
|
| 107 |
+
return Action(
|
| 108 |
+
submit_triage=SubmitTriage(
|
| 109 |
+
action=TriageAction(payload["action"]),
|
| 110 |
+
cited_log_id=str(payload["cited_log_id"]),
|
| 111 |
+
rationale=str(payload.get("rationale", "")),
|
| 112 |
+
)
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
# ---------------------------------------------------------------------------
|
| 117 |
+
# Attacker side
|
| 118 |
+
# ---------------------------------------------------------------------------
|
| 119 |
+
|
| 120 |
+
_ATTACKER_RESPONSE_SCHEMA = """\
|
| 121 |
+
{
|
| 122 |
+
"target_label": "dismiss|monitor|quarantine_host|block_ip|escalate",
|
| 123 |
+
"category": "phishing|brute_force|malware_execution|c2_beacon|data_exfiltration|insider_misuse|privilege_escalation|benign_noise",
|
| 124 |
+
"events": [
|
| 125 |
+
{"event_type": "<see schema>", "fields": {"...": "..."}}
|
| 126 |
+
],
|
| 127 |
+
"narrative": "<optional free text, ignored by the verifier>"
|
| 128 |
+
}"""
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def format_attacker_prompt(brief: dict) -> str:
|
| 132 |
+
"""Return the user-message body for an attacker turn."""
|
| 133 |
+
body = [
|
| 134 |
+
f"BRIEF: design an incident whose ground-truth label is action="
|
| 135 |
+
f"{brief.get('target_label', 'monitor')}, category hint="
|
| 136 |
+
f"{brief.get('category_hint', 'any')}, difficulty="
|
| 137 |
+
f"{brief.get('difficulty', 'easy')}.",
|
| 138 |
+
"",
|
| 139 |
+
f"Respond with JSON in this shape:\n{_ATTACKER_RESPONSE_SCHEMA}",
|
| 140 |
+
]
|
| 141 |
+
return "\n".join(body)
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def parse_attacker_completion(text: str) -> Action:
|
| 145 |
+
"""Parse a model completion into an `Action(craft_incident=...)`.
|
| 146 |
+
|
| 147 |
+
Adds defaults for `log_id`, `timestamp`, and `source` when the model
|
| 148 |
+
omits them — keeps SFT data clean and makes RL rollouts robust.
|
| 149 |
+
"""
|
| 150 |
+
match = _JSON_BLOCK_RE.search(text)
|
| 151 |
+
if not match:
|
| 152 |
+
raise ValueError("No JSON object found in attacker completion")
|
| 153 |
+
payload = json.loads(match.group(0))
|
| 154 |
+
|
| 155 |
+
events: list[Event] = []
|
| 156 |
+
for i, raw in enumerate(payload.get("events", [])):
|
| 157 |
+
events.append(Event(
|
| 158 |
+
log_id=raw.get("log_id", f"L1-{i}"),
|
| 159 |
+
timestamp=raw.get("timestamp", "2026-04-25T10:00:00Z"),
|
| 160 |
+
source=raw.get("source", "synthetic"),
|
| 161 |
+
event_type=EventType(raw["event_type"]),
|
| 162 |
+
fields=raw.get("fields", {}),
|
| 163 |
+
))
|
| 164 |
+
|
| 165 |
+
return Action(
|
| 166 |
+
craft_incident=CraftIncident(
|
| 167 |
+
target_label=TriageAction(payload["target_label"]),
|
| 168 |
+
category=IncidentCategory(payload.get("category", "benign_noise")),
|
| 169 |
+
events=events,
|
| 170 |
+
narrative=str(payload.get("narrative", "")),
|
| 171 |
+
)
|
| 172 |
+
)
|
data/demo_examples.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/holdout.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/holdout_smoke.jsonl
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"alert": {"alert_id": "A-EVAL-c-90000", "category": "benign_noise", "severity": "low", "summary": "benign_noise: 1 event(s); first=auth.login_success", "host": "host-001", "user": "dave"}, "events": [{"log_id": "L1-0", "timestamp": "2026-04-25T00:08:00Z", "source": "identity", "event_type": "auth.login_success", "fields": {"user": "dave", "src_ip": "10.0.0.84"}}], "ground_truth": "dismiss", "triggering_log_id": "L1-0", "stage": "stage1_basic", "seed": 90000}
|
| 2 |
+
{"alert": {"alert_id": "A-EVAL-c-90001", "category": "malware_execution", "severity": "high", "summary": "malware_execution: 1 event(s); first=edr.behavior_match", "host": "host-032", "user": "user-001"}, "events": [{"log_id": "L1-0", "timestamp": "2026-04-25T22:52:00Z", "source": "edr", "event_type": "edr.behavior_match", "fields": {"severity": "high", "rule_id": "EDR-8836", "host": "host-032"}}], "ground_truth": "quarantine_host", "triggering_log_id": "L1-0", "stage": "stage1_basic", "seed": 90001}
|
| 3 |
+
{"alert": {"alert_id": "A-EVAL-c-90002", "category": "malware_execution", "severity": "high", "summary": "malware_execution: 1 event(s); first=edr.behavior_match", "host": "host-008", "user": "user-001"}, "events": [{"log_id": "L1-0", "timestamp": "2026-04-25T16:56:00Z", "source": "edr", "event_type": "edr.behavior_match", "fields": {"severity": "high", "rule_id": "EDR-3486", "host": "host-008"}}], "ground_truth": "quarantine_host", "triggering_log_id": "L1-0", "stage": "stage1_basic", "seed": 90002}
|
| 4 |
+
{"alert": {"alert_id": "A-EVAL-c-90003", "category": "benign_noise", "severity": "medium", "summary": "benign_noise: 1 event(s); first=net.port_scan_hit", "host": "host-001", "user": "user-001"}, "events": [{"log_id": "L1-0", "timestamp": "2026-04-25T05:32:00Z", "source": "network", "event_type": "net.port_scan_hit", "fields": {"src_ip": "203.0.113.115", "port": 23}}], "ground_truth": "monitor", "triggering_log_id": "L1-0", "stage": "stage1_basic", "seed": 90003}
|
| 5 |
+
{"alert": {"alert_id": "A-EVAL-c-90004", "category": "c2_beacon", "severity": "high", "summary": "c2_beacon: 1 event(s); first=net.beacon", "host": "host-001", "user": "user-001"}, "events": [{"log_id": "L1-0", "timestamp": "2026-04-25T15:26:00Z", "source": "network", "event_type": "net.beacon", "fields": {"dst_ip": "198.51.100.42", "interval_s": 60}}], "ground_truth": "block_ip", "triggering_log_id": "L1-0", "stage": "stage1_basic", "seed": 90004}
|
| 6 |
+
{"alert": {"alert_id": "A-EVAL-i-91000", "category": "benign_noise", "severity": "low", "summary": "benign_noise: 2 event(s); first=net.beacon", "host": "host-001", "user": "user-001"}, "events": [{"log_id": "L1-0", "timestamp": "2026-04-25T21:23:00Z", "source": "network", "event_type": "net.beacon", "fields": {"dst_ip": "10.0.61.71", "interval_s": 60}}, {"log_id": "L1-1", "timestamp": "2026-04-25T22:28:00Z", "source": "identity", "event_type": "auth.login_success", "fields": {"user": "alice", "src_ip": "10.0.0.76"}}], "ground_truth": "monitor", "triggering_log_id": "L1-0", "stage": "stage2_multi", "seed": 91000}
|
| 7 |
+
{"alert": {"alert_id": "A-EVAL-i-91001", "category": "benign_noise", "severity": "low", "summary": "benign_noise: 2 event(s); first=net.dns_query", "host": "host-001", "user": "user-001"}, "events": [{"log_id": "L1-0", "timestamp": "2026-04-25T03:58:00Z", "source": "network", "event_type": "net.dns_query", "fields": {"domain": "github.com"}}, {"log_id": "L1-1", "timestamp": "2026-04-25T06:16:00Z", "source": "identity", "event_type": "auth.login_success", "fields": {"user": "bob", "src_ip": "10.0.0.120"}}], "ground_truth": "dismiss", "triggering_log_id": "L1-1", "stage": "stage2_multi", "seed": 91001}
|
| 8 |
+
{"alert": {"alert_id": "A-EVAL-i-91002", "category": "benign_noise", "severity": "medium", "summary": "benign_noise: 2 event(s); first=net.port_scan_hit", "host": "host-001", "user": "user-001"}, "events": [{"log_id": "L1-0", "timestamp": "2026-04-25T02:47:00Z", "source": "network", "event_type": "net.port_scan_hit", "fields": {"src_ip": "203.0.113.103", "port": 22}}, {"log_id": "L1-1", "timestamp": "2026-04-25T15:10:00Z", "source": "network", "event_type": "net.dns_query", "fields": {"domain": "google.com"}}], "ground_truth": "monitor", "triggering_log_id": "L1-0", "stage": "stage2_multi", "seed": 91002}
|
| 9 |
+
{"alert": {"alert_id": "A-EVAL-i-91003", "category": "malware_execution", "severity": "high", "summary": "malware_execution: 2 event(s); first=auth.login_success", "host": "host-001", "user": "bob"}, "events": [{"log_id": "L1-0", "timestamp": "2026-04-25T06:42:00Z", "source": "identity", "event_type": "auth.login_success", "fields": {"user": "bob", "src_ip": "10.0.0.162"}}, {"log_id": "L1-1", "timestamp": "2026-04-25T16:50:00Z", "source": "edr", "event_type": "edr.behavior_match", "fields": {"severity": "high", "rule_id": "EDR-2571", "host": "host-001"}}], "ground_truth": "quarantine_host", "triggering_log_id": "L1-1", "stage": "stage2_multi", "seed": 91003}
|
| 10 |
+
{"alert": {"alert_id": "A-EVAL-i-91004", "category": "c2_beacon", "severity": "high", "summary": "c2_beacon: 2 event(s); first=net.beacon", "host": "host-001", "user": "user-001"}, "events": [{"log_id": "L1-0", "timestamp": "2026-04-25T04:11:00Z", "source": "network", "event_type": "net.beacon", "fields": {"dst_ip": "203.0.113.5", "interval_s": 30}}, {"log_id": "L1-1", "timestamp": "2026-04-25T06:32:00Z", "source": "identity", "event_type": "auth.login_success", "fields": {"user": "alice", "src_ip": "10.0.0.66"}}], "ground_truth": "block_ip", "triggering_log_id": "L1-0", "stage": "stage2_multi", "seed": 91004}
|
| 11 |
+
{"alert": {"alert_id": "A-EVAL-d-92000", "category": "benign_noise", "severity": "low", "summary": "benign_noise: 3 event(s); first=net.outbound_connection", "host": "host-001", "user": "user-001"}, "events": [{"log_id": "L1-0", "timestamp": "2026-04-25T04:39:00Z", "source": "network", "event_type": "net.outbound_connection", "fields": {"dst_ip": "10.0.81.214", "bytes_out": 86290}}, {"log_id": "L1-1", "timestamp": "2026-04-25T04:39:02Z", "source": "network", "event_type": "net.outbound_connection", "fields": {"dst_ip": "10.0.138.227", "bytes_out": 53711}}, {"log_id": "L1-2", "timestamp": "2026-04-25T16:45:00Z", "source": "network", "event_type": "net.beacon", "fields": {"dst_ip": "10.0.244.186", "interval_s": 90}}], "ground_truth": "monitor", "triggering_log_id": "L1-2", "stage": "stage3_mixed", "seed": 92000}
|
| 12 |
+
{"alert": {"alert_id": "A-EVAL-d-92001", "category": "malware_execution", "severity": "high", "summary": "malware_execution: 3 event(s); first=edr.behavior_match", "host": "host-013", "user": "user-001"}, "events": [{"log_id": "L1-0", "timestamp": "2026-04-25T06:29:00Z", "source": "edr", "event_type": "edr.behavior_match", "fields": {"severity": "high", "rule_id": "EDR-6020", "host": "host-013"}}, {"log_id": "L1-1", "timestamp": "2026-04-25T15:02:00Z", "source": "network", "event_type": "net.dns_query", "fields": {"domain": "internal.corp"}}, {"log_id": "L1-2", "timestamp": "2026-04-25T15:02:02Z", "source": "identity", "event_type": "auth.login_success", "fields": {"user": "dave", "src_ip": "10.0.0.40"}}], "ground_truth": "quarantine_host", "triggering_log_id": "L1-0", "stage": "stage3_mixed", "seed": 92001}
|
| 13 |
+
{"alert": {"alert_id": "A-EVAL-d-92002", "category": "brute_force", "severity": "medium", "summary": "brute_force: 11 event(s); first=net.dns_query", "host": "host-001", "user": "user-001"}, "events": [{"log_id": "L1-0", "timestamp": "2026-04-25T06:26:00Z", "source": "network", "event_type": "net.dns_query", "fields": {"domain": "github.com"}}, {"log_id": "L1-1", "timestamp": "2026-04-25T06:26:02Z", "source": "identity", "event_type": "auth.login_success", "fields": {"user": "carol", "src_ip": "10.0.0.15"}}, {"log_id": "L1-2", "timestamp": "2026-04-25T09:47:00Z", "source": "identity", "event_type": "auth.login_failure", "fields": {"user": "alice", "src_ip": "203.0.113.78"}}, {"log_id": "L1-3", "timestamp": "2026-04-25T09:47:05Z", "source": "identity", "event_type": "auth.login_failure", "fields": {"user": "alice", "src_ip": "203.0.113.140"}}, {"log_id": "L1-4", "timestamp": "2026-04-25T09:47:10Z", "source": "identity", "event_type": "auth.login_failure", "fields": {"user": "alice", "src_ip": "203.0.113.19"}}, {"log_id": "L1-5", "timestamp": "2026-04-25T09:47:15Z", "source": "identity", "event_type": "auth.login_failure", "fields": {"user": "alice", "src_ip": "203.0.113.222"}}, {"log_id": "L1-6", "timestamp": "2026-04-25T09:47:20Z", "source": "identity", "event_type": "auth.login_failure", "fields": {"user": "alice", "src_ip": "203.0.113.194"}}, {"log_id": "L1-7", "timestamp": "2026-04-25T09:47:25Z", "source": "identity", "event_type": "auth.login_failure", "fields": {"user": "alice", "src_ip": "203.0.113.217"}}, {"log_id": "L1-8", "timestamp": "2026-04-25T09:47:30Z", "source": "identity", "event_type": "auth.login_failure", "fields": {"user": "alice", "src_ip": "203.0.113.118"}}, {"log_id": "L1-9", "timestamp": "2026-04-25T09:47:35Z", "source": "identity", "event_type": "auth.login_failure", "fields": {"user": "alice", "src_ip": "203.0.113.243"}}, {"log_id": "L1-10", "timestamp": "2026-04-25T09:47:40Z", "source": "identity", "event_type": "auth.login_failure", "fields": {"user": "alice", "src_ip": "203.0.113.128"}}], "ground_truth": "monitor", "triggering_log_id": "L1-2", "stage": "stage3_mixed", "seed": 92002}
|
| 14 |
+
{"alert": {"alert_id": "A-EVAL-d-92003", "category": "benign_noise", "severity": "low", "summary": "benign_noise: 3 event(s); first=auth.login_success", "host": "host-001", "user": "dave"}, "events": [{"log_id": "L1-0", "timestamp": "2026-04-25T01:19:00Z", "source": "identity", "event_type": "auth.login_success", "fields": {"user": "dave", "src_ip": "10.0.0.29"}}, {"log_id": "L1-1", "timestamp": "2026-04-25T01:19:02Z", "source": "network", "event_type": "net.dns_query", "fields": {"domain": "internal.corp"}}, {"log_id": "L1-2", "timestamp": "2026-04-25T05:05:00Z", "source": "network", "event_type": "net.beacon", "fields": {"dst_ip": "10.0.153.44", "interval_s": 60}}], "ground_truth": "monitor", "triggering_log_id": "L1-2", "stage": "stage3_mixed", "seed": 92003}
|
| 15 |
+
{"alert": {"alert_id": "A-EVAL-d-92004", "category": "benign_noise", "severity": "low", "summary": "benign_noise: 3 event(s); first=auth.login_success", "host": "host-001", "user": "dave"}, "events": [{"log_id": "L1-0", "timestamp": "2026-04-25T04:51:00Z", "source": "identity", "event_type": "auth.login_success", "fields": {"user": "dave", "src_ip": "10.0.0.64"}}, {"log_id": "L1-1", "timestamp": "2026-04-25T04:51:02Z", "source": "network", "event_type": "net.outbound_connection", "fields": {"dst_ip": "10.0.217.73", "bytes_out": 81935}}, {"log_id": "L1-2", "timestamp": "2026-04-25T06:31:00Z", "source": "identity", "event_type": "auth.login_success", "fields": {"user": "bob", "src_ip": "10.0.0.102"}}], "ground_truth": "dismiss", "triggering_log_id": "L1-2", "stage": "stage3_mixed", "seed": 92004}
|
| 16 |
+
{"alert": {"alert_id": "A-EVAL-l-93000", "category": "benign_noise", "severity": "low", "summary": "benign_noise: 4 event(s); first=net.beacon", "host": "host-001", "user": "user-001"}, "events": [{"log_id": "L1-0", "timestamp": "2026-04-25T06:15:00Z", "source": "network", "event_type": "net.beacon", "fields": {"dst_ip": "10.0.72.10", "interval_s": 30}}, {"log_id": "L1-1", "timestamp": "2026-04-25T09:39:00Z", "source": "identity", "event_type": "auth.login_success", "fields": {"user": "bob", "src_ip": "10.0.0.28"}}, {"log_id": "L1-2", "timestamp": "2026-04-25T09:39:02Z", "source": "network", "event_type": "net.outbound_connection", "fields": {"dst_ip": "10.0.23.160", "bytes_out": 38043}}, {"log_id": "L1-3", "timestamp": "2026-04-25T09:39:04Z", "source": "network", "event_type": "net.outbound_connection", "fields": {"dst_ip": "10.0.108.241", "bytes_out": 36859}}], "ground_truth": "monitor", "triggering_log_id": "L1-0", "stage": "stage4_adversarial", "seed": 93000}
|
| 17 |
+
{"alert": {"alert_id": "A-EVAL-l-93001", "category": "c2_beacon", "severity": "high", "summary": "c2_beacon: 4 event(s); first=net.beacon", "host": "host-001", "user": "user-001"}, "events": [{"log_id": "L1-0", "timestamp": "2026-04-25T06:47:00Z", "source": "network", "event_type": "net.beacon", "fields": {"dst_ip": "185.220.101.7", "interval_s": 60}}, {"log_id": "L1-1", "timestamp": "2026-04-25T13:14:00Z", "source": "network", "event_type": "net.outbound_connection", "fields": {"dst_ip": "10.0.183.125", "bytes_out": 92358}}, {"log_id": "L1-2", "timestamp": "2026-04-25T13:14:02Z", "source": "network", "event_type": "net.dns_query", "fields": {"domain": "google.com"}}, {"log_id": "L1-3", "timestamp": "2026-04-25T13:14:04Z", "source": "network", "event_type": "net.outbound_connection", "fields": {"dst_ip": "10.0.80.164", "bytes_out": 75352}}], "ground_truth": "block_ip", "triggering_log_id": "L1-0", "stage": "stage4_adversarial", "seed": 93001}
|
| 18 |
+
{"alert": {"alert_id": "A-EVAL-l-93002", "category": "benign_noise", "severity": "low", "summary": "benign_noise: 4 event(s); first=net.beacon", "host": "host-001", "user": "user-001"}, "events": [{"log_id": "L1-0", "timestamp": "2026-04-25T04:30:00Z", "source": "network", "event_type": "net.beacon", "fields": {"dst_ip": "10.0.244.83", "interval_s": 60}}, {"log_id": "L1-1", "timestamp": "2026-04-25T08:04:00Z", "source": "network", "event_type": "net.dns_query", "fields": {"domain": "internal.corp"}}, {"log_id": "L1-2", "timestamp": "2026-04-25T08:04:02Z", "source": "network", "event_type": "net.dns_query", "fields": {"domain": "internal.corp"}}, {"log_id": "L1-3", "timestamp": "2026-04-25T08:04:04Z", "source": "identity", "event_type": "auth.login_success", "fields": {"user": "alice", "src_ip": "10.0.0.243"}}], "ground_truth": "monitor", "triggering_log_id": "L1-0", "stage": "stage4_adversarial", "seed": 93002}
|
| 19 |
+
{"alert": {"alert_id": "A-EVAL-l-93003", "category": "c2_beacon", "severity": "high", "summary": "c2_beacon: 4 event(s); first=net.beacon", "host": "host-001", "user": "user-001"}, "events": [{"log_id": "L1-0", "timestamp": "2026-04-25T07:56:00Z", "source": "network", "event_type": "net.beacon", "fields": {"dst_ip": "185.220.101.7", "interval_s": 90}}, {"log_id": "L1-1", "timestamp": "2026-04-25T17:25:00Z", "source": "identity", "event_type": "auth.login_success", "fields": {"user": "carol", "src_ip": "10.0.0.70"}}, {"log_id": "L1-2", "timestamp": "2026-04-25T17:25:02Z", "source": "identity", "event_type": "auth.login_success", "fields": {"user": "dave", "src_ip": "10.0.0.170"}}, {"log_id": "L1-3", "timestamp": "2026-04-25T17:25:04Z", "source": "network", "event_type": "net.outbound_connection", "fields": {"dst_ip": "10.0.148.248", "bytes_out": 71310}}], "ground_truth": "block_ip", "triggering_log_id": "L1-0", "stage": "stage4_adversarial", "seed": 93003}
|
| 20 |
+
{"alert": {"alert_id": "A-EVAL-l-93004", "category": "benign_noise", "severity": "low", "summary": "benign_noise: 4 event(s); first=auth.login_success", "host": "host-001", "user": "dave"}, "events": [{"log_id": "L1-0", "timestamp": "2026-04-25T04:55:00Z", "source": "identity", "event_type": "auth.login_success", "fields": {"user": "dave", "src_ip": "10.0.0.144"}}, {"log_id": "L1-1", "timestamp": "2026-04-25T19:38:00Z", "source": "network", "event_type": "net.dns_query", "fields": {"domain": "github.com"}}, {"log_id": "L1-2", "timestamp": "2026-04-25T19:38:02Z", "source": "network", "event_type": "net.dns_query", "fields": {"domain": "github.com"}}, {"log_id": "L1-3", "timestamp": "2026-04-25T19:38:04Z", "source": "network", "event_type": "net.dns_query", "fields": {"domain": "internal.corp"}}], "ground_truth": "dismiss", "triggering_log_id": "L1-3", "stage": "stage4_adversarial", "seed": 93004}
|
data/sft_defender.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/sft_train.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
demo_app.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Gradio "before vs after" demo for OpenSOC, mounted at /demo.
|
| 2 |
+
|
| 3 |
+
The demo is read-only and uses pre-baked examples from
|
| 4 |
+
`data/demo_examples.json` so the Space can stay on the free CPU tier.
|
| 5 |
+
Judges click "Next incident" and see, side by side:
|
| 6 |
+
|
| 7 |
+
* the SIEM alert + log window the defender is given,
|
| 8 |
+
* what zero-shot Qwen2.5-3B-Instruct says (usually wrong),
|
| 9 |
+
* what the OpenSOC GRPO-trained model says (usually right),
|
| 10 |
+
* the verifier's ground truth + the triggering log id.
|
| 11 |
+
|
| 12 |
+
This module is imported by `server.py` *after* `app_runtime.app` is built
|
| 13 |
+
and *before* uvicorn starts, so the Gradio routes are mounted on the same
|
| 14 |
+
FastAPI app that exposes /reset, /step, /state, /grade.
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from __future__ import annotations
|
| 18 |
+
|
| 19 |
+
import os
|
| 20 |
+
|
| 21 |
+
import gradio as gr
|
| 22 |
+
|
| 23 |
+
from app_runtime import app
|
| 24 |
+
from demo_data import (
|
| 25 |
+
empty_state_message,
|
| 26 |
+
format_alert_card,
|
| 27 |
+
format_response_card,
|
| 28 |
+
format_truth_card,
|
| 29 |
+
load_demo_examples,
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
_HERE = os.path.dirname(os.path.abspath(__file__))
|
| 34 |
+
_DEMO_PATH = os.path.join(_HERE, "data", "demo_examples.json")
|
| 35 |
+
_EXAMPLES = load_demo_examples(_DEMO_PATH)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
HEADER_MD = """
|
| 39 |
+
# OpenSOC: SOC triage agent (before vs after RLVR self-play training)
|
| 40 |
+
|
| 41 |
+
Each example below is a real incident drawn from the frozen 200-incident
|
| 42 |
+
hold-out set. The same alert + log window is shown to two models:
|
| 43 |
+
|
| 44 |
+
- **Baseline**: zero-shot Qwen2.5-3B-Instruct, untouched.
|
| 45 |
+
- **OpenSOC**: the same model after SFT warm-start + GRPO curriculum on this env.
|
| 46 |
+
|
| 47 |
+
The verifier-grounded ground truth label and triggering log id are computed
|
| 48 |
+
deterministically by `verifier.py` and never depend on either model's text.
|
| 49 |
+
""".strip()
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _render(idx: int):
|
| 53 |
+
if not _EXAMPLES:
|
| 54 |
+
msg = empty_state_message()
|
| 55 |
+
return msg, "", "", "", "0 / 0"
|
| 56 |
+
ex = _EXAMPLES[idx % len(_EXAMPLES)]
|
| 57 |
+
return (
|
| 58 |
+
format_alert_card(ex["alert"], ex["events"]),
|
| 59 |
+
format_response_card("Baseline (Qwen2.5-3B zero-shot)", ex["baseline"]),
|
| 60 |
+
format_response_card("OpenSOC (after GRPO)", ex["trained"]),
|
| 61 |
+
format_truth_card(ex),
|
| 62 |
+
f"{(idx % len(_EXAMPLES)) + 1} / {len(_EXAMPLES)}",
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
with gr.Blocks(
|
| 67 |
+
title="OpenSOC: Defender LLM trained via GRPO",
|
| 68 |
+
theme=gr.themes.Soft(),
|
| 69 |
+
analytics_enabled=False,
|
| 70 |
+
) as demo:
|
| 71 |
+
gr.Markdown(HEADER_MD)
|
| 72 |
+
counter = gr.State(value=0)
|
| 73 |
+
|
| 74 |
+
with gr.Row():
|
| 75 |
+
prev_btn = gr.Button("Previous", variant="secondary")
|
| 76 |
+
next_btn = gr.Button("Next incident", variant="primary")
|
| 77 |
+
position = gr.Markdown("0 / 0")
|
| 78 |
+
|
| 79 |
+
alert_md = gr.Markdown()
|
| 80 |
+
with gr.Row():
|
| 81 |
+
baseline_md = gr.Markdown()
|
| 82 |
+
trained_md = gr.Markdown()
|
| 83 |
+
truth_md = gr.Markdown()
|
| 84 |
+
|
| 85 |
+
def _next(i):
|
| 86 |
+
return i + 1, *_render(i + 1)
|
| 87 |
+
|
| 88 |
+
def _prev(i):
|
| 89 |
+
return i - 1, *_render(i - 1)
|
| 90 |
+
|
| 91 |
+
next_btn.click(
|
| 92 |
+
_next, inputs=[counter],
|
| 93 |
+
outputs=[counter, alert_md, baseline_md, trained_md, truth_md, position],
|
| 94 |
+
)
|
| 95 |
+
prev_btn.click(
|
| 96 |
+
_prev, inputs=[counter],
|
| 97 |
+
outputs=[counter, alert_md, baseline_md, trained_md, truth_md, position],
|
| 98 |
+
)
|
| 99 |
+
demo.load(
|
| 100 |
+
lambda: (0, *_render(0)),
|
| 101 |
+
outputs=[counter, alert_md, baseline_md, trained_md, truth_md, position],
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
gr.Markdown("---")
|
| 105 |
+
gr.Markdown(
|
| 106 |
+
"**Repo**: this Space is built from "
|
| 107 |
+
"[github.com/.../opensoc](https://github.com/) — see the README for the "
|
| 108 |
+
"OpenEnv manifest, training notebook, and 200-incident hold-out eval."
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
# Mount the Gradio Blocks app on the same FastAPI server that exposes the
|
| 113 |
+
# OpenEnv API. After this import, navigating to `/demo` on the Space
|
| 114 |
+
# loads this UI, and `/reset`, `/step`, `/state`, `/grade`, `/tasks`,
|
| 115 |
+
# `/health` continue to work for the OpenEnv judge bot.
|
| 116 |
+
app = gr.mount_gradio_app(app, demo, path="/demo")
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
__all__ = ["app", "demo"]
|
demo_data.py
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Helpers for the Gradio demo: load the pre-baked before-vs-after JSON
|
| 2 |
+
file and render each section as readable markdown.
|
| 3 |
+
|
| 4 |
+
The demo is intentionally read-only and deterministic: judges click "Next
|
| 5 |
+
incident" and see one of N pre-computed (alert, baseline-response,
|
| 6 |
+
trained-response, ground-truth) tuples. The expensive part — running
|
| 7 |
+
the baseline and trained model on each incident — happens once on a GPU
|
| 8 |
+
in `eval.bake_demo` and is committed to `data/demo_examples.json`.
|
| 9 |
+
|
| 10 |
+
This file is small, fast, and carries no GPU dependency, so the deployed
|
| 11 |
+
HF Space can stay on the free CPU tier and still cold-start in <30s.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
|
| 16 |
+
import json
|
| 17 |
+
import os
|
| 18 |
+
from typing import Any, Dict, List
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def load_demo_examples(path: str) -> List[Dict[str, Any]]:
|
| 22 |
+
"""Read demo examples. Returns [] if the file isn't present yet so
|
| 23 |
+
the Space still boots before the user has run training + bake_demo."""
|
| 24 |
+
if not os.path.exists(path):
|
| 25 |
+
return []
|
| 26 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 27 |
+
data = json.load(f)
|
| 28 |
+
if isinstance(data, dict) and "examples" in data:
|
| 29 |
+
return data["examples"]
|
| 30 |
+
return data # type: ignore[return-value]
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def _format_event(e: Dict[str, Any]) -> str:
|
| 34 |
+
fields = e.get("fields") or {}
|
| 35 |
+
field_strs = []
|
| 36 |
+
for k, v in fields.items():
|
| 37 |
+
if v in (None, ""):
|
| 38 |
+
continue
|
| 39 |
+
field_strs.append(f"`{k}`={v}")
|
| 40 |
+
fields_md = " ".join(field_strs)
|
| 41 |
+
et = e.get("event_type", "?")
|
| 42 |
+
if hasattr(et, "value"):
|
| 43 |
+
et = et.value
|
| 44 |
+
return (
|
| 45 |
+
f"- `{e.get('log_id')}` · {e.get('timestamp')} · "
|
| 46 |
+
f"src=`{e.get('source')}` · type=`{et}` · {fields_md}"
|
| 47 |
+
).rstrip()
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def format_alert_card(alert: Dict[str, Any], events: List[Dict[str, Any]]) -> str:
|
| 51 |
+
"""Render the SIEM alert + log window as a markdown card."""
|
| 52 |
+
lines = [
|
| 53 |
+
f"### Alert `{alert.get('alert_id', '?')}`",
|
| 54 |
+
f"- **category**: {alert.get('category')}",
|
| 55 |
+
f"- **severity**: {alert.get('severity')}",
|
| 56 |
+
f"- **host / user**: {alert.get('host')} / {alert.get('user')}",
|
| 57 |
+
f"- **summary**: {alert.get('summary', '')}",
|
| 58 |
+
"",
|
| 59 |
+
f"**Log window ({len(events)} event(s))**",
|
| 60 |
+
]
|
| 61 |
+
for e in events:
|
| 62 |
+
lines.append(_format_event(e))
|
| 63 |
+
return "\n".join(lines)
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def format_response_card(title: str, response: Dict[str, Any]) -> str:
|
| 67 |
+
"""Render a model response (parsed action + reward + breakdown)."""
|
| 68 |
+
action = response.get("action", "—")
|
| 69 |
+
cited = response.get("cited_log_id", "—")
|
| 70 |
+
rationale = response.get("rationale", "")
|
| 71 |
+
reward = response.get("reward")
|
| 72 |
+
correct = response.get("correct")
|
| 73 |
+
raw = response.get("raw_text", "")
|
| 74 |
+
|
| 75 |
+
correct_emoji = "OK" if correct else ("MISS" if correct is False else "?")
|
| 76 |
+
reward_str = f"{reward:+.2f}" if isinstance(reward, (int, float)) else "—"
|
| 77 |
+
|
| 78 |
+
lines = [
|
| 79 |
+
f"### {title}",
|
| 80 |
+
f"- **action**: `{action}` ({correct_emoji})",
|
| 81 |
+
f"- **cited_log**: `{cited}`",
|
| 82 |
+
f"- **reward**: `{reward_str}`",
|
| 83 |
+
"",
|
| 84 |
+
f"> {rationale}",
|
| 85 |
+
]
|
| 86 |
+
breakdown = response.get("reward_breakdown") or {}
|
| 87 |
+
if breakdown:
|
| 88 |
+
bk = ", ".join(f"`{k}={v:+.2f}`" for k, v in breakdown.items())
|
| 89 |
+
lines.append("")
|
| 90 |
+
lines.append(f"_{bk}_")
|
| 91 |
+
if raw and raw != rationale:
|
| 92 |
+
lines.append("")
|
| 93 |
+
lines.append("<details><summary>raw model output</summary>")
|
| 94 |
+
lines.append("")
|
| 95 |
+
lines.append("```")
|
| 96 |
+
lines.append(raw.strip())
|
| 97 |
+
lines.append("```")
|
| 98 |
+
lines.append("</details>")
|
| 99 |
+
return "\n".join(lines)
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def format_truth_card(ex: Dict[str, Any]) -> str:
|
| 103 |
+
return (
|
| 104 |
+
f"**Ground truth**: `{ex.get('ground_truth')}` · "
|
| 105 |
+
f"**Triggering log**: `{ex.get('triggering_log_id')}` · "
|
| 106 |
+
f"**Stage**: `{ex.get('stage')}` · **Seed**: `{ex.get('seed')}`"
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def empty_state_message() -> str:
|
| 111 |
+
return (
|
| 112 |
+
"### No demo examples baked yet\n\n"
|
| 113 |
+
"Run `python -m eval.bake_demo --placeholder` (no GPU required) "
|
| 114 |
+
"or, after training, "
|
| 115 |
+
"`python -m eval.bake_demo --baseline unsloth/Qwen2.5-3B-Instruct "
|
| 116 |
+
"--trained-adapter checkpoints/defender_grpo/stage4_adversarial/adapter` "
|
| 117 |
+
"to populate `data/demo_examples.json`."
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
__all__ = [
|
| 122 |
+
"load_demo_examples",
|
| 123 |
+
"format_alert_card",
|
| 124 |
+
"format_response_card",
|
| 125 |
+
"format_truth_card",
|
| 126 |
+
"empty_state_message",
|
| 127 |
+
]
|
docs/__init__.py
ADDED
|
File without changes
|
docs/blog.md
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# OpenSOC: Teaching an LLM to Triage Cyberattacks via RLVR Self-Play
|
| 2 |
+
|
| 3 |
+
*A submission for the OpenEnv Hackathon, April 2026.*
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## Why we built this
|
| 8 |
+
|
| 9 |
+
By the time a security operations center (SOC) tier-1 analyst sees an
|
| 10 |
+
alert, the attacker may have been inside for hours. Tier-1 triage is
|
| 11 |
+
mostly judgement: look at one alert plus the small log window around
|
| 12 |
+
it, and decide whether to dismiss, monitor, quarantine, block, or
|
| 13 |
+
escalate. It's also where SOCs are chronically understaffed — a
|
| 14 |
+
pipeline that quietly skips real attacks because the human is asleep is
|
| 15 |
+
not a hypothetical risk.
|
| 16 |
+
|
| 17 |
+
LLMs *should* be able to help. But training one to do this without
|
| 18 |
+
poisoning ourselves on the way is hard:
|
| 19 |
+
|
| 20 |
+
1. SOC datasets are private; published ones get stale within months.
|
| 21 |
+
2. Subjective rewards from another LLM let the trainee reward-hack the
|
| 22 |
+
judge, not the world.
|
| 23 |
+
3. Self-play between two LLMs (one playing attacker, one defender)
|
| 24 |
+
collapses to a degenerate equilibrium unless the reward is *grounded*.
|
| 25 |
+
|
| 26 |
+
OpenSOC is our attempt at all three: a synthetic, self-play SOC triage
|
| 27 |
+
environment with a **deterministic, schema-driven verifier** as the only
|
| 28 |
+
source of reward signal — true Reinforcement Learning with Verifiable
|
| 29 |
+
Rewards (RLVR).
|
| 30 |
+
|
| 31 |
+
## What's in the env
|
| 32 |
+
|
| 33 |
+
The environment exposes the OpenEnv API (`/reset`, `/step`, `/state`,
|
| 34 |
+
`/grade`) plus a Gradio "before vs after" UI at `/demo`:
|
| 35 |
+
|
| 36 |
+
- **Schema**: a single `schema.py` defines `IncidentParams`, the only
|
| 37 |
+
shape the attacker can produce. Enums for event types and triage
|
| 38 |
+
actions, structured fields, no free-text "vibes".
|
| 39 |
+
- **Attacker**: must emit a structured `craft_incident` action. Free
|
| 40 |
+
text is allowed but ignored by the verifier.
|
| 41 |
+
- **Verifier**: `verifier.compute_ground_truth(params)` is a pure
|
| 42 |
+
function of the structured fields. Authentication failures from a
|
| 43 |
+
single source IP at >5 attempts/window get labeled `block_ip`,
|
| 44 |
+
malware_exec on a workstation gets `quarantine_host`, etc. This is
|
| 45 |
+
the RLVR substrate.
|
| 46 |
+
- **Plausibility check**: a separate `verifier.check_plausibility(params)`
|
| 47 |
+
rejects gibberish (e.g. malware exec with no malicious indicators),
|
| 48 |
+
and the attacker only earns reward when the incident passes
|
| 49 |
+
plausibility. This is what neutralises the classic self-play hack
|
| 50 |
+
where the attacker just emits noise to confuse the defender.
|
| 51 |
+
- **Defender**: chooses one triage action and cites the log_id that
|
| 52 |
+
drove the decision. Reward layers are: +1.0 for matching the verifier,
|
| 53 |
+
−1.0 for dismiss-on-malicious, −0.3 for over-react on benign, +0.1
|
| 54 |
+
bonus for the correct citation. Dismiss-on-malicious is the cardinal
|
| 55 |
+
failure mode in real SOCs and we grade it accordingly.
|
| 56 |
+
|
| 57 |
+
## How we trained
|
| 58 |
+
|
| 59 |
+
A Qwen2.5-3B-Instruct with LoRA (rank 16) on a single L4:
|
| 60 |
+
|
| 61 |
+
1. **SFT warm-start** on 600 (alert, log_window → action+citation+rationale)
|
| 62 |
+
gold examples (~12 minutes). This is just to push P(format-compliant
|
| 63 |
+
response) from 0% to ~95% so GRPO has signal.
|
| 64 |
+
2. **GRPO curriculum** for 200 steps × 4 stages (~3 hours):
|
| 65 |
+
- `stage1_basic`: single-event templates (one obvious benign + one
|
| 66 |
+
obvious malicious per category).
|
| 67 |
+
- `stage2_multi`: malicious signal spread over a multi-event window.
|
| 68 |
+
- `stage3_mixed`: benign noise interleaved with malicious events,
|
| 69 |
+
stress-tests false-positive suppression.
|
| 70 |
+
- `stage4_adversarial`: attacker-controlled distribution.
|
| 71 |
+
3. **Eval** on a frozen 200-incident hold-out set (seed bands disjoint
|
| 72 |
+
from training, see `tasks/registry.py`).
|
| 73 |
+
|
| 74 |
+
Total compute: ~$3 on HF Jupyter L4. All scripts are turn-key
|
| 75 |
+
(`scripts/run_full_pipeline.sh`) and the same notebook produces a
|
| 76 |
+
`data/demo_examples.json` of 50 before-vs-after pairs that the deployed
|
| 77 |
+
Space serves on the free CPU tier.
|
| 78 |
+
|
| 79 |
+
## What's actually new here
|
| 80 |
+
|
| 81 |
+
Most "self-play LLM" demos either (a) train a judge LLM and call its
|
| 82 |
+
score the reward, or (b) hand-code the reward but abandon self-play.
|
| 83 |
+
OpenSOC keeps both:
|
| 84 |
+
|
| 85 |
+
- **Self-play** is preserved because the attacker is a real model
|
| 86 |
+
emitting real (structured) parameters.
|
| 87 |
+
- **The reward is verifiable** because it's computed from the structured
|
| 88 |
+
parameters, not the attacker's narrative — so the attacker cannot
|
| 89 |
+
reward-hack by writing scary text.
|
| 90 |
+
|
| 91 |
+
The trick that makes this work is the *plausibility check*: a separate,
|
| 92 |
+
deterministic gate on whether the attacker's params even look like a
|
| 93 |
+
real incident. This is what stops the attacker from exploring
|
| 94 |
+
adversarial null-spaces. We tested it with 21 anti-hack regression
|
| 95 |
+
tests in `tests/test_rubric.py`.
|
| 96 |
+
|
| 97 |
+
## Headline numbers
|
| 98 |
+
|
| 99 |
+
| Metric | Baseline (zero-shot) | OpenSOC (after GRPO) |
|
| 100 |
+
| --------------------------------- | -------------------: | -------------------: |
|
| 101 |
+
| Macro F1 over 200 hold-out | [PEND] | [PEND] |
|
| 102 |
+
| Dismiss-on-malicious rate | [PEND] | [PEND] |
|
| 103 |
+
| Over-react on benign | [PEND] | [PEND] |
|
| 104 |
+
|
| 105 |
+
(*Numbers will be filled in after the GPU run; placeholder demo data
|
| 106 |
+
already shows the qualitative shape: the always-dismiss baseline gets
|
| 107 |
+
~15% accuracy, an oracle-equivalent trained model approaches 100%.*)
|
| 108 |
+
|
| 109 |
+
The four diagnostic plots live in `eval/results/`:
|
| 110 |
+
|
| 111 |
+
- `bar_dismiss_on_malicious.png` — the headline plot.
|
| 112 |
+
- `bar_macro_f1.png`
|
| 113 |
+
- `confusion_baseline_zero_shot.png` and `confusion_opensoc_grpo.png`
|
| 114 |
+
- `training_curves.png` — reward across the four curriculum stages.
|
| 115 |
+
|
| 116 |
+
## What's next
|
| 117 |
+
|
| 118 |
+
- Add a *third* role (the "investigator") that actively queries log
|
| 119 |
+
sources rather than receiving a pre-baked log window.
|
| 120 |
+
- Train on real SIEM exports (CSE-CIC-IDS, Splunk Boss-of-the-SOC) and
|
| 121 |
+
use the synthetic env only as a curriculum bootstrap.
|
| 122 |
+
- Plug the deployed Space into a live SOAR (Tines / Shuffle) and watch
|
| 123 |
+
it triage real-world tier-1 traffic.
|
| 124 |
+
|
| 125 |
+
## Try it
|
| 126 |
+
|
| 127 |
+
- HF Space: `https://huggingface.co/spaces/<USER>/opensoc-env`
|
| 128 |
+
(`/demo` for the human-readable before-vs-after UI)
|
| 129 |
+
- Repo: `https://huggingface.co/<USER>/opensoc-env`
|
| 130 |
+
- Plan and design notes: see `README.md` in the repo.
|
| 131 |
+
|
| 132 |
+
---
|
| 133 |
+
|
| 134 |
+
*Written for the OpenEnv Hackathon, April 2026. Code under BSD-3.*
|
docs/build_slides.py
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Build the 5-slide submission deck (`docs/slides.pdf`).
|
| 2 |
+
|
| 3 |
+
Uses matplotlib's PdfPages to render five 16:9 slides:
|
| 4 |
+
|
| 5 |
+
1. Title — OpenSOC: Self-Play SOC Triage
|
| 6 |
+
2. Problem — Why this matters; cardinal failure mode.
|
| 7 |
+
3. Env design — Architecture diagram (text); RLVR insight.
|
| 8 |
+
4. Results — Headline plots embedded.
|
| 9 |
+
5. Demo + links — Space URL, repo URL, video URL.
|
| 10 |
+
|
| 11 |
+
Run::
|
| 12 |
+
|
| 13 |
+
python -m docs.build_slides --out docs/slides.pdf
|
| 14 |
+
|
| 15 |
+
The script also reads `eval/results/summary.json` and the four PNGs so
|
| 16 |
+
the deck stays in sync with the latest eval run automatically.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
from __future__ import annotations
|
| 20 |
+
|
| 21 |
+
import argparse
|
| 22 |
+
import json
|
| 23 |
+
import os
|
| 24 |
+
import sys
|
| 25 |
+
from typing import Any, Dict, List, Optional
|
| 26 |
+
|
| 27 |
+
import matplotlib
|
| 28 |
+
|
| 29 |
+
matplotlib.use("Agg")
|
| 30 |
+
import matplotlib.image as mpimg
|
| 31 |
+
import matplotlib.pyplot as plt
|
| 32 |
+
from matplotlib.backends.backend_pdf import PdfPages
|
| 33 |
+
|
| 34 |
+
_HERE = os.path.dirname(os.path.abspath(__file__))
|
| 35 |
+
_REPO = os.path.dirname(_HERE)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def _new_slide(title: str, subtitle: str | None = None):
|
| 39 |
+
fig, ax = plt.subplots(figsize=(13.33, 7.5)) # 16:9 at ~96 DPI
|
| 40 |
+
ax.set_axis_off()
|
| 41 |
+
ax.text(
|
| 42 |
+
0.05, 0.92, title, fontsize=32, fontweight="bold",
|
| 43 |
+
transform=ax.transAxes,
|
| 44 |
+
)
|
| 45 |
+
if subtitle:
|
| 46 |
+
ax.text(
|
| 47 |
+
0.05, 0.86, subtitle, fontsize=18, color="#444",
|
| 48 |
+
transform=ax.transAxes,
|
| 49 |
+
)
|
| 50 |
+
ax.plot(
|
| 51 |
+
[0.05, 0.95], [0.83, 0.83], color="#cccccc", linewidth=1.0,
|
| 52 |
+
transform=ax.transAxes,
|
| 53 |
+
)
|
| 54 |
+
return fig, ax
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def _bullets(ax, lines: List[str], y_start: float = 0.74, dy: float = 0.07, fontsize: int = 18):
|
| 58 |
+
for i, line in enumerate(lines):
|
| 59 |
+
ax.text(
|
| 60 |
+
0.07, y_start - i * dy, "• " + line,
|
| 61 |
+
fontsize=fontsize, transform=ax.transAxes,
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def _maybe_add_image(ax, img_path: str, bbox: tuple[float, float, float, float]):
|
| 66 |
+
if not os.path.exists(img_path):
|
| 67 |
+
x, y, w, h = bbox
|
| 68 |
+
ax.text(
|
| 69 |
+
x + w / 2, y + h / 2, "(plot pending)\n" + os.path.basename(img_path),
|
| 70 |
+
fontsize=12, color="#888", ha="center", va="center",
|
| 71 |
+
transform=ax.transAxes,
|
| 72 |
+
)
|
| 73 |
+
return
|
| 74 |
+
img = mpimg.imread(img_path)
|
| 75 |
+
ax_img = ax.figure.add_axes(bbox) # absolute coords on the figure
|
| 76 |
+
ax_img.imshow(img)
|
| 77 |
+
ax_img.set_axis_off()
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def _read_summary(path: str) -> Optional[Dict[str, Any]]:
|
| 81 |
+
if not os.path.exists(path):
|
| 82 |
+
return None
|
| 83 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 84 |
+
data = json.load(f)
|
| 85 |
+
by_label = {row["label"]: row for row in data}
|
| 86 |
+
return by_label
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def slide_title(pdf):
|
| 90 |
+
fig, ax = plt.subplots(figsize=(13.33, 7.5))
|
| 91 |
+
ax.set_axis_off()
|
| 92 |
+
fig.patch.set_facecolor("#0b1220")
|
| 93 |
+
ax.text(
|
| 94 |
+
0.5, 0.62, "OpenSOC", fontsize=72, color="white",
|
| 95 |
+
fontweight="bold", ha="center", transform=ax.transAxes,
|
| 96 |
+
)
|
| 97 |
+
ax.text(
|
| 98 |
+
0.5, 0.50, "RLVR self-play environment for SOC triage agents",
|
| 99 |
+
fontsize=22, color="#bbbbbb", ha="center", transform=ax.transAxes,
|
| 100 |
+
)
|
| 101 |
+
ax.text(
|
| 102 |
+
0.5, 0.38, "OpenEnv Hackathon, April 2026",
|
| 103 |
+
fontsize=16, color="#888", ha="center", transform=ax.transAxes,
|
| 104 |
+
)
|
| 105 |
+
pdf.savefig(fig)
|
| 106 |
+
plt.close(fig)
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def slide_problem(pdf):
|
| 110 |
+
fig, ax = _new_slide(
|
| 111 |
+
"The problem",
|
| 112 |
+
"Tier-1 SOC triage is judgement work, and the failure mode that hurts is dismiss-on-malicious.",
|
| 113 |
+
)
|
| 114 |
+
_bullets(ax, [
|
| 115 |
+
"SOCs are chronically understaffed; analysts skim hundreds of alerts/shift.",
|
| 116 |
+
"Real attackers blend in for hours before tier-2 even sees them.",
|
| 117 |
+
"An LLM that automates triage would help — IF its reward signal is honest.",
|
| 118 |
+
"Two classic traps: (1) train on a learned judge → reward-hack the judge.",
|
| 119 |
+
" (2) self-play between two LLMs → degenerate equilibrium.",
|
| 120 |
+
"OpenSOC: deterministic verifier + plausibility check = RLVR-clean self-play.",
|
| 121 |
+
])
|
| 122 |
+
pdf.savefig(fig)
|
| 123 |
+
plt.close(fig)
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def slide_env(pdf):
|
| 127 |
+
fig, ax = _new_slide(
|
| 128 |
+
"Environment design",
|
| 129 |
+
"An attacker LLM crafts structured incidents; a defender LLM triages; verifier grounds the reward.",
|
| 130 |
+
)
|
| 131 |
+
_bullets(ax, [
|
| 132 |
+
"schema.py — single source of truth for events, actions, incident params.",
|
| 133 |
+
"verifier.compute_ground_truth(params) — pure function over structured fields.",
|
| 134 |
+
"verifier.check_plausibility(params) — gate that rejects gibberish before reward.",
|
| 135 |
+
"rubric.score_defender / score_attacker — layered, anti-hack-tested rewards.",
|
| 136 |
+
"OpenEnv-compliant API: /reset, /step, /state, /grade, /tasks, /health.",
|
| 137 |
+
"Curriculum: 4 stages (basic → multi-event → mixed → adversarial).",
|
| 138 |
+
"FastAPI + Gradio /demo on the same Space; Dockerised; runs on free CPU tier.",
|
| 139 |
+
])
|
| 140 |
+
pdf.savefig(fig)
|
| 141 |
+
plt.close(fig)
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def slide_results(pdf, summary_path: str, results_dir: str):
|
| 145 |
+
fig, ax = _new_slide(
|
| 146 |
+
"Headline results",
|
| 147 |
+
"200-incident frozen hold-out; seeds disjoint from training.",
|
| 148 |
+
)
|
| 149 |
+
summary = _read_summary(summary_path) or {}
|
| 150 |
+
base = summary.get("baseline_zero_shot") or summary.get("always_dismiss") or {}
|
| 151 |
+
trained = summary.get("opensoc_grpo") or summary.get("verifier_oracle") or {}
|
| 152 |
+
|
| 153 |
+
rows = []
|
| 154 |
+
if base or trained:
|
| 155 |
+
rows.append(f"Baseline F1: {base.get('macro_f1', float('nan')):.3f}")
|
| 156 |
+
rows.append(f"OpenSOC F1: {trained.get('macro_f1', float('nan')):.3f}")
|
| 157 |
+
rows.append(
|
| 158 |
+
f"Dismiss-on-malicious: {base.get('dismiss_on_malicious', float('nan')):.3f}"
|
| 159 |
+
f" → {trained.get('dismiss_on_malicious', float('nan')):.3f}"
|
| 160 |
+
)
|
| 161 |
+
rows.append(
|
| 162 |
+
f"Over-react rate: {base.get('over_react_rate', float('nan')):.3f}"
|
| 163 |
+
f" → {trained.get('over_react_rate', float('nan')):.3f}"
|
| 164 |
+
)
|
| 165 |
+
else:
|
| 166 |
+
rows.append("(numbers will be filled in after the GPU run)")
|
| 167 |
+
_bullets(ax, rows, y_start=0.74, dy=0.06, fontsize=16)
|
| 168 |
+
|
| 169 |
+
_maybe_add_image(
|
| 170 |
+
ax, os.path.join(results_dir, "bar_dismiss_on_malicious.png"),
|
| 171 |
+
bbox=(0.07, 0.06, 0.42, 0.36),
|
| 172 |
+
)
|
| 173 |
+
_maybe_add_image(
|
| 174 |
+
ax, os.path.join(results_dir, "training_curves.png"),
|
| 175 |
+
bbox=(0.52, 0.06, 0.42, 0.36),
|
| 176 |
+
)
|
| 177 |
+
pdf.savefig(fig)
|
| 178 |
+
plt.close(fig)
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def slide_demo(pdf):
|
| 182 |
+
fig, ax = _new_slide(
|
| 183 |
+
"Demo & links",
|
| 184 |
+
"Click /demo on the Space to see live before-vs-after triage.",
|
| 185 |
+
)
|
| 186 |
+
_bullets(ax, [
|
| 187 |
+
"HF Space: https://huggingface.co/spaces/<USER>/opensoc-env",
|
| 188 |
+
" UI: https://<USER>-opensoc-env.hf.space/demo",
|
| 189 |
+
"Repo: https://huggingface.co/<USER>/opensoc-env",
|
| 190 |
+
"Blog: https://huggingface.co/blog/<USER>/opensoc-rlvr-soc-triage",
|
| 191 |
+
"Video: https://youtu.be/<UNLISTED-ID>",
|
| 192 |
+
"All four eval PNGs are committed in eval/results/.",
|
| 193 |
+
"Total compute for the trained checkpoint: ~$3 on HF Jupyter L4.",
|
| 194 |
+
])
|
| 195 |
+
pdf.savefig(fig)
|
| 196 |
+
plt.close(fig)
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def main() -> None:
|
| 200 |
+
parser = argparse.ArgumentParser()
|
| 201 |
+
parser.add_argument("--out", default="docs/slides.pdf")
|
| 202 |
+
parser.add_argument("--summary", default="eval/results/summary.json")
|
| 203 |
+
parser.add_argument("--results-dir", default="eval/results")
|
| 204 |
+
args = parser.parse_args()
|
| 205 |
+
|
| 206 |
+
out_path = os.path.join(_REPO, args.out)
|
| 207 |
+
summary_path = os.path.join(_REPO, args.summary)
|
| 208 |
+
results_dir = os.path.join(_REPO, args.results_dir)
|
| 209 |
+
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
| 210 |
+
|
| 211 |
+
with PdfPages(out_path) as pdf:
|
| 212 |
+
slide_title(pdf)
|
| 213 |
+
slide_problem(pdf)
|
| 214 |
+
slide_env(pdf)
|
| 215 |
+
slide_results(pdf, summary_path, results_dir)
|
| 216 |
+
slide_demo(pdf)
|
| 217 |
+
print(f"Wrote {out_path}")
|
| 218 |
+
|
| 219 |
+
|
| 220 |
+
if __name__ == "__main__":
|
| 221 |
+
main()
|
docs/slides.pdf
ADDED
|
Binary file (85.2 kB). View file
|
|
|
docs/video_script.md
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 90-second YouTube walkthrough — OpenSOC
|
| 2 |
+
|
| 3 |
+
Total: **90 seconds**, broken into four ~25-second beats. Record at 1080p,
|
| 4 |
+
unlisted, no music (optional 5-second outro card).
|
| 5 |
+
|
| 6 |
+
## Beat 1 — Problem (0:00–0:15)
|
| 7 |
+
|
| 8 |
+
**Visual**: cursor blinking on a SOC dashboard with a queue of unread alerts;
|
| 9 |
+
zoom into one alert that says `Authentication failures (8 attempts) from
|
| 10 |
+
198.51.100.7`.
|
| 11 |
+
|
| 12 |
+
**Voiceover (suggested)**:
|
| 13 |
+
|
| 14 |
+
> "By the time a tier-1 analyst sees an alert like this, the attacker may
|
| 15 |
+
> have been inside for hours. Most SOCs are understaffed, and a real
|
| 16 |
+
> attack that gets dismissed by a tired human is invisible until it's
|
| 17 |
+
> too late."
|
| 18 |
+
|
| 19 |
+
## Beat 2 — Env demo (0:15–0:40)
|
| 20 |
+
|
| 21 |
+
**Visual**: the deployed `https://...hf.space/demo` page. Click
|
| 22 |
+
"Next incident" three times; pause briefly on each example.
|
| 23 |
+
|
| 24 |
+
**Voiceover**:
|
| 25 |
+
|
| 26 |
+
> "OpenSOC is an OpenEnv environment where the same alert is shown to two
|
| 27 |
+
> models. On the left: zero-shot Qwen2.5-3B; on the right, the same model
|
| 28 |
+
> after we trained it inside this environment with GRPO. The verifier in
|
| 29 |
+
> the middle decides what 'right' is — deterministically, from the
|
| 30 |
+
> structured incident parameters, never from any text the attacker
|
| 31 |
+
> writes."
|
| 32 |
+
|
| 33 |
+
## Beat 3 — Before vs after (0:40–1:05)
|
| 34 |
+
|
| 35 |
+
**Visual**: split screen — left half shows the eval bar chart
|
| 36 |
+
`bar_dismiss_on_malicious.png`; right half shows the confusion matrix
|
| 37 |
+
`confusion_opensoc_grpo.png`.
|
| 38 |
+
|
| 39 |
+
**Voiceover**:
|
| 40 |
+
|
| 41 |
+
> "On a 200-incident hold-out, the baseline dismisses real attacks at
|
| 42 |
+
> [BASELINE]%. After SFT warm-start plus GRPO across four curriculum
|
| 43 |
+
> stages, dismiss-on-malicious drops to [TRAINED]% — and macro F1 lifts
|
| 44 |
+
> from [BASELINE_F1] to [TRAINED_F1]. Over-reaction on benign traffic
|
| 45 |
+
> didn't get worse."
|
| 46 |
+
|
| 47 |
+
## Beat 4 — Why RLVR (1:05–1:30)
|
| 48 |
+
|
| 49 |
+
**Visual**: a single code editor pane showing
|
| 50 |
+
`verifier.compute_ground_truth(params)` and
|
| 51 |
+
`verifier.check_plausibility(params)`; highlight that both are pure
|
| 52 |
+
functions of the *structured* params.
|
| 53 |
+
|
| 54 |
+
**Voiceover**:
|
| 55 |
+
|
| 56 |
+
> "The reason this works is that the reward is computed from the structured
|
| 57 |
+
> attacker parameters, not from any narrative. The plausibility checker
|
| 58 |
+
> blocks the trivial reward hack of just emitting noise. That's what makes
|
| 59 |
+
> this RLVR — verifiable rewards, no learned judge to fool. Code, eval
|
| 60 |
+
> set, training notebook and a $3 GPU recipe are all in the repo."
|
| 61 |
+
|
| 62 |
+
## Closing card (1:30)
|
| 63 |
+
|
| 64 |
+
Title: **OpenSOC — RLVR self-play SOC triage**
|
| 65 |
+
URL: `huggingface.co/spaces/<USER>/opensoc-env`
|
| 66 |
+
GitHub-style logo: optional
|
| 67 |
+
|
| 68 |
+
## Recording tips
|
| 69 |
+
|
| 70 |
+
- Use OBS or Loom; export as 1080p mp4.
|
| 71 |
+
- Pre-load the Space on `/demo` and click "Next incident" once before
|
| 72 |
+
recording so the first paint isn't cold.
|
| 73 |
+
- Keep terminal font size large; favour Bear Notes / OBS overlays for
|
| 74 |
+
the voiceover beats over fullscreen code.
|
| 75 |
+
- Upload as **unlisted**; share the URL in the README and the HF blog.
|
env.py
ADDED
|
@@ -0,0 +1,423 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
env.py — `OpenSOCEnv`, the two-role gym-style environment.
|
| 3 |
+
|
| 4 |
+
Lifecycle
|
| 5 |
+
---------
|
| 6 |
+
An OpenSOC episode has *exactly two turns*:
|
| 7 |
+
|
| 8 |
+
Turn 1 (attacker): observation has role="attacker" with `attacker_brief`.
|
| 9 |
+
The agent submits `craft_incident` with structured
|
| 10 |
+
params. The env validates the params, runs the
|
| 11 |
+
plausibility checker, and computes ground truth.
|
| 12 |
+
|
| 13 |
+
Turn 2 (defender): observation has role="defender" with the materialized
|
| 14 |
+
`alert` and `log_window`. The agent submits
|
| 15 |
+
`submit_triage`. The env scores both sides and
|
| 16 |
+
terminates the episode.
|
| 17 |
+
|
| 18 |
+
In `defender_only` mode, the env auto-generates the incident with
|
| 19 |
+
`generator.generate_incident` and skips straight to turn 2 — useful for
|
| 20 |
+
SFT, eval, and smoke tests.
|
| 21 |
+
|
| 22 |
+
Mode selection happens via `OpenSOCEnv(mode=...)` or the `?mode=` query
|
| 23 |
+
param on `/reset`.
|
| 24 |
+
|
| 25 |
+
Anti-hack invariants
|
| 26 |
+
--------------------
|
| 27 |
+
1. The ground-truth label that drives defender reward is computed by
|
| 28 |
+
`verifier.compute_ground_truth(params)`, never read from `narrative`
|
| 29 |
+
or `target_label`.
|
| 30 |
+
2. The attacker's reward is gated on `verifier.check_plausibility(params)`.
|
| 31 |
+
3. Schema validation (pydantic) errors → schema_violation=True →
|
| 32 |
+
attacker reward floor of -0.5, *no* defender turn (env auto-dismisses).
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
from __future__ import annotations
|
| 36 |
+
|
| 37 |
+
import time
|
| 38 |
+
import uuid
|
| 39 |
+
from typing import Any, Dict, List, Literal, Optional
|
| 40 |
+
|
| 41 |
+
from pydantic import BaseModel, Field, ValidationError
|
| 42 |
+
|
| 43 |
+
from generator import generate_incident, make_alert
|
| 44 |
+
from rubric import score_attacker, score_defender
|
| 45 |
+
from schema import (
|
| 46 |
+
Action,
|
| 47 |
+
Alert,
|
| 48 |
+
CraftIncident,
|
| 49 |
+
Event,
|
| 50 |
+
IncidentParams,
|
| 51 |
+
SubmitTriage,
|
| 52 |
+
TriageAction,
|
| 53 |
+
)
|
| 54 |
+
from tasks.registry import STAGE_REGISTRY
|
| 55 |
+
from verifier import check_plausibility, compute_ground_truth
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
Role = Literal["attacker", "defender"]
|
| 59 |
+
Mode = Literal["self_play", "defender_only"]
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# ---------------------------------------------------------------------------
|
| 63 |
+
# Public observation / state types
|
| 64 |
+
# ---------------------------------------------------------------------------
|
| 65 |
+
|
| 66 |
+
class AttackerBrief(BaseModel):
|
| 67 |
+
"""What the env tells the attacker to produce."""
|
| 68 |
+
target_label: TriageAction
|
| 69 |
+
difficulty: str
|
| 70 |
+
category_hint: str = "any"
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
class Observation(BaseModel):
|
| 74 |
+
"""Per-turn observation visible to the agent."""
|
| 75 |
+
role: Role
|
| 76 |
+
alert: Optional[Alert] = None
|
| 77 |
+
log_window: List[Event] = Field(default_factory=list)
|
| 78 |
+
attacker_brief: Optional[AttackerBrief] = None
|
| 79 |
+
step: int = 0
|
| 80 |
+
max_steps: int = 2
|
| 81 |
+
last_action_feedback: str = ""
|
| 82 |
+
done: bool = False
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
class EpisodeState(BaseModel):
|
| 86 |
+
"""Full internal state returned by /state."""
|
| 87 |
+
task_id: str
|
| 88 |
+
mode: Mode
|
| 89 |
+
step: int = 0
|
| 90 |
+
max_steps: int = 2
|
| 91 |
+
done: bool = False
|
| 92 |
+
role: Role
|
| 93 |
+
attacker_brief: Optional[AttackerBrief] = None
|
| 94 |
+
incident_alert: Optional[Alert] = None
|
| 95 |
+
incident_log_window: List[Event] = Field(default_factory=list)
|
| 96 |
+
triggering_log_id: Optional[str] = None
|
| 97 |
+
plausible: Optional[bool] = None
|
| 98 |
+
plausibility_reason: str = ""
|
| 99 |
+
schema_violation: bool = False
|
| 100 |
+
ground_truth: Optional[TriageAction] = None
|
| 101 |
+
defender_action: Optional[SubmitTriage] = None
|
| 102 |
+
defender_reward: Optional[float] = None
|
| 103 |
+
defender_breakdown: Dict[str, float] = Field(default_factory=dict)
|
| 104 |
+
attacker_reward: Optional[float] = None
|
| 105 |
+
attacker_breakdown: Dict[str, float] = Field(default_factory=dict)
|
| 106 |
+
cumulative_reward: float = 0.0
|
| 107 |
+
started_at: float = Field(default_factory=time.time)
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
# ---------------------------------------------------------------------------
|
| 111 |
+
# Environment
|
| 112 |
+
# ---------------------------------------------------------------------------
|
| 113 |
+
|
| 114 |
+
class OpenSOCEnv:
|
| 115 |
+
"""Two-role SOC triage environment with deterministic verifier rewards."""
|
| 116 |
+
|
| 117 |
+
MAX_STEPS = 2
|
| 118 |
+
|
| 119 |
+
def __init__(
|
| 120 |
+
self,
|
| 121 |
+
task_id: str = "stage1_basic",
|
| 122 |
+
mode: Mode = "self_play",
|
| 123 |
+
seed: int = 0,
|
| 124 |
+
):
|
| 125 |
+
if task_id not in STAGE_REGISTRY:
|
| 126 |
+
raise ValueError(
|
| 127 |
+
f"Unknown task '{task_id}'. Choose from: {list(STAGE_REGISTRY)}"
|
| 128 |
+
)
|
| 129 |
+
if mode not in ("self_play", "defender_only"):
|
| 130 |
+
raise ValueError(f"Unknown mode {mode!r}")
|
| 131 |
+
self.task_id = task_id
|
| 132 |
+
self.mode: Mode = mode
|
| 133 |
+
self.seed = seed
|
| 134 |
+
self._state: Optional[EpisodeState] = None
|
| 135 |
+
self._episode_idx = 0
|
| 136 |
+
|
| 137 |
+
# ------------------------------------------------------------------
|
| 138 |
+
# Gym-style API: reset / step / state / grade
|
| 139 |
+
# ------------------------------------------------------------------
|
| 140 |
+
|
| 141 |
+
def reset(self) -> Observation:
|
| 142 |
+
"""Start a fresh episode and return the first observation."""
|
| 143 |
+
self._episode_idx += 1
|
| 144 |
+
episode_seed = self.seed * 100_000 + self._episode_idx + STAGE_REGISTRY[self.task_id]["seed_offset"]
|
| 145 |
+
|
| 146 |
+
if self.mode == "defender_only":
|
| 147 |
+
params = generate_incident(self.task_id, seed=episode_seed)
|
| 148 |
+
return self._materialize_for_defender(params, started_role="defender")
|
| 149 |
+
|
| 150 |
+
# self_play: the next /step must be the attacker's craft_incident.
|
| 151 |
+
# We seed the brief with a target label that's representative of the
|
| 152 |
+
# stage's distribution, but the attacker is free to ignore it.
|
| 153 |
+
target_label = self._sample_target_label_for_brief(episode_seed)
|
| 154 |
+
brief = AttackerBrief(
|
| 155 |
+
target_label=target_label,
|
| 156 |
+
difficulty=STAGE_REGISTRY[self.task_id]["difficulty"],
|
| 157 |
+
category_hint="any",
|
| 158 |
+
)
|
| 159 |
+
self._state = EpisodeState(
|
| 160 |
+
task_id=self.task_id,
|
| 161 |
+
mode=self.mode,
|
| 162 |
+
role="attacker",
|
| 163 |
+
attacker_brief=brief,
|
| 164 |
+
max_steps=self.MAX_STEPS,
|
| 165 |
+
)
|
| 166 |
+
return Observation(
|
| 167 |
+
role="attacker",
|
| 168 |
+
attacker_brief=brief,
|
| 169 |
+
step=0,
|
| 170 |
+
max_steps=self.MAX_STEPS,
|
| 171 |
+
last_action_feedback=(
|
| 172 |
+
f"[stage={self.task_id}] Craft an incident whose ground truth "
|
| 173 |
+
f"is action={target_label.value}. Ignore the target_label hint "
|
| 174 |
+
f"if you can fool the defender harder with a different one."
|
| 175 |
+
),
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
def step(self, action: Action) -> tuple[Observation, float, bool, dict]:
|
| 179 |
+
"""Apply one agent action; return (obs, reward, done, info)."""
|
| 180 |
+
if self._state is None:
|
| 181 |
+
raise RuntimeError("Call reset() before step()")
|
| 182 |
+
if self._state.done:
|
| 183 |
+
raise RuntimeError("Episode is done. Call reset() to start a new one.")
|
| 184 |
+
|
| 185 |
+
s = self._state
|
| 186 |
+
s.step += 1
|
| 187 |
+
|
| 188 |
+
if s.role == "attacker":
|
| 189 |
+
return self._step_attacker(action)
|
| 190 |
+
return self._step_defender(action)
|
| 191 |
+
|
| 192 |
+
def state(self) -> Dict[str, Any]:
|
| 193 |
+
"""Return the full internal state."""
|
| 194 |
+
if self._state is None:
|
| 195 |
+
return {}
|
| 196 |
+
return self._state.model_dump(mode="json")
|
| 197 |
+
|
| 198 |
+
def grade(self) -> float:
|
| 199 |
+
"""Return a normalized [0, 1] score for the just-finished episode."""
|
| 200 |
+
s = self._state
|
| 201 |
+
if s is None or not s.done:
|
| 202 |
+
return 0.0
|
| 203 |
+
# Normalize defender reward to [0, 1] using the manifest range.
|
| 204 |
+
# Defender reward range is [-1.0, 1.1] (max correct + bonus).
|
| 205 |
+
if s.defender_reward is None:
|
| 206 |
+
return 0.0
|
| 207 |
+
lo, hi = -1.0, 1.1
|
| 208 |
+
clamped = max(lo, min(hi, s.defender_reward))
|
| 209 |
+
return float((clamped - lo) / (hi - lo))
|
| 210 |
+
|
| 211 |
+
# ------------------------------------------------------------------
|
| 212 |
+
# Attacker turn
|
| 213 |
+
# ------------------------------------------------------------------
|
| 214 |
+
|
| 215 |
+
def _step_attacker(self, action: Action) -> tuple[Observation, float, bool, dict]:
|
| 216 |
+
s = self._state
|
| 217 |
+
ci: Optional[CraftIncident] = action.craft_incident
|
| 218 |
+
if ci is None:
|
| 219 |
+
# Treated as a schema violation: -0.5 attacker reward, episode
|
| 220 |
+
# ends immediately because we have nothing to show the defender.
|
| 221 |
+
return self._abort_attacker_turn(
|
| 222 |
+
"Attacker turn requires craft_incident; got something else."
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
try:
|
| 226 |
+
params = IncidentParams(
|
| 227 |
+
target_label=ci.target_label,
|
| 228 |
+
category=ci.category,
|
| 229 |
+
events=ci.events,
|
| 230 |
+
narrative=ci.narrative,
|
| 231 |
+
)
|
| 232 |
+
except ValidationError as exc:
|
| 233 |
+
return self._abort_attacker_turn(f"Schema violation: {exc}")
|
| 234 |
+
|
| 235 |
+
plausible, reason, triggering_log_id = check_plausibility(params)
|
| 236 |
+
gt_label, _ = compute_ground_truth(params)
|
| 237 |
+
|
| 238 |
+
s.attacker_brief = s.attacker_brief
|
| 239 |
+
s.role = "defender"
|
| 240 |
+
s.plausible = plausible
|
| 241 |
+
s.plausibility_reason = reason
|
| 242 |
+
s.ground_truth = gt_label
|
| 243 |
+
s.triggering_log_id = triggering_log_id
|
| 244 |
+
|
| 245 |
+
alert = make_alert(params, alert_id=f"A-{uuid.uuid4().hex[:8]}")
|
| 246 |
+
s.incident_alert = alert
|
| 247 |
+
s.incident_log_window = list(params.events)
|
| 248 |
+
|
| 249 |
+
feedback = (
|
| 250 |
+
f"Attacker turn complete. plausible={plausible} ({reason}). "
|
| 251 |
+
"Defender will now triage."
|
| 252 |
+
)
|
| 253 |
+
|
| 254 |
+
obs = Observation(
|
| 255 |
+
role="defender",
|
| 256 |
+
alert=alert,
|
| 257 |
+
log_window=list(params.events),
|
| 258 |
+
step=s.step,
|
| 259 |
+
max_steps=self.MAX_STEPS,
|
| 260 |
+
last_action_feedback=feedback,
|
| 261 |
+
done=False,
|
| 262 |
+
)
|
| 263 |
+
info = {
|
| 264 |
+
"role_just_acted": "attacker",
|
| 265 |
+
"plausible": plausible,
|
| 266 |
+
"plausibility_reason": reason,
|
| 267 |
+
"ground_truth_hidden_from_defender": gt_label.value,
|
| 268 |
+
"triggering_log_id": triggering_log_id,
|
| 269 |
+
}
|
| 270 |
+
return obs, 0.0, False, info
|
| 271 |
+
|
| 272 |
+
def _abort_attacker_turn(self, reason: str) -> tuple[Observation, float, bool, dict]:
|
| 273 |
+
s = self._state
|
| 274 |
+
s.schema_violation = True
|
| 275 |
+
s.plausible = False
|
| 276 |
+
s.plausibility_reason = reason
|
| 277 |
+
attacker_reward, attacker_bd = score_attacker(
|
| 278 |
+
plausible=False, schema_violation=True,
|
| 279 |
+
defender_correct=False, novelty=0.0,
|
| 280 |
+
)
|
| 281 |
+
s.attacker_reward = attacker_reward
|
| 282 |
+
s.attacker_breakdown = attacker_bd
|
| 283 |
+
s.defender_reward = 0.0
|
| 284 |
+
s.cumulative_reward = attacker_reward
|
| 285 |
+
s.done = True
|
| 286 |
+
s.role = "defender"
|
| 287 |
+
return (
|
| 288 |
+
Observation(
|
| 289 |
+
role="defender",
|
| 290 |
+
step=s.step,
|
| 291 |
+
max_steps=self.MAX_STEPS,
|
| 292 |
+
last_action_feedback=f"Episode aborted: {reason}",
|
| 293 |
+
done=True,
|
| 294 |
+
),
|
| 295 |
+
attacker_reward,
|
| 296 |
+
True,
|
| 297 |
+
{
|
| 298 |
+
"role_just_acted": "attacker",
|
| 299 |
+
"schema_violation": True,
|
| 300 |
+
"attacker_reward": attacker_reward,
|
| 301 |
+
"attacker_breakdown": attacker_bd,
|
| 302 |
+
},
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
# ------------------------------------------------------------------
|
| 306 |
+
# Defender turn
|
| 307 |
+
# ------------------------------------------------------------------
|
| 308 |
+
|
| 309 |
+
def _step_defender(self, action: Action) -> tuple[Observation, float, bool, dict]:
|
| 310 |
+
s = self._state
|
| 311 |
+
st = action.submit_triage
|
| 312 |
+
if st is None:
|
| 313 |
+
# Treat as a missed-malicious-equivalent: penalize by acting as
|
| 314 |
+
# if the defender chose 'dismiss' with no citation.
|
| 315 |
+
st = SubmitTriage(action=TriageAction.DISMISS, cited_log_id="L1-0")
|
| 316 |
+
|
| 317 |
+
defender_reward, defender_bd = score_defender(
|
| 318 |
+
action=st.action,
|
| 319 |
+
ground_truth=s.ground_truth or TriageAction.DISMISS,
|
| 320 |
+
triggering_log_id=s.triggering_log_id or "L1-0",
|
| 321 |
+
cited_log_id=st.cited_log_id,
|
| 322 |
+
)
|
| 323 |
+
defender_correct = st.action is s.ground_truth
|
| 324 |
+
|
| 325 |
+
attacker_reward, attacker_bd = 0.0, {}
|
| 326 |
+
if s.mode == "self_play":
|
| 327 |
+
attacker_reward, attacker_bd = score_attacker(
|
| 328 |
+
plausible=bool(s.plausible),
|
| 329 |
+
schema_violation=False,
|
| 330 |
+
defender_correct=defender_correct,
|
| 331 |
+
novelty=0.0, # filled in by the trainer if it tracks batches
|
| 332 |
+
)
|
| 333 |
+
|
| 334 |
+
s.defender_action = st
|
| 335 |
+
s.defender_reward = defender_reward
|
| 336 |
+
s.defender_breakdown = defender_bd
|
| 337 |
+
s.attacker_reward = attacker_reward
|
| 338 |
+
s.attacker_breakdown = attacker_bd
|
| 339 |
+
s.cumulative_reward = defender_reward + attacker_reward
|
| 340 |
+
s.done = True
|
| 341 |
+
s.role = "defender"
|
| 342 |
+
|
| 343 |
+
feedback = (
|
| 344 |
+
f"Defender chose {st.action.value}; ground truth was "
|
| 345 |
+
f"{(s.ground_truth or TriageAction.DISMISS).value}. "
|
| 346 |
+
f"Reward={defender_reward:+.2f}."
|
| 347 |
+
)
|
| 348 |
+
obs = Observation(
|
| 349 |
+
role="defender",
|
| 350 |
+
alert=s.incident_alert,
|
| 351 |
+
log_window=list(s.incident_log_window),
|
| 352 |
+
step=s.step,
|
| 353 |
+
max_steps=self.MAX_STEPS,
|
| 354 |
+
last_action_feedback=feedback,
|
| 355 |
+
done=True,
|
| 356 |
+
)
|
| 357 |
+
info = {
|
| 358 |
+
"role_just_acted": "defender",
|
| 359 |
+
"ground_truth": (s.ground_truth or TriageAction.DISMISS).value,
|
| 360 |
+
"defender_correct": defender_correct,
|
| 361 |
+
"defender_breakdown": defender_bd,
|
| 362 |
+
"attacker_reward": attacker_reward,
|
| 363 |
+
"attacker_breakdown": attacker_bd,
|
| 364 |
+
"triggering_log_id": s.triggering_log_id,
|
| 365 |
+
}
|
| 366 |
+
return obs, defender_reward, True, info
|
| 367 |
+
|
| 368 |
+
# ------------------------------------------------------------------
|
| 369 |
+
# Helpers
|
| 370 |
+
# ------------------------------------------------------------------
|
| 371 |
+
|
| 372 |
+
def _materialize_for_defender(
|
| 373 |
+
self, params: IncidentParams, *, started_role: Role
|
| 374 |
+
) -> Observation:
|
| 375 |
+
"""Set up state for a defender_only episode (skip attacker turn)."""
|
| 376 |
+
plausible, reason, triggering_log_id = check_plausibility(params)
|
| 377 |
+
gt_label, _ = compute_ground_truth(params)
|
| 378 |
+
alert = make_alert(params, alert_id=f"A-{uuid.uuid4().hex[:8]}")
|
| 379 |
+
|
| 380 |
+
self._state = EpisodeState(
|
| 381 |
+
task_id=self.task_id,
|
| 382 |
+
mode=self.mode,
|
| 383 |
+
role="defender",
|
| 384 |
+
incident_alert=alert,
|
| 385 |
+
incident_log_window=list(params.events),
|
| 386 |
+
triggering_log_id=triggering_log_id,
|
| 387 |
+
plausible=plausible,
|
| 388 |
+
plausibility_reason=reason,
|
| 389 |
+
ground_truth=gt_label,
|
| 390 |
+
max_steps=self.MAX_STEPS,
|
| 391 |
+
)
|
| 392 |
+
|
| 393 |
+
return Observation(
|
| 394 |
+
role="defender",
|
| 395 |
+
alert=alert,
|
| 396 |
+
log_window=list(params.events),
|
| 397 |
+
step=0,
|
| 398 |
+
max_steps=self.MAX_STEPS,
|
| 399 |
+
last_action_feedback=(
|
| 400 |
+
f"[stage={self.task_id}, defender_only] Triage this alert."
|
| 401 |
+
),
|
| 402 |
+
)
|
| 403 |
+
|
| 404 |
+
def _sample_target_label_for_brief(self, seed: int) -> TriageAction:
|
| 405 |
+
"""Pick a brief target label from the stage's label distribution."""
|
| 406 |
+
# Reuse the generator's stage config so brief and defender-only
|
| 407 |
+
# generation are coherent.
|
| 408 |
+
from generator import STAGE_CONFIGS # local import avoids cycle
|
| 409 |
+
import random as _random
|
| 410 |
+
cfg = STAGE_CONFIGS[self.task_id]
|
| 411 |
+
rng = _random.Random(seed)
|
| 412 |
+
labels = list(cfg["label_distribution"].keys())
|
| 413 |
+
weights = [cfg["label_distribution"][lab] for lab in labels]
|
| 414 |
+
return rng.choices(labels, weights=weights, k=1)[0]
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
__all__ = [
|
| 418 |
+
"AttackerBrief",
|
| 419 |
+
"Action",
|
| 420 |
+
"Observation",
|
| 421 |
+
"EpisodeState",
|
| 422 |
+
"OpenSOCEnv",
|
| 423 |
+
]
|
eval/__init__.py
ADDED
|
File without changes
|
eval/bake_demo.py
ADDED
|
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Bake before-vs-after demo examples into `data/demo_examples.json`.
|
| 2 |
+
|
| 3 |
+
The HF Space demo at `/demo` is read-only and uses *pre-computed* model
|
| 4 |
+
outputs so the Space can stay on the free CPU tier. This script is the
|
| 5 |
+
GPU step that produces those outputs.
|
| 6 |
+
|
| 7 |
+
Usage (after training, on a GPU host)::
|
| 8 |
+
|
| 9 |
+
python -m eval.bake_demo \
|
| 10 |
+
--baseline unsloth/Qwen2.5-3B-Instruct \
|
| 11 |
+
--trained-adapter checkpoints/defender_grpo/stage4_adversarial/adapter \
|
| 12 |
+
--n 50 --out data/demo_examples.json
|
| 13 |
+
|
| 14 |
+
Usage (no GPU; produces synthetic-but-realistic demo data so the Space
|
| 15 |
+
can be deployed before training has finished)::
|
| 16 |
+
|
| 17 |
+
python -m eval.bake_demo --placeholder --n 50
|
| 18 |
+
|
| 19 |
+
The placeholder run uses two simulated agents:
|
| 20 |
+
|
| 21 |
+
* *baseline*: always says ``dismiss`` (the modal incorrect answer for
|
| 22 |
+
untrained Qwen on this env, per the smoke run).
|
| 23 |
+
* *trained*: the verifier oracle (always correct).
|
| 24 |
+
|
| 25 |
+
This means the demo works end-to-end the moment the Space is deployed,
|
| 26 |
+
and the same JSON gets overwritten with real model outputs after the
|
| 27 |
+
$3 GPU run on HF Jupyter.
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
from __future__ import annotations
|
| 31 |
+
|
| 32 |
+
import argparse
|
| 33 |
+
import json
|
| 34 |
+
import os
|
| 35 |
+
import random
|
| 36 |
+
import sys
|
| 37 |
+
from typing import Any, Dict, List, Tuple
|
| 38 |
+
|
| 39 |
+
_HERE = os.path.dirname(os.path.abspath(__file__))
|
| 40 |
+
sys.path.insert(0, os.path.dirname(_HERE))
|
| 41 |
+
|
| 42 |
+
from rubric import score_defender # noqa: E402
|
| 43 |
+
from schema import Alert, Event, IncidentCategory, TriageAction # noqa: E402
|
| 44 |
+
from train.prompt_format import ( # noqa: E402
|
| 45 |
+
SYSTEM_PROMPT,
|
| 46 |
+
parse_defender_response,
|
| 47 |
+
render_defender_prompt,
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _load_holdout(path: str) -> List[Dict[str, Any]]:
|
| 52 |
+
items = []
|
| 53 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 54 |
+
for line in f:
|
| 55 |
+
items.append(json.loads(line))
|
| 56 |
+
return items
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def _to_alert_events(rec: Dict[str, Any]) -> Tuple[Alert, List[Event]]:
|
| 60 |
+
a = rec["alert"]
|
| 61 |
+
alert = Alert(
|
| 62 |
+
alert_id=a["alert_id"],
|
| 63 |
+
category=IncidentCategory(a["category"]),
|
| 64 |
+
severity=a["severity"],
|
| 65 |
+
summary=a["summary"],
|
| 66 |
+
host=a.get("host", ""),
|
| 67 |
+
user=a.get("user", ""),
|
| 68 |
+
)
|
| 69 |
+
events = [Event(**e) for e in rec["events"]]
|
| 70 |
+
return alert, events
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def _score(parsed_text: str, rec: Dict[str, Any]) -> Dict[str, Any]:
|
| 74 |
+
"""Parse a model response and score it against the verifier truth."""
|
| 75 |
+
parsed = parse_defender_response(parsed_text)
|
| 76 |
+
pred_action = parsed.action or TriageAction.DISMISS
|
| 77 |
+
truth = TriageAction(rec["ground_truth"])
|
| 78 |
+
cited = parsed.cited_log_id or rec["events"][0]["log_id"]
|
| 79 |
+
reward, breakdown = score_defender(
|
| 80 |
+
action=pred_action,
|
| 81 |
+
ground_truth=truth,
|
| 82 |
+
triggering_log_id=rec["triggering_log_id"],
|
| 83 |
+
cited_log_id=cited,
|
| 84 |
+
)
|
| 85 |
+
return {
|
| 86 |
+
"raw_text": parsed_text,
|
| 87 |
+
"action": pred_action.value,
|
| 88 |
+
"cited_log_id": cited,
|
| 89 |
+
"rationale": parsed.rationale or "(no rationale parsed)",
|
| 90 |
+
"format_ok": parsed.format_ok,
|
| 91 |
+
"reward": reward,
|
| 92 |
+
"reward_breakdown": breakdown,
|
| 93 |
+
"correct": pred_action == truth,
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
# ---------------------------------------------------------------------------
|
| 98 |
+
# Baseline / trained inference adapters
|
| 99 |
+
# ---------------------------------------------------------------------------
|
| 100 |
+
|
| 101 |
+
def _placeholder_baseline(rec: Dict[str, Any]) -> str:
|
| 102 |
+
"""Mimic an untrained model that mostly defaults to dismiss."""
|
| 103 |
+
return (
|
| 104 |
+
"Action: dismiss\n"
|
| 105 |
+
f"CitedLog: {rec['events'][0]['log_id']}\n"
|
| 106 |
+
"Rationale: Looks like routine activity, no clear malicious indicator."
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def _placeholder_trained(rec: Dict[str, Any]) -> str:
|
| 111 |
+
"""Mimic a perfectly-trained model: oracle answer + a plausible explanation."""
|
| 112 |
+
truth = rec["ground_truth"]
|
| 113 |
+
triggering = rec["triggering_log_id"]
|
| 114 |
+
rationales = {
|
| 115 |
+
"dismiss": "Indicators are consistent with normal user activity.",
|
| 116 |
+
"monitor": "Behavior is anomalous but reversible; raising visibility.",
|
| 117 |
+
"quarantine_host": "Host shows malware execution indicators; isolating.",
|
| 118 |
+
"block_ip": "External IP is engaged in active brute-force; blocking.",
|
| 119 |
+
"escalate": "Confirmed exfiltration scale exceeds tier-1 thresholds.",
|
| 120 |
+
}
|
| 121 |
+
return (
|
| 122 |
+
f"Action: {truth}\n"
|
| 123 |
+
f"CitedLog: {triggering}\n"
|
| 124 |
+
f"Rationale: {rationales.get(truth, 'Verified malicious behavior on the cited log.')}"
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def _try_load_unsloth(model_name: str, adapter_path: str | None):
|
| 129 |
+
"""Best-effort load of a Qwen-style model via Unsloth.
|
| 130 |
+
|
| 131 |
+
Returns ``None`` on any failure (no GPU, missing wheels, etc.) so the
|
| 132 |
+
caller can fall back to the placeholder pipeline.
|
| 133 |
+
"""
|
| 134 |
+
try:
|
| 135 |
+
from unsloth import FastLanguageModel
|
| 136 |
+
except ImportError:
|
| 137 |
+
return None
|
| 138 |
+
model, tokenizer = FastLanguageModel.from_pretrained(
|
| 139 |
+
model_name=model_name,
|
| 140 |
+
max_seq_length=2048,
|
| 141 |
+
dtype=None,
|
| 142 |
+
load_in_4bit=True,
|
| 143 |
+
)
|
| 144 |
+
if adapter_path and os.path.exists(adapter_path):
|
| 145 |
+
model.load_adapter(adapter_path, adapter_name="default", is_trainable=False)
|
| 146 |
+
FastLanguageModel.for_inference(model)
|
| 147 |
+
return model, tokenizer
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def _generate(model_pair, alert: Alert, events: List[Event]) -> str:
|
| 151 |
+
model, tokenizer = model_pair
|
| 152 |
+
messages = [
|
| 153 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 154 |
+
{"role": "user", "content": render_defender_prompt(alert, events)},
|
| 155 |
+
]
|
| 156 |
+
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 157 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
| 158 |
+
out = model.generate(
|
| 159 |
+
**inputs, max_new_tokens=128, do_sample=False, temperature=0.0,
|
| 160 |
+
)
|
| 161 |
+
return tokenizer.decode(out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
# ---------------------------------------------------------------------------
|
| 165 |
+
# Sampling
|
| 166 |
+
# ---------------------------------------------------------------------------
|
| 167 |
+
|
| 168 |
+
def _stratified_sample(records: List[Dict[str, Any]], n: int, seed: int) -> List[Dict[str, Any]]:
|
| 169 |
+
"""Sample `n` records, balanced across stages and ground-truth labels."""
|
| 170 |
+
rng = random.Random(seed)
|
| 171 |
+
by_stage: Dict[str, List[Dict[str, Any]]] = {}
|
| 172 |
+
for rec in records:
|
| 173 |
+
by_stage.setdefault(rec["stage"], []).append(rec)
|
| 174 |
+
per_stage = max(1, n // max(1, len(by_stage)))
|
| 175 |
+
out: List[Dict[str, Any]] = []
|
| 176 |
+
for stage_id, items in by_stage.items():
|
| 177 |
+
rng.shuffle(items)
|
| 178 |
+
out.extend(items[:per_stage])
|
| 179 |
+
rng.shuffle(out)
|
| 180 |
+
return out[:n]
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
# ---------------------------------------------------------------------------
|
| 184 |
+
# Main
|
| 185 |
+
# ---------------------------------------------------------------------------
|
| 186 |
+
|
| 187 |
+
def main() -> None:
|
| 188 |
+
parser = argparse.ArgumentParser()
|
| 189 |
+
parser.add_argument("--baseline", default="unsloth/Qwen2.5-3B-Instruct")
|
| 190 |
+
parser.add_argument(
|
| 191 |
+
"--trained-adapter",
|
| 192 |
+
default="checkpoints/defender_grpo/stage4_adversarial/adapter",
|
| 193 |
+
)
|
| 194 |
+
parser.add_argument("--holdout", default="data/holdout.jsonl")
|
| 195 |
+
parser.add_argument("--n", type=int, default=50)
|
| 196 |
+
parser.add_argument("--seed", type=int, default=7)
|
| 197 |
+
parser.add_argument("--out", default="data/demo_examples.json")
|
| 198 |
+
parser.add_argument(
|
| 199 |
+
"--placeholder",
|
| 200 |
+
action="store_true",
|
| 201 |
+
help="Skip GPU loading; use scripted always-dismiss vs oracle responses.",
|
| 202 |
+
)
|
| 203 |
+
args = parser.parse_args()
|
| 204 |
+
|
| 205 |
+
holdout_path = os.path.join(os.path.dirname(_HERE), args.holdout)
|
| 206 |
+
out_path = os.path.join(os.path.dirname(_HERE), args.out)
|
| 207 |
+
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
| 208 |
+
|
| 209 |
+
records = _load_holdout(holdout_path)
|
| 210 |
+
chosen = _stratified_sample(records, n=args.n, seed=args.seed)
|
| 211 |
+
print(f"Selected {len(chosen)} demo incidents from {holdout_path}")
|
| 212 |
+
|
| 213 |
+
use_real = not args.placeholder
|
| 214 |
+
baseline_pair = trained_pair = None
|
| 215 |
+
if use_real:
|
| 216 |
+
print(f"Loading baseline {args.baseline} ...")
|
| 217 |
+
baseline_pair = _try_load_unsloth(args.baseline, adapter_path=None)
|
| 218 |
+
if baseline_pair is None:
|
| 219 |
+
print("(no GPU / unsloth) falling back to placeholder pipeline.")
|
| 220 |
+
use_real = False
|
| 221 |
+
else:
|
| 222 |
+
adapter_full = os.path.join(os.path.dirname(_HERE), args.trained_adapter)
|
| 223 |
+
print(f"Loading trained adapter from {adapter_full} ...")
|
| 224 |
+
trained_pair = _try_load_unsloth(args.baseline, adapter_path=adapter_full)
|
| 225 |
+
if trained_pair is None:
|
| 226 |
+
print("(adapter not loadable) falling back to placeholder pipeline.")
|
| 227 |
+
use_real = False
|
| 228 |
+
|
| 229 |
+
examples: List[Dict[str, Any]] = []
|
| 230 |
+
for rec in chosen:
|
| 231 |
+
alert, events = _to_alert_events(rec)
|
| 232 |
+
if use_real:
|
| 233 |
+
baseline_text = _generate(baseline_pair, alert, events)
|
| 234 |
+
trained_text = _generate(trained_pair, alert, events)
|
| 235 |
+
else:
|
| 236 |
+
baseline_text = _placeholder_baseline(rec)
|
| 237 |
+
trained_text = _placeholder_trained(rec)
|
| 238 |
+
examples.append({
|
| 239 |
+
"alert": rec["alert"],
|
| 240 |
+
"events": rec["events"],
|
| 241 |
+
"ground_truth": rec["ground_truth"],
|
| 242 |
+
"triggering_log_id": rec["triggering_log_id"],
|
| 243 |
+
"stage": rec["stage"],
|
| 244 |
+
"seed": rec["seed"],
|
| 245 |
+
"baseline": _score(baseline_text, rec),
|
| 246 |
+
"trained": _score(trained_text, rec),
|
| 247 |
+
})
|
| 248 |
+
|
| 249 |
+
summary = {
|
| 250 |
+
"n": len(examples),
|
| 251 |
+
"source": "real_inference" if use_real else "placeholder",
|
| 252 |
+
"baseline_model": args.baseline,
|
| 253 |
+
"trained_adapter": args.trained_adapter if use_real else None,
|
| 254 |
+
"examples": examples,
|
| 255 |
+
}
|
| 256 |
+
with open(out_path, "w", encoding="utf-8") as f:
|
| 257 |
+
json.dump(summary, f, indent=2)
|
| 258 |
+
print(f"Wrote {len(examples)} demo examples to {out_path} ({summary['source']})")
|
| 259 |
+
|
| 260 |
+
base_correct = sum(1 for e in examples if e["baseline"]["correct"])
|
| 261 |
+
trained_correct = sum(1 for e in examples if e["trained"]["correct"])
|
| 262 |
+
print(
|
| 263 |
+
f" baseline accuracy: {base_correct/len(examples):.2%} ({base_correct}/{len(examples)})"
|
| 264 |
+
)
|
| 265 |
+
print(
|
| 266 |
+
f" trained accuracy: {trained_correct/len(examples):.2%} ({trained_correct}/{len(examples)})"
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
|
| 270 |
+
if __name__ == "__main__":
|
| 271 |
+
main()
|
eval/eval.py
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Evaluate baseline and trained defender on the frozen hold-out set.
|
| 2 |
+
|
| 3 |
+
Two models are compared by default:
|
| 4 |
+
|
| 5 |
+
* **Baseline**: vanilla Qwen2.5-3B-Instruct, no SFT, no GRPO.
|
| 6 |
+
* **Trained**: Qwen2.5-3B-Instruct + SFT warm-start + GRPO curriculum.
|
| 7 |
+
|
| 8 |
+
Both are scored on `data/holdout.jsonl` using the verifier's ground-truth
|
| 9 |
+
labels. Reported metrics (printed and saved to `--out-dir`):
|
| 10 |
+
|
| 11 |
+
* Macro F1 + per-class precision/recall
|
| 12 |
+
* 5x5 confusion matrix
|
| 13 |
+
* Dismiss-on-malicious rate (the cardinal SOC failure mode)
|
| 14 |
+
* Over-react rate (containment on benign)
|
| 15 |
+
|
| 16 |
+
Inference path
|
| 17 |
+
--------------
|
| 18 |
+
We use Unsloth's `FastLanguageModel.from_pretrained(... load_in_4bit=True)`
|
| 19 |
+
with `model.fast_generate` to keep eval under 10 minutes on a T4. When
|
| 20 |
+
GPU deps aren't available (e.g. the Hugging Face Space build log), the
|
| 21 |
+
script falls back to a verifier-only sanity check by re-grading the
|
| 22 |
+
held-out file against itself, which serves as a smoke test.
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
from __future__ import annotations
|
| 26 |
+
|
| 27 |
+
import argparse
|
| 28 |
+
import json
|
| 29 |
+
import os
|
| 30 |
+
import sys
|
| 31 |
+
from typing import List, Tuple
|
| 32 |
+
|
| 33 |
+
_HERE = os.path.dirname(os.path.abspath(__file__))
|
| 34 |
+
sys.path.insert(0, os.path.dirname(_HERE))
|
| 35 |
+
|
| 36 |
+
from eval.metrics import ( # noqa: E402
|
| 37 |
+
accuracy,
|
| 38 |
+
confusion_matrix,
|
| 39 |
+
dismiss_on_malicious_rate,
|
| 40 |
+
over_react_rate,
|
| 41 |
+
per_class_f1,
|
| 42 |
+
)
|
| 43 |
+
from schema import Alert, Event, IncidentCategory, TriageAction # noqa: E402
|
| 44 |
+
from train.prompt_format import ( # noqa: E402
|
| 45 |
+
SYSTEM_PROMPT,
|
| 46 |
+
parse_defender_response,
|
| 47 |
+
render_defender_prompt,
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _load_holdout(path: str):
|
| 52 |
+
items = []
|
| 53 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 54 |
+
for line in f:
|
| 55 |
+
items.append(json.loads(line))
|
| 56 |
+
return items
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def _to_alert_events(rec: dict) -> Tuple[Alert, List[Event]]:
|
| 60 |
+
a = rec["alert"]
|
| 61 |
+
alert = Alert(
|
| 62 |
+
alert_id=a["alert_id"],
|
| 63 |
+
category=IncidentCategory(a["category"]),
|
| 64 |
+
severity=a["severity"],
|
| 65 |
+
summary=a["summary"],
|
| 66 |
+
host=a.get("host", ""),
|
| 67 |
+
user=a.get("user", ""),
|
| 68 |
+
)
|
| 69 |
+
events = [Event(**e) for e in rec["events"]]
|
| 70 |
+
return alert, events
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def _print_metrics(label: str, preds: List[str], truths: List[str]) -> dict:
|
| 74 |
+
cm = confusion_matrix(preds, truths)
|
| 75 |
+
macro_f1, per_class = per_class_f1(cm)
|
| 76 |
+
acc = accuracy(preds, truths)
|
| 77 |
+
miss = dismiss_on_malicious_rate(preds, truths)
|
| 78 |
+
over = over_react_rate(preds, truths)
|
| 79 |
+
print(f"\n=== {label} ===")
|
| 80 |
+
print(f" accuracy: {acc:.3f}")
|
| 81 |
+
print(f" macro F1: {macro_f1:.3f}")
|
| 82 |
+
print(f" dismiss-on-malicious: {miss:.3f}")
|
| 83 |
+
print(f" over-react on benign: {over:.3f}")
|
| 84 |
+
print(" per-class:")
|
| 85 |
+
for cls, m in per_class.items():
|
| 86 |
+
print(f" {cls:<18} P={m['precision']:.2f} R={m['recall']:.2f} F1={m['f1']:.2f} (n={int(m['support'])})")
|
| 87 |
+
return {
|
| 88 |
+
"label": label,
|
| 89 |
+
"accuracy": acc,
|
| 90 |
+
"macro_f1": macro_f1,
|
| 91 |
+
"dismiss_on_malicious": miss,
|
| 92 |
+
"over_react_rate": over,
|
| 93 |
+
"per_class": per_class,
|
| 94 |
+
"confusion_matrix": cm,
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
# ---------------------------------------------------------------------------
|
| 99 |
+
# Inference adapters
|
| 100 |
+
# ---------------------------------------------------------------------------
|
| 101 |
+
|
| 102 |
+
class _VerifierOracle:
|
| 103 |
+
"""A 'model' that always returns the verifier's correct answer.
|
| 104 |
+
|
| 105 |
+
Used as a smoke test when GPU deps aren't installed; it should achieve
|
| 106 |
+
100% accuracy / 0% dismiss-on-malicious by construction.
|
| 107 |
+
"""
|
| 108 |
+
|
| 109 |
+
name = "verifier_oracle"
|
| 110 |
+
|
| 111 |
+
def predict(self, alert: Alert, events: List[Event], gold: dict) -> str:
|
| 112 |
+
return f"Action: {gold['ground_truth']}\nCitedLog: {gold['triggering_log_id']}\nRationale: oracle"
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
class _AlwaysDismissBaseline:
|
| 116 |
+
"""A trivial baseline that always says 'dismiss'."""
|
| 117 |
+
|
| 118 |
+
name = "always_dismiss"
|
| 119 |
+
|
| 120 |
+
def predict(self, alert: Alert, events: List[Event], gold: dict) -> str:
|
| 121 |
+
return "Action: dismiss\nCitedLog: L1-0\nRationale: trivial baseline"
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def _try_load_unsloth_model(model_name: str, adapter_path: str | None):
|
| 125 |
+
"""Load a model via Unsloth. Returns None if GPU deps aren't installed."""
|
| 126 |
+
try:
|
| 127 |
+
from unsloth import FastLanguageModel
|
| 128 |
+
except ImportError:
|
| 129 |
+
return None
|
| 130 |
+
model, tokenizer = FastLanguageModel.from_pretrained(
|
| 131 |
+
model_name=model_name,
|
| 132 |
+
max_seq_length=2048,
|
| 133 |
+
dtype=None,
|
| 134 |
+
load_in_4bit=True,
|
| 135 |
+
)
|
| 136 |
+
if adapter_path and os.path.exists(adapter_path):
|
| 137 |
+
model.load_adapter(adapter_path, adapter_name="default", is_trainable=False)
|
| 138 |
+
FastLanguageModel.for_inference(model)
|
| 139 |
+
return model, tokenizer
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def _generate(model_pair, alert, events) -> str:
|
| 143 |
+
model, tokenizer = model_pair
|
| 144 |
+
messages = [
|
| 145 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 146 |
+
{"role": "user", "content": render_defender_prompt(alert, events)},
|
| 147 |
+
]
|
| 148 |
+
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 149 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
| 150 |
+
out = model.generate(**inputs, max_new_tokens=128, do_sample=False, temperature=0.0)
|
| 151 |
+
text = tokenizer.decode(out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
|
| 152 |
+
return text
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
# ---------------------------------------------------------------------------
|
| 156 |
+
# Main eval
|
| 157 |
+
# ---------------------------------------------------------------------------
|
| 158 |
+
|
| 159 |
+
def main() -> None:
|
| 160 |
+
parser = argparse.ArgumentParser()
|
| 161 |
+
parser.add_argument("--baseline", default="unsloth/Qwen2.5-3B-Instruct")
|
| 162 |
+
parser.add_argument("--trained-adapter", default="checkpoints/defender_grpo/stage4_adversarial/adapter")
|
| 163 |
+
parser.add_argument("--holdout", default="data/holdout.jsonl")
|
| 164 |
+
parser.add_argument("--out-dir", default="eval/results")
|
| 165 |
+
parser.add_argument("--smoke-only", action="store_true",
|
| 166 |
+
help="Skip GPU model loading; run oracle + always_dismiss only.")
|
| 167 |
+
args = parser.parse_args()
|
| 168 |
+
|
| 169 |
+
holdout_path = os.path.join(os.path.dirname(_HERE), args.holdout)
|
| 170 |
+
out_dir = os.path.join(os.path.dirname(_HERE), args.out_dir)
|
| 171 |
+
os.makedirs(out_dir, exist_ok=True)
|
| 172 |
+
holdout = _load_holdout(holdout_path)
|
| 173 |
+
truths = [r["ground_truth"] for r in holdout]
|
| 174 |
+
print(f"Loaded {len(holdout)} hold-out incidents from {holdout_path}")
|
| 175 |
+
|
| 176 |
+
summaries = []
|
| 177 |
+
|
| 178 |
+
# --- Always-dismiss baseline (sanity) ---
|
| 179 |
+
preds_dismiss = []
|
| 180 |
+
for rec in holdout:
|
| 181 |
+
alert, events = _to_alert_events(rec)
|
| 182 |
+
text = _AlwaysDismissBaseline().predict(alert, events, rec)
|
| 183 |
+
parsed = parse_defender_response(text)
|
| 184 |
+
preds_dismiss.append(parsed.action.value if parsed.action else "dismiss")
|
| 185 |
+
summaries.append(_print_metrics("always_dismiss", preds_dismiss, truths))
|
| 186 |
+
|
| 187 |
+
# --- Verifier oracle (sanity) ---
|
| 188 |
+
preds_oracle = []
|
| 189 |
+
for rec in holdout:
|
| 190 |
+
alert, events = _to_alert_events(rec)
|
| 191 |
+
text = _VerifierOracle().predict(alert, events, rec)
|
| 192 |
+
parsed = parse_defender_response(text)
|
| 193 |
+
preds_oracle.append(parsed.action.value if parsed.action else "dismiss")
|
| 194 |
+
summaries.append(_print_metrics("verifier_oracle", preds_oracle, truths))
|
| 195 |
+
|
| 196 |
+
# --- Real models ---
|
| 197 |
+
if not args.smoke_only:
|
| 198 |
+
baseline_pair = _try_load_unsloth_model(args.baseline, adapter_path=None)
|
| 199 |
+
if baseline_pair is not None:
|
| 200 |
+
preds_baseline = []
|
| 201 |
+
for rec in holdout:
|
| 202 |
+
alert, events = _to_alert_events(rec)
|
| 203 |
+
text = _generate(baseline_pair, alert, events)
|
| 204 |
+
parsed = parse_defender_response(text)
|
| 205 |
+
preds_baseline.append(parsed.action.value if parsed.action else "dismiss")
|
| 206 |
+
summaries.append(_print_metrics("baseline_zero_shot", preds_baseline, truths))
|
| 207 |
+
|
| 208 |
+
adapter_full = os.path.join(os.path.dirname(_HERE), args.trained_adapter)
|
| 209 |
+
if os.path.exists(adapter_full):
|
| 210 |
+
trained_pair = _try_load_unsloth_model(args.baseline, adapter_path=adapter_full)
|
| 211 |
+
if trained_pair is not None:
|
| 212 |
+
preds_trained = []
|
| 213 |
+
for rec in holdout:
|
| 214 |
+
alert, events = _to_alert_events(rec)
|
| 215 |
+
text = _generate(trained_pair, alert, events)
|
| 216 |
+
parsed = parse_defender_response(text)
|
| 217 |
+
preds_trained.append(parsed.action.value if parsed.action else "dismiss")
|
| 218 |
+
summaries.append(_print_metrics("opensoc_grpo", preds_trained, truths))
|
| 219 |
+
else:
|
| 220 |
+
print(f"\n(skip) trained adapter not found at {adapter_full}")
|
| 221 |
+
else:
|
| 222 |
+
print("\n(skip) GPU deps not installed; skipping baseline_zero_shot and opensoc_grpo.")
|
| 223 |
+
|
| 224 |
+
out_json = os.path.join(out_dir, "summary.json")
|
| 225 |
+
with open(out_json, "w") as f:
|
| 226 |
+
json.dump(summaries, f, indent=2)
|
| 227 |
+
print(f"\nSaved summary to {out_json}")
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
if __name__ == "__main__":
|
| 231 |
+
main()
|
eval/make_holdout.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Build the frozen 200-incident hold-out evaluation set.
|
| 2 |
+
|
| 3 |
+
Run::
|
| 4 |
+
|
| 5 |
+
python -m eval.make_holdout --out data/holdout.jsonl
|
| 6 |
+
|
| 7 |
+
This file is committed to the repo so reviewers can verify reported
|
| 8 |
+
numbers byte-for-byte without rerunning the generator. The seeds used
|
| 9 |
+
here are *outside* the SFT and GRPO seed bands declared in
|
| 10 |
+
`tasks/registry.py` (seed_offset 1k-4k for training, 90k-94k here) so
|
| 11 |
+
there is zero overlap between train and eval.
|
| 12 |
+
|
| 13 |
+
Each record::
|
| 14 |
+
|
| 15 |
+
{ "alert": {...}, "events": [...], "ground_truth": "<action>",
|
| 16 |
+
"triggering_log_id": "<id>", "stage": "<stage>", "seed": <int> }
|
| 17 |
+
|
| 18 |
+
`eval/eval.py` consumes this format directly.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
from __future__ import annotations
|
| 22 |
+
|
| 23 |
+
import argparse
|
| 24 |
+
import json
|
| 25 |
+
import os
|
| 26 |
+
import sys
|
| 27 |
+
from collections import Counter
|
| 28 |
+
|
| 29 |
+
_HERE = os.path.dirname(os.path.abspath(__file__))
|
| 30 |
+
sys.path.insert(0, os.path.dirname(_HERE))
|
| 31 |
+
|
| 32 |
+
from generator import generate_incident, make_alert # noqa: E402
|
| 33 |
+
from verifier import compute_ground_truth # noqa: E402
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# Seed bands — kept distinct from training seed bands.
|
| 37 |
+
HOLDOUT_SEED_BAND = {
|
| 38 |
+
"stage1_basic": 90_000,
|
| 39 |
+
"stage2_multi": 91_000,
|
| 40 |
+
"stage3_mixed": 92_000,
|
| 41 |
+
"stage4_adversarial": 93_000,
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def main() -> None:
|
| 46 |
+
parser = argparse.ArgumentParser()
|
| 47 |
+
parser.add_argument("--n-per-stage", type=int, default=50,
|
| 48 |
+
help="Number of incidents per stage (default 50 → 200 total).")
|
| 49 |
+
parser.add_argument("--out", default="data/holdout.jsonl")
|
| 50 |
+
args = parser.parse_args()
|
| 51 |
+
|
| 52 |
+
out_path = os.path.join(os.path.dirname(_HERE), args.out)
|
| 53 |
+
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
| 54 |
+
|
| 55 |
+
counts: Counter = Counter()
|
| 56 |
+
written = 0
|
| 57 |
+
with open(out_path, "w", encoding="utf-8") as f:
|
| 58 |
+
for stage_id, base in HOLDOUT_SEED_BAND.items():
|
| 59 |
+
for i in range(args.n_per_stage):
|
| 60 |
+
seed = base + i
|
| 61 |
+
params = generate_incident(stage_id, seed)
|
| 62 |
+
alert = make_alert(params, alert_id=f"A-EVAL-{stage_id[-1]}-{seed}")
|
| 63 |
+
gt, sig = compute_ground_truth(params)
|
| 64 |
+
rec = {
|
| 65 |
+
"alert": alert.model_dump(mode="json"),
|
| 66 |
+
"events": [e.model_dump(mode="json") for e in params.events],
|
| 67 |
+
"ground_truth": gt.value,
|
| 68 |
+
"triggering_log_id": sig.triggering_log_id or params.events[0].log_id,
|
| 69 |
+
"stage": stage_id,
|
| 70 |
+
"seed": seed,
|
| 71 |
+
}
|
| 72 |
+
f.write(json.dumps(rec) + "\n")
|
| 73 |
+
counts[gt.value] += 1
|
| 74 |
+
written += 1
|
| 75 |
+
print(f"Wrote {written} hold-out incidents to {out_path}")
|
| 76 |
+
print("Label distribution:")
|
| 77 |
+
for k, v in sorted(counts.items()):
|
| 78 |
+
print(f" {k:<18} {v:4d} ({100 * v / written:5.1f}%)")
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
if __name__ == "__main__":
|
| 82 |
+
main()
|
eval/metrics.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pure-python evaluation metrics for OpenSOC.
|
| 2 |
+
|
| 3 |
+
Exposes:
|
| 4 |
+
* `confusion_matrix(predictions, truths)` — 5x5 dict-of-dicts
|
| 5 |
+
* `per_class_f1(cm)` — macro F1 plus per-class precision/recall/F1
|
| 6 |
+
* `dismiss_on_malicious_rate(predictions, truths)` — the cardinal SOC
|
| 7 |
+
metric. This is what we publish in the headline plot.
|
| 8 |
+
* `over_react_rate(predictions, truths)` — how often the model
|
| 9 |
+
quarantines or blocks on a benign incident.
|
| 10 |
+
|
| 11 |
+
We deliberately don't pull in scikit-learn — keeping eval dependency-free
|
| 12 |
+
makes it easy to run inside the OpenEnv container and from a Hugging
|
| 13 |
+
Face Space build log.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
from __future__ import annotations
|
| 17 |
+
|
| 18 |
+
from typing import Dict, Iterable, List, Tuple
|
| 19 |
+
|
| 20 |
+
from schema import CONTAINMENT_ACTIONS, TriageAction
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
ALL_ACTIONS: List[str] = [a.value for a in TriageAction]
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def confusion_matrix(predictions: Iterable[str], truths: Iterable[str]) -> Dict[str, Dict[str, int]]:
|
| 27 |
+
cm: Dict[str, Dict[str, int]] = {gt: {p: 0 for p in ALL_ACTIONS} for gt in ALL_ACTIONS}
|
| 28 |
+
for p, gt in zip(predictions, truths):
|
| 29 |
+
if gt not in cm:
|
| 30 |
+
cm[gt] = {a: 0 for a in ALL_ACTIONS}
|
| 31 |
+
if p not in cm[gt]:
|
| 32 |
+
cm[gt][p] = 0
|
| 33 |
+
cm[gt][p] = cm[gt].get(p, 0) + 1
|
| 34 |
+
return cm
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def per_class_f1(cm: Dict[str, Dict[str, int]]) -> Tuple[float, Dict[str, Dict[str, float]]]:
|
| 38 |
+
per_class: Dict[str, Dict[str, float]] = {}
|
| 39 |
+
f1_sum = 0.0
|
| 40 |
+
n_classes = 0
|
| 41 |
+
for cls in ALL_ACTIONS:
|
| 42 |
+
tp = cm.get(cls, {}).get(cls, 0)
|
| 43 |
+
fp = sum(cm.get(other, {}).get(cls, 0) for other in ALL_ACTIONS if other != cls)
|
| 44 |
+
fn = sum(cm.get(cls, {}).get(other, 0) for other in ALL_ACTIONS if other != cls)
|
| 45 |
+
precision = tp / (tp + fp) if (tp + fp) else 0.0
|
| 46 |
+
recall = tp / (tp + fn) if (tp + fn) else 0.0
|
| 47 |
+
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0.0
|
| 48 |
+
per_class[cls] = {"precision": precision, "recall": recall, "f1": f1, "support": tp + fn}
|
| 49 |
+
f1_sum += f1
|
| 50 |
+
n_classes += 1
|
| 51 |
+
macro_f1 = f1_sum / n_classes if n_classes else 0.0
|
| 52 |
+
return macro_f1, per_class
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def dismiss_on_malicious_rate(predictions: Iterable[str], truths: Iterable[str]) -> float:
|
| 56 |
+
"""Fraction of malicious incidents the model wrongly dismissed."""
|
| 57 |
+
malicious_total = 0
|
| 58 |
+
dismissed = 0
|
| 59 |
+
for p, gt in zip(predictions, truths):
|
| 60 |
+
if gt != TriageAction.DISMISS.value:
|
| 61 |
+
malicious_total += 1
|
| 62 |
+
if p == TriageAction.DISMISS.value:
|
| 63 |
+
dismissed += 1
|
| 64 |
+
return dismissed / malicious_total if malicious_total else 0.0
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def over_react_rate(predictions: Iterable[str], truths: Iterable[str]) -> float:
|
| 68 |
+
"""Fraction of benign-or-low incidents the model over-reacted on."""
|
| 69 |
+
benign_total = 0
|
| 70 |
+
over_reacted = 0
|
| 71 |
+
containment = {a.value for a in CONTAINMENT_ACTIONS}
|
| 72 |
+
for p, gt in zip(predictions, truths):
|
| 73 |
+
if gt in (TriageAction.DISMISS.value, TriageAction.MONITOR.value):
|
| 74 |
+
benign_total += 1
|
| 75 |
+
if p in containment:
|
| 76 |
+
over_reacted += 1
|
| 77 |
+
return over_reacted / benign_total if benign_total else 0.0
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def accuracy(predictions: Iterable[str], truths: Iterable[str]) -> float:
|
| 81 |
+
correct = 0
|
| 82 |
+
n = 0
|
| 83 |
+
for p, gt in zip(predictions, truths):
|
| 84 |
+
n += 1
|
| 85 |
+
if p == gt:
|
| 86 |
+
correct += 1
|
| 87 |
+
return correct / n if n else 0.0
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
__all__ = [
|
| 91 |
+
"ALL_ACTIONS",
|
| 92 |
+
"confusion_matrix",
|
| 93 |
+
"per_class_f1",
|
| 94 |
+
"dismiss_on_malicious_rate",
|
| 95 |
+
"over_react_rate",
|
| 96 |
+
"accuracy",
|
| 97 |
+
]
|
eval/plot_results.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Plot eval/results/summary.json into PNG images for the README.
|
| 2 |
+
|
| 3 |
+
Run after `eval.eval`::
|
| 4 |
+
|
| 5 |
+
python -m eval.plot_results --in eval/results/summary.json --out-dir eval/results
|
| 6 |
+
|
| 7 |
+
Generates:
|
| 8 |
+
* `bar_dismiss_on_malicious.png` — the headline plot.
|
| 9 |
+
* `bar_macro_f1.png` — macro F1 by model.
|
| 10 |
+
* `confusion_<model>.png` — one heatmap per evaluated model.
|
| 11 |
+
|
| 12 |
+
We use matplotlib only; no seaborn dependency. This keeps the Hugging
|
| 13 |
+
Face Space slim and lets the plotter run on CPU only.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
from __future__ import annotations
|
| 17 |
+
|
| 18 |
+
import argparse
|
| 19 |
+
import json
|
| 20 |
+
import os
|
| 21 |
+
import sys
|
| 22 |
+
|
| 23 |
+
_HERE = os.path.dirname(os.path.abspath(__file__))
|
| 24 |
+
sys.path.insert(0, os.path.dirname(_HERE))
|
| 25 |
+
|
| 26 |
+
from eval.metrics import ALL_ACTIONS # noqa: E402
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def _try_matplotlib():
|
| 30 |
+
try:
|
| 31 |
+
import matplotlib
|
| 32 |
+
matplotlib.use("Agg")
|
| 33 |
+
import matplotlib.pyplot as plt
|
| 34 |
+
return plt
|
| 35 |
+
except ImportError:
|
| 36 |
+
return None
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def main() -> None:
|
| 40 |
+
parser = argparse.ArgumentParser()
|
| 41 |
+
parser.add_argument("--in", dest="inp", default="eval/results/summary.json")
|
| 42 |
+
parser.add_argument("--out-dir", default="eval/results")
|
| 43 |
+
args = parser.parse_args()
|
| 44 |
+
|
| 45 |
+
plt = _try_matplotlib()
|
| 46 |
+
if plt is None:
|
| 47 |
+
sys.exit("matplotlib is required to render plots: `pip install matplotlib`")
|
| 48 |
+
|
| 49 |
+
inp = os.path.join(os.path.dirname(_HERE), args.inp)
|
| 50 |
+
out_dir = os.path.join(os.path.dirname(_HERE), args.out_dir)
|
| 51 |
+
os.makedirs(out_dir, exist_ok=True)
|
| 52 |
+
|
| 53 |
+
with open(inp, "r") as f:
|
| 54 |
+
summaries = json.load(f)
|
| 55 |
+
|
| 56 |
+
labels = [s["label"] for s in summaries]
|
| 57 |
+
miss = [s["dismiss_on_malicious"] for s in summaries]
|
| 58 |
+
f1s = [s["macro_f1"] for s in summaries]
|
| 59 |
+
|
| 60 |
+
fig, ax = plt.subplots(figsize=(7, 4))
|
| 61 |
+
ax.bar(labels, miss)
|
| 62 |
+
ax.set_ylabel("dismiss-on-malicious rate (lower is better)")
|
| 63 |
+
ax.set_title("Missed-malicious rate by model")
|
| 64 |
+
plt.xticks(rotation=20, ha="right")
|
| 65 |
+
fig.tight_layout()
|
| 66 |
+
fig.savefig(os.path.join(out_dir, "bar_dismiss_on_malicious.png"), dpi=150)
|
| 67 |
+
plt.close(fig)
|
| 68 |
+
|
| 69 |
+
fig, ax = plt.subplots(figsize=(7, 4))
|
| 70 |
+
ax.bar(labels, f1s)
|
| 71 |
+
ax.set_ylabel("macro F1 (higher is better)")
|
| 72 |
+
ax.set_title("Macro F1 by model")
|
| 73 |
+
plt.xticks(rotation=20, ha="right")
|
| 74 |
+
fig.tight_layout()
|
| 75 |
+
fig.savefig(os.path.join(out_dir, "bar_macro_f1.png"), dpi=150)
|
| 76 |
+
plt.close(fig)
|
| 77 |
+
|
| 78 |
+
for s in summaries:
|
| 79 |
+
cm = s["confusion_matrix"]
|
| 80 |
+
rows = [[cm.get(gt, {}).get(p, 0) for p in ALL_ACTIONS] for gt in ALL_ACTIONS]
|
| 81 |
+
fig, ax = plt.subplots(figsize=(5.5, 4.5))
|
| 82 |
+
im = ax.imshow(rows, cmap="Blues")
|
| 83 |
+
ax.set_xticks(range(len(ALL_ACTIONS)), ALL_ACTIONS, rotation=25, ha="right")
|
| 84 |
+
ax.set_yticks(range(len(ALL_ACTIONS)), ALL_ACTIONS)
|
| 85 |
+
ax.set_xlabel("predicted")
|
| 86 |
+
ax.set_ylabel("ground truth")
|
| 87 |
+
ax.set_title(f"Confusion matrix: {s['label']}")
|
| 88 |
+
for r, row in enumerate(rows):
|
| 89 |
+
for c, v in enumerate(row):
|
| 90 |
+
ax.text(c, r, str(v), ha="center", va="center", fontsize=8,
|
| 91 |
+
color="white" if v > max(max(rr) for rr in rows) / 2 else "black")
|
| 92 |
+
fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
|
| 93 |
+
fig.tight_layout()
|
| 94 |
+
fig.savefig(os.path.join(out_dir, f"confusion_{s['label']}.png"), dpi=150)
|
| 95 |
+
plt.close(fig)
|
| 96 |
+
|
| 97 |
+
print(f"Wrote plots to {out_dir}")
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
if __name__ == "__main__":
|
| 101 |
+
main()
|
eval/plot_training.py
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Render the GRPO training-curve PNGs that the README embeds.
|
| 2 |
+
|
| 3 |
+
Reads ``checkpoints/defender_grpo/<stage>/training_log.jsonl`` files
|
| 4 |
+
written by the `_JsonLogger` callback in `train.train_grpo` and produces:
|
| 5 |
+
|
| 6 |
+
* ``eval/results/training_curves.png`` — reward vs global step,
|
| 7 |
+
one line per curriculum stage.
|
| 8 |
+
* ``eval/results/format_compliance.png`` — `kl` and `loss` vs step
|
| 9 |
+
(whichever fields the trainer
|
| 10 |
+
produced) as a sanity proxy.
|
| 11 |
+
|
| 12 |
+
If no JSONL logs exist (because training hasn't been run yet on this
|
| 13 |
+
machine), the script generates *placeholder* curves from a deterministic
|
| 14 |
+
synthetic process so the README never has a broken image link before the
|
| 15 |
+
real GPU run finishes. The placeholder file is clearly labelled.
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
from __future__ import annotations
|
| 19 |
+
|
| 20 |
+
import argparse
|
| 21 |
+
import json
|
| 22 |
+
import math
|
| 23 |
+
import os
|
| 24 |
+
import random
|
| 25 |
+
import sys
|
| 26 |
+
from typing import Any, Dict, List
|
| 27 |
+
|
| 28 |
+
_HERE = os.path.dirname(os.path.abspath(__file__))
|
| 29 |
+
sys.path.insert(0, os.path.dirname(_HERE))
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
STAGE_ORDER = [
|
| 33 |
+
"stage1_basic",
|
| 34 |
+
"stage2_multi",
|
| 35 |
+
"stage3_mixed",
|
| 36 |
+
"stage4_adversarial",
|
| 37 |
+
]
|
| 38 |
+
STAGE_COLORS = {
|
| 39 |
+
"stage1_basic": "#1f77b4",
|
| 40 |
+
"stage2_multi": "#2ca02c",
|
| 41 |
+
"stage3_mixed": "#ff7f0e",
|
| 42 |
+
"stage4_adversarial": "#d62728",
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _read_stage_logs(grpo_root: str) -> Dict[str, List[Dict[str, Any]]]:
|
| 47 |
+
"""Read training_log.jsonl from each stage subdirectory."""
|
| 48 |
+
out: Dict[str, List[Dict[str, Any]]] = {}
|
| 49 |
+
for stage in STAGE_ORDER:
|
| 50 |
+
path = os.path.join(grpo_root, stage, "training_log.jsonl")
|
| 51 |
+
if not os.path.exists(path):
|
| 52 |
+
continue
|
| 53 |
+
rows: List[Dict[str, Any]] = []
|
| 54 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 55 |
+
for line in f:
|
| 56 |
+
line = line.strip()
|
| 57 |
+
if not line:
|
| 58 |
+
continue
|
| 59 |
+
try:
|
| 60 |
+
rows.append(json.loads(line))
|
| 61 |
+
except json.JSONDecodeError:
|
| 62 |
+
continue
|
| 63 |
+
if rows:
|
| 64 |
+
out[stage] = rows
|
| 65 |
+
return out
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def _placeholder_logs() -> Dict[str, List[Dict[str, Any]]]:
|
| 69 |
+
"""Make synthetic-but-believable curves so the README has a plot.
|
| 70 |
+
|
| 71 |
+
Each stage's reward starts low and asymptotes; later stages start
|
| 72 |
+
lower because they're harder. Designed to look like a noisy
|
| 73 |
+
sigmoid: this is illustrative only and is overwritten the moment
|
| 74 |
+
real logs land in checkpoints/defender_grpo/<stage>/training_log.jsonl.
|
| 75 |
+
"""
|
| 76 |
+
rng = random.Random(42)
|
| 77 |
+
out: Dict[str, List[Dict[str, Any]]] = {}
|
| 78 |
+
starts = {"stage1_basic": -0.4, "stage2_multi": -0.6, "stage3_mixed": -0.8, "stage4_adversarial": -0.9}
|
| 79 |
+
asymptotes = {
|
| 80 |
+
"stage1_basic": 0.95,
|
| 81 |
+
"stage2_multi": 0.85,
|
| 82 |
+
"stage3_mixed": 0.70,
|
| 83 |
+
"stage4_adversarial": 0.55,
|
| 84 |
+
}
|
| 85 |
+
for stage in STAGE_ORDER:
|
| 86 |
+
rows = []
|
| 87 |
+
n_steps = 200
|
| 88 |
+
a, b = starts[stage], asymptotes[stage]
|
| 89 |
+
for step in range(0, n_steps, 5):
|
| 90 |
+
t = step / n_steps
|
| 91 |
+
mean = a + (b - a) * (1 - math.exp(-3.5 * t))
|
| 92 |
+
noise = rng.gauss(0, 0.07)
|
| 93 |
+
rows.append({
|
| 94 |
+
"stage": stage,
|
| 95 |
+
"step": step,
|
| 96 |
+
"reward": max(-1.5, min(1.1, mean + noise)),
|
| 97 |
+
"kl": 0.02 + 0.01 * t + max(0.0, rng.gauss(0, 0.005)),
|
| 98 |
+
"loss": 0.7 - 0.3 * t + rng.gauss(0, 0.04),
|
| 99 |
+
})
|
| 100 |
+
out[stage] = rows
|
| 101 |
+
return out
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def _key(rows: List[Dict[str, Any]], names: List[str]) -> List[float] | None:
|
| 105 |
+
"""Return values for the first matching key, else None."""
|
| 106 |
+
for name in names:
|
| 107 |
+
if any(name in r for r in rows):
|
| 108 |
+
return [r.get(name, math.nan) for r in rows]
|
| 109 |
+
return None
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def _plot_curves(stage_logs: Dict[str, List[Dict[str, Any]]], out_path: str, placeholder: bool):
|
| 113 |
+
import matplotlib # type: ignore[import-not-found]
|
| 114 |
+
matplotlib.use("Agg")
|
| 115 |
+
import matplotlib.pyplot as plt # type: ignore[import-not-found]
|
| 116 |
+
|
| 117 |
+
fig, ax = plt.subplots(figsize=(8, 4.5))
|
| 118 |
+
cumulative = 0
|
| 119 |
+
for stage in STAGE_ORDER:
|
| 120 |
+
rows = stage_logs.get(stage, [])
|
| 121 |
+
if not rows:
|
| 122 |
+
continue
|
| 123 |
+
rows = sorted(rows, key=lambda r: r.get("step", 0))
|
| 124 |
+
steps = [cumulative + r.get("step", 0) for r in rows]
|
| 125 |
+
rewards = _key(rows, ["reward", "rewards/mean", "train/reward", "reward_mean"]) or [
|
| 126 |
+
math.nan
|
| 127 |
+
] * len(rows)
|
| 128 |
+
ax.plot(steps, rewards, label=stage, color=STAGE_COLORS[stage], linewidth=1.6)
|
| 129 |
+
if rows:
|
| 130 |
+
cumulative += max(r.get("step", 0) for r in rows) + 5
|
| 131 |
+
|
| 132 |
+
ax.axhline(0.0, color="#888", linewidth=0.6, linestyle="--")
|
| 133 |
+
ax.set_xlabel("Global step (concatenated across stages)")
|
| 134 |
+
ax.set_ylabel("Mean reward")
|
| 135 |
+
title = "OpenSOC GRPO defender — reward across curriculum stages"
|
| 136 |
+
if placeholder:
|
| 137 |
+
title += " [placeholder — re-run after real training]"
|
| 138 |
+
ax.set_title(title)
|
| 139 |
+
ax.legend(loc="lower right", fontsize=9)
|
| 140 |
+
ax.grid(True, alpha=0.3)
|
| 141 |
+
fig.tight_layout()
|
| 142 |
+
fig.savefig(out_path, dpi=150)
|
| 143 |
+
plt.close(fig)
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
def _plot_aux(stage_logs: Dict[str, List[Dict[str, Any]]], out_path: str, placeholder: bool):
|
| 147 |
+
import matplotlib # type: ignore[import-not-found]
|
| 148 |
+
matplotlib.use("Agg")
|
| 149 |
+
import matplotlib.pyplot as plt # type: ignore[import-not-found]
|
| 150 |
+
|
| 151 |
+
fig, axes = plt.subplots(1, 2, figsize=(10, 3.8))
|
| 152 |
+
for stage in STAGE_ORDER:
|
| 153 |
+
rows = stage_logs.get(stage, [])
|
| 154 |
+
if not rows:
|
| 155 |
+
continue
|
| 156 |
+
rows = sorted(rows, key=lambda r: r.get("step", 0))
|
| 157 |
+
steps = [r.get("step", 0) for r in rows]
|
| 158 |
+
kl = _key(rows, ["kl", "kl_div", "objective/kl", "train/kl"])
|
| 159 |
+
loss = _key(rows, ["loss", "train/loss"])
|
| 160 |
+
if kl is not None:
|
| 161 |
+
axes[0].plot(steps, kl, label=stage, color=STAGE_COLORS[stage], linewidth=1.4)
|
| 162 |
+
if loss is not None:
|
| 163 |
+
axes[1].plot(steps, loss, label=stage, color=STAGE_COLORS[stage], linewidth=1.4)
|
| 164 |
+
axes[0].set_title("KL(policy ‖ ref)")
|
| 165 |
+
axes[0].set_xlabel("Step (within stage)")
|
| 166 |
+
axes[0].grid(True, alpha=0.3)
|
| 167 |
+
axes[0].legend(fontsize=8, loc="upper right")
|
| 168 |
+
axes[1].set_title("Training loss")
|
| 169 |
+
axes[1].set_xlabel("Step (within stage)")
|
| 170 |
+
axes[1].grid(True, alpha=0.3)
|
| 171 |
+
axes[1].legend(fontsize=8, loc="upper right")
|
| 172 |
+
suffix = " [placeholder]" if placeholder else ""
|
| 173 |
+
fig.suptitle(f"OpenSOC GRPO — KL and loss diagnostics{suffix}")
|
| 174 |
+
fig.tight_layout()
|
| 175 |
+
fig.savefig(out_path, dpi=150)
|
| 176 |
+
plt.close(fig)
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def main() -> None:
|
| 180 |
+
parser = argparse.ArgumentParser()
|
| 181 |
+
parser.add_argument(
|
| 182 |
+
"--grpo-root", default="checkpoints/defender_grpo",
|
| 183 |
+
help="Directory containing <stage>/training_log.jsonl files.",
|
| 184 |
+
)
|
| 185 |
+
parser.add_argument("--out-dir", default="eval/results")
|
| 186 |
+
parser.add_argument(
|
| 187 |
+
"--allow-placeholder", action="store_true",
|
| 188 |
+
help="Generate fake curves if real logs are missing (default off).",
|
| 189 |
+
)
|
| 190 |
+
args = parser.parse_args()
|
| 191 |
+
|
| 192 |
+
grpo_root = os.path.join(os.path.dirname(_HERE), args.grpo_root)
|
| 193 |
+
out_dir = os.path.join(os.path.dirname(_HERE), args.out_dir)
|
| 194 |
+
os.makedirs(out_dir, exist_ok=True)
|
| 195 |
+
|
| 196 |
+
stage_logs = _read_stage_logs(grpo_root)
|
| 197 |
+
placeholder = False
|
| 198 |
+
if not stage_logs:
|
| 199 |
+
if not args.allow_placeholder:
|
| 200 |
+
print(
|
| 201 |
+
f"No training logs found under {grpo_root}.\n"
|
| 202 |
+
" - re-run after `python -m train.train_grpo ...` produces "
|
| 203 |
+
"training_log.jsonl, or pass `--allow-placeholder` to render "
|
| 204 |
+
"synthetic curves for the README scaffold.",
|
| 205 |
+
file=sys.stderr,
|
| 206 |
+
)
|
| 207 |
+
sys.exit(2)
|
| 208 |
+
stage_logs = _placeholder_logs()
|
| 209 |
+
placeholder = True
|
| 210 |
+
|
| 211 |
+
curves_path = os.path.join(out_dir, "training_curves.png")
|
| 212 |
+
aux_path = os.path.join(out_dir, "training_kl_loss.png")
|
| 213 |
+
_plot_curves(stage_logs, curves_path, placeholder)
|
| 214 |
+
_plot_aux(stage_logs, aux_path, placeholder)
|
| 215 |
+
|
| 216 |
+
print(f"Wrote {curves_path} and {aux_path}" + (" [placeholder]" if placeholder else ""))
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
if __name__ == "__main__":
|
| 220 |
+
main()
|
generator.py
ADDED
|
@@ -0,0 +1,365 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
generator.py — Deterministic, seeded incident generator.
|
| 3 |
+
|
| 4 |
+
Used by:
|
| 5 |
+
|
| 6 |
+
* `OpenSOCEnv` in `defender_only` mode, when the env needs to materialize a
|
| 7 |
+
self-contained incident for the defender (SFT warm-start, eval, smoke
|
| 8 |
+
tests, curriculum starter prompts).
|
| 9 |
+
* `train/sft_warmstart.py` to produce ~600 (incident, triage) pairs for
|
| 10 |
+
bootstrapping defender format learning.
|
| 11 |
+
* `train/make_holdout.py` to build the frozen 200-incident eval set.
|
| 12 |
+
|
| 13 |
+
The generator emits `IncidentParams` instances; the env then materializes
|
| 14 |
+
them into `Incident` objects with a SIEM-style `Alert` summary. The
|
| 15 |
+
attacker is *not* required to use this generator — its only job is to give
|
| 16 |
+
the env a deterministic starting distribution for stages 1-4.
|
| 17 |
+
|
| 18 |
+
Seeding contract
|
| 19 |
+
----------------
|
| 20 |
+
``generate_incident(stage_id, seed=N)`` is referentially transparent:
|
| 21 |
+
calling it with the same arguments anywhere in the codebase returns the
|
| 22 |
+
exact same incident. This is what makes the held-out eval set
|
| 23 |
+
reproducible across machines.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
from __future__ import annotations
|
| 27 |
+
|
| 28 |
+
import random
|
| 29 |
+
from datetime import datetime, timedelta, timezone
|
| 30 |
+
from typing import Callable, Dict, List, Tuple
|
| 31 |
+
|
| 32 |
+
from schema import (
|
| 33 |
+
Alert,
|
| 34 |
+
EventType,
|
| 35 |
+
IncidentCategory,
|
| 36 |
+
IncidentParams,
|
| 37 |
+
TriageAction,
|
| 38 |
+
make_event,
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# ---------------------------------------------------------------------------
|
| 43 |
+
# Time helpers
|
| 44 |
+
# ---------------------------------------------------------------------------
|
| 45 |
+
|
| 46 |
+
def _ts_iter(start: datetime, n: int, step_s: int = 5) -> List[str]:
|
| 47 |
+
"""Return n monotonic ISO-8601 UTC timestamps starting at `start`."""
|
| 48 |
+
return [
|
| 49 |
+
(start + timedelta(seconds=step_s * i))
|
| 50 |
+
.replace(tzinfo=timezone.utc)
|
| 51 |
+
.strftime("%Y-%m-%dT%H:%M:%SZ")
|
| 52 |
+
for i in range(n)
|
| 53 |
+
]
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def _start_time(rng: random.Random) -> datetime:
|
| 57 |
+
"""Pick a recent UTC start time anchored on the current calendar day."""
|
| 58 |
+
base = datetime(2026, 4, 25, 0, 0, 0)
|
| 59 |
+
minutes = rng.randint(0, 60 * 23)
|
| 60 |
+
return base + timedelta(minutes=minutes)
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
# ---------------------------------------------------------------------------
|
| 64 |
+
# Single-event template builders, keyed by *true* ground-truth label
|
| 65 |
+
# ---------------------------------------------------------------------------
|
| 66 |
+
|
| 67 |
+
def _benign_login(rng: random.Random) -> Tuple[IncidentCategory, list]:
|
| 68 |
+
ts = _ts_iter(_start_time(rng), 1)
|
| 69 |
+
user = rng.choice(["alice", "bob", "carol", "dave", "erin"])
|
| 70 |
+
return IncidentCategory.BENIGN_NOISE, [
|
| 71 |
+
make_event(
|
| 72 |
+
1, 0, EventType.AUTH_LOGIN_SUCCESS, ts[0],
|
| 73 |
+
source="identity", user=user, src_ip=f"10.0.0.{rng.randint(2, 200)}",
|
| 74 |
+
)
|
| 75 |
+
]
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def _benign_internal_beacon(rng: random.Random) -> Tuple[IncidentCategory, list]:
|
| 79 |
+
ts = _ts_iter(_start_time(rng), 1)
|
| 80 |
+
return IncidentCategory.BENIGN_NOISE, [
|
| 81 |
+
make_event(
|
| 82 |
+
1, 0, EventType.NET_BEACON, ts[0],
|
| 83 |
+
source="network", dst_ip=f"10.0.{rng.randint(0, 255)}.{rng.randint(2, 250)}",
|
| 84 |
+
interval_s=rng.choice([30, 60, 90]),
|
| 85 |
+
)
|
| 86 |
+
]
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def _monitor_brute_force(rng: random.Random) -> Tuple[IncidentCategory, list]:
|
| 90 |
+
n_failures = rng.randint(5, 9)
|
| 91 |
+
ts = _ts_iter(_start_time(rng), n_failures)
|
| 92 |
+
user = rng.choice(["root", "admin", "service_acct", "alice"])
|
| 93 |
+
return IncidentCategory.BRUTE_FORCE, [
|
| 94 |
+
make_event(
|
| 95 |
+
1, i, EventType.AUTH_LOGIN_FAILURE, ts[i],
|
| 96 |
+
source="identity", user=user,
|
| 97 |
+
src_ip=f"203.0.113.{rng.randint(2, 250)}",
|
| 98 |
+
)
|
| 99 |
+
for i in range(n_failures)
|
| 100 |
+
]
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def _monitor_port_scan(rng: random.Random) -> Tuple[IncidentCategory, list]:
|
| 104 |
+
ts = _ts_iter(_start_time(rng), 1)
|
| 105 |
+
return IncidentCategory.BENIGN_NOISE, [
|
| 106 |
+
make_event(
|
| 107 |
+
1, 0, EventType.NET_PORT_SCAN_HIT, ts[0],
|
| 108 |
+
source="network",
|
| 109 |
+
src_ip=f"203.0.113.{rng.randint(2, 250)}",
|
| 110 |
+
port=rng.choice([22, 23, 445, 3389, 5985]),
|
| 111 |
+
),
|
| 112 |
+
]
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def _quarantine_lolbin(rng: random.Random) -> Tuple[IncidentCategory, list]:
|
| 116 |
+
ts = _ts_iter(_start_time(rng), 2)
|
| 117 |
+
parent = rng.choice(["outlook.exe", "winword.exe", "excel.exe", "chrome.exe"])
|
| 118 |
+
return IncidentCategory.MALWARE_EXECUTION, [
|
| 119 |
+
make_event(
|
| 120 |
+
1, 0, EventType.PROC_LOLBIN, ts[0],
|
| 121 |
+
source="endpoint",
|
| 122 |
+
process="powershell.exe",
|
| 123 |
+
parent_process=parent,
|
| 124 |
+
cmd="-enc " + "A" * rng.randint(40, 80),
|
| 125 |
+
),
|
| 126 |
+
make_event(
|
| 127 |
+
1, 1, EventType.FILE_WRITE, ts[1],
|
| 128 |
+
source="endpoint",
|
| 129 |
+
path=f"C:\\Users\\Public\\{rng.choice(['payload', 'svc', 'tmp'])}.exe",
|
| 130 |
+
),
|
| 131 |
+
]
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def _quarantine_edr_high(rng: random.Random) -> Tuple[IncidentCategory, list]:
|
| 135 |
+
ts = _ts_iter(_start_time(rng), 1)
|
| 136 |
+
return IncidentCategory.MALWARE_EXECUTION, [
|
| 137 |
+
make_event(
|
| 138 |
+
1, 0, EventType.EDR_BEHAVIOR_MATCH, ts[0],
|
| 139 |
+
source="edr", severity="high", rule_id=f"EDR-{rng.randint(1000, 9999)}",
|
| 140 |
+
host=f"host-{rng.randint(1, 50):03d}",
|
| 141 |
+
)
|
| 142 |
+
]
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def _block_ip_external_beacon(rng: random.Random) -> Tuple[IncidentCategory, list]:
|
| 146 |
+
ts = _ts_iter(_start_time(rng), 1)
|
| 147 |
+
return IncidentCategory.C2_BEACON, [
|
| 148 |
+
make_event(
|
| 149 |
+
1, 0, EventType.NET_BEACON, ts[0],
|
| 150 |
+
source="network",
|
| 151 |
+
dst_ip=rng.choice(["203.0.113.5", "198.51.100.42", "185.220.101.7"]),
|
| 152 |
+
interval_s=rng.choice([30, 60, 90]),
|
| 153 |
+
)
|
| 154 |
+
]
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def _block_ip_phish_url(rng: random.Random) -> Tuple[IncidentCategory, list]:
|
| 158 |
+
ts = _ts_iter(_start_time(rng), 1)
|
| 159 |
+
tld = rng.choice([".ru", ".cn", ".top", ".xyz", ".click"])
|
| 160 |
+
return IncidentCategory.PHISHING, [
|
| 161 |
+
make_event(
|
| 162 |
+
1, 0, EventType.EMAIL_LINK_CLICKED, ts[0],
|
| 163 |
+
source="email",
|
| 164 |
+
url=f"https://login-update{tld}/secure",
|
| 165 |
+
user=rng.choice(["alice", "bob", "carol"]),
|
| 166 |
+
)
|
| 167 |
+
]
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def _escalate_combined(rng: random.Random) -> Tuple[IncidentCategory, list]:
|
| 171 |
+
ts = _ts_iter(_start_time(rng), 3, step_s=15)
|
| 172 |
+
return IncidentCategory.MALWARE_EXECUTION, [
|
| 173 |
+
make_event(
|
| 174 |
+
1, 0, EventType.PROC_LOLBIN, ts[0],
|
| 175 |
+
source="endpoint",
|
| 176 |
+
process="powershell.exe", parent_process="outlook.exe",
|
| 177 |
+
cmd="-enc " + "B" * 60,
|
| 178 |
+
),
|
| 179 |
+
make_event(
|
| 180 |
+
1, 1, EventType.NET_BEACON, ts[1],
|
| 181 |
+
source="network", dst_ip="203.0.113.5", interval_s=30,
|
| 182 |
+
),
|
| 183 |
+
make_event(
|
| 184 |
+
1, 2, EventType.AUTH_PRIVILEGE_GRANT, ts[2],
|
| 185 |
+
source="identity", user=rng.choice(["alice", "bob"]), role="admin",
|
| 186 |
+
),
|
| 187 |
+
]
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
# Mapping: ground-truth label → list of template builders
|
| 191 |
+
TEMPLATES: Dict[TriageAction, List[Callable[[random.Random], Tuple[IncidentCategory, list]]]] = {
|
| 192 |
+
TriageAction.DISMISS: [_benign_login, _benign_internal_beacon],
|
| 193 |
+
TriageAction.MONITOR: [_monitor_brute_force, _monitor_port_scan],
|
| 194 |
+
TriageAction.QUARANTINE_HOST: [_quarantine_lolbin, _quarantine_edr_high],
|
| 195 |
+
TriageAction.BLOCK_IP: [_block_ip_external_beacon, _block_ip_phish_url],
|
| 196 |
+
TriageAction.ESCALATE: [_escalate_combined],
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
# ---------------------------------------------------------------------------
|
| 201 |
+
# Stage configs
|
| 202 |
+
# ---------------------------------------------------------------------------
|
| 203 |
+
|
| 204 |
+
# Each stage has:
|
| 205 |
+
# - label_distribution: probability mass over ground-truth labels (must sum to 1)
|
| 206 |
+
# - decoys: number of *additional* benign-looking events to splice in
|
| 207 |
+
# - jitter: how much we perturb fields (0.0 = none, 1.0 = max)
|
| 208 |
+
STAGE_CONFIGS: Dict[str, dict] = {
|
| 209 |
+
"stage1_basic": {
|
| 210 |
+
"label_distribution": {
|
| 211 |
+
TriageAction.DISMISS: 0.30,
|
| 212 |
+
TriageAction.MONITOR: 0.20,
|
| 213 |
+
TriageAction.QUARANTINE_HOST: 0.20,
|
| 214 |
+
TriageAction.BLOCK_IP: 0.20,
|
| 215 |
+
TriageAction.ESCALATE: 0.10,
|
| 216 |
+
},
|
| 217 |
+
"decoys": 0,
|
| 218 |
+
"jitter": 0.0,
|
| 219 |
+
},
|
| 220 |
+
"stage2_multi": {
|
| 221 |
+
"label_distribution": {
|
| 222 |
+
TriageAction.DISMISS: 0.20,
|
| 223 |
+
TriageAction.MONITOR: 0.20,
|
| 224 |
+
TriageAction.QUARANTINE_HOST: 0.25,
|
| 225 |
+
TriageAction.BLOCK_IP: 0.20,
|
| 226 |
+
TriageAction.ESCALATE: 0.15,
|
| 227 |
+
},
|
| 228 |
+
"decoys": 1,
|
| 229 |
+
"jitter": 0.2,
|
| 230 |
+
},
|
| 231 |
+
"stage3_mixed": {
|
| 232 |
+
"label_distribution": {
|
| 233 |
+
TriageAction.DISMISS: 0.25,
|
| 234 |
+
TriageAction.MONITOR: 0.25,
|
| 235 |
+
TriageAction.QUARANTINE_HOST: 0.20,
|
| 236 |
+
TriageAction.BLOCK_IP: 0.15,
|
| 237 |
+
TriageAction.ESCALATE: 0.15,
|
| 238 |
+
},
|
| 239 |
+
"decoys": 2,
|
| 240 |
+
"jitter": 0.4,
|
| 241 |
+
},
|
| 242 |
+
"stage4_adversarial": {
|
| 243 |
+
"label_distribution": {
|
| 244 |
+
TriageAction.DISMISS: 0.30,
|
| 245 |
+
TriageAction.MONITOR: 0.25,
|
| 246 |
+
TriageAction.QUARANTINE_HOST: 0.15,
|
| 247 |
+
TriageAction.BLOCK_IP: 0.15,
|
| 248 |
+
TriageAction.ESCALATE: 0.15,
|
| 249 |
+
},
|
| 250 |
+
"decoys": 3,
|
| 251 |
+
"jitter": 0.7,
|
| 252 |
+
},
|
| 253 |
+
}
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
def _sample_label(rng: random.Random, dist: Dict[TriageAction, float]) -> TriageAction:
|
| 257 |
+
labels = list(dist.keys())
|
| 258 |
+
weights = [dist[lab] for lab in labels]
|
| 259 |
+
return rng.choices(labels, weights=weights, k=1)[0]
|
| 260 |
+
|
| 261 |
+
|
| 262 |
+
def _make_decoy_events(rng: random.Random, n_decoys: int, start_idx: int) -> list:
|
| 263 |
+
"""Generate `n_decoys` benign decoy events that don't change the label.
|
| 264 |
+
|
| 265 |
+
Decoys are drawn from a pool that is provably benign by the verifier:
|
| 266 |
+
a successful login, an internal DNS query, an internal outbound flow.
|
| 267 |
+
"""
|
| 268 |
+
ts = _ts_iter(_start_time(rng), n_decoys, step_s=2)
|
| 269 |
+
decoys = []
|
| 270 |
+
for i in range(n_decoys):
|
| 271 |
+
choice = rng.randint(0, 2)
|
| 272 |
+
n = start_idx + i
|
| 273 |
+
if choice == 0:
|
| 274 |
+
decoys.append(make_event(
|
| 275 |
+
1, n, EventType.AUTH_LOGIN_SUCCESS, ts[i],
|
| 276 |
+
source="identity",
|
| 277 |
+
user=rng.choice(["alice", "bob", "carol", "dave"]),
|
| 278 |
+
src_ip=f"10.0.0.{rng.randint(2, 250)}",
|
| 279 |
+
))
|
| 280 |
+
elif choice == 1:
|
| 281 |
+
decoys.append(make_event(
|
| 282 |
+
1, n, EventType.NET_DNS_QUERY, ts[i],
|
| 283 |
+
source="network",
|
| 284 |
+
domain=rng.choice(["github.com", "google.com", "internal.corp"]),
|
| 285 |
+
))
|
| 286 |
+
else:
|
| 287 |
+
decoys.append(make_event(
|
| 288 |
+
1, n, EventType.NET_OUTBOUND, ts[i],
|
| 289 |
+
source="network",
|
| 290 |
+
dst_ip=f"10.0.{rng.randint(0, 255)}.{rng.randint(2, 250)}",
|
| 291 |
+
bytes_out=rng.randint(1_000, 100_000),
|
| 292 |
+
))
|
| 293 |
+
return decoys
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
def _renumber_and_resort(events: list) -> list:
|
| 297 |
+
"""Rewrite log_ids to L1-0..L1-N-1 and sort by timestamp."""
|
| 298 |
+
events = sorted(events, key=lambda e: e.timestamp)
|
| 299 |
+
fixed = []
|
| 300 |
+
for i, e in enumerate(events):
|
| 301 |
+
fixed.append(
|
| 302 |
+
type(e)(
|
| 303 |
+
log_id=f"L1-{i}",
|
| 304 |
+
timestamp=e.timestamp,
|
| 305 |
+
source=e.source,
|
| 306 |
+
event_type=e.event_type,
|
| 307 |
+
fields=e.fields,
|
| 308 |
+
)
|
| 309 |
+
)
|
| 310 |
+
return fixed
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
# ---------------------------------------------------------------------------
|
| 314 |
+
# Public API
|
| 315 |
+
# ---------------------------------------------------------------------------
|
| 316 |
+
|
| 317 |
+
def generate_incident(stage_id: str, seed: int) -> IncidentParams:
|
| 318 |
+
"""Deterministically generate an `IncidentParams` for the given stage."""
|
| 319 |
+
if stage_id not in STAGE_CONFIGS:
|
| 320 |
+
raise ValueError(
|
| 321 |
+
f"Unknown stage_id {stage_id!r}; choose from {list(STAGE_CONFIGS)}"
|
| 322 |
+
)
|
| 323 |
+
cfg = STAGE_CONFIGS[stage_id]
|
| 324 |
+
rng = random.Random(seed)
|
| 325 |
+
|
| 326 |
+
label = _sample_label(rng, cfg["label_distribution"])
|
| 327 |
+
template = rng.choice(TEMPLATES[label])
|
| 328 |
+
category, core_events = template(rng)
|
| 329 |
+
|
| 330 |
+
decoy_events = _make_decoy_events(rng, cfg["decoys"], start_idx=len(core_events))
|
| 331 |
+
events = _renumber_and_resort(core_events + decoy_events)
|
| 332 |
+
|
| 333 |
+
return IncidentParams(
|
| 334 |
+
target_label=label,
|
| 335 |
+
category=category,
|
| 336 |
+
events=events,
|
| 337 |
+
narrative="",
|
| 338 |
+
)
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
def make_alert(params: IncidentParams, alert_id: str) -> Alert:
|
| 342 |
+
"""Synthesize a SIEM alert summary from an incident's events."""
|
| 343 |
+
sev_for_label = {
|
| 344 |
+
TriageAction.DISMISS: "low",
|
| 345 |
+
TriageAction.MONITOR: "medium",
|
| 346 |
+
TriageAction.QUARANTINE_HOST: "high",
|
| 347 |
+
TriageAction.BLOCK_IP: "high",
|
| 348 |
+
TriageAction.ESCALATE: "critical",
|
| 349 |
+
}
|
| 350 |
+
severity = sev_for_label.get(params.target_label, "medium")
|
| 351 |
+
first_event = params.events[0]
|
| 352 |
+
user = str(first_event.fields.get("user", "user-001"))
|
| 353 |
+
host = str(first_event.fields.get("host", "host-001"))
|
| 354 |
+
summary = f"{params.category.value}: {len(params.events)} event(s); first={first_event.event_type.value}"
|
| 355 |
+
return Alert(
|
| 356 |
+
alert_id=alert_id,
|
| 357 |
+
category=params.category,
|
| 358 |
+
severity=severity,
|
| 359 |
+
summary=summary,
|
| 360 |
+
host=host,
|
| 361 |
+
user=user,
|
| 362 |
+
)
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
__all__ = ["generate_incident", "make_alert", "STAGE_CONFIGS", "TEMPLATES"]
|
openenv.yaml
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: opensoc
|
| 2 |
+
version: "1.0.0"
|
| 3 |
+
description: >
|
| 4 |
+
OpenSOC is a self-play OpenEnv environment for training Security Operations Center
|
| 5 |
+
(SOC) triage agents. An attacker LLM crafts synthetic security incidents from a
|
| 6 |
+
constrained schema; a defender LLM (the trainee) reads the resulting alert and log
|
| 7 |
+
window and decides how to triage it. Ground-truth triage labels are computed by a
|
| 8 |
+
deterministic schema-side verifier — never read from attacker text — so the
|
| 9 |
+
environment is fully RLVR-compatible and resistant to self-play reward hacking.
|
| 10 |
+
|
| 11 |
+
author: opensoc
|
| 12 |
+
tags:
|
| 13 |
+
- openenv
|
| 14 |
+
- cybersecurity
|
| 15 |
+
- soc
|
| 16 |
+
- self-play
|
| 17 |
+
- multi-agent
|
| 18 |
+
- rlvr
|
| 19 |
+
- threat-detection
|
| 20 |
+
|
| 21 |
+
# Curriculum stages: an LLM trainee can move through these in order.
|
| 22 |
+
# task_id is used by /reset?task=<id>. Difficulty is purely the parameter
|
| 23 |
+
# distribution; the action space and rewards are identical across stages.
|
| 24 |
+
tasks:
|
| 25 |
+
- id: stage1_basic
|
| 26 |
+
difficulty: easy
|
| 27 |
+
description: >
|
| 28 |
+
Single-event incidents drawn from a small set of unambiguous templates
|
| 29 |
+
(one obviously benign, one obviously malicious per category). Used to
|
| 30 |
+
bootstrap defender format learning.
|
| 31 |
+
max_steps: 2
|
| 32 |
+
reward_range: [-1.5, 1.1]
|
| 33 |
+
|
| 34 |
+
- id: stage2_multi
|
| 35 |
+
difficulty: medium
|
| 36 |
+
description: >
|
| 37 |
+
Multi-event incidents where the malicious signal is spread across a
|
| 38 |
+
short log window. Tests temporal reasoning and rationale citation.
|
| 39 |
+
max_steps: 2
|
| 40 |
+
reward_range: [-1.5, 1.1]
|
| 41 |
+
|
| 42 |
+
- id: stage3_mixed
|
| 43 |
+
difficulty: hard
|
| 44 |
+
description: >
|
| 45 |
+
Incidents where benign events are interleaved with malicious ones, and
|
| 46 |
+
some benign templates closely mimic malicious patterns. Tests
|
| 47 |
+
false-positive suppression.
|
| 48 |
+
max_steps: 2
|
| 49 |
+
reward_range: [-1.5, 1.1]
|
| 50 |
+
|
| 51 |
+
- id: stage4_adversarial
|
| 52 |
+
difficulty: adversarial
|
| 53 |
+
description: >
|
| 54 |
+
Attacker-controlled distribution (when run in self-play) or
|
| 55 |
+
held-out adversarial set (when run with a fixed dataset). Used as the
|
| 56 |
+
eval benchmark for trained agents.
|
| 57 |
+
max_steps: 2
|
| 58 |
+
reward_range: [-1.5, 1.1]
|
| 59 |
+
|
| 60 |
+
observation_space:
|
| 61 |
+
type: object
|
| 62 |
+
fields:
|
| 63 |
+
role:
|
| 64 |
+
type: string
|
| 65 |
+
enum: [attacker, defender]
|
| 66 |
+
description: Which side is expected to act on this turn.
|
| 67 |
+
alert:
|
| 68 |
+
type: object
|
| 69 |
+
description: SIEM-style alert summary visible to the defender.
|
| 70 |
+
fields:
|
| 71 |
+
alert_id: { type: string }
|
| 72 |
+
category: { type: string }
|
| 73 |
+
severity: { type: string, enum: [info, low, medium, high, critical] }
|
| 74 |
+
summary: { type: string }
|
| 75 |
+
host: { type: string }
|
| 76 |
+
user: { type: string }
|
| 77 |
+
log_window:
|
| 78 |
+
type: array
|
| 79 |
+
description: >
|
| 80 |
+
Ordered list of log events surrounding the alert. Each event is a dict
|
| 81 |
+
with log_id, timestamp, source, event_type, and a fields object.
|
| 82 |
+
attacker_brief:
|
| 83 |
+
type: object
|
| 84 |
+
description: >
|
| 85 |
+
Only populated on the attacker turn; tells the attacker the target
|
| 86 |
+
ground-truth label slot it should produce an incident for.
|
| 87 |
+
fields:
|
| 88 |
+
target_label: { type: string, enum: [dismiss, monitor, quarantine_host, block_ip, escalate] }
|
| 89 |
+
difficulty: { type: string, enum: [easy, medium, hard, adversarial] }
|
| 90 |
+
category_hint: { type: string }
|
| 91 |
+
step: { type: integer }
|
| 92 |
+
max_steps: { type: integer }
|
| 93 |
+
last_action_feedback: { type: string }
|
| 94 |
+
done: { type: boolean }
|
| 95 |
+
|
| 96 |
+
action_space:
|
| 97 |
+
type: object
|
| 98 |
+
description: >
|
| 99 |
+
Exactly one of `craft_incident` (attacker turn) or `submit_triage`
|
| 100 |
+
(defender turn) should be non-null per /step call. Tool names are
|
| 101 |
+
deliberately non-reserved (no reset/step/state/close).
|
| 102 |
+
fields:
|
| 103 |
+
craft_incident:
|
| 104 |
+
type: object
|
| 105 |
+
description: >
|
| 106 |
+
Attacker action. The attacker proposes incident parameters; the env
|
| 107 |
+
validates them and computes the ground-truth label deterministically
|
| 108 |
+
from the params (NOT from any text the attacker writes).
|
| 109 |
+
fields:
|
| 110 |
+
target_label: { type: string, enum: [dismiss, monitor, quarantine_host, block_ip, escalate] }
|
| 111 |
+
category: { type: string }
|
| 112 |
+
events:
|
| 113 |
+
type: array
|
| 114 |
+
description: List of structured events to materialize.
|
| 115 |
+
items:
|
| 116 |
+
type: object
|
| 117 |
+
fields:
|
| 118 |
+
event_type: { type: string }
|
| 119 |
+
fields: { type: object }
|
| 120 |
+
narrative:
|
| 121 |
+
type: string
|
| 122 |
+
description: Free-text scratchpad; ignored by the verifier.
|
| 123 |
+
submit_triage:
|
| 124 |
+
type: object
|
| 125 |
+
description: Defender action; choose one triage action and cite a log id.
|
| 126 |
+
fields:
|
| 127 |
+
action: { type: string, enum: [dismiss, monitor, quarantine_host, block_ip, escalate] }
|
| 128 |
+
cited_log_id: { type: string, description: "ID of the log event that drove the decision." }
|
| 129 |
+
rationale: { type: string }
|
| 130 |
+
|
| 131 |
+
reward:
|
| 132 |
+
type: float
|
| 133 |
+
range: [-1.5, 1.1]
|
| 134 |
+
description: >
|
| 135 |
+
Per-turn reward.
|
| 136 |
+
|
| 137 |
+
Defender (per submit_triage):
|
| 138 |
+
+1.0 if action matches the env-computed ground-truth label
|
| 139 |
+
-1.0 if action == 'dismiss' but ground truth is malicious (missed-malicious)
|
| 140 |
+
-0.3 if action ∈ {block_ip, quarantine_host} and ground truth is benign (over-react)
|
| 141 |
+
-0.05 if action == 'escalate' and ground truth would have been a cheaper action
|
| 142 |
+
+0.1 bonus if cited_log_id matches the schema-flagged triggering event id
|
| 143 |
+
|
| 144 |
+
Attacker (per craft_incident, scored after the defender turn):
|
| 145 |
+
+1.0 iff defender misclassified AND incident passed the plausibility check
|
| 146 |
+
-0.5 if the schema validator rejected the params
|
| 147 |
+
+0.2 novelty bonus on rare feature combos within the rolling batch
|
| 148 |
+
0.0 if implausible (gibberish penalty)
|
| 149 |
+
|
| 150 |
+
endpoints:
|
| 151 |
+
reset: POST /reset
|
| 152 |
+
step: POST /step
|
| 153 |
+
state: GET /state
|
| 154 |
+
grade: POST /grade
|
| 155 |
+
tasks: GET /tasks
|
| 156 |
+
health: GET /health
|
| 157 |
+
demo: GET /demo # Gradio "before vs after" UI for human reviewers
|
| 158 |
+
|
| 159 |
+
docker:
|
| 160 |
+
port: 7860
|
| 161 |
+
|
| 162 |
+
baseline_scores:
|
| 163 |
+
stage1_basic: 0.65
|
| 164 |
+
stage2_multi: 0.45
|
| 165 |
+
stage3_mixed: 0.30
|
| 166 |
+
stage4_adversarial: 0.15
|
pyproject.toml
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "opensoc"
|
| 3 |
+
version = "1.0.0"
|
| 4 |
+
description = "Self-play SOC triage OpenEnv environment for training cybersecurity defender LLMs."
|
| 5 |
+
requires-python = ">=3.10"
|
| 6 |
+
authors = [{ name = "OpenSOC team" }]
|
| 7 |
+
license = { text = "BSD-3-Clause" }
|
| 8 |
+
readme = "README.md"
|
| 9 |
+
dependencies = [
|
| 10 |
+
"fastapi==0.115.5",
|
| 11 |
+
"uvicorn[standard]==0.32.1",
|
| 12 |
+
"pydantic==2.10.3",
|
| 13 |
+
"requests==2.32.3",
|
| 14 |
+
"httpx==0.27.2",
|
| 15 |
+
"openenv-core>=0.2.0",
|
| 16 |
+
"pyyaml==6.0.2",
|
| 17 |
+
"gradio>=4.40,<5",
|
| 18 |
+
]
|
| 19 |
+
|
| 20 |
+
[project.optional-dependencies]
|
| 21 |
+
dev = [
|
| 22 |
+
"pytest>=8.0",
|
| 23 |
+
"ruff>=0.6",
|
| 24 |
+
]
|
| 25 |
+
|
| 26 |
+
[build-system]
|
| 27 |
+
requires = ["setuptools>=68"]
|
| 28 |
+
build-backend = "setuptools.build_meta"
|
| 29 |
+
|
| 30 |
+
[tool.setuptools]
|
| 31 |
+
py-modules = ["env", "app_runtime", "server", "schema", "generator", "verifier", "rubric", "demo_app", "demo_data"]
|
| 32 |
+
|
| 33 |
+
[tool.setuptools.packages.find]
|
| 34 |
+
include = ["tasks*", "client*", "train*", "eval*"]
|
| 35 |
+
|
| 36 |
+
[tool.pytest.ini_options]
|
| 37 |
+
testpaths = ["tests"]
|
| 38 |
+
addopts = "-q"
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.115.5
|
| 2 |
+
uvicorn[standard]==0.32.1
|
| 3 |
+
pydantic==2.10.3
|
| 4 |
+
requests==2.32.3
|
| 5 |
+
httpx==0.27.2
|
| 6 |
+
openenv-core>=0.2.0
|
| 7 |
+
pyyaml==6.0.2
|
| 8 |
+
gradio>=4.40,<5
|
rubric.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
rubric.py — Layered, composable reward for OpenSOC.
|
| 3 |
+
|
| 4 |
+
The reward is the task definition. This module exposes two pure functions
|
| 5 |
+
the env calls at episode end:
|
| 6 |
+
|
| 7 |
+
* `score_defender(action, ground_truth, triggering_log_id, cited_log_id)`
|
| 8 |
+
* `score_attacker(plausible, defender_correct, novelty)`
|
| 9 |
+
|
| 10 |
+
Each returns ``(reward, breakdown)``. Both functions are pure and easy to
|
| 11 |
+
unit-test (see `tests/test_rubric.py`). The numerical ranges deliberately
|
| 12 |
+
match `openenv.yaml`'s declared `[-1.5, 1.1]` reward range so the manifest
|
| 13 |
+
stays a faithful description of behaviour.
|
| 14 |
+
|
| 15 |
+
Reward design choices
|
| 16 |
+
---------------------
|
| 17 |
+
1. We grade `dismiss-on-malicious` (missed malicious) much more harshly than
|
| 18 |
+
over-reaction; in real SOC workflows missing a real attacker has a much
|
| 19 |
+
larger blast radius than briefly quarantining a benign host.
|
| 20 |
+
2. Containment over-reaction (`block_ip` / `quarantine_host` on benign) is
|
| 21 |
+
penalized but not catastrophically — these are reversible operationally.
|
| 22 |
+
3. The `+0.1` cited-log-id bonus is a structural verifier: a defender that
|
| 23 |
+
guesses the right action without citing the right event gets less than a
|
| 24 |
+
defender that cites the right event, which discourages "always pick the
|
| 25 |
+
most aggressive action" shortcuts.
|
| 26 |
+
4. Attacker reward is gated on plausibility, not on how angry the narrative
|
| 27 |
+
sounds. This is what neutralizes the classic self-play reward hack.
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
from __future__ import annotations
|
| 31 |
+
|
| 32 |
+
from typing import Dict, Tuple
|
| 33 |
+
|
| 34 |
+
from schema import (
|
| 35 |
+
ACTION_COST,
|
| 36 |
+
CONTAINMENT_ACTIONS,
|
| 37 |
+
TriageAction,
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# ---------------------------------------------------------------------------
|
| 42 |
+
# Defender scoring
|
| 43 |
+
# ---------------------------------------------------------------------------
|
| 44 |
+
|
| 45 |
+
def score_defender(
|
| 46 |
+
action: TriageAction,
|
| 47 |
+
ground_truth: TriageAction,
|
| 48 |
+
triggering_log_id: str,
|
| 49 |
+
cited_log_id: str,
|
| 50 |
+
) -> Tuple[float, Dict[str, float]]:
|
| 51 |
+
"""Return ``(reward, breakdown)`` for a defender's submit_triage."""
|
| 52 |
+
breakdown: Dict[str, float] = {}
|
| 53 |
+
reward = 0.0
|
| 54 |
+
|
| 55 |
+
if action is ground_truth:
|
| 56 |
+
reward += 1.0
|
| 57 |
+
breakdown["correct_action"] = 1.0
|
| 58 |
+
else:
|
| 59 |
+
# Missed-malicious is by far the worst error class.
|
| 60 |
+
if action is TriageAction.DISMISS and ground_truth is not TriageAction.DISMISS:
|
| 61 |
+
reward -= 1.0
|
| 62 |
+
breakdown["missed_malicious"] = -1.0
|
| 63 |
+
elif (
|
| 64 |
+
action in CONTAINMENT_ACTIONS
|
| 65 |
+
and ground_truth in (TriageAction.DISMISS, TriageAction.MONITOR)
|
| 66 |
+
):
|
| 67 |
+
reward -= 0.3
|
| 68 |
+
breakdown["over_react_containment"] = -0.3
|
| 69 |
+
elif (
|
| 70 |
+
action is TriageAction.ESCALATE
|
| 71 |
+
and ACTION_COST[ground_truth] < ACTION_COST[TriageAction.ESCALATE]
|
| 72 |
+
):
|
| 73 |
+
reward -= 0.05
|
| 74 |
+
breakdown["unnecessary_escalation"] = -0.05
|
| 75 |
+
else:
|
| 76 |
+
# Any other mismatch (e.g. monitor-when-block-ip-was-needed) is
|
| 77 |
+
# graded as a partial-credit miss using the cost gap.
|
| 78 |
+
cost_gap = abs(ACTION_COST[action] - ACTION_COST[ground_truth])
|
| 79 |
+
penalty = -0.2 * cost_gap
|
| 80 |
+
reward += penalty
|
| 81 |
+
breakdown["cost_gap_miss"] = penalty
|
| 82 |
+
|
| 83 |
+
if cited_log_id == triggering_log_id:
|
| 84 |
+
reward += 0.1
|
| 85 |
+
breakdown["correct_citation_bonus"] = 0.1
|
| 86 |
+
|
| 87 |
+
return reward, breakdown
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
# ---------------------------------------------------------------------------
|
| 91 |
+
# Attacker scoring
|
| 92 |
+
# ---------------------------------------------------------------------------
|
| 93 |
+
|
| 94 |
+
def score_attacker(
|
| 95 |
+
*,
|
| 96 |
+
plausible: bool,
|
| 97 |
+
schema_violation: bool,
|
| 98 |
+
defender_correct: bool,
|
| 99 |
+
novelty: float = 0.0,
|
| 100 |
+
) -> Tuple[float, Dict[str, float]]:
|
| 101 |
+
"""Return ``(reward, breakdown)`` for an attacker's craft_incident.
|
| 102 |
+
|
| 103 |
+
Args:
|
| 104 |
+
plausible: Did the env's plausibility checker accept the incident?
|
| 105 |
+
schema_violation: Did pydantic / model_validator reject the
|
| 106 |
+
attacker's params (e.g. duplicate log ids, bad timestamps)? When
|
| 107 |
+
true, `plausible` should be False.
|
| 108 |
+
defender_correct: Did the defender pick the env-computed ground-truth
|
| 109 |
+
label? The attacker is rewarded for fooling the defender.
|
| 110 |
+
novelty: Optional [0, 1] score for how rare this incident's feature
|
| 111 |
+
combination is in the recent rollout batch (drives curriculum).
|
| 112 |
+
"""
|
| 113 |
+
breakdown: Dict[str, float] = {}
|
| 114 |
+
reward = 0.0
|
| 115 |
+
|
| 116 |
+
if schema_violation:
|
| 117 |
+
breakdown["schema_violation"] = -0.5
|
| 118 |
+
return -0.5, breakdown
|
| 119 |
+
|
| 120 |
+
if not plausible:
|
| 121 |
+
# Gibberish that satisfies pydantic but fails plausibility.
|
| 122 |
+
breakdown["implausible"] = 0.0
|
| 123 |
+
return 0.0, breakdown
|
| 124 |
+
|
| 125 |
+
if not defender_correct:
|
| 126 |
+
reward += 1.0
|
| 127 |
+
breakdown["fooled_defender"] = 1.0
|
| 128 |
+
|
| 129 |
+
if novelty > 0.0:
|
| 130 |
+
bonus = 0.2 * max(0.0, min(1.0, novelty))
|
| 131 |
+
reward += bonus
|
| 132 |
+
breakdown["novelty_bonus"] = bonus
|
| 133 |
+
|
| 134 |
+
return reward, breakdown
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
__all__ = ["score_defender", "score_attacker"]
|
schema.py
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
schema.py — OpenSOC incident & action schema.
|
| 3 |
+
|
| 4 |
+
This module is the single source of truth for what the attacker is allowed to
|
| 5 |
+
emit and what the defender is allowed to respond with. The verifier and
|
| 6 |
+
rubric both depend on the constraints here; they should never be relaxed
|
| 7 |
+
without updating the corresponding tests in `tests/test_schema.py` and
|
| 8 |
+
`tests/test_verifier.py`.
|
| 9 |
+
|
| 10 |
+
Design principles
|
| 11 |
+
-----------------
|
| 12 |
+
1. The *attacker* controls structured parameters (event types, field values),
|
| 13 |
+
never the ground-truth label directly. The label is derived deterministically
|
| 14 |
+
from the params by `verifier.compute_ground_truth` so that the reward can
|
| 15 |
+
never be hacked by attacker text.
|
| 16 |
+
2. Every event has a stable `log_id` of the form `L<turn>-<n>` so that the
|
| 17 |
+
defender can cite a triggering event and earn a small bonus. This is
|
| 18 |
+
regex-verifiable.
|
| 19 |
+
3. The defender's action set is a fixed enum of five SOC responses ranked by
|
| 20 |
+
"cost" (dismiss = cheapest, escalate = most expensive). This lets the
|
| 21 |
+
rubric grade over- vs under-reaction.
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
from __future__ import annotations
|
| 25 |
+
|
| 26 |
+
import enum
|
| 27 |
+
import ipaddress
|
| 28 |
+
import re
|
| 29 |
+
from typing import Any, Dict, List, Optional
|
| 30 |
+
|
| 31 |
+
from pydantic import BaseModel, Field, field_validator, model_validator
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# ---------------------------------------------------------------------------
|
| 35 |
+
# Defender action set (fixed enum, ordered by escalation cost)
|
| 36 |
+
# ---------------------------------------------------------------------------
|
| 37 |
+
|
| 38 |
+
class TriageAction(str, enum.Enum):
|
| 39 |
+
DISMISS = "dismiss"
|
| 40 |
+
MONITOR = "monitor"
|
| 41 |
+
QUARANTINE_HOST = "quarantine_host"
|
| 42 |
+
BLOCK_IP = "block_ip"
|
| 43 |
+
ESCALATE = "escalate"
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# Cost ordering — used by rubric to compute over-/under-reaction penalties.
|
| 47 |
+
# Higher number = more disruptive / expensive action.
|
| 48 |
+
ACTION_COST: Dict[TriageAction, int] = {
|
| 49 |
+
TriageAction.DISMISS: 0,
|
| 50 |
+
TriageAction.MONITOR: 1,
|
| 51 |
+
TriageAction.QUARANTINE_HOST: 2,
|
| 52 |
+
TriageAction.BLOCK_IP: 2,
|
| 53 |
+
TriageAction.ESCALATE: 3,
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
# Actions that are considered "containment" — used by rubric to detect
|
| 58 |
+
# over-reaction on benign incidents.
|
| 59 |
+
CONTAINMENT_ACTIONS = {TriageAction.QUARANTINE_HOST, TriageAction.BLOCK_IP}
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# ---------------------------------------------------------------------------
|
| 63 |
+
# Event taxonomy
|
| 64 |
+
# ---------------------------------------------------------------------------
|
| 65 |
+
|
| 66 |
+
class EventType(str, enum.Enum):
|
| 67 |
+
# Auth / identity
|
| 68 |
+
AUTH_LOGIN_SUCCESS = "auth.login_success"
|
| 69 |
+
AUTH_LOGIN_FAILURE = "auth.login_failure"
|
| 70 |
+
AUTH_PASSWORD_RESET = "auth.password_reset"
|
| 71 |
+
AUTH_MFA_FAILURE = "auth.mfa_failure"
|
| 72 |
+
AUTH_PRIVILEGE_GRANT = "auth.privilege_grant"
|
| 73 |
+
|
| 74 |
+
# Process
|
| 75 |
+
PROC_START = "proc.start"
|
| 76 |
+
PROC_PARENT_MISMATCH = "proc.parent_mismatch"
|
| 77 |
+
PROC_LOLBIN = "proc.lolbin_use"
|
| 78 |
+
|
| 79 |
+
# Network
|
| 80 |
+
NET_OUTBOUND = "net.outbound_connection"
|
| 81 |
+
NET_DNS_QUERY = "net.dns_query"
|
| 82 |
+
NET_BEACON = "net.beacon"
|
| 83 |
+
NET_PORT_SCAN_HIT = "net.port_scan_hit"
|
| 84 |
+
|
| 85 |
+
# File / object
|
| 86 |
+
FILE_WRITE = "file.write"
|
| 87 |
+
FILE_DELETE = "file.delete"
|
| 88 |
+
FILE_RENAME_DOUBLE_EXT = "file.rename_double_ext"
|
| 89 |
+
|
| 90 |
+
# Email / phishing
|
| 91 |
+
EMAIL_RECEIVED = "email.received"
|
| 92 |
+
EMAIL_LINK_CLICKED = "email.link_clicked"
|
| 93 |
+
EMAIL_ATTACHMENT_OPENED = "email.attachment_opened"
|
| 94 |
+
|
| 95 |
+
# Cloud / SaaS
|
| 96 |
+
CLOUD_API_CALL = "cloud.api_call"
|
| 97 |
+
CLOUD_KEY_CREATED = "cloud.key_created"
|
| 98 |
+
|
| 99 |
+
# Endpoint detection signals
|
| 100 |
+
EDR_BEHAVIOR_MATCH = "edr.behavior_match"
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
# Categories the attacker can target (informational for shaping; final
|
| 104 |
+
# triage label is computed from parameter content, not from `category`).
|
| 105 |
+
class IncidentCategory(str, enum.Enum):
|
| 106 |
+
BENIGN_NOISE = "benign_noise"
|
| 107 |
+
BRUTE_FORCE = "brute_force"
|
| 108 |
+
PHISHING = "phishing"
|
| 109 |
+
LATERAL_MOVEMENT = "lateral_movement"
|
| 110 |
+
PRIVILEGE_ESCALATION = "privilege_escalation"
|
| 111 |
+
DATA_EXFILTRATION = "data_exfiltration"
|
| 112 |
+
MALWARE_EXECUTION = "malware_execution"
|
| 113 |
+
C2_BEACON = "c2_beacon"
|
| 114 |
+
INSIDER_DATA_ACCESS = "insider_data_access"
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
# ---------------------------------------------------------------------------
|
| 118 |
+
# Constants used for plausibility checks
|
| 119 |
+
# ---------------------------------------------------------------------------
|
| 120 |
+
|
| 121 |
+
SEVERITIES = ("info", "low", "medium", "high", "critical")
|
| 122 |
+
LOG_ID_PATTERN = re.compile(r"^L\d+-\d+$")
|
| 123 |
+
ISO_TS_PATTERN = re.compile(
|
| 124 |
+
r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z$"
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
# Internal RFC1918 ranges — a connection from internal->internal does NOT
|
| 128 |
+
# look like exfil even if bytes are large. The verifier uses this.
|
| 129 |
+
INTERNAL_NETS = [
|
| 130 |
+
ipaddress.ip_network("10.0.0.0/8"),
|
| 131 |
+
ipaddress.ip_network("172.16.0.0/12"),
|
| 132 |
+
ipaddress.ip_network("192.168.0.0/16"),
|
| 133 |
+
]
|
| 134 |
+
|
| 135 |
+
# Living-off-the-land binaries the attacker can mark as `lolbin_use` events.
|
| 136 |
+
KNOWN_LOLBINS = {
|
| 137 |
+
"powershell.exe", "pwsh.exe", "cmd.exe", "wmic.exe", "rundll32.exe",
|
| 138 |
+
"regsvr32.exe", "mshta.exe", "certutil.exe", "bitsadmin.exe",
|
| 139 |
+
"schtasks.exe", "wscript.exe", "cscript.exe",
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
# Suspicious LOLBin parents — a lolbin spawned by Office or a browser is
|
| 143 |
+
# strongly indicative of malicious code execution.
|
| 144 |
+
SUSPICIOUS_LOLBIN_PARENTS = {
|
| 145 |
+
"winword.exe", "excel.exe", "powerpnt.exe", "outlook.exe",
|
| 146 |
+
"chrome.exe", "msedge.exe", "firefox.exe",
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def is_internal_ip(ip_str: str) -> bool:
|
| 151 |
+
"""Return True if `ip_str` is in any RFC1918 range."""
|
| 152 |
+
try:
|
| 153 |
+
ip = ipaddress.ip_address(ip_str)
|
| 154 |
+
except ValueError:
|
| 155 |
+
return False
|
| 156 |
+
return any(ip in net for net in INTERNAL_NETS)
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
# ---------------------------------------------------------------------------
|
| 160 |
+
# Pydantic models
|
| 161 |
+
# ---------------------------------------------------------------------------
|
| 162 |
+
|
| 163 |
+
class Event(BaseModel):
|
| 164 |
+
"""A single structured log event."""
|
| 165 |
+
|
| 166 |
+
log_id: str = Field(..., description="Stable id of the form 'L<turn>-<n>'.")
|
| 167 |
+
timestamp: str = Field(..., description="ISO-8601 UTC timestamp.")
|
| 168 |
+
source: str = Field(
|
| 169 |
+
"endpoint",
|
| 170 |
+
description="Logical source bucket: endpoint | network | identity | email | cloud | edr",
|
| 171 |
+
)
|
| 172 |
+
event_type: EventType
|
| 173 |
+
fields: Dict[str, Any] = Field(default_factory=dict)
|
| 174 |
+
|
| 175 |
+
@field_validator("log_id")
|
| 176 |
+
@classmethod
|
| 177 |
+
def _check_log_id(cls, v: str) -> str:
|
| 178 |
+
if not LOG_ID_PATTERN.match(v):
|
| 179 |
+
raise ValueError(f"log_id must match L<turn>-<n>, got {v!r}")
|
| 180 |
+
return v
|
| 181 |
+
|
| 182 |
+
@field_validator("timestamp")
|
| 183 |
+
@classmethod
|
| 184 |
+
def _check_ts(cls, v: str) -> str:
|
| 185 |
+
if not ISO_TS_PATTERN.match(v):
|
| 186 |
+
raise ValueError(f"timestamp must be ISO-8601 UTC, got {v!r}")
|
| 187 |
+
return v
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
class Alert(BaseModel):
|
| 191 |
+
"""SIEM-style alert summary the defender sees first."""
|
| 192 |
+
|
| 193 |
+
alert_id: str
|
| 194 |
+
category: IncidentCategory
|
| 195 |
+
severity: str = Field("medium", description="One of: info, low, medium, high, critical.")
|
| 196 |
+
summary: str
|
| 197 |
+
host: str = "host-001"
|
| 198 |
+
user: str = "user-001"
|
| 199 |
+
|
| 200 |
+
@field_validator("severity")
|
| 201 |
+
@classmethod
|
| 202 |
+
def _check_severity(cls, v: str) -> str:
|
| 203 |
+
if v not in SEVERITIES:
|
| 204 |
+
raise ValueError(f"severity must be one of {SEVERITIES}, got {v!r}")
|
| 205 |
+
return v
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
class IncidentParams(BaseModel):
|
| 209 |
+
"""Parameters the attacker chooses; the env materializes these into an Incident.
|
| 210 |
+
|
| 211 |
+
The triage label that ends up in the defender's reward is derived
|
| 212 |
+
*deterministically* from the events here by `verifier.compute_ground_truth`.
|
| 213 |
+
`target_label` is purely a shaping hint: if the attacker's events imply a
|
| 214 |
+
different label than `target_label`, the schema validator rejects the
|
| 215 |
+
incident (so the attacker cannot lie about its own intent).
|
| 216 |
+
"""
|
| 217 |
+
|
| 218 |
+
target_label: TriageAction
|
| 219 |
+
category: IncidentCategory
|
| 220 |
+
events: List[Event]
|
| 221 |
+
narrative: str = Field("", description="Free-text scratchpad; ignored by the verifier.")
|
| 222 |
+
|
| 223 |
+
@field_validator("events")
|
| 224 |
+
@classmethod
|
| 225 |
+
def _events_nonempty(cls, v: List[Event]) -> List[Event]:
|
| 226 |
+
if not v:
|
| 227 |
+
raise ValueError("events must contain at least one Event")
|
| 228 |
+
if len(v) > 32:
|
| 229 |
+
raise ValueError("events list capped at 32 entries")
|
| 230 |
+
return v
|
| 231 |
+
|
| 232 |
+
@model_validator(mode="after")
|
| 233 |
+
def _events_have_unique_ids(self) -> "IncidentParams":
|
| 234 |
+
ids = [e.log_id for e in self.events]
|
| 235 |
+
if len(set(ids)) != len(ids):
|
| 236 |
+
raise ValueError("event log_ids must be unique")
|
| 237 |
+
return self
|
| 238 |
+
|
| 239 |
+
@model_validator(mode="after")
|
| 240 |
+
def _timestamps_monotonic(self) -> "IncidentParams":
|
| 241 |
+
ts = [e.timestamp for e in self.events]
|
| 242 |
+
if ts != sorted(ts):
|
| 243 |
+
raise ValueError("event timestamps must be non-decreasing")
|
| 244 |
+
return self
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
class Incident(BaseModel):
|
| 248 |
+
"""Materialized incident the env shows to the defender."""
|
| 249 |
+
|
| 250 |
+
alert: Alert
|
| 251 |
+
log_window: List[Event]
|
| 252 |
+
triggering_log_id: str = Field(
|
| 253 |
+
..., description="The log_id the verifier deemed most diagnostic."
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
@field_validator("triggering_log_id")
|
| 257 |
+
@classmethod
|
| 258 |
+
def _check_trigger_id(cls, v: str) -> str:
|
| 259 |
+
if not LOG_ID_PATTERN.match(v):
|
| 260 |
+
raise ValueError(f"triggering_log_id must match L<turn>-<n>, got {v!r}")
|
| 261 |
+
return v
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
class CraftIncident(BaseModel):
|
| 265 |
+
"""Attacker-facing action wrapper."""
|
| 266 |
+
|
| 267 |
+
target_label: TriageAction
|
| 268 |
+
category: IncidentCategory
|
| 269 |
+
events: List[Event]
|
| 270 |
+
narrative: str = ""
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
class SubmitTriage(BaseModel):
|
| 274 |
+
"""Defender-facing action wrapper."""
|
| 275 |
+
|
| 276 |
+
action: TriageAction
|
| 277 |
+
cited_log_id: str = Field(..., description="Log id that drove the decision.")
|
| 278 |
+
rationale: str = ""
|
| 279 |
+
|
| 280 |
+
@field_validator("cited_log_id")
|
| 281 |
+
@classmethod
|
| 282 |
+
def _check_cited_log_id(cls, v: str) -> str:
|
| 283 |
+
if not LOG_ID_PATTERN.match(v):
|
| 284 |
+
raise ValueError(f"cited_log_id must match L<turn>-<n>, got {v!r}")
|
| 285 |
+
return v
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
class Action(BaseModel):
|
| 289 |
+
"""OpenEnv-style action union: exactly one field non-null per /step."""
|
| 290 |
+
|
| 291 |
+
craft_incident: Optional[CraftIncident] = None
|
| 292 |
+
submit_triage: Optional[SubmitTriage] = None
|
| 293 |
+
|
| 294 |
+
|
| 295 |
+
# ---------------------------------------------------------------------------
|
| 296 |
+
# Convenience builders for tests / generators
|
| 297 |
+
# ---------------------------------------------------------------------------
|
| 298 |
+
|
| 299 |
+
def make_log_id(turn: int, n: int) -> str:
|
| 300 |
+
"""Return a canonically-formatted log id."""
|
| 301 |
+
return f"L{turn}-{n}"
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
def make_event(
|
| 305 |
+
turn: int,
|
| 306 |
+
n: int,
|
| 307 |
+
event_type: EventType,
|
| 308 |
+
timestamp: str,
|
| 309 |
+
*,
|
| 310 |
+
source: str = "endpoint",
|
| 311 |
+
**fields: Any,
|
| 312 |
+
) -> Event:
|
| 313 |
+
"""Compact helper used by `generator.py` and tests."""
|
| 314 |
+
return Event(
|
| 315 |
+
log_id=make_log_id(turn, n),
|
| 316 |
+
timestamp=timestamp,
|
| 317 |
+
source=source,
|
| 318 |
+
event_type=event_type,
|
| 319 |
+
fields=dict(fields),
|
| 320 |
+
)
|
scripts/deploy_to_hf.sh
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# One-shot deploy of OpenSOC to a Hugging Face Space.
|
| 3 |
+
# Pre-conditions:
|
| 4 |
+
# - `huggingface-cli login` has been run (browser PAT login).
|
| 5 |
+
# - HF_USER env var is set to your HF username, e.g. `export HF_USER=foo`.
|
| 6 |
+
# Idempotent: safe to re-run after committing fresh artifacts.
|
| 7 |
+
set -euo pipefail
|
| 8 |
+
|
| 9 |
+
: "${HF_USER:?Set HF_USER to your Hugging Face username, e.g. export HF_USER=foo}"
|
| 10 |
+
SPACE_NAME="opensoc-env"
|
| 11 |
+
SPACE_URL="https://huggingface.co/spaces/${HF_USER}/${SPACE_NAME}"
|
| 12 |
+
|
| 13 |
+
echo "Deploying to ${SPACE_URL}"
|
| 14 |
+
|
| 15 |
+
# Create the Space if it doesn't exist (no-op if it does).
|
| 16 |
+
huggingface-cli repo create "${SPACE_NAME}" --type space --space-sdk docker -y \
|
| 17 |
+
|| echo "(space already exists or create errored — continuing)"
|
| 18 |
+
|
| 19 |
+
# Add the Space as a git remote (idempotent).
|
| 20 |
+
if ! git remote get-url space >/dev/null 2>&1; then
|
| 21 |
+
git remote add space "${SPACE_URL}"
|
| 22 |
+
else
|
| 23 |
+
git remote set-url space "${SPACE_URL}"
|
| 24 |
+
fi
|
| 25 |
+
|
| 26 |
+
# Stage SPACE_README.md as the Space's README so HF picks up `sdk: docker`.
|
| 27 |
+
TMP_BRANCH="space-deploy-$(date +%s)"
|
| 28 |
+
git checkout -b "${TMP_BRANCH}"
|
| 29 |
+
cat SPACE_README.md > README.md.space
|
| 30 |
+
mv README.md README.md.bak
|
| 31 |
+
mv README.md.space README.md
|
| 32 |
+
git add README.md
|
| 33 |
+
git commit -m "Space metadata header (auto)"
|
| 34 |
+
git push -u space "${TMP_BRANCH}:main"
|
| 35 |
+
echo ""
|
| 36 |
+
echo "Pushed to ${SPACE_URL}. Restoring local README ..."
|
| 37 |
+
git reset --hard HEAD~1
|
| 38 |
+
mv README.md.bak README.md 2>/dev/null || true
|
| 39 |
+
git checkout main
|
| 40 |
+
git branch -D "${TMP_BRANCH}"
|
| 41 |
+
echo ""
|
| 42 |
+
echo "Done. Open ${SPACE_URL%/spaces*}/spaces/${HF_USER}/${SPACE_NAME} to watch the build,"
|
| 43 |
+
echo "then visit:"
|
| 44 |
+
echo " ${SPACE_URL%/spaces*}/${HF_USER}-${SPACE_NAME}.hf.space/health"
|
| 45 |
+
echo " ${SPACE_URL%/spaces*}/${HF_USER}-${SPACE_NAME}.hf.space/demo"
|
scripts/run_full_pipeline.sh
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Run the OpenSOC SFT + GRPO + eval + bake-demo pipeline end-to-end on a
|
| 3 |
+
# GPU host (Hugging Face Jupyter L4 recommended).
|
| 4 |
+
#
|
| 5 |
+
# Pre-conditions:
|
| 6 |
+
# - You're at the repo root.
|
| 7 |
+
# - GPU is visible to torch (`python -c "import torch; print(torch.cuda.is_available())"`).
|
| 8 |
+
# - HF_TOKEN is set if you plan to push back at the end.
|
| 9 |
+
#
|
| 10 |
+
# Estimated cost on HF L4 (~$0.80/h): ~$3.20 total.
|
| 11 |
+
#
|
| 12 |
+
set -euo pipefail
|
| 13 |
+
|
| 14 |
+
echo "[1/6] Installing GPU stack ..."
|
| 15 |
+
pip install -q --upgrade pip
|
| 16 |
+
pip install -q "unsloth[cu121] @ git+https://github.com/unslothai/unsloth.git"
|
| 17 |
+
pip install -q "trl>=0.12" peft accelerate bitsandbytes datasets tensorboard matplotlib
|
| 18 |
+
pip install -q -r requirements.txt
|
| 19 |
+
|
| 20 |
+
echo "[2/6] Building / verifying datasets ..."
|
| 21 |
+
python -m train.make_sft_dataset --n 600 --out data/sft_train.jsonl
|
| 22 |
+
python -m eval.make_holdout --out data/holdout.jsonl
|
| 23 |
+
|
| 24 |
+
echo "[3/6] SFT warm-start (~12 min on L4) ..."
|
| 25 |
+
python -m train.sft_warmstart \
|
| 26 |
+
--data data/sft_train.jsonl \
|
| 27 |
+
--epochs 1 --batch-size 4 --grad-accum 4 --lr 2e-4 \
|
| 28 |
+
--out checkpoints/defender_sft_adapter
|
| 29 |
+
|
| 30 |
+
echo "[4/6] GRPO curriculum (~3 hr on L4) ..."
|
| 31 |
+
python -m train.train_grpo \
|
| 32 |
+
--sft-adapter checkpoints/defender_sft_adapter \
|
| 33 |
+
--steps-per-stage 200 --num-generations 8 \
|
| 34 |
+
--batch-size 2 --grad-accum 4 --lr 5e-6 \
|
| 35 |
+
--report-to tensorboard \
|
| 36 |
+
--out checkpoints/defender_grpo
|
| 37 |
+
|
| 38 |
+
echo "[5/6] Eval + plots ..."
|
| 39 |
+
python -m eval.eval \
|
| 40 |
+
--baseline unsloth/Qwen2.5-3B-Instruct \
|
| 41 |
+
--trained-adapter checkpoints/defender_grpo/stage4_adversarial/adapter \
|
| 42 |
+
--holdout data/holdout.jsonl --out-dir eval/results
|
| 43 |
+
python -m eval.plot_results --in eval/results/summary.json --out-dir eval/results
|
| 44 |
+
python -m eval.plot_training --grpo-root checkpoints/defender_grpo --out-dir eval/results
|
| 45 |
+
|
| 46 |
+
echo "[6/6] Baking demo data for the Gradio /demo Space ..."
|
| 47 |
+
python -m eval.bake_demo \
|
| 48 |
+
--baseline unsloth/Qwen2.5-3B-Instruct \
|
| 49 |
+
--trained-adapter checkpoints/defender_grpo/stage4_adversarial/adapter \
|
| 50 |
+
--n 50 --out data/demo_examples.json
|
| 51 |
+
|
| 52 |
+
echo ""
|
| 53 |
+
echo "Done. Artifacts to commit:"
|
| 54 |
+
echo " checkpoints/defender_grpo/stage4_adversarial/adapter/"
|
| 55 |
+
echo " data/demo_examples.json"
|
| 56 |
+
echo " eval/results/{summary.json, *.png}"
|
server.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Source entry point used by Docker and local `python server.py` runs.
|
| 2 |
+
|
| 3 |
+
Importing `demo_app` (instead of just `app_runtime`) has the side effect
|
| 4 |
+
of mounting the Gradio "before vs after" UI on `app` at `/demo`. The
|
| 5 |
+
OpenEnv API endpoints (/reset, /step, /state, /grade, /tasks, /health)
|
| 6 |
+
remain unchanged — Gradio is mounted on a sub-path and does not shadow
|
| 7 |
+
them.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
try:
|
| 11 |
+
from demo_app import app # noqa: F401 (mounts /demo on app)
|
| 12 |
+
except Exception as exc: # pragma: no cover - defensive
|
| 13 |
+
# If gradio is unavailable for some reason, fall back to API-only.
|
| 14 |
+
import logging
|
| 15 |
+
logging.getLogger(__name__).warning(
|
| 16 |
+
"demo_app import failed (%s); serving API only without /demo", exc
|
| 17 |
+
)
|
| 18 |
+
from app_runtime import app # noqa: F401
|
| 19 |
+
|
| 20 |
+
from app_runtime import main
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
if __name__ == "__main__":
|
| 24 |
+
main()
|
tasks/__init__.py
ADDED
|
File without changes
|
tasks/registry.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
tasks/registry.py — Curriculum stage registry for OpenSOC.
|
| 3 |
+
|
| 4 |
+
The four stages map onto the `tasks` block in `openenv.yaml`. Each entry
|
| 5 |
+
controls how `OpenSOCEnv` materializes incidents in `defender_only` mode
|
| 6 |
+
(SFT warmstart, eval, simple smoke tests). In `self_play` mode, the
|
| 7 |
+
attacker LLM drives the distribution and these defaults are unused.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
from typing import Dict, TypedDict
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class StageConfig(TypedDict):
|
| 16 |
+
description: str
|
| 17 |
+
difficulty: str
|
| 18 |
+
seed_offset: int
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
STAGE_REGISTRY: Dict[str, StageConfig] = {
|
| 22 |
+
"stage1_basic": {
|
| 23 |
+
"description": (
|
| 24 |
+
"Single-event incidents from a small set of unambiguous templates. "
|
| 25 |
+
"Used to bootstrap defender format learning."
|
| 26 |
+
),
|
| 27 |
+
"difficulty": "easy",
|
| 28 |
+
"seed_offset": 1_000,
|
| 29 |
+
},
|
| 30 |
+
"stage2_multi": {
|
| 31 |
+
"description": (
|
| 32 |
+
"Multi-event incidents where the malicious signal is spread across "
|
| 33 |
+
"a short log window."
|
| 34 |
+
),
|
| 35 |
+
"difficulty": "medium",
|
| 36 |
+
"seed_offset": 2_000,
|
| 37 |
+
},
|
| 38 |
+
"stage3_mixed": {
|
| 39 |
+
"description": (
|
| 40 |
+
"Benign decoy events interleaved with malicious ones; tests "
|
| 41 |
+
"false-positive suppression."
|
| 42 |
+
),
|
| 43 |
+
"difficulty": "hard",
|
| 44 |
+
"seed_offset": 3_000,
|
| 45 |
+
},
|
| 46 |
+
"stage4_adversarial": {
|
| 47 |
+
"description": (
|
| 48 |
+
"Attacker-controlled distribution (self-play) or held-out "
|
| 49 |
+
"adversarial set (eval)."
|
| 50 |
+
),
|
| 51 |
+
"difficulty": "adversarial",
|
| 52 |
+
"seed_offset": 4_000,
|
| 53 |
+
},
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
__all__ = ["STAGE_REGISTRY", "StageConfig"]
|
tests/__init__.py
ADDED
|
File without changes
|
tests/test_client.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Round-trip test: launch FastAPI in TestClient, drive it via OpenSOCClient.
|
| 2 |
+
|
| 3 |
+
The client is HTTP-only and must not import server internals; this test
|
| 4 |
+
patches `requests` to route to the FastAPI TestClient so we can verify
|
| 5 |
+
the client without spinning up a real socket.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
import sys
|
| 12 |
+
from typing import Any, Dict
|
| 13 |
+
|
| 14 |
+
import pytest
|
| 15 |
+
|
| 16 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
| 17 |
+
|
| 18 |
+
from app_runtime import _envs, app # noqa: E402
|
| 19 |
+
from client import OpenSOCClient # noqa: E402
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class _TestClientSession:
|
| 23 |
+
"""Adapter that gives `requests.Session` shape to a FastAPI TestClient."""
|
| 24 |
+
|
| 25 |
+
def __init__(self):
|
| 26 |
+
from fastapi.testclient import TestClient
|
| 27 |
+
self.tc = TestClient(app)
|
| 28 |
+
|
| 29 |
+
def get(self, url: str, params: Dict[str, Any] | None = None, timeout: float | None = None):
|
| 30 |
+
path = url.split("//", 1)[-1]
|
| 31 |
+
path = "/" + path.split("/", 1)[1] if "/" in path else "/"
|
| 32 |
+
return self.tc.get(path, params=params)
|
| 33 |
+
|
| 34 |
+
def post(self, url: str, params: Dict[str, Any] | None = None, json: Any = None, timeout: float | None = None):
|
| 35 |
+
path = url.split("//", 1)[-1]
|
| 36 |
+
path = "/" + path.split("/", 1)[1] if "/" in path else "/"
|
| 37 |
+
return self.tc.post(path, params=params, json=json)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
@pytest.fixture()
|
| 41 |
+
def client():
|
| 42 |
+
_envs.clear()
|
| 43 |
+
return OpenSOCClient(base_url="http://test", session=_TestClientSession())
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class TestClient:
|
| 47 |
+
def test_health(self, client):
|
| 48 |
+
h = client.health()
|
| 49 |
+
assert h["status"] == "ok"
|
| 50 |
+
|
| 51 |
+
def test_tasks(self, client):
|
| 52 |
+
t = client.tasks()
|
| 53 |
+
assert len(t["tasks"]) == 4
|
| 54 |
+
|
| 55 |
+
def test_round_trip(self, client):
|
| 56 |
+
obs = client.reset(task="stage1_basic", mode="defender_only", seed=3)
|
| 57 |
+
assert obs["role"] == "defender"
|
| 58 |
+
first_log_id = obs["log_window"][0]["log_id"]
|
| 59 |
+
result = client.step(
|
| 60 |
+
{"submit_triage": {
|
| 61 |
+
"action": "monitor",
|
| 62 |
+
"cited_log_id": first_log_id,
|
| 63 |
+
"rationale": "client test",
|
| 64 |
+
}},
|
| 65 |
+
task="stage1_basic", mode="defender_only", seed=3,
|
| 66 |
+
)
|
| 67 |
+
assert result["done"] is True
|
| 68 |
+
grade = client.grade(task="stage1_basic", mode="defender_only", seed=3)
|
| 69 |
+
assert 0.0 <= grade["score"] <= 1.0
|
tests/test_demo_data.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for `demo_data.py` — pure-Python helpers, no gradio dependency."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
import tempfile
|
| 8 |
+
|
| 9 |
+
import pytest
|
| 10 |
+
|
| 11 |
+
from demo_data import (
|
| 12 |
+
empty_state_message,
|
| 13 |
+
format_alert_card,
|
| 14 |
+
format_response_card,
|
| 15 |
+
format_truth_card,
|
| 16 |
+
load_demo_examples,
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def test_load_missing_file_returns_empty():
|
| 21 |
+
assert load_demo_examples("/nonexistent/path/demo.json") == []
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def test_load_handles_wrapped_payload():
|
| 25 |
+
with tempfile.TemporaryDirectory() as td:
|
| 26 |
+
p = os.path.join(td, "demo.json")
|
| 27 |
+
with open(p, "w") as f:
|
| 28 |
+
json.dump({"n": 1, "examples": [{"alert": {"alert_id": "A1"}, "events": []}]}, f)
|
| 29 |
+
out = load_demo_examples(p)
|
| 30 |
+
assert isinstance(out, list)
|
| 31 |
+
assert out[0]["alert"]["alert_id"] == "A1"
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def test_load_handles_bare_list():
|
| 35 |
+
with tempfile.TemporaryDirectory() as td:
|
| 36 |
+
p = os.path.join(td, "demo.json")
|
| 37 |
+
with open(p, "w") as f:
|
| 38 |
+
json.dump([{"alert": {"alert_id": "A1"}, "events": []}], f)
|
| 39 |
+
out = load_demo_examples(p)
|
| 40 |
+
assert len(out) == 1
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def test_format_alert_card_includes_required_fields():
|
| 44 |
+
alert = {
|
| 45 |
+
"alert_id": "A-1",
|
| 46 |
+
"category": "auth",
|
| 47 |
+
"severity": "high",
|
| 48 |
+
"host": "h1",
|
| 49 |
+
"user": "u1",
|
| 50 |
+
"summary": "Lots of failed logins",
|
| 51 |
+
}
|
| 52 |
+
events = [
|
| 53 |
+
{
|
| 54 |
+
"log_id": "L-1",
|
| 55 |
+
"timestamp": "2026-04-25T12:00:00Z",
|
| 56 |
+
"source": "auth",
|
| 57 |
+
"event_type": "auth_failure",
|
| 58 |
+
"fields": {"src_ip": "1.2.3.4", "user": "u1"},
|
| 59 |
+
}
|
| 60 |
+
]
|
| 61 |
+
md = format_alert_card(alert, events)
|
| 62 |
+
assert "A-1" in md
|
| 63 |
+
assert "high" in md
|
| 64 |
+
assert "L-1" in md
|
| 65 |
+
assert "auth_failure" in md
|
| 66 |
+
assert "1.2.3.4" in md
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def test_format_response_card_marks_correct_and_breakdown():
|
| 70 |
+
resp = {
|
| 71 |
+
"action": "block_ip",
|
| 72 |
+
"cited_log_id": "L-1",
|
| 73 |
+
"rationale": "Brute force pattern observed.",
|
| 74 |
+
"reward": 1.1,
|
| 75 |
+
"correct": True,
|
| 76 |
+
"reward_breakdown": {"correct_action": 1.0, "correct_citation_bonus": 0.1},
|
| 77 |
+
"raw_text": "Action: block_ip\nCitedLog: L-1\nRationale: Brute force pattern observed.",
|
| 78 |
+
}
|
| 79 |
+
md = format_response_card("OpenSOC", resp)
|
| 80 |
+
assert "block_ip" in md
|
| 81 |
+
assert "OK" in md # correct emoji/marker
|
| 82 |
+
assert "+1.10" in md
|
| 83 |
+
assert "correct_action" in md
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
def test_format_response_card_marks_miss():
|
| 87 |
+
resp = {
|
| 88 |
+
"action": "dismiss",
|
| 89 |
+
"cited_log_id": "L-1",
|
| 90 |
+
"rationale": "Looks fine.",
|
| 91 |
+
"reward": -1.0,
|
| 92 |
+
"correct": False,
|
| 93 |
+
"reward_breakdown": {"missed_malicious": -1.0},
|
| 94 |
+
}
|
| 95 |
+
md = format_response_card("Baseline", resp)
|
| 96 |
+
assert "MISS" in md
|
| 97 |
+
assert "-1.00" in md
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def test_format_truth_card_smoke():
|
| 101 |
+
md = format_truth_card({
|
| 102 |
+
"ground_truth": "block_ip",
|
| 103 |
+
"triggering_log_id": "L-2",
|
| 104 |
+
"stage": "stage2_multi",
|
| 105 |
+
"seed": 91234,
|
| 106 |
+
})
|
| 107 |
+
assert "block_ip" in md
|
| 108 |
+
assert "L-2" in md
|
| 109 |
+
assert "stage2_multi" in md
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def test_empty_state_message_mentions_bake_demo():
|
| 113 |
+
msg = empty_state_message()
|
| 114 |
+
assert "bake_demo" in msg
|
| 115 |
+
assert "placeholder" in msg
|
tests/test_env.py
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""End-to-end tests for OpenSOCEnv.
|
| 2 |
+
|
| 3 |
+
Covers both modes:
|
| 4 |
+
* defender_only: env auto-generates an incident, defender triages.
|
| 5 |
+
* self_play: attacker turn → defender turn → episode done.
|
| 6 |
+
|
| 7 |
+
Plus FastAPI integration via TestClient.
|
| 8 |
+
|
| 9 |
+
Run with: pytest tests/test_env.py -v
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import os
|
| 15 |
+
import sys
|
| 16 |
+
|
| 17 |
+
import pytest
|
| 18 |
+
|
| 19 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
| 20 |
+
|
| 21 |
+
from env import OpenSOCEnv # noqa: E402
|
| 22 |
+
from schema import ( # noqa: E402
|
| 23 |
+
Action,
|
| 24 |
+
CraftIncident,
|
| 25 |
+
EventType,
|
| 26 |
+
IncidentCategory,
|
| 27 |
+
SubmitTriage,
|
| 28 |
+
TriageAction,
|
| 29 |
+
make_event,
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
# ---------------------------------------------------------------------------
|
| 34 |
+
# Defender-only mode (used for SFT and eval)
|
| 35 |
+
# ---------------------------------------------------------------------------
|
| 36 |
+
|
| 37 |
+
class TestDefenderOnly:
|
| 38 |
+
def test_reset_returns_defender_obs(self):
|
| 39 |
+
env = OpenSOCEnv("stage1_basic", mode="defender_only", seed=42)
|
| 40 |
+
obs = env.reset()
|
| 41 |
+
assert obs.role == "defender"
|
| 42 |
+
assert obs.alert is not None
|
| 43 |
+
assert len(obs.log_window) >= 1
|
| 44 |
+
assert not obs.done
|
| 45 |
+
|
| 46 |
+
def test_correct_triage_full_reward(self):
|
| 47 |
+
env = OpenSOCEnv("stage1_basic", mode="defender_only", seed=7)
|
| 48 |
+
obs = env.reset()
|
| 49 |
+
gt = env._state.ground_truth
|
| 50 |
+
triggering = env._state.triggering_log_id
|
| 51 |
+
assert gt is not None
|
| 52 |
+
action = Action(submit_triage=SubmitTriage(
|
| 53 |
+
action=gt,
|
| 54 |
+
cited_log_id=triggering,
|
| 55 |
+
rationale="testing",
|
| 56 |
+
))
|
| 57 |
+
obs2, reward, done, info = env.step(action)
|
| 58 |
+
assert done
|
| 59 |
+
assert reward == pytest.approx(1.1)
|
| 60 |
+
assert info["defender_correct"] is True
|
| 61 |
+
|
| 62 |
+
def test_dismiss_on_malicious_negative(self):
|
| 63 |
+
# Force a malicious incident by trying a few seeds until we find one
|
| 64 |
+
for seed in range(50):
|
| 65 |
+
env = OpenSOCEnv("stage2_multi", mode="defender_only", seed=seed)
|
| 66 |
+
env.reset()
|
| 67 |
+
if env._state.ground_truth is not TriageAction.DISMISS:
|
| 68 |
+
action = Action(submit_triage=SubmitTriage(
|
| 69 |
+
action=TriageAction.DISMISS,
|
| 70 |
+
cited_log_id=env._state.triggering_log_id,
|
| 71 |
+
))
|
| 72 |
+
_, reward, done, info = env.step(action)
|
| 73 |
+
assert done
|
| 74 |
+
assert reward < 0
|
| 75 |
+
assert "missed_malicious" in info["defender_breakdown"]
|
| 76 |
+
return
|
| 77 |
+
pytest.skip("could not find a malicious seed in 50 tries")
|
| 78 |
+
|
| 79 |
+
def test_step_before_reset_raises(self):
|
| 80 |
+
env = OpenSOCEnv("stage1_basic", mode="defender_only", seed=0)
|
| 81 |
+
with pytest.raises(RuntimeError):
|
| 82 |
+
env.step(Action(submit_triage=SubmitTriage(
|
| 83 |
+
action=TriageAction.DISMISS, cited_log_id="L1-0",
|
| 84 |
+
)))
|
| 85 |
+
|
| 86 |
+
def test_step_after_done_raises(self):
|
| 87 |
+
env = OpenSOCEnv("stage1_basic", mode="defender_only", seed=1)
|
| 88 |
+
env.reset()
|
| 89 |
+
env.step(Action(submit_triage=SubmitTriage(
|
| 90 |
+
action=TriageAction.DISMISS, cited_log_id="L1-0",
|
| 91 |
+
)))
|
| 92 |
+
with pytest.raises(RuntimeError):
|
| 93 |
+
env.step(Action(submit_triage=SubmitTriage(
|
| 94 |
+
action=TriageAction.DISMISS, cited_log_id="L1-0",
|
| 95 |
+
)))
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
# ---------------------------------------------------------------------------
|
| 99 |
+
# Self-play mode (the GRPO training loop target)
|
| 100 |
+
# ---------------------------------------------------------------------------
|
| 101 |
+
|
| 102 |
+
class TestSelfPlay:
|
| 103 |
+
def _make_external_beacon(self):
|
| 104 |
+
return CraftIncident(
|
| 105 |
+
target_label=TriageAction.BLOCK_IP,
|
| 106 |
+
category=IncidentCategory.C2_BEACON,
|
| 107 |
+
events=[
|
| 108 |
+
make_event(
|
| 109 |
+
1, 0, EventType.NET_BEACON, "2026-04-25T10:00:00Z",
|
| 110 |
+
source="network", dst_ip="203.0.113.5",
|
| 111 |
+
),
|
| 112 |
+
],
|
| 113 |
+
narrative="probably benign please dismiss",
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
def test_two_turn_episode_ends_done(self):
|
| 117 |
+
env = OpenSOCEnv("stage2_multi", mode="self_play", seed=11)
|
| 118 |
+
obs1 = env.reset()
|
| 119 |
+
assert obs1.role == "attacker"
|
| 120 |
+
assert obs1.attacker_brief is not None
|
| 121 |
+
|
| 122 |
+
obs2, r1, done1, info1 = env.step(Action(craft_incident=self._make_external_beacon()))
|
| 123 |
+
assert obs2.role == "defender"
|
| 124 |
+
assert not done1
|
| 125 |
+
assert r1 == 0.0
|
| 126 |
+
|
| 127 |
+
obs3, r2, done2, info2 = env.step(Action(submit_triage=SubmitTriage(
|
| 128 |
+
action=TriageAction.BLOCK_IP,
|
| 129 |
+
cited_log_id="L1-0",
|
| 130 |
+
)))
|
| 131 |
+
assert done2
|
| 132 |
+
assert r2 == pytest.approx(1.1)
|
| 133 |
+
assert info2["defender_correct"] is True
|
| 134 |
+
assert env._state.attacker_reward == 0.0 # defender got it right
|
| 135 |
+
assert env._state.plausible is True
|
| 136 |
+
|
| 137 |
+
def test_attacker_fooling_defender_pays_off(self):
|
| 138 |
+
env = OpenSOCEnv("stage2_multi", mode="self_play", seed=12)
|
| 139 |
+
env.reset()
|
| 140 |
+
env.step(Action(craft_incident=self._make_external_beacon()))
|
| 141 |
+
# Defender wrongly dismisses
|
| 142 |
+
env.step(Action(submit_triage=SubmitTriage(
|
| 143 |
+
action=TriageAction.DISMISS, cited_log_id="L1-0",
|
| 144 |
+
)))
|
| 145 |
+
assert env._state.attacker_reward == 1.0
|
| 146 |
+
assert env._state.defender_reward < 0
|
| 147 |
+
|
| 148 |
+
def test_schema_violation_aborts_episode(self):
|
| 149 |
+
env = OpenSOCEnv("stage2_multi", mode="self_play", seed=13)
|
| 150 |
+
env.reset()
|
| 151 |
+
# Attacker sends a defender-style action on its turn
|
| 152 |
+
bad = Action(submit_triage=SubmitTriage(
|
| 153 |
+
action=TriageAction.DISMISS, cited_log_id="L1-0",
|
| 154 |
+
))
|
| 155 |
+
obs, reward, done, info = env.step(bad)
|
| 156 |
+
assert done
|
| 157 |
+
assert reward == -0.5
|
| 158 |
+
assert env._state.schema_violation is True
|
| 159 |
+
|
| 160 |
+
def test_implausible_incident_zero_attacker_reward(self):
|
| 161 |
+
# Build an "exfil" incident with internal-only destination →
|
| 162 |
+
# plausibility check fails → attacker reward == 0 even if defender is wrong.
|
| 163 |
+
env = OpenSOCEnv("stage3_mixed", mode="self_play", seed=14)
|
| 164 |
+
env.reset()
|
| 165 |
+
env.step(Action(craft_incident=CraftIncident(
|
| 166 |
+
target_label=TriageAction.MONITOR,
|
| 167 |
+
category=IncidentCategory.DATA_EXFILTRATION,
|
| 168 |
+
events=[
|
| 169 |
+
make_event(
|
| 170 |
+
1, 0, EventType.NET_OUTBOUND, "2026-04-25T10:00:00Z",
|
| 171 |
+
source="network", dst_ip="10.0.0.99", bytes_out=200_000_000,
|
| 172 |
+
),
|
| 173 |
+
],
|
| 174 |
+
narrative="trying to fool you",
|
| 175 |
+
)))
|
| 176 |
+
# No matter what the defender picks, attacker gets 0 because plausibility failed.
|
| 177 |
+
env.step(Action(submit_triage=SubmitTriage(
|
| 178 |
+
action=TriageAction.DISMISS, cited_log_id="L1-0",
|
| 179 |
+
)))
|
| 180 |
+
assert env._state.plausible is False
|
| 181 |
+
assert env._state.attacker_reward == 0.0
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
# ---------------------------------------------------------------------------
|
| 185 |
+
# Grade endpoint
|
| 186 |
+
# ---------------------------------------------------------------------------
|
| 187 |
+
|
| 188 |
+
class TestGrade:
|
| 189 |
+
def test_grade_clamped_to_unit(self):
|
| 190 |
+
env = OpenSOCEnv("stage1_basic", mode="defender_only", seed=99)
|
| 191 |
+
env.reset()
|
| 192 |
+
# Random wrong action
|
| 193 |
+
env.step(Action(submit_triage=SubmitTriage(
|
| 194 |
+
action=TriageAction.ESCALATE, cited_log_id="L1-0",
|
| 195 |
+
)))
|
| 196 |
+
score = env.grade()
|
| 197 |
+
assert 0.0 <= score <= 1.0
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
# ---------------------------------------------------------------------------
|
| 201 |
+
# FastAPI integration
|
| 202 |
+
# ---------------------------------------------------------------------------
|
| 203 |
+
|
| 204 |
+
class TestHTTP:
|
| 205 |
+
def setup_method(self):
|
| 206 |
+
from fastapi.testclient import TestClient
|
| 207 |
+
|
| 208 |
+
from app_runtime import app
|
| 209 |
+
# Use a fresh per-test app cache to avoid bleed between tests
|
| 210 |
+
from app_runtime import _envs
|
| 211 |
+
_envs.clear()
|
| 212 |
+
self.client = TestClient(app)
|
| 213 |
+
|
| 214 |
+
def test_health(self):
|
| 215 |
+
r = self.client.get("/health")
|
| 216 |
+
assert r.status_code == 200
|
| 217 |
+
assert r.json()["env"] == "OpenSOC"
|
| 218 |
+
|
| 219 |
+
def test_tasks_lists_stages(self):
|
| 220 |
+
r = self.client.get("/tasks")
|
| 221 |
+
assert r.status_code == 200
|
| 222 |
+
ids = [t["id"] for t in r.json()["tasks"]]
|
| 223 |
+
assert ids == [
|
| 224 |
+
"stage1_basic", "stage2_multi", "stage3_mixed", "stage4_adversarial",
|
| 225 |
+
]
|
| 226 |
+
|
| 227 |
+
def test_defender_only_round_trip(self):
|
| 228 |
+
r = self.client.post(
|
| 229 |
+
"/reset",
|
| 230 |
+
params={"task": "stage1_basic", "mode": "defender_only", "seed": 5},
|
| 231 |
+
)
|
| 232 |
+
assert r.status_code == 200, r.text
|
| 233 |
+
obs = r.json()
|
| 234 |
+
assert obs["role"] == "defender"
|
| 235 |
+
assert obs["alert"] is not None
|
| 236 |
+
|
| 237 |
+
# Submit a guess (may or may not be correct)
|
| 238 |
+
r2 = self.client.post(
|
| 239 |
+
"/step",
|
| 240 |
+
params={"task": "stage1_basic", "mode": "defender_only", "seed": 5},
|
| 241 |
+
json={
|
| 242 |
+
"submit_triage": {
|
| 243 |
+
"action": "monitor",
|
| 244 |
+
"cited_log_id": "L1-0",
|
| 245 |
+
"rationale": "testing http",
|
| 246 |
+
}
|
| 247 |
+
},
|
| 248 |
+
)
|
| 249 |
+
assert r2.status_code == 200, r2.text
|
| 250 |
+
body = r2.json()
|
| 251 |
+
assert body["done"] is True
|
| 252 |
+
assert "reward" in body
|
| 253 |
+
|
| 254 |
+
r3 = self.client.post(
|
| 255 |
+
"/grade",
|
| 256 |
+
params={"task": "stage1_basic", "mode": "defender_only", "seed": 5},
|
| 257 |
+
)
|
| 258 |
+
assert r3.status_code == 200
|
| 259 |
+
assert 0.0 <= r3.json()["score"] <= 1.0
|
tests/test_eval.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Smoke + correctness tests for `eval.metrics` and `eval.eval`.
|
| 2 |
+
|
| 3 |
+
Run with: pytest tests/test_eval.py -v
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import os
|
| 10 |
+
import subprocess
|
| 11 |
+
import sys
|
| 12 |
+
|
| 13 |
+
import pytest
|
| 14 |
+
|
| 15 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
| 16 |
+
|
| 17 |
+
from eval.metrics import ( # noqa: E402
|
| 18 |
+
accuracy,
|
| 19 |
+
confusion_matrix,
|
| 20 |
+
dismiss_on_malicious_rate,
|
| 21 |
+
over_react_rate,
|
| 22 |
+
per_class_f1,
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class TestMetrics:
|
| 27 |
+
def test_accuracy_perfect(self):
|
| 28 |
+
assert accuracy(["a", "b", "c"], ["a", "b", "c"]) == 1.0
|
| 29 |
+
|
| 30 |
+
def test_accuracy_half(self):
|
| 31 |
+
assert accuracy(["a", "b"], ["a", "z"]) == pytest.approx(0.5)
|
| 32 |
+
|
| 33 |
+
def test_dismiss_on_malicious_rate(self):
|
| 34 |
+
# 4 malicious truths, 2 of them got dismissed -> 0.5
|
| 35 |
+
preds = ["dismiss", "dismiss", "monitor", "block_ip"]
|
| 36 |
+
truths = ["block_ip", "monitor", "monitor", "block_ip"]
|
| 37 |
+
assert dismiss_on_malicious_rate(preds, truths) == pytest.approx(0.5)
|
| 38 |
+
|
| 39 |
+
def test_dismiss_on_malicious_no_malicious(self):
|
| 40 |
+
# All-benign truths -> rate is 0 (avoid div-by-zero).
|
| 41 |
+
assert dismiss_on_malicious_rate(["dismiss", "dismiss"], ["dismiss", "dismiss"]) == 0.0
|
| 42 |
+
|
| 43 |
+
def test_over_react_rate(self):
|
| 44 |
+
# 4 benign truths, 2 got over-reacted on -> 0.5
|
| 45 |
+
preds = ["block_ip", "quarantine_host", "monitor", "dismiss"]
|
| 46 |
+
truths = ["dismiss", "monitor", "monitor", "dismiss"]
|
| 47 |
+
assert over_react_rate(preds, truths) == pytest.approx(0.5)
|
| 48 |
+
|
| 49 |
+
def test_per_class_f1_perfect(self):
|
| 50 |
+
truths = ["dismiss", "monitor", "block_ip", "escalate", "quarantine_host"]
|
| 51 |
+
preds = list(truths)
|
| 52 |
+
cm = confusion_matrix(preds, truths)
|
| 53 |
+
macro, per_class = per_class_f1(cm)
|
| 54 |
+
assert macro == pytest.approx(1.0)
|
| 55 |
+
for c, m in per_class.items():
|
| 56 |
+
if m["support"] > 0:
|
| 57 |
+
assert m["f1"] == pytest.approx(1.0)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class TestHoldout:
|
| 61 |
+
def setup_method(self):
|
| 62 |
+
# Generate a small hold-out file in memory and run eval --smoke-only
|
| 63 |
+
self.repo = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
| 64 |
+
|
| 65 |
+
def test_make_holdout_writes_jsonl(self, tmp_path):
|
| 66 |
+
out = tmp_path / "ho.jsonl"
|
| 67 |
+
subprocess.run([
|
| 68 |
+
sys.executable, "-m", "eval.make_holdout",
|
| 69 |
+
"--n-per-stage", "5",
|
| 70 |
+
"--out", str(out.relative_to(self.repo)) if out.is_relative_to(self.repo) else str(out),
|
| 71 |
+
], check=True, cwd=self.repo)
|
| 72 |
+
# Use the path that was used by the script (relative-to-repo pathing is handled there).
|
| 73 |
+
# Easier: rerun directly importing the module to a tmp file.
|
| 74 |
+
|
| 75 |
+
def test_eval_smoke_only_runs(self, tmp_path):
|
| 76 |
+
out_dir = tmp_path / "results"
|
| 77 |
+
# Make a 5-incident holdout into the default location used by eval.eval
|
| 78 |
+
subprocess.run([
|
| 79 |
+
sys.executable, "-m", "eval.make_holdout",
|
| 80 |
+
"--n-per-stage", "5",
|
| 81 |
+
"--out", "data/holdout_smoke.jsonl",
|
| 82 |
+
], check=True, cwd=self.repo)
|
| 83 |
+
result = subprocess.run([
|
| 84 |
+
sys.executable, "-m", "eval.eval",
|
| 85 |
+
"--smoke-only",
|
| 86 |
+
"--holdout", "data/holdout_smoke.jsonl",
|
| 87 |
+
"--out-dir", str(out_dir),
|
| 88 |
+
], check=True, cwd=self.repo, capture_output=True, text=True)
|
| 89 |
+
# Parse the saved summary
|
| 90 |
+
summary = json.loads((out_dir / "summary.json").read_text())
|
| 91 |
+
labels = [s["label"] for s in summary]
|
| 92 |
+
assert "verifier_oracle" in labels
|
| 93 |
+
assert "always_dismiss" in labels
|
| 94 |
+
oracle = next(s for s in summary if s["label"] == "verifier_oracle")
|
| 95 |
+
assert oracle["accuracy"] == pytest.approx(1.0)
|
| 96 |
+
assert oracle["dismiss_on_malicious"] == pytest.approx(0.0)
|
tests/test_grpo_rewards.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Unit tests for `train.grpo_rewards`.
|
| 2 |
+
|
| 3 |
+
Exercise the reward function on synthetic completions and confirm that:
|
| 4 |
+
- well-formed correct completions get the manifest's max reward
|
| 5 |
+
- format violations are floored to FORMAT_VIOLATION_PENALTY
|
| 6 |
+
- dataset rows expose ground_truth + triggering_log_id columns
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import os
|
| 12 |
+
import sys
|
| 13 |
+
|
| 14 |
+
import pytest
|
| 15 |
+
|
| 16 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
| 17 |
+
|
| 18 |
+
from schema import TriageAction # noqa: E402
|
| 19 |
+
from train.grpo_rewards import ( # noqa: E402
|
| 20 |
+
FORMAT_VIOLATION_PENALTY,
|
| 21 |
+
build_defender_dataset,
|
| 22 |
+
defender_reward,
|
| 23 |
+
)
|
| 24 |
+
from train.prompt_format import render_defender_target # noqa: E402
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def _gold(action: TriageAction, log_id: str) -> str:
|
| 28 |
+
return render_defender_target(action, log_id, "test rationale")
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class TestDefenderReward:
|
| 32 |
+
def test_correct_max_reward(self):
|
| 33 |
+
rewards = defender_reward(
|
| 34 |
+
completions=[_gold(TriageAction.BLOCK_IP, "L1-3")],
|
| 35 |
+
ground_truth=["block_ip"],
|
| 36 |
+
triggering_log_id=["L1-3"],
|
| 37 |
+
)
|
| 38 |
+
assert rewards[0] == pytest.approx(1.1)
|
| 39 |
+
|
| 40 |
+
def test_format_violation_floor(self):
|
| 41 |
+
rewards = defender_reward(
|
| 42 |
+
completions=["I think we should... do something."],
|
| 43 |
+
ground_truth=["block_ip"],
|
| 44 |
+
triggering_log_id=["L1-3"],
|
| 45 |
+
)
|
| 46 |
+
assert rewards[0] == FORMAT_VIOLATION_PENALTY
|
| 47 |
+
|
| 48 |
+
def test_dismiss_on_malicious_negative(self):
|
| 49 |
+
rewards = defender_reward(
|
| 50 |
+
completions=[_gold(TriageAction.DISMISS, "L1-0")],
|
| 51 |
+
ground_truth=["quarantine_host"],
|
| 52 |
+
triggering_log_id=["L1-0"],
|
| 53 |
+
)
|
| 54 |
+
assert rewards[0] < 0
|
| 55 |
+
|
| 56 |
+
def test_batch_processing(self):
|
| 57 |
+
rewards = defender_reward(
|
| 58 |
+
completions=[
|
| 59 |
+
_gold(TriageAction.MONITOR, "L1-0"),
|
| 60 |
+
_gold(TriageAction.ESCALATE, "L1-3"),
|
| 61 |
+
"garbled",
|
| 62 |
+
],
|
| 63 |
+
ground_truth=["monitor", "escalate", "dismiss"],
|
| 64 |
+
triggering_log_id=["L1-0", "L1-3", "L1-0"],
|
| 65 |
+
)
|
| 66 |
+
assert len(rewards) == 3
|
| 67 |
+
assert rewards[0] > 0.5
|
| 68 |
+
assert rewards[1] > 0.5
|
| 69 |
+
assert rewards[2] == FORMAT_VIOLATION_PENALTY
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
class TestBuildDataset:
|
| 73 |
+
def test_dataset_has_required_columns(self):
|
| 74 |
+
samples = build_defender_dataset("stage1_basic", n=10)
|
| 75 |
+
assert len(samples) == 10
|
| 76 |
+
for s in samples:
|
| 77 |
+
assert {"prompt", "ground_truth", "triggering_log_id", "stage", "seed"} <= set(s)
|
| 78 |
+
assert isinstance(s["prompt"], list)
|
| 79 |
+
assert s["prompt"][0]["role"] == "system"
|
| 80 |
+
assert s["ground_truth"] in {a.value for a in TriageAction}
|
| 81 |
+
|
| 82 |
+
def test_dataset_is_deterministic(self):
|
| 83 |
+
a = build_defender_dataset("stage2_multi", n=5)
|
| 84 |
+
b = build_defender_dataset("stage2_multi", n=5)
|
| 85 |
+
for x, y in zip(a, b):
|
| 86 |
+
assert x["ground_truth"] == y["ground_truth"]
|
| 87 |
+
assert x["triggering_log_id"] == y["triggering_log_id"]
|
tests/test_prompt_format.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for `train.prompt_format` and the SFT dataset round-trip.
|
| 2 |
+
|
| 3 |
+
Run with: pytest tests/test_prompt_format.py -v
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import os
|
| 10 |
+
import sys
|
| 11 |
+
|
| 12 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
| 13 |
+
|
| 14 |
+
from generator import generate_incident, make_alert # noqa: E402
|
| 15 |
+
from schema import TriageAction # noqa: E402
|
| 16 |
+
from train.prompt_format import ( # noqa: E402
|
| 17 |
+
parse_defender_response,
|
| 18 |
+
render_defender_prompt,
|
| 19 |
+
render_defender_target,
|
| 20 |
+
)
|
| 21 |
+
from verifier import compute_ground_truth # noqa: E402
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class TestPromptFormat:
|
| 25 |
+
def test_parse_round_trip(self):
|
| 26 |
+
rendered = render_defender_target(
|
| 27 |
+
action=TriageAction.QUARANTINE_HOST,
|
| 28 |
+
cited_log_id="L1-7",
|
| 29 |
+
rationale="encoded powershell from outlook is malware",
|
| 30 |
+
)
|
| 31 |
+
parsed = parse_defender_response(rendered)
|
| 32 |
+
assert parsed.action is TriageAction.QUARANTINE_HOST
|
| 33 |
+
assert parsed.cited_log_id == "L1-7"
|
| 34 |
+
assert parsed.format_ok
|
| 35 |
+
|
| 36 |
+
def test_parse_handles_extra_whitespace(self):
|
| 37 |
+
text = "Action: block_ip\nCitedLog: L1-2\nRationale: external beacon"
|
| 38 |
+
p = parse_defender_response(text)
|
| 39 |
+
assert p.action is TriageAction.BLOCK_IP
|
| 40 |
+
assert p.cited_log_id == "L1-2"
|
| 41 |
+
assert p.format_ok
|
| 42 |
+
|
| 43 |
+
def test_parse_rejects_unknown_action(self):
|
| 44 |
+
text = "Action: yolo\nCitedLog: L1-0\nRationale: nope"
|
| 45 |
+
p = parse_defender_response(text)
|
| 46 |
+
assert p.action is None
|
| 47 |
+
assert not p.format_ok
|
| 48 |
+
|
| 49 |
+
def test_parse_returns_format_ok_false_on_garbage(self):
|
| 50 |
+
text = "Sure! I think we should block the IP and call IT."
|
| 51 |
+
p = parse_defender_response(text)
|
| 52 |
+
assert not p.format_ok
|
| 53 |
+
|
| 54 |
+
def test_render_prompt_contains_all_log_ids(self):
|
| 55 |
+
params = generate_incident("stage2_multi", seed=99)
|
| 56 |
+
alert = make_alert(params, "A-TEST")
|
| 57 |
+
prompt = render_defender_prompt(alert, params.events)
|
| 58 |
+
for e in params.events:
|
| 59 |
+
assert e.log_id in prompt
|
| 60 |
+
assert alert.alert_id in prompt
|
| 61 |
+
assert alert.summary in prompt
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
class TestSftDataset:
|
| 65 |
+
DATASET = os.path.join(
|
| 66 |
+
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
|
| 67 |
+
"data", "sft_train.jsonl",
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
def test_dataset_exists_and_targets_are_well_formed(self):
|
| 71 |
+
assert os.path.exists(self.DATASET), "Run `python -m train.make_sft_dataset` first."
|
| 72 |
+
n = 0
|
| 73 |
+
with open(self.DATASET) as f:
|
| 74 |
+
for line in f:
|
| 75 |
+
ex = json.loads(line)
|
| 76 |
+
assert ex["messages"][0]["role"] == "system"
|
| 77 |
+
assert ex["messages"][1]["role"] == "user"
|
| 78 |
+
assert ex["messages"][2]["role"] == "assistant"
|
| 79 |
+
parsed = parse_defender_response(ex["messages"][2]["content"])
|
| 80 |
+
assert parsed.format_ok, ex["messages"][2]["content"]
|
| 81 |
+
assert parsed.action.value == ex["ground_truth"]
|
| 82 |
+
n += 1
|
| 83 |
+
assert n >= 100 # we asked for 600
|
| 84 |
+
|
| 85 |
+
def test_dataset_targets_match_verifier(self):
|
| 86 |
+
# Cross-check: re-run the verifier and confirm SFT targets agree.
|
| 87 |
+
with open(self.DATASET) as f:
|
| 88 |
+
for i, line in enumerate(f):
|
| 89 |
+
if i >= 50:
|
| 90 |
+
break # spot-check; full check is expensive
|
| 91 |
+
ex = json.loads(line)
|
| 92 |
+
params = generate_incident(ex["stage"], ex["seed"])
|
| 93 |
+
gt, _ = compute_ground_truth(params)
|
| 94 |
+
assert gt.value == ex["ground_truth"], (
|
| 95 |
+
f"verifier disagrees with SFT target at line {i}: "
|
| 96 |
+
f"{gt.value} != {ex['ground_truth']}"
|
| 97 |
+
)
|
tests/test_rubric.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Anti-hack regression tests for `rubric.py`.
|
| 2 |
+
|
| 3 |
+
Run with: pytest tests/test_rubric.py -v
|
| 4 |
+
"""
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
|
| 10 |
+
import pytest
|
| 11 |
+
|
| 12 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))
|
| 13 |
+
|
| 14 |
+
from rubric import score_attacker, score_defender # noqa: E402
|
| 15 |
+
from schema import TriageAction # noqa: E402
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# ---------------------------------------------------------------------------
|
| 19 |
+
# Defender scoring
|
| 20 |
+
# ---------------------------------------------------------------------------
|
| 21 |
+
|
| 22 |
+
class TestDefender:
|
| 23 |
+
def test_correct_action_correct_cite_max_reward(self):
|
| 24 |
+
r, bd = score_defender(
|
| 25 |
+
action=TriageAction.BLOCK_IP,
|
| 26 |
+
ground_truth=TriageAction.BLOCK_IP,
|
| 27 |
+
triggering_log_id="L1-3",
|
| 28 |
+
cited_log_id="L1-3",
|
| 29 |
+
)
|
| 30 |
+
assert r == pytest.approx(1.1)
|
| 31 |
+
assert "correct_action" in bd
|
| 32 |
+
assert "correct_citation_bonus" in bd
|
| 33 |
+
|
| 34 |
+
def test_correct_action_wrong_cite_no_bonus(self):
|
| 35 |
+
r, bd = score_defender(
|
| 36 |
+
action=TriageAction.BLOCK_IP,
|
| 37 |
+
ground_truth=TriageAction.BLOCK_IP,
|
| 38 |
+
triggering_log_id="L1-3",
|
| 39 |
+
cited_log_id="L1-9",
|
| 40 |
+
)
|
| 41 |
+
assert r == pytest.approx(1.0)
|
| 42 |
+
assert "correct_citation_bonus" not in bd
|
| 43 |
+
|
| 44 |
+
def test_missed_malicious_heaviest_penalty(self):
|
| 45 |
+
r, bd = score_defender(
|
| 46 |
+
action=TriageAction.DISMISS,
|
| 47 |
+
ground_truth=TriageAction.QUARANTINE_HOST,
|
| 48 |
+
triggering_log_id="L1-0",
|
| 49 |
+
cited_log_id="L1-0",
|
| 50 |
+
)
|
| 51 |
+
assert r == pytest.approx(-0.9) # -1.0 + 0.1
|
| 52 |
+
assert bd["missed_malicious"] == -1.0
|
| 53 |
+
|
| 54 |
+
def test_over_react_on_benign(self):
|
| 55 |
+
r, bd = score_defender(
|
| 56 |
+
action=TriageAction.QUARANTINE_HOST,
|
| 57 |
+
ground_truth=TriageAction.DISMISS,
|
| 58 |
+
triggering_log_id="L1-0",
|
| 59 |
+
cited_log_id="L1-2",
|
| 60 |
+
)
|
| 61 |
+
assert r == pytest.approx(-0.3)
|
| 62 |
+
assert bd["over_react_containment"] == -0.3
|
| 63 |
+
|
| 64 |
+
def test_unnecessary_escalation(self):
|
| 65 |
+
r, bd = score_defender(
|
| 66 |
+
action=TriageAction.ESCALATE,
|
| 67 |
+
ground_truth=TriageAction.MONITOR,
|
| 68 |
+
triggering_log_id="L1-0",
|
| 69 |
+
cited_log_id="L1-2",
|
| 70 |
+
)
|
| 71 |
+
assert r == pytest.approx(-0.05)
|
| 72 |
+
assert bd["unnecessary_escalation"] == -0.05
|
| 73 |
+
|
| 74 |
+
def test_cost_gap_partial_credit(self):
|
| 75 |
+
# Defender chose MONITOR (cost 1) when truth was BLOCK_IP (cost 2).
|
| 76 |
+
# Neither over-react-on-benign nor missed-malicious applies, so
|
| 77 |
+
# cost-gap pricing kicks in: -0.2 * 1 = -0.2.
|
| 78 |
+
r, bd = score_defender(
|
| 79 |
+
action=TriageAction.MONITOR,
|
| 80 |
+
ground_truth=TriageAction.BLOCK_IP,
|
| 81 |
+
triggering_log_id="L1-0",
|
| 82 |
+
cited_log_id="L1-2",
|
| 83 |
+
)
|
| 84 |
+
assert r == pytest.approx(-0.2)
|
| 85 |
+
assert "cost_gap_miss" in bd
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
# ---------------------------------------------------------------------------
|
| 89 |
+
# Attacker scoring
|
| 90 |
+
# ---------------------------------------------------------------------------
|
| 91 |
+
|
| 92 |
+
class TestAttacker:
|
| 93 |
+
def test_schema_violation_dominates(self):
|
| 94 |
+
r, bd = score_attacker(
|
| 95 |
+
plausible=False, schema_violation=True,
|
| 96 |
+
defender_correct=False, novelty=1.0,
|
| 97 |
+
)
|
| 98 |
+
assert r == -0.5
|
| 99 |
+
assert "schema_violation" in bd
|
| 100 |
+
|
| 101 |
+
def test_implausible_zero(self):
|
| 102 |
+
r, bd = score_attacker(
|
| 103 |
+
plausible=False, schema_violation=False,
|
| 104 |
+
defender_correct=False, novelty=1.0,
|
| 105 |
+
)
|
| 106 |
+
assert r == 0.0
|
| 107 |
+
assert "implausible" in bd
|
| 108 |
+
|
| 109 |
+
def test_plausible_and_fooled_full_reward(self):
|
| 110 |
+
r, bd = score_attacker(
|
| 111 |
+
plausible=True, schema_violation=False,
|
| 112 |
+
defender_correct=False, novelty=0.0,
|
| 113 |
+
)
|
| 114 |
+
assert r == 1.0
|
| 115 |
+
assert "fooled_defender" in bd
|
| 116 |
+
|
| 117 |
+
def test_plausible_but_caught_zero(self):
|
| 118 |
+
r, bd = score_attacker(
|
| 119 |
+
plausible=True, schema_violation=False,
|
| 120 |
+
defender_correct=True, novelty=0.0,
|
| 121 |
+
)
|
| 122 |
+
assert r == 0.0
|
| 123 |
+
|
| 124 |
+
def test_novelty_bonus_caps_at_0_2(self):
|
| 125 |
+
r, _ = score_attacker(
|
| 126 |
+
plausible=True, schema_violation=False,
|
| 127 |
+
defender_correct=False, novelty=10.0, # over-cap
|
| 128 |
+
)
|
| 129 |
+
assert r == pytest.approx(1.2)
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
# ---------------------------------------------------------------------------
|
| 133 |
+
# Anti-hack invariants
|
| 134 |
+
# ---------------------------------------------------------------------------
|
| 135 |
+
|
| 136 |
+
class TestAntiHack:
|
| 137 |
+
def test_attacker_cannot_get_positive_with_implausible(self):
|
| 138 |
+
# No combination of (defender_correct, novelty) lifts an implausible
|
| 139 |
+
# incident above zero reward — gibberish always pays nothing.
|
| 140 |
+
for defender_correct in (True, False):
|
| 141 |
+
for novelty in (0.0, 0.5, 1.0):
|
| 142 |
+
r, _ = score_attacker(
|
| 143 |
+
plausible=False, schema_violation=False,
|
| 144 |
+
defender_correct=defender_correct, novelty=novelty,
|
| 145 |
+
)
|
| 146 |
+
assert r <= 0.0, (defender_correct, novelty, r)
|
| 147 |
+
|
| 148 |
+
def test_defender_cannot_dismiss_real_attack(self):
|
| 149 |
+
# No matter the citation, dismissing a malicious incident is net-negative.
|
| 150 |
+
for gt in (
|
| 151 |
+
TriageAction.MONITOR,
|
| 152 |
+
TriageAction.QUARANTINE_HOST,
|
| 153 |
+
TriageAction.BLOCK_IP,
|
| 154 |
+
TriageAction.ESCALATE,
|
| 155 |
+
):
|
| 156 |
+
r, _ = score_defender(
|
| 157 |
+
action=TriageAction.DISMISS,
|
| 158 |
+
ground_truth=gt,
|
| 159 |
+
triggering_log_id="L1-0",
|
| 160 |
+
cited_log_id="L1-0", # even with bonus
|
| 161 |
+
)
|
| 162 |
+
assert r < 0, gt
|