HagalazAI commited on
Commit
a2a956b
Β·
verified Β·
1 Parent(s): d8c0d8c

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +161 -53
README.md CHANGED
@@ -72,57 +72,165 @@ Detects **technical red-team / offensive security** text (English).
72
  ## Quick start
73
 
74
  ```python
75
- import torch
76
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
77
-
78
- def classify_texts(model_name, phrases, threshold=0.515):
79
- """
80
- Returns a list of (probability_offensive, label) tuples for each phrase
81
- given a model_name and threshold.
82
- """
83
- tokenizer = AutoTokenizer.from_pretrained(model_name)
84
- model = AutoModelForSequenceClassification.from_pretrained(model_name)
85
- model.eval()
86
-
87
- inputs = tokenizer(phrases, padding=True, truncation=True, return_tensors="pt")
88
-
89
- with torch.no_grad():
90
- logits = model(**inputs).logits # shape: (batch_size, 2)
91
- probs_offensive = torch.softmax(logits, dim=1)[:, 1] # Probability of the "Offensive" class
92
-
93
- results = []
94
- for p_val in probs_offensive:
95
- p_val = p_val.item()
96
- label = "Offensive (red-team)" if p_val >= threshold else "Not Offensive"
97
- results.append((p_val, label))
98
- return results
99
-
100
- def main():
101
- # Example phrases: Offensive (red-team), Defensive (blue-team), Non-technical
102
- phrases = [
103
- # 1) Cybersecurity Offensive / red-team
104
- "To exfiltrate sensitive data, launch a phishing campaign that tricks employees into revealing their VPN credentials.",
105
- # 2) Cybersecurity Defensive / blue-team
106
- "We should deploy an EDR solution, monitor all endpoints for intrusion attempts, and enforce strict password policies.",
107
- # 5) Cybersecruity Marketing
108
- "β€œOur marketing team will unveil the new cybersecurity branding materials at next Tuesday’s antivirus product launch",
109
- # 5) Non Cybersecruity related
110
- "I'm excited about the company picnic. There's no cybersecurity topicβ€”just burgers and games."
111
- ]
112
-
113
- # Classify with both models
114
- threshold = 0.515
115
- blue_results = classify_texts("HagalazAI/BlueSecureBERT", phrases, threshold)
116
- red_results = classify_texts("HagalazAI/RedSecureBERT", phrases, threshold)
117
-
118
- # Print a Markdown table
119
- print("| # | Phrase | Blue Score | Blue Label | Red Score | Red Label |")
120
- print("|---|--------|-----------|-----------|----------|----------|")
121
- for i, text in enumerate(phrases, start=1):
122
- blue_score, blue_label = blue_results[i - 1]
123
- red_score, red_label = red_results[i - 1]
124
- print(f"| {i} | {text} | {blue_score:.3f} | {blue_label} | {red_score:.3f} | {red_label} |")
125
-
126
- if __name__ == "__main__":
127
- main()
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  ## Quick start
73
 
74
  ```python
75
+ #!/usr/bin/env python
76
+ """
77
+ 06_split_binary.py
78
+ ~~~~~~~~~~~~~~~~~~
79
+
80
+ Stream-splits a JSONL cybersecurity corpus into *offensive*, *defensive*, and *other* shards
81
+ using **two** fine-tuned SecureBERT heads.
82
+
83
+ How the two heads work together
84
+ -------------------------------
85
+ We load two independent checkpoints:
86
+
87
+ * `offensive_vs_rest` → gives **P(offensive | text)**
88
+ * `defensive_vs_rest` → gives **P(defensive | text)**
89
+
90
+ For every line we:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
+ 1. run both heads in the same GPU batch;
93
+ 2. take the positive-class probability from each soft-max;
94
+ 3. compare against per-head thresholds (from `thresholds.json`, default 0.5);
95
+ 4. route the text with this truth table
96
+ """
97
+
98
+ from __future__ import annotations
99
+
100
+ import argparse
101
+ import json
102
+ from itertools import islice
103
+ from pathlib import Path
104
+
105
+ import torch
106
+ from torch.nn.functional import softmax
107
+ from tqdm.auto import tqdm
108
+ from transformers import (
109
+ AutoModelForSequenceClassification as HFModel,
110
+ AutoTokenizer,
111
+ )
112
+
113
+ from config import RAW_JSONL, MODEL_DIR # MODEL_DIR == securebert_finetuned
114
+
115
+ # ───────────────────────────── GPU SETTINGS ──────────────────────────
116
+ # 1. Use TensorFloat-32 on Ada GPUs (gives a big matmul speed boost).
117
+ torch.backends.cuda.matmul.allow_tf32 = True
118
+ torch.set_float32_matmul_precision("medium")
119
+
120
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
121
+
122
+ # ──────────────────────────────── CLI ────────────────────────────────
123
+ cli = argparse.ArgumentParser(description="Split JSONL into offence/defence/other")
124
+ cli.add_argument("--batch_size", type=int, help="override auto batch sizing")
125
+ args = cli.parse_args()
126
+
127
+ # ───────────────────── BATCH-SIZE HEURISTIC ──────────────────────────
128
+ if args.batch_size: # user override wins
129
+ BATCH = args.batch_size
130
+ else:
131
+ try:
132
+ import pynvml
133
+
134
+ pynvml.nvmlInit()
135
+ free = (
136
+ pynvml.nvmlDeviceGetMemoryInfo(pynvml.nvmlDeviceGetHandleByIndex(0)).free
137
+ / 1024**3
138
+ )
139
+ pynvml.nvmlShutdown()
140
+ # ~30 MB per 512-token sequence (bfloat16, two heads) – clamp sensibly
141
+ BATCH = max(64, min(int(free // 0.03), 1024))
142
+ except Exception: # any issue β†’ decent default
143
+ BATCH = 256
144
+ print(f"[split-binary] batch size = {BATCH}")
145
+
146
+ # ───────────────────────── THRESHOLDS ────────────────────────────────
147
+ thr_path = Path(MODEL_DIR) / "thresholds.json"
148
+ if thr_path.exists():
149
+ THR = json.loads(thr_path.read_text())
150
+ print("Loaded thresholds:", THR)
151
+ else:
152
+ THR = {"off": 0.5, "def": 0.5}
153
+ print("No thresholds.json β†’ default 0.5 each")
154
+
155
+ # ─────────────────── MODEL & TOKENISER LOADING ───────────────────────
156
+ def load_model(path: Path):
157
+ """Load classification head in BF16 (no flash-attention)."""
158
+ return HFModel.from_pretrained(path, torch_dtype=torch.bfloat16)
159
+
160
+
161
+ paths = {
162
+ "off": Path(MODEL_DIR) / "offensive_vs_rest",
163
+ "def": Path(MODEL_DIR) / "defensive_vs_rest",
164
+ }
165
+ print("Loading models …")
166
+ m_off = load_model(paths["off"]).to(DEVICE).eval()
167
+ m_def = load_model(paths["def"]).to(DEVICE).eval()
168
+
169
+ # Optional: compile graphs for a little extra throughput
170
+ try:
171
+ m_off = torch.compile(m_off, dynamic=True, mode="reduce-overhead")
172
+ m_def = torch.compile(m_def, dynamic=True, mode="reduce-overhead")
173
+ print("torch.compile: dynamic=True, reduce-overhead βœ“")
174
+ except Exception:
175
+ pass
176
+
177
+ tok = AutoTokenizer.from_pretrained(paths["off"])
178
+ ENC = dict(
179
+ truncation=True,
180
+ padding="longest",
181
+ max_length=512,
182
+ return_tensors="pt",
183
+ )
184
+
185
+ # ─────────────────────── OUTPUT HANDLES ──────────────────────────────
186
+ outs = {
187
+ "off": open("offensive.jsonl", "w", encoding="utf-8"),
188
+ "def": open("defensive.jsonl", "w", encoding="utf-8"),
189
+ "oth": open("other.jsonl", "w", encoding="utf-8"),
190
+ }
191
+
192
+ # ───────────────────────── HELPERS ───────────────────────────────────
193
+ def batched(it, n):
194
+ """Yield `n`-sized chunks from iterator `it`."""
195
+ while True:
196
+ chunk = list(islice(it, n))
197
+ if not chunk:
198
+ break
199
+ yield chunk
200
+
201
+
202
+ # ───────────────────── MAIN SPLITTING LOOP ───────────────────────────
203
+ with open(RAW_JSONL, "r", encoding="utf-8") as fin, torch.inference_mode():
204
+ for lines in tqdm(batched(fin, BATCH), desc="Splitting", ncols=110):
205
+ recs = [json.loads(l) for l in lines]
206
+ texts = [r.get("content", "") for r in recs]
207
+
208
+ # Tokenise β†’ pin CPU mem β†’ async copy to GPU
209
+ batch = tok(texts, **ENC)
210
+ batch = {
211
+ k: v.pin_memory().to(DEVICE, non_blocking=True) for k, v in batch.items()
212
+ }
213
+
214
+ # Positive-class probabilities
215
+ p_off = softmax(m_off(**batch).logits, dim=-1)[:, 1].cpu()
216
+ p_def = softmax(m_def(**batch).logits, dim=-1)[:, 1].cpu()
217
+
218
+ for r, po, pd in zip(recs, p_off, p_def):
219
+ txt = r.get("content", "")
220
+ off, dfn = po >= THR["off"], pd >= THR["def"]
221
+
222
+ if off and not dfn:
223
+ outs["off"].write(json.dumps({"content": txt}) + "\n")
224
+ elif dfn and not off:
225
+ outs["def"].write(json.dumps({"content": txt}) + "\n")
226
+ elif off and dfn: # tie β†’ higher prob wins
227
+ (outs["off"] if po >= pd else outs["def"]).write(
228
+ json.dumps({"content": txt}) + "\n"
229
+ )
230
+ else:
231
+ outs["oth"].write(json.dumps({"content": txt}) + "\n")
232
+
233
+ # ───────────────────────── CLEAN-UP ──────────────────────────────────
234
+ for f in outs.values():
235
+ f.close()
236
+ print("βœ… Done! β†’ offensive.jsonl defensive.jsonl other.jsonl")