jflynt commited on
Commit
f17ff41
·
verified ·
1 Parent(s): 4e87a21

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,14 +1,11 @@
1
  ---
2
- title: Orgforge It
3
- emoji: 🏢
4
- colorFrom: pink
5
- colorTo: red
6
  sdk: gradio
7
  sdk_version: 6.9.0
8
  app_file: app.py
9
- pinned: false
10
  license: mit
11
- short_description: LLM detection leaderboard for OrgForge insider threat sim
12
  ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: OrgForge Insider Threat Benchmark
3
+ emoji: 🛡
4
+ colorFrom: red
5
+ colorTo: gray
6
  sdk: gradio
7
  sdk_version: 6.9.0
8
  app_file: app.py
9
+ pinned: true
10
  license: mit
 
11
  ---
 
 
app.py ADDED
@@ -0,0 +1,556 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+
4
+ DATA_URL = "./insider_threat_leaderboard.csv"
5
+
6
+ # ─── Column definitions ───────────────────────────────────────────────────────
7
+
8
+ CORE_COLS = [
9
+ "model",
10
+ "tier",
11
+ "triage_f1",
12
+ "verdict_f1",
13
+ "baseline_fp_rate",
14
+ "onset_sensitivity",
15
+ "vishing_detected",
16
+ "host_trail_reconstructed",
17
+ ]
18
+
19
+ TRIAGE_COLS = [
20
+ "triage_precision",
21
+ "triage_recall",
22
+ "triage_f1",
23
+ "triage_tp",
24
+ "triage_fp",
25
+ "triage_fn",
26
+ ]
27
+
28
+ VERDICT_COLS = [
29
+ "verdict_precision",
30
+ "verdict_recall",
31
+ "verdict_f1",
32
+ "verdict_tp",
33
+ "verdict_fp",
34
+ "verdict_fn",
35
+ ]
36
+
37
+ BEHAVIOR_COLS_MAP = {
38
+ "secret_in_commit": ["tp_secret_in_commit", "fp_secret_in_commit"],
39
+ "data_exfil_email": ["tp_data_exfil_email", "fp_data_exfil_email"],
40
+ "host_data_hoarding": ["tp_host_data_hoarding", "fp_host_data_hoarding"],
41
+ "social_engineering": ["tp_social_engineering", "fp_social_engineering"],
42
+ "unusual_hours_access": ["tp_unusual_hours_access", "fp_unusual_hours_access"],
43
+ "sentiment_drift": ["tp_sentiment_drift", "fp_sentiment_drift"],
44
+ "excessive_repo_cloning":["tp_excessive_repo_cloning","fp_excessive_repo_cloning"],
45
+ "cross_dept_snooping": ["tp_cross_dept_snooping", "fp_cross_dept_snooping"],
46
+ }
47
+
48
+ CLASS_COLS_MAP = {
49
+ "negligent": ["negligent_tp", "negligent_fp", "negligent_fn"],
50
+ "disgruntled": ["disgruntled_tp", "disgruntled_fp", "disgruntled_fn"],
51
+ "malicious": ["malicious_tp", "malicious_fp", "malicious_fn"],
52
+ }
53
+
54
+ FRIENDLY_COLS = {
55
+ "model": "Model",
56
+ "tier": "Tier",
57
+ "triage_f1": "Triage F1",
58
+ "verdict_f1": "Verdict F1",
59
+ "baseline_fp_rate": "Baseline FP Rate ↓",
60
+ "onset_sensitivity": "Onset Sensitivity ↓",
61
+ "vishing_detected": "Vishing",
62
+ "host_trail_reconstructed":"Host Trail",
63
+ "triage_precision": "Triage P",
64
+ "triage_recall": "Triage R",
65
+ "triage_tp": "T-TP",
66
+ "triage_fp": "T-FP",
67
+ "triage_fn": "T-FN",
68
+ "verdict_precision": "Verdict P",
69
+ "verdict_recall": "Verdict R",
70
+ "verdict_tp": "V-TP",
71
+ "verdict_fp": "V-FP",
72
+ "verdict_fn": "V-FN",
73
+ }
74
+
75
+
76
+ # ─── Data loading ─────────────────────────────────────────────────────────────
77
+
78
+ def load_data() -> pd.DataFrame:
79
+ try:
80
+ df = pd.read_csv(DATA_URL)
81
+ return df
82
+ except Exception:
83
+ # Return an empty frame with expected columns so the UI doesn't crash
84
+ return pd.DataFrame(columns=CORE_COLS)
85
+
86
+
87
+ def build_display(
88
+ df: pd.DataFrame,
89
+ search: str,
90
+ tier: str,
91
+ show_triage: bool,
92
+ show_verdict: bool,
93
+ selected_behaviors: list,
94
+ selected_classes: list,
95
+ sort_by: str,
96
+ ) -> pd.DataFrame:
97
+ if df.empty:
98
+ return pd.DataFrame({"Status": ["No data — place insider_threat_leaderboard.csv next to app.py"]})
99
+
100
+ # Tier filter
101
+ if tier != "All":
102
+ tier_val = "2" if tier == "Tier 2 (Full Pipeline)" else "1"
103
+ if "tier" in df.columns:
104
+ df = df[df["tier"].astype(str) == tier_val]
105
+
106
+ # Model search
107
+ if search and "model" in df.columns:
108
+ df = df[df["model"].str.contains(search, case=False, na=False)]
109
+
110
+ # Build column list
111
+ cols = CORE_COLS.copy()
112
+ if show_triage:
113
+ cols += [c for c in TRIAGE_COLS if c not in cols]
114
+ if show_verdict:
115
+ cols += [c for c in VERDICT_COLS if c not in cols]
116
+ for b in selected_behaviors:
117
+ cols += [c for c in BEHAVIOR_COLS_MAP.get(b, []) if c not in cols]
118
+ for c in selected_classes:
119
+ cols += [cl for cl in CLASS_COLS_MAP.get(c, []) if cl not in cols]
120
+
121
+ # Keep only columns that actually exist in the CSV
122
+ cols = [c for c in cols if c in df.columns]
123
+ df = df[cols].copy()
124
+
125
+ # Sort
126
+ sort_col_map = {
127
+ "Verdict F1": "verdict_f1",
128
+ "Triage F1": "triage_f1",
129
+ "Baseline FP Rate ↑": "baseline_fp_rate",
130
+ "Onset Sensitivity ↑": "onset_sensitivity",
131
+ }
132
+ sort_col = sort_col_map.get(sort_by, "verdict_f1")
133
+ ascending = sort_by in ("Baseline FP Rate ↑", "Onset Sensitivity ↑")
134
+ if sort_col in df.columns:
135
+ df = df.sort_values(by=sort_col, ascending=ascending, na_position="last")
136
+
137
+ # Rename columns for display
138
+ df = df.rename(columns=FRIENDLY_COLS)
139
+
140
+ # Format booleans
141
+ for col in ["Vishing", "Host Trail"]:
142
+ if col in df.columns:
143
+ df[col] = df[col].map(
144
+ lambda v: "✓" if v is True or str(v).lower() in ("true", "1", "yes")
145
+ else ("✗" if v is False or str(v).lower() in ("false", "0", "no") else "—")
146
+ )
147
+
148
+ # Round floats
149
+ float_cols = df.select_dtypes(include="float").columns
150
+ df[float_cols] = df[float_cols].round(4)
151
+
152
+ return df.reset_index(drop=True)
153
+
154
+
155
+ # ─── UI ───────────────────────────────────────────────────────────────────────
156
+
157
+ CSS = """
158
+ @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;500;600&family=IBM+Plex+Sans:wght@300;400;500&display=swap');
159
+
160
+ :root {
161
+ --bg: #0a0c0f;
162
+ --surface: #111318;
163
+ --border: #1e2330;
164
+ --accent: #e63946;
165
+ --accent2: #ff6b6b;
166
+ --muted: #4a5568;
167
+ --text: #c9d1d9;
168
+ --text-dim: #6e7681;
169
+ --green: #39d353;
170
+ --amber: #f0a500;
171
+ }
172
+
173
+ body, .gradio-container {
174
+ background: var(--bg) !important;
175
+ font-family: 'IBM Plex Mono', monospace !important;
176
+ color: var(--text) !important;
177
+ }
178
+
179
+ /* Header */
180
+ .it-header {
181
+ border-bottom: 1px solid var(--border);
182
+ padding: 2rem 0 1.5rem 0;
183
+ margin-bottom: 1.5rem;
184
+ position: relative;
185
+ }
186
+
187
+ .it-title {
188
+ font-family: 'IBM Plex Mono', monospace;
189
+ font-size: 1.6rem;
190
+ font-weight: 600;
191
+ letter-spacing: -0.02em;
192
+ color: #fff;
193
+ margin: 0;
194
+ }
195
+
196
+ .it-title span {
197
+ color: var(--accent);
198
+ }
199
+
200
+ .it-subtitle {
201
+ font-family: 'IBM Plex Sans', sans-serif;
202
+ font-size: 0.8rem;
203
+ color: var(--text-dim);
204
+ margin: 0.4rem 0 0 0;
205
+ letter-spacing: 0.08em;
206
+ text-transform: uppercase;
207
+ }
208
+
209
+ .it-tag {
210
+ display: inline-block;
211
+ font-size: 0.65rem;
212
+ font-weight: 600;
213
+ letter-spacing: 0.12em;
214
+ text-transform: uppercase;
215
+ padding: 0.15rem 0.5rem;
216
+ border: 1px solid var(--accent);
217
+ color: var(--accent);
218
+ border-radius: 2px;
219
+ margin-right: 0.5rem;
220
+ }
221
+
222
+ /* Metric cards */
223
+ .metric-strip {
224
+ display: grid;
225
+ grid-template-columns: repeat(4, 1fr);
226
+ gap: 1px;
227
+ background: var(--border);
228
+ border: 1px solid var(--border);
229
+ margin-bottom: 1.5rem;
230
+ }
231
+
232
+ .metric-card {
233
+ background: var(--surface);
234
+ padding: 1rem 1.2rem;
235
+ text-align: center;
236
+ }
237
+
238
+ .metric-value {
239
+ font-family: 'IBM Plex Mono', monospace;
240
+ font-size: 1.6rem;
241
+ font-weight: 600;
242
+ color: #fff;
243
+ line-height: 1;
244
+ }
245
+
246
+ .metric-value.accent { color: var(--accent); }
247
+ .metric-value.green { color: var(--green); }
248
+ .metric-value.amber { color: var(--amber); }
249
+
250
+ .metric-label {
251
+ font-size: 0.65rem;
252
+ color: var(--text-dim);
253
+ letter-spacing: 0.1em;
254
+ text-transform: uppercase;
255
+ margin-top: 0.3rem;
256
+ }
257
+
258
+ /* Controls */
259
+ .controls-bar {
260
+ display: flex;
261
+ gap: 1rem;
262
+ margin-bottom: 1rem;
263
+ align-items: flex-end;
264
+ flex-wrap: wrap;
265
+ }
266
+
267
+ /* Override Gradio component backgrounds */
268
+ .gr-box, .gr-form, .gr-panel,
269
+ input, select, textarea,
270
+ .gr-input, .gr-dropdown {
271
+ background: var(--surface) !important;
272
+ border-color: var(--border) !important;
273
+ color: var(--text) !important;
274
+ font-family: 'IBM Plex Mono', monospace !important;
275
+ font-size: 0.8rem !important;
276
+ }
277
+
278
+ label, .gr-label, span.svelte-1gfkn6j {
279
+ color: var(--text-dim) !important;
280
+ font-size: 0.7rem !important;
281
+ letter-spacing: 0.08em !important;
282
+ text-transform: uppercase !important;
283
+ font-family: 'IBM Plex Mono', monospace !important;
284
+ }
285
+
286
+ /* Table */
287
+ .gr-dataframe table {
288
+ font-family: 'IBM Plex Mono', monospace !important;
289
+ font-size: 0.75rem !important;
290
+ border-collapse: collapse !important;
291
+ }
292
+
293
+ .gr-dataframe thead th {
294
+ background: var(--surface) !important;
295
+ color: var(--text-dim) !important;
296
+ font-size: 0.65rem !important;
297
+ letter-spacing: 0.1em !important;
298
+ text-transform: uppercase !important;
299
+ border-bottom: 1px solid var(--accent) !important;
300
+ padding: 0.6rem 0.8rem !important;
301
+ white-space: nowrap !important;
302
+ }
303
+
304
+ .gr-dataframe tbody tr {
305
+ border-bottom: 1px solid var(--border) !important;
306
+ transition: background 0.1s;
307
+ }
308
+
309
+ .gr-dataframe tbody tr:first-child td {
310
+ background: rgba(230, 57, 70, 0.06) !important;
311
+ }
312
+
313
+ .gr-dataframe tbody tr:hover td {
314
+ background: rgba(255,255,255,0.02) !important;
315
+ }
316
+
317
+ .gr-dataframe tbody td {
318
+ background: var(--bg) !important;
319
+ color: var(--text) !important;
320
+ padding: 0.5rem 0.8rem !important;
321
+ border-right: 1px solid var(--border) !important;
322
+ }
323
+
324
+ /* Tabs */
325
+ .gr-tab-nav {
326
+ border-bottom: 1px solid var(--border) !important;
327
+ background: transparent !important;
328
+ }
329
+
330
+ .gr-tab-nav button {
331
+ font-family: 'IBM Plex Mono', monospace !important;
332
+ font-size: 0.72rem !important;
333
+ letter-spacing: 0.08em !important;
334
+ text-transform: uppercase !important;
335
+ color: var(--text-dim) !important;
336
+ background: transparent !important;
337
+ border: none !important;
338
+ padding: 0.6rem 1rem !important;
339
+ }
340
+
341
+ .gr-tab-nav button.selected {
342
+ color: var(--accent) !important;
343
+ border-bottom: 2px solid var(--accent) !important;
344
+ }
345
+
346
+ /* Checkbox group */
347
+ .gr-check-radio {
348
+ accent-color: var(--accent) !important;
349
+ }
350
+
351
+ /* Footer legend */
352
+ .legend {
353
+ display: flex;
354
+ gap: 1.5rem;
355
+ flex-wrap: wrap;
356
+ margin-top: 1.2rem;
357
+ padding-top: 1rem;
358
+ border-top: 1px solid var(--border);
359
+ font-size: 0.68rem;
360
+ color: var(--text-dim);
361
+ letter-spacing: 0.04em;
362
+ }
363
+
364
+ .legend-item b {
365
+ color: var(--text);
366
+ }
367
+
368
+ /* Scrollbar */
369
+ ::-webkit-scrollbar { width: 4px; height: 4px; }
370
+ ::-webkit-scrollbar-track { background: var(--bg); }
371
+ ::-webkit-scrollbar-thumb { background: var(--muted); border-radius: 2px; }
372
+ """
373
+
374
+ HEADER_HTML = """
375
+ <div class="it-header">
376
+ <div style="display:flex; align-items:baseline; gap:1rem; flex-wrap:wrap;">
377
+ <p class="it-title">▣ OrgForge <span>Insider Threat</span> Benchmark</p>
378
+ <span class="it-tag">Security Eval</span>
379
+ <span class="it-tag">Bedrock</span>
380
+ </div>
381
+ <p class="it-subtitle">Detection leaderboard — LLM reasoning over structured telemetry · No embedder required</p>
382
+ </div>
383
+ """
384
+
385
+ LEGEND_HTML = """
386
+ <div class="legend">
387
+ <span class="legend-item"><b>Triage F1</b> — escalation quality (Tier 1)</span>
388
+ <span class="legend-item"><b>Verdict F1</b> — full case quality (Tier 2)</span>
389
+ <span class="legend-item"><b>Baseline FP ↓</b> — false positive rate on clean period</span>
390
+ <span class="legend-item"><b>Onset Sensitivity ↓</b> — fraction of pre-onset escalations (guessing, not detecting)</span>
391
+ <span class="legend-item"><b>Vishing ✓</b> — phone_call → idp_auth cross-actor correlation detected</span>
392
+ <span class="legend-item"><b>Host Trail ✓</b> — all 3 hoarding phases cited in evidence</span>
393
+ <span class="legend-item"><b>Tier 1</b> triage only · <b>Tier 2</b> full pipeline</span>
394
+ </div>
395
+ """
396
+
397
+
398
+ def compute_summary_stats(df: pd.DataFrame) -> tuple:
399
+ """Return (n_models, best_verdict_f1, best_model, vishing_rate) for the header cards."""
400
+ if df.empty:
401
+ return 0, "—", "—", "—"
402
+ n = len(df)
403
+ if "verdict_f1" in df.columns:
404
+ best_row = df.loc[df["verdict_f1"].idxmax()]
405
+ best_f1 = f"{best_row['verdict_f1']:.3f}"
406
+ best_model = str(best_row.get("model", "—")).split(".")[-1][:24]
407
+ else:
408
+ best_f1, best_model = "—", "—"
409
+ if "vishing_detected" in df.columns:
410
+ vishing_rate = df["vishing_detected"].map(
411
+ lambda v: str(v).lower() in ("true", "1", "yes")
412
+ ).mean()
413
+ vishing_str = f"{vishing_rate:.0%}"
414
+ else:
415
+ vishing_str = "—"
416
+ return n, best_f1, best_model, vishing_str
417
+
418
+
419
+ def make_stats_html(df: pd.DataFrame) -> str:
420
+ n, best_f1, best_model, vishing_rate = compute_summary_stats(df)
421
+ return f"""
422
+ <div class="metric-strip">
423
+ <div class="metric-card">
424
+ <div class="metric-value">{n}</div>
425
+ <div class="metric-label">Models evaluated</div>
426
+ </div>
427
+ <div class="metric-card">
428
+ <div class="metric-value green">{best_f1}</div>
429
+ <div class="metric-label">Best verdict F1</div>
430
+ </div>
431
+ <div class="metric-card">
432
+ <div class="metric-value" style="font-size:1rem; padding-top:0.3rem">{best_model}</div>
433
+ <div class="metric-label">Leading model</div>
434
+ </div>
435
+ <div class="metric-card">
436
+ <div class="metric-value {'accent' if vishing_rate not in ('—','0%') else ''}">{vishing_rate}</div>
437
+ <div class="metric-label">Vishing detection rate</div>
438
+ </div>
439
+ </div>
440
+ """
441
+
442
+
443
+ # ─── App ──────────────────────────────────────────────────────────────────────
444
+
445
+ df_global = load_data()
446
+
447
+ with gr.Blocks(css=CSS, title="OrgForge Insider Threat Benchmark") as demo:
448
+
449
+ gr.HTML(HEADER_HTML)
450
+
451
+ stats_box = gr.HTML(make_stats_html(df_global))
452
+
453
+ with gr.Row():
454
+ search_bar = gr.Textbox(
455
+ placeholder="claude, llama, nova …",
456
+ label="Filter by model name",
457
+ scale=2,
458
+ )
459
+ tier_filter = gr.Dropdown(
460
+ choices=["All", "Tier 2 (Full Pipeline)", "Tier 1 (Triage Only)"],
461
+ value="All",
462
+ label="Tier",
463
+ scale=1,
464
+ )
465
+ sort_by = gr.Dropdown(
466
+ choices=[
467
+ "Verdict F1",
468
+ "Triage F1",
469
+ "Baseline FP Rate ↑",
470
+ "Onset Sensitivity ↑",
471
+ ],
472
+ value="Verdict F1",
473
+ label="Sort by",
474
+ scale=1,
475
+ )
476
+
477
+ with gr.Tabs():
478
+
479
+ with gr.Tab("📊 Overview"):
480
+ out_main = gr.Dataframe(
481
+ value=build_display(df_global, "", "All", False, False, [], [], "Verdict F1"),
482
+ interactive=False,
483
+ max_height=560,
484
+ wrap=False,
485
+ )
486
+
487
+ with gr.Tab("🔍 Triage Detail"):
488
+ out_triage = gr.Dataframe(
489
+ value=build_display(df_global, "", "All", True, False, [], [], "Triage F1"),
490
+ interactive=False,
491
+ max_height=560,
492
+ wrap=False,
493
+ )
494
+
495
+ with gr.Tab("🎯 Verdict Detail"):
496
+ out_verdict = gr.Dataframe(
497
+ value=build_display(df_global, "", "All", False, True, [], [], "Verdict F1"),
498
+ interactive=False,
499
+ max_height=560,
500
+ wrap=False,
501
+ )
502
+
503
+ with gr.Tab("🧩 By Behavior"):
504
+ behavior_filter = gr.CheckboxGroup(
505
+ choices=list(BEHAVIOR_COLS_MAP.keys()),
506
+ value=list(BEHAVIOR_COLS_MAP.keys()),
507
+ label="Behaviors to show",
508
+ )
509
+ out_behavior = gr.Dataframe(
510
+ value=build_display(
511
+ df_global, "", "All", False, False,
512
+ list(BEHAVIOR_COLS_MAP.keys()), [], "Verdict F1"
513
+ ),
514
+ interactive=False,
515
+ max_height=560,
516
+ wrap=False,
517
+ )
518
+
519
+ with gr.Tab("🏷 By Threat Class"):
520
+ class_filter = gr.CheckboxGroup(
521
+ choices=list(CLASS_COLS_MAP.keys()),
522
+ value=list(CLASS_COLS_MAP.keys()),
523
+ label="Classes to show",
524
+ )
525
+ out_class = gr.Dataframe(
526
+ value=build_display(
527
+ df_global, "", "All", False, False,
528
+ [], list(CLASS_COLS_MAP.keys()), "Verdict F1"
529
+ ),
530
+ interactive=False,
531
+ max_height=560,
532
+ wrap=False,
533
+ )
534
+
535
+ gr.HTML(LEGEND_HTML)
536
+
537
+ # ── Reactivity ────────────────────────────────────────────────────────────
538
+
539
+ def refresh(search, tier, sort, behaviors, classes):
540
+ df = load_data()
541
+ return (
542
+ make_stats_html(df),
543
+ build_display(df, search, tier, False, False, [], [], sort),
544
+ build_display(df, search, tier, True, False, [], [], sort),
545
+ build_display(df, search, tier, False, True, [], [], sort),
546
+ build_display(df, search, tier, False, False, behaviors, [], sort),
547
+ build_display(df, search, tier, False, False, [], classes, sort),
548
+ )
549
+
550
+ controls = [search_bar, tier_filter, sort_by, behavior_filter, class_filter]
551
+ outputs = [stats_box, out_main, out_triage, out_verdict, out_behavior, out_class]
552
+
553
+ for ctrl in controls:
554
+ ctrl.change(fn=refresh, inputs=controls, outputs=outputs)
555
+
556
+ demo.launch()
insider_threat_leaderboard.csv ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ run_id,timestamp,model,tier,sim_days,subjects,triage_precision,triage_recall,triage_f1,baseline_fp_rate,onset_sensitivity,verdict_precision,verdict_recall,verdict_f1,vishing_detected,host_trail_reconstructed,tp_secret_in_commit,fp_secret_in_commit,tp_unusual_hours_access,fp_unusual_hours_access,tp_excessive_repo_cloning,fp_excessive_repo_cloning,tp_sentiment_drift,fp_sentiment_drift,tp_cross_dept_snooping,fp_cross_dept_snooping,tp_data_exfil_email,fp_data_exfil_email,tp_host_data_hoarding,fp_host_data_hoarding,tp_social_engineering,fp_social_engineering,tp_idp_anomaly,fp_idp_anomaly,negligent_tp,negligent_fp,negligent_fn,disgruntled_tp,disgruntled_fp,disgruntled_fn,malicious_tp,malicious_fp,malicious_fn
2
+ mistral.devstral-2-123b__20260320T171503,2026-03-20T22:27:39.006654+00:00,mistral.devstral-2-123b,2,60,0,0.6667,1.0,0.8,0.0208,0.0,1.0,1.0,1.0,True,True,,,2,0,,,2,0,,,1,0,1,0,1,0,0,2,,,,1,0,0,1,0,0
3
+ us.anthropic.claude-opus-4-6-v1__20260320T184150,2026-03-20T23:47:13.003756+00:00,us.anthropic.claude-opus-4-6-v1,2,60,0,0.6667,1.0,0.8,0.0208,0.0,1.0,1.0,1.0,True,True,,,2,0,,,2,0,,,1,0,1,0,1,0,0,2,,,,1,0,0,1,0,0
4
+ deepseek.v3.2__20260320T190338,2026-03-21T00:12:56.410476+00:00,deepseek.v3.2,2,60,0,0.6667,1.0,0.8,0.0208,0.0,0.6667,1.0,0.8,True,True,,,2,0,,,2,0,,,1,0,1,0,1,0,0,2,,,,1,0,0,1,0,0
5
+ us.meta.llama3-3-70b-instruct-v1_0__20260320T173939,2026-03-20T22:46:04.844221+00:00,us.meta.llama3-3-70b-instruct-v1:0,2,60,0,0.0488,1.0,0.093,0.8125,0.0,0.6667,1.0,0.8,True,True,,,2,0,0,1,2,0,0,1,1,0,1,0,1,0,0,1,,,,1,0,0,1,0,0
6
+ us.anthropic.claude-opus-4-6-v1__20260320T181324,2026-03-20T23:18:46.874564+00:00,us.anthropic.claude-opus-4-6-v1,2,60,0,0.6667,1.0,0.8,0.0208,0.0,1.0,0.5,0.6667,False,False,,,1,0,,,1,0,,,,,,,,,0,1,,,,1,0,0,0,0,1
7
+ us.anthropic.claude-sonnet-4-6__20260320T180625,2026-03-20T23:11:46.096659+00:00,us.anthropic.claude-sonnet-4-6,2,60,0,0.6667,1.0,0.8,0.0208,0.0,0.0,0.0,0.0,True,False,,,,,,,,,,,,,,,,,,,,,,0,0,1,0,0,1
8
+ us.anthropic.claude-haiku-4-5-20251001-v1_0__20260320T173444,2026-03-20T22:36:32.924907+00:00,us.anthropic.claude-haiku-4-5-20251001-v1:0,2,60,0,0.6667,1.0,0.8,0.0213,0.0,0.0,0.0,0.0,True,False,,,,,,,,,,,,,,,,,,,,,,0,0,1,0,0,1
insider_threat_leaderboard.json ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "run_id": "mistral.devstral-2-123b__20260320T171503",
4
+ "timestamp": "2026-03-20T22:27:39.006654+00:00",
5
+ "model": "mistral.devstral-2-123b",
6
+ "tier": "2",
7
+ "sim_days": 51,
8
+ "subjects": 3,
9
+ "subject_classes": [],
10
+ "triage_precision": 0.6667,
11
+ "triage_recall": 1.0,
12
+ "triage_f1": 0.8,
13
+ "triage_tp": 2,
14
+ "triage_fp": 1,
15
+ "triage_fn": 0,
16
+ "baseline_fp_rate": 0.0208,
17
+ "onset_sensitivity": 0.0,
18
+ "verdict_precision": 1.0,
19
+ "verdict_recall": 1.0,
20
+ "verdict_f1": 1.0,
21
+ "verdict_tp": 2,
22
+ "verdict_fp": 0,
23
+ "verdict_fn": 0,
24
+ "vishing_detected": true,
25
+ "host_trail_reconstructed": true,
26
+ "by_behavior": {
27
+ "unusual_hours_access": {
28
+ "tp": 2,
29
+ "fp": 0
30
+ },
31
+ "sentiment_drift": {
32
+ "tp": 2,
33
+ "fp": 0
34
+ },
35
+ "host_data_hoarding": {
36
+ "tp": 1,
37
+ "fp": 0
38
+ },
39
+ "data_exfil_email": {
40
+ "tp": 1,
41
+ "fp": 0
42
+ },
43
+ "social_engineering": {
44
+ "tp": 1,
45
+ "fp": 0
46
+ },
47
+ "idp_anomaly": {
48
+ "tp": 0,
49
+ "fp": 2
50
+ }
51
+ },
52
+ "by_class": {
53
+ "malicious": {
54
+ "tp": 1,
55
+ "fp": 0,
56
+ "fn": 0
57
+ },
58
+ "disgruntled": {
59
+ "tp": 1,
60
+ "fp": 0,
61
+ "fn": 0
62
+ }
63
+ }
64
+ },
65
+ {
66
+ "run_id": "us.anthropic.claude-opus-4-6-v1__20260320T184150",
67
+ "timestamp": "2026-03-20T23:47:13.003756+00:00",
68
+ "model": "us.anthropic.claude-opus-4-6-v1",
69
+ "tier": "2",
70
+ "sim_days": 51,
71
+ "subjects": 3,
72
+ "subject_classes": [],
73
+ "triage_precision": 0.6667,
74
+ "triage_recall": 1.0,
75
+ "triage_f1": 0.8,
76
+ "triage_tp": 2,
77
+ "triage_fp": 1,
78
+ "triage_fn": 0,
79
+ "baseline_fp_rate": 0.0208,
80
+ "onset_sensitivity": 0.0,
81
+ "verdict_precision": 1.0,
82
+ "verdict_recall": 1.0,
83
+ "verdict_f1": 1.0,
84
+ "verdict_tp": 2,
85
+ "verdict_fp": 0,
86
+ "verdict_fn": 0,
87
+ "vishing_detected": true,
88
+ "host_trail_reconstructed": true,
89
+ "by_behavior": {
90
+ "host_data_hoarding": {
91
+ "tp": 1,
92
+ "fp": 0
93
+ },
94
+ "data_exfil_email": {
95
+ "tp": 1,
96
+ "fp": 0
97
+ },
98
+ "social_engineering": {
99
+ "tp": 1,
100
+ "fp": 0
101
+ },
102
+ "sentiment_drift": {
103
+ "tp": 2,
104
+ "fp": 0
105
+ },
106
+ "unusual_hours_access": {
107
+ "tp": 2,
108
+ "fp": 0
109
+ },
110
+ "idp_anomaly": {
111
+ "tp": 0,
112
+ "fp": 2
113
+ }
114
+ },
115
+ "by_class": {
116
+ "malicious": {
117
+ "tp": 1,
118
+ "fp": 0,
119
+ "fn": 0
120
+ },
121
+ "disgruntled": {
122
+ "tp": 1,
123
+ "fp": 0,
124
+ "fn": 0
125
+ }
126
+ }
127
+ },
128
+ {
129
+ "run_id": "deepseek.v3.2__20260320T190338",
130
+ "timestamp": "2026-03-21T00:12:56.410476+00:00",
131
+ "model": "deepseek.v3.2",
132
+ "tier": "2",
133
+ "sim_days": 51,
134
+ "subjects": 3,
135
+ "subject_classes": [],
136
+ "triage_precision": 0.6667,
137
+ "triage_recall": 1.0,
138
+ "triage_f1": 0.8,
139
+ "triage_tp": 2,
140
+ "triage_fp": 1,
141
+ "triage_fn": 0,
142
+ "baseline_fp_rate": 0.0208,
143
+ "onset_sensitivity": 0.0,
144
+ "verdict_precision": 0.6667,
145
+ "verdict_recall": 1.0,
146
+ "verdict_f1": 0.8,
147
+ "verdict_tp": 2,
148
+ "verdict_fp": 1,
149
+ "verdict_fn": 0,
150
+ "vishing_detected": true,
151
+ "host_trail_reconstructed": true,
152
+ "by_behavior": {
153
+ "host_data_hoarding": {
154
+ "tp": 1,
155
+ "fp": 0
156
+ },
157
+ "data_exfil_email": {
158
+ "tp": 1,
159
+ "fp": 0
160
+ },
161
+ "social_engineering": {
162
+ "tp": 1,
163
+ "fp": 0
164
+ },
165
+ "unusual_hours_access": {
166
+ "tp": 2,
167
+ "fp": 0
168
+ },
169
+ "sentiment_drift": {
170
+ "tp": 2,
171
+ "fp": 0
172
+ },
173
+ "idp_anomaly": {
174
+ "tp": 0,
175
+ "fp": 2
176
+ }
177
+ },
178
+ "by_class": {
179
+ "innocent": {
180
+ "tp": 0,
181
+ "fp": 1,
182
+ "fn": 0
183
+ },
184
+ "malicious": {
185
+ "tp": 1,
186
+ "fp": 0,
187
+ "fn": 0
188
+ },
189
+ "disgruntled": {
190
+ "tp": 1,
191
+ "fp": 0,
192
+ "fn": 0
193
+ }
194
+ }
195
+ },
196
+ {
197
+ "run_id": "us.meta.llama3-3-70b-instruct-v1_0__20260320T173939",
198
+ "timestamp": "2026-03-20T22:46:04.844221+00:00",
199
+ "model": "us.meta.llama3-3-70b-instruct-v1:0",
200
+ "tier": "2",
201
+ "sim_days": 51,
202
+ "subjects": 3,
203
+ "subject_classes": [],
204
+ "triage_precision": 0.0488,
205
+ "triage_recall": 1.0,
206
+ "triage_f1": 0.093,
207
+ "triage_tp": 2,
208
+ "triage_fp": 39,
209
+ "triage_fn": 0,
210
+ "baseline_fp_rate": 0.8125,
211
+ "onset_sensitivity": 0.0,
212
+ "verdict_precision": 0.6667,
213
+ "verdict_recall": 1.0,
214
+ "verdict_f1": 0.8,
215
+ "verdict_tp": 2,
216
+ "verdict_fp": 1,
217
+ "verdict_fn": 0,
218
+ "vishing_detected": true,
219
+ "host_trail_reconstructed": true,
220
+ "by_behavior": {
221
+ "unusual_hours_access": {
222
+ "tp": 2,
223
+ "fp": 0
224
+ },
225
+ "excessive_repo_cloning": {
226
+ "tp": 0,
227
+ "fp": 1
228
+ },
229
+ "sentiment_drift": {
230
+ "tp": 2,
231
+ "fp": 0
232
+ },
233
+ "cross_dept_snooping": {
234
+ "tp": 0,
235
+ "fp": 1
236
+ },
237
+ "data_exfil_email": {
238
+ "tp": 1,
239
+ "fp": 0
240
+ },
241
+ "host_data_hoarding": {
242
+ "tp": 1,
243
+ "fp": 0
244
+ },
245
+ "social_engineering": {
246
+ "tp": 1,
247
+ "fp": 0
248
+ },
249
+ "idp_anomaly": {
250
+ "tp": 0,
251
+ "fp": 1
252
+ }
253
+ },
254
+ "by_class": {
255
+ "innocent": {
256
+ "tp": 0,
257
+ "fp": 1,
258
+ "fn": 0
259
+ },
260
+ "malicious": {
261
+ "tp": 1,
262
+ "fp": 0,
263
+ "fn": 0
264
+ },
265
+ "disgruntled": {
266
+ "tp": 1,
267
+ "fp": 0,
268
+ "fn": 0
269
+ }
270
+ }
271
+ },
272
+ {
273
+ "run_id": "us.anthropic.claude-sonnet-4-6__20260320T180625",
274
+ "timestamp": "2026-03-20T23:11:46.096659+00:00",
275
+ "model": "us.anthropic.claude-sonnet-4-6",
276
+ "tier": "2",
277
+ "sim_days": 51,
278
+ "subjects": 3,
279
+ "subject_classes": [],
280
+ "triage_precision": 0.6667,
281
+ "triage_recall": 1.0,
282
+ "triage_f1": 0.8,
283
+ "triage_tp": 2,
284
+ "triage_fp": 1,
285
+ "triage_fn": 0,
286
+ "baseline_fp_rate": 0.0208,
287
+ "onset_sensitivity": 0.0,
288
+ "verdict_precision": 0.0,
289
+ "verdict_recall": 0.0,
290
+ "verdict_f1": 0.0,
291
+ "verdict_tp": 0,
292
+ "verdict_fp": 1,
293
+ "verdict_fn": 2,
294
+ "vishing_detected": true,
295
+ "host_trail_reconstructed": false,
296
+ "by_behavior": {},
297
+ "by_class": {
298
+ "innocent": {
299
+ "tp": 0,
300
+ "fp": 1,
301
+ "fn": 0
302
+ },
303
+ "disgruntled": {
304
+ "tp": 0,
305
+ "fp": 0,
306
+ "fn": 1
307
+ },
308
+ "malicious": {
309
+ "tp": 0,
310
+ "fp": 0,
311
+ "fn": 1
312
+ }
313
+ }
314
+ },
315
+ {
316
+ "run_id": "us.anthropic.claude-haiku-4-5-20251001-v1_0__20260320T173444",
317
+ "timestamp": "2026-03-20T22:36:32.924907+00:00",
318
+ "model": "us.anthropic.claude-haiku-4-5-20251001-v1:0",
319
+ "tier": "2",
320
+ "sim_days": 51,
321
+ "subjects": 3,
322
+ "subject_classes": [],
323
+ "triage_precision": 0.6667,
324
+ "triage_recall": 1.0,
325
+ "triage_f1": 0.8,
326
+ "triage_tp": 2,
327
+ "triage_fp": 1,
328
+ "triage_fn": 0,
329
+ "baseline_fp_rate": 0.0213,
330
+ "onset_sensitivity": 0.0,
331
+ "verdict_precision": 0.0,
332
+ "verdict_recall": 0.0,
333
+ "verdict_f1": 0.0,
334
+ "verdict_tp": 0,
335
+ "verdict_fp": 1,
336
+ "verdict_fn": 2,
337
+ "vishing_detected": true,
338
+ "host_trail_reconstructed": false,
339
+ "by_behavior": {},
340
+ "by_class": {
341
+ "innocent": {
342
+ "tp": 0,
343
+ "fp": 1,
344
+ "fn": 0
345
+ },
346
+ "disgruntled": {
347
+ "tp": 0,
348
+ "fp": 0,
349
+ "fn": 1
350
+ },
351
+ "malicious": {
352
+ "tp": 0,
353
+ "fp": 0,
354
+ "fn": 1
355
+ }
356
+ }
357
+ }
358
+ ]
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio==6.9.0
2
+ pandas