orgforge-it / app.py
jflynt's picture
Upload folder using huggingface_hub
8581d75 verified
import gradio as gr
import pandas as pd
DATA_URL = "https://huggingface.co/datasets/aeriesec/orgforge-insider-threat/resolve/main/leaderboard/insider_threat_leaderboard.csv"
# ─── Column definitions ───────────────────────────────────────────────────────
CORE_COLS = [
"model",
"tier",
"triage_f1",
"verdict_f1",
"baseline_fp_rate",
"onset_sensitivity",
"vishing_detected",
"host_trail_reconstructed",
]
TRIAGE_COLS = [
"triage_precision",
"triage_recall",
"triage_f1",
"triage_tp",
"triage_fp",
"triage_fn",
]
VERDICT_COLS = [
"verdict_precision",
"verdict_recall",
"verdict_f1",
"verdict_tp",
"verdict_fp",
"verdict_fn",
]
BEHAVIOR_COLS_MAP = {
"secret_in_commit": ["tp_secret_in_commit", "fp_secret_in_commit"],
"data_exfil_email": ["tp_data_exfil_email", "fp_data_exfil_email"],
"host_data_hoarding": ["tp_host_data_hoarding", "fp_host_data_hoarding"],
"social_engineering": ["tp_social_engineering", "fp_social_engineering"],
"unusual_hours_access": ["tp_unusual_hours_access", "fp_unusual_hours_access"],
"sentiment_drift": ["tp_sentiment_drift", "fp_sentiment_drift"],
"excessive_repo_cloning":["tp_excessive_repo_cloning","fp_excessive_repo_cloning"],
"cross_dept_snooping": ["tp_cross_dept_snooping", "fp_cross_dept_snooping"],
}
CLASS_COLS_MAP = {
"negligent": ["negligent_tp", "negligent_fp", "negligent_fn"],
"disgruntled": ["disgruntled_tp", "disgruntled_fp", "disgruntled_fn"],
"malicious": ["malicious_tp", "malicious_fp", "malicious_fn"],
}
FRIENDLY_COLS = {
"model": "Model",
"tier": "Tier",
"triage_f1": "Triage F1",
"verdict_f1": "Verdict F1",
"baseline_fp_rate": "Baseline FP Rate ↓",
"onset_sensitivity": "Onset Sensitivity ↓",
"vishing_detected": "Vishing",
"host_trail_reconstructed":"Host Trail",
"triage_precision": "Triage P",
"triage_recall": "Triage R",
"triage_tp": "T-TP",
"triage_fp": "T-FP",
"triage_fn": "T-FN",
"verdict_precision": "Verdict P",
"verdict_recall": "Verdict R",
"verdict_tp": "V-TP",
"verdict_fp": "V-FP",
"verdict_fn": "V-FN",
}
# ─── Data loading ─────────────────────────────────────────────────────────────
def load_data() -> pd.DataFrame:
try:
df = pd.read_csv(DATA_URL)
return df
except Exception:
# Return an empty frame with expected columns so the UI doesn't crash
return pd.DataFrame(columns=CORE_COLS)
def build_display(
df: pd.DataFrame,
search: str,
tier: str,
show_triage: bool,
show_verdict: bool,
selected_behaviors: list,
selected_classes: list,
sort_by: str,
) -> pd.DataFrame:
if df.empty:
return pd.DataFrame({"Status": ["No data β€” place insider_threat_leaderboard.csv next to app.py"]})
# Tier filter
if tier != "All":
tier_val = "2" if tier == "Tier 2 (Full Pipeline)" else "1"
if "tier" in df.columns:
df = df[df["tier"].astype(str) == tier_val]
# Model search
if search and "model" in df.columns:
df = df[df["model"].str.contains(search, case=False, na=False)]
# Build column list
cols = CORE_COLS.copy()
if show_triage:
cols += [c for c in TRIAGE_COLS if c not in cols]
if show_verdict:
cols += [c for c in VERDICT_COLS if c not in cols]
for b in selected_behaviors:
cols += [c for c in BEHAVIOR_COLS_MAP.get(b, []) if c not in cols]
for c in selected_classes:
cols += [cl for cl in CLASS_COLS_MAP.get(c, []) if cl not in cols]
# Keep only columns that actually exist in the CSV
cols = [c for c in cols if c in df.columns]
df = df[cols].copy()
# Sort
sort_col_map = {
"Verdict F1": "verdict_f1",
"Triage F1": "triage_f1",
"Baseline FP Rate ↑": "baseline_fp_rate",
"Onset Sensitivity ↑": "onset_sensitivity",
}
sort_col = sort_col_map.get(sort_by, "verdict_f1")
ascending = sort_by in ("Baseline FP Rate ↑", "Onset Sensitivity ↑")
if sort_col in df.columns:
df = df.sort_values(by=sort_col, ascending=ascending, na_position="last")
# Rename columns for display
df = df.rename(columns=FRIENDLY_COLS)
# Format booleans
for col in ["Vishing", "Host Trail"]:
if col in df.columns:
df[col] = df[col].map(
lambda v: "βœ“" if v is True or str(v).lower() in ("true", "1", "yes")
else ("βœ—" if v is False or str(v).lower() in ("false", "0", "no") else "β€”")
)
# Round floats
float_cols = df.select_dtypes(include="float").columns
df[float_cols] = df[float_cols].round(4)
return df.reset_index(drop=True)
# ─── UI ───────────────────────────────────────────────────────────────────────
CSS = """
@import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;500;600&family=IBM+Plex+Sans:wght@300;400;500&display=swap');
:root {
--bg: #0a0c0f;
--surface: #111318;
--border: #1e2330;
--accent: #e63946;
--accent2: #ff6b6b;
--muted: #4a5568;
--text: #c9d1d9;
--text-dim: #6e7681;
--green: #39d353;
--amber: #f0a500;
}
body, .gradio-container {
background: var(--bg) !important;
font-family: 'IBM Plex Mono', monospace !important;
color: var(--text) !important;
}
/* Header */
.it-header {
border-bottom: 1px solid var(--border);
padding: 2rem 0 1.5rem 0;
margin-bottom: 1.5rem;
position: relative;
}
.it-title {
font-family: 'IBM Plex Mono', monospace;
font-size: 1.6rem;
font-weight: 600;
letter-spacing: -0.02em;
color: #fff;
margin: 0;
}
.it-title span {
color: var(--accent);
}
.it-subtitle {
font-family: 'IBM Plex Sans', sans-serif;
font-size: 0.8rem;
color: var(--text-dim);
margin: 0.4rem 0 0 0;
letter-spacing: 0.08em;
text-transform: uppercase;
}
.it-tag {
display: inline-block;
font-size: 0.65rem;
font-weight: 600;
letter-spacing: 0.12em;
text-transform: uppercase;
padding: 0.15rem 0.5rem;
border: 1px solid var(--accent);
color: var(--accent);
border-radius: 2px;
margin-right: 0.5rem;
}
/* Metric cards */
.metric-strip {
display: grid;
grid-template-columns: repeat(4, 1fr);
gap: 1px;
background: var(--border);
border: 1px solid var(--border);
margin-bottom: 1.5rem;
}
.metric-card {
background: var(--surface);
padding: 1rem 1.2rem;
text-align: center;
}
.metric-value {
font-family: 'IBM Plex Mono', monospace;
font-size: 1.6rem;
font-weight: 600;
color: #fff;
line-height: 1;
}
.metric-value.accent { color: var(--accent); }
.metric-value.green { color: var(--green); }
.metric-value.amber { color: var(--amber); }
.metric-label {
font-size: 0.65rem;
color: var(--text-dim);
letter-spacing: 0.1em;
text-transform: uppercase;
margin-top: 0.3rem;
}
/* Controls */
.controls-bar {
display: flex;
gap: 1rem;
margin-bottom: 1rem;
align-items: flex-end;
flex-wrap: wrap;
}
/* Override Gradio component backgrounds */
.gr-box, .gr-form, .gr-panel,
input, select, textarea,
.gr-input, .gr-dropdown {
background: var(--surface) !important;
border-color: var(--border) !important;
color: var(--text) !important;
font-family: 'IBM Plex Mono', monospace !important;
font-size: 0.8rem !important;
}
label, .gr-label, span.svelte-1gfkn6j {
color: var(--text-dim) !important;
font-size: 0.7rem !important;
letter-spacing: 0.08em !important;
text-transform: uppercase !important;
font-family: 'IBM Plex Mono', monospace !important;
}
/* Table */
.gr-dataframe table {
font-family: 'IBM Plex Mono', monospace !important;
font-size: 0.75rem !important;
border-collapse: collapse !important;
}
.gr-dataframe thead th {
background: var(--surface) !important;
color: var(--text-dim) !important;
font-size: 0.65rem !important;
letter-spacing: 0.1em !important;
text-transform: uppercase !important;
border-bottom: 1px solid var(--accent) !important;
padding: 0.6rem 0.8rem !important;
white-space: nowrap !important;
}
.gr-dataframe tbody tr {
border-bottom: 1px solid var(--border) !important;
transition: background 0.1s;
}
.gr-dataframe tbody tr:first-child td {
background: rgba(230, 57, 70, 0.06) !important;
}
.gr-dataframe tbody tr:hover td {
background: rgba(255,255,255,0.02) !important;
}
.gr-dataframe tbody td {
background: var(--bg) !important;
color: var(--text) !important;
padding: 0.5rem 0.8rem !important;
border-right: 1px solid var(--border) !important;
}
/* Tabs */
.gr-tab-nav {
border-bottom: 1px solid var(--border) !important;
background: transparent !important;
}
.gr-tab-nav button {
font-family: 'IBM Plex Mono', monospace !important;
font-size: 0.72rem !important;
letter-spacing: 0.08em !important;
text-transform: uppercase !important;
color: var(--text-dim) !important;
background: transparent !important;
border: none !important;
padding: 0.6rem 1rem !important;
}
.gr-tab-nav button.selected {
color: var(--accent) !important;
border-bottom: 2px solid var(--accent) !important;
}
/* Checkbox group */
.gr-check-radio {
accent-color: var(--accent) !important;
}
/* Footer legend */
.legend {
display: flex;
gap: 1.5rem;
flex-wrap: wrap;
margin-top: 1.2rem;
padding-top: 1rem;
border-top: 1px solid var(--border);
font-size: 0.68rem;
color: var(--text-dim);
letter-spacing: 0.04em;
}
.legend-item b {
color: var(--text);
}
/* Scrollbar */
::-webkit-scrollbar { width: 4px; height: 4px; }
::-webkit-scrollbar-track { background: var(--bg); }
::-webkit-scrollbar-thumb { background: var(--muted); border-radius: 2px; }
"""
HEADER_HTML = """
<div class="it-header">
<div style="display:flex; align-items:baseline; gap:1rem; flex-wrap:wrap;">
<p class="it-title">β–£ OrgForge <span>Insider Threat</span> Benchmark</p>
<span class="it-tag">Security Eval</span>
<span class="it-tag">Bedrock</span>
</div>
<p class="it-subtitle">Detection leaderboard β€” LLM reasoning over structured telemetry Β· No embedder required</p>
</div>
"""
LEGEND_HTML = """
<div class="legend">
<span class="legend-item"><b>Triage F1</b> β€” escalation quality (Tier 1)</span>
<span class="legend-item"><b>Verdict F1</b> β€” full case quality (Tier 2)</span>
<span class="legend-item"><b>Baseline FP ↓</b> β€” false positive rate on clean period</span>
<span class="legend-item"><b>Onset Sensitivity ↓</b> β€” fraction of pre-onset escalations (guessing, not detecting)</span>
<span class="legend-item"><b>Vishing βœ“</b> β€” phone_call β†’ idp_auth cross-actor correlation detected</span>
<span class="legend-item"><b>Host Trail βœ“</b> β€” all 3 hoarding phases cited in evidence</span>
<span class="legend-item"><b>Tier 1</b> triage only Β· <b>Tier 2</b> full pipeline</span>
</div>
"""
def compute_summary_stats(df: pd.DataFrame) -> tuple:
"""Return (n_models, best_verdict_f1, best_model, vishing_rate) for the header cards."""
if df.empty:
return 0, "β€”", "β€”", "β€”"
n = len(df)
if "verdict_f1" in df.columns:
best_row = df.loc[df["verdict_f1"].idxmax()]
best_f1 = f"{best_row['verdict_f1']:.3f}"
best_model = str(best_row.get("model", "β€”")).split(".")[-1][:24]
else:
best_f1, best_model = "β€”", "β€”"
if "vishing_detected" in df.columns:
vishing_rate = df["vishing_detected"].map(
lambda v: str(v).lower() in ("true", "1", "yes")
).mean()
vishing_str = f"{vishing_rate:.0%}"
else:
vishing_str = "β€”"
return n, best_f1, best_model, vishing_str
def make_stats_html(df: pd.DataFrame) -> str:
n, best_f1, best_model, vishing_rate = compute_summary_stats(df)
return f"""
<div class="metric-strip">
<div class="metric-card">
<div class="metric-value">{n}</div>
<div class="metric-label">Models evaluated</div>
</div>
<div class="metric-card">
<div class="metric-value green">{best_f1}</div>
<div class="metric-label">Best verdict F1</div>
</div>
<div class="metric-card">
<div class="metric-value" style="font-size:1rem; padding-top:0.3rem">{best_model}</div>
<div class="metric-label">Leading model</div>
</div>
<div class="metric-card">
<div class="metric-value {'accent' if vishing_rate not in ('β€”','0%') else ''}">{vishing_rate}</div>
<div class="metric-label">Vishing detection rate</div>
</div>
</div>
"""
# ─── App ──────────────────────────────────────────────────────────────────────
df_global = load_data()
with gr.Blocks(css=CSS, title="OrgForge Insider Threat Benchmark") as demo:
gr.HTML(HEADER_HTML)
stats_box = gr.HTML(make_stats_html(df_global))
with gr.Row():
search_bar = gr.Textbox(
placeholder="claude, llama, nova …",
label="Filter by model name",
scale=2,
)
tier_filter = gr.Dropdown(
choices=["All", "Tier 2 (Full Pipeline)", "Tier 1 (Triage Only)"],
value="All",
label="Tier",
scale=1,
)
sort_by = gr.Dropdown(
choices=[
"Verdict F1",
"Triage F1",
"Baseline FP Rate ↑",
"Onset Sensitivity ↑",
],
value="Verdict F1",
label="Sort by",
scale=1,
)
with gr.Tabs():
with gr.Tab("πŸ“Š Overview"):
out_main = gr.Dataframe(
value=build_display(df_global, "", "All", False, False, [], [], "Verdict F1"),
interactive=False,
max_height=560,
wrap=False,
)
with gr.Tab("πŸ” Triage Detail"):
out_triage = gr.Dataframe(
value=build_display(df_global, "", "All", True, False, [], [], "Triage F1"),
interactive=False,
max_height=560,
wrap=False,
)
with gr.Tab("🎯 Verdict Detail"):
out_verdict = gr.Dataframe(
value=build_display(df_global, "", "All", False, True, [], [], "Verdict F1"),
interactive=False,
max_height=560,
wrap=False,
)
with gr.Tab("🧩 By Behavior"):
behavior_filter = gr.CheckboxGroup(
choices=list(BEHAVIOR_COLS_MAP.keys()),
value=list(BEHAVIOR_COLS_MAP.keys()),
label="Behaviors to show",
)
out_behavior = gr.Dataframe(
value=build_display(
df_global, "", "All", False, False,
list(BEHAVIOR_COLS_MAP.keys()), [], "Verdict F1"
),
interactive=False,
max_height=560,
wrap=False,
)
with gr.Tab("🏷 By Threat Class"):
class_filter = gr.CheckboxGroup(
choices=list(CLASS_COLS_MAP.keys()),
value=list(CLASS_COLS_MAP.keys()),
label="Classes to show",
)
out_class = gr.Dataframe(
value=build_display(
df_global, "", "All", False, False,
[], list(CLASS_COLS_MAP.keys()), "Verdict F1"
),
interactive=False,
max_height=560,
wrap=False,
)
gr.HTML(LEGEND_HTML)
# ── Reactivity ────────────────────────────────────────────────────────────
def refresh(search, tier, sort, behaviors, classes):
df = load_data()
return (
make_stats_html(df),
build_display(df, search, tier, False, False, [], [], sort),
build_display(df, search, tier, True, False, [], [], sort),
build_display(df, search, tier, False, True, [], [], sort),
build_display(df, search, tier, False, False, behaviors, [], sort),
build_display(df, search, tier, False, False, [], classes, sort),
)
controls = [search_bar, tier_filter, sort_by, behavior_filter, class_filter]
outputs = [stats_box, out_main, out_triage, out_verdict, out_behavior, out_class]
for ctrl in controls:
ctrl.change(fn=refresh, inputs=controls, outputs=outputs)
demo.launch()