Spaces:

aeriesec
/

orgforge-it

Running

App Files Files Community

orgforge-it / app.py

jflynt

Upload folder using huggingface_hub

8581d75 verified 4 days ago

raw

history blame contribute delete

17.6 kB

	import gradio as gr
	import pandas as pd

	DATA_URL = "https://huggingface.co/datasets/aeriesec/orgforge-insider-threat/resolve/main/leaderboard/insider_threat_leaderboard.csv"

	# ─── Column definitions ───────────────────────────────────────────────────────

	CORE_COLS = [
	"model",
	"tier",
	"triage_f1",
	"verdict_f1",
	"baseline_fp_rate",
	"onset_sensitivity",
	"vishing_detected",
	"host_trail_reconstructed",
	]

	TRIAGE_COLS = [
	"triage_precision",
	"triage_recall",
	"triage_f1",
	"triage_tp",
	"triage_fp",
	"triage_fn",
	]

	VERDICT_COLS = [
	"verdict_precision",
	"verdict_recall",
	"verdict_f1",
	"verdict_tp",
	"verdict_fp",
	"verdict_fn",
	]

	BEHAVIOR_COLS_MAP = {
	"secret_in_commit": ["tp_secret_in_commit", "fp_secret_in_commit"],
	"data_exfil_email": ["tp_data_exfil_email", "fp_data_exfil_email"],
	"host_data_hoarding": ["tp_host_data_hoarding", "fp_host_data_hoarding"],
	"social_engineering": ["tp_social_engineering", "fp_social_engineering"],
	"unusual_hours_access": ["tp_unusual_hours_access", "fp_unusual_hours_access"],
	"sentiment_drift": ["tp_sentiment_drift", "fp_sentiment_drift"],
	"excessive_repo_cloning":["tp_excessive_repo_cloning","fp_excessive_repo_cloning"],
	"cross_dept_snooping": ["tp_cross_dept_snooping", "fp_cross_dept_snooping"],
	}

	CLASS_COLS_MAP = {
	"negligent": ["negligent_tp", "negligent_fp", "negligent_fn"],
	"disgruntled": ["disgruntled_tp", "disgruntled_fp", "disgruntled_fn"],
	"malicious": ["malicious_tp", "malicious_fp", "malicious_fn"],
	}

	FRIENDLY_COLS = {
	"model": "Model",
	"tier": "Tier",
	"triage_f1": "Triage F1",
	"verdict_f1": "Verdict F1",
	"baseline_fp_rate": "Baseline FP Rate ↓",
	"onset_sensitivity": "Onset Sensitivity ↓",
	"vishing_detected": "Vishing",
	"host_trail_reconstructed":"Host Trail",
	"triage_precision": "Triage P",
	"triage_recall": "Triage R",
	"triage_tp": "T-TP",
	"triage_fp": "T-FP",
	"triage_fn": "T-FN",
	"verdict_precision": "Verdict P",
	"verdict_recall": "Verdict R",
	"verdict_tp": "V-TP",
	"verdict_fp": "V-FP",
	"verdict_fn": "V-FN",
	}


	# ─── Data loading ─────────────────────────────────────────────────────────────

	def load_data() -> pd.DataFrame:
	try:
	df = pd.read_csv(DATA_URL)
	return df
	except Exception:
	# Return an empty frame with expected columns so the UI doesn't crash
	return pd.DataFrame(columns=CORE_COLS)


	def build_display(
	df: pd.DataFrame,
	search: str,
	tier: str,
	show_triage: bool,
	show_verdict: bool,
	selected_behaviors: list,
	selected_classes: list,
	sort_by: str,
	) -> pd.DataFrame:
	if df.empty:
	return pd.DataFrame({"Status": ["No data — place insider_threat_leaderboard.csv next to app.py"]})

	# Tier filter
	if tier != "All":
	tier_val = "2" if tier == "Tier 2 (Full Pipeline)" else "1"
	if "tier" in df.columns:
	df = df[df["tier"].astype(str) == tier_val]

	# Model search
	if search and "model" in df.columns:
	df = df[df["model"].str.contains(search, case=False, na=False)]

	# Build column list
	cols = CORE_COLS.copy()
	if show_triage:
	cols += [c for c in TRIAGE_COLS if c not in cols]
	if show_verdict:
	cols += [c for c in VERDICT_COLS if c not in cols]
	for b in selected_behaviors:
	cols += [c for c in BEHAVIOR_COLS_MAP.get(b, []) if c not in cols]
	for c in selected_classes:
	cols += [cl for cl in CLASS_COLS_MAP.get(c, []) if cl not in cols]

	# Keep only columns that actually exist in the CSV
	cols = [c for c in cols if c in df.columns]
	df = df[cols].copy()

	# Sort
	sort_col_map = {
	"Verdict F1": "verdict_f1",
	"Triage F1": "triage_f1",
	"Baseline FP Rate ↑": "baseline_fp_rate",
	"Onset Sensitivity ↑": "onset_sensitivity",
	}
	sort_col = sort_col_map.get(sort_by, "verdict_f1")
	ascending = sort_by in ("Baseline FP Rate ↑", "Onset Sensitivity ↑")
	if sort_col in df.columns:
	df = df.sort_values(by=sort_col, ascending=ascending, na_position="last")

	# Rename columns for display
	df = df.rename(columns=FRIENDLY_COLS)

	# Format booleans
	for col in ["Vishing", "Host Trail"]:
	if col in df.columns:
	df[col] = df[col].map(
	lambda v: "✓" if v is True or str(v).lower() in ("true", "1", "yes")
	else ("✗" if v is False or str(v).lower() in ("false", "0", "no") else "—")
	)

	# Round floats
	float_cols = df.select_dtypes(include="float").columns
	df[float_cols] = df[float_cols].round(4)

	return df.reset_index(drop=True)


	# ─── UI ───────────────────────────────────────────────────────────────────────

	CSS = """
	@import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;500;600&family=IBM+Plex+Sans:wght@300;400;500&display=swap');

	:root {
	--bg: #0a0c0f;
	--surface: #111318;
	--border: #1e2330;
	--accent: #e63946;
	--accent2: #ff6b6b;
	--muted: #4a5568;
	--text: #c9d1d9;
	--text-dim: #6e7681;
	--green: #39d353;
	--amber: #f0a500;
	}

	body, .gradio-container {
	background: var(--bg) !important;
	font-family: 'IBM Plex Mono', monospace !important;
	color: var(--text) !important;
	}

	/* Header */
	.it-header {
	border-bottom: 1px solid var(--border);
	padding: 2rem 0 1.5rem 0;
	margin-bottom: 1.5rem;
	position: relative;
	}

	.it-title {
	font-family: 'IBM Plex Mono', monospace;
	font-size: 1.6rem;
	font-weight: 600;
	letter-spacing: -0.02em;
	color: #fff;
	margin: 0;
	}

	.it-title span {
	color: var(--accent);
	}

	.it-subtitle {
	font-family: 'IBM Plex Sans', sans-serif;
	font-size: 0.8rem;
	color: var(--text-dim);
	margin: 0.4rem 0 0 0;
	letter-spacing: 0.08em;
	text-transform: uppercase;
	}

	.it-tag {
	display: inline-block;
	font-size: 0.65rem;
	font-weight: 600;
	letter-spacing: 0.12em;
	text-transform: uppercase;
	padding: 0.15rem 0.5rem;
	border: 1px solid var(--accent);
	color: var(--accent);
	border-radius: 2px;
	margin-right: 0.5rem;
	}

	/* Metric cards */
	.metric-strip {
	display: grid;
	grid-template-columns: repeat(4, 1fr);
	gap: 1px;
	background: var(--border);
	border: 1px solid var(--border);
	margin-bottom: 1.5rem;
	}

	.metric-card {
	background: var(--surface);
	padding: 1rem 1.2rem;
	text-align: center;
	}

	.metric-value {
	font-family: 'IBM Plex Mono', monospace;
	font-size: 1.6rem;
	font-weight: 600;
	color: #fff;
	line-height: 1;
	}

	.metric-value.accent { color: var(--accent); }
	.metric-value.green { color: var(--green); }
	.metric-value.amber { color: var(--amber); }

	.metric-label {
	font-size: 0.65rem;
	color: var(--text-dim);
	letter-spacing: 0.1em;
	text-transform: uppercase;
	margin-top: 0.3rem;
	}

	/* Controls */
	.controls-bar {
	display: flex;
	gap: 1rem;
	margin-bottom: 1rem;
	align-items: flex-end;
	flex-wrap: wrap;
	}

	/* Override Gradio component backgrounds */
	.gr-box, .gr-form, .gr-panel,
	input, select, textarea,
	.gr-input, .gr-dropdown {
	background: var(--surface) !important;
	border-color: var(--border) !important;
	color: var(--text) !important;
	font-family: 'IBM Plex Mono', monospace !important;
	font-size: 0.8rem !important;
	}

	label, .gr-label, span.svelte-1gfkn6j {
	color: var(--text-dim) !important;
	font-size: 0.7rem !important;
	letter-spacing: 0.08em !important;
	text-transform: uppercase !important;
	font-family: 'IBM Plex Mono', monospace !important;
	}

	/* Table */
	.gr-dataframe table {
	font-family: 'IBM Plex Mono', monospace !important;
	font-size: 0.75rem !important;
	border-collapse: collapse !important;
	}

	.gr-dataframe thead th {
	background: var(--surface) !important;
	color: var(--text-dim) !important;
	font-size: 0.65rem !important;
	letter-spacing: 0.1em !important;
	text-transform: uppercase !important;
	border-bottom: 1px solid var(--accent) !important;
	padding: 0.6rem 0.8rem !important;
	white-space: nowrap !important;
	}

	.gr-dataframe tbody tr {
	border-bottom: 1px solid var(--border) !important;
	transition: background 0.1s;
	}

	.gr-dataframe tbody tr:first-child td {
	background: rgba(230, 57, 70, 0.06) !important;
	}

	.gr-dataframe tbody tr:hover td {
	background: rgba(255,255,255,0.02) !important;
	}

	.gr-dataframe tbody td {
	background: var(--bg) !important;
	color: var(--text) !important;
	padding: 0.5rem 0.8rem !important;
	border-right: 1px solid var(--border) !important;
	}

	/* Tabs */
	.gr-tab-nav {
	border-bottom: 1px solid var(--border) !important;
	background: transparent !important;
	}

	.gr-tab-nav button {
	font-family: 'IBM Plex Mono', monospace !important;
	font-size: 0.72rem !important;
	letter-spacing: 0.08em !important;
	text-transform: uppercase !important;
	color: var(--text-dim) !important;
	background: transparent !important;
	border: none !important;
	padding: 0.6rem 1rem !important;
	}

	.gr-tab-nav button.selected {
	color: var(--accent) !important;
	border-bottom: 2px solid var(--accent) !important;
	}

	/* Checkbox group */
	.gr-check-radio {
	accent-color: var(--accent) !important;
	}

	/* Footer legend */
	.legend {
	display: flex;
	gap: 1.5rem;
	flex-wrap: wrap;
	margin-top: 1.2rem;
	padding-top: 1rem;
	border-top: 1px solid var(--border);
	font-size: 0.68rem;
	color: var(--text-dim);
	letter-spacing: 0.04em;
	}

	.legend-item b {
	color: var(--text);
	}

	/* Scrollbar */
	::-webkit-scrollbar { width: 4px; height: 4px; }
	::-webkit-scrollbar-track { background: var(--bg); }
	::-webkit-scrollbar-thumb { background: var(--muted); border-radius: 2px; }
	"""

	HEADER_HTML = """
	<div class="it-header">
	<div style="display:flex; align-items:baseline; gap:1rem; flex-wrap:wrap;">
	<p class="it-title">▣ OrgForge <span>Insider Threat</span> Benchmark</p>
	<span class="it-tag">Security Eval</span>
	<span class="it-tag">Bedrock</span>
	</div>
	<p class="it-subtitle">Detection leaderboard — LLM reasoning over structured telemetry · No embedder required</p>
	</div>
	"""

	LEGEND_HTML = """
	<div class="legend">
	<span class="legend-item"><b>Triage F1</b> — escalation quality (Tier 1)</span>
	<span class="legend-item"><b>Verdict F1</b> — full case quality (Tier 2)</span>
	<span class="legend-item"><b>Baseline FP ↓</b> — false positive rate on clean period</span>
	<span class="legend-item"><b>Onset Sensitivity ↓</b> — fraction of pre-onset escalations (guessing, not detecting)</span>
	<span class="legend-item"><b>Vishing ✓</b> — phone_call → idp_auth cross-actor correlation detected</span>
	<span class="legend-item"><b>Host Trail ✓</b> — all 3 hoarding phases cited in evidence</span>
	<span class="legend-item"><b>Tier 1</b> triage only · <b>Tier 2</b> full pipeline</span>
	</div>
	"""


	def compute_summary_stats(df: pd.DataFrame) -> tuple:
	"""Return (n_models, best_verdict_f1, best_model, vishing_rate) for the header cards."""
	if df.empty:
	return 0, "—", "—", "—"
	n = len(df)
	if "verdict_f1" in df.columns:
	best_row = df.loc[df["verdict_f1"].idxmax()]
	best_f1 = f"{best_row['verdict_f1']:.3f}"
	best_model = str(best_row.get("model", "—")).split(".")[-1][:24]
	else:
	best_f1, best_model = "—", "—"
	if "vishing_detected" in df.columns:
	vishing_rate = df["vishing_detected"].map(
	lambda v: str(v).lower() in ("true", "1", "yes")
	).mean()
	vishing_str = f"{vishing_rate:.0%}"
	else:
	vishing_str = "—"
	return n, best_f1, best_model, vishing_str


	def make_stats_html(df: pd.DataFrame) -> str:
	n, best_f1, best_model, vishing_rate = compute_summary_stats(df)
	return f"""
	<div class="metric-strip">
	<div class="metric-card">
	<div class="metric-value">{n}</div>
	<div class="metric-label">Models evaluated</div>
	</div>
	<div class="metric-card">
	<div class="metric-value green">{best_f1}</div>
	<div class="metric-label">Best verdict F1</div>
	</div>
	<div class="metric-card">
	<div class="metric-value" style="font-size:1rem; padding-top:0.3rem">{best_model}</div>
	<div class="metric-label">Leading model</div>
	</div>
	<div class="metric-card">
	<div class="metric-value {'accent' if vishing_rate not in ('—','0%') else ''}">{vishing_rate}</div>
	<div class="metric-label">Vishing detection rate</div>
	</div>
	</div>
	"""


	# ─── App ──────────────────────────────────────────────────────────────────────

	df_global = load_data()

	with gr.Blocks(css=CSS, title="OrgForge Insider Threat Benchmark") as demo:

	gr.HTML(HEADER_HTML)

	stats_box = gr.HTML(make_stats_html(df_global))

	with gr.Row():
	search_bar = gr.Textbox(
	placeholder="claude, llama, nova …",
	label="Filter by model name",
	scale=2,
	)
	tier_filter = gr.Dropdown(
	choices=["All", "Tier 2 (Full Pipeline)", "Tier 1 (Triage Only)"],
	value="All",
	label="Tier",
	scale=1,
	)
	sort_by = gr.Dropdown(
	choices=[
	"Verdict F1",
	"Triage F1",
	"Baseline FP Rate ↑",
	"Onset Sensitivity ↑",
	],
	value="Verdict F1",
	label="Sort by",
	scale=1,
	)

	with gr.Tabs():

	with gr.Tab("📊 Overview"):
	out_main = gr.Dataframe(
	value=build_display(df_global, "", "All", False, False, [], [], "Verdict F1"),
	interactive=False,
	max_height=560,
	wrap=False,
	)

	with gr.Tab("🔍 Triage Detail"):
	out_triage = gr.Dataframe(
	value=build_display(df_global, "", "All", True, False, [], [], "Triage F1"),
	interactive=False,
	max_height=560,
	wrap=False,
	)

	with gr.Tab("🎯 Verdict Detail"):
	out_verdict = gr.Dataframe(
	value=build_display(df_global, "", "All", False, True, [], [], "Verdict F1"),
	interactive=False,
	max_height=560,
	wrap=False,
	)

	with gr.Tab("🧩 By Behavior"):
	behavior_filter = gr.CheckboxGroup(
	choices=list(BEHAVIOR_COLS_MAP.keys()),
	value=list(BEHAVIOR_COLS_MAP.keys()),
	label="Behaviors to show",
	)
	out_behavior = gr.Dataframe(
	value=build_display(
	df_global, "", "All", False, False,
	list(BEHAVIOR_COLS_MAP.keys()), [], "Verdict F1"
	),
	interactive=False,
	max_height=560,
	wrap=False,
	)

	with gr.Tab("🏷 By Threat Class"):
	class_filter = gr.CheckboxGroup(
	choices=list(CLASS_COLS_MAP.keys()),
	value=list(CLASS_COLS_MAP.keys()),
	label="Classes to show",
	)
	out_class = gr.Dataframe(
	value=build_display(
	df_global, "", "All", False, False,
	[], list(CLASS_COLS_MAP.keys()), "Verdict F1"
	),
	interactive=False,
	max_height=560,
	wrap=False,
	)

	gr.HTML(LEGEND_HTML)

	# ── Reactivity ────────────────────────────────────────────────────────────

	def refresh(search, tier, sort, behaviors, classes):
	df = load_data()
	return (
	make_stats_html(df),
	build_display(df, search, tier, False, False, [], [], sort),
	build_display(df, search, tier, True, False, [], [], sort),
	build_display(df, search, tier, False, True, [], [], sort),
	build_display(df, search, tier, False, False, behaviors, [], sort),
	build_display(df, search, tier, False, False, [], classes, sort),
	)

	controls = [search_bar, tier_filter, sort_by, behavior_filter, class_filter]
	outputs = [stats_box, out_main, out_triage, out_verdict, out_behavior, out_class]

	for ctrl in controls:
	ctrl.change(fn=refresh, inputs=controls, outputs=outputs)

	demo.launch()