Spaces:
Running
Running
| import gradio as gr | |
| import pandas as pd | |
| DATA_URL = "https://huggingface.co/datasets/aeriesec/orgforge-insider-threat/resolve/main/leaderboard/insider_threat_leaderboard.csv" | |
| # βββ Column definitions βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| CORE_COLS = [ | |
| "model", | |
| "tier", | |
| "triage_f1", | |
| "verdict_f1", | |
| "baseline_fp_rate", | |
| "onset_sensitivity", | |
| "vishing_detected", | |
| "host_trail_reconstructed", | |
| ] | |
| TRIAGE_COLS = [ | |
| "triage_precision", | |
| "triage_recall", | |
| "triage_f1", | |
| "triage_tp", | |
| "triage_fp", | |
| "triage_fn", | |
| ] | |
| VERDICT_COLS = [ | |
| "verdict_precision", | |
| "verdict_recall", | |
| "verdict_f1", | |
| "verdict_tp", | |
| "verdict_fp", | |
| "verdict_fn", | |
| ] | |
| BEHAVIOR_COLS_MAP = { | |
| "secret_in_commit": ["tp_secret_in_commit", "fp_secret_in_commit"], | |
| "data_exfil_email": ["tp_data_exfil_email", "fp_data_exfil_email"], | |
| "host_data_hoarding": ["tp_host_data_hoarding", "fp_host_data_hoarding"], | |
| "social_engineering": ["tp_social_engineering", "fp_social_engineering"], | |
| "unusual_hours_access": ["tp_unusual_hours_access", "fp_unusual_hours_access"], | |
| "sentiment_drift": ["tp_sentiment_drift", "fp_sentiment_drift"], | |
| "excessive_repo_cloning":["tp_excessive_repo_cloning","fp_excessive_repo_cloning"], | |
| "cross_dept_snooping": ["tp_cross_dept_snooping", "fp_cross_dept_snooping"], | |
| } | |
| CLASS_COLS_MAP = { | |
| "negligent": ["negligent_tp", "negligent_fp", "negligent_fn"], | |
| "disgruntled": ["disgruntled_tp", "disgruntled_fp", "disgruntled_fn"], | |
| "malicious": ["malicious_tp", "malicious_fp", "malicious_fn"], | |
| } | |
| FRIENDLY_COLS = { | |
| "model": "Model", | |
| "tier": "Tier", | |
| "triage_f1": "Triage F1", | |
| "verdict_f1": "Verdict F1", | |
| "baseline_fp_rate": "Baseline FP Rate β", | |
| "onset_sensitivity": "Onset Sensitivity β", | |
| "vishing_detected": "Vishing", | |
| "host_trail_reconstructed":"Host Trail", | |
| "triage_precision": "Triage P", | |
| "triage_recall": "Triage R", | |
| "triage_tp": "T-TP", | |
| "triage_fp": "T-FP", | |
| "triage_fn": "T-FN", | |
| "verdict_precision": "Verdict P", | |
| "verdict_recall": "Verdict R", | |
| "verdict_tp": "V-TP", | |
| "verdict_fp": "V-FP", | |
| "verdict_fn": "V-FN", | |
| } | |
| # βββ Data loading βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def load_data() -> pd.DataFrame: | |
| try: | |
| df = pd.read_csv(DATA_URL) | |
| return df | |
| except Exception: | |
| # Return an empty frame with expected columns so the UI doesn't crash | |
| return pd.DataFrame(columns=CORE_COLS) | |
| def build_display( | |
| df: pd.DataFrame, | |
| search: str, | |
| tier: str, | |
| show_triage: bool, | |
| show_verdict: bool, | |
| selected_behaviors: list, | |
| selected_classes: list, | |
| sort_by: str, | |
| ) -> pd.DataFrame: | |
| if df.empty: | |
| return pd.DataFrame({"Status": ["No data β place insider_threat_leaderboard.csv next to app.py"]}) | |
| # Tier filter | |
| if tier != "All": | |
| tier_val = "2" if tier == "Tier 2 (Full Pipeline)" else "1" | |
| if "tier" in df.columns: | |
| df = df[df["tier"].astype(str) == tier_val] | |
| # Model search | |
| if search and "model" in df.columns: | |
| df = df[df["model"].str.contains(search, case=False, na=False)] | |
| # Build column list | |
| cols = CORE_COLS.copy() | |
| if show_triage: | |
| cols += [c for c in TRIAGE_COLS if c not in cols] | |
| if show_verdict: | |
| cols += [c for c in VERDICT_COLS if c not in cols] | |
| for b in selected_behaviors: | |
| cols += [c for c in BEHAVIOR_COLS_MAP.get(b, []) if c not in cols] | |
| for c in selected_classes: | |
| cols += [cl for cl in CLASS_COLS_MAP.get(c, []) if cl not in cols] | |
| # Keep only columns that actually exist in the CSV | |
| cols = [c for c in cols if c in df.columns] | |
| df = df[cols].copy() | |
| # Sort | |
| sort_col_map = { | |
| "Verdict F1": "verdict_f1", | |
| "Triage F1": "triage_f1", | |
| "Baseline FP Rate β": "baseline_fp_rate", | |
| "Onset Sensitivity β": "onset_sensitivity", | |
| } | |
| sort_col = sort_col_map.get(sort_by, "verdict_f1") | |
| ascending = sort_by in ("Baseline FP Rate β", "Onset Sensitivity β") | |
| if sort_col in df.columns: | |
| df = df.sort_values(by=sort_col, ascending=ascending, na_position="last") | |
| # Rename columns for display | |
| df = df.rename(columns=FRIENDLY_COLS) | |
| # Format booleans | |
| for col in ["Vishing", "Host Trail"]: | |
| if col in df.columns: | |
| df[col] = df[col].map( | |
| lambda v: "β" if v is True or str(v).lower() in ("true", "1", "yes") | |
| else ("β" if v is False or str(v).lower() in ("false", "0", "no") else "β") | |
| ) | |
| # Round floats | |
| float_cols = df.select_dtypes(include="float").columns | |
| df[float_cols] = df[float_cols].round(4) | |
| return df.reset_index(drop=True) | |
| # βββ UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| CSS = """ | |
| @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;500;600&family=IBM+Plex+Sans:wght@300;400;500&display=swap'); | |
| :root { | |
| --bg: #0a0c0f; | |
| --surface: #111318; | |
| --border: #1e2330; | |
| --accent: #e63946; | |
| --accent2: #ff6b6b; | |
| --muted: #4a5568; | |
| --text: #c9d1d9; | |
| --text-dim: #6e7681; | |
| --green: #39d353; | |
| --amber: #f0a500; | |
| } | |
| body, .gradio-container { | |
| background: var(--bg) !important; | |
| font-family: 'IBM Plex Mono', monospace !important; | |
| color: var(--text) !important; | |
| } | |
| /* Header */ | |
| .it-header { | |
| border-bottom: 1px solid var(--border); | |
| padding: 2rem 0 1.5rem 0; | |
| margin-bottom: 1.5rem; | |
| position: relative; | |
| } | |
| .it-title { | |
| font-family: 'IBM Plex Mono', monospace; | |
| font-size: 1.6rem; | |
| font-weight: 600; | |
| letter-spacing: -0.02em; | |
| color: #fff; | |
| margin: 0; | |
| } | |
| .it-title span { | |
| color: var(--accent); | |
| } | |
| .it-subtitle { | |
| font-family: 'IBM Plex Sans', sans-serif; | |
| font-size: 0.8rem; | |
| color: var(--text-dim); | |
| margin: 0.4rem 0 0 0; | |
| letter-spacing: 0.08em; | |
| text-transform: uppercase; | |
| } | |
| .it-tag { | |
| display: inline-block; | |
| font-size: 0.65rem; | |
| font-weight: 600; | |
| letter-spacing: 0.12em; | |
| text-transform: uppercase; | |
| padding: 0.15rem 0.5rem; | |
| border: 1px solid var(--accent); | |
| color: var(--accent); | |
| border-radius: 2px; | |
| margin-right: 0.5rem; | |
| } | |
| /* Metric cards */ | |
| .metric-strip { | |
| display: grid; | |
| grid-template-columns: repeat(4, 1fr); | |
| gap: 1px; | |
| background: var(--border); | |
| border: 1px solid var(--border); | |
| margin-bottom: 1.5rem; | |
| } | |
| .metric-card { | |
| background: var(--surface); | |
| padding: 1rem 1.2rem; | |
| text-align: center; | |
| } | |
| .metric-value { | |
| font-family: 'IBM Plex Mono', monospace; | |
| font-size: 1.6rem; | |
| font-weight: 600; | |
| color: #fff; | |
| line-height: 1; | |
| } | |
| .metric-value.accent { color: var(--accent); } | |
| .metric-value.green { color: var(--green); } | |
| .metric-value.amber { color: var(--amber); } | |
| .metric-label { | |
| font-size: 0.65rem; | |
| color: var(--text-dim); | |
| letter-spacing: 0.1em; | |
| text-transform: uppercase; | |
| margin-top: 0.3rem; | |
| } | |
| /* Controls */ | |
| .controls-bar { | |
| display: flex; | |
| gap: 1rem; | |
| margin-bottom: 1rem; | |
| align-items: flex-end; | |
| flex-wrap: wrap; | |
| } | |
| /* Override Gradio component backgrounds */ | |
| .gr-box, .gr-form, .gr-panel, | |
| input, select, textarea, | |
| .gr-input, .gr-dropdown { | |
| background: var(--surface) !important; | |
| border-color: var(--border) !important; | |
| color: var(--text) !important; | |
| font-family: 'IBM Plex Mono', monospace !important; | |
| font-size: 0.8rem !important; | |
| } | |
| label, .gr-label, span.svelte-1gfkn6j { | |
| color: var(--text-dim) !important; | |
| font-size: 0.7rem !important; | |
| letter-spacing: 0.08em !important; | |
| text-transform: uppercase !important; | |
| font-family: 'IBM Plex Mono', monospace !important; | |
| } | |
| /* Table */ | |
| .gr-dataframe table { | |
| font-family: 'IBM Plex Mono', monospace !important; | |
| font-size: 0.75rem !important; | |
| border-collapse: collapse !important; | |
| } | |
| .gr-dataframe thead th { | |
| background: var(--surface) !important; | |
| color: var(--text-dim) !important; | |
| font-size: 0.65rem !important; | |
| letter-spacing: 0.1em !important; | |
| text-transform: uppercase !important; | |
| border-bottom: 1px solid var(--accent) !important; | |
| padding: 0.6rem 0.8rem !important; | |
| white-space: nowrap !important; | |
| } | |
| .gr-dataframe tbody tr { | |
| border-bottom: 1px solid var(--border) !important; | |
| transition: background 0.1s; | |
| } | |
| .gr-dataframe tbody tr:first-child td { | |
| background: rgba(230, 57, 70, 0.06) !important; | |
| } | |
| .gr-dataframe tbody tr:hover td { | |
| background: rgba(255,255,255,0.02) !important; | |
| } | |
| .gr-dataframe tbody td { | |
| background: var(--bg) !important; | |
| color: var(--text) !important; | |
| padding: 0.5rem 0.8rem !important; | |
| border-right: 1px solid var(--border) !important; | |
| } | |
| /* Tabs */ | |
| .gr-tab-nav { | |
| border-bottom: 1px solid var(--border) !important; | |
| background: transparent !important; | |
| } | |
| .gr-tab-nav button { | |
| font-family: 'IBM Plex Mono', monospace !important; | |
| font-size: 0.72rem !important; | |
| letter-spacing: 0.08em !important; | |
| text-transform: uppercase !important; | |
| color: var(--text-dim) !important; | |
| background: transparent !important; | |
| border: none !important; | |
| padding: 0.6rem 1rem !important; | |
| } | |
| .gr-tab-nav button.selected { | |
| color: var(--accent) !important; | |
| border-bottom: 2px solid var(--accent) !important; | |
| } | |
| /* Checkbox group */ | |
| .gr-check-radio { | |
| accent-color: var(--accent) !important; | |
| } | |
| /* Footer legend */ | |
| .legend { | |
| display: flex; | |
| gap: 1.5rem; | |
| flex-wrap: wrap; | |
| margin-top: 1.2rem; | |
| padding-top: 1rem; | |
| border-top: 1px solid var(--border); | |
| font-size: 0.68rem; | |
| color: var(--text-dim); | |
| letter-spacing: 0.04em; | |
| } | |
| .legend-item b { | |
| color: var(--text); | |
| } | |
| /* Scrollbar */ | |
| ::-webkit-scrollbar { width: 4px; height: 4px; } | |
| ::-webkit-scrollbar-track { background: var(--bg); } | |
| ::-webkit-scrollbar-thumb { background: var(--muted); border-radius: 2px; } | |
| """ | |
| HEADER_HTML = """ | |
| <div class="it-header"> | |
| <div style="display:flex; align-items:baseline; gap:1rem; flex-wrap:wrap;"> | |
| <p class="it-title">β£ OrgForge <span>Insider Threat</span> Benchmark</p> | |
| <span class="it-tag">Security Eval</span> | |
| <span class="it-tag">Bedrock</span> | |
| </div> | |
| <p class="it-subtitle">Detection leaderboard β LLM reasoning over structured telemetry Β· No embedder required</p> | |
| </div> | |
| """ | |
| LEGEND_HTML = """ | |
| <div class="legend"> | |
| <span class="legend-item"><b>Triage F1</b> β escalation quality (Tier 1)</span> | |
| <span class="legend-item"><b>Verdict F1</b> β full case quality (Tier 2)</span> | |
| <span class="legend-item"><b>Baseline FP β</b> β false positive rate on clean period</span> | |
| <span class="legend-item"><b>Onset Sensitivity β</b> β fraction of pre-onset escalations (guessing, not detecting)</span> | |
| <span class="legend-item"><b>Vishing β</b> β phone_call β idp_auth cross-actor correlation detected</span> | |
| <span class="legend-item"><b>Host Trail β</b> β all 3 hoarding phases cited in evidence</span> | |
| <span class="legend-item"><b>Tier 1</b> triage only Β· <b>Tier 2</b> full pipeline</span> | |
| </div> | |
| """ | |
| def compute_summary_stats(df: pd.DataFrame) -> tuple: | |
| """Return (n_models, best_verdict_f1, best_model, vishing_rate) for the header cards.""" | |
| if df.empty: | |
| return 0, "β", "β", "β" | |
| n = len(df) | |
| if "verdict_f1" in df.columns: | |
| best_row = df.loc[df["verdict_f1"].idxmax()] | |
| best_f1 = f"{best_row['verdict_f1']:.3f}" | |
| best_model = str(best_row.get("model", "β")).split(".")[-1][:24] | |
| else: | |
| best_f1, best_model = "β", "β" | |
| if "vishing_detected" in df.columns: | |
| vishing_rate = df["vishing_detected"].map( | |
| lambda v: str(v).lower() in ("true", "1", "yes") | |
| ).mean() | |
| vishing_str = f"{vishing_rate:.0%}" | |
| else: | |
| vishing_str = "β" | |
| return n, best_f1, best_model, vishing_str | |
| def make_stats_html(df: pd.DataFrame) -> str: | |
| n, best_f1, best_model, vishing_rate = compute_summary_stats(df) | |
| return f""" | |
| <div class="metric-strip"> | |
| <div class="metric-card"> | |
| <div class="metric-value">{n}</div> | |
| <div class="metric-label">Models evaluated</div> | |
| </div> | |
| <div class="metric-card"> | |
| <div class="metric-value green">{best_f1}</div> | |
| <div class="metric-label">Best verdict F1</div> | |
| </div> | |
| <div class="metric-card"> | |
| <div class="metric-value" style="font-size:1rem; padding-top:0.3rem">{best_model}</div> | |
| <div class="metric-label">Leading model</div> | |
| </div> | |
| <div class="metric-card"> | |
| <div class="metric-value {'accent' if vishing_rate not in ('β','0%') else ''}">{vishing_rate}</div> | |
| <div class="metric-label">Vishing detection rate</div> | |
| </div> | |
| </div> | |
| """ | |
| # βββ App ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| df_global = load_data() | |
| with gr.Blocks(css=CSS, title="OrgForge Insider Threat Benchmark") as demo: | |
| gr.HTML(HEADER_HTML) | |
| stats_box = gr.HTML(make_stats_html(df_global)) | |
| with gr.Row(): | |
| search_bar = gr.Textbox( | |
| placeholder="claude, llama, nova β¦", | |
| label="Filter by model name", | |
| scale=2, | |
| ) | |
| tier_filter = gr.Dropdown( | |
| choices=["All", "Tier 2 (Full Pipeline)", "Tier 1 (Triage Only)"], | |
| value="All", | |
| label="Tier", | |
| scale=1, | |
| ) | |
| sort_by = gr.Dropdown( | |
| choices=[ | |
| "Verdict F1", | |
| "Triage F1", | |
| "Baseline FP Rate β", | |
| "Onset Sensitivity β", | |
| ], | |
| value="Verdict F1", | |
| label="Sort by", | |
| scale=1, | |
| ) | |
| with gr.Tabs(): | |
| with gr.Tab("π Overview"): | |
| out_main = gr.Dataframe( | |
| value=build_display(df_global, "", "All", False, False, [], [], "Verdict F1"), | |
| interactive=False, | |
| max_height=560, | |
| wrap=False, | |
| ) | |
| with gr.Tab("π Triage Detail"): | |
| out_triage = gr.Dataframe( | |
| value=build_display(df_global, "", "All", True, False, [], [], "Triage F1"), | |
| interactive=False, | |
| max_height=560, | |
| wrap=False, | |
| ) | |
| with gr.Tab("π― Verdict Detail"): | |
| out_verdict = gr.Dataframe( | |
| value=build_display(df_global, "", "All", False, True, [], [], "Verdict F1"), | |
| interactive=False, | |
| max_height=560, | |
| wrap=False, | |
| ) | |
| with gr.Tab("π§© By Behavior"): | |
| behavior_filter = gr.CheckboxGroup( | |
| choices=list(BEHAVIOR_COLS_MAP.keys()), | |
| value=list(BEHAVIOR_COLS_MAP.keys()), | |
| label="Behaviors to show", | |
| ) | |
| out_behavior = gr.Dataframe( | |
| value=build_display( | |
| df_global, "", "All", False, False, | |
| list(BEHAVIOR_COLS_MAP.keys()), [], "Verdict F1" | |
| ), | |
| interactive=False, | |
| max_height=560, | |
| wrap=False, | |
| ) | |
| with gr.Tab("π· By Threat Class"): | |
| class_filter = gr.CheckboxGroup( | |
| choices=list(CLASS_COLS_MAP.keys()), | |
| value=list(CLASS_COLS_MAP.keys()), | |
| label="Classes to show", | |
| ) | |
| out_class = gr.Dataframe( | |
| value=build_display( | |
| df_global, "", "All", False, False, | |
| [], list(CLASS_COLS_MAP.keys()), "Verdict F1" | |
| ), | |
| interactive=False, | |
| max_height=560, | |
| wrap=False, | |
| ) | |
| gr.HTML(LEGEND_HTML) | |
| # ββ Reactivity ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def refresh(search, tier, sort, behaviors, classes): | |
| df = load_data() | |
| return ( | |
| make_stats_html(df), | |
| build_display(df, search, tier, False, False, [], [], sort), | |
| build_display(df, search, tier, True, False, [], [], sort), | |
| build_display(df, search, tier, False, True, [], [], sort), | |
| build_display(df, search, tier, False, False, behaviors, [], sort), | |
| build_display(df, search, tier, False, False, [], classes, sort), | |
| ) | |
| controls = [search_bar, tier_filter, sort_by, behavior_filter, class_filter] | |
| outputs = [stats_box, out_main, out_triage, out_verdict, out_behavior, out_class] | |
| for ctrl in controls: | |
| ctrl.change(fn=refresh, inputs=controls, outputs=outputs) | |
| demo.launch() | |