Spaces:

aeriesec
/

orgforge-it

Running

File size: 17,642 Bytes

f17ff41
 
 
8581d75
f17ff41

import gradio as gr
import pandas as pd

DATA_URL = "https://huggingface.co/datasets/aeriesec/orgforge-insider-threat/resolve/main/leaderboard/insider_threat_leaderboard.csv"

# ─── Column definitions ───────────────────────────────────────────────────────

CORE_COLS = [
    "model",
    "tier",
    "triage_f1",
    "verdict_f1",
    "baseline_fp_rate",
    "onset_sensitivity",
    "vishing_detected",
    "host_trail_reconstructed",
]

TRIAGE_COLS = [
    "triage_precision",
    "triage_recall",
    "triage_f1",
    "triage_tp",
    "triage_fp",
    "triage_fn",
]

VERDICT_COLS = [
    "verdict_precision",
    "verdict_recall",
    "verdict_f1",
    "verdict_tp",
    "verdict_fp",
    "verdict_fn",
]

BEHAVIOR_COLS_MAP = {
    "secret_in_commit":      ["tp_secret_in_commit",      "fp_secret_in_commit"],
    "data_exfil_email":      ["tp_data_exfil_email",      "fp_data_exfil_email"],
    "host_data_hoarding":    ["tp_host_data_hoarding",    "fp_host_data_hoarding"],
    "social_engineering":    ["tp_social_engineering",    "fp_social_engineering"],
    "unusual_hours_access":  ["tp_unusual_hours_access",  "fp_unusual_hours_access"],
    "sentiment_drift":       ["tp_sentiment_drift",       "fp_sentiment_drift"],
    "excessive_repo_cloning":["tp_excessive_repo_cloning","fp_excessive_repo_cloning"],
    "cross_dept_snooping":   ["tp_cross_dept_snooping",   "fp_cross_dept_snooping"],
}

CLASS_COLS_MAP = {
    "negligent":   ["negligent_tp",   "negligent_fp",   "negligent_fn"],
    "disgruntled": ["disgruntled_tp", "disgruntled_fp", "disgruntled_fn"],
    "malicious":   ["malicious_tp",   "malicious_fp",   "malicious_fn"],
}

FRIENDLY_COLS = {
    "model":                   "Model",
    "tier":                    "Tier",
    "triage_f1":               "Triage F1",
    "verdict_f1":              "Verdict F1",
    "baseline_fp_rate":        "Baseline FP Rate ↓",
    "onset_sensitivity":       "Onset Sensitivity ↓",
    "vishing_detected":        "Vishing",
    "host_trail_reconstructed":"Host Trail",
    "triage_precision":        "Triage P",
    "triage_recall":           "Triage R",
    "triage_tp":               "T-TP",
    "triage_fp":               "T-FP",
    "triage_fn":               "T-FN",
    "verdict_precision":       "Verdict P",
    "verdict_recall":          "Verdict R",
    "verdict_tp":              "V-TP",
    "verdict_fp":              "V-FP",
    "verdict_fn":              "V-FN",
}


# ─── Data loading ─────────────────────────────────────────────────────────────

def load_data() -> pd.DataFrame:
    try:
        df = pd.read_csv(DATA_URL)
        return df
    except Exception:
        # Return an empty frame with expected columns so the UI doesn't crash
        return pd.DataFrame(columns=CORE_COLS)


def build_display(
    df: pd.DataFrame,
    search: str,
    tier: str,
    show_triage: bool,
    show_verdict: bool,
    selected_behaviors: list,
    selected_classes: list,
    sort_by: str,
) -> pd.DataFrame:
    if df.empty:
        return pd.DataFrame({"Status": ["No data — place insider_threat_leaderboard.csv next to app.py"]})

    # Tier filter
    if tier != "All":
        tier_val = "2" if tier == "Tier 2 (Full Pipeline)" else "1"
        if "tier" in df.columns:
            df = df[df["tier"].astype(str) == tier_val]

    # Model search
    if search and "model" in df.columns:
        df = df[df["model"].str.contains(search, case=False, na=False)]

    # Build column list
    cols = CORE_COLS.copy()
    if show_triage:
        cols += [c for c in TRIAGE_COLS if c not in cols]
    if show_verdict:
        cols += [c for c in VERDICT_COLS if c not in cols]
    for b in selected_behaviors:
        cols += [c for c in BEHAVIOR_COLS_MAP.get(b, []) if c not in cols]
    for c in selected_classes:
        cols += [cl for cl in CLASS_COLS_MAP.get(c, []) if cl not in cols]

    # Keep only columns that actually exist in the CSV
    cols = [c for c in cols if c in df.columns]
    df = df[cols].copy()

    # Sort
    sort_col_map = {
        "Verdict F1":          "verdict_f1",
        "Triage F1":           "triage_f1",
        "Baseline FP Rate ↑":  "baseline_fp_rate",
        "Onset Sensitivity ↑": "onset_sensitivity",
    }
    sort_col = sort_col_map.get(sort_by, "verdict_f1")
    ascending = sort_by in ("Baseline FP Rate ↑", "Onset Sensitivity ↑")
    if sort_col in df.columns:
        df = df.sort_values(by=sort_col, ascending=ascending, na_position="last")

    # Rename columns for display
    df = df.rename(columns=FRIENDLY_COLS)

    # Format booleans
    for col in ["Vishing", "Host Trail"]:
        if col in df.columns:
            df[col] = df[col].map(
                lambda v: "✓" if v is True or str(v).lower() in ("true", "1", "yes")
                else ("✗" if v is False or str(v).lower() in ("false", "0", "no") else "—")
            )

    # Round floats
    float_cols = df.select_dtypes(include="float").columns
    df[float_cols] = df[float_cols].round(4)

    return df.reset_index(drop=True)


# ─── UI ───────────────────────────────────────────────────────────────────────

CSS = """
@import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;500;600&family=IBM+Plex+Sans:wght@300;400;500&display=swap');

:root {
    --bg:        #0a0c0f;
    --surface:   #111318;
    --border:    #1e2330;
    --accent:    #e63946;
    --accent2:   #ff6b6b;
    --muted:     #4a5568;
    --text:      #c9d1d9;
    --text-dim:  #6e7681;
    --green:     #39d353;
    --amber:     #f0a500;
}

body, .gradio-container {
    background: var(--bg) !important;
    font-family: 'IBM Plex Mono', monospace !important;
    color: var(--text) !important;
}

/* Header */
.it-header {
    border-bottom: 1px solid var(--border);
    padding: 2rem 0 1.5rem 0;
    margin-bottom: 1.5rem;
    position: relative;
}

.it-title {
    font-family: 'IBM Plex Mono', monospace;
    font-size: 1.6rem;
    font-weight: 600;
    letter-spacing: -0.02em;
    color: #fff;
    margin: 0;
}

.it-title span {
    color: var(--accent);
}

.it-subtitle {
    font-family: 'IBM Plex Sans', sans-serif;
    font-size: 0.8rem;
    color: var(--text-dim);
    margin: 0.4rem 0 0 0;
    letter-spacing: 0.08em;
    text-transform: uppercase;
}

.it-tag {
    display: inline-block;
    font-size: 0.65rem;
    font-weight: 600;
    letter-spacing: 0.12em;
    text-transform: uppercase;
    padding: 0.15rem 0.5rem;
    border: 1px solid var(--accent);
    color: var(--accent);
    border-radius: 2px;
    margin-right: 0.5rem;
}

/* Metric cards */
.metric-strip {
    display: grid;
    grid-template-columns: repeat(4, 1fr);
    gap: 1px;
    background: var(--border);
    border: 1px solid var(--border);
    margin-bottom: 1.5rem;
}

.metric-card {
    background: var(--surface);
    padding: 1rem 1.2rem;
    text-align: center;
}

.metric-value {
    font-family: 'IBM Plex Mono', monospace;
    font-size: 1.6rem;
    font-weight: 600;
    color: #fff;
    line-height: 1;
}

.metric-value.accent { color: var(--accent); }
.metric-value.green  { color: var(--green); }
.metric-value.amber  { color: var(--amber); }

.metric-label {
    font-size: 0.65rem;
    color: var(--text-dim);
    letter-spacing: 0.1em;
    text-transform: uppercase;
    margin-top: 0.3rem;
}

/* Controls */
.controls-bar {
    display: flex;
    gap: 1rem;
    margin-bottom: 1rem;
    align-items: flex-end;
    flex-wrap: wrap;
}

/* Override Gradio component backgrounds */
.gr-box, .gr-form, .gr-panel,
input, select, textarea,
.gr-input, .gr-dropdown {
    background: var(--surface) !important;
    border-color: var(--border) !important;
    color: var(--text) !important;
    font-family: 'IBM Plex Mono', monospace !important;
    font-size: 0.8rem !important;
}

label, .gr-label, span.svelte-1gfkn6j {
    color: var(--text-dim) !important;
    font-size: 0.7rem !important;
    letter-spacing: 0.08em !important;
    text-transform: uppercase !important;
    font-family: 'IBM Plex Mono', monospace !important;
}

/* Table */
.gr-dataframe table {
    font-family: 'IBM Plex Mono', monospace !important;
    font-size: 0.75rem !important;
    border-collapse: collapse !important;
}

.gr-dataframe thead th {
    background: var(--surface) !important;
    color: var(--text-dim) !important;
    font-size: 0.65rem !important;
    letter-spacing: 0.1em !important;
    text-transform: uppercase !important;
    border-bottom: 1px solid var(--accent) !important;
    padding: 0.6rem 0.8rem !important;
    white-space: nowrap !important;
}

.gr-dataframe tbody tr {
    border-bottom: 1px solid var(--border) !important;
    transition: background 0.1s;
}

.gr-dataframe tbody tr:first-child td {
    background: rgba(230, 57, 70, 0.06) !important;
}

.gr-dataframe tbody tr:hover td {
    background: rgba(255,255,255,0.02) !important;
}

.gr-dataframe tbody td {
    background: var(--bg) !important;
    color: var(--text) !important;
    padding: 0.5rem 0.8rem !important;
    border-right: 1px solid var(--border) !important;
}

/* Tabs */
.gr-tab-nav {
    border-bottom: 1px solid var(--border) !important;
    background: transparent !important;
}

.gr-tab-nav button {
    font-family: 'IBM Plex Mono', monospace !important;
    font-size: 0.72rem !important;
    letter-spacing: 0.08em !important;
    text-transform: uppercase !important;
    color: var(--text-dim) !important;
    background: transparent !important;
    border: none !important;
    padding: 0.6rem 1rem !important;
}

.gr-tab-nav button.selected {
    color: var(--accent) !important;
    border-bottom: 2px solid var(--accent) !important;
}

/* Checkbox group */
.gr-check-radio {
    accent-color: var(--accent) !important;
}

/* Footer legend */
.legend {
    display: flex;
    gap: 1.5rem;
    flex-wrap: wrap;
    margin-top: 1.2rem;
    padding-top: 1rem;
    border-top: 1px solid var(--border);
    font-size: 0.68rem;
    color: var(--text-dim);
    letter-spacing: 0.04em;
}

.legend-item b {
    color: var(--text);
}

/* Scrollbar */
::-webkit-scrollbar { width: 4px; height: 4px; }
::-webkit-scrollbar-track { background: var(--bg); }
::-webkit-scrollbar-thumb { background: var(--muted); border-radius: 2px; }
"""

HEADER_HTML = """
<div class="it-header">
    <div style="display:flex; align-items:baseline; gap:1rem; flex-wrap:wrap;">
        <p class="it-title">▣ OrgForge <span>Insider Threat</span> Benchmark</p>
        <span class="it-tag">Security Eval</span>
        <span class="it-tag">Bedrock</span>
    </div>
    <p class="it-subtitle">Detection leaderboard — LLM reasoning over structured telemetry · No embedder required</p>
</div>
"""

LEGEND_HTML = """
<div class="legend">
    <span class="legend-item"><b>Triage F1</b> — escalation quality (Tier 1)</span>
    <span class="legend-item"><b>Verdict F1</b> — full case quality (Tier 2)</span>
    <span class="legend-item"><b>Baseline FP ↓</b> — false positive rate on clean period</span>
    <span class="legend-item"><b>Onset Sensitivity ↓</b> — fraction of pre-onset escalations (guessing, not detecting)</span>
    <span class="legend-item"><b>Vishing ✓</b> — phone_call → idp_auth cross-actor correlation detected</span>
    <span class="legend-item"><b>Host Trail ✓</b> — all 3 hoarding phases cited in evidence</span>
    <span class="legend-item"><b>Tier 1</b> triage only · <b>Tier 2</b> full pipeline</span>
</div>
"""


def compute_summary_stats(df: pd.DataFrame) -> tuple:
    """Return (n_models, best_verdict_f1, best_model, vishing_rate) for the header cards."""
    if df.empty:
        return 0, "—", "—", "—"
    n = len(df)
    if "verdict_f1" in df.columns:
        best_row = df.loc[df["verdict_f1"].idxmax()]
        best_f1 = f"{best_row['verdict_f1']:.3f}"
        best_model = str(best_row.get("model", "—")).split(".")[-1][:24]
    else:
        best_f1, best_model = "—", "—"
    if "vishing_detected" in df.columns:
        vishing_rate = df["vishing_detected"].map(
            lambda v: str(v).lower() in ("true", "1", "yes")
        ).mean()
        vishing_str = f"{vishing_rate:.0%}"
    else:
        vishing_str = "—"
    return n, best_f1, best_model, vishing_str


def make_stats_html(df: pd.DataFrame) -> str:
    n, best_f1, best_model, vishing_rate = compute_summary_stats(df)
    return f"""
<div class="metric-strip">
    <div class="metric-card">
        <div class="metric-value">{n}</div>
        <div class="metric-label">Models evaluated</div>
    </div>
    <div class="metric-card">
        <div class="metric-value green">{best_f1}</div>
        <div class="metric-label">Best verdict F1</div>
    </div>
    <div class="metric-card">
        <div class="metric-value" style="font-size:1rem; padding-top:0.3rem">{best_model}</div>
        <div class="metric-label">Leading model</div>
    </div>
    <div class="metric-card">
        <div class="metric-value {'accent' if vishing_rate not in ('—','0%') else ''}">{vishing_rate}</div>
        <div class="metric-label">Vishing detection rate</div>
    </div>
</div>
"""


# ─── App ──────────────────────────────────────────────────────────────────────

df_global = load_data()

with gr.Blocks(css=CSS, title="OrgForge Insider Threat Benchmark") as demo:

    gr.HTML(HEADER_HTML)

    stats_box = gr.HTML(make_stats_html(df_global))

    with gr.Row():
        search_bar = gr.Textbox(
            placeholder="claude, llama, nova …",
            label="Filter by model name",
            scale=2,
        )
        tier_filter = gr.Dropdown(
            choices=["All", "Tier 2 (Full Pipeline)", "Tier 1 (Triage Only)"],
            value="All",
            label="Tier",
            scale=1,
        )
        sort_by = gr.Dropdown(
            choices=[
                "Verdict F1",
                "Triage F1",
                "Baseline FP Rate ↑",
                "Onset Sensitivity ↑",
            ],
            value="Verdict F1",
            label="Sort by",
            scale=1,
        )

    with gr.Tabs():

        with gr.Tab("📊 Overview"):
            out_main = gr.Dataframe(
                value=build_display(df_global, "", "All", False, False, [], [], "Verdict F1"),
                interactive=False,
                max_height=560,
                wrap=False,
            )

        with gr.Tab("🔍 Triage Detail"):
            out_triage = gr.Dataframe(
                value=build_display(df_global, "", "All", True, False, [], [], "Triage F1"),
                interactive=False,
                max_height=560,
                wrap=False,
            )

        with gr.Tab("🎯 Verdict Detail"):
            out_verdict = gr.Dataframe(
                value=build_display(df_global, "", "All", False, True, [], [], "Verdict F1"),
                interactive=False,
                max_height=560,
                wrap=False,
            )

        with gr.Tab("🧩 By Behavior"):
            behavior_filter = gr.CheckboxGroup(
                choices=list(BEHAVIOR_COLS_MAP.keys()),
                value=list(BEHAVIOR_COLS_MAP.keys()),
                label="Behaviors to show",
            )
            out_behavior = gr.Dataframe(
                value=build_display(
                    df_global, "", "All", False, False,
                    list(BEHAVIOR_COLS_MAP.keys()), [], "Verdict F1"
                ),
                interactive=False,
                max_height=560,
                wrap=False,
            )

        with gr.Tab("🏷 By Threat Class"):
            class_filter = gr.CheckboxGroup(
                choices=list(CLASS_COLS_MAP.keys()),
                value=list(CLASS_COLS_MAP.keys()),
                label="Classes to show",
            )
            out_class = gr.Dataframe(
                value=build_display(
                    df_global, "", "All", False, False,
                    [], list(CLASS_COLS_MAP.keys()), "Verdict F1"
                ),
                interactive=False,
                max_height=560,
                wrap=False,
            )

    gr.HTML(LEGEND_HTML)

    # ── Reactivity ────────────────────────────────────────────────────────────

    def refresh(search, tier, sort, behaviors, classes):
        df = load_data()
        return (
            make_stats_html(df),
            build_display(df, search, tier, False, False, [], [], sort),
            build_display(df, search, tier, True,  False, [], [], sort),
            build_display(df, search, tier, False, True,  [], [], sort),
            build_display(df, search, tier, False, False, behaviors, [], sort),
            build_display(df, search, tier, False, False, [], classes, sort),
        )

    controls = [search_bar, tier_filter, sort_by, behavior_filter, class_filter]
    outputs  = [stats_box, out_main, out_triage, out_verdict, out_behavior, out_class]

    for ctrl in controls:
        ctrl.change(fn=refresh, inputs=controls, outputs=outputs)

demo.launch()