Spaces:

aeriesec
/

orgforge-it

Sleeping

App Files Files Community

jflynt commited on Mar 21

Commit

f17ff41

verified ·

1 Parent(s): 4e87a21

Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

README.md +5 -8
app.py +556 -0
insider_threat_leaderboard.csv +8 -0
insider_threat_leaderboard.json +358 -0
requirements.txt +2 -0

README.md CHANGED Viewed

@@ -1,14 +1,11 @@
 ---
-title: Orgforge It
-emoji: 🏢
-colorFrom: pink
-colorTo: red
 sdk: gradio
 sdk_version: 6.9.0
 app_file: app.py
-pinned: false
 license: mit
-short_description: LLM detection leaderboard for OrgForge insider threat sim
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: OrgForge Insider Threat Benchmark
+emoji: 🛡
+colorFrom: red
+colorTo: gray
 sdk: gradio
 sdk_version: 6.9.0
 app_file: app.py
+pinned: true
 license: mit
 ---

app.py ADDED Viewed

	@@ -0,0 +1,556 @@

+import gradio as gr
+import pandas as pd
+DATA_URL = "./insider_threat_leaderboard.csv"
+# ─── Column definitions ───────────────────────────────────────────────────────
+CORE_COLS = [
+    "model",
+    "tier",
+    "triage_f1",
+    "verdict_f1",
+    "baseline_fp_rate",
+    "onset_sensitivity",
+    "vishing_detected",
+    "host_trail_reconstructed",
+]
+TRIAGE_COLS = [
+    "triage_precision",
+    "triage_recall",
+    "triage_f1",
+    "triage_tp",
+    "triage_fp",
+    "triage_fn",
+]
+VERDICT_COLS = [
+    "verdict_precision",
+    "verdict_recall",
+    "verdict_f1",
+    "verdict_tp",
+    "verdict_fp",
+    "verdict_fn",
+]
+BEHAVIOR_COLS_MAP = {
+    "secret_in_commit":      ["tp_secret_in_commit",      "fp_secret_in_commit"],
+    "data_exfil_email":      ["tp_data_exfil_email",      "fp_data_exfil_email"],
+    "host_data_hoarding":    ["tp_host_data_hoarding",    "fp_host_data_hoarding"],
+    "social_engineering":    ["tp_social_engineering",    "fp_social_engineering"],
+    "unusual_hours_access":  ["tp_unusual_hours_access",  "fp_unusual_hours_access"],
+    "sentiment_drift":       ["tp_sentiment_drift",       "fp_sentiment_drift"],
+    "excessive_repo_cloning":["tp_excessive_repo_cloning","fp_excessive_repo_cloning"],
+    "cross_dept_snooping":   ["tp_cross_dept_snooping",   "fp_cross_dept_snooping"],
+}
+CLASS_COLS_MAP = {
+    "negligent":   ["negligent_tp",   "negligent_fp",   "negligent_fn"],
+    "disgruntled": ["disgruntled_tp", "disgruntled_fp", "disgruntled_fn"],
+    "malicious":   ["malicious_tp",   "malicious_fp",   "malicious_fn"],
+}
+FRIENDLY_COLS = {
+    "model":                   "Model",
+    "tier":                    "Tier",
+    "triage_f1":               "Triage F1",
+    "verdict_f1":              "Verdict F1",
+    "baseline_fp_rate":        "Baseline FP Rate ↓",
+    "onset_sensitivity":       "Onset Sensitivity ↓",
+    "vishing_detected":        "Vishing",
+    "host_trail_reconstructed":"Host Trail",
+    "triage_precision":        "Triage P",
+    "triage_recall":           "Triage R",
+    "triage_tp":               "T-TP",
+    "triage_fp":               "T-FP",
+    "triage_fn":               "T-FN",
+    "verdict_precision":       "Verdict P",
+    "verdict_recall":          "Verdict R",
+    "verdict_tp":              "V-TP",
+    "verdict_fp":              "V-FP",
+    "verdict_fn":              "V-FN",
+}
+# ─── Data loading ─────────────────────────────────────────────────────────────
+def load_data() -> pd.DataFrame:
+    try:
+        df = pd.read_csv(DATA_URL)
+        return df
+    except Exception:
+        # Return an empty frame with expected columns so the UI doesn't crash
+        return pd.DataFrame(columns=CORE_COLS)
+def build_display(
+    df: pd.DataFrame,
+    search: str,
+    tier: str,
+    show_triage: bool,
+    show_verdict: bool,
+    selected_behaviors: list,
+    selected_classes: list,
+    sort_by: str,
+) -> pd.DataFrame:
+    if df.empty:
+        return pd.DataFrame({"Status": ["No data — place insider_threat_leaderboard.csv next to app.py"]})
+    # Tier filter
+    if tier != "All":
+        tier_val = "2" if tier == "Tier 2 (Full Pipeline)" else "1"
+        if "tier" in df.columns:
+            df = df[df["tier"].astype(str) == tier_val]
+    # Model search
+    if search and "model" in df.columns:
+        df = df[df["model"].str.contains(search, case=False, na=False)]
+    # Build column list
+    cols = CORE_COLS.copy()
+    if show_triage:
+        cols += [c for c in TRIAGE_COLS if c not in cols]
+    if show_verdict:
+        cols += [c for c in VERDICT_COLS if c not in cols]
+    for b in selected_behaviors:
+        cols += [c for c in BEHAVIOR_COLS_MAP.get(b, []) if c not in cols]
+    for c in selected_classes:
+        cols += [cl for cl in CLASS_COLS_MAP.get(c, []) if cl not in cols]
+    # Keep only columns that actually exist in the CSV
+    cols = [c for c in cols if c in df.columns]
+    df = df[cols].copy()
+    # Sort
+    sort_col_map = {
+        "Verdict F1":          "verdict_f1",
+        "Triage F1":           "triage_f1",
+        "Baseline FP Rate ↑":  "baseline_fp_rate",
+        "Onset Sensitivity ↑": "onset_sensitivity",
+    }
+    sort_col = sort_col_map.get(sort_by, "verdict_f1")
+    ascending = sort_by in ("Baseline FP Rate ↑", "Onset Sensitivity ↑")
+    if sort_col in df.columns:
+        df = df.sort_values(by=sort_col, ascending=ascending, na_position="last")
+    # Rename columns for display
+    df = df.rename(columns=FRIENDLY_COLS)
+    # Format booleans
+    for col in ["Vishing", "Host Trail"]:
+        if col in df.columns:
+            df[col] = df[col].map(
+                lambda v: "✓" if v is True or str(v).lower() in ("true", "1", "yes")
+                else ("✗" if v is False or str(v).lower() in ("false", "0", "no") else "—")
+            )
+    # Round floats
+    float_cols = df.select_dtypes(include="float").columns
+    df[float_cols] = df[float_cols].round(4)
+    return df.reset_index(drop=True)
+# ─── UI ───────────────────────────────────────────────────────────────────────
+CSS = """
+@import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Mono:wght@400;500;600&family=IBM+Plex+Sans:wght@300;400;500&display=swap');
+:root {
+    --bg:        #0a0c0f;
+    --surface:   #111318;
+    --border:    #1e2330;
+    --accent:    #e63946;
+    --accent2:   #ff6b6b;
+    --muted:     #4a5568;
+    --text:      #c9d1d9;
+    --text-dim:  #6e7681;
+    --green:     #39d353;
+    --amber:     #f0a500;
+}
+body, .gradio-container {
+    background: var(--bg) !important;
+    font-family: 'IBM Plex Mono', monospace !important;
+    color: var(--text) !important;
+}
+/* Header */
+.it-header {
+    border-bottom: 1px solid var(--border);
+    padding: 2rem 0 1.5rem 0;
+    margin-bottom: 1.5rem;
+    position: relative;
+}
+.it-title {
+    font-family: 'IBM Plex Mono', monospace;
+    font-size: 1.6rem;
+    font-weight: 600;
+    letter-spacing: -0.02em;
+    color: #fff;
+    margin: 0;
+}
+.it-title span {
+    color: var(--accent);
+}
+.it-subtitle {
+    font-family: 'IBM Plex Sans', sans-serif;
+    font-size: 0.8rem;
+    color: var(--text-dim);
+    margin: 0.4rem 0 0 0;
+    letter-spacing: 0.08em;
+    text-transform: uppercase;
+}
+.it-tag {
+    display: inline-block;
+    font-size: 0.65rem;
+    font-weight: 600;
+    letter-spacing: 0.12em;
+    text-transform: uppercase;
+    padding: 0.15rem 0.5rem;
+    border: 1px solid var(--accent);
+    color: var(--accent);
+    border-radius: 2px;
+    margin-right: 0.5rem;
+}
+/* Metric cards */
+.metric-strip {
+    display: grid;
+    grid-template-columns: repeat(4, 1fr);
+    gap: 1px;
+    background: var(--border);
+    border: 1px solid var(--border);
+    margin-bottom: 1.5rem;
+}
+.metric-card {
+    background: var(--surface);
+    padding: 1rem 1.2rem;
+    text-align: center;
+}
+.metric-value {
+    font-family: 'IBM Plex Mono', monospace;
+    font-size: 1.6rem;
+    font-weight: 600;
+    color: #fff;
+    line-height: 1;
+}
+.metric-value.accent { color: var(--accent); }
+.metric-value.green  { color: var(--green); }
+.metric-value.amber  { color: var(--amber); }
+.metric-label {
+    font-size: 0.65rem;
+    color: var(--text-dim);
+    letter-spacing: 0.1em;
+    text-transform: uppercase;
+    margin-top: 0.3rem;
+}
+/* Controls */
+.controls-bar {
+    display: flex;
+    gap: 1rem;
+    margin-bottom: 1rem;
+    align-items: flex-end;
+    flex-wrap: wrap;
+}
+/* Override Gradio component backgrounds */
+.gr-box, .gr-form, .gr-panel,
+input, select, textarea,
+.gr-input, .gr-dropdown {
+    background: var(--surface) !important;
+    border-color: var(--border) !important;
+    color: var(--text) !important;
+    font-family: 'IBM Plex Mono', monospace !important;
+    font-size: 0.8rem !important;
+}
+label, .gr-label, span.svelte-1gfkn6j {
+    color: var(--text-dim) !important;
+    font-size: 0.7rem !important;
+    letter-spacing: 0.08em !important;
+    text-transform: uppercase !important;
+    font-family: 'IBM Plex Mono', monospace !important;
+}
+/* Table */
+.gr-dataframe table {
+    font-family: 'IBM Plex Mono', monospace !important;
+    font-size: 0.75rem !important;
+    border-collapse: collapse !important;
+}
+.gr-dataframe thead th {
+    background: var(--surface) !important;
+    color: var(--text-dim) !important;
+    font-size: 0.65rem !important;
+    letter-spacing: 0.1em !important;
+    text-transform: uppercase !important;
+    border-bottom: 1px solid var(--accent) !important;
+    padding: 0.6rem 0.8rem !important;
+    white-space: nowrap !important;
+}
+.gr-dataframe tbody tr {
+    border-bottom: 1px solid var(--border) !important;
+    transition: background 0.1s;
+}
+.gr-dataframe tbody tr:first-child td {
+    background: rgba(230, 57, 70, 0.06) !important;
+}
+.gr-dataframe tbody tr:hover td {
+    background: rgba(255,255,255,0.02) !important;
+}
+.gr-dataframe tbody td {
+    background: var(--bg) !important;
+    color: var(--text) !important;
+    padding: 0.5rem 0.8rem !important;
+    border-right: 1px solid var(--border) !important;
+}
+/* Tabs */
+.gr-tab-nav {
+    border-bottom: 1px solid var(--border) !important;
+    background: transparent !important;
+}
+.gr-tab-nav button {
+    font-family: 'IBM Plex Mono', monospace !important;
+    font-size: 0.72rem !important;
+    letter-spacing: 0.08em !important;
+    text-transform: uppercase !important;
+    color: var(--text-dim) !important;
+    background: transparent !important;
+    border: none !important;
+    padding: 0.6rem 1rem !important;
+}
+.gr-tab-nav button.selected {
+    color: var(--accent) !important;
+    border-bottom: 2px solid var(--accent) !important;
+}
+/* Checkbox group */
+.gr-check-radio {
+    accent-color: var(--accent) !important;
+}
+/* Footer legend */
+.legend {
+    display: flex;
+    gap: 1.5rem;
+    flex-wrap: wrap;
+    margin-top: 1.2rem;
+    padding-top: 1rem;
+    border-top: 1px solid var(--border);
+    font-size: 0.68rem;
+    color: var(--text-dim);
+    letter-spacing: 0.04em;
+}
+.legend-item b {
+    color: var(--text);
+}
+/* Scrollbar */
+::-webkit-scrollbar { width: 4px; height: 4px; }
+::-webkit-scrollbar-track { background: var(--bg); }
+::-webkit-scrollbar-thumb { background: var(--muted); border-radius: 2px; }
+"""
+HEADER_HTML = """
+<div class="it-header">
+    <div style="display:flex; align-items:baseline; gap:1rem; flex-wrap:wrap;">
+        <p class="it-title">▣ OrgForge <span>Insider Threat</span> Benchmark</p>
+        <span class="it-tag">Security Eval</span>
+        <span class="it-tag">Bedrock</span>
+    </div>
+    <p class="it-subtitle">Detection leaderboard — LLM reasoning over structured telemetry · No embedder required</p>
+</div>
+"""
+LEGEND_HTML = """
+<div class="legend">
+    <span class="legend-item"><b>Triage F1</b> — escalation quality (Tier 1)</span>
+    <span class="legend-item"><b>Verdict F1</b> — full case quality (Tier 2)</span>
+    <span class="legend-item"><b>Baseline FP ↓</b> — false positive rate on clean period</span>
+    <span class="legend-item"><b>Onset Sensitivity ↓</b> — fraction of pre-onset escalations (guessing, not detecting)</span>
+    <span class="legend-item"><b>Vishing ✓</b> — phone_call → idp_auth cross-actor correlation detected</span>
+    <span class="legend-item"><b>Host Trail ✓</b> — all 3 hoarding phases cited in evidence</span>
+    <span class="legend-item"><b>Tier 1</b> triage only · <b>Tier 2</b> full pipeline</span>
+</div>
+"""
+def compute_summary_stats(df: pd.DataFrame) -> tuple:
+    """Return (n_models, best_verdict_f1, best_model, vishing_rate) for the header cards."""
+    if df.empty:
+        return 0, "—", "—", "—"
+    n = len(df)
+    if "verdict_f1" in df.columns:
+        best_row = df.loc[df["verdict_f1"].idxmax()]
+        best_f1 = f"{best_row['verdict_f1']:.3f}"
+        best_model = str(best_row.get("model", "—")).split(".")[-1][:24]
+    else:
+        best_f1, best_model = "—", "—"
+    if "vishing_detected" in df.columns:
+        vishing_rate = df["vishing_detected"].map(
+            lambda v: str(v).lower() in ("true", "1", "yes")
+        ).mean()
+        vishing_str = f"{vishing_rate:.0%}"
+    else:
+        vishing_str = "—"
+    return n, best_f1, best_model, vishing_str
+def make_stats_html(df: pd.DataFrame) -> str:
+    n, best_f1, best_model, vishing_rate = compute_summary_stats(df)
+    return f"""
+<div class="metric-strip">
+    <div class="metric-card">
+        <div class="metric-value">{n}</div>
+        <div class="metric-label">Models evaluated</div>
+    </div>
+    <div class="metric-card">
+        <div class="metric-value green">{best_f1}</div>
+        <div class="metric-label">Best verdict F1</div>
+    </div>
+    <div class="metric-card">
+        <div class="metric-value" style="font-size:1rem; padding-top:0.3rem">{best_model}</div>
+        <div class="metric-label">Leading model</div>
+    </div>
+    <div class="metric-card">
+        <div class="metric-value {'accent' if vishing_rate not in ('—','0%') else ''}">{vishing_rate}</div>
+        <div class="metric-label">Vishing detection rate</div>
+    </div>
+</div>
+"""
+# ─── App ──────────────────────────────────────────────────────────────────────
+df_global = load_data()
+with gr.Blocks(css=CSS, title="OrgForge Insider Threat Benchmark") as demo:
+    gr.HTML(HEADER_HTML)
+    stats_box = gr.HTML(make_stats_html(df_global))
+    with gr.Row():
+        search_bar = gr.Textbox(
+            placeholder="claude, llama, nova …",
+            label="Filter by model name",
+            scale=2,
+        )
+        tier_filter = gr.Dropdown(
+            choices=["All", "Tier 2 (Full Pipeline)", "Tier 1 (Triage Only)"],
+            value="All",
+            label="Tier",
+            scale=1,
+        )
+        sort_by = gr.Dropdown(
+            choices=[
+                "Verdict F1",
+                "Triage F1",
+                "Baseline FP Rate ↑",
+                "Onset Sensitivity ↑",
+            ],
+            value="Verdict F1",
+            label="Sort by",
+            scale=1,
+        )
+    with gr.Tabs():
+        with gr.Tab("📊 Overview"):
+            out_main = gr.Dataframe(
+                value=build_display(df_global, "", "All", False, False, [], [], "Verdict F1"),
+                interactive=False,
+                max_height=560,
+                wrap=False,
+            )
+        with gr.Tab("🔍 Triage Detail"):
+            out_triage = gr.Dataframe(
+                value=build_display(df_global, "", "All", True, False, [], [], "Triage F1"),
+                interactive=False,
+                max_height=560,
+                wrap=False,
+            )
+        with gr.Tab("🎯 Verdict Detail"):
+            out_verdict = gr.Dataframe(
+                value=build_display(df_global, "", "All", False, True, [], [], "Verdict F1"),
+                interactive=False,
+                max_height=560,
+                wrap=False,
+            )
+        with gr.Tab("🧩 By Behavior"):
+            behavior_filter = gr.CheckboxGroup(
+                choices=list(BEHAVIOR_COLS_MAP.keys()),
+                value=list(BEHAVIOR_COLS_MAP.keys()),
+                label="Behaviors to show",
+            )
+            out_behavior = gr.Dataframe(
+                value=build_display(
+                    df_global, "", "All", False, False,
+                    list(BEHAVIOR_COLS_MAP.keys()), [], "Verdict F1"
+                ),
+                interactive=False,
+                max_height=560,
+                wrap=False,
+            )
+        with gr.Tab("🏷 By Threat Class"):
+            class_filter = gr.CheckboxGroup(
+                choices=list(CLASS_COLS_MAP.keys()),
+                value=list(CLASS_COLS_MAP.keys()),
+                label="Classes to show",
+            )
+            out_class = gr.Dataframe(
+                value=build_display(
+                    df_global, "", "All", False, False,
+                    [], list(CLASS_COLS_MAP.keys()), "Verdict F1"
+                ),
+                interactive=False,
+                max_height=560,
+                wrap=False,
+            )
+    gr.HTML(LEGEND_HTML)
+    # ── Reactivity ────────────────────────────────────────────────────────────
+    def refresh(search, tier, sort, behaviors, classes):
+        df = load_data()
+        return (
+            make_stats_html(df),
+            build_display(df, search, tier, False, False, [], [], sort),
+            build_display(df, search, tier, True,  False, [], [], sort),
+            build_display(df, search, tier, False, True,  [], [], sort),
+            build_display(df, search, tier, False, False, behaviors, [], sort),
+            build_display(df, search, tier, False, False, [], classes, sort),
+        )
+    controls = [search_bar, tier_filter, sort_by, behavior_filter, class_filter]
+    outputs  = [stats_box, out_main, out_triage, out_verdict, out_behavior, out_class]
+    for ctrl in controls:
+        ctrl.change(fn=refresh, inputs=controls, outputs=outputs)
+demo.launch()

insider_threat_leaderboard.csv ADDED Viewed

	@@ -0,0 +1,8 @@

+run_id,timestamp,model,tier,sim_days,subjects,triage_precision,triage_recall,triage_f1,baseline_fp_rate,onset_sensitivity,verdict_precision,verdict_recall,verdict_f1,vishing_detected,host_trail_reconstructed,tp_secret_in_commit,fp_secret_in_commit,tp_unusual_hours_access,fp_unusual_hours_access,tp_excessive_repo_cloning,fp_excessive_repo_cloning,tp_sentiment_drift,fp_sentiment_drift,tp_cross_dept_snooping,fp_cross_dept_snooping,tp_data_exfil_email,fp_data_exfil_email,tp_host_data_hoarding,fp_host_data_hoarding,tp_social_engineering,fp_social_engineering,tp_idp_anomaly,fp_idp_anomaly,negligent_tp,negligent_fp,negligent_fn,disgruntled_tp,disgruntled_fp,disgruntled_fn,malicious_tp,malicious_fp,malicious_fn
+mistral.devstral-2-123b__20260320T171503,2026-03-20T22:27:39.006654+00:00,mistral.devstral-2-123b,2,60,0,0.6667,1.0,0.8,0.0208,0.0,1.0,1.0,1.0,True,True,,,2,0,,,2,0,,,1,0,1,0,1,0,0,2,,,,1,0,0,1,0,0
+us.anthropic.claude-opus-4-6-v1__20260320T184150,2026-03-20T23:47:13.003756+00:00,us.anthropic.claude-opus-4-6-v1,2,60,0,0.6667,1.0,0.8,0.0208,0.0,1.0,1.0,1.0,True,True,,,2,0,,,2,0,,,1,0,1,0,1,0,0,2,,,,1,0,0,1,0,0
+deepseek.v3.2__20260320T190338,2026-03-21T00:12:56.410476+00:00,deepseek.v3.2,2,60,0,0.6667,1.0,0.8,0.0208,0.0,0.6667,1.0,0.8,True,True,,,2,0,,,2,0,,,1,0,1,0,1,0,0,2,,,,1,0,0,1,0,0
+us.meta.llama3-3-70b-instruct-v1_0__20260320T173939,2026-03-20T22:46:04.844221+00:00,us.meta.llama3-3-70b-instruct-v1:0,2,60,0,0.0488,1.0,0.093,0.8125,0.0,0.6667,1.0,0.8,True,True,,,2,0,0,1,2,0,0,1,1,0,1,0,1,0,0,1,,,,1,0,0,1,0,0
+us.anthropic.claude-opus-4-6-v1__20260320T181324,2026-03-20T23:18:46.874564+00:00,us.anthropic.claude-opus-4-6-v1,2,60,0,0.6667,1.0,0.8,0.0208,0.0,1.0,0.5,0.6667,False,False,,,1,0,,,1,0,,,,,,,,,0,1,,,,1,0,0,0,0,1
+us.anthropic.claude-sonnet-4-6__20260320T180625,2026-03-20T23:11:46.096659+00:00,us.anthropic.claude-sonnet-4-6,2,60,0,0.6667,1.0,0.8,0.0208,0.0,0.0,0.0,0.0,True,False,,,,,,,,,,,,,,,,,,,,,,0,0,1,0,0,1
+us.anthropic.claude-haiku-4-5-20251001-v1_0__20260320T173444,2026-03-20T22:36:32.924907+00:00,us.anthropic.claude-haiku-4-5-20251001-v1:0,2,60,0,0.6667,1.0,0.8,0.0213,0.0,0.0,0.0,0.0,True,False,,,,,,,,,,,,,,,,,,,,,,0,0,1,0,0,1

insider_threat_leaderboard.json ADDED Viewed

	@@ -0,0 +1,358 @@

+[
+  {
+    "run_id": "mistral.devstral-2-123b__20260320T171503",
+    "timestamp": "2026-03-20T22:27:39.006654+00:00",
+    "model": "mistral.devstral-2-123b",
+    "tier": "2",
+    "sim_days": 51,
+    "subjects": 3,
+    "subject_classes": [],
+    "triage_precision": 0.6667,
+    "triage_recall": 1.0,
+    "triage_f1": 0.8,
+    "triage_tp": 2,
+    "triage_fp": 1,
+    "triage_fn": 0,
+    "baseline_fp_rate": 0.0208,
+    "onset_sensitivity": 0.0,
+    "verdict_precision": 1.0,
+    "verdict_recall": 1.0,
+    "verdict_f1": 1.0,
+    "verdict_tp": 2,
+    "verdict_fp": 0,
+    "verdict_fn": 0,
+    "vishing_detected": true,
+    "host_trail_reconstructed": true,
+    "by_behavior": {
+      "unusual_hours_access": {
+        "tp": 2,
+        "fp": 0
+      },
+      "sentiment_drift": {
+        "tp": 2,
+        "fp": 0
+      },
+      "host_data_hoarding": {
+        "tp": 1,
+        "fp": 0
+      },
+      "data_exfil_email": {
+        "tp": 1,
+        "fp": 0
+      },
+      "social_engineering": {
+        "tp": 1,
+        "fp": 0
+      },
+      "idp_anomaly": {
+        "tp": 0,
+        "fp": 2
+      }
+    },
+    "by_class": {
+      "malicious": {
+        "tp": 1,
+        "fp": 0,
+        "fn": 0
+      },
+      "disgruntled": {
+        "tp": 1,
+        "fp": 0,
+        "fn": 0
+      }
+    }
+  },
+  {
+    "run_id": "us.anthropic.claude-opus-4-6-v1__20260320T184150",
+    "timestamp": "2026-03-20T23:47:13.003756+00:00",
+    "model": "us.anthropic.claude-opus-4-6-v1",
+    "tier": "2",
+    "sim_days": 51,
+    "subjects": 3,
+    "subject_classes": [],
+    "triage_precision": 0.6667,
+    "triage_recall": 1.0,
+    "triage_f1": 0.8,
+    "triage_tp": 2,
+    "triage_fp": 1,
+    "triage_fn": 0,
+    "baseline_fp_rate": 0.0208,
+    "onset_sensitivity": 0.0,
+    "verdict_precision": 1.0,
+    "verdict_recall": 1.0,
+    "verdict_f1": 1.0,
+    "verdict_tp": 2,
+    "verdict_fp": 0,
+    "verdict_fn": 0,
+    "vishing_detected": true,
+    "host_trail_reconstructed": true,
+    "by_behavior": {
+      "host_data_hoarding": {
+        "tp": 1,
+        "fp": 0
+      },
+      "data_exfil_email": {
+        "tp": 1,
+        "fp": 0
+      },
+      "social_engineering": {
+        "tp": 1,
+        "fp": 0
+      },
+      "sentiment_drift": {
+        "tp": 2,
+        "fp": 0
+      },
+      "unusual_hours_access": {
+        "tp": 2,
+        "fp": 0
+      },
+      "idp_anomaly": {
+        "tp": 0,
+        "fp": 2
+      }
+    },
+    "by_class": {
+      "malicious": {
+        "tp": 1,
+        "fp": 0,
+        "fn": 0
+      },
+      "disgruntled": {
+        "tp": 1,
+        "fp": 0,
+        "fn": 0
+      }
+    }
+  },
+  {
+    "run_id": "deepseek.v3.2__20260320T190338",
+    "timestamp": "2026-03-21T00:12:56.410476+00:00",
+    "model": "deepseek.v3.2",
+    "tier": "2",
+    "sim_days": 51,
+    "subjects": 3,
+    "subject_classes": [],
+    "triage_precision": 0.6667,
+    "triage_recall": 1.0,
+    "triage_f1": 0.8,
+    "triage_tp": 2,
+    "triage_fp": 1,
+    "triage_fn": 0,
+    "baseline_fp_rate": 0.0208,
+    "onset_sensitivity": 0.0,
+    "verdict_precision": 0.6667,
+    "verdict_recall": 1.0,
+    "verdict_f1": 0.8,
+    "verdict_tp": 2,
+    "verdict_fp": 1,
+    "verdict_fn": 0,
+    "vishing_detected": true,
+    "host_trail_reconstructed": true,
+    "by_behavior": {
+      "host_data_hoarding": {
+        "tp": 1,
+        "fp": 0
+      },
+      "data_exfil_email": {
+        "tp": 1,
+        "fp": 0
+      },
+      "social_engineering": {
+        "tp": 1,
+        "fp": 0
+      },
+      "unusual_hours_access": {
+        "tp": 2,
+        "fp": 0
+      },
+      "sentiment_drift": {
+        "tp": 2,
+        "fp": 0
+      },
+      "idp_anomaly": {
+        "tp": 0,
+        "fp": 2
+      }
+    },
+    "by_class": {
+      "innocent": {
+        "tp": 0,
+        "fp": 1,
+        "fn": 0
+      },
+      "malicious": {
+        "tp": 1,
+        "fp": 0,
+        "fn": 0
+      },
+      "disgruntled": {
+        "tp": 1,
+        "fp": 0,
+        "fn": 0
+      }
+    }
+  },
+  {
+    "run_id": "us.meta.llama3-3-70b-instruct-v1_0__20260320T173939",
+    "timestamp": "2026-03-20T22:46:04.844221+00:00",
+    "model": "us.meta.llama3-3-70b-instruct-v1:0",
+    "tier": "2",
+    "sim_days": 51,
+    "subjects": 3,
+    "subject_classes": [],
+    "triage_precision": 0.0488,
+    "triage_recall": 1.0,
+    "triage_f1": 0.093,
+    "triage_tp": 2,
+    "triage_fp": 39,
+    "triage_fn": 0,
+    "baseline_fp_rate": 0.8125,
+    "onset_sensitivity": 0.0,
+    "verdict_precision": 0.6667,
+    "verdict_recall": 1.0,
+    "verdict_f1": 0.8,
+    "verdict_tp": 2,
+    "verdict_fp": 1,
+    "verdict_fn": 0,
+    "vishing_detected": true,
+    "host_trail_reconstructed": true,
+    "by_behavior": {
+      "unusual_hours_access": {
+        "tp": 2,
+        "fp": 0
+      },
+      "excessive_repo_cloning": {
+        "tp": 0,
+        "fp": 1
+      },
+      "sentiment_drift": {
+        "tp": 2,
+        "fp": 0
+      },
+      "cross_dept_snooping": {
+        "tp": 0,
+        "fp": 1
+      },
+      "data_exfil_email": {
+        "tp": 1,
+        "fp": 0
+      },
+      "host_data_hoarding": {
+        "tp": 1,
+        "fp": 0
+      },
+      "social_engineering": {
+        "tp": 1,
+        "fp": 0
+      },
+      "idp_anomaly": {
+        "tp": 0,
+        "fp": 1
+      }
+    },
+    "by_class": {
+      "innocent": {
+        "tp": 0,
+        "fp": 1,
+        "fn": 0
+      },
+      "malicious": {
+        "tp": 1,
+        "fp": 0,
+        "fn": 0
+      },
+      "disgruntled": {
+        "tp": 1,
+        "fp": 0,
+        "fn": 0
+      }
+    }
+  },
+  {
+    "run_id": "us.anthropic.claude-sonnet-4-6__20260320T180625",
+    "timestamp": "2026-03-20T23:11:46.096659+00:00",
+    "model": "us.anthropic.claude-sonnet-4-6",
+    "tier": "2",
+    "sim_days": 51,
+    "subjects": 3,
+    "subject_classes": [],
+    "triage_precision": 0.6667,
+    "triage_recall": 1.0,
+    "triage_f1": 0.8,
+    "triage_tp": 2,
+    "triage_fp": 1,
+    "triage_fn": 0,
+    "baseline_fp_rate": 0.0208,
+    "onset_sensitivity": 0.0,
+    "verdict_precision": 0.0,
+    "verdict_recall": 0.0,
+    "verdict_f1": 0.0,
+    "verdict_tp": 0,
+    "verdict_fp": 1,
+    "verdict_fn": 2,
+    "vishing_detected": true,
+    "host_trail_reconstructed": false,
+    "by_behavior": {},
+    "by_class": {
+      "innocent": {
+        "tp": 0,
+        "fp": 1,
+        "fn": 0
+      },
+      "disgruntled": {
+        "tp": 0,
+        "fp": 0,
+        "fn": 1
+      },
+      "malicious": {
+        "tp": 0,
+        "fp": 0,
+        "fn": 1
+      }
+    }
+  },
+  {
+    "run_id": "us.anthropic.claude-haiku-4-5-20251001-v1_0__20260320T173444",
+    "timestamp": "2026-03-20T22:36:32.924907+00:00",
+    "model": "us.anthropic.claude-haiku-4-5-20251001-v1:0",
+    "tier": "2",
+    "sim_days": 51,
+    "subjects": 3,
+    "subject_classes": [],
+    "triage_precision": 0.6667,
+    "triage_recall": 1.0,
+    "triage_f1": 0.8,
+    "triage_tp": 2,
+    "triage_fp": 1,
+    "triage_fn": 0,
+    "baseline_fp_rate": 0.0213,
+    "onset_sensitivity": 0.0,
+    "verdict_precision": 0.0,
+    "verdict_recall": 0.0,
+    "verdict_f1": 0.0,
+    "verdict_tp": 0,
+    "verdict_fp": 1,
+    "verdict_fn": 2,
+    "vishing_detected": true,
+    "host_trail_reconstructed": false,
+    "by_behavior": {},
+    "by_class": {
+      "innocent": {
+        "tp": 0,
+        "fp": 1,
+        "fn": 0
+      },
+      "disgruntled": {
+        "tp": 0,
+        "fp": 0,
+        "fn": 1
+      },
+      "malicious": {
+        "tp": 0,
+        "fp": 0,
+        "fn": 1
+      }
+    }
+  }
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ gradio==6.9.0
2	+ pandas