"""SAEGuardBench: HuggingFace Spaces Gradio app. Interactive benchmark explorer for "Do SAE Features Actually Help Detect Jailbreaks?" — a systematic comparison of 8 detection methods across 4 paradigms, 6 datasets, and 4 models (2B--70B parameters). Key finding: SAE features consistently hurt jailbreak detection compared to simple linear probes on raw activations (the "Detection Gap"). """ from __future__ import annotations from pathlib import Path from typing import Any import gradio as gr import numpy as np import pandas as pd import plotly.graph_objects as go # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- APP_TITLE = "SAEGuardBench: Do SAE Features Actually Help Detect Jailbreaks?" PROJECT_ROOT = Path(__file__).resolve().parent RESULTS_DIR = PROJECT_ROOT / "results" FIGURES_DIR = PROJECT_ROOT / "figures" PAPER_FIGURES_DIR = PROJECT_ROOT / "paper" / "figures" LEADERBOARD_CSV = RESULTS_DIR / "leaderboard.csv" # Color palette (matches paper) COLOR_PRIMARY = "#0072B2" COLOR_SAE = "#E69F00" COLOR_RAW = "#009E73" COLOR_GAP_NEG = "#D55E00" COLOR_GAP_POS = "#009E73" # Detection Gap data (from paper Table 2 / Section 4) DETECTION_GAP_DATA: list[dict[str, Any]] = [ {"model": "Gemma-2-2B", "raw_auroc": 0.949, "sae_auroc": 0.712, "gap": -0.237}, {"model": "Llama-3.1-8B", "raw_auroc": 0.867, "sae_auroc": 0.477, "gap": -0.391}, {"model": "Gemma-3-4B", "raw_auroc": 0.922, "sae_auroc": 0.709, "gap": -0.213}, {"model": "Llama-3.3-70B", "raw_auroc": 1.000, "sae_auroc": 0.949, "gap": -0.051}, ] # Method comparison data — Gemma-2-2B, Layer 12 # SAE methods use SAE features; probes use raw activations (matches paper Table 1) # Values from results/sae_features_*.json and results/train_*.json DATASETS = ["JailbreakBench", "HarmBench", "AdvBench", "SORRY-Bench", "WildJailbreak"] METHOD_RESULTS: dict[str, list[float]] = { "Linear Probe": [0.949, 1.000, 1.000, 1.000, 1.000], "SAE-Classifier": [0.704, 0.984, 1.000, 1.000, 1.000], "CC-Delta": [0.712, 0.972, 1.000, 0.997, 0.999], "GSAE": [0.707, 0.966, 1.000, 1.000, 0.999], "Random SAE": [0.571, 0.626, 1.000, 0.996, 1.000], "MLP Probe": [0.942, 1.000, 1.000, 1.000, 0.999], "FJD": [0.472, 0.500, 0.500, 0.490, 0.680], "LlamaGuard-3": [0.885, 0.940, 0.990, 0.940, 0.850], } # Paradigm labels for leaderboard METHOD_PARADIGMS: dict[str, str] = { "Linear Probe": "Activation Probe", "MLP Probe": "Activation Probe", "SAE-Classifier": "SAE Feature", "GSAE": "SAE Feature", "Random SAE": "SAE Feature", "CC-Delta": "SAE Feature", "FJD": "Logit-Based", "LlamaGuard-3": "External Model", } # Figure metadata: (filename, caption) FIGURE_ENTRIES: list[tuple[str, str]] = [ ( "fig1_raw_vs_sae.png", "Figure 1: Raw activation probes consistently outperform SAE-based " "methods across all models and datasets.", ), ( "fig_roc_curves.png", "ROC curves comparing detection methods on JailbreakBench. The raw " "linear probe dominates SAE-based approaches.", ), ( "fig_hybrid_recovery.png", "Hybrid recovery rates: combining raw detection with SAE explanation " "recovers 88-106% of raw-only performance.", ), ( "fig_safety_subspace.png", "Safety subspace PCA: PC2 carries 13% variance in reconstruction but " "58% in residual, explaining the Detection Gap.", ), ( "fig_pareto_tradeoff.png", "Pareto tradeoff between detection accuracy and interpretability " "across all methods.", ), ( "fig_crossmodel_hybrid.png", "Cross-model hybrid results showing InterpGuard generalises across " "model families and scales.", ), ] PAPER_ABSTRACT = ( "Sparse Autoencoders (SAEs) are increasingly proposed as interpretable " "safety monitors for large language models. But do their features actually " "help detect jailbreaks? We introduce SAEGuardBench, a benchmark comparing " "8 detection methods across 4 paradigms on 6 datasets and 4 models " "(2B\u201370B parameters). The answer is no. SAE features consistently hurt " "detection compared to simple linear probes on raw activations, a gap we " "call the Detection Gap, which is negative on every model we test. The gap " "persists across layers, transfer settings, wider SAEs, and nonlinear " "classifiers. We trace the cause to the reconstruction objective, which " "discards low-variance directions carrying safety signal. Yet SAE features " "still capture interpretable concept structure that raw activations lack. " "To exploit both strengths, we describe InterpGuard, a practical two-stage " "recipe that detects with raw activations and explains with SAE features. " "An LLM-as-judge evaluation across three frontier models reveals a " "bottleneck: current SAE labels identify that a prompt is harmful but not " "what kind of harm. We also show the gap is not fundamental: fine-tuning " "the SAE encoder with a classification-aware objective nearly closes it, " "confirming the problem lies in the training objective, not the architecture." ) BIBTEX_CITATION = r"""@article{rahman2026saeguardbench, title = {Do SAE Features Actually Help Detect Jailbreaks? A Systematic Benchmark of Interpretability-Based Safety Methods}, author = {Rahman, Md A}, year = {2026}, url = {https://github.com/ronyrahmaan/saeguardbench} }""" # --------------------------------------------------------------------------- # Data helpers # --------------------------------------------------------------------------- def _build_hardcoded_leaderboard() -> pd.DataFrame: """Build the leaderboard from hardcoded paper results. Returns a DataFrame with columns: Method, Paradigm, Model, Dataset, AUROC, F1, FPR@95TPR """ rows: list[dict[str, Any]] = [] model = "Gemma-2-2B" for method, scores in METHOD_RESULTS.items(): paradigm = METHOD_PARADIGMS[method] for dataset, auroc in zip(DATASETS, scores): rows.append( { "Method": method, "Paradigm": paradigm, "Model": model, "Dataset": dataset, "AUROC": round(auroc, 3), } ) return pd.DataFrame(rows) def load_leaderboard() -> pd.DataFrame: """Load the leaderboard CSV, falling back to hardcoded data.""" if LEADERBOARD_CSV.exists(): try: df = pd.read_csv(LEADERBOARD_CSV) required = {"Method", "Paradigm", "Model", "Dataset", "AUROC"} if required.issubset(set(df.columns)): return df except Exception: pass return _build_hardcoded_leaderboard() def _color_auroc(val: float) -> str: """Return an HTML-styled AUROC value with color coding.""" if val >= 0.9: color = "#15803d" # green-700 elif val >= 0.7: color = "#a16207" # yellow-700 else: color = "#b91c1c" # red-700 return f"{val:.3f}" # --------------------------------------------------------------------------- # Tab builders # --------------------------------------------------------------------------- def build_leaderboard_tab() -> None: """Tab 1: Sortable leaderboard with filters.""" full_df = load_leaderboard() # Detection Gap banner avg_gap = np.mean([d["gap"] for d in DETECTION_GAP_DATA]) gr.Markdown( f"### Detection Gap (average across models): " f"**{avg_gap:+.3f}** AUROC\n" "*SAE features hurt detection on every model tested.*" ) model_choices = ["All", *sorted(full_df["Model"].unique().tolist())] dataset_choices = ["All", *sorted(full_df["Dataset"].unique().tolist())] paradigm_choices = ["All", *sorted(full_df["Paradigm"].unique().tolist())] with gr.Row(): model_filter = gr.Dropdown( choices=model_choices, value="All", label="Model" ) dataset_filter = gr.Dropdown( choices=dataset_choices, value="All", label="Dataset" ) paradigm_filter = gr.Dropdown( choices=paradigm_choices, value="All", label="Paradigm" ) table = gr.Dataframe( value=full_df, label="Benchmark Results", interactive=False, ) def _filter_table( model: str, dataset: str, paradigm: str ) -> pd.DataFrame: """Filter the leaderboard by the selected criteria.""" df = full_df.copy() if model != "All": df = df[df["Model"] == model] if dataset != "All": df = df[df["Dataset"] == dataset] if paradigm != "All": df = df[df["Paradigm"] == paradigm] return df.sort_values("AUROC", ascending=False).reset_index(drop=True) for filt in [model_filter, dataset_filter, paradigm_filter]: filt.change( fn=_filter_table, inputs=[model_filter, dataset_filter, paradigm_filter], outputs=table, ) def build_detection_gap_tab() -> None: """Tab 2: Detection Gap bar charts.""" df = pd.DataFrame(DETECTION_GAP_DATA) # --- Gap bar chart --- gap_colors = [ COLOR_GAP_POS if g >= 0 else COLOR_GAP_NEG for g in df["gap"] ] fig_gap = go.Figure( go.Bar( x=df["model"], y=df["gap"], marker_color=gap_colors, text=[f"{g:+.3f}" for g in df["gap"]], textposition="outside", ) ) fig_gap.update_layout( title="Detection Gap per Model (SAE AUROC minus Raw AUROC)", xaxis_title="Model", yaxis_title="Detection Gap (AUROC)", yaxis_range=[ min(df["gap"]) - 0.08, max(0.05, max(df["gap"]) + 0.08), ], template="plotly_white", height=450, font={"family": "Inter, system-ui, sans-serif"}, ) fig_gap.add_hline(y=0, line_dash="dash", line_color="gray") gr.Markdown( "### Detection Gap\n" "Negative values mean SAE features *hurt* detection vs. raw probes." ) gr.Plot(fig_gap) # --- Grouped bar chart: raw vs SAE --- fig_grouped = go.Figure() fig_grouped.add_trace( go.Bar( name="Raw Probe AUROC", x=df["model"], y=df["raw_auroc"], marker_color=COLOR_RAW, text=[f"{v:.3f}" for v in df["raw_auroc"]], textposition="outside", ) ) fig_grouped.add_trace( go.Bar( name="SAE AUROC", x=df["model"], y=df["sae_auroc"], marker_color=COLOR_SAE, text=[f"{v:.3f}" for v in df["sae_auroc"]], textposition="outside", ) ) fig_grouped.update_layout( title="Raw Probe vs SAE Detection AUROC", xaxis_title="Model", yaxis_title="AUROC", yaxis_range=[0, 1.12], barmode="group", template="plotly_white", height=450, legend={"orientation": "h", "yanchor": "bottom", "y": 1.02}, font={"family": "Inter, system-ui, sans-serif"}, ) gr.Plot(fig_grouped) def build_method_comparison_tab() -> None: """Tab 3: Radar/spider chart comparing methods across datasets.""" method_names = list(METHOD_RESULTS.keys()) default_methods = ["Linear Probe", "SAE-Classifier", "CC-Delta"] method_selector = gr.CheckboxGroup( choices=method_names, value=default_methods, label="Select methods to compare", ) plot_output = gr.Plot() def _make_radar(selected: list[str]) -> go.Figure: """Build a radar chart for the selected methods.""" if not selected: selected = default_methods fig = go.Figure() # Plotly radar needs the first category repeated to close the polygon categories = [*DATASETS, DATASETS[0]] # Use a qualitative color scale colors = [ "#0072B2", "#E69F00", "#009E73", "#CC79A7", "#D55E00", "#56B4E9", "#F0E442", "#999999", ] for i, method in enumerate(selected): if method not in METHOD_RESULTS: continue values = METHOD_RESULTS[method] + [METHOD_RESULTS[method][0]] color = colors[i % len(colors)] fig.add_trace( go.Scatterpolar( r=values, theta=categories, fill="toself", name=method, line_color=color, opacity=0.7, ) ) fig.update_layout( polar={ "radialaxis": { "visible": True, "range": [0.3, 1.05], "tickvals": [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], } }, title="Method Comparison (Gemma-2-2B, Layer 12, AUROC)", template="plotly_white", height=550, font={"family": "Inter, system-ui, sans-serif"}, legend={"orientation": "h", "yanchor": "bottom", "y": -0.15}, ) return fig method_selector.change( fn=_make_radar, inputs=method_selector, outputs=plot_output ) # Trigger initial render via app.load plot_output.value = _make_radar(default_methods) # noqa: set at build time def build_figures_tab() -> None: """Tab 4: Gallery of paper figures with captions.""" found_any = False for filename, caption in FIGURE_ENTRIES: # Check both figures/ and paper/figures/ path = FIGURES_DIR / filename if not path.exists(): path = PAPER_FIGURES_DIR / filename if path.exists(): found_any = True gr.Markdown(f"**{caption}**") gr.Image(str(path), label=filename, show_label=False) gr.Markdown("---") if not found_any: gr.Markdown( "*No figures found. Place PNG files in `figures/` or " "`paper/figures/` to display them here.*" ) def build_about_tab() -> None: """Tab 5: Paper abstract, links, and citation.""" gr.Markdown("## Abstract") gr.Markdown(PAPER_ABSTRACT) gr.Markdown("---") gr.Markdown("## Links") gr.Markdown( "- **Code**: [github.com/ronyrahmaan/saeguardbench]" "(https://github.com/ronyrahmaan/saeguardbench)\n" "- **Dataset**: [HuggingFace](https://huggingface.co/datasets/mdarahmanxAI/SAEGuardBench)\n" "- **Paper**: [PDF](https://github.com/ronyrahmaan/saeguardbench/blob/main/paper.pdf)" ) gr.Markdown("---") gr.Markdown("## Citation") gr.Code(BIBTEX_CITATION, language=None, label="BibTeX") gr.Markdown("---") gr.Markdown( "## Authors\n\n" "**Md A Rahman** \n" "Department of Computer Science, Texas Tech University \n" "[ara02434@ttu.edu](mailto:ara02434@ttu.edu)" ) # --------------------------------------------------------------------------- # App assembly # --------------------------------------------------------------------------- def create_app() -> gr.Blocks: """Create and return the Gradio Blocks app.""" with gr.Blocks(title=APP_TITLE) as app: gr.Markdown(f"# {APP_TITLE}") gr.Markdown( "A systematic benchmark of interpretability-based safety methods " "for LLM jailbreak detection. **Key finding:** SAE features " "consistently *hurt* detection compared to raw activation probes." ) with gr.Tabs(): with gr.TabItem("Leaderboard"): build_leaderboard_tab() with gr.TabItem("Detection Gap"): build_detection_gap_tab() with gr.TabItem("Method Comparison"): build_method_comparison_tab() with gr.TabItem("Figures"): build_figures_tab() with gr.TabItem("About"): build_about_tab() return app if __name__ == "__main__": demo = create_app() demo.launch()