Spaces:
Sleeping
Sleeping
| """SAEGuardBench: HuggingFace Spaces Gradio app. | |
| Interactive benchmark explorer for "Do SAE Features Actually Help Detect | |
| Jailbreaks?" — a systematic comparison of 8 detection methods across 4 | |
| paradigms, 6 datasets, and 4 models (2B--70B parameters). | |
| Key finding: SAE features consistently hurt jailbreak detection compared to | |
| simple linear probes on raw activations (the "Detection Gap"). | |
| """ | |
| from __future__ import annotations | |
| from pathlib import Path | |
| from typing import Any | |
| import gradio as gr | |
| import numpy as np | |
| import pandas as pd | |
| import plotly.graph_objects as go | |
| # --------------------------------------------------------------------------- | |
| # Constants | |
| # --------------------------------------------------------------------------- | |
| APP_TITLE = "SAEGuardBench: Do SAE Features Actually Help Detect Jailbreaks?" | |
| PROJECT_ROOT = Path(__file__).resolve().parent | |
| RESULTS_DIR = PROJECT_ROOT / "results" | |
| FIGURES_DIR = PROJECT_ROOT / "figures" | |
| PAPER_FIGURES_DIR = PROJECT_ROOT / "paper" / "figures" | |
| LEADERBOARD_CSV = RESULTS_DIR / "leaderboard.csv" | |
| # Color palette (matches paper) | |
| COLOR_PRIMARY = "#0072B2" | |
| COLOR_SAE = "#E69F00" | |
| COLOR_RAW = "#009E73" | |
| COLOR_GAP_NEG = "#D55E00" | |
| COLOR_GAP_POS = "#009E73" | |
| # Detection Gap data (from paper Table 2 / Section 4) | |
| DETECTION_GAP_DATA: list[dict[str, Any]] = [ | |
| {"model": "Gemma-2-2B", "raw_auroc": 0.949, "sae_auroc": 0.712, "gap": -0.237}, | |
| {"model": "Llama-3.1-8B", "raw_auroc": 0.867, "sae_auroc": 0.477, "gap": -0.391}, | |
| {"model": "Gemma-3-4B", "raw_auroc": 0.922, "sae_auroc": 0.709, "gap": -0.213}, | |
| {"model": "Llama-3.3-70B", "raw_auroc": 1.000, "sae_auroc": 0.949, "gap": -0.051}, | |
| ] | |
| # Method comparison data — Gemma-2-2B, Layer 12 | |
| # SAE methods use SAE features; probes use raw activations (matches paper Table 1) | |
| # Values from results/sae_features_*.json and results/train_*.json | |
| DATASETS = ["JailbreakBench", "HarmBench", "AdvBench", "SORRY-Bench", "WildJailbreak"] | |
| METHOD_RESULTS: dict[str, list[float]] = { | |
| "Linear Probe": [0.949, 1.000, 1.000, 1.000, 1.000], | |
| "SAE-Classifier": [0.704, 0.984, 1.000, 1.000, 1.000], | |
| "CC-Delta": [0.712, 0.972, 1.000, 0.997, 0.999], | |
| "GSAE": [0.707, 0.966, 1.000, 1.000, 0.999], | |
| "Random SAE": [0.571, 0.626, 1.000, 0.996, 1.000], | |
| "MLP Probe": [0.942, 1.000, 1.000, 1.000, 0.999], | |
| "FJD": [0.472, 0.500, 0.500, 0.490, 0.680], | |
| "LlamaGuard-3": [0.885, 0.940, 0.990, 0.940, 0.850], | |
| } | |
| # Paradigm labels for leaderboard | |
| METHOD_PARADIGMS: dict[str, str] = { | |
| "Linear Probe": "Activation Probe", | |
| "MLP Probe": "Activation Probe", | |
| "SAE-Classifier": "SAE Feature", | |
| "GSAE": "SAE Feature", | |
| "Random SAE": "SAE Feature", | |
| "CC-Delta": "SAE Feature", | |
| "FJD": "Logit-Based", | |
| "LlamaGuard-3": "External Model", | |
| } | |
| # Figure metadata: (filename, caption) | |
| FIGURE_ENTRIES: list[tuple[str, str]] = [ | |
| ( | |
| "fig1_raw_vs_sae.png", | |
| "Figure 1: Raw activation probes consistently outperform SAE-based " | |
| "methods across all models and datasets.", | |
| ), | |
| ( | |
| "fig_roc_curves.png", | |
| "ROC curves comparing detection methods on JailbreakBench. The raw " | |
| "linear probe dominates SAE-based approaches.", | |
| ), | |
| ( | |
| "fig_hybrid_recovery.png", | |
| "Hybrid recovery rates: combining raw detection with SAE explanation " | |
| "recovers 88-106% of raw-only performance.", | |
| ), | |
| ( | |
| "fig_safety_subspace.png", | |
| "Safety subspace PCA: PC2 carries 13% variance in reconstruction but " | |
| "58% in residual, explaining the Detection Gap.", | |
| ), | |
| ( | |
| "fig_pareto_tradeoff.png", | |
| "Pareto tradeoff between detection accuracy and interpretability " | |
| "across all methods.", | |
| ), | |
| ( | |
| "fig_crossmodel_hybrid.png", | |
| "Cross-model hybrid results showing InterpGuard generalises across " | |
| "model families and scales.", | |
| ), | |
| ] | |
| PAPER_ABSTRACT = ( | |
| "Sparse Autoencoders (SAEs) are increasingly proposed as interpretable " | |
| "safety monitors for large language models. But do their features actually " | |
| "help detect jailbreaks? We introduce SAEGuardBench, a benchmark comparing " | |
| "8 detection methods across 4 paradigms on 6 datasets and 4 models " | |
| "(2B\u201370B parameters). The answer is no. SAE features consistently hurt " | |
| "detection compared to simple linear probes on raw activations, a gap we " | |
| "call the Detection Gap, which is negative on every model we test. The gap " | |
| "persists across layers, transfer settings, wider SAEs, and nonlinear " | |
| "classifiers. We trace the cause to the reconstruction objective, which " | |
| "discards low-variance directions carrying safety signal. Yet SAE features " | |
| "still capture interpretable concept structure that raw activations lack. " | |
| "To exploit both strengths, we describe InterpGuard, a practical two-stage " | |
| "recipe that detects with raw activations and explains with SAE features. " | |
| "An LLM-as-judge evaluation across three frontier models reveals a " | |
| "bottleneck: current SAE labels identify that a prompt is harmful but not " | |
| "what kind of harm. We also show the gap is not fundamental: fine-tuning " | |
| "the SAE encoder with a classification-aware objective nearly closes it, " | |
| "confirming the problem lies in the training objective, not the architecture." | |
| ) | |
| BIBTEX_CITATION = r"""@article{rahman2026saeguardbench, | |
| title = {Do SAE Features Actually Help Detect Jailbreaks? | |
| A Systematic Benchmark of Interpretability-Based Safety Methods}, | |
| author = {Rahman, Md A}, | |
| year = {2026}, | |
| url = {https://github.com/ronyrahmaan/saeguardbench} | |
| }""" | |
| # --------------------------------------------------------------------------- | |
| # Data helpers | |
| # --------------------------------------------------------------------------- | |
| def _build_hardcoded_leaderboard() -> pd.DataFrame: | |
| """Build the leaderboard from hardcoded paper results. | |
| Returns a DataFrame with columns: | |
| Method, Paradigm, Model, Dataset, AUROC, F1, FPR@95TPR | |
| """ | |
| rows: list[dict[str, Any]] = [] | |
| model = "Gemma-2-2B" | |
| for method, scores in METHOD_RESULTS.items(): | |
| paradigm = METHOD_PARADIGMS[method] | |
| for dataset, auroc in zip(DATASETS, scores): | |
| rows.append( | |
| { | |
| "Method": method, | |
| "Paradigm": paradigm, | |
| "Model": model, | |
| "Dataset": dataset, | |
| "AUROC": round(auroc, 3), | |
| } | |
| ) | |
| return pd.DataFrame(rows) | |
| def load_leaderboard() -> pd.DataFrame: | |
| """Load the leaderboard CSV, falling back to hardcoded data.""" | |
| if LEADERBOARD_CSV.exists(): | |
| try: | |
| df = pd.read_csv(LEADERBOARD_CSV) | |
| required = {"Method", "Paradigm", "Model", "Dataset", "AUROC"} | |
| if required.issubset(set(df.columns)): | |
| return df | |
| except Exception: | |
| pass | |
| return _build_hardcoded_leaderboard() | |
| def _color_auroc(val: float) -> str: | |
| """Return an HTML-styled AUROC value with color coding.""" | |
| if val >= 0.9: | |
| color = "#15803d" # green-700 | |
| elif val >= 0.7: | |
| color = "#a16207" # yellow-700 | |
| else: | |
| color = "#b91c1c" # red-700 | |
| return f"<span style='color:{color};font-weight:600'>{val:.3f}</span>" | |
| # --------------------------------------------------------------------------- | |
| # Tab builders | |
| # --------------------------------------------------------------------------- | |
| def build_leaderboard_tab() -> None: | |
| """Tab 1: Sortable leaderboard with filters.""" | |
| full_df = load_leaderboard() | |
| # Detection Gap banner | |
| avg_gap = np.mean([d["gap"] for d in DETECTION_GAP_DATA]) | |
| gr.Markdown( | |
| f"### Detection Gap (average across models): " | |
| f"**{avg_gap:+.3f}** AUROC\n" | |
| "*SAE features hurt detection on every model tested.*" | |
| ) | |
| model_choices = ["All", *sorted(full_df["Model"].unique().tolist())] | |
| dataset_choices = ["All", *sorted(full_df["Dataset"].unique().tolist())] | |
| paradigm_choices = ["All", *sorted(full_df["Paradigm"].unique().tolist())] | |
| with gr.Row(): | |
| model_filter = gr.Dropdown( | |
| choices=model_choices, value="All", label="Model" | |
| ) | |
| dataset_filter = gr.Dropdown( | |
| choices=dataset_choices, value="All", label="Dataset" | |
| ) | |
| paradigm_filter = gr.Dropdown( | |
| choices=paradigm_choices, value="All", label="Paradigm" | |
| ) | |
| table = gr.Dataframe( | |
| value=full_df, | |
| label="Benchmark Results", | |
| interactive=False, | |
| ) | |
| def _filter_table( | |
| model: str, dataset: str, paradigm: str | |
| ) -> pd.DataFrame: | |
| """Filter the leaderboard by the selected criteria.""" | |
| df = full_df.copy() | |
| if model != "All": | |
| df = df[df["Model"] == model] | |
| if dataset != "All": | |
| df = df[df["Dataset"] == dataset] | |
| if paradigm != "All": | |
| df = df[df["Paradigm"] == paradigm] | |
| return df.sort_values("AUROC", ascending=False).reset_index(drop=True) | |
| for filt in [model_filter, dataset_filter, paradigm_filter]: | |
| filt.change( | |
| fn=_filter_table, | |
| inputs=[model_filter, dataset_filter, paradigm_filter], | |
| outputs=table, | |
| ) | |
| def build_detection_gap_tab() -> None: | |
| """Tab 2: Detection Gap bar charts.""" | |
| df = pd.DataFrame(DETECTION_GAP_DATA) | |
| # --- Gap bar chart --- | |
| gap_colors = [ | |
| COLOR_GAP_POS if g >= 0 else COLOR_GAP_NEG for g in df["gap"] | |
| ] | |
| fig_gap = go.Figure( | |
| go.Bar( | |
| x=df["model"], | |
| y=df["gap"], | |
| marker_color=gap_colors, | |
| text=[f"{g:+.3f}" for g in df["gap"]], | |
| textposition="outside", | |
| ) | |
| ) | |
| fig_gap.update_layout( | |
| title="Detection Gap per Model (SAE AUROC minus Raw AUROC)", | |
| xaxis_title="Model", | |
| yaxis_title="Detection Gap (AUROC)", | |
| yaxis_range=[ | |
| min(df["gap"]) - 0.08, | |
| max(0.05, max(df["gap"]) + 0.08), | |
| ], | |
| template="plotly_white", | |
| height=450, | |
| font={"family": "Inter, system-ui, sans-serif"}, | |
| ) | |
| fig_gap.add_hline(y=0, line_dash="dash", line_color="gray") | |
| gr.Markdown( | |
| "### Detection Gap\n" | |
| "Negative values mean SAE features *hurt* detection vs. raw probes." | |
| ) | |
| gr.Plot(fig_gap) | |
| # --- Grouped bar chart: raw vs SAE --- | |
| fig_grouped = go.Figure() | |
| fig_grouped.add_trace( | |
| go.Bar( | |
| name="Raw Probe AUROC", | |
| x=df["model"], | |
| y=df["raw_auroc"], | |
| marker_color=COLOR_RAW, | |
| text=[f"{v:.3f}" for v in df["raw_auroc"]], | |
| textposition="outside", | |
| ) | |
| ) | |
| fig_grouped.add_trace( | |
| go.Bar( | |
| name="SAE AUROC", | |
| x=df["model"], | |
| y=df["sae_auroc"], | |
| marker_color=COLOR_SAE, | |
| text=[f"{v:.3f}" for v in df["sae_auroc"]], | |
| textposition="outside", | |
| ) | |
| ) | |
| fig_grouped.update_layout( | |
| title="Raw Probe vs SAE Detection AUROC", | |
| xaxis_title="Model", | |
| yaxis_title="AUROC", | |
| yaxis_range=[0, 1.12], | |
| barmode="group", | |
| template="plotly_white", | |
| height=450, | |
| legend={"orientation": "h", "yanchor": "bottom", "y": 1.02}, | |
| font={"family": "Inter, system-ui, sans-serif"}, | |
| ) | |
| gr.Plot(fig_grouped) | |
| def build_method_comparison_tab() -> None: | |
| """Tab 3: Radar/spider chart comparing methods across datasets.""" | |
| method_names = list(METHOD_RESULTS.keys()) | |
| default_methods = ["Linear Probe", "SAE-Classifier", "CC-Delta"] | |
| method_selector = gr.CheckboxGroup( | |
| choices=method_names, | |
| value=default_methods, | |
| label="Select methods to compare", | |
| ) | |
| plot_output = gr.Plot() | |
| def _make_radar(selected: list[str]) -> go.Figure: | |
| """Build a radar chart for the selected methods.""" | |
| if not selected: | |
| selected = default_methods | |
| fig = go.Figure() | |
| # Plotly radar needs the first category repeated to close the polygon | |
| categories = [*DATASETS, DATASETS[0]] | |
| # Use a qualitative color scale | |
| colors = [ | |
| "#0072B2", "#E69F00", "#009E73", "#CC79A7", | |
| "#D55E00", "#56B4E9", "#F0E442", "#999999", | |
| ] | |
| for i, method in enumerate(selected): | |
| if method not in METHOD_RESULTS: | |
| continue | |
| values = METHOD_RESULTS[method] + [METHOD_RESULTS[method][0]] | |
| color = colors[i % len(colors)] | |
| fig.add_trace( | |
| go.Scatterpolar( | |
| r=values, | |
| theta=categories, | |
| fill="toself", | |
| name=method, | |
| line_color=color, | |
| opacity=0.7, | |
| ) | |
| ) | |
| fig.update_layout( | |
| polar={ | |
| "radialaxis": { | |
| "visible": True, | |
| "range": [0.3, 1.05], | |
| "tickvals": [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], | |
| } | |
| }, | |
| title="Method Comparison (Gemma-2-2B, Layer 12, AUROC)", | |
| template="plotly_white", | |
| height=550, | |
| font={"family": "Inter, system-ui, sans-serif"}, | |
| legend={"orientation": "h", "yanchor": "bottom", "y": -0.15}, | |
| ) | |
| return fig | |
| method_selector.change( | |
| fn=_make_radar, inputs=method_selector, outputs=plot_output | |
| ) | |
| # Trigger initial render via app.load | |
| plot_output.value = _make_radar(default_methods) # noqa: set at build time | |
| def build_figures_tab() -> None: | |
| """Tab 4: Gallery of paper figures with captions.""" | |
| found_any = False | |
| for filename, caption in FIGURE_ENTRIES: | |
| # Check both figures/ and paper/figures/ | |
| path = FIGURES_DIR / filename | |
| if not path.exists(): | |
| path = PAPER_FIGURES_DIR / filename | |
| if path.exists(): | |
| found_any = True | |
| gr.Markdown(f"**{caption}**") | |
| gr.Image(str(path), label=filename, show_label=False) | |
| gr.Markdown("---") | |
| if not found_any: | |
| gr.Markdown( | |
| "*No figures found. Place PNG files in `figures/` or " | |
| "`paper/figures/` to display them here.*" | |
| ) | |
| def build_about_tab() -> None: | |
| """Tab 5: Paper abstract, links, and citation.""" | |
| gr.Markdown("## Abstract") | |
| gr.Markdown(PAPER_ABSTRACT) | |
| gr.Markdown("---") | |
| gr.Markdown("## Links") | |
| gr.Markdown( | |
| "- **Code**: [github.com/ronyrahmaan/saeguardbench]" | |
| "(https://github.com/ronyrahmaan/saeguardbench)\n" | |
| "- **Dataset**: [HuggingFace](https://huggingface.co/datasets/mdarahmanxAI/SAEGuardBench)\n" | |
| "- **Paper**: [PDF](https://github.com/ronyrahmaan/saeguardbench/blob/main/paper.pdf)" | |
| ) | |
| gr.Markdown("---") | |
| gr.Markdown("## Citation") | |
| gr.Code(BIBTEX_CITATION, language=None, label="BibTeX") | |
| gr.Markdown("---") | |
| gr.Markdown( | |
| "## Authors\n\n" | |
| "**Md A Rahman** \n" | |
| "Department of Computer Science, Texas Tech University \n" | |
| "[ara02434@ttu.edu](mailto:ara02434@ttu.edu)" | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # App assembly | |
| # --------------------------------------------------------------------------- | |
| def create_app() -> gr.Blocks: | |
| """Create and return the Gradio Blocks app.""" | |
| with gr.Blocks(title=APP_TITLE) as app: | |
| gr.Markdown(f"# {APP_TITLE}") | |
| gr.Markdown( | |
| "A systematic benchmark of interpretability-based safety methods " | |
| "for LLM jailbreak detection. **Key finding:** SAE features " | |
| "consistently *hurt* detection compared to raw activation probes." | |
| ) | |
| with gr.Tabs(): | |
| with gr.TabItem("Leaderboard"): | |
| build_leaderboard_tab() | |
| with gr.TabItem("Detection Gap"): | |
| build_detection_gap_tab() | |
| with gr.TabItem("Method Comparison"): | |
| build_method_comparison_tab() | |
| with gr.TabItem("Figures"): | |
| build_figures_tab() | |
| with gr.TabItem("About"): | |
| build_about_tab() | |
| return app | |
| if __name__ == "__main__": | |
| demo = create_app() | |
| demo.launch() | |