import gradio as gr from gradio_leaderboard import Leaderboard, SelectColumns import pandas as pd # ------------------------- # Title & Intro # ------------------------- TITLE = """

🏆 Eval-Unlearn Leaderboard

""" INTRO_TEXT = """ ## About eval-learn **eval-learn** is an open-source Python library providing a unified, reproducible benchmarking framework for concept unlearning in Stable Diffusion models. The proliferation of concept unlearning techniques has produced a fragmented evaluation landscape: methods are assessed under heterogeneous experimental conditions with different datasets, metrics, and hyperparameters, making principled cross-method comparison difficult. eval-learn is designed to fill this gap. 📦 **Install:** `pip install eval-learn` | 💻 **[GitHub](https://github.com/REAL-Lab-Imperial/Eval-Unlearn)** | 📖 **[Docs](https://eval-learn.readthedocs.io)** --- ### Key Features - **Unified pipeline** supporting fine-tuning, closed-form model editing, and inference-time intervention techniques on a common Stable Diffusion base - **9 standardised evaluation metrics** covering erasure efficacy, adversarial robustness, generative quality, and concept retention - **Plugin architecture** via Python entry points — third-party techniques and metrics self-register upon installation without modifying the core framework - **GPU-efficient execution** with FP16 inference, batch streaming, and proactive VRAM management - **CLI + YAML/JSON config** for streamlined experiment management - **HuggingFace Hub integration** via `eval-learn push` --- ### BenchScore Results are aggregated using a composite **BenchScore** that balances safety and quality: > BenchScore(α) = α · S + (1 − α) · Q - **BenchScore-S** (α = 0.6): safety-prioritised - **BenchScore-Q** (α = 0.4): quality-prioritised --- ### Quick Start ```bash pip install eval-learn eval-learn run --config config.yaml ``` Example `config.yaml`: ```yaml output_dir: ./results/nudity/esd seed: 42 technique: name: esd config: {erase_concept: nudity, train_method: noxattn} metrics: - name: asr_i2p config: {concept: nudity, detector: nudenet} - name: clip_score config: {device: cuda} ``` --- """ TABLES_HTML = """

Implemented Techniques (v0.1.6)

Technique	Category
ESD	Fine-Tuning
CA	Fine-Tuning
CoGFD	Fine-Tuning
AdvUnlearn	Fine-Tuning
SSD	Fine-Tuning / Closed-Form
UCE	Closed-Form
MACE	Closed-Form
SLD	Inference-Time
SAFREE	Inference-Time
TraSCE	Inference-Time
Concept Steerers	Inference-Time
SAeUron	Inference-Time

Evaluation Metrics

Metric	Type
ASR (I2P) ↓	Erasure efficacy
ASR (Ring-A-Bell) ↓	Adversarial robustness
ASR (MMA-Diffusion) ↓	Adversarial robustness
ASR (P4D) ↓	Adversarial robustness
FID ↓	Image quality
CLIP Score ↑	Text-image fidelity
TIFA ↑	Compositional fidelity
ERR	Erasure-retention
UA-IRA ↑	Unlearning / retention

""" LEADERBOARD_TEXT = """ ### Results Comparison of unlearning techniques across all benchmarks. **↓** = lower is better, **↑** = higher is better. Sorted by average BenchScore. """ # ------------------------- # Paper links + citation labels # ------------------------- PAPER_INFO = { "SSD": ("Foster et al., 2024", "https://doi.org/10.1609/aaai.v38i11.29092"), "MACE": ("Lu et al., 2024", "https://doi.org/10.1109/CVPR52733.2024.00615"), "SLD": ("Schramowski et al., 2023", "https://doi.org/10.1109/CVPR52729.2023.02157"), "UCE": ("Gandikota et al., 2024", "https://doi.org/10.1109/WACV57701.2024.00503"), "AdvUnlearn": ("Zhang et al., 2024", "https://openreview.net/forum?id=dkpmfIydrF"), "SAFREE": ("Yoon et al., 2025", "https://openreview.net/forum?id=hgTFotBRKl"), "SAEuron": ("Cywinski et al., 2025", "https://openreview.net/forum?id=HFCaWGWEzi"), "CogFD": ("Nie et al., 2025", "https://openreview.net/forum?id=OBjF5I4PWg"), "ESD": ("Gandikota et al., 2023", "https://doi.org/10.1109/ICCV51070.2023.00230"), "CA": ("Kumari et al., 2023", "https://doi.org/10.1109/ICCV51070.2023.02074"), "TraSCE": ("Jain et al., 2024", "https://arxiv.org/abs/2412.07658"), "Concept-Steerers": ("Kim et al., 2025", "https://doi.org/10.48550/arXiv.2501.19066"), } # ------------------------- # Data # ------------------------- raw_data = [ ["SSD", 0.00, 0.00, 0.08, 157.90, 25.48, 0.60, 0.13, 0.8235, 0.7918], ["MACE", 0.04, 0.00, 0.08, 133.17, 26.35, 0.70, 0.20, 0.7962, 0.8116], ["SLD", 0.02, 0.00, 0.00, 133.37, 25.80, 0.75, 0.07, 0.7765, 0.7666], ["UCE", 0.06, 0.10, 0.08, 129.66, 27.16, 0.65, 0.20, 0.7377, 0.7809], ["AdvUnlearn", 0.06, 0.00, 0.25, 126.15, 25.83, 0.60, 0.07, 0.7423, 0.7492], ["SAFREE", 0.16, 0.10, 0.33, 133.61, 26.27, 0.55, 0.40, 0.6694, 0.7678], ["SAEuron", 0.10, 0.00, 0.17, 196.11, 24.80, 0.50, 0.07, 0.7108, 0.6704], ["CogFD", 0.16, 0.10, 0.50, 129.93, 25.48, 0.50, 0.13, 0.5705, 0.6432], ["ESD", 0.00, 0.00, 0.00, 242.05, 21.36, 0.55, 0.13, 0.6474, 0.5425], ["CA", 0.20, 0.20, 0.50, 126.83, 26.13, 0.55, 0.07, 0.4358, 0.5465], ["TraSCE", 0.00, 0.00, 0.00, 230.96, 16.69, 0.30, 0.00, 0.5759, 0.3996], ["Concept-Steerers", 0.00, 0.00, 0.00, 236.58, 15.62, 0.60, 0.07, 0.4880, 0.3571], ] columns = [ "Technique", "ASR I2P ↓", "ASR RingABell ↓", "ASR MMA ↓", "FID ↓", "CLIP Score ↑", "UA-IRA ↑", "TIFA ↑", "BenchScore-S ↑", "BenchScore-Q ↑" ] df = pd.DataFrame(raw_data, columns=columns) df["Avg BenchScore"] = ((df["BenchScore-S ↑"] + df["BenchScore-Q ↑"]) / 2).round(4) def make_technique_html(t): citation, url = PAPER_INFO.get(t, ("", "#")) return ( f'' f'{t} ' f'({citation})' ) df["Technique"] = df["Technique"].map(make_technique_html) df = df.sort_values("Avg BenchScore", ascending=False).reset_index(drop=True) df.insert(0, "#", range(1, len(df) + 1)) # ------------------------- # Contribute Note # ------------------------- CONTRIBUTE_TEXT = """ --- ## 🚀 Add Your Technique to the Leaderboard Want to see your unlearning method on this leaderboard? Here\'s how: 1. **Follow the contribution guidelines** at [eval-unlearn.readthedocs.io/en/latest/contributing](https://eval-unlearn.readthedocs.io/en/latest/contributing/) to integrate your technique into the eval-learn framework 2. **Submit a Pull Request** to the [Eval-Unlearn GitHub repository](https://github.com/REAL-Lab-Imperial/Eval-Unlearn) 3. Once your PR is reviewed and merged, **we will evaluate your method** using eval-learn under the same standardised conditions as all other techniques 4. **Your results will be published** on this leaderboard We welcome contributions from the community — the more techniques we evaluate under unified conditions, the more useful this benchmark becomes for the field. """ # ------------------------- # CSS # ------------------------- custom_css = """ #space-title { font-size: 2em; margin-bottom: 0.2em; } .markdown-text { font-size: 1em; } """ # ------------------------- # App # ------------------------- demo = gr.Blocks(css=custom_css) with demo: gr.HTML(TITLE) gr.Markdown(INTRO_TEXT, elem_classes="markdown-text") gr.HTML(TABLES_HTML) gr.Markdown(LEADERBOARD_TEXT, elem_classes="markdown-text") Leaderboard( value=df, datatype=["number", "html", "number", "number", "number", "number", "number", "number", "number", "number", "number", "number"], select_columns=SelectColumns( default_selection=["#"] + columns + ["Avg BenchScore"], cant_deselect=["#", "Technique"], label="Select Columns to Display:", ), search_columns=["Technique"], filter_columns=[], interactive=False, ) gr.Markdown(CONTRIBUTE_TEXT, elem_classes="markdown-text") demo.queue(default_concurrency_limit=40).launch()