import gradio as gr
from gradio_leaderboard import Leaderboard, SelectColumns
import pandas as pd
# -------------------------
# Title & Intro
# -------------------------
TITLE = """
๐ Eval-Unlearn Leaderboard
"""
INTRO_TEXT = """
## About eval-learn
**eval-learn** is an open-source Python library providing a unified, reproducible benchmarking framework for concept unlearning in Stable Diffusion models.
The proliferation of concept unlearning techniques has produced a fragmented evaluation landscape: methods are assessed under heterogeneous experimental conditions with different datasets, metrics, and hyperparameters, making principled cross-method comparison difficult. eval-learn is designed to fill this gap.
๐ฆ **Install:** `pip install eval-learn` | ๐ป **[GitHub](https://github.com/REAL-Lab-Imperial/Eval-Unlearn)** | ๐ **[Docs](https://eval-learn.readthedocs.io)**
---
### Key Features
- **Unified pipeline** supporting fine-tuning, closed-form model editing, and inference-time intervention techniques on a common Stable Diffusion base
- **9 standardised evaluation metrics** covering erasure efficacy, adversarial robustness, generative quality, and concept retention
- **Plugin architecture** via Python entry points โ third-party techniques and metrics self-register upon installation without modifying the core framework
- **GPU-efficient execution** with FP16 inference, batch streaming, and proactive VRAM management
- **CLI + YAML/JSON config** for streamlined experiment management
- **HuggingFace Hub integration** via `eval-learn push`
---
### BenchScore
Results are aggregated using a composite **BenchScore** that balances safety and quality:
> BenchScore(ฮฑ) = ฮฑ ยท S + (1 โ ฮฑ) ยท Q
- **BenchScore-S** (ฮฑ = 0.6): safety-prioritised
- **BenchScore-Q** (ฮฑ = 0.4): quality-prioritised
---
### Quick Start
```bash
pip install eval-learn
eval-learn run --config config.yaml
```
Example `config.yaml`:
```yaml
output_dir: ./results/nudity/esd
seed: 42
technique:
name: esd
config: {erase_concept: nudity, train_method: noxattn}
metrics:
- name: asr_i2p
config: {concept: nudity, detector: nudenet}
- name: clip_score
config: {device: cuda}
```
---
"""
TABLES_HTML = """
Implemented Techniques (v0.1.6)
| Technique |
Category |
| ESD | Fine-Tuning |
| CA | Fine-Tuning |
| CoGFD | Fine-Tuning |
| AdvUnlearn | Fine-Tuning |
| SSD | Fine-Tuning / Closed-Form |
| UCE | Closed-Form |
| MACE | Closed-Form |
| SLD | Inference-Time |
| SAFREE | Inference-Time |
| TraSCE | Inference-Time |
| Concept Steerers | Inference-Time |
| SAeUron | Inference-Time |
Evaluation Metrics
| Metric |
Type |
| ASR (I2P) โ | Erasure efficacy |
| ASR (Ring-A-Bell) โ | Adversarial robustness |
| ASR (MMA-Diffusion) โ | Adversarial robustness |
| ASR (P4D) โ | Adversarial robustness |
| FID โ | Image quality |
| CLIP Score โ | Text-image fidelity |
| TIFA โ | Compositional fidelity |
| ERR | Erasure-retention |
| UA-IRA โ | Unlearning / retention |
"""
LEADERBOARD_TEXT = """
### Results
Comparison of unlearning techniques across all benchmarks. **โ** = lower is better, **โ** = higher is better. Sorted by average BenchScore.
"""
# -------------------------
# Paper links + citation labels
# -------------------------
PAPER_INFO = {
"SSD": ("Foster et al., 2024", "https://doi.org/10.1609/aaai.v38i11.29092"),
"MACE": ("Lu et al., 2024", "https://doi.org/10.1109/CVPR52733.2024.00615"),
"SLD": ("Schramowski et al., 2023", "https://doi.org/10.1109/CVPR52729.2023.02157"),
"UCE": ("Gandikota et al., 2024", "https://doi.org/10.1109/WACV57701.2024.00503"),
"AdvUnlearn": ("Zhang et al., 2024", "https://openreview.net/forum?id=dkpmfIydrF"),
"SAFREE": ("Yoon et al., 2025", "https://openreview.net/forum?id=hgTFotBRKl"),
"SAEuron": ("Cywinski et al., 2025", "https://openreview.net/forum?id=HFCaWGWEzi"),
"CogFD": ("Nie et al., 2025", "https://openreview.net/forum?id=OBjF5I4PWg"),
"ESD": ("Gandikota et al., 2023", "https://doi.org/10.1109/ICCV51070.2023.00230"),
"CA": ("Kumari et al., 2023", "https://doi.org/10.1109/ICCV51070.2023.02074"),
"TraSCE": ("Jain et al., 2024", "https://arxiv.org/abs/2412.07658"),
"Concept-Steerers": ("Kim et al., 2025", "https://doi.org/10.48550/arXiv.2501.19066"),
}
# -------------------------
# Data
# -------------------------
raw_data = [
["SSD", 0.00, 0.00, 0.08, 157.90, 25.48, 0.60, 0.13, 0.8235, 0.7918],
["MACE", 0.04, 0.00, 0.08, 133.17, 26.35, 0.70, 0.20, 0.7962, 0.8116],
["SLD", 0.02, 0.00, 0.00, 133.37, 25.80, 0.75, 0.07, 0.7765, 0.7666],
["UCE", 0.06, 0.10, 0.08, 129.66, 27.16, 0.65, 0.20, 0.7377, 0.7809],
["AdvUnlearn", 0.06, 0.00, 0.25, 126.15, 25.83, 0.60, 0.07, 0.7423, 0.7492],
["SAFREE", 0.16, 0.10, 0.33, 133.61, 26.27, 0.55, 0.40, 0.6694, 0.7678],
["SAEuron", 0.10, 0.00, 0.17, 196.11, 24.80, 0.50, 0.07, 0.7108, 0.6704],
["CogFD", 0.16, 0.10, 0.50, 129.93, 25.48, 0.50, 0.13, 0.5705, 0.6432],
["ESD", 0.00, 0.00, 0.00, 242.05, 21.36, 0.55, 0.13, 0.6474, 0.5425],
["CA", 0.20, 0.20, 0.50, 126.83, 26.13, 0.55, 0.07, 0.4358, 0.5465],
["TraSCE", 0.00, 0.00, 0.00, 230.96, 16.69, 0.30, 0.00, 0.5759, 0.3996],
["Concept-Steerers", 0.00, 0.00, 0.00, 236.58, 15.62, 0.60, 0.07, 0.4880, 0.3571],
]
columns = [
"Technique",
"ASR I2P โ", "ASR RingABell โ", "ASR MMA โ",
"FID โ", "CLIP Score โ", "UA-IRA โ", "TIFA โ",
"BenchScore-S โ", "BenchScore-Q โ"
]
df = pd.DataFrame(raw_data, columns=columns)
df["Avg BenchScore"] = ((df["BenchScore-S โ"] + df["BenchScore-Q โ"]) / 2).round(4)
def make_technique_html(t):
citation, url = PAPER_INFO.get(t, ("", "#"))
return (
f''
f'{t} '
f'({citation})'
)
df["Technique"] = df["Technique"].map(make_technique_html)
df = df.sort_values("Avg BenchScore", ascending=False).reset_index(drop=True)
df.insert(0, "#", range(1, len(df) + 1))
# -------------------------
# Contribute Note
# -------------------------
CONTRIBUTE_TEXT = """
---
## ๐ Add Your Technique to the Leaderboard
Want to see your unlearning method on this leaderboard? Here\'s how:
1. **Follow the contribution guidelines** at [eval-unlearn.readthedocs.io/en/latest/contributing](https://eval-unlearn.readthedocs.io/en/latest/contributing/) to integrate your technique into the eval-learn framework
2. **Submit a Pull Request** to the [Eval-Unlearn GitHub repository](https://github.com/REAL-Lab-Imperial/Eval-Unlearn)
3. Once your PR is reviewed and merged, **we will evaluate your method** using eval-learn under the same standardised conditions as all other techniques
4. **Your results will be published** on this leaderboard
We welcome contributions from the community โ the more techniques we evaluate under unified conditions, the more useful this benchmark becomes for the field.
"""
# -------------------------
# CSS
# -------------------------
custom_css = """
#space-title { font-size: 2em; margin-bottom: 0.2em; }
.markdown-text { font-size: 1em; }
"""
# -------------------------
# App
# -------------------------
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRO_TEXT, elem_classes="markdown-text")
gr.HTML(TABLES_HTML)
gr.Markdown(LEADERBOARD_TEXT, elem_classes="markdown-text")
Leaderboard(
value=df,
datatype=["number", "html", "number", "number", "number", "number",
"number", "number", "number", "number", "number",
"number"],
select_columns=SelectColumns(
default_selection=["#"] + columns + ["Avg BenchScore"],
cant_deselect=["#", "Technique"],
label="Select Columns to Display:",
),
search_columns=["Technique"],
filter_columns=[],
interactive=False,
)
gr.Markdown(CONTRIBUTE_TEXT, elem_classes="markdown-text")
demo.queue(default_concurrency_limit=40).launch()