eval-unlearn / app.py
MansiJerry's picture
Improving the layout a bit
0c48551 verified
import gradio as gr
from gradio_leaderboard import Leaderboard, SelectColumns
import pandas as pd
# -------------------------
# Title & Intro
# -------------------------
TITLE = """
<h1 align="center" id="space-title">πŸ† Eval-Unlearn Leaderboard</h1>
"""
INTRO_TEXT = """
## About eval-learn
**eval-learn** is an open-source Python library providing a unified, reproducible benchmarking framework for concept unlearning in Stable Diffusion models.
The proliferation of concept unlearning techniques has produced a fragmented evaluation landscape: methods are assessed under heterogeneous experimental conditions with different datasets, metrics, and hyperparameters, making principled cross-method comparison difficult. eval-learn is designed to fill this gap.
πŸ“¦ **Install:** `pip install eval-learn` &nbsp;|&nbsp; πŸ’» **[GitHub](https://github.com/REAL-Lab-Imperial/Eval-Unlearn)** &nbsp;|&nbsp; πŸ“– **[Docs](https://eval-learn.readthedocs.io)**
---
### Key Features
- **Unified pipeline** supporting fine-tuning, closed-form model editing, and inference-time intervention techniques on a common Stable Diffusion base
- **9 standardised evaluation metrics** covering erasure efficacy, adversarial robustness, generative quality, and concept retention
- **Plugin architecture** via Python entry points β€” third-party techniques and metrics self-register upon installation without modifying the core framework
- **GPU-efficient execution** with FP16 inference, batch streaming, and proactive VRAM management
- **CLI + YAML/JSON config** for streamlined experiment management
- **HuggingFace Hub integration** via `eval-learn push`
---
### BenchScore
Results are aggregated using a composite **BenchScore** that balances safety and quality:
> BenchScore(Ξ±) = Ξ± Β· S + (1 βˆ’ Ξ±) Β· Q
- **BenchScore-S** (Ξ± = 0.6): safety-prioritised
- **BenchScore-Q** (Ξ± = 0.4): quality-prioritised
---
### Quick Start
```bash
pip install eval-learn
eval-learn run --config config.yaml
```
Example `config.yaml`:
```yaml
output_dir: ./results/nudity/esd
seed: 42
technique:
name: esd
config: {erase_concept: nudity, train_method: noxattn}
metrics:
- name: asr_i2p
config: {concept: nudity, detector: nudenet}
- name: clip_score
config: {device: cuda}
```
---
"""
TABLES_HTML = """
<div style="display:flex; gap:2em; flex-wrap:wrap; margin: 1em 0;">
<div style="flex:1; min-width:260px;">
<h4 style="margin-bottom:0.5em;">Implemented Techniques (v0.1.6)</h4>
<table style="width:100%; border-collapse:collapse;">
<thead><tr>
<th style="text-align:left; padding:6px 10px; border-bottom:1px solid #555;">Technique</th>
<th style="text-align:left; padding:6px 10px; border-bottom:1px solid #555;">Category</th>
</tr></thead>
<tbody>
<tr><td style="padding:5px 10px;">ESD</td><td style="padding:5px 10px;">Fine-Tuning</td></tr>
<tr><td style="padding:5px 10px;">CA</td><td style="padding:5px 10px;">Fine-Tuning</td></tr>
<tr><td style="padding:5px 10px;">CoGFD</td><td style="padding:5px 10px;">Fine-Tuning</td></tr>
<tr><td style="padding:5px 10px;">AdvUnlearn</td><td style="padding:5px 10px;">Fine-Tuning</td></tr>
<tr><td style="padding:5px 10px;">SSD</td><td style="padding:5px 10px;">Fine-Tuning / Closed-Form</td></tr>
<tr><td style="padding:5px 10px;">UCE</td><td style="padding:5px 10px;">Closed-Form</td></tr>
<tr><td style="padding:5px 10px;">MACE</td><td style="padding:5px 10px;">Closed-Form</td></tr>
<tr><td style="padding:5px 10px;">SLD</td><td style="padding:5px 10px;">Inference-Time</td></tr>
<tr><td style="padding:5px 10px;">SAFREE</td><td style="padding:5px 10px;">Inference-Time</td></tr>
<tr><td style="padding:5px 10px;">TraSCE</td><td style="padding:5px 10px;">Inference-Time</td></tr>
<tr><td style="padding:5px 10px;">Concept Steerers</td><td style="padding:5px 10px;">Inference-Time</td></tr>
<tr><td style="padding:5px 10px;">SAeUron</td><td style="padding:5px 10px;">Inference-Time</td></tr>
</tbody>
</table>
</div>
<div style="flex:1; min-width:260px;">
<h4 style="margin-bottom:0.5em;">Evaluation Metrics</h4>
<table style="width:100%; border-collapse:collapse;">
<thead><tr>
<th style="text-align:left; padding:6px 10px; border-bottom:1px solid #555;">Metric</th>
<th style="text-align:left; padding:6px 10px; border-bottom:1px solid #555;">Type</th>
</tr></thead>
<tbody>
<tr><td style="padding:5px 10px;">ASR (I2P) ↓</td><td style="padding:5px 10px;">Erasure efficacy</td></tr>
<tr><td style="padding:5px 10px;">ASR (Ring-A-Bell) ↓</td><td style="padding:5px 10px;">Adversarial robustness</td></tr>
<tr><td style="padding:5px 10px;">ASR (MMA-Diffusion) ↓</td><td style="padding:5px 10px;">Adversarial robustness</td></tr>
<tr><td style="padding:5px 10px;">ASR (P4D) ↓</td><td style="padding:5px 10px;">Adversarial robustness</td></tr>
<tr><td style="padding:5px 10px;">FID ↓</td><td style="padding:5px 10px;">Image quality</td></tr>
<tr><td style="padding:5px 10px;">CLIP Score ↑</td><td style="padding:5px 10px;">Text-image fidelity</td></tr>
<tr><td style="padding:5px 10px;">TIFA ↑</td><td style="padding:5px 10px;">Compositional fidelity</td></tr>
<tr><td style="padding:5px 10px;">ERR</td><td style="padding:5px 10px;">Erasure-retention</td></tr>
<tr><td style="padding:5px 10px;">UA-IRA ↑</td><td style="padding:5px 10px;">Unlearning / retention</td></tr>
</tbody>
</table>
</div>
</div>
<hr style="border:none; border-top:1px solid #444; margin:1.5em 0;">
"""
LEADERBOARD_TEXT = """
### Results
Comparison of unlearning techniques across all benchmarks. **↓** = lower is better, **↑** = higher is better. Sorted by average BenchScore.
"""
# -------------------------
# Paper links + citation labels
# -------------------------
PAPER_INFO = {
"SSD": ("Foster et al., 2024", "https://doi.org/10.1609/aaai.v38i11.29092"),
"MACE": ("Lu et al., 2024", "https://doi.org/10.1109/CVPR52733.2024.00615"),
"SLD": ("Schramowski et al., 2023", "https://doi.org/10.1109/CVPR52729.2023.02157"),
"UCE": ("Gandikota et al., 2024", "https://doi.org/10.1109/WACV57701.2024.00503"),
"AdvUnlearn": ("Zhang et al., 2024", "https://openreview.net/forum?id=dkpmfIydrF"),
"SAFREE": ("Yoon et al., 2025", "https://openreview.net/forum?id=hgTFotBRKl"),
"SAEuron": ("Cywinski et al., 2025", "https://openreview.net/forum?id=HFCaWGWEzi"),
"CogFD": ("Nie et al., 2025", "https://openreview.net/forum?id=OBjF5I4PWg"),
"ESD": ("Gandikota et al., 2023", "https://doi.org/10.1109/ICCV51070.2023.00230"),
"CA": ("Kumari et al., 2023", "https://doi.org/10.1109/ICCV51070.2023.02074"),
"TraSCE": ("Jain et al., 2024", "https://arxiv.org/abs/2412.07658"),
"Concept-Steerers": ("Kim et al., 2025", "https://doi.org/10.48550/arXiv.2501.19066"),
}
# -------------------------
# Data
# -------------------------
raw_data = [
["SSD", 0.00, 0.00, 0.08, 157.90, 25.48, 0.60, 0.13, 0.8235, 0.7918],
["MACE", 0.04, 0.00, 0.08, 133.17, 26.35, 0.70, 0.20, 0.7962, 0.8116],
["SLD", 0.02, 0.00, 0.00, 133.37, 25.80, 0.75, 0.07, 0.7765, 0.7666],
["UCE", 0.06, 0.10, 0.08, 129.66, 27.16, 0.65, 0.20, 0.7377, 0.7809],
["AdvUnlearn", 0.06, 0.00, 0.25, 126.15, 25.83, 0.60, 0.07, 0.7423, 0.7492],
["SAFREE", 0.16, 0.10, 0.33, 133.61, 26.27, 0.55, 0.40, 0.6694, 0.7678],
["SAEuron", 0.10, 0.00, 0.17, 196.11, 24.80, 0.50, 0.07, 0.7108, 0.6704],
["CogFD", 0.16, 0.10, 0.50, 129.93, 25.48, 0.50, 0.13, 0.5705, 0.6432],
["ESD", 0.00, 0.00, 0.00, 242.05, 21.36, 0.55, 0.13, 0.6474, 0.5425],
["CA", 0.20, 0.20, 0.50, 126.83, 26.13, 0.55, 0.07, 0.4358, 0.5465],
["TraSCE", 0.00, 0.00, 0.00, 230.96, 16.69, 0.30, 0.00, 0.5759, 0.3996],
["Concept-Steerers", 0.00, 0.00, 0.00, 236.58, 15.62, 0.60, 0.07, 0.4880, 0.3571],
]
columns = [
"Technique",
"ASR I2P ↓", "ASR RingABell ↓", "ASR MMA ↓",
"FID ↓", "CLIP Score ↑", "UA-IRA ↑", "TIFA ↑",
"BenchScore-S ↑", "BenchScore-Q ↑"
]
df = pd.DataFrame(raw_data, columns=columns)
df["Avg BenchScore"] = ((df["BenchScore-S ↑"] + df["BenchScore-Q ↑"]) / 2).round(4)
def make_technique_html(t):
citation, url = PAPER_INFO.get(t, ("", "#"))
return (
f'<a href="{url}" target="_blank" '
f'style="color:#7eb8f7; text-decoration:underline; font-weight:600;">'
f'{t}</a> '
f'<span style="color:#aaa; font-size:0.85em;">({citation})</span>'
)
df["Technique"] = df["Technique"].map(make_technique_html)
df = df.sort_values("Avg BenchScore", ascending=False).reset_index(drop=True)
df.insert(0, "#", range(1, len(df) + 1))
# -------------------------
# Contribute Note
# -------------------------
CONTRIBUTE_TEXT = """
---
## πŸš€ Add Your Technique to the Leaderboard
Want to see your unlearning method on this leaderboard? Here\'s how:
1. **Follow the contribution guidelines** at [eval-unlearn.readthedocs.io/en/latest/contributing](https://eval-unlearn.readthedocs.io/en/latest/contributing/) to integrate your technique into the eval-learn framework
2. **Submit a Pull Request** to the [Eval-Unlearn GitHub repository](https://github.com/REAL-Lab-Imperial/Eval-Unlearn)
3. Once your PR is reviewed and merged, **we will evaluate your method** using eval-learn under the same standardised conditions as all other techniques
4. **Your results will be published** on this leaderboard
We welcome contributions from the community β€” the more techniques we evaluate under unified conditions, the more useful this benchmark becomes for the field.
"""
# -------------------------
# CSS
# -------------------------
custom_css = """
#space-title { font-size: 2em; margin-bottom: 0.2em; }
.markdown-text { font-size: 1em; }
"""
# -------------------------
# App
# -------------------------
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRO_TEXT, elem_classes="markdown-text")
gr.HTML(TABLES_HTML)
gr.Markdown(LEADERBOARD_TEXT, elem_classes="markdown-text")
Leaderboard(
value=df,
datatype=["number", "html", "number", "number", "number", "number",
"number", "number", "number", "number", "number",
"number"],
select_columns=SelectColumns(
default_selection=["#"] + columns + ["Avg BenchScore"],
cant_deselect=["#", "Technique"],
label="Select Columns to Display:",
),
search_columns=["Technique"],
filter_columns=[],
interactive=False,
)
gr.Markdown(CONTRIBUTE_TEXT, elem_classes="markdown-text")
demo.queue(default_concurrency_limit=40).launch()