Spaces:

REAL-Lab-Imperial
/

eval-unlearn

Running

App Files Files Community

eval-unlearn / app.py

MansiJerry

Improving the layout a bit

0c48551 verified 2 days ago

raw

history blame contribute delete

11.1 kB

	import gradio as gr
	from gradio_leaderboard import Leaderboard, SelectColumns
	import pandas as pd

	# -------------------------
	# Title & Intro
	# -------------------------
	TITLE = """
	<h1 align="center" id="space-title">🏆 Eval-Unlearn Leaderboard</h1>
	"""

	INTRO_TEXT = """
	## About eval-learn

	eval-learn is an open-source Python library providing a unified, reproducible benchmarking framework for concept unlearning in Stable Diffusion models.

	The proliferation of concept unlearning techniques has produced a fragmented evaluation landscape: methods are assessed under heterogeneous experimental conditions with different datasets, metrics, and hyperparameters, making principled cross-method comparison difficult. eval-learn is designed to fill this gap.

	📦 Install: `pip install eval-learn`  \|  💻 [GitHub](https://github.com/REAL-Lab-Imperial/Eval-Unlearn)  \|  📖 [Docs](https://eval-learn.readthedocs.io)

	---

	### Key Features

	- Unified pipeline supporting fine-tuning, closed-form model editing, and inference-time intervention techniques on a common Stable Diffusion base
	- 9 standardised evaluation metrics covering erasure efficacy, adversarial robustness, generative quality, and concept retention
	- Plugin architecture via Python entry points — third-party techniques and metrics self-register upon installation without modifying the core framework
	- GPU-efficient execution with FP16 inference, batch streaming, and proactive VRAM management
	- CLI + YAML/JSON config for streamlined experiment management
	- HuggingFace Hub integration via `eval-learn push`

	---


	### BenchScore

	Results are aggregated using a composite BenchScore that balances safety and quality:

	> BenchScore(α) = α · S + (1 − α) · Q

	- BenchScore-S (α = 0.6): safety-prioritised
	- BenchScore-Q (α = 0.4): quality-prioritised

	---

	### Quick Start

	```bash
	pip install eval-learn
	eval-learn run --config config.yaml
	```

	Example `config.yaml`:

	```yaml
	output_dir: ./results/nudity/esd
	seed: 42
	technique:
	name: esd
	config: {erase_concept: nudity, train_method: noxattn}
	metrics:
	- name: asr_i2p
	config: {concept: nudity, detector: nudenet}
	- name: clip_score
	config: {device: cuda}
	```

	---
	"""

	TABLES_HTML = """
	<div style="display:flex; gap:2em; flex-wrap:wrap; margin: 1em 0;">
	<div style="flex:1; min-width:260px;">
	<h4 style="margin-bottom:0.5em;">Implemented Techniques (v0.1.6)</h4>
	<table style="width:100%; border-collapse:collapse;">
	<thead><tr>
	<th style="text-align:left; padding:6px 10px; border-bottom:1px solid #555;">Technique</th>
	<th style="text-align:left; padding:6px 10px; border-bottom:1px solid #555;">Category</th>
	</tr></thead>
	<tbody>
	<tr><td style="padding:5px 10px;">ESD</td><td style="padding:5px 10px;">Fine-Tuning</td></tr>
	<tr><td style="padding:5px 10px;">CA</td><td style="padding:5px 10px;">Fine-Tuning</td></tr>
	<tr><td style="padding:5px 10px;">CoGFD</td><td style="padding:5px 10px;">Fine-Tuning</td></tr>
	<tr><td style="padding:5px 10px;">AdvUnlearn</td><td style="padding:5px 10px;">Fine-Tuning</td></tr>
	<tr><td style="padding:5px 10px;">SSD</td><td style="padding:5px 10px;">Fine-Tuning / Closed-Form</td></tr>
	<tr><td style="padding:5px 10px;">UCE</td><td style="padding:5px 10px;">Closed-Form</td></tr>
	<tr><td style="padding:5px 10px;">MACE</td><td style="padding:5px 10px;">Closed-Form</td></tr>
	<tr><td style="padding:5px 10px;">SLD</td><td style="padding:5px 10px;">Inference-Time</td></tr>
	<tr><td style="padding:5px 10px;">SAFREE</td><td style="padding:5px 10px;">Inference-Time</td></tr>
	<tr><td style="padding:5px 10px;">TraSCE</td><td style="padding:5px 10px;">Inference-Time</td></tr>
	<tr><td style="padding:5px 10px;">Concept Steerers</td><td style="padding:5px 10px;">Inference-Time</td></tr>
	<tr><td style="padding:5px 10px;">SAeUron</td><td style="padding:5px 10px;">Inference-Time</td></tr>
	</tbody>
	</table>
	</div>
	<div style="flex:1; min-width:260px;">
	<h4 style="margin-bottom:0.5em;">Evaluation Metrics</h4>
	<table style="width:100%; border-collapse:collapse;">
	<thead><tr>
	<th style="text-align:left; padding:6px 10px; border-bottom:1px solid #555;">Metric</th>
	<th style="text-align:left; padding:6px 10px; border-bottom:1px solid #555;">Type</th>
	</tr></thead>
	<tbody>
	<tr><td style="padding:5px 10px;">ASR (I2P) ↓</td><td style="padding:5px 10px;">Erasure efficacy</td></tr>
	<tr><td style="padding:5px 10px;">ASR (Ring-A-Bell) ↓</td><td style="padding:5px 10px;">Adversarial robustness</td></tr>
	<tr><td style="padding:5px 10px;">ASR (MMA-Diffusion) ↓</td><td style="padding:5px 10px;">Adversarial robustness</td></tr>
	<tr><td style="padding:5px 10px;">ASR (P4D) ↓</td><td style="padding:5px 10px;">Adversarial robustness</td></tr>
	<tr><td style="padding:5px 10px;">FID ↓</td><td style="padding:5px 10px;">Image quality</td></tr>
	<tr><td style="padding:5px 10px;">CLIP Score ↑</td><td style="padding:5px 10px;">Text-image fidelity</td></tr>
	<tr><td style="padding:5px 10px;">TIFA ↑</td><td style="padding:5px 10px;">Compositional fidelity</td></tr>
	<tr><td style="padding:5px 10px;">ERR</td><td style="padding:5px 10px;">Erasure-retention</td></tr>
	<tr><td style="padding:5px 10px;">UA-IRA ↑</td><td style="padding:5px 10px;">Unlearning / retention</td></tr>
	</tbody>
	</table>
	</div>
	</div>
	<hr style="border:none; border-top:1px solid #444; margin:1.5em 0;">
	"""

	LEADERBOARD_TEXT = """
	### Results

	Comparison of unlearning techniques across all benchmarks. ↓ = lower is better, ↑ = higher is better. Sorted by average BenchScore.
	"""

	# -------------------------
	# Paper links + citation labels
	# -------------------------
	PAPER_INFO = {
	"SSD": ("Foster et al., 2024", "https://doi.org/10.1609/aaai.v38i11.29092"),
	"MACE": ("Lu et al., 2024", "https://doi.org/10.1109/CVPR52733.2024.00615"),
	"SLD": ("Schramowski et al., 2023", "https://doi.org/10.1109/CVPR52729.2023.02157"),
	"UCE": ("Gandikota et al., 2024", "https://doi.org/10.1109/WACV57701.2024.00503"),
	"AdvUnlearn": ("Zhang et al., 2024", "https://openreview.net/forum?id=dkpmfIydrF"),
	"SAFREE": ("Yoon et al., 2025", "https://openreview.net/forum?id=hgTFotBRKl"),
	"SAEuron": ("Cywinski et al., 2025", "https://openreview.net/forum?id=HFCaWGWEzi"),
	"CogFD": ("Nie et al., 2025", "https://openreview.net/forum?id=OBjF5I4PWg"),
	"ESD": ("Gandikota et al., 2023", "https://doi.org/10.1109/ICCV51070.2023.00230"),
	"CA": ("Kumari et al., 2023", "https://doi.org/10.1109/ICCV51070.2023.02074"),
	"TraSCE": ("Jain et al., 2024", "https://arxiv.org/abs/2412.07658"),
	"Concept-Steerers": ("Kim et al., 2025", "https://doi.org/10.48550/arXiv.2501.19066"),
	}

	# -------------------------
	# Data
	# -------------------------
	raw_data = [
	["SSD", 0.00, 0.00, 0.08, 157.90, 25.48, 0.60, 0.13, 0.8235, 0.7918],
	["MACE", 0.04, 0.00, 0.08, 133.17, 26.35, 0.70, 0.20, 0.7962, 0.8116],
	["SLD", 0.02, 0.00, 0.00, 133.37, 25.80, 0.75, 0.07, 0.7765, 0.7666],
	["UCE", 0.06, 0.10, 0.08, 129.66, 27.16, 0.65, 0.20, 0.7377, 0.7809],
	["AdvUnlearn", 0.06, 0.00, 0.25, 126.15, 25.83, 0.60, 0.07, 0.7423, 0.7492],
	["SAFREE", 0.16, 0.10, 0.33, 133.61, 26.27, 0.55, 0.40, 0.6694, 0.7678],
	["SAEuron", 0.10, 0.00, 0.17, 196.11, 24.80, 0.50, 0.07, 0.7108, 0.6704],
	["CogFD", 0.16, 0.10, 0.50, 129.93, 25.48, 0.50, 0.13, 0.5705, 0.6432],
	["ESD", 0.00, 0.00, 0.00, 242.05, 21.36, 0.55, 0.13, 0.6474, 0.5425],
	["CA", 0.20, 0.20, 0.50, 126.83, 26.13, 0.55, 0.07, 0.4358, 0.5465],
	["TraSCE", 0.00, 0.00, 0.00, 230.96, 16.69, 0.30, 0.00, 0.5759, 0.3996],
	["Concept-Steerers", 0.00, 0.00, 0.00, 236.58, 15.62, 0.60, 0.07, 0.4880, 0.3571],
	]

	columns = [
	"Technique",
	"ASR I2P ↓", "ASR RingABell ↓", "ASR MMA ↓",
	"FID ↓", "CLIP Score ↑", "UA-IRA ↑", "TIFA ↑",
	"BenchScore-S ↑", "BenchScore-Q ↑"
	]

	df = pd.DataFrame(raw_data, columns=columns)
	df["Avg BenchScore"] = ((df["BenchScore-S ↑"] + df["BenchScore-Q ↑"]) / 2).round(4)

	def make_technique_html(t):
	citation, url = PAPER_INFO.get(t, ("", "#"))
	return (
	f'<a href="{url}" target="_blank" '
	f'style="color:#7eb8f7; text-decoration:underline; font-weight:600;">'
	f'{t}</a> '
	f'<span style="color:#aaa; font-size:0.85em;">({citation})</span>'
	)

	df["Technique"] = df["Technique"].map(make_technique_html)
	df = df.sort_values("Avg BenchScore", ascending=False).reset_index(drop=True)
	df.insert(0, "#", range(1, len(df) + 1))

	# -------------------------
	# Contribute Note
	# -------------------------
	CONTRIBUTE_TEXT = """
	---

	## 🚀 Add Your Technique to the Leaderboard

	Want to see your unlearning method on this leaderboard? Here\'s how:

	1. Follow the contribution guidelines at [eval-unlearn.readthedocs.io/en/latest/contributing](https://eval-unlearn.readthedocs.io/en/latest/contributing/) to integrate your technique into the eval-learn framework
	2. Submit a Pull Request to the [Eval-Unlearn GitHub repository](https://github.com/REAL-Lab-Imperial/Eval-Unlearn)
	3. Once your PR is reviewed and merged, we will evaluate your method using eval-learn under the same standardised conditions as all other techniques
	4. Your results will be published on this leaderboard

	We welcome contributions from the community — the more techniques we evaluate under unified conditions, the more useful this benchmark becomes for the field.
	"""

	# -------------------------
	# CSS
	# -------------------------
	custom_css = """
	#space-title { font-size: 2em; margin-bottom: 0.2em; }
	.markdown-text { font-size: 1em; }
	"""

	# -------------------------
	# App
	# -------------------------
	demo = gr.Blocks(css=custom_css)
	with demo:
	gr.HTML(TITLE)
	gr.Markdown(INTRO_TEXT, elem_classes="markdown-text")
	gr.HTML(TABLES_HTML)
	gr.Markdown(LEADERBOARD_TEXT, elem_classes="markdown-text")
	Leaderboard(
	value=df,
	datatype=["number", "html", "number", "number", "number", "number",
	"number", "number", "number", "number", "number",
	"number"],
	select_columns=SelectColumns(
	default_selection=["#"] + columns + ["Avg BenchScore"],
	cant_deselect=["#", "Technique"],
	label="Select Columns to Display:",
	),
	search_columns=["Technique"],
	filter_columns=[],
	interactive=False,
	)

	gr.Markdown(CONTRIBUTE_TEXT, elem_classes="markdown-text")

	demo.queue(default_concurrency_limit=40).launch()