Spaces:

Nanboy
/

RVCBench

Running

App Files Files Community

RVCBench / app.py

Nanboy

Fix RVCBench title color: add color: white !important to .hero h1

665fda4 verified 6 days ago

raw

history blame contribute delete

39.9 kB

	"""RVCBench — Interactive HuggingFace Space demo (v2).

	Tabs
	────
	1. Voice Cloning Gallery – hear pre-computed clean vs. protected clones
	+ protection-effectiveness bar chart for all 5 methods
	2. Protect Your Voice – upload audio, apply protection, see waveform comparison
	3. Results Explorer – interactive bar chart + protection robustness heatmap
	4. About – paper, citation, resources
	"""

	from __future__ import annotations

	import io
	import os
	import time

	import gradio as gr
	import numpy as np
	import plotly.graph_objects as go
	import soundfile as sf

	try:
	import _plotly_utils.basevalidators as _plotly_basevalidators

	def _plotly_to_scalar_or_list_without_pandas(value):
	np_mod = _plotly_basevalidators.get_module("numpy", should_load=False)
	if np_mod and np_mod.isscalar(value) and hasattr(value, "item"):
	return _plotly_basevalidators.to_non_numpy_type(np_mod, value)
	if isinstance(value, (list, tuple)):
	return [_plotly_to_scalar_or_list_without_pandas(item) for item in value]
	if np_mod and isinstance(value, np_mod.ndarray):
	if value.ndim == 0:
	return _plotly_basevalidators.to_non_numpy_type(np_mod, value)
	return [_plotly_to_scalar_or_list_without_pandas(item) for item in value]
	if _plotly_basevalidators.is_numpy_convertable(value):
	np_mod = _plotly_basevalidators.get_module("numpy", should_load=True)
	if np_mod:
	return _plotly_to_scalar_or_list_without_pandas(np_mod.array(value))
	return value

	def _plotly_is_homogeneous_array_without_pandas(value):
	np_mod = _plotly_basevalidators.get_module("numpy", should_load=False)
	if np_mod and isinstance(value, np_mod.ndarray):
	return True
	if isinstance(value, _plotly_basevalidators.nw.Series):
	return True
	if _plotly_basevalidators.is_numpy_convertable(value):
	np_mod = _plotly_basevalidators.get_module("numpy", should_load=True)
	if np_mod:
	return np_mod.array(value).shape != ()
	return False

	_plotly_basevalidators.to_scalar_or_list = _plotly_to_scalar_or_list_without_pandas
	_plotly_basevalidators.is_homogeneous_array = _plotly_is_homogeneous_array_without_pandas
	except Exception:
	pass

	# ── paths ────────────────────────────────────────────────────────────────────

	SAMPLES = os.path.join(os.path.dirname(__file__), "samples", "1089")
	REF_WAV = os.path.join(SAMPLES, "reference.wav")
	TARGET_WAV = os.path.join(SAMPLES, "target.wav")
	REF_TEXT = ("But her long fair hair was girlish: and girlish, and touched "
	"with the wonder of mortal beauty, her face.")
	TARGET_TEXT = "A great fisher of souls!"

	# ── gallery models (audio samples available for SafeSpeech protection) ────────

	GALLERY_MODELS = {
	"ZipVoice": dict(
	clean="zipvoice_clean.wav",
	prot="zipvoice_safespeech.wav",
	sims={"Clean": 0.579, "SafeSpeech": 0.287, "Enkidu": 0.435,
	"Spectral": 0.262, "GR-Noise": 0.258, "AntiFake": 0.543},
	),
	"MOSS-TTSD": dict(
	clean="moss_ttsd_clean.wav",
	prot="moss_ttsd_safespeech.wav",
	sims={"Clean": 0.492, "SafeSpeech": 0.242, "Enkidu": 0.335,
	"Spectral": 0.216, "GR-Noise": 0.247, "AntiFake": 0.453},
	),
	"MGM-Omni": dict(
	clean="mgm_omni_clean.wav",
	prot="mgm_omni_safespeech.wav",
	sims={"Clean": 0.539, "SafeSpeech": 0.184, "Enkidu": 0.316,
	"Spectral": 0.166, "GR-Noise": 0.229, "AntiFake": 0.491},
	),
	"OZSpeech": dict(
	clean="ozspeech_clean.wav",
	prot="ozspeech_safespeech.wav",
	sims={"Clean": 0.388, "SafeSpeech": 0.156, "Enkidu": 0.187,
	"Spectral": 0.147, "GR-Noise": 0.148, "AntiFake": 0.337},
	),
	"StyleTTS 2": dict(
	clean="styletts2_clean.wav",
	prot="styletts2_safespeech.wav",
	sims={"Clean": 0.228, "SafeSpeech": 0.089, "Enkidu": 0.125,
	"Spectral": 0.081, "GR-Noise": 0.030, "AntiFake": 0.207},
	),
	}

	# ── benchmark data (LibriTTS, clean prompts) ─────────────────────────────────

	# fmt: off
	LEADERBOARD_ROWS = [
	dict(model="Qwen3-TTS", SIM=0.614, WER=0.052, MOS=4.39, MCD=5.79, RTF=2.02, SVA=0.974, Emo=0.731),
	dict(model="IndexTTS", SIM=0.606, WER=0.052, MOS=4.06, MCD=6.61, RTF=2.23, SVA=0.972, Emo=0.693),
	dict(model="CosyVoice 2", SIM=0.602, WER=0.175, MOS=4.39, MCD=6.17, RTF=4.58, SVA=0.974, Emo=0.729),
	dict(model="ZipVoice", SIM=0.579, WER=0.053, MOS=4.13, MCD=7.09, RTF=1.46, SVA=0.952, Emo=0.675),
	dict(model="MaskGCT", SIM=0.570, WER=0.088, MOS=3.93, MCD=6.91, RTF=1.36, SVA=0.939, Emo=0.682),
	dict(model="GLM-TTS", SIM=0.570, WER=0.087, MOS=4.08, MCD=6.41, RTF=1.74, SVA=0.951, Emo=0.678),
	dict(model="F5-TTS", SIM=0.559, WER=0.116, MOS=3.99, MCD=6.96, RTF=0.61, SVA=0.937, Emo=0.676),
	dict(model="Higgs Audio", SIM=0.559, WER=0.250, MOS=4.30, MCD=6.06, RTF=1.42, SVA=0.941, Emo=0.717),
	dict(model="MGM-Omni", SIM=0.539, WER=0.095, MOS=4.28, MCD=5.82, RTF=0.84, SVA=0.933, Emo=0.676),
	dict(model="PlayDiffusion",SIM=0.506, WER=0.055, MOS=4.15, MCD=8.06, RTF=0.73, SVA=0.936, Emo=0.681),
	dict(model="MOSS-TTSD", SIM=0.492, WER=0.383, MOS=4.10, MCD=7.09, RTF=None, SVA=0.876, Emo=0.667),
	dict(model="VibeVoice", SIM=0.480, WER=0.228, MOS=3.83, MCD=6.76, RTF=1.86, SVA=0.852, Emo=0.624),
	dict(model="FishSpeech", SIM=0.472, WER=0.166, MOS=4.37, MCD=6.47, RTF=3.61, SVA=0.907, Emo=0.682),
	dict(model="XTTS-v2", SIM=0.454, WER=0.073, MOS=3.81, MCD=8.62, RTF=0.62, SVA=0.908, Emo=0.639),
	dict(model="SparkTTS", SIM=0.408, WER=0.326, MOS=4.06, MCD=5.83, RTF=1.56, SVA=0.764, Emo=0.672),
	dict(model="OZSpeech", SIM=0.388, WER=0.060, MOS=3.21, MCD=6.87, RTF=8.75, SVA=0.840, Emo=0.636),
	dict(model="OpenVoice V2", SIM=0.244, WER=0.075, MOS=4.30, MCD=7.06, RTF=0.08, SVA=0.474, Emo=0.601),
	dict(model="StyleTTS 2", SIM=0.228, WER=0.049, MOS=4.30, MCD=6.81, RTF=0.11, SVA=0.388, Emo=0.589),
	]

	# Cross-dataset generalisation — SIM on clean prompts across all 10 datasets
	CROSS_DATASET_ROWS = [
	dict(model="Qwen3-TTS", LibriTTS=0.614, VCTK=0.618, MultiSpk=0.495, Long=0.561, AISHELL=0.721, French=0.536, Bilingual=0.673, BGclean=0.689, BGnoise=0.572, Hallucin=0.515),
	dict(model="IndexTTS", LibriTTS=0.606, VCTK=0.567, MultiSpk=0.473, Long=0.775, AISHELL=0.721, French=0.397, Bilingual=0.673, BGclean=0.589, BGnoise=0.528, Hallucin=0.529),
	dict(model="CosyVoice 2", LibriTTS=0.602, VCTK=0.582, MultiSpk=0.448, Long=0.530, AISHELL=0.717, French=0.378, Bilingual=0.653, BGclean=0.626, BGnoise=0.515, Hallucin=0.518),
	dict(model="ZipVoice", LibriTTS=0.579, VCTK=0.554, MultiSpk=0.531, Long=0.729, AISHELL=0.712, French=0.363, Bilingual=0.322, BGclean=0.625, BGnoise=0.462, Hallucin=0.509),
	dict(model="MaskGCT", LibriTTS=0.570, VCTK=0.555, MultiSpk=0.431, Long=0.194, AISHELL=0.674, French=0.494, Bilingual=None, BGclean=0.610, BGnoise=0.487, Hallucin=0.499),
	dict(model="GLM-TTS", LibriTTS=0.570, VCTK=0.573, MultiSpk=0.445, Long=0.757, AISHELL=0.690, French=0.398, Bilingual=0.657, BGclean=0.622, BGnoise=0.528, Hallucin=0.533),
	dict(model="F5-TTS", LibriTTS=0.559, VCTK=0.537, MultiSpk=0.507, Long=0.607, AISHELL=0.696, French=0.304, Bilingual=0.653, BGclean=0.582, BGnoise=0.414, Hallucin=0.455),
	dict(model="Higgs Audio", LibriTTS=0.559, VCTK=0.516, MultiSpk=0.418, Long=0.520, AISHELL=0.581, French=0.349, Bilingual=0.543, BGclean=0.592, BGnoise=0.421, Hallucin=0.425),
	dict(model="MGM-Omni", LibriTTS=0.539, VCTK=0.447, MultiSpk=0.370, Long=0.442, AISHELL=0.713, French=0.227, Bilingual=0.630, BGclean=0.523, BGnoise=0.332, Hallucin=0.396),
	dict(model="PlayDiffusion",LibriTTS=0.506, VCTK=0.426, MultiSpk=0.360, Long=0.637, AISHELL=0.441, French=0.283, Bilingual=0.465, BGclean=0.433, BGnoise=0.305, Hallucin=0.408),
	dict(model="MOSS-TTSD", LibriTTS=0.492, VCTK=0.440, MultiSpk=0.379, Long=0.644, AISHELL=0.437, French=0.327, Bilingual=0.471, BGclean=0.494, BGnoise=0.488, Hallucin=0.416),
	dict(model="VibeVoice", LibriTTS=0.480, VCTK=0.436, MultiSpk=0.348, Long=0.625, AISHELL=0.564, French=0.343, Bilingual=0.531, BGclean=0.513, BGnoise=0.364, Hallucin=0.408),
	dict(model="FishSpeech", LibriTTS=0.472, VCTK=0.430, MultiSpk=0.383, Long=0.572, AISHELL=0.611, French=0.374, Bilingual=0.566, BGclean=0.495, BGnoise=0.387, Hallucin=0.351),
	dict(model="XTTS-v2", LibriTTS=0.454, VCTK=0.454, MultiSpk=0.328, Long=0.613, AISHELL=0.569, French=0.445, Bilingual=0.506, BGclean=0.546, BGnoise=0.394, Hallucin=0.488),
	dict(model="SparkTTS", LibriTTS=0.408, VCTK=0.532, MultiSpk=0.228, Long=0.345, AISHELL=0.569, French=0.164, Bilingual=0.480, BGclean=0.588, BGnoise=0.332, Hallucin=0.336),
	dict(model="OZSpeech", LibriTTS=0.388, VCTK=0.253, MultiSpk=0.271, Long=None, AISHELL=None, French=0.109, Bilingual=None, BGclean=0.272, BGnoise=0.164, Hallucin=0.281),
	dict(model="OpenVoice V2", LibriTTS=0.244, VCTK=0.392, MultiSpk=0.192, Long=0.278, AISHELL=0.431, French=0.271, Bilingual=0.298, BGclean=0.484, BGnoise=0.358, Hallucin=0.365),
	dict(model="StyleTTS 2", LibriTTS=0.228, VCTK=0.236, MultiSpk=0.162, Long=None, AISHELL=None, French=None, Bilingual=0.213, BGclean=0.196, BGnoise=0.166, Hallucin=0.184),
	]

	CROSS_DATASET_COLS = [
	("LibriTTS", "LibriTTS"),
	("VCTK", "VCTK"),
	("MultiSpk", "Multi-spk"),
	("Long", "Long"),
	("AISHELL", "AISHELL"),
	("French", "French"),
	("Bilingual", "Bilingual"),
	("BGclean", "BG-clean"),
	("BGnoise", "BG-noise"),
	("Hallucin", "Hallucin."),
	]

	# Protection robustness — SIM under each method (LibriTTS, all 18 models)
	PROT_ROWS = [
	dict(model="Qwen3-TTS", Clean=0.614, SafeSpeech=0.384, Enkidu=0.502, Spectral=0.363, GRNoise=0.408, AntiFake=0.582),
	dict(model="IndexTTS", Clean=0.606, SafeSpeech=0.346, Enkidu=0.475, Spectral=0.318, GRNoise=0.392, AntiFake=0.572),
	dict(model="CosyVoice 2", Clean=0.602, SafeSpeech=0.321, Enkidu=0.447, Spectral=0.301, GRNoise=0.384, AntiFake=0.549),
	dict(model="ZipVoice", Clean=0.579, SafeSpeech=0.287, Enkidu=0.435, Spectral=0.262, GRNoise=0.258, AntiFake=0.543),
	dict(model="MaskGCT", Clean=0.570, SafeSpeech=0.303, Enkidu=0.407, Spectral=0.281, GRNoise=0.312, AntiFake=0.530),
	dict(model="GLM-TTS", Clean=0.570, SafeSpeech=0.330, Enkidu=0.445, Spectral=0.311, GRNoise=0.388, AntiFake=0.532),
	dict(model="F5-TTS", Clean=0.559, SafeSpeech=0.207, Enkidu=0.431, Spectral=0.176, GRNoise=0.137, AntiFake=0.520),
	dict(model="Higgs Audio", Clean=0.559, SafeSpeech=0.264, Enkidu=0.435, Spectral=0.236, GRNoise=0.272, AntiFake=0.521),
	dict(model="MGM-Omni", Clean=0.539, SafeSpeech=0.184, Enkidu=0.316, Spectral=0.166, GRNoise=0.229, AntiFake=0.491),
	dict(model="PlayDiffusion",Clean=0.506, SafeSpeech=0.173, Enkidu=None, Spectral=0.149, GRNoise=0.162, AntiFake=0.466),
	dict(model="MOSS-TTSD", Clean=0.492, SafeSpeech=0.242, Enkidu=0.335, Spectral=0.216, GRNoise=0.247, AntiFake=0.453),
	dict(model="VibeVoice", Clean=0.480, SafeSpeech=0.272, Enkidu=0.367, Spectral=0.253, GRNoise=0.280, AntiFake=0.442),
	dict(model="FishSpeech", Clean=0.472, SafeSpeech=0.238, Enkidu=0.334, Spectral=0.212, GRNoise=0.235, AntiFake=0.439),
	dict(model="XTTS-v2", Clean=0.454, SafeSpeech=0.260, Enkidu=0.308, Spectral=0.241, GRNoise=0.237, AntiFake=0.414),
	dict(model="SparkTTS", Clean=0.408, SafeSpeech=0.129, Enkidu=0.137, Spectral=0.108, GRNoise=0.062, AntiFake=0.359),
	dict(model="OZSpeech", Clean=0.388, SafeSpeech=0.156, Enkidu=0.187, Spectral=0.147, GRNoise=0.148, AntiFake=0.337),
	dict(model="OpenVoice V2", Clean=0.244, SafeSpeech=0.185, Enkidu=0.188, Spectral=0.180, GRNoise=0.175, AntiFake=0.236),
	dict(model="StyleTTS 2", Clean=0.228, SafeSpeech=0.089, Enkidu=0.125, Spectral=0.081, GRNoise=0.030, AntiFake=0.207),
	]
	# fmt: on

	METRIC_META = {
	"SIM": ("Speaker Similarity ↑", True),
	"WER": ("Word Error Rate ↓", False),
	"MOS": ("MOS Score ↑", True),
	"MCD": ("Mel Cepstral Dist. ↓", False),
	"RTF": ("Real-Time Factor ↓", False),
	"SVA": ("Speaker Verif. Acc. ↑",True),
	"Emo": ("Emotion Match Rate ↑", True),
	}

	# ── colour helpers ────────────────────────────────────────────────────────────

	_GOOD = (200, 230, 201) # #c8e6c9 light green
	_MID = (255, 249, 196) # #fff9c4 light yellow
	_BAD = (255, 205, 210) # #ffcdd2 light red


	def _interp_color(t: float) -> str:
	"""t=0 → bad (red), t=1 → good (green), t=0.5 → yellow."""
	if t <= 0.5:
	s = t / 0.5
	r = int(_BAD[0] + s * (_MID[0] - _BAD[0]))
	g = int(_BAD[1] + s * (_MID[1] - _BAD[1]))
	b = int(_BAD[2] + s * (_MID[2] - _BAD[2]))
	else:
	s = (t - 0.5) / 0.5
	r = int(_MID[0] + s * (_GOOD[0] - _MID[0]))
	g = int(_MID[1] + s * (_GOOD[1] - _MID[1]))
	b = int(_MID[2] + s * (_GOOD[2] - _MID[2]))
	return f"rgb({r},{g},{b})"


	def _col_colors(values: list, higher_is_better: bool) -> list[str]:
	valid = [v for v in values if v is not None]
	if not valid or max(valid) == min(valid):
	return ["rgb(245,245,245)"] * len(values)
	vmin, vmax = min(valid), max(valid)
	colors = []
	for v in values:
	if v is None:
	colors.append("rgb(245,245,245)")
	else:
	t = (v - vmin) / (vmax - vmin)
	if not higher_is_better:
	t = 1 - t
	colors.append(_interp_color(t))
	return colors


	# ── audio helpers ─────────────────────────────────────────────────────────────

	def _load(path: str) -> tuple[np.ndarray, int]:
	audio, sr = sf.read(path, dtype="float32")
	if audio.ndim > 1:
	audio = audio.mean(axis=1)
	return audio, sr


	def _snr(original: np.ndarray, protected: np.ndarray) -> float:
	noise = protected - original
	sp = np.mean(original ** 2)
	np_ = np.mean(noise ** 2)
	return float("inf") if np_ < 1e-12 else float(10 * np.log10(sp / np_))


	# ── protection functions ──────────────────────────────────────────────────────

	def apply_grnoise(audio: np.ndarray, sr: int, snr_db: float = 25.0) -> np.ndarray:
	sig_pow = np.mean(audio ** 2)
	noise_pow = sig_pow / (10 ** (snr_db / 10))
	noise = np.random.randn(audio.shape).astype(np.float32) np.sqrt(noise_pow)
	return np.clip(audio + noise, -1.0, 1.0)


	def apply_spectral(audio: np.ndarray, sr: int, strength: float = 0.05) -> np.ndarray:
	from numpy.fft import rfft, irfft
	n_fft, hop = 1024, 256
	out = np.zeros_like(audio)
	cnt = np.zeros_like(audio)
	for start in range(0, len(audio) - n_fft, hop):
	frame = audio[start:start + n_fft] * np.hanning(n_fft).astype(np.float32)
	spec = rfft(frame)
	mag = np.abs(spec)
	perturb = np.random.randn(mag.shape).astype(np.float32) strength * mag
	spec_p = spec + perturb * np.exp(1j * np.random.uniform(0, 2 * np.pi, mag.shape))
	f = irfft(spec_p)[:n_fft].astype(np.float32)
	out[start:start + n_fft] += f
	cnt[start:start + n_fft] += 1
	cnt = np.maximum(cnt, 1)
	return np.clip(out / cnt, -1.0, 1.0)


	PROTECT_FN = {"GR-Noise": apply_grnoise, "Spectral": apply_spectral}


	# ── plotly figures ────────────────────────────────────────────────────────────

	def make_sim_bar(model_name: str) -> go.Figure:
	"""Bar chart: SIM under each protection method for one gallery model."""
	info = GALLERY_MODELS[model_name]
	sims = info["sims"]
	labels = list(sims.keys())
	values = list(sims.values())

	bar_colors = [
	"#2563eb", # Clean
	"#7c3aed", # SafeSpeech
	"#059669", # Enkidu
	"#ea580c", # Spectral
	"#475569", # GR-Noise
	"#be123c", # AntiFake
	]
	# annotate drop vs clean
	clean_sim = sims["Clean"]
	text = [f"{v:.3f}" if k == "Clean" else f"{v:.3f}<br>↓{clean_sim - v:.3f}"
	for k, v in sims.items()]
	hover_text = [
	f"{label}<br>SIM: {value:.3f}<br>Drop from clean: {clean_sim - value:.3f}"
	for label, value in zip(labels, values)
	]

	fig = go.Figure(go.Bar(
	x=labels, y=values,
	marker_color=bar_colors,
	marker_line_color="rgba(15, 23, 42, 0.25)",
	marker_line_width=1,
	text=text,
	textposition="outside",
	hovertext=hover_text,
	hoverinfo="text",
	cliponaxis=False,
	))
	fig.update_layout(
	title=dict(
	text=f"<b>{model_name}</b> speaker similarity after protection",
	font=dict(size=16, color="#0f172a"),
	x=0.02,
	),
	yaxis=dict(
	title="SIM",
	range=[0, min(0.75, max(values) * 1.28)],
	gridcolor="#e2e8f0",
	zeroline=False,
	),
	xaxis=dict(title="", tickfont=dict(size=12)),
	paper_bgcolor="white",
	plot_bgcolor="#f8fafc",
	margin=dict(t=62, b=42, l=48, r=24),
	height=350,
	showlegend=False,
	bargap=0.28,
	font=dict(color="#334155"),
	)
	fig.add_trace(go.Scatter(
	x=labels,
	y=[clean_sim] * len(labels),
	mode="lines+text",
	line=dict(color="#2563eb", dash="dot", width=1.5),
	text=[""] * (len(labels) - 1) + ["Clean baseline"],
	textposition="top right",
	textfont=dict(size=10, color="#2563eb"),
	hoverinfo="skip",
	showlegend=False,
	))
	return fig


	def make_results_bar(metric: str = "SIM", ascending: bool = False) -> go.Figure:
	"""Horizontal bar chart of all 18 models sorted by the chosen metric."""
	higher_is_better = METRIC_META[metric][1]
	metric_label = METRIC_META[metric][0]

	rows = [r for r in LEADERBOARD_ROWS if r.get(metric) is not None]
	rows = sorted(rows, key=lambda r: r[metric], reverse=(higher_is_better ^ ascending))

	models = [r["model"] for r in rows]
	values = [r[metric] for r in rows]

	colors = _col_colors(values, higher_is_better)
	text = [f"{v:.3f}" if v is not None else "—" for v in values]

	fig = go.Figure(go.Bar(
	x=values, y=models,
	orientation="h",
	marker_color=colors,
	marker_line_color="#999", marker_line_width=0.5,
	text=text, textposition="outside",
	cliponaxis=False,
	))
	fig.update_layout(
	title=dict(text=f"<b>Model Ranking by {metric_label}</b>",
	font=dict(size=14)),
	xaxis=dict(title=metric_label),
	yaxis=dict(autorange="reversed"),
	paper_bgcolor="white", plot_bgcolor="#f8f9fa",
	margin=dict(t=50, b=40, l=120, r=80),
	height=520,
	showlegend=False,
	)
	return fig


	def make_prot_heatmap() -> go.Figure:
	"""Heatmap: SIM under each protection method for all 18 models."""
	col_order = ["Clean", "SafeSpeech", "Enkidu", "Spectral", "GRNoise", "AntiFake"]
	col_labels = ["Clean", "SafeSpeech", "Enkidu", "Spectral", "GR-Noise", "AntiFake"]

	# sort models by Clean SIM descending
	rows = sorted(PROT_ROWS, key=lambda r: r["Clean"], reverse=True)
	model_names = [r["model"] for r in rows]

	z: list[list] = []
	text_vals: list[list[str]] = []
	for r in rows:
	row_z, row_t = [], []
	for col in col_order:
	v = r.get(col)
	row_z.append(v)
	row_t.append(f"{v:.3f}" if v is not None else "—")
	z.append(row_z)
	text_vals.append(row_t)

	fig = go.Figure(go.Heatmap(
	z=z,
	x=col_labels,
	y=model_names,
	text=text_vals,
	texttemplate="%{text}",
	textfont=dict(size=10),
	colorscale=[
	[0.0, "#b71c1c"],
	[0.25, "#ef9a9a"],
	[0.5, "#fff9c4"],
	[0.75, "#a5d6a7"],
	[1.0, "#1b5e20"],
	],
	zmin=0.0, zmax=0.75,
	colorbar=dict(title="SIM", tickformat=".2f", len=0.8),
	hoverongaps=False,
	))
	# separator line after Clean column
	fig.add_shape(type="line",
	x0=0.5, x1=0.5, y0=-0.5, y1=len(model_names) - 0.5,
	line=dict(color="#555", width=2, dash="dot"),
	xref="x", yref="y")

	fig.update_layout(
	title=dict(
	text="<b>Protection Robustness — Speaker Similarity (SIM) on LibriTTS</b><br>"
	"<sup>Green = high SIM (clone faithful). Red = low SIM (protection effective). "
	"Drop from Clean → protected shows protection strength.</sup>",
	font=dict(size=13),
	),
	yaxis=dict(autorange="reversed"),
	xaxis=dict(side="top"),
	paper_bgcolor="white", plot_bgcolor="white",
	margin=dict(t=120, b=40, l=120, r=80),
	height=600,
	)
	return fig


	def make_cross_dataset_heatmap() -> go.Figure:
	"""Heatmap: SIM on clean prompts across all 10 datasets for all 18 models."""
	col_keys = [k for k, _ in CROSS_DATASET_COLS]
	col_labels = [label for _, label in CROSS_DATASET_COLS]

	rows = sorted(CROSS_DATASET_ROWS, key=lambda r: r["LibriTTS"], reverse=True)
	model_names = [r["model"] for r in rows]

	z: list[list] = []
	text_vals: list[list[str]] = []
	for r in rows:
	row_z, row_t = [], []
	for key in col_keys:
	v = r.get(key)
	row_z.append(v)
	row_t.append(f"{v:.3f}" if v is not None else "—")
	z.append(row_z)
	text_vals.append(row_t)

	fig = go.Figure(go.Heatmap(
	z=z,
	x=col_labels,
	y=model_names,
	text=text_vals,
	texttemplate="%{text}",
	textfont=dict(size=10),
	colorscale=[
	[0.0, "#b71c1c"],
	[0.25, "#ef9a9a"],
	[0.5, "#fff9c4"],
	[0.75, "#a5d6a7"],
	[1.0, "#1b5e20"],
	],
	zmin=0.0, zmax=0.75,
	colorbar=dict(title="SIM", tickformat=".2f", len=0.8),
	hoverongaps=False,
	))
	fig.update_layout(
	title=dict(
	text="<b>Cross-Dataset Generalisation — Speaker Similarity (SIM) on Clean Prompts</b><br>"
	"<sup>Models sorted by LibriTTS SIM. — = not evaluated. "
	"Green = high SIM (faithful clone), red = low SIM.</sup>",
	font=dict(size=13),
	),
	yaxis=dict(autorange="reversed"),
	xaxis=dict(side="top"),
	paper_bgcolor="white", plot_bgcolor="white",
	margin=dict(t=120, b=40, l=120, r=80),
	height=600,
	)
	return fig


	def make_waveform_figure(
	original: np.ndarray, protected: np.ndarray, sr: int
	) -> go.Figure:
	"""Overlay waveform plot: original vs. protected audio."""
	n = min(len(original), len(protected), sr * 5) # cap at 5 s
	t = (np.arange(n) / sr).tolist()
	original_wave = original[:n].tolist()
	protected_wave = protected[:n].tolist()

	fig = go.Figure()
	fig.add_trace(go.Scatter(
	x=t, y=original_wave,
	name="Original",
	line=dict(color="#1565c0", width=1),
	opacity=0.85,
	))
	fig.add_trace(go.Scatter(
	x=t, y=protected_wave,
	name="Protected",
	line=dict(color="#c62828", width=1),
	opacity=0.85,
	))
	fig.update_layout(
	title=dict(text="<b>Waveform Comparison</b> (first 5 s)",
	font=dict(size=13)),
	xaxis=dict(title="Time (s)"),
	yaxis=dict(title="Amplitude", range=[-1.05, 1.05]),
	paper_bgcolor="white", plot_bgcolor="#f8f9fa",
	legend=dict(orientation="h", y=1.08, x=0.5, xanchor="center"),
	margin=dict(t=60, b=40, l=55, r=20),
	height=220,
	)
	return fig


	# ── gallery callback ──────────────────────────────────────────────────────────

	def load_gallery(model_name: str):
	info = GALLERY_MODELS[model_name]
	clean_sim = info["sims"]["Clean"]
	prot_sim = info["sims"]["SafeSpeech"]
	drop = clean_sim - prot_sim
	note_md = (
	f"Clean SIM: {clean_sim:.3f}  →  "
	f"Protected SIM (SafeSpeech): {prot_sim:.3f}  "
	f"(drop: {drop:.3f})"
	)
	return (
	REF_WAV,
	TARGET_WAV,
	os.path.join(SAMPLES, info["clean"]),
	os.path.join(SAMPLES, "protected_safespeech.wav"),
	os.path.join(SAMPLES, info["prot"]),
	note_md,
	make_sim_bar(model_name),
	)


	# ── live protection callback ──────────────────────────────────────────────────

	def run_protection(audio_input, method: str, strength: float):
	if audio_input is None:
	return None, None, "Upload an audio file first.", None

	sr_in, data = audio_input
	audio = data.astype(np.float32)
	if audio.max() > 1.0:
	audio /= 32768.0
	if audio.ndim > 1:
	audio = audio.mean(axis=1)

	t0 = time.time()
	fn = PROTECT_FN[method]
	if method == "GR-Noise":
	protected = fn(audio, sr_in, snr_db=strength)
	else:
	protected = fn(audio, sr_in, strength=strength / 100.0)
	elapsed = time.time() - t0

	snr = _snr(audio, protected)
	prot_int = (protected * 32767).astype(np.int16)

	metrics_md = (
	f"\| Metric \| Value \|\n\|--------\|-------\|\n"
	f"\| SNR (dB) \| {snr:.1f} \|\n"
	f"\| Processing time \| {elapsed * 1000:.0f} ms \|\n"
	f"\| Method \| {method} \|\n"
	)

	waveform_fig = make_waveform_figure(audio, protected, sr_in)
	return (sr_in, audio.copy()), (sr_in, prot_int), metrics_md, waveform_fig


	def update_strength_label(method: str) -> dict:
	if method == "GR-Noise":
	return gr.update(
	label="Target SNR (dB) — lower = stronger, more audible",
	info="25 dB: nearly imperceptible. 10 dB: noticeable noise.",
	minimum=10, maximum=40, value=25, step=1,
	)
	else:
	return gr.update(
	label="Spectral Strength (%) — higher = stronger perturbation",
	info="5% is nearly inaudible. 20%+ may cause artifacts.",
	minimum=1, maximum=30, value=5, step=1,
	)


	# ── results callbacks ─────────────────────────────────────────────────────────

	def update_results_bar(metric: str) -> go.Figure:
	return make_results_bar(metric)


	# ── UI constants ──────────────────────────────────────────────────────────────

	CSS = """
	footer { display: none !important; }
	.gradio-container {
	max-width: 1180px !important;
	margin: 0 auto !important;
	}
	.hero {
	padding: 28px 28px 22px;
	border-radius: 12px;
	background: linear-gradient(135deg, #0f172a 0%, #164e63 54%, #065f46 100%);
	color: white;
	margin-bottom: 18px;
	}
	.hero h1 {
	margin: 0 0 8px;
	font-size: 2.35rem;
	line-height: 1.08;
	letter-spacing: 0;
	color: white !important;
	}
	.hero p {
	max-width: 760px;
	margin: 0;
	color: #dbeafe;
	font-size: 1.05rem;
	}
	.hero a {
	color: white !important;
	}
	.hero-links {
	display: flex;
	flex-wrap: wrap;
	gap: 8px;
	margin-top: 16px;
	}
	.hero-links a {
	text-decoration: none;
	}
	.stat-strip {
	display: grid;
	grid-template-columns: repeat(4, minmax(0, 1fr));
	gap: 10px;
	margin: 14px 0 18px;
	}
	.stat-card {
	border: 1px solid #d8dee9;
	border-radius: 8px;
	padding: 12px 14px;
	background: #ffffff;
	}
	.stat-card b {
	display: block;
	font-size: 1.35rem;
	color: #0f172a;
	line-height: 1.1;
	}
	.stat-card span {
	color: #475569;
	font-size: 0.9rem;
	}
	.section-head {
	margin: 18px 0 8px;
	color: #0f172a;
	}
	.note-box {
	font-size: 1.02em;
	background: #eef6ff;
	border: 1px solid #bfdbfe;
	border-left: 4px solid #2563eb;
	border-radius: 8px;
	padding: 10px 12px;
	}
	.audio-panel {
	border: 1px solid #e2e8f0;
	border-radius: 8px;
	padding: 12px;
	background: #ffffff;
	}
	.audio-panel h3,
	.audio-panel h4 {
	margin-top: 0;
	}
	.workflow-copy {
	color: #475569;
	margin-bottom: 12px;
	}
	@media (max-width: 760px) {
	.hero {
	padding: 22px 18px 18px;
	}
	.hero h1 {
	font-size: 1.75rem;
	}
	.stat-strip {
	grid-template-columns: repeat(2, minmax(0, 1fr));
	}
	}
	"""

	INTRO_MD = """
	<div class="hero">
	<h1>RVCBench</h1>
	<p>Voice cloning attacks and audio protection methods, compared through paired listening examples and speaker-similarity results.</p>
	<div class="hero-links">
	<a href="https://arxiv.org/abs/2602.00443"><img alt="Paper" src="https://img.shields.io/badge/arXiv-2602.00443-b31b1b.svg"></a>
	<a href="https://huggingface.co/datasets/Nanboy/RVCBench"><img alt="Dataset" src="https://img.shields.io/badge/HuggingFace-Dataset-ffcc00.svg"></a>
	<a href="https://github.com/Nanboy-Ronan/RVCBench"><img alt="GitHub" src="https://img.shields.io/badge/GitHub-RVCBench-181717.svg"></a>
	</div>
	</div>

	<div class="stat-strip">
	<div class="stat-card"><b>26</b><span>voice cloning models</span></div>
	<div class="stat-card"><b>5</b><span>protection methods</span></div>
	<div class="stat-card"><b>7</b><span>evaluation metrics</span></div>
	<div class="stat-card"><b>10</b><span>speech datasets</span></div>
	</div>
	"""

	GALLERY_INTRO_MD = """
	<div class="workflow-copy">
	Select a cloning model, compare clean and protected audio, then inspect how much each protection method lowers speaker similarity.
	</div>
	"""

	PROT_INTRO_MD = """
	Upload your own audio clip and apply a protection method. The protected audio sounds nearly
	identical to humans, but disrupts automatic voice cloning models.

	- GR-Noise — Gaussian random noise at a chosen SNR level. No surrogate model required.
	- Spectral — Structured perturbation in the STFT frequency domain.
	"""

	RESULTS_INTRO_MD = """
	Metric guide — SIM: speaker cosine similarity ↑  ·
	WER: word error rate ↓  ·  MOS: perceptual quality ↑  ·
	MCD: mel cepstral distortion ↓  ·  RTF: real-time factor ↓  ·
	SVA: speaker verification accuracy ↑  ·  Emo: emotion match rate ↑

	Select a metric to re-rank the 18 models. The heatmap below shows protection robustness
	(SIM under each of 5 protection methods).
	"""


	# ── build demo ────────────────────────────────────────────────────────────────

	def build_demo():
	with gr.Blocks(css=CSS, title="RVCBench Demo") as demo:
	gr.Markdown(INTRO_MD)

	with gr.Tabs():

	# ── Tab 1: Voice Cloning Gallery ──────────────────────────────────
	with gr.Tab("🎧 Voice Cloning Gallery"):
	gr.Markdown(GALLERY_INTRO_MD)

	with gr.Row():
	model_dd = gr.Dropdown(
	choices=list(GALLERY_MODELS.keys()),
	value="ZipVoice",
	label="Voice Cloning Model",
	scale=3,
	)
	load_btn = gr.Button("Load Example", variant="primary", scale=1)

	sim_note = gr.Markdown("", elem_classes="note-box")

	with gr.Row():
	with gr.Column(elem_classes="audio-panel"):
	gr.Markdown('<h3 class="section-head">1. Reference Voice</h3>')
	gr.Markdown(f"\"{REF_TEXT}\"")
	ref_out = gr.Audio(label="Reference (original)", interactive=False)
	with gr.Column(elem_classes="audio-panel"):
	gr.Markdown('<h3 class="section-head">2. Target Speech</h3>')
	gr.Markdown(f"\"{TARGET_TEXT}\"")
	target_out = gr.Audio(label="Target utterance", interactive=False)

	gr.Markdown('<h3 class="section-head">3. Cloning Results</h3>')

	with gr.Row():
	with gr.Column(elem_classes="audio-panel"):
	gr.Markdown("#### Clean Reference")
	clean_out = gr.Audio(label="Clean clone", interactive=False)
	with gr.Column(elem_classes="audio-panel"):
	gr.Markdown("#### SafeSpeech-Protected Reference")
	prot_ref_out = gr.Audio(label="Protected reference", interactive=False)
	prot_clone_out = gr.Audio(label="Clone from protected (degraded)", interactive=False)

	gr.Markdown('<h3 class="section-head">4. Protection Effectiveness Across Methods</h3>')
	sim_chart = gr.Plot(label="", show_label=False)

	gallery_outputs = [ref_out, target_out, clean_out, prot_ref_out,
	prot_clone_out, sim_note, sim_chart]
	load_btn.click(fn=load_gallery, inputs=[model_dd], outputs=gallery_outputs)
	demo.load(fn=load_gallery, inputs=[model_dd], outputs=gallery_outputs)
	model_dd.change(fn=load_gallery, inputs=[model_dd], outputs=gallery_outputs)

	# ── Tab 2: Protect Your Voice ─────────────────────────────────────
	with gr.Tab("🔒 Protect Your Voice"):
	gr.Markdown(PROT_INTRO_MD)

	with gr.Row():
	audio_in = gr.Audio(
	label="Upload your audio (wav / mp3, ≤ 30 s)",
	type="numpy", scale=3,
	)
	with gr.Column(scale=1):
	method_dd = gr.Dropdown(
	choices=list(PROTECT_FN.keys()),
	value="GR-Noise",
	label="Protection Method",
	)
	strength_sl = gr.Slider(
	minimum=10, maximum=40, value=25, step=1,
	label="Target SNR (dB) — lower = stronger, more audible",
	info="25 dB: nearly imperceptible. 10 dB: noticeable noise.",
	)
	protect_btn = gr.Button("Apply Protection", variant="primary")

	with gr.Row():
	orig_out = gr.Audio(label="Original", interactive=False)
	prot_live = gr.Audio(label="Protected", interactive=False)

	metrics_out = gr.Markdown("")
	waveform_plot = gr.Plot(label="Waveform Comparison", show_label=False)

	method_dd.change(fn=update_strength_label, inputs=[method_dd],
	outputs=[strength_sl])
	protect_btn.click(
	fn=run_protection,
	inputs=[audio_in, method_dd, strength_sl],
	outputs=[orig_out, prot_live, metrics_out, waveform_plot],
	)

	gr.Markdown(
	"> Note: Full voice cloning inference (SafeSpeech, Enkidu, AntiFake) "
	"requires surrogate models and is not included in this Space due to compute "
	"constraints. See the "
	"[GitHub repo](https://github.com/Nanboy-Ronan/RVCBench) for the full pipeline."
	)

	# ── Tab 3: Results Explorer ───────────────────────────────────────
	with gr.Tab("📊 Results Explorer"):
	gr.Markdown(RESULTS_INTRO_MD)

	metric_dd = gr.Dropdown(
	choices=list(METRIC_META.keys()),
	value="SIM",
	label="Sort by metric",
	)
	bar_chart = gr.Plot(label="", show_label=False)
	metric_dd.change(fn=update_results_bar, inputs=[metric_dd],
	outputs=[bar_chart])
	demo.load(fn=lambda: make_results_bar("SIM"), outputs=[bar_chart])

	gr.Markdown("---")
	gr.Markdown(
	"### Cross-Dataset Generalisation\n"
	"SIM on clean prompts across all 10 benchmark datasets. "
	"Models sorted by LibriTTS SIM. — = not evaluated."
	)
	cross_heatmap = gr.Plot(label="", show_label=False)
	demo.load(fn=make_cross_dataset_heatmap, outputs=[cross_heatmap])

	gr.Markdown("---")
	gr.Markdown(
	"### Protection Robustness Heatmap\n"
	"SIM under each of 5 protection methods — drop from Clean indicates "
	"more effective protection."
	)
	prot_heatmap = gr.Plot(label="", show_label=False)
	demo.load(fn=make_prot_heatmap, outputs=[prot_heatmap])

	# ── Tab 4: About ──────────────────────────────────────────────────
	with gr.Tab("ℹ️ About"):
	gr.Markdown("""
	## About RVCBench

	RVCBench is an open-source benchmark for evaluating the robustness of voice cloning
	against audio protection methods.

	### What it measures
	- How well 18+ modern zero-shot TTS/VC models can clone a speaker's voice
	- How effectively 5 audio protection methods (SafeSpeech, Enkidu, Spectral, GR-Noise, AntiFake)
	prevent cloning across 10 datasets and 7 evaluation metrics

	### Resources

	\| Resource \| Link \|
	\|----------\|------\|
	\| Paper (arXiv) \| [arXiv:2602.00443](https://arxiv.org/abs/2602.00443) \|
	\| Code & full pipeline \| [GitHub: Nanboy-Ronan/RVCBench](https://github.com/Nanboy-Ronan/RVCBench) \|
	\| Dataset \| [HuggingFace: Nanboy/RVCBench](https://huggingface.co/datasets/Nanboy/RVCBench) \|
	\| Contact \| ruinanjin@alumni.ubc.ca \|

	### Citation

	```bibtex
	@article{liao2026rvcbench,
	title = {RVCBench: Benchmarking the Robustness of Voice Cloning Across Modern Audio Generation Models},
	author = {Liao, Xinting and Jin, Ruinan and Yu, Hanlin and Pandya, Deval and Li, Xiaoxiao},
	journal = {arXiv preprint arXiv:2602.00443},
	year = {2026}
	}
	```
	""")

	return demo


	if __name__ == "__main__":
	build_demo().launch(server_name="0.0.0.0", server_port=7860, show_api=False)