RVCBench / app.py
Nanboy's picture
Fix RVCBench title color: add color: white !important to .hero h1
665fda4 verified
"""RVCBench — Interactive HuggingFace Space demo (v2).
Tabs
────
1. Voice Cloning Gallery – hear pre-computed clean vs. protected clones
+ protection-effectiveness bar chart for all 5 methods
2. Protect Your Voice – upload audio, apply protection, see waveform comparison
3. Results Explorer – interactive bar chart + protection robustness heatmap
4. About – paper, citation, resources
"""
from __future__ import annotations
import io
import os
import time
import gradio as gr
import numpy as np
import plotly.graph_objects as go
import soundfile as sf
try:
import _plotly_utils.basevalidators as _plotly_basevalidators
def _plotly_to_scalar_or_list_without_pandas(value):
np_mod = _plotly_basevalidators.get_module("numpy", should_load=False)
if np_mod and np_mod.isscalar(value) and hasattr(value, "item"):
return _plotly_basevalidators.to_non_numpy_type(np_mod, value)
if isinstance(value, (list, tuple)):
return [_plotly_to_scalar_or_list_without_pandas(item) for item in value]
if np_mod and isinstance(value, np_mod.ndarray):
if value.ndim == 0:
return _plotly_basevalidators.to_non_numpy_type(np_mod, value)
return [_plotly_to_scalar_or_list_without_pandas(item) for item in value]
if _plotly_basevalidators.is_numpy_convertable(value):
np_mod = _plotly_basevalidators.get_module("numpy", should_load=True)
if np_mod:
return _plotly_to_scalar_or_list_without_pandas(np_mod.array(value))
return value
def _plotly_is_homogeneous_array_without_pandas(value):
np_mod = _plotly_basevalidators.get_module("numpy", should_load=False)
if np_mod and isinstance(value, np_mod.ndarray):
return True
if isinstance(value, _plotly_basevalidators.nw.Series):
return True
if _plotly_basevalidators.is_numpy_convertable(value):
np_mod = _plotly_basevalidators.get_module("numpy", should_load=True)
if np_mod:
return np_mod.array(value).shape != ()
return False
_plotly_basevalidators.to_scalar_or_list = _plotly_to_scalar_or_list_without_pandas
_plotly_basevalidators.is_homogeneous_array = _plotly_is_homogeneous_array_without_pandas
except Exception:
pass
# ── paths ────────────────────────────────────────────────────────────────────
SAMPLES = os.path.join(os.path.dirname(__file__), "samples", "1089")
REF_WAV = os.path.join(SAMPLES, "reference.wav")
TARGET_WAV = os.path.join(SAMPLES, "target.wav")
REF_TEXT = ("But her long fair hair was girlish: and girlish, and touched "
"with the wonder of mortal beauty, her face.")
TARGET_TEXT = "A great fisher of souls!"
# ── gallery models (audio samples available for SafeSpeech protection) ────────
GALLERY_MODELS = {
"ZipVoice": dict(
clean="zipvoice_clean.wav",
prot="zipvoice_safespeech.wav",
sims={"Clean": 0.579, "SafeSpeech": 0.287, "Enkidu": 0.435,
"Spectral": 0.262, "GR-Noise": 0.258, "AntiFake": 0.543},
),
"MOSS-TTSD": dict(
clean="moss_ttsd_clean.wav",
prot="moss_ttsd_safespeech.wav",
sims={"Clean": 0.492, "SafeSpeech": 0.242, "Enkidu": 0.335,
"Spectral": 0.216, "GR-Noise": 0.247, "AntiFake": 0.453},
),
"MGM-Omni": dict(
clean="mgm_omni_clean.wav",
prot="mgm_omni_safespeech.wav",
sims={"Clean": 0.539, "SafeSpeech": 0.184, "Enkidu": 0.316,
"Spectral": 0.166, "GR-Noise": 0.229, "AntiFake": 0.491},
),
"OZSpeech": dict(
clean="ozspeech_clean.wav",
prot="ozspeech_safespeech.wav",
sims={"Clean": 0.388, "SafeSpeech": 0.156, "Enkidu": 0.187,
"Spectral": 0.147, "GR-Noise": 0.148, "AntiFake": 0.337},
),
"StyleTTS 2": dict(
clean="styletts2_clean.wav",
prot="styletts2_safespeech.wav",
sims={"Clean": 0.228, "SafeSpeech": 0.089, "Enkidu": 0.125,
"Spectral": 0.081, "GR-Noise": 0.030, "AntiFake": 0.207},
),
}
# ── benchmark data (LibriTTS, clean prompts) ─────────────────────────────────
# fmt: off
LEADERBOARD_ROWS = [
dict(model="Qwen3-TTS", SIM=0.614, WER=0.052, MOS=4.39, MCD=5.79, RTF=2.02, SVA=0.974, Emo=0.731),
dict(model="IndexTTS", SIM=0.606, WER=0.052, MOS=4.06, MCD=6.61, RTF=2.23, SVA=0.972, Emo=0.693),
dict(model="CosyVoice 2", SIM=0.602, WER=0.175, MOS=4.39, MCD=6.17, RTF=4.58, SVA=0.974, Emo=0.729),
dict(model="ZipVoice", SIM=0.579, WER=0.053, MOS=4.13, MCD=7.09, RTF=1.46, SVA=0.952, Emo=0.675),
dict(model="MaskGCT", SIM=0.570, WER=0.088, MOS=3.93, MCD=6.91, RTF=1.36, SVA=0.939, Emo=0.682),
dict(model="GLM-TTS", SIM=0.570, WER=0.087, MOS=4.08, MCD=6.41, RTF=1.74, SVA=0.951, Emo=0.678),
dict(model="F5-TTS", SIM=0.559, WER=0.116, MOS=3.99, MCD=6.96, RTF=0.61, SVA=0.937, Emo=0.676),
dict(model="Higgs Audio", SIM=0.559, WER=0.250, MOS=4.30, MCD=6.06, RTF=1.42, SVA=0.941, Emo=0.717),
dict(model="MGM-Omni", SIM=0.539, WER=0.095, MOS=4.28, MCD=5.82, RTF=0.84, SVA=0.933, Emo=0.676),
dict(model="PlayDiffusion",SIM=0.506, WER=0.055, MOS=4.15, MCD=8.06, RTF=0.73, SVA=0.936, Emo=0.681),
dict(model="MOSS-TTSD", SIM=0.492, WER=0.383, MOS=4.10, MCD=7.09, RTF=None, SVA=0.876, Emo=0.667),
dict(model="VibeVoice", SIM=0.480, WER=0.228, MOS=3.83, MCD=6.76, RTF=1.86, SVA=0.852, Emo=0.624),
dict(model="FishSpeech", SIM=0.472, WER=0.166, MOS=4.37, MCD=6.47, RTF=3.61, SVA=0.907, Emo=0.682),
dict(model="XTTS-v2", SIM=0.454, WER=0.073, MOS=3.81, MCD=8.62, RTF=0.62, SVA=0.908, Emo=0.639),
dict(model="SparkTTS", SIM=0.408, WER=0.326, MOS=4.06, MCD=5.83, RTF=1.56, SVA=0.764, Emo=0.672),
dict(model="OZSpeech", SIM=0.388, WER=0.060, MOS=3.21, MCD=6.87, RTF=8.75, SVA=0.840, Emo=0.636),
dict(model="OpenVoice V2", SIM=0.244, WER=0.075, MOS=4.30, MCD=7.06, RTF=0.08, SVA=0.474, Emo=0.601),
dict(model="StyleTTS 2", SIM=0.228, WER=0.049, MOS=4.30, MCD=6.81, RTF=0.11, SVA=0.388, Emo=0.589),
]
# Cross-dataset generalisation — SIM on clean prompts across all 10 datasets
CROSS_DATASET_ROWS = [
dict(model="Qwen3-TTS", LibriTTS=0.614, VCTK=0.618, MultiSpk=0.495, Long=0.561, AISHELL=0.721, French=0.536, Bilingual=0.673, BGclean=0.689, BGnoise=0.572, Hallucin=0.515),
dict(model="IndexTTS", LibriTTS=0.606, VCTK=0.567, MultiSpk=0.473, Long=0.775, AISHELL=0.721, French=0.397, Bilingual=0.673, BGclean=0.589, BGnoise=0.528, Hallucin=0.529),
dict(model="CosyVoice 2", LibriTTS=0.602, VCTK=0.582, MultiSpk=0.448, Long=0.530, AISHELL=0.717, French=0.378, Bilingual=0.653, BGclean=0.626, BGnoise=0.515, Hallucin=0.518),
dict(model="ZipVoice", LibriTTS=0.579, VCTK=0.554, MultiSpk=0.531, Long=0.729, AISHELL=0.712, French=0.363, Bilingual=0.322, BGclean=0.625, BGnoise=0.462, Hallucin=0.509),
dict(model="MaskGCT", LibriTTS=0.570, VCTK=0.555, MultiSpk=0.431, Long=0.194, AISHELL=0.674, French=0.494, Bilingual=None, BGclean=0.610, BGnoise=0.487, Hallucin=0.499),
dict(model="GLM-TTS", LibriTTS=0.570, VCTK=0.573, MultiSpk=0.445, Long=0.757, AISHELL=0.690, French=0.398, Bilingual=0.657, BGclean=0.622, BGnoise=0.528, Hallucin=0.533),
dict(model="F5-TTS", LibriTTS=0.559, VCTK=0.537, MultiSpk=0.507, Long=0.607, AISHELL=0.696, French=0.304, Bilingual=0.653, BGclean=0.582, BGnoise=0.414, Hallucin=0.455),
dict(model="Higgs Audio", LibriTTS=0.559, VCTK=0.516, MultiSpk=0.418, Long=0.520, AISHELL=0.581, French=0.349, Bilingual=0.543, BGclean=0.592, BGnoise=0.421, Hallucin=0.425),
dict(model="MGM-Omni", LibriTTS=0.539, VCTK=0.447, MultiSpk=0.370, Long=0.442, AISHELL=0.713, French=0.227, Bilingual=0.630, BGclean=0.523, BGnoise=0.332, Hallucin=0.396),
dict(model="PlayDiffusion",LibriTTS=0.506, VCTK=0.426, MultiSpk=0.360, Long=0.637, AISHELL=0.441, French=0.283, Bilingual=0.465, BGclean=0.433, BGnoise=0.305, Hallucin=0.408),
dict(model="MOSS-TTSD", LibriTTS=0.492, VCTK=0.440, MultiSpk=0.379, Long=0.644, AISHELL=0.437, French=0.327, Bilingual=0.471, BGclean=0.494, BGnoise=0.488, Hallucin=0.416),
dict(model="VibeVoice", LibriTTS=0.480, VCTK=0.436, MultiSpk=0.348, Long=0.625, AISHELL=0.564, French=0.343, Bilingual=0.531, BGclean=0.513, BGnoise=0.364, Hallucin=0.408),
dict(model="FishSpeech", LibriTTS=0.472, VCTK=0.430, MultiSpk=0.383, Long=0.572, AISHELL=0.611, French=0.374, Bilingual=0.566, BGclean=0.495, BGnoise=0.387, Hallucin=0.351),
dict(model="XTTS-v2", LibriTTS=0.454, VCTK=0.454, MultiSpk=0.328, Long=0.613, AISHELL=0.569, French=0.445, Bilingual=0.506, BGclean=0.546, BGnoise=0.394, Hallucin=0.488),
dict(model="SparkTTS", LibriTTS=0.408, VCTK=0.532, MultiSpk=0.228, Long=0.345, AISHELL=0.569, French=0.164, Bilingual=0.480, BGclean=0.588, BGnoise=0.332, Hallucin=0.336),
dict(model="OZSpeech", LibriTTS=0.388, VCTK=0.253, MultiSpk=0.271, Long=None, AISHELL=None, French=0.109, Bilingual=None, BGclean=0.272, BGnoise=0.164, Hallucin=0.281),
dict(model="OpenVoice V2", LibriTTS=0.244, VCTK=0.392, MultiSpk=0.192, Long=0.278, AISHELL=0.431, French=0.271, Bilingual=0.298, BGclean=0.484, BGnoise=0.358, Hallucin=0.365),
dict(model="StyleTTS 2", LibriTTS=0.228, VCTK=0.236, MultiSpk=0.162, Long=None, AISHELL=None, French=None, Bilingual=0.213, BGclean=0.196, BGnoise=0.166, Hallucin=0.184),
]
CROSS_DATASET_COLS = [
("LibriTTS", "LibriTTS"),
("VCTK", "VCTK"),
("MultiSpk", "Multi-spk"),
("Long", "Long"),
("AISHELL", "AISHELL"),
("French", "French"),
("Bilingual", "Bilingual"),
("BGclean", "BG-clean"),
("BGnoise", "BG-noise"),
("Hallucin", "Hallucin."),
]
# Protection robustness — SIM under each method (LibriTTS, all 18 models)
PROT_ROWS = [
dict(model="Qwen3-TTS", Clean=0.614, SafeSpeech=0.384, Enkidu=0.502, Spectral=0.363, GRNoise=0.408, AntiFake=0.582),
dict(model="IndexTTS", Clean=0.606, SafeSpeech=0.346, Enkidu=0.475, Spectral=0.318, GRNoise=0.392, AntiFake=0.572),
dict(model="CosyVoice 2", Clean=0.602, SafeSpeech=0.321, Enkidu=0.447, Spectral=0.301, GRNoise=0.384, AntiFake=0.549),
dict(model="ZipVoice", Clean=0.579, SafeSpeech=0.287, Enkidu=0.435, Spectral=0.262, GRNoise=0.258, AntiFake=0.543),
dict(model="MaskGCT", Clean=0.570, SafeSpeech=0.303, Enkidu=0.407, Spectral=0.281, GRNoise=0.312, AntiFake=0.530),
dict(model="GLM-TTS", Clean=0.570, SafeSpeech=0.330, Enkidu=0.445, Spectral=0.311, GRNoise=0.388, AntiFake=0.532),
dict(model="F5-TTS", Clean=0.559, SafeSpeech=0.207, Enkidu=0.431, Spectral=0.176, GRNoise=0.137, AntiFake=0.520),
dict(model="Higgs Audio", Clean=0.559, SafeSpeech=0.264, Enkidu=0.435, Spectral=0.236, GRNoise=0.272, AntiFake=0.521),
dict(model="MGM-Omni", Clean=0.539, SafeSpeech=0.184, Enkidu=0.316, Spectral=0.166, GRNoise=0.229, AntiFake=0.491),
dict(model="PlayDiffusion",Clean=0.506, SafeSpeech=0.173, Enkidu=None, Spectral=0.149, GRNoise=0.162, AntiFake=0.466),
dict(model="MOSS-TTSD", Clean=0.492, SafeSpeech=0.242, Enkidu=0.335, Spectral=0.216, GRNoise=0.247, AntiFake=0.453),
dict(model="VibeVoice", Clean=0.480, SafeSpeech=0.272, Enkidu=0.367, Spectral=0.253, GRNoise=0.280, AntiFake=0.442),
dict(model="FishSpeech", Clean=0.472, SafeSpeech=0.238, Enkidu=0.334, Spectral=0.212, GRNoise=0.235, AntiFake=0.439),
dict(model="XTTS-v2", Clean=0.454, SafeSpeech=0.260, Enkidu=0.308, Spectral=0.241, GRNoise=0.237, AntiFake=0.414),
dict(model="SparkTTS", Clean=0.408, SafeSpeech=0.129, Enkidu=0.137, Spectral=0.108, GRNoise=0.062, AntiFake=0.359),
dict(model="OZSpeech", Clean=0.388, SafeSpeech=0.156, Enkidu=0.187, Spectral=0.147, GRNoise=0.148, AntiFake=0.337),
dict(model="OpenVoice V2", Clean=0.244, SafeSpeech=0.185, Enkidu=0.188, Spectral=0.180, GRNoise=0.175, AntiFake=0.236),
dict(model="StyleTTS 2", Clean=0.228, SafeSpeech=0.089, Enkidu=0.125, Spectral=0.081, GRNoise=0.030, AntiFake=0.207),
]
# fmt: on
METRIC_META = {
"SIM": ("Speaker Similarity ↑", True),
"WER": ("Word Error Rate ↓", False),
"MOS": ("MOS Score ↑", True),
"MCD": ("Mel Cepstral Dist. ↓", False),
"RTF": ("Real-Time Factor ↓", False),
"SVA": ("Speaker Verif. Acc. ↑",True),
"Emo": ("Emotion Match Rate ↑", True),
}
# ── colour helpers ────────────────────────────────────────────────────────────
_GOOD = (200, 230, 201) # #c8e6c9 light green
_MID = (255, 249, 196) # #fff9c4 light yellow
_BAD = (255, 205, 210) # #ffcdd2 light red
def _interp_color(t: float) -> str:
"""t=0 → bad (red), t=1 → good (green), t=0.5 → yellow."""
if t <= 0.5:
s = t / 0.5
r = int(_BAD[0] + s * (_MID[0] - _BAD[0]))
g = int(_BAD[1] + s * (_MID[1] - _BAD[1]))
b = int(_BAD[2] + s * (_MID[2] - _BAD[2]))
else:
s = (t - 0.5) / 0.5
r = int(_MID[0] + s * (_GOOD[0] - _MID[0]))
g = int(_MID[1] + s * (_GOOD[1] - _MID[1]))
b = int(_MID[2] + s * (_GOOD[2] - _MID[2]))
return f"rgb({r},{g},{b})"
def _col_colors(values: list, higher_is_better: bool) -> list[str]:
valid = [v for v in values if v is not None]
if not valid or max(valid) == min(valid):
return ["rgb(245,245,245)"] * len(values)
vmin, vmax = min(valid), max(valid)
colors = []
for v in values:
if v is None:
colors.append("rgb(245,245,245)")
else:
t = (v - vmin) / (vmax - vmin)
if not higher_is_better:
t = 1 - t
colors.append(_interp_color(t))
return colors
# ── audio helpers ─────────────────────────────────────────────────────────────
def _load(path: str) -> tuple[np.ndarray, int]:
audio, sr = sf.read(path, dtype="float32")
if audio.ndim > 1:
audio = audio.mean(axis=1)
return audio, sr
def _snr(original: np.ndarray, protected: np.ndarray) -> float:
noise = protected - original
sp = np.mean(original ** 2)
np_ = np.mean(noise ** 2)
return float("inf") if np_ < 1e-12 else float(10 * np.log10(sp / np_))
# ── protection functions ──────────────────────────────────────────────────────
def apply_grnoise(audio: np.ndarray, sr: int, snr_db: float = 25.0) -> np.ndarray:
sig_pow = np.mean(audio ** 2)
noise_pow = sig_pow / (10 ** (snr_db / 10))
noise = np.random.randn(*audio.shape).astype(np.float32) * np.sqrt(noise_pow)
return np.clip(audio + noise, -1.0, 1.0)
def apply_spectral(audio: np.ndarray, sr: int, strength: float = 0.05) -> np.ndarray:
from numpy.fft import rfft, irfft
n_fft, hop = 1024, 256
out = np.zeros_like(audio)
cnt = np.zeros_like(audio)
for start in range(0, len(audio) - n_fft, hop):
frame = audio[start:start + n_fft] * np.hanning(n_fft).astype(np.float32)
spec = rfft(frame)
mag = np.abs(spec)
perturb = np.random.randn(*mag.shape).astype(np.float32) * strength * mag
spec_p = spec + perturb * np.exp(1j * np.random.uniform(0, 2 * np.pi, mag.shape))
f = irfft(spec_p)[:n_fft].astype(np.float32)
out[start:start + n_fft] += f
cnt[start:start + n_fft] += 1
cnt = np.maximum(cnt, 1)
return np.clip(out / cnt, -1.0, 1.0)
PROTECT_FN = {"GR-Noise": apply_grnoise, "Spectral": apply_spectral}
# ── plotly figures ────────────────────────────────────────────────────────────
def make_sim_bar(model_name: str) -> go.Figure:
"""Bar chart: SIM under each protection method for one gallery model."""
info = GALLERY_MODELS[model_name]
sims = info["sims"]
labels = list(sims.keys())
values = list(sims.values())
bar_colors = [
"#2563eb", # Clean
"#7c3aed", # SafeSpeech
"#059669", # Enkidu
"#ea580c", # Spectral
"#475569", # GR-Noise
"#be123c", # AntiFake
]
# annotate drop vs clean
clean_sim = sims["Clean"]
text = [f"{v:.3f}" if k == "Clean" else f"{v:.3f}<br>↓{clean_sim - v:.3f}"
for k, v in sims.items()]
hover_text = [
f"{label}<br>SIM: {value:.3f}<br>Drop from clean: {clean_sim - value:.3f}"
for label, value in zip(labels, values)
]
fig = go.Figure(go.Bar(
x=labels, y=values,
marker_color=bar_colors,
marker_line_color="rgba(15, 23, 42, 0.25)",
marker_line_width=1,
text=text,
textposition="outside",
hovertext=hover_text,
hoverinfo="text",
cliponaxis=False,
))
fig.update_layout(
title=dict(
text=f"<b>{model_name}</b> speaker similarity after protection",
font=dict(size=16, color="#0f172a"),
x=0.02,
),
yaxis=dict(
title="SIM",
range=[0, min(0.75, max(values) * 1.28)],
gridcolor="#e2e8f0",
zeroline=False,
),
xaxis=dict(title="", tickfont=dict(size=12)),
paper_bgcolor="white",
plot_bgcolor="#f8fafc",
margin=dict(t=62, b=42, l=48, r=24),
height=350,
showlegend=False,
bargap=0.28,
font=dict(color="#334155"),
)
fig.add_trace(go.Scatter(
x=labels,
y=[clean_sim] * len(labels),
mode="lines+text",
line=dict(color="#2563eb", dash="dot", width=1.5),
text=[""] * (len(labels) - 1) + ["Clean baseline"],
textposition="top right",
textfont=dict(size=10, color="#2563eb"),
hoverinfo="skip",
showlegend=False,
))
return fig
def make_results_bar(metric: str = "SIM", ascending: bool = False) -> go.Figure:
"""Horizontal bar chart of all 18 models sorted by the chosen metric."""
higher_is_better = METRIC_META[metric][1]
metric_label = METRIC_META[metric][0]
rows = [r for r in LEADERBOARD_ROWS if r.get(metric) is not None]
rows = sorted(rows, key=lambda r: r[metric], reverse=(higher_is_better ^ ascending))
models = [r["model"] for r in rows]
values = [r[metric] for r in rows]
colors = _col_colors(values, higher_is_better)
text = [f"{v:.3f}" if v is not None else "—" for v in values]
fig = go.Figure(go.Bar(
x=values, y=models,
orientation="h",
marker_color=colors,
marker_line_color="#999", marker_line_width=0.5,
text=text, textposition="outside",
cliponaxis=False,
))
fig.update_layout(
title=dict(text=f"<b>Model Ranking by {metric_label}</b>",
font=dict(size=14)),
xaxis=dict(title=metric_label),
yaxis=dict(autorange="reversed"),
paper_bgcolor="white", plot_bgcolor="#f8f9fa",
margin=dict(t=50, b=40, l=120, r=80),
height=520,
showlegend=False,
)
return fig
def make_prot_heatmap() -> go.Figure:
"""Heatmap: SIM under each protection method for all 18 models."""
col_order = ["Clean", "SafeSpeech", "Enkidu", "Spectral", "GRNoise", "AntiFake"]
col_labels = ["Clean", "SafeSpeech", "Enkidu", "Spectral", "GR-Noise", "AntiFake"]
# sort models by Clean SIM descending
rows = sorted(PROT_ROWS, key=lambda r: r["Clean"], reverse=True)
model_names = [r["model"] for r in rows]
z: list[list] = []
text_vals: list[list[str]] = []
for r in rows:
row_z, row_t = [], []
for col in col_order:
v = r.get(col)
row_z.append(v)
row_t.append(f"{v:.3f}" if v is not None else "—")
z.append(row_z)
text_vals.append(row_t)
fig = go.Figure(go.Heatmap(
z=z,
x=col_labels,
y=model_names,
text=text_vals,
texttemplate="%{text}",
textfont=dict(size=10),
colorscale=[
[0.0, "#b71c1c"],
[0.25, "#ef9a9a"],
[0.5, "#fff9c4"],
[0.75, "#a5d6a7"],
[1.0, "#1b5e20"],
],
zmin=0.0, zmax=0.75,
colorbar=dict(title="SIM", tickformat=".2f", len=0.8),
hoverongaps=False,
))
# separator line after Clean column
fig.add_shape(type="line",
x0=0.5, x1=0.5, y0=-0.5, y1=len(model_names) - 0.5,
line=dict(color="#555", width=2, dash="dot"),
xref="x", yref="y")
fig.update_layout(
title=dict(
text="<b>Protection Robustness — Speaker Similarity (SIM) on LibriTTS</b><br>"
"<sup>Green = high SIM (clone faithful). Red = low SIM (protection effective). "
"Drop from Clean → protected shows protection strength.</sup>",
font=dict(size=13),
),
yaxis=dict(autorange="reversed"),
xaxis=dict(side="top"),
paper_bgcolor="white", plot_bgcolor="white",
margin=dict(t=120, b=40, l=120, r=80),
height=600,
)
return fig
def make_cross_dataset_heatmap() -> go.Figure:
"""Heatmap: SIM on clean prompts across all 10 datasets for all 18 models."""
col_keys = [k for k, _ in CROSS_DATASET_COLS]
col_labels = [label for _, label in CROSS_DATASET_COLS]
rows = sorted(CROSS_DATASET_ROWS, key=lambda r: r["LibriTTS"], reverse=True)
model_names = [r["model"] for r in rows]
z: list[list] = []
text_vals: list[list[str]] = []
for r in rows:
row_z, row_t = [], []
for key in col_keys:
v = r.get(key)
row_z.append(v)
row_t.append(f"{v:.3f}" if v is not None else "—")
z.append(row_z)
text_vals.append(row_t)
fig = go.Figure(go.Heatmap(
z=z,
x=col_labels,
y=model_names,
text=text_vals,
texttemplate="%{text}",
textfont=dict(size=10),
colorscale=[
[0.0, "#b71c1c"],
[0.25, "#ef9a9a"],
[0.5, "#fff9c4"],
[0.75, "#a5d6a7"],
[1.0, "#1b5e20"],
],
zmin=0.0, zmax=0.75,
colorbar=dict(title="SIM", tickformat=".2f", len=0.8),
hoverongaps=False,
))
fig.update_layout(
title=dict(
text="<b>Cross-Dataset Generalisation — Speaker Similarity (SIM) on Clean Prompts</b><br>"
"<sup>Models sorted by LibriTTS SIM. — = not evaluated. "
"Green = high SIM (faithful clone), red = low SIM.</sup>",
font=dict(size=13),
),
yaxis=dict(autorange="reversed"),
xaxis=dict(side="top"),
paper_bgcolor="white", plot_bgcolor="white",
margin=dict(t=120, b=40, l=120, r=80),
height=600,
)
return fig
def make_waveform_figure(
original: np.ndarray, protected: np.ndarray, sr: int
) -> go.Figure:
"""Overlay waveform plot: original vs. protected audio."""
n = min(len(original), len(protected), sr * 5) # cap at 5 s
t = (np.arange(n) / sr).tolist()
original_wave = original[:n].tolist()
protected_wave = protected[:n].tolist()
fig = go.Figure()
fig.add_trace(go.Scatter(
x=t, y=original_wave,
name="Original",
line=dict(color="#1565c0", width=1),
opacity=0.85,
))
fig.add_trace(go.Scatter(
x=t, y=protected_wave,
name="Protected",
line=dict(color="#c62828", width=1),
opacity=0.85,
))
fig.update_layout(
title=dict(text="<b>Waveform Comparison</b> (first 5 s)",
font=dict(size=13)),
xaxis=dict(title="Time (s)"),
yaxis=dict(title="Amplitude", range=[-1.05, 1.05]),
paper_bgcolor="white", plot_bgcolor="#f8f9fa",
legend=dict(orientation="h", y=1.08, x=0.5, xanchor="center"),
margin=dict(t=60, b=40, l=55, r=20),
height=220,
)
return fig
# ── gallery callback ──────────────────────────────────────────────────────────
def load_gallery(model_name: str):
info = GALLERY_MODELS[model_name]
clean_sim = info["sims"]["Clean"]
prot_sim = info["sims"]["SafeSpeech"]
drop = clean_sim - prot_sim
note_md = (
f"**Clean SIM:** {clean_sim:.3f} &nbsp;→&nbsp; "
f"**Protected SIM (SafeSpeech):** {prot_sim:.3f} &nbsp;"
f"*(drop: {drop:.3f})*"
)
return (
REF_WAV,
TARGET_WAV,
os.path.join(SAMPLES, info["clean"]),
os.path.join(SAMPLES, "protected_safespeech.wav"),
os.path.join(SAMPLES, info["prot"]),
note_md,
make_sim_bar(model_name),
)
# ── live protection callback ──────────────────────────────────────────────────
def run_protection(audio_input, method: str, strength: float):
if audio_input is None:
return None, None, "Upload an audio file first.", None
sr_in, data = audio_input
audio = data.astype(np.float32)
if audio.max() > 1.0:
audio /= 32768.0
if audio.ndim > 1:
audio = audio.mean(axis=1)
t0 = time.time()
fn = PROTECT_FN[method]
if method == "GR-Noise":
protected = fn(audio, sr_in, snr_db=strength)
else:
protected = fn(audio, sr_in, strength=strength / 100.0)
elapsed = time.time() - t0
snr = _snr(audio, protected)
prot_int = (protected * 32767).astype(np.int16)
metrics_md = (
f"| Metric | Value |\n|--------|-------|\n"
f"| SNR (dB) | {snr:.1f} |\n"
f"| Processing time | {elapsed * 1000:.0f} ms |\n"
f"| Method | {method} |\n"
)
waveform_fig = make_waveform_figure(audio, protected, sr_in)
return (sr_in, audio.copy()), (sr_in, prot_int), metrics_md, waveform_fig
def update_strength_label(method: str) -> dict:
if method == "GR-Noise":
return gr.update(
label="Target SNR (dB) — lower = stronger, more audible",
info="25 dB: nearly imperceptible. 10 dB: noticeable noise.",
minimum=10, maximum=40, value=25, step=1,
)
else:
return gr.update(
label="Spectral Strength (%) — higher = stronger perturbation",
info="5% is nearly inaudible. 20%+ may cause artifacts.",
minimum=1, maximum=30, value=5, step=1,
)
# ── results callbacks ─────────────────────────────────────────────────────────
def update_results_bar(metric: str) -> go.Figure:
return make_results_bar(metric)
# ── UI constants ──────────────────────────────────────────────────────────────
CSS = """
footer { display: none !important; }
.gradio-container {
max-width: 1180px !important;
margin: 0 auto !important;
}
.hero {
padding: 28px 28px 22px;
border-radius: 12px;
background: linear-gradient(135deg, #0f172a 0%, #164e63 54%, #065f46 100%);
color: white;
margin-bottom: 18px;
}
.hero h1 {
margin: 0 0 8px;
font-size: 2.35rem;
line-height: 1.08;
letter-spacing: 0;
color: white !important;
}
.hero p {
max-width: 760px;
margin: 0;
color: #dbeafe;
font-size: 1.05rem;
}
.hero a {
color: white !important;
}
.hero-links {
display: flex;
flex-wrap: wrap;
gap: 8px;
margin-top: 16px;
}
.hero-links a {
text-decoration: none;
}
.stat-strip {
display: grid;
grid-template-columns: repeat(4, minmax(0, 1fr));
gap: 10px;
margin: 14px 0 18px;
}
.stat-card {
border: 1px solid #d8dee9;
border-radius: 8px;
padding: 12px 14px;
background: #ffffff;
}
.stat-card b {
display: block;
font-size: 1.35rem;
color: #0f172a;
line-height: 1.1;
}
.stat-card span {
color: #475569;
font-size: 0.9rem;
}
.section-head {
margin: 18px 0 8px;
color: #0f172a;
}
.note-box {
font-size: 1.02em;
background: #eef6ff;
border: 1px solid #bfdbfe;
border-left: 4px solid #2563eb;
border-radius: 8px;
padding: 10px 12px;
}
.audio-panel {
border: 1px solid #e2e8f0;
border-radius: 8px;
padding: 12px;
background: #ffffff;
}
.audio-panel h3,
.audio-panel h4 {
margin-top: 0;
}
.workflow-copy {
color: #475569;
margin-bottom: 12px;
}
@media (max-width: 760px) {
.hero {
padding: 22px 18px 18px;
}
.hero h1 {
font-size: 1.75rem;
}
.stat-strip {
grid-template-columns: repeat(2, minmax(0, 1fr));
}
}
"""
INTRO_MD = """
<div class="hero">
<h1>RVCBench</h1>
<p>Voice cloning attacks and audio protection methods, compared through paired listening examples and speaker-similarity results.</p>
<div class="hero-links">
<a href="https://arxiv.org/abs/2602.00443"><img alt="Paper" src="https://img.shields.io/badge/arXiv-2602.00443-b31b1b.svg"></a>
<a href="https://huggingface.co/datasets/Nanboy/RVCBench"><img alt="Dataset" src="https://img.shields.io/badge/HuggingFace-Dataset-ffcc00.svg"></a>
<a href="https://github.com/Nanboy-Ronan/RVCBench"><img alt="GitHub" src="https://img.shields.io/badge/GitHub-RVCBench-181717.svg"></a>
</div>
</div>
<div class="stat-strip">
<div class="stat-card"><b>26</b><span>voice cloning models</span></div>
<div class="stat-card"><b>5</b><span>protection methods</span></div>
<div class="stat-card"><b>7</b><span>evaluation metrics</span></div>
<div class="stat-card"><b>10</b><span>speech datasets</span></div>
</div>
"""
GALLERY_INTRO_MD = """
<div class="workflow-copy">
Select a cloning model, compare clean and protected audio, then inspect how much each protection method lowers speaker similarity.
</div>
"""
PROT_INTRO_MD = """
Upload your own audio clip and apply a protection method. The protected audio sounds nearly
identical to humans, but disrupts automatic voice cloning models.
- **GR-Noise** — Gaussian random noise at a chosen SNR level. No surrogate model required.
- **Spectral** — Structured perturbation in the STFT frequency domain.
"""
RESULTS_INTRO_MD = """
**Metric guide** — SIM: speaker cosine similarity ↑ &nbsp;·&nbsp;
WER: word error rate ↓ &nbsp;·&nbsp; MOS: perceptual quality ↑ &nbsp;·&nbsp;
MCD: mel cepstral distortion ↓ &nbsp;·&nbsp; RTF: real-time factor ↓ &nbsp;·&nbsp;
SVA: speaker verification accuracy ↑ &nbsp;·&nbsp; Emo: emotion match rate ↑
Select a metric to re-rank the 18 models. The heatmap below shows protection robustness
(SIM under each of 5 protection methods).
"""
# ── build demo ────────────────────────────────────────────────────────────────
def build_demo():
with gr.Blocks(css=CSS, title="RVCBench Demo") as demo:
gr.Markdown(INTRO_MD)
with gr.Tabs():
# ── Tab 1: Voice Cloning Gallery ──────────────────────────────────
with gr.Tab("🎧 Voice Cloning Gallery"):
gr.Markdown(GALLERY_INTRO_MD)
with gr.Row():
model_dd = gr.Dropdown(
choices=list(GALLERY_MODELS.keys()),
value="ZipVoice",
label="Voice Cloning Model",
scale=3,
)
load_btn = gr.Button("Load Example", variant="primary", scale=1)
sim_note = gr.Markdown("", elem_classes="note-box")
with gr.Row():
with gr.Column(elem_classes="audio-panel"):
gr.Markdown('<h3 class="section-head">1. Reference Voice</h3>')
gr.Markdown(f"*\"{REF_TEXT}\"*")
ref_out = gr.Audio(label="Reference (original)", interactive=False)
with gr.Column(elem_classes="audio-panel"):
gr.Markdown('<h3 class="section-head">2. Target Speech</h3>')
gr.Markdown(f"*\"{TARGET_TEXT}\"*")
target_out = gr.Audio(label="Target utterance", interactive=False)
gr.Markdown('<h3 class="section-head">3. Cloning Results</h3>')
with gr.Row():
with gr.Column(elem_classes="audio-panel"):
gr.Markdown("#### Clean Reference")
clean_out = gr.Audio(label="Clean clone", interactive=False)
with gr.Column(elem_classes="audio-panel"):
gr.Markdown("#### SafeSpeech-Protected Reference")
prot_ref_out = gr.Audio(label="Protected reference", interactive=False)
prot_clone_out = gr.Audio(label="Clone from protected (degraded)", interactive=False)
gr.Markdown('<h3 class="section-head">4. Protection Effectiveness Across Methods</h3>')
sim_chart = gr.Plot(label="", show_label=False)
gallery_outputs = [ref_out, target_out, clean_out, prot_ref_out,
prot_clone_out, sim_note, sim_chart]
load_btn.click(fn=load_gallery, inputs=[model_dd], outputs=gallery_outputs)
demo.load(fn=load_gallery, inputs=[model_dd], outputs=gallery_outputs)
model_dd.change(fn=load_gallery, inputs=[model_dd], outputs=gallery_outputs)
# ── Tab 2: Protect Your Voice ─────────────────────────────────────
with gr.Tab("🔒 Protect Your Voice"):
gr.Markdown(PROT_INTRO_MD)
with gr.Row():
audio_in = gr.Audio(
label="Upload your audio (wav / mp3, ≤ 30 s)",
type="numpy", scale=3,
)
with gr.Column(scale=1):
method_dd = gr.Dropdown(
choices=list(PROTECT_FN.keys()),
value="GR-Noise",
label="Protection Method",
)
strength_sl = gr.Slider(
minimum=10, maximum=40, value=25, step=1,
label="Target SNR (dB) — lower = stronger, more audible",
info="25 dB: nearly imperceptible. 10 dB: noticeable noise.",
)
protect_btn = gr.Button("Apply Protection", variant="primary")
with gr.Row():
orig_out = gr.Audio(label="Original", interactive=False)
prot_live = gr.Audio(label="Protected", interactive=False)
metrics_out = gr.Markdown("")
waveform_plot = gr.Plot(label="Waveform Comparison", show_label=False)
method_dd.change(fn=update_strength_label, inputs=[method_dd],
outputs=[strength_sl])
protect_btn.click(
fn=run_protection,
inputs=[audio_in, method_dd, strength_sl],
outputs=[orig_out, prot_live, metrics_out, waveform_plot],
)
gr.Markdown(
"> **Note:** Full voice cloning inference (SafeSpeech, Enkidu, AntiFake) "
"requires surrogate models and is not included in this Space due to compute "
"constraints. See the "
"[GitHub repo](https://github.com/Nanboy-Ronan/RVCBench) for the full pipeline."
)
# ── Tab 3: Results Explorer ───────────────────────────────────────
with gr.Tab("📊 Results Explorer"):
gr.Markdown(RESULTS_INTRO_MD)
metric_dd = gr.Dropdown(
choices=list(METRIC_META.keys()),
value="SIM",
label="Sort by metric",
)
bar_chart = gr.Plot(label="", show_label=False)
metric_dd.change(fn=update_results_bar, inputs=[metric_dd],
outputs=[bar_chart])
demo.load(fn=lambda: make_results_bar("SIM"), outputs=[bar_chart])
gr.Markdown("---")
gr.Markdown(
"### Cross-Dataset Generalisation\n"
"SIM on clean prompts across all 10 benchmark datasets. "
"Models sorted by LibriTTS SIM. — = not evaluated."
)
cross_heatmap = gr.Plot(label="", show_label=False)
demo.load(fn=make_cross_dataset_heatmap, outputs=[cross_heatmap])
gr.Markdown("---")
gr.Markdown(
"### Protection Robustness Heatmap\n"
"SIM under each of 5 protection methods — drop from **Clean** indicates "
"more effective protection."
)
prot_heatmap = gr.Plot(label="", show_label=False)
demo.load(fn=make_prot_heatmap, outputs=[prot_heatmap])
# ── Tab 4: About ──────────────────────────────────────────────────
with gr.Tab("ℹ️ About"):
gr.Markdown("""
## About RVCBench
**RVCBench** is an open-source benchmark for evaluating the robustness of voice cloning
against audio protection methods.
### What it measures
- How well **18+ modern zero-shot TTS/VC models** can clone a speaker's voice
- How effectively **5 audio protection methods** (SafeSpeech, Enkidu, Spectral, GR-Noise, AntiFake)
prevent cloning across **10 datasets** and **7 evaluation metrics**
### Resources
| Resource | Link |
|----------|------|
| Paper (arXiv) | [arXiv:2602.00443](https://arxiv.org/abs/2602.00443) |
| Code & full pipeline | [GitHub: Nanboy-Ronan/RVCBench](https://github.com/Nanboy-Ronan/RVCBench) |
| Dataset | [HuggingFace: Nanboy/RVCBench](https://huggingface.co/datasets/Nanboy/RVCBench) |
| Contact | ruinanjin@alumni.ubc.ca |
### Citation
```bibtex
@article{liao2026rvcbench,
title = {RVCBench: Benchmarking the Robustness of Voice Cloning Across Modern Audio Generation Models},
author = {Liao, Xinting and Jin, Ruinan and Yu, Hanlin and Pandya, Deval and Li, Xiaoxiao},
journal = {arXiv preprint arXiv:2602.00443},
year = {2026}
}
```
""")
return demo
if __name__ == "__main__":
build_demo().launch(server_name="0.0.0.0", server_port=7860, show_api=False)