"""RVCBench — Interactive HuggingFace Space demo (v2). Tabs ──── 1. Voice Cloning Gallery – hear pre-computed clean vs. protected clones + protection-effectiveness bar chart for all 5 methods 2. Protect Your Voice – upload audio, apply protection, see waveform comparison 3. Results Explorer – interactive bar chart + protection robustness heatmap 4. About – paper, citation, resources """ from __future__ import annotations import io import os import time import gradio as gr import numpy as np import plotly.graph_objects as go import soundfile as sf try: import _plotly_utils.basevalidators as _plotly_basevalidators def _plotly_to_scalar_or_list_without_pandas(value): np_mod = _plotly_basevalidators.get_module("numpy", should_load=False) if np_mod and np_mod.isscalar(value) and hasattr(value, "item"): return _plotly_basevalidators.to_non_numpy_type(np_mod, value) if isinstance(value, (list, tuple)): return [_plotly_to_scalar_or_list_without_pandas(item) for item in value] if np_mod and isinstance(value, np_mod.ndarray): if value.ndim == 0: return _plotly_basevalidators.to_non_numpy_type(np_mod, value) return [_plotly_to_scalar_or_list_without_pandas(item) for item in value] if _plotly_basevalidators.is_numpy_convertable(value): np_mod = _plotly_basevalidators.get_module("numpy", should_load=True) if np_mod: return _plotly_to_scalar_or_list_without_pandas(np_mod.array(value)) return value def _plotly_is_homogeneous_array_without_pandas(value): np_mod = _plotly_basevalidators.get_module("numpy", should_load=False) if np_mod and isinstance(value, np_mod.ndarray): return True if isinstance(value, _plotly_basevalidators.nw.Series): return True if _plotly_basevalidators.is_numpy_convertable(value): np_mod = _plotly_basevalidators.get_module("numpy", should_load=True) if np_mod: return np_mod.array(value).shape != () return False _plotly_basevalidators.to_scalar_or_list = _plotly_to_scalar_or_list_without_pandas _plotly_basevalidators.is_homogeneous_array = _plotly_is_homogeneous_array_without_pandas except Exception: pass # ── paths ──────────────────────────────────────────────────────────────────── SAMPLES = os.path.join(os.path.dirname(__file__), "samples", "1089") REF_WAV = os.path.join(SAMPLES, "reference.wav") TARGET_WAV = os.path.join(SAMPLES, "target.wav") REF_TEXT = ("But her long fair hair was girlish: and girlish, and touched " "with the wonder of mortal beauty, her face.") TARGET_TEXT = "A great fisher of souls!" # ── gallery models (audio samples available for SafeSpeech protection) ──────── GALLERY_MODELS = { "ZipVoice": dict( clean="zipvoice_clean.wav", prot="zipvoice_safespeech.wav", sims={"Clean": 0.579, "SafeSpeech": 0.287, "Enkidu": 0.435, "Spectral": 0.262, "GR-Noise": 0.258, "AntiFake": 0.543}, ), "MOSS-TTSD": dict( clean="moss_ttsd_clean.wav", prot="moss_ttsd_safespeech.wav", sims={"Clean": 0.492, "SafeSpeech": 0.242, "Enkidu": 0.335, "Spectral": 0.216, "GR-Noise": 0.247, "AntiFake": 0.453}, ), "MGM-Omni": dict( clean="mgm_omni_clean.wav", prot="mgm_omni_safespeech.wav", sims={"Clean": 0.539, "SafeSpeech": 0.184, "Enkidu": 0.316, "Spectral": 0.166, "GR-Noise": 0.229, "AntiFake": 0.491}, ), "OZSpeech": dict( clean="ozspeech_clean.wav", prot="ozspeech_safespeech.wav", sims={"Clean": 0.388, "SafeSpeech": 0.156, "Enkidu": 0.187, "Spectral": 0.147, "GR-Noise": 0.148, "AntiFake": 0.337}, ), "StyleTTS 2": dict( clean="styletts2_clean.wav", prot="styletts2_safespeech.wav", sims={"Clean": 0.228, "SafeSpeech": 0.089, "Enkidu": 0.125, "Spectral": 0.081, "GR-Noise": 0.030, "AntiFake": 0.207}, ), } # ── benchmark data (LibriTTS, clean prompts) ───────────────────────────────── # fmt: off LEADERBOARD_ROWS = [ dict(model="Qwen3-TTS", SIM=0.614, WER=0.052, MOS=4.39, MCD=5.79, RTF=2.02, SVA=0.974, Emo=0.731), dict(model="IndexTTS", SIM=0.606, WER=0.052, MOS=4.06, MCD=6.61, RTF=2.23, SVA=0.972, Emo=0.693), dict(model="CosyVoice 2", SIM=0.602, WER=0.175, MOS=4.39, MCD=6.17, RTF=4.58, SVA=0.974, Emo=0.729), dict(model="ZipVoice", SIM=0.579, WER=0.053, MOS=4.13, MCD=7.09, RTF=1.46, SVA=0.952, Emo=0.675), dict(model="MaskGCT", SIM=0.570, WER=0.088, MOS=3.93, MCD=6.91, RTF=1.36, SVA=0.939, Emo=0.682), dict(model="GLM-TTS", SIM=0.570, WER=0.087, MOS=4.08, MCD=6.41, RTF=1.74, SVA=0.951, Emo=0.678), dict(model="F5-TTS", SIM=0.559, WER=0.116, MOS=3.99, MCD=6.96, RTF=0.61, SVA=0.937, Emo=0.676), dict(model="Higgs Audio", SIM=0.559, WER=0.250, MOS=4.30, MCD=6.06, RTF=1.42, SVA=0.941, Emo=0.717), dict(model="MGM-Omni", SIM=0.539, WER=0.095, MOS=4.28, MCD=5.82, RTF=0.84, SVA=0.933, Emo=0.676), dict(model="PlayDiffusion",SIM=0.506, WER=0.055, MOS=4.15, MCD=8.06, RTF=0.73, SVA=0.936, Emo=0.681), dict(model="MOSS-TTSD", SIM=0.492, WER=0.383, MOS=4.10, MCD=7.09, RTF=None, SVA=0.876, Emo=0.667), dict(model="VibeVoice", SIM=0.480, WER=0.228, MOS=3.83, MCD=6.76, RTF=1.86, SVA=0.852, Emo=0.624), dict(model="FishSpeech", SIM=0.472, WER=0.166, MOS=4.37, MCD=6.47, RTF=3.61, SVA=0.907, Emo=0.682), dict(model="XTTS-v2", SIM=0.454, WER=0.073, MOS=3.81, MCD=8.62, RTF=0.62, SVA=0.908, Emo=0.639), dict(model="SparkTTS", SIM=0.408, WER=0.326, MOS=4.06, MCD=5.83, RTF=1.56, SVA=0.764, Emo=0.672), dict(model="OZSpeech", SIM=0.388, WER=0.060, MOS=3.21, MCD=6.87, RTF=8.75, SVA=0.840, Emo=0.636), dict(model="OpenVoice V2", SIM=0.244, WER=0.075, MOS=4.30, MCD=7.06, RTF=0.08, SVA=0.474, Emo=0.601), dict(model="StyleTTS 2", SIM=0.228, WER=0.049, MOS=4.30, MCD=6.81, RTF=0.11, SVA=0.388, Emo=0.589), ] # Cross-dataset generalisation — SIM on clean prompts across all 10 datasets CROSS_DATASET_ROWS = [ dict(model="Qwen3-TTS", LibriTTS=0.614, VCTK=0.618, MultiSpk=0.495, Long=0.561, AISHELL=0.721, French=0.536, Bilingual=0.673, BGclean=0.689, BGnoise=0.572, Hallucin=0.515), dict(model="IndexTTS", LibriTTS=0.606, VCTK=0.567, MultiSpk=0.473, Long=0.775, AISHELL=0.721, French=0.397, Bilingual=0.673, BGclean=0.589, BGnoise=0.528, Hallucin=0.529), dict(model="CosyVoice 2", LibriTTS=0.602, VCTK=0.582, MultiSpk=0.448, Long=0.530, AISHELL=0.717, French=0.378, Bilingual=0.653, BGclean=0.626, BGnoise=0.515, Hallucin=0.518), dict(model="ZipVoice", LibriTTS=0.579, VCTK=0.554, MultiSpk=0.531, Long=0.729, AISHELL=0.712, French=0.363, Bilingual=0.322, BGclean=0.625, BGnoise=0.462, Hallucin=0.509), dict(model="MaskGCT", LibriTTS=0.570, VCTK=0.555, MultiSpk=0.431, Long=0.194, AISHELL=0.674, French=0.494, Bilingual=None, BGclean=0.610, BGnoise=0.487, Hallucin=0.499), dict(model="GLM-TTS", LibriTTS=0.570, VCTK=0.573, MultiSpk=0.445, Long=0.757, AISHELL=0.690, French=0.398, Bilingual=0.657, BGclean=0.622, BGnoise=0.528, Hallucin=0.533), dict(model="F5-TTS", LibriTTS=0.559, VCTK=0.537, MultiSpk=0.507, Long=0.607, AISHELL=0.696, French=0.304, Bilingual=0.653, BGclean=0.582, BGnoise=0.414, Hallucin=0.455), dict(model="Higgs Audio", LibriTTS=0.559, VCTK=0.516, MultiSpk=0.418, Long=0.520, AISHELL=0.581, French=0.349, Bilingual=0.543, BGclean=0.592, BGnoise=0.421, Hallucin=0.425), dict(model="MGM-Omni", LibriTTS=0.539, VCTK=0.447, MultiSpk=0.370, Long=0.442, AISHELL=0.713, French=0.227, Bilingual=0.630, BGclean=0.523, BGnoise=0.332, Hallucin=0.396), dict(model="PlayDiffusion",LibriTTS=0.506, VCTK=0.426, MultiSpk=0.360, Long=0.637, AISHELL=0.441, French=0.283, Bilingual=0.465, BGclean=0.433, BGnoise=0.305, Hallucin=0.408), dict(model="MOSS-TTSD", LibriTTS=0.492, VCTK=0.440, MultiSpk=0.379, Long=0.644, AISHELL=0.437, French=0.327, Bilingual=0.471, BGclean=0.494, BGnoise=0.488, Hallucin=0.416), dict(model="VibeVoice", LibriTTS=0.480, VCTK=0.436, MultiSpk=0.348, Long=0.625, AISHELL=0.564, French=0.343, Bilingual=0.531, BGclean=0.513, BGnoise=0.364, Hallucin=0.408), dict(model="FishSpeech", LibriTTS=0.472, VCTK=0.430, MultiSpk=0.383, Long=0.572, AISHELL=0.611, French=0.374, Bilingual=0.566, BGclean=0.495, BGnoise=0.387, Hallucin=0.351), dict(model="XTTS-v2", LibriTTS=0.454, VCTK=0.454, MultiSpk=0.328, Long=0.613, AISHELL=0.569, French=0.445, Bilingual=0.506, BGclean=0.546, BGnoise=0.394, Hallucin=0.488), dict(model="SparkTTS", LibriTTS=0.408, VCTK=0.532, MultiSpk=0.228, Long=0.345, AISHELL=0.569, French=0.164, Bilingual=0.480, BGclean=0.588, BGnoise=0.332, Hallucin=0.336), dict(model="OZSpeech", LibriTTS=0.388, VCTK=0.253, MultiSpk=0.271, Long=None, AISHELL=None, French=0.109, Bilingual=None, BGclean=0.272, BGnoise=0.164, Hallucin=0.281), dict(model="OpenVoice V2", LibriTTS=0.244, VCTK=0.392, MultiSpk=0.192, Long=0.278, AISHELL=0.431, French=0.271, Bilingual=0.298, BGclean=0.484, BGnoise=0.358, Hallucin=0.365), dict(model="StyleTTS 2", LibriTTS=0.228, VCTK=0.236, MultiSpk=0.162, Long=None, AISHELL=None, French=None, Bilingual=0.213, BGclean=0.196, BGnoise=0.166, Hallucin=0.184), ] CROSS_DATASET_COLS = [ ("LibriTTS", "LibriTTS"), ("VCTK", "VCTK"), ("MultiSpk", "Multi-spk"), ("Long", "Long"), ("AISHELL", "AISHELL"), ("French", "French"), ("Bilingual", "Bilingual"), ("BGclean", "BG-clean"), ("BGnoise", "BG-noise"), ("Hallucin", "Hallucin."), ] # Protection robustness — SIM under each method (LibriTTS, all 18 models) PROT_ROWS = [ dict(model="Qwen3-TTS", Clean=0.614, SafeSpeech=0.384, Enkidu=0.502, Spectral=0.363, GRNoise=0.408, AntiFake=0.582), dict(model="IndexTTS", Clean=0.606, SafeSpeech=0.346, Enkidu=0.475, Spectral=0.318, GRNoise=0.392, AntiFake=0.572), dict(model="CosyVoice 2", Clean=0.602, SafeSpeech=0.321, Enkidu=0.447, Spectral=0.301, GRNoise=0.384, AntiFake=0.549), dict(model="ZipVoice", Clean=0.579, SafeSpeech=0.287, Enkidu=0.435, Spectral=0.262, GRNoise=0.258, AntiFake=0.543), dict(model="MaskGCT", Clean=0.570, SafeSpeech=0.303, Enkidu=0.407, Spectral=0.281, GRNoise=0.312, AntiFake=0.530), dict(model="GLM-TTS", Clean=0.570, SafeSpeech=0.330, Enkidu=0.445, Spectral=0.311, GRNoise=0.388, AntiFake=0.532), dict(model="F5-TTS", Clean=0.559, SafeSpeech=0.207, Enkidu=0.431, Spectral=0.176, GRNoise=0.137, AntiFake=0.520), dict(model="Higgs Audio", Clean=0.559, SafeSpeech=0.264, Enkidu=0.435, Spectral=0.236, GRNoise=0.272, AntiFake=0.521), dict(model="MGM-Omni", Clean=0.539, SafeSpeech=0.184, Enkidu=0.316, Spectral=0.166, GRNoise=0.229, AntiFake=0.491), dict(model="PlayDiffusion",Clean=0.506, SafeSpeech=0.173, Enkidu=None, Spectral=0.149, GRNoise=0.162, AntiFake=0.466), dict(model="MOSS-TTSD", Clean=0.492, SafeSpeech=0.242, Enkidu=0.335, Spectral=0.216, GRNoise=0.247, AntiFake=0.453), dict(model="VibeVoice", Clean=0.480, SafeSpeech=0.272, Enkidu=0.367, Spectral=0.253, GRNoise=0.280, AntiFake=0.442), dict(model="FishSpeech", Clean=0.472, SafeSpeech=0.238, Enkidu=0.334, Spectral=0.212, GRNoise=0.235, AntiFake=0.439), dict(model="XTTS-v2", Clean=0.454, SafeSpeech=0.260, Enkidu=0.308, Spectral=0.241, GRNoise=0.237, AntiFake=0.414), dict(model="SparkTTS", Clean=0.408, SafeSpeech=0.129, Enkidu=0.137, Spectral=0.108, GRNoise=0.062, AntiFake=0.359), dict(model="OZSpeech", Clean=0.388, SafeSpeech=0.156, Enkidu=0.187, Spectral=0.147, GRNoise=0.148, AntiFake=0.337), dict(model="OpenVoice V2", Clean=0.244, SafeSpeech=0.185, Enkidu=0.188, Spectral=0.180, GRNoise=0.175, AntiFake=0.236), dict(model="StyleTTS 2", Clean=0.228, SafeSpeech=0.089, Enkidu=0.125, Spectral=0.081, GRNoise=0.030, AntiFake=0.207), ] # fmt: on METRIC_META = { "SIM": ("Speaker Similarity ↑", True), "WER": ("Word Error Rate ↓", False), "MOS": ("MOS Score ↑", True), "MCD": ("Mel Cepstral Dist. ↓", False), "RTF": ("Real-Time Factor ↓", False), "SVA": ("Speaker Verif. Acc. ↑",True), "Emo": ("Emotion Match Rate ↑", True), } # ── colour helpers ──────────────────────────────────────────────────────────── _GOOD = (200, 230, 201) # #c8e6c9 light green _MID = (255, 249, 196) # #fff9c4 light yellow _BAD = (255, 205, 210) # #ffcdd2 light red def _interp_color(t: float) -> str: """t=0 → bad (red), t=1 → good (green), t=0.5 → yellow.""" if t <= 0.5: s = t / 0.5 r = int(_BAD[0] + s * (_MID[0] - _BAD[0])) g = int(_BAD[1] + s * (_MID[1] - _BAD[1])) b = int(_BAD[2] + s * (_MID[2] - _BAD[2])) else: s = (t - 0.5) / 0.5 r = int(_MID[0] + s * (_GOOD[0] - _MID[0])) g = int(_MID[1] + s * (_GOOD[1] - _MID[1])) b = int(_MID[2] + s * (_GOOD[2] - _MID[2])) return f"rgb({r},{g},{b})" def _col_colors(values: list, higher_is_better: bool) -> list[str]: valid = [v for v in values if v is not None] if not valid or max(valid) == min(valid): return ["rgb(245,245,245)"] * len(values) vmin, vmax = min(valid), max(valid) colors = [] for v in values: if v is None: colors.append("rgb(245,245,245)") else: t = (v - vmin) / (vmax - vmin) if not higher_is_better: t = 1 - t colors.append(_interp_color(t)) return colors # ── audio helpers ───────────────────────────────────────────────────────────── def _load(path: str) -> tuple[np.ndarray, int]: audio, sr = sf.read(path, dtype="float32") if audio.ndim > 1: audio = audio.mean(axis=1) return audio, sr def _snr(original: np.ndarray, protected: np.ndarray) -> float: noise = protected - original sp = np.mean(original ** 2) np_ = np.mean(noise ** 2) return float("inf") if np_ < 1e-12 else float(10 * np.log10(sp / np_)) # ── protection functions ────────────────────────────────────────────────────── def apply_grnoise(audio: np.ndarray, sr: int, snr_db: float = 25.0) -> np.ndarray: sig_pow = np.mean(audio ** 2) noise_pow = sig_pow / (10 ** (snr_db / 10)) noise = np.random.randn(*audio.shape).astype(np.float32) * np.sqrt(noise_pow) return np.clip(audio + noise, -1.0, 1.0) def apply_spectral(audio: np.ndarray, sr: int, strength: float = 0.05) -> np.ndarray: from numpy.fft import rfft, irfft n_fft, hop = 1024, 256 out = np.zeros_like(audio) cnt = np.zeros_like(audio) for start in range(0, len(audio) - n_fft, hop): frame = audio[start:start + n_fft] * np.hanning(n_fft).astype(np.float32) spec = rfft(frame) mag = np.abs(spec) perturb = np.random.randn(*mag.shape).astype(np.float32) * strength * mag spec_p = spec + perturb * np.exp(1j * np.random.uniform(0, 2 * np.pi, mag.shape)) f = irfft(spec_p)[:n_fft].astype(np.float32) out[start:start + n_fft] += f cnt[start:start + n_fft] += 1 cnt = np.maximum(cnt, 1) return np.clip(out / cnt, -1.0, 1.0) PROTECT_FN = {"GR-Noise": apply_grnoise, "Spectral": apply_spectral} # ── plotly figures ──────────────────────────────────────────────────────────── def make_sim_bar(model_name: str) -> go.Figure: """Bar chart: SIM under each protection method for one gallery model.""" info = GALLERY_MODELS[model_name] sims = info["sims"] labels = list(sims.keys()) values = list(sims.values()) bar_colors = [ "#2563eb", # Clean "#7c3aed", # SafeSpeech "#059669", # Enkidu "#ea580c", # Spectral "#475569", # GR-Noise "#be123c", # AntiFake ] # annotate drop vs clean clean_sim = sims["Clean"] text = [f"{v:.3f}" if k == "Clean" else f"{v:.3f}
↓{clean_sim - v:.3f}" for k, v in sims.items()] hover_text = [ f"{label}
SIM: {value:.3f}
Drop from clean: {clean_sim - value:.3f}" for label, value in zip(labels, values) ] fig = go.Figure(go.Bar( x=labels, y=values, marker_color=bar_colors, marker_line_color="rgba(15, 23, 42, 0.25)", marker_line_width=1, text=text, textposition="outside", hovertext=hover_text, hoverinfo="text", cliponaxis=False, )) fig.update_layout( title=dict( text=f"{model_name} speaker similarity after protection", font=dict(size=16, color="#0f172a"), x=0.02, ), yaxis=dict( title="SIM", range=[0, min(0.75, max(values) * 1.28)], gridcolor="#e2e8f0", zeroline=False, ), xaxis=dict(title="", tickfont=dict(size=12)), paper_bgcolor="white", plot_bgcolor="#f8fafc", margin=dict(t=62, b=42, l=48, r=24), height=350, showlegend=False, bargap=0.28, font=dict(color="#334155"), ) fig.add_trace(go.Scatter( x=labels, y=[clean_sim] * len(labels), mode="lines+text", line=dict(color="#2563eb", dash="dot", width=1.5), text=[""] * (len(labels) - 1) + ["Clean baseline"], textposition="top right", textfont=dict(size=10, color="#2563eb"), hoverinfo="skip", showlegend=False, )) return fig def make_results_bar(metric: str = "SIM", ascending: bool = False) -> go.Figure: """Horizontal bar chart of all 18 models sorted by the chosen metric.""" higher_is_better = METRIC_META[metric][1] metric_label = METRIC_META[metric][0] rows = [r for r in LEADERBOARD_ROWS if r.get(metric) is not None] rows = sorted(rows, key=lambda r: r[metric], reverse=(higher_is_better ^ ascending)) models = [r["model"] for r in rows] values = [r[metric] for r in rows] colors = _col_colors(values, higher_is_better) text = [f"{v:.3f}" if v is not None else "—" for v in values] fig = go.Figure(go.Bar( x=values, y=models, orientation="h", marker_color=colors, marker_line_color="#999", marker_line_width=0.5, text=text, textposition="outside", cliponaxis=False, )) fig.update_layout( title=dict(text=f"Model Ranking by {metric_label}", font=dict(size=14)), xaxis=dict(title=metric_label), yaxis=dict(autorange="reversed"), paper_bgcolor="white", plot_bgcolor="#f8f9fa", margin=dict(t=50, b=40, l=120, r=80), height=520, showlegend=False, ) return fig def make_prot_heatmap() -> go.Figure: """Heatmap: SIM under each protection method for all 18 models.""" col_order = ["Clean", "SafeSpeech", "Enkidu", "Spectral", "GRNoise", "AntiFake"] col_labels = ["Clean", "SafeSpeech", "Enkidu", "Spectral", "GR-Noise", "AntiFake"] # sort models by Clean SIM descending rows = sorted(PROT_ROWS, key=lambda r: r["Clean"], reverse=True) model_names = [r["model"] for r in rows] z: list[list] = [] text_vals: list[list[str]] = [] for r in rows: row_z, row_t = [], [] for col in col_order: v = r.get(col) row_z.append(v) row_t.append(f"{v:.3f}" if v is not None else "—") z.append(row_z) text_vals.append(row_t) fig = go.Figure(go.Heatmap( z=z, x=col_labels, y=model_names, text=text_vals, texttemplate="%{text}", textfont=dict(size=10), colorscale=[ [0.0, "#b71c1c"], [0.25, "#ef9a9a"], [0.5, "#fff9c4"], [0.75, "#a5d6a7"], [1.0, "#1b5e20"], ], zmin=0.0, zmax=0.75, colorbar=dict(title="SIM", tickformat=".2f", len=0.8), hoverongaps=False, )) # separator line after Clean column fig.add_shape(type="line", x0=0.5, x1=0.5, y0=-0.5, y1=len(model_names) - 0.5, line=dict(color="#555", width=2, dash="dot"), xref="x", yref="y") fig.update_layout( title=dict( text="Protection Robustness — Speaker Similarity (SIM) on LibriTTS
" "Green = high SIM (clone faithful). Red = low SIM (protection effective). " "Drop from Clean → protected shows protection strength.", font=dict(size=13), ), yaxis=dict(autorange="reversed"), xaxis=dict(side="top"), paper_bgcolor="white", plot_bgcolor="white", margin=dict(t=120, b=40, l=120, r=80), height=600, ) return fig def make_cross_dataset_heatmap() -> go.Figure: """Heatmap: SIM on clean prompts across all 10 datasets for all 18 models.""" col_keys = [k for k, _ in CROSS_DATASET_COLS] col_labels = [label for _, label in CROSS_DATASET_COLS] rows = sorted(CROSS_DATASET_ROWS, key=lambda r: r["LibriTTS"], reverse=True) model_names = [r["model"] for r in rows] z: list[list] = [] text_vals: list[list[str]] = [] for r in rows: row_z, row_t = [], [] for key in col_keys: v = r.get(key) row_z.append(v) row_t.append(f"{v:.3f}" if v is not None else "—") z.append(row_z) text_vals.append(row_t) fig = go.Figure(go.Heatmap( z=z, x=col_labels, y=model_names, text=text_vals, texttemplate="%{text}", textfont=dict(size=10), colorscale=[ [0.0, "#b71c1c"], [0.25, "#ef9a9a"], [0.5, "#fff9c4"], [0.75, "#a5d6a7"], [1.0, "#1b5e20"], ], zmin=0.0, zmax=0.75, colorbar=dict(title="SIM", tickformat=".2f", len=0.8), hoverongaps=False, )) fig.update_layout( title=dict( text="Cross-Dataset Generalisation — Speaker Similarity (SIM) on Clean Prompts
" "Models sorted by LibriTTS SIM. — = not evaluated. " "Green = high SIM (faithful clone), red = low SIM.", font=dict(size=13), ), yaxis=dict(autorange="reversed"), xaxis=dict(side="top"), paper_bgcolor="white", plot_bgcolor="white", margin=dict(t=120, b=40, l=120, r=80), height=600, ) return fig def make_waveform_figure( original: np.ndarray, protected: np.ndarray, sr: int ) -> go.Figure: """Overlay waveform plot: original vs. protected audio.""" n = min(len(original), len(protected), sr * 5) # cap at 5 s t = (np.arange(n) / sr).tolist() original_wave = original[:n].tolist() protected_wave = protected[:n].tolist() fig = go.Figure() fig.add_trace(go.Scatter( x=t, y=original_wave, name="Original", line=dict(color="#1565c0", width=1), opacity=0.85, )) fig.add_trace(go.Scatter( x=t, y=protected_wave, name="Protected", line=dict(color="#c62828", width=1), opacity=0.85, )) fig.update_layout( title=dict(text="Waveform Comparison (first 5 s)", font=dict(size=13)), xaxis=dict(title="Time (s)"), yaxis=dict(title="Amplitude", range=[-1.05, 1.05]), paper_bgcolor="white", plot_bgcolor="#f8f9fa", legend=dict(orientation="h", y=1.08, x=0.5, xanchor="center"), margin=dict(t=60, b=40, l=55, r=20), height=220, ) return fig # ── gallery callback ────────────────────────────────────────────────────────── def load_gallery(model_name: str): info = GALLERY_MODELS[model_name] clean_sim = info["sims"]["Clean"] prot_sim = info["sims"]["SafeSpeech"] drop = clean_sim - prot_sim note_md = ( f"**Clean SIM:** {clean_sim:.3f}  →  " f"**Protected SIM (SafeSpeech):** {prot_sim:.3f}  " f"*(drop: {drop:.3f})*" ) return ( REF_WAV, TARGET_WAV, os.path.join(SAMPLES, info["clean"]), os.path.join(SAMPLES, "protected_safespeech.wav"), os.path.join(SAMPLES, info["prot"]), note_md, make_sim_bar(model_name), ) # ── live protection callback ────────────────────────────────────────────────── def run_protection(audio_input, method: str, strength: float): if audio_input is None: return None, None, "Upload an audio file first.", None sr_in, data = audio_input audio = data.astype(np.float32) if audio.max() > 1.0: audio /= 32768.0 if audio.ndim > 1: audio = audio.mean(axis=1) t0 = time.time() fn = PROTECT_FN[method] if method == "GR-Noise": protected = fn(audio, sr_in, snr_db=strength) else: protected = fn(audio, sr_in, strength=strength / 100.0) elapsed = time.time() - t0 snr = _snr(audio, protected) prot_int = (protected * 32767).astype(np.int16) metrics_md = ( f"| Metric | Value |\n|--------|-------|\n" f"| SNR (dB) | {snr:.1f} |\n" f"| Processing time | {elapsed * 1000:.0f} ms |\n" f"| Method | {method} |\n" ) waveform_fig = make_waveform_figure(audio, protected, sr_in) return (sr_in, audio.copy()), (sr_in, prot_int), metrics_md, waveform_fig def update_strength_label(method: str) -> dict: if method == "GR-Noise": return gr.update( label="Target SNR (dB) — lower = stronger, more audible", info="25 dB: nearly imperceptible. 10 dB: noticeable noise.", minimum=10, maximum=40, value=25, step=1, ) else: return gr.update( label="Spectral Strength (%) — higher = stronger perturbation", info="5% is nearly inaudible. 20%+ may cause artifacts.", minimum=1, maximum=30, value=5, step=1, ) # ── results callbacks ───────────────────────────────────────────────────────── def update_results_bar(metric: str) -> go.Figure: return make_results_bar(metric) # ── UI constants ────────────────────────────────────────────────────────────── CSS = """ footer { display: none !important; } .gradio-container { max-width: 1180px !important; margin: 0 auto !important; } .hero { padding: 28px 28px 22px; border-radius: 12px; background: linear-gradient(135deg, #0f172a 0%, #164e63 54%, #065f46 100%); color: white; margin-bottom: 18px; } .hero h1 { margin: 0 0 8px; font-size: 2.35rem; line-height: 1.08; letter-spacing: 0; color: white !important; } .hero p { max-width: 760px; margin: 0; color: #dbeafe; font-size: 1.05rem; } .hero a { color: white !important; } .hero-links { display: flex; flex-wrap: wrap; gap: 8px; margin-top: 16px; } .hero-links a { text-decoration: none; } .stat-strip { display: grid; grid-template-columns: repeat(4, minmax(0, 1fr)); gap: 10px; margin: 14px 0 18px; } .stat-card { border: 1px solid #d8dee9; border-radius: 8px; padding: 12px 14px; background: #ffffff; } .stat-card b { display: block; font-size: 1.35rem; color: #0f172a; line-height: 1.1; } .stat-card span { color: #475569; font-size: 0.9rem; } .section-head { margin: 18px 0 8px; color: #0f172a; } .note-box { font-size: 1.02em; background: #eef6ff; border: 1px solid #bfdbfe; border-left: 4px solid #2563eb; border-radius: 8px; padding: 10px 12px; } .audio-panel { border: 1px solid #e2e8f0; border-radius: 8px; padding: 12px; background: #ffffff; } .audio-panel h3, .audio-panel h4 { margin-top: 0; } .workflow-copy { color: #475569; margin-bottom: 12px; } @media (max-width: 760px) { .hero { padding: 22px 18px 18px; } .hero h1 { font-size: 1.75rem; } .stat-strip { grid-template-columns: repeat(2, minmax(0, 1fr)); } } """ INTRO_MD = """

RVCBench

Voice cloning attacks and audio protection methods, compared through paired listening examples and speaker-similarity results.

26voice cloning models
5protection methods
7evaluation metrics
10speech datasets
""" GALLERY_INTRO_MD = """
Select a cloning model, compare clean and protected audio, then inspect how much each protection method lowers speaker similarity.
""" PROT_INTRO_MD = """ Upload your own audio clip and apply a protection method. The protected audio sounds nearly identical to humans, but disrupts automatic voice cloning models. - **GR-Noise** — Gaussian random noise at a chosen SNR level. No surrogate model required. - **Spectral** — Structured perturbation in the STFT frequency domain. """ RESULTS_INTRO_MD = """ **Metric guide** — SIM: speaker cosine similarity ↑  ·  WER: word error rate ↓  ·  MOS: perceptual quality ↑  ·  MCD: mel cepstral distortion ↓  ·  RTF: real-time factor ↓  ·  SVA: speaker verification accuracy ↑  ·  Emo: emotion match rate ↑ Select a metric to re-rank the 18 models. The heatmap below shows protection robustness (SIM under each of 5 protection methods). """ # ── build demo ──────────────────────────────────────────────────────────────── def build_demo(): with gr.Blocks(css=CSS, title="RVCBench Demo") as demo: gr.Markdown(INTRO_MD) with gr.Tabs(): # ── Tab 1: Voice Cloning Gallery ────────────────────────────────── with gr.Tab("🎧 Voice Cloning Gallery"): gr.Markdown(GALLERY_INTRO_MD) with gr.Row(): model_dd = gr.Dropdown( choices=list(GALLERY_MODELS.keys()), value="ZipVoice", label="Voice Cloning Model", scale=3, ) load_btn = gr.Button("Load Example", variant="primary", scale=1) sim_note = gr.Markdown("", elem_classes="note-box") with gr.Row(): with gr.Column(elem_classes="audio-panel"): gr.Markdown('

1. Reference Voice

') gr.Markdown(f"*\"{REF_TEXT}\"*") ref_out = gr.Audio(label="Reference (original)", interactive=False) with gr.Column(elem_classes="audio-panel"): gr.Markdown('

2. Target Speech

') gr.Markdown(f"*\"{TARGET_TEXT}\"*") target_out = gr.Audio(label="Target utterance", interactive=False) gr.Markdown('

3. Cloning Results

') with gr.Row(): with gr.Column(elem_classes="audio-panel"): gr.Markdown("#### Clean Reference") clean_out = gr.Audio(label="Clean clone", interactive=False) with gr.Column(elem_classes="audio-panel"): gr.Markdown("#### SafeSpeech-Protected Reference") prot_ref_out = gr.Audio(label="Protected reference", interactive=False) prot_clone_out = gr.Audio(label="Clone from protected (degraded)", interactive=False) gr.Markdown('

4. Protection Effectiveness Across Methods

') sim_chart = gr.Plot(label="", show_label=False) gallery_outputs = [ref_out, target_out, clean_out, prot_ref_out, prot_clone_out, sim_note, sim_chart] load_btn.click(fn=load_gallery, inputs=[model_dd], outputs=gallery_outputs) demo.load(fn=load_gallery, inputs=[model_dd], outputs=gallery_outputs) model_dd.change(fn=load_gallery, inputs=[model_dd], outputs=gallery_outputs) # ── Tab 2: Protect Your Voice ───────────────────────────────────── with gr.Tab("🔒 Protect Your Voice"): gr.Markdown(PROT_INTRO_MD) with gr.Row(): audio_in = gr.Audio( label="Upload your audio (wav / mp3, ≤ 30 s)", type="numpy", scale=3, ) with gr.Column(scale=1): method_dd = gr.Dropdown( choices=list(PROTECT_FN.keys()), value="GR-Noise", label="Protection Method", ) strength_sl = gr.Slider( minimum=10, maximum=40, value=25, step=1, label="Target SNR (dB) — lower = stronger, more audible", info="25 dB: nearly imperceptible. 10 dB: noticeable noise.", ) protect_btn = gr.Button("Apply Protection", variant="primary") with gr.Row(): orig_out = gr.Audio(label="Original", interactive=False) prot_live = gr.Audio(label="Protected", interactive=False) metrics_out = gr.Markdown("") waveform_plot = gr.Plot(label="Waveform Comparison", show_label=False) method_dd.change(fn=update_strength_label, inputs=[method_dd], outputs=[strength_sl]) protect_btn.click( fn=run_protection, inputs=[audio_in, method_dd, strength_sl], outputs=[orig_out, prot_live, metrics_out, waveform_plot], ) gr.Markdown( "> **Note:** Full voice cloning inference (SafeSpeech, Enkidu, AntiFake) " "requires surrogate models and is not included in this Space due to compute " "constraints. See the " "[GitHub repo](https://github.com/Nanboy-Ronan/RVCBench) for the full pipeline." ) # ── Tab 3: Results Explorer ─────────────────────────────────────── with gr.Tab("📊 Results Explorer"): gr.Markdown(RESULTS_INTRO_MD) metric_dd = gr.Dropdown( choices=list(METRIC_META.keys()), value="SIM", label="Sort by metric", ) bar_chart = gr.Plot(label="", show_label=False) metric_dd.change(fn=update_results_bar, inputs=[metric_dd], outputs=[bar_chart]) demo.load(fn=lambda: make_results_bar("SIM"), outputs=[bar_chart]) gr.Markdown("---") gr.Markdown( "### Cross-Dataset Generalisation\n" "SIM on clean prompts across all 10 benchmark datasets. " "Models sorted by LibriTTS SIM. — = not evaluated." ) cross_heatmap = gr.Plot(label="", show_label=False) demo.load(fn=make_cross_dataset_heatmap, outputs=[cross_heatmap]) gr.Markdown("---") gr.Markdown( "### Protection Robustness Heatmap\n" "SIM under each of 5 protection methods — drop from **Clean** indicates " "more effective protection." ) prot_heatmap = gr.Plot(label="", show_label=False) demo.load(fn=make_prot_heatmap, outputs=[prot_heatmap]) # ── Tab 4: About ────────────────────────────────────────────────── with gr.Tab("ℹ️ About"): gr.Markdown(""" ## About RVCBench **RVCBench** is an open-source benchmark for evaluating the robustness of voice cloning against audio protection methods. ### What it measures - How well **18+ modern zero-shot TTS/VC models** can clone a speaker's voice - How effectively **5 audio protection methods** (SafeSpeech, Enkidu, Spectral, GR-Noise, AntiFake) prevent cloning across **10 datasets** and **7 evaluation metrics** ### Resources | Resource | Link | |----------|------| | Paper (arXiv) | [arXiv:2602.00443](https://arxiv.org/abs/2602.00443) | | Code & full pipeline | [GitHub: Nanboy-Ronan/RVCBench](https://github.com/Nanboy-Ronan/RVCBench) | | Dataset | [HuggingFace: Nanboy/RVCBench](https://huggingface.co/datasets/Nanboy/RVCBench) | | Contact | ruinanjin@alumni.ubc.ca | ### Citation ```bibtex @article{liao2026rvcbench, title = {RVCBench: Benchmarking the Robustness of Voice Cloning Across Modern Audio Generation Models}, author = {Liao, Xinting and Jin, Ruinan and Yu, Hanlin and Pandya, Deval and Li, Xiaoxiao}, journal = {arXiv preprint arXiv:2602.00443}, year = {2026} } ``` """) return demo if __name__ == "__main__": build_demo().launch(server_name="0.0.0.0", server_port=7860, show_api=False)