| """RVCBench — Interactive HuggingFace Space demo (v2). |
| |
| Tabs |
| ──── |
| 1. Voice Cloning Gallery – hear pre-computed clean vs. protected clones |
| + protection-effectiveness bar chart for all 5 methods |
| 2. Protect Your Voice – upload audio, apply protection, see waveform comparison |
| 3. Results Explorer – interactive bar chart + protection robustness heatmap |
| 4. About – paper, citation, resources |
| """ |
|
|
| from __future__ import annotations |
|
|
| import io |
| import os |
| import time |
|
|
| import gradio as gr |
| import numpy as np |
| import plotly.graph_objects as go |
| import soundfile as sf |
|
|
| try: |
| import _plotly_utils.basevalidators as _plotly_basevalidators |
|
|
| def _plotly_to_scalar_or_list_without_pandas(value): |
| np_mod = _plotly_basevalidators.get_module("numpy", should_load=False) |
| if np_mod and np_mod.isscalar(value) and hasattr(value, "item"): |
| return _plotly_basevalidators.to_non_numpy_type(np_mod, value) |
| if isinstance(value, (list, tuple)): |
| return [_plotly_to_scalar_or_list_without_pandas(item) for item in value] |
| if np_mod and isinstance(value, np_mod.ndarray): |
| if value.ndim == 0: |
| return _plotly_basevalidators.to_non_numpy_type(np_mod, value) |
| return [_plotly_to_scalar_or_list_without_pandas(item) for item in value] |
| if _plotly_basevalidators.is_numpy_convertable(value): |
| np_mod = _plotly_basevalidators.get_module("numpy", should_load=True) |
| if np_mod: |
| return _plotly_to_scalar_or_list_without_pandas(np_mod.array(value)) |
| return value |
|
|
| def _plotly_is_homogeneous_array_without_pandas(value): |
| np_mod = _plotly_basevalidators.get_module("numpy", should_load=False) |
| if np_mod and isinstance(value, np_mod.ndarray): |
| return True |
| if isinstance(value, _plotly_basevalidators.nw.Series): |
| return True |
| if _plotly_basevalidators.is_numpy_convertable(value): |
| np_mod = _plotly_basevalidators.get_module("numpy", should_load=True) |
| if np_mod: |
| return np_mod.array(value).shape != () |
| return False |
|
|
| _plotly_basevalidators.to_scalar_or_list = _plotly_to_scalar_or_list_without_pandas |
| _plotly_basevalidators.is_homogeneous_array = _plotly_is_homogeneous_array_without_pandas |
| except Exception: |
| pass |
|
|
| |
|
|
| SAMPLES = os.path.join(os.path.dirname(__file__), "samples", "1089") |
| REF_WAV = os.path.join(SAMPLES, "reference.wav") |
| TARGET_WAV = os.path.join(SAMPLES, "target.wav") |
| REF_TEXT = ("But her long fair hair was girlish: and girlish, and touched " |
| "with the wonder of mortal beauty, her face.") |
| TARGET_TEXT = "A great fisher of souls!" |
|
|
| |
|
|
| GALLERY_MODELS = { |
| "ZipVoice": dict( |
| clean="zipvoice_clean.wav", |
| prot="zipvoice_safespeech.wav", |
| sims={"Clean": 0.579, "SafeSpeech": 0.287, "Enkidu": 0.435, |
| "Spectral": 0.262, "GR-Noise": 0.258, "AntiFake": 0.543}, |
| ), |
| "MOSS-TTSD": dict( |
| clean="moss_ttsd_clean.wav", |
| prot="moss_ttsd_safespeech.wav", |
| sims={"Clean": 0.492, "SafeSpeech": 0.242, "Enkidu": 0.335, |
| "Spectral": 0.216, "GR-Noise": 0.247, "AntiFake": 0.453}, |
| ), |
| "MGM-Omni": dict( |
| clean="mgm_omni_clean.wav", |
| prot="mgm_omni_safespeech.wav", |
| sims={"Clean": 0.539, "SafeSpeech": 0.184, "Enkidu": 0.316, |
| "Spectral": 0.166, "GR-Noise": 0.229, "AntiFake": 0.491}, |
| ), |
| "OZSpeech": dict( |
| clean="ozspeech_clean.wav", |
| prot="ozspeech_safespeech.wav", |
| sims={"Clean": 0.388, "SafeSpeech": 0.156, "Enkidu": 0.187, |
| "Spectral": 0.147, "GR-Noise": 0.148, "AntiFake": 0.337}, |
| ), |
| "StyleTTS 2": dict( |
| clean="styletts2_clean.wav", |
| prot="styletts2_safespeech.wav", |
| sims={"Clean": 0.228, "SafeSpeech": 0.089, "Enkidu": 0.125, |
| "Spectral": 0.081, "GR-Noise": 0.030, "AntiFake": 0.207}, |
| ), |
| } |
|
|
| |
|
|
| |
| LEADERBOARD_ROWS = [ |
| dict(model="Qwen3-TTS", SIM=0.614, WER=0.052, MOS=4.39, MCD=5.79, RTF=2.02, SVA=0.974, Emo=0.731), |
| dict(model="IndexTTS", SIM=0.606, WER=0.052, MOS=4.06, MCD=6.61, RTF=2.23, SVA=0.972, Emo=0.693), |
| dict(model="CosyVoice 2", SIM=0.602, WER=0.175, MOS=4.39, MCD=6.17, RTF=4.58, SVA=0.974, Emo=0.729), |
| dict(model="ZipVoice", SIM=0.579, WER=0.053, MOS=4.13, MCD=7.09, RTF=1.46, SVA=0.952, Emo=0.675), |
| dict(model="MaskGCT", SIM=0.570, WER=0.088, MOS=3.93, MCD=6.91, RTF=1.36, SVA=0.939, Emo=0.682), |
| dict(model="GLM-TTS", SIM=0.570, WER=0.087, MOS=4.08, MCD=6.41, RTF=1.74, SVA=0.951, Emo=0.678), |
| dict(model="F5-TTS", SIM=0.559, WER=0.116, MOS=3.99, MCD=6.96, RTF=0.61, SVA=0.937, Emo=0.676), |
| dict(model="Higgs Audio", SIM=0.559, WER=0.250, MOS=4.30, MCD=6.06, RTF=1.42, SVA=0.941, Emo=0.717), |
| dict(model="MGM-Omni", SIM=0.539, WER=0.095, MOS=4.28, MCD=5.82, RTF=0.84, SVA=0.933, Emo=0.676), |
| dict(model="PlayDiffusion",SIM=0.506, WER=0.055, MOS=4.15, MCD=8.06, RTF=0.73, SVA=0.936, Emo=0.681), |
| dict(model="MOSS-TTSD", SIM=0.492, WER=0.383, MOS=4.10, MCD=7.09, RTF=None, SVA=0.876, Emo=0.667), |
| dict(model="VibeVoice", SIM=0.480, WER=0.228, MOS=3.83, MCD=6.76, RTF=1.86, SVA=0.852, Emo=0.624), |
| dict(model="FishSpeech", SIM=0.472, WER=0.166, MOS=4.37, MCD=6.47, RTF=3.61, SVA=0.907, Emo=0.682), |
| dict(model="XTTS-v2", SIM=0.454, WER=0.073, MOS=3.81, MCD=8.62, RTF=0.62, SVA=0.908, Emo=0.639), |
| dict(model="SparkTTS", SIM=0.408, WER=0.326, MOS=4.06, MCD=5.83, RTF=1.56, SVA=0.764, Emo=0.672), |
| dict(model="OZSpeech", SIM=0.388, WER=0.060, MOS=3.21, MCD=6.87, RTF=8.75, SVA=0.840, Emo=0.636), |
| dict(model="OpenVoice V2", SIM=0.244, WER=0.075, MOS=4.30, MCD=7.06, RTF=0.08, SVA=0.474, Emo=0.601), |
| dict(model="StyleTTS 2", SIM=0.228, WER=0.049, MOS=4.30, MCD=6.81, RTF=0.11, SVA=0.388, Emo=0.589), |
| ] |
|
|
| |
| CROSS_DATASET_ROWS = [ |
| dict(model="Qwen3-TTS", LibriTTS=0.614, VCTK=0.618, MultiSpk=0.495, Long=0.561, AISHELL=0.721, French=0.536, Bilingual=0.673, BGclean=0.689, BGnoise=0.572, Hallucin=0.515), |
| dict(model="IndexTTS", LibriTTS=0.606, VCTK=0.567, MultiSpk=0.473, Long=0.775, AISHELL=0.721, French=0.397, Bilingual=0.673, BGclean=0.589, BGnoise=0.528, Hallucin=0.529), |
| dict(model="CosyVoice 2", LibriTTS=0.602, VCTK=0.582, MultiSpk=0.448, Long=0.530, AISHELL=0.717, French=0.378, Bilingual=0.653, BGclean=0.626, BGnoise=0.515, Hallucin=0.518), |
| dict(model="ZipVoice", LibriTTS=0.579, VCTK=0.554, MultiSpk=0.531, Long=0.729, AISHELL=0.712, French=0.363, Bilingual=0.322, BGclean=0.625, BGnoise=0.462, Hallucin=0.509), |
| dict(model="MaskGCT", LibriTTS=0.570, VCTK=0.555, MultiSpk=0.431, Long=0.194, AISHELL=0.674, French=0.494, Bilingual=None, BGclean=0.610, BGnoise=0.487, Hallucin=0.499), |
| dict(model="GLM-TTS", LibriTTS=0.570, VCTK=0.573, MultiSpk=0.445, Long=0.757, AISHELL=0.690, French=0.398, Bilingual=0.657, BGclean=0.622, BGnoise=0.528, Hallucin=0.533), |
| dict(model="F5-TTS", LibriTTS=0.559, VCTK=0.537, MultiSpk=0.507, Long=0.607, AISHELL=0.696, French=0.304, Bilingual=0.653, BGclean=0.582, BGnoise=0.414, Hallucin=0.455), |
| dict(model="Higgs Audio", LibriTTS=0.559, VCTK=0.516, MultiSpk=0.418, Long=0.520, AISHELL=0.581, French=0.349, Bilingual=0.543, BGclean=0.592, BGnoise=0.421, Hallucin=0.425), |
| dict(model="MGM-Omni", LibriTTS=0.539, VCTK=0.447, MultiSpk=0.370, Long=0.442, AISHELL=0.713, French=0.227, Bilingual=0.630, BGclean=0.523, BGnoise=0.332, Hallucin=0.396), |
| dict(model="PlayDiffusion",LibriTTS=0.506, VCTK=0.426, MultiSpk=0.360, Long=0.637, AISHELL=0.441, French=0.283, Bilingual=0.465, BGclean=0.433, BGnoise=0.305, Hallucin=0.408), |
| dict(model="MOSS-TTSD", LibriTTS=0.492, VCTK=0.440, MultiSpk=0.379, Long=0.644, AISHELL=0.437, French=0.327, Bilingual=0.471, BGclean=0.494, BGnoise=0.488, Hallucin=0.416), |
| dict(model="VibeVoice", LibriTTS=0.480, VCTK=0.436, MultiSpk=0.348, Long=0.625, AISHELL=0.564, French=0.343, Bilingual=0.531, BGclean=0.513, BGnoise=0.364, Hallucin=0.408), |
| dict(model="FishSpeech", LibriTTS=0.472, VCTK=0.430, MultiSpk=0.383, Long=0.572, AISHELL=0.611, French=0.374, Bilingual=0.566, BGclean=0.495, BGnoise=0.387, Hallucin=0.351), |
| dict(model="XTTS-v2", LibriTTS=0.454, VCTK=0.454, MultiSpk=0.328, Long=0.613, AISHELL=0.569, French=0.445, Bilingual=0.506, BGclean=0.546, BGnoise=0.394, Hallucin=0.488), |
| dict(model="SparkTTS", LibriTTS=0.408, VCTK=0.532, MultiSpk=0.228, Long=0.345, AISHELL=0.569, French=0.164, Bilingual=0.480, BGclean=0.588, BGnoise=0.332, Hallucin=0.336), |
| dict(model="OZSpeech", LibriTTS=0.388, VCTK=0.253, MultiSpk=0.271, Long=None, AISHELL=None, French=0.109, Bilingual=None, BGclean=0.272, BGnoise=0.164, Hallucin=0.281), |
| dict(model="OpenVoice V2", LibriTTS=0.244, VCTK=0.392, MultiSpk=0.192, Long=0.278, AISHELL=0.431, French=0.271, Bilingual=0.298, BGclean=0.484, BGnoise=0.358, Hallucin=0.365), |
| dict(model="StyleTTS 2", LibriTTS=0.228, VCTK=0.236, MultiSpk=0.162, Long=None, AISHELL=None, French=None, Bilingual=0.213, BGclean=0.196, BGnoise=0.166, Hallucin=0.184), |
| ] |
|
|
| CROSS_DATASET_COLS = [ |
| ("LibriTTS", "LibriTTS"), |
| ("VCTK", "VCTK"), |
| ("MultiSpk", "Multi-spk"), |
| ("Long", "Long"), |
| ("AISHELL", "AISHELL"), |
| ("French", "French"), |
| ("Bilingual", "Bilingual"), |
| ("BGclean", "BG-clean"), |
| ("BGnoise", "BG-noise"), |
| ("Hallucin", "Hallucin."), |
| ] |
|
|
| |
| PROT_ROWS = [ |
| dict(model="Qwen3-TTS", Clean=0.614, SafeSpeech=0.384, Enkidu=0.502, Spectral=0.363, GRNoise=0.408, AntiFake=0.582), |
| dict(model="IndexTTS", Clean=0.606, SafeSpeech=0.346, Enkidu=0.475, Spectral=0.318, GRNoise=0.392, AntiFake=0.572), |
| dict(model="CosyVoice 2", Clean=0.602, SafeSpeech=0.321, Enkidu=0.447, Spectral=0.301, GRNoise=0.384, AntiFake=0.549), |
| dict(model="ZipVoice", Clean=0.579, SafeSpeech=0.287, Enkidu=0.435, Spectral=0.262, GRNoise=0.258, AntiFake=0.543), |
| dict(model="MaskGCT", Clean=0.570, SafeSpeech=0.303, Enkidu=0.407, Spectral=0.281, GRNoise=0.312, AntiFake=0.530), |
| dict(model="GLM-TTS", Clean=0.570, SafeSpeech=0.330, Enkidu=0.445, Spectral=0.311, GRNoise=0.388, AntiFake=0.532), |
| dict(model="F5-TTS", Clean=0.559, SafeSpeech=0.207, Enkidu=0.431, Spectral=0.176, GRNoise=0.137, AntiFake=0.520), |
| dict(model="Higgs Audio", Clean=0.559, SafeSpeech=0.264, Enkidu=0.435, Spectral=0.236, GRNoise=0.272, AntiFake=0.521), |
| dict(model="MGM-Omni", Clean=0.539, SafeSpeech=0.184, Enkidu=0.316, Spectral=0.166, GRNoise=0.229, AntiFake=0.491), |
| dict(model="PlayDiffusion",Clean=0.506, SafeSpeech=0.173, Enkidu=None, Spectral=0.149, GRNoise=0.162, AntiFake=0.466), |
| dict(model="MOSS-TTSD", Clean=0.492, SafeSpeech=0.242, Enkidu=0.335, Spectral=0.216, GRNoise=0.247, AntiFake=0.453), |
| dict(model="VibeVoice", Clean=0.480, SafeSpeech=0.272, Enkidu=0.367, Spectral=0.253, GRNoise=0.280, AntiFake=0.442), |
| dict(model="FishSpeech", Clean=0.472, SafeSpeech=0.238, Enkidu=0.334, Spectral=0.212, GRNoise=0.235, AntiFake=0.439), |
| dict(model="XTTS-v2", Clean=0.454, SafeSpeech=0.260, Enkidu=0.308, Spectral=0.241, GRNoise=0.237, AntiFake=0.414), |
| dict(model="SparkTTS", Clean=0.408, SafeSpeech=0.129, Enkidu=0.137, Spectral=0.108, GRNoise=0.062, AntiFake=0.359), |
| dict(model="OZSpeech", Clean=0.388, SafeSpeech=0.156, Enkidu=0.187, Spectral=0.147, GRNoise=0.148, AntiFake=0.337), |
| dict(model="OpenVoice V2", Clean=0.244, SafeSpeech=0.185, Enkidu=0.188, Spectral=0.180, GRNoise=0.175, AntiFake=0.236), |
| dict(model="StyleTTS 2", Clean=0.228, SafeSpeech=0.089, Enkidu=0.125, Spectral=0.081, GRNoise=0.030, AntiFake=0.207), |
| ] |
| |
|
|
| METRIC_META = { |
| "SIM": ("Speaker Similarity ↑", True), |
| "WER": ("Word Error Rate ↓", False), |
| "MOS": ("MOS Score ↑", True), |
| "MCD": ("Mel Cepstral Dist. ↓", False), |
| "RTF": ("Real-Time Factor ↓", False), |
| "SVA": ("Speaker Verif. Acc. ↑",True), |
| "Emo": ("Emotion Match Rate ↑", True), |
| } |
|
|
| |
|
|
| _GOOD = (200, 230, 201) |
| _MID = (255, 249, 196) |
| _BAD = (255, 205, 210) |
|
|
|
|
| def _interp_color(t: float) -> str: |
| """t=0 → bad (red), t=1 → good (green), t=0.5 → yellow.""" |
| if t <= 0.5: |
| s = t / 0.5 |
| r = int(_BAD[0] + s * (_MID[0] - _BAD[0])) |
| g = int(_BAD[1] + s * (_MID[1] - _BAD[1])) |
| b = int(_BAD[2] + s * (_MID[2] - _BAD[2])) |
| else: |
| s = (t - 0.5) / 0.5 |
| r = int(_MID[0] + s * (_GOOD[0] - _MID[0])) |
| g = int(_MID[1] + s * (_GOOD[1] - _MID[1])) |
| b = int(_MID[2] + s * (_GOOD[2] - _MID[2])) |
| return f"rgb({r},{g},{b})" |
|
|
|
|
| def _col_colors(values: list, higher_is_better: bool) -> list[str]: |
| valid = [v for v in values if v is not None] |
| if not valid or max(valid) == min(valid): |
| return ["rgb(245,245,245)"] * len(values) |
| vmin, vmax = min(valid), max(valid) |
| colors = [] |
| for v in values: |
| if v is None: |
| colors.append("rgb(245,245,245)") |
| else: |
| t = (v - vmin) / (vmax - vmin) |
| if not higher_is_better: |
| t = 1 - t |
| colors.append(_interp_color(t)) |
| return colors |
|
|
|
|
| |
|
|
| def _load(path: str) -> tuple[np.ndarray, int]: |
| audio, sr = sf.read(path, dtype="float32") |
| if audio.ndim > 1: |
| audio = audio.mean(axis=1) |
| return audio, sr |
|
|
|
|
| def _snr(original: np.ndarray, protected: np.ndarray) -> float: |
| noise = protected - original |
| sp = np.mean(original ** 2) |
| np_ = np.mean(noise ** 2) |
| return float("inf") if np_ < 1e-12 else float(10 * np.log10(sp / np_)) |
|
|
|
|
| |
|
|
| def apply_grnoise(audio: np.ndarray, sr: int, snr_db: float = 25.0) -> np.ndarray: |
| sig_pow = np.mean(audio ** 2) |
| noise_pow = sig_pow / (10 ** (snr_db / 10)) |
| noise = np.random.randn(*audio.shape).astype(np.float32) * np.sqrt(noise_pow) |
| return np.clip(audio + noise, -1.0, 1.0) |
|
|
|
|
| def apply_spectral(audio: np.ndarray, sr: int, strength: float = 0.05) -> np.ndarray: |
| from numpy.fft import rfft, irfft |
| n_fft, hop = 1024, 256 |
| out = np.zeros_like(audio) |
| cnt = np.zeros_like(audio) |
| for start in range(0, len(audio) - n_fft, hop): |
| frame = audio[start:start + n_fft] * np.hanning(n_fft).astype(np.float32) |
| spec = rfft(frame) |
| mag = np.abs(spec) |
| perturb = np.random.randn(*mag.shape).astype(np.float32) * strength * mag |
| spec_p = spec + perturb * np.exp(1j * np.random.uniform(0, 2 * np.pi, mag.shape)) |
| f = irfft(spec_p)[:n_fft].astype(np.float32) |
| out[start:start + n_fft] += f |
| cnt[start:start + n_fft] += 1 |
| cnt = np.maximum(cnt, 1) |
| return np.clip(out / cnt, -1.0, 1.0) |
|
|
|
|
| PROTECT_FN = {"GR-Noise": apply_grnoise, "Spectral": apply_spectral} |
|
|
|
|
| |
|
|
| def make_sim_bar(model_name: str) -> go.Figure: |
| """Bar chart: SIM under each protection method for one gallery model.""" |
| info = GALLERY_MODELS[model_name] |
| sims = info["sims"] |
| labels = list(sims.keys()) |
| values = list(sims.values()) |
|
|
| bar_colors = [ |
| "#2563eb", |
| "#7c3aed", |
| "#059669", |
| "#ea580c", |
| "#475569", |
| "#be123c", |
| ] |
| |
| clean_sim = sims["Clean"] |
| text = [f"{v:.3f}" if k == "Clean" else f"{v:.3f}<br>↓{clean_sim - v:.3f}" |
| for k, v in sims.items()] |
| hover_text = [ |
| f"{label}<br>SIM: {value:.3f}<br>Drop from clean: {clean_sim - value:.3f}" |
| for label, value in zip(labels, values) |
| ] |
|
|
| fig = go.Figure(go.Bar( |
| x=labels, y=values, |
| marker_color=bar_colors, |
| marker_line_color="rgba(15, 23, 42, 0.25)", |
| marker_line_width=1, |
| text=text, |
| textposition="outside", |
| hovertext=hover_text, |
| hoverinfo="text", |
| cliponaxis=False, |
| )) |
| fig.update_layout( |
| title=dict( |
| text=f"<b>{model_name}</b> speaker similarity after protection", |
| font=dict(size=16, color="#0f172a"), |
| x=0.02, |
| ), |
| yaxis=dict( |
| title="SIM", |
| range=[0, min(0.75, max(values) * 1.28)], |
| gridcolor="#e2e8f0", |
| zeroline=False, |
| ), |
| xaxis=dict(title="", tickfont=dict(size=12)), |
| paper_bgcolor="white", |
| plot_bgcolor="#f8fafc", |
| margin=dict(t=62, b=42, l=48, r=24), |
| height=350, |
| showlegend=False, |
| bargap=0.28, |
| font=dict(color="#334155"), |
| ) |
| fig.add_trace(go.Scatter( |
| x=labels, |
| y=[clean_sim] * len(labels), |
| mode="lines+text", |
| line=dict(color="#2563eb", dash="dot", width=1.5), |
| text=[""] * (len(labels) - 1) + ["Clean baseline"], |
| textposition="top right", |
| textfont=dict(size=10, color="#2563eb"), |
| hoverinfo="skip", |
| showlegend=False, |
| )) |
| return fig |
|
|
|
|
| def make_results_bar(metric: str = "SIM", ascending: bool = False) -> go.Figure: |
| """Horizontal bar chart of all 18 models sorted by the chosen metric.""" |
| higher_is_better = METRIC_META[metric][1] |
| metric_label = METRIC_META[metric][0] |
|
|
| rows = [r for r in LEADERBOARD_ROWS if r.get(metric) is not None] |
| rows = sorted(rows, key=lambda r: r[metric], reverse=(higher_is_better ^ ascending)) |
|
|
| models = [r["model"] for r in rows] |
| values = [r[metric] for r in rows] |
|
|
| colors = _col_colors(values, higher_is_better) |
| text = [f"{v:.3f}" if v is not None else "—" for v in values] |
|
|
| fig = go.Figure(go.Bar( |
| x=values, y=models, |
| orientation="h", |
| marker_color=colors, |
| marker_line_color="#999", marker_line_width=0.5, |
| text=text, textposition="outside", |
| cliponaxis=False, |
| )) |
| fig.update_layout( |
| title=dict(text=f"<b>Model Ranking by {metric_label}</b>", |
| font=dict(size=14)), |
| xaxis=dict(title=metric_label), |
| yaxis=dict(autorange="reversed"), |
| paper_bgcolor="white", plot_bgcolor="#f8f9fa", |
| margin=dict(t=50, b=40, l=120, r=80), |
| height=520, |
| showlegend=False, |
| ) |
| return fig |
|
|
|
|
| def make_prot_heatmap() -> go.Figure: |
| """Heatmap: SIM under each protection method for all 18 models.""" |
| col_order = ["Clean", "SafeSpeech", "Enkidu", "Spectral", "GRNoise", "AntiFake"] |
| col_labels = ["Clean", "SafeSpeech", "Enkidu", "Spectral", "GR-Noise", "AntiFake"] |
|
|
| |
| rows = sorted(PROT_ROWS, key=lambda r: r["Clean"], reverse=True) |
| model_names = [r["model"] for r in rows] |
|
|
| z: list[list] = [] |
| text_vals: list[list[str]] = [] |
| for r in rows: |
| row_z, row_t = [], [] |
| for col in col_order: |
| v = r.get(col) |
| row_z.append(v) |
| row_t.append(f"{v:.3f}" if v is not None else "—") |
| z.append(row_z) |
| text_vals.append(row_t) |
|
|
| fig = go.Figure(go.Heatmap( |
| z=z, |
| x=col_labels, |
| y=model_names, |
| text=text_vals, |
| texttemplate="%{text}", |
| textfont=dict(size=10), |
| colorscale=[ |
| [0.0, "#b71c1c"], |
| [0.25, "#ef9a9a"], |
| [0.5, "#fff9c4"], |
| [0.75, "#a5d6a7"], |
| [1.0, "#1b5e20"], |
| ], |
| zmin=0.0, zmax=0.75, |
| colorbar=dict(title="SIM", tickformat=".2f", len=0.8), |
| hoverongaps=False, |
| )) |
| |
| fig.add_shape(type="line", |
| x0=0.5, x1=0.5, y0=-0.5, y1=len(model_names) - 0.5, |
| line=dict(color="#555", width=2, dash="dot"), |
| xref="x", yref="y") |
|
|
| fig.update_layout( |
| title=dict( |
| text="<b>Protection Robustness — Speaker Similarity (SIM) on LibriTTS</b><br>" |
| "<sup>Green = high SIM (clone faithful). Red = low SIM (protection effective). " |
| "Drop from Clean → protected shows protection strength.</sup>", |
| font=dict(size=13), |
| ), |
| yaxis=dict(autorange="reversed"), |
| xaxis=dict(side="top"), |
| paper_bgcolor="white", plot_bgcolor="white", |
| margin=dict(t=120, b=40, l=120, r=80), |
| height=600, |
| ) |
| return fig |
|
|
|
|
| def make_cross_dataset_heatmap() -> go.Figure: |
| """Heatmap: SIM on clean prompts across all 10 datasets for all 18 models.""" |
| col_keys = [k for k, _ in CROSS_DATASET_COLS] |
| col_labels = [label for _, label in CROSS_DATASET_COLS] |
|
|
| rows = sorted(CROSS_DATASET_ROWS, key=lambda r: r["LibriTTS"], reverse=True) |
| model_names = [r["model"] for r in rows] |
|
|
| z: list[list] = [] |
| text_vals: list[list[str]] = [] |
| for r in rows: |
| row_z, row_t = [], [] |
| for key in col_keys: |
| v = r.get(key) |
| row_z.append(v) |
| row_t.append(f"{v:.3f}" if v is not None else "—") |
| z.append(row_z) |
| text_vals.append(row_t) |
|
|
| fig = go.Figure(go.Heatmap( |
| z=z, |
| x=col_labels, |
| y=model_names, |
| text=text_vals, |
| texttemplate="%{text}", |
| textfont=dict(size=10), |
| colorscale=[ |
| [0.0, "#b71c1c"], |
| [0.25, "#ef9a9a"], |
| [0.5, "#fff9c4"], |
| [0.75, "#a5d6a7"], |
| [1.0, "#1b5e20"], |
| ], |
| zmin=0.0, zmax=0.75, |
| colorbar=dict(title="SIM", tickformat=".2f", len=0.8), |
| hoverongaps=False, |
| )) |
| fig.update_layout( |
| title=dict( |
| text="<b>Cross-Dataset Generalisation — Speaker Similarity (SIM) on Clean Prompts</b><br>" |
| "<sup>Models sorted by LibriTTS SIM. — = not evaluated. " |
| "Green = high SIM (faithful clone), red = low SIM.</sup>", |
| font=dict(size=13), |
| ), |
| yaxis=dict(autorange="reversed"), |
| xaxis=dict(side="top"), |
| paper_bgcolor="white", plot_bgcolor="white", |
| margin=dict(t=120, b=40, l=120, r=80), |
| height=600, |
| ) |
| return fig |
|
|
|
|
| def make_waveform_figure( |
| original: np.ndarray, protected: np.ndarray, sr: int |
| ) -> go.Figure: |
| """Overlay waveform plot: original vs. protected audio.""" |
| n = min(len(original), len(protected), sr * 5) |
| t = (np.arange(n) / sr).tolist() |
| original_wave = original[:n].tolist() |
| protected_wave = protected[:n].tolist() |
|
|
| fig = go.Figure() |
| fig.add_trace(go.Scatter( |
| x=t, y=original_wave, |
| name="Original", |
| line=dict(color="#1565c0", width=1), |
| opacity=0.85, |
| )) |
| fig.add_trace(go.Scatter( |
| x=t, y=protected_wave, |
| name="Protected", |
| line=dict(color="#c62828", width=1), |
| opacity=0.85, |
| )) |
| fig.update_layout( |
| title=dict(text="<b>Waveform Comparison</b> (first 5 s)", |
| font=dict(size=13)), |
| xaxis=dict(title="Time (s)"), |
| yaxis=dict(title="Amplitude", range=[-1.05, 1.05]), |
| paper_bgcolor="white", plot_bgcolor="#f8f9fa", |
| legend=dict(orientation="h", y=1.08, x=0.5, xanchor="center"), |
| margin=dict(t=60, b=40, l=55, r=20), |
| height=220, |
| ) |
| return fig |
|
|
|
|
| |
|
|
| def load_gallery(model_name: str): |
| info = GALLERY_MODELS[model_name] |
| clean_sim = info["sims"]["Clean"] |
| prot_sim = info["sims"]["SafeSpeech"] |
| drop = clean_sim - prot_sim |
| note_md = ( |
| f"**Clean SIM:** {clean_sim:.3f} → " |
| f"**Protected SIM (SafeSpeech):** {prot_sim:.3f} " |
| f"*(drop: {drop:.3f})*" |
| ) |
| return ( |
| REF_WAV, |
| TARGET_WAV, |
| os.path.join(SAMPLES, info["clean"]), |
| os.path.join(SAMPLES, "protected_safespeech.wav"), |
| os.path.join(SAMPLES, info["prot"]), |
| note_md, |
| make_sim_bar(model_name), |
| ) |
|
|
|
|
| |
|
|
| def run_protection(audio_input, method: str, strength: float): |
| if audio_input is None: |
| return None, None, "Upload an audio file first.", None |
|
|
| sr_in, data = audio_input |
| audio = data.astype(np.float32) |
| if audio.max() > 1.0: |
| audio /= 32768.0 |
| if audio.ndim > 1: |
| audio = audio.mean(axis=1) |
|
|
| t0 = time.time() |
| fn = PROTECT_FN[method] |
| if method == "GR-Noise": |
| protected = fn(audio, sr_in, snr_db=strength) |
| else: |
| protected = fn(audio, sr_in, strength=strength / 100.0) |
| elapsed = time.time() - t0 |
|
|
| snr = _snr(audio, protected) |
| prot_int = (protected * 32767).astype(np.int16) |
|
|
| metrics_md = ( |
| f"| Metric | Value |\n|--------|-------|\n" |
| f"| SNR (dB) | {snr:.1f} |\n" |
| f"| Processing time | {elapsed * 1000:.0f} ms |\n" |
| f"| Method | {method} |\n" |
| ) |
|
|
| waveform_fig = make_waveform_figure(audio, protected, sr_in) |
| return (sr_in, audio.copy()), (sr_in, prot_int), metrics_md, waveform_fig |
|
|
|
|
| def update_strength_label(method: str) -> dict: |
| if method == "GR-Noise": |
| return gr.update( |
| label="Target SNR (dB) — lower = stronger, more audible", |
| info="25 dB: nearly imperceptible. 10 dB: noticeable noise.", |
| minimum=10, maximum=40, value=25, step=1, |
| ) |
| else: |
| return gr.update( |
| label="Spectral Strength (%) — higher = stronger perturbation", |
| info="5% is nearly inaudible. 20%+ may cause artifacts.", |
| minimum=1, maximum=30, value=5, step=1, |
| ) |
|
|
|
|
| |
|
|
| def update_results_bar(metric: str) -> go.Figure: |
| return make_results_bar(metric) |
|
|
|
|
| |
|
|
| CSS = """ |
| footer { display: none !important; } |
| .gradio-container { |
| max-width: 1180px !important; |
| margin: 0 auto !important; |
| } |
| .hero { |
| padding: 28px 28px 22px; |
| border-radius: 12px; |
| background: linear-gradient(135deg, #0f172a 0%, #164e63 54%, #065f46 100%); |
| color: white; |
| margin-bottom: 18px; |
| } |
| .hero h1 { |
| margin: 0 0 8px; |
| font-size: 2.35rem; |
| line-height: 1.08; |
| letter-spacing: 0; |
| color: white !important; |
| } |
| .hero p { |
| max-width: 760px; |
| margin: 0; |
| color: #dbeafe; |
| font-size: 1.05rem; |
| } |
| .hero a { |
| color: white !important; |
| } |
| .hero-links { |
| display: flex; |
| flex-wrap: wrap; |
| gap: 8px; |
| margin-top: 16px; |
| } |
| .hero-links a { |
| text-decoration: none; |
| } |
| .stat-strip { |
| display: grid; |
| grid-template-columns: repeat(4, minmax(0, 1fr)); |
| gap: 10px; |
| margin: 14px 0 18px; |
| } |
| .stat-card { |
| border: 1px solid #d8dee9; |
| border-radius: 8px; |
| padding: 12px 14px; |
| background: #ffffff; |
| } |
| .stat-card b { |
| display: block; |
| font-size: 1.35rem; |
| color: #0f172a; |
| line-height: 1.1; |
| } |
| .stat-card span { |
| color: #475569; |
| font-size: 0.9rem; |
| } |
| .section-head { |
| margin: 18px 0 8px; |
| color: #0f172a; |
| } |
| .note-box { |
| font-size: 1.02em; |
| background: #eef6ff; |
| border: 1px solid #bfdbfe; |
| border-left: 4px solid #2563eb; |
| border-radius: 8px; |
| padding: 10px 12px; |
| } |
| .audio-panel { |
| border: 1px solid #e2e8f0; |
| border-radius: 8px; |
| padding: 12px; |
| background: #ffffff; |
| } |
| .audio-panel h3, |
| .audio-panel h4 { |
| margin-top: 0; |
| } |
| .workflow-copy { |
| color: #475569; |
| margin-bottom: 12px; |
| } |
| @media (max-width: 760px) { |
| .hero { |
| padding: 22px 18px 18px; |
| } |
| .hero h1 { |
| font-size: 1.75rem; |
| } |
| .stat-strip { |
| grid-template-columns: repeat(2, minmax(0, 1fr)); |
| } |
| } |
| """ |
|
|
| INTRO_MD = """ |
| <div class="hero"> |
| <h1>RVCBench</h1> |
| <p>Voice cloning attacks and audio protection methods, compared through paired listening examples and speaker-similarity results.</p> |
| <div class="hero-links"> |
| <a href="https://arxiv.org/abs/2602.00443"><img alt="Paper" src="https://img.shields.io/badge/arXiv-2602.00443-b31b1b.svg"></a> |
| <a href="https://huggingface.co/datasets/Nanboy/RVCBench"><img alt="Dataset" src="https://img.shields.io/badge/HuggingFace-Dataset-ffcc00.svg"></a> |
| <a href="https://github.com/Nanboy-Ronan/RVCBench"><img alt="GitHub" src="https://img.shields.io/badge/GitHub-RVCBench-181717.svg"></a> |
| </div> |
| </div> |
| |
| <div class="stat-strip"> |
| <div class="stat-card"><b>26</b><span>voice cloning models</span></div> |
| <div class="stat-card"><b>5</b><span>protection methods</span></div> |
| <div class="stat-card"><b>7</b><span>evaluation metrics</span></div> |
| <div class="stat-card"><b>10</b><span>speech datasets</span></div> |
| </div> |
| """ |
|
|
| GALLERY_INTRO_MD = """ |
| <div class="workflow-copy"> |
| Select a cloning model, compare clean and protected audio, then inspect how much each protection method lowers speaker similarity. |
| </div> |
| """ |
|
|
| PROT_INTRO_MD = """ |
| Upload your own audio clip and apply a protection method. The protected audio sounds nearly |
| identical to humans, but disrupts automatic voice cloning models. |
| |
| - **GR-Noise** — Gaussian random noise at a chosen SNR level. No surrogate model required. |
| - **Spectral** — Structured perturbation in the STFT frequency domain. |
| """ |
|
|
| RESULTS_INTRO_MD = """ |
| **Metric guide** — SIM: speaker cosine similarity ↑ · |
| WER: word error rate ↓ · MOS: perceptual quality ↑ · |
| MCD: mel cepstral distortion ↓ · RTF: real-time factor ↓ · |
| SVA: speaker verification accuracy ↑ · Emo: emotion match rate ↑ |
| |
| Select a metric to re-rank the 18 models. The heatmap below shows protection robustness |
| (SIM under each of 5 protection methods). |
| """ |
|
|
|
|
| |
|
|
| def build_demo(): |
| with gr.Blocks(css=CSS, title="RVCBench Demo") as demo: |
| gr.Markdown(INTRO_MD) |
|
|
| with gr.Tabs(): |
|
|
| |
| with gr.Tab("🎧 Voice Cloning Gallery"): |
| gr.Markdown(GALLERY_INTRO_MD) |
|
|
| with gr.Row(): |
| model_dd = gr.Dropdown( |
| choices=list(GALLERY_MODELS.keys()), |
| value="ZipVoice", |
| label="Voice Cloning Model", |
| scale=3, |
| ) |
| load_btn = gr.Button("Load Example", variant="primary", scale=1) |
|
|
| sim_note = gr.Markdown("", elem_classes="note-box") |
|
|
| with gr.Row(): |
| with gr.Column(elem_classes="audio-panel"): |
| gr.Markdown('<h3 class="section-head">1. Reference Voice</h3>') |
| gr.Markdown(f"*\"{REF_TEXT}\"*") |
| ref_out = gr.Audio(label="Reference (original)", interactive=False) |
| with gr.Column(elem_classes="audio-panel"): |
| gr.Markdown('<h3 class="section-head">2. Target Speech</h3>') |
| gr.Markdown(f"*\"{TARGET_TEXT}\"*") |
| target_out = gr.Audio(label="Target utterance", interactive=False) |
|
|
| gr.Markdown('<h3 class="section-head">3. Cloning Results</h3>') |
|
|
| with gr.Row(): |
| with gr.Column(elem_classes="audio-panel"): |
| gr.Markdown("#### Clean Reference") |
| clean_out = gr.Audio(label="Clean clone", interactive=False) |
| with gr.Column(elem_classes="audio-panel"): |
| gr.Markdown("#### SafeSpeech-Protected Reference") |
| prot_ref_out = gr.Audio(label="Protected reference", interactive=False) |
| prot_clone_out = gr.Audio(label="Clone from protected (degraded)", interactive=False) |
|
|
| gr.Markdown('<h3 class="section-head">4. Protection Effectiveness Across Methods</h3>') |
| sim_chart = gr.Plot(label="", show_label=False) |
|
|
| gallery_outputs = [ref_out, target_out, clean_out, prot_ref_out, |
| prot_clone_out, sim_note, sim_chart] |
| load_btn.click(fn=load_gallery, inputs=[model_dd], outputs=gallery_outputs) |
| demo.load(fn=load_gallery, inputs=[model_dd], outputs=gallery_outputs) |
| model_dd.change(fn=load_gallery, inputs=[model_dd], outputs=gallery_outputs) |
|
|
| |
| with gr.Tab("🔒 Protect Your Voice"): |
| gr.Markdown(PROT_INTRO_MD) |
|
|
| with gr.Row(): |
| audio_in = gr.Audio( |
| label="Upload your audio (wav / mp3, ≤ 30 s)", |
| type="numpy", scale=3, |
| ) |
| with gr.Column(scale=1): |
| method_dd = gr.Dropdown( |
| choices=list(PROTECT_FN.keys()), |
| value="GR-Noise", |
| label="Protection Method", |
| ) |
| strength_sl = gr.Slider( |
| minimum=10, maximum=40, value=25, step=1, |
| label="Target SNR (dB) — lower = stronger, more audible", |
| info="25 dB: nearly imperceptible. 10 dB: noticeable noise.", |
| ) |
| protect_btn = gr.Button("Apply Protection", variant="primary") |
|
|
| with gr.Row(): |
| orig_out = gr.Audio(label="Original", interactive=False) |
| prot_live = gr.Audio(label="Protected", interactive=False) |
|
|
| metrics_out = gr.Markdown("") |
| waveform_plot = gr.Plot(label="Waveform Comparison", show_label=False) |
|
|
| method_dd.change(fn=update_strength_label, inputs=[method_dd], |
| outputs=[strength_sl]) |
| protect_btn.click( |
| fn=run_protection, |
| inputs=[audio_in, method_dd, strength_sl], |
| outputs=[orig_out, prot_live, metrics_out, waveform_plot], |
| ) |
|
|
| gr.Markdown( |
| "> **Note:** Full voice cloning inference (SafeSpeech, Enkidu, AntiFake) " |
| "requires surrogate models and is not included in this Space due to compute " |
| "constraints. See the " |
| "[GitHub repo](https://github.com/Nanboy-Ronan/RVCBench) for the full pipeline." |
| ) |
|
|
| |
| with gr.Tab("📊 Results Explorer"): |
| gr.Markdown(RESULTS_INTRO_MD) |
|
|
| metric_dd = gr.Dropdown( |
| choices=list(METRIC_META.keys()), |
| value="SIM", |
| label="Sort by metric", |
| ) |
| bar_chart = gr.Plot(label="", show_label=False) |
| metric_dd.change(fn=update_results_bar, inputs=[metric_dd], |
| outputs=[bar_chart]) |
| demo.load(fn=lambda: make_results_bar("SIM"), outputs=[bar_chart]) |
|
|
| gr.Markdown("---") |
| gr.Markdown( |
| "### Cross-Dataset Generalisation\n" |
| "SIM on clean prompts across all 10 benchmark datasets. " |
| "Models sorted by LibriTTS SIM. — = not evaluated." |
| ) |
| cross_heatmap = gr.Plot(label="", show_label=False) |
| demo.load(fn=make_cross_dataset_heatmap, outputs=[cross_heatmap]) |
|
|
| gr.Markdown("---") |
| gr.Markdown( |
| "### Protection Robustness Heatmap\n" |
| "SIM under each of 5 protection methods — drop from **Clean** indicates " |
| "more effective protection." |
| ) |
| prot_heatmap = gr.Plot(label="", show_label=False) |
| demo.load(fn=make_prot_heatmap, outputs=[prot_heatmap]) |
|
|
| |
| with gr.Tab("ℹ️ About"): |
| gr.Markdown(""" |
| ## About RVCBench |
| |
| **RVCBench** is an open-source benchmark for evaluating the robustness of voice cloning |
| against audio protection methods. |
| |
| ### What it measures |
| - How well **18+ modern zero-shot TTS/VC models** can clone a speaker's voice |
| - How effectively **5 audio protection methods** (SafeSpeech, Enkidu, Spectral, GR-Noise, AntiFake) |
| prevent cloning across **10 datasets** and **7 evaluation metrics** |
| |
| ### Resources |
| |
| | Resource | Link | |
| |----------|------| |
| | Paper (arXiv) | [arXiv:2602.00443](https://arxiv.org/abs/2602.00443) | |
| | Code & full pipeline | [GitHub: Nanboy-Ronan/RVCBench](https://github.com/Nanboy-Ronan/RVCBench) | |
| | Dataset | [HuggingFace: Nanboy/RVCBench](https://huggingface.co/datasets/Nanboy/RVCBench) | |
| | Contact | ruinanjin@alumni.ubc.ca | |
| |
| ### Citation |
| |
| ```bibtex |
| @article{liao2026rvcbench, |
| title = {RVCBench: Benchmarking the Robustness of Voice Cloning Across Modern Audio Generation Models}, |
| author = {Liao, Xinting and Jin, Ruinan and Yu, Hanlin and Pandya, Deval and Li, Xiaoxiao}, |
| journal = {arXiv preprint arXiv:2602.00443}, |
| year = {2026} |
| } |
| ``` |
| """) |
|
|
| return demo |
|
|
|
|
| if __name__ == "__main__": |
| build_demo().launch(server_name="0.0.0.0", server_port=7860, show_api=False) |
|
|