Nanboy commited on
Commit
fc16bfb
·
verified ·
1 Parent(s): 41c0fa6

Optimize Space: interactive Plotly charts, waveform viz, protection heatmap, fix gallery

Browse files
Files changed (2) hide show
  1. app.py +457 -189
  2. requirements.txt +1 -0
app.py CHANGED
@@ -1,10 +1,12 @@
1
- """RVCBench — Interactive HuggingFace Space demo.
2
 
3
  Tabs
4
  ────
5
  1. Voice Cloning Gallery – hear pre-computed clean vs. protected clones
6
- 2. Protect Your Voice – upload audio, apply a protection method live, compare
7
- 3. Leaderboard sortable benchmark results table
 
 
8
  """
9
 
10
  from __future__ import annotations
@@ -15,56 +17,150 @@ import time
15
 
16
  import gradio as gr
17
  import numpy as np
 
18
  import soundfile as sf
19
 
20
  # ── paths ────────────────────────────────────────────────────────────────────
21
 
22
- SAMPLES = os.path.join(os.path.dirname(__file__), "samples", "1089")
23
-
24
- REF_WAV = os.path.join(SAMPLES, "reference.wav")
25
- TARGET_WAV = os.path.join(SAMPLES, "target.wav")
26
- REF_TEXT = "But her long fair hair was girlish: and girlish, and touched with the wonder of mortal beauty, her face."
27
  TARGET_TEXT = "A great fisher of souls!"
28
 
29
- MODELS = {
30
- "ZipVoice (SIM 0.579)": ("zipvoice_clean.wav", "zipvoice_safespeech.wav"),
31
- "MOSS-TTSD (SIM 0.492)": ("moss_ttsd_clean.wav", "moss_ttsd_safespeech.wav"),
32
- "MGM-Omni (SIM 0.539)": ("mgm_omni_clean.wav", "mgm_omni_safespeech.wav"),
33
- "OZSpeech (SIM 0.388)": ("ozspeech_clean.wav", "ozspeech_safespeech.wav"),
34
- "StyleTTS 2 (SIM 0.228)": ("styletts2_clean.wav", "styletts2_safespeech.wav"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  }
36
 
37
- PROTECTION_SAMPLES = {
38
- "SafeSpeech": "protected_safespeech.wav",
39
- "GR-Noise": "protected_grnoise.wav",
40
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- # ── leaderboard data ──────────────────────────────────────────────────────────
43
-
44
- LEADERBOARD = [
45
- ["1", "Qwen3-TTS", "0.614", "0.052", "4.39", "5.79", "2.02", "0.974", "0.731"],
46
- ["2", "IndexTTS", "0.606", "0.052", "4.06", "6.61", "2.23", "0.972", "0.693"],
47
- ["3", "CosyVoice 2", "0.602", "0.175", "4.39", "6.17", "4.58", "0.974", "0.729"],
48
- ["4", "ZipVoice", "0.579", "0.053", "4.13", "7.09", "1.46", "0.952", "0.675"],
49
- ["5", "MaskGCT", "0.570", "0.088", "3.93", "6.91", "1.36", "0.939", "0.682"],
50
- ["6", "GLM-TTS", "0.570", "0.087", "4.08", "6.41", "1.74", "0.951", "0.678"],
51
- ["7", "F5-TTS", "0.559", "0.116", "3.99", "6.96", "0.61", "0.937", "0.676"],
52
- ["8", "Higgs Audio", "0.559", "0.250", "4.30", "6.06", "1.42", "0.941", "0.717"],
53
- ["9", "MGM-Omni", "0.539", "0.095", "4.28", "5.82", "0.84", "0.933", "0.676"],
54
- ["10","PlayDiffusion","0.506", "0.055", "4.15", "8.06", "0.73", "0.936", "0.681"],
55
- ["11","MOSS-TTSD", "0.492", "0.383", "4.10", "7.09", "—", "0.876", "0.667"],
56
- ["12","VibeVoice", "0.480", "0.228", "3.83", "6.76", "1.86", "0.852", "0.624"],
57
- ["13","FishSpeech", "0.472", "0.166", "4.37", "6.47", "3.61", "0.907", "0.682"],
58
- ["14","XTTS-v2", "0.454", "0.073", "3.81", "8.62", "0.62", "0.908", "0.639"],
59
- ["15","SparkTTS", "0.408", "0.326", "4.06", "5.83", "1.56", "0.764", "0.672"],
60
- ["16","OZSpeech", "0.388", "0.060", "3.21", "6.87", "8.75", "0.840", "0.636"],
61
- ["17","OpenVoice V2", "0.244", "0.075", "4.30", "7.06", "0.08", "0.474", "0.601"],
62
- ["18","StyleTTS 2", "0.228", "0.049", "4.30", "6.81", "0.11", "0.388", "0.589"],
63
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- HEADERS = ["#", "Model", "SIM ↑", "WER ↓", "MOS ↑", "MCD ↓", "RTF ↓", "SVA ↑", "Emo ↑"]
66
 
67
- # ── protection helpers ────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  def _load(path: str) -> tuple[np.ndarray, int]:
70
  audio, sr = sf.read(path, dtype="float32")
@@ -73,107 +169,248 @@ def _load(path: str) -> tuple[np.ndarray, int]:
73
  return audio, sr
74
 
75
 
76
- def _to_bytes(audio: np.ndarray, sr: int) -> bytes:
77
- buf = io.BytesIO()
78
- sf.write(buf, audio, sr, format="WAV", subtype="PCM_16")
79
- buf.seek(0)
80
- return buf.read()
81
-
82
-
83
  def _snr(original: np.ndarray, protected: np.ndarray) -> float:
84
  noise = protected - original
85
- signal_power = np.mean(original ** 2)
86
- noise_power = np.mean(noise ** 2)
87
- if noise_power < 1e-12:
88
- return float("inf")
89
- return float(10 * np.log10(signal_power / noise_power))
90
 
91
 
 
 
92
  def apply_grnoise(audio: np.ndarray, sr: int, snr_db: float = 25.0) -> np.ndarray:
93
- signal_power = np.mean(audio ** 2)
94
- noise_power = signal_power / (10 ** (snr_db / 10))
95
- noise = np.random.randn(*audio.shape).astype(np.float32) * np.sqrt(noise_power)
96
  return np.clip(audio + noise, -1.0, 1.0)
97
 
98
 
99
  def apply_spectral(audio: np.ndarray, sr: int, strength: float = 0.05) -> np.ndarray:
100
- """Frequency-domain perturbation: add structured noise in the STFT domain."""
101
  from numpy.fft import rfft, irfft
102
- n_fft = 1024
103
- hop = n_fft // 4
104
- frames = []
105
  for start in range(0, len(audio) - n_fft, hop):
106
  frame = audio[start:start + n_fft] * np.hanning(n_fft).astype(np.float32)
107
- spec = rfft(frame)
108
- mag = np.abs(spec)
109
  perturb = np.random.randn(*mag.shape).astype(np.float32) * strength * mag
110
- spec_p = spec + perturb * np.exp(1j * np.random.uniform(0, 2 * np.pi, mag.shape))
111
- frames.append((start, irfft(spec_p)))
112
- out = np.zeros_like(audio)
113
- cnt = np.zeros_like(audio)
114
- for start, f in frames:
115
- end = start + n_fft
116
- out[start:end] += f[:n_fft].astype(np.float32)
117
- cnt[start:end] += 1
118
  cnt = np.maximum(cnt, 1)
119
  return np.clip(out / cnt, -1.0, 1.0)
120
 
121
 
122
- PROTECT_FN = {
123
- "GR-Noise": apply_grnoise,
124
- "Spectral": apply_spectral,
125
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
- # ── tab 1: gallery ────────────────────────────────────────────────────────────
128
-
129
- def load_gallery(model_label: str, protection: str):
130
- clean_file, safe_file = MODELS[model_label]
131
- prot_audio_file = PROTECTION_SAMPLES.get(protection)
132
-
133
- ref_audio = REF_WAV
134
- target_audio = TARGET_WAV
135
- clean_clone = os.path.join(SAMPLES, clean_file)
136
- prot_ref = os.path.join(SAMPLES, prot_audio_file) if prot_audio_file else None
137
- prot_clone = os.path.join(SAMPLES, safe_file)
138
-
139
- # Compute SIM drop note
140
- clean_sim = float(model_label.split("SIM ")[-1].rstrip(")"))
141
- sim_lookup = {
142
- "ZipVoice (SIM 0.579)": {"SafeSpeech": 0.287, "GR-Noise": 0.258},
143
- "MOSS-TTSD (SIM 0.492)": {"SafeSpeech": 0.242, "GR-Noise": 0.247},
144
- "MGM-Omni (SIM 0.539)": {"SafeSpeech": 0.184, "GR-Noise": 0.229},
145
- "OZSpeech (SIM 0.388)": {"SafeSpeech": 0.156, "GR-Noise": 0.148},
146
- "StyleTTS 2 (SIM 0.228)": {"SafeSpeech": 0.089, "GR-Noise": 0.030},
147
- }
148
- prot_sim = sim_lookup.get(model_label, {}).get(protection, None)
149
- drop = clean_sim - prot_sim if prot_sim else None
150
 
 
 
 
 
 
151
  note_md = (
152
  f"**Clean SIM:** {clean_sim:.3f} &nbsp;→&nbsp; "
153
- f"**Protected SIM ({protection}):** {prot_sim:.3f} &nbsp;"
154
  f"*(drop: {drop:.3f})*"
155
- if drop is not None else ""
156
  )
157
-
158
  return (
159
- ref_audio,
160
- target_audio,
161
- clean_clone,
162
- prot_ref,
163
- prot_clone,
164
  note_md,
 
165
  )
166
 
167
- # ── tab 2: live protection ────────────────────────────────────────────────────
 
168
 
169
  def run_protection(audio_input, method: str, strength: float):
170
  if audio_input is None:
171
- return None, None, "Upload an audio file first."
172
 
173
  sr_in, data = audio_input
174
  audio = data.astype(np.float32)
175
  if audio.max() > 1.0:
176
- audio = audio / 32768.0
177
  if audio.ndim > 1:
178
  audio = audio.mean(axis=1)
179
 
@@ -186,34 +423,54 @@ def run_protection(audio_input, method: str, strength: float):
186
  elapsed = time.time() - t0
187
 
188
  snr = _snr(audio, protected)
189
- protected_int = (protected * 32767).astype(np.int16)
190
 
191
  metrics_md = (
192
  f"| Metric | Value |\n|--------|-------|\n"
193
  f"| SNR (dB) | {snr:.1f} |\n"
194
- f"| Processing time | {elapsed*1000:.0f} ms |\n"
195
  f"| Method | {method} |\n"
196
  )
197
 
198
- return (sr_in, audio.copy()), (sr_in, protected_int), metrics_md
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
 
200
 
201
- # ── build UI ──────────────────────────────────────────────────────────────────
 
 
 
 
202
 
203
  CSS = """
204
  #title { text-align: center; }
205
- .metric-box { font-size: 1.1em; }
206
- .tab-header { font-weight: bold; }
207
  footer { display: none !important; }
 
208
  """
209
 
210
  INTRO_MD = """
211
  <div id="title">
212
 
213
- # RVCBench — Voice Cloning & Protection Demo
214
 
215
  **Can audio protection prevent your voice from being cloned?**
216
- This demo lets you hear the answer.
217
 
218
  [![Paper](https://img.shields.io/badge/arXiv-2602.00443-b31b1b.svg)](https://arxiv.org/abs/2602.00443)
219
  [![Dataset](https://img.shields.io/badge/HuggingFace-Dataset-ffcc00.svg)](https://huggingface.co/datasets/Nanboy/RVCBench)
@@ -222,91 +479,94 @@ This demo lets you hear the answer.
222
  </div>
223
  """
224
 
225
- GALLERY_MD = """
226
- **How it works:** A voice cloning model uses the *Reference Voice* to clone the *Target Speech*
227
- (what it wants the speaker to say). When protection is applied to the reference first,
228
- the clone degrades — the speaker sounds wrong or the speech becomes unintelligible.
 
 
 
 
 
 
 
 
 
 
229
  """
230
 
231
- PROTECTION_MD = """
232
- Upload your own audio clip and apply a protection method in real-time.
233
- The protected audio sounds nearly identical to humans but disrupts voice cloning models.
 
 
234
 
235
- - **GR-Noise** Gaussian random noise at a target SNR level. No surrogate model needed.
236
- - **Spectral** Structured perturbation in the frequency domain.
237
  """
238
 
239
 
 
 
240
  def build_demo():
241
  with gr.Blocks(css=CSS, title="RVCBench Demo") as demo:
242
  gr.Markdown(INTRO_MD)
243
 
244
  with gr.Tabs():
245
 
246
- # ── Tab 1: Gallery ──────────────────────────────────────────────
247
  with gr.Tab("🎧 Voice Cloning Gallery"):
248
- gr.Markdown(GALLERY_MD)
249
 
250
- with gr.Row():
251
- model_dd = gr.Dropdown(
252
- choices=list(MODELS.keys()),
253
- value=list(MODELS.keys())[0],
254
- label="Voice Cloning Model",
255
- scale=2,
256
- )
257
- prot_dd = gr.Dropdown(
258
- choices=["SafeSpeech", "GR-Noise"],
259
- value="SafeSpeech",
260
- label="Protection Method",
261
- scale=1,
262
- )
263
 
264
- sim_note = gr.Markdown("", elem_classes="metric-box")
265
 
266
  with gr.Row():
267
  with gr.Column():
268
  gr.Markdown("### 1 · Reference Voice")
269
  gr.Markdown(f"*\"{REF_TEXT}\"*")
270
- ref_out = gr.Audio(label="Reference (original)", interactive=False)
271
  with gr.Column():
272
  gr.Markdown("### 2 · Target Speech")
273
  gr.Markdown(f"*\"{TARGET_TEXT}\"*")
274
  target_out = gr.Audio(label="Target utterance", interactive=False)
275
 
276
  gr.Markdown("---")
277
- gr.Markdown("### Cloning Results")
278
 
279
  with gr.Row():
280
  with gr.Column():
281
  gr.Markdown("#### Without Protection")
282
- clean_out = gr.Audio(label="Clean clone (threat)", interactive=False)
283
  with gr.Column():
284
- gr.Markdown("#### With Protection")
285
  prot_ref_out = gr.Audio(label="Protected reference", interactive=False)
286
  prot_clone_out = gr.Audio(label="Clone from protected (degraded)", interactive=False)
287
 
288
- load_btn = gr.Button("Load Example", variant="primary")
 
 
289
 
290
- load_btn.click(
291
- fn=load_gallery,
292
- inputs=[model_dd, prot_dd],
293
- outputs=[ref_out, target_out, clean_out, prot_ref_out, prot_clone_out, sim_note],
294
- )
295
- demo.load(
296
- fn=load_gallery,
297
- inputs=[model_dd, prot_dd],
298
- outputs=[ref_out, target_out, clean_out, prot_ref_out, prot_clone_out, sim_note],
299
- )
300
 
301
- # ── Tab 2: Live Protection ─────────────────────────────────────
302
  with gr.Tab("🔒 Protect Your Voice"):
303
- gr.Markdown(PROTECTION_MD)
304
 
305
  with gr.Row():
306
  audio_in = gr.Audio(
307
  label="Upload your audio (wav / mp3, ≤ 30 s)",
308
- type="numpy",
309
- scale=3,
310
  )
311
  with gr.Column(scale=1):
312
  method_dd = gr.Dropdown(
@@ -316,49 +576,57 @@ def build_demo():
316
  )
317
  strength_sl = gr.Slider(
318
  minimum=10, maximum=40, value=25, step=1,
319
- label="Strength (SNR dB for GR-Noise; intensity × 100 for Spectral)",
320
- info="Lower = stronger protection, more audible artifacts.",
321
  )
322
  protect_btn = gr.Button("Apply Protection", variant="primary")
323
 
324
  with gr.Row():
325
- orig_out = gr.Audio(label="Original", interactive=False)
326
- prot_live = gr.Audio(label="Protected", interactive=False)
327
 
328
- metrics_out = gr.Markdown("", elem_classes="metric-box")
 
329
 
 
 
330
  protect_btn.click(
331
  fn=run_protection,
332
  inputs=[audio_in, method_dd, strength_sl],
333
- outputs=[orig_out, prot_live, metrics_out],
334
  )
335
 
336
  gr.Markdown(
337
- "> **Note:** Live voice cloning inference is not included in this Space due to "
338
- "model size constraints. See the [GitHub repo](https://github.com/Nanboy-Ronan/RVCBench) "
339
- "for the full pipeline with 18+ VC models."
 
340
  )
341
 
342
- # ── Tab 3: Leaderboard ──────────────────────────────────────────
343
- with gr.Tab("📊 Leaderboard"):
344
- gr.Markdown(
345
- "### Benchmark Results — LibriTTS (clean prompts)\n"
346
- "Sorted by Speaker Similarity (SIM ↑). "
347
- "Full results including protection robustness and cross-dataset generalisation: "
348
- "[GitHub README](https://github.com/Nanboy-Ronan/RVCBench#benchmark-results).\n\n"
349
- "> **Metric guide** · SIM: speaker similarity ↑ · WER: word error rate ↓ · "
350
- "MOS: perceptual score ↑ · MCD: mel cepstral distortion ↓ · "
351
- "RTF: real-time factor ↓ · SVA: speaker verification accuracy ↑ · Emo: emotion match ↑"
352
  )
353
- gr.DataFrame(
354
- value=LEADERBOARD,
355
- headers=HEADERS,
356
- datatype=["number", "str"] + ["number"] * 7,
357
- interactive=False,
358
- wrap=False,
 
 
 
 
359
  )
 
 
360
 
361
- # ── Tab 4: About ────────────────────────────────────────────────
362
  with gr.Tab("ℹ️ About"):
363
  gr.Markdown("""
364
  ## About RVCBench
@@ -367,9 +635,9 @@ def build_demo():
367
  against audio protection methods.
368
 
369
  ### What it measures
370
- - How well 18+ modern zero-shot TTS/VC models can clone a speaker's voice
371
- - How effectively 5 audio protection methods (SafeSpeech, Enkidu, Spectral, GR-Noise, AntiFake)
372
- prevent cloning across 10 datasets and 7 evaluation metrics
373
 
374
  ### Resources
375
 
 
1
+ """RVCBench — Interactive HuggingFace Space demo (v2).
2
 
3
  Tabs
4
  ────
5
  1. Voice Cloning Gallery – hear pre-computed clean vs. protected clones
6
+ + protection-effectiveness bar chart for all 5 methods
7
+ 2. Protect Your Voice upload audio, apply protection, see waveform comparison
8
+ 3. Results Explorer – interactive bar chart + protection robustness heatmap
9
+ 4. About – paper, citation, resources
10
  """
11
 
12
  from __future__ import annotations
 
17
 
18
  import gradio as gr
19
  import numpy as np
20
+ import plotly.graph_objects as go
21
  import soundfile as sf
22
 
23
  # ── paths ────────────────────────────────────────────────────────────────────
24
 
25
+ SAMPLES = os.path.join(os.path.dirname(__file__), "samples", "1089")
26
+ REF_WAV = os.path.join(SAMPLES, "reference.wav")
27
+ TARGET_WAV = os.path.join(SAMPLES, "target.wav")
28
+ REF_TEXT = ("But her long fair hair was girlish: and girlish, and touched "
29
+ "with the wonder of mortal beauty, her face.")
30
  TARGET_TEXT = "A great fisher of souls!"
31
 
32
+ # ── gallery models (audio samples available for SafeSpeech protection) ────────
33
+
34
+ GALLERY_MODELS = {
35
+ "ZipVoice": dict(
36
+ clean="zipvoice_clean.wav",
37
+ prot="zipvoice_safespeech.wav",
38
+ sims={"Clean": 0.579, "SafeSpeech": 0.287, "Enkidu": 0.435,
39
+ "Spectral": 0.262, "GR-Noise": 0.258, "AntiFake": 0.543},
40
+ ),
41
+ "MOSS-TTSD": dict(
42
+ clean="moss_ttsd_clean.wav",
43
+ prot="moss_ttsd_safespeech.wav",
44
+ sims={"Clean": 0.492, "SafeSpeech": 0.242, "Enkidu": 0.335,
45
+ "Spectral": 0.216, "GR-Noise": 0.247, "AntiFake": 0.453},
46
+ ),
47
+ "MGM-Omni": dict(
48
+ clean="mgm_omni_clean.wav",
49
+ prot="mgm_omni_safespeech.wav",
50
+ sims={"Clean": 0.539, "SafeSpeech": 0.184, "Enkidu": 0.316,
51
+ "Spectral": 0.166, "GR-Noise": 0.229, "AntiFake": 0.491},
52
+ ),
53
+ "OZSpeech": dict(
54
+ clean="ozspeech_clean.wav",
55
+ prot="ozspeech_safespeech.wav",
56
+ sims={"Clean": 0.388, "SafeSpeech": 0.156, "Enkidu": 0.187,
57
+ "Spectral": 0.147, "GR-Noise": 0.148, "AntiFake": 0.337},
58
+ ),
59
+ "StyleTTS 2": dict(
60
+ clean="styletts2_clean.wav",
61
+ prot="styletts2_safespeech.wav",
62
+ sims={"Clean": 0.228, "SafeSpeech": 0.089, "Enkidu": 0.125,
63
+ "Spectral": 0.081, "GR-Noise": 0.030, "AntiFake": 0.207},
64
+ ),
65
  }
66
 
67
+ # ── benchmark data (LibriTTS, clean prompts) ─────────────────────────────────
68
+
69
+ # fmt: off
70
+ LEADERBOARD_ROWS = [
71
+ dict(model="Qwen3-TTS", SIM=0.614, WER=0.052, MOS=4.39, MCD=5.79, RTF=2.02, SVA=0.974, Emo=0.731),
72
+ dict(model="IndexTTS", SIM=0.606, WER=0.052, MOS=4.06, MCD=6.61, RTF=2.23, SVA=0.972, Emo=0.693),
73
+ dict(model="CosyVoice 2", SIM=0.602, WER=0.175, MOS=4.39, MCD=6.17, RTF=4.58, SVA=0.974, Emo=0.729),
74
+ dict(model="ZipVoice", SIM=0.579, WER=0.053, MOS=4.13, MCD=7.09, RTF=1.46, SVA=0.952, Emo=0.675),
75
+ dict(model="MaskGCT", SIM=0.570, WER=0.088, MOS=3.93, MCD=6.91, RTF=1.36, SVA=0.939, Emo=0.682),
76
+ dict(model="GLM-TTS", SIM=0.570, WER=0.087, MOS=4.08, MCD=6.41, RTF=1.74, SVA=0.951, Emo=0.678),
77
+ dict(model="F5-TTS", SIM=0.559, WER=0.116, MOS=3.99, MCD=6.96, RTF=0.61, SVA=0.937, Emo=0.676),
78
+ dict(model="Higgs Audio", SIM=0.559, WER=0.250, MOS=4.30, MCD=6.06, RTF=1.42, SVA=0.941, Emo=0.717),
79
+ dict(model="MGM-Omni", SIM=0.539, WER=0.095, MOS=4.28, MCD=5.82, RTF=0.84, SVA=0.933, Emo=0.676),
80
+ dict(model="PlayDiffusion",SIM=0.506, WER=0.055, MOS=4.15, MCD=8.06, RTF=0.73, SVA=0.936, Emo=0.681),
81
+ dict(model="MOSS-TTSD", SIM=0.492, WER=0.383, MOS=4.10, MCD=7.09, RTF=None, SVA=0.876, Emo=0.667),
82
+ dict(model="VibeVoice", SIM=0.480, WER=0.228, MOS=3.83, MCD=6.76, RTF=1.86, SVA=0.852, Emo=0.624),
83
+ dict(model="FishSpeech", SIM=0.472, WER=0.166, MOS=4.37, MCD=6.47, RTF=3.61, SVA=0.907, Emo=0.682),
84
+ dict(model="XTTS-v2", SIM=0.454, WER=0.073, MOS=3.81, MCD=8.62, RTF=0.62, SVA=0.908, Emo=0.639),
85
+ dict(model="SparkTTS", SIM=0.408, WER=0.326, MOS=4.06, MCD=5.83, RTF=1.56, SVA=0.764, Emo=0.672),
86
+ dict(model="OZSpeech", SIM=0.388, WER=0.060, MOS=3.21, MCD=6.87, RTF=8.75, SVA=0.840, Emo=0.636),
87
+ dict(model="OpenVoice V2", SIM=0.244, WER=0.075, MOS=4.30, MCD=7.06, RTF=0.08, SVA=0.474, Emo=0.601),
88
+ dict(model="StyleTTS 2", SIM=0.228, WER=0.049, MOS=4.30, MCD=6.81, RTF=0.11, SVA=0.388, Emo=0.589),
89
+ ]
90
 
91
+ # Protection robustness SIM under each method (LibriTTS, all 18 models)
92
+ PROT_ROWS = [
93
+ dict(model="Qwen3-TTS", Clean=0.614, SafeSpeech=0.384, Enkidu=0.502, Spectral=0.363, GRNoise=0.408, AntiFake=0.582),
94
+ dict(model="IndexTTS", Clean=0.606, SafeSpeech=0.346, Enkidu=0.475, Spectral=0.318, GRNoise=0.392, AntiFake=0.572),
95
+ dict(model="CosyVoice 2", Clean=0.602, SafeSpeech=0.321, Enkidu=0.447, Spectral=0.301, GRNoise=0.384, AntiFake=0.549),
96
+ dict(model="ZipVoice", Clean=0.579, SafeSpeech=0.287, Enkidu=0.435, Spectral=0.262, GRNoise=0.258, AntiFake=0.543),
97
+ dict(model="MaskGCT", Clean=0.570, SafeSpeech=0.303, Enkidu=0.407, Spectral=0.281, GRNoise=0.312, AntiFake=0.530),
98
+ dict(model="GLM-TTS", Clean=0.570, SafeSpeech=0.330, Enkidu=0.445, Spectral=0.311, GRNoise=0.388, AntiFake=0.532),
99
+ dict(model="F5-TTS", Clean=0.559, SafeSpeech=0.207, Enkidu=0.431, Spectral=0.176, GRNoise=0.137, AntiFake=0.520),
100
+ dict(model="Higgs Audio", Clean=0.559, SafeSpeech=0.264, Enkidu=0.435, Spectral=0.236, GRNoise=0.272, AntiFake=0.521),
101
+ dict(model="MGM-Omni", Clean=0.539, SafeSpeech=0.184, Enkidu=0.316, Spectral=0.166, GRNoise=0.229, AntiFake=0.491),
102
+ dict(model="PlayDiffusion",Clean=0.506, SafeSpeech=0.173, Enkidu=None, Spectral=0.149, GRNoise=0.162, AntiFake=0.466),
103
+ dict(model="MOSS-TTSD", Clean=0.492, SafeSpeech=0.242, Enkidu=0.335, Spectral=0.216, GRNoise=0.247, AntiFake=0.453),
104
+ dict(model="VibeVoice", Clean=0.480, SafeSpeech=0.272, Enkidu=0.367, Spectral=0.253, GRNoise=0.280, AntiFake=0.442),
105
+ dict(model="FishSpeech", Clean=0.472, SafeSpeech=0.238, Enkidu=0.334, Spectral=0.212, GRNoise=0.235, AntiFake=0.439),
106
+ dict(model="XTTS-v2", Clean=0.454, SafeSpeech=0.260, Enkidu=0.308, Spectral=0.241, GRNoise=0.237, AntiFake=0.414),
107
+ dict(model="SparkTTS", Clean=0.408, SafeSpeech=0.129, Enkidu=0.137, Spectral=0.108, GRNoise=0.062, AntiFake=0.359),
108
+ dict(model="OZSpeech", Clean=0.388, SafeSpeech=0.156, Enkidu=0.187, Spectral=0.147, GRNoise=0.148, AntiFake=0.337),
109
+ dict(model="OpenVoice V2", Clean=0.244, SafeSpeech=0.185, Enkidu=0.188, Spectral=0.180, GRNoise=0.175, AntiFake=0.236),
110
+ dict(model="StyleTTS 2", Clean=0.228, SafeSpeech=0.089, Enkidu=0.125, Spectral=0.081, GRNoise=0.030, AntiFake=0.207),
 
111
  ]
112
+ # fmt: on
113
+
114
+ METRIC_META = {
115
+ "SIM": ("Speaker Similarity ↑", True),
116
+ "WER": ("Word Error Rate ↓", False),
117
+ "MOS": ("MOS Score ↑", True),
118
+ "MCD": ("Mel Cepstral Dist. ↓", False),
119
+ "RTF": ("Real-Time Factor ↓", False),
120
+ "SVA": ("Speaker Verif. Acc. ↑",True),
121
+ "Emo": ("Emotion Match Rate ↑", True),
122
+ }
123
+
124
+ # ── colour helpers ────────────────────────────────────────────────────────────
125
+
126
+ _GOOD = (200, 230, 201) # #c8e6c9 light green
127
+ _MID = (255, 249, 196) # #fff9c4 light yellow
128
+ _BAD = (255, 205, 210) # #ffcdd2 light red
129
 
 
130
 
131
+ def _interp_color(t: float) -> str:
132
+ """t=0 → bad (red), t=1 → good (green), t=0.5 → yellow."""
133
+ if t <= 0.5:
134
+ s = t / 0.5
135
+ r = int(_BAD[0] + s * (_MID[0] - _BAD[0]))
136
+ g = int(_BAD[1] + s * (_MID[1] - _BAD[1]))
137
+ b = int(_BAD[2] + s * (_MID[2] - _BAD[2]))
138
+ else:
139
+ s = (t - 0.5) / 0.5
140
+ r = int(_MID[0] + s * (_GOOD[0] - _MID[0]))
141
+ g = int(_MID[1] + s * (_GOOD[1] - _MID[1]))
142
+ b = int(_MID[2] + s * (_GOOD[2] - _MID[2]))
143
+ return f"rgb({r},{g},{b})"
144
+
145
+
146
+ def _col_colors(values: list, higher_is_better: bool) -> list[str]:
147
+ valid = [v for v in values if v is not None]
148
+ if not valid or max(valid) == min(valid):
149
+ return ["rgb(245,245,245)"] * len(values)
150
+ vmin, vmax = min(valid), max(valid)
151
+ colors = []
152
+ for v in values:
153
+ if v is None:
154
+ colors.append("rgb(245,245,245)")
155
+ else:
156
+ t = (v - vmin) / (vmax - vmin)
157
+ if not higher_is_better:
158
+ t = 1 - t
159
+ colors.append(_interp_color(t))
160
+ return colors
161
+
162
+
163
+ # ── audio helpers ─────────────────────────────────────────────────────────────
164
 
165
  def _load(path: str) -> tuple[np.ndarray, int]:
166
  audio, sr = sf.read(path, dtype="float32")
 
169
  return audio, sr
170
 
171
 
 
 
 
 
 
 
 
172
  def _snr(original: np.ndarray, protected: np.ndarray) -> float:
173
  noise = protected - original
174
+ sp = np.mean(original ** 2)
175
+ np_ = np.mean(noise ** 2)
176
+ return float("inf") if np_ < 1e-12 else float(10 * np.log10(sp / np_))
 
 
177
 
178
 
179
+ # ── protection functions ──────────────────────────────────────────────────────
180
+
181
  def apply_grnoise(audio: np.ndarray, sr: int, snr_db: float = 25.0) -> np.ndarray:
182
+ sig_pow = np.mean(audio ** 2)
183
+ noise_pow = sig_pow / (10 ** (snr_db / 10))
184
+ noise = np.random.randn(*audio.shape).astype(np.float32) * np.sqrt(noise_pow)
185
  return np.clip(audio + noise, -1.0, 1.0)
186
 
187
 
188
  def apply_spectral(audio: np.ndarray, sr: int, strength: float = 0.05) -> np.ndarray:
 
189
  from numpy.fft import rfft, irfft
190
+ n_fft, hop = 1024, 256
191
+ out = np.zeros_like(audio)
192
+ cnt = np.zeros_like(audio)
193
  for start in range(0, len(audio) - n_fft, hop):
194
  frame = audio[start:start + n_fft] * np.hanning(n_fft).astype(np.float32)
195
+ spec = rfft(frame)
196
+ mag = np.abs(spec)
197
  perturb = np.random.randn(*mag.shape).astype(np.float32) * strength * mag
198
+ spec_p = spec + perturb * np.exp(1j * np.random.uniform(0, 2 * np.pi, mag.shape))
199
+ f = irfft(spec_p)[:n_fft].astype(np.float32)
200
+ out[start:start + n_fft] += f
201
+ cnt[start:start + n_fft] += 1
 
 
 
 
202
  cnt = np.maximum(cnt, 1)
203
  return np.clip(out / cnt, -1.0, 1.0)
204
 
205
 
206
+ PROTECT_FN = {"GR-Noise": apply_grnoise, "Spectral": apply_spectral}
207
+
208
+
209
+ # ── plotly figures ────────────────────────────────────────────────────────────
210
+
211
+ def make_sim_bar(model_name: str) -> go.Figure:
212
+ """Bar chart: SIM under each protection method for one gallery model."""
213
+ info = GALLERY_MODELS[model_name]
214
+ sims = info["sims"]
215
+ labels = list(sims.keys())
216
+ values = list(sims.values())
217
+
218
+ bar_colors = [
219
+ "#1565c0", # Clean
220
+ "#6a1b9a", # SafeSpeech
221
+ "#1b5e20", # Enkidu
222
+ "#e65100", # Spectral
223
+ "#37474f", # GR-Noise
224
+ "#880e4f", # AntiFake
225
+ ]
226
+ # annotate drop vs clean
227
+ clean_sim = sims["Clean"]
228
+ text = [f"{v:.3f}" if k == "Clean" else f"{v:.3f}<br>↓{clean_sim - v:.3f}"
229
+ for k, v in sims.items()]
230
+
231
+ fig = go.Figure(go.Bar(
232
+ x=labels, y=values,
233
+ marker_color=bar_colors,
234
+ text=text, textposition="outside",
235
+ cliponaxis=False,
236
+ ))
237
+ fig.update_layout(
238
+ title=dict(text=f"<b>{model_name}</b> — Speaker Similarity Under Each Protection",
239
+ font=dict(size=14)),
240
+ yaxis=dict(title="SIM (Speaker Similarity)", range=[0, max(values) * 1.2]),
241
+ xaxis=dict(title="Condition"),
242
+ paper_bgcolor="white", plot_bgcolor="#f8f9fa",
243
+ margin=dict(t=60, b=40, l=50, r=20),
244
+ height=320,
245
+ showlegend=False,
246
+ )
247
+ fig.add_hline(y=clean_sim, line_dash="dot", line_color="#1565c0",
248
+ annotation_text="Clean baseline", annotation_position="top right",
249
+ annotation_font_size=10)
250
+ return fig
251
+
252
+
253
+ def make_results_bar(metric: str = "SIM", ascending: bool = False) -> go.Figure:
254
+ """Horizontal bar chart of all 18 models sorted by the chosen metric."""
255
+ higher_is_better = METRIC_META[metric][1]
256
+ metric_label = METRIC_META[metric][0]
257
+
258
+ rows = [r for r in LEADERBOARD_ROWS if r.get(metric) is not None]
259
+ rows = sorted(rows, key=lambda r: r[metric], reverse=(higher_is_better ^ ascending))
260
+
261
+ models = [r["model"] for r in rows]
262
+ values = [r[metric] for r in rows]
263
+
264
+ colors = _col_colors(values, higher_is_better)
265
+ text = [f"{v:.3f}" if v is not None else "—" for v in values]
266
+
267
+ fig = go.Figure(go.Bar(
268
+ x=values, y=models,
269
+ orientation="h",
270
+ marker_color=colors,
271
+ marker_line_color="#999", marker_line_width=0.5,
272
+ text=text, textposition="outside",
273
+ cliponaxis=False,
274
+ ))
275
+ fig.update_layout(
276
+ title=dict(text=f"<b>Model Ranking by {metric_label}</b>",
277
+ font=dict(size=14)),
278
+ xaxis=dict(title=metric_label),
279
+ yaxis=dict(autorange="reversed"),
280
+ paper_bgcolor="white", plot_bgcolor="#f8f9fa",
281
+ margin=dict(t=50, b=40, l=120, r=80),
282
+ height=520,
283
+ showlegend=False,
284
+ )
285
+ return fig
286
+
287
+
288
+ def make_prot_heatmap() -> go.Figure:
289
+ """Heatmap: SIM under each protection method for all 18 models."""
290
+ col_order = ["Clean", "SafeSpeech", "Enkidu", "Spectral", "GRNoise", "AntiFake"]
291
+ col_labels = ["Clean", "SafeSpeech", "Enkidu", "Spectral", "GR-Noise", "AntiFake"]
292
+
293
+ # sort models by Clean SIM descending
294
+ rows = sorted(PROT_ROWS, key=lambda r: r["Clean"], reverse=True)
295
+ model_names = [r["model"] for r in rows]
296
+
297
+ z: list[list] = []
298
+ text_vals: list[list[str]] = []
299
+ for r in rows:
300
+ row_z, row_t = [], []
301
+ for col in col_order:
302
+ v = r.get(col)
303
+ row_z.append(v)
304
+ row_t.append(f"{v:.3f}" if v is not None else "—")
305
+ z.append(row_z)
306
+ text_vals.append(row_t)
307
+
308
+ fig = go.Figure(go.Heatmap(
309
+ z=z,
310
+ x=col_labels,
311
+ y=model_names,
312
+ text=text_vals,
313
+ texttemplate="%{text}",
314
+ textfont=dict(size=10),
315
+ colorscale=[
316
+ [0.0, "#b71c1c"],
317
+ [0.25, "#ef9a9a"],
318
+ [0.5, "#fff9c4"],
319
+ [0.75, "#a5d6a7"],
320
+ [1.0, "#1b5e20"],
321
+ ],
322
+ zmin=0.0, zmax=0.75,
323
+ colorbar=dict(title="SIM", tickformat=".2f", len=0.8),
324
+ hoverongaps=False,
325
+ ))
326
+ # separator line after Clean column
327
+ fig.add_shape(type="line",
328
+ x0=0.5, x1=0.5, y0=-0.5, y1=len(model_names) - 0.5,
329
+ line=dict(color="#555", width=2, dash="dot"),
330
+ xref="x", yref="y")
331
+
332
+ fig.update_layout(
333
+ title=dict(
334
+ text="<b>Protection Robustness — Speaker Similarity (SIM) on LibriTTS</b><br>"
335
+ "<sup>Green = high SIM (clone faithful). Red = low SIM (protection effective). "
336
+ "Drop from Clean → protected shows protection strength.</sup>",
337
+ font=dict(size=13),
338
+ ),
339
+ yaxis=dict(autorange="reversed"),
340
+ xaxis=dict(side="top"),
341
+ paper_bgcolor="white", plot_bgcolor="white",
342
+ margin=dict(t=120, b=40, l=120, r=80),
343
+ height=600,
344
+ )
345
+ return fig
346
+
347
+
348
+ def make_waveform_figure(
349
+ original: np.ndarray, protected: np.ndarray, sr: int
350
+ ) -> go.Figure:
351
+ """Overlay waveform plot: original vs. protected audio."""
352
+ n = min(len(original), len(protected), sr * 5) # cap at 5 s
353
+ t = np.arange(n) / sr
354
+
355
+ fig = go.Figure()
356
+ fig.add_trace(go.Scatter(
357
+ x=t, y=original[:n],
358
+ name="Original",
359
+ line=dict(color="#1565c0", width=1),
360
+ opacity=0.85,
361
+ ))
362
+ fig.add_trace(go.Scatter(
363
+ x=t, y=protected[:n],
364
+ name="Protected",
365
+ line=dict(color="#c62828", width=1),
366
+ opacity=0.85,
367
+ ))
368
+ fig.update_layout(
369
+ title=dict(text="<b>Waveform Comparison</b> (first 5 s)",
370
+ font=dict(size=13)),
371
+ xaxis=dict(title="Time (s)"),
372
+ yaxis=dict(title="Amplitude", range=[-1.05, 1.05]),
373
+ paper_bgcolor="white", plot_bgcolor="#f8f9fa",
374
+ legend=dict(orientation="h", y=1.08, x=0.5, xanchor="center"),
375
+ margin=dict(t=60, b=40, l=55, r=20),
376
+ height=220,
377
+ )
378
+ return fig
379
+
380
 
381
+ # ── gallery callback ──────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
 
383
+ def load_gallery(model_name: str):
384
+ info = GALLERY_MODELS[model_name]
385
+ clean_sim = info["sims"]["Clean"]
386
+ prot_sim = info["sims"]["SafeSpeech"]
387
+ drop = clean_sim - prot_sim
388
  note_md = (
389
  f"**Clean SIM:** {clean_sim:.3f} &nbsp;→&nbsp; "
390
+ f"**Protected SIM (SafeSpeech):** {prot_sim:.3f} &nbsp;"
391
  f"*(drop: {drop:.3f})*"
 
392
  )
 
393
  return (
394
+ REF_WAV,
395
+ TARGET_WAV,
396
+ os.path.join(SAMPLES, info["clean"]),
397
+ os.path.join(SAMPLES, "protected_safespeech.wav"),
398
+ os.path.join(SAMPLES, info["prot"]),
399
  note_md,
400
+ make_sim_bar(model_name),
401
  )
402
 
403
+
404
+ # ── live protection callback ──────────────────────────────────────────────────
405
 
406
  def run_protection(audio_input, method: str, strength: float):
407
  if audio_input is None:
408
+ return None, None, "Upload an audio file first.", None
409
 
410
  sr_in, data = audio_input
411
  audio = data.astype(np.float32)
412
  if audio.max() > 1.0:
413
+ audio /= 32768.0
414
  if audio.ndim > 1:
415
  audio = audio.mean(axis=1)
416
 
 
423
  elapsed = time.time() - t0
424
 
425
  snr = _snr(audio, protected)
426
+ prot_int = (protected * 32767).astype(np.int16)
427
 
428
  metrics_md = (
429
  f"| Metric | Value |\n|--------|-------|\n"
430
  f"| SNR (dB) | {snr:.1f} |\n"
431
+ f"| Processing time | {elapsed * 1000:.0f} ms |\n"
432
  f"| Method | {method} |\n"
433
  )
434
 
435
+ waveform_fig = make_waveform_figure(audio, protected, sr_in)
436
+ return (sr_in, audio.copy()), (sr_in, prot_int), metrics_md, waveform_fig
437
+
438
+
439
+ def update_strength_label(method: str) -> dict:
440
+ if method == "GR-Noise":
441
+ return gr.update(
442
+ label="Target SNR (dB) — lower = stronger, more audible",
443
+ info="25 dB: nearly imperceptible. 10 dB: noticeable noise.",
444
+ minimum=10, maximum=40, value=25, step=1,
445
+ )
446
+ else:
447
+ return gr.update(
448
+ label="Spectral Strength (%) — higher = stronger perturbation",
449
+ info="5% is nearly inaudible. 20%+ may cause artifacts.",
450
+ minimum=1, maximum=30, value=5, step=1,
451
+ )
452
+
453
 
454
+ # ── results callbacks ─────────────────────────────────────────────────────────
455
 
456
+ def update_results_bar(metric: str) -> go.Figure:
457
+ return make_results_bar(metric)
458
+
459
+
460
+ # ── UI constants ──────────────────────────────────────────────────────────────
461
 
462
  CSS = """
463
  #title { text-align: center; }
 
 
464
  footer { display: none !important; }
465
+ .note-box { font-size: 1.05em; background: #f0f4ff; border-radius: 8px; padding: 8px 12px; }
466
  """
467
 
468
  INTRO_MD = """
469
  <div id="title">
470
 
471
+ # RVCBench — Voice Cloning & Protection Benchmark
472
 
473
  **Can audio protection prevent your voice from being cloned?**
 
474
 
475
  [![Paper](https://img.shields.io/badge/arXiv-2602.00443-b31b1b.svg)](https://arxiv.org/abs/2602.00443)
476
  [![Dataset](https://img.shields.io/badge/HuggingFace-Dataset-ffcc00.svg)](https://huggingface.co/datasets/Nanboy/RVCBench)
 
479
  </div>
480
  """
481
 
482
+ GALLERY_INTRO_MD = """
483
+ A voice cloning model uses the **Reference Voice** to clone the **Target Speech**.
484
+ When protection (SafeSpeech adversarial perturbation) is applied to the reference first,
485
+ the clone degrades — lower speaker similarity means protection is working.
486
+
487
+ The bar chart below shows the SIM drop under **all 5 protection methods** for the selected model.
488
+ """
489
+
490
+ PROT_INTRO_MD = """
491
+ Upload your own audio clip and apply a protection method. The protected audio sounds nearly
492
+ identical to humans, but disrupts automatic voice cloning models.
493
+
494
+ - **GR-Noise** — Gaussian random noise at a chosen SNR level. No surrogate model required.
495
+ - **Spectral** — Structured perturbation in the STFT frequency domain.
496
  """
497
 
498
+ RESULTS_INTRO_MD = """
499
+ **Metric guide** SIM: speaker cosine similarity &nbsp;·&nbsp;
500
+ WER: word error rate &nbsp;·&nbsp; MOS: perceptual quality &nbsp;·&nbsp;
501
+ MCD: mel cepstral distortion ↓ &nbsp;·&nbsp; RTF: real-time factor ↓ &nbsp;·&nbsp;
502
+ SVA: speaker verification accuracy ↑ &nbsp;·&nbsp; Emo: emotion match rate ↑
503
 
504
+ Select a metric to re-rank the 18 models. The heatmap below shows protection robustness
505
+ (SIM under each of 5 protection methods).
506
  """
507
 
508
 
509
+ # ── build demo ────────────────────────────────────────────────────────────────
510
+
511
  def build_demo():
512
  with gr.Blocks(css=CSS, title="RVCBench Demo") as demo:
513
  gr.Markdown(INTRO_MD)
514
 
515
  with gr.Tabs():
516
 
517
+ # ── Tab 1: Voice Cloning Gallery ──────────────────────────────────
518
  with gr.Tab("🎧 Voice Cloning Gallery"):
519
+ gr.Markdown(GALLERY_INTRO_MD)
520
 
521
+ model_dd = gr.Dropdown(
522
+ choices=list(GALLERY_MODELS.keys()),
523
+ value="ZipVoice",
524
+ label="Voice Cloning Model",
525
+ )
526
+ load_btn = gr.Button("Load Example", variant="primary")
 
 
 
 
 
 
 
527
 
528
+ sim_note = gr.Markdown("", elem_classes="note-box")
529
 
530
  with gr.Row():
531
  with gr.Column():
532
  gr.Markdown("### 1 · Reference Voice")
533
  gr.Markdown(f"*\"{REF_TEXT}\"*")
534
+ ref_out = gr.Audio(label="Reference (original)", interactive=False)
535
  with gr.Column():
536
  gr.Markdown("### 2 · Target Speech")
537
  gr.Markdown(f"*\"{TARGET_TEXT}\"*")
538
  target_out = gr.Audio(label="Target utterance", interactive=False)
539
 
540
  gr.Markdown("---")
541
+ gr.Markdown("### 3 · Cloning Results — Clean vs. SafeSpeech-Protected")
542
 
543
  with gr.Row():
544
  with gr.Column():
545
  gr.Markdown("#### Without Protection")
546
+ clean_out = gr.Audio(label="Clean clone", interactive=False)
547
  with gr.Column():
548
+ gr.Markdown("#### With SafeSpeech Protection")
549
  prot_ref_out = gr.Audio(label="Protected reference", interactive=False)
550
  prot_clone_out = gr.Audio(label="Clone from protected (degraded)", interactive=False)
551
 
552
+ gr.Markdown("---")
553
+ gr.Markdown("### 4 · Protection Effectiveness Across All Methods")
554
+ sim_chart = gr.Plot(label="", show_label=False)
555
 
556
+ gallery_outputs = [ref_out, target_out, clean_out, prot_ref_out,
557
+ prot_clone_out, sim_note, sim_chart]
558
+ load_btn.click(fn=load_gallery, inputs=[model_dd], outputs=gallery_outputs)
559
+ demo.load(fn=load_gallery, inputs=[model_dd], outputs=gallery_outputs)
560
+ model_dd.change(fn=load_gallery, inputs=[model_dd], outputs=gallery_outputs)
 
 
 
 
 
561
 
562
+ # ── Tab 2: Protect Your Voice ─────────────────────────────────────
563
  with gr.Tab("🔒 Protect Your Voice"):
564
+ gr.Markdown(PROT_INTRO_MD)
565
 
566
  with gr.Row():
567
  audio_in = gr.Audio(
568
  label="Upload your audio (wav / mp3, ≤ 30 s)",
569
+ type="numpy", scale=3,
 
570
  )
571
  with gr.Column(scale=1):
572
  method_dd = gr.Dropdown(
 
576
  )
577
  strength_sl = gr.Slider(
578
  minimum=10, maximum=40, value=25, step=1,
579
+ label="Target SNR (dB) lower = stronger, more audible",
580
+ info="25 dB: nearly imperceptible. 10 dB: noticeable noise.",
581
  )
582
  protect_btn = gr.Button("Apply Protection", variant="primary")
583
 
584
  with gr.Row():
585
+ orig_out = gr.Audio(label="Original", interactive=False)
586
+ prot_live = gr.Audio(label="Protected", interactive=False)
587
 
588
+ metrics_out = gr.Markdown("")
589
+ waveform_plot = gr.Plot(label="Waveform Comparison", show_label=False)
590
 
591
+ method_dd.change(fn=update_strength_label, inputs=[method_dd],
592
+ outputs=[strength_sl])
593
  protect_btn.click(
594
  fn=run_protection,
595
  inputs=[audio_in, method_dd, strength_sl],
596
+ outputs=[orig_out, prot_live, metrics_out, waveform_plot],
597
  )
598
 
599
  gr.Markdown(
600
+ "> **Note:** Full voice cloning inference (SafeSpeech, Enkidu, AntiFake) "
601
+ "requires surrogate models and is not included in this Space due to compute "
602
+ "constraints. See the "
603
+ "[GitHub repo](https://github.com/Nanboy-Ronan/RVCBench) for the full pipeline."
604
  )
605
 
606
+ # ── Tab 3: Results Explorer ───────────────────────────────────────
607
+ with gr.Tab("📊 Results Explorer"):
608
+ gr.Markdown(RESULTS_INTRO_MD)
609
+
610
+ metric_dd = gr.Dropdown(
611
+ choices=list(METRIC_META.keys()),
612
+ value="SIM",
613
+ label="Sort by metric",
 
 
614
  )
615
+ bar_chart = gr.Plot(label="", show_label=False)
616
+ metric_dd.change(fn=update_results_bar, inputs=[metric_dd],
617
+ outputs=[bar_chart])
618
+ demo.load(fn=lambda: make_results_bar("SIM"), outputs=[bar_chart])
619
+
620
+ gr.Markdown("---")
621
+ gr.Markdown(
622
+ "### Protection Robustness Heatmap\n"
623
+ "SIM under each of 5 protection methods — drop from **Clean** indicates "
624
+ "more effective protection."
625
  )
626
+ prot_heatmap = gr.Plot(label="", show_label=False)
627
+ demo.load(fn=make_prot_heatmap, outputs=[prot_heatmap])
628
 
629
+ # ── Tab 4: About ──────────────────────────────────────────────────
630
  with gr.Tab("ℹ️ About"):
631
  gr.Markdown("""
632
  ## About RVCBench
 
635
  against audio protection methods.
636
 
637
  ### What it measures
638
+ - How well **18+ modern zero-shot TTS/VC models** can clone a speaker's voice
639
+ - How effectively **5 audio protection methods** (SafeSpeech, Enkidu, Spectral, GR-Noise, AntiFake)
640
+ prevent cloning across **10 datasets** and **7 evaluation metrics**
641
 
642
  ### Resources
643
 
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  gradio>=5.0,<6
2
  numpy>=1.24
3
  soundfile>=0.12
 
 
1
  gradio>=5.0,<6
2
  numpy>=1.24
3
  soundfile>=0.12
4
+ plotly>=5.0