Spaces:

Nanboy
/

RVCBench

Running

App Files Files Community

Nanboy commited on 7 days ago

Commit

fc16bfb

verified ·

1 Parent(s): 41c0fa6

Optimize Space: interactive Plotly charts, waveform viz, protection heatmap, fix gallery

Browse files

Files changed (2) hide show

app.py +457 -189
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -1,10 +1,12 @@
-"""RVCBench — Interactive HuggingFace Space demo.
 Tabs
 ────
 1. Voice Cloning Gallery   – hear pre-computed clean vs. protected clones
-2. Protect Your Voice      – upload audio, apply a protection method live, compare
-3. Leaderboard             – sortable benchmark results table
 """
 from __future__ import annotations
@@ -15,56 +17,150 @@ import time
 import gradio as gr
 import numpy as np
 import soundfile as sf
 # ── paths ────────────────────────────────────────────────────────────────────
-SAMPLES = os.path.join(os.path.dirname(__file__), "samples", "1089")
-REF_WAV    = os.path.join(SAMPLES, "reference.wav")
-TARGET_WAV = os.path.join(SAMPLES, "target.wav")
-REF_TEXT   = "But her long fair hair was girlish: and girlish, and touched with the wonder of mortal beauty, her face."
 TARGET_TEXT = "A great fisher of souls!"
-MODELS = {
-    "ZipVoice (SIM 0.579)":   ("zipvoice_clean.wav",   "zipvoice_safespeech.wav"),
-    "MOSS-TTSD (SIM 0.492)":  ("moss_ttsd_clean.wav",  "moss_ttsd_safespeech.wav"),
-    "MGM-Omni (SIM 0.539)":   ("mgm_omni_clean.wav",   "mgm_omni_safespeech.wav"),
-    "OZSpeech (SIM 0.388)":   ("ozspeech_clean.wav",   "ozspeech_safespeech.wav"),
-    "StyleTTS 2 (SIM 0.228)": ("styletts2_clean.wav",  "styletts2_safespeech.wav"),
 }
-PROTECTION_SAMPLES = {
-    "SafeSpeech": "protected_safespeech.wav",
-    "GR-Noise":   "protected_grnoise.wav",
-}
-# ── leaderboard data ──────────────────────────────────────────────────────────
-LEADERBOARD = [
-    ["1", "Qwen3-TTS",    "0.614", "0.052", "4.39", "5.79", "2.02",  "0.974", "0.731"],
-    ["2", "IndexTTS",     "0.606", "0.052", "4.06", "6.61", "2.23",  "0.972", "0.693"],
-    ["3", "CosyVoice 2",  "0.602", "0.175", "4.39", "6.17", "4.58",  "0.974", "0.729"],
-    ["4", "ZipVoice",     "0.579", "0.053", "4.13", "7.09", "1.46",  "0.952", "0.675"],
-    ["5", "MaskGCT",      "0.570", "0.088", "3.93", "6.91", "1.36",  "0.939", "0.682"],
-    ["6", "GLM-TTS",      "0.570", "0.087", "4.08", "6.41", "1.74",  "0.951", "0.678"],
-    ["7", "F5-TTS",       "0.559", "0.116", "3.99", "6.96", "0.61",  "0.937", "0.676"],
-    ["8", "Higgs Audio",  "0.559", "0.250", "4.30", "6.06", "1.42",  "0.941", "0.717"],
-    ["9", "MGM-Omni",     "0.539", "0.095", "4.28", "5.82", "0.84",  "0.933", "0.676"],
-    ["10","PlayDiffusion","0.506", "0.055", "4.15", "8.06", "0.73",  "0.936", "0.681"],
-    ["11","MOSS-TTSD",    "0.492", "0.383", "4.10", "7.09", "—",     "0.876", "0.667"],
-    ["12","VibeVoice",    "0.480", "0.228", "3.83", "6.76", "1.86",  "0.852", "0.624"],
-    ["13","FishSpeech",   "0.472", "0.166", "4.37", "6.47", "3.61",  "0.907", "0.682"],
-    ["14","XTTS-v2",      "0.454", "0.073", "3.81", "8.62", "0.62",  "0.908", "0.639"],
-    ["15","SparkTTS",     "0.408", "0.326", "4.06", "5.83", "1.56",  "0.764", "0.672"],
-    ["16","OZSpeech",     "0.388", "0.060", "3.21", "6.87", "8.75",  "0.840", "0.636"],
-    ["17","OpenVoice V2", "0.244", "0.075", "4.30", "7.06", "0.08",  "0.474", "0.601"],
-    ["18","StyleTTS 2",   "0.228", "0.049", "4.30", "6.81", "0.11",  "0.388", "0.589"],
 ]
-HEADERS = ["#", "Model", "SIM ↑", "WER ↓", "MOS ↑", "MCD ↓", "RTF ↓", "SVA ↑", "Emo ↑"]
-# ── protection helpers ────────────────────────────────────────────────────────
 def _load(path: str) -> tuple[np.ndarray, int]:
     audio, sr = sf.read(path, dtype="float32")
@@ -73,107 +169,248 @@ def _load(path: str) -> tuple[np.ndarray, int]:
     return audio, sr
-def _to_bytes(audio: np.ndarray, sr: int) -> bytes:
-    buf = io.BytesIO()
-    sf.write(buf, audio, sr, format="WAV", subtype="PCM_16")
-    buf.seek(0)
-    return buf.read()
 def _snr(original: np.ndarray, protected: np.ndarray) -> float:
     noise = protected - original
-    signal_power = np.mean(original ** 2)
-    noise_power  = np.mean(noise ** 2)
-    if noise_power < 1e-12:
-        return float("inf")
-    return float(10 * np.log10(signal_power / noise_power))
 def apply_grnoise(audio: np.ndarray, sr: int, snr_db: float = 25.0) -> np.ndarray:
-    signal_power = np.mean(audio ** 2)
-    noise_power  = signal_power / (10 ** (snr_db / 10))
-    noise = np.random.randn(*audio.shape).astype(np.float32) * np.sqrt(noise_power)
     return np.clip(audio + noise, -1.0, 1.0)
 def apply_spectral(audio: np.ndarray, sr: int, strength: float = 0.05) -> np.ndarray:
-    """Frequency-domain perturbation: add structured noise in the STFT domain."""
     from numpy.fft import rfft, irfft
-    n_fft = 1024
-    hop   = n_fft // 4
-    frames = []
     for start in range(0, len(audio) - n_fft, hop):
         frame = audio[start:start + n_fft] * np.hanning(n_fft).astype(np.float32)
-        spec  = rfft(frame)
-        mag   = np.abs(spec)
         perturb = np.random.randn(*mag.shape).astype(np.float32) * strength * mag
-        spec_p  = spec + perturb * np.exp(1j * np.random.uniform(0, 2 * np.pi, mag.shape))
-        frames.append((start, irfft(spec_p)))
-    out = np.zeros_like(audio)
-    cnt = np.zeros_like(audio)
-    for start, f in frames:
-        end = start + n_fft
-        out[start:end] += f[:n_fft].astype(np.float32)
-        cnt[start:end] += 1
     cnt = np.maximum(cnt, 1)
     return np.clip(out / cnt, -1.0, 1.0)
-PROTECT_FN = {
-    "GR-Noise":       apply_grnoise,
-    "Spectral":       apply_spectral,
-}
-# ── tab 1: gallery ────────────────────────────────────────────────────────────
-def load_gallery(model_label: str, protection: str):
-    clean_file, safe_file = MODELS[model_label]
-    prot_audio_file = PROTECTION_SAMPLES.get(protection)
-    ref_audio   = REF_WAV
-    target_audio = TARGET_WAV
-    clean_clone  = os.path.join(SAMPLES, clean_file)
-    prot_ref     = os.path.join(SAMPLES, prot_audio_file) if prot_audio_file else None
-    prot_clone   = os.path.join(SAMPLES, safe_file)
-    # Compute SIM drop note
-    clean_sim = float(model_label.split("SIM ")[-1].rstrip(")"))
-    sim_lookup = {
-        "ZipVoice (SIM 0.579)": {"SafeSpeech": 0.287, "GR-Noise": 0.258},
-        "MOSS-TTSD (SIM 0.492)": {"SafeSpeech": 0.242, "GR-Noise": 0.247},
-        "MGM-Omni (SIM 0.539)":  {"SafeSpeech": 0.184, "GR-Noise": 0.229},
-        "OZSpeech (SIM 0.388)":  {"SafeSpeech": 0.156, "GR-Noise": 0.148},
-        "StyleTTS 2 (SIM 0.228)": {"SafeSpeech": 0.089, "GR-Noise": 0.030},
-    }
-    prot_sim = sim_lookup.get(model_label, {}).get(protection, None)
-    drop = clean_sim - prot_sim if prot_sim else None
     note_md = (
         f"**Clean SIM:** {clean_sim:.3f} &nbsp;→&nbsp; "
-        f"**Protected SIM ({protection}):** {prot_sim:.3f} &nbsp;"
         f"*(drop: {drop:.3f})*"
-        if drop is not None else ""
     )
     return (
-        ref_audio,
-        target_audio,
-        clean_clone,
-        prot_ref,
-        prot_clone,
         note_md,
     )
-# ── tab 2: live protection ────────────────────────────────────────────────────
 def run_protection(audio_input, method: str, strength: float):
     if audio_input is None:
-        return None, None, "Upload an audio file first."
     sr_in, data = audio_input
     audio = data.astype(np.float32)
     if audio.max() > 1.0:
-        audio = audio / 32768.0
     if audio.ndim > 1:
         audio = audio.mean(axis=1)
@@ -186,34 +423,54 @@ def run_protection(audio_input, method: str, strength: float):
     elapsed = time.time() - t0
     snr = _snr(audio, protected)
-    protected_int = (protected * 32767).astype(np.int16)
     metrics_md = (
         f"| Metric | Value |\n|--------|-------|\n"
         f"| SNR (dB) | {snr:.1f} |\n"
-        f"| Processing time | {elapsed*1000:.0f} ms |\n"
         f"| Method | {method} |\n"
     )
-    return (sr_in, audio.copy()), (sr_in, protected_int), metrics_md
-# ── build UI ──────────────────────────────────────────────────────────────────
 CSS = """
 #title { text-align: center; }
-.metric-box { font-size: 1.1em; }
-.tab-header { font-weight: bold; }
 footer { display: none !important; }
 """
 INTRO_MD = """
 <div id="title">
-# RVCBench — Voice Cloning & Protection Demo
 **Can audio protection prevent your voice from being cloned?**
-This demo lets you hear the answer.
 [![Paper](https://img.shields.io/badge/arXiv-2602.00443-b31b1b.svg)](https://arxiv.org/abs/2602.00443)
 [![Dataset](https://img.shields.io/badge/HuggingFace-Dataset-ffcc00.svg)](https://huggingface.co/datasets/Nanboy/RVCBench)
@@ -222,91 +479,94 @@ This demo lets you hear the answer.
 </div>
 """
-GALLERY_MD = """
-**How it works:** A voice cloning model uses the *Reference Voice* to clone the *Target Speech*
-(what it wants the speaker to say). When protection is applied to the reference first,
-the clone degrades — the speaker sounds wrong or the speech becomes unintelligible.
 """
-PROTECTION_MD = """
-Upload your own audio clip and apply a protection method in real-time.
-The protected audio sounds nearly identical to humans but disrupts voice cloning models.
-- **GR-Noise** — Gaussian random noise at a target SNR level. No surrogate model needed.
-- **Spectral** — Structured perturbation in the frequency domain.
 """
 def build_demo():
     with gr.Blocks(css=CSS, title="RVCBench Demo") as demo:
         gr.Markdown(INTRO_MD)
         with gr.Tabs():
-            # ── Tab 1: Gallery ──────────────────────────────────────────────
             with gr.Tab("🎧  Voice Cloning Gallery"):
-                gr.Markdown(GALLERY_MD)
-                with gr.Row():
-                    model_dd = gr.Dropdown(
-                        choices=list(MODELS.keys()),
-                        value=list(MODELS.keys())[0],
-                        label="Voice Cloning Model",
-                        scale=2,
-                    )
-                    prot_dd = gr.Dropdown(
-                        choices=["SafeSpeech", "GR-Noise"],
-                        value="SafeSpeech",
-                        label="Protection Method",
-                        scale=1,
-                    )
-                sim_note = gr.Markdown("", elem_classes="metric-box")
                 with gr.Row():
                     with gr.Column():
                         gr.Markdown("### 1 · Reference Voice")
                         gr.Markdown(f"*\"{REF_TEXT}\"*")
-                        ref_out    = gr.Audio(label="Reference (original)", interactive=False)
                     with gr.Column():
                         gr.Markdown("### 2 · Target Speech")
                         gr.Markdown(f"*\"{TARGET_TEXT}\"*")
                         target_out = gr.Audio(label="Target utterance", interactive=False)
                 gr.Markdown("---")
-                gr.Markdown("### Cloning Results")
                 with gr.Row():
                     with gr.Column():
                         gr.Markdown("#### Without Protection")
-                        clean_out = gr.Audio(label="Clean clone (threat)", interactive=False)
                     with gr.Column():
-                        gr.Markdown("#### With Protection")
                         prot_ref_out   = gr.Audio(label="Protected reference", interactive=False)
                         prot_clone_out = gr.Audio(label="Clone from protected (degraded)", interactive=False)
-                load_btn = gr.Button("Load Example", variant="primary")
-                load_btn.click(
-                    fn=load_gallery,
-                    inputs=[model_dd, prot_dd],
-                    outputs=[ref_out, target_out, clean_out, prot_ref_out, prot_clone_out, sim_note],
-                )
-                demo.load(
-                    fn=load_gallery,
-                    inputs=[model_dd, prot_dd],
-                    outputs=[ref_out, target_out, clean_out, prot_ref_out, prot_clone_out, sim_note],
-                )
-            # ── Tab 2: Live Protection ──────────────────────────────────────
             with gr.Tab("🔒  Protect Your Voice"):
-                gr.Markdown(PROTECTION_MD)
                 with gr.Row():
                     audio_in = gr.Audio(
                         label="Upload your audio (wav / mp3, ≤ 30 s)",
-                        type="numpy",
-                        scale=3,
                     )
                     with gr.Column(scale=1):
                         method_dd = gr.Dropdown(
@@ -316,49 +576,57 @@ def build_demo():
                         )
                         strength_sl = gr.Slider(
                             minimum=10, maximum=40, value=25, step=1,
-                            label="Strength (SNR dB for GR-Noise; intensity × 100 for Spectral)",
-                            info="Lower = stronger protection, more audible artifacts.",
                         )
                         protect_btn = gr.Button("Apply Protection", variant="primary")
                 with gr.Row():
-                    orig_out   = gr.Audio(label="Original", interactive=False)
-                    prot_live  = gr.Audio(label="Protected", interactive=False)
-                metrics_out = gr.Markdown("", elem_classes="metric-box")
                 protect_btn.click(
                     fn=run_protection,
                     inputs=[audio_in, method_dd, strength_sl],
-                    outputs=[orig_out, prot_live, metrics_out],
                 )
                 gr.Markdown(
-                    "> **Note:** Live voice cloning inference is not included in this Space due to "
-                    "model size constraints. See the [GitHub repo](https://github.com/Nanboy-Ronan/RVCBench) "
-                    "for the full pipeline with 18+ VC models."
                 )
-            # ── Tab 3: Leaderboard ──────────────────────────────────────────
-            with gr.Tab("📊  Leaderboard"):
-                gr.Markdown(
-                    "### Benchmark Results — LibriTTS (clean prompts)\n"
-                    "Sorted by Speaker Similarity (SIM ↑). "
-                    "Full results including protection robustness and cross-dataset generalisation: "
-                    "[GitHub README](https://github.com/Nanboy-Ronan/RVCBench#benchmark-results).\n\n"
-                    "> **Metric guide** · SIM: speaker similarity ↑ · WER: word error rate ↓ · "
-                    "MOS: perceptual score ↑ · MCD: mel cepstral distortion ↓ · "
-                    "RTF: real-time factor ↓ · SVA: speaker verification accuracy ↑ · Emo: emotion match ↑"
                 )
-                gr.DataFrame(
-                    value=LEADERBOARD,
-                    headers=HEADERS,
-                    datatype=["number", "str"] + ["number"] * 7,
-                    interactive=False,
-                    wrap=False,
                 )
-            # ── Tab 4: About ────────────────────────────────────────────────
             with gr.Tab("ℹ️  About"):
                 gr.Markdown("""
 ## About RVCBench
@@ -367,9 +635,9 @@ def build_demo():
 against audio protection methods.
 ### What it measures
-- How well 18+ modern zero-shot TTS/VC models can clone a speaker's voice
-- How effectively 5 audio protection methods (SafeSpeech, Enkidu, Spectral, GR-Noise, AntiFake)
-  prevent cloning across 10 datasets and 7 evaluation metrics
 ### Resources

+"""RVCBench — Interactive HuggingFace Space demo (v2).
 Tabs
 ────
 1. Voice Cloning Gallery   – hear pre-computed clean vs. protected clones
+                             + protection-effectiveness bar chart for all 5 methods
+2. Protect Your Voice      – upload audio, apply protection, see waveform comparison
+3. Results Explorer        – interactive bar chart + protection robustness heatmap
+4. About                   – paper, citation, resources
 """
 from __future__ import annotations
 import gradio as gr
 import numpy as np
+import plotly.graph_objects as go
 import soundfile as sf
 # ── paths ────────────────────────────────────────────────────────────────────
+SAMPLES     = os.path.join(os.path.dirname(__file__), "samples", "1089")
+REF_WAV     = os.path.join(SAMPLES, "reference.wav")
+TARGET_WAV  = os.path.join(SAMPLES, "target.wav")
+REF_TEXT    = ("But her long fair hair was girlish: and girlish, and touched "
+               "with the wonder of mortal beauty, her face.")
 TARGET_TEXT = "A great fisher of souls!"
+# ── gallery models (audio samples available for SafeSpeech protection) ────────
+GALLERY_MODELS = {
+    "ZipVoice": dict(
+        clean="zipvoice_clean.wav",
+        prot="zipvoice_safespeech.wav",
+        sims={"Clean": 0.579, "SafeSpeech": 0.287, "Enkidu": 0.435,
+              "Spectral": 0.262, "GR-Noise": 0.258, "AntiFake": 0.543},
+    ),
+    "MOSS-TTSD": dict(
+        clean="moss_ttsd_clean.wav",
+        prot="moss_ttsd_safespeech.wav",
+        sims={"Clean": 0.492, "SafeSpeech": 0.242, "Enkidu": 0.335,
+              "Spectral": 0.216, "GR-Noise": 0.247, "AntiFake": 0.453},
+    ),
+    "MGM-Omni": dict(
+        clean="mgm_omni_clean.wav",
+        prot="mgm_omni_safespeech.wav",
+        sims={"Clean": 0.539, "SafeSpeech": 0.184, "Enkidu": 0.316,
+              "Spectral": 0.166, "GR-Noise": 0.229, "AntiFake": 0.491},
+    ),
+    "OZSpeech": dict(
+        clean="ozspeech_clean.wav",
+        prot="ozspeech_safespeech.wav",
+        sims={"Clean": 0.388, "SafeSpeech": 0.156, "Enkidu": 0.187,
+              "Spectral": 0.147, "GR-Noise": 0.148, "AntiFake": 0.337},
+    ),
+    "StyleTTS 2": dict(
+        clean="styletts2_clean.wav",
+        prot="styletts2_safespeech.wav",
+        sims={"Clean": 0.228, "SafeSpeech": 0.089, "Enkidu": 0.125,
+              "Spectral": 0.081, "GR-Noise": 0.030, "AntiFake": 0.207},
+    ),
 }
+# ── benchmark data (LibriTTS, clean prompts) ─────────────────────────────────
+# fmt: off
+LEADERBOARD_ROWS = [
+    dict(model="Qwen3-TTS",    SIM=0.614, WER=0.052, MOS=4.39, MCD=5.79, RTF=2.02,  SVA=0.974, Emo=0.731),
+    dict(model="IndexTTS",     SIM=0.606, WER=0.052, MOS=4.06, MCD=6.61, RTF=2.23,  SVA=0.972, Emo=0.693),
+    dict(model="CosyVoice 2",  SIM=0.602, WER=0.175, MOS=4.39, MCD=6.17, RTF=4.58,  SVA=0.974, Emo=0.729),
+    dict(model="ZipVoice",     SIM=0.579, WER=0.053, MOS=4.13, MCD=7.09, RTF=1.46,  SVA=0.952, Emo=0.675),
+    dict(model="MaskGCT",      SIM=0.570, WER=0.088, MOS=3.93, MCD=6.91, RTF=1.36,  SVA=0.939, Emo=0.682),
+    dict(model="GLM-TTS",      SIM=0.570, WER=0.087, MOS=4.08, MCD=6.41, RTF=1.74,  SVA=0.951, Emo=0.678),
+    dict(model="F5-TTS",       SIM=0.559, WER=0.116, MOS=3.99, MCD=6.96, RTF=0.61,  SVA=0.937, Emo=0.676),
+    dict(model="Higgs Audio",  SIM=0.559, WER=0.250, MOS=4.30, MCD=6.06, RTF=1.42,  SVA=0.941, Emo=0.717),
+    dict(model="MGM-Omni",     SIM=0.539, WER=0.095, MOS=4.28, MCD=5.82, RTF=0.84,  SVA=0.933, Emo=0.676),
+    dict(model="PlayDiffusion",SIM=0.506, WER=0.055, MOS=4.15, MCD=8.06, RTF=0.73,  SVA=0.936, Emo=0.681),
+    dict(model="MOSS-TTSD",    SIM=0.492, WER=0.383, MOS=4.10, MCD=7.09, RTF=None,  SVA=0.876, Emo=0.667),
+    dict(model="VibeVoice",    SIM=0.480, WER=0.228, MOS=3.83, MCD=6.76, RTF=1.86,  SVA=0.852, Emo=0.624),
+    dict(model="FishSpeech",   SIM=0.472, WER=0.166, MOS=4.37, MCD=6.47, RTF=3.61,  SVA=0.907, Emo=0.682),
+    dict(model="XTTS-v2",      SIM=0.454, WER=0.073, MOS=3.81, MCD=8.62, RTF=0.62,  SVA=0.908, Emo=0.639),
+    dict(model="SparkTTS",     SIM=0.408, WER=0.326, MOS=4.06, MCD=5.83, RTF=1.56,  SVA=0.764, Emo=0.672),
+    dict(model="OZSpeech",     SIM=0.388, WER=0.060, MOS=3.21, MCD=6.87, RTF=8.75,  SVA=0.840, Emo=0.636),
+    dict(model="OpenVoice V2", SIM=0.244, WER=0.075, MOS=4.30, MCD=7.06, RTF=0.08,  SVA=0.474, Emo=0.601),
+    dict(model="StyleTTS 2",   SIM=0.228, WER=0.049, MOS=4.30, MCD=6.81, RTF=0.11,  SVA=0.388, Emo=0.589),
+]
+# Protection robustness — SIM under each method (LibriTTS, all 18 models)
+PROT_ROWS = [
+    dict(model="Qwen3-TTS",    Clean=0.614, SafeSpeech=0.384, Enkidu=0.502, Spectral=0.363, GRNoise=0.408, AntiFake=0.582),
+    dict(model="IndexTTS",     Clean=0.606, SafeSpeech=0.346, Enkidu=0.475, Spectral=0.318, GRNoise=0.392, AntiFake=0.572),
+    dict(model="CosyVoice 2",  Clean=0.602, SafeSpeech=0.321, Enkidu=0.447, Spectral=0.301, GRNoise=0.384, AntiFake=0.549),
+    dict(model="ZipVoice",     Clean=0.579, SafeSpeech=0.287, Enkidu=0.435, Spectral=0.262, GRNoise=0.258, AntiFake=0.543),
+    dict(model="MaskGCT",      Clean=0.570, SafeSpeech=0.303, Enkidu=0.407, Spectral=0.281, GRNoise=0.312, AntiFake=0.530),
+    dict(model="GLM-TTS",      Clean=0.570, SafeSpeech=0.330, Enkidu=0.445, Spectral=0.311, GRNoise=0.388, AntiFake=0.532),
+    dict(model="F5-TTS",       Clean=0.559, SafeSpeech=0.207, Enkidu=0.431, Spectral=0.176, GRNoise=0.137, AntiFake=0.520),
+    dict(model="Higgs Audio",  Clean=0.559, SafeSpeech=0.264, Enkidu=0.435, Spectral=0.236, GRNoise=0.272, AntiFake=0.521),
+    dict(model="MGM-Omni",     Clean=0.539, SafeSpeech=0.184, Enkidu=0.316, Spectral=0.166, GRNoise=0.229, AntiFake=0.491),
+    dict(model="PlayDiffusion",Clean=0.506, SafeSpeech=0.173, Enkidu=None,  Spectral=0.149, GRNoise=0.162, AntiFake=0.466),
+    dict(model="MOSS-TTSD",    Clean=0.492, SafeSpeech=0.242, Enkidu=0.335, Spectral=0.216, GRNoise=0.247, AntiFake=0.453),
+    dict(model="VibeVoice",    Clean=0.480, SafeSpeech=0.272, Enkidu=0.367, Spectral=0.253, GRNoise=0.280, AntiFake=0.442),
+    dict(model="FishSpeech",   Clean=0.472, SafeSpeech=0.238, Enkidu=0.334, Spectral=0.212, GRNoise=0.235, AntiFake=0.439),
+    dict(model="XTTS-v2",      Clean=0.454, SafeSpeech=0.260, Enkidu=0.308, Spectral=0.241, GRNoise=0.237, AntiFake=0.414),
+    dict(model="SparkTTS",     Clean=0.408, SafeSpeech=0.129, Enkidu=0.137, Spectral=0.108, GRNoise=0.062, AntiFake=0.359),
+    dict(model="OZSpeech",     Clean=0.388, SafeSpeech=0.156, Enkidu=0.187, Spectral=0.147, GRNoise=0.148, AntiFake=0.337),
+    dict(model="OpenVoice V2", Clean=0.244, SafeSpeech=0.185, Enkidu=0.188, Spectral=0.180, GRNoise=0.175, AntiFake=0.236),
+    dict(model="StyleTTS 2",   Clean=0.228, SafeSpeech=0.089, Enkidu=0.125, Spectral=0.081, GRNoise=0.030, AntiFake=0.207),
 ]
+# fmt: on
+METRIC_META = {
+    "SIM": ("Speaker Similarity ↑", True),
+    "WER": ("Word Error Rate ↓",    False),
+    "MOS": ("MOS Score ↑",          True),
+    "MCD": ("Mel Cepstral Dist. ↓", False),
+    "RTF": ("Real-Time Factor ↓",   False),
+    "SVA": ("Speaker Verif. Acc. ↑",True),
+    "Emo": ("Emotion Match Rate ↑", True),
+}
+# ── colour helpers ────────────────────────────────────────────────────────────
+_GOOD  = (200, 230, 201)   # #c8e6c9 light green
+_MID   = (255, 249, 196)   # #fff9c4 light yellow
+_BAD   = (255, 205, 210)   # #ffcdd2 light red
+def _interp_color(t: float) -> str:
+    """t=0 → bad (red), t=1 → good (green), t=0.5 → yellow."""
+    if t <= 0.5:
+        s = t / 0.5
+        r = int(_BAD[0] + s * (_MID[0] - _BAD[0]))
+        g = int(_BAD[1] + s * (_MID[1] - _BAD[1]))
+        b = int(_BAD[2] + s * (_MID[2] - _BAD[2]))
+    else:
+        s = (t - 0.5) / 0.5
+        r = int(_MID[0] + s * (_GOOD[0] - _MID[0]))
+        g = int(_MID[1] + s * (_GOOD[1] - _MID[1]))
+        b = int(_MID[2] + s * (_GOOD[2] - _MID[2]))
+    return f"rgb({r},{g},{b})"
+def _col_colors(values: list, higher_is_better: bool) -> list[str]:
+    valid = [v for v in values if v is not None]
+    if not valid or max(valid) == min(valid):
+        return ["rgb(245,245,245)"] * len(values)
+    vmin, vmax = min(valid), max(valid)
+    colors = []
+    for v in values:
+        if v is None:
+            colors.append("rgb(245,245,245)")
+        else:
+            t = (v - vmin) / (vmax - vmin)
+            if not higher_is_better:
+                t = 1 - t
+            colors.append(_interp_color(t))
+    return colors
+# ── audio helpers ─────────────────────────────────────────────────────────────
 def _load(path: str) -> tuple[np.ndarray, int]:
     audio, sr = sf.read(path, dtype="float32")
     return audio, sr
 def _snr(original: np.ndarray, protected: np.ndarray) -> float:
     noise = protected - original
+    sp = np.mean(original ** 2)
+    np_ = np.mean(noise ** 2)
+    return float("inf") if np_ < 1e-12 else float(10 * np.log10(sp / np_))
+# ── protection functions ──────────────────────────────────────────────────────
 def apply_grnoise(audio: np.ndarray, sr: int, snr_db: float = 25.0) -> np.ndarray:
+    sig_pow = np.mean(audio ** 2)
+    noise_pow = sig_pow / (10 ** (snr_db / 10))
+    noise = np.random.randn(*audio.shape).astype(np.float32) * np.sqrt(noise_pow)
     return np.clip(audio + noise, -1.0, 1.0)
 def apply_spectral(audio: np.ndarray, sr: int, strength: float = 0.05) -> np.ndarray:
     from numpy.fft import rfft, irfft
+    n_fft, hop = 1024, 256
+    out = np.zeros_like(audio)
+    cnt = np.zeros_like(audio)
     for start in range(0, len(audio) - n_fft, hop):
         frame = audio[start:start + n_fft] * np.hanning(n_fft).astype(np.float32)
+        spec = rfft(frame)
+        mag = np.abs(spec)
         perturb = np.random.randn(*mag.shape).astype(np.float32) * strength * mag
+        spec_p = spec + perturb * np.exp(1j * np.random.uniform(0, 2 * np.pi, mag.shape))
+        f = irfft(spec_p)[:n_fft].astype(np.float32)
+        out[start:start + n_fft] += f
+        cnt[start:start + n_fft] += 1
     cnt = np.maximum(cnt, 1)
     return np.clip(out / cnt, -1.0, 1.0)
+PROTECT_FN = {"GR-Noise": apply_grnoise, "Spectral": apply_spectral}
+# ── plotly figures ────────────────────────────────────────────────────────────
+def make_sim_bar(model_name: str) -> go.Figure:
+    """Bar chart: SIM under each protection method for one gallery model."""
+    info = GALLERY_MODELS[model_name]
+    sims = info["sims"]
+    labels = list(sims.keys())
+    values = list(sims.values())
+    bar_colors = [
+        "#1565c0",  # Clean
+        "#6a1b9a",  # SafeSpeech
+        "#1b5e20",  # Enkidu
+        "#e65100",  # Spectral
+        "#37474f",  # GR-Noise
+        "#880e4f",  # AntiFake
+    ]
+    # annotate drop vs clean
+    clean_sim = sims["Clean"]
+    text = [f"{v:.3f}" if k == "Clean" else f"{v:.3f}<br>↓{clean_sim - v:.3f}"
+            for k, v in sims.items()]
+    fig = go.Figure(go.Bar(
+        x=labels, y=values,
+        marker_color=bar_colors,
+        text=text, textposition="outside",
+        cliponaxis=False,
+    ))
+    fig.update_layout(
+        title=dict(text=f"<b>{model_name}</b> — Speaker Similarity Under Each Protection",
+                   font=dict(size=14)),
+        yaxis=dict(title="SIM (Speaker Similarity)", range=[0, max(values) * 1.2]),
+        xaxis=dict(title="Condition"),
+        paper_bgcolor="white", plot_bgcolor="#f8f9fa",
+        margin=dict(t=60, b=40, l=50, r=20),
+        height=320,
+        showlegend=False,
+    )
+    fig.add_hline(y=clean_sim, line_dash="dot", line_color="#1565c0",
+                  annotation_text="Clean baseline", annotation_position="top right",
+                  annotation_font_size=10)
+    return fig
+def make_results_bar(metric: str = "SIM", ascending: bool = False) -> go.Figure:
+    """Horizontal bar chart of all 18 models sorted by the chosen metric."""
+    higher_is_better = METRIC_META[metric][1]
+    metric_label     = METRIC_META[metric][0]
+    rows = [r for r in LEADERBOARD_ROWS if r.get(metric) is not None]
+    rows = sorted(rows, key=lambda r: r[metric], reverse=(higher_is_better ^ ascending))
+    models = [r["model"] for r in rows]
+    values = [r[metric] for r in rows]
+    colors = _col_colors(values, higher_is_better)
+    text   = [f"{v:.3f}" if v is not None else "—" for v in values]
+    fig = go.Figure(go.Bar(
+        x=values, y=models,
+        orientation="h",
+        marker_color=colors,
+        marker_line_color="#999", marker_line_width=0.5,
+        text=text, textposition="outside",
+        cliponaxis=False,
+    ))
+    fig.update_layout(
+        title=dict(text=f"<b>Model Ranking by {metric_label}</b>",
+                   font=dict(size=14)),
+        xaxis=dict(title=metric_label),
+        yaxis=dict(autorange="reversed"),
+        paper_bgcolor="white", plot_bgcolor="#f8f9fa",
+        margin=dict(t=50, b=40, l=120, r=80),
+        height=520,
+        showlegend=False,
+    )
+    return fig
+def make_prot_heatmap() -> go.Figure:
+    """Heatmap: SIM under each protection method for all 18 models."""
+    col_order = ["Clean", "SafeSpeech", "Enkidu", "Spectral", "GRNoise", "AntiFake"]
+    col_labels = ["Clean", "SafeSpeech", "Enkidu", "Spectral", "GR-Noise", "AntiFake"]
+    # sort models by Clean SIM descending
+    rows = sorted(PROT_ROWS, key=lambda r: r["Clean"], reverse=True)
+    model_names = [r["model"] for r in rows]
+    z: list[list] = []
+    text_vals: list[list[str]] = []
+    for r in rows:
+        row_z, row_t = [], []
+        for col in col_order:
+            v = r.get(col)
+            row_z.append(v)
+            row_t.append(f"{v:.3f}" if v is not None else "—")
+        z.append(row_z)
+        text_vals.append(row_t)
+    fig = go.Figure(go.Heatmap(
+        z=z,
+        x=col_labels,
+        y=model_names,
+        text=text_vals,
+        texttemplate="%{text}",
+        textfont=dict(size=10),
+        colorscale=[
+            [0.0,  "#b71c1c"],
+            [0.25, "#ef9a9a"],
+            [0.5,  "#fff9c4"],
+            [0.75, "#a5d6a7"],
+            [1.0,  "#1b5e20"],
+        ],
+        zmin=0.0, zmax=0.75,
+        colorbar=dict(title="SIM", tickformat=".2f", len=0.8),
+        hoverongaps=False,
+    ))
+    # separator line after Clean column
+    fig.add_shape(type="line",
+                  x0=0.5, x1=0.5, y0=-0.5, y1=len(model_names) - 0.5,
+                  line=dict(color="#555", width=2, dash="dot"),
+                  xref="x", yref="y")
+    fig.update_layout(
+        title=dict(
+            text="<b>Protection Robustness — Speaker Similarity (SIM) on LibriTTS</b><br>"
+                 "<sup>Green = high SIM (clone faithful). Red = low SIM (protection effective). "
+                 "Drop from Clean → protected shows protection strength.</sup>",
+            font=dict(size=13),
+        ),
+        yaxis=dict(autorange="reversed"),
+        xaxis=dict(side="top"),
+        paper_bgcolor="white", plot_bgcolor="white",
+        margin=dict(t=120, b=40, l=120, r=80),
+        height=600,
+    )
+    return fig
+def make_waveform_figure(
+    original: np.ndarray, protected: np.ndarray, sr: int
+) -> go.Figure:
+    """Overlay waveform plot: original vs. protected audio."""
+    n = min(len(original), len(protected), sr * 5)  # cap at 5 s
+    t = np.arange(n) / sr
+    fig = go.Figure()
+    fig.add_trace(go.Scatter(
+        x=t, y=original[:n],
+        name="Original",
+        line=dict(color="#1565c0", width=1),
+        opacity=0.85,
+    ))
+    fig.add_trace(go.Scatter(
+        x=t, y=protected[:n],
+        name="Protected",
+        line=dict(color="#c62828", width=1),
+        opacity=0.85,
+    ))
+    fig.update_layout(
+        title=dict(text="<b>Waveform Comparison</b> (first 5 s)",
+                   font=dict(size=13)),
+        xaxis=dict(title="Time (s)"),
+        yaxis=dict(title="Amplitude", range=[-1.05, 1.05]),
+        paper_bgcolor="white", plot_bgcolor="#f8f9fa",
+        legend=dict(orientation="h", y=1.08, x=0.5, xanchor="center"),
+        margin=dict(t=60, b=40, l=55, r=20),
+        height=220,
+    )
+    return fig
+# ── gallery callback ──────────────────────────────────────────────────────────
+def load_gallery(model_name: str):
+    info = GALLERY_MODELS[model_name]
+    clean_sim  = info["sims"]["Clean"]
+    prot_sim   = info["sims"]["SafeSpeech"]
+    drop       = clean_sim - prot_sim
     note_md = (
         f"**Clean SIM:** {clean_sim:.3f} &nbsp;→&nbsp; "
+        f"**Protected SIM (SafeSpeech):** {prot_sim:.3f} &nbsp;"
         f"*(drop: {drop:.3f})*"
     )
     return (
+        REF_WAV,
+        TARGET_WAV,
+        os.path.join(SAMPLES, info["clean"]),
+        os.path.join(SAMPLES, "protected_safespeech.wav"),
+        os.path.join(SAMPLES, info["prot"]),
         note_md,
+        make_sim_bar(model_name),
     )
+# ── live protection callback ──────────────────────────────────────────────────
 def run_protection(audio_input, method: str, strength: float):
     if audio_input is None:
+        return None, None, "Upload an audio file first.", None
     sr_in, data = audio_input
     audio = data.astype(np.float32)
     if audio.max() > 1.0:
+        audio /= 32768.0
     if audio.ndim > 1:
         audio = audio.mean(axis=1)
     elapsed = time.time() - t0
     snr = _snr(audio, protected)
+    prot_int = (protected * 32767).astype(np.int16)
     metrics_md = (
         f"| Metric | Value |\n|--------|-------|\n"
         f"| SNR (dB) | {snr:.1f} |\n"
+        f"| Processing time | {elapsed * 1000:.0f} ms |\n"
         f"| Method | {method} |\n"
     )
+    waveform_fig = make_waveform_figure(audio, protected, sr_in)
+    return (sr_in, audio.copy()), (sr_in, prot_int), metrics_md, waveform_fig
+def update_strength_label(method: str) -> dict:
+    if method == "GR-Noise":
+        return gr.update(
+            label="Target SNR (dB) — lower = stronger, more audible",
+            info="25 dB: nearly imperceptible. 10 dB: noticeable noise.",
+            minimum=10, maximum=40, value=25, step=1,
+        )
+    else:
+        return gr.update(
+            label="Spectral Strength (%) — higher = stronger perturbation",
+            info="5% is nearly inaudible. 20%+ may cause artifacts.",
+            minimum=1, maximum=30, value=5, step=1,
+        )
+# ── results callbacks ─────────────────────────────────────────────────────────
+def update_results_bar(metric: str) -> go.Figure:
+    return make_results_bar(metric)
+# ── UI constants ──────────────────────────────────────────────────────────────
 CSS = """
 #title { text-align: center; }
 footer { display: none !important; }
+.note-box { font-size: 1.05em; background: #f0f4ff; border-radius: 8px; padding: 8px 12px; }
 """
 INTRO_MD = """
 <div id="title">
+# RVCBench — Voice Cloning & Protection Benchmark
 **Can audio protection prevent your voice from being cloned?**
 [![Paper](https://img.shields.io/badge/arXiv-2602.00443-b31b1b.svg)](https://arxiv.org/abs/2602.00443)
 [![Dataset](https://img.shields.io/badge/HuggingFace-Dataset-ffcc00.svg)](https://huggingface.co/datasets/Nanboy/RVCBench)
 </div>
 """
+GALLERY_INTRO_MD = """
+A voice cloning model uses the **Reference Voice** to clone the **Target Speech**.
+When protection (SafeSpeech adversarial perturbation) is applied to the reference first,
+the clone degrades — lower speaker similarity means protection is working.
+The bar chart below shows the SIM drop under **all 5 protection methods** for the selected model.
+"""
+PROT_INTRO_MD = """
+Upload your own audio clip and apply a protection method. The protected audio sounds nearly
+identical to humans, but disrupts automatic voice cloning models.
+- **GR-Noise** — Gaussian random noise at a chosen SNR level. No surrogate model required.
+- **Spectral** — Structured perturbation in the STFT frequency domain.
 """
+RESULTS_INTRO_MD = """
+**Metric guide** — SIM: speaker cosine similarity ↑ &nbsp;·&nbsp;
+WER: word error rate ↓ &nbsp;·&nbsp; MOS: perceptual quality ↑ &nbsp;·&nbsp;
+MCD: mel cepstral distortion ↓ &nbsp;·&nbsp; RTF: real-time factor ↓ &nbsp;·&nbsp;
+SVA: speaker verification accuracy ↑ &nbsp;·&nbsp; Emo: emotion match rate ↑
+Select a metric to re-rank the 18 models. The heatmap below shows protection robustness
+(SIM under each of 5 protection methods).
 """
+# ── build demo ────────────────────────────────────────────────────────────────
 def build_demo():
     with gr.Blocks(css=CSS, title="RVCBench Demo") as demo:
         gr.Markdown(INTRO_MD)
         with gr.Tabs():
+            # ── Tab 1: Voice Cloning Gallery ──────────────────────────────────
             with gr.Tab("🎧  Voice Cloning Gallery"):
+                gr.Markdown(GALLERY_INTRO_MD)
+                model_dd = gr.Dropdown(
+                    choices=list(GALLERY_MODELS.keys()),
+                    value="ZipVoice",
+                    label="Voice Cloning Model",
+                )
+                load_btn = gr.Button("Load Example", variant="primary")
+                sim_note = gr.Markdown("", elem_classes="note-box")
                 with gr.Row():
                     with gr.Column():
                         gr.Markdown("### 1 · Reference Voice")
                         gr.Markdown(f"*\"{REF_TEXT}\"*")
+                        ref_out = gr.Audio(label="Reference (original)", interactive=False)
                     with gr.Column():
                         gr.Markdown("### 2 · Target Speech")
                         gr.Markdown(f"*\"{TARGET_TEXT}\"*")
                         target_out = gr.Audio(label="Target utterance", interactive=False)
                 gr.Markdown("---")
+                gr.Markdown("### 3 · Cloning Results — Clean vs. SafeSpeech-Protected")
                 with gr.Row():
                     with gr.Column():
                         gr.Markdown("#### Without Protection")
+                        clean_out = gr.Audio(label="Clean clone", interactive=False)
                     with gr.Column():
+                        gr.Markdown("#### With SafeSpeech Protection")
                         prot_ref_out   = gr.Audio(label="Protected reference", interactive=False)
                         prot_clone_out = gr.Audio(label="Clone from protected (degraded)", interactive=False)
+                gr.Markdown("---")
+                gr.Markdown("### 4 · Protection Effectiveness Across All Methods")
+                sim_chart = gr.Plot(label="", show_label=False)
+                gallery_outputs = [ref_out, target_out, clean_out, prot_ref_out,
+                                   prot_clone_out, sim_note, sim_chart]
+                load_btn.click(fn=load_gallery, inputs=[model_dd], outputs=gallery_outputs)
+                demo.load(fn=load_gallery, inputs=[model_dd], outputs=gallery_outputs)
+                model_dd.change(fn=load_gallery, inputs=[model_dd], outputs=gallery_outputs)
+            # ── Tab 2: Protect Your Voice ─────────────────────────────────────
             with gr.Tab("🔒  Protect Your Voice"):
+                gr.Markdown(PROT_INTRO_MD)
                 with gr.Row():
                     audio_in = gr.Audio(
                         label="Upload your audio (wav / mp3, ≤ 30 s)",
+                        type="numpy", scale=3,
                     )
                     with gr.Column(scale=1):
                         method_dd = gr.Dropdown(
                         )
                         strength_sl = gr.Slider(
                             minimum=10, maximum=40, value=25, step=1,
+                            label="Target SNR (dB) — lower = stronger, more audible",
+                            info="25 dB: nearly imperceptible. 10 dB: noticeable noise.",
                         )
                         protect_btn = gr.Button("Apply Protection", variant="primary")
                 with gr.Row():
+                    orig_out  = gr.Audio(label="Original", interactive=False)
+                    prot_live = gr.Audio(label="Protected", interactive=False)
+                metrics_out   = gr.Markdown("")
+                waveform_plot = gr.Plot(label="Waveform Comparison", show_label=False)
+                method_dd.change(fn=update_strength_label, inputs=[method_dd],
+                                 outputs=[strength_sl])
                 protect_btn.click(
                     fn=run_protection,
                     inputs=[audio_in, method_dd, strength_sl],
+                    outputs=[orig_out, prot_live, metrics_out, waveform_plot],
                 )
                 gr.Markdown(
+                    "> **Note:** Full voice cloning inference (SafeSpeech, Enkidu, AntiFake) "
+                    "requires surrogate models and is not included in this Space due to compute "
+                    "constraints. See the "
+                    "[GitHub repo](https://github.com/Nanboy-Ronan/RVCBench) for the full pipeline."
                 )
+            # ── Tab 3: Results Explorer ───────────────────────────────────────
+            with gr.Tab("📊  Results Explorer"):
+                gr.Markdown(RESULTS_INTRO_MD)
+                metric_dd = gr.Dropdown(
+                    choices=list(METRIC_META.keys()),
+                    value="SIM",
+                    label="Sort by metric",
                 )
+                bar_chart = gr.Plot(label="", show_label=False)
+                metric_dd.change(fn=update_results_bar, inputs=[metric_dd],
+                                 outputs=[bar_chart])
+                demo.load(fn=lambda: make_results_bar("SIM"), outputs=[bar_chart])
+                gr.Markdown("---")
+                gr.Markdown(
+                    "### Protection Robustness Heatmap\n"
+                    "SIM under each of 5 protection methods — drop from **Clean** indicates "
+                    "more effective protection."
                 )
+                prot_heatmap = gr.Plot(label="", show_label=False)
+                demo.load(fn=make_prot_heatmap, outputs=[prot_heatmap])
+            # ── Tab 4: About ──────────────────────────────────────────────────
             with gr.Tab("ℹ️  About"):
                 gr.Markdown("""
 ## About RVCBench
 against audio protection methods.
 ### What it measures
+- How well **18+ modern zero-shot TTS/VC models** can clone a speaker's voice
+- How effectively **5 audio protection methods** (SafeSpeech, Enkidu, Spectral, GR-Noise, AntiFake)
+  prevent cloning across **10 datasets** and **7 evaluation metrics**
 ### Resources

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 gradio>=5.0,<6
 numpy>=1.24
 soundfile>=0.12

 gradio>=5.0,<6
 numpy>=1.24
 soundfile>=0.12
+plotly>=5.0