Spaces:

Nanboy
/

RVCBench

Running

File size: 39,916 Bytes

"""RVCBench — Interactive HuggingFace Space demo (v2).

Tabs
────
1. Voice Cloning Gallery   – hear pre-computed clean vs. protected clones
                             + protection-effectiveness bar chart for all 5 methods
2. Protect Your Voice      – upload audio, apply protection, see waveform comparison
3. Results Explorer        – interactive bar chart + protection robustness heatmap
4. About                   – paper, citation, resources
"""

from __future__ import annotations

import io
import os
import time

import gradio as gr
import numpy as np
import plotly.graph_objects as go
import soundfile as sf

try:
    import _plotly_utils.basevalidators as _plotly_basevalidators

    def _plotly_to_scalar_or_list_without_pandas(value):
        np_mod = _plotly_basevalidators.get_module("numpy", should_load=False)
        if np_mod and np_mod.isscalar(value) and hasattr(value, "item"):
            return _plotly_basevalidators.to_non_numpy_type(np_mod, value)
        if isinstance(value, (list, tuple)):
            return [_plotly_to_scalar_or_list_without_pandas(item) for item in value]
        if np_mod and isinstance(value, np_mod.ndarray):
            if value.ndim == 0:
                return _plotly_basevalidators.to_non_numpy_type(np_mod, value)
            return [_plotly_to_scalar_or_list_without_pandas(item) for item in value]
        if _plotly_basevalidators.is_numpy_convertable(value):
            np_mod = _plotly_basevalidators.get_module("numpy", should_load=True)
            if np_mod:
                return _plotly_to_scalar_or_list_without_pandas(np_mod.array(value))
        return value

    def _plotly_is_homogeneous_array_without_pandas(value):
        np_mod = _plotly_basevalidators.get_module("numpy", should_load=False)
        if np_mod and isinstance(value, np_mod.ndarray):
            return True
        if isinstance(value, _plotly_basevalidators.nw.Series):
            return True
        if _plotly_basevalidators.is_numpy_convertable(value):
            np_mod = _plotly_basevalidators.get_module("numpy", should_load=True)
            if np_mod:
                return np_mod.array(value).shape != ()
        return False

    _plotly_basevalidators.to_scalar_or_list = _plotly_to_scalar_or_list_without_pandas
    _plotly_basevalidators.is_homogeneous_array = _plotly_is_homogeneous_array_without_pandas
except Exception:
    pass

# ── paths ────────────────────────────────────────────────────────────────────

SAMPLES     = os.path.join(os.path.dirname(__file__), "samples", "1089")
REF_WAV     = os.path.join(SAMPLES, "reference.wav")
TARGET_WAV  = os.path.join(SAMPLES, "target.wav")
REF_TEXT    = ("But her long fair hair was girlish: and girlish, and touched "
               "with the wonder of mortal beauty, her face.")
TARGET_TEXT = "A great fisher of souls!"

# ── gallery models (audio samples available for SafeSpeech protection) ────────

GALLERY_MODELS = {
    "ZipVoice": dict(
        clean="zipvoice_clean.wav",
        prot="zipvoice_safespeech.wav",
        sims={"Clean": 0.579, "SafeSpeech": 0.287, "Enkidu": 0.435,
              "Spectral": 0.262, "GR-Noise": 0.258, "AntiFake": 0.543},
    ),
    "MOSS-TTSD": dict(
        clean="moss_ttsd_clean.wav",
        prot="moss_ttsd_safespeech.wav",
        sims={"Clean": 0.492, "SafeSpeech": 0.242, "Enkidu": 0.335,
              "Spectral": 0.216, "GR-Noise": 0.247, "AntiFake": 0.453},
    ),
    "MGM-Omni": dict(
        clean="mgm_omni_clean.wav",
        prot="mgm_omni_safespeech.wav",
        sims={"Clean": 0.539, "SafeSpeech": 0.184, "Enkidu": 0.316,
              "Spectral": 0.166, "GR-Noise": 0.229, "AntiFake": 0.491},
    ),
    "OZSpeech": dict(
        clean="ozspeech_clean.wav",
        prot="ozspeech_safespeech.wav",
        sims={"Clean": 0.388, "SafeSpeech": 0.156, "Enkidu": 0.187,
              "Spectral": 0.147, "GR-Noise": 0.148, "AntiFake": 0.337},
    ),
    "StyleTTS 2": dict(
        clean="styletts2_clean.wav",
        prot="styletts2_safespeech.wav",
        sims={"Clean": 0.228, "SafeSpeech": 0.089, "Enkidu": 0.125,
              "Spectral": 0.081, "GR-Noise": 0.030, "AntiFake": 0.207},
    ),
}

# ── benchmark data (LibriTTS, clean prompts) ─────────────────────────────────

# fmt: off
LEADERBOARD_ROWS = [
    dict(model="Qwen3-TTS",    SIM=0.614, WER=0.052, MOS=4.39, MCD=5.79, RTF=2.02,  SVA=0.974, Emo=0.731),
    dict(model="IndexTTS",     SIM=0.606, WER=0.052, MOS=4.06, MCD=6.61, RTF=2.23,  SVA=0.972, Emo=0.693),
    dict(model="CosyVoice 2",  SIM=0.602, WER=0.175, MOS=4.39, MCD=6.17, RTF=4.58,  SVA=0.974, Emo=0.729),
    dict(model="ZipVoice",     SIM=0.579, WER=0.053, MOS=4.13, MCD=7.09, RTF=1.46,  SVA=0.952, Emo=0.675),
    dict(model="MaskGCT",      SIM=0.570, WER=0.088, MOS=3.93, MCD=6.91, RTF=1.36,  SVA=0.939, Emo=0.682),
    dict(model="GLM-TTS",      SIM=0.570, WER=0.087, MOS=4.08, MCD=6.41, RTF=1.74,  SVA=0.951, Emo=0.678),
    dict(model="F5-TTS",       SIM=0.559, WER=0.116, MOS=3.99, MCD=6.96, RTF=0.61,  SVA=0.937, Emo=0.676),
    dict(model="Higgs Audio",  SIM=0.559, WER=0.250, MOS=4.30, MCD=6.06, RTF=1.42,  SVA=0.941, Emo=0.717),
    dict(model="MGM-Omni",     SIM=0.539, WER=0.095, MOS=4.28, MCD=5.82, RTF=0.84,  SVA=0.933, Emo=0.676),
    dict(model="PlayDiffusion",SIM=0.506, WER=0.055, MOS=4.15, MCD=8.06, RTF=0.73,  SVA=0.936, Emo=0.681),
    dict(model="MOSS-TTSD",    SIM=0.492, WER=0.383, MOS=4.10, MCD=7.09, RTF=None,  SVA=0.876, Emo=0.667),
    dict(model="VibeVoice",    SIM=0.480, WER=0.228, MOS=3.83, MCD=6.76, RTF=1.86,  SVA=0.852, Emo=0.624),
    dict(model="FishSpeech",   SIM=0.472, WER=0.166, MOS=4.37, MCD=6.47, RTF=3.61,  SVA=0.907, Emo=0.682),
    dict(model="XTTS-v2",      SIM=0.454, WER=0.073, MOS=3.81, MCD=8.62, RTF=0.62,  SVA=0.908, Emo=0.639),
    dict(model="SparkTTS",     SIM=0.408, WER=0.326, MOS=4.06, MCD=5.83, RTF=1.56,  SVA=0.764, Emo=0.672),
    dict(model="OZSpeech",     SIM=0.388, WER=0.060, MOS=3.21, MCD=6.87, RTF=8.75,  SVA=0.840, Emo=0.636),
    dict(model="OpenVoice V2", SIM=0.244, WER=0.075, MOS=4.30, MCD=7.06, RTF=0.08,  SVA=0.474, Emo=0.601),
    dict(model="StyleTTS 2",   SIM=0.228, WER=0.049, MOS=4.30, MCD=6.81, RTF=0.11,  SVA=0.388, Emo=0.589),
]

# Cross-dataset generalisation — SIM on clean prompts across all 10 datasets
CROSS_DATASET_ROWS = [
    dict(model="Qwen3-TTS",    LibriTTS=0.614, VCTK=0.618, MultiSpk=0.495, Long=0.561,  AISHELL=0.721, French=0.536, Bilingual=0.673, BGclean=0.689, BGnoise=0.572, Hallucin=0.515),
    dict(model="IndexTTS",     LibriTTS=0.606, VCTK=0.567, MultiSpk=0.473, Long=0.775,  AISHELL=0.721, French=0.397, Bilingual=0.673, BGclean=0.589, BGnoise=0.528, Hallucin=0.529),
    dict(model="CosyVoice 2",  LibriTTS=0.602, VCTK=0.582, MultiSpk=0.448, Long=0.530,  AISHELL=0.717, French=0.378, Bilingual=0.653, BGclean=0.626, BGnoise=0.515, Hallucin=0.518),
    dict(model="ZipVoice",     LibriTTS=0.579, VCTK=0.554, MultiSpk=0.531, Long=0.729,  AISHELL=0.712, French=0.363, Bilingual=0.322, BGclean=0.625, BGnoise=0.462, Hallucin=0.509),
    dict(model="MaskGCT",      LibriTTS=0.570, VCTK=0.555, MultiSpk=0.431, Long=0.194,  AISHELL=0.674, French=0.494, Bilingual=None,  BGclean=0.610, BGnoise=0.487, Hallucin=0.499),
    dict(model="GLM-TTS",      LibriTTS=0.570, VCTK=0.573, MultiSpk=0.445, Long=0.757,  AISHELL=0.690, French=0.398, Bilingual=0.657, BGclean=0.622, BGnoise=0.528, Hallucin=0.533),
    dict(model="F5-TTS",       LibriTTS=0.559, VCTK=0.537, MultiSpk=0.507, Long=0.607,  AISHELL=0.696, French=0.304, Bilingual=0.653, BGclean=0.582, BGnoise=0.414, Hallucin=0.455),
    dict(model="Higgs Audio",  LibriTTS=0.559, VCTK=0.516, MultiSpk=0.418, Long=0.520,  AISHELL=0.581, French=0.349, Bilingual=0.543, BGclean=0.592, BGnoise=0.421, Hallucin=0.425),
    dict(model="MGM-Omni",     LibriTTS=0.539, VCTK=0.447, MultiSpk=0.370, Long=0.442,  AISHELL=0.713, French=0.227, Bilingual=0.630, BGclean=0.523, BGnoise=0.332, Hallucin=0.396),
    dict(model="PlayDiffusion",LibriTTS=0.506, VCTK=0.426, MultiSpk=0.360, Long=0.637,  AISHELL=0.441, French=0.283, Bilingual=0.465, BGclean=0.433, BGnoise=0.305, Hallucin=0.408),
    dict(model="MOSS-TTSD",    LibriTTS=0.492, VCTK=0.440, MultiSpk=0.379, Long=0.644,  AISHELL=0.437, French=0.327, Bilingual=0.471, BGclean=0.494, BGnoise=0.488, Hallucin=0.416),
    dict(model="VibeVoice",    LibriTTS=0.480, VCTK=0.436, MultiSpk=0.348, Long=0.625,  AISHELL=0.564, French=0.343, Bilingual=0.531, BGclean=0.513, BGnoise=0.364, Hallucin=0.408),
    dict(model="FishSpeech",   LibriTTS=0.472, VCTK=0.430, MultiSpk=0.383, Long=0.572,  AISHELL=0.611, French=0.374, Bilingual=0.566, BGclean=0.495, BGnoise=0.387, Hallucin=0.351),
    dict(model="XTTS-v2",      LibriTTS=0.454, VCTK=0.454, MultiSpk=0.328, Long=0.613,  AISHELL=0.569, French=0.445, Bilingual=0.506, BGclean=0.546, BGnoise=0.394, Hallucin=0.488),
    dict(model="SparkTTS",     LibriTTS=0.408, VCTK=0.532, MultiSpk=0.228, Long=0.345,  AISHELL=0.569, French=0.164, Bilingual=0.480, BGclean=0.588, BGnoise=0.332, Hallucin=0.336),
    dict(model="OZSpeech",     LibriTTS=0.388, VCTK=0.253, MultiSpk=0.271, Long=None,   AISHELL=None,  French=0.109, Bilingual=None,  BGclean=0.272, BGnoise=0.164, Hallucin=0.281),
    dict(model="OpenVoice V2", LibriTTS=0.244, VCTK=0.392, MultiSpk=0.192, Long=0.278,  AISHELL=0.431, French=0.271, Bilingual=0.298, BGclean=0.484, BGnoise=0.358, Hallucin=0.365),
    dict(model="StyleTTS 2",   LibriTTS=0.228, VCTK=0.236, MultiSpk=0.162, Long=None,   AISHELL=None,  French=None,  Bilingual=0.213, BGclean=0.196, BGnoise=0.166, Hallucin=0.184),
]

CROSS_DATASET_COLS = [
    ("LibriTTS",  "LibriTTS"),
    ("VCTK",      "VCTK"),
    ("MultiSpk",  "Multi-spk"),
    ("Long",      "Long"),
    ("AISHELL",   "AISHELL"),
    ("French",    "French"),
    ("Bilingual", "Bilingual"),
    ("BGclean",   "BG-clean"),
    ("BGnoise",   "BG-noise"),
    ("Hallucin",  "Hallucin."),
]

# Protection robustness — SIM under each method (LibriTTS, all 18 models)
PROT_ROWS = [
    dict(model="Qwen3-TTS",    Clean=0.614, SafeSpeech=0.384, Enkidu=0.502, Spectral=0.363, GRNoise=0.408, AntiFake=0.582),
    dict(model="IndexTTS",     Clean=0.606, SafeSpeech=0.346, Enkidu=0.475, Spectral=0.318, GRNoise=0.392, AntiFake=0.572),
    dict(model="CosyVoice 2",  Clean=0.602, SafeSpeech=0.321, Enkidu=0.447, Spectral=0.301, GRNoise=0.384, AntiFake=0.549),
    dict(model="ZipVoice",     Clean=0.579, SafeSpeech=0.287, Enkidu=0.435, Spectral=0.262, GRNoise=0.258, AntiFake=0.543),
    dict(model="MaskGCT",      Clean=0.570, SafeSpeech=0.303, Enkidu=0.407, Spectral=0.281, GRNoise=0.312, AntiFake=0.530),
    dict(model="GLM-TTS",      Clean=0.570, SafeSpeech=0.330, Enkidu=0.445, Spectral=0.311, GRNoise=0.388, AntiFake=0.532),
    dict(model="F5-TTS",       Clean=0.559, SafeSpeech=0.207, Enkidu=0.431, Spectral=0.176, GRNoise=0.137, AntiFake=0.520),
    dict(model="Higgs Audio",  Clean=0.559, SafeSpeech=0.264, Enkidu=0.435, Spectral=0.236, GRNoise=0.272, AntiFake=0.521),
    dict(model="MGM-Omni",     Clean=0.539, SafeSpeech=0.184, Enkidu=0.316, Spectral=0.166, GRNoise=0.229, AntiFake=0.491),
    dict(model="PlayDiffusion",Clean=0.506, SafeSpeech=0.173, Enkidu=None,  Spectral=0.149, GRNoise=0.162, AntiFake=0.466),
    dict(model="MOSS-TTSD",    Clean=0.492, SafeSpeech=0.242, Enkidu=0.335, Spectral=0.216, GRNoise=0.247, AntiFake=0.453),
    dict(model="VibeVoice",    Clean=0.480, SafeSpeech=0.272, Enkidu=0.367, Spectral=0.253, GRNoise=0.280, AntiFake=0.442),
    dict(model="FishSpeech",   Clean=0.472, SafeSpeech=0.238, Enkidu=0.334, Spectral=0.212, GRNoise=0.235, AntiFake=0.439),
    dict(model="XTTS-v2",      Clean=0.454, SafeSpeech=0.260, Enkidu=0.308, Spectral=0.241, GRNoise=0.237, AntiFake=0.414),
    dict(model="SparkTTS",     Clean=0.408, SafeSpeech=0.129, Enkidu=0.137, Spectral=0.108, GRNoise=0.062, AntiFake=0.359),
    dict(model="OZSpeech",     Clean=0.388, SafeSpeech=0.156, Enkidu=0.187, Spectral=0.147, GRNoise=0.148, AntiFake=0.337),
    dict(model="OpenVoice V2", Clean=0.244, SafeSpeech=0.185, Enkidu=0.188, Spectral=0.180, GRNoise=0.175, AntiFake=0.236),
    dict(model="StyleTTS 2",   Clean=0.228, SafeSpeech=0.089, Enkidu=0.125, Spectral=0.081, GRNoise=0.030, AntiFake=0.207),
]
# fmt: on

METRIC_META = {
    "SIM": ("Speaker Similarity ↑", True),
    "WER": ("Word Error Rate ↓",    False),
    "MOS": ("MOS Score ↑",          True),
    "MCD": ("Mel Cepstral Dist. ↓", False),
    "RTF": ("Real-Time Factor ↓",   False),
    "SVA": ("Speaker Verif. Acc. ↑",True),
    "Emo": ("Emotion Match Rate ↑", True),
}

# ── colour helpers ────────────────────────────────────────────────────────────

_GOOD  = (200, 230, 201)   # #c8e6c9 light green
_MID   = (255, 249, 196)   # #fff9c4 light yellow
_BAD   = (255, 205, 210)   # #ffcdd2 light red


def _interp_color(t: float) -> str:
    """t=0 → bad (red), t=1 → good (green), t=0.5 → yellow."""
    if t <= 0.5:
        s = t / 0.5
        r = int(_BAD[0] + s * (_MID[0] - _BAD[0]))
        g = int(_BAD[1] + s * (_MID[1] - _BAD[1]))
        b = int(_BAD[2] + s * (_MID[2] - _BAD[2]))
    else:
        s = (t - 0.5) / 0.5
        r = int(_MID[0] + s * (_GOOD[0] - _MID[0]))
        g = int(_MID[1] + s * (_GOOD[1] - _MID[1]))
        b = int(_MID[2] + s * (_GOOD[2] - _MID[2]))
    return f"rgb({r},{g},{b})"


def _col_colors(values: list, higher_is_better: bool) -> list[str]:
    valid = [v for v in values if v is not None]
    if not valid or max(valid) == min(valid):
        return ["rgb(245,245,245)"] * len(values)
    vmin, vmax = min(valid), max(valid)
    colors = []
    for v in values:
        if v is None:
            colors.append("rgb(245,245,245)")
        else:
            t = (v - vmin) / (vmax - vmin)
            if not higher_is_better:
                t = 1 - t
            colors.append(_interp_color(t))
    return colors


# ── audio helpers ─────────────────────────────────────────────────────────────

def _load(path: str) -> tuple[np.ndarray, int]:
    audio, sr = sf.read(path, dtype="float32")
    if audio.ndim > 1:
        audio = audio.mean(axis=1)
    return audio, sr


def _snr(original: np.ndarray, protected: np.ndarray) -> float:
    noise = protected - original
    sp = np.mean(original ** 2)
    np_ = np.mean(noise ** 2)
    return float("inf") if np_ < 1e-12 else float(10 * np.log10(sp / np_))


# ── protection functions ──────────────────────────────────────────────────────

def apply_grnoise(audio: np.ndarray, sr: int, snr_db: float = 25.0) -> np.ndarray:
    sig_pow = np.mean(audio ** 2)
    noise_pow = sig_pow / (10 ** (snr_db / 10))
    noise = np.random.randn(*audio.shape).astype(np.float32) * np.sqrt(noise_pow)
    return np.clip(audio + noise, -1.0, 1.0)


def apply_spectral(audio: np.ndarray, sr: int, strength: float = 0.05) -> np.ndarray:
    from numpy.fft import rfft, irfft
    n_fft, hop = 1024, 256
    out = np.zeros_like(audio)
    cnt = np.zeros_like(audio)
    for start in range(0, len(audio) - n_fft, hop):
        frame = audio[start:start + n_fft] * np.hanning(n_fft).astype(np.float32)
        spec = rfft(frame)
        mag = np.abs(spec)
        perturb = np.random.randn(*mag.shape).astype(np.float32) * strength * mag
        spec_p = spec + perturb * np.exp(1j * np.random.uniform(0, 2 * np.pi, mag.shape))
        f = irfft(spec_p)[:n_fft].astype(np.float32)
        out[start:start + n_fft] += f
        cnt[start:start + n_fft] += 1
    cnt = np.maximum(cnt, 1)
    return np.clip(out / cnt, -1.0, 1.0)


PROTECT_FN = {"GR-Noise": apply_grnoise, "Spectral": apply_spectral}


# ── plotly figures ────────────────────────────────────────────────────────────

def make_sim_bar(model_name: str) -> go.Figure:
    """Bar chart: SIM under each protection method for one gallery model."""
    info = GALLERY_MODELS[model_name]
    sims = info["sims"]
    labels = list(sims.keys())
    values = list(sims.values())

    bar_colors = [
        "#2563eb",  # Clean
        "#7c3aed",  # SafeSpeech
        "#059669",  # Enkidu
        "#ea580c",  # Spectral
        "#475569",  # GR-Noise
        "#be123c",  # AntiFake
    ]
    # annotate drop vs clean
    clean_sim = sims["Clean"]
    text = [f"{v:.3f}" if k == "Clean" else f"{v:.3f}<br>↓{clean_sim - v:.3f}"
            for k, v in sims.items()]
    hover_text = [
        f"{label}<br>SIM: {value:.3f}<br>Drop from clean: {clean_sim - value:.3f}"
        for label, value in zip(labels, values)
    ]

    fig = go.Figure(go.Bar(
        x=labels, y=values,
        marker_color=bar_colors,
        marker_line_color="rgba(15, 23, 42, 0.25)",
        marker_line_width=1,
        text=text,
        textposition="outside",
        hovertext=hover_text,
        hoverinfo="text",
        cliponaxis=False,
    ))
    fig.update_layout(
        title=dict(
            text=f"<b>{model_name}</b> speaker similarity after protection",
            font=dict(size=16, color="#0f172a"),
            x=0.02,
        ),
        yaxis=dict(
            title="SIM",
            range=[0, min(0.75, max(values) * 1.28)],
            gridcolor="#e2e8f0",
            zeroline=False,
        ),
        xaxis=dict(title="", tickfont=dict(size=12)),
        paper_bgcolor="white",
        plot_bgcolor="#f8fafc",
        margin=dict(t=62, b=42, l=48, r=24),
        height=350,
        showlegend=False,
        bargap=0.28,
        font=dict(color="#334155"),
    )
    fig.add_trace(go.Scatter(
        x=labels,
        y=[clean_sim] * len(labels),
        mode="lines+text",
        line=dict(color="#2563eb", dash="dot", width=1.5),
        text=[""] * (len(labels) - 1) + ["Clean baseline"],
        textposition="top right",
        textfont=dict(size=10, color="#2563eb"),
        hoverinfo="skip",
        showlegend=False,
    ))
    return fig


def make_results_bar(metric: str = "SIM", ascending: bool = False) -> go.Figure:
    """Horizontal bar chart of all 18 models sorted by the chosen metric."""
    higher_is_better = METRIC_META[metric][1]
    metric_label     = METRIC_META[metric][0]

    rows = [r for r in LEADERBOARD_ROWS if r.get(metric) is not None]
    rows = sorted(rows, key=lambda r: r[metric], reverse=(higher_is_better ^ ascending))

    models = [r["model"] for r in rows]
    values = [r[metric] for r in rows]

    colors = _col_colors(values, higher_is_better)
    text   = [f"{v:.3f}" if v is not None else "—" for v in values]

    fig = go.Figure(go.Bar(
        x=values, y=models,
        orientation="h",
        marker_color=colors,
        marker_line_color="#999", marker_line_width=0.5,
        text=text, textposition="outside",
        cliponaxis=False,
    ))
    fig.update_layout(
        title=dict(text=f"<b>Model Ranking by {metric_label}</b>",
                   font=dict(size=14)),
        xaxis=dict(title=metric_label),
        yaxis=dict(autorange="reversed"),
        paper_bgcolor="white", plot_bgcolor="#f8f9fa",
        margin=dict(t=50, b=40, l=120, r=80),
        height=520,
        showlegend=False,
    )
    return fig


def make_prot_heatmap() -> go.Figure:
    """Heatmap: SIM under each protection method for all 18 models."""
    col_order = ["Clean", "SafeSpeech", "Enkidu", "Spectral", "GRNoise", "AntiFake"]
    col_labels = ["Clean", "SafeSpeech", "Enkidu", "Spectral", "GR-Noise", "AntiFake"]

    # sort models by Clean SIM descending
    rows = sorted(PROT_ROWS, key=lambda r: r["Clean"], reverse=True)
    model_names = [r["model"] for r in rows]

    z: list[list] = []
    text_vals: list[list[str]] = []
    for r in rows:
        row_z, row_t = [], []
        for col in col_order:
            v = r.get(col)
            row_z.append(v)
            row_t.append(f"{v:.3f}" if v is not None else "—")
        z.append(row_z)
        text_vals.append(row_t)

    fig = go.Figure(go.Heatmap(
        z=z,
        x=col_labels,
        y=model_names,
        text=text_vals,
        texttemplate="%{text}",
        textfont=dict(size=10),
        colorscale=[
            [0.0,  "#b71c1c"],
            [0.25, "#ef9a9a"],
            [0.5,  "#fff9c4"],
            [0.75, "#a5d6a7"],
            [1.0,  "#1b5e20"],
        ],
        zmin=0.0, zmax=0.75,
        colorbar=dict(title="SIM", tickformat=".2f", len=0.8),
        hoverongaps=False,
    ))
    # separator line after Clean column
    fig.add_shape(type="line",
                  x0=0.5, x1=0.5, y0=-0.5, y1=len(model_names) - 0.5,
                  line=dict(color="#555", width=2, dash="dot"),
                  xref="x", yref="y")

    fig.update_layout(
        title=dict(
            text="<b>Protection Robustness — Speaker Similarity (SIM) on LibriTTS</b><br>"
                 "<sup>Green = high SIM (clone faithful). Red = low SIM (protection effective). "
                 "Drop from Clean → protected shows protection strength.</sup>",
            font=dict(size=13),
        ),
        yaxis=dict(autorange="reversed"),
        xaxis=dict(side="top"),
        paper_bgcolor="white", plot_bgcolor="white",
        margin=dict(t=120, b=40, l=120, r=80),
        height=600,
    )
    return fig


def make_cross_dataset_heatmap() -> go.Figure:
    """Heatmap: SIM on clean prompts across all 10 datasets for all 18 models."""
    col_keys   = [k for k, _ in CROSS_DATASET_COLS]
    col_labels = [label for _, label in CROSS_DATASET_COLS]

    rows = sorted(CROSS_DATASET_ROWS, key=lambda r: r["LibriTTS"], reverse=True)
    model_names = [r["model"] for r in rows]

    z: list[list] = []
    text_vals: list[list[str]] = []
    for r in rows:
        row_z, row_t = [], []
        for key in col_keys:
            v = r.get(key)
            row_z.append(v)
            row_t.append(f"{v:.3f}" if v is not None else "—")
        z.append(row_z)
        text_vals.append(row_t)

    fig = go.Figure(go.Heatmap(
        z=z,
        x=col_labels,
        y=model_names,
        text=text_vals,
        texttemplate="%{text}",
        textfont=dict(size=10),
        colorscale=[
            [0.0,  "#b71c1c"],
            [0.25, "#ef9a9a"],
            [0.5,  "#fff9c4"],
            [0.75, "#a5d6a7"],
            [1.0,  "#1b5e20"],
        ],
        zmin=0.0, zmax=0.75,
        colorbar=dict(title="SIM", tickformat=".2f", len=0.8),
        hoverongaps=False,
    ))
    fig.update_layout(
        title=dict(
            text="<b>Cross-Dataset Generalisation — Speaker Similarity (SIM) on Clean Prompts</b><br>"
                 "<sup>Models sorted by LibriTTS SIM. — = not evaluated. "
                 "Green = high SIM (faithful clone), red = low SIM.</sup>",
            font=dict(size=13),
        ),
        yaxis=dict(autorange="reversed"),
        xaxis=dict(side="top"),
        paper_bgcolor="white", plot_bgcolor="white",
        margin=dict(t=120, b=40, l=120, r=80),
        height=600,
    )
    return fig


def make_waveform_figure(
    original: np.ndarray, protected: np.ndarray, sr: int
) -> go.Figure:
    """Overlay waveform plot: original vs. protected audio."""
    n = min(len(original), len(protected), sr * 5)  # cap at 5 s
    t = (np.arange(n) / sr).tolist()
    original_wave = original[:n].tolist()
    protected_wave = protected[:n].tolist()

    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=t, y=original_wave,
        name="Original",
        line=dict(color="#1565c0", width=1),
        opacity=0.85,
    ))
    fig.add_trace(go.Scatter(
        x=t, y=protected_wave,
        name="Protected",
        line=dict(color="#c62828", width=1),
        opacity=0.85,
    ))
    fig.update_layout(
        title=dict(text="<b>Waveform Comparison</b> (first 5 s)",
                   font=dict(size=13)),
        xaxis=dict(title="Time (s)"),
        yaxis=dict(title="Amplitude", range=[-1.05, 1.05]),
        paper_bgcolor="white", plot_bgcolor="#f8f9fa",
        legend=dict(orientation="h", y=1.08, x=0.5, xanchor="center"),
        margin=dict(t=60, b=40, l=55, r=20),
        height=220,
    )
    return fig


# ── gallery callback ──────────────────────────────────────────────────────────

def load_gallery(model_name: str):
    info = GALLERY_MODELS[model_name]
    clean_sim  = info["sims"]["Clean"]
    prot_sim   = info["sims"]["SafeSpeech"]
    drop       = clean_sim - prot_sim
    note_md = (
        f"**Clean SIM:** {clean_sim:.3f} &nbsp;→&nbsp; "
        f"**Protected SIM (SafeSpeech):** {prot_sim:.3f} &nbsp;"
        f"*(drop: {drop:.3f})*"
    )
    return (
        REF_WAV,
        TARGET_WAV,
        os.path.join(SAMPLES, info["clean"]),
        os.path.join(SAMPLES, "protected_safespeech.wav"),
        os.path.join(SAMPLES, info["prot"]),
        note_md,
        make_sim_bar(model_name),
    )


# ── live protection callback ──────────────────────────────────────────────────

def run_protection(audio_input, method: str, strength: float):
    if audio_input is None:
        return None, None, "Upload an audio file first.", None

    sr_in, data = audio_input
    audio = data.astype(np.float32)
    if audio.max() > 1.0:
        audio /= 32768.0
    if audio.ndim > 1:
        audio = audio.mean(axis=1)

    t0 = time.time()
    fn = PROTECT_FN[method]
    if method == "GR-Noise":
        protected = fn(audio, sr_in, snr_db=strength)
    else:
        protected = fn(audio, sr_in, strength=strength / 100.0)
    elapsed = time.time() - t0

    snr = _snr(audio, protected)
    prot_int = (protected * 32767).astype(np.int16)

    metrics_md = (
        f"| Metric | Value |\n|--------|-------|\n"
        f"| SNR (dB) | {snr:.1f} |\n"
        f"| Processing time | {elapsed * 1000:.0f} ms |\n"
        f"| Method | {method} |\n"
    )

    waveform_fig = make_waveform_figure(audio, protected, sr_in)
    return (sr_in, audio.copy()), (sr_in, prot_int), metrics_md, waveform_fig


def update_strength_label(method: str) -> dict:
    if method == "GR-Noise":
        return gr.update(
            label="Target SNR (dB) — lower = stronger, more audible",
            info="25 dB: nearly imperceptible. 10 dB: noticeable noise.",
            minimum=10, maximum=40, value=25, step=1,
        )
    else:
        return gr.update(
            label="Spectral Strength (%) — higher = stronger perturbation",
            info="5% is nearly inaudible. 20%+ may cause artifacts.",
            minimum=1, maximum=30, value=5, step=1,
        )


# ── results callbacks ─────────────────────────────────────────────────────────

def update_results_bar(metric: str) -> go.Figure:
    return make_results_bar(metric)


# ── UI constants ──────────────────────────────────────────────────────────────

CSS = """
footer { display: none !important; }
.gradio-container {
    max-width: 1180px !important;
    margin: 0 auto !important;
}
.hero {
    padding: 28px 28px 22px;
    border-radius: 12px;
    background: linear-gradient(135deg, #0f172a 0%, #164e63 54%, #065f46 100%);
    color: white;
    margin-bottom: 18px;
}
.hero h1 {
    margin: 0 0 8px;
    font-size: 2.35rem;
    line-height: 1.08;
    letter-spacing: 0;
    color: white !important;
}
.hero p {
    max-width: 760px;
    margin: 0;
    color: #dbeafe;
    font-size: 1.05rem;
}
.hero a {
    color: white !important;
}
.hero-links {
    display: flex;
    flex-wrap: wrap;
    gap: 8px;
    margin-top: 16px;
}
.hero-links a {
    text-decoration: none;
}
.stat-strip {
    display: grid;
    grid-template-columns: repeat(4, minmax(0, 1fr));
    gap: 10px;
    margin: 14px 0 18px;
}
.stat-card {
    border: 1px solid #d8dee9;
    border-radius: 8px;
    padding: 12px 14px;
    background: #ffffff;
}
.stat-card b {
    display: block;
    font-size: 1.35rem;
    color: #0f172a;
    line-height: 1.1;
}
.stat-card span {
    color: #475569;
    font-size: 0.9rem;
}
.section-head {
    margin: 18px 0 8px;
    color: #0f172a;
}
.note-box {
    font-size: 1.02em;
    background: #eef6ff;
    border: 1px solid #bfdbfe;
    border-left: 4px solid #2563eb;
    border-radius: 8px;
    padding: 10px 12px;
}
.audio-panel {
    border: 1px solid #e2e8f0;
    border-radius: 8px;
    padding: 12px;
    background: #ffffff;
}
.audio-panel h3,
.audio-panel h4 {
    margin-top: 0;
}
.workflow-copy {
    color: #475569;
    margin-bottom: 12px;
}
@media (max-width: 760px) {
    .hero {
        padding: 22px 18px 18px;
    }
    .hero h1 {
        font-size: 1.75rem;
    }
    .stat-strip {
        grid-template-columns: repeat(2, minmax(0, 1fr));
    }
}
"""

INTRO_MD = """
<div class="hero">
  <h1>RVCBench</h1>
  <p>Voice cloning attacks and audio protection methods, compared through paired listening examples and speaker-similarity results.</p>
  <div class="hero-links">
    <a href="https://arxiv.org/abs/2602.00443"><img alt="Paper" src="https://img.shields.io/badge/arXiv-2602.00443-b31b1b.svg"></a>
    <a href="https://huggingface.co/datasets/Nanboy/RVCBench"><img alt="Dataset" src="https://img.shields.io/badge/HuggingFace-Dataset-ffcc00.svg"></a>
    <a href="https://github.com/Nanboy-Ronan/RVCBench"><img alt="GitHub" src="https://img.shields.io/badge/GitHub-RVCBench-181717.svg"></a>
  </div>
</div>

<div class="stat-strip">
  <div class="stat-card"><b>26</b><span>voice cloning models</span></div>
  <div class="stat-card"><b>5</b><span>protection methods</span></div>
  <div class="stat-card"><b>7</b><span>evaluation metrics</span></div>
  <div class="stat-card"><b>10</b><span>speech datasets</span></div>
</div>
"""

GALLERY_INTRO_MD = """
<div class="workflow-copy">
Select a cloning model, compare clean and protected audio, then inspect how much each protection method lowers speaker similarity.
</div>
"""

PROT_INTRO_MD = """
Upload your own audio clip and apply a protection method. The protected audio sounds nearly
identical to humans, but disrupts automatic voice cloning models.

- **GR-Noise** — Gaussian random noise at a chosen SNR level. No surrogate model required.
- **Spectral** — Structured perturbation in the STFT frequency domain.
"""

RESULTS_INTRO_MD = """
**Metric guide** — SIM: speaker cosine similarity ↑ &nbsp;·&nbsp;
WER: word error rate ↓ &nbsp;·&nbsp; MOS: perceptual quality ↑ &nbsp;·&nbsp;
MCD: mel cepstral distortion ↓ &nbsp;·&nbsp; RTF: real-time factor ↓ &nbsp;·&nbsp;
SVA: speaker verification accuracy ↑ &nbsp;·&nbsp; Emo: emotion match rate ↑

Select a metric to re-rank the 18 models. The heatmap below shows protection robustness
(SIM under each of 5 protection methods).
"""


# ── build demo ────────────────────────────────────────────────────────────────

def build_demo():
    with gr.Blocks(css=CSS, title="RVCBench Demo") as demo:
        gr.Markdown(INTRO_MD)

        with gr.Tabs():

            # ── Tab 1: Voice Cloning Gallery ──────────────────────────────────
            with gr.Tab("🎧  Voice Cloning Gallery"):
                gr.Markdown(GALLERY_INTRO_MD)

                with gr.Row():
                    model_dd = gr.Dropdown(
                        choices=list(GALLERY_MODELS.keys()),
                        value="ZipVoice",
                        label="Voice Cloning Model",
                        scale=3,
                    )
                    load_btn = gr.Button("Load Example", variant="primary", scale=1)

                sim_note = gr.Markdown("", elem_classes="note-box")

                with gr.Row():
                    with gr.Column(elem_classes="audio-panel"):
                        gr.Markdown('<h3 class="section-head">1. Reference Voice</h3>')
                        gr.Markdown(f"*\"{REF_TEXT}\"*")
                        ref_out = gr.Audio(label="Reference (original)", interactive=False)
                    with gr.Column(elem_classes="audio-panel"):
                        gr.Markdown('<h3 class="section-head">2. Target Speech</h3>')
                        gr.Markdown(f"*\"{TARGET_TEXT}\"*")
                        target_out = gr.Audio(label="Target utterance", interactive=False)

                gr.Markdown('<h3 class="section-head">3. Cloning Results</h3>')

                with gr.Row():
                    with gr.Column(elem_classes="audio-panel"):
                        gr.Markdown("#### Clean Reference")
                        clean_out = gr.Audio(label="Clean clone", interactive=False)
                    with gr.Column(elem_classes="audio-panel"):
                        gr.Markdown("#### SafeSpeech-Protected Reference")
                        prot_ref_out   = gr.Audio(label="Protected reference", interactive=False)
                        prot_clone_out = gr.Audio(label="Clone from protected (degraded)", interactive=False)

                gr.Markdown('<h3 class="section-head">4. Protection Effectiveness Across Methods</h3>')
                sim_chart = gr.Plot(label="", show_label=False)

                gallery_outputs = [ref_out, target_out, clean_out, prot_ref_out,
                                   prot_clone_out, sim_note, sim_chart]
                load_btn.click(fn=load_gallery, inputs=[model_dd], outputs=gallery_outputs)
                demo.load(fn=load_gallery, inputs=[model_dd], outputs=gallery_outputs)
                model_dd.change(fn=load_gallery, inputs=[model_dd], outputs=gallery_outputs)

            # ── Tab 2: Protect Your Voice ─────────────────────────────────────
            with gr.Tab("🔒  Protect Your Voice"):
                gr.Markdown(PROT_INTRO_MD)

                with gr.Row():
                    audio_in = gr.Audio(
                        label="Upload your audio (wav / mp3, ≤ 30 s)",
                        type="numpy", scale=3,
                    )
                    with gr.Column(scale=1):
                        method_dd = gr.Dropdown(
                            choices=list(PROTECT_FN.keys()),
                            value="GR-Noise",
                            label="Protection Method",
                        )
                        strength_sl = gr.Slider(
                            minimum=10, maximum=40, value=25, step=1,
                            label="Target SNR (dB) — lower = stronger, more audible",
                            info="25 dB: nearly imperceptible. 10 dB: noticeable noise.",
                        )
                        protect_btn = gr.Button("Apply Protection", variant="primary")

                with gr.Row():
                    orig_out  = gr.Audio(label="Original", interactive=False)
                    prot_live = gr.Audio(label="Protected", interactive=False)

                metrics_out   = gr.Markdown("")
                waveform_plot = gr.Plot(label="Waveform Comparison", show_label=False)

                method_dd.change(fn=update_strength_label, inputs=[method_dd],
                                 outputs=[strength_sl])
                protect_btn.click(
                    fn=run_protection,
                    inputs=[audio_in, method_dd, strength_sl],
                    outputs=[orig_out, prot_live, metrics_out, waveform_plot],
                )

                gr.Markdown(
                    "> **Note:** Full voice cloning inference (SafeSpeech, Enkidu, AntiFake) "
                    "requires surrogate models and is not included in this Space due to compute "
                    "constraints. See the "
                    "[GitHub repo](https://github.com/Nanboy-Ronan/RVCBench) for the full pipeline."
                )

            # ── Tab 3: Results Explorer ───────────────────────────────────────
            with gr.Tab("📊  Results Explorer"):
                gr.Markdown(RESULTS_INTRO_MD)

                metric_dd = gr.Dropdown(
                    choices=list(METRIC_META.keys()),
                    value="SIM",
                    label="Sort by metric",
                )
                bar_chart = gr.Plot(label="", show_label=False)
                metric_dd.change(fn=update_results_bar, inputs=[metric_dd],
                                 outputs=[bar_chart])
                demo.load(fn=lambda: make_results_bar("SIM"), outputs=[bar_chart])

                gr.Markdown("---")
                gr.Markdown(
                    "### Cross-Dataset Generalisation\n"
                    "SIM on clean prompts across all 10 benchmark datasets. "
                    "Models sorted by LibriTTS SIM. — = not evaluated."
                )
                cross_heatmap = gr.Plot(label="", show_label=False)
                demo.load(fn=make_cross_dataset_heatmap, outputs=[cross_heatmap])

                gr.Markdown("---")
                gr.Markdown(
                    "### Protection Robustness Heatmap\n"
                    "SIM under each of 5 protection methods — drop from **Clean** indicates "
                    "more effective protection."
                )
                prot_heatmap = gr.Plot(label="", show_label=False)
                demo.load(fn=make_prot_heatmap, outputs=[prot_heatmap])

            # ── Tab 4: About ──────────────────────────────────────────────────
            with gr.Tab("ℹ️  About"):
                gr.Markdown("""
## About RVCBench

**RVCBench** is an open-source benchmark for evaluating the robustness of voice cloning
against audio protection methods.

### What it measures
- How well **18+ modern zero-shot TTS/VC models** can clone a speaker's voice
- How effectively **5 audio protection methods** (SafeSpeech, Enkidu, Spectral, GR-Noise, AntiFake)
  prevent cloning across **10 datasets** and **7 evaluation metrics**

### Resources

| Resource | Link |
|----------|------|
| Paper (arXiv) | [arXiv:2602.00443](https://arxiv.org/abs/2602.00443) |
| Code & full pipeline | [GitHub: Nanboy-Ronan/RVCBench](https://github.com/Nanboy-Ronan/RVCBench) |
| Dataset | [HuggingFace: Nanboy/RVCBench](https://huggingface.co/datasets/Nanboy/RVCBench) |
| Contact | ruinanjin@alumni.ubc.ca |

### Citation

```bibtex
@article{liao2026rvcbench,
  title   = {RVCBench: Benchmarking the Robustness of Voice Cloning Across Modern Audio Generation Models},
  author  = {Liao, Xinting and Jin, Ruinan and Yu, Hanlin and Pandya, Deval and Li, Xiaoxiao},
  journal = {arXiv preprint arXiv:2602.00443},
  year    = {2026}
}
```
""")

    return demo


if __name__ == "__main__":
    build_demo().launch(server_name="0.0.0.0", server_port=7860, show_api=False)