Fix RVCBench title color: add color: white !important to .hero h1
Browse files
app.py
CHANGED
|
@@ -124,6 +124,41 @@ LEADERBOARD_ROWS = [
|
|
| 124 |
dict(model="StyleTTS 2", SIM=0.228, WER=0.049, MOS=4.30, MCD=6.81, RTF=0.11, SVA=0.388, Emo=0.589),
|
| 125 |
]
|
| 126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
# Protection robustness — SIM under each method (LibriTTS, all 18 models)
|
| 128 |
PROT_ROWS = [
|
| 129 |
dict(model="Qwen3-TTS", Clean=0.614, SafeSpeech=0.384, Enkidu=0.502, Spectral=0.363, GRNoise=0.408, AntiFake=0.582),
|
|
@@ -409,6 +444,59 @@ def make_prot_heatmap() -> go.Figure:
|
|
| 409 |
return fig
|
| 410 |
|
| 411 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
def make_waveform_figure(
|
| 413 |
original: np.ndarray, protected: np.ndarray, sr: int
|
| 414 |
) -> go.Figure:
|
|
@@ -543,6 +631,7 @@ footer { display: none !important; }
|
|
| 543 |
font-size: 2.35rem;
|
| 544 |
line-height: 1.08;
|
| 545 |
letter-spacing: 0;
|
|
|
|
| 546 |
}
|
| 547 |
.hero p {
|
| 548 |
max-width: 760px;
|
|
@@ -635,7 +724,7 @@ INTRO_MD = """
|
|
| 635 |
</div>
|
| 636 |
|
| 637 |
<div class="stat-strip">
|
| 638 |
-
<div class="stat-card"><b>
|
| 639 |
<div class="stat-card"><b>5</b><span>protection methods</span></div>
|
| 640 |
<div class="stat-card"><b>7</b><span>evaluation metrics</span></div>
|
| 641 |
<div class="stat-card"><b>10</b><span>speech datasets</span></div>
|
|
@@ -778,6 +867,15 @@ def build_demo():
|
|
| 778 |
outputs=[bar_chart])
|
| 779 |
demo.load(fn=lambda: make_results_bar("SIM"), outputs=[bar_chart])
|
| 780 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 781 |
gr.Markdown("---")
|
| 782 |
gr.Markdown(
|
| 783 |
"### Protection Robustness Heatmap\n"
|
|
|
|
| 124 |
dict(model="StyleTTS 2", SIM=0.228, WER=0.049, MOS=4.30, MCD=6.81, RTF=0.11, SVA=0.388, Emo=0.589),
|
| 125 |
]
|
| 126 |
|
| 127 |
+
# Cross-dataset generalisation — SIM on clean prompts across all 10 datasets
|
| 128 |
+
CROSS_DATASET_ROWS = [
|
| 129 |
+
dict(model="Qwen3-TTS", LibriTTS=0.614, VCTK=0.618, MultiSpk=0.495, Long=0.561, AISHELL=0.721, French=0.536, Bilingual=0.673, BGclean=0.689, BGnoise=0.572, Hallucin=0.515),
|
| 130 |
+
dict(model="IndexTTS", LibriTTS=0.606, VCTK=0.567, MultiSpk=0.473, Long=0.775, AISHELL=0.721, French=0.397, Bilingual=0.673, BGclean=0.589, BGnoise=0.528, Hallucin=0.529),
|
| 131 |
+
dict(model="CosyVoice 2", LibriTTS=0.602, VCTK=0.582, MultiSpk=0.448, Long=0.530, AISHELL=0.717, French=0.378, Bilingual=0.653, BGclean=0.626, BGnoise=0.515, Hallucin=0.518),
|
| 132 |
+
dict(model="ZipVoice", LibriTTS=0.579, VCTK=0.554, MultiSpk=0.531, Long=0.729, AISHELL=0.712, French=0.363, Bilingual=0.322, BGclean=0.625, BGnoise=0.462, Hallucin=0.509),
|
| 133 |
+
dict(model="MaskGCT", LibriTTS=0.570, VCTK=0.555, MultiSpk=0.431, Long=0.194, AISHELL=0.674, French=0.494, Bilingual=None, BGclean=0.610, BGnoise=0.487, Hallucin=0.499),
|
| 134 |
+
dict(model="GLM-TTS", LibriTTS=0.570, VCTK=0.573, MultiSpk=0.445, Long=0.757, AISHELL=0.690, French=0.398, Bilingual=0.657, BGclean=0.622, BGnoise=0.528, Hallucin=0.533),
|
| 135 |
+
dict(model="F5-TTS", LibriTTS=0.559, VCTK=0.537, MultiSpk=0.507, Long=0.607, AISHELL=0.696, French=0.304, Bilingual=0.653, BGclean=0.582, BGnoise=0.414, Hallucin=0.455),
|
| 136 |
+
dict(model="Higgs Audio", LibriTTS=0.559, VCTK=0.516, MultiSpk=0.418, Long=0.520, AISHELL=0.581, French=0.349, Bilingual=0.543, BGclean=0.592, BGnoise=0.421, Hallucin=0.425),
|
| 137 |
+
dict(model="MGM-Omni", LibriTTS=0.539, VCTK=0.447, MultiSpk=0.370, Long=0.442, AISHELL=0.713, French=0.227, Bilingual=0.630, BGclean=0.523, BGnoise=0.332, Hallucin=0.396),
|
| 138 |
+
dict(model="PlayDiffusion",LibriTTS=0.506, VCTK=0.426, MultiSpk=0.360, Long=0.637, AISHELL=0.441, French=0.283, Bilingual=0.465, BGclean=0.433, BGnoise=0.305, Hallucin=0.408),
|
| 139 |
+
dict(model="MOSS-TTSD", LibriTTS=0.492, VCTK=0.440, MultiSpk=0.379, Long=0.644, AISHELL=0.437, French=0.327, Bilingual=0.471, BGclean=0.494, BGnoise=0.488, Hallucin=0.416),
|
| 140 |
+
dict(model="VibeVoice", LibriTTS=0.480, VCTK=0.436, MultiSpk=0.348, Long=0.625, AISHELL=0.564, French=0.343, Bilingual=0.531, BGclean=0.513, BGnoise=0.364, Hallucin=0.408),
|
| 141 |
+
dict(model="FishSpeech", LibriTTS=0.472, VCTK=0.430, MultiSpk=0.383, Long=0.572, AISHELL=0.611, French=0.374, Bilingual=0.566, BGclean=0.495, BGnoise=0.387, Hallucin=0.351),
|
| 142 |
+
dict(model="XTTS-v2", LibriTTS=0.454, VCTK=0.454, MultiSpk=0.328, Long=0.613, AISHELL=0.569, French=0.445, Bilingual=0.506, BGclean=0.546, BGnoise=0.394, Hallucin=0.488),
|
| 143 |
+
dict(model="SparkTTS", LibriTTS=0.408, VCTK=0.532, MultiSpk=0.228, Long=0.345, AISHELL=0.569, French=0.164, Bilingual=0.480, BGclean=0.588, BGnoise=0.332, Hallucin=0.336),
|
| 144 |
+
dict(model="OZSpeech", LibriTTS=0.388, VCTK=0.253, MultiSpk=0.271, Long=None, AISHELL=None, French=0.109, Bilingual=None, BGclean=0.272, BGnoise=0.164, Hallucin=0.281),
|
| 145 |
+
dict(model="OpenVoice V2", LibriTTS=0.244, VCTK=0.392, MultiSpk=0.192, Long=0.278, AISHELL=0.431, French=0.271, Bilingual=0.298, BGclean=0.484, BGnoise=0.358, Hallucin=0.365),
|
| 146 |
+
dict(model="StyleTTS 2", LibriTTS=0.228, VCTK=0.236, MultiSpk=0.162, Long=None, AISHELL=None, French=None, Bilingual=0.213, BGclean=0.196, BGnoise=0.166, Hallucin=0.184),
|
| 147 |
+
]
|
| 148 |
+
|
| 149 |
+
CROSS_DATASET_COLS = [
|
| 150 |
+
("LibriTTS", "LibriTTS"),
|
| 151 |
+
("VCTK", "VCTK"),
|
| 152 |
+
("MultiSpk", "Multi-spk"),
|
| 153 |
+
("Long", "Long"),
|
| 154 |
+
("AISHELL", "AISHELL"),
|
| 155 |
+
("French", "French"),
|
| 156 |
+
("Bilingual", "Bilingual"),
|
| 157 |
+
("BGclean", "BG-clean"),
|
| 158 |
+
("BGnoise", "BG-noise"),
|
| 159 |
+
("Hallucin", "Hallucin."),
|
| 160 |
+
]
|
| 161 |
+
|
| 162 |
# Protection robustness — SIM under each method (LibriTTS, all 18 models)
|
| 163 |
PROT_ROWS = [
|
| 164 |
dict(model="Qwen3-TTS", Clean=0.614, SafeSpeech=0.384, Enkidu=0.502, Spectral=0.363, GRNoise=0.408, AntiFake=0.582),
|
|
|
|
| 444 |
return fig
|
| 445 |
|
| 446 |
|
| 447 |
+
def make_cross_dataset_heatmap() -> go.Figure:
|
| 448 |
+
"""Heatmap: SIM on clean prompts across all 10 datasets for all 18 models."""
|
| 449 |
+
col_keys = [k for k, _ in CROSS_DATASET_COLS]
|
| 450 |
+
col_labels = [label for _, label in CROSS_DATASET_COLS]
|
| 451 |
+
|
| 452 |
+
rows = sorted(CROSS_DATASET_ROWS, key=lambda r: r["LibriTTS"], reverse=True)
|
| 453 |
+
model_names = [r["model"] for r in rows]
|
| 454 |
+
|
| 455 |
+
z: list[list] = []
|
| 456 |
+
text_vals: list[list[str]] = []
|
| 457 |
+
for r in rows:
|
| 458 |
+
row_z, row_t = [], []
|
| 459 |
+
for key in col_keys:
|
| 460 |
+
v = r.get(key)
|
| 461 |
+
row_z.append(v)
|
| 462 |
+
row_t.append(f"{v:.3f}" if v is not None else "—")
|
| 463 |
+
z.append(row_z)
|
| 464 |
+
text_vals.append(row_t)
|
| 465 |
+
|
| 466 |
+
fig = go.Figure(go.Heatmap(
|
| 467 |
+
z=z,
|
| 468 |
+
x=col_labels,
|
| 469 |
+
y=model_names,
|
| 470 |
+
text=text_vals,
|
| 471 |
+
texttemplate="%{text}",
|
| 472 |
+
textfont=dict(size=10),
|
| 473 |
+
colorscale=[
|
| 474 |
+
[0.0, "#b71c1c"],
|
| 475 |
+
[0.25, "#ef9a9a"],
|
| 476 |
+
[0.5, "#fff9c4"],
|
| 477 |
+
[0.75, "#a5d6a7"],
|
| 478 |
+
[1.0, "#1b5e20"],
|
| 479 |
+
],
|
| 480 |
+
zmin=0.0, zmax=0.75,
|
| 481 |
+
colorbar=dict(title="SIM", tickformat=".2f", len=0.8),
|
| 482 |
+
hoverongaps=False,
|
| 483 |
+
))
|
| 484 |
+
fig.update_layout(
|
| 485 |
+
title=dict(
|
| 486 |
+
text="<b>Cross-Dataset Generalisation — Speaker Similarity (SIM) on Clean Prompts</b><br>"
|
| 487 |
+
"<sup>Models sorted by LibriTTS SIM. — = not evaluated. "
|
| 488 |
+
"Green = high SIM (faithful clone), red = low SIM.</sup>",
|
| 489 |
+
font=dict(size=13),
|
| 490 |
+
),
|
| 491 |
+
yaxis=dict(autorange="reversed"),
|
| 492 |
+
xaxis=dict(side="top"),
|
| 493 |
+
paper_bgcolor="white", plot_bgcolor="white",
|
| 494 |
+
margin=dict(t=120, b=40, l=120, r=80),
|
| 495 |
+
height=600,
|
| 496 |
+
)
|
| 497 |
+
return fig
|
| 498 |
+
|
| 499 |
+
|
| 500 |
def make_waveform_figure(
|
| 501 |
original: np.ndarray, protected: np.ndarray, sr: int
|
| 502 |
) -> go.Figure:
|
|
|
|
| 631 |
font-size: 2.35rem;
|
| 632 |
line-height: 1.08;
|
| 633 |
letter-spacing: 0;
|
| 634 |
+
color: white !important;
|
| 635 |
}
|
| 636 |
.hero p {
|
| 637 |
max-width: 760px;
|
|
|
|
| 724 |
</div>
|
| 725 |
|
| 726 |
<div class="stat-strip">
|
| 727 |
+
<div class="stat-card"><b>26</b><span>voice cloning models</span></div>
|
| 728 |
<div class="stat-card"><b>5</b><span>protection methods</span></div>
|
| 729 |
<div class="stat-card"><b>7</b><span>evaluation metrics</span></div>
|
| 730 |
<div class="stat-card"><b>10</b><span>speech datasets</span></div>
|
|
|
|
| 867 |
outputs=[bar_chart])
|
| 868 |
demo.load(fn=lambda: make_results_bar("SIM"), outputs=[bar_chart])
|
| 869 |
|
| 870 |
+
gr.Markdown("---")
|
| 871 |
+
gr.Markdown(
|
| 872 |
+
"### Cross-Dataset Generalisation\n"
|
| 873 |
+
"SIM on clean prompts across all 10 benchmark datasets. "
|
| 874 |
+
"Models sorted by LibriTTS SIM. — = not evaluated."
|
| 875 |
+
)
|
| 876 |
+
cross_heatmap = gr.Plot(label="", show_label=False)
|
| 877 |
+
demo.load(fn=make_cross_dataset_heatmap, outputs=[cross_heatmap])
|
| 878 |
+
|
| 879 |
gr.Markdown("---")
|
| 880 |
gr.Markdown(
|
| 881 |
"### Protection Robustness Heatmap\n"
|