Nanboy commited on
Commit
665fda4
·
verified ·
1 Parent(s): 31d8814

Fix RVCBench title color: add color: white !important to .hero h1

Browse files
Files changed (1) hide show
  1. app.py +99 -1
app.py CHANGED
@@ -124,6 +124,41 @@ LEADERBOARD_ROWS = [
124
  dict(model="StyleTTS 2", SIM=0.228, WER=0.049, MOS=4.30, MCD=6.81, RTF=0.11, SVA=0.388, Emo=0.589),
125
  ]
126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  # Protection robustness — SIM under each method (LibriTTS, all 18 models)
128
  PROT_ROWS = [
129
  dict(model="Qwen3-TTS", Clean=0.614, SafeSpeech=0.384, Enkidu=0.502, Spectral=0.363, GRNoise=0.408, AntiFake=0.582),
@@ -409,6 +444,59 @@ def make_prot_heatmap() -> go.Figure:
409
  return fig
410
 
411
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
  def make_waveform_figure(
413
  original: np.ndarray, protected: np.ndarray, sr: int
414
  ) -> go.Figure:
@@ -543,6 +631,7 @@ footer { display: none !important; }
543
  font-size: 2.35rem;
544
  line-height: 1.08;
545
  letter-spacing: 0;
 
546
  }
547
  .hero p {
548
  max-width: 760px;
@@ -635,7 +724,7 @@ INTRO_MD = """
635
  </div>
636
 
637
  <div class="stat-strip">
638
- <div class="stat-card"><b>18</b><span>voice cloning models</span></div>
639
  <div class="stat-card"><b>5</b><span>protection methods</span></div>
640
  <div class="stat-card"><b>7</b><span>evaluation metrics</span></div>
641
  <div class="stat-card"><b>10</b><span>speech datasets</span></div>
@@ -778,6 +867,15 @@ def build_demo():
778
  outputs=[bar_chart])
779
  demo.load(fn=lambda: make_results_bar("SIM"), outputs=[bar_chart])
780
 
 
 
 
 
 
 
 
 
 
781
  gr.Markdown("---")
782
  gr.Markdown(
783
  "### Protection Robustness Heatmap\n"
 
124
  dict(model="StyleTTS 2", SIM=0.228, WER=0.049, MOS=4.30, MCD=6.81, RTF=0.11, SVA=0.388, Emo=0.589),
125
  ]
126
 
127
+ # Cross-dataset generalisation — SIM on clean prompts across all 10 datasets
128
+ CROSS_DATASET_ROWS = [
129
+ dict(model="Qwen3-TTS", LibriTTS=0.614, VCTK=0.618, MultiSpk=0.495, Long=0.561, AISHELL=0.721, French=0.536, Bilingual=0.673, BGclean=0.689, BGnoise=0.572, Hallucin=0.515),
130
+ dict(model="IndexTTS", LibriTTS=0.606, VCTK=0.567, MultiSpk=0.473, Long=0.775, AISHELL=0.721, French=0.397, Bilingual=0.673, BGclean=0.589, BGnoise=0.528, Hallucin=0.529),
131
+ dict(model="CosyVoice 2", LibriTTS=0.602, VCTK=0.582, MultiSpk=0.448, Long=0.530, AISHELL=0.717, French=0.378, Bilingual=0.653, BGclean=0.626, BGnoise=0.515, Hallucin=0.518),
132
+ dict(model="ZipVoice", LibriTTS=0.579, VCTK=0.554, MultiSpk=0.531, Long=0.729, AISHELL=0.712, French=0.363, Bilingual=0.322, BGclean=0.625, BGnoise=0.462, Hallucin=0.509),
133
+ dict(model="MaskGCT", LibriTTS=0.570, VCTK=0.555, MultiSpk=0.431, Long=0.194, AISHELL=0.674, French=0.494, Bilingual=None, BGclean=0.610, BGnoise=0.487, Hallucin=0.499),
134
+ dict(model="GLM-TTS", LibriTTS=0.570, VCTK=0.573, MultiSpk=0.445, Long=0.757, AISHELL=0.690, French=0.398, Bilingual=0.657, BGclean=0.622, BGnoise=0.528, Hallucin=0.533),
135
+ dict(model="F5-TTS", LibriTTS=0.559, VCTK=0.537, MultiSpk=0.507, Long=0.607, AISHELL=0.696, French=0.304, Bilingual=0.653, BGclean=0.582, BGnoise=0.414, Hallucin=0.455),
136
+ dict(model="Higgs Audio", LibriTTS=0.559, VCTK=0.516, MultiSpk=0.418, Long=0.520, AISHELL=0.581, French=0.349, Bilingual=0.543, BGclean=0.592, BGnoise=0.421, Hallucin=0.425),
137
+ dict(model="MGM-Omni", LibriTTS=0.539, VCTK=0.447, MultiSpk=0.370, Long=0.442, AISHELL=0.713, French=0.227, Bilingual=0.630, BGclean=0.523, BGnoise=0.332, Hallucin=0.396),
138
+ dict(model="PlayDiffusion",LibriTTS=0.506, VCTK=0.426, MultiSpk=0.360, Long=0.637, AISHELL=0.441, French=0.283, Bilingual=0.465, BGclean=0.433, BGnoise=0.305, Hallucin=0.408),
139
+ dict(model="MOSS-TTSD", LibriTTS=0.492, VCTK=0.440, MultiSpk=0.379, Long=0.644, AISHELL=0.437, French=0.327, Bilingual=0.471, BGclean=0.494, BGnoise=0.488, Hallucin=0.416),
140
+ dict(model="VibeVoice", LibriTTS=0.480, VCTK=0.436, MultiSpk=0.348, Long=0.625, AISHELL=0.564, French=0.343, Bilingual=0.531, BGclean=0.513, BGnoise=0.364, Hallucin=0.408),
141
+ dict(model="FishSpeech", LibriTTS=0.472, VCTK=0.430, MultiSpk=0.383, Long=0.572, AISHELL=0.611, French=0.374, Bilingual=0.566, BGclean=0.495, BGnoise=0.387, Hallucin=0.351),
142
+ dict(model="XTTS-v2", LibriTTS=0.454, VCTK=0.454, MultiSpk=0.328, Long=0.613, AISHELL=0.569, French=0.445, Bilingual=0.506, BGclean=0.546, BGnoise=0.394, Hallucin=0.488),
143
+ dict(model="SparkTTS", LibriTTS=0.408, VCTK=0.532, MultiSpk=0.228, Long=0.345, AISHELL=0.569, French=0.164, Bilingual=0.480, BGclean=0.588, BGnoise=0.332, Hallucin=0.336),
144
+ dict(model="OZSpeech", LibriTTS=0.388, VCTK=0.253, MultiSpk=0.271, Long=None, AISHELL=None, French=0.109, Bilingual=None, BGclean=0.272, BGnoise=0.164, Hallucin=0.281),
145
+ dict(model="OpenVoice V2", LibriTTS=0.244, VCTK=0.392, MultiSpk=0.192, Long=0.278, AISHELL=0.431, French=0.271, Bilingual=0.298, BGclean=0.484, BGnoise=0.358, Hallucin=0.365),
146
+ dict(model="StyleTTS 2", LibriTTS=0.228, VCTK=0.236, MultiSpk=0.162, Long=None, AISHELL=None, French=None, Bilingual=0.213, BGclean=0.196, BGnoise=0.166, Hallucin=0.184),
147
+ ]
148
+
149
+ CROSS_DATASET_COLS = [
150
+ ("LibriTTS", "LibriTTS"),
151
+ ("VCTK", "VCTK"),
152
+ ("MultiSpk", "Multi-spk"),
153
+ ("Long", "Long"),
154
+ ("AISHELL", "AISHELL"),
155
+ ("French", "French"),
156
+ ("Bilingual", "Bilingual"),
157
+ ("BGclean", "BG-clean"),
158
+ ("BGnoise", "BG-noise"),
159
+ ("Hallucin", "Hallucin."),
160
+ ]
161
+
162
  # Protection robustness — SIM under each method (LibriTTS, all 18 models)
163
  PROT_ROWS = [
164
  dict(model="Qwen3-TTS", Clean=0.614, SafeSpeech=0.384, Enkidu=0.502, Spectral=0.363, GRNoise=0.408, AntiFake=0.582),
 
444
  return fig
445
 
446
 
447
+ def make_cross_dataset_heatmap() -> go.Figure:
448
+ """Heatmap: SIM on clean prompts across all 10 datasets for all 18 models."""
449
+ col_keys = [k for k, _ in CROSS_DATASET_COLS]
450
+ col_labels = [label for _, label in CROSS_DATASET_COLS]
451
+
452
+ rows = sorted(CROSS_DATASET_ROWS, key=lambda r: r["LibriTTS"], reverse=True)
453
+ model_names = [r["model"] for r in rows]
454
+
455
+ z: list[list] = []
456
+ text_vals: list[list[str]] = []
457
+ for r in rows:
458
+ row_z, row_t = [], []
459
+ for key in col_keys:
460
+ v = r.get(key)
461
+ row_z.append(v)
462
+ row_t.append(f"{v:.3f}" if v is not None else "—")
463
+ z.append(row_z)
464
+ text_vals.append(row_t)
465
+
466
+ fig = go.Figure(go.Heatmap(
467
+ z=z,
468
+ x=col_labels,
469
+ y=model_names,
470
+ text=text_vals,
471
+ texttemplate="%{text}",
472
+ textfont=dict(size=10),
473
+ colorscale=[
474
+ [0.0, "#b71c1c"],
475
+ [0.25, "#ef9a9a"],
476
+ [0.5, "#fff9c4"],
477
+ [0.75, "#a5d6a7"],
478
+ [1.0, "#1b5e20"],
479
+ ],
480
+ zmin=0.0, zmax=0.75,
481
+ colorbar=dict(title="SIM", tickformat=".2f", len=0.8),
482
+ hoverongaps=False,
483
+ ))
484
+ fig.update_layout(
485
+ title=dict(
486
+ text="<b>Cross-Dataset Generalisation — Speaker Similarity (SIM) on Clean Prompts</b><br>"
487
+ "<sup>Models sorted by LibriTTS SIM. — = not evaluated. "
488
+ "Green = high SIM (faithful clone), red = low SIM.</sup>",
489
+ font=dict(size=13),
490
+ ),
491
+ yaxis=dict(autorange="reversed"),
492
+ xaxis=dict(side="top"),
493
+ paper_bgcolor="white", plot_bgcolor="white",
494
+ margin=dict(t=120, b=40, l=120, r=80),
495
+ height=600,
496
+ )
497
+ return fig
498
+
499
+
500
  def make_waveform_figure(
501
  original: np.ndarray, protected: np.ndarray, sr: int
502
  ) -> go.Figure:
 
631
  font-size: 2.35rem;
632
  line-height: 1.08;
633
  letter-spacing: 0;
634
+ color: white !important;
635
  }
636
  .hero p {
637
  max-width: 760px;
 
724
  </div>
725
 
726
  <div class="stat-strip">
727
+ <div class="stat-card"><b>26</b><span>voice cloning models</span></div>
728
  <div class="stat-card"><b>5</b><span>protection methods</span></div>
729
  <div class="stat-card"><b>7</b><span>evaluation metrics</span></div>
730
  <div class="stat-card"><b>10</b><span>speech datasets</span></div>
 
867
  outputs=[bar_chart])
868
  demo.load(fn=lambda: make_results_bar("SIM"), outputs=[bar_chart])
869
 
870
+ gr.Markdown("---")
871
+ gr.Markdown(
872
+ "### Cross-Dataset Generalisation\n"
873
+ "SIM on clean prompts across all 10 benchmark datasets. "
874
+ "Models sorted by LibriTTS SIM. — = not evaluated."
875
+ )
876
+ cross_heatmap = gr.Plot(label="", show_label=False)
877
+ demo.load(fn=make_cross_dataset_heatmap, outputs=[cross_heatmap])
878
+
879
  gr.Markdown("---")
880
  gr.Markdown(
881
  "### Protection Robustness Heatmap\n"