Nanboy commited on
Commit
31d8814
·
verified ·
1 Parent(s): 3b92d83

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +165 -50
app.py CHANGED
@@ -252,42 +252,62 @@ def make_sim_bar(model_name: str) -> go.Figure:
252
  values = list(sims.values())
253
 
254
  bar_colors = [
255
- "#1565c0", # Clean
256
- "#6a1b9a", # SafeSpeech
257
- "#1b5e20", # Enkidu
258
- "#e65100", # Spectral
259
- "#37474f", # GR-Noise
260
- "#880e4f", # AntiFake
261
  ]
262
  # annotate drop vs clean
263
  clean_sim = sims["Clean"]
264
  text = [f"{v:.3f}" if k == "Clean" else f"{v:.3f}<br>↓{clean_sim - v:.3f}"
265
  for k, v in sims.items()]
 
 
 
 
266
 
267
  fig = go.Figure(go.Bar(
268
  x=labels, y=values,
269
  marker_color=bar_colors,
270
- text=text, textposition="outside",
 
 
 
 
 
271
  cliponaxis=False,
272
  ))
273
  fig.update_layout(
274
- title=dict(text=f"<b>{model_name}</b> — Speaker Similarity Under Each Protection",
275
- font=dict(size=14)),
276
- yaxis=dict(title="SIM (Speaker Similarity)", range=[0, max(values) * 1.2]),
277
- xaxis=dict(title="Condition"),
278
- paper_bgcolor="white", plot_bgcolor="#f8f9fa",
279
- margin=dict(t=60, b=40, l=50, r=20),
280
- height=320,
 
 
 
 
 
 
 
 
 
281
  showlegend=False,
 
 
282
  )
283
  fig.add_trace(go.Scatter(
284
  x=labels,
285
  y=[clean_sim] * len(labels),
286
  mode="lines+text",
287
- line=dict(color="#1565c0", dash="dot", width=1.5),
288
  text=[""] * (len(labels) - 1) + ["Clean baseline"],
289
  textposition="top right",
290
- textfont=dict(size=10, color="#1565c0"),
291
  hoverinfo="skip",
292
  showlegend=False,
293
  ))
@@ -506,31 +526,126 @@ def update_results_bar(metric: str) -> go.Figure:
506
  # ── UI constants ──────────────────────────────────────────────────────────────
507
 
508
  CSS = """
509
- #title { text-align: center; }
510
  footer { display: none !important; }
511
- .note-box { font-size: 1.05em; background: #f0f4ff; border-radius: 8px; padding: 8px 12px; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
512
  """
513
 
514
  INTRO_MD = """
515
- <div id="title">
516
-
517
- # RVCBench ��� Voice Cloning & Protection Benchmark
518
-
519
- **Can audio protection prevent your voice from being cloned?**
520
-
521
- [![Paper](https://img.shields.io/badge/arXiv-2602.00443-b31b1b.svg)](https://arxiv.org/abs/2602.00443)
522
- [![Dataset](https://img.shields.io/badge/HuggingFace-Dataset-ffcc00.svg)](https://huggingface.co/datasets/Nanboy/RVCBench)
523
- [![GitHub](https://img.shields.io/badge/GitHub-RVCBench-181717.svg)](https://github.com/Nanboy-Ronan/RVCBench)
524
 
 
 
 
 
 
525
  </div>
526
  """
527
 
528
  GALLERY_INTRO_MD = """
529
- A voice cloning model uses the **Reference Voice** to clone the **Target Speech**.
530
- When protection (SafeSpeech adversarial perturbation) is applied to the reference first,
531
- the clone degrades — lower speaker similarity means protection is working.
532
-
533
- The bar chart below shows the SIM drop under **all 5 protection methods** for the selected model.
534
  """
535
 
536
  PROT_INTRO_MD = """
@@ -564,39 +679,39 @@ def build_demo():
564
  with gr.Tab("🎧 Voice Cloning Gallery"):
565
  gr.Markdown(GALLERY_INTRO_MD)
566
 
567
- model_dd = gr.Dropdown(
568
- choices=list(GALLERY_MODELS.keys()),
569
- value="ZipVoice",
570
- label="Voice Cloning Model",
571
- )
572
- load_btn = gr.Button("Load Example", variant="primary")
 
 
573
 
574
  sim_note = gr.Markdown("", elem_classes="note-box")
575
 
576
  with gr.Row():
577
- with gr.Column():
578
- gr.Markdown("### 1 · Reference Voice")
579
  gr.Markdown(f"*\"{REF_TEXT}\"*")
580
  ref_out = gr.Audio(label="Reference (original)", interactive=False)
581
- with gr.Column():
582
- gr.Markdown("### 2 · Target Speech")
583
  gr.Markdown(f"*\"{TARGET_TEXT}\"*")
584
  target_out = gr.Audio(label="Target utterance", interactive=False)
585
 
586
- gr.Markdown("---")
587
- gr.Markdown("### 3 · Cloning Results — Clean vs. SafeSpeech-Protected")
588
 
589
  with gr.Row():
590
- with gr.Column():
591
- gr.Markdown("#### Without Protection")
592
  clean_out = gr.Audio(label="Clean clone", interactive=False)
593
- with gr.Column():
594
- gr.Markdown("#### With SafeSpeech Protection")
595
  prot_ref_out = gr.Audio(label="Protected reference", interactive=False)
596
  prot_clone_out = gr.Audio(label="Clone from protected (degraded)", interactive=False)
597
 
598
- gr.Markdown("---")
599
- gr.Markdown("### 4 · Protection Effectiveness Across All Methods")
600
  sim_chart = gr.Plot(label="", show_label=False)
601
 
602
  gallery_outputs = [ref_out, target_out, clean_out, prot_ref_out,
 
252
  values = list(sims.values())
253
 
254
  bar_colors = [
255
+ "#2563eb", # Clean
256
+ "#7c3aed", # SafeSpeech
257
+ "#059669", # Enkidu
258
+ "#ea580c", # Spectral
259
+ "#475569", # GR-Noise
260
+ "#be123c", # AntiFake
261
  ]
262
  # annotate drop vs clean
263
  clean_sim = sims["Clean"]
264
  text = [f"{v:.3f}" if k == "Clean" else f"{v:.3f}<br>↓{clean_sim - v:.3f}"
265
  for k, v in sims.items()]
266
+ hover_text = [
267
+ f"{label}<br>SIM: {value:.3f}<br>Drop from clean: {clean_sim - value:.3f}"
268
+ for label, value in zip(labels, values)
269
+ ]
270
 
271
  fig = go.Figure(go.Bar(
272
  x=labels, y=values,
273
  marker_color=bar_colors,
274
+ marker_line_color="rgba(15, 23, 42, 0.25)",
275
+ marker_line_width=1,
276
+ text=text,
277
+ textposition="outside",
278
+ hovertext=hover_text,
279
+ hoverinfo="text",
280
  cliponaxis=False,
281
  ))
282
  fig.update_layout(
283
+ title=dict(
284
+ text=f"<b>{model_name}</b> speaker similarity after protection",
285
+ font=dict(size=16, color="#0f172a"),
286
+ x=0.02,
287
+ ),
288
+ yaxis=dict(
289
+ title="SIM",
290
+ range=[0, min(0.75, max(values) * 1.28)],
291
+ gridcolor="#e2e8f0",
292
+ zeroline=False,
293
+ ),
294
+ xaxis=dict(title="", tickfont=dict(size=12)),
295
+ paper_bgcolor="white",
296
+ plot_bgcolor="#f8fafc",
297
+ margin=dict(t=62, b=42, l=48, r=24),
298
+ height=350,
299
  showlegend=False,
300
+ bargap=0.28,
301
+ font=dict(color="#334155"),
302
  )
303
  fig.add_trace(go.Scatter(
304
  x=labels,
305
  y=[clean_sim] * len(labels),
306
  mode="lines+text",
307
+ line=dict(color="#2563eb", dash="dot", width=1.5),
308
  text=[""] * (len(labels) - 1) + ["Clean baseline"],
309
  textposition="top right",
310
+ textfont=dict(size=10, color="#2563eb"),
311
  hoverinfo="skip",
312
  showlegend=False,
313
  ))
 
526
  # ── UI constants ──────────────────────────────────────────────────────────────
527
 
528
  CSS = """
 
529
  footer { display: none !important; }
530
+ .gradio-container {
531
+ max-width: 1180px !important;
532
+ margin: 0 auto !important;
533
+ }
534
+ .hero {
535
+ padding: 28px 28px 22px;
536
+ border-radius: 12px;
537
+ background: linear-gradient(135deg, #0f172a 0%, #164e63 54%, #065f46 100%);
538
+ color: white;
539
+ margin-bottom: 18px;
540
+ }
541
+ .hero h1 {
542
+ margin: 0 0 8px;
543
+ font-size: 2.35rem;
544
+ line-height: 1.08;
545
+ letter-spacing: 0;
546
+ }
547
+ .hero p {
548
+ max-width: 760px;
549
+ margin: 0;
550
+ color: #dbeafe;
551
+ font-size: 1.05rem;
552
+ }
553
+ .hero a {
554
+ color: white !important;
555
+ }
556
+ .hero-links {
557
+ display: flex;
558
+ flex-wrap: wrap;
559
+ gap: 8px;
560
+ margin-top: 16px;
561
+ }
562
+ .hero-links a {
563
+ text-decoration: none;
564
+ }
565
+ .stat-strip {
566
+ display: grid;
567
+ grid-template-columns: repeat(4, minmax(0, 1fr));
568
+ gap: 10px;
569
+ margin: 14px 0 18px;
570
+ }
571
+ .stat-card {
572
+ border: 1px solid #d8dee9;
573
+ border-radius: 8px;
574
+ padding: 12px 14px;
575
+ background: #ffffff;
576
+ }
577
+ .stat-card b {
578
+ display: block;
579
+ font-size: 1.35rem;
580
+ color: #0f172a;
581
+ line-height: 1.1;
582
+ }
583
+ .stat-card span {
584
+ color: #475569;
585
+ font-size: 0.9rem;
586
+ }
587
+ .section-head {
588
+ margin: 18px 0 8px;
589
+ color: #0f172a;
590
+ }
591
+ .note-box {
592
+ font-size: 1.02em;
593
+ background: #eef6ff;
594
+ border: 1px solid #bfdbfe;
595
+ border-left: 4px solid #2563eb;
596
+ border-radius: 8px;
597
+ padding: 10px 12px;
598
+ }
599
+ .audio-panel {
600
+ border: 1px solid #e2e8f0;
601
+ border-radius: 8px;
602
+ padding: 12px;
603
+ background: #ffffff;
604
+ }
605
+ .audio-panel h3,
606
+ .audio-panel h4 {
607
+ margin-top: 0;
608
+ }
609
+ .workflow-copy {
610
+ color: #475569;
611
+ margin-bottom: 12px;
612
+ }
613
+ @media (max-width: 760px) {
614
+ .hero {
615
+ padding: 22px 18px 18px;
616
+ }
617
+ .hero h1 {
618
+ font-size: 1.75rem;
619
+ }
620
+ .stat-strip {
621
+ grid-template-columns: repeat(2, minmax(0, 1fr));
622
+ }
623
+ }
624
  """
625
 
626
  INTRO_MD = """
627
+ <div class="hero">
628
+ <h1>RVCBench</h1>
629
+ <p>Voice cloning attacks and audio protection methods, compared through paired listening examples and speaker-similarity results.</p>
630
+ <div class="hero-links">
631
+ <a href="https://arxiv.org/abs/2602.00443"><img alt="Paper" src="https://img.shields.io/badge/arXiv-2602.00443-b31b1b.svg"></a>
632
+ <a href="https://huggingface.co/datasets/Nanboy/RVCBench"><img alt="Dataset" src="https://img.shields.io/badge/HuggingFace-Dataset-ffcc00.svg"></a>
633
+ <a href="https://github.com/Nanboy-Ronan/RVCBench"><img alt="GitHub" src="https://img.shields.io/badge/GitHub-RVCBench-181717.svg"></a>
634
+ </div>
635
+ </div>
636
 
637
+ <div class="stat-strip">
638
+ <div class="stat-card"><b>18</b><span>voice cloning models</span></div>
639
+ <div class="stat-card"><b>5</b><span>protection methods</span></div>
640
+ <div class="stat-card"><b>7</b><span>evaluation metrics</span></div>
641
+ <div class="stat-card"><b>10</b><span>speech datasets</span></div>
642
  </div>
643
  """
644
 
645
  GALLERY_INTRO_MD = """
646
+ <div class="workflow-copy">
647
+ Select a cloning model, compare clean and protected audio, then inspect how much each protection method lowers speaker similarity.
648
+ </div>
 
 
649
  """
650
 
651
  PROT_INTRO_MD = """
 
679
  with gr.Tab("🎧 Voice Cloning Gallery"):
680
  gr.Markdown(GALLERY_INTRO_MD)
681
 
682
+ with gr.Row():
683
+ model_dd = gr.Dropdown(
684
+ choices=list(GALLERY_MODELS.keys()),
685
+ value="ZipVoice",
686
+ label="Voice Cloning Model",
687
+ scale=3,
688
+ )
689
+ load_btn = gr.Button("Load Example", variant="primary", scale=1)
690
 
691
  sim_note = gr.Markdown("", elem_classes="note-box")
692
 
693
  with gr.Row():
694
+ with gr.Column(elem_classes="audio-panel"):
695
+ gr.Markdown('<h3 class="section-head">1. Reference Voice</h3>')
696
  gr.Markdown(f"*\"{REF_TEXT}\"*")
697
  ref_out = gr.Audio(label="Reference (original)", interactive=False)
698
+ with gr.Column(elem_classes="audio-panel"):
699
+ gr.Markdown('<h3 class="section-head">2. Target Speech</h3>')
700
  gr.Markdown(f"*\"{TARGET_TEXT}\"*")
701
  target_out = gr.Audio(label="Target utterance", interactive=False)
702
 
703
+ gr.Markdown('<h3 class="section-head">3. Cloning Results</h3>')
 
704
 
705
  with gr.Row():
706
+ with gr.Column(elem_classes="audio-panel"):
707
+ gr.Markdown("#### Clean Reference")
708
  clean_out = gr.Audio(label="Clean clone", interactive=False)
709
+ with gr.Column(elem_classes="audio-panel"):
710
+ gr.Markdown("#### SafeSpeech-Protected Reference")
711
  prot_ref_out = gr.Audio(label="Protected reference", interactive=False)
712
  prot_clone_out = gr.Audio(label="Clone from protected (degraded)", interactive=False)
713
 
714
+ gr.Markdown('<h3 class="section-head">4. Protection Effectiveness Across Methods</h3>')
 
715
  sim_chart = gr.Plot(label="", show_label=False)
716
 
717
  gallery_outputs = [ref_out, target_out, clean_out, prot_ref_out,