Hector Li commited on
Commit
610c983
Β·
1 Parent(s): 05660dd

Fix UI sync, explicitly add examples/obama.spk.npy to git so HF Space sees it, and explicitly fallback to None reference

Browse files
Files changed (3) hide show
  1. app.py +8 -2
  2. examples/obama.spk.npy +0 -0
  3. ui_f5svc.py +73 -37
app.py CHANGED
@@ -39,10 +39,16 @@ def _download_models():
39
  try:
40
  hf_hub_download(repo_id=repo_id, filename="chkpt_f5svc/model_1200000.safetensors", local_dir=PROJECT_ROOT, token=token)
41
  hf_hub_download(repo_id=repo_id, filename="chkpt_f5svc/stage1_epoch_50.pt", local_dir=PROJECT_ROOT, token=token)
 
42
  hf_hub_download(repo_id=repo_id, filename="whisper_pretrain/large-v2.pt", local_dir=PROJECT_ROOT, token=token)
43
  hf_hub_download(repo_id=repo_id, filename="hubert_pretrain/hubert-soft-0d54a1f4.pt", local_dir=PROJECT_ROOT, token=token)
44
  hf_hub_download(repo_id=repo_id, filename="speaker_pretrain/best_model.pth.tar", local_dir=PROJECT_ROOT, token=token)
45
  hf_hub_download(repo_id=repo_id, filename="speaker_pretrain/config.json", local_dir=PROJECT_ROOT, token=token)
 
 
 
 
 
46
  snapshot_download(repo_id=repo_id, allow_patterns="examples/*", local_dir=PROJECT_ROOT, token=token)
47
  print("All downloads complete!")
48
  except Exception as e:
@@ -618,8 +624,8 @@ with gr.Blocks(title="F5-SVC", theme=gr.themes.Soft()) as app:
618
  gr.Markdown("### Target Speaker")
619
  speaker_dd = gr.Dropdown(
620
  choices=_speaker_choices(),
621
- value=_speaker_choices()[0],
622
- label="Speaker (from data_svc/singer/)",
623
  )
624
  custom_spk = gr.File(
625
  label="Upload custom .spk.npy (overrides dropdown)",
 
39
  try:
40
  hf_hub_download(repo_id=repo_id, filename="chkpt_f5svc/model_1200000.safetensors", local_dir=PROJECT_ROOT, token=token)
41
  hf_hub_download(repo_id=repo_id, filename="chkpt_f5svc/stage1_epoch_50.pt", local_dir=PROJECT_ROOT, token=token)
42
+ hf_hub_download(repo_id=repo_id, filename="chkpt_f5svc/stage2_obama.pt", local_dir=PROJECT_ROOT, token=token)
43
  hf_hub_download(repo_id=repo_id, filename="whisper_pretrain/large-v2.pt", local_dir=PROJECT_ROOT, token=token)
44
  hf_hub_download(repo_id=repo_id, filename="hubert_pretrain/hubert-soft-0d54a1f4.pt", local_dir=PROJECT_ROOT, token=token)
45
  hf_hub_download(repo_id=repo_id, filename="speaker_pretrain/best_model.pth.tar", local_dir=PROJECT_ROOT, token=token)
46
  hf_hub_download(repo_id=repo_id, filename="speaker_pretrain/config.json", local_dir=PROJECT_ROOT, token=token)
47
+ try:
48
+ hf_hub_download(repo_id=repo_id, filename="examples/obama.spk.npy", local_dir=PROJECT_ROOT, token=token)
49
+ hf_hub_download(repo_id=repo_id, filename="examples/obama_ref.wav", local_dir=PROJECT_ROOT, token=token)
50
+ except Exception:
51
+ pass
52
  snapshot_download(repo_id=repo_id, allow_patterns="examples/*", local_dir=PROJECT_ROOT, token=token)
53
  print("All downloads complete!")
54
  except Exception as e:
 
624
  gr.Markdown("### Target Speaker")
625
  speaker_dd = gr.Dropdown(
626
  choices=_speaker_choices(),
627
+ value=_speaker_choices()[0] if _speaker_choices() else None,
628
+ label="Target Speaker",
629
  )
630
  custom_spk = gr.File(
631
  label="Upload custom .spk.npy (overrides dropdown)",
examples/obama.spk.npy ADDED
Binary file (1.15 kB). View file
 
ui_f5svc.py CHANGED
@@ -22,6 +22,8 @@ import torch
22
  import torchaudio.functional as TAF
23
  import gradio as gr
24
 
 
 
25
  PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
26
  INFER_ROOT = os.path.join(PROJECT_ROOT, "data_svc_infer_f5")
27
  CHKPT_DIR = os.path.join(PROJECT_ROOT, "chkpt_f5svc")
@@ -29,6 +31,7 @@ SINGER_DIR = os.path.join(PROJECT_ROOT, "data_svc", "singer")
29
  SAMPLE_RATE = 24000
30
 
31
 
 
32
  # ──────────────────────────────────────────────────────────────────────────────
33
  # Helpers
34
  # ──────────────────────────────────────────────────────────────────────────────
@@ -62,10 +65,24 @@ def _stage2_checkpoints():
62
 
63
  def _speaker_choices():
64
  choices = []
 
65
  if os.path.isdir(SINGER_DIR):
66
  for f in sorted(glob.glob(os.path.join(SINGER_DIR, "*.spk.npy"))):
67
  choices.append(os.path.splitext(os.path.basename(f).replace(".spk", ""))[0])
68
- return choices if choices else ["(no speakers found)"]
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
 
71
  def _resolve_stage1_path(choice: str) -> str | None:
@@ -293,7 +310,11 @@ def run_inference(
293
  spk_arr = np.load(custom_spk_file)
294
  push(f" Using uploaded speaker embedding.")
295
  elif speaker_choice and not speaker_choice.startswith("(no speakers"):
296
- spk_path = os.path.join(SINGER_DIR, f"{speaker_choice}.spk.npy")
 
 
 
 
297
  spk_arr = np.load(spk_path)
298
  push(f" Using speaker: {speaker_choice}")
299
  else:
@@ -410,19 +431,30 @@ def run_inference(
410
  # 3. Preprocess new speaker (streaming subprocess)
411
  # ──────────────────────────────────────────────────────────────────────────────
412
 
413
- def preprocess_speaker(audio_dir, speaker_id):
414
- if not os.path.isdir(audio_dir):
415
- yield f"Error: directory '{audio_dir}' not found."
 
 
 
416
  return
417
- wav_count = sum(
418
- 1 for _, _, fs in os.walk(audio_dir)
419
- for f in fs if f.lower().endswith(".wav")
420
- )
421
- if wav_count == 0:
422
- yield f"Error: no .wav files found under {audio_dir}."
 
 
 
 
 
 
 
423
  return
 
 
424
 
425
- log_lines = []
426
 
427
  def push(line):
428
  log_lines.append(line)
@@ -431,13 +463,13 @@ def preprocess_speaker(audio_dir, speaker_id):
431
  return "\n".join(log_lines)
432
 
433
  spk_id = speaker_id.strip() or "speaker"
434
- yield push(f"Preprocessing speaker '{spk_id}' from {audio_dir} ({wav_count} wavs)")
435
 
436
  # Wrap files into data_svc/waves-32k/<spk_id>/ if needed
437
  abs_audio = os.path.abspath(audio_dir)
438
  target_waves = os.path.join(PROJECT_ROOT, "data_svc", "waves-32k", spk_id)
439
  if abs_audio != os.path.abspath(target_waves):
440
- yield push(f"Note: pass audio already inside data_svc/waves-32k/{spk_id}/ for correct path resolution.")
441
 
442
  steps = [
443
  ("Speaker embeddings", [
@@ -463,7 +495,7 @@ def preprocess_speaker(audio_dir, speaker_id):
463
  ]
464
 
465
  for step_name, cmd in steps:
466
- yield push(f"\n--- {step_name} ---")
467
  process = subprocess.Popen(
468
  cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
469
  text=True, bufsize=1, cwd=PROJECT_ROOT,
@@ -472,19 +504,24 @@ def preprocess_speaker(audio_dir, speaker_id):
472
  for raw_line in process.stdout:
473
  line = raw_line.rstrip()
474
  if line:
475
- yield push(line)
476
  rc = process.wait()
477
  if rc != 0:
478
- yield push(f" Warning: exited with code {rc}")
479
 
480
- yield push(f"\nPreprocessing complete. Speaker embedding: data_svc/singer/{spk_id}.spk.npy")
 
 
 
 
481
 
482
 
483
  # ──────────────────────────────────────────────────────────────────────────────
484
  # 4. Stage 2 Training (streaming subprocess)
485
  # ──────────────────────────────────────────────────────────────────────────────
486
 
487
- def start_stage2(stage1_choice, speaker_id, audio_dir, epochs, lr, stage2_rank):
 
488
  log_lines = []
489
 
490
  def push(line):
@@ -550,17 +587,17 @@ with gr.Blocks(title="F5-SVC", theme=gr.themes.Soft()) as app:
550
  with gr.Row():
551
  with gr.Column():
552
  gr.Markdown("### Source")
553
- audio_in = gr.Audio(label="Source Singing", type="filepath")
554
  ref_audio_in = gr.Audio(
555
- label="Reference Audio (target speaker timbre)",
556
  type="filepath",
557
  )
558
 
559
  gr.Markdown("### Target Speaker")
560
  speaker_dd = gr.Dropdown(
561
  choices=_speaker_choices(),
562
- value=_speaker_choices()[0],
563
- label="Speaker (from data_svc/singer/)",
564
  )
565
  custom_spk = gr.File(
566
  label="Upload custom .spk.npy (overrides dropdown)",
@@ -639,24 +676,26 @@ with gr.Blocks(title="F5-SVC", theme=gr.themes.Soft()) as app:
639
  with gr.TabItem("2. Preprocess Speaker"):
640
  gr.Markdown(
641
  "Extract PPG / HuBERT / F0 / speaker embeddings for a new speaker.\n\n"
642
- "**Place wav files at** `data_svc/waves-32k/<speaker_id>/` before running."
643
  )
644
  with gr.Row():
645
- audio_dir_tb = gr.Textbox(
646
- value="./data_svc/waves-32k/obama",
647
- label="Audio directory (data_svc/waves-32k/<speaker_id>/)",
648
- )
649
  speaker_id_tb = gr.Textbox(
650
  value="obama",
651
  label="Speaker ID",
652
  )
 
 
 
 
653
  prep_btn = gr.Button("Run Preprocessing", variant="primary")
654
- prep_log = gr.Textbox(label="Log", lines=18, interactive=False)
 
 
655
 
656
  prep_btn.click(
657
  fn=preprocess_speaker,
658
- inputs=[audio_dir_tb, speaker_id_tb],
659
- outputs=[prep_log],
660
  queue=True,
661
  )
662
 
@@ -674,10 +713,7 @@ with gr.Blocks(title="F5-SVC", theme=gr.themes.Soft()) as app:
674
  label="Base Stage 1 Checkpoint",
675
  )
676
  s2_spk_tb = gr.Textbox(value="obama", label="Speaker ID")
677
- s2_audio_tb = gr.Textbox(
678
- value="./data_svc/waves-32k/obama",
679
- label="Speaker Audio Directory",
680
- )
681
  with gr.Row():
682
  s2_epoch_sl = gr.Slider(10, 200, step=5, value=50, label="Epochs")
683
  s2_lr_num = gr.Number(value=5e-5, label="Learning Rate")
@@ -688,7 +724,7 @@ with gr.Blocks(title="F5-SVC", theme=gr.themes.Soft()) as app:
688
 
689
  s2_btn.click(
690
  fn=start_stage2,
691
- inputs=[s2_stage1_dd, s2_spk_tb, s2_audio_tb, s2_epoch_sl, s2_lr_num, s2_rank_sl],
692
  outputs=[s2_log],
693
  queue=True,
694
  )
@@ -696,4 +732,4 @@ with gr.Blocks(title="F5-SVC", theme=gr.themes.Soft()) as app:
696
 
697
  if __name__ == "__main__":
698
  print("Starting F5-SVC Web Interface...")
699
- app.queue().launch(server_name="0.0.0.0", server_port=7861, share=False)
 
22
  import torchaudio.functional as TAF
23
  import gradio as gr
24
 
25
+ import os
26
+
27
  PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
28
  INFER_ROOT = os.path.join(PROJECT_ROOT, "data_svc_infer_f5")
29
  CHKPT_DIR = os.path.join(PROJECT_ROOT, "chkpt_f5svc")
 
31
  SAMPLE_RATE = 24000
32
 
33
 
34
+
35
  # ──────────────────────────────────────────────────────────────────────────────
36
  # Helpers
37
  # ──────────────────────────────────────────────────────────────────────────────
 
65
 
66
  def _speaker_choices():
67
  choices = []
68
+ # 1. Custom preprocessed speakers
69
  if os.path.isdir(SINGER_DIR):
70
  for f in sorted(glob.glob(os.path.join(SINGER_DIR, "*.spk.npy"))):
71
  choices.append(os.path.splitext(os.path.basename(f).replace(".spk", ""))[0])
72
+
73
+ # 2. Example speakers
74
+ examples = os.path.join(PROJECT_ROOT, "examples")
75
+ if os.path.isdir(examples):
76
+ for f in sorted(glob.glob(os.path.join(examples, "*.spk.npy"))):
77
+ choices.append(os.path.splitext(os.path.basename(f).replace(".spk", ""))[0])
78
+
79
+ # 3. OpenSinger Open Speakers
80
+ opensinger = os.path.join(examples, "opensinger")
81
+ if os.path.isdir(opensinger):
82
+ for f in sorted(glob.glob(os.path.join(opensinger, "*.spk.npy"))):
83
+ choices.append(os.path.splitext(os.path.basename(f).replace(".spk", ""))[0])
84
+
85
+ return list(set(choices)) if choices else ["(no speakers found)"]
86
 
87
 
88
  def _resolve_stage1_path(choice: str) -> str | None:
 
310
  spk_arr = np.load(custom_spk_file)
311
  push(f" Using uploaded speaker embedding.")
312
  elif speaker_choice and not speaker_choice.startswith("(no speakers"):
313
+ p1 = os.path.join(SINGER_DIR, f"{speaker_choice}.spk.npy")
314
+ p2 = os.path.join(PROJECT_ROOT, "examples", f"{speaker_choice}.spk.npy")
315
+ p3 = os.path.join(PROJECT_ROOT, "examples", "opensinger", f"{speaker_choice}.spk.npy")
316
+
317
+ spk_path = p1 if os.path.exists(p1) else (p2 if os.path.exists(p2) else p3)
318
  spk_arr = np.load(spk_path)
319
  push(f" Using speaker: {speaker_choice}")
320
  else:
 
431
  # 3. Preprocess new speaker (streaming subprocess)
432
  # ──────────────────────────────────────────────────────────────────────────────
433
 
434
+ def preprocess_speaker(speaker_id, uploaded_files):
435
+ log_lines = []
436
+
437
+ spk_id = speaker_id.strip() or "speaker"
438
+ if not uploaded_files:
439
+ yield "Error: Please upload target audio files.", None
440
  return
441
+
442
+ import shutil
443
+ target_waves = os.path.join(PROJECT_ROOT, "data_svc", "waves-32k", spk_id)
444
+ os.makedirs(target_waves, exist_ok=True)
445
+
446
+ count = 0
447
+ for f in uploaded_files:
448
+ if f.endswith(".wav"):
449
+ shutil.copy(f, target_waves)
450
+ count += 1
451
+
452
+ if count == 0:
453
+ yield "Error: No .wav files found in upload.", None
454
  return
455
+
456
+ audio_dir = target_waves
457
 
 
458
 
459
  def push(line):
460
  log_lines.append(line)
 
463
  return "\n".join(log_lines)
464
 
465
  spk_id = speaker_id.strip() or "speaker"
466
+ yield push(f"Preprocessing speaker '{spk_id}' from {audio_dir} ({count} wavs)"), None
467
 
468
  # Wrap files into data_svc/waves-32k/<spk_id>/ if needed
469
  abs_audio = os.path.abspath(audio_dir)
470
  target_waves = os.path.join(PROJECT_ROOT, "data_svc", "waves-32k", spk_id)
471
  if abs_audio != os.path.abspath(target_waves):
472
+ yield push(f"Note: pass audio already inside data_svc/waves-32k/{spk_id}/ for correct path resolution."), None
473
 
474
  steps = [
475
  ("Speaker embeddings", [
 
495
  ]
496
 
497
  for step_name, cmd in steps:
498
+ yield push(f"\n--- {step_name} ---"), None
499
  process = subprocess.Popen(
500
  cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
501
  text=True, bufsize=1, cwd=PROJECT_ROOT,
 
504
  for raw_line in process.stdout:
505
  line = raw_line.rstrip()
506
  if line:
507
+ yield push(line), None
508
  rc = process.wait()
509
  if rc != 0:
510
+ yield push(f" Warning: exited with code {rc}"), None
511
 
512
+ final_spk = os.path.join(PROJECT_ROOT, "data_svc", "singer", f"{spk_id}.spk.npy")
513
+ if os.path.exists(final_spk):
514
+ yield push(f"\nPreprocessing complete. Speaker embedding: {final_spk}"), final_spk
515
+ else:
516
+ yield push(f"\nPreprocessing failed. Embedding not found."), None
517
 
518
 
519
  # ──────────────────────────────────────────────────────────────────────────────
520
  # 4. Stage 2 Training (streaming subprocess)
521
  # ──────────────────────────────────────────────────────────────────────────────
522
 
523
+ def start_stage2(stage1_choice, speaker_id, epochs, lr, stage2_rank):
524
+ audio_dir = os.path.join(PROJECT_ROOT, 'data_svc', 'waves-32k', speaker_id.strip())
525
  log_lines = []
526
 
527
  def push(line):
 
587
  with gr.Row():
588
  with gr.Column():
589
  gr.Markdown("### Source")
590
+ audio_in = gr.Audio(label="Source Audio (No accompaniment, ~5-15s)", type="filepath")
591
  ref_audio_in = gr.Audio(
592
+ label="Reference Audio (target speaker timbre, leave blank for zero-ref)",
593
  type="filepath",
594
  )
595
 
596
  gr.Markdown("### Target Speaker")
597
  speaker_dd = gr.Dropdown(
598
  choices=_speaker_choices(),
599
+ value=_speaker_choices()[0] if _speaker_choices() else None,
600
+ label="Target Speaker",
601
  )
602
  custom_spk = gr.File(
603
  label="Upload custom .spk.npy (overrides dropdown)",
 
676
  with gr.TabItem("2. Preprocess Speaker"):
677
  gr.Markdown(
678
  "Extract PPG / HuBERT / F0 / speaker embeddings for a new speaker.\n\n"
679
+ "**Upload your target speaker .wav files below.**"
680
  )
681
  with gr.Row():
 
 
 
 
682
  speaker_id_tb = gr.Textbox(
683
  value="obama",
684
  label="Speaker ID",
685
  )
686
+ upload_wavs = gr.File(
687
+ label="Upload Target Audio (.wav)",
688
+ file_count="multiple",
689
+ )
690
  prep_btn = gr.Button("Run Preprocessing", variant="primary")
691
+ with gr.Row():
692
+ prep_log = gr.Textbox(label="Log", lines=18, interactive=False)
693
+ prep_spk_out = gr.File(label="Generated Speaker Embedding (.spk.npy)")
694
 
695
  prep_btn.click(
696
  fn=preprocess_speaker,
697
+ inputs=[speaker_id_tb, upload_wavs],
698
+ outputs=[prep_log, prep_spk_out],
699
  queue=True,
700
  )
701
 
 
713
  label="Base Stage 1 Checkpoint",
714
  )
715
  s2_spk_tb = gr.Textbox(value="obama", label="Speaker ID")
716
+
 
 
 
717
  with gr.Row():
718
  s2_epoch_sl = gr.Slider(10, 200, step=5, value=50, label="Epochs")
719
  s2_lr_num = gr.Number(value=5e-5, label="Learning Rate")
 
724
 
725
  s2_btn.click(
726
  fn=start_stage2,
727
+ inputs=[s2_stage1_dd, s2_spk_tb, s2_epoch_sl, s2_lr_num, s2_rank_sl],
728
  outputs=[s2_log],
729
  queue=True,
730
  )
 
732
 
733
  if __name__ == "__main__":
734
  print("Starting F5-SVC Web Interface...")
735
+ app.queue().launch(server_name="0.0.0.0", server_port=7860, share=False)