Spaces:

mvp-lab
/

cfm_svc

Running on Zero

App Files Files Community

Hector Li commited on Mar 7

Commit

610c983

1 Parent(s): 05660dd

Fix UI sync, explicitly add examples/obama.spk.npy to git so HF Space sees it, and explicitly fallback to None reference

Browse files

Files changed (3) hide show

app.py +8 -2
examples/obama.spk.npy +0 -0
ui_f5svc.py +73 -37

app.py CHANGED Viewed

@@ -39,10 +39,16 @@ def _download_models():
     try:
         hf_hub_download(repo_id=repo_id, filename="chkpt_f5svc/model_1200000.safetensors", local_dir=PROJECT_ROOT, token=token)
         hf_hub_download(repo_id=repo_id, filename="chkpt_f5svc/stage1_epoch_50.pt", local_dir=PROJECT_ROOT, token=token)
         hf_hub_download(repo_id=repo_id, filename="whisper_pretrain/large-v2.pt", local_dir=PROJECT_ROOT, token=token)
         hf_hub_download(repo_id=repo_id, filename="hubert_pretrain/hubert-soft-0d54a1f4.pt", local_dir=PROJECT_ROOT, token=token)
         hf_hub_download(repo_id=repo_id, filename="speaker_pretrain/best_model.pth.tar", local_dir=PROJECT_ROOT, token=token)
         hf_hub_download(repo_id=repo_id, filename="speaker_pretrain/config.json", local_dir=PROJECT_ROOT, token=token)
         snapshot_download(repo_id=repo_id, allow_patterns="examples/*", local_dir=PROJECT_ROOT, token=token)
         print("All downloads complete!")
     except Exception as e:
@@ -618,8 +624,8 @@ with gr.Blocks(title="F5-SVC", theme=gr.themes.Soft()) as app:
                     gr.Markdown("### Target Speaker")
                     speaker_dd = gr.Dropdown(
                         choices=_speaker_choices(),
-                        value=_speaker_choices()[0],
-                        label="Speaker (from data_svc/singer/)",
                     )
                     custom_spk = gr.File(
                         label="Upload custom .spk.npy (overrides dropdown)",

     try:
         hf_hub_download(repo_id=repo_id, filename="chkpt_f5svc/model_1200000.safetensors", local_dir=PROJECT_ROOT, token=token)
         hf_hub_download(repo_id=repo_id, filename="chkpt_f5svc/stage1_epoch_50.pt", local_dir=PROJECT_ROOT, token=token)
+        hf_hub_download(repo_id=repo_id, filename="chkpt_f5svc/stage2_obama.pt", local_dir=PROJECT_ROOT, token=token)
         hf_hub_download(repo_id=repo_id, filename="whisper_pretrain/large-v2.pt", local_dir=PROJECT_ROOT, token=token)
         hf_hub_download(repo_id=repo_id, filename="hubert_pretrain/hubert-soft-0d54a1f4.pt", local_dir=PROJECT_ROOT, token=token)
         hf_hub_download(repo_id=repo_id, filename="speaker_pretrain/best_model.pth.tar", local_dir=PROJECT_ROOT, token=token)
         hf_hub_download(repo_id=repo_id, filename="speaker_pretrain/config.json", local_dir=PROJECT_ROOT, token=token)
+        try:
+            hf_hub_download(repo_id=repo_id, filename="examples/obama.spk.npy", local_dir=PROJECT_ROOT, token=token)
+            hf_hub_download(repo_id=repo_id, filename="examples/obama_ref.wav", local_dir=PROJECT_ROOT, token=token)
+        except Exception:
+            pass
         snapshot_download(repo_id=repo_id, allow_patterns="examples/*", local_dir=PROJECT_ROOT, token=token)
         print("All downloads complete!")
     except Exception as e:
                     gr.Markdown("### Target Speaker")
                     speaker_dd = gr.Dropdown(
                         choices=_speaker_choices(),
+                        value=_speaker_choices()[0] if _speaker_choices() else None,
+                        label="Target Speaker",
                     )
                     custom_spk = gr.File(
                         label="Upload custom .spk.npy (overrides dropdown)",

examples/obama.spk.npy ADDED Viewed

Binary file (1.15 kB). View file

ui_f5svc.py CHANGED Viewed

@@ -22,6 +22,8 @@ import torch
 import torchaudio.functional as TAF
 import gradio as gr
 PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
 INFER_ROOT   = os.path.join(PROJECT_ROOT, "data_svc_infer_f5")
 CHKPT_DIR    = os.path.join(PROJECT_ROOT, "chkpt_f5svc")
@@ -29,6 +31,7 @@ SINGER_DIR   = os.path.join(PROJECT_ROOT, "data_svc", "singer")
 SAMPLE_RATE  = 24000
 # ──────────────────────────────────────────────────────────────────────────────
 # Helpers
 # ──────────────────────────────────────────────────────────────────────────────
@@ -62,10 +65,24 @@ def _stage2_checkpoints():
 def _speaker_choices():
     choices = []
     if os.path.isdir(SINGER_DIR):
         for f in sorted(glob.glob(os.path.join(SINGER_DIR, "*.spk.npy"))):
             choices.append(os.path.splitext(os.path.basename(f).replace(".spk", ""))[0])
-    return choices if choices else ["(no speakers found)"]
 def _resolve_stage1_path(choice: str) -> str | None:
@@ -293,7 +310,11 @@ def run_inference(
             spk_arr = np.load(custom_spk_file)
             push(f"  Using uploaded speaker embedding.")
         elif speaker_choice and not speaker_choice.startswith("(no speakers"):
-            spk_path = os.path.join(SINGER_DIR, f"{speaker_choice}.spk.npy")
             spk_arr = np.load(spk_path)
             push(f"  Using speaker: {speaker_choice}")
         else:
@@ -410,19 +431,30 @@ def run_inference(
 # 3. Preprocess new speaker (streaming subprocess)
 # ──────────────────────────────────────────────────────────────────────────────
-def preprocess_speaker(audio_dir, speaker_id):
-    if not os.path.isdir(audio_dir):
-        yield f"Error: directory '{audio_dir}' not found."
         return
-    wav_count = sum(
-        1 for _, _, fs in os.walk(audio_dir)
-        for f in fs if f.lower().endswith(".wav")
-    )
-    if wav_count == 0:
-        yield f"Error: no .wav files found under {audio_dir}."
         return
-    log_lines = []
     def push(line):
         log_lines.append(line)
@@ -431,13 +463,13 @@ def preprocess_speaker(audio_dir, speaker_id):
         return "\n".join(log_lines)
     spk_id = speaker_id.strip() or "speaker"
-    yield push(f"Preprocessing speaker '{spk_id}' from {audio_dir} ({wav_count} wavs)")
     # Wrap files into data_svc/waves-32k/<spk_id>/ if needed
     abs_audio = os.path.abspath(audio_dir)
     target_waves = os.path.join(PROJECT_ROOT, "data_svc", "waves-32k", spk_id)
     if abs_audio != os.path.abspath(target_waves):
-        yield push(f"Note: pass audio already inside data_svc/waves-32k/{spk_id}/ for correct path resolution.")
     steps = [
         ("Speaker embeddings", [
@@ -463,7 +495,7 @@ def preprocess_speaker(audio_dir, speaker_id):
     ]
     for step_name, cmd in steps:
-        yield push(f"\n--- {step_name} ---")
         process = subprocess.Popen(
             cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
             text=True, bufsize=1, cwd=PROJECT_ROOT,
@@ -472,19 +504,24 @@ def preprocess_speaker(audio_dir, speaker_id):
         for raw_line in process.stdout:
             line = raw_line.rstrip()
             if line:
-                yield push(line)
         rc = process.wait()
         if rc != 0:
-            yield push(f"  Warning: exited with code {rc}")
-    yield push(f"\nPreprocessing complete. Speaker embedding: data_svc/singer/{spk_id}.spk.npy")
 # ──────────────────────────────────────────────────────────────────────────────
 # 4. Stage 2 Training (streaming subprocess)
 # ──────────────────────────────────────────────────────────────────────────────
-def start_stage2(stage1_choice, speaker_id, audio_dir, epochs, lr, stage2_rank):
     log_lines = []
     def push(line):
@@ -550,17 +587,17 @@ with gr.Blocks(title="F5-SVC", theme=gr.themes.Soft()) as app:
             with gr.Row():
                 with gr.Column():
                     gr.Markdown("### Source")
-                    audio_in = gr.Audio(label="Source Singing", type="filepath")
                     ref_audio_in = gr.Audio(
-                        label="Reference Audio (target speaker timbre)",
                         type="filepath",
                     )
                     gr.Markdown("### Target Speaker")
                     speaker_dd = gr.Dropdown(
                         choices=_speaker_choices(),
-                        value=_speaker_choices()[0],
-                        label="Speaker (from data_svc/singer/)",
                     )
                     custom_spk = gr.File(
                         label="Upload custom .spk.npy (overrides dropdown)",
@@ -639,24 +676,26 @@ with gr.Blocks(title="F5-SVC", theme=gr.themes.Soft()) as app:
         with gr.TabItem("2. Preprocess Speaker"):
             gr.Markdown(
                 "Extract PPG / HuBERT / F0 / speaker embeddings for a new speaker.\n\n"
-                "**Place wav files at** `data_svc/waves-32k/<speaker_id>/` before running."
             )
             with gr.Row():
-                audio_dir_tb = gr.Textbox(
-                    value="./data_svc/waves-32k/obama",
-                    label="Audio directory (data_svc/waves-32k/<speaker_id>/)",
-                )
                 speaker_id_tb = gr.Textbox(
                     value="obama",
                     label="Speaker ID",
                 )
             prep_btn = gr.Button("Run Preprocessing", variant="primary")
-            prep_log = gr.Textbox(label="Log", lines=18, interactive=False)
             prep_btn.click(
                 fn=preprocess_speaker,
-                inputs=[audio_dir_tb, speaker_id_tb],
-                outputs=[prep_log],
                 queue=True,
             )
@@ -674,10 +713,7 @@ with gr.Blocks(title="F5-SVC", theme=gr.themes.Soft()) as app:
                     label="Base Stage 1 Checkpoint",
                 )
                 s2_spk_tb = gr.Textbox(value="obama", label="Speaker ID")
-            s2_audio_tb = gr.Textbox(
-                value="./data_svc/waves-32k/obama",
-                label="Speaker Audio Directory",
-            )
             with gr.Row():
                 s2_epoch_sl = gr.Slider(10, 200, step=5,  value=50,   label="Epochs")
                 s2_lr_num   = gr.Number(value=5e-5,  label="Learning Rate")
@@ -688,7 +724,7 @@ with gr.Blocks(title="F5-SVC", theme=gr.themes.Soft()) as app:
             s2_btn.click(
                 fn=start_stage2,
-                inputs=[s2_stage1_dd, s2_spk_tb, s2_audio_tb, s2_epoch_sl, s2_lr_num, s2_rank_sl],
                 outputs=[s2_log],
                 queue=True,
             )
@@ -696,4 +732,4 @@ with gr.Blocks(title="F5-SVC", theme=gr.themes.Soft()) as app:
 if __name__ == "__main__":
     print("Starting F5-SVC Web Interface...")
-    app.queue().launch(server_name="0.0.0.0", server_port=7861, share=False)

 import torchaudio.functional as TAF
 import gradio as gr
+import os
 PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
 INFER_ROOT   = os.path.join(PROJECT_ROOT, "data_svc_infer_f5")
 CHKPT_DIR    = os.path.join(PROJECT_ROOT, "chkpt_f5svc")
 SAMPLE_RATE  = 24000
 # ──────────────────────────────────────────────────────────────────────────────
 # Helpers
 # ──────────────────────────────────────────────────────────────────────────────
 def _speaker_choices():
     choices = []
+    # 1. Custom preprocessed speakers
     if os.path.isdir(SINGER_DIR):
         for f in sorted(glob.glob(os.path.join(SINGER_DIR, "*.spk.npy"))):
             choices.append(os.path.splitext(os.path.basename(f).replace(".spk", ""))[0])
+    # 2. Example speakers
+    examples = os.path.join(PROJECT_ROOT, "examples")
+    if os.path.isdir(examples):
+        for f in sorted(glob.glob(os.path.join(examples, "*.spk.npy"))):
+            choices.append(os.path.splitext(os.path.basename(f).replace(".spk", ""))[0])
+    # 3. OpenSinger Open Speakers
+    opensinger = os.path.join(examples, "opensinger")
+    if os.path.isdir(opensinger):
+        for f in sorted(glob.glob(os.path.join(opensinger, "*.spk.npy"))):
+            choices.append(os.path.splitext(os.path.basename(f).replace(".spk", ""))[0])
+    return list(set(choices)) if choices else ["(no speakers found)"]
 def _resolve_stage1_path(choice: str) -> str | None:
             spk_arr = np.load(custom_spk_file)
             push(f"  Using uploaded speaker embedding.")
         elif speaker_choice and not speaker_choice.startswith("(no speakers"):
+            p1 = os.path.join(SINGER_DIR, f"{speaker_choice}.spk.npy")
+            p2 = os.path.join(PROJECT_ROOT, "examples", f"{speaker_choice}.spk.npy")
+            p3 = os.path.join(PROJECT_ROOT, "examples", "opensinger", f"{speaker_choice}.spk.npy")
+            spk_path = p1 if os.path.exists(p1) else (p2 if os.path.exists(p2) else p3)
             spk_arr = np.load(spk_path)
             push(f"  Using speaker: {speaker_choice}")
         else:
 # 3. Preprocess new speaker (streaming subprocess)
 # ──────────────────────────────────────────────────────────────────────────────
+def preprocess_speaker(speaker_id, uploaded_files):
+    log_lines = []
+    spk_id = speaker_id.strip() or "speaker"
+    if not uploaded_files:
+        yield "Error: Please upload target audio files.", None
         return
+    import shutil
+    target_waves = os.path.join(PROJECT_ROOT, "data_svc", "waves-32k", spk_id)
+    os.makedirs(target_waves, exist_ok=True)
+    count = 0
+    for f in uploaded_files:
+        if f.endswith(".wav"):
+            shutil.copy(f, target_waves)
+            count += 1
+    if count == 0:
+        yield "Error: No .wav files found in upload.", None
         return
+    audio_dir = target_waves
     def push(line):
         log_lines.append(line)
         return "\n".join(log_lines)
     spk_id = speaker_id.strip() or "speaker"
+    yield push(f"Preprocessing speaker '{spk_id}' from {audio_dir} ({count} wavs)"), None
     # Wrap files into data_svc/waves-32k/<spk_id>/ if needed
     abs_audio = os.path.abspath(audio_dir)
     target_waves = os.path.join(PROJECT_ROOT, "data_svc", "waves-32k", spk_id)
     if abs_audio != os.path.abspath(target_waves):
+        yield push(f"Note: pass audio already inside data_svc/waves-32k/{spk_id}/ for correct path resolution."), None
     steps = [
         ("Speaker embeddings", [
     ]
     for step_name, cmd in steps:
+        yield push(f"\n--- {step_name} ---"), None
         process = subprocess.Popen(
             cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
             text=True, bufsize=1, cwd=PROJECT_ROOT,
         for raw_line in process.stdout:
             line = raw_line.rstrip()
             if line:
+                yield push(line), None
         rc = process.wait()
         if rc != 0:
+            yield push(f"  Warning: exited with code {rc}"), None
+    final_spk = os.path.join(PROJECT_ROOT, "data_svc", "singer", f"{spk_id}.spk.npy")
+    if os.path.exists(final_spk):
+        yield push(f"\nPreprocessing complete. Speaker embedding: {final_spk}"), final_spk
+    else:
+        yield push(f"\nPreprocessing failed. Embedding not found."), None
 # ──────────────────────────────────────────────────────────────────────────────
 # 4. Stage 2 Training (streaming subprocess)
 # ──────────────────────────────────────────────────────────────────────────────
+def start_stage2(stage1_choice, speaker_id, epochs, lr, stage2_rank):
+    audio_dir = os.path.join(PROJECT_ROOT, 'data_svc', 'waves-32k', speaker_id.strip())
     log_lines = []
     def push(line):
             with gr.Row():
                 with gr.Column():
                     gr.Markdown("### Source")
+                    audio_in = gr.Audio(label="Source Audio (No accompaniment, ~5-15s)", type="filepath")
                     ref_audio_in = gr.Audio(
+                        label="Reference Audio (target speaker timbre, leave blank for zero-ref)",
                         type="filepath",
                     )
                     gr.Markdown("### Target Speaker")
                     speaker_dd = gr.Dropdown(
                         choices=_speaker_choices(),
+                        value=_speaker_choices()[0] if _speaker_choices() else None,
+                        label="Target Speaker",
                     )
                     custom_spk = gr.File(
                         label="Upload custom .spk.npy (overrides dropdown)",
         with gr.TabItem("2. Preprocess Speaker"):
             gr.Markdown(
                 "Extract PPG / HuBERT / F0 / speaker embeddings for a new speaker.\n\n"
+                "**Upload your target speaker .wav files below.**"
             )
             with gr.Row():
                 speaker_id_tb = gr.Textbox(
                     value="obama",
                     label="Speaker ID",
                 )
+                upload_wavs = gr.File(
+                    label="Upload Target Audio (.wav)",
+                    file_count="multiple",
+                )
             prep_btn = gr.Button("Run Preprocessing", variant="primary")
+            with gr.Row():
+                prep_log = gr.Textbox(label="Log", lines=18, interactive=False)
+                prep_spk_out = gr.File(label="Generated Speaker Embedding (.spk.npy)")
             prep_btn.click(
                 fn=preprocess_speaker,
+                inputs=[speaker_id_tb, upload_wavs],
+                outputs=[prep_log, prep_spk_out],
                 queue=True,
             )
                     label="Base Stage 1 Checkpoint",
                 )
                 s2_spk_tb = gr.Textbox(value="obama", label="Speaker ID")
             with gr.Row():
                 s2_epoch_sl = gr.Slider(10, 200, step=5,  value=50,   label="Epochs")
                 s2_lr_num   = gr.Number(value=5e-5,  label="Learning Rate")
             s2_btn.click(
                 fn=start_stage2,
+                inputs=[s2_stage1_dd, s2_spk_tb, s2_epoch_sl, s2_lr_num, s2_rank_sl],
                 outputs=[s2_log],
                 queue=True,
             )
 if __name__ == "__main__":
     print("Starting F5-SVC Web Interface...")
+    app.queue().launch(server_name="0.0.0.0", server_port=7860, share=False)