SoulX-Singer

Paused

App Files Files Community

kokole commited on Mar 12

Commit

2566adf

1 Parent(s): a4b297a

feat: add svc inference code and webui

Browse files

Files changed (24) hide show

.gitattributes +2 -0
app.py +32 -2
cli/inference_svc.py +105 -0
ensure_models.py +2 -2
example/audio/en_prompt.mp3 +0 -0
example/audio/en_target.mp3 +0 -0
example/audio/music_f0.npy +3 -0
example/audio/svc_prompt_demo.mp3 +3 -0
example/audio/svc_target_demo.mp3 +3 -0
example/audio/svc_webui/I'm Yours.mp3 +3 -0
example/audio/svc_webui/Sun Yanzi.mp3 +3 -0
example/audio/svc_webui/传奇.mp3 +3 -0
example/audio/svc_webui/君が好きだと叫びたい.mp3 +3 -0
example/audio/svc_webui/富士山下.mp3 +3 -0
example/audio/zh_prompt.mp3 +0 -0
example/audio/zh_prompt_f0.npy +3 -0
example/audio/zh_target.mp3 +0 -0
example/infer_svc.sh +27 -0
example/preprocess.sh +6 -2
preprocess/pipeline.py +35 -20
soulxsinger/models/modules/whisper_encoder.py +74 -0
soulxsinger/models/soulxsinger_svc.py +319 -0
webui.py +186 -174
webui_svc.py +419 -0

.gitattributes CHANGED Viewed

@@ -44,3 +44,5 @@ raven.wav filter=lfs diff=lfs merge=lfs -text
 anita.wav filter=lfs diff=lfs merge=lfs -text
 everybody_loves.wav filter=lfs diff=lfs merge=lfs -text
 obama.wav filter=lfs diff=lfs merge=lfs -text

 anita.wav filter=lfs diff=lfs merge=lfs -text
 everybody_loves.wav filter=lfs diff=lfs merge=lfs -text
 obama.wav filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -18,9 +18,39 @@ if __name__ == "__main__":
     os.chdir(ROOT)
     ensure_pretrained_models()
-    from webui import render_interface
-    page = render_interface()
     page.queue()
     page.launch(
         server_name="0.0.0.0",

     os.chdir(ROOT)
     ensure_pretrained_models()
+    import gradio as gr
+    from webui import render_tab_content as render_svs_tab
+    from webui_svc import render_tab_content as render_svc_tab
+    with gr.Blocks(title="SoulX-Singer", theme=gr.themes.Default()) as page:
+        gr.HTML(
+            '<div style="'
+            'text-align: center; '
+            'padding: 1.25rem 0 1.5rem; '
+            'margin-bottom: 0.5rem;'
+            '">'
+            '<div style="'
+            'display: inline-block; '
+            'font-size: 1.75rem; '
+            'font-weight: 700; '
+            'letter-spacing: 0.02em; '
+            'line-height: 1.3;'
+            '">SoulX-Singer</div>'
+            '<div style="'
+            'width: 80px; '
+            'height: 3px; '
+            'margin: 1rem auto 0; '
+            'background: linear-gradient(90deg, transparent, #6366f1, transparent); '
+            'border-radius: 2px;'
+            '"></div>'
+            '</div>'
+        )
+        with gr.Tabs():
+            with gr.Tab("Singing Voice Synthesis"):
+                render_svs_tab()
+            with gr.Tab("Singing Voice Conversion"):
+                render_svc_tab()
     page.queue()
     page.launch(
         server_name="0.0.0.0",

cli/inference_svc.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import os
+import torch
+import json
+import argparse
+from tqdm import tqdm
+import numpy as np
+import soundfile as sf
+from collections import OrderedDict
+from omegaconf import DictConfig
+from soulxsinger.utils.file_utils import load_config
+from soulxsinger.models.soulxsinger_svc import SoulXSingerSVC
+from soulxsinger.utils.audio_utils import load_wav
+def build_model(
+    model_path: str,
+    config: DictConfig,
+    device: str = "cuda",
+):
+    """
+    Build the model from the pre-trained model path and model configuration.
+    Args:
+        model_path (str): Path to the checkpoint file.
+        config (DictConfig): Model configuration.
+        device (str, optional): Device to use. Defaults to "cuda".
+    Returns:
+        Tuple[torch.nn.Module, torch.nn.Module]: The initialized model and vocoder.
+    """
+    if not os.path.isfile(model_path):
+        raise FileNotFoundError(
+            f"Model checkpoint not found: {model_path}. "
+            "Please download the pretrained model and place it at the path, or set --model_path."
+        )
+    model = SoulXSingerSVC(config).to(device)
+    print("Model initialized.")
+    print("Model parameters:", sum(p.numel() for p in model.parameters()) / 1e6, "M")
+    checkpoint = torch.load(model_path, weights_only=False, map_location=device)
+    if "state_dict" not in checkpoint:
+        raise KeyError(
+            f"Checkpoint at {model_path} has no 'state_dict' key. "
+            "Expected a checkpoint saved with model.state_dict()."
+        )
+    model.load_state_dict(checkpoint["state_dict"], strict=True)
+    model.eval()
+    model.to(device)
+    print("Model checkpoint loaded.")
+    return model
+def process(args, config, model: torch.nn.Module):
+    """Run the full inference pipeline given a data_processor and model.
+    """
+    os.makedirs(args.save_dir, exist_ok=True)
+    pt_wav = load_wav(args.prompt_wav_path, config.audio.sample_rate).to(args.device)
+    gt_wav = load_wav(args.target_wav_path, config.audio.sample_rate).to(args.device)
+    pt_f0 = torch.from_numpy(np.load(args.prompt_f0_path)).unsqueeze(0).to(args.device)
+    gt_f0 = torch.from_numpy(np.load(args.target_f0_path)).unsqueeze(0).to(args.device)
+    n_step = args.n_steps if hasattr(args, "n_steps") else config.infer.n_steps
+    cfg = args.cfg if hasattr(args, "cfg") else config.infer.cfg
+    generated_audio, generated_shift = model.infer(pt_wav, gt_wav, pt_f0, gt_f0, auto_shift=args.auto_shift, pitch_shift=args.pitch_shift, n_steps=n_step, cfg=cfg)
+    generated_audio = generated_audio.squeeze().cpu().numpy()
+    if args.pitch_shift != generated_shift:
+        args.pitch_shift = generated_shift
+        # print(f"Applied pitch shift of {generated_shift} semitones to match GT F0 contour.")
+    sf.write(os.path.join(args.save_dir, "generated.wav"), generated_audio, config.audio.sample_rate)
+    print(f"Generated audio saved to {os.path.join(args.save_dir, 'generated.wav')}")
+def main(args, config):
+    model = build_model(
+        model_path=args.model_path,
+        config=config,
+        device=args.device,
+    )
+    process(args, config, model)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--model_path", type=str, default='pretrained_models/soulx-singer/model.pt')
+    parser.add_argument("--config", type=str, default='soulxsinger/config/soulxsinger.yaml')
+    parser.add_argument("--prompt_wav_path", type=str, default='example/audio/zh_prompt.wav')
+    parser.add_argument("--target_wav_path", type=str, default='example/audio/zh_target.wav')
+    parser.add_argument("--prompt_f0_path", type=str, default='example/audio/zh_prompt_f0.npy')
+    parser.add_argument("--target_f0_path", type=str, default='example/audio/zh_target_f0.npy')
+    parser.add_argument("--save_dir", type=str, default='outputs')
+    parser.add_argument("--auto_shift", action="store_true")
+    parser.add_argument("--pitch_shift", type=int, default=0)
+    parser.add_argument("--n_steps", type=int, default=32)
+    parser.add_argument("--cfg", type=float, default=3.0)
+    args = parser.parse_args()
+    config = load_config(args.config)
+    main(args, config)

ensure_models.py CHANGED Viewed

@@ -10,7 +10,7 @@ MODEL_DIR_PREPROCESS = PRETRAINED_DIR / "SoulX-Singer-Preprocess"
 def ensure_pretrained_models():
     """Download SoulX-Singer and Preprocess models from Hugging Face Hub if not present."""
-    if (MODEL_DIR_SVS / "model.pt").exists() and MODEL_DIR_PREPROCESS.exists():
         print("Pretrained models already present, skipping download.", flush=True)
         return
@@ -26,7 +26,7 @@ def ensure_pretrained_models():
     PRETRAINED_DIR.mkdir(parents=True, exist_ok=True)
-    if not (MODEL_DIR_SVS / "model.pt").exists():
         print("Downloading SoulX-Singer model...", flush=True)
         snapshot_download(
             repo_id="Soul-AILab/SoulX-Singer",

 def ensure_pretrained_models():
     """Download SoulX-Singer and Preprocess models from Hugging Face Hub if not present."""
+    if (MODEL_DIR_SVS / "model.pt").exists() and (MODEL_DIR_SVS / "model-svc.pt").exists() and MODEL_DIR_PREPROCESS.exists():
         print("Pretrained models already present, skipping download.", flush=True)
         return
     PRETRAINED_DIR.mkdir(parents=True, exist_ok=True)
+    if not (MODEL_DIR_SVS / "model.pt").exists() or not (MODEL_DIR_SVS / "model-svc.pt").exists():
         print("Downloading SoulX-Singer model...", flush=True)
         snapshot_download(
             repo_id="Soul-AILab/SoulX-Singer",

example/audio/en_prompt.mp3 CHANGED Viewed

Binary files a/example/audio/en_prompt.mp3 and b/example/audio/en_prompt.mp3 differ

example/audio/en_target.mp3 CHANGED Viewed

Binary files a/example/audio/en_target.mp3 and b/example/audio/en_target.mp3 differ

example/audio/music_f0.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a091dce0ab269093a455f8959222f8c7fb55e8d9c9477e8cd2cde8eb9279d9ef
+size 20720

example/audio/svc_prompt_demo.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0dde83f7ff5ef5ad52939db70bd1324b6247ea4f399e60e0393cc18725cf29c3
+size 41187

example/audio/svc_target_demo.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c65c2e5fec64a51c613badcce35145b6f8e2bb33907ee7428275bfb918876a2c
+size 1944155

example/audio/svc_webui/I'm Yours.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c65c2e5fec64a51c613badcce35145b6f8e2bb33907ee7428275bfb918876a2c
+size 1944155

example/audio/svc_webui/Sun Yanzi.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0dde83f7ff5ef5ad52939db70bd1324b6247ea4f399e60e0393cc18725cf29c3
+size 41187

example/audio/svc_webui/传奇.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2dfec7ebb41dd6c56877fdeddf7a5fdc106ea9c2fdb1c06f6adddc6f89e6285e
+size 4738948

example/audio/svc_webui/君が好きだと叫びたい.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1fe990727559bf1ffb548c562b6c3b19f16602e3c147da42bf56fc92129ae35e
+size 3706589

example/audio/svc_webui/富士山下.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1cad7eaabe05f1c6ef1994bf4326cdafc79991e1c647a857fa2f64925e84aab
+size 4147219

example/audio/zh_prompt.mp3 CHANGED Viewed

Binary files a/example/audio/zh_prompt.mp3 and b/example/audio/zh_prompt.mp3 differ

example/audio/zh_prompt_f0.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aecf5f40c16a6390e8bb8c19ce69120dcbedaea5e4051aba1bdde95a024f29d3
+size 4408

example/audio/zh_target.mp3 CHANGED Viewed

Binary files a/example/audio/zh_target.mp3 and b/example/audio/zh_target.mp3 differ

example/infer_svc.sh ADDED Viewed

	@@ -0,0 +1,27 @@

+#!/bin/bash
+script_dir=$(dirname "$(realpath "$0")")
+root_dir=$(dirname "$script_dir")
+cd $root_dir || exit
+export PYTHONPATH=$root_dir:$PYTHONPATH
+model_path=pretrained_models/SoulX-Singer/model-svc.pt
+config=soulxsinger/config/soulxsinger.yaml
+prompt_wav_path=example/audio/zh_prompt.mp3
+target_wav_path=example/audio/music.mp3
+prompt_f0_path=example/audio/zh_prompt_f0.npy
+target_f0_path=example/audio/music_f0.npy
+save_dir=example/generated/music_svc
+python -m cli.inference_svc \
+    --device cuda \
+    --model_path $model_path \
+    --config $config \
+    --prompt_wav_path $prompt_wav_path \
+    --target_wav_path $target_wav_path \
+    --prompt_f0_path $prompt_f0_path \
+    --target_f0_path $target_f0_path \
+    --save_dir $save_dir \
+    --auto_shift \
+    --pitch_shift 0

example/preprocess.sh CHANGED Viewed

@@ -15,6 +15,7 @@ save_dir=example/transcriptions/zh_prompt
 language=Mandarin
 vocal_sep=False
 max_merge_duration=30000
 python -m preprocess.pipeline \
     --audio_path $audio_path \
@@ -22,7 +23,8 @@ python -m preprocess.pipeline \
     --language $language \
     --device $device \
     --vocal_sep $vocal_sep \
-    --max_merge_duration $max_merge_duration
 ####### Run Target Annotation #######
@@ -31,6 +33,7 @@ save_dir=example/transcriptions/music
 language=Mandarin
 vocal_sep=True
 max_merge_duration=60000
 python -m preprocess.pipeline \
     --audio_path $audio_path \
@@ -38,4 +41,5 @@ python -m preprocess.pipeline \
     --language $language \
     --device $device \
     --vocal_sep $vocal_sep \
-    --max_merge_duration $max_merge_duration

 language=Mandarin
 vocal_sep=False
 max_merge_duration=30000
+midi_transcribe=True    # Whether to transcribe vocal midi, set True for singing voice synthesis, False for singing voice conversion
 python -m preprocess.pipeline \
     --audio_path $audio_path \
     --language $language \
     --device $device \
     --vocal_sep $vocal_sep \
+    --max_merge_duration $max_merge_duration \
+    --midi_transcribe $midi_transcribe
 ####### Run Target Annotation #######
 language=Mandarin
 vocal_sep=True
 max_merge_duration=60000
+midi_transcribe=True    # Whether to transcribe vocal midi, set True for singing voice synthesis, False for singing voice conversion
 python -m preprocess.pipeline \
     --audio_path $audio_path \
     --language $language \
     --device $device \
     --vocal_sep $vocal_sep \
+    --max_merge_duration $max_merge_duration \
+    --midi_transcribe $midi_transcribe

preprocess/pipeline.py CHANGED Viewed

@@ -16,12 +16,13 @@ from preprocess.tools import (
 class PreprocessPipeline:
-    def __init__(self, device: str, language: str, save_dir: str, vocal_sep: bool = True, max_merge_duration: int = 60000):
         self.device = device
         self.language = language
         self.save_dir = save_dir
         self.vocal_sep = vocal_sep
         self.max_merge_duration = max_merge_duration
         if vocal_sep:
             self.vocal_separator = VocalSeparator(
@@ -37,26 +38,31 @@ class PreprocessPipeline:
             model_path="pretrained_models/SoulX-Singer-Preprocess/rmvpe/rmvpe.pt",
             device=device,
         )
-        self.vocal_detector = VocalDetector(
-            cut_wavs_output_dir=  f"{save_dir}/cut_wavs",
-        )
-        self.lyric_transcriber = LyricTranscriber(
-            zh_model_path="pretrained_models/SoulX-Singer-Preprocess/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
-            en_model_path="pretrained_models/SoulX-Singer-Preprocess/parakeet-tdt-0.6b-v2/parakeet-tdt-0.6b-v2.nemo",
-            device=device
-        )
-        self.note_transcriber = NoteTranscriber(
-            rosvot_model_path="pretrained_models/SoulX-Singer-Preprocess/rosvot/rosvot/model.pt",
-            rwbd_model_path="pretrained_models/SoulX-Singer-Preprocess/rosvot/rwbd/model.pt",
-            device=device
-        )
     def run(
         self,
         audio_path: str,
-        vocal_sep: bool = True,
-        max_merge_duration: int = 60000,
-        language: str = "Mandarin"
     ) -> None:
         vocal_sep = self.vocal_sep if vocal_sep is None else vocal_sep
         max_merge_duration = self.max_merge_duration if max_merge_duration is None else max_merge_duration
@@ -81,7 +87,11 @@ class PreprocessPipeline:
             vocal_path = output_dir / "vocal.wav"
             sf.write(vocal_path, vocal, sample_rate)
-        vocal_f0 = self.f0_extractor.process(str(vocal_path))
         segments = self.vocal_detector.process(str(vocal_path), f0=vocal_f0)
         metadata = []
@@ -124,10 +134,11 @@ def main(args):
         save_dir=args.save_dir,
         vocal_sep=args.vocal_sep,
         max_merge_duration=args.max_merge_duration,
     )
     pipeline.run(
         audio_path=args.audio_path,
-        language=args.language
     )
@@ -139,8 +150,12 @@ if __name__ == "__main__":
     parser.add_argument("--save_dir", type=str, required=True, help="Directory to save the output files")
     parser.add_argument("--language", type=str, default="Mandarin", help="Language of the audio")
     parser.add_argument("--device", type=str, default="cuda:0", help="Device to run the models on")
-    parser.add_argument("--vocal_sep", type=bool, default=True, help="Whether to perform vocal separation")
     parser.add_argument("--max_merge_duration", type=int, default=60000, help="Maximum merged segment duration in milliseconds")
     args = parser.parse_args()
     main(args)

 class PreprocessPipeline:
+    def __init__(self, device: str, language: str, save_dir: str, vocal_sep: bool = True, max_merge_duration: int = 60000, midi_transcribe: bool = True):
         self.device = device
         self.language = language
         self.save_dir = save_dir
         self.vocal_sep = vocal_sep
         self.max_merge_duration = max_merge_duration
+        self.midi_transcribe = midi_transcribe
         if vocal_sep:
             self.vocal_separator = VocalSeparator(
             model_path="pretrained_models/SoulX-Singer-Preprocess/rmvpe/rmvpe.pt",
             device=device,
         )
+        if self.midi_transcribe:
+            self.vocal_detector = VocalDetector(
+                cut_wavs_output_dir=  f"{save_dir}/cut_wavs",
+            )
+            self.lyric_transcriber = LyricTranscriber(
+                zh_model_path="pretrained_models/SoulX-Singer-Preprocess/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+                en_model_path="pretrained_models/SoulX-Singer-Preprocess/parakeet-tdt-0.6b-v2/parakeet-tdt-0.6b-v2.nemo",
+                device=device
+            )
+            self.note_transcriber = NoteTranscriber(
+                rosvot_model_path="pretrained_models/SoulX-Singer-Preprocess/rosvot/rosvot/model.pt",
+                rwbd_model_path="pretrained_models/SoulX-Singer-Preprocess/rosvot/rwbd/model.pt",
+                device=device
+            )
+        else:
+            self.vocal_detector = None
+            self.lyric_transcriber = None
+            self.note_transcriber = None
     def run(
         self,
         audio_path: str,
+        vocal_sep: bool = None,
+        max_merge_duration: int = None,
+        language: str = None,
     ) -> None:
         vocal_sep = self.vocal_sep if vocal_sep is None else vocal_sep
         max_merge_duration = self.max_merge_duration if max_merge_duration is None else max_merge_duration
             vocal_path = output_dir / "vocal.wav"
             sf.write(vocal_path, vocal, sample_rate)
+        vocal_f0 = self.f0_extractor.process(str(vocal_path), f0_path=str(vocal_path).replace(".wav", "_f0.npy"))
+        if not self.midi_transcribe or self.vocal_detector is None or self.lyric_transcriber is None or self.note_transcriber is None:
+            return
         segments = self.vocal_detector.process(str(vocal_path), f0=vocal_f0)
         metadata = []
         save_dir=args.save_dir,
         vocal_sep=args.vocal_sep,
         max_merge_duration=args.max_merge_duration,
+        midi_transcribe=args.midi_transcribe,
     )
     pipeline.run(
         audio_path=args.audio_path,
+        language=args.language,
     )
     parser.add_argument("--save_dir", type=str, required=True, help="Directory to save the output files")
     parser.add_argument("--language", type=str, default="Mandarin", help="Language of the audio")
     parser.add_argument("--device", type=str, default="cuda:0", help="Device to run the models on")
+    parser.add_argument("--vocal_sep", type=str, default="True", help="Whether to perform vocal separation")
     parser.add_argument("--max_merge_duration", type=int, default=60000, help="Maximum merged segment duration in milliseconds")
+    parser.add_argument("--midi_transcribe", type=str, default="True", help="Whether to do MIDI transcription")
     args = parser.parse_args()
+    args.vocal_sep = args.vocal_sep.lower() == "true"
+    args.midi_transcribe = args.midi_transcribe.lower() == "true"
     main(args)

soulxsinger/models/modules/whisper_encoder.py ADDED Viewed

	@@ -0,0 +1,74 @@

+"""Frozen Whisper encoder wrapper (wav -> encoder embeddings)."""
+from __future__ import annotations
+from typing import Optional
+import torch
+import torch.nn as nn
+import torchaudio
+from transformers import WhisperFeatureExtractor, WhisperModel
+WHISPER_MEL_FRAMES = 3000        # 3000 frames at 16000 Hz
+class WhisperEncoder():
+    def __init__(
+        self,
+        device: Optional[str] = None,
+    ) -> None:
+        self.fe = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")
+        self.model = WhisperModel.from_pretrained("openai/whisper-base")
+        self.model = self.model.to(device or ("cuda" if torch.cuda.is_available() else "cpu"))
+    def encode(
+        self,
+        wav: torch.Tensor,
+        sr: int,
+    ) -> torch.Tensor:
+        wav = torchaudio.functional.resample(wav, orig_freq=sr, new_freq=self.fe.sampling_rate) if sr != self.fe.sampling_rate else wav
+        wav_np = wav.cpu().detach().numpy().astype("float32", copy=False)
+        inputs = self.fe(
+            wav_np,
+            sampling_rate=self.fe.sampling_rate,
+            return_tensors="pt",
+            padding=False,
+            truncation=False,
+            return_attention_mask=True,
+        )
+        input_features = inputs.input_features
+        num_frames = input_features.shape[-1]
+        if num_frames < WHISPER_MEL_FRAMES:
+            pad = WHISPER_MEL_FRAMES - num_frames
+            input_features = torch.nn.functional.pad(input_features, (0, pad))
+        else:
+            input_features = input_features[..., :WHISPER_MEL_FRAMES]
+        input_features = input_features.to(wav.device)
+        if self.model.device != wav.device:
+            self.model = self.model.to(wav.device)
+        attention_mask = inputs.attention_mask.to(wav.device) if inputs.attention_mask is not None else None
+        encoder_out = self.model.encoder(input_features).last_hidden_state
+        if attention_mask is not None:
+            valid_mel_frames = attention_mask.sum(dim=1)
+            valid_enc_frames = (valid_mel_frames + 1) // 2
+            max_valid_enc_frames = min(int(valid_enc_frames.max().item()), encoder_out.shape[1])
+            encoder_out = encoder_out[:, :max_valid_enc_frames, :]
+            valid_len = min(int(valid_enc_frames[0].item()), max_valid_enc_frames)
+            if valid_len < max_valid_enc_frames:
+                encoder_out[0, valid_len:, :] = 0
+        return encoder_out
+if __name__ == "__main__":
+    torch.manual_seed(0)
+    audio = torch.randn(1, 24000 * 25).float().to("cuda")
+    encoder = WhisperEncoder()
+    whisper_encoder_out = encoder.encode(audio, sr=24000)
+    print(whisper_encoder_out.shape)

soulxsinger/models/soulxsinger_svc.py ADDED Viewed

	@@ -0,0 +1,319 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from tqdm import tqdm
+from typing import Optional, Dict, Any, List, Tuple
+from soulxsinger.models.modules.vocoder import Vocoder
+from soulxsinger.models.modules.decoder import CFMDecoder
+from soulxsinger.models.modules.mel_transform import MelSpectrogramEncoder
+from soulxsinger.models.modules.whisper_encoder import WhisperEncoder
+class SoulXSingerSVC(nn.Module):
+    """
+    SoulXSinger SVC model.
+    """
+    def __init__(self, config: Dict):
+        super(SoulXSingerSVC, self).__init__()
+        self.audio_cfg = config.audio
+        enc_cfg = config.model.encoder
+        cfm_cfg = config.model.flow_matching
+        self.whisper_encoder = WhisperEncoder()
+        self.f0_encoder = nn.Embedding(enc_cfg["f0_bin"], enc_cfg["f0_dim"])
+        self.cfm_decoder = CFMDecoder(cfm_cfg)
+        self.mel = MelSpectrogramEncoder(self.audio_cfg)
+        self.vocoder = Vocoder()
+    @staticmethod
+    def f0_to_coarse(f0, f0_bin=361, f0_min=32.7031956625, f0_shift=0):
+        """
+        Convert continuous F0 values to discrete F0 bins (SIL and C1 - B6, 361 bins).
+        args:
+            f0: continuous F0 values
+            f0_bin: number of F0 bins
+            f0_min: minimum F0 value
+            f0_shift: shift value for F0 bins
+        returns:
+            f0_coarse: discrete F0 bins
+        """
+        is_torch = isinstance(f0, torch.Tensor)
+        uv_mask = f0 <= 0
+        if is_torch:
+            f0_safe = torch.maximum(f0, torch.tensor(f0_min))
+            f0_cents = 1200 * torch.log2(f0_safe / f0_min)
+        else:
+            f0_safe = np.maximum(f0, f0_min)
+            f0_cents = 1200 * np.log2(f0_safe / f0_min)
+        f0_coarse = (f0_cents / 20) + 1
+        if is_torch:
+            f0_coarse = torch.round(f0_coarse).long()
+            f0_coarse = torch.clamp(f0_coarse, min=1, max=f0_bin - 1)
+        else:
+            f0_coarse = np.rint(f0_coarse).astype(int)
+            f0_coarse = np.clip(f0_coarse, 1, f0_bin - 1)
+        f0_coarse[uv_mask] = 0
+        if f0_shift != 0:
+            if is_torch:
+                voiced = f0_coarse > 0
+                if voiced.any():
+                    shifted = f0_coarse[voiced] + f0_shift
+                    f0_coarse[voiced] = torch.clamp(shifted, 1, f0_bin - 1)
+            else:
+                voiced = f0_coarse > 0
+                if np.any(voiced):
+                    shifted = f0_coarse[voiced] + f0_shift
+                    f0_coarse[voiced] = np.clip(shifted, 1, f0_bin - 1)
+        return f0_coarse
+    @staticmethod
+    def build_vocal_segments(
+        f0,
+        f0_rate: int = 50,
+        uv_frames_th: int = 5,
+        min_duration_sec: float = 5.0,
+        max_duration_sec: float = 30.0,
+        num_overlaps: int = 1,
+        ignore_silent_segments: bool = True,
+    ) -> Tuple[List[Tuple[float, float]], List[Tuple[float, float]]]:
+        """Build vocal segments based on F0 contour. First split by long silent runs, then merge into segments based on min and max duration constraints.
+        args:
+            f0: F0 contour of the audio, 1D array or tensor with shape (T,)
+            f0_rate: F0 sampling rate in Hz (e.g., 50 for 20ms hop size)
+            uv_frames_th: number of consecutive zero F0 frames to consider as a split point
+            min_duration_sec: minimum duration of each segment in seconds
+            max_duration_sec: maximum duration of each segment in seconds
+            num_overlaps: number of overlapping segments to create for each non-overlapping segment (for smooth inference)
+            ignore_silent_segments: whether to ignore segments that are mostly silent (e.g., > 95% zero F0)
+        returns:
+            overlap_segments: list of (overlap_start_sec, overlap_end_sec) for each segment, which may overlap with adjacent segments for smooth inference
+            segments: list of (seg_start_sec, seg_end_sec) for each segment, which are non-overlapping and used for final merging
+        """
+        if isinstance(f0, torch.Tensor):
+            f0_np = f0.detach().float().cpu().numpy()
+        else:
+            f0_np = np.asarray(f0, dtype=np.float32)
+        f0_np = np.squeeze(f0_np)
+        total_frames = int(f0_np.shape[0])
+        if total_frames == 0:
+            return [], []
+        min_frames = max(1, int(round(min_duration_sec * f0_rate)))
+        max_frames = max(1, int(round(max_duration_sec * f0_rate)))
+        split_points = [0]      # silence split points in frame indices, starting with 0 and ending with total_frames
+        def append_split_point(point: int):
+            # Ensure split points are within valid range and respect max_frames constraint
+            point = int(max(0, min(point, total_frames)))
+            while point - split_points[-1] > max_frames:
+                split_points.append(split_points[-1] + max_frames)
+            if point > split_points[-1]:
+                split_points.append(point)
+        idx = 0
+        while idx < total_frames:
+            if f0_np[idx] == 0:
+                run_start = idx
+                while idx < total_frames and f0_np[idx] == 0:
+                    idx += 1
+                run_end = idx
+                if (run_end - run_start) >= uv_frames_th:
+                    split_point = max(run_end - 5, (run_start + run_end) // 2)
+                    append_split_point(split_point)
+            else:
+                idx += 1
+        append_split_point(total_frames)
+        # print(f"Initial split points (in seconds): {[round(p / f0_rate, 2) for p in split_points]}")
+        segments: List[Tuple[int, int]] = []
+        overlap_segments: List[Tuple[int, int]] = []
+        def append_segment(start_idx: int, end_idx: int, num_overlaps: int = num_overlaps):
+            segments.append((split_points[start_idx] / f0_rate, split_points[end_idx] / f0_rate))
+            overlap_start_idx = start_idx
+            if start_idx > 0 and (split_points[end_idx] - split_points[start_idx - num_overlaps]) <= max_frames:
+                overlap_start_idx = start_idx - num_overlaps
+            overlap_segments.append((split_points[overlap_start_idx] / f0_rate, split_points[end_idx] / f0_rate))
+        segment_start, segment_end = 0, 1
+        while segment_start < len(split_points) - 1:
+            while segment_end < len(split_points) and (split_points[segment_end] - split_points[segment_start]) < min_frames:
+                segment_end += 1
+            if segment_end >= len(split_points):
+                append_segment(segment_start, len(split_points) - 1, num_overlaps=num_overlaps)
+                break
+            append_segment(segment_start, segment_end, num_overlaps=num_overlaps)
+            segment_start = segment_end
+            segment_end = segment_start + 1
+        # print(f"Final segments (overlap_start, overlap_end, seg_start_time, seg_end_time) in seconds: {overlap_segments}")
+        if ignore_silent_segments:
+            filtered_idx = []
+            for i, seg in enumerate(overlap_segments):
+                start_frame = int(seg[0] * f0_rate)
+                end_frame = int(seg[1] * f0_rate)
+                total_frames = end_frame - start_frame
+                voice_frames = np.sum(f0_np[start_frame:end_frame] > 0)
+                if voice_frames / total_frames > 0.05 and voice_frames >= 10:   # at least 10 voiced frames and >5% voiced frames
+                    filtered_idx.append(i)
+            overlap_segments = [overlap_segments[i] for i in filtered_idx]
+            segments = [segments[i] for i in filtered_idx]
+            # print(f"Filtered segments with mostly silence removed: {overlap_segments}")
+        return overlap_segments, segments
+    def infer(
+        self,
+        pt_wav: str|torch.Tensor,
+        gt_wav: str|torch.Tensor,
+        pt_f0: str|torch.Tensor,
+        gt_f0: str|torch.Tensor,
+        auto_shift=False,
+        pitch_shift=0,
+        n_steps=32,
+        cfg=3,
+    ):
+        """
+        SVC inference pipeline. First build vocal segments based on F0 contour, then run inference for each segment and merge results.
+        args:
+            pt_wav: prompt waveform path or tensor
+            gt_wav: target waveform path or tensor
+            pt_f0: prompt F0 path or tensor
+            gt_f0: target F0 path or tensor
+            auto_shift: whether to automatically calculate pitch shift based on median F0 of prompt and target
+            pitch_shift: manual pitch shift in semitones (overrides auto_shift if > 0)
+            n_steps: number of diffusion steps for inference
+            cfg: classifier-free guidance scale for inference
+        """
+        # calculate auto pitch shift
+        if auto_shift and pitch_shift == 0:
+            if gt_f0 is not None and pt_f0 is not None:
+                gt_f0_median = torch.median(gt_f0[gt_f0 > 0])
+                pt_f0_median = torch.median(pt_f0[pt_f0 > 0])
+                pitch_shift = torch.round(torch.log2(pt_f0_median / gt_f0_median) * 1200 / 100).int().item()
+            else:
+                print("Warning: pitch_shift is True but note_pitch or f0 is None. Set f0_shift to 0.")
+                pitch_shift = 0
+        else:
+            pitch_shift = pitch_shift
+        # if target audio is less than 30 seconds, infer the whole audio
+        if gt_wav.shape[-1] < 30 * self.audio_cfg.sample_rate:
+            generated_audio = self.infer_segment(
+                pt_wav=pt_wav,
+                gt_wav=gt_wav,
+                pt_f0=pt_f0,
+                gt_f0=gt_f0,
+                pitch_shift=pitch_shift,
+                n_steps=n_steps,
+                cfg=cfg,
+            )
+            return generated_audio, pitch_shift
+        # if target audio is longer than 30 seconds, build vocal segments and infer each segment
+        generated_audio = []
+        f0_rate = self.audio_cfg.sample_rate // self.audio_cfg.hop_size
+        overlap_segments, segments = self.build_vocal_segments(
+            gt_f0,
+            f0_rate=f0_rate,
+            uv_frames_th=10,
+            min_duration_sec=15.0,
+            max_duration_sec=30.0,
+        )
+        if len(segments) == 0:
+            segments = [(0.0, gt_wav.shape[-1] / self.audio_cfg.sample_rate)]
+            overlap_segments = [(0.0, gt_wav.shape[-1] / self.audio_cfg.sample_rate)]
+        generated_audio = torch.zeros_like(gt_wav)
+        for idx in tqdm(range(len(segments)), total=len(segments), desc="Inferring segments (SVC)", dynamic_ncols=True):
+            overlap_start_sec, overlap_end_sec = overlap_segments[idx]
+            seg_start_sec, seg_end_sec = segments[idx]
+            wav_start = int(round(overlap_start_sec * self.audio_cfg.sample_rate))
+            wav_end = int(round(overlap_end_sec * self.audio_cfg.sample_rate))
+            f0_start = int(round(overlap_start_sec * f0_rate))
+            f0_end = int(round(overlap_end_sec * f0_rate))
+            wav_start = max(0, min(wav_start, gt_wav.shape[-1]))
+            wav_end = max(wav_start, min(wav_end, gt_wav.shape[-1]))
+            f0_start = max(0, min(f0_start, gt_f0.shape[-1]))
+            f0_end = max(f0_start, min(f0_end, gt_f0.shape[-1]))
+            segment_gt_wav = gt_wav[:, wav_start:wav_end]
+            segment_gt_f0 = gt_f0[:, f0_start:f0_end]
+            segment_generated_audio = self.infer_segment(
+                pt_wav=pt_wav,
+                gt_wav=segment_gt_wav,
+                pt_f0=pt_f0,
+                gt_f0=segment_gt_f0,
+                pitch_shift=pitch_shift,
+                n_steps=n_steps,
+                cfg=cfg,
+            )
+            segment_start = int(round(seg_start_sec * self.audio_cfg.sample_rate))
+            segment_end = int(round(seg_end_sec * self.audio_cfg.sample_rate))
+            segment_generated_audio = segment_generated_audio[segment_start - wav_start: segment_end - wav_start]
+            generated_audio[:, segment_start:segment_end] = segment_generated_audio
+        return generated_audio, pitch_shift
+    def infer_segment(self, pt_wav, gt_wav, pt_f0, gt_f0, pitch_shift=0, n_steps=32, cfg=3):
+        pt_mel = self.mel(pt_wav)
+        len_prompt_mel = pt_mel.shape[1]
+        pt_f0 = F.pad(pt_f0, (0, 0, 0, max(0, len_prompt_mel - pt_f0.shape[1])))[:, :len_prompt_mel]
+        f0_course_pt = self.f0_to_coarse(pt_f0)
+        f0_course_gt = self.f0_to_coarse(gt_f0, f0_shift=pitch_shift * 5)
+        f0_course = torch.cat([f0_course_pt, f0_course_gt], 1)
+        pt_content_feat = self.whisper_encoder.encode(pt_wav, sr=self.audio_cfg.sample_rate)
+        gt_content_feat = self.whisper_encoder.encode(gt_wav, sr=self.audio_cfg.sample_rate)
+        t_pt, t_gt = f0_course_pt.shape[1], f0_course_gt.shape[1]
+        pt_content_feat = F.pad(pt_content_feat, (0, 0, 0, max(0, t_pt - pt_content_feat.shape[1])))[:, :t_pt, :]
+        gt_content_feat = F.pad(gt_content_feat, (0, 0, 0, max(0, t_gt - gt_content_feat.shape[1])))[:, :t_gt, :]
+        content_feat = torch.cat([pt_content_feat, gt_content_feat], 1)
+        f0_feat = self.f0_encoder(f0_course)
+        features = content_feat + f0_feat
+        gt_decoder_inp = features[:, len_prompt_mel:, :]
+        pt_decoder_inp = features[:, :len_prompt_mel, :]
+        generated_mel = self.cfm_decoder.reverse_diffusion(
+            pt_mel,
+            pt_decoder_inp,
+            gt_decoder_inp,
+            n_timesteps=n_steps,
+            cfg=cfg
+        )
+        generated_audio = self.vocoder(generated_mel.transpose(1, 2)[0:1, ...])
+        generated_audio = generated_audio.squeeze()
+        # cut or pad to match gt_wav length
+        if generated_audio.shape[-1] > gt_wav.shape[-1]:
+            generated_audio = generated_audio[:gt_wav.shape[-1]]
+        elif generated_audio.shape[-1] < gt_wav.shape[-1]:
+            generated_audio = F.pad(generated_audio, (0, gt_wav.shape[-1] - generated_audio.shape[-1]))
+        return generated_audio

webui.py CHANGED Viewed

@@ -4,6 +4,7 @@ import random
 import shutil
 import sys
 import traceback
 from pathlib import Path
 from typing import Tuple
 import spaces
@@ -269,6 +270,10 @@ def transcription_function(
     except Exception:
         print(traceback.format_exc(), file=sys.stderr, flush=True)
         return None, None
 @spaces.GPU
@@ -351,7 +356,187 @@ def synthesis_function(
     except Exception:
         print(traceback.format_exc(), file=sys.stderr, flush=True)
         return None, gr.update(), gr.update()
 def render_interface() -> gr.Blocks:
@@ -378,180 +563,7 @@ def render_interface() -> gr.Blocks:
             '"></div>'
             '</div>'
         )
-        with gr.Row(equal_height=False):
-            # ── Left column: inputs & controls ──
-            with gr.Column(scale=1):
-                prompt_audio = gr.Audio(
-                    label="Prompt audio (reference voice), max 30s",
-                    type="filepath",
-                    interactive=True,
-                )
-                target_audio = gr.Audio(
-                    label="Target audio (melody / lyrics source), max 60s",
-                    type="filepath",
-                    interactive=True,
-                )
-                with gr.Row():
-                    control_radio = gr.Radio(
-                        choices=["melody", "score"],
-                        value="melody",
-                        label="Control type",
-                        scale=1,
-                    )
-                    auto_shift = gr.Checkbox(
-                        label="Auto pitch shift",
-                        value=True,
-                        interactive=True,
-                        scale=1,
-                    )
-                synthesis_btn = gr.Button(
-                    value="🎤 Generate singing voice",
-                    variant="primary",
-                    size="lg",
-                )
-                # ── Advanced: transcription settings & metadata ──
-                with gr.Accordion("Advanced: Transcription & Metadata", open=False):
-                    with gr.Row():
-                        pitch_shift = gr.Number(
-                            label="Pitch shift (semitones)",
-                            value=0,
-                            minimum=-36,
-                            maximum=36,
-                            step=1,
-                            interactive=True,
-                            scale=1,
-                        )
-                        seed_input = gr.Number(
-                            label="Seed",
-                            value=12306,
-                            step=1,
-                            interactive=True,
-                            scale=1,
-                        )
-                    gr.Markdown(
-                        "Upload your own metadata files to skip automatic transcription. "
-                        "You can use the [SoulX-Singer-Midi-Editor]"
-                        "(https://huggingface.co/spaces/Soul-AILab/SoulX-Singer-Midi-Editor) "
-                        "to edit metadata for better alignment."
-                    )
-                    with gr.Row():
-                        prompt_lyric_lang = gr.Dropdown(
-                            label="Prompt lyric language",
-                            choices=[
-                                ("Mandarin", "Mandarin"),
-                                ("Cantonese", "Cantonese"),
-                                ("English", "English"),
-                            ],
-                            value="English",
-                            interactive=True,
-                            scale=1,
-                        )
-                        target_lyric_lang = gr.Dropdown(
-                            label="Target lyric language",
-                            choices=[
-                                ("Mandarin", "Mandarin"),
-                                ("Cantonese", "Cantonese"),
-                                ("English", "English"),
-                            ],
-                            value="English",
-                            interactive=True,
-                            scale=1,
-                        )
-                    with gr.Row():
-                        prompt_vocal_sep = gr.Checkbox(
-                            label="Prompt vocal separation",
-                            value=False,
-                            interactive=True,
-                            scale=1,
-                        )
-                        target_vocal_sep = gr.Checkbox(
-                            label="Target vocal separation",
-                            value=True,
-                            interactive=True,
-                            scale=1,
-                        )
-                    transcription_btn = gr.Button(
-                        value="Run singing transcription",
-                        variant="secondary",
-                        size="lg",
-                    )
-                    with gr.Row():
-                        prompt_metadata = gr.File(
-                            label="Prompt metadata",
-                            type="filepath",
-                            file_types=[".json"],
-                            interactive=True,
-                        )
-                        target_metadata = gr.File(
-                            label="Target metadata",
-                            type="filepath",
-                            file_types=[".json"],
-                            interactive=True,
-                        )
-            # ── Right column: output ──
-            with gr.Column(scale=1):
-                output_audio = gr.Audio(
-                    label="Generated audio",
-                    type="filepath",
-                    interactive=False,
-                )
-                gr.Examples(
-                    examples=[
-                        ["raven.wav", "happy_birthday.mp3"],
-                        ["anita.wav", "happy_birthday.mp3"],
-                        ["obama.wav", "happy_birthday.mp3"],
-                        ["raven.wav", "everybody_loves.wav"],
-                        ["anita.wav", "everybody_loves.wav"],
-                        ["obama.wav", "everybody_loves.wav"],
-                    ],
-                    inputs=[prompt_audio, target_audio],
-                    outputs=[output_audio, prompt_metadata, target_metadata],
-                    fn=synthesis_function,
-                    cache_examples=True,
-                    cache_mode="lazy"
-                )
-        # ── Event handlers ──
-        prompt_audio.change(
-            fn=lambda: None,
-            inputs=[],
-            outputs=[prompt_metadata],
-        )
-        target_audio.change(
-            fn=lambda: None,
-            inputs=[],
-            outputs=[target_metadata],
-        )
-        transcription_btn.click(
-            fn=transcription_function,
-            inputs=[
-                prompt_audio, target_audio,
-                prompt_metadata, target_metadata,
-                prompt_lyric_lang, target_lyric_lang,
-                prompt_vocal_sep, target_vocal_sep,
-            ],
-            outputs=[prompt_metadata, target_metadata],
-        )
-        synthesis_btn.click(
-            fn=synthesis_function,
-            inputs=[
-                prompt_audio, target_audio,
-                prompt_metadata, target_metadata,
-                control_radio, auto_shift, pitch_shift, seed_input,
-                prompt_lyric_lang, target_lyric_lang,
-                prompt_vocal_sep, target_vocal_sep,
-            ],
-            outputs=[output_audio, prompt_metadata, target_metadata],
-        )
     return page

 import shutil
 import sys
 import traceback
+import gc
 from pathlib import Path
 from typing import Tuple
 import spaces
     except Exception:
         print(traceback.format_exc(), file=sys.stderr, flush=True)
         return None, None
+    finally:
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
 @spaces.GPU
     except Exception:
         print(traceback.format_exc(), file=sys.stderr, flush=True)
         return None, gr.update(), gr.update()
+    finally:
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+def render_tab_content() -> None:
+    """Render the main content (for embedding in app.py tabs). No Blocks or title."""
+    with gr.Row(equal_height=False):
+        # ── Left column: inputs & controls ──
+        with gr.Column(scale=1):
+            prompt_audio = gr.Audio(
+                label="Prompt audio (reference voice), max 30s",
+                type="filepath",
+                interactive=True,
+            )
+            target_audio = gr.Audio(
+                label="Target audio (melody / lyrics source), max 60s",
+                type="filepath",
+                interactive=True,
+            )
+            with gr.Row():
+                control_radio = gr.Radio(
+                    choices=["melody", "score"],
+                    value="melody",
+                    label="Control type",
+                    scale=1,
+                )
+                auto_shift = gr.Checkbox(
+                    label="Auto pitch shift",
+                    value=True,
+                    interactive=True,
+                    scale=1,
+                )
+            synthesis_btn = gr.Button(
+                value="🎤 Generate singing voice",
+                variant="primary",
+                size="lg",
+            )
+            # ── Advanced: transcription settings & metadata ──
+            with gr.Accordion("Advanced: Transcription & Metadata", open=False):
+                with gr.Row():
+                    pitch_shift = gr.Number(
+                        label="Pitch shift (semitones)",
+                        value=0,
+                        minimum=-36,
+                        maximum=36,
+                        step=1,
+                        interactive=True,
+                        scale=1,
+                    )
+                    seed_input = gr.Number(
+                        label="Seed",
+                        value=12306,
+                        step=1,
+                        interactive=True,
+                        scale=1,
+                    )
+                gr.Markdown(
+                    "Upload your own metadata files to skip automatic transcription. "
+                    "You can use the [SoulX-Singer-Midi-Editor]"
+                    "(https://huggingface.co/spaces/Soul-AILab/SoulX-Singer-Midi-Editor) "
+                    "to edit metadata for better alignment."
+                )
+                with gr.Row():
+                    prompt_lyric_lang = gr.Dropdown(
+                        label="Prompt lyric language",
+                        choices=[
+                            ("Mandarin", "Mandarin"),
+                            ("Cantonese", "Cantonese"),
+                            ("English", "English"),
+                        ],
+                        value="English",
+                        interactive=True,
+                        scale=1,
+                    )
+                    target_lyric_lang = gr.Dropdown(
+                        label="Target lyric language",
+                        choices=[
+                            ("Mandarin", "Mandarin"),
+                            ("Cantonese", "Cantonese"),
+                            ("English", "English"),
+                        ],
+                        value="English",
+                        interactive=True,
+                        scale=1,
+                    )
+                with gr.Row():
+                    prompt_vocal_sep = gr.Checkbox(
+                        label="Prompt vocal separation",
+                        value=False,
+                        interactive=True,
+                        scale=1,
+                    )
+                    target_vocal_sep = gr.Checkbox(
+                        label="Target vocal separation",
+                        value=True,
+                        interactive=True,
+                        scale=1,
+                    )
+                transcription_btn = gr.Button(
+                    value="Run singing transcription",
+                    variant="secondary",
+                    size="lg",
+                )
+                with gr.Row():
+                    prompt_metadata = gr.File(
+                        label="Prompt metadata",
+                        type="filepath",
+                        file_types=[".json"],
+                        interactive=True,
+                    )
+                    target_metadata = gr.File(
+                        label="Target metadata",
+                        type="filepath",
+                        file_types=[".json"],
+                        interactive=True,
+                    )
+        # ── Right column: output ──
+        with gr.Column(scale=1):
+            output_audio = gr.Audio(
+                label="Generated audio",
+                type="filepath",
+                interactive=False,
+            )
+            gr.Examples(
+                examples=[
+                    ["raven.wav", "happy_birthday.mp3"],
+                    ["anita.wav", "happy_birthday.mp3"],
+                    ["obama.wav", "happy_birthday.mp3"],
+                    ["raven.wav", "everybody_loves.wav"],
+                    ["anita.wav", "everybody_loves.wav"],
+                    ["obama.wav", "everybody_loves.wav"],
+                ],
+                inputs=[prompt_audio, target_audio],
+                outputs=[output_audio, prompt_metadata, target_metadata],
+                fn=synthesis_function,
+                cache_examples=True,
+                cache_mode="lazy"
+            )
+    # ── Event handlers ──
+    prompt_audio.change(
+        fn=lambda: None,
+        inputs=[],
+        outputs=[prompt_metadata],
+    )
+    target_audio.change(
+        fn=lambda: None,
+        inputs=[],
+        outputs=[target_metadata],
+    )
+    transcription_btn.click(
+        fn=transcription_function,
+        inputs=[
+            prompt_audio, target_audio,
+            prompt_metadata, target_metadata,
+            prompt_lyric_lang, target_lyric_lang,
+            prompt_vocal_sep, target_vocal_sep,
+        ],
+        outputs=[prompt_metadata, target_metadata],
+    )
+    synthesis_btn.click(
+        fn=synthesis_function,
+        inputs=[
+            prompt_audio, target_audio,
+            prompt_metadata, target_metadata,
+            control_radio, auto_shift, pitch_shift, seed_input,
+            prompt_lyric_lang, target_lyric_lang,
+            prompt_vocal_sep, target_vocal_sep,
+        ],
+        outputs=[output_audio, prompt_metadata, target_metadata],
+    )
 def render_interface() -> gr.Blocks:
             '"></div>'
             '</div>'
         )
+        render_tab_content()
     return page

webui_svc.py ADDED Viewed

	@@ -0,0 +1,419 @@

+import random
+import sys
+import traceback
+import gc
+from datetime import datetime
+from pathlib import Path
+from typing import Literal
+import gradio as gr
+import librosa
+import numpy as np
+import soundfile as sf
+import torch
+import spaces
+from preprocess.pipeline import PreprocessPipeline
+from soulxsinger.utils.file_utils import load_config
+from cli.inference_svc import build_model as build_svc_model, process as svc_process
+ROOT = Path(__file__).parent
+SAMPLE_RATE = 44100
+PROMPT_MAX_SEC_DEFAULT = 30
+TARGET_MAX_SEC_DEFAULT = 600
+# Example rows: only [prompt_audio, target_audio]; other params use UI defaults when running
+EXAMPLE_LIST = [
+    [str(ROOT / "example/audio/zh_prompt.mp3"), str(ROOT / "example/audio/zh_target.mp3")],
+    [str(ROOT / "example/audio/en_prompt.mp3"), str(ROOT / "example/audio/en_target.mp3")],
+    [str(ROOT / "example/audio/svc_webui/Sun Yanzi.mp3"), str(ROOT / "example/audio/svc_webui/I'm Yours.mp3")],
+    [str(ROOT / "example/audio/svc_webui/Sun Yanzi.mp3"), str(ROOT / "example/audio/svc_webui/传奇.mp3")],
+    [str(ROOT / "example/audio/svc_webui/Sun Yanzi.mp3"), str(ROOT / "example/audio/svc_webui/君が好きだと叫びたい.mp3")],
+    [str(ROOT / "example/audio/svc_webui/Sun Yanzi.mp3"), str(ROOT / "example/audio/svc_webui/富士山下.mp3")],
+]
+_I18N = dict(
+	display_lang_label=dict(en="Display Language", zh="显示语言"),
+	title=dict(en="## SoulX-Singer SVC", zh="## SoulX-Singer SVC"),
+	prompt_audio_label=dict(en=f"Prompt audio", zh=f"Prompt 音频"),
+	target_audio_label=dict(en=f"Target audio", zh=f"Target 音频"),
+	prompt_vocal_sep_label=dict(en="Prompt vocal separation", zh="Prompt 人声分离"),
+	target_vocal_sep_label=dict(en="Target vocal separation", zh="Target 人声分离"),
+	auto_shift_label=dict(en="Auto pitch shift", zh="自动变调"),
+	auto_mix_acc_label=dict(en="Auto mix accompaniment", zh="自动混合伴奏"),
+	pitch_shift_label=dict(en="Pitch shift (semitones)", zh="指定变调（半音）"),
+	n_step_label=dict(en="diffusion steps", zh="采样步数"),
+	cfg_label=dict(en="cfg scale", zh="cfg系数"),
+	seed_label=dict(en="Seed", zh="种子"),
+	examples_label=dict(en="Examples", zh="示例"),
+	run_btn=dict(en="🎤Singing Voice Conversion", zh="🎤歌声转换"),
+	output_audio_label=dict(en="Generated audio", zh="合成结果音频"),
+	warn_missing_audio=dict(en="Please provide both prompt audio and target audio.", zh="请同时上传 Prompt 与 Target 音频。"),
+	instruction_title=dict(en="Usage", zh="使用说明"),
+	instruction_p1=dict(
+        en="Upload the Prompt and Target audio, and configure the parameters",
+        zh="上传 Prompt 与 Target 音频，并配置相关参数",
+    ),
+    instruction_p2=dict(
+        en="Click「🎤Singing Voice Conversion」to start singing voice conversion.",
+        zh="点击「🎤歌声转换」开始最终生成。",
+    ),
+	tips_title=dict(en="Tips", zh="提示"),
+	tip_p1=dict(
+        en="Input: The Prompt audio is recommended to be a clean and clear singing voice, while the Target audio can be either a pure vocal or a mixture with accompaniment. If the audio contains accompaniment, please check the vocal separation option.",
+        zh="输入：Prompt 音频建议是干净清晰的歌声，Target 音频可以是纯歌声或伴奏，这两者若带伴奏需要勾选分离选项",
+    ),
+	tip_p2=dict(
+        en="Pitch shift: When there is a large pitch range difference between the Prompt and Target audio, you can try enabling auto pitch shift or manually adjusting the pitch shift in semitones. When a non-zero pitch shift is specified, auto pitch shift will not take effect. The accompaniment of auto mix will be pitch-shifted together with the vocal (keeping the same octave).",
+        zh="变调：Prompt 音频的音域和 Target 音频的音域差距较大的时候，可以尝试开启自动变调或手动调整变调半音数，指定非0的变调半音数时，自动变调不生效，自动混音的伴奏会配合歌声进行升降调（保持同一个八度）",
+    ),
+	tip_p3=dict(
+        en="Model parameters: Generally, a larger number of sampling steps will yield better generation quality but also longer generation time; a larger cfg scale will increase timbre similarity and melody fidelity, but may cause more distortion, it is recommended to take a value between 1 and 3.",
+        zh="模型参数：一般采样步数越大，生成质量越好，但生成时间也越长；一般cfg系数越大，音色相似度和旋律保真度越高，但是会造成更多的失真，建议取1～3之间的值",
+    ),
+	tip_p4=dict(
+        en="If you want to convert a long audio or a whole song with large pitch range, there may be instability in the generated voice. You can try converting in segments.",
+        zh="长音频或完整歌曲中，音域变化较大的情况有可能出现音色不稳定，可以尝试分段转换",
+    )
+)
+_GLOBAL_LANG: Literal["zh", "en"] = "zh"
+def _i18n(key: str) -> str:
+	return _I18N[key][_GLOBAL_LANG]
+def _print_exception(context: str) -> None:
+	print(f"[{context}]\n{traceback.format_exc()}", file=sys.stderr, flush=True)
+def _get_device() -> str:
+	return "cuda:0" if torch.cuda.is_available() else "cpu"
+def _session_dir() -> Path:
+	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+	return ROOT / "outputs" / "gradio" / "svc" / timestamp
+def _normalize_audio_input(audio):
+	return audio[0] if isinstance(audio, tuple) else audio
+def _trim_and_save_audio(src_audio_path: str, dst_wav_path: Path, max_sec: int, sr: int = SAMPLE_RATE) -> None:
+	audio_data, _ = librosa.load(src_audio_path, sr=sr, mono=True)
+	audio_data = audio_data[: max_sec * sr]
+	dst_wav_path.parent.mkdir(parents=True, exist_ok=True)
+	sf.write(dst_wav_path, audio_data, sr)
+def _usage_md() -> str:
+	return "\n\n".join([
+		f"### {_i18n('instruction_title')}",
+		f"**1.** {_i18n('instruction_p1')}",
+		f"**2.** {_i18n('instruction_p2')}",
+	])
+def _tips_md() -> str:
+	return "\n\n".join([
+		f"### {_i18n('tips_title')}",
+		f"- {_i18n('tip_p1')}",
+		f"- {_i18n('tip_p2')}",
+		f"- {_i18n('tip_p3')}",
+		f"- {_i18n('tip_p4')}",
+	])
+class AppState:
+	def __init__(self) -> None:
+		self.device = _get_device()
+		self.preprocess_pipeline = PreprocessPipeline(
+			device=self.device,
+			language="Mandarin",
+			save_dir=str(ROOT / "outputs" / "gradio" / "_placeholder" / "svc"),
+			vocal_sep=True,
+			max_merge_duration=60000,
+			midi_transcribe=False,
+		)
+		self.svc_config = load_config("soulxsinger/config/soulxsinger.yaml")
+		self.svc_model = build_svc_model(
+			model_path="pretrained_models/SoulX-Singer/model-svc.pt",
+			config=self.svc_config,
+			device=self.device,
+		)
+	def run_preprocess(self, audio_path: Path, save_path: Path, vocal_sep: bool) -> tuple[bool, str, Path | None, Path | None]:
+		try:
+			self.preprocess_pipeline.save_dir = str(save_path)
+			self.preprocess_pipeline.run(
+				audio_path=str(audio_path),
+				vocal_sep=vocal_sep,
+				max_merge_duration=60000,
+				language="Mandarin",
+			)
+			vocal_wav = save_path / "vocal.wav"
+			vocal_f0 = save_path / "vocal_f0.npy"
+			if not vocal_wav.exists() or not vocal_f0.exists():
+				return False, f"preprocess output missing: {vocal_wav} or {vocal_f0}", None, None
+			return True, "ok", vocal_wav, vocal_f0
+		except Exception as e:
+			return False, f"preprocess failed: {e}", None, None
+	def run_svc(
+		self,
+		prompt_wav_path: Path,
+		target_wav_path: Path,
+		prompt_f0_path: Path,
+		target_f0_path: Path,
+		session_base: Path,
+		auto_shift: bool,
+		auto_mix_acc: bool,
+		pitch_shift: int,
+		n_step: int,
+		cfg: float,
+		seed: int,
+	) -> tuple[bool, str, Path | None]:
+		try:
+			torch.manual_seed(seed)
+			np.random.seed(seed)
+			random.seed(seed)
+			save_dir = session_base / "generated"
+			save_dir.mkdir(parents=True, exist_ok=True)
+			class Args:
+				pass
+			args = Args()
+			args.device = self.device
+			args.prompt_wav_path = str(prompt_wav_path)
+			args.target_wav_path = str(target_wav_path)
+			args.prompt_f0_path = str(prompt_f0_path)
+			args.target_f0_path = str(target_f0_path)
+			args.save_dir = str(save_dir)
+			args.auto_shift = auto_shift
+			args.pitch_shift = int(pitch_shift)
+			args.n_steps = int(n_step)
+			args.cfg = float(cfg)
+			svc_process(args, self.svc_config, self.svc_model)
+			generated = save_dir / "generated.wav"
+			if not generated.exists():
+				return False, f"inference finished but output not found: {generated}", None
+			if auto_mix_acc:
+				acc_path = session_base / "transcriptions" / "target" / "acc.wav"
+				if acc_path.exists():
+					vocal_shift = args.pitch_shift
+					mul = -1 if vocal_shift < 0 else 1
+					acc_shift = abs(vocal_shift) % 12
+					acc_shift = mul * acc_shift
+					if acc_shift > 6:
+						acc_shift -= 12
+					if acc_shift < -6:
+						acc_shift += 12
+					mix_sr = self.svc_config.audio.sample_rate
+					vocal, _ = librosa.load(str(generated), sr=mix_sr, mono=True)
+					acc, _ = librosa.load(str(acc_path), sr=mix_sr, mono=True)
+					if acc_shift != 0:
+						acc = librosa.effects.pitch_shift(acc, sr=mix_sr, n_steps=acc_shift)
+						print(f"Applied pitch shift of {acc_shift} semitones to accompaniment to match vocal shift of {vocal_shift} semitones.")
+					mix_len = min(len(vocal), len(acc))
+					if mix_len > 0:
+						mixed = vocal[:mix_len] + acc[:mix_len]
+						peak = float(np.max(np.abs(mixed))) if mixed.size > 0 else 1.0
+						if peak > 1.0:
+							mixed = mixed / peak
+						mixed_path = save_dir / "generated_mixed.wav"
+						sf.write(str(mixed_path), mixed, mix_sr)
+						generated = mixed_path
+			return True, "svc inference done", generated
+		except Exception as e:
+			return False, f"svc inference failed: {e}", None
+APP_STATE = AppState()
+@spaces.GPU
+def _start_svc(
+    prompt_audio,
+    target_audio,
+    prompt_vocal_sep=False,
+    target_vocal_sep=True,
+    auto_shift=True,
+    auto_mix_acc=True,
+    pitch_shift=0,
+    n_step=32,
+    cfg=1.0,
+    seed=42
+):
+	try:
+		prompt_audio = _normalize_audio_input(prompt_audio)
+		target_audio = _normalize_audio_input(target_audio)
+		if not prompt_audio or not target_audio:
+			gr.Warning(_i18n("warn_missing_audio"))
+			return None
+		session_base = _session_dir()
+		audio_dir = session_base / "audio"
+		prompt_raw = audio_dir / "prompt.wav"
+		target_raw = audio_dir / "target.wav"
+		_trim_and_save_audio(prompt_audio, prompt_raw, PROMPT_MAX_SEC_DEFAULT)
+		_trim_and_save_audio(target_audio, target_raw, TARGET_MAX_SEC_DEFAULT)
+		prompt_ok, prompt_msg, prompt_wav, prompt_f0 = APP_STATE.run_preprocess(
+			audio_path=prompt_raw,
+			save_path=session_base / "transcriptions" / "prompt",
+			vocal_sep=bool(prompt_vocal_sep),
+		)
+		if not prompt_ok or prompt_wav is None or prompt_f0 is None:
+			print(prompt_msg, file=sys.stderr, flush=True)
+			return None
+		target_ok, target_msg, target_wav, target_f0 = APP_STATE.run_preprocess(
+			audio_path=target_raw,
+			save_path=session_base / "transcriptions" / "target",
+			vocal_sep=bool(target_vocal_sep),
+		)
+		if not target_ok or target_wav is None or target_f0 is None:
+			print(target_msg, file=sys.stderr, flush=True)
+			return None
+		ok, msg, generated = APP_STATE.run_svc(
+			prompt_wav_path=prompt_wav,
+			target_wav_path=target_wav,
+			prompt_f0_path=prompt_f0,
+			target_f0_path=target_f0,
+			session_base=session_base,
+			auto_shift=bool(auto_shift),
+			auto_mix_acc=bool(auto_mix_acc),
+			pitch_shift=int(pitch_shift),
+			n_step=int(n_step),
+			cfg=float(cfg),
+			seed=int(seed),
+		)
+		if not ok or generated is None:
+			print(msg, file=sys.stderr, flush=True)
+			return None
+		return str(generated)
+	except Exception:
+		_print_exception("_start_svc")
+		return None
+	finally:
+		gc.collect()
+		if torch.cuda.is_available():
+			torch.cuda.empty_cache()
+def render_tab_content() -> None:
+    """Render SVC tab content (for embedding in app.py). Same UI style as webui: two columns, no title."""
+    with gr.Row(equal_height=False):
+        # ── Left column: inputs & controls ──
+        with gr.Column(scale=1):
+            prompt_audio = gr.Audio(
+                label="Prompt audio (reference voice)",
+                type="filepath",
+                interactive=True,
+            )
+            target_audio = gr.Audio(
+                label="Target audio (to convert)",
+                type="filepath",
+                interactive=True,
+            )
+            run_btn = gr.Button(
+                value="🎤 Singing Voice Conversion",
+                variant="primary",
+                size="lg",
+            )
+            with gr.Accordion("Advanced settings", open=False):
+                with gr.Row():
+                    prompt_vocal_sep = gr.Checkbox(label="Prompt vocal separation", value=False, scale=1)
+                    target_vocal_sep = gr.Checkbox(label="Target vocal separation", value=True, scale=1)
+                with gr.Row():
+                    auto_shift = gr.Checkbox(label="Auto pitch shift", value=True, scale=1)
+                    auto_mix_acc = gr.Checkbox(label="Auto mix accompaniment", value=True, scale=1)
+                pitch_shift = gr.Slider(label="Pitch shift (semitones)", value=0, minimum=-36, maximum=36, step=1)
+                n_step = gr.Slider(label="n_step", value=32, minimum=1, maximum=200, step=1)
+                cfg = gr.Slider(label="cfg scale", value=1.0, minimum=0.0, maximum=10.0, step=0.1)
+                seed_input = gr.Slider(label="Seed", value=42, minimum=0, maximum=10000, step=1)
+        # ── Right column: output ──
+        with gr.Column(scale=1):
+            output_audio = gr.Audio(label="Generated audio", type="filepath", interactive=False)
+            gr.Examples(
+                examples=EXAMPLE_LIST,
+                inputs=[prompt_audio, target_audio],
+                outputs=[output_audio],
+                fn=_start_svc,
+                cache_examples=True,
+                cache_mode="lazy",
+            )
+    run_btn.click(
+        fn=_start_svc,
+        inputs=[
+            prompt_audio,
+            target_audio,
+            prompt_vocal_sep,
+            target_vocal_sep,
+            auto_shift,
+            auto_mix_acc,
+            pitch_shift,
+            n_step,
+            cfg,
+            seed_input,
+        ],
+        outputs=[output_audio],
+    )
+def render_interface() -> gr.Blocks:
+    with gr.Blocks(title="SoulX-Singer", theme=gr.themes.Default()) as page:
+        gr.HTML(
+            '<div style="'
+            'text-align: center; '
+            'padding: 1.25rem 0 1.5rem; '
+            'margin-bottom: 0.5rem;'
+            '">'
+            '<div style="'
+            'display: inline-block; '
+            'font-size: 1.75rem; '
+            'font-weight: 700; '
+            'letter-spacing: 0.02em; '
+            'line-height: 1.3;'
+            '">SoulX-Singer</div>'
+            '<div style="'
+            'width: 80px; '
+            'height: 3px; '
+            'margin: 1rem auto 0; '
+            'background: linear-gradient(90deg, transparent, #6366f1, transparent); '
+            'border-radius: 2px;'
+            '"></div>'
+            '</div>'
+        )
+        render_tab_content()
+    return page
+if __name__ == "__main__":
+	import argparse
+	parser = argparse.ArgumentParser()
+	parser.add_argument("--port", type=int, default=7861, help="Gradio server port")
+	parser.add_argument("--share", action="store_true", help="Create public link")
+	args = parser.parse_args()
+	page = render_interface()
+	page.queue()
+	page.launch(share=args.share, server_name="0.0.0.0", server_port=args.port)