Spaces:
Paused
Paused
feat: add svc inference code and webui
Browse files- .gitattributes +2 -0
- app.py +32 -2
- cli/inference_svc.py +105 -0
- ensure_models.py +2 -2
- example/audio/en_prompt.mp3 +0 -0
- example/audio/en_target.mp3 +0 -0
- example/audio/music_f0.npy +3 -0
- example/audio/svc_prompt_demo.mp3 +3 -0
- example/audio/svc_target_demo.mp3 +3 -0
- example/audio/svc_webui/I'm Yours.mp3 +3 -0
- example/audio/svc_webui/Sun Yanzi.mp3 +3 -0
- example/audio/svc_webui/传奇.mp3 +3 -0
- example/audio/svc_webui/君が好きだと叫びたい.mp3 +3 -0
- example/audio/svc_webui/富士山下.mp3 +3 -0
- example/audio/zh_prompt.mp3 +0 -0
- example/audio/zh_prompt_f0.npy +3 -0
- example/audio/zh_target.mp3 +0 -0
- example/infer_svc.sh +27 -0
- example/preprocess.sh +6 -2
- preprocess/pipeline.py +35 -20
- soulxsinger/models/modules/whisper_encoder.py +74 -0
- soulxsinger/models/soulxsinger_svc.py +319 -0
- webui.py +186 -174
- webui_svc.py +419 -0
.gitattributes
CHANGED
|
@@ -44,3 +44,5 @@ raven.wav filter=lfs diff=lfs merge=lfs -text
|
|
| 44 |
anita.wav filter=lfs diff=lfs merge=lfs -text
|
| 45 |
everybody_loves.wav filter=lfs diff=lfs merge=lfs -text
|
| 46 |
obama.wav filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 44 |
anita.wav filter=lfs diff=lfs merge=lfs -text
|
| 45 |
everybody_loves.wav filter=lfs diff=lfs merge=lfs -text
|
| 46 |
obama.wav filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
*.mp3 filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
|
@@ -18,9 +18,39 @@ if __name__ == "__main__":
|
|
| 18 |
os.chdir(ROOT)
|
| 19 |
ensure_pretrained_models()
|
| 20 |
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
-
page = render_interface()
|
| 24 |
page.queue()
|
| 25 |
page.launch(
|
| 26 |
server_name="0.0.0.0",
|
|
|
|
| 18 |
os.chdir(ROOT)
|
| 19 |
ensure_pretrained_models()
|
| 20 |
|
| 21 |
+
import gradio as gr
|
| 22 |
+
from webui import render_tab_content as render_svs_tab
|
| 23 |
+
from webui_svc import render_tab_content as render_svc_tab
|
| 24 |
+
|
| 25 |
+
with gr.Blocks(title="SoulX-Singer", theme=gr.themes.Default()) as page:
|
| 26 |
+
gr.HTML(
|
| 27 |
+
'<div style="'
|
| 28 |
+
'text-align: center; '
|
| 29 |
+
'padding: 1.25rem 0 1.5rem; '
|
| 30 |
+
'margin-bottom: 0.5rem;'
|
| 31 |
+
'">'
|
| 32 |
+
'<div style="'
|
| 33 |
+
'display: inline-block; '
|
| 34 |
+
'font-size: 1.75rem; '
|
| 35 |
+
'font-weight: 700; '
|
| 36 |
+
'letter-spacing: 0.02em; '
|
| 37 |
+
'line-height: 1.3;'
|
| 38 |
+
'">SoulX-Singer</div>'
|
| 39 |
+
'<div style="'
|
| 40 |
+
'width: 80px; '
|
| 41 |
+
'height: 3px; '
|
| 42 |
+
'margin: 1rem auto 0; '
|
| 43 |
+
'background: linear-gradient(90deg, transparent, #6366f1, transparent); '
|
| 44 |
+
'border-radius: 2px;'
|
| 45 |
+
'"></div>'
|
| 46 |
+
'</div>'
|
| 47 |
+
)
|
| 48 |
+
with gr.Tabs():
|
| 49 |
+
with gr.Tab("Singing Voice Synthesis"):
|
| 50 |
+
render_svs_tab()
|
| 51 |
+
with gr.Tab("Singing Voice Conversion"):
|
| 52 |
+
render_svc_tab()
|
| 53 |
|
|
|
|
| 54 |
page.queue()
|
| 55 |
page.launch(
|
| 56 |
server_name="0.0.0.0",
|
cli/inference_svc.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import torch
|
| 3 |
+
import json
|
| 4 |
+
import argparse
|
| 5 |
+
from tqdm import tqdm
|
| 6 |
+
import numpy as np
|
| 7 |
+
import soundfile as sf
|
| 8 |
+
from collections import OrderedDict
|
| 9 |
+
from omegaconf import DictConfig
|
| 10 |
+
|
| 11 |
+
from soulxsinger.utils.file_utils import load_config
|
| 12 |
+
from soulxsinger.models.soulxsinger_svc import SoulXSingerSVC
|
| 13 |
+
from soulxsinger.utils.audio_utils import load_wav
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def build_model(
|
| 17 |
+
model_path: str,
|
| 18 |
+
config: DictConfig,
|
| 19 |
+
device: str = "cuda",
|
| 20 |
+
):
|
| 21 |
+
"""
|
| 22 |
+
Build the model from the pre-trained model path and model configuration.
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
model_path (str): Path to the checkpoint file.
|
| 26 |
+
config (DictConfig): Model configuration.
|
| 27 |
+
device (str, optional): Device to use. Defaults to "cuda".
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
Tuple[torch.nn.Module, torch.nn.Module]: The initialized model and vocoder.
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
if not os.path.isfile(model_path):
|
| 34 |
+
raise FileNotFoundError(
|
| 35 |
+
f"Model checkpoint not found: {model_path}. "
|
| 36 |
+
"Please download the pretrained model and place it at the path, or set --model_path."
|
| 37 |
+
)
|
| 38 |
+
model = SoulXSingerSVC(config).to(device)
|
| 39 |
+
print("Model initialized.")
|
| 40 |
+
print("Model parameters:", sum(p.numel() for p in model.parameters()) / 1e6, "M")
|
| 41 |
+
|
| 42 |
+
checkpoint = torch.load(model_path, weights_only=False, map_location=device)
|
| 43 |
+
if "state_dict" not in checkpoint:
|
| 44 |
+
raise KeyError(
|
| 45 |
+
f"Checkpoint at {model_path} has no 'state_dict' key. "
|
| 46 |
+
"Expected a checkpoint saved with model.state_dict()."
|
| 47 |
+
)
|
| 48 |
+
model.load_state_dict(checkpoint["state_dict"], strict=True)
|
| 49 |
+
|
| 50 |
+
model.eval()
|
| 51 |
+
model.to(device)
|
| 52 |
+
print("Model checkpoint loaded.")
|
| 53 |
+
|
| 54 |
+
return model
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def process(args, config, model: torch.nn.Module):
|
| 58 |
+
"""Run the full inference pipeline given a data_processor and model.
|
| 59 |
+
"""
|
| 60 |
+
|
| 61 |
+
os.makedirs(args.save_dir, exist_ok=True)
|
| 62 |
+
pt_wav = load_wav(args.prompt_wav_path, config.audio.sample_rate).to(args.device)
|
| 63 |
+
gt_wav = load_wav(args.target_wav_path, config.audio.sample_rate).to(args.device)
|
| 64 |
+
pt_f0 = torch.from_numpy(np.load(args.prompt_f0_path)).unsqueeze(0).to(args.device)
|
| 65 |
+
gt_f0 = torch.from_numpy(np.load(args.target_f0_path)).unsqueeze(0).to(args.device)
|
| 66 |
+
|
| 67 |
+
n_step = args.n_steps if hasattr(args, "n_steps") else config.infer.n_steps
|
| 68 |
+
cfg = args.cfg if hasattr(args, "cfg") else config.infer.cfg
|
| 69 |
+
|
| 70 |
+
generated_audio, generated_shift = model.infer(pt_wav, gt_wav, pt_f0, gt_f0, auto_shift=args.auto_shift, pitch_shift=args.pitch_shift, n_steps=n_step, cfg=cfg)
|
| 71 |
+
generated_audio = generated_audio.squeeze().cpu().numpy()
|
| 72 |
+
if args.pitch_shift != generated_shift:
|
| 73 |
+
args.pitch_shift = generated_shift
|
| 74 |
+
# print(f"Applied pitch shift of {generated_shift} semitones to match GT F0 contour.")
|
| 75 |
+
|
| 76 |
+
sf.write(os.path.join(args.save_dir, "generated.wav"), generated_audio, config.audio.sample_rate)
|
| 77 |
+
print(f"Generated audio saved to {os.path.join(args.save_dir, 'generated.wav')}")
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def main(args, config):
|
| 81 |
+
model = build_model(
|
| 82 |
+
model_path=args.model_path,
|
| 83 |
+
config=config,
|
| 84 |
+
device=args.device,
|
| 85 |
+
)
|
| 86 |
+
process(args, config, model)
|
| 87 |
+
|
| 88 |
+
if __name__ == "__main__":
|
| 89 |
+
parser = argparse.ArgumentParser()
|
| 90 |
+
parser.add_argument("--device", type=str, default="cuda")
|
| 91 |
+
parser.add_argument("--model_path", type=str, default='pretrained_models/soulx-singer/model.pt')
|
| 92 |
+
parser.add_argument("--config", type=str, default='soulxsinger/config/soulxsinger.yaml')
|
| 93 |
+
parser.add_argument("--prompt_wav_path", type=str, default='example/audio/zh_prompt.wav')
|
| 94 |
+
parser.add_argument("--target_wav_path", type=str, default='example/audio/zh_target.wav')
|
| 95 |
+
parser.add_argument("--prompt_f0_path", type=str, default='example/audio/zh_prompt_f0.npy')
|
| 96 |
+
parser.add_argument("--target_f0_path", type=str, default='example/audio/zh_target_f0.npy')
|
| 97 |
+
parser.add_argument("--save_dir", type=str, default='outputs')
|
| 98 |
+
parser.add_argument("--auto_shift", action="store_true")
|
| 99 |
+
parser.add_argument("--pitch_shift", type=int, default=0)
|
| 100 |
+
parser.add_argument("--n_steps", type=int, default=32)
|
| 101 |
+
parser.add_argument("--cfg", type=float, default=3.0)
|
| 102 |
+
args = parser.parse_args()
|
| 103 |
+
|
| 104 |
+
config = load_config(args.config)
|
| 105 |
+
main(args, config)
|
ensure_models.py
CHANGED
|
@@ -10,7 +10,7 @@ MODEL_DIR_PREPROCESS = PRETRAINED_DIR / "SoulX-Singer-Preprocess"
|
|
| 10 |
|
| 11 |
def ensure_pretrained_models():
|
| 12 |
"""Download SoulX-Singer and Preprocess models from Hugging Face Hub if not present."""
|
| 13 |
-
if (MODEL_DIR_SVS / "model.pt").exists() and MODEL_DIR_PREPROCESS.exists():
|
| 14 |
print("Pretrained models already present, skipping download.", flush=True)
|
| 15 |
return
|
| 16 |
|
|
@@ -26,7 +26,7 @@ def ensure_pretrained_models():
|
|
| 26 |
|
| 27 |
PRETRAINED_DIR.mkdir(parents=True, exist_ok=True)
|
| 28 |
|
| 29 |
-
if not (MODEL_DIR_SVS / "model.pt").exists():
|
| 30 |
print("Downloading SoulX-Singer model...", flush=True)
|
| 31 |
snapshot_download(
|
| 32 |
repo_id="Soul-AILab/SoulX-Singer",
|
|
|
|
| 10 |
|
| 11 |
def ensure_pretrained_models():
|
| 12 |
"""Download SoulX-Singer and Preprocess models from Hugging Face Hub if not present."""
|
| 13 |
+
if (MODEL_DIR_SVS / "model.pt").exists() and (MODEL_DIR_SVS / "model-svc.pt").exists() and MODEL_DIR_PREPROCESS.exists():
|
| 14 |
print("Pretrained models already present, skipping download.", flush=True)
|
| 15 |
return
|
| 16 |
|
|
|
|
| 26 |
|
| 27 |
PRETRAINED_DIR.mkdir(parents=True, exist_ok=True)
|
| 28 |
|
| 29 |
+
if not (MODEL_DIR_SVS / "model.pt").exists() or not (MODEL_DIR_SVS / "model-svc.pt").exists():
|
| 30 |
print("Downloading SoulX-Singer model...", flush=True)
|
| 31 |
snapshot_download(
|
| 32 |
repo_id="Soul-AILab/SoulX-Singer",
|
example/audio/en_prompt.mp3
CHANGED
|
Binary files a/example/audio/en_prompt.mp3 and b/example/audio/en_prompt.mp3 differ
|
|
|
example/audio/en_target.mp3
CHANGED
|
Binary files a/example/audio/en_target.mp3 and b/example/audio/en_target.mp3 differ
|
|
|
example/audio/music_f0.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a091dce0ab269093a455f8959222f8c7fb55e8d9c9477e8cd2cde8eb9279d9ef
|
| 3 |
+
size 20720
|
example/audio/svc_prompt_demo.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0dde83f7ff5ef5ad52939db70bd1324b6247ea4f399e60e0393cc18725cf29c3
|
| 3 |
+
size 41187
|
example/audio/svc_target_demo.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c65c2e5fec64a51c613badcce35145b6f8e2bb33907ee7428275bfb918876a2c
|
| 3 |
+
size 1944155
|
example/audio/svc_webui/I'm Yours.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c65c2e5fec64a51c613badcce35145b6f8e2bb33907ee7428275bfb918876a2c
|
| 3 |
+
size 1944155
|
example/audio/svc_webui/Sun Yanzi.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0dde83f7ff5ef5ad52939db70bd1324b6247ea4f399e60e0393cc18725cf29c3
|
| 3 |
+
size 41187
|
example/audio/svc_webui/传奇.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2dfec7ebb41dd6c56877fdeddf7a5fdc106ea9c2fdb1c06f6adddc6f89e6285e
|
| 3 |
+
size 4738948
|
example/audio/svc_webui/君が好きだと叫びたい.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1fe990727559bf1ffb548c562b6c3b19f16602e3c147da42bf56fc92129ae35e
|
| 3 |
+
size 3706589
|
example/audio/svc_webui/富士山下.mp3
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e1cad7eaabe05f1c6ef1994bf4326cdafc79991e1c647a857fa2f64925e84aab
|
| 3 |
+
size 4147219
|
example/audio/zh_prompt.mp3
CHANGED
|
Binary files a/example/audio/zh_prompt.mp3 and b/example/audio/zh_prompt.mp3 differ
|
|
|
example/audio/zh_prompt_f0.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aecf5f40c16a6390e8bb8c19ce69120dcbedaea5e4051aba1bdde95a024f29d3
|
| 3 |
+
size 4408
|
example/audio/zh_target.mp3
CHANGED
|
Binary files a/example/audio/zh_target.mp3 and b/example/audio/zh_target.mp3 differ
|
|
|
example/infer_svc.sh
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
script_dir=$(dirname "$(realpath "$0")")
|
| 4 |
+
root_dir=$(dirname "$script_dir")
|
| 5 |
+
|
| 6 |
+
cd $root_dir || exit
|
| 7 |
+
export PYTHONPATH=$root_dir:$PYTHONPATH
|
| 8 |
+
|
| 9 |
+
model_path=pretrained_models/SoulX-Singer/model-svc.pt
|
| 10 |
+
config=soulxsinger/config/soulxsinger.yaml
|
| 11 |
+
prompt_wav_path=example/audio/zh_prompt.mp3
|
| 12 |
+
target_wav_path=example/audio/music.mp3
|
| 13 |
+
prompt_f0_path=example/audio/zh_prompt_f0.npy
|
| 14 |
+
target_f0_path=example/audio/music_f0.npy
|
| 15 |
+
save_dir=example/generated/music_svc
|
| 16 |
+
|
| 17 |
+
python -m cli.inference_svc \
|
| 18 |
+
--device cuda \
|
| 19 |
+
--model_path $model_path \
|
| 20 |
+
--config $config \
|
| 21 |
+
--prompt_wav_path $prompt_wav_path \
|
| 22 |
+
--target_wav_path $target_wav_path \
|
| 23 |
+
--prompt_f0_path $prompt_f0_path \
|
| 24 |
+
--target_f0_path $target_f0_path \
|
| 25 |
+
--save_dir $save_dir \
|
| 26 |
+
--auto_shift \
|
| 27 |
+
--pitch_shift 0
|
example/preprocess.sh
CHANGED
|
@@ -15,6 +15,7 @@ save_dir=example/transcriptions/zh_prompt
|
|
| 15 |
language=Mandarin
|
| 16 |
vocal_sep=False
|
| 17 |
max_merge_duration=30000
|
|
|
|
| 18 |
|
| 19 |
python -m preprocess.pipeline \
|
| 20 |
--audio_path $audio_path \
|
|
@@ -22,7 +23,8 @@ python -m preprocess.pipeline \
|
|
| 22 |
--language $language \
|
| 23 |
--device $device \
|
| 24 |
--vocal_sep $vocal_sep \
|
| 25 |
-
--max_merge_duration $max_merge_duration
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
####### Run Target Annotation #######
|
|
@@ -31,6 +33,7 @@ save_dir=example/transcriptions/music
|
|
| 31 |
language=Mandarin
|
| 32 |
vocal_sep=True
|
| 33 |
max_merge_duration=60000
|
|
|
|
| 34 |
|
| 35 |
python -m preprocess.pipeline \
|
| 36 |
--audio_path $audio_path \
|
|
@@ -38,4 +41,5 @@ python -m preprocess.pipeline \
|
|
| 38 |
--language $language \
|
| 39 |
--device $device \
|
| 40 |
--vocal_sep $vocal_sep \
|
| 41 |
-
--max_merge_duration $max_merge_duration
|
|
|
|
|
|
| 15 |
language=Mandarin
|
| 16 |
vocal_sep=False
|
| 17 |
max_merge_duration=30000
|
| 18 |
+
midi_transcribe=True # Whether to transcribe vocal midi, set True for singing voice synthesis, False for singing voice conversion
|
| 19 |
|
| 20 |
python -m preprocess.pipeline \
|
| 21 |
--audio_path $audio_path \
|
|
|
|
| 23 |
--language $language \
|
| 24 |
--device $device \
|
| 25 |
--vocal_sep $vocal_sep \
|
| 26 |
+
--max_merge_duration $max_merge_duration \
|
| 27 |
+
--midi_transcribe $midi_transcribe
|
| 28 |
|
| 29 |
|
| 30 |
####### Run Target Annotation #######
|
|
|
|
| 33 |
language=Mandarin
|
| 34 |
vocal_sep=True
|
| 35 |
max_merge_duration=60000
|
| 36 |
+
midi_transcribe=True # Whether to transcribe vocal midi, set True for singing voice synthesis, False for singing voice conversion
|
| 37 |
|
| 38 |
python -m preprocess.pipeline \
|
| 39 |
--audio_path $audio_path \
|
|
|
|
| 41 |
--language $language \
|
| 42 |
--device $device \
|
| 43 |
--vocal_sep $vocal_sep \
|
| 44 |
+
--max_merge_duration $max_merge_duration \
|
| 45 |
+
--midi_transcribe $midi_transcribe
|
preprocess/pipeline.py
CHANGED
|
@@ -16,12 +16,13 @@ from preprocess.tools import (
|
|
| 16 |
|
| 17 |
|
| 18 |
class PreprocessPipeline:
|
| 19 |
-
def __init__(self, device: str, language: str, save_dir: str, vocal_sep: bool = True, max_merge_duration: int = 60000):
|
| 20 |
self.device = device
|
| 21 |
self.language = language
|
| 22 |
self.save_dir = save_dir
|
| 23 |
self.vocal_sep = vocal_sep
|
| 24 |
self.max_merge_duration = max_merge_duration
|
|
|
|
| 25 |
|
| 26 |
if vocal_sep:
|
| 27 |
self.vocal_separator = VocalSeparator(
|
|
@@ -37,26 +38,31 @@ class PreprocessPipeline:
|
|
| 37 |
model_path="pretrained_models/SoulX-Singer-Preprocess/rmvpe/rmvpe.pt",
|
| 38 |
device=device,
|
| 39 |
)
|
| 40 |
-
self.
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
def run(
|
| 55 |
self,
|
| 56 |
audio_path: str,
|
| 57 |
-
vocal_sep: bool =
|
| 58 |
-
max_merge_duration: int =
|
| 59 |
-
language: str =
|
| 60 |
) -> None:
|
| 61 |
vocal_sep = self.vocal_sep if vocal_sep is None else vocal_sep
|
| 62 |
max_merge_duration = self.max_merge_duration if max_merge_duration is None else max_merge_duration
|
|
@@ -81,7 +87,11 @@ class PreprocessPipeline:
|
|
| 81 |
vocal_path = output_dir / "vocal.wav"
|
| 82 |
sf.write(vocal_path, vocal, sample_rate)
|
| 83 |
|
| 84 |
-
vocal_f0 = self.f0_extractor.process(str(vocal_path))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
segments = self.vocal_detector.process(str(vocal_path), f0=vocal_f0)
|
| 86 |
|
| 87 |
metadata = []
|
|
@@ -124,10 +134,11 @@ def main(args):
|
|
| 124 |
save_dir=args.save_dir,
|
| 125 |
vocal_sep=args.vocal_sep,
|
| 126 |
max_merge_duration=args.max_merge_duration,
|
|
|
|
| 127 |
)
|
| 128 |
pipeline.run(
|
| 129 |
audio_path=args.audio_path,
|
| 130 |
-
language=args.language
|
| 131 |
)
|
| 132 |
|
| 133 |
|
|
@@ -139,8 +150,12 @@ if __name__ == "__main__":
|
|
| 139 |
parser.add_argument("--save_dir", type=str, required=True, help="Directory to save the output files")
|
| 140 |
parser.add_argument("--language", type=str, default="Mandarin", help="Language of the audio")
|
| 141 |
parser.add_argument("--device", type=str, default="cuda:0", help="Device to run the models on")
|
| 142 |
-
parser.add_argument("--vocal_sep", type=
|
| 143 |
parser.add_argument("--max_merge_duration", type=int, default=60000, help="Maximum merged segment duration in milliseconds")
|
|
|
|
| 144 |
args = parser.parse_args()
|
| 145 |
|
|
|
|
|
|
|
|
|
|
| 146 |
main(args)
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
class PreprocessPipeline:
|
| 19 |
+
def __init__(self, device: str, language: str, save_dir: str, vocal_sep: bool = True, max_merge_duration: int = 60000, midi_transcribe: bool = True):
|
| 20 |
self.device = device
|
| 21 |
self.language = language
|
| 22 |
self.save_dir = save_dir
|
| 23 |
self.vocal_sep = vocal_sep
|
| 24 |
self.max_merge_duration = max_merge_duration
|
| 25 |
+
self.midi_transcribe = midi_transcribe
|
| 26 |
|
| 27 |
if vocal_sep:
|
| 28 |
self.vocal_separator = VocalSeparator(
|
|
|
|
| 38 |
model_path="pretrained_models/SoulX-Singer-Preprocess/rmvpe/rmvpe.pt",
|
| 39 |
device=device,
|
| 40 |
)
|
| 41 |
+
if self.midi_transcribe:
|
| 42 |
+
self.vocal_detector = VocalDetector(
|
| 43 |
+
cut_wavs_output_dir= f"{save_dir}/cut_wavs",
|
| 44 |
+
)
|
| 45 |
+
self.lyric_transcriber = LyricTranscriber(
|
| 46 |
+
zh_model_path="pretrained_models/SoulX-Singer-Preprocess/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
|
| 47 |
+
en_model_path="pretrained_models/SoulX-Singer-Preprocess/parakeet-tdt-0.6b-v2/parakeet-tdt-0.6b-v2.nemo",
|
| 48 |
+
device=device
|
| 49 |
+
)
|
| 50 |
+
self.note_transcriber = NoteTranscriber(
|
| 51 |
+
rosvot_model_path="pretrained_models/SoulX-Singer-Preprocess/rosvot/rosvot/model.pt",
|
| 52 |
+
rwbd_model_path="pretrained_models/SoulX-Singer-Preprocess/rosvot/rwbd/model.pt",
|
| 53 |
+
device=device
|
| 54 |
+
)
|
| 55 |
+
else:
|
| 56 |
+
self.vocal_detector = None
|
| 57 |
+
self.lyric_transcriber = None
|
| 58 |
+
self.note_transcriber = None
|
| 59 |
|
| 60 |
def run(
|
| 61 |
self,
|
| 62 |
audio_path: str,
|
| 63 |
+
vocal_sep: bool = None,
|
| 64 |
+
max_merge_duration: int = None,
|
| 65 |
+
language: str = None,
|
| 66 |
) -> None:
|
| 67 |
vocal_sep = self.vocal_sep if vocal_sep is None else vocal_sep
|
| 68 |
max_merge_duration = self.max_merge_duration if max_merge_duration is None else max_merge_duration
|
|
|
|
| 87 |
vocal_path = output_dir / "vocal.wav"
|
| 88 |
sf.write(vocal_path, vocal, sample_rate)
|
| 89 |
|
| 90 |
+
vocal_f0 = self.f0_extractor.process(str(vocal_path), f0_path=str(vocal_path).replace(".wav", "_f0.npy"))
|
| 91 |
+
|
| 92 |
+
if not self.midi_transcribe or self.vocal_detector is None or self.lyric_transcriber is None or self.note_transcriber is None:
|
| 93 |
+
return
|
| 94 |
+
|
| 95 |
segments = self.vocal_detector.process(str(vocal_path), f0=vocal_f0)
|
| 96 |
|
| 97 |
metadata = []
|
|
|
|
| 134 |
save_dir=args.save_dir,
|
| 135 |
vocal_sep=args.vocal_sep,
|
| 136 |
max_merge_duration=args.max_merge_duration,
|
| 137 |
+
midi_transcribe=args.midi_transcribe,
|
| 138 |
)
|
| 139 |
pipeline.run(
|
| 140 |
audio_path=args.audio_path,
|
| 141 |
+
language=args.language,
|
| 142 |
)
|
| 143 |
|
| 144 |
|
|
|
|
| 150 |
parser.add_argument("--save_dir", type=str, required=True, help="Directory to save the output files")
|
| 151 |
parser.add_argument("--language", type=str, default="Mandarin", help="Language of the audio")
|
| 152 |
parser.add_argument("--device", type=str, default="cuda:0", help="Device to run the models on")
|
| 153 |
+
parser.add_argument("--vocal_sep", type=str, default="True", help="Whether to perform vocal separation")
|
| 154 |
parser.add_argument("--max_merge_duration", type=int, default=60000, help="Maximum merged segment duration in milliseconds")
|
| 155 |
+
parser.add_argument("--midi_transcribe", type=str, default="True", help="Whether to do MIDI transcription")
|
| 156 |
args = parser.parse_args()
|
| 157 |
|
| 158 |
+
args.vocal_sep = args.vocal_sep.lower() == "true"
|
| 159 |
+
args.midi_transcribe = args.midi_transcribe.lower() == "true"
|
| 160 |
+
|
| 161 |
main(args)
|
soulxsinger/models/modules/whisper_encoder.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Frozen Whisper encoder wrapper (wav -> encoder embeddings)."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Optional
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
import torch.nn as nn
|
| 9 |
+
import torchaudio
|
| 10 |
+
from transformers import WhisperFeatureExtractor, WhisperModel
|
| 11 |
+
|
| 12 |
+
WHISPER_MEL_FRAMES = 3000 # 3000 frames at 16000 Hz
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class WhisperEncoder():
|
| 16 |
+
|
| 17 |
+
def __init__(
|
| 18 |
+
self,
|
| 19 |
+
device: Optional[str] = None,
|
| 20 |
+
) -> None:
|
| 21 |
+
self.fe = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")
|
| 22 |
+
self.model = WhisperModel.from_pretrained("openai/whisper-base")
|
| 23 |
+
self.model = self.model.to(device or ("cuda" if torch.cuda.is_available() else "cpu"))
|
| 24 |
+
|
| 25 |
+
def encode(
|
| 26 |
+
self,
|
| 27 |
+
wav: torch.Tensor,
|
| 28 |
+
sr: int,
|
| 29 |
+
) -> torch.Tensor:
|
| 30 |
+
wav = torchaudio.functional.resample(wav, orig_freq=sr, new_freq=self.fe.sampling_rate) if sr != self.fe.sampling_rate else wav
|
| 31 |
+
wav_np = wav.cpu().detach().numpy().astype("float32", copy=False)
|
| 32 |
+
|
| 33 |
+
inputs = self.fe(
|
| 34 |
+
wav_np,
|
| 35 |
+
sampling_rate=self.fe.sampling_rate,
|
| 36 |
+
return_tensors="pt",
|
| 37 |
+
padding=False,
|
| 38 |
+
truncation=False,
|
| 39 |
+
return_attention_mask=True,
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
input_features = inputs.input_features
|
| 43 |
+
num_frames = input_features.shape[-1]
|
| 44 |
+
if num_frames < WHISPER_MEL_FRAMES:
|
| 45 |
+
pad = WHISPER_MEL_FRAMES - num_frames
|
| 46 |
+
input_features = torch.nn.functional.pad(input_features, (0, pad))
|
| 47 |
+
else:
|
| 48 |
+
input_features = input_features[..., :WHISPER_MEL_FRAMES]
|
| 49 |
+
|
| 50 |
+
input_features = input_features.to(wav.device)
|
| 51 |
+
if self.model.device != wav.device:
|
| 52 |
+
self.model = self.model.to(wav.device)
|
| 53 |
+
attention_mask = inputs.attention_mask.to(wav.device) if inputs.attention_mask is not None else None
|
| 54 |
+
|
| 55 |
+
encoder_out = self.model.encoder(input_features).last_hidden_state
|
| 56 |
+
|
| 57 |
+
if attention_mask is not None:
|
| 58 |
+
valid_mel_frames = attention_mask.sum(dim=1)
|
| 59 |
+
valid_enc_frames = (valid_mel_frames + 1) // 2
|
| 60 |
+
max_valid_enc_frames = min(int(valid_enc_frames.max().item()), encoder_out.shape[1])
|
| 61 |
+
encoder_out = encoder_out[:, :max_valid_enc_frames, :]
|
| 62 |
+
valid_len = min(int(valid_enc_frames[0].item()), max_valid_enc_frames)
|
| 63 |
+
if valid_len < max_valid_enc_frames:
|
| 64 |
+
encoder_out[0, valid_len:, :] = 0
|
| 65 |
+
|
| 66 |
+
return encoder_out
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
if __name__ == "__main__":
|
| 70 |
+
torch.manual_seed(0)
|
| 71 |
+
audio = torch.randn(1, 24000 * 25).float().to("cuda")
|
| 72 |
+
encoder = WhisperEncoder()
|
| 73 |
+
whisper_encoder_out = encoder.encode(audio, sr=24000)
|
| 74 |
+
print(whisper_encoder_out.shape)
|
soulxsinger/models/soulxsinger_svc.py
ADDED
|
@@ -0,0 +1,319 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import torch.nn.functional as F
|
| 4 |
+
import numpy as np
|
| 5 |
+
from tqdm import tqdm
|
| 6 |
+
from typing import Optional, Dict, Any, List, Tuple
|
| 7 |
+
|
| 8 |
+
from soulxsinger.models.modules.vocoder import Vocoder
|
| 9 |
+
from soulxsinger.models.modules.decoder import CFMDecoder
|
| 10 |
+
from soulxsinger.models.modules.mel_transform import MelSpectrogramEncoder
|
| 11 |
+
from soulxsinger.models.modules.whisper_encoder import WhisperEncoder
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class SoulXSingerSVC(nn.Module):
|
| 15 |
+
"""
|
| 16 |
+
SoulXSinger SVC model.
|
| 17 |
+
"""
|
| 18 |
+
def __init__(self, config: Dict):
|
| 19 |
+
super(SoulXSingerSVC, self).__init__()
|
| 20 |
+
self.audio_cfg = config.audio
|
| 21 |
+
enc_cfg = config.model.encoder
|
| 22 |
+
cfm_cfg = config.model.flow_matching
|
| 23 |
+
|
| 24 |
+
self.whisper_encoder = WhisperEncoder()
|
| 25 |
+
self.f0_encoder = nn.Embedding(enc_cfg["f0_bin"], enc_cfg["f0_dim"])
|
| 26 |
+
self.cfm_decoder = CFMDecoder(cfm_cfg)
|
| 27 |
+
|
| 28 |
+
self.mel = MelSpectrogramEncoder(self.audio_cfg)
|
| 29 |
+
self.vocoder = Vocoder()
|
| 30 |
+
|
| 31 |
+
@staticmethod
|
| 32 |
+
def f0_to_coarse(f0, f0_bin=361, f0_min=32.7031956625, f0_shift=0):
|
| 33 |
+
"""
|
| 34 |
+
Convert continuous F0 values to discrete F0 bins (SIL and C1 - B6, 361 bins).
|
| 35 |
+
args:
|
| 36 |
+
f0: continuous F0 values
|
| 37 |
+
f0_bin: number of F0 bins
|
| 38 |
+
f0_min: minimum F0 value
|
| 39 |
+
f0_shift: shift value for F0 bins
|
| 40 |
+
returns:
|
| 41 |
+
f0_coarse: discrete F0 bins
|
| 42 |
+
"""
|
| 43 |
+
is_torch = isinstance(f0, torch.Tensor)
|
| 44 |
+
uv_mask = f0 <= 0
|
| 45 |
+
|
| 46 |
+
if is_torch:
|
| 47 |
+
f0_safe = torch.maximum(f0, torch.tensor(f0_min))
|
| 48 |
+
f0_cents = 1200 * torch.log2(f0_safe / f0_min)
|
| 49 |
+
else:
|
| 50 |
+
f0_safe = np.maximum(f0, f0_min)
|
| 51 |
+
f0_cents = 1200 * np.log2(f0_safe / f0_min)
|
| 52 |
+
|
| 53 |
+
f0_coarse = (f0_cents / 20) + 1
|
| 54 |
+
|
| 55 |
+
if is_torch:
|
| 56 |
+
f0_coarse = torch.round(f0_coarse).long()
|
| 57 |
+
f0_coarse = torch.clamp(f0_coarse, min=1, max=f0_bin - 1)
|
| 58 |
+
else:
|
| 59 |
+
f0_coarse = np.rint(f0_coarse).astype(int)
|
| 60 |
+
f0_coarse = np.clip(f0_coarse, 1, f0_bin - 1)
|
| 61 |
+
|
| 62 |
+
f0_coarse[uv_mask] = 0
|
| 63 |
+
|
| 64 |
+
if f0_shift != 0:
|
| 65 |
+
if is_torch:
|
| 66 |
+
voiced = f0_coarse > 0
|
| 67 |
+
if voiced.any():
|
| 68 |
+
shifted = f0_coarse[voiced] + f0_shift
|
| 69 |
+
f0_coarse[voiced] = torch.clamp(shifted, 1, f0_bin - 1)
|
| 70 |
+
else:
|
| 71 |
+
voiced = f0_coarse > 0
|
| 72 |
+
if np.any(voiced):
|
| 73 |
+
shifted = f0_coarse[voiced] + f0_shift
|
| 74 |
+
f0_coarse[voiced] = np.clip(shifted, 1, f0_bin - 1)
|
| 75 |
+
|
| 76 |
+
return f0_coarse
|
| 77 |
+
|
| 78 |
+
@staticmethod
|
| 79 |
+
def build_vocal_segments(
|
| 80 |
+
f0,
|
| 81 |
+
f0_rate: int = 50,
|
| 82 |
+
uv_frames_th: int = 5,
|
| 83 |
+
min_duration_sec: float = 5.0,
|
| 84 |
+
max_duration_sec: float = 30.0,
|
| 85 |
+
num_overlaps: int = 1,
|
| 86 |
+
ignore_silent_segments: bool = True,
|
| 87 |
+
) -> Tuple[List[Tuple[float, float]], List[Tuple[float, float]]]:
|
| 88 |
+
"""Build vocal segments based on F0 contour. First split by long silent runs, then merge into segments based on min and max duration constraints.
|
| 89 |
+
args:
|
| 90 |
+
f0: F0 contour of the audio, 1D array or tensor with shape (T,)
|
| 91 |
+
f0_rate: F0 sampling rate in Hz (e.g., 50 for 20ms hop size)
|
| 92 |
+
uv_frames_th: number of consecutive zero F0 frames to consider as a split point
|
| 93 |
+
min_duration_sec: minimum duration of each segment in seconds
|
| 94 |
+
max_duration_sec: maximum duration of each segment in seconds
|
| 95 |
+
num_overlaps: number of overlapping segments to create for each non-overlapping segment (for smooth inference)
|
| 96 |
+
ignore_silent_segments: whether to ignore segments that are mostly silent (e.g., > 95% zero F0)
|
| 97 |
+
returns:
|
| 98 |
+
overlap_segments: list of (overlap_start_sec, overlap_end_sec) for each segment, which may overlap with adjacent segments for smooth inference
|
| 99 |
+
segments: list of (seg_start_sec, seg_end_sec) for each segment, which are non-overlapping and used for final merging
|
| 100 |
+
"""
|
| 101 |
+
if isinstance(f0, torch.Tensor):
|
| 102 |
+
f0_np = f0.detach().float().cpu().numpy()
|
| 103 |
+
else:
|
| 104 |
+
f0_np = np.asarray(f0, dtype=np.float32)
|
| 105 |
+
f0_np = np.squeeze(f0_np)
|
| 106 |
+
|
| 107 |
+
total_frames = int(f0_np.shape[0])
|
| 108 |
+
if total_frames == 0:
|
| 109 |
+
return [], []
|
| 110 |
+
|
| 111 |
+
min_frames = max(1, int(round(min_duration_sec * f0_rate)))
|
| 112 |
+
max_frames = max(1, int(round(max_duration_sec * f0_rate)))
|
| 113 |
+
|
| 114 |
+
split_points = [0] # silence split points in frame indices, starting with 0 and ending with total_frames
|
| 115 |
+
|
| 116 |
+
def append_split_point(point: int):
|
| 117 |
+
# Ensure split points are within valid range and respect max_frames constraint
|
| 118 |
+
point = int(max(0, min(point, total_frames)))
|
| 119 |
+
while point - split_points[-1] > max_frames:
|
| 120 |
+
split_points.append(split_points[-1] + max_frames)
|
| 121 |
+
if point > split_points[-1]:
|
| 122 |
+
split_points.append(point)
|
| 123 |
+
|
| 124 |
+
idx = 0
|
| 125 |
+
while idx < total_frames:
|
| 126 |
+
if f0_np[idx] == 0:
|
| 127 |
+
run_start = idx
|
| 128 |
+
while idx < total_frames and f0_np[idx] == 0:
|
| 129 |
+
idx += 1
|
| 130 |
+
run_end = idx
|
| 131 |
+
if (run_end - run_start) >= uv_frames_th:
|
| 132 |
+
split_point = max(run_end - 5, (run_start + run_end) // 2)
|
| 133 |
+
append_split_point(split_point)
|
| 134 |
+
else:
|
| 135 |
+
idx += 1
|
| 136 |
+
append_split_point(total_frames)
|
| 137 |
+
# print(f"Initial split points (in seconds): {[round(p / f0_rate, 2) for p in split_points]}")
|
| 138 |
+
|
| 139 |
+
segments: List[Tuple[int, int]] = []
|
| 140 |
+
overlap_segments: List[Tuple[int, int]] = []
|
| 141 |
+
|
| 142 |
+
def append_segment(start_idx: int, end_idx: int, num_overlaps: int = num_overlaps):
|
| 143 |
+
segments.append((split_points[start_idx] / f0_rate, split_points[end_idx] / f0_rate))
|
| 144 |
+
overlap_start_idx = start_idx
|
| 145 |
+
if start_idx > 0 and (split_points[end_idx] - split_points[start_idx - num_overlaps]) <= max_frames:
|
| 146 |
+
overlap_start_idx = start_idx - num_overlaps
|
| 147 |
+
overlap_segments.append((split_points[overlap_start_idx] / f0_rate, split_points[end_idx] / f0_rate))
|
| 148 |
+
|
| 149 |
+
segment_start, segment_end = 0, 1
|
| 150 |
+
|
| 151 |
+
while segment_start < len(split_points) - 1:
|
| 152 |
+
while segment_end < len(split_points) and (split_points[segment_end] - split_points[segment_start]) < min_frames:
|
| 153 |
+
segment_end += 1
|
| 154 |
+
|
| 155 |
+
if segment_end >= len(split_points):
|
| 156 |
+
append_segment(segment_start, len(split_points) - 1, num_overlaps=num_overlaps)
|
| 157 |
+
break
|
| 158 |
+
append_segment(segment_start, segment_end, num_overlaps=num_overlaps)
|
| 159 |
+
segment_start = segment_end
|
| 160 |
+
segment_end = segment_start + 1
|
| 161 |
+
|
| 162 |
+
# print(f"Final segments (overlap_start, overlap_end, seg_start_time, seg_end_time) in seconds: {overlap_segments}")
|
| 163 |
+
if ignore_silent_segments:
|
| 164 |
+
filtered_idx = []
|
| 165 |
+
for i, seg in enumerate(overlap_segments):
|
| 166 |
+
start_frame = int(seg[0] * f0_rate)
|
| 167 |
+
end_frame = int(seg[1] * f0_rate)
|
| 168 |
+
total_frames = end_frame - start_frame
|
| 169 |
+
voice_frames = np.sum(f0_np[start_frame:end_frame] > 0)
|
| 170 |
+
if voice_frames / total_frames > 0.05 and voice_frames >= 10: # at least 10 voiced frames and >5% voiced frames
|
| 171 |
+
filtered_idx.append(i)
|
| 172 |
+
|
| 173 |
+
overlap_segments = [overlap_segments[i] for i in filtered_idx]
|
| 174 |
+
segments = [segments[i] for i in filtered_idx]
|
| 175 |
+
# print(f"Filtered segments with mostly silence removed: {overlap_segments}")
|
| 176 |
+
|
| 177 |
+
return overlap_segments, segments
|
| 178 |
+
|
| 179 |
+
def infer(
|
| 180 |
+
self,
|
| 181 |
+
pt_wav: str|torch.Tensor,
|
| 182 |
+
gt_wav: str|torch.Tensor,
|
| 183 |
+
pt_f0: str|torch.Tensor,
|
| 184 |
+
gt_f0: str|torch.Tensor,
|
| 185 |
+
auto_shift=False,
|
| 186 |
+
pitch_shift=0,
|
| 187 |
+
n_steps=32,
|
| 188 |
+
cfg=3,
|
| 189 |
+
):
|
| 190 |
+
"""
|
| 191 |
+
SVC inference pipeline. First build vocal segments based on F0 contour, then run inference for each segment and merge results.
|
| 192 |
+
args:
|
| 193 |
+
pt_wav: prompt waveform path or tensor
|
| 194 |
+
gt_wav: target waveform path or tensor
|
| 195 |
+
pt_f0: prompt F0 path or tensor
|
| 196 |
+
gt_f0: target F0 path or tensor
|
| 197 |
+
auto_shift: whether to automatically calculate pitch shift based on median F0 of prompt and target
|
| 198 |
+
pitch_shift: manual pitch shift in semitones (overrides auto_shift if > 0)
|
| 199 |
+
n_steps: number of diffusion steps for inference
|
| 200 |
+
cfg: classifier-free guidance scale for inference
|
| 201 |
+
"""
|
| 202 |
+
|
| 203 |
+
# calculate auto pitch shift
|
| 204 |
+
if auto_shift and pitch_shift == 0:
|
| 205 |
+
if gt_f0 is not None and pt_f0 is not None:
|
| 206 |
+
gt_f0_median = torch.median(gt_f0[gt_f0 > 0])
|
| 207 |
+
pt_f0_median = torch.median(pt_f0[pt_f0 > 0])
|
| 208 |
+
pitch_shift = torch.round(torch.log2(pt_f0_median / gt_f0_median) * 1200 / 100).int().item()
|
| 209 |
+
else:
|
| 210 |
+
print("Warning: pitch_shift is True but note_pitch or f0 is None. Set f0_shift to 0.")
|
| 211 |
+
pitch_shift = 0
|
| 212 |
+
else:
|
| 213 |
+
pitch_shift = pitch_shift
|
| 214 |
+
|
| 215 |
+
# if target audio is less than 30 seconds, infer the whole audio
|
| 216 |
+
if gt_wav.shape[-1] < 30 * self.audio_cfg.sample_rate:
|
| 217 |
+
generated_audio = self.infer_segment(
|
| 218 |
+
pt_wav=pt_wav,
|
| 219 |
+
gt_wav=gt_wav,
|
| 220 |
+
pt_f0=pt_f0,
|
| 221 |
+
gt_f0=gt_f0,
|
| 222 |
+
pitch_shift=pitch_shift,
|
| 223 |
+
n_steps=n_steps,
|
| 224 |
+
cfg=cfg,
|
| 225 |
+
)
|
| 226 |
+
return generated_audio, pitch_shift
|
| 227 |
+
|
| 228 |
+
# if target audio is longer than 30 seconds, build vocal segments and infer each segment
|
| 229 |
+
generated_audio = []
|
| 230 |
+
|
| 231 |
+
f0_rate = self.audio_cfg.sample_rate // self.audio_cfg.hop_size
|
| 232 |
+
|
| 233 |
+
overlap_segments, segments = self.build_vocal_segments(
|
| 234 |
+
gt_f0,
|
| 235 |
+
f0_rate=f0_rate,
|
| 236 |
+
uv_frames_th=10,
|
| 237 |
+
min_duration_sec=15.0,
|
| 238 |
+
max_duration_sec=30.0,
|
| 239 |
+
)
|
| 240 |
+
if len(segments) == 0:
|
| 241 |
+
segments = [(0.0, gt_wav.shape[-1] / self.audio_cfg.sample_rate)]
|
| 242 |
+
overlap_segments = [(0.0, gt_wav.shape[-1] / self.audio_cfg.sample_rate)]
|
| 243 |
+
|
| 244 |
+
generated_audio = torch.zeros_like(gt_wav)
|
| 245 |
+
for idx in tqdm(range(len(segments)), total=len(segments), desc="Inferring segments (SVC)", dynamic_ncols=True):
|
| 246 |
+
overlap_start_sec, overlap_end_sec = overlap_segments[idx]
|
| 247 |
+
seg_start_sec, seg_end_sec = segments[idx]
|
| 248 |
+
|
| 249 |
+
wav_start = int(round(overlap_start_sec * self.audio_cfg.sample_rate))
|
| 250 |
+
wav_end = int(round(overlap_end_sec * self.audio_cfg.sample_rate))
|
| 251 |
+
f0_start = int(round(overlap_start_sec * f0_rate))
|
| 252 |
+
f0_end = int(round(overlap_end_sec * f0_rate))
|
| 253 |
+
|
| 254 |
+
wav_start = max(0, min(wav_start, gt_wav.shape[-1]))
|
| 255 |
+
wav_end = max(wav_start, min(wav_end, gt_wav.shape[-1]))
|
| 256 |
+
f0_start = max(0, min(f0_start, gt_f0.shape[-1]))
|
| 257 |
+
f0_end = max(f0_start, min(f0_end, gt_f0.shape[-1]))
|
| 258 |
+
|
| 259 |
+
segment_gt_wav = gt_wav[:, wav_start:wav_end]
|
| 260 |
+
segment_gt_f0 = gt_f0[:, f0_start:f0_end]
|
| 261 |
+
segment_generated_audio = self.infer_segment(
|
| 262 |
+
pt_wav=pt_wav,
|
| 263 |
+
gt_wav=segment_gt_wav,
|
| 264 |
+
pt_f0=pt_f0,
|
| 265 |
+
gt_f0=segment_gt_f0,
|
| 266 |
+
pitch_shift=pitch_shift,
|
| 267 |
+
n_steps=n_steps,
|
| 268 |
+
cfg=cfg,
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
segment_start = int(round(seg_start_sec * self.audio_cfg.sample_rate))
|
| 272 |
+
segment_end = int(round(seg_end_sec * self.audio_cfg.sample_rate))
|
| 273 |
+
segment_generated_audio = segment_generated_audio[segment_start - wav_start: segment_end - wav_start]
|
| 274 |
+
|
| 275 |
+
generated_audio[:, segment_start:segment_end] = segment_generated_audio
|
| 276 |
+
|
| 277 |
+
return generated_audio, pitch_shift
|
| 278 |
+
|
| 279 |
+
def infer_segment(self, pt_wav, gt_wav, pt_f0, gt_f0, pitch_shift=0, n_steps=32, cfg=3):
|
| 280 |
+
pt_mel = self.mel(pt_wav)
|
| 281 |
+
len_prompt_mel = pt_mel.shape[1]
|
| 282 |
+
pt_f0 = F.pad(pt_f0, (0, 0, 0, max(0, len_prompt_mel - pt_f0.shape[1])))[:, :len_prompt_mel]
|
| 283 |
+
|
| 284 |
+
f0_course_pt = self.f0_to_coarse(pt_f0)
|
| 285 |
+
f0_course_gt = self.f0_to_coarse(gt_f0, f0_shift=pitch_shift * 5)
|
| 286 |
+
f0_course = torch.cat([f0_course_pt, f0_course_gt], 1)
|
| 287 |
+
|
| 288 |
+
pt_content_feat = self.whisper_encoder.encode(pt_wav, sr=self.audio_cfg.sample_rate)
|
| 289 |
+
gt_content_feat = self.whisper_encoder.encode(gt_wav, sr=self.audio_cfg.sample_rate)
|
| 290 |
+
t_pt, t_gt = f0_course_pt.shape[1], f0_course_gt.shape[1]
|
| 291 |
+
pt_content_feat = F.pad(pt_content_feat, (0, 0, 0, max(0, t_pt - pt_content_feat.shape[1])))[:, :t_pt, :]
|
| 292 |
+
gt_content_feat = F.pad(gt_content_feat, (0, 0, 0, max(0, t_gt - gt_content_feat.shape[1])))[:, :t_gt, :]
|
| 293 |
+
|
| 294 |
+
content_feat = torch.cat([pt_content_feat, gt_content_feat], 1)
|
| 295 |
+
|
| 296 |
+
f0_feat = self.f0_encoder(f0_course)
|
| 297 |
+
features = content_feat + f0_feat
|
| 298 |
+
|
| 299 |
+
gt_decoder_inp = features[:, len_prompt_mel:, :]
|
| 300 |
+
pt_decoder_inp = features[:, :len_prompt_mel, :]
|
| 301 |
+
|
| 302 |
+
generated_mel = self.cfm_decoder.reverse_diffusion(
|
| 303 |
+
pt_mel,
|
| 304 |
+
pt_decoder_inp,
|
| 305 |
+
gt_decoder_inp,
|
| 306 |
+
n_timesteps=n_steps,
|
| 307 |
+
cfg=cfg
|
| 308 |
+
)
|
| 309 |
+
|
| 310 |
+
generated_audio = self.vocoder(generated_mel.transpose(1, 2)[0:1, ...])
|
| 311 |
+
generated_audio = generated_audio.squeeze()
|
| 312 |
+
|
| 313 |
+
# cut or pad to match gt_wav length
|
| 314 |
+
if generated_audio.shape[-1] > gt_wav.shape[-1]:
|
| 315 |
+
generated_audio = generated_audio[:gt_wav.shape[-1]]
|
| 316 |
+
elif generated_audio.shape[-1] < gt_wav.shape[-1]:
|
| 317 |
+
generated_audio = F.pad(generated_audio, (0, gt_wav.shape[-1] - generated_audio.shape[-1]))
|
| 318 |
+
|
| 319 |
+
return generated_audio
|
webui.py
CHANGED
|
@@ -4,6 +4,7 @@ import random
|
|
| 4 |
import shutil
|
| 5 |
import sys
|
| 6 |
import traceback
|
|
|
|
| 7 |
from pathlib import Path
|
| 8 |
from typing import Tuple
|
| 9 |
import spaces
|
|
@@ -269,6 +270,10 @@ def transcription_function(
|
|
| 269 |
except Exception:
|
| 270 |
print(traceback.format_exc(), file=sys.stderr, flush=True)
|
| 271 |
return None, None
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
|
| 273 |
|
| 274 |
@spaces.GPU
|
|
@@ -351,7 +356,187 @@ def synthesis_function(
|
|
| 351 |
except Exception:
|
| 352 |
print(traceback.format_exc(), file=sys.stderr, flush=True)
|
| 353 |
return None, gr.update(), gr.update()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
|
| 356 |
|
| 357 |
def render_interface() -> gr.Blocks:
|
|
@@ -378,180 +563,7 @@ def render_interface() -> gr.Blocks:
|
|
| 378 |
'"></div>'
|
| 379 |
'</div>'
|
| 380 |
)
|
| 381 |
-
|
| 382 |
-
with gr.Row(equal_height=False):
|
| 383 |
-
# ── Left column: inputs & controls ──
|
| 384 |
-
with gr.Column(scale=1):
|
| 385 |
-
prompt_audio = gr.Audio(
|
| 386 |
-
label="Prompt audio (reference voice), max 30s",
|
| 387 |
-
type="filepath",
|
| 388 |
-
interactive=True,
|
| 389 |
-
)
|
| 390 |
-
target_audio = gr.Audio(
|
| 391 |
-
label="Target audio (melody / lyrics source), max 60s",
|
| 392 |
-
type="filepath",
|
| 393 |
-
interactive=True,
|
| 394 |
-
)
|
| 395 |
-
|
| 396 |
-
with gr.Row():
|
| 397 |
-
control_radio = gr.Radio(
|
| 398 |
-
choices=["melody", "score"],
|
| 399 |
-
value="melody",
|
| 400 |
-
label="Control type",
|
| 401 |
-
scale=1,
|
| 402 |
-
)
|
| 403 |
-
auto_shift = gr.Checkbox(
|
| 404 |
-
label="Auto pitch shift",
|
| 405 |
-
value=True,
|
| 406 |
-
interactive=True,
|
| 407 |
-
scale=1,
|
| 408 |
-
)
|
| 409 |
-
|
| 410 |
-
synthesis_btn = gr.Button(
|
| 411 |
-
value="🎤 Generate singing voice",
|
| 412 |
-
variant="primary",
|
| 413 |
-
size="lg",
|
| 414 |
-
)
|
| 415 |
-
|
| 416 |
-
# ── Advanced: transcription settings & metadata ──
|
| 417 |
-
with gr.Accordion("Advanced: Transcription & Metadata", open=False):
|
| 418 |
-
with gr.Row():
|
| 419 |
-
pitch_shift = gr.Number(
|
| 420 |
-
label="Pitch shift (semitones)",
|
| 421 |
-
value=0,
|
| 422 |
-
minimum=-36,
|
| 423 |
-
maximum=36,
|
| 424 |
-
step=1,
|
| 425 |
-
interactive=True,
|
| 426 |
-
scale=1,
|
| 427 |
-
)
|
| 428 |
-
seed_input = gr.Number(
|
| 429 |
-
label="Seed",
|
| 430 |
-
value=12306,
|
| 431 |
-
step=1,
|
| 432 |
-
interactive=True,
|
| 433 |
-
scale=1,
|
| 434 |
-
)
|
| 435 |
-
gr.Markdown(
|
| 436 |
-
"Upload your own metadata files to skip automatic transcription. "
|
| 437 |
-
"You can use the [SoulX-Singer-Midi-Editor]"
|
| 438 |
-
"(https://huggingface.co/spaces/Soul-AILab/SoulX-Singer-Midi-Editor) "
|
| 439 |
-
"to edit metadata for better alignment."
|
| 440 |
-
)
|
| 441 |
-
with gr.Row():
|
| 442 |
-
prompt_lyric_lang = gr.Dropdown(
|
| 443 |
-
label="Prompt lyric language",
|
| 444 |
-
choices=[
|
| 445 |
-
("Mandarin", "Mandarin"),
|
| 446 |
-
("Cantonese", "Cantonese"),
|
| 447 |
-
("English", "English"),
|
| 448 |
-
],
|
| 449 |
-
value="English",
|
| 450 |
-
interactive=True,
|
| 451 |
-
scale=1,
|
| 452 |
-
)
|
| 453 |
-
target_lyric_lang = gr.Dropdown(
|
| 454 |
-
label="Target lyric language",
|
| 455 |
-
choices=[
|
| 456 |
-
("Mandarin", "Mandarin"),
|
| 457 |
-
("Cantonese", "Cantonese"),
|
| 458 |
-
("English", "English"),
|
| 459 |
-
],
|
| 460 |
-
value="English",
|
| 461 |
-
interactive=True,
|
| 462 |
-
scale=1,
|
| 463 |
-
)
|
| 464 |
-
with gr.Row():
|
| 465 |
-
prompt_vocal_sep = gr.Checkbox(
|
| 466 |
-
label="Prompt vocal separation",
|
| 467 |
-
value=False,
|
| 468 |
-
interactive=True,
|
| 469 |
-
scale=1,
|
| 470 |
-
)
|
| 471 |
-
target_vocal_sep = gr.Checkbox(
|
| 472 |
-
label="Target vocal separation",
|
| 473 |
-
value=True,
|
| 474 |
-
interactive=True,
|
| 475 |
-
scale=1,
|
| 476 |
-
)
|
| 477 |
-
transcription_btn = gr.Button(
|
| 478 |
-
value="Run singing transcription",
|
| 479 |
-
variant="secondary",
|
| 480 |
-
size="lg",
|
| 481 |
-
)
|
| 482 |
-
with gr.Row():
|
| 483 |
-
prompt_metadata = gr.File(
|
| 484 |
-
label="Prompt metadata",
|
| 485 |
-
type="filepath",
|
| 486 |
-
file_types=[".json"],
|
| 487 |
-
interactive=True,
|
| 488 |
-
)
|
| 489 |
-
target_metadata = gr.File(
|
| 490 |
-
label="Target metadata",
|
| 491 |
-
type="filepath",
|
| 492 |
-
file_types=[".json"],
|
| 493 |
-
interactive=True,
|
| 494 |
-
)
|
| 495 |
-
|
| 496 |
-
# ── Right column: output ──
|
| 497 |
-
with gr.Column(scale=1):
|
| 498 |
-
output_audio = gr.Audio(
|
| 499 |
-
label="Generated audio",
|
| 500 |
-
type="filepath",
|
| 501 |
-
interactive=False,
|
| 502 |
-
)
|
| 503 |
-
gr.Examples(
|
| 504 |
-
examples=[
|
| 505 |
-
["raven.wav", "happy_birthday.mp3"],
|
| 506 |
-
["anita.wav", "happy_birthday.mp3"],
|
| 507 |
-
["obama.wav", "happy_birthday.mp3"],
|
| 508 |
-
["raven.wav", "everybody_loves.wav"],
|
| 509 |
-
["anita.wav", "everybody_loves.wav"],
|
| 510 |
-
["obama.wav", "everybody_loves.wav"],
|
| 511 |
-
],
|
| 512 |
-
inputs=[prompt_audio, target_audio],
|
| 513 |
-
outputs=[output_audio, prompt_metadata, target_metadata],
|
| 514 |
-
fn=synthesis_function,
|
| 515 |
-
cache_examples=True,
|
| 516 |
-
cache_mode="lazy"
|
| 517 |
-
)
|
| 518 |
-
|
| 519 |
-
# ── Event handlers ──
|
| 520 |
-
prompt_audio.change(
|
| 521 |
-
fn=lambda: None,
|
| 522 |
-
inputs=[],
|
| 523 |
-
outputs=[prompt_metadata],
|
| 524 |
-
)
|
| 525 |
-
|
| 526 |
-
target_audio.change(
|
| 527 |
-
fn=lambda: None,
|
| 528 |
-
inputs=[],
|
| 529 |
-
outputs=[target_metadata],
|
| 530 |
-
)
|
| 531 |
-
|
| 532 |
-
transcription_btn.click(
|
| 533 |
-
fn=transcription_function,
|
| 534 |
-
inputs=[
|
| 535 |
-
prompt_audio, target_audio,
|
| 536 |
-
prompt_metadata, target_metadata,
|
| 537 |
-
prompt_lyric_lang, target_lyric_lang,
|
| 538 |
-
prompt_vocal_sep, target_vocal_sep,
|
| 539 |
-
],
|
| 540 |
-
outputs=[prompt_metadata, target_metadata],
|
| 541 |
-
)
|
| 542 |
-
|
| 543 |
-
synthesis_btn.click(
|
| 544 |
-
fn=synthesis_function,
|
| 545 |
-
inputs=[
|
| 546 |
-
prompt_audio, target_audio,
|
| 547 |
-
prompt_metadata, target_metadata,
|
| 548 |
-
control_radio, auto_shift, pitch_shift, seed_input,
|
| 549 |
-
prompt_lyric_lang, target_lyric_lang,
|
| 550 |
-
prompt_vocal_sep, target_vocal_sep,
|
| 551 |
-
],
|
| 552 |
-
outputs=[output_audio, prompt_metadata, target_metadata],
|
| 553 |
-
)
|
| 554 |
-
|
| 555 |
return page
|
| 556 |
|
| 557 |
|
|
|
|
| 4 |
import shutil
|
| 5 |
import sys
|
| 6 |
import traceback
|
| 7 |
+
import gc
|
| 8 |
from pathlib import Path
|
| 9 |
from typing import Tuple
|
| 10 |
import spaces
|
|
|
|
| 270 |
except Exception:
|
| 271 |
print(traceback.format_exc(), file=sys.stderr, flush=True)
|
| 272 |
return None, None
|
| 273 |
+
finally:
|
| 274 |
+
gc.collect()
|
| 275 |
+
if torch.cuda.is_available():
|
| 276 |
+
torch.cuda.empty_cache()
|
| 277 |
|
| 278 |
|
| 279 |
@spaces.GPU
|
|
|
|
| 356 |
except Exception:
|
| 357 |
print(traceback.format_exc(), file=sys.stderr, flush=True)
|
| 358 |
return None, gr.update(), gr.update()
|
| 359 |
+
finally:
|
| 360 |
+
gc.collect()
|
| 361 |
+
if torch.cuda.is_available():
|
| 362 |
+
torch.cuda.empty_cache()
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
|
| 366 |
+
def render_tab_content() -> None:
|
| 367 |
+
"""Render the main content (for embedding in app.py tabs). No Blocks or title."""
|
| 368 |
+
with gr.Row(equal_height=False):
|
| 369 |
+
# ── Left column: inputs & controls ──
|
| 370 |
+
with gr.Column(scale=1):
|
| 371 |
+
prompt_audio = gr.Audio(
|
| 372 |
+
label="Prompt audio (reference voice), max 30s",
|
| 373 |
+
type="filepath",
|
| 374 |
+
interactive=True,
|
| 375 |
+
)
|
| 376 |
+
target_audio = gr.Audio(
|
| 377 |
+
label="Target audio (melody / lyrics source), max 60s",
|
| 378 |
+
type="filepath",
|
| 379 |
+
interactive=True,
|
| 380 |
+
)
|
| 381 |
+
|
| 382 |
+
with gr.Row():
|
| 383 |
+
control_radio = gr.Radio(
|
| 384 |
+
choices=["melody", "score"],
|
| 385 |
+
value="melody",
|
| 386 |
+
label="Control type",
|
| 387 |
+
scale=1,
|
| 388 |
+
)
|
| 389 |
+
auto_shift = gr.Checkbox(
|
| 390 |
+
label="Auto pitch shift",
|
| 391 |
+
value=True,
|
| 392 |
+
interactive=True,
|
| 393 |
+
scale=1,
|
| 394 |
+
)
|
| 395 |
+
|
| 396 |
+
synthesis_btn = gr.Button(
|
| 397 |
+
value="🎤 Generate singing voice",
|
| 398 |
+
variant="primary",
|
| 399 |
+
size="lg",
|
| 400 |
+
)
|
| 401 |
+
|
| 402 |
+
# ── Advanced: transcription settings & metadata ──
|
| 403 |
+
with gr.Accordion("Advanced: Transcription & Metadata", open=False):
|
| 404 |
+
with gr.Row():
|
| 405 |
+
pitch_shift = gr.Number(
|
| 406 |
+
label="Pitch shift (semitones)",
|
| 407 |
+
value=0,
|
| 408 |
+
minimum=-36,
|
| 409 |
+
maximum=36,
|
| 410 |
+
step=1,
|
| 411 |
+
interactive=True,
|
| 412 |
+
scale=1,
|
| 413 |
+
)
|
| 414 |
+
seed_input = gr.Number(
|
| 415 |
+
label="Seed",
|
| 416 |
+
value=12306,
|
| 417 |
+
step=1,
|
| 418 |
+
interactive=True,
|
| 419 |
+
scale=1,
|
| 420 |
+
)
|
| 421 |
+
gr.Markdown(
|
| 422 |
+
"Upload your own metadata files to skip automatic transcription. "
|
| 423 |
+
"You can use the [SoulX-Singer-Midi-Editor]"
|
| 424 |
+
"(https://huggingface.co/spaces/Soul-AILab/SoulX-Singer-Midi-Editor) "
|
| 425 |
+
"to edit metadata for better alignment."
|
| 426 |
+
)
|
| 427 |
+
with gr.Row():
|
| 428 |
+
prompt_lyric_lang = gr.Dropdown(
|
| 429 |
+
label="Prompt lyric language",
|
| 430 |
+
choices=[
|
| 431 |
+
("Mandarin", "Mandarin"),
|
| 432 |
+
("Cantonese", "Cantonese"),
|
| 433 |
+
("English", "English"),
|
| 434 |
+
],
|
| 435 |
+
value="English",
|
| 436 |
+
interactive=True,
|
| 437 |
+
scale=1,
|
| 438 |
+
)
|
| 439 |
+
target_lyric_lang = gr.Dropdown(
|
| 440 |
+
label="Target lyric language",
|
| 441 |
+
choices=[
|
| 442 |
+
("Mandarin", "Mandarin"),
|
| 443 |
+
("Cantonese", "Cantonese"),
|
| 444 |
+
("English", "English"),
|
| 445 |
+
],
|
| 446 |
+
value="English",
|
| 447 |
+
interactive=True,
|
| 448 |
+
scale=1,
|
| 449 |
+
)
|
| 450 |
+
with gr.Row():
|
| 451 |
+
prompt_vocal_sep = gr.Checkbox(
|
| 452 |
+
label="Prompt vocal separation",
|
| 453 |
+
value=False,
|
| 454 |
+
interactive=True,
|
| 455 |
+
scale=1,
|
| 456 |
+
)
|
| 457 |
+
target_vocal_sep = gr.Checkbox(
|
| 458 |
+
label="Target vocal separation",
|
| 459 |
+
value=True,
|
| 460 |
+
interactive=True,
|
| 461 |
+
scale=1,
|
| 462 |
+
)
|
| 463 |
+
transcription_btn = gr.Button(
|
| 464 |
+
value="Run singing transcription",
|
| 465 |
+
variant="secondary",
|
| 466 |
+
size="lg",
|
| 467 |
+
)
|
| 468 |
+
with gr.Row():
|
| 469 |
+
prompt_metadata = gr.File(
|
| 470 |
+
label="Prompt metadata",
|
| 471 |
+
type="filepath",
|
| 472 |
+
file_types=[".json"],
|
| 473 |
+
interactive=True,
|
| 474 |
+
)
|
| 475 |
+
target_metadata = gr.File(
|
| 476 |
+
label="Target metadata",
|
| 477 |
+
type="filepath",
|
| 478 |
+
file_types=[".json"],
|
| 479 |
+
interactive=True,
|
| 480 |
+
)
|
| 481 |
|
| 482 |
+
# ── Right column: output ──
|
| 483 |
+
with gr.Column(scale=1):
|
| 484 |
+
output_audio = gr.Audio(
|
| 485 |
+
label="Generated audio",
|
| 486 |
+
type="filepath",
|
| 487 |
+
interactive=False,
|
| 488 |
+
)
|
| 489 |
+
gr.Examples(
|
| 490 |
+
examples=[
|
| 491 |
+
["raven.wav", "happy_birthday.mp3"],
|
| 492 |
+
["anita.wav", "happy_birthday.mp3"],
|
| 493 |
+
["obama.wav", "happy_birthday.mp3"],
|
| 494 |
+
["raven.wav", "everybody_loves.wav"],
|
| 495 |
+
["anita.wav", "everybody_loves.wav"],
|
| 496 |
+
["obama.wav", "everybody_loves.wav"],
|
| 497 |
+
],
|
| 498 |
+
inputs=[prompt_audio, target_audio],
|
| 499 |
+
outputs=[output_audio, prompt_metadata, target_metadata],
|
| 500 |
+
fn=synthesis_function,
|
| 501 |
+
cache_examples=True,
|
| 502 |
+
cache_mode="lazy"
|
| 503 |
+
)
|
| 504 |
+
|
| 505 |
+
# ── Event handlers ──
|
| 506 |
+
prompt_audio.change(
|
| 507 |
+
fn=lambda: None,
|
| 508 |
+
inputs=[],
|
| 509 |
+
outputs=[prompt_metadata],
|
| 510 |
+
)
|
| 511 |
+
|
| 512 |
+
target_audio.change(
|
| 513 |
+
fn=lambda: None,
|
| 514 |
+
inputs=[],
|
| 515 |
+
outputs=[target_metadata],
|
| 516 |
+
)
|
| 517 |
+
|
| 518 |
+
transcription_btn.click(
|
| 519 |
+
fn=transcription_function,
|
| 520 |
+
inputs=[
|
| 521 |
+
prompt_audio, target_audio,
|
| 522 |
+
prompt_metadata, target_metadata,
|
| 523 |
+
prompt_lyric_lang, target_lyric_lang,
|
| 524 |
+
prompt_vocal_sep, target_vocal_sep,
|
| 525 |
+
],
|
| 526 |
+
outputs=[prompt_metadata, target_metadata],
|
| 527 |
+
)
|
| 528 |
+
|
| 529 |
+
synthesis_btn.click(
|
| 530 |
+
fn=synthesis_function,
|
| 531 |
+
inputs=[
|
| 532 |
+
prompt_audio, target_audio,
|
| 533 |
+
prompt_metadata, target_metadata,
|
| 534 |
+
control_radio, auto_shift, pitch_shift, seed_input,
|
| 535 |
+
prompt_lyric_lang, target_lyric_lang,
|
| 536 |
+
prompt_vocal_sep, target_vocal_sep,
|
| 537 |
+
],
|
| 538 |
+
outputs=[output_audio, prompt_metadata, target_metadata],
|
| 539 |
+
)
|
| 540 |
|
| 541 |
|
| 542 |
def render_interface() -> gr.Blocks:
|
|
|
|
| 563 |
'"></div>'
|
| 564 |
'</div>'
|
| 565 |
)
|
| 566 |
+
render_tab_content()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 567 |
return page
|
| 568 |
|
| 569 |
|
webui_svc.py
ADDED
|
@@ -0,0 +1,419 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
import sys
|
| 3 |
+
import traceback
|
| 4 |
+
import gc
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Literal
|
| 8 |
+
|
| 9 |
+
import gradio as gr
|
| 10 |
+
import librosa
|
| 11 |
+
import numpy as np
|
| 12 |
+
import soundfile as sf
|
| 13 |
+
import torch
|
| 14 |
+
|
| 15 |
+
import spaces
|
| 16 |
+
from preprocess.pipeline import PreprocessPipeline
|
| 17 |
+
from soulxsinger.utils.file_utils import load_config
|
| 18 |
+
from cli.inference_svc import build_model as build_svc_model, process as svc_process
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
ROOT = Path(__file__).parent
|
| 22 |
+
SAMPLE_RATE = 44100
|
| 23 |
+
PROMPT_MAX_SEC_DEFAULT = 30
|
| 24 |
+
TARGET_MAX_SEC_DEFAULT = 600
|
| 25 |
+
|
| 26 |
+
# Example rows: only [prompt_audio, target_audio]; other params use UI defaults when running
|
| 27 |
+
EXAMPLE_LIST = [
|
| 28 |
+
[str(ROOT / "example/audio/zh_prompt.mp3"), str(ROOT / "example/audio/zh_target.mp3")],
|
| 29 |
+
[str(ROOT / "example/audio/en_prompt.mp3"), str(ROOT / "example/audio/en_target.mp3")],
|
| 30 |
+
[str(ROOT / "example/audio/svc_webui/Sun Yanzi.mp3"), str(ROOT / "example/audio/svc_webui/I'm Yours.mp3")],
|
| 31 |
+
[str(ROOT / "example/audio/svc_webui/Sun Yanzi.mp3"), str(ROOT / "example/audio/svc_webui/传奇.mp3")],
|
| 32 |
+
[str(ROOT / "example/audio/svc_webui/Sun Yanzi.mp3"), str(ROOT / "example/audio/svc_webui/君が好きだと叫びたい.mp3")],
|
| 33 |
+
[str(ROOT / "example/audio/svc_webui/Sun Yanzi.mp3"), str(ROOT / "example/audio/svc_webui/富士山下.mp3")],
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
_I18N = dict(
|
| 37 |
+
display_lang_label=dict(en="Display Language", zh="显示语言"),
|
| 38 |
+
title=dict(en="## SoulX-Singer SVC", zh="## SoulX-Singer SVC"),
|
| 39 |
+
prompt_audio_label=dict(en=f"Prompt audio", zh=f"Prompt 音频"),
|
| 40 |
+
target_audio_label=dict(en=f"Target audio", zh=f"Target 音频"),
|
| 41 |
+
prompt_vocal_sep_label=dict(en="Prompt vocal separation", zh="Prompt 人声分离"),
|
| 42 |
+
target_vocal_sep_label=dict(en="Target vocal separation", zh="Target 人声分离"),
|
| 43 |
+
auto_shift_label=dict(en="Auto pitch shift", zh="自动变调"),
|
| 44 |
+
auto_mix_acc_label=dict(en="Auto mix accompaniment", zh="自动混合伴奏"),
|
| 45 |
+
pitch_shift_label=dict(en="Pitch shift (semitones)", zh="指定变调(半音)"),
|
| 46 |
+
n_step_label=dict(en="diffusion steps", zh="采样步数"),
|
| 47 |
+
cfg_label=dict(en="cfg scale", zh="cfg系数"),
|
| 48 |
+
seed_label=dict(en="Seed", zh="种子"),
|
| 49 |
+
examples_label=dict(en="Examples", zh="示例"),
|
| 50 |
+
run_btn=dict(en="🎤Singing Voice Conversion", zh="🎤歌声转换"),
|
| 51 |
+
output_audio_label=dict(en="Generated audio", zh="合成结果音频"),
|
| 52 |
+
warn_missing_audio=dict(en="Please provide both prompt audio and target audio.", zh="请同时上传 Prompt 与 Target 音频。"),
|
| 53 |
+
instruction_title=dict(en="Usage", zh="使用说明"),
|
| 54 |
+
instruction_p1=dict(
|
| 55 |
+
en="Upload the Prompt and Target audio, and configure the parameters",
|
| 56 |
+
zh="上传 Prompt 与 Target 音频,并配置相关参数",
|
| 57 |
+
),
|
| 58 |
+
instruction_p2=dict(
|
| 59 |
+
en="Click「🎤Singing Voice Conversion」to start singing voice conversion.",
|
| 60 |
+
zh="点击「🎤歌声转换」开始最终生成。",
|
| 61 |
+
),
|
| 62 |
+
tips_title=dict(en="Tips", zh="提示"),
|
| 63 |
+
tip_p1=dict(
|
| 64 |
+
en="Input: The Prompt audio is recommended to be a clean and clear singing voice, while the Target audio can be either a pure vocal or a mixture with accompaniment. If the audio contains accompaniment, please check the vocal separation option.",
|
| 65 |
+
zh="输入:Prompt 音频建议是干净清晰的歌声,Target 音频可以是纯歌声或伴奏,这两者若带伴奏需要勾选分离选项",
|
| 66 |
+
),
|
| 67 |
+
tip_p2=dict(
|
| 68 |
+
en="Pitch shift: When there is a large pitch range difference between the Prompt and Target audio, you can try enabling auto pitch shift or manually adjusting the pitch shift in semitones. When a non-zero pitch shift is specified, auto pitch shift will not take effect. The accompaniment of auto mix will be pitch-shifted together with the vocal (keeping the same octave).",
|
| 69 |
+
zh="变调:Prompt 音频的音域和 Target 音频的音域差距较大的时候,可以尝试开启自动变调或手动调整变调半音数,指定非0的变调半音数时,自动变调不生效,自动混音的伴奏会配合歌声进行升降调(保持同一个八度)",
|
| 70 |
+
),
|
| 71 |
+
tip_p3=dict(
|
| 72 |
+
en="Model parameters: Generally, a larger number of sampling steps will yield better generation quality but also longer generation time; a larger cfg scale will increase timbre similarity and melody fidelity, but may cause more distortion, it is recommended to take a value between 1 and 3.",
|
| 73 |
+
zh="模型参数:一般采样步数越大,生成质量越好,但生成时间也越长;一般cfg系数越大,音色相似度和旋律保真度越高,但是会造成更多的失真,建议取1~3之间的值",
|
| 74 |
+
),
|
| 75 |
+
tip_p4=dict(
|
| 76 |
+
en="If you want to convert a long audio or a whole song with large pitch range, there may be instability in the generated voice. You can try converting in segments.",
|
| 77 |
+
zh="长音频或完整歌曲中,音域变化较大的情况有可能出现音色不稳定,可以尝试分段转换",
|
| 78 |
+
)
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
_GLOBAL_LANG: Literal["zh", "en"] = "zh"
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def _i18n(key: str) -> str:
|
| 85 |
+
return _I18N[key][_GLOBAL_LANG]
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def _print_exception(context: str) -> None:
|
| 89 |
+
print(f"[{context}]\n{traceback.format_exc()}", file=sys.stderr, flush=True)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def _get_device() -> str:
|
| 93 |
+
return "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def _session_dir() -> Path:
|
| 97 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
|
| 98 |
+
return ROOT / "outputs" / "gradio" / "svc" / timestamp
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def _normalize_audio_input(audio):
|
| 102 |
+
return audio[0] if isinstance(audio, tuple) else audio
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def _trim_and_save_audio(src_audio_path: str, dst_wav_path: Path, max_sec: int, sr: int = SAMPLE_RATE) -> None:
|
| 106 |
+
audio_data, _ = librosa.load(src_audio_path, sr=sr, mono=True)
|
| 107 |
+
audio_data = audio_data[: max_sec * sr]
|
| 108 |
+
dst_wav_path.parent.mkdir(parents=True, exist_ok=True)
|
| 109 |
+
sf.write(dst_wav_path, audio_data, sr)
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def _usage_md() -> str:
|
| 113 |
+
return "\n\n".join([
|
| 114 |
+
f"### {_i18n('instruction_title')}",
|
| 115 |
+
f"**1.** {_i18n('instruction_p1')}",
|
| 116 |
+
f"**2.** {_i18n('instruction_p2')}",
|
| 117 |
+
])
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def _tips_md() -> str:
|
| 121 |
+
return "\n\n".join([
|
| 122 |
+
f"### {_i18n('tips_title')}",
|
| 123 |
+
f"- {_i18n('tip_p1')}",
|
| 124 |
+
f"- {_i18n('tip_p2')}",
|
| 125 |
+
f"- {_i18n('tip_p3')}",
|
| 126 |
+
f"- {_i18n('tip_p4')}",
|
| 127 |
+
])
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
class AppState:
|
| 131 |
+
def __init__(self) -> None:
|
| 132 |
+
self.device = _get_device()
|
| 133 |
+
self.preprocess_pipeline = PreprocessPipeline(
|
| 134 |
+
device=self.device,
|
| 135 |
+
language="Mandarin",
|
| 136 |
+
save_dir=str(ROOT / "outputs" / "gradio" / "_placeholder" / "svc"),
|
| 137 |
+
vocal_sep=True,
|
| 138 |
+
max_merge_duration=60000,
|
| 139 |
+
midi_transcribe=False,
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
self.svc_config = load_config("soulxsinger/config/soulxsinger.yaml")
|
| 143 |
+
self.svc_model = build_svc_model(
|
| 144 |
+
model_path="pretrained_models/SoulX-Singer/model-svc.pt",
|
| 145 |
+
config=self.svc_config,
|
| 146 |
+
device=self.device,
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
def run_preprocess(self, audio_path: Path, save_path: Path, vocal_sep: bool) -> tuple[bool, str, Path | None, Path | None]:
|
| 150 |
+
try:
|
| 151 |
+
self.preprocess_pipeline.save_dir = str(save_path)
|
| 152 |
+
self.preprocess_pipeline.run(
|
| 153 |
+
audio_path=str(audio_path),
|
| 154 |
+
vocal_sep=vocal_sep,
|
| 155 |
+
max_merge_duration=60000,
|
| 156 |
+
language="Mandarin",
|
| 157 |
+
)
|
| 158 |
+
vocal_wav = save_path / "vocal.wav"
|
| 159 |
+
vocal_f0 = save_path / "vocal_f0.npy"
|
| 160 |
+
if not vocal_wav.exists() or not vocal_f0.exists():
|
| 161 |
+
return False, f"preprocess output missing: {vocal_wav} or {vocal_f0}", None, None
|
| 162 |
+
return True, "ok", vocal_wav, vocal_f0
|
| 163 |
+
except Exception as e:
|
| 164 |
+
return False, f"preprocess failed: {e}", None, None
|
| 165 |
+
|
| 166 |
+
def run_svc(
|
| 167 |
+
self,
|
| 168 |
+
prompt_wav_path: Path,
|
| 169 |
+
target_wav_path: Path,
|
| 170 |
+
prompt_f0_path: Path,
|
| 171 |
+
target_f0_path: Path,
|
| 172 |
+
session_base: Path,
|
| 173 |
+
auto_shift: bool,
|
| 174 |
+
auto_mix_acc: bool,
|
| 175 |
+
pitch_shift: int,
|
| 176 |
+
n_step: int,
|
| 177 |
+
cfg: float,
|
| 178 |
+
seed: int,
|
| 179 |
+
) -> tuple[bool, str, Path | None]:
|
| 180 |
+
try:
|
| 181 |
+
torch.manual_seed(seed)
|
| 182 |
+
np.random.seed(seed)
|
| 183 |
+
random.seed(seed)
|
| 184 |
+
|
| 185 |
+
save_dir = session_base / "generated"
|
| 186 |
+
save_dir.mkdir(parents=True, exist_ok=True)
|
| 187 |
+
|
| 188 |
+
class Args:
|
| 189 |
+
pass
|
| 190 |
+
|
| 191 |
+
args = Args()
|
| 192 |
+
args.device = self.device
|
| 193 |
+
args.prompt_wav_path = str(prompt_wav_path)
|
| 194 |
+
args.target_wav_path = str(target_wav_path)
|
| 195 |
+
args.prompt_f0_path = str(prompt_f0_path)
|
| 196 |
+
args.target_f0_path = str(target_f0_path)
|
| 197 |
+
args.save_dir = str(save_dir)
|
| 198 |
+
args.auto_shift = auto_shift
|
| 199 |
+
args.pitch_shift = int(pitch_shift)
|
| 200 |
+
args.n_steps = int(n_step)
|
| 201 |
+
args.cfg = float(cfg)
|
| 202 |
+
|
| 203 |
+
svc_process(args, self.svc_config, self.svc_model)
|
| 204 |
+
|
| 205 |
+
generated = save_dir / "generated.wav"
|
| 206 |
+
if not generated.exists():
|
| 207 |
+
return False, f"inference finished but output not found: {generated}", None
|
| 208 |
+
|
| 209 |
+
if auto_mix_acc:
|
| 210 |
+
acc_path = session_base / "transcriptions" / "target" / "acc.wav"
|
| 211 |
+
if acc_path.exists():
|
| 212 |
+
vocal_shift = args.pitch_shift
|
| 213 |
+
mul = -1 if vocal_shift < 0 else 1
|
| 214 |
+
acc_shift = abs(vocal_shift) % 12
|
| 215 |
+
acc_shift = mul * acc_shift
|
| 216 |
+
if acc_shift > 6:
|
| 217 |
+
acc_shift -= 12
|
| 218 |
+
if acc_shift < -6:
|
| 219 |
+
acc_shift += 12
|
| 220 |
+
|
| 221 |
+
mix_sr = self.svc_config.audio.sample_rate
|
| 222 |
+
vocal, _ = librosa.load(str(generated), sr=mix_sr, mono=True)
|
| 223 |
+
acc, _ = librosa.load(str(acc_path), sr=mix_sr, mono=True)
|
| 224 |
+
if acc_shift != 0:
|
| 225 |
+
acc = librosa.effects.pitch_shift(acc, sr=mix_sr, n_steps=acc_shift)
|
| 226 |
+
print(f"Applied pitch shift of {acc_shift} semitones to accompaniment to match vocal shift of {vocal_shift} semitones.")
|
| 227 |
+
|
| 228 |
+
mix_len = min(len(vocal), len(acc))
|
| 229 |
+
if mix_len > 0:
|
| 230 |
+
mixed = vocal[:mix_len] + acc[:mix_len]
|
| 231 |
+
peak = float(np.max(np.abs(mixed))) if mixed.size > 0 else 1.0
|
| 232 |
+
if peak > 1.0:
|
| 233 |
+
mixed = mixed / peak
|
| 234 |
+
mixed_path = save_dir / "generated_mixed.wav"
|
| 235 |
+
sf.write(str(mixed_path), mixed, mix_sr)
|
| 236 |
+
generated = mixed_path
|
| 237 |
+
|
| 238 |
+
return True, "svc inference done", generated
|
| 239 |
+
except Exception as e:
|
| 240 |
+
return False, f"svc inference failed: {e}", None
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
APP_STATE = AppState()
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
@spaces.GPU
|
| 247 |
+
def _start_svc(
|
| 248 |
+
prompt_audio,
|
| 249 |
+
target_audio,
|
| 250 |
+
prompt_vocal_sep=False,
|
| 251 |
+
target_vocal_sep=True,
|
| 252 |
+
auto_shift=True,
|
| 253 |
+
auto_mix_acc=True,
|
| 254 |
+
pitch_shift=0,
|
| 255 |
+
n_step=32,
|
| 256 |
+
cfg=1.0,
|
| 257 |
+
seed=42
|
| 258 |
+
):
|
| 259 |
+
try:
|
| 260 |
+
prompt_audio = _normalize_audio_input(prompt_audio)
|
| 261 |
+
target_audio = _normalize_audio_input(target_audio)
|
| 262 |
+
if not prompt_audio or not target_audio:
|
| 263 |
+
gr.Warning(_i18n("warn_missing_audio"))
|
| 264 |
+
return None
|
| 265 |
+
|
| 266 |
+
session_base = _session_dir()
|
| 267 |
+
audio_dir = session_base / "audio"
|
| 268 |
+
prompt_raw = audio_dir / "prompt.wav"
|
| 269 |
+
target_raw = audio_dir / "target.wav"
|
| 270 |
+
_trim_and_save_audio(prompt_audio, prompt_raw, PROMPT_MAX_SEC_DEFAULT)
|
| 271 |
+
_trim_and_save_audio(target_audio, target_raw, TARGET_MAX_SEC_DEFAULT)
|
| 272 |
+
|
| 273 |
+
prompt_ok, prompt_msg, prompt_wav, prompt_f0 = APP_STATE.run_preprocess(
|
| 274 |
+
audio_path=prompt_raw,
|
| 275 |
+
save_path=session_base / "transcriptions" / "prompt",
|
| 276 |
+
vocal_sep=bool(prompt_vocal_sep),
|
| 277 |
+
)
|
| 278 |
+
if not prompt_ok or prompt_wav is None or prompt_f0 is None:
|
| 279 |
+
print(prompt_msg, file=sys.stderr, flush=True)
|
| 280 |
+
return None
|
| 281 |
+
|
| 282 |
+
target_ok, target_msg, target_wav, target_f0 = APP_STATE.run_preprocess(
|
| 283 |
+
audio_path=target_raw,
|
| 284 |
+
save_path=session_base / "transcriptions" / "target",
|
| 285 |
+
vocal_sep=bool(target_vocal_sep),
|
| 286 |
+
)
|
| 287 |
+
if not target_ok or target_wav is None or target_f0 is None:
|
| 288 |
+
print(target_msg, file=sys.stderr, flush=True)
|
| 289 |
+
return None
|
| 290 |
+
|
| 291 |
+
ok, msg, generated = APP_STATE.run_svc(
|
| 292 |
+
prompt_wav_path=prompt_wav,
|
| 293 |
+
target_wav_path=target_wav,
|
| 294 |
+
prompt_f0_path=prompt_f0,
|
| 295 |
+
target_f0_path=target_f0,
|
| 296 |
+
session_base=session_base,
|
| 297 |
+
auto_shift=bool(auto_shift),
|
| 298 |
+
auto_mix_acc=bool(auto_mix_acc),
|
| 299 |
+
pitch_shift=int(pitch_shift),
|
| 300 |
+
n_step=int(n_step),
|
| 301 |
+
cfg=float(cfg),
|
| 302 |
+
seed=int(seed),
|
| 303 |
+
)
|
| 304 |
+
if not ok or generated is None:
|
| 305 |
+
print(msg, file=sys.stderr, flush=True)
|
| 306 |
+
return None
|
| 307 |
+
return str(generated)
|
| 308 |
+
except Exception:
|
| 309 |
+
_print_exception("_start_svc")
|
| 310 |
+
return None
|
| 311 |
+
finally:
|
| 312 |
+
gc.collect()
|
| 313 |
+
if torch.cuda.is_available():
|
| 314 |
+
torch.cuda.empty_cache()
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
def render_tab_content() -> None:
|
| 318 |
+
"""Render SVC tab content (for embedding in app.py). Same UI style as webui: two columns, no title."""
|
| 319 |
+
with gr.Row(equal_height=False):
|
| 320 |
+
# ── Left column: inputs & controls ──
|
| 321 |
+
with gr.Column(scale=1):
|
| 322 |
+
prompt_audio = gr.Audio(
|
| 323 |
+
label="Prompt audio (reference voice)",
|
| 324 |
+
type="filepath",
|
| 325 |
+
interactive=True,
|
| 326 |
+
)
|
| 327 |
+
target_audio = gr.Audio(
|
| 328 |
+
label="Target audio (to convert)",
|
| 329 |
+
type="filepath",
|
| 330 |
+
interactive=True,
|
| 331 |
+
)
|
| 332 |
+
|
| 333 |
+
run_btn = gr.Button(
|
| 334 |
+
value="🎤 Singing Voice Conversion",
|
| 335 |
+
variant="primary",
|
| 336 |
+
size="lg",
|
| 337 |
+
)
|
| 338 |
+
|
| 339 |
+
with gr.Accordion("Advanced settings", open=False):
|
| 340 |
+
with gr.Row():
|
| 341 |
+
prompt_vocal_sep = gr.Checkbox(label="Prompt vocal separation", value=False, scale=1)
|
| 342 |
+
target_vocal_sep = gr.Checkbox(label="Target vocal separation", value=True, scale=1)
|
| 343 |
+
with gr.Row():
|
| 344 |
+
auto_shift = gr.Checkbox(label="Auto pitch shift", value=True, scale=1)
|
| 345 |
+
auto_mix_acc = gr.Checkbox(label="Auto mix accompaniment", value=True, scale=1)
|
| 346 |
+
pitch_shift = gr.Slider(label="Pitch shift (semitones)", value=0, minimum=-36, maximum=36, step=1)
|
| 347 |
+
n_step = gr.Slider(label="n_step", value=32, minimum=1, maximum=200, step=1)
|
| 348 |
+
cfg = gr.Slider(label="cfg scale", value=1.0, minimum=0.0, maximum=10.0, step=0.1)
|
| 349 |
+
seed_input = gr.Slider(label="Seed", value=42, minimum=0, maximum=10000, step=1)
|
| 350 |
+
|
| 351 |
+
# ── Right column: output ──
|
| 352 |
+
with gr.Column(scale=1):
|
| 353 |
+
output_audio = gr.Audio(label="Generated audio", type="filepath", interactive=False)
|
| 354 |
+
gr.Examples(
|
| 355 |
+
examples=EXAMPLE_LIST,
|
| 356 |
+
inputs=[prompt_audio, target_audio],
|
| 357 |
+
outputs=[output_audio],
|
| 358 |
+
fn=_start_svc,
|
| 359 |
+
cache_examples=True,
|
| 360 |
+
cache_mode="lazy",
|
| 361 |
+
)
|
| 362 |
+
|
| 363 |
+
run_btn.click(
|
| 364 |
+
fn=_start_svc,
|
| 365 |
+
inputs=[
|
| 366 |
+
prompt_audio,
|
| 367 |
+
target_audio,
|
| 368 |
+
prompt_vocal_sep,
|
| 369 |
+
target_vocal_sep,
|
| 370 |
+
auto_shift,
|
| 371 |
+
auto_mix_acc,
|
| 372 |
+
pitch_shift,
|
| 373 |
+
n_step,
|
| 374 |
+
cfg,
|
| 375 |
+
seed_input,
|
| 376 |
+
],
|
| 377 |
+
outputs=[output_audio],
|
| 378 |
+
)
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
def render_interface() -> gr.Blocks:
|
| 382 |
+
with gr.Blocks(title="SoulX-Singer", theme=gr.themes.Default()) as page:
|
| 383 |
+
gr.HTML(
|
| 384 |
+
'<div style="'
|
| 385 |
+
'text-align: center; '
|
| 386 |
+
'padding: 1.25rem 0 1.5rem; '
|
| 387 |
+
'margin-bottom: 0.5rem;'
|
| 388 |
+
'">'
|
| 389 |
+
'<div style="'
|
| 390 |
+
'display: inline-block; '
|
| 391 |
+
'font-size: 1.75rem; '
|
| 392 |
+
'font-weight: 700; '
|
| 393 |
+
'letter-spacing: 0.02em; '
|
| 394 |
+
'line-height: 1.3;'
|
| 395 |
+
'">SoulX-Singer</div>'
|
| 396 |
+
'<div style="'
|
| 397 |
+
'width: 80px; '
|
| 398 |
+
'height: 3px; '
|
| 399 |
+
'margin: 1rem auto 0; '
|
| 400 |
+
'background: linear-gradient(90deg, transparent, #6366f1, transparent); '
|
| 401 |
+
'border-radius: 2px;'
|
| 402 |
+
'"></div>'
|
| 403 |
+
'</div>'
|
| 404 |
+
)
|
| 405 |
+
render_tab_content()
|
| 406 |
+
return page
|
| 407 |
+
|
| 408 |
+
|
| 409 |
+
if __name__ == "__main__":
|
| 410 |
+
import argparse
|
| 411 |
+
|
| 412 |
+
parser = argparse.ArgumentParser()
|
| 413 |
+
parser.add_argument("--port", type=int, default=7861, help="Gradio server port")
|
| 414 |
+
parser.add_argument("--share", action="store_true", help="Create public link")
|
| 415 |
+
args = parser.parse_args()
|
| 416 |
+
|
| 417 |
+
page = render_interface()
|
| 418 |
+
page.queue()
|
| 419 |
+
page.launch(share=args.share, server_name="0.0.0.0", server_port=args.port)
|