kokole commited on
Commit
2566adf
·
1 Parent(s): a4b297a

feat: add svc inference code and webui

Browse files
.gitattributes CHANGED
@@ -44,3 +44,5 @@ raven.wav filter=lfs diff=lfs merge=lfs -text
44
  anita.wav filter=lfs diff=lfs merge=lfs -text
45
  everybody_loves.wav filter=lfs diff=lfs merge=lfs -text
46
  obama.wav filter=lfs diff=lfs merge=lfs -text
 
 
 
44
  anita.wav filter=lfs diff=lfs merge=lfs -text
45
  everybody_loves.wav filter=lfs diff=lfs merge=lfs -text
46
  obama.wav filter=lfs diff=lfs merge=lfs -text
47
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
48
+ *.wav filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -18,9 +18,39 @@ if __name__ == "__main__":
18
  os.chdir(ROOT)
19
  ensure_pretrained_models()
20
 
21
- from webui import render_interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- page = render_interface()
24
  page.queue()
25
  page.launch(
26
  server_name="0.0.0.0",
 
18
  os.chdir(ROOT)
19
  ensure_pretrained_models()
20
 
21
+ import gradio as gr
22
+ from webui import render_tab_content as render_svs_tab
23
+ from webui_svc import render_tab_content as render_svc_tab
24
+
25
+ with gr.Blocks(title="SoulX-Singer", theme=gr.themes.Default()) as page:
26
+ gr.HTML(
27
+ '<div style="'
28
+ 'text-align: center; '
29
+ 'padding: 1.25rem 0 1.5rem; '
30
+ 'margin-bottom: 0.5rem;'
31
+ '">'
32
+ '<div style="'
33
+ 'display: inline-block; '
34
+ 'font-size: 1.75rem; '
35
+ 'font-weight: 700; '
36
+ 'letter-spacing: 0.02em; '
37
+ 'line-height: 1.3;'
38
+ '">SoulX-Singer</div>'
39
+ '<div style="'
40
+ 'width: 80px; '
41
+ 'height: 3px; '
42
+ 'margin: 1rem auto 0; '
43
+ 'background: linear-gradient(90deg, transparent, #6366f1, transparent); '
44
+ 'border-radius: 2px;'
45
+ '"></div>'
46
+ '</div>'
47
+ )
48
+ with gr.Tabs():
49
+ with gr.Tab("Singing Voice Synthesis"):
50
+ render_svs_tab()
51
+ with gr.Tab("Singing Voice Conversion"):
52
+ render_svc_tab()
53
 
 
54
  page.queue()
55
  page.launch(
56
  server_name="0.0.0.0",
cli/inference_svc.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import json
4
+ import argparse
5
+ from tqdm import tqdm
6
+ import numpy as np
7
+ import soundfile as sf
8
+ from collections import OrderedDict
9
+ from omegaconf import DictConfig
10
+
11
+ from soulxsinger.utils.file_utils import load_config
12
+ from soulxsinger.models.soulxsinger_svc import SoulXSingerSVC
13
+ from soulxsinger.utils.audio_utils import load_wav
14
+
15
+
16
+ def build_model(
17
+ model_path: str,
18
+ config: DictConfig,
19
+ device: str = "cuda",
20
+ ):
21
+ """
22
+ Build the model from the pre-trained model path and model configuration.
23
+
24
+ Args:
25
+ model_path (str): Path to the checkpoint file.
26
+ config (DictConfig): Model configuration.
27
+ device (str, optional): Device to use. Defaults to "cuda".
28
+
29
+ Returns:
30
+ Tuple[torch.nn.Module, torch.nn.Module]: The initialized model and vocoder.
31
+ """
32
+
33
+ if not os.path.isfile(model_path):
34
+ raise FileNotFoundError(
35
+ f"Model checkpoint not found: {model_path}. "
36
+ "Please download the pretrained model and place it at the path, or set --model_path."
37
+ )
38
+ model = SoulXSingerSVC(config).to(device)
39
+ print("Model initialized.")
40
+ print("Model parameters:", sum(p.numel() for p in model.parameters()) / 1e6, "M")
41
+
42
+ checkpoint = torch.load(model_path, weights_only=False, map_location=device)
43
+ if "state_dict" not in checkpoint:
44
+ raise KeyError(
45
+ f"Checkpoint at {model_path} has no 'state_dict' key. "
46
+ "Expected a checkpoint saved with model.state_dict()."
47
+ )
48
+ model.load_state_dict(checkpoint["state_dict"], strict=True)
49
+
50
+ model.eval()
51
+ model.to(device)
52
+ print("Model checkpoint loaded.")
53
+
54
+ return model
55
+
56
+
57
+ def process(args, config, model: torch.nn.Module):
58
+ """Run the full inference pipeline given a data_processor and model.
59
+ """
60
+
61
+ os.makedirs(args.save_dir, exist_ok=True)
62
+ pt_wav = load_wav(args.prompt_wav_path, config.audio.sample_rate).to(args.device)
63
+ gt_wav = load_wav(args.target_wav_path, config.audio.sample_rate).to(args.device)
64
+ pt_f0 = torch.from_numpy(np.load(args.prompt_f0_path)).unsqueeze(0).to(args.device)
65
+ gt_f0 = torch.from_numpy(np.load(args.target_f0_path)).unsqueeze(0).to(args.device)
66
+
67
+ n_step = args.n_steps if hasattr(args, "n_steps") else config.infer.n_steps
68
+ cfg = args.cfg if hasattr(args, "cfg") else config.infer.cfg
69
+
70
+ generated_audio, generated_shift = model.infer(pt_wav, gt_wav, pt_f0, gt_f0, auto_shift=args.auto_shift, pitch_shift=args.pitch_shift, n_steps=n_step, cfg=cfg)
71
+ generated_audio = generated_audio.squeeze().cpu().numpy()
72
+ if args.pitch_shift != generated_shift:
73
+ args.pitch_shift = generated_shift
74
+ # print(f"Applied pitch shift of {generated_shift} semitones to match GT F0 contour.")
75
+
76
+ sf.write(os.path.join(args.save_dir, "generated.wav"), generated_audio, config.audio.sample_rate)
77
+ print(f"Generated audio saved to {os.path.join(args.save_dir, 'generated.wav')}")
78
+
79
+
80
+ def main(args, config):
81
+ model = build_model(
82
+ model_path=args.model_path,
83
+ config=config,
84
+ device=args.device,
85
+ )
86
+ process(args, config, model)
87
+
88
+ if __name__ == "__main__":
89
+ parser = argparse.ArgumentParser()
90
+ parser.add_argument("--device", type=str, default="cuda")
91
+ parser.add_argument("--model_path", type=str, default='pretrained_models/soulx-singer/model.pt')
92
+ parser.add_argument("--config", type=str, default='soulxsinger/config/soulxsinger.yaml')
93
+ parser.add_argument("--prompt_wav_path", type=str, default='example/audio/zh_prompt.wav')
94
+ parser.add_argument("--target_wav_path", type=str, default='example/audio/zh_target.wav')
95
+ parser.add_argument("--prompt_f0_path", type=str, default='example/audio/zh_prompt_f0.npy')
96
+ parser.add_argument("--target_f0_path", type=str, default='example/audio/zh_target_f0.npy')
97
+ parser.add_argument("--save_dir", type=str, default='outputs')
98
+ parser.add_argument("--auto_shift", action="store_true")
99
+ parser.add_argument("--pitch_shift", type=int, default=0)
100
+ parser.add_argument("--n_steps", type=int, default=32)
101
+ parser.add_argument("--cfg", type=float, default=3.0)
102
+ args = parser.parse_args()
103
+
104
+ config = load_config(args.config)
105
+ main(args, config)
ensure_models.py CHANGED
@@ -10,7 +10,7 @@ MODEL_DIR_PREPROCESS = PRETRAINED_DIR / "SoulX-Singer-Preprocess"
10
 
11
  def ensure_pretrained_models():
12
  """Download SoulX-Singer and Preprocess models from Hugging Face Hub if not present."""
13
- if (MODEL_DIR_SVS / "model.pt").exists() and MODEL_DIR_PREPROCESS.exists():
14
  print("Pretrained models already present, skipping download.", flush=True)
15
  return
16
 
@@ -26,7 +26,7 @@ def ensure_pretrained_models():
26
 
27
  PRETRAINED_DIR.mkdir(parents=True, exist_ok=True)
28
 
29
- if not (MODEL_DIR_SVS / "model.pt").exists():
30
  print("Downloading SoulX-Singer model...", flush=True)
31
  snapshot_download(
32
  repo_id="Soul-AILab/SoulX-Singer",
 
10
 
11
  def ensure_pretrained_models():
12
  """Download SoulX-Singer and Preprocess models from Hugging Face Hub if not present."""
13
+ if (MODEL_DIR_SVS / "model.pt").exists() and (MODEL_DIR_SVS / "model-svc.pt").exists() and MODEL_DIR_PREPROCESS.exists():
14
  print("Pretrained models already present, skipping download.", flush=True)
15
  return
16
 
 
26
 
27
  PRETRAINED_DIR.mkdir(parents=True, exist_ok=True)
28
 
29
+ if not (MODEL_DIR_SVS / "model.pt").exists() or not (MODEL_DIR_SVS / "model-svc.pt").exists():
30
  print("Downloading SoulX-Singer model...", flush=True)
31
  snapshot_download(
32
  repo_id="Soul-AILab/SoulX-Singer",
example/audio/en_prompt.mp3 CHANGED
Binary files a/example/audio/en_prompt.mp3 and b/example/audio/en_prompt.mp3 differ
 
example/audio/en_target.mp3 CHANGED
Binary files a/example/audio/en_target.mp3 and b/example/audio/en_target.mp3 differ
 
example/audio/music_f0.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a091dce0ab269093a455f8959222f8c7fb55e8d9c9477e8cd2cde8eb9279d9ef
3
+ size 20720
example/audio/svc_prompt_demo.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0dde83f7ff5ef5ad52939db70bd1324b6247ea4f399e60e0393cc18725cf29c3
3
+ size 41187
example/audio/svc_target_demo.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c65c2e5fec64a51c613badcce35145b6f8e2bb33907ee7428275bfb918876a2c
3
+ size 1944155
example/audio/svc_webui/I'm Yours.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c65c2e5fec64a51c613badcce35145b6f8e2bb33907ee7428275bfb918876a2c
3
+ size 1944155
example/audio/svc_webui/Sun Yanzi.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0dde83f7ff5ef5ad52939db70bd1324b6247ea4f399e60e0393cc18725cf29c3
3
+ size 41187
example/audio/svc_webui/传奇.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2dfec7ebb41dd6c56877fdeddf7a5fdc106ea9c2fdb1c06f6adddc6f89e6285e
3
+ size 4738948
example/audio/svc_webui/君が好きだと叫びたい.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fe990727559bf1ffb548c562b6c3b19f16602e3c147da42bf56fc92129ae35e
3
+ size 3706589
example/audio/svc_webui/富士山下.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1cad7eaabe05f1c6ef1994bf4326cdafc79991e1c647a857fa2f64925e84aab
3
+ size 4147219
example/audio/zh_prompt.mp3 CHANGED
Binary files a/example/audio/zh_prompt.mp3 and b/example/audio/zh_prompt.mp3 differ
 
example/audio/zh_prompt_f0.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aecf5f40c16a6390e8bb8c19ce69120dcbedaea5e4051aba1bdde95a024f29d3
3
+ size 4408
example/audio/zh_target.mp3 CHANGED
Binary files a/example/audio/zh_target.mp3 and b/example/audio/zh_target.mp3 differ
 
example/infer_svc.sh ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ script_dir=$(dirname "$(realpath "$0")")
4
+ root_dir=$(dirname "$script_dir")
5
+
6
+ cd $root_dir || exit
7
+ export PYTHONPATH=$root_dir:$PYTHONPATH
8
+
9
+ model_path=pretrained_models/SoulX-Singer/model-svc.pt
10
+ config=soulxsinger/config/soulxsinger.yaml
11
+ prompt_wav_path=example/audio/zh_prompt.mp3
12
+ target_wav_path=example/audio/music.mp3
13
+ prompt_f0_path=example/audio/zh_prompt_f0.npy
14
+ target_f0_path=example/audio/music_f0.npy
15
+ save_dir=example/generated/music_svc
16
+
17
+ python -m cli.inference_svc \
18
+ --device cuda \
19
+ --model_path $model_path \
20
+ --config $config \
21
+ --prompt_wav_path $prompt_wav_path \
22
+ --target_wav_path $target_wav_path \
23
+ --prompt_f0_path $prompt_f0_path \
24
+ --target_f0_path $target_f0_path \
25
+ --save_dir $save_dir \
26
+ --auto_shift \
27
+ --pitch_shift 0
example/preprocess.sh CHANGED
@@ -15,6 +15,7 @@ save_dir=example/transcriptions/zh_prompt
15
  language=Mandarin
16
  vocal_sep=False
17
  max_merge_duration=30000
 
18
 
19
  python -m preprocess.pipeline \
20
  --audio_path $audio_path \
@@ -22,7 +23,8 @@ python -m preprocess.pipeline \
22
  --language $language \
23
  --device $device \
24
  --vocal_sep $vocal_sep \
25
- --max_merge_duration $max_merge_duration
 
26
 
27
 
28
  ####### Run Target Annotation #######
@@ -31,6 +33,7 @@ save_dir=example/transcriptions/music
31
  language=Mandarin
32
  vocal_sep=True
33
  max_merge_duration=60000
 
34
 
35
  python -m preprocess.pipeline \
36
  --audio_path $audio_path \
@@ -38,4 +41,5 @@ python -m preprocess.pipeline \
38
  --language $language \
39
  --device $device \
40
  --vocal_sep $vocal_sep \
41
- --max_merge_duration $max_merge_duration
 
 
15
  language=Mandarin
16
  vocal_sep=False
17
  max_merge_duration=30000
18
+ midi_transcribe=True # Whether to transcribe vocal midi, set True for singing voice synthesis, False for singing voice conversion
19
 
20
  python -m preprocess.pipeline \
21
  --audio_path $audio_path \
 
23
  --language $language \
24
  --device $device \
25
  --vocal_sep $vocal_sep \
26
+ --max_merge_duration $max_merge_duration \
27
+ --midi_transcribe $midi_transcribe
28
 
29
 
30
  ####### Run Target Annotation #######
 
33
  language=Mandarin
34
  vocal_sep=True
35
  max_merge_duration=60000
36
+ midi_transcribe=True # Whether to transcribe vocal midi, set True for singing voice synthesis, False for singing voice conversion
37
 
38
  python -m preprocess.pipeline \
39
  --audio_path $audio_path \
 
41
  --language $language \
42
  --device $device \
43
  --vocal_sep $vocal_sep \
44
+ --max_merge_duration $max_merge_duration \
45
+ --midi_transcribe $midi_transcribe
preprocess/pipeline.py CHANGED
@@ -16,12 +16,13 @@ from preprocess.tools import (
16
 
17
 
18
  class PreprocessPipeline:
19
- def __init__(self, device: str, language: str, save_dir: str, vocal_sep: bool = True, max_merge_duration: int = 60000):
20
  self.device = device
21
  self.language = language
22
  self.save_dir = save_dir
23
  self.vocal_sep = vocal_sep
24
  self.max_merge_duration = max_merge_duration
 
25
 
26
  if vocal_sep:
27
  self.vocal_separator = VocalSeparator(
@@ -37,26 +38,31 @@ class PreprocessPipeline:
37
  model_path="pretrained_models/SoulX-Singer-Preprocess/rmvpe/rmvpe.pt",
38
  device=device,
39
  )
40
- self.vocal_detector = VocalDetector(
41
- cut_wavs_output_dir= f"{save_dir}/cut_wavs",
42
- )
43
- self.lyric_transcriber = LyricTranscriber(
44
- zh_model_path="pretrained_models/SoulX-Singer-Preprocess/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
45
- en_model_path="pretrained_models/SoulX-Singer-Preprocess/parakeet-tdt-0.6b-v2/parakeet-tdt-0.6b-v2.nemo",
46
- device=device
47
- )
48
- self.note_transcriber = NoteTranscriber(
49
- rosvot_model_path="pretrained_models/SoulX-Singer-Preprocess/rosvot/rosvot/model.pt",
50
- rwbd_model_path="pretrained_models/SoulX-Singer-Preprocess/rosvot/rwbd/model.pt",
51
- device=device
52
- )
 
 
 
 
 
53
 
54
  def run(
55
  self,
56
  audio_path: str,
57
- vocal_sep: bool = True,
58
- max_merge_duration: int = 60000,
59
- language: str = "Mandarin"
60
  ) -> None:
61
  vocal_sep = self.vocal_sep if vocal_sep is None else vocal_sep
62
  max_merge_duration = self.max_merge_duration if max_merge_duration is None else max_merge_duration
@@ -81,7 +87,11 @@ class PreprocessPipeline:
81
  vocal_path = output_dir / "vocal.wav"
82
  sf.write(vocal_path, vocal, sample_rate)
83
 
84
- vocal_f0 = self.f0_extractor.process(str(vocal_path))
 
 
 
 
85
  segments = self.vocal_detector.process(str(vocal_path), f0=vocal_f0)
86
 
87
  metadata = []
@@ -124,10 +134,11 @@ def main(args):
124
  save_dir=args.save_dir,
125
  vocal_sep=args.vocal_sep,
126
  max_merge_duration=args.max_merge_duration,
 
127
  )
128
  pipeline.run(
129
  audio_path=args.audio_path,
130
- language=args.language
131
  )
132
 
133
 
@@ -139,8 +150,12 @@ if __name__ == "__main__":
139
  parser.add_argument("--save_dir", type=str, required=True, help="Directory to save the output files")
140
  parser.add_argument("--language", type=str, default="Mandarin", help="Language of the audio")
141
  parser.add_argument("--device", type=str, default="cuda:0", help="Device to run the models on")
142
- parser.add_argument("--vocal_sep", type=bool, default=True, help="Whether to perform vocal separation")
143
  parser.add_argument("--max_merge_duration", type=int, default=60000, help="Maximum merged segment duration in milliseconds")
 
144
  args = parser.parse_args()
145
 
 
 
 
146
  main(args)
 
16
 
17
 
18
  class PreprocessPipeline:
19
+ def __init__(self, device: str, language: str, save_dir: str, vocal_sep: bool = True, max_merge_duration: int = 60000, midi_transcribe: bool = True):
20
  self.device = device
21
  self.language = language
22
  self.save_dir = save_dir
23
  self.vocal_sep = vocal_sep
24
  self.max_merge_duration = max_merge_duration
25
+ self.midi_transcribe = midi_transcribe
26
 
27
  if vocal_sep:
28
  self.vocal_separator = VocalSeparator(
 
38
  model_path="pretrained_models/SoulX-Singer-Preprocess/rmvpe/rmvpe.pt",
39
  device=device,
40
  )
41
+ if self.midi_transcribe:
42
+ self.vocal_detector = VocalDetector(
43
+ cut_wavs_output_dir= f"{save_dir}/cut_wavs",
44
+ )
45
+ self.lyric_transcriber = LyricTranscriber(
46
+ zh_model_path="pretrained_models/SoulX-Singer-Preprocess/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
47
+ en_model_path="pretrained_models/SoulX-Singer-Preprocess/parakeet-tdt-0.6b-v2/parakeet-tdt-0.6b-v2.nemo",
48
+ device=device
49
+ )
50
+ self.note_transcriber = NoteTranscriber(
51
+ rosvot_model_path="pretrained_models/SoulX-Singer-Preprocess/rosvot/rosvot/model.pt",
52
+ rwbd_model_path="pretrained_models/SoulX-Singer-Preprocess/rosvot/rwbd/model.pt",
53
+ device=device
54
+ )
55
+ else:
56
+ self.vocal_detector = None
57
+ self.lyric_transcriber = None
58
+ self.note_transcriber = None
59
 
60
  def run(
61
  self,
62
  audio_path: str,
63
+ vocal_sep: bool = None,
64
+ max_merge_duration: int = None,
65
+ language: str = None,
66
  ) -> None:
67
  vocal_sep = self.vocal_sep if vocal_sep is None else vocal_sep
68
  max_merge_duration = self.max_merge_duration if max_merge_duration is None else max_merge_duration
 
87
  vocal_path = output_dir / "vocal.wav"
88
  sf.write(vocal_path, vocal, sample_rate)
89
 
90
+ vocal_f0 = self.f0_extractor.process(str(vocal_path), f0_path=str(vocal_path).replace(".wav", "_f0.npy"))
91
+
92
+ if not self.midi_transcribe or self.vocal_detector is None or self.lyric_transcriber is None or self.note_transcriber is None:
93
+ return
94
+
95
  segments = self.vocal_detector.process(str(vocal_path), f0=vocal_f0)
96
 
97
  metadata = []
 
134
  save_dir=args.save_dir,
135
  vocal_sep=args.vocal_sep,
136
  max_merge_duration=args.max_merge_duration,
137
+ midi_transcribe=args.midi_transcribe,
138
  )
139
  pipeline.run(
140
  audio_path=args.audio_path,
141
+ language=args.language,
142
  )
143
 
144
 
 
150
  parser.add_argument("--save_dir", type=str, required=True, help="Directory to save the output files")
151
  parser.add_argument("--language", type=str, default="Mandarin", help="Language of the audio")
152
  parser.add_argument("--device", type=str, default="cuda:0", help="Device to run the models on")
153
+ parser.add_argument("--vocal_sep", type=str, default="True", help="Whether to perform vocal separation")
154
  parser.add_argument("--max_merge_duration", type=int, default=60000, help="Maximum merged segment duration in milliseconds")
155
+ parser.add_argument("--midi_transcribe", type=str, default="True", help="Whether to do MIDI transcription")
156
  args = parser.parse_args()
157
 
158
+ args.vocal_sep = args.vocal_sep.lower() == "true"
159
+ args.midi_transcribe = args.midi_transcribe.lower() == "true"
160
+
161
  main(args)
soulxsinger/models/modules/whisper_encoder.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Frozen Whisper encoder wrapper (wav -> encoder embeddings)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+
7
+ import torch
8
+ import torch.nn as nn
9
+ import torchaudio
10
+ from transformers import WhisperFeatureExtractor, WhisperModel
11
+
12
+ WHISPER_MEL_FRAMES = 3000 # 3000 frames at 16000 Hz
13
+
14
+
15
+ class WhisperEncoder():
16
+
17
+ def __init__(
18
+ self,
19
+ device: Optional[str] = None,
20
+ ) -> None:
21
+ self.fe = WhisperFeatureExtractor.from_pretrained("openai/whisper-base")
22
+ self.model = WhisperModel.from_pretrained("openai/whisper-base")
23
+ self.model = self.model.to(device or ("cuda" if torch.cuda.is_available() else "cpu"))
24
+
25
+ def encode(
26
+ self,
27
+ wav: torch.Tensor,
28
+ sr: int,
29
+ ) -> torch.Tensor:
30
+ wav = torchaudio.functional.resample(wav, orig_freq=sr, new_freq=self.fe.sampling_rate) if sr != self.fe.sampling_rate else wav
31
+ wav_np = wav.cpu().detach().numpy().astype("float32", copy=False)
32
+
33
+ inputs = self.fe(
34
+ wav_np,
35
+ sampling_rate=self.fe.sampling_rate,
36
+ return_tensors="pt",
37
+ padding=False,
38
+ truncation=False,
39
+ return_attention_mask=True,
40
+ )
41
+
42
+ input_features = inputs.input_features
43
+ num_frames = input_features.shape[-1]
44
+ if num_frames < WHISPER_MEL_FRAMES:
45
+ pad = WHISPER_MEL_FRAMES - num_frames
46
+ input_features = torch.nn.functional.pad(input_features, (0, pad))
47
+ else:
48
+ input_features = input_features[..., :WHISPER_MEL_FRAMES]
49
+
50
+ input_features = input_features.to(wav.device)
51
+ if self.model.device != wav.device:
52
+ self.model = self.model.to(wav.device)
53
+ attention_mask = inputs.attention_mask.to(wav.device) if inputs.attention_mask is not None else None
54
+
55
+ encoder_out = self.model.encoder(input_features).last_hidden_state
56
+
57
+ if attention_mask is not None:
58
+ valid_mel_frames = attention_mask.sum(dim=1)
59
+ valid_enc_frames = (valid_mel_frames + 1) // 2
60
+ max_valid_enc_frames = min(int(valid_enc_frames.max().item()), encoder_out.shape[1])
61
+ encoder_out = encoder_out[:, :max_valid_enc_frames, :]
62
+ valid_len = min(int(valid_enc_frames[0].item()), max_valid_enc_frames)
63
+ if valid_len < max_valid_enc_frames:
64
+ encoder_out[0, valid_len:, :] = 0
65
+
66
+ return encoder_out
67
+
68
+
69
+ if __name__ == "__main__":
70
+ torch.manual_seed(0)
71
+ audio = torch.randn(1, 24000 * 25).float().to("cuda")
72
+ encoder = WhisperEncoder()
73
+ whisper_encoder_out = encoder.encode(audio, sr=24000)
74
+ print(whisper_encoder_out.shape)
soulxsinger/models/soulxsinger_svc.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ import numpy as np
5
+ from tqdm import tqdm
6
+ from typing import Optional, Dict, Any, List, Tuple
7
+
8
+ from soulxsinger.models.modules.vocoder import Vocoder
9
+ from soulxsinger.models.modules.decoder import CFMDecoder
10
+ from soulxsinger.models.modules.mel_transform import MelSpectrogramEncoder
11
+ from soulxsinger.models.modules.whisper_encoder import WhisperEncoder
12
+
13
+
14
+ class SoulXSingerSVC(nn.Module):
15
+ """
16
+ SoulXSinger SVC model.
17
+ """
18
+ def __init__(self, config: Dict):
19
+ super(SoulXSingerSVC, self).__init__()
20
+ self.audio_cfg = config.audio
21
+ enc_cfg = config.model.encoder
22
+ cfm_cfg = config.model.flow_matching
23
+
24
+ self.whisper_encoder = WhisperEncoder()
25
+ self.f0_encoder = nn.Embedding(enc_cfg["f0_bin"], enc_cfg["f0_dim"])
26
+ self.cfm_decoder = CFMDecoder(cfm_cfg)
27
+
28
+ self.mel = MelSpectrogramEncoder(self.audio_cfg)
29
+ self.vocoder = Vocoder()
30
+
31
+ @staticmethod
32
+ def f0_to_coarse(f0, f0_bin=361, f0_min=32.7031956625, f0_shift=0):
33
+ """
34
+ Convert continuous F0 values to discrete F0 bins (SIL and C1 - B6, 361 bins).
35
+ args:
36
+ f0: continuous F0 values
37
+ f0_bin: number of F0 bins
38
+ f0_min: minimum F0 value
39
+ f0_shift: shift value for F0 bins
40
+ returns:
41
+ f0_coarse: discrete F0 bins
42
+ """
43
+ is_torch = isinstance(f0, torch.Tensor)
44
+ uv_mask = f0 <= 0
45
+
46
+ if is_torch:
47
+ f0_safe = torch.maximum(f0, torch.tensor(f0_min))
48
+ f0_cents = 1200 * torch.log2(f0_safe / f0_min)
49
+ else:
50
+ f0_safe = np.maximum(f0, f0_min)
51
+ f0_cents = 1200 * np.log2(f0_safe / f0_min)
52
+
53
+ f0_coarse = (f0_cents / 20) + 1
54
+
55
+ if is_torch:
56
+ f0_coarse = torch.round(f0_coarse).long()
57
+ f0_coarse = torch.clamp(f0_coarse, min=1, max=f0_bin - 1)
58
+ else:
59
+ f0_coarse = np.rint(f0_coarse).astype(int)
60
+ f0_coarse = np.clip(f0_coarse, 1, f0_bin - 1)
61
+
62
+ f0_coarse[uv_mask] = 0
63
+
64
+ if f0_shift != 0:
65
+ if is_torch:
66
+ voiced = f0_coarse > 0
67
+ if voiced.any():
68
+ shifted = f0_coarse[voiced] + f0_shift
69
+ f0_coarse[voiced] = torch.clamp(shifted, 1, f0_bin - 1)
70
+ else:
71
+ voiced = f0_coarse > 0
72
+ if np.any(voiced):
73
+ shifted = f0_coarse[voiced] + f0_shift
74
+ f0_coarse[voiced] = np.clip(shifted, 1, f0_bin - 1)
75
+
76
+ return f0_coarse
77
+
78
+ @staticmethod
79
+ def build_vocal_segments(
80
+ f0,
81
+ f0_rate: int = 50,
82
+ uv_frames_th: int = 5,
83
+ min_duration_sec: float = 5.0,
84
+ max_duration_sec: float = 30.0,
85
+ num_overlaps: int = 1,
86
+ ignore_silent_segments: bool = True,
87
+ ) -> Tuple[List[Tuple[float, float]], List[Tuple[float, float]]]:
88
+ """Build vocal segments based on F0 contour. First split by long silent runs, then merge into segments based on min and max duration constraints.
89
+ args:
90
+ f0: F0 contour of the audio, 1D array or tensor with shape (T,)
91
+ f0_rate: F0 sampling rate in Hz (e.g., 50 for 20ms hop size)
92
+ uv_frames_th: number of consecutive zero F0 frames to consider as a split point
93
+ min_duration_sec: minimum duration of each segment in seconds
94
+ max_duration_sec: maximum duration of each segment in seconds
95
+ num_overlaps: number of overlapping segments to create for each non-overlapping segment (for smooth inference)
96
+ ignore_silent_segments: whether to ignore segments that are mostly silent (e.g., > 95% zero F0)
97
+ returns:
98
+ overlap_segments: list of (overlap_start_sec, overlap_end_sec) for each segment, which may overlap with adjacent segments for smooth inference
99
+ segments: list of (seg_start_sec, seg_end_sec) for each segment, which are non-overlapping and used for final merging
100
+ """
101
+ if isinstance(f0, torch.Tensor):
102
+ f0_np = f0.detach().float().cpu().numpy()
103
+ else:
104
+ f0_np = np.asarray(f0, dtype=np.float32)
105
+ f0_np = np.squeeze(f0_np)
106
+
107
+ total_frames = int(f0_np.shape[0])
108
+ if total_frames == 0:
109
+ return [], []
110
+
111
+ min_frames = max(1, int(round(min_duration_sec * f0_rate)))
112
+ max_frames = max(1, int(round(max_duration_sec * f0_rate)))
113
+
114
+ split_points = [0] # silence split points in frame indices, starting with 0 and ending with total_frames
115
+
116
+ def append_split_point(point: int):
117
+ # Ensure split points are within valid range and respect max_frames constraint
118
+ point = int(max(0, min(point, total_frames)))
119
+ while point - split_points[-1] > max_frames:
120
+ split_points.append(split_points[-1] + max_frames)
121
+ if point > split_points[-1]:
122
+ split_points.append(point)
123
+
124
+ idx = 0
125
+ while idx < total_frames:
126
+ if f0_np[idx] == 0:
127
+ run_start = idx
128
+ while idx < total_frames and f0_np[idx] == 0:
129
+ idx += 1
130
+ run_end = idx
131
+ if (run_end - run_start) >= uv_frames_th:
132
+ split_point = max(run_end - 5, (run_start + run_end) // 2)
133
+ append_split_point(split_point)
134
+ else:
135
+ idx += 1
136
+ append_split_point(total_frames)
137
+ # print(f"Initial split points (in seconds): {[round(p / f0_rate, 2) for p in split_points]}")
138
+
139
+ segments: List[Tuple[int, int]] = []
140
+ overlap_segments: List[Tuple[int, int]] = []
141
+
142
+ def append_segment(start_idx: int, end_idx: int, num_overlaps: int = num_overlaps):
143
+ segments.append((split_points[start_idx] / f0_rate, split_points[end_idx] / f0_rate))
144
+ overlap_start_idx = start_idx
145
+ if start_idx > 0 and (split_points[end_idx] - split_points[start_idx - num_overlaps]) <= max_frames:
146
+ overlap_start_idx = start_idx - num_overlaps
147
+ overlap_segments.append((split_points[overlap_start_idx] / f0_rate, split_points[end_idx] / f0_rate))
148
+
149
+ segment_start, segment_end = 0, 1
150
+
151
+ while segment_start < len(split_points) - 1:
152
+ while segment_end < len(split_points) and (split_points[segment_end] - split_points[segment_start]) < min_frames:
153
+ segment_end += 1
154
+
155
+ if segment_end >= len(split_points):
156
+ append_segment(segment_start, len(split_points) - 1, num_overlaps=num_overlaps)
157
+ break
158
+ append_segment(segment_start, segment_end, num_overlaps=num_overlaps)
159
+ segment_start = segment_end
160
+ segment_end = segment_start + 1
161
+
162
+ # print(f"Final segments (overlap_start, overlap_end, seg_start_time, seg_end_time) in seconds: {overlap_segments}")
163
+ if ignore_silent_segments:
164
+ filtered_idx = []
165
+ for i, seg in enumerate(overlap_segments):
166
+ start_frame = int(seg[0] * f0_rate)
167
+ end_frame = int(seg[1] * f0_rate)
168
+ total_frames = end_frame - start_frame
169
+ voice_frames = np.sum(f0_np[start_frame:end_frame] > 0)
170
+ if voice_frames / total_frames > 0.05 and voice_frames >= 10: # at least 10 voiced frames and >5% voiced frames
171
+ filtered_idx.append(i)
172
+
173
+ overlap_segments = [overlap_segments[i] for i in filtered_idx]
174
+ segments = [segments[i] for i in filtered_idx]
175
+ # print(f"Filtered segments with mostly silence removed: {overlap_segments}")
176
+
177
+ return overlap_segments, segments
178
+
179
+ def infer(
180
+ self,
181
+ pt_wav: str|torch.Tensor,
182
+ gt_wav: str|torch.Tensor,
183
+ pt_f0: str|torch.Tensor,
184
+ gt_f0: str|torch.Tensor,
185
+ auto_shift=False,
186
+ pitch_shift=0,
187
+ n_steps=32,
188
+ cfg=3,
189
+ ):
190
+ """
191
+ SVC inference pipeline. First build vocal segments based on F0 contour, then run inference for each segment and merge results.
192
+ args:
193
+ pt_wav: prompt waveform path or tensor
194
+ gt_wav: target waveform path or tensor
195
+ pt_f0: prompt F0 path or tensor
196
+ gt_f0: target F0 path or tensor
197
+ auto_shift: whether to automatically calculate pitch shift based on median F0 of prompt and target
198
+ pitch_shift: manual pitch shift in semitones (overrides auto_shift if > 0)
199
+ n_steps: number of diffusion steps for inference
200
+ cfg: classifier-free guidance scale for inference
201
+ """
202
+
203
+ # calculate auto pitch shift
204
+ if auto_shift and pitch_shift == 0:
205
+ if gt_f0 is not None and pt_f0 is not None:
206
+ gt_f0_median = torch.median(gt_f0[gt_f0 > 0])
207
+ pt_f0_median = torch.median(pt_f0[pt_f0 > 0])
208
+ pitch_shift = torch.round(torch.log2(pt_f0_median / gt_f0_median) * 1200 / 100).int().item()
209
+ else:
210
+ print("Warning: pitch_shift is True but note_pitch or f0 is None. Set f0_shift to 0.")
211
+ pitch_shift = 0
212
+ else:
213
+ pitch_shift = pitch_shift
214
+
215
+ # if target audio is less than 30 seconds, infer the whole audio
216
+ if gt_wav.shape[-1] < 30 * self.audio_cfg.sample_rate:
217
+ generated_audio = self.infer_segment(
218
+ pt_wav=pt_wav,
219
+ gt_wav=gt_wav,
220
+ pt_f0=pt_f0,
221
+ gt_f0=gt_f0,
222
+ pitch_shift=pitch_shift,
223
+ n_steps=n_steps,
224
+ cfg=cfg,
225
+ )
226
+ return generated_audio, pitch_shift
227
+
228
+ # if target audio is longer than 30 seconds, build vocal segments and infer each segment
229
+ generated_audio = []
230
+
231
+ f0_rate = self.audio_cfg.sample_rate // self.audio_cfg.hop_size
232
+
233
+ overlap_segments, segments = self.build_vocal_segments(
234
+ gt_f0,
235
+ f0_rate=f0_rate,
236
+ uv_frames_th=10,
237
+ min_duration_sec=15.0,
238
+ max_duration_sec=30.0,
239
+ )
240
+ if len(segments) == 0:
241
+ segments = [(0.0, gt_wav.shape[-1] / self.audio_cfg.sample_rate)]
242
+ overlap_segments = [(0.0, gt_wav.shape[-1] / self.audio_cfg.sample_rate)]
243
+
244
+ generated_audio = torch.zeros_like(gt_wav)
245
+ for idx in tqdm(range(len(segments)), total=len(segments), desc="Inferring segments (SVC)", dynamic_ncols=True):
246
+ overlap_start_sec, overlap_end_sec = overlap_segments[idx]
247
+ seg_start_sec, seg_end_sec = segments[idx]
248
+
249
+ wav_start = int(round(overlap_start_sec * self.audio_cfg.sample_rate))
250
+ wav_end = int(round(overlap_end_sec * self.audio_cfg.sample_rate))
251
+ f0_start = int(round(overlap_start_sec * f0_rate))
252
+ f0_end = int(round(overlap_end_sec * f0_rate))
253
+
254
+ wav_start = max(0, min(wav_start, gt_wav.shape[-1]))
255
+ wav_end = max(wav_start, min(wav_end, gt_wav.shape[-1]))
256
+ f0_start = max(0, min(f0_start, gt_f0.shape[-1]))
257
+ f0_end = max(f0_start, min(f0_end, gt_f0.shape[-1]))
258
+
259
+ segment_gt_wav = gt_wav[:, wav_start:wav_end]
260
+ segment_gt_f0 = gt_f0[:, f0_start:f0_end]
261
+ segment_generated_audio = self.infer_segment(
262
+ pt_wav=pt_wav,
263
+ gt_wav=segment_gt_wav,
264
+ pt_f0=pt_f0,
265
+ gt_f0=segment_gt_f0,
266
+ pitch_shift=pitch_shift,
267
+ n_steps=n_steps,
268
+ cfg=cfg,
269
+ )
270
+
271
+ segment_start = int(round(seg_start_sec * self.audio_cfg.sample_rate))
272
+ segment_end = int(round(seg_end_sec * self.audio_cfg.sample_rate))
273
+ segment_generated_audio = segment_generated_audio[segment_start - wav_start: segment_end - wav_start]
274
+
275
+ generated_audio[:, segment_start:segment_end] = segment_generated_audio
276
+
277
+ return generated_audio, pitch_shift
278
+
279
+ def infer_segment(self, pt_wav, gt_wav, pt_f0, gt_f0, pitch_shift=0, n_steps=32, cfg=3):
280
+ pt_mel = self.mel(pt_wav)
281
+ len_prompt_mel = pt_mel.shape[1]
282
+ pt_f0 = F.pad(pt_f0, (0, 0, 0, max(0, len_prompt_mel - pt_f0.shape[1])))[:, :len_prompt_mel]
283
+
284
+ f0_course_pt = self.f0_to_coarse(pt_f0)
285
+ f0_course_gt = self.f0_to_coarse(gt_f0, f0_shift=pitch_shift * 5)
286
+ f0_course = torch.cat([f0_course_pt, f0_course_gt], 1)
287
+
288
+ pt_content_feat = self.whisper_encoder.encode(pt_wav, sr=self.audio_cfg.sample_rate)
289
+ gt_content_feat = self.whisper_encoder.encode(gt_wav, sr=self.audio_cfg.sample_rate)
290
+ t_pt, t_gt = f0_course_pt.shape[1], f0_course_gt.shape[1]
291
+ pt_content_feat = F.pad(pt_content_feat, (0, 0, 0, max(0, t_pt - pt_content_feat.shape[1])))[:, :t_pt, :]
292
+ gt_content_feat = F.pad(gt_content_feat, (0, 0, 0, max(0, t_gt - gt_content_feat.shape[1])))[:, :t_gt, :]
293
+
294
+ content_feat = torch.cat([pt_content_feat, gt_content_feat], 1)
295
+
296
+ f0_feat = self.f0_encoder(f0_course)
297
+ features = content_feat + f0_feat
298
+
299
+ gt_decoder_inp = features[:, len_prompt_mel:, :]
300
+ pt_decoder_inp = features[:, :len_prompt_mel, :]
301
+
302
+ generated_mel = self.cfm_decoder.reverse_diffusion(
303
+ pt_mel,
304
+ pt_decoder_inp,
305
+ gt_decoder_inp,
306
+ n_timesteps=n_steps,
307
+ cfg=cfg
308
+ )
309
+
310
+ generated_audio = self.vocoder(generated_mel.transpose(1, 2)[0:1, ...])
311
+ generated_audio = generated_audio.squeeze()
312
+
313
+ # cut or pad to match gt_wav length
314
+ if generated_audio.shape[-1] > gt_wav.shape[-1]:
315
+ generated_audio = generated_audio[:gt_wav.shape[-1]]
316
+ elif generated_audio.shape[-1] < gt_wav.shape[-1]:
317
+ generated_audio = F.pad(generated_audio, (0, gt_wav.shape[-1] - generated_audio.shape[-1]))
318
+
319
+ return generated_audio
webui.py CHANGED
@@ -4,6 +4,7 @@ import random
4
  import shutil
5
  import sys
6
  import traceback
 
7
  from pathlib import Path
8
  from typing import Tuple
9
  import spaces
@@ -269,6 +270,10 @@ def transcription_function(
269
  except Exception:
270
  print(traceback.format_exc(), file=sys.stderr, flush=True)
271
  return None, None
 
 
 
 
272
 
273
 
274
  @spaces.GPU
@@ -351,7 +356,187 @@ def synthesis_function(
351
  except Exception:
352
  print(traceback.format_exc(), file=sys.stderr, flush=True)
353
  return None, gr.update(), gr.update()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
 
356
 
357
  def render_interface() -> gr.Blocks:
@@ -378,180 +563,7 @@ def render_interface() -> gr.Blocks:
378
  '"></div>'
379
  '</div>'
380
  )
381
-
382
- with gr.Row(equal_height=False):
383
- # ── Left column: inputs & controls ──
384
- with gr.Column(scale=1):
385
- prompt_audio = gr.Audio(
386
- label="Prompt audio (reference voice), max 30s",
387
- type="filepath",
388
- interactive=True,
389
- )
390
- target_audio = gr.Audio(
391
- label="Target audio (melody / lyrics source), max 60s",
392
- type="filepath",
393
- interactive=True,
394
- )
395
-
396
- with gr.Row():
397
- control_radio = gr.Radio(
398
- choices=["melody", "score"],
399
- value="melody",
400
- label="Control type",
401
- scale=1,
402
- )
403
- auto_shift = gr.Checkbox(
404
- label="Auto pitch shift",
405
- value=True,
406
- interactive=True,
407
- scale=1,
408
- )
409
-
410
- synthesis_btn = gr.Button(
411
- value="🎤 Generate singing voice",
412
- variant="primary",
413
- size="lg",
414
- )
415
-
416
- # ── Advanced: transcription settings & metadata ──
417
- with gr.Accordion("Advanced: Transcription & Metadata", open=False):
418
- with gr.Row():
419
- pitch_shift = gr.Number(
420
- label="Pitch shift (semitones)",
421
- value=0,
422
- minimum=-36,
423
- maximum=36,
424
- step=1,
425
- interactive=True,
426
- scale=1,
427
- )
428
- seed_input = gr.Number(
429
- label="Seed",
430
- value=12306,
431
- step=1,
432
- interactive=True,
433
- scale=1,
434
- )
435
- gr.Markdown(
436
- "Upload your own metadata files to skip automatic transcription. "
437
- "You can use the [SoulX-Singer-Midi-Editor]"
438
- "(https://huggingface.co/spaces/Soul-AILab/SoulX-Singer-Midi-Editor) "
439
- "to edit metadata for better alignment."
440
- )
441
- with gr.Row():
442
- prompt_lyric_lang = gr.Dropdown(
443
- label="Prompt lyric language",
444
- choices=[
445
- ("Mandarin", "Mandarin"),
446
- ("Cantonese", "Cantonese"),
447
- ("English", "English"),
448
- ],
449
- value="English",
450
- interactive=True,
451
- scale=1,
452
- )
453
- target_lyric_lang = gr.Dropdown(
454
- label="Target lyric language",
455
- choices=[
456
- ("Mandarin", "Mandarin"),
457
- ("Cantonese", "Cantonese"),
458
- ("English", "English"),
459
- ],
460
- value="English",
461
- interactive=True,
462
- scale=1,
463
- )
464
- with gr.Row():
465
- prompt_vocal_sep = gr.Checkbox(
466
- label="Prompt vocal separation",
467
- value=False,
468
- interactive=True,
469
- scale=1,
470
- )
471
- target_vocal_sep = gr.Checkbox(
472
- label="Target vocal separation",
473
- value=True,
474
- interactive=True,
475
- scale=1,
476
- )
477
- transcription_btn = gr.Button(
478
- value="Run singing transcription",
479
- variant="secondary",
480
- size="lg",
481
- )
482
- with gr.Row():
483
- prompt_metadata = gr.File(
484
- label="Prompt metadata",
485
- type="filepath",
486
- file_types=[".json"],
487
- interactive=True,
488
- )
489
- target_metadata = gr.File(
490
- label="Target metadata",
491
- type="filepath",
492
- file_types=[".json"],
493
- interactive=True,
494
- )
495
-
496
- # ── Right column: output ──
497
- with gr.Column(scale=1):
498
- output_audio = gr.Audio(
499
- label="Generated audio",
500
- type="filepath",
501
- interactive=False,
502
- )
503
- gr.Examples(
504
- examples=[
505
- ["raven.wav", "happy_birthday.mp3"],
506
- ["anita.wav", "happy_birthday.mp3"],
507
- ["obama.wav", "happy_birthday.mp3"],
508
- ["raven.wav", "everybody_loves.wav"],
509
- ["anita.wav", "everybody_loves.wav"],
510
- ["obama.wav", "everybody_loves.wav"],
511
- ],
512
- inputs=[prompt_audio, target_audio],
513
- outputs=[output_audio, prompt_metadata, target_metadata],
514
- fn=synthesis_function,
515
- cache_examples=True,
516
- cache_mode="lazy"
517
- )
518
-
519
- # ── Event handlers ──
520
- prompt_audio.change(
521
- fn=lambda: None,
522
- inputs=[],
523
- outputs=[prompt_metadata],
524
- )
525
-
526
- target_audio.change(
527
- fn=lambda: None,
528
- inputs=[],
529
- outputs=[target_metadata],
530
- )
531
-
532
- transcription_btn.click(
533
- fn=transcription_function,
534
- inputs=[
535
- prompt_audio, target_audio,
536
- prompt_metadata, target_metadata,
537
- prompt_lyric_lang, target_lyric_lang,
538
- prompt_vocal_sep, target_vocal_sep,
539
- ],
540
- outputs=[prompt_metadata, target_metadata],
541
- )
542
-
543
- synthesis_btn.click(
544
- fn=synthesis_function,
545
- inputs=[
546
- prompt_audio, target_audio,
547
- prompt_metadata, target_metadata,
548
- control_radio, auto_shift, pitch_shift, seed_input,
549
- prompt_lyric_lang, target_lyric_lang,
550
- prompt_vocal_sep, target_vocal_sep,
551
- ],
552
- outputs=[output_audio, prompt_metadata, target_metadata],
553
- )
554
-
555
  return page
556
 
557
 
 
4
  import shutil
5
  import sys
6
  import traceback
7
+ import gc
8
  from pathlib import Path
9
  from typing import Tuple
10
  import spaces
 
270
  except Exception:
271
  print(traceback.format_exc(), file=sys.stderr, flush=True)
272
  return None, None
273
+ finally:
274
+ gc.collect()
275
+ if torch.cuda.is_available():
276
+ torch.cuda.empty_cache()
277
 
278
 
279
  @spaces.GPU
 
356
  except Exception:
357
  print(traceback.format_exc(), file=sys.stderr, flush=True)
358
  return None, gr.update(), gr.update()
359
+ finally:
360
+ gc.collect()
361
+ if torch.cuda.is_available():
362
+ torch.cuda.empty_cache()
363
+
364
+
365
+
366
+ def render_tab_content() -> None:
367
+ """Render the main content (for embedding in app.py tabs). No Blocks or title."""
368
+ with gr.Row(equal_height=False):
369
+ # ── Left column: inputs & controls ──
370
+ with gr.Column(scale=1):
371
+ prompt_audio = gr.Audio(
372
+ label="Prompt audio (reference voice), max 30s",
373
+ type="filepath",
374
+ interactive=True,
375
+ )
376
+ target_audio = gr.Audio(
377
+ label="Target audio (melody / lyrics source), max 60s",
378
+ type="filepath",
379
+ interactive=True,
380
+ )
381
+
382
+ with gr.Row():
383
+ control_radio = gr.Radio(
384
+ choices=["melody", "score"],
385
+ value="melody",
386
+ label="Control type",
387
+ scale=1,
388
+ )
389
+ auto_shift = gr.Checkbox(
390
+ label="Auto pitch shift",
391
+ value=True,
392
+ interactive=True,
393
+ scale=1,
394
+ )
395
+
396
+ synthesis_btn = gr.Button(
397
+ value="🎤 Generate singing voice",
398
+ variant="primary",
399
+ size="lg",
400
+ )
401
+
402
+ # ── Advanced: transcription settings & metadata ──
403
+ with gr.Accordion("Advanced: Transcription & Metadata", open=False):
404
+ with gr.Row():
405
+ pitch_shift = gr.Number(
406
+ label="Pitch shift (semitones)",
407
+ value=0,
408
+ minimum=-36,
409
+ maximum=36,
410
+ step=1,
411
+ interactive=True,
412
+ scale=1,
413
+ )
414
+ seed_input = gr.Number(
415
+ label="Seed",
416
+ value=12306,
417
+ step=1,
418
+ interactive=True,
419
+ scale=1,
420
+ )
421
+ gr.Markdown(
422
+ "Upload your own metadata files to skip automatic transcription. "
423
+ "You can use the [SoulX-Singer-Midi-Editor]"
424
+ "(https://huggingface.co/spaces/Soul-AILab/SoulX-Singer-Midi-Editor) "
425
+ "to edit metadata for better alignment."
426
+ )
427
+ with gr.Row():
428
+ prompt_lyric_lang = gr.Dropdown(
429
+ label="Prompt lyric language",
430
+ choices=[
431
+ ("Mandarin", "Mandarin"),
432
+ ("Cantonese", "Cantonese"),
433
+ ("English", "English"),
434
+ ],
435
+ value="English",
436
+ interactive=True,
437
+ scale=1,
438
+ )
439
+ target_lyric_lang = gr.Dropdown(
440
+ label="Target lyric language",
441
+ choices=[
442
+ ("Mandarin", "Mandarin"),
443
+ ("Cantonese", "Cantonese"),
444
+ ("English", "English"),
445
+ ],
446
+ value="English",
447
+ interactive=True,
448
+ scale=1,
449
+ )
450
+ with gr.Row():
451
+ prompt_vocal_sep = gr.Checkbox(
452
+ label="Prompt vocal separation",
453
+ value=False,
454
+ interactive=True,
455
+ scale=1,
456
+ )
457
+ target_vocal_sep = gr.Checkbox(
458
+ label="Target vocal separation",
459
+ value=True,
460
+ interactive=True,
461
+ scale=1,
462
+ )
463
+ transcription_btn = gr.Button(
464
+ value="Run singing transcription",
465
+ variant="secondary",
466
+ size="lg",
467
+ )
468
+ with gr.Row():
469
+ prompt_metadata = gr.File(
470
+ label="Prompt metadata",
471
+ type="filepath",
472
+ file_types=[".json"],
473
+ interactive=True,
474
+ )
475
+ target_metadata = gr.File(
476
+ label="Target metadata",
477
+ type="filepath",
478
+ file_types=[".json"],
479
+ interactive=True,
480
+ )
481
 
482
+ # ── Right column: output ──
483
+ with gr.Column(scale=1):
484
+ output_audio = gr.Audio(
485
+ label="Generated audio",
486
+ type="filepath",
487
+ interactive=False,
488
+ )
489
+ gr.Examples(
490
+ examples=[
491
+ ["raven.wav", "happy_birthday.mp3"],
492
+ ["anita.wav", "happy_birthday.mp3"],
493
+ ["obama.wav", "happy_birthday.mp3"],
494
+ ["raven.wav", "everybody_loves.wav"],
495
+ ["anita.wav", "everybody_loves.wav"],
496
+ ["obama.wav", "everybody_loves.wav"],
497
+ ],
498
+ inputs=[prompt_audio, target_audio],
499
+ outputs=[output_audio, prompt_metadata, target_metadata],
500
+ fn=synthesis_function,
501
+ cache_examples=True,
502
+ cache_mode="lazy"
503
+ )
504
+
505
+ # ── Event handlers ──
506
+ prompt_audio.change(
507
+ fn=lambda: None,
508
+ inputs=[],
509
+ outputs=[prompt_metadata],
510
+ )
511
+
512
+ target_audio.change(
513
+ fn=lambda: None,
514
+ inputs=[],
515
+ outputs=[target_metadata],
516
+ )
517
+
518
+ transcription_btn.click(
519
+ fn=transcription_function,
520
+ inputs=[
521
+ prompt_audio, target_audio,
522
+ prompt_metadata, target_metadata,
523
+ prompt_lyric_lang, target_lyric_lang,
524
+ prompt_vocal_sep, target_vocal_sep,
525
+ ],
526
+ outputs=[prompt_metadata, target_metadata],
527
+ )
528
+
529
+ synthesis_btn.click(
530
+ fn=synthesis_function,
531
+ inputs=[
532
+ prompt_audio, target_audio,
533
+ prompt_metadata, target_metadata,
534
+ control_radio, auto_shift, pitch_shift, seed_input,
535
+ prompt_lyric_lang, target_lyric_lang,
536
+ prompt_vocal_sep, target_vocal_sep,
537
+ ],
538
+ outputs=[output_audio, prompt_metadata, target_metadata],
539
+ )
540
 
541
 
542
  def render_interface() -> gr.Blocks:
 
563
  '"></div>'
564
  '</div>'
565
  )
566
+ render_tab_content()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
567
  return page
568
 
569
 
webui_svc.py ADDED
@@ -0,0 +1,419 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import sys
3
+ import traceback
4
+ import gc
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+ from typing import Literal
8
+
9
+ import gradio as gr
10
+ import librosa
11
+ import numpy as np
12
+ import soundfile as sf
13
+ import torch
14
+
15
+ import spaces
16
+ from preprocess.pipeline import PreprocessPipeline
17
+ from soulxsinger.utils.file_utils import load_config
18
+ from cli.inference_svc import build_model as build_svc_model, process as svc_process
19
+
20
+
21
+ ROOT = Path(__file__).parent
22
+ SAMPLE_RATE = 44100
23
+ PROMPT_MAX_SEC_DEFAULT = 30
24
+ TARGET_MAX_SEC_DEFAULT = 600
25
+
26
+ # Example rows: only [prompt_audio, target_audio]; other params use UI defaults when running
27
+ EXAMPLE_LIST = [
28
+ [str(ROOT / "example/audio/zh_prompt.mp3"), str(ROOT / "example/audio/zh_target.mp3")],
29
+ [str(ROOT / "example/audio/en_prompt.mp3"), str(ROOT / "example/audio/en_target.mp3")],
30
+ [str(ROOT / "example/audio/svc_webui/Sun Yanzi.mp3"), str(ROOT / "example/audio/svc_webui/I'm Yours.mp3")],
31
+ [str(ROOT / "example/audio/svc_webui/Sun Yanzi.mp3"), str(ROOT / "example/audio/svc_webui/传奇.mp3")],
32
+ [str(ROOT / "example/audio/svc_webui/Sun Yanzi.mp3"), str(ROOT / "example/audio/svc_webui/君が好きだと叫びたい.mp3")],
33
+ [str(ROOT / "example/audio/svc_webui/Sun Yanzi.mp3"), str(ROOT / "example/audio/svc_webui/富士山下.mp3")],
34
+ ]
35
+
36
+ _I18N = dict(
37
+ display_lang_label=dict(en="Display Language", zh="显示语言"),
38
+ title=dict(en="## SoulX-Singer SVC", zh="## SoulX-Singer SVC"),
39
+ prompt_audio_label=dict(en=f"Prompt audio", zh=f"Prompt 音频"),
40
+ target_audio_label=dict(en=f"Target audio", zh=f"Target 音频"),
41
+ prompt_vocal_sep_label=dict(en="Prompt vocal separation", zh="Prompt 人声分离"),
42
+ target_vocal_sep_label=dict(en="Target vocal separation", zh="Target 人声分离"),
43
+ auto_shift_label=dict(en="Auto pitch shift", zh="自动变调"),
44
+ auto_mix_acc_label=dict(en="Auto mix accompaniment", zh="自动混合伴奏"),
45
+ pitch_shift_label=dict(en="Pitch shift (semitones)", zh="指定变调(半音)"),
46
+ n_step_label=dict(en="diffusion steps", zh="采样步数"),
47
+ cfg_label=dict(en="cfg scale", zh="cfg系数"),
48
+ seed_label=dict(en="Seed", zh="种子"),
49
+ examples_label=dict(en="Examples", zh="示例"),
50
+ run_btn=dict(en="🎤Singing Voice Conversion", zh="🎤歌声转换"),
51
+ output_audio_label=dict(en="Generated audio", zh="合成结果音频"),
52
+ warn_missing_audio=dict(en="Please provide both prompt audio and target audio.", zh="请同时上传 Prompt 与 Target 音频。"),
53
+ instruction_title=dict(en="Usage", zh="使用说明"),
54
+ instruction_p1=dict(
55
+ en="Upload the Prompt and Target audio, and configure the parameters",
56
+ zh="上传 Prompt 与 Target 音频,并配置相关参数",
57
+ ),
58
+ instruction_p2=dict(
59
+ en="Click「🎤Singing Voice Conversion」to start singing voice conversion.",
60
+ zh="点击「🎤歌声转换」开始最终生成。",
61
+ ),
62
+ tips_title=dict(en="Tips", zh="提示"),
63
+ tip_p1=dict(
64
+ en="Input: The Prompt audio is recommended to be a clean and clear singing voice, while the Target audio can be either a pure vocal or a mixture with accompaniment. If the audio contains accompaniment, please check the vocal separation option.",
65
+ zh="输入:Prompt 音频建议是干净清晰的歌声,Target 音频可以是纯歌声或伴奏,这两者若带伴奏需要勾选分离选项",
66
+ ),
67
+ tip_p2=dict(
68
+ en="Pitch shift: When there is a large pitch range difference between the Prompt and Target audio, you can try enabling auto pitch shift or manually adjusting the pitch shift in semitones. When a non-zero pitch shift is specified, auto pitch shift will not take effect. The accompaniment of auto mix will be pitch-shifted together with the vocal (keeping the same octave).",
69
+ zh="变调:Prompt 音频的音域和 Target 音频的音域差距较大的时候,可以尝试开启自动变调或手动调整变调半音数,指定非0的变调半音数时,自动变调不生效,自动混音的伴奏会配合歌声进行升降调(保持同一个八度)",
70
+ ),
71
+ tip_p3=dict(
72
+ en="Model parameters: Generally, a larger number of sampling steps will yield better generation quality but also longer generation time; a larger cfg scale will increase timbre similarity and melody fidelity, but may cause more distortion, it is recommended to take a value between 1 and 3.",
73
+ zh="模型参数:一般采样步数越大,生成质量越好,但生成时间也越长;一般cfg系数越大,音色相似度和旋律保真度越高,但是会造成更多的失真,建议取1~3之间的值",
74
+ ),
75
+ tip_p4=dict(
76
+ en="If you want to convert a long audio or a whole song with large pitch range, there may be instability in the generated voice. You can try converting in segments.",
77
+ zh="长音频或完整歌曲中,音域变化较大的情况有可能出现音色不稳定,可以尝试分段转换",
78
+ )
79
+ )
80
+
81
+ _GLOBAL_LANG: Literal["zh", "en"] = "zh"
82
+
83
+
84
+ def _i18n(key: str) -> str:
85
+ return _I18N[key][_GLOBAL_LANG]
86
+
87
+
88
+ def _print_exception(context: str) -> None:
89
+ print(f"[{context}]\n{traceback.format_exc()}", file=sys.stderr, flush=True)
90
+
91
+
92
+ def _get_device() -> str:
93
+ return "cuda:0" if torch.cuda.is_available() else "cpu"
94
+
95
+
96
+ def _session_dir() -> Path:
97
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
98
+ return ROOT / "outputs" / "gradio" / "svc" / timestamp
99
+
100
+
101
+ def _normalize_audio_input(audio):
102
+ return audio[0] if isinstance(audio, tuple) else audio
103
+
104
+
105
+ def _trim_and_save_audio(src_audio_path: str, dst_wav_path: Path, max_sec: int, sr: int = SAMPLE_RATE) -> None:
106
+ audio_data, _ = librosa.load(src_audio_path, sr=sr, mono=True)
107
+ audio_data = audio_data[: max_sec * sr]
108
+ dst_wav_path.parent.mkdir(parents=True, exist_ok=True)
109
+ sf.write(dst_wav_path, audio_data, sr)
110
+
111
+
112
+ def _usage_md() -> str:
113
+ return "\n\n".join([
114
+ f"### {_i18n('instruction_title')}",
115
+ f"**1.** {_i18n('instruction_p1')}",
116
+ f"**2.** {_i18n('instruction_p2')}",
117
+ ])
118
+
119
+
120
+ def _tips_md() -> str:
121
+ return "\n\n".join([
122
+ f"### {_i18n('tips_title')}",
123
+ f"- {_i18n('tip_p1')}",
124
+ f"- {_i18n('tip_p2')}",
125
+ f"- {_i18n('tip_p3')}",
126
+ f"- {_i18n('tip_p4')}",
127
+ ])
128
+
129
+
130
+ class AppState:
131
+ def __init__(self) -> None:
132
+ self.device = _get_device()
133
+ self.preprocess_pipeline = PreprocessPipeline(
134
+ device=self.device,
135
+ language="Mandarin",
136
+ save_dir=str(ROOT / "outputs" / "gradio" / "_placeholder" / "svc"),
137
+ vocal_sep=True,
138
+ max_merge_duration=60000,
139
+ midi_transcribe=False,
140
+ )
141
+
142
+ self.svc_config = load_config("soulxsinger/config/soulxsinger.yaml")
143
+ self.svc_model = build_svc_model(
144
+ model_path="pretrained_models/SoulX-Singer/model-svc.pt",
145
+ config=self.svc_config,
146
+ device=self.device,
147
+ )
148
+
149
+ def run_preprocess(self, audio_path: Path, save_path: Path, vocal_sep: bool) -> tuple[bool, str, Path | None, Path | None]:
150
+ try:
151
+ self.preprocess_pipeline.save_dir = str(save_path)
152
+ self.preprocess_pipeline.run(
153
+ audio_path=str(audio_path),
154
+ vocal_sep=vocal_sep,
155
+ max_merge_duration=60000,
156
+ language="Mandarin",
157
+ )
158
+ vocal_wav = save_path / "vocal.wav"
159
+ vocal_f0 = save_path / "vocal_f0.npy"
160
+ if not vocal_wav.exists() or not vocal_f0.exists():
161
+ return False, f"preprocess output missing: {vocal_wav} or {vocal_f0}", None, None
162
+ return True, "ok", vocal_wav, vocal_f0
163
+ except Exception as e:
164
+ return False, f"preprocess failed: {e}", None, None
165
+
166
+ def run_svc(
167
+ self,
168
+ prompt_wav_path: Path,
169
+ target_wav_path: Path,
170
+ prompt_f0_path: Path,
171
+ target_f0_path: Path,
172
+ session_base: Path,
173
+ auto_shift: bool,
174
+ auto_mix_acc: bool,
175
+ pitch_shift: int,
176
+ n_step: int,
177
+ cfg: float,
178
+ seed: int,
179
+ ) -> tuple[bool, str, Path | None]:
180
+ try:
181
+ torch.manual_seed(seed)
182
+ np.random.seed(seed)
183
+ random.seed(seed)
184
+
185
+ save_dir = session_base / "generated"
186
+ save_dir.mkdir(parents=True, exist_ok=True)
187
+
188
+ class Args:
189
+ pass
190
+
191
+ args = Args()
192
+ args.device = self.device
193
+ args.prompt_wav_path = str(prompt_wav_path)
194
+ args.target_wav_path = str(target_wav_path)
195
+ args.prompt_f0_path = str(prompt_f0_path)
196
+ args.target_f0_path = str(target_f0_path)
197
+ args.save_dir = str(save_dir)
198
+ args.auto_shift = auto_shift
199
+ args.pitch_shift = int(pitch_shift)
200
+ args.n_steps = int(n_step)
201
+ args.cfg = float(cfg)
202
+
203
+ svc_process(args, self.svc_config, self.svc_model)
204
+
205
+ generated = save_dir / "generated.wav"
206
+ if not generated.exists():
207
+ return False, f"inference finished but output not found: {generated}", None
208
+
209
+ if auto_mix_acc:
210
+ acc_path = session_base / "transcriptions" / "target" / "acc.wav"
211
+ if acc_path.exists():
212
+ vocal_shift = args.pitch_shift
213
+ mul = -1 if vocal_shift < 0 else 1
214
+ acc_shift = abs(vocal_shift) % 12
215
+ acc_shift = mul * acc_shift
216
+ if acc_shift > 6:
217
+ acc_shift -= 12
218
+ if acc_shift < -6:
219
+ acc_shift += 12
220
+
221
+ mix_sr = self.svc_config.audio.sample_rate
222
+ vocal, _ = librosa.load(str(generated), sr=mix_sr, mono=True)
223
+ acc, _ = librosa.load(str(acc_path), sr=mix_sr, mono=True)
224
+ if acc_shift != 0:
225
+ acc = librosa.effects.pitch_shift(acc, sr=mix_sr, n_steps=acc_shift)
226
+ print(f"Applied pitch shift of {acc_shift} semitones to accompaniment to match vocal shift of {vocal_shift} semitones.")
227
+
228
+ mix_len = min(len(vocal), len(acc))
229
+ if mix_len > 0:
230
+ mixed = vocal[:mix_len] + acc[:mix_len]
231
+ peak = float(np.max(np.abs(mixed))) if mixed.size > 0 else 1.0
232
+ if peak > 1.0:
233
+ mixed = mixed / peak
234
+ mixed_path = save_dir / "generated_mixed.wav"
235
+ sf.write(str(mixed_path), mixed, mix_sr)
236
+ generated = mixed_path
237
+
238
+ return True, "svc inference done", generated
239
+ except Exception as e:
240
+ return False, f"svc inference failed: {e}", None
241
+
242
+
243
+ APP_STATE = AppState()
244
+
245
+
246
+ @spaces.GPU
247
+ def _start_svc(
248
+ prompt_audio,
249
+ target_audio,
250
+ prompt_vocal_sep=False,
251
+ target_vocal_sep=True,
252
+ auto_shift=True,
253
+ auto_mix_acc=True,
254
+ pitch_shift=0,
255
+ n_step=32,
256
+ cfg=1.0,
257
+ seed=42
258
+ ):
259
+ try:
260
+ prompt_audio = _normalize_audio_input(prompt_audio)
261
+ target_audio = _normalize_audio_input(target_audio)
262
+ if not prompt_audio or not target_audio:
263
+ gr.Warning(_i18n("warn_missing_audio"))
264
+ return None
265
+
266
+ session_base = _session_dir()
267
+ audio_dir = session_base / "audio"
268
+ prompt_raw = audio_dir / "prompt.wav"
269
+ target_raw = audio_dir / "target.wav"
270
+ _trim_and_save_audio(prompt_audio, prompt_raw, PROMPT_MAX_SEC_DEFAULT)
271
+ _trim_and_save_audio(target_audio, target_raw, TARGET_MAX_SEC_DEFAULT)
272
+
273
+ prompt_ok, prompt_msg, prompt_wav, prompt_f0 = APP_STATE.run_preprocess(
274
+ audio_path=prompt_raw,
275
+ save_path=session_base / "transcriptions" / "prompt",
276
+ vocal_sep=bool(prompt_vocal_sep),
277
+ )
278
+ if not prompt_ok or prompt_wav is None or prompt_f0 is None:
279
+ print(prompt_msg, file=sys.stderr, flush=True)
280
+ return None
281
+
282
+ target_ok, target_msg, target_wav, target_f0 = APP_STATE.run_preprocess(
283
+ audio_path=target_raw,
284
+ save_path=session_base / "transcriptions" / "target",
285
+ vocal_sep=bool(target_vocal_sep),
286
+ )
287
+ if not target_ok or target_wav is None or target_f0 is None:
288
+ print(target_msg, file=sys.stderr, flush=True)
289
+ return None
290
+
291
+ ok, msg, generated = APP_STATE.run_svc(
292
+ prompt_wav_path=prompt_wav,
293
+ target_wav_path=target_wav,
294
+ prompt_f0_path=prompt_f0,
295
+ target_f0_path=target_f0,
296
+ session_base=session_base,
297
+ auto_shift=bool(auto_shift),
298
+ auto_mix_acc=bool(auto_mix_acc),
299
+ pitch_shift=int(pitch_shift),
300
+ n_step=int(n_step),
301
+ cfg=float(cfg),
302
+ seed=int(seed),
303
+ )
304
+ if not ok or generated is None:
305
+ print(msg, file=sys.stderr, flush=True)
306
+ return None
307
+ return str(generated)
308
+ except Exception:
309
+ _print_exception("_start_svc")
310
+ return None
311
+ finally:
312
+ gc.collect()
313
+ if torch.cuda.is_available():
314
+ torch.cuda.empty_cache()
315
+
316
+
317
+ def render_tab_content() -> None:
318
+ """Render SVC tab content (for embedding in app.py). Same UI style as webui: two columns, no title."""
319
+ with gr.Row(equal_height=False):
320
+ # ── Left column: inputs & controls ──
321
+ with gr.Column(scale=1):
322
+ prompt_audio = gr.Audio(
323
+ label="Prompt audio (reference voice)",
324
+ type="filepath",
325
+ interactive=True,
326
+ )
327
+ target_audio = gr.Audio(
328
+ label="Target audio (to convert)",
329
+ type="filepath",
330
+ interactive=True,
331
+ )
332
+
333
+ run_btn = gr.Button(
334
+ value="🎤 Singing Voice Conversion",
335
+ variant="primary",
336
+ size="lg",
337
+ )
338
+
339
+ with gr.Accordion("Advanced settings", open=False):
340
+ with gr.Row():
341
+ prompt_vocal_sep = gr.Checkbox(label="Prompt vocal separation", value=False, scale=1)
342
+ target_vocal_sep = gr.Checkbox(label="Target vocal separation", value=True, scale=1)
343
+ with gr.Row():
344
+ auto_shift = gr.Checkbox(label="Auto pitch shift", value=True, scale=1)
345
+ auto_mix_acc = gr.Checkbox(label="Auto mix accompaniment", value=True, scale=1)
346
+ pitch_shift = gr.Slider(label="Pitch shift (semitones)", value=0, minimum=-36, maximum=36, step=1)
347
+ n_step = gr.Slider(label="n_step", value=32, minimum=1, maximum=200, step=1)
348
+ cfg = gr.Slider(label="cfg scale", value=1.0, minimum=0.0, maximum=10.0, step=0.1)
349
+ seed_input = gr.Slider(label="Seed", value=42, minimum=0, maximum=10000, step=1)
350
+
351
+ # ── Right column: output ──
352
+ with gr.Column(scale=1):
353
+ output_audio = gr.Audio(label="Generated audio", type="filepath", interactive=False)
354
+ gr.Examples(
355
+ examples=EXAMPLE_LIST,
356
+ inputs=[prompt_audio, target_audio],
357
+ outputs=[output_audio],
358
+ fn=_start_svc,
359
+ cache_examples=True,
360
+ cache_mode="lazy",
361
+ )
362
+
363
+ run_btn.click(
364
+ fn=_start_svc,
365
+ inputs=[
366
+ prompt_audio,
367
+ target_audio,
368
+ prompt_vocal_sep,
369
+ target_vocal_sep,
370
+ auto_shift,
371
+ auto_mix_acc,
372
+ pitch_shift,
373
+ n_step,
374
+ cfg,
375
+ seed_input,
376
+ ],
377
+ outputs=[output_audio],
378
+ )
379
+
380
+
381
+ def render_interface() -> gr.Blocks:
382
+ with gr.Blocks(title="SoulX-Singer", theme=gr.themes.Default()) as page:
383
+ gr.HTML(
384
+ '<div style="'
385
+ 'text-align: center; '
386
+ 'padding: 1.25rem 0 1.5rem; '
387
+ 'margin-bottom: 0.5rem;'
388
+ '">'
389
+ '<div style="'
390
+ 'display: inline-block; '
391
+ 'font-size: 1.75rem; '
392
+ 'font-weight: 700; '
393
+ 'letter-spacing: 0.02em; '
394
+ 'line-height: 1.3;'
395
+ '">SoulX-Singer</div>'
396
+ '<div style="'
397
+ 'width: 80px; '
398
+ 'height: 3px; '
399
+ 'margin: 1rem auto 0; '
400
+ 'background: linear-gradient(90deg, transparent, #6366f1, transparent); '
401
+ 'border-radius: 2px;'
402
+ '"></div>'
403
+ '</div>'
404
+ )
405
+ render_tab_content()
406
+ return page
407
+
408
+
409
+ if __name__ == "__main__":
410
+ import argparse
411
+
412
+ parser = argparse.ArgumentParser()
413
+ parser.add_argument("--port", type=int, default=7861, help="Gradio server port")
414
+ parser.add_argument("--share", action="store_true", help="Create public link")
415
+ args = parser.parse_args()
416
+
417
+ page = render_interface()
418
+ page.queue()
419
+ page.launch(share=args.share, server_name="0.0.0.0", server_port=args.port)