SoulX-Singer

Paused

App Files Files Community

kokole commited on Mar 12

Commit

339c325

1 Parent(s): 1fd58ee

add fp16 suport for svc

Browse files

Files changed (3) hide show

cli/inference_svc.py +20 -3
soulxsinger/models/soulxsinger_svc.py +41 -21
webui_svc.py +10 -3

cli/inference_svc.py CHANGED Viewed

@@ -67,8 +67,18 @@ def process(args, config, model: torch.nn.Module):
     n_step = args.n_steps if hasattr(args, "n_steps") else config.infer.n_steps
     cfg = args.cfg if hasattr(args, "cfg") else config.infer.cfg
-    generated_audio, generated_shift = model.infer(pt_wav, gt_wav, pt_f0, gt_f0, auto_shift=args.auto_shift, pitch_shift=args.pitch_shift, n_steps=n_step, cfg=cfg)
-    generated_audio = generated_audio.squeeze().cpu().numpy()
     if args.pitch_shift != generated_shift:
         args.pitch_shift = generated_shift
         # print(f"Applied pitch shift of {generated_shift} semitones to match GT F0 contour.")
@@ -99,7 +109,14 @@ if __name__ == "__main__":
     parser.add_argument("--pitch_shift", type=int, default=0)
     parser.add_argument("--n_steps", type=int, default=32)
     parser.add_argument("--cfg", type=float, default=3.0)
     args = parser.parse_args()
     config = load_config(args.config)
     main(args, config)

     n_step = args.n_steps if hasattr(args, "n_steps") else config.infer.n_steps
     cfg = args.cfg if hasattr(args, "cfg") else config.infer.cfg
+    generated_audio, generated_shift = model.infer(
+        pt_wav=pt_wav,
+        gt_wav=gt_wav,
+        pt_f0=pt_f0,
+        gt_f0=gt_f0,
+        auto_shift=args.auto_shift,
+        pitch_shift=args.pitch_shift,
+        n_steps=n_step,
+        cfg=cfg,
+        use_fp16=args.use_fp16,
+    )
+    generated_audio = generated_audio.squeeze().float().cpu().numpy()
     if args.pitch_shift != generated_shift:
         args.pitch_shift = generated_shift
         # print(f"Applied pitch shift of {generated_shift} semitones to match GT F0 contour.")
     parser.add_argument("--pitch_shift", type=int, default=0)
     parser.add_argument("--n_steps", type=int, default=32)
     parser.add_argument("--cfg", type=float, default=3.0)
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        default=False,
+        help="Use FP16 inference (faster on GPU)",
+    )
     args = parser.parse_args()
     config = load_config(args.config)
+    args.use_fp16 = args.fp16
     main(args, config)

soulxsinger/models/soulxsinger_svc.py CHANGED Viewed

@@ -4,6 +4,7 @@ import torch.nn.functional as F
 import numpy as np
 from tqdm import tqdm
 from typing import Optional, Dict, Any, List, Tuple
 from soulxsinger.models.modules.vocoder import Vocoder
 from soulxsinger.models.modules.decoder import CFMDecoder
@@ -11,6 +12,10 @@ from soulxsinger.models.modules.mel_transform import MelSpectrogramEncoder
 from soulxsinger.models.modules.whisper_encoder import WhisperEncoder
 class SoulXSingerSVC(nn.Module):
     """
     SoulXSinger SVC model.
@@ -186,6 +191,7 @@ class SoulXSingerSVC(nn.Module):
         pitch_shift=0,
         n_steps=32,
         cfg=3,
     ):
         """
         SVC inference pipeline. First build vocal segments based on F0 contour, then run inference for each segment and merge results.
@@ -198,6 +204,7 @@ class SoulXSingerSVC(nn.Module):
             pitch_shift: manual pitch shift in semitones (overrides auto_shift if > 0)
             n_steps: number of diffusion steps for inference
             cfg: classifier-free guidance scale for inference
         """
         # calculate auto pitch shift
@@ -212,17 +219,29 @@ class SoulXSingerSVC(nn.Module):
         else:
             pitch_shift = pitch_shift
         # if target audio is less than 30 seconds, infer the whole audio
         if gt_wav.shape[-1] < 30 * self.audio_cfg.sample_rate:
-            generated_audio = self.infer_segment(
-                pt_wav=pt_wav,
-                gt_wav=gt_wav,
-                pt_f0=pt_f0,
-                gt_f0=gt_f0,
-                pitch_shift=pitch_shift,
-                n_steps=n_steps,
-                cfg=cfg,
-            )
             return generated_audio, pitch_shift
         # if target audio is longer than 30 seconds, build vocal segments and infer each segment
@@ -258,15 +277,17 @@ class SoulXSingerSVC(nn.Module):
             segment_gt_wav = gt_wav[:, wav_start:wav_end]
             segment_gt_f0 = gt_f0[:, f0_start:f0_end]
-            segment_generated_audio = self.infer_segment(
-                pt_wav=pt_wav,
-                gt_wav=segment_gt_wav,
-                pt_f0=pt_f0,
-                gt_f0=segment_gt_f0,
-                pitch_shift=pitch_shift,
-                n_steps=n_steps,
-                cfg=cfg,
-            )
             segment_start = int(round(seg_start_sec * self.audio_cfg.sample_rate))
             segment_end = int(round(seg_end_sec * self.audio_cfg.sample_rate))
@@ -276,8 +297,7 @@ class SoulXSingerSVC(nn.Module):
         return generated_audio, pitch_shift
-    def infer_segment(self, pt_wav, gt_wav, pt_f0, gt_f0, pitch_shift=0, n_steps=32, cfg=3):
-        pt_mel = self.mel(pt_wav)
         len_prompt_mel = pt_mel.shape[1]
         pt_f0 = F.pad(pt_f0, (0, 0, 0, max(0, len_prompt_mel - pt_f0.shape[1])))[:, :len_prompt_mel]
@@ -308,7 +328,7 @@ class SoulXSingerSVC(nn.Module):
         )
         generated_audio = self.vocoder(generated_mel.transpose(1, 2)[0:1, ...])
-        generated_audio = generated_audio.squeeze()
         # cut or pad to match gt_wav length
         if generated_audio.shape[-1] > gt_wav.shape[-1]:

 import numpy as np
 from tqdm import tqdm
 from typing import Optional, Dict, Any, List, Tuple
+from contextlib import nullcontext
 from soulxsinger.models.modules.vocoder import Vocoder
 from soulxsinger.models.modules.decoder import CFMDecoder
 from soulxsinger.models.modules.whisper_encoder import WhisperEncoder
+def _autocast_if(enabled: bool):
+    """Return autocast(context) if enabled else no-op context. Use: with _autocast_if(use_amp): ..."""
+    return torch.amp.autocast(device_type="cuda", enabled=True) if enabled else nullcontext()
 class SoulXSingerSVC(nn.Module):
     """
     SoulXSinger SVC model.
         pitch_shift=0,
         n_steps=32,
         cfg=3,
+        use_fp16=True,
     ):
         """
         SVC inference pipeline. First build vocal segments based on F0 contour, then run inference for each segment and merge results.
             pitch_shift: manual pitch shift in semitones (overrides auto_shift if > 0)
             n_steps: number of diffusion steps for inference
             cfg: classifier-free guidance scale for inference
+            use_fp16: if True, run in FP16 except mel extraction to save memory and speed.
         """
         # calculate auto pitch shift
         else:
             pitch_shift = pitch_shift
+        use_fp16 = use_fp16 and pt_wav.is_cuda
+        with torch.amp.autocast(device_type="cuda", enabled=False):
+            pt_mel = self.mel(pt_wav.float() if pt_wav.dtype != torch.float32 else pt_wav)
+        if use_fp16:
+            pt_mel = pt_mel.half()
+            pt_wav = pt_wav.half()
+            gt_wav = gt_wav.half()
+            pt_f0 = pt_f0.half()
+            gt_f0 = gt_f0.half()
         # if target audio is less than 30 seconds, infer the whole audio
         if gt_wav.shape[-1] < 30 * self.audio_cfg.sample_rate:
+            with _autocast_if(use_fp16):
+                generated_audio = self.infer_segment(
+                    pt_mel=pt_mel,
+                    pt_wav=pt_wav,
+                    gt_wav=gt_wav,
+                    pt_f0=pt_f0,
+                    gt_f0=gt_f0,
+                    pitch_shift=pitch_shift,
+                    n_steps=n_steps,
+                    cfg=cfg,
+                )
             return generated_audio, pitch_shift
         # if target audio is longer than 30 seconds, build vocal segments and infer each segment
             segment_gt_wav = gt_wav[:, wav_start:wav_end]
             segment_gt_f0 = gt_f0[:, f0_start:f0_end]
+            with _autocast_if(use_fp16):
+                segment_generated_audio = self.infer_segment(
+                    pt_mel=pt_mel,
+                    pt_wav=pt_wav,
+                    gt_wav=segment_gt_wav,
+                    pt_f0=pt_f0,
+                    gt_f0=segment_gt_f0,
+                    pitch_shift=pitch_shift,
+                    n_steps=n_steps,
+                    cfg=cfg,
+                )
             segment_start = int(round(seg_start_sec * self.audio_cfg.sample_rate))
             segment_end = int(round(seg_end_sec * self.audio_cfg.sample_rate))
         return generated_audio, pitch_shift
+    def infer_segment(self, pt_mel, pt_wav, gt_wav, pt_f0, gt_f0, pitch_shift=0, n_steps=32, cfg=3):
         len_prompt_mel = pt_mel.shape[1]
         pt_f0 = F.pad(pt_f0, (0, 0, 0, max(0, len_prompt_mel - pt_f0.shape[1])))[:, :len_prompt_mel]
         )
         generated_audio = self.vocoder(generated_mel.transpose(1, 2)[0:1, ...])
+        generated_audio = generated_audio.squeeze().float()
         # cut or pad to match gt_wav length
         if generated_audio.shape[-1] > gt_wav.shape[-1]:

webui_svc.py CHANGED Viewed

@@ -175,6 +175,7 @@ class AppState:
 		pitch_shift: int,
 		n_step: int,
 		cfg: float,
 		seed: int,
 	) -> tuple[bool, str, Path | None]:
 		try:
@@ -199,6 +200,7 @@ class AppState:
 			args.pitch_shift = int(pitch_shift)
 			args.n_steps = int(n_step)
 			args.cfg = float(cfg)
 			svc_process(args, self.svc_config, self.svc_model)
@@ -306,6 +308,7 @@ def _run_svc_convert(
     pitch_shift=0,
     n_step=32,
     cfg=1.0,
     seed=42,
 ):
 	try:
@@ -325,6 +328,7 @@ def _run_svc_convert(
 			pitch_shift=int(pitch_shift),
 			n_step=int(n_step),
 			cfg=float(cfg),
 			seed=int(seed),
 		)
 		if not ok or generated is None:
@@ -351,12 +355,13 @@ def _start_svc(
     pitch_shift=0,
     n_step=32,
     cfg=1.0,
     seed=42,
 ):
 	state = _run_svc_preprocess(prompt_audio, target_audio, prompt_vocal_sep, target_vocal_sep)
 	if state is None:
 		return None
-	return _run_svc_convert(state, auto_shift, auto_mix_acc, pitch_shift, n_step, cfg, seed)
 def render_tab_content() -> None:
@@ -387,8 +392,10 @@ def render_tab_content() -> None:
                 with gr.Row():
                     auto_shift = gr.Checkbox(label="Auto pitch shift", value=True, scale=1)
                     auto_mix_acc = gr.Checkbox(label="Auto mix accompaniment", value=True, scale=1)
                 pitch_shift = gr.Slider(label="Pitch shift (semitones)", value=0, minimum=-36, maximum=36, step=1)
-                n_step = gr.Slider(label="n_step", value=32, minimum=1, maximum=200, step=1)
                 cfg = gr.Slider(label="cfg scale", value=1.0, minimum=0.0, maximum=10.0, step=0.1)
                 seed_input = gr.Slider(label="Seed", value=42, minimum=0, maximum=10000, step=1)
@@ -411,7 +418,7 @@ def render_tab_content() -> None:
         outputs=[svc_state],
     ).then(
         fn=_run_svc_convert,
-        inputs=[svc_state, auto_shift, auto_mix_acc, pitch_shift, n_step, cfg, seed_input],
         outputs=[output_audio],
     )

 		pitch_shift: int,
 		n_step: int,
 		cfg: float,
+		use_fp16: bool,
 		seed: int,
 	) -> tuple[bool, str, Path | None]:
 		try:
 			args.pitch_shift = int(pitch_shift)
 			args.n_steps = int(n_step)
 			args.cfg = float(cfg)
+			args.use_fp16 = bool(use_fp16)
 			svc_process(args, self.svc_config, self.svc_model)
     pitch_shift=0,
     n_step=32,
     cfg=1.0,
+    use_fp16=True,
     seed=42,
 ):
 	try:
 			pitch_shift=int(pitch_shift),
 			n_step=int(n_step),
 			cfg=float(cfg),
+			use_fp16=bool(use_fp16),
 			seed=int(seed),
 		)
 		if not ok or generated is None:
     pitch_shift=0,
     n_step=32,
     cfg=1.0,
+    use_fp16=True,
     seed=42,
 ):
 	state = _run_svc_preprocess(prompt_audio, target_audio, prompt_vocal_sep, target_vocal_sep)
 	if state is None:
 		return None
+	return _run_svc_convert(state, auto_shift, auto_mix_acc, pitch_shift, n_step, cfg, use_fp16, seed)
 def render_tab_content() -> None:
                 with gr.Row():
                     auto_shift = gr.Checkbox(label="Auto pitch shift", value=True, scale=1)
                     auto_mix_acc = gr.Checkbox(label="Auto mix accompaniment", value=True, scale=1)
+                with gr.Row():
+                    use_fp16 = gr.Checkbox(label="Use FP16", value=True, scale=1)
                 pitch_shift = gr.Slider(label="Pitch shift (semitones)", value=0, minimum=-36, maximum=36, step=1)
+                n_step = gr.Slider(label="diffusion steps", value=32, minimum=1, maximum=200, step=1)
                 cfg = gr.Slider(label="cfg scale", value=1.0, minimum=0.0, maximum=10.0, step=0.1)
                 seed_input = gr.Slider(label="Seed", value=42, minimum=0, maximum=10000, step=1)
         outputs=[svc_state],
     ).then(
         fn=_run_svc_convert,
+        inputs=[svc_state, auto_shift, auto_mix_acc, pitch_shift, n_step, cfg, use_fp16, seed_input],
         outputs=[output_audio],
     )