#!/usr/bin/env python3 """ Generate pre-built SVC example outputs for the Space. Run from space/ directory: python scripts/generate_example_outputs.py Uses CPU by default (set CUDA_VISIBLE_DEVICES or --device cuda for GPU). Each example may take several minutes on CPU. Prerequisites: pip install -r requirements.txt # from space/ or project root # Ensure pretrained models exist (run Space once or: python -c "from ensure_models import ensure_pretrained_models; ensure_pretrained_models()") """ import argparse import os import gc import random import sys from datetime import datetime from pathlib import Path import librosa import numpy as np import soundfile as sf import torch # Add parent (space/) to path when run as script ROOT = Path(__file__).resolve().parent.parent if str(ROOT) not in sys.path: sys.path.insert(0, str(ROOT)) from preprocess.pipeline import PreprocessPipeline from soulxsinger.utils.file_utils import load_config from cli.inference_svc import build_model as build_svc_model, process as svc_process SAMPLE_RATE = 44100 PROMPT_MAX_SEC = 30 TARGET_MAX_SEC = 600 # Must match EXAMPLE_LIST order in webui_svc.py EXAMPLE_PAIRS = [ ("example/audio/zh_prompt.mp3", "example/audio/zh_target.mp3", "zh_prompt_zh_target.wav"), ("example/audio/en_prompt.mp3", "example/audio/en_target.mp3", "en_prompt_en_target.wav"), ("example/audio/svc_webui/Sun Yanzi.mp3", "example/audio/svc_webui/I'm Yours.mp3", "sunyanzi_im_yours.wav"), ("example/audio/svc_webui/Sun Yanzi.mp3", "example/audio/svc_webui/传奇.mp3", "sunyanzi_legend.wav"), ("example/audio/svc_webui/Sun Yanzi.mp3", "example/audio/svc_webui/君が好きだと叫びたい.mp3", "sunyanzi_kowarekakeru.wav"), ("example/audio/svc_webui/Sun Yanzi.mp3", "example/audio/svc_webui/富士山下.mp3", "sunyanzi_fujisan.wav"), ] # Fallback for decomposed Unicode filename (macOS may normalize) EXAMPLE_PAIRS_ALT = [ ("example/audio/svc_webui/Sun Yanzi.mp3", "example/audio/svc_webui/君が好きだと叫びたい.mp3", "sunyanzi_kowarekakeru.wav"), ] def _trim_and_save_audio(src_path: Path, dst_path: Path, max_sec: int, sr: int = SAMPLE_RATE) -> None: audio_data, _ = librosa.load(str(src_path), sr=sr, mono=True) audio_data = audio_data[: max_sec * sr] dst_path.parent.mkdir(parents=True, exist_ok=True) sf.write(str(dst_path), audio_data, sr) def main(): parser = argparse.ArgumentParser(description="Generate SVC example outputs for Space") parser.add_argument("--device", type=str, default=None, help="cuda or cpu (auto if not set)") parser.add_argument("--use-fp16", action="store_true", help="Use FP16 (GPU only)") parser.add_argument("--index", type=int, default=None, help="Only generate example at index (0-5)") args = parser.parse_args() device = args.device or ("cuda:0" if torch.cuda.is_available() else "cpu") use_fp16 = args.use_fp16 and "cuda" in device os.chdir(ROOT) output_dir = ROOT / "example" / "outputs" output_dir.mkdir(parents=True, exist_ok=True) # Ensure models (may download SoulX-Singer + SoulX-Singer-Preprocess; first run can take long) from ensure_models import ensure_pretrained_models print("Checking / downloading pretrained models (HF)...", flush=True) ensure_pretrained_models() print("Pretrained models ready.", flush=True) # Build pipeline and model print(f"Using device: {device}", flush=True) preprocess = PreprocessPipeline( device=device, language="Mandarin", save_dir=str(ROOT / "outputs" / "gradio" / "_gen" / "svc"), vocal_sep=True, max_merge_duration=60000, midi_transcribe=False, ) config = load_config("soulxsinger/config/soulxsinger.yaml") model = build_svc_model( model_path="pretrained_models/SoulX-Singer/model-svc.pt", config=config, device=device, use_fp16=use_fp16, ) pairs = EXAMPLE_PAIRS if args.index is not None: pairs = [pairs[args.index]] for i, (prompt_rel, target_rel, out_name) in enumerate(pairs): prompt_path = ROOT / prompt_rel target_path = ROOT / target_rel out_path = output_dir / out_name # Resolve Japanese filename (NFC vs NFD) if not prompt_path.exists() or not target_path.exists(): if out_name == "sunyanzi_kowarekakeru.wav": for pa, ta, _ in EXAMPLE_PAIRS_ALT: if (ROOT / pa).exists() and (ROOT / ta).exists(): prompt_path = ROOT / pa target_path = ROOT / ta break if not prompt_path.exists(): print(f"[{i+1}] SKIP: {prompt_path} not found", flush=True) continue if not target_path.exists(): print(f"[{i+1}] SKIP: {target_path} not found", flush=True) continue if out_path.exists(): print(f"[{i+1}] SKIP (exists): {out_name}", flush=True) continue print(f"[{i+1}] Generating {out_name} ...", flush=True) session_base = ROOT / "outputs" / "gradio" / "_gen" / "svc" / datetime.now().strftime("%Y%m%d_%H%M%S_%f") audio_dir = session_base / "audio" audio_dir.mkdir(parents=True, exist_ok=True) prompt_raw = audio_dir / "prompt.wav" target_raw = audio_dir / "target.wav" _trim_and_save_audio(prompt_path, prompt_raw, PROMPT_MAX_SEC) _trim_and_save_audio(target_path, target_raw, TARGET_MAX_SEC) # Preprocess prompt prompt_save = session_base / "transcriptions" / "prompt" ok, msg, prompt_wav, prompt_f0 = _run_preprocess(preprocess, prompt_raw, prompt_save, vocal_sep=False) if not ok: print(f" Preprocess prompt failed: {msg}", flush=True) continue # Preprocess target target_save = session_base / "transcriptions" / "target" ok, msg, target_wav, target_f0 = _run_preprocess(preprocess, target_raw, target_save, vocal_sep=True) if not ok: print(f" Preprocess target failed: {msg}", flush=True) continue # SVC inference random.seed(42) np.random.seed(42) torch.manual_seed(42) class Args: pass infer_args = Args() infer_args.device = device infer_args.prompt_wav_path = str(prompt_wav) infer_args.target_wav_path = str(target_wav) infer_args.prompt_f0_path = str(prompt_f0) infer_args.target_f0_path = str(target_f0) infer_args.save_dir = str(session_base / "generated") infer_args.auto_shift = True infer_args.auto_mix_acc = True infer_args.pitch_shift = 0 infer_args.n_steps = 32 infer_args.cfg = 1.0 infer_args.use_fp16 = use_fp16 Path(infer_args.save_dir).mkdir(parents=True, exist_ok=True) try: svc_process(infer_args, config, model) except Exception as e: print(f" SVC failed: {e}", flush=True) continue generated = Path(infer_args.save_dir) / "generated.wav" if not generated.exists(): print(f" Output not found: {generated}", flush=True) continue # Mix accompaniment if available acc_path = session_base / "transcriptions" / "target" / "acc.wav" if acc_path.exists(): vocal_shift = infer_args.pitch_shift mul = -1 if vocal_shift < 0 else 1 acc_shift = abs(vocal_shift) % 12 acc_shift = mul * acc_shift if acc_shift > 6: acc_shift -= 12 if acc_shift < -6: acc_shift += 12 mix_sr = config.audio.sample_rate vocal, _ = librosa.load(str(generated), sr=mix_sr, mono=True) acc, _ = librosa.load(str(acc_path), sr=mix_sr, mono=True) if acc_shift != 0: acc = librosa.effects.pitch_shift(acc, sr=mix_sr, n_steps=acc_shift) mix_len = min(len(vocal), len(acc)) if mix_len > 0: mixed = vocal[:mix_len] + acc[:mix_len] peak = float(np.max(np.abs(mixed))) if mixed.size > 0 else 1.0 if peak > 1.0: mixed = mixed / peak generated = Path(infer_args.save_dir) / "generated_mixed.wav" sf.write(str(generated), mixed, mix_sr) # Copy to final output import shutil shutil.copy(str(generated), str(out_path)) print(f" -> {out_path}", flush=True) gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() print("Done.", flush=True) def _run_preprocess(pipeline, audio_path: Path, save_path: Path, vocal_sep: bool): try: pipeline.save_dir = str(save_path) pipeline.run( audio_path=str(audio_path), vocal_sep=vocal_sep, max_merge_duration=60000, language="Mandarin", ) vocal_wav = save_path / "vocal.wav" vocal_f0 = save_path / "vocal_f0.npy" if not vocal_wav.exists() or not vocal_f0.exists(): return False, f"missing {vocal_wav} or {vocal_f0}", None, None return True, "ok", vocal_wav, vocal_f0 except Exception as e: return False, str(e), None, None if __name__ == "__main__": main()