Spaces:
Running on Zero
Running on Zero
| #!/usr/bin/env python3 | |
| """ | |
| Generate pre-built SVC example outputs for the Space. | |
| Run from space/ directory: python scripts/generate_example_outputs.py | |
| Uses CPU by default (set CUDA_VISIBLE_DEVICES or --device cuda for GPU). | |
| Each example may take several minutes on CPU. | |
| Prerequisites: | |
| pip install -r requirements.txt # from space/ or project root | |
| # Ensure pretrained models exist (run Space once or: python -c "from ensure_models import ensure_pretrained_models; ensure_pretrained_models()") | |
| """ | |
| import argparse | |
| import os | |
| import gc | |
| import random | |
| import sys | |
| from datetime import datetime | |
| from pathlib import Path | |
| import librosa | |
| import numpy as np | |
| import soundfile as sf | |
| import torch | |
| # Add parent (space/) to path when run as script | |
| ROOT = Path(__file__).resolve().parent.parent | |
| if str(ROOT) not in sys.path: | |
| sys.path.insert(0, str(ROOT)) | |
| from preprocess.pipeline import PreprocessPipeline | |
| from soulxsinger.utils.file_utils import load_config | |
| from cli.inference_svc import build_model as build_svc_model, process as svc_process | |
| SAMPLE_RATE = 44100 | |
| PROMPT_MAX_SEC = 30 | |
| TARGET_MAX_SEC = 600 | |
| # Must match EXAMPLE_LIST order in webui_svc.py | |
| EXAMPLE_PAIRS = [ | |
| ("example/audio/zh_prompt.mp3", "example/audio/zh_target.mp3", "zh_prompt_zh_target.wav"), | |
| ("example/audio/en_prompt.mp3", "example/audio/en_target.mp3", "en_prompt_en_target.wav"), | |
| ("example/audio/svc_webui/Sun Yanzi.mp3", "example/audio/svc_webui/I'm Yours.mp3", "sunyanzi_im_yours.wav"), | |
| ("example/audio/svc_webui/Sun Yanzi.mp3", "example/audio/svc_webui/传奇.mp3", "sunyanzi_legend.wav"), | |
| ("example/audio/svc_webui/Sun Yanzi.mp3", "example/audio/svc_webui/君が好きだと叫びたい.mp3", "sunyanzi_kowarekakeru.wav"), | |
| ("example/audio/svc_webui/Sun Yanzi.mp3", "example/audio/svc_webui/富士山下.mp3", "sunyanzi_fujisan.wav"), | |
| ] | |
| # Fallback for decomposed Unicode filename (macOS may normalize) | |
| EXAMPLE_PAIRS_ALT = [ | |
| ("example/audio/svc_webui/Sun Yanzi.mp3", "example/audio/svc_webui/君が好きだと叫びたい.mp3", "sunyanzi_kowarekakeru.wav"), | |
| ] | |
| def _trim_and_save_audio(src_path: Path, dst_path: Path, max_sec: int, sr: int = SAMPLE_RATE) -> None: | |
| audio_data, _ = librosa.load(str(src_path), sr=sr, mono=True) | |
| audio_data = audio_data[: max_sec * sr] | |
| dst_path.parent.mkdir(parents=True, exist_ok=True) | |
| sf.write(str(dst_path), audio_data, sr) | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Generate SVC example outputs for Space") | |
| parser.add_argument("--device", type=str, default=None, help="cuda or cpu (auto if not set)") | |
| parser.add_argument("--use-fp16", action="store_true", help="Use FP16 (GPU only)") | |
| parser.add_argument("--index", type=int, default=None, help="Only generate example at index (0-5)") | |
| args = parser.parse_args() | |
| device = args.device or ("cuda:0" if torch.cuda.is_available() else "cpu") | |
| use_fp16 = args.use_fp16 and "cuda" in device | |
| os.chdir(ROOT) | |
| output_dir = ROOT / "example" / "outputs" | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| # Ensure models (may download SoulX-Singer + SoulX-Singer-Preprocess; first run can take long) | |
| from ensure_models import ensure_pretrained_models | |
| print("Checking / downloading pretrained models (HF)...", flush=True) | |
| ensure_pretrained_models() | |
| print("Pretrained models ready.", flush=True) | |
| # Build pipeline and model | |
| print(f"Using device: {device}", flush=True) | |
| preprocess = PreprocessPipeline( | |
| device=device, | |
| language="Mandarin", | |
| save_dir=str(ROOT / "outputs" / "gradio" / "_gen" / "svc"), | |
| vocal_sep=True, | |
| max_merge_duration=60000, | |
| midi_transcribe=False, | |
| ) | |
| config = load_config("soulxsinger/config/soulxsinger.yaml") | |
| model = build_svc_model( | |
| model_path="pretrained_models/SoulX-Singer/model-svc.pt", | |
| config=config, | |
| device=device, | |
| use_fp16=use_fp16, | |
| ) | |
| pairs = EXAMPLE_PAIRS | |
| if args.index is not None: | |
| pairs = [pairs[args.index]] | |
| for i, (prompt_rel, target_rel, out_name) in enumerate(pairs): | |
| prompt_path = ROOT / prompt_rel | |
| target_path = ROOT / target_rel | |
| out_path = output_dir / out_name | |
| # Resolve Japanese filename (NFC vs NFD) | |
| if not prompt_path.exists() or not target_path.exists(): | |
| if out_name == "sunyanzi_kowarekakeru.wav": | |
| for pa, ta, _ in EXAMPLE_PAIRS_ALT: | |
| if (ROOT / pa).exists() and (ROOT / ta).exists(): | |
| prompt_path = ROOT / pa | |
| target_path = ROOT / ta | |
| break | |
| if not prompt_path.exists(): | |
| print(f"[{i+1}] SKIP: {prompt_path} not found", flush=True) | |
| continue | |
| if not target_path.exists(): | |
| print(f"[{i+1}] SKIP: {target_path} not found", flush=True) | |
| continue | |
| if out_path.exists(): | |
| print(f"[{i+1}] SKIP (exists): {out_name}", flush=True) | |
| continue | |
| print(f"[{i+1}] Generating {out_name} ...", flush=True) | |
| session_base = ROOT / "outputs" / "gradio" / "_gen" / "svc" / datetime.now().strftime("%Y%m%d_%H%M%S_%f") | |
| audio_dir = session_base / "audio" | |
| audio_dir.mkdir(parents=True, exist_ok=True) | |
| prompt_raw = audio_dir / "prompt.wav" | |
| target_raw = audio_dir / "target.wav" | |
| _trim_and_save_audio(prompt_path, prompt_raw, PROMPT_MAX_SEC) | |
| _trim_and_save_audio(target_path, target_raw, TARGET_MAX_SEC) | |
| # Preprocess prompt | |
| prompt_save = session_base / "transcriptions" / "prompt" | |
| ok, msg, prompt_wav, prompt_f0 = _run_preprocess(preprocess, prompt_raw, prompt_save, vocal_sep=False) | |
| if not ok: | |
| print(f" Preprocess prompt failed: {msg}", flush=True) | |
| continue | |
| # Preprocess target | |
| target_save = session_base / "transcriptions" / "target" | |
| ok, msg, target_wav, target_f0 = _run_preprocess(preprocess, target_raw, target_save, vocal_sep=True) | |
| if not ok: | |
| print(f" Preprocess target failed: {msg}", flush=True) | |
| continue | |
| # SVC inference | |
| random.seed(42) | |
| np.random.seed(42) | |
| torch.manual_seed(42) | |
| class Args: | |
| pass | |
| infer_args = Args() | |
| infer_args.device = device | |
| infer_args.prompt_wav_path = str(prompt_wav) | |
| infer_args.target_wav_path = str(target_wav) | |
| infer_args.prompt_f0_path = str(prompt_f0) | |
| infer_args.target_f0_path = str(target_f0) | |
| infer_args.save_dir = str(session_base / "generated") | |
| infer_args.auto_shift = True | |
| infer_args.auto_mix_acc = True | |
| infer_args.pitch_shift = 0 | |
| infer_args.n_steps = 32 | |
| infer_args.cfg = 1.0 | |
| infer_args.use_fp16 = use_fp16 | |
| Path(infer_args.save_dir).mkdir(parents=True, exist_ok=True) | |
| try: | |
| svc_process(infer_args, config, model) | |
| except Exception as e: | |
| print(f" SVC failed: {e}", flush=True) | |
| continue | |
| generated = Path(infer_args.save_dir) / "generated.wav" | |
| if not generated.exists(): | |
| print(f" Output not found: {generated}", flush=True) | |
| continue | |
| # Mix accompaniment if available | |
| acc_path = session_base / "transcriptions" / "target" / "acc.wav" | |
| if acc_path.exists(): | |
| vocal_shift = infer_args.pitch_shift | |
| mul = -1 if vocal_shift < 0 else 1 | |
| acc_shift = abs(vocal_shift) % 12 | |
| acc_shift = mul * acc_shift | |
| if acc_shift > 6: | |
| acc_shift -= 12 | |
| if acc_shift < -6: | |
| acc_shift += 12 | |
| mix_sr = config.audio.sample_rate | |
| vocal, _ = librosa.load(str(generated), sr=mix_sr, mono=True) | |
| acc, _ = librosa.load(str(acc_path), sr=mix_sr, mono=True) | |
| if acc_shift != 0: | |
| acc = librosa.effects.pitch_shift(acc, sr=mix_sr, n_steps=acc_shift) | |
| mix_len = min(len(vocal), len(acc)) | |
| if mix_len > 0: | |
| mixed = vocal[:mix_len] + acc[:mix_len] | |
| peak = float(np.max(np.abs(mixed))) if mixed.size > 0 else 1.0 | |
| if peak > 1.0: | |
| mixed = mixed / peak | |
| generated = Path(infer_args.save_dir) / "generated_mixed.wav" | |
| sf.write(str(generated), mixed, mix_sr) | |
| # Copy to final output | |
| import shutil | |
| shutil.copy(str(generated), str(out_path)) | |
| print(f" -> {out_path}", flush=True) | |
| gc.collect() | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| print("Done.", flush=True) | |
| def _run_preprocess(pipeline, audio_path: Path, save_path: Path, vocal_sep: bool): | |
| try: | |
| pipeline.save_dir = str(save_path) | |
| pipeline.run( | |
| audio_path=str(audio_path), | |
| vocal_sep=vocal_sep, | |
| max_merge_duration=60000, | |
| language="Mandarin", | |
| ) | |
| vocal_wav = save_path / "vocal.wav" | |
| vocal_f0 = save_path / "vocal_f0.npy" | |
| if not vocal_wav.exists() or not vocal_f0.exists(): | |
| return False, f"missing {vocal_wav} or {vocal_f0}", None, None | |
| return True, "ok", vocal_wav, vocal_f0 | |
| except Exception as e: | |
| return False, str(e), None, None | |
| if __name__ == "__main__": | |
| main() | |