Spaces:

vincenthugging
/

SoulX-Singer-with-background

Running on Zero

File size: 9,419 Bytes

5f32b51

#!/usr/bin/env python3
"""
Generate pre-built SVC example outputs for the Space.
Run from space/ directory: python scripts/generate_example_outputs.py

Uses CPU by default (set CUDA_VISIBLE_DEVICES or --device cuda for GPU).
Each example may take several minutes on CPU.

Prerequisites:
  pip install -r requirements.txt  # from space/ or project root
  # Ensure pretrained models exist (run Space once or: python -c "from ensure_models import ensure_pretrained_models; ensure_pretrained_models()")
"""

import argparse
import os
import gc
import random
import sys
from datetime import datetime
from pathlib import Path

import librosa
import numpy as np
import soundfile as sf
import torch

# Add parent (space/) to path when run as script
ROOT = Path(__file__).resolve().parent.parent
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from preprocess.pipeline import PreprocessPipeline
from soulxsinger.utils.file_utils import load_config
from cli.inference_svc import build_model as build_svc_model, process as svc_process

SAMPLE_RATE = 44100
PROMPT_MAX_SEC = 30
TARGET_MAX_SEC = 600

# Must match EXAMPLE_LIST order in webui_svc.py
EXAMPLE_PAIRS = [
    ("example/audio/zh_prompt.mp3", "example/audio/zh_target.mp3", "zh_prompt_zh_target.wav"),
    ("example/audio/en_prompt.mp3", "example/audio/en_target.mp3", "en_prompt_en_target.wav"),
    ("example/audio/svc_webui/Sun Yanzi.mp3", "example/audio/svc_webui/I'm Yours.mp3", "sunyanzi_im_yours.wav"),
    ("example/audio/svc_webui/Sun Yanzi.mp3", "example/audio/svc_webui/传奇.mp3", "sunyanzi_legend.wav"),
    ("example/audio/svc_webui/Sun Yanzi.mp3", "example/audio/svc_webui/君が好きだと叫びたい.mp3", "sunyanzi_kowarekakeru.wav"),
    ("example/audio/svc_webui/Sun Yanzi.mp3", "example/audio/svc_webui/富士山下.mp3", "sunyanzi_fujisan.wav"),
]

# Fallback for decomposed Unicode filename (macOS may normalize)
EXAMPLE_PAIRS_ALT = [
    ("example/audio/svc_webui/Sun Yanzi.mp3", "example/audio/svc_webui/君が好きだと叫びたい.mp3", "sunyanzi_kowarekakeru.wav"),
]


def _trim_and_save_audio(src_path: Path, dst_path: Path, max_sec: int, sr: int = SAMPLE_RATE) -> None:
    audio_data, _ = librosa.load(str(src_path), sr=sr, mono=True)
    audio_data = audio_data[: max_sec * sr]
    dst_path.parent.mkdir(parents=True, exist_ok=True)
    sf.write(str(dst_path), audio_data, sr)


def main():
    parser = argparse.ArgumentParser(description="Generate SVC example outputs for Space")
    parser.add_argument("--device", type=str, default=None, help="cuda or cpu (auto if not set)")
    parser.add_argument("--use-fp16", action="store_true", help="Use FP16 (GPU only)")
    parser.add_argument("--index", type=int, default=None, help="Only generate example at index (0-5)")
    args = parser.parse_args()

    device = args.device or ("cuda:0" if torch.cuda.is_available() else "cpu")
    use_fp16 = args.use_fp16 and "cuda" in device

    os.chdir(ROOT)

    output_dir = ROOT / "example" / "outputs"
    output_dir.mkdir(parents=True, exist_ok=True)

    # Ensure models (may download SoulX-Singer + SoulX-Singer-Preprocess; first run can take long)
    from ensure_models import ensure_pretrained_models
    print("Checking / downloading pretrained models (HF)...", flush=True)
    ensure_pretrained_models()
    print("Pretrained models ready.", flush=True)

    # Build pipeline and model
    print(f"Using device: {device}", flush=True)
    preprocess = PreprocessPipeline(
        device=device,
        language="Mandarin",
        save_dir=str(ROOT / "outputs" / "gradio" / "_gen" / "svc"),
        vocal_sep=True,
        max_merge_duration=60000,
        midi_transcribe=False,
    )
    config = load_config("soulxsinger/config/soulxsinger.yaml")
    model = build_svc_model(
        model_path="pretrained_models/SoulX-Singer/model-svc.pt",
        config=config,
        device=device,
        use_fp16=use_fp16,
    )

    pairs = EXAMPLE_PAIRS
    if args.index is not None:
        pairs = [pairs[args.index]]

    for i, (prompt_rel, target_rel, out_name) in enumerate(pairs):
        prompt_path = ROOT / prompt_rel
        target_path = ROOT / target_rel
        out_path = output_dir / out_name

        # Resolve Japanese filename (NFC vs NFD)
        if not prompt_path.exists() or not target_path.exists():
            if out_name == "sunyanzi_kowarekakeru.wav":
                for pa, ta, _ in EXAMPLE_PAIRS_ALT:
                    if (ROOT / pa).exists() and (ROOT / ta).exists():
                        prompt_path = ROOT / pa
                        target_path = ROOT / ta
                        break
        if not prompt_path.exists():
            print(f"[{i+1}] SKIP: {prompt_path} not found", flush=True)
            continue
        if not target_path.exists():
            print(f"[{i+1}] SKIP: {target_path} not found", flush=True)
            continue

        if out_path.exists():
            print(f"[{i+1}] SKIP (exists): {out_name}", flush=True)
            continue

        print(f"[{i+1}] Generating {out_name} ...", flush=True)
        session_base = ROOT / "outputs" / "gradio" / "_gen" / "svc" / datetime.now().strftime("%Y%m%d_%H%M%S_%f")
        audio_dir = session_base / "audio"
        audio_dir.mkdir(parents=True, exist_ok=True)

        prompt_raw = audio_dir / "prompt.wav"
        target_raw = audio_dir / "target.wav"
        _trim_and_save_audio(prompt_path, prompt_raw, PROMPT_MAX_SEC)
        _trim_and_save_audio(target_path, target_raw, TARGET_MAX_SEC)

        # Preprocess prompt
        prompt_save = session_base / "transcriptions" / "prompt"
        ok, msg, prompt_wav, prompt_f0 = _run_preprocess(preprocess, prompt_raw, prompt_save, vocal_sep=False)
        if not ok:
            print(f"  Preprocess prompt failed: {msg}", flush=True)
            continue

        # Preprocess target
        target_save = session_base / "transcriptions" / "target"
        ok, msg, target_wav, target_f0 = _run_preprocess(preprocess, target_raw, target_save, vocal_sep=True)
        if not ok:
            print(f"  Preprocess target failed: {msg}", flush=True)
            continue

        # SVC inference
        random.seed(42)
        np.random.seed(42)
        torch.manual_seed(42)

        class Args:
            pass

        infer_args = Args()
        infer_args.device = device
        infer_args.prompt_wav_path = str(prompt_wav)
        infer_args.target_wav_path = str(target_wav)
        infer_args.prompt_f0_path = str(prompt_f0)
        infer_args.target_f0_path = str(target_f0)
        infer_args.save_dir = str(session_base / "generated")
        infer_args.auto_shift = True
        infer_args.auto_mix_acc = True
        infer_args.pitch_shift = 0
        infer_args.n_steps = 32
        infer_args.cfg = 1.0
        infer_args.use_fp16 = use_fp16

        Path(infer_args.save_dir).mkdir(parents=True, exist_ok=True)
        try:
            svc_process(infer_args, config, model)
        except Exception as e:
            print(f"  SVC failed: {e}", flush=True)
            continue

        generated = Path(infer_args.save_dir) / "generated.wav"
        if not generated.exists():
            print(f"  Output not found: {generated}", flush=True)
            continue

        # Mix accompaniment if available
        acc_path = session_base / "transcriptions" / "target" / "acc.wav"
        if acc_path.exists():
            vocal_shift = infer_args.pitch_shift
            mul = -1 if vocal_shift < 0 else 1
            acc_shift = abs(vocal_shift) % 12
            acc_shift = mul * acc_shift
            if acc_shift > 6:
                acc_shift -= 12
            if acc_shift < -6:
                acc_shift += 12
            mix_sr = config.audio.sample_rate
            vocal, _ = librosa.load(str(generated), sr=mix_sr, mono=True)
            acc, _ = librosa.load(str(acc_path), sr=mix_sr, mono=True)
            if acc_shift != 0:
                acc = librosa.effects.pitch_shift(acc, sr=mix_sr, n_steps=acc_shift)
            mix_len = min(len(vocal), len(acc))
            if mix_len > 0:
                mixed = vocal[:mix_len] + acc[:mix_len]
                peak = float(np.max(np.abs(mixed))) if mixed.size > 0 else 1.0
                if peak > 1.0:
                    mixed = mixed / peak
                generated = Path(infer_args.save_dir) / "generated_mixed.wav"
                sf.write(str(generated), mixed, mix_sr)

        # Copy to final output
        import shutil
        shutil.copy(str(generated), str(out_path))
        print(f"  -> {out_path}", flush=True)

        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    print("Done.", flush=True)


def _run_preprocess(pipeline, audio_path: Path, save_path: Path, vocal_sep: bool):
    try:
        pipeline.save_dir = str(save_path)
        pipeline.run(
            audio_path=str(audio_path),
            vocal_sep=vocal_sep,
            max_merge_duration=60000,
            language="Mandarin",
        )
        vocal_wav = save_path / "vocal.wav"
        vocal_f0 = save_path / "vocal_f0.npy"
        if not vocal_wav.exists() or not vocal_f0.exists():
            return False, f"missing {vocal_wav} or {vocal_f0}", None, None
        return True, "ok", vocal_wav, vocal_f0
    except Exception as e:
        return False, str(e), None, None


if __name__ == "__main__":
    main()