SoulX-Singer-with-background / scripts /generate_example_outputs.py
杨月政
fix: 修复 spaces 依赖导入问题
5f32b51
#!/usr/bin/env python3
"""
Generate pre-built SVC example outputs for the Space.
Run from space/ directory: python scripts/generate_example_outputs.py
Uses CPU by default (set CUDA_VISIBLE_DEVICES or --device cuda for GPU).
Each example may take several minutes on CPU.
Prerequisites:
pip install -r requirements.txt # from space/ or project root
# Ensure pretrained models exist (run Space once or: python -c "from ensure_models import ensure_pretrained_models; ensure_pretrained_models()")
"""
import argparse
import os
import gc
import random
import sys
from datetime import datetime
from pathlib import Path
import librosa
import numpy as np
import soundfile as sf
import torch
# Add parent (space/) to path when run as script
ROOT = Path(__file__).resolve().parent.parent
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from preprocess.pipeline import PreprocessPipeline
from soulxsinger.utils.file_utils import load_config
from cli.inference_svc import build_model as build_svc_model, process as svc_process
SAMPLE_RATE = 44100
PROMPT_MAX_SEC = 30
TARGET_MAX_SEC = 600
# Must match EXAMPLE_LIST order in webui_svc.py
EXAMPLE_PAIRS = [
("example/audio/zh_prompt.mp3", "example/audio/zh_target.mp3", "zh_prompt_zh_target.wav"),
("example/audio/en_prompt.mp3", "example/audio/en_target.mp3", "en_prompt_en_target.wav"),
("example/audio/svc_webui/Sun Yanzi.mp3", "example/audio/svc_webui/I'm Yours.mp3", "sunyanzi_im_yours.wav"),
("example/audio/svc_webui/Sun Yanzi.mp3", "example/audio/svc_webui/传奇.mp3", "sunyanzi_legend.wav"),
("example/audio/svc_webui/Sun Yanzi.mp3", "example/audio/svc_webui/君が好きだと叫びたい.mp3", "sunyanzi_kowarekakeru.wav"),
("example/audio/svc_webui/Sun Yanzi.mp3", "example/audio/svc_webui/富士山下.mp3", "sunyanzi_fujisan.wav"),
]
# Fallback for decomposed Unicode filename (macOS may normalize)
EXAMPLE_PAIRS_ALT = [
("example/audio/svc_webui/Sun Yanzi.mp3", "example/audio/svc_webui/君が好きだと叫びたい.mp3", "sunyanzi_kowarekakeru.wav"),
]
def _trim_and_save_audio(src_path: Path, dst_path: Path, max_sec: int, sr: int = SAMPLE_RATE) -> None:
audio_data, _ = librosa.load(str(src_path), sr=sr, mono=True)
audio_data = audio_data[: max_sec * sr]
dst_path.parent.mkdir(parents=True, exist_ok=True)
sf.write(str(dst_path), audio_data, sr)
def main():
parser = argparse.ArgumentParser(description="Generate SVC example outputs for Space")
parser.add_argument("--device", type=str, default=None, help="cuda or cpu (auto if not set)")
parser.add_argument("--use-fp16", action="store_true", help="Use FP16 (GPU only)")
parser.add_argument("--index", type=int, default=None, help="Only generate example at index (0-5)")
args = parser.parse_args()
device = args.device or ("cuda:0" if torch.cuda.is_available() else "cpu")
use_fp16 = args.use_fp16 and "cuda" in device
os.chdir(ROOT)
output_dir = ROOT / "example" / "outputs"
output_dir.mkdir(parents=True, exist_ok=True)
# Ensure models (may download SoulX-Singer + SoulX-Singer-Preprocess; first run can take long)
from ensure_models import ensure_pretrained_models
print("Checking / downloading pretrained models (HF)...", flush=True)
ensure_pretrained_models()
print("Pretrained models ready.", flush=True)
# Build pipeline and model
print(f"Using device: {device}", flush=True)
preprocess = PreprocessPipeline(
device=device,
language="Mandarin",
save_dir=str(ROOT / "outputs" / "gradio" / "_gen" / "svc"),
vocal_sep=True,
max_merge_duration=60000,
midi_transcribe=False,
)
config = load_config("soulxsinger/config/soulxsinger.yaml")
model = build_svc_model(
model_path="pretrained_models/SoulX-Singer/model-svc.pt",
config=config,
device=device,
use_fp16=use_fp16,
)
pairs = EXAMPLE_PAIRS
if args.index is not None:
pairs = [pairs[args.index]]
for i, (prompt_rel, target_rel, out_name) in enumerate(pairs):
prompt_path = ROOT / prompt_rel
target_path = ROOT / target_rel
out_path = output_dir / out_name
# Resolve Japanese filename (NFC vs NFD)
if not prompt_path.exists() or not target_path.exists():
if out_name == "sunyanzi_kowarekakeru.wav":
for pa, ta, _ in EXAMPLE_PAIRS_ALT:
if (ROOT / pa).exists() and (ROOT / ta).exists():
prompt_path = ROOT / pa
target_path = ROOT / ta
break
if not prompt_path.exists():
print(f"[{i+1}] SKIP: {prompt_path} not found", flush=True)
continue
if not target_path.exists():
print(f"[{i+1}] SKIP: {target_path} not found", flush=True)
continue
if out_path.exists():
print(f"[{i+1}] SKIP (exists): {out_name}", flush=True)
continue
print(f"[{i+1}] Generating {out_name} ...", flush=True)
session_base = ROOT / "outputs" / "gradio" / "_gen" / "svc" / datetime.now().strftime("%Y%m%d_%H%M%S_%f")
audio_dir = session_base / "audio"
audio_dir.mkdir(parents=True, exist_ok=True)
prompt_raw = audio_dir / "prompt.wav"
target_raw = audio_dir / "target.wav"
_trim_and_save_audio(prompt_path, prompt_raw, PROMPT_MAX_SEC)
_trim_and_save_audio(target_path, target_raw, TARGET_MAX_SEC)
# Preprocess prompt
prompt_save = session_base / "transcriptions" / "prompt"
ok, msg, prompt_wav, prompt_f0 = _run_preprocess(preprocess, prompt_raw, prompt_save, vocal_sep=False)
if not ok:
print(f" Preprocess prompt failed: {msg}", flush=True)
continue
# Preprocess target
target_save = session_base / "transcriptions" / "target"
ok, msg, target_wav, target_f0 = _run_preprocess(preprocess, target_raw, target_save, vocal_sep=True)
if not ok:
print(f" Preprocess target failed: {msg}", flush=True)
continue
# SVC inference
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
class Args:
pass
infer_args = Args()
infer_args.device = device
infer_args.prompt_wav_path = str(prompt_wav)
infer_args.target_wav_path = str(target_wav)
infer_args.prompt_f0_path = str(prompt_f0)
infer_args.target_f0_path = str(target_f0)
infer_args.save_dir = str(session_base / "generated")
infer_args.auto_shift = True
infer_args.auto_mix_acc = True
infer_args.pitch_shift = 0
infer_args.n_steps = 32
infer_args.cfg = 1.0
infer_args.use_fp16 = use_fp16
Path(infer_args.save_dir).mkdir(parents=True, exist_ok=True)
try:
svc_process(infer_args, config, model)
except Exception as e:
print(f" SVC failed: {e}", flush=True)
continue
generated = Path(infer_args.save_dir) / "generated.wav"
if not generated.exists():
print(f" Output not found: {generated}", flush=True)
continue
# Mix accompaniment if available
acc_path = session_base / "transcriptions" / "target" / "acc.wav"
if acc_path.exists():
vocal_shift = infer_args.pitch_shift
mul = -1 if vocal_shift < 0 else 1
acc_shift = abs(vocal_shift) % 12
acc_shift = mul * acc_shift
if acc_shift > 6:
acc_shift -= 12
if acc_shift < -6:
acc_shift += 12
mix_sr = config.audio.sample_rate
vocal, _ = librosa.load(str(generated), sr=mix_sr, mono=True)
acc, _ = librosa.load(str(acc_path), sr=mix_sr, mono=True)
if acc_shift != 0:
acc = librosa.effects.pitch_shift(acc, sr=mix_sr, n_steps=acc_shift)
mix_len = min(len(vocal), len(acc))
if mix_len > 0:
mixed = vocal[:mix_len] + acc[:mix_len]
peak = float(np.max(np.abs(mixed))) if mixed.size > 0 else 1.0
if peak > 1.0:
mixed = mixed / peak
generated = Path(infer_args.save_dir) / "generated_mixed.wav"
sf.write(str(generated), mixed, mix_sr)
# Copy to final output
import shutil
shutil.copy(str(generated), str(out_path))
print(f" -> {out_path}", flush=True)
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
print("Done.", flush=True)
def _run_preprocess(pipeline, audio_path: Path, save_path: Path, vocal_sep: bool):
try:
pipeline.save_dir = str(save_path)
pipeline.run(
audio_path=str(audio_path),
vocal_sep=vocal_sep,
max_merge_duration=60000,
language="Mandarin",
)
vocal_wav = save_path / "vocal.wav"
vocal_f0 = save_path / "vocal_f0.npy"
if not vocal_wav.exists() or not vocal_f0.exists():
return False, f"missing {vocal_wav} or {vocal_f0}", None, None
return True, "ok", vocal_wav, vocal_f0
except Exception as e:
return False, str(e), None, None
if __name__ == "__main__":
main()