Spaces:

vincenthugging
/

SoulX-Singer-with-background

Running on Zero

SoulX-Singer-with-background / scripts /generate_example_outputs.py

杨月政

fix: 修复 spaces 依赖导入问题

5f32b51 about 1 month ago

9.42 kB

	#!/usr/bin/env python3
	"""
	Generate pre-built SVC example outputs for the Space.
	Run from space/ directory: python scripts/generate_example_outputs.py

	Uses CPU by default (set CUDA_VISIBLE_DEVICES or --device cuda for GPU).
	Each example may take several minutes on CPU.

	Prerequisites:
	pip install -r requirements.txt # from space/ or project root
	# Ensure pretrained models exist (run Space once or: python -c "from ensure_models import ensure_pretrained_models; ensure_pretrained_models()")
	"""

	import argparse
	import os
	import gc
	import random
	import sys
	from datetime import datetime
	from pathlib import Path

	import librosa
	import numpy as np
	import soundfile as sf
	import torch

	# Add parent (space/) to path when run as script
	ROOT = Path(__file__).resolve().parent.parent
	if str(ROOT) not in sys.path:
	sys.path.insert(0, str(ROOT))

	from preprocess.pipeline import PreprocessPipeline
	from soulxsinger.utils.file_utils import load_config
	from cli.inference_svc import build_model as build_svc_model, process as svc_process

	SAMPLE_RATE = 44100
	PROMPT_MAX_SEC = 30
	TARGET_MAX_SEC = 600

	# Must match EXAMPLE_LIST order in webui_svc.py
	EXAMPLE_PAIRS = [
	("example/audio/zh_prompt.mp3", "example/audio/zh_target.mp3", "zh_prompt_zh_target.wav"),
	("example/audio/en_prompt.mp3", "example/audio/en_target.mp3", "en_prompt_en_target.wav"),
	("example/audio/svc_webui/Sun Yanzi.mp3", "example/audio/svc_webui/I'm Yours.mp3", "sunyanzi_im_yours.wav"),
	("example/audio/svc_webui/Sun Yanzi.mp3", "example/audio/svc_webui/传奇.mp3", "sunyanzi_legend.wav"),
	("example/audio/svc_webui/Sun Yanzi.mp3", "example/audio/svc_webui/君が好きだと叫びたい.mp3", "sunyanzi_kowarekakeru.wav"),
	("example/audio/svc_webui/Sun Yanzi.mp3", "example/audio/svc_webui/富士山下.mp3", "sunyanzi_fujisan.wav"),
	]

	# Fallback for decomposed Unicode filename (macOS may normalize)
	EXAMPLE_PAIRS_ALT = [
	("example/audio/svc_webui/Sun Yanzi.mp3", "example/audio/svc_webui/君が好きだと叫びたい.mp3", "sunyanzi_kowarekakeru.wav"),
	]


	def _trim_and_save_audio(src_path: Path, dst_path: Path, max_sec: int, sr: int = SAMPLE_RATE) -> None:
	audio_data, _ = librosa.load(str(src_path), sr=sr, mono=True)
	audio_data = audio_data[: max_sec * sr]
	dst_path.parent.mkdir(parents=True, exist_ok=True)
	sf.write(str(dst_path), audio_data, sr)


	def main():
	parser = argparse.ArgumentParser(description="Generate SVC example outputs for Space")
	parser.add_argument("--device", type=str, default=None, help="cuda or cpu (auto if not set)")
	parser.add_argument("--use-fp16", action="store_true", help="Use FP16 (GPU only)")
	parser.add_argument("--index", type=int, default=None, help="Only generate example at index (0-5)")
	args = parser.parse_args()

	device = args.device or ("cuda:0" if torch.cuda.is_available() else "cpu")
	use_fp16 = args.use_fp16 and "cuda" in device

	os.chdir(ROOT)

	output_dir = ROOT / "example" / "outputs"
	output_dir.mkdir(parents=True, exist_ok=True)

	# Ensure models (may download SoulX-Singer + SoulX-Singer-Preprocess; first run can take long)
	from ensure_models import ensure_pretrained_models
	print("Checking / downloading pretrained models (HF)...", flush=True)
	ensure_pretrained_models()
	print("Pretrained models ready.", flush=True)

	# Build pipeline and model
	print(f"Using device: {device}", flush=True)
	preprocess = PreprocessPipeline(
	device=device,
	language="Mandarin",
	save_dir=str(ROOT / "outputs" / "gradio" / "_gen" / "svc"),
	vocal_sep=True,
	max_merge_duration=60000,
	midi_transcribe=False,
	)
	config = load_config("soulxsinger/config/soulxsinger.yaml")
	model = build_svc_model(
	model_path="pretrained_models/SoulX-Singer/model-svc.pt",
	config=config,
	device=device,
	use_fp16=use_fp16,
	)

	pairs = EXAMPLE_PAIRS
	if args.index is not None:
	pairs = [pairs[args.index]]

	for i, (prompt_rel, target_rel, out_name) in enumerate(pairs):
	prompt_path = ROOT / prompt_rel
	target_path = ROOT / target_rel
	out_path = output_dir / out_name

	# Resolve Japanese filename (NFC vs NFD)
	if not prompt_path.exists() or not target_path.exists():
	if out_name == "sunyanzi_kowarekakeru.wav":
	for pa, ta, _ in EXAMPLE_PAIRS_ALT:
	if (ROOT / pa).exists() and (ROOT / ta).exists():
	prompt_path = ROOT / pa
	target_path = ROOT / ta
	break
	if not prompt_path.exists():
	print(f"[{i+1}] SKIP: {prompt_path} not found", flush=True)
	continue
	if not target_path.exists():
	print(f"[{i+1}] SKIP: {target_path} not found", flush=True)
	continue

	if out_path.exists():
	print(f"[{i+1}] SKIP (exists): {out_name}", flush=True)
	continue

	print(f"[{i+1}] Generating {out_name} ...", flush=True)
	session_base = ROOT / "outputs" / "gradio" / "_gen" / "svc" / datetime.now().strftime("%Y%m%d_%H%M%S_%f")
	audio_dir = session_base / "audio"
	audio_dir.mkdir(parents=True, exist_ok=True)

	prompt_raw = audio_dir / "prompt.wav"
	target_raw = audio_dir / "target.wav"
	_trim_and_save_audio(prompt_path, prompt_raw, PROMPT_MAX_SEC)
	_trim_and_save_audio(target_path, target_raw, TARGET_MAX_SEC)

	# Preprocess prompt
	prompt_save = session_base / "transcriptions" / "prompt"
	ok, msg, prompt_wav, prompt_f0 = _run_preprocess(preprocess, prompt_raw, prompt_save, vocal_sep=False)
	if not ok:
	print(f" Preprocess prompt failed: {msg}", flush=True)
	continue

	# Preprocess target
	target_save = session_base / "transcriptions" / "target"
	ok, msg, target_wav, target_f0 = _run_preprocess(preprocess, target_raw, target_save, vocal_sep=True)
	if not ok:
	print(f" Preprocess target failed: {msg}", flush=True)
	continue

	# SVC inference
	random.seed(42)
	np.random.seed(42)
	torch.manual_seed(42)

	class Args:
	pass

	infer_args = Args()
	infer_args.device = device
	infer_args.prompt_wav_path = str(prompt_wav)
	infer_args.target_wav_path = str(target_wav)
	infer_args.prompt_f0_path = str(prompt_f0)
	infer_args.target_f0_path = str(target_f0)
	infer_args.save_dir = str(session_base / "generated")
	infer_args.auto_shift = True
	infer_args.auto_mix_acc = True
	infer_args.pitch_shift = 0
	infer_args.n_steps = 32
	infer_args.cfg = 1.0
	infer_args.use_fp16 = use_fp16

	Path(infer_args.save_dir).mkdir(parents=True, exist_ok=True)
	try:
	svc_process(infer_args, config, model)
	except Exception as e:
	print(f" SVC failed: {e}", flush=True)
	continue

	generated = Path(infer_args.save_dir) / "generated.wav"
	if not generated.exists():
	print(f" Output not found: {generated}", flush=True)
	continue

	# Mix accompaniment if available
	acc_path = session_base / "transcriptions" / "target" / "acc.wav"
	if acc_path.exists():
	vocal_shift = infer_args.pitch_shift
	mul = -1 if vocal_shift < 0 else 1
	acc_shift = abs(vocal_shift) % 12
	acc_shift = mul * acc_shift
	if acc_shift > 6:
	acc_shift -= 12
	if acc_shift < -6:
	acc_shift += 12
	mix_sr = config.audio.sample_rate
	vocal, _ = librosa.load(str(generated), sr=mix_sr, mono=True)
	acc, _ = librosa.load(str(acc_path), sr=mix_sr, mono=True)
	if acc_shift != 0:
	acc = librosa.effects.pitch_shift(acc, sr=mix_sr, n_steps=acc_shift)
	mix_len = min(len(vocal), len(acc))
	if mix_len > 0:
	mixed = vocal[:mix_len] + acc[:mix_len]
	peak = float(np.max(np.abs(mixed))) if mixed.size > 0 else 1.0
	if peak > 1.0:
	mixed = mixed / peak
	generated = Path(infer_args.save_dir) / "generated_mixed.wav"
	sf.write(str(generated), mixed, mix_sr)

	# Copy to final output
	import shutil
	shutil.copy(str(generated), str(out_path))
	print(f" -> {out_path}", flush=True)

	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	print("Done.", flush=True)


	def _run_preprocess(pipeline, audio_path: Path, save_path: Path, vocal_sep: bool):
	try:
	pipeline.save_dir = str(save_path)
	pipeline.run(
	audio_path=str(audio_path),
	vocal_sep=vocal_sep,
	max_merge_duration=60000,
	language="Mandarin",
	)
	vocal_wav = save_path / "vocal.wav"
	vocal_f0 = save_path / "vocal_f0.npy"
	if not vocal_wav.exists() or not vocal_f0.exists():
	return False, f"missing {vocal_wav} or {vocal_f0}", None, None
	return True, "ok", vocal_wav, vocal_f0
	except Exception as e:
	return False, str(e), None, None


	if __name__ == "__main__":
	main()