Spaces:
Running on Zero
Running on Zero
File size: 7,873 Bytes
ffbb4ab | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 | """
YingMusic Singer - Command Line Inference
==========================================
Single-sample inference script, replacing the Gradio Web UI.
Usage:
python infer.py \
--ref_audio path/to/ref.wav \
--melody_audio path/to/melody.wav \
--ref_text "该体谅的不执着|如果那天我" \
--target_text "好多天|看不完你" \
--output output.wav
# Enable vocal separation + accompaniment mixing simultaneously
python infer.py \
--ref_audio ref.wav \
--melody_audio melody.wav \
--ref_text "..." \
--target_text "..." \
--separate_vocals \
--mix_accompaniment \
--output mixed_output.wav
"""
import argparse
import os
import random
import tempfile
import torch
import torchaudio
from initialization import download_files
# ---------------------------------------------------------------------------
# Model loading (lazy singleton)
# ---------------------------------------------------------------------------
_model = None
_separator = None
def get_device():
return "cuda:0" if torch.cuda.is_available() else "cpu"
def get_model():
global _model
if _model is None:
download_files(task="infer")
from src.YingMusicSinger.infer.YingMusicSinger import YingMusicSinger
_model = YingMusicSinger.from_pretrained("ASLP-lab/YingMusic-Singer")
_model = _model.to(get_device())
_model.eval()
return _model
def get_separator():
global _separator
if _separator is None:
download_files(task="infer")
from src.third_party.MusicSourceSeparationTraining.inference_api import Separator
_separator = Separator(
config_path="ckpts/config_vocals_mel_band_roformer_kj.yaml",
checkpoint_path="ckpts/MelBandRoformer.ckpt",
)
return _separator
# ---------------------------------------------------------------------------
# Vocal separation
# ---------------------------------------------------------------------------
def separate_vocals(audio_path: str) -> tuple:
"""
Separate vocals and accompaniment, returns (vocals_path, accompaniment_path).
"""
separator = get_separator()
wav, sr = torchaudio.load(audio_path)
vocal_wav, inst_wav, out_sr = separator.separate(wav, sr)
tmp_dir = tempfile.mkdtemp()
vocals_path = os.path.join(tmp_dir, "vocals.wav")
accomp_path = os.path.join(tmp_dir, "accompaniment.wav")
torchaudio.save(vocals_path, torch.from_numpy(vocal_wav), out_sr)
torchaudio.save(accomp_path, torch.from_numpy(inst_wav), out_sr)
return vocals_path, accomp_path
# ---------------------------------------------------------------------------
# Mix vocals + accompaniment
# ---------------------------------------------------------------------------
def mix_vocal_and_accompaniment(vocal_path: str, accomp_path: str, vocal_gain: float = 1.0) -> str:
vocal_wav, vocal_sr = torchaudio.load(vocal_path)
accomp_wav, accomp_sr = torchaudio.load(accomp_path)
if accomp_sr != vocal_sr:
accomp_wav = torchaudio.functional.resample(accomp_wav, accomp_sr, vocal_sr)
if vocal_wav.shape[0] != accomp_wav.shape[0]:
if vocal_wav.shape[0] == 1:
vocal_wav = vocal_wav.expand(accomp_wav.shape[0], -1)
else:
accomp_wav = accomp_wav.expand(vocal_wav.shape[0], -1)
min_len = min(vocal_wav.shape[1], accomp_wav.shape[1])
mixed = vocal_wav[:, :min_len] * vocal_gain + accomp_wav[:, :min_len]
peak = mixed.abs().max()
if peak > 1.0:
mixed = mixed / peak
out_path = os.path.join(tempfile.mkdtemp(), "mixed_output.wav")
torchaudio.save(out_path, mixed, sample_rate=vocal_sr)
return out_path
# ---------------------------------------------------------------------------
# Main inference pipeline
# ---------------------------------------------------------------------------
def synthesize(args):
actual_seed = args.seed if args.seed >= 0 else random.randint(0, 2**31 - 1)
print(f"[INFO] Using seed: {actual_seed}")
actual_ref_path = args.ref_audio
actual_melody_path = args.melody_audio
melody_accomp_path = None
# Step 1: Vocal separation (optional)
if args.separate_vocals:
print("[INFO] Separating vocals from reference audio...")
actual_ref_path, _ = separate_vocals(args.ref_audio)
print("[INFO] Separating vocals from melody audio...")
actual_melody_path, melody_accomp_path = separate_vocals(args.melody_audio)
# Step 2: Model inference
print("[INFO] Loading model...")
model = get_model()
print("[INFO] Running synthesis...")
audio_tensor, sr = model(
ref_audio_path=actual_ref_path,
melody_audio_path=actual_melody_path,
ref_text=args.ref_text.strip(),
target_text=args.target_text.strip(),
lrc_align_mode="sentence_level",
sil_len_to_end=args.sil_len_to_end,
t_shift=args.t_shift,
nfe_step=args.nfe_step,
cfg_strength=args.cfg_strength,
seed=actual_seed,
)
vocal_out_path = os.path.join(tempfile.mkdtemp(), "vocal_output.wav")
torchaudio.save(vocal_out_path, audio_tensor.to("cpu"), sample_rate=sr)
# Step 3: Mix accompaniment (optional)
if args.separate_vocals and args.mix_accompaniment and melody_accomp_path is not None:
print("[INFO] Mixing vocals with accompaniment...")
final_path = mix_vocal_and_accompaniment(vocal_out_path, melody_accomp_path)
else:
final_path = vocal_out_path
# Write to specified output path
out_wav, out_sr = torchaudio.load(final_path)
os.makedirs(os.path.dirname(os.path.abspath(args.output)), exist_ok=True)
torchaudio.save(args.output, out_wav, sample_rate=out_sr)
print(f"[INFO] Saved to: {args.output}")
# ---------------------------------------------------------------------------
# Argument parser
# ---------------------------------------------------------------------------
def parse_args():
parser = argparse.ArgumentParser(
description="YingMusic Singer - Single sample command line inference"
)
# Required
parser.add_argument("--ref_audio", required=True,
help="Reference audio path")
parser.add_argument("--melody_audio", required=True,
help="Melody audio path")
parser.add_argument("--ref_text", required=True,
help="Reference lyrics, use | to separate phrases")
parser.add_argument("--target_text", required=True,
help="Target lyrics, use | to separate phrases")
# Output
parser.add_argument("--output", default="output.wav",
help="Output wav path (default: output.wav)")
# Optional flags
parser.add_argument("--separate_vocals", action="store_true",
help="Separate vocals before synthesis")
parser.add_argument("--mix_accompaniment", action="store_true",
help="Mix accompaniment into output (requires --separate_vocals)")
# Advanced params
parser.add_argument("--nfe_step", type=int, default=32,
help="NFE steps (default: 32)")
parser.add_argument("--cfg_strength", type=float, default=3.0,
help="CFG strength (default: 3.0)")
parser.add_argument("--t_shift", type=float, default=0.5,
help="t-shift (default: 0.5)")
parser.add_argument("--sil_len_to_end", type=float, default=0.5,
help="Silence padding in seconds (default: 0.5)")
parser.add_argument("--seed", type=int, default=-1,
help="Random seed, -1 for random (default: -1)")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
synthesize(args) |