Spaces:
Sleeping
Sleeping
File size: 5,956 Bytes
c7f3ffb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | import json
import shutil
import soundfile as sf
from pathlib import Path
import librosa
from preprocess.utils import convert_metadata, merge_short_segments
from preprocess.tools import (
F0Extractor,
VocalDetector,
VocalSeparator,
NoteTranscriber,
LyricTranscriber,
)
class PreprocessPipeline:
def __init__(self, device: str, language: str, save_dir: str, vocal_sep: bool = True, max_merge_duration: int = 60000):
self.device = device
self.language = language
self.save_dir = save_dir
self.vocal_sep = vocal_sep
self.max_merge_duration = max_merge_duration
if vocal_sep:
self.vocal_separator = VocalSeparator(
sep_model_path="pretrained_models/SoulX-Singer-Preprocess/mel-band-roformer-karaoke/mel_band_roformer_karaoke_becruily.ckpt",
sep_config_path="pretrained_models/SoulX-Singer-Preprocess/mel-band-roformer-karaoke/config_karaoke_becruily.yaml",
der_model_path="pretrained_models/SoulX-Singer-Preprocess/dereverb_mel_band_roformer/dereverb_mel_band_roformer_anvuew_sdr_19.1729.ckpt",
der_config_path="pretrained_models/SoulX-Singer-Preprocess/dereverb_mel_band_roformer/dereverb_mel_band_roformer_anvuew.yaml",
device=device
)
else:
self.vocal_separator = None
self.f0_extractor = F0Extractor(
model_path="pretrained_models/SoulX-Singer-Preprocess/rmvpe/rmvpe.pt",
device=device,
)
self.vocal_detector = VocalDetector(
cut_wavs_output_dir= f"{save_dir}/cut_wavs",
)
self.lyric_transcriber = LyricTranscriber(
zh_model_path="pretrained_models/SoulX-Singer-Preprocess/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
en_model_path="pretrained_models/SoulX-Singer-Preprocess/parakeet-tdt-0.6b-v2/parakeet-tdt-0.6b-v2.nemo",
device=device
)
self.note_transcriber = NoteTranscriber(
rosvot_model_path="pretrained_models/SoulX-Singer-Preprocess/rosvot/rosvot/model.pt",
rwbd_model_path="pretrained_models/SoulX-Singer-Preprocess/rosvot/rwbd/model.pt",
device=device
)
def run(
self,
audio_path: str,
vocal_sep: bool = True,
max_merge_duration: int = 60000,
language: str = "Mandarin"
) -> None:
vocal_sep = self.vocal_sep if vocal_sep is None else vocal_sep
max_merge_duration = self.max_merge_duration if max_merge_duration is None else max_merge_duration
language = self.language if language is None else language
output_dir = Path(self.save_dir)
output_dir.mkdir(parents=True, exist_ok=True)
if vocal_sep:
# Perform vocal/accompaniment separation
sep = self.vocal_separator.process(audio_path)
vocal = sep.vocals_dereverbed.T
acc = sep.accompaniment.T
sample_rate = sep.sample_rate
vocal_path = output_dir / "vocal.wav"
acc_path = output_dir / "acc.wav"
sf.write(vocal_path, vocal, sample_rate)
sf.write(acc_path, acc, sample_rate)
else:
# Use the original audio as vocal source (no separation)
vocal, sample_rate = librosa.load(audio_path, sr=None, mono=True)
vocal_path = output_dir / "vocal.wav"
sf.write(vocal_path, vocal, sample_rate)
vocal_f0 = self.f0_extractor.process(str(vocal_path))
segments = self.vocal_detector.process(str(vocal_path), f0=vocal_f0)
metadata = []
for seg in segments:
self.f0_extractor.process(seg["wav_fn"], f0_path=seg["wav_fn"].replace(".wav", "_f0.npy"))
words, durs = self.lyric_transcriber.process(
seg["wav_fn"], language
)
seg["words"] = words
seg["word_durs"] = durs
seg["language"] = language
metadata.append(
self.note_transcriber.process(seg, segment_info=seg)
)
merged = merge_short_segments(
vocal,
sample_rate,
metadata,
output_dir / "long_cut_wavs",
max_duration_ms=max_merge_duration,
)
final_metadata = []
for item in merged:
self.f0_extractor.process(item.wav_fn, f0_path=item.wav_fn.replace(".wav", "_f0.npy"))
final_metadata.append(convert_metadata(item))
with open(output_dir / "metadata.json", "w", encoding="utf-8") as f:
json.dump(final_metadata, f, ensure_ascii=False, indent=2)
shutil.copy(output_dir / "metadata.json", audio_path.replace(".wav", ".json").replace(".mp3", ".json").replace(".flac", ".json"))
def main(args):
pipeline = PreprocessPipeline(
device=args.device,
language=args.language,
save_dir=args.save_dir,
vocal_sep=args.vocal_sep,
max_merge_duration=args.max_merge_duration,
)
pipeline.run(
audio_path=args.audio_path,
language=args.language
)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--audio_path", type=str, required=True, help="Path to the input audio file")
parser.add_argument("--save_dir", type=str, required=True, help="Directory to save the output files")
parser.add_argument("--language", type=str, default="Mandarin", help="Language of the audio")
parser.add_argument("--device", type=str, default="cuda:0", help="Device to run the models on")
parser.add_argument("--vocal_sep", type=bool, default=True, help="Whether to perform vocal separation")
parser.add_argument("--max_merge_duration", type=int, default=60000, help="Maximum merged segment duration in milliseconds")
args = parser.parse_args()
main(args)
|