import operator import os from concurrent.futures import ProcessPoolExecutor from typing import * import librosa import numpy as np import scipy.signal as signal from scipy.io import wavfile from tqdm import tqdm from lib.rvc.utils import load_audio from .slicer import Slicer def norm_write( tmp_audio: np.ndarray, idx0: int, idx1: int, speaker_id: int, outdir: str, outdir_16k: str, sampling_rate: int, max: float, alpha: float, is_normalize: bool, ): if is_normalize: tmp_audio = (tmp_audio / np.abs(tmp_audio).max() * (max * alpha)) + ( 1 - alpha ) * tmp_audio else: # clip level to max (cause sometimes when floating point decoding) audio_min = np.min(tmp_audio) if audio_min < -max: tmp_audio = tmp_audio / -audio_min * max audio_max = np.max(tmp_audio) if audio_max > max: tmp_audio = tmp_audio / audio_max * max wavfile.write( os.path.join(outdir, f"{speaker_id:05}", f"{idx0}_{idx1}.wav"), sampling_rate, tmp_audio.astype(np.float32), ) tmp_audio = librosa.resample( tmp_audio, orig_sr=sampling_rate, target_sr=16000, res_type="soxr_vhq" ) wavfile.write( os.path.join(outdir_16k, f"{speaker_id:05}", f"{idx0}_{idx1}.wav"), 16000, tmp_audio.astype(np.float32), ) def write_mute( mute_wave_filename: str, speaker_id: int, outdir: str, outdir_16k: str, sampling_rate: int, ): tmp_audio = load_audio(mute_wave_filename, sampling_rate) wavfile.write( os.path.join(outdir, f"{speaker_id:05}", "mute.wav"), sampling_rate, tmp_audio.astype(np.float32), ) tmp_audio = librosa.resample( tmp_audio, orig_sr=sampling_rate, target_sr=16000, res_type="soxr_vhq" ) wavfile.write( os.path.join(outdir_16k, f"{speaker_id:05}", "mute.wav"), 16000, tmp_audio.astype(np.float32), ) def pipeline( slicer: Slicer, datasets: List[Tuple[str, int]], # List[(path, speaker_id)] outdir: str, outdir_16k: str, sampling_rate: int, is_normalize: bool, process_id: int = 0, ): per = 3.7 overlap = 0.3 tail = per + overlap max = 0.95 alpha = 0.8 bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=sampling_rate) for index, (wave_filename, speaker_id) in tqdm(datasets, position=1 + process_id): audio = load_audio(wave_filename, sampling_rate) audio = signal.lfilter(bh, ah, audio) idx1 = 0 for audio in slicer.slice(audio): i = 0 while 1: start = int(sampling_rate * (per - overlap) * i) i += 1 if len(audio[start:]) > tail * sampling_rate: tmp_audio = audio[start : start + int(per * sampling_rate)] norm_write( tmp_audio, index, idx1, speaker_id, outdir, outdir_16k, sampling_rate, max, alpha, is_normalize, ) idx1 += 1 else: tmp_audio = audio[start:] break norm_write( tmp_audio, index, idx1, speaker_id, outdir, outdir_16k, sampling_rate, max, alpha, is_normalize, ) idx1 += 1 def preprocess_audio( datasets: List[Tuple[str, int]], # List[(path, speaker_id)] sampling_rate: int, num_processes: int, training_dir: str, is_normalize: bool, mute_wav_path: str, ): waves_dir = os.path.join(training_dir, "0_gt_wavs") waves16k_dir = os.path.join(training_dir, "1_16k_wavs") if os.path.exists(waves_dir) and os.path.exists(waves16k_dir): return for speaker_id in set([spk for _, spk in datasets]): os.makedirs(os.path.join(waves_dir, f"{speaker_id:05}"), exist_ok=True) os.makedirs(os.path.join(waves16k_dir, f"{speaker_id:05}"), exist_ok=True) all = [(i, x) for i, x in enumerate(sorted(datasets, key=operator.itemgetter(0)))] # n of datasets per process process_all_nums = [len(all) // num_processes] * num_processes # add residual datasets for i in range(len(all) % num_processes): process_all_nums[i] += 1 assert len(all) == sum(process_all_nums), print( f"len(all): {len(all)}, sum(process_all_nums): {sum(process_all_nums)}" ) with ProcessPoolExecutor(max_workers=num_processes) as executor: all_index = 0 for i in range(num_processes): data = all[all_index : all_index + process_all_nums[i]] slicer = Slicer( sr=sampling_rate, threshold=-42, min_length=1500, min_interval=400, hop_size=15, max_sil_kept=500, ) executor.submit( pipeline, slicer, data, waves_dir, waves16k_dir, sampling_rate, is_normalize, process_id=i, ) all_index += process_all_nums[i] for speaker_id in set([spk for _, spk in datasets]): write_mute(mute_wav_path, speaker_id, waves_dir, waves16k_dir, sampling_rate)