Spaces:
Sleeping
Sleeping
| import operator | |
| import os | |
| from concurrent.futures import ProcessPoolExecutor | |
| from typing import * | |
| import librosa | |
| import numpy as np | |
| import scipy.signal as signal | |
| from scipy.io import wavfile | |
| from tqdm import tqdm | |
| from lib.rvc.utils import load_audio | |
| from .slicer import Slicer | |
| def norm_write( | |
| tmp_audio: np.ndarray, | |
| idx0: int, | |
| idx1: int, | |
| speaker_id: int, | |
| outdir: str, | |
| outdir_16k: str, | |
| sampling_rate: int, | |
| max: float, | |
| alpha: float, | |
| is_normalize: bool, | |
| ): | |
| if is_normalize: | |
| tmp_audio = (tmp_audio / np.abs(tmp_audio).max() * (max * alpha)) + ( | |
| 1 - alpha | |
| ) * tmp_audio | |
| else: | |
| # clip level to max (cause sometimes when floating point decoding) | |
| audio_min = np.min(tmp_audio) | |
| if audio_min < -max: | |
| tmp_audio = tmp_audio / -audio_min * max | |
| audio_max = np.max(tmp_audio) | |
| if audio_max > max: | |
| tmp_audio = tmp_audio / audio_max * max | |
| wavfile.write( | |
| os.path.join(outdir, f"{speaker_id:05}", f"{idx0}_{idx1}.wav"), | |
| sampling_rate, | |
| tmp_audio.astype(np.float32), | |
| ) | |
| tmp_audio = librosa.resample( | |
| tmp_audio, orig_sr=sampling_rate, target_sr=16000, res_type="soxr_vhq" | |
| ) | |
| wavfile.write( | |
| os.path.join(outdir_16k, f"{speaker_id:05}", f"{idx0}_{idx1}.wav"), | |
| 16000, | |
| tmp_audio.astype(np.float32), | |
| ) | |
| def write_mute( | |
| mute_wave_filename: str, | |
| speaker_id: int, | |
| outdir: str, | |
| outdir_16k: str, | |
| sampling_rate: int, | |
| ): | |
| tmp_audio = load_audio(mute_wave_filename, sampling_rate) | |
| wavfile.write( | |
| os.path.join(outdir, f"{speaker_id:05}", "mute.wav"), | |
| sampling_rate, | |
| tmp_audio.astype(np.float32), | |
| ) | |
| tmp_audio = librosa.resample( | |
| tmp_audio, orig_sr=sampling_rate, target_sr=16000, res_type="soxr_vhq" | |
| ) | |
| wavfile.write( | |
| os.path.join(outdir_16k, f"{speaker_id:05}", "mute.wav"), | |
| 16000, | |
| tmp_audio.astype(np.float32), | |
| ) | |
| def pipeline( | |
| slicer: Slicer, | |
| datasets: List[Tuple[str, int]], # List[(path, speaker_id)] | |
| outdir: str, | |
| outdir_16k: str, | |
| sampling_rate: int, | |
| is_normalize: bool, | |
| process_id: int = 0, | |
| ): | |
| per = 3.7 | |
| overlap = 0.3 | |
| tail = per + overlap | |
| max = 0.95 | |
| alpha = 0.8 | |
| bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=sampling_rate) | |
| for index, (wave_filename, speaker_id) in tqdm(datasets, position=1 + process_id): | |
| audio = load_audio(wave_filename, sampling_rate) | |
| audio = signal.lfilter(bh, ah, audio) | |
| idx1 = 0 | |
| for audio in slicer.slice(audio): | |
| i = 0 | |
| while 1: | |
| start = int(sampling_rate * (per - overlap) * i) | |
| i += 1 | |
| if len(audio[start:]) > tail * sampling_rate: | |
| tmp_audio = audio[start : start + int(per * sampling_rate)] | |
| norm_write( | |
| tmp_audio, | |
| index, | |
| idx1, | |
| speaker_id, | |
| outdir, | |
| outdir_16k, | |
| sampling_rate, | |
| max, | |
| alpha, | |
| is_normalize, | |
| ) | |
| idx1 += 1 | |
| else: | |
| tmp_audio = audio[start:] | |
| break | |
| norm_write( | |
| tmp_audio, | |
| index, | |
| idx1, | |
| speaker_id, | |
| outdir, | |
| outdir_16k, | |
| sampling_rate, | |
| max, | |
| alpha, | |
| is_normalize, | |
| ) | |
| idx1 += 1 | |
| def preprocess_audio( | |
| datasets: List[Tuple[str, int]], # List[(path, speaker_id)] | |
| sampling_rate: int, | |
| num_processes: int, | |
| training_dir: str, | |
| is_normalize: bool, | |
| mute_wav_path: str, | |
| ): | |
| waves_dir = os.path.join(training_dir, "0_gt_wavs") | |
| waves16k_dir = os.path.join(training_dir, "1_16k_wavs") | |
| if os.path.exists(waves_dir) and os.path.exists(waves16k_dir): | |
| return | |
| for speaker_id in set([spk for _, spk in datasets]): | |
| os.makedirs(os.path.join(waves_dir, f"{speaker_id:05}"), exist_ok=True) | |
| os.makedirs(os.path.join(waves16k_dir, f"{speaker_id:05}"), exist_ok=True) | |
| all = [(i, x) for i, x in enumerate(sorted(datasets, key=operator.itemgetter(0)))] | |
| # n of datasets per process | |
| process_all_nums = [len(all) // num_processes] * num_processes | |
| # add residual datasets | |
| for i in range(len(all) % num_processes): | |
| process_all_nums[i] += 1 | |
| assert len(all) == sum(process_all_nums), print( | |
| f"len(all): {len(all)}, sum(process_all_nums): {sum(process_all_nums)}" | |
| ) | |
| with ProcessPoolExecutor(max_workers=num_processes) as executor: | |
| all_index = 0 | |
| for i in range(num_processes): | |
| data = all[all_index : all_index + process_all_nums[i]] | |
| slicer = Slicer( | |
| sr=sampling_rate, | |
| threshold=-42, | |
| min_length=1500, | |
| min_interval=400, | |
| hop_size=15, | |
| max_sil_kept=500, | |
| ) | |
| executor.submit( | |
| pipeline, | |
| slicer, | |
| data, | |
| waves_dir, | |
| waves16k_dir, | |
| sampling_rate, | |
| is_normalize, | |
| process_id=i, | |
| ) | |
| all_index += process_all_nums[i] | |
| for speaker_id in set([spk for _, spk in datasets]): | |
| write_mute(mute_wav_path, speaker_id, waves_dir, waves16k_dir, sampling_rate) | |