| | from scipy.ndimage.morphology import binary_dilation |
| | from encoder.params_data import * |
| | from pathlib import Path |
| | from typing import Optional, Union |
| | from warnings import warn |
| | import numpy as np |
| | import librosa |
| | import struct |
| |
|
| | try: |
| | import webrtcvad |
| | except: |
| | warn("Unable to import 'webrtcvad'. This package enables noise removal and is recommended.") |
| | webrtcvad=None |
| |
|
| | int16_max = (2 ** 15) - 1 |
| |
|
| |
|
| | def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray], |
| | source_sr: Optional[int] = None, |
| | normalize: Optional[bool] = True, |
| | trim_silence: Optional[bool] = True): |
| | """ |
| | Applies the preprocessing operations used in training the Speaker Encoder to a waveform |
| | either on disk or in memory. The waveform will be resampled to match the data hyperparameters. |
| | |
| | :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not |
| | just .wav), either the waveform as a numpy array of floats. |
| | :param source_sr: if passing an audio waveform, the sampling rate of the waveform before |
| | preprocessing. After preprocessing, the waveform's sampling rate will match the data |
| | hyperparameters. If passing a filepath, the sampling rate will be automatically detected and |
| | this argument will be ignored. |
| | """ |
| | |
| | if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path): |
| | wav, source_sr = librosa.load(str(fpath_or_wav), sr=None) |
| | else: |
| | wav = fpath_or_wav |
| | |
| | |
| | if source_sr is not None and source_sr != sampling_rate: |
| | wav = librosa.resample(wav, source_sr, sampling_rate) |
| | |
| | |
| | if normalize: |
| | wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True) |
| | if webrtcvad and trim_silence: |
| | wav = trim_long_silences(wav) |
| | |
| | return wav |
| |
|
| |
|
| | def wav_to_mel_spectrogram(wav): |
| | """ |
| | Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform. |
| | Note: this not a log-mel spectrogram. |
| | """ |
| | frames = librosa.feature.melspectrogram( |
| | y=wav, |
| | sr=sampling_rate, |
| | n_fft=int(sampling_rate * mel_window_length / 1000), |
| | hop_length=int(sampling_rate * mel_window_step / 1000), |
| | n_mels=mel_n_channels |
| | ) |
| | return frames.astype(np.float32).T |
| |
|
| |
|
| | def trim_long_silences(wav): |
| | """ |
| | Ensures that segments without voice in the waveform remain no longer than a |
| | threshold determined by the VAD parameters in params.py. |
| | |
| | :param wav: the raw waveform as a numpy array of floats |
| | :return: the same waveform with silences trimmed away (length <= original wav length) |
| | """ |
| | |
| | samples_per_window = (vad_window_length * sampling_rate) // 1000 |
| | |
| | |
| | wav = wav[:len(wav) - (len(wav) % samples_per_window)] |
| | |
| | |
| | pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16)) |
| | |
| | |
| | voice_flags = [] |
| | vad = webrtcvad.Vad(mode=3) |
| | for window_start in range(0, len(wav), samples_per_window): |
| | window_end = window_start + samples_per_window |
| | voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2], |
| | sample_rate=sampling_rate)) |
| | voice_flags = np.array(voice_flags) |
| | |
| | |
| | def moving_average(array, width): |
| | array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2))) |
| | ret = np.cumsum(array_padded, dtype=float) |
| | ret[width:] = ret[width:] - ret[:-width] |
| | return ret[width - 1:] / width |
| | |
| | audio_mask = moving_average(voice_flags, vad_moving_average_width) |
| | audio_mask = np.round(audio_mask).astype(bool) |
| | |
| | |
| | audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1)) |
| | audio_mask = np.repeat(audio_mask, samples_per_window) |
| | |
| | return wav[audio_mask == True] |
| |
|
| |
|
| | def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False): |
| | if increase_only and decrease_only: |
| | raise ValueError("Both increase only and decrease only are set") |
| | dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2)) |
| | if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only): |
| | return wav |
| | return wav * (10 ** (dBFS_change / 20)) |
| |
|