| # raccoonML audio tools. | |
| # MIT License | |
| # Copyright (c) 2021 raccoonML (https://patreon.com/raccoonML) | |
| # | |
| # Permission is hereby granted, free of charge, to any person obtaining a copy | |
| # of this software and associated documentation files (the "Software") to deal | |
| # in the Software without restriction, including without limitation the rights | |
| # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| # copies of the Software, and to permit persons to whom the Software is | |
| # furnished to do so, subject to the following conditions: | |
| # | |
| # The above copyright notice and this permission notice shall be included in | |
| # all copies or substantial portions of the Software. | |
| # | |
| # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR ANY OTHER | |
| # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
| # THE SOFTWARE. | |
| import librosa | |
| import numpy as np | |
| import soundfile as sf | |
| import torch | |
| from scipy import signal | |
| _mel_basis = None | |
| def load_wav(path, sr): | |
| # Loads an audio file and returns the waveform data. | |
| wav, _ = librosa.load(str(path), sr=sr) | |
| return wav | |
| def save_wav(wav, path, sr): | |
| # Saves waveform data to audio file. | |
| sf.write(path, wav, sr) | |
| def melspectrogram(wav, hparams): | |
| # Converts a waveform to a mel-scale spectrogram. | |
| # Output shape = (num_mels, frames) | |
| # Apply preemphasis | |
| if hparams.preemphasize: | |
| wav = preemphasis(wav, hparams.preemphasis) | |
| # Short-time Fourier Transform (STFT) | |
| D = librosa.stft( | |
| y=wav, | |
| n_fft=hparams.n_fft, | |
| hop_length=hparams.hop_size, | |
| win_length=hparams.win_size, | |
| ) | |
| # Convert complex-valued output of STFT to absolute value (real) | |
| S = np.abs(D) | |
| # Build and cache mel basis | |
| # This improves speed when calculating thousands of mel spectrograms. | |
| global _mel_basis | |
| if _mel_basis is None: | |
| _mel_basis = _build_mel_basis(hparams) | |
| # Transform to mel scale | |
| S = np.dot(_mel_basis, S) | |
| # Dynamic range compression | |
| S = np.log(np.clip(S, a_min=1e-5, a_max=None)) | |
| return S.astype(np.float32) | |
| def inv_mel_spectrogram(S, hparams): | |
| # Converts a mel spectrogram to waveform using Griffin-Lim | |
| # Input shape = (num_mels, frames) | |
| # Denormalize | |
| S = np.exp(S) | |
| # Build and cache mel basis | |
| # This improves speed when calculating thousands of mel spectrograms. | |
| global _mel_basis | |
| if _mel_basis is None: | |
| _mel_basis = _build_mel_basis(hparams) | |
| # Inverse mel basis | |
| p = np.matmul(_mel_basis, _mel_basis.T) | |
| d = [1.0 / x if np.abs(x) > 1.0e-8 else x for x in np.sum(p, axis=0)] | |
| _inv_mel_basis = np.matmul(_mel_basis.T, np.diag(d)) | |
| # Invert mel basis to recover linear spectrogram | |
| S = np.dot(_inv_mel_basis, S) | |
| # Use Griffin-Lim to recover waveform | |
| wav = _griffin_lim(S ** hparams.power, hparams) | |
| # Invert preemphasis | |
| if hparams.preemphasize: | |
| wav = inv_preemphasis(wav, hparams.preemphasis) | |
| return wav | |
| def preemphasis(wav, k, preemphasize=True): | |
| # Amplifies high frequency content in a waveform. | |
| if preemphasize: | |
| wav = signal.lfilter([1, -k], [1], wav) | |
| return wav | |
| def inv_preemphasis(wav, k, inv_preemphasize=True): | |
| # Inverts the preemphasis filter. | |
| if inv_preemphasize: | |
| wav = signal.lfilter([1], [1, -k], wav) | |
| return wav | |
| def _build_mel_basis(hparams): | |
| return librosa.filters.mel( | |
| sr=hparams.sample_rate, | |
| n_fft=hparams.n_fft, | |
| n_mels=hparams.num_mels, | |
| fmin=hparams.fmin, | |
| fmax=hparams.fmax, | |
| ) | |
| def _griffin_lim(S, hparams): | |
| angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) | |
| S = np.abs(S).astype(np.complex) | |
| wav = librosa.istft( | |
| S * angles, hop_length=hparams.hop_size, win_length=hparams.win_size | |
| ) | |
| for i in range(hparams.griffin_lim_iters): | |
| angles = np.exp( | |
| 1j | |
| * np.angle( | |
| librosa.stft( | |
| wav, | |
| n_fft=hparams.n_fft, | |
| hop_length=hparams.hop_size, | |
| win_length=hparams.win_size, | |
| ) | |
| ) | |
| ) | |
| wav = librosa.istft( | |
| S * angles, hop_length=hparams.hop_size, win_length=hparams.win_size | |
| ) | |
| return wav | |