|
|
from functools import lru_cache |
|
|
|
|
|
from scipy import signal |
|
|
import numpy as np |
|
|
import librosa |
|
|
|
|
|
|
|
|
@lru_cache() |
|
|
def mel_basis(hp): |
|
|
assert hp.fmax <= hp.sample_rate // 2 |
|
|
return librosa.filters.mel( |
|
|
sr=hp.sample_rate, |
|
|
n_fft=hp.n_fft, |
|
|
n_mels=hp.num_mels, |
|
|
fmin=hp.fmin, |
|
|
fmax=hp.fmax) |
|
|
|
|
|
|
|
|
def preemphasis(wav, hp): |
|
|
assert hp.preemphasis != 0 |
|
|
wav = signal.lfilter([1, -hp.preemphasis], [1], wav) |
|
|
wav = np.clip(wav, -1, 1) |
|
|
return wav |
|
|
|
|
|
|
|
|
def melspectrogram(wav, hp, pad=True): |
|
|
|
|
|
if hp.preemphasis > 0: |
|
|
wav = preemphasis(wav, hp) |
|
|
assert np.abs(wav).max() - 1 < 1e-07 |
|
|
|
|
|
|
|
|
spec_complex = _stft(wav, hp, pad=pad) |
|
|
|
|
|
|
|
|
spec_magnitudes = np.abs(spec_complex) |
|
|
|
|
|
if hp.mel_power != 1.0: |
|
|
spec_magnitudes **= hp.mel_power |
|
|
|
|
|
|
|
|
mel = np.dot(mel_basis(hp), spec_magnitudes) |
|
|
if hp.mel_type == "db": |
|
|
mel = _amp_to_db(mel, hp) |
|
|
|
|
|
|
|
|
if hp.normalized_mels: |
|
|
mel = _normalize(mel, hp).astype(np.float32) |
|
|
|
|
|
assert not pad or mel.shape[1] == 1 + len(wav) // hp.hop_size |
|
|
return mel |
|
|
|
|
|
|
|
|
def _stft(y, hp, pad=True): |
|
|
|
|
|
|
|
|
return librosa.stft( |
|
|
y, |
|
|
n_fft=hp.n_fft, |
|
|
hop_length=hp.hop_size, |
|
|
win_length=hp.win_size, |
|
|
center=pad, |
|
|
pad_mode="reflect", |
|
|
) |
|
|
|
|
|
|
|
|
def _amp_to_db(x, hp): |
|
|
return 20 * np.log10(np.maximum(hp.stft_magnitude_min, x)) |
|
|
|
|
|
|
|
|
def _db_to_amp(x): |
|
|
return np.power(10.0, x * 0.05) |
|
|
|
|
|
|
|
|
def _normalize(s, hp, headroom_db=15): |
|
|
min_level_db = 20 * np.log10(hp.stft_magnitude_min) |
|
|
s = (s - min_level_db) / (-min_level_db + headroom_db) |
|
|
return s |
|
|
|