| import librosa |
| import numpy as np |
| import pyloudnorm as pyln |
|
|
| from text_to_speech.utils.audio.vad import trim_long_silences |
|
|
|
|
| def librosa_pad_lr(x, fsize, fshift, pad_sides=1): |
| '''compute right padding (final frame) or both sides padding (first and final frames) |
| ''' |
| assert pad_sides in (1, 2) |
| |
| pad = (x.shape[0] // fshift + 1) * fshift - x.shape[0] |
| if pad_sides == 1: |
| return 0, pad |
| else: |
| return pad // 2, pad // 2 + pad % 2 |
|
|
|
|
| def amp_to_db(x): |
| return 20 * np.log10(np.maximum(1e-5, x)) |
|
|
|
|
| def db_to_amp(x): |
| return 10.0 ** (x * 0.05) |
|
|
|
|
| def normalize(S, min_level_db): |
| return (S - min_level_db) / -min_level_db |
|
|
|
|
| def denormalize(D, min_level_db): |
| return (D * -min_level_db) + min_level_db |
|
|
|
|
| def librosa_wav2spec(wav_path, |
| fft_size=1024, |
| hop_size=256, |
| win_length=1024, |
| window="hann", |
| num_mels=80, |
| fmin=80, |
| fmax=-1, |
| eps=1e-6, |
| sample_rate=22050, |
| loud_norm=False, |
| trim_long_sil=False): |
| if isinstance(wav_path, str): |
| if trim_long_sil: |
| wav, _, _ = trim_long_silences(wav_path, sample_rate) |
| else: |
| wav, _ = librosa.core.load(wav_path, sr=sample_rate) |
| else: |
| wav = wav_path |
|
|
| if loud_norm: |
| meter = pyln.Meter(sample_rate) |
| loudness = meter.integrated_loudness(wav) |
| wav = pyln.normalize.loudness(wav, loudness, -22.0) |
| if np.abs(wav).max() > 1: |
| wav = wav / np.abs(wav).max() |
|
|
| |
| x_stft = librosa.stft(wav, n_fft=fft_size, hop_length=hop_size, |
| win_length=win_length, window=window, pad_mode="constant") |
| linear_spc = np.abs(x_stft) |
|
|
| |
| fmin = 0 if fmin == -1 else fmin |
| fmax = sample_rate / 2 if fmax == -1 else fmax |
| mel_basis = librosa.filters.mel(sample_rate, fft_size, num_mels, fmin, fmax) |
|
|
| |
| mel = mel_basis @ linear_spc |
| mel = np.log10(np.maximum(eps, mel)) |
| l_pad, r_pad = librosa_pad_lr(wav, fft_size, hop_size, 1) |
| wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0) |
| wav = wav[:mel.shape[1] * hop_size] |
|
|
| |
| linear_spc = np.log10(np.maximum(eps, linear_spc)) |
| return {'wav': wav, 'mel': mel.T, 'linear': linear_spc.T, 'mel_basis': mel_basis} |
|
|