| | |
| | |
| | |
| | |
| |
|
| | import torch |
| | import numpy as np |
| | from numpy import linalg as LA |
| | import librosa |
| | import soundfile as sf |
| | import librosa.filters |
| |
|
| |
|
| | def load_audio_torch(wave_file, fs): |
| | """Load audio data into torch tensor |
| | |
| | Args: |
| | wave_file (str): path to wave file |
| | fs (int): sample rate |
| | |
| | Returns: |
| | audio (tensor): audio data in tensor |
| | fs (int): sample rate |
| | """ |
| |
|
| | audio, sample_rate = librosa.load(wave_file, sr=fs, mono=True) |
| | |
| | assert len(audio) > 2 |
| |
|
| | |
| | if np.issubdtype(audio.dtype, np.integer): |
| | max_mag = -np.iinfo(audio.dtype).min |
| | else: |
| | max_mag = max(np.amax(audio), -np.amin(audio)) |
| | max_mag = ( |
| | (2**31) + 1 |
| | if max_mag > (2**15) |
| | else ((2**15) + 1 if max_mag > 1.01 else 1.0) |
| | ) |
| |
|
| | |
| | audio = torch.FloatTensor(audio.astype(np.float32)) / max_mag |
| |
|
| | if (torch.isnan(audio) | torch.isinf(audio)).any(): |
| | return [], sample_rate or fs or 48000 |
| |
|
| | |
| | if fs is not None and fs != sample_rate: |
| | audio = torch.from_numpy( |
| | librosa.core.resample(audio.numpy(), orig_sr=sample_rate, target_sr=fs) |
| | ) |
| | sample_rate = fs |
| |
|
| | return audio, fs |
| |
|
| |
|
| | def _stft(y, cfg): |
| | return librosa.stft( |
| | y=y, n_fft=cfg.n_fft, hop_length=cfg.hop_size, win_length=cfg.win_size |
| | ) |
| |
|
| |
|
| | def energy(wav, cfg): |
| | D = _stft(wav, cfg) |
| | magnitudes = np.abs(D).T |
| | return LA.norm(magnitudes, axis=1) |
| |
|
| |
|
| | def get_energy_from_tacotron(audio, _stft): |
| | audio = torch.clip(torch.FloatTensor(audio).unsqueeze(0), -1, 1) |
| | audio = torch.autograd.Variable(audio, requires_grad=False) |
| | mel, energy = _stft.mel_spectrogram(audio) |
| | energy = torch.squeeze(energy, 0).numpy().astype(np.float32) |
| | return mel, energy |
| |
|