| |
| from typing import Union |
| from numpy import ndarray |
| from torch import Tensor |
| |
| import os |
| import torch |
| import numpy as np |
| import librosa.display |
| from librosa.filters import mel as librosa_mel_fn |
| try: |
| import matplotlib.pyplot as plt |
| except: |
| print('matplotlib is uninstalled') |
| |
| from TorchJaekwon.Util.UtilAudioSTFT import UtilAudioSTFT |
| from TorchJaekwon.Util.UtilTorch import UtilTorch |
|
|
| class UtilAudioMelSpec(UtilAudioSTFT): |
| def __init__(self, |
| nfft: int, |
| hop_size: int, |
| sample_rate:int, |
| mel_size:int, |
| frequency_min:float, |
| frequency_max:float): |
| super().__init__(nfft, hop_size) |
|
|
| self.sample_rate:int = sample_rate |
| self.mel_size:int = mel_size |
| self.frequency_min:float = frequency_min |
| self.frequency_max:float = frequency_max if frequency_max is not None else sample_rate//2 |
|
|
| |
| self.mel_basis_np:ndarray = librosa_mel_fn(sr = self.sample_rate, |
| n_fft = self.nfft, |
| n_mels = self.mel_size, |
| fmin = self.frequency_min, |
| fmax = self.frequency_max) |
| self.mel_basis_tensor:Tensor = torch.from_numpy(self.mel_basis_np).float() |
| self.mel_frequncies = librosa.mel_frequencies(n_mels = self.mel_size, |
| fmin = self.frequency_min, |
| fmax = self.frequency_max) |
| |
| @staticmethod |
| def get_default_mel_spec_config(sample_rate:int = 16000) -> dict: |
| nfft:int = 1024 if sample_rate <= 24000 else 2048 |
| mel_size:int = 80 if sample_rate <= 24000 else 128 |
| return {'nfft': nfft, 'hop_size': nfft//4, 'sample_rate': sample_rate, 'mel_size': mel_size, 'frequency_max': sample_rate//2, 'frequency_min': 0} |
| |
| def spec_to_mel_spec(self,stft_mag): |
| if type(stft_mag) == np.ndarray: |
| return np.matmul(self.mel_basis_np, stft_mag) |
| elif type(stft_mag) == torch.Tensor: |
| self.mel_basis_tensor = self.mel_basis_tensor.to(stft_mag.device) |
| return torch.matmul(self.mel_basis_tensor, stft_mag) |
| else: |
| print("spec_to_mel_spec type error") |
| exit() |
| |
| def dynamic_range_compression(self, x, C=1, clip_val=1e-5): |
| if type(x) == np.ndarray: |
| return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) |
| elif type(x) == torch.Tensor: |
| return torch.log(torch.clamp(x, min=clip_val) * C) |
| else: |
| print("dynamic_range_compression type error") |
| exit() |
| |
| def get_hifigan_mel_spec(self, |
| audio:Union[ndarray,Tensor], |
| return_type:str=['ndarray','Tensor'][1] |
| ) -> Union[ndarray,Tensor]: |
| if isinstance(audio,ndarray): audio = torch.FloatTensor(audio) |
| while len(audio.shape) < 2: audio = audio.unsqueeze(0) |
|
|
| if torch.min(audio) < -1.: |
| print('min value is ', torch.min(audio)) |
| if torch.max(audio) > 1.: |
| print('max value is ', torch.max(audio)) |
|
|
| spectrogram = self.stft_torch(audio)["mag"] |
| mel_spec = self.spec_to_mel_spec(spectrogram) |
| log_scale_mel = self.dynamic_range_compression(mel_spec) |
|
|
| if return_type == 'ndarray': |
| return log_scale_mel.cpu().detach().numpy() |
| else: |
| return log_scale_mel |
| |
| def mel_spec_plot(self, |
| save_path:str, |
| mel_spec:ndarray, |
| fig_size:tuple=(8,4), |
| dpi:int = 500) -> None: |
| assert(os.path.splitext(save_path)[1] == ".png") , "file extension should be '.png'" |
| if isinstance(mel_spec, Tensor): |
| mel_spec = UtilTorch.to_np(mel_spec) |
| plt.figure(figsize=fig_size) |
| plt.imshow(mel_spec, origin='lower', aspect='auto', cmap='viridis') |
| plt.savefig(save_path,dpi=dpi) |
| plt.close() |
| |
| def f0_to_melbin(self, |
| f0:Tensor |
| ) -> Tensor: |
| mel_frequencies = torch.FloatTensor(self.mel_frequncies).repeat(f0.shape[0]).reshape(f0.shape[0],-1).to(f0.device) |
| mel_frequencies[((mel_frequencies - f0.unsqueeze(-1)) < 0)] = np.inf |
| all_inf_value = torch.all(torch.isinf(mel_frequencies), dim = 1) |
| mel_frequencies[all_inf_value,-1] = 0 |
| return torch.argmin(mel_frequencies, dim=1) |
|
|