File size: 4,769 Bytes
dfd1909
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#type
from typing import Union
from numpy import ndarray
from torch import Tensor
#package
import os
import torch
import numpy as np
import librosa.display
from librosa.filters import mel as librosa_mel_fn
try:
    import matplotlib.pyplot as plt
except:
    print('matplotlib is uninstalled')
#torchjaekwon
from TorchJaekwon.Util.UtilAudioSTFT import UtilAudioSTFT
from TorchJaekwon.Util.UtilTorch import UtilTorch

class UtilAudioMelSpec(UtilAudioSTFT):
    def __init__(self, 
                 nfft: int, 
                 hop_size: int, 
                 sample_rate:int,
                 mel_size:int,
                 frequency_min:float,
                 frequency_max:float):
        super().__init__(nfft, hop_size)

        self.sample_rate:int = sample_rate
        self.mel_size:int = mel_size
        self.frequency_min:float = frequency_min
        self.frequency_max:float = frequency_max if frequency_max is not None else sample_rate//2

        #[self.mel_size, self.nfft//2 + 1]
        self.mel_basis_np:ndarray = librosa_mel_fn(sr = self.sample_rate,
                                                   n_fft = self.nfft, 
                                                   n_mels = self.mel_size,
                                                   fmin = self.frequency_min, 
                                                   fmax = self.frequency_max)
        self.mel_basis_tensor:Tensor = torch.from_numpy(self.mel_basis_np).float()
        self.mel_frequncies = librosa.mel_frequencies(n_mels = self.mel_size,
                                                      fmin = self.frequency_min, 
                                                      fmax = self.frequency_max)
    
    @staticmethod
    def get_default_mel_spec_config(sample_rate:int = 16000) -> dict:
        nfft:int = 1024 if sample_rate <= 24000 else 2048
        mel_size:int = 80 if sample_rate <= 24000 else 128
        return {'nfft': nfft, 'hop_size': nfft//4, 'sample_rate': sample_rate, 'mel_size': mel_size, 'frequency_max': sample_rate//2, 'frequency_min': 0}
    
    def spec_to_mel_spec(self,stft_mag):
        if type(stft_mag) == np.ndarray:
            return np.matmul(self.mel_basis_np, stft_mag)
        elif type(stft_mag) == torch.Tensor:
            self.mel_basis_tensor = self.mel_basis_tensor.to(stft_mag.device)
            return torch.matmul(self.mel_basis_tensor, stft_mag)
        else:
            print("spec_to_mel_spec type error")
            exit()
    
    def dynamic_range_compression(self, x, C=1, clip_val=1e-5):
        if type(x) == np.ndarray:
            return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
        elif type(x) == torch.Tensor:
            return torch.log(torch.clamp(x, min=clip_val) * C)
        else:
            print("dynamic_range_compression type error")
            exit()
    
    def get_hifigan_mel_spec(self,
                             audio:Union[ndarray,Tensor], #[Batch,Time]
                             return_type:str=['ndarray','Tensor'][1]
                             ) -> Union[ndarray,Tensor]:
        if isinstance(audio,ndarray): audio = torch.FloatTensor(audio)
        while len(audio.shape) < 2: audio = audio.unsqueeze(0)

        if torch.min(audio) < -1.:
            print('min value is ', torch.min(audio))
        if torch.max(audio) > 1.:
            print('max value is ', torch.max(audio))

        spectrogram = self.stft_torch(audio)["mag"]
        mel_spec = self.spec_to_mel_spec(spectrogram)
        log_scale_mel = self.dynamic_range_compression(mel_spec)

        if return_type == 'ndarray':
            return log_scale_mel.cpu().detach().numpy()
        else:
            return log_scale_mel
    
    def mel_spec_plot(self,
                      save_path:str, #'*.png'
                      mel_spec:ndarray, #[mel_size, time]
                      fig_size:tuple=(8,4),
                      dpi:int = 500) -> None:
        assert(os.path.splitext(save_path)[1] == ".png") , "file extension should be '.png'"
        if isinstance(mel_spec, Tensor):
            mel_spec = UtilTorch.to_np(mel_spec)
        plt.figure(figsize=fig_size)
        plt.imshow(mel_spec, origin='lower', aspect='auto', cmap='viridis')
        plt.savefig(save_path,dpi=dpi)
        plt.close()
    
    def f0_to_melbin(self, 
                     f0:Tensor # 1d f0 tensor
                     ) -> Tensor:
        mel_frequencies = torch.FloatTensor(self.mel_frequncies).repeat(f0.shape[0]).reshape(f0.shape[0],-1).to(f0.device)
        mel_frequencies[((mel_frequencies - f0.unsqueeze(-1)) < 0)] = np.inf
        all_inf_value = torch.all(torch.isinf(mel_frequencies), dim = 1)
        mel_frequencies[all_inf_value,-1] = 0
        return torch.argmin(mel_frequencies, dim=1)