File size: 10,034 Bytes
dfd1909
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
import librosa
import numpy as np
import pyworld as pw
import pysptk.sptk as pysptk
import torch

from HParams import HParams
class UtilWorldVocoder:
    def __init__(self,h_params:HParams):
        self.h_params = h_params
        self.sample_rate = self.h_params.preprocess.sample_rate
        self.n_fft = self.h_params.preprocess.nfft
        self.hop_length = self.h_params.preprocess.hopsize
        self.window_size = self.n_fft
        self.world_frame_period = (self.hop_length / self.sample_rate) * 1000

    def mag_phase_stft(self,audio):
        stft = librosa.stft(audio,n_fft=self.h_params.preprocess.nfft, hop_length=self.h_params.preprocess.hopsize)
        mag = abs(stft)
        phase = np.exp(1.j * np.angle(stft))
        return {"mag":mag,"phase":phase}
    
    
    def dynamic_range_compression(self, x, C=1, clip_val=1e-5):
        return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
    
    def dynamic_range_compression_torch(self, x, C=1, clip_val=1e-5):
        return torch.log(torch.clamp(x, min=clip_val) * C)
    
    def normalize(self,x, min_db = -80.0, max_db = 20.0, clip_val = 0.8):
        x = 2.0*(x - min_db)/(max_db - min_db) - 1.0
        x = torch.clamp(clip_val*x, -clip_val, clip_val)
        return x

    def denormalize(self, x, min_db = -80.0, max_db = 20.0, clip_val = 0.8):
        x = x/clip_val
        x = (max_db - min_db)*(x + 1.0)/2.0 + min_db
        return x
        
    def get_pred_accom_by_subtract_pred_vocal(self,pred_vocal,is_pred_vocal_audio,mix_audio):
        pred_vocal_mag = pred_vocal
        if is_pred_vocal_audio:
            pred_vocal_mag = self.mag_phase_stft(pred_vocal)["mag"]
        mix_stft = self.mag_phase_stft(mix_audio)
        mix_mag = mix_stft["mag"]
        mix_phase = mix_stft["phase"]
        pred_accom_mag = mix_mag - pred_vocal_mag
        pred_accom_mag[pred_accom_mag < 0] = 0
        pred_accom = librosa.istft(pred_accom_mag*mix_phase,hop_length=self.h_params.preprocess.hopsize,length=len(mix_audio))
        return pred_accom
    
    def get_compressed_world_parameters_from_audio(self,audio_mono):
        print("start: compressed_world_parameters_from_audio")
        world_parameters = pw.wav2world(audio_mono.astype("double"), self.sample_rate, frame_period=self.world_frame_period)
        
        f0 = world_parameters[0]
        f0_midi = self.pitch_to_midi(f0)
        interpolated_f0_midi,not_pitch = self.interpolate_f0_midi_nan_value(f0_midi)

        spectral_envelope = world_parameters[1]
        spectral_envelope = 10*np.log10(spectral_envelope)

        aperiodic = world_parameters[2]
        aperiodic = 10.*np.log10(aperiodic**2)

        if self.h_params.preprocess.compress_method_world_parameter == 'mfsc':
            print("start: spectral sp_to_mfsc")
            compressed_spectral = self.sp_to_mfsc(spectral_envelope, self.h_params.preprocess.num_spectral_coefficients,0.45)
            print("start: aperiodic sp_to_mfsc")
            compressed_aperiodic = self.sp_to_mfsc(aperiodic, self.h_params.preprocess.num_aperiodic_coefficients,0.45)
        elif self.h_params.preprocess.compress_method_world_parameter == 'mgc':
            print("start: spectral sp_to_mgc")
            compressed_spectral = self.sp_to_mgc(spectral_envelope, self.h_params.preprocess.num_spectral_coefficients,0.45)
            print("start: aperiodic sp_to_mgc")
            compressed_aperiodic = self.sp_to_mgc(aperiodic, self.h_params.preprocess.num_aperiodic_coefficients,0.45)

        return { "f0": np.transpose(interpolated_f0_midi),"not_pitch":np.transpose(not_pitch.astype(int)), "spectral": np.transpose(compressed_spectral), "aperiodic": np.transpose(compressed_aperiodic) }

    def pitch_to_midi(self,frequency):
        midi = 69 + 12 * np.log2(frequency/440)
        return midi
    
    def midi_to_pitch(self,midi):
        frequency = 440 * pow(2, (midi - 69) / 12)
        return frequency
    
    def interpolate_f0_midi_nan_value(self,f0_midi):
        infinite_conditional_index = np.isinf(f0_midi)
        not_infinite_conditional_index = ~infinite_conditional_index
        infinite_int_index = infinite_conditional_index.nonzero()[0]
        not_infinite_int_index = (not_infinite_conditional_index).nonzero()[0]
        
        interpolated_f0_midi = f0_midi.copy()
        interpolated_f0_midi[infinite_conditional_index] = np.interp(infinite_int_index,not_infinite_int_index,f0_midi[not_infinite_conditional_index])
        
        interpolated_f0_midi = interpolated_f0_midi
        not_pitch = infinite_conditional_index
        
        return (interpolated_f0_midi,not_pitch)
    
    def sp_to_mfsc(self,sp, ndim, fw, noise_floor_db=-120.0):
        # helper function, sp->mgc->mfsc in a single step
        mgc = self.sp_to_mgc(sp, ndim, fw, noise_floor_db)
        mfsc = self.mgc_to_mfsc(mgc)
        return mfsc
    
    def sp_to_mgc(self,sp, ndim, fw, noise_floor_db=-120.0):
        # HTS uses -80, but we shift WORLD/STRAIGHT by -20 dB (so would be -100); use a little more headroom (SPTK uses doubles internally, so eps 1e-12 should still be OK)
        dtype = sp.dtype
        sp = sp.astype(np.float64)  # required for pysptk
        mgc = np.apply_along_axis(pysptk.mcep, 1, np.atleast_2d(sp), order=ndim-1, alpha=fw, maxiter=0, etype=1, eps=10**(noise_floor_db/10), min_det=0.0, itype=1)
        if sp.ndim == 1:
            mgc = mgc.flatten()
        mgc = mgc.astype(dtype)
        return mgc

    def mgc_to_mfsc(self,mgc):
        is_1d = mgc.ndim == 1
        mgc = np.atleast_2d(mgc)
        ndim = mgc.shape[1]

        # mirror cepstrum
        mgc1 = np.concatenate([mgc[:, :], mgc[:, -2:0:-1]], axis=-1)

        # re-scale 'dc' and 'nyquist' cepstral bins (see mcep())
        mgc1[:, 0] *= 2
        mgc1[:, ndim-1] *= 2
        
        # fft, truncate, to decibels
        mfsc = np.real(np.fft.fft(mgc1))
        mfsc = mfsc[:, :ndim]
        mfsc = 10*mfsc/np.log(10)

        if is_1d:
            mfsc = mfsc.flatten()

        return mfsc
    
    def mfsc_to_mgc(self,mfsc):
        # mfsc -> mgc -> sp is a much slower alternative to mfsc_to_sp()
        is_1d = mfsc.ndim == 1
        mfsc = np.atleast_2d(mfsc)
        ndim = mfsc.shape[1]

        mfsc = mfsc/10*np.log(10)
        mfsc1 = np.concatenate([mfsc[:, :], mfsc[:, -2:0:-1]], axis=-1)
        mgc = np.real(np.fft.ifft(mfsc1))
        mgc[:, 0] /= 2
        mgc[:, ndim-1] /= 2
        mgc = mgc[:, :ndim]

        if is_1d:
            mgc = mgc.flatten()
        
        return mgc
    
    def mgc_to_sp(self,mgc, spec_size, fw):
        dtype = mgc.dtype
        mgc = mgc.astype(np.float64)  # required for pysptk
        fftlen = 2*(spec_size - 1)
        sp = np.apply_along_axis(pysptk.mgc2sp, 1, np.atleast_2d(mgc), alpha=fw, gamma=0.0, fftlen=fftlen)
        sp = 20*np.real(sp)/np.log(10)
        if mgc.ndim == 1:
            sp = sp.flatten()
        sp = sp.astype(dtype)
        return sp
    
    def get_audio_from_compressed_world_parameters(self, f0, not_pitch, spectral_compressed, aperiodic_compressed):
        print("start: audio_from_compressed_world_parameters")
        
        is_pitch = (1-np.transpose(not_pitch))
        interpolated_f0 = self.midi_to_pitch(np.transpose(f0))
        f0_hz = (interpolated_f0 * is_pitch).astype('double')

        spectral = np.transpose(spectral_compressed)
        aperiodic = np.transpose(aperiodic_compressed)

        if self.h_params.preprocess.compress_method_world_parameter == 'mfsc':
            print("start: spectral mfsc_to_mgc")
            spectral = self.mfsc_to_mgc(spectral)
            print("start: aperiodic mfsc_to_mgc")
            aperiodic = self.mfsc_to_mgc(aperiodic)
            
        print("start: spectral mgc_to_sp")
        spectral = self.mgc_to_sp(spectral, self.h_params.preprocess.world_parameter_dimension, 0.45)
        print("start: aperiodic mgc_to_sp")
        aperiodic = self.mgc_to_sp(aperiodic, self.h_params.preprocess.world_parameter_dimension, 0.45)
        
        spectral = (10**(spectral/10)).astype('double')
        aperiodic = (10**(aperiodic/20)).astype('double')

        print("start: synthesize audio")
        audio = pw.synthesize(f0_hz,spectral,aperiodic,self.sample_rate,self.world_frame_period)
        
        return audio

    
    def torch_A_weighting(self, frequencies, min_db = -45.0):
        """
        Compute A-weighting weights in Decibel scale (codes from librosa) and 
        transform into amplitude domain (with DB-SPL equation).
        
        Argument: 
            frequencies : tensor of frequencies to return amplitude weight
            min_db : mininum decibel weight. appropriate min_db value is important, as 
                exp/log calculation might raise numeric error with float32 type. 
        
        Returns:
            weights : tensor of amplitude attenuation weights corresponding to the frequencies tensor.
        """
        
        # Calculate A-weighting in Decibel scale.
        frequencies_squared = frequencies ** 2 
        const = torch.tensor([12200, 20.6, 107.7, 737.9]) ** 2.0
        weights_in_db = 2.0 + 20.0 * (torch.log10(const[0]) + 4 * torch.log10(frequencies)
                               - torch.log10(frequencies_squared + const[0])
                               - torch.log10(frequencies_squared + const[1])
                               - 0.5 * torch.log10(frequencies_squared + const[2])
                               - 0.5 * torch.log10(frequencies_squared + const[3]))
        
        # Set minimum Decibel weight.
        if min_db is not None:
            weights_in_db = torch.max(weights_in_db, torch.tensor([min_db], dtype = torch.float32))
        
        # Transform Decibel scale weight to amplitude scale weight.
        weights = torch.exp(torch.log(torch.tensor([10.], dtype = torch.float32)) * weights_in_db / 10) 
        
        return weights
    
    
        
if __name__ == '__main__':
    pa = HParams()
    wo = UtilWorldVocoder(pa)