| import librosa |
| import numpy as np |
| import pyworld as pw |
| import pysptk.sptk as pysptk |
| import torch |
|
|
| from HParams import HParams |
| class UtilWorldVocoder: |
| def __init__(self,h_params:HParams): |
| self.h_params = h_params |
| self.sample_rate = self.h_params.preprocess.sample_rate |
| self.n_fft = self.h_params.preprocess.nfft |
| self.hop_length = self.h_params.preprocess.hopsize |
| self.window_size = self.n_fft |
| self.world_frame_period = (self.hop_length / self.sample_rate) * 1000 |
|
|
| def mag_phase_stft(self,audio): |
| stft = librosa.stft(audio,n_fft=self.h_params.preprocess.nfft, hop_length=self.h_params.preprocess.hopsize) |
| mag = abs(stft) |
| phase = np.exp(1.j * np.angle(stft)) |
| return {"mag":mag,"phase":phase} |
| |
| |
| def dynamic_range_compression(self, x, C=1, clip_val=1e-5): |
| return np.log(np.clip(x, a_min=clip_val, a_max=None) * C) |
| |
| def dynamic_range_compression_torch(self, x, C=1, clip_val=1e-5): |
| return torch.log(torch.clamp(x, min=clip_val) * C) |
| |
| def normalize(self,x, min_db = -80.0, max_db = 20.0, clip_val = 0.8): |
| x = 2.0*(x - min_db)/(max_db - min_db) - 1.0 |
| x = torch.clamp(clip_val*x, -clip_val, clip_val) |
| return x |
|
|
| def denormalize(self, x, min_db = -80.0, max_db = 20.0, clip_val = 0.8): |
| x = x/clip_val |
| x = (max_db - min_db)*(x + 1.0)/2.0 + min_db |
| return x |
| |
| def get_pred_accom_by_subtract_pred_vocal(self,pred_vocal,is_pred_vocal_audio,mix_audio): |
| pred_vocal_mag = pred_vocal |
| if is_pred_vocal_audio: |
| pred_vocal_mag = self.mag_phase_stft(pred_vocal)["mag"] |
| mix_stft = self.mag_phase_stft(mix_audio) |
| mix_mag = mix_stft["mag"] |
| mix_phase = mix_stft["phase"] |
| pred_accom_mag = mix_mag - pred_vocal_mag |
| pred_accom_mag[pred_accom_mag < 0] = 0 |
| pred_accom = librosa.istft(pred_accom_mag*mix_phase,hop_length=self.h_params.preprocess.hopsize,length=len(mix_audio)) |
| return pred_accom |
| |
| def get_compressed_world_parameters_from_audio(self,audio_mono): |
| print("start: compressed_world_parameters_from_audio") |
| world_parameters = pw.wav2world(audio_mono.astype("double"), self.sample_rate, frame_period=self.world_frame_period) |
| |
| f0 = world_parameters[0] |
| f0_midi = self.pitch_to_midi(f0) |
| interpolated_f0_midi,not_pitch = self.interpolate_f0_midi_nan_value(f0_midi) |
|
|
| spectral_envelope = world_parameters[1] |
| spectral_envelope = 10*np.log10(spectral_envelope) |
|
|
| aperiodic = world_parameters[2] |
| aperiodic = 10.*np.log10(aperiodic**2) |
|
|
| if self.h_params.preprocess.compress_method_world_parameter == 'mfsc': |
| print("start: spectral sp_to_mfsc") |
| compressed_spectral = self.sp_to_mfsc(spectral_envelope, self.h_params.preprocess.num_spectral_coefficients,0.45) |
| print("start: aperiodic sp_to_mfsc") |
| compressed_aperiodic = self.sp_to_mfsc(aperiodic, self.h_params.preprocess.num_aperiodic_coefficients,0.45) |
| elif self.h_params.preprocess.compress_method_world_parameter == 'mgc': |
| print("start: spectral sp_to_mgc") |
| compressed_spectral = self.sp_to_mgc(spectral_envelope, self.h_params.preprocess.num_spectral_coefficients,0.45) |
| print("start: aperiodic sp_to_mgc") |
| compressed_aperiodic = self.sp_to_mgc(aperiodic, self.h_params.preprocess.num_aperiodic_coefficients,0.45) |
|
|
| return { "f0": np.transpose(interpolated_f0_midi),"not_pitch":np.transpose(not_pitch.astype(int)), "spectral": np.transpose(compressed_spectral), "aperiodic": np.transpose(compressed_aperiodic) } |
|
|
| def pitch_to_midi(self,frequency): |
| midi = 69 + 12 * np.log2(frequency/440) |
| return midi |
| |
| def midi_to_pitch(self,midi): |
| frequency = 440 * pow(2, (midi - 69) / 12) |
| return frequency |
| |
| def interpolate_f0_midi_nan_value(self,f0_midi): |
| infinite_conditional_index = np.isinf(f0_midi) |
| not_infinite_conditional_index = ~infinite_conditional_index |
| infinite_int_index = infinite_conditional_index.nonzero()[0] |
| not_infinite_int_index = (not_infinite_conditional_index).nonzero()[0] |
| |
| interpolated_f0_midi = f0_midi.copy() |
| interpolated_f0_midi[infinite_conditional_index] = np.interp(infinite_int_index,not_infinite_int_index,f0_midi[not_infinite_conditional_index]) |
| |
| interpolated_f0_midi = interpolated_f0_midi |
| not_pitch = infinite_conditional_index |
| |
| return (interpolated_f0_midi,not_pitch) |
| |
| def sp_to_mfsc(self,sp, ndim, fw, noise_floor_db=-120.0): |
| |
| mgc = self.sp_to_mgc(sp, ndim, fw, noise_floor_db) |
| mfsc = self.mgc_to_mfsc(mgc) |
| return mfsc |
| |
| def sp_to_mgc(self,sp, ndim, fw, noise_floor_db=-120.0): |
| |
| dtype = sp.dtype |
| sp = sp.astype(np.float64) |
| mgc = np.apply_along_axis(pysptk.mcep, 1, np.atleast_2d(sp), order=ndim-1, alpha=fw, maxiter=0, etype=1, eps=10**(noise_floor_db/10), min_det=0.0, itype=1) |
| if sp.ndim == 1: |
| mgc = mgc.flatten() |
| mgc = mgc.astype(dtype) |
| return mgc |
|
|
| def mgc_to_mfsc(self,mgc): |
| is_1d = mgc.ndim == 1 |
| mgc = np.atleast_2d(mgc) |
| ndim = mgc.shape[1] |
|
|
| |
| mgc1 = np.concatenate([mgc[:, :], mgc[:, -2:0:-1]], axis=-1) |
|
|
| |
| mgc1[:, 0] *= 2 |
| mgc1[:, ndim-1] *= 2 |
| |
| |
| mfsc = np.real(np.fft.fft(mgc1)) |
| mfsc = mfsc[:, :ndim] |
| mfsc = 10*mfsc/np.log(10) |
|
|
| if is_1d: |
| mfsc = mfsc.flatten() |
|
|
| return mfsc |
| |
| def mfsc_to_mgc(self,mfsc): |
| |
| is_1d = mfsc.ndim == 1 |
| mfsc = np.atleast_2d(mfsc) |
| ndim = mfsc.shape[1] |
|
|
| mfsc = mfsc/10*np.log(10) |
| mfsc1 = np.concatenate([mfsc[:, :], mfsc[:, -2:0:-1]], axis=-1) |
| mgc = np.real(np.fft.ifft(mfsc1)) |
| mgc[:, 0] /= 2 |
| mgc[:, ndim-1] /= 2 |
| mgc = mgc[:, :ndim] |
|
|
| if is_1d: |
| mgc = mgc.flatten() |
| |
| return mgc |
| |
| def mgc_to_sp(self,mgc, spec_size, fw): |
| dtype = mgc.dtype |
| mgc = mgc.astype(np.float64) |
| fftlen = 2*(spec_size - 1) |
| sp = np.apply_along_axis(pysptk.mgc2sp, 1, np.atleast_2d(mgc), alpha=fw, gamma=0.0, fftlen=fftlen) |
| sp = 20*np.real(sp)/np.log(10) |
| if mgc.ndim == 1: |
| sp = sp.flatten() |
| sp = sp.astype(dtype) |
| return sp |
| |
| def get_audio_from_compressed_world_parameters(self, f0, not_pitch, spectral_compressed, aperiodic_compressed): |
| print("start: audio_from_compressed_world_parameters") |
| |
| is_pitch = (1-np.transpose(not_pitch)) |
| interpolated_f0 = self.midi_to_pitch(np.transpose(f0)) |
| f0_hz = (interpolated_f0 * is_pitch).astype('double') |
|
|
| spectral = np.transpose(spectral_compressed) |
| aperiodic = np.transpose(aperiodic_compressed) |
|
|
| if self.h_params.preprocess.compress_method_world_parameter == 'mfsc': |
| print("start: spectral mfsc_to_mgc") |
| spectral = self.mfsc_to_mgc(spectral) |
| print("start: aperiodic mfsc_to_mgc") |
| aperiodic = self.mfsc_to_mgc(aperiodic) |
| |
| print("start: spectral mgc_to_sp") |
| spectral = self.mgc_to_sp(spectral, self.h_params.preprocess.world_parameter_dimension, 0.45) |
| print("start: aperiodic mgc_to_sp") |
| aperiodic = self.mgc_to_sp(aperiodic, self.h_params.preprocess.world_parameter_dimension, 0.45) |
| |
| spectral = (10**(spectral/10)).astype('double') |
| aperiodic = (10**(aperiodic/20)).astype('double') |
|
|
| print("start: synthesize audio") |
| audio = pw.synthesize(f0_hz,spectral,aperiodic,self.sample_rate,self.world_frame_period) |
| |
| return audio |
|
|
| |
| def torch_A_weighting(self, frequencies, min_db = -45.0): |
| """ |
| Compute A-weighting weights in Decibel scale (codes from librosa) and |
| transform into amplitude domain (with DB-SPL equation). |
| |
| Argument: |
| frequencies : tensor of frequencies to return amplitude weight |
| min_db : mininum decibel weight. appropriate min_db value is important, as |
| exp/log calculation might raise numeric error with float32 type. |
| |
| Returns: |
| weights : tensor of amplitude attenuation weights corresponding to the frequencies tensor. |
| """ |
| |
| |
| frequencies_squared = frequencies ** 2 |
| const = torch.tensor([12200, 20.6, 107.7, 737.9]) ** 2.0 |
| weights_in_db = 2.0 + 20.0 * (torch.log10(const[0]) + 4 * torch.log10(frequencies) |
| - torch.log10(frequencies_squared + const[0]) |
| - torch.log10(frequencies_squared + const[1]) |
| - 0.5 * torch.log10(frequencies_squared + const[2]) |
| - 0.5 * torch.log10(frequencies_squared + const[3])) |
| |
| |
| if min_db is not None: |
| weights_in_db = torch.max(weights_in_db, torch.tensor([min_db], dtype = torch.float32)) |
| |
| |
| weights = torch.exp(torch.log(torch.tensor([10.], dtype = torch.float32)) * weights_in_db / 10) |
| |
| return weights |
| |
| |
| |
| if __name__ == '__main__': |
| pa = HParams() |
| wo = UtilWorldVocoder(pa) |
| |
|
|
|
|