ChristophSchuhmann's picture
Add model code, inference script, and examples
dfd1909 verified
import librosa
import numpy as np
import pyworld as pw
import pysptk.sptk as pysptk
import torch
from HParams import HParams
class UtilWorldVocoder:
def __init__(self,h_params:HParams):
self.h_params = h_params
self.sample_rate = self.h_params.preprocess.sample_rate
self.n_fft = self.h_params.preprocess.nfft
self.hop_length = self.h_params.preprocess.hopsize
self.window_size = self.n_fft
self.world_frame_period = (self.hop_length / self.sample_rate) * 1000
def mag_phase_stft(self,audio):
stft = librosa.stft(audio,n_fft=self.h_params.preprocess.nfft, hop_length=self.h_params.preprocess.hopsize)
mag = abs(stft)
phase = np.exp(1.j * np.angle(stft))
return {"mag":mag,"phase":phase}
def dynamic_range_compression(self, x, C=1, clip_val=1e-5):
return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)
def dynamic_range_compression_torch(self, x, C=1, clip_val=1e-5):
return torch.log(torch.clamp(x, min=clip_val) * C)
def normalize(self,x, min_db = -80.0, max_db = 20.0, clip_val = 0.8):
x = 2.0*(x - min_db)/(max_db - min_db) - 1.0
x = torch.clamp(clip_val*x, -clip_val, clip_val)
return x
def denormalize(self, x, min_db = -80.0, max_db = 20.0, clip_val = 0.8):
x = x/clip_val
x = (max_db - min_db)*(x + 1.0)/2.0 + min_db
return x
def get_pred_accom_by_subtract_pred_vocal(self,pred_vocal,is_pred_vocal_audio,mix_audio):
pred_vocal_mag = pred_vocal
if is_pred_vocal_audio:
pred_vocal_mag = self.mag_phase_stft(pred_vocal)["mag"]
mix_stft = self.mag_phase_stft(mix_audio)
mix_mag = mix_stft["mag"]
mix_phase = mix_stft["phase"]
pred_accom_mag = mix_mag - pred_vocal_mag
pred_accom_mag[pred_accom_mag < 0] = 0
pred_accom = librosa.istft(pred_accom_mag*mix_phase,hop_length=self.h_params.preprocess.hopsize,length=len(mix_audio))
return pred_accom
def get_compressed_world_parameters_from_audio(self,audio_mono):
print("start: compressed_world_parameters_from_audio")
world_parameters = pw.wav2world(audio_mono.astype("double"), self.sample_rate, frame_period=self.world_frame_period)
f0 = world_parameters[0]
f0_midi = self.pitch_to_midi(f0)
interpolated_f0_midi,not_pitch = self.interpolate_f0_midi_nan_value(f0_midi)
spectral_envelope = world_parameters[1]
spectral_envelope = 10*np.log10(spectral_envelope)
aperiodic = world_parameters[2]
aperiodic = 10.*np.log10(aperiodic**2)
if self.h_params.preprocess.compress_method_world_parameter == 'mfsc':
print("start: spectral sp_to_mfsc")
compressed_spectral = self.sp_to_mfsc(spectral_envelope, self.h_params.preprocess.num_spectral_coefficients,0.45)
print("start: aperiodic sp_to_mfsc")
compressed_aperiodic = self.sp_to_mfsc(aperiodic, self.h_params.preprocess.num_aperiodic_coefficients,0.45)
elif self.h_params.preprocess.compress_method_world_parameter == 'mgc':
print("start: spectral sp_to_mgc")
compressed_spectral = self.sp_to_mgc(spectral_envelope, self.h_params.preprocess.num_spectral_coefficients,0.45)
print("start: aperiodic sp_to_mgc")
compressed_aperiodic = self.sp_to_mgc(aperiodic, self.h_params.preprocess.num_aperiodic_coefficients,0.45)
return { "f0": np.transpose(interpolated_f0_midi),"not_pitch":np.transpose(not_pitch.astype(int)), "spectral": np.transpose(compressed_spectral), "aperiodic": np.transpose(compressed_aperiodic) }
def pitch_to_midi(self,frequency):
midi = 69 + 12 * np.log2(frequency/440)
return midi
def midi_to_pitch(self,midi):
frequency = 440 * pow(2, (midi - 69) / 12)
return frequency
def interpolate_f0_midi_nan_value(self,f0_midi):
infinite_conditional_index = np.isinf(f0_midi)
not_infinite_conditional_index = ~infinite_conditional_index
infinite_int_index = infinite_conditional_index.nonzero()[0]
not_infinite_int_index = (not_infinite_conditional_index).nonzero()[0]
interpolated_f0_midi = f0_midi.copy()
interpolated_f0_midi[infinite_conditional_index] = np.interp(infinite_int_index,not_infinite_int_index,f0_midi[not_infinite_conditional_index])
interpolated_f0_midi = interpolated_f0_midi
not_pitch = infinite_conditional_index
return (interpolated_f0_midi,not_pitch)
def sp_to_mfsc(self,sp, ndim, fw, noise_floor_db=-120.0):
# helper function, sp->mgc->mfsc in a single step
mgc = self.sp_to_mgc(sp, ndim, fw, noise_floor_db)
mfsc = self.mgc_to_mfsc(mgc)
return mfsc
def sp_to_mgc(self,sp, ndim, fw, noise_floor_db=-120.0):
# HTS uses -80, but we shift WORLD/STRAIGHT by -20 dB (so would be -100); use a little more headroom (SPTK uses doubles internally, so eps 1e-12 should still be OK)
dtype = sp.dtype
sp = sp.astype(np.float64) # required for pysptk
mgc = np.apply_along_axis(pysptk.mcep, 1, np.atleast_2d(sp), order=ndim-1, alpha=fw, maxiter=0, etype=1, eps=10**(noise_floor_db/10), min_det=0.0, itype=1)
if sp.ndim == 1:
mgc = mgc.flatten()
mgc = mgc.astype(dtype)
return mgc
def mgc_to_mfsc(self,mgc):
is_1d = mgc.ndim == 1
mgc = np.atleast_2d(mgc)
ndim = mgc.shape[1]
# mirror cepstrum
mgc1 = np.concatenate([mgc[:, :], mgc[:, -2:0:-1]], axis=-1)
# re-scale 'dc' and 'nyquist' cepstral bins (see mcep())
mgc1[:, 0] *= 2
mgc1[:, ndim-1] *= 2
# fft, truncate, to decibels
mfsc = np.real(np.fft.fft(mgc1))
mfsc = mfsc[:, :ndim]
mfsc = 10*mfsc/np.log(10)
if is_1d:
mfsc = mfsc.flatten()
return mfsc
def mfsc_to_mgc(self,mfsc):
# mfsc -> mgc -> sp is a much slower alternative to mfsc_to_sp()
is_1d = mfsc.ndim == 1
mfsc = np.atleast_2d(mfsc)
ndim = mfsc.shape[1]
mfsc = mfsc/10*np.log(10)
mfsc1 = np.concatenate([mfsc[:, :], mfsc[:, -2:0:-1]], axis=-1)
mgc = np.real(np.fft.ifft(mfsc1))
mgc[:, 0] /= 2
mgc[:, ndim-1] /= 2
mgc = mgc[:, :ndim]
if is_1d:
mgc = mgc.flatten()
return mgc
def mgc_to_sp(self,mgc, spec_size, fw):
dtype = mgc.dtype
mgc = mgc.astype(np.float64) # required for pysptk
fftlen = 2*(spec_size - 1)
sp = np.apply_along_axis(pysptk.mgc2sp, 1, np.atleast_2d(mgc), alpha=fw, gamma=0.0, fftlen=fftlen)
sp = 20*np.real(sp)/np.log(10)
if mgc.ndim == 1:
sp = sp.flatten()
sp = sp.astype(dtype)
return sp
def get_audio_from_compressed_world_parameters(self, f0, not_pitch, spectral_compressed, aperiodic_compressed):
print("start: audio_from_compressed_world_parameters")
is_pitch = (1-np.transpose(not_pitch))
interpolated_f0 = self.midi_to_pitch(np.transpose(f0))
f0_hz = (interpolated_f0 * is_pitch).astype('double')
spectral = np.transpose(spectral_compressed)
aperiodic = np.transpose(aperiodic_compressed)
if self.h_params.preprocess.compress_method_world_parameter == 'mfsc':
print("start: spectral mfsc_to_mgc")
spectral = self.mfsc_to_mgc(spectral)
print("start: aperiodic mfsc_to_mgc")
aperiodic = self.mfsc_to_mgc(aperiodic)
print("start: spectral mgc_to_sp")
spectral = self.mgc_to_sp(spectral, self.h_params.preprocess.world_parameter_dimension, 0.45)
print("start: aperiodic mgc_to_sp")
aperiodic = self.mgc_to_sp(aperiodic, self.h_params.preprocess.world_parameter_dimension, 0.45)
spectral = (10**(spectral/10)).astype('double')
aperiodic = (10**(aperiodic/20)).astype('double')
print("start: synthesize audio")
audio = pw.synthesize(f0_hz,spectral,aperiodic,self.sample_rate,self.world_frame_period)
return audio
def torch_A_weighting(self, frequencies, min_db = -45.0):
"""
Compute A-weighting weights in Decibel scale (codes from librosa) and
transform into amplitude domain (with DB-SPL equation).
Argument:
frequencies : tensor of frequencies to return amplitude weight
min_db : mininum decibel weight. appropriate min_db value is important, as
exp/log calculation might raise numeric error with float32 type.
Returns:
weights : tensor of amplitude attenuation weights corresponding to the frequencies tensor.
"""
# Calculate A-weighting in Decibel scale.
frequencies_squared = frequencies ** 2
const = torch.tensor([12200, 20.6, 107.7, 737.9]) ** 2.0
weights_in_db = 2.0 + 20.0 * (torch.log10(const[0]) + 4 * torch.log10(frequencies)
- torch.log10(frequencies_squared + const[0])
- torch.log10(frequencies_squared + const[1])
- 0.5 * torch.log10(frequencies_squared + const[2])
- 0.5 * torch.log10(frequencies_squared + const[3]))
# Set minimum Decibel weight.
if min_db is not None:
weights_in_db = torch.max(weights_in_db, torch.tensor([min_db], dtype = torch.float32))
# Transform Decibel scale weight to amplitude scale weight.
weights = torch.exp(torch.log(torch.tensor([10.], dtype = torch.float32)) * weights_in_db / 10)
return weights
if __name__ == '__main__':
pa = HParams()
wo = UtilWorldVocoder(pa)