Add model code, inference script, and examples

dfd1909 verified 10 days ago

10 kB

	import librosa
	import numpy as np
	import pyworld as pw
	import pysptk.sptk as pysptk
	import torch

	from HParams import HParams
	class UtilWorldVocoder:
	def __init__(self,h_params:HParams):
	self.h_params = h_params
	self.sample_rate = self.h_params.preprocess.sample_rate
	self.n_fft = self.h_params.preprocess.nfft
	self.hop_length = self.h_params.preprocess.hopsize
	self.window_size = self.n_fft
	self.world_frame_period = (self.hop_length / self.sample_rate) * 1000

	def mag_phase_stft(self,audio):
	stft = librosa.stft(audio,n_fft=self.h_params.preprocess.nfft, hop_length=self.h_params.preprocess.hopsize)
	mag = abs(stft)
	phase = np.exp(1.j * np.angle(stft))
	return {"mag":mag,"phase":phase}


	def dynamic_range_compression(self, x, C=1, clip_val=1e-5):
	return np.log(np.clip(x, a_min=clip_val, a_max=None) * C)

	def dynamic_range_compression_torch(self, x, C=1, clip_val=1e-5):
	return torch.log(torch.clamp(x, min=clip_val) * C)

	def normalize(self,x, min_db = -80.0, max_db = 20.0, clip_val = 0.8):
	x = 2.0*(x - min_db)/(max_db - min_db) - 1.0
	x = torch.clamp(clip_val*x, -clip_val, clip_val)
	return x

	def denormalize(self, x, min_db = -80.0, max_db = 20.0, clip_val = 0.8):
	x = x/clip_val
	x = (max_db - min_db)*(x + 1.0)/2.0 + min_db
	return x

	def get_pred_accom_by_subtract_pred_vocal(self,pred_vocal,is_pred_vocal_audio,mix_audio):
	pred_vocal_mag = pred_vocal
	if is_pred_vocal_audio:
	pred_vocal_mag = self.mag_phase_stft(pred_vocal)["mag"]
	mix_stft = self.mag_phase_stft(mix_audio)
	mix_mag = mix_stft["mag"]
	mix_phase = mix_stft["phase"]
	pred_accom_mag = mix_mag - pred_vocal_mag
	pred_accom_mag[pred_accom_mag < 0] = 0
	pred_accom = librosa.istft(pred_accom_mag*mix_phase,hop_length=self.h_params.preprocess.hopsize,length=len(mix_audio))
	return pred_accom

	def get_compressed_world_parameters_from_audio(self,audio_mono):
	print("start: compressed_world_parameters_from_audio")
	world_parameters = pw.wav2world(audio_mono.astype("double"), self.sample_rate, frame_period=self.world_frame_period)

	f0 = world_parameters[0]
	f0_midi = self.pitch_to_midi(f0)
	interpolated_f0_midi,not_pitch = self.interpolate_f0_midi_nan_value(f0_midi)

	spectral_envelope = world_parameters[1]
	spectral_envelope = 10*np.log10(spectral_envelope)

	aperiodic = world_parameters[2]
	aperiodic = 10.np.log10(aperiodic*2)

	if self.h_params.preprocess.compress_method_world_parameter == 'mfsc':
	print("start: spectral sp_to_mfsc")
	compressed_spectral = self.sp_to_mfsc(spectral_envelope, self.h_params.preprocess.num_spectral_coefficients,0.45)
	print("start: aperiodic sp_to_mfsc")
	compressed_aperiodic = self.sp_to_mfsc(aperiodic, self.h_params.preprocess.num_aperiodic_coefficients,0.45)
	elif self.h_params.preprocess.compress_method_world_parameter == 'mgc':
	print("start: spectral sp_to_mgc")
	compressed_spectral = self.sp_to_mgc(spectral_envelope, self.h_params.preprocess.num_spectral_coefficients,0.45)
	print("start: aperiodic sp_to_mgc")
	compressed_aperiodic = self.sp_to_mgc(aperiodic, self.h_params.preprocess.num_aperiodic_coefficients,0.45)

	return { "f0": np.transpose(interpolated_f0_midi),"not_pitch":np.transpose(not_pitch.astype(int)), "spectral": np.transpose(compressed_spectral), "aperiodic": np.transpose(compressed_aperiodic) }

	def pitch_to_midi(self,frequency):
	midi = 69 + 12 * np.log2(frequency/440)
	return midi

	def midi_to_pitch(self,midi):
	frequency = 440 * pow(2, (midi - 69) / 12)
	return frequency

	def interpolate_f0_midi_nan_value(self,f0_midi):
	infinite_conditional_index = np.isinf(f0_midi)
	not_infinite_conditional_index = ~infinite_conditional_index
	infinite_int_index = infinite_conditional_index.nonzero()[0]
	not_infinite_int_index = (not_infinite_conditional_index).nonzero()[0]

	interpolated_f0_midi = f0_midi.copy()
	interpolated_f0_midi[infinite_conditional_index] = np.interp(infinite_int_index,not_infinite_int_index,f0_midi[not_infinite_conditional_index])

	interpolated_f0_midi = interpolated_f0_midi
	not_pitch = infinite_conditional_index

	return (interpolated_f0_midi,not_pitch)

	def sp_to_mfsc(self,sp, ndim, fw, noise_floor_db=-120.0):
	# helper function, sp->mgc->mfsc in a single step
	mgc = self.sp_to_mgc(sp, ndim, fw, noise_floor_db)
	mfsc = self.mgc_to_mfsc(mgc)
	return mfsc

	def sp_to_mgc(self,sp, ndim, fw, noise_floor_db=-120.0):
	# HTS uses -80, but we shift WORLD/STRAIGHT by -20 dB (so would be -100); use a little more headroom (SPTK uses doubles internally, so eps 1e-12 should still be OK)
	dtype = sp.dtype
	sp = sp.astype(np.float64) # required for pysptk
	mgc = np.apply_along_axis(pysptk.mcep, 1, np.atleast_2d(sp), order=ndim-1, alpha=fw, maxiter=0, etype=1, eps=10**(noise_floor_db/10), min_det=0.0, itype=1)
	if sp.ndim == 1:
	mgc = mgc.flatten()
	mgc = mgc.astype(dtype)
	return mgc

	def mgc_to_mfsc(self,mgc):
	is_1d = mgc.ndim == 1
	mgc = np.atleast_2d(mgc)
	ndim = mgc.shape[1]

	# mirror cepstrum
	mgc1 = np.concatenate([mgc[:, :], mgc[:, -2:0:-1]], axis=-1)

	# re-scale 'dc' and 'nyquist' cepstral bins (see mcep())
	mgc1[:, 0] *= 2
	mgc1[:, ndim-1] *= 2

	# fft, truncate, to decibels
	mfsc = np.real(np.fft.fft(mgc1))
	mfsc = mfsc[:, :ndim]
	mfsc = 10*mfsc/np.log(10)

	if is_1d:
	mfsc = mfsc.flatten()

	return mfsc

	def mfsc_to_mgc(self,mfsc):
	# mfsc -> mgc -> sp is a much slower alternative to mfsc_to_sp()
	is_1d = mfsc.ndim == 1
	mfsc = np.atleast_2d(mfsc)
	ndim = mfsc.shape[1]

	mfsc = mfsc/10*np.log(10)
	mfsc1 = np.concatenate([mfsc[:, :], mfsc[:, -2:0:-1]], axis=-1)
	mgc = np.real(np.fft.ifft(mfsc1))
	mgc[:, 0] /= 2
	mgc[:, ndim-1] /= 2
	mgc = mgc[:, :ndim]

	if is_1d:
	mgc = mgc.flatten()

	return mgc

	def mgc_to_sp(self,mgc, spec_size, fw):
	dtype = mgc.dtype
	mgc = mgc.astype(np.float64) # required for pysptk
	fftlen = 2*(spec_size - 1)
	sp = np.apply_along_axis(pysptk.mgc2sp, 1, np.atleast_2d(mgc), alpha=fw, gamma=0.0, fftlen=fftlen)
	sp = 20*np.real(sp)/np.log(10)
	if mgc.ndim == 1:
	sp = sp.flatten()
	sp = sp.astype(dtype)
	return sp

	def get_audio_from_compressed_world_parameters(self, f0, not_pitch, spectral_compressed, aperiodic_compressed):
	print("start: audio_from_compressed_world_parameters")

	is_pitch = (1-np.transpose(not_pitch))
	interpolated_f0 = self.midi_to_pitch(np.transpose(f0))
	f0_hz = (interpolated_f0 * is_pitch).astype('double')

	spectral = np.transpose(spectral_compressed)
	aperiodic = np.transpose(aperiodic_compressed)

	if self.h_params.preprocess.compress_method_world_parameter == 'mfsc':
	print("start: spectral mfsc_to_mgc")
	spectral = self.mfsc_to_mgc(spectral)
	print("start: aperiodic mfsc_to_mgc")
	aperiodic = self.mfsc_to_mgc(aperiodic)

	print("start: spectral mgc_to_sp")
	spectral = self.mgc_to_sp(spectral, self.h_params.preprocess.world_parameter_dimension, 0.45)
	print("start: aperiodic mgc_to_sp")
	aperiodic = self.mgc_to_sp(aperiodic, self.h_params.preprocess.world_parameter_dimension, 0.45)

	spectral = (10**(spectral/10)).astype('double')
	aperiodic = (10**(aperiodic/20)).astype('double')

	print("start: synthesize audio")
	audio = pw.synthesize(f0_hz,spectral,aperiodic,self.sample_rate,self.world_frame_period)

	return audio


	def torch_A_weighting(self, frequencies, min_db = -45.0):
	"""
	Compute A-weighting weights in Decibel scale (codes from librosa) and
	transform into amplitude domain (with DB-SPL equation).

	Argument:
	frequencies : tensor of frequencies to return amplitude weight
	min_db : mininum decibel weight. appropriate min_db value is important, as
	exp/log calculation might raise numeric error with float32 type.

	Returns:
	weights : tensor of amplitude attenuation weights corresponding to the frequencies tensor.
	"""

	# Calculate A-weighting in Decibel scale.
	frequencies_squared = frequencies ** 2
	const = torch.tensor([12200, 20.6, 107.7, 737.9]) ** 2.0
	weights_in_db = 2.0 + 20.0 * (torch.log10(const[0]) + 4 * torch.log10(frequencies)
	- torch.log10(frequencies_squared + const[0])
	- torch.log10(frequencies_squared + const[1])
	- 0.5 * torch.log10(frequencies_squared + const[2])
	- 0.5 * torch.log10(frequencies_squared + const[3]))

	# Set minimum Decibel weight.
	if min_db is not None:
	weights_in_db = torch.max(weights_in_db, torch.tensor([min_db], dtype = torch.float32))

	# Transform Decibel scale weight to amplitude scale weight.
	weights = torch.exp(torch.log(torch.tensor([10.], dtype = torch.float32)) * weights_in_db / 10)

	return weights



	if __name__ == '__main__':
	pa = HParams()
	wo = UtilWorldVocoder(pa)