Upload 187 files

da855ff almost 3 years ago

23.6 kB

	import numpy as np
	import scipy.signal as sps

	from .logs import get_logger_from_arg
	from .signal_manipulation import preemphasis


	def extract_mel_spectrogram_for_tts(wav_signal, fs, n_fft, step_size, n_mels, mel_fmin, mel_fmax, min_amplitude,
	pre_emphasis=True, pre_emph_coeff=0.97, dynamic_range=None, real_amplitude=True,
	centered=True, normalize_mel_bins=True, normalize_range=True, logger=None):
	""" Extract mel-spectrogram from an audio signal for TTS training

	:param wav_signal: Numpy array of audio samples -- shape = (T, )
	:param fs: sampling frequency of the audio signal
	:param n_fft: filter length (in samples) of the FFT
	:param step_size: length (in samples) between successive analysis windows
	:param n_mels: number of mel components in the mel-spectrogram
	:param mel_fmin: minimum frequency used when converting to mel
	:param mel_fmax: maximum frequency used when converting to mel
	:param min_amplitude: mel-spectrogram minimal permitted amplitude value (limits the dynamic range)
	:param pre_emphasis: perform pre-emphasis on input audio
	:param pre_emph_coeff: pre-emphasis coefficient
	:param dynamic_range: mel-spectrogram maximal dynamic range in dB (ignored if min_amplitude is specified)
	:param real_amplitude: if True, the value of the spectrogram bins will be divided by n_fft to get bin magnitude that
	reflect the temporal signal amplitude
	:param centered: if True, the spectrogram extraction window will be centered on the time step.
	The time sequence has to be padded.
	:param normalize_mel_bins: normalize energy per bins in the mel-spectrogram
	:param normalize_range: If True, map the db_dynamic_range to [0,1]
	:param logger: arg to create logger object

	:return: the mel-spectrogram corresponding to the input audio
	"""
	# perform pre-emphasis on input audio
	if pre_emphasis:
	wav_signal = preemphasis(wav_signal, preemph=pre_emph_coeff)

	# get linear amplitude spectrogram
	s, _ = extract_spectrogram(x=wav_signal, n_fft=n_fft, step_size=step_size,
	real_amplitude=real_amplitude, centered=centered)

	# convert to mel frequency scale
	s = linear_to_mel(linear_spectrogram=s, fs=fs, n_mels=n_mels, mel_fmin=mel_fmin, mel_fmax=mel_fmax,
	normalize_mel_bins=normalize_mel_bins, logger=logger)

	# extract min amplitude to clip the mel-spectrogram and set the dynamic range
	if min_amplitude or dynamic_range:
	min_amplitude = get_spectrogram_min_amplitude(real_amplitude=real_amplitude, min_amplitude=min_amplitude,
	dynamic_range=dynamic_range, n_fft=n_fft, logger=logger)

	# convert to dB and normalize range to [0, 1]
	s = amplitude_to_db(spectrogram=s, min_amplitude=min_amplitude, normalize_range=normalize_range, logger=logger)

	return s, wav_signal


	def get_spectrogram_min_amplitude(real_amplitude, min_amplitude=None, dynamic_range=None, n_fft=None, logger=None):
	""" Compute the minimum amplitude value a spectrogram bin can reach

	:param real_amplitude: If True, assume that the values of the spectrogram bins were divided by n_fft to get
	bin magnitude that reflect the temporal signal amplitude
	:param min_amplitude: The spectrogram minimal permitted amplitude value (limits the dynamic range)
	This value is affected when real_amplitude is set to True
	:param dynamic_range: The spectrogram maximal dynamic range in dB (ignored if min_amplitude is specified)
	This value is affected when real_amplitude is set to True
	:param n_fft: Number of samples of the FFT window used to extract spectrogram
	Only used when real_amplitude is set to True
	:param logger: arg to create logger object

	:return: the minimum amplitude of spectrogram bins
	"""
	# create logger object
	logger = get_logger_from_arg(logger)

	if min_amplitude and dynamic_range:
	logger.warning(f'Both "min_amplitude" and "dynamic_range" are specified, '
	f'only "min_amplitude" ({min_amplitude}) will be considered')
	else:
	assert (min_amplitude or dynamic_range), logger.error(f'Neither "min_amplitude" nor "dynamic_range" are set')

	if real_amplitude:
	assert (n_fft is not None), logger.error(f'"real_amplitude" is set to True but "n_fft" has no value')
	else:
	n_fft = 1 # equivalent to using a FFT window of 1

	if min_amplitude:
	# compute real min amplitude per bin
	min_amplitude = min_amplitude / n_fft

	elif dynamic_range:
	# compute real dynamic range per bin
	dynamic_range = dynamic_range + 20 * np.log10(n_fft)
	# compute real min amplitude per bin
	min_amplitude = 10 ** (-dynamic_range / 20)

	return min_amplitude


	def amplitude_to_db(spectrogram, min_amplitude=None, normalize_range=False, logger=None):
	""" Transform amplitude to dB with optional clipping and dynamic range normalization

	:param spectrogram: Numpy array containing all amplitudes of a spectrogram
	:param min_amplitude: Clip the spectrogram to the minimal permitted amplitude value
	:param normalize_range: If True, map the db_dynamic_range to [0,1]
	:param logger: arg to create logger object

	:return: spectrogram in dB
	"""
	# create logger object
	logger = get_logger_from_arg(logger)

	# make sure amplitude bins are positive
	spectrogram = np.abs(spectrogram)

	if min_amplitude:
	# apply clipping
	spectrogram = np.clip(spectrogram, a_min=min_amplitude, a_max=None)

	# transform to dB
	spectrogram = 20 * np.log10(spectrogram)

	# normalize range if necessary
	if normalize_range:
	# min_amplitude must be given to normalize de dB dynamic range
	assert (min_amplitude), logger.error(f'Asked for dynamic range normalization, but "min_amplitude" has no value')

	# compute dB dynamic range and map it to [0, 1]
	dynamic_range = -20 * np.log10(min_amplitude)
	spectrogram = (spectrogram + dynamic_range) / dynamic_range

	return spectrogram


	def denormalize_range(spectrogram, min_amplitude_used):
	""" Take a dB spectrogram that has been mapped between [0, 1] and shape it back to its original dB dynamic range

	:param spectrogram: Numpy array containing all amplitudes of a spectrogram in dB (values between 0 and 1)
	:param min_amplitude_used: Minimal amplitude value that was used to normalize the dB spectrogram dynamic range

	:return: spectrogram in dB with its range de-normalized
	"""
	# compute dB dynamic range
	dynamic_range = -20 * np.log10(min_amplitude_used)

	# denormalize dB dynamic range
	spectrogram = spectrogram * dynamic_range - dynamic_range

	return spectrogram


	def db_to_amplitude(spectrogram):
	""" Transform dB spectrogram to amplitude spectrogram

	:param spectrogram: Numpy array containing all amplitude of a spectrogram

	:return: spectrogram in amplitude value
	"""
	return 10 ** (spectrogram / 20)


	def linear_to_mel(linear_spectrogram, fs=None, n_mels=80, mel_fmin=0, mel_fmax=None,
	normalize_mel_bins=True, logger=None):
	""" Convert a linear spectrogram to a mel-spectrogram

	:param linear_spectrogram: Numpy array containing all amplitudes of a spectrogram -- shape = (n_fft // 2 + 1, T)
	:param fs: Sampling frequency expected by the algorithm
	:param n_mels: Number of bins in the mel-spectrogram
	:param mel_fmin: Lowest frequency in the mel-spectrum (Hz)
	:param mel_fmax: Highest frequency in the mel-spectrum (Hz)
	:param normalize_mel_bins: normalize energy per bins in the mel-spectrogram
	:param logger: arg to create logger object

	:return: Numpy array containing the spectrogram in mel frequency space -- shape = (n_mels, T)
	"""
	# find the number of samples used in the FFT window
	n_fft = (linear_spectrogram.shape[0] - 1) * 2

	# get filter parameters
	mel_basis = _get_mel_filterbank_matrix(n_fft=n_fft, fs=fs, n_mels=n_mels, mel_fmin=mel_fmin, mel_fmax=mel_fmax,
	normalize_mel_bins=normalize_mel_bins, logger=logger)

	# apply filter bank matrix
	return np.dot(mel_basis, linear_spectrogram)


	def mel_to_linear(mel_spectrogram, fs, n_fft, mel_fmin=0, mel_fmax=None, normalize_mel_bins=False, logger=None):
	""" Convert a mel-spectrogram to a linear spectrogram

	:param mel_spectrogram: Numpy array of the input mel spectrogram -- shape = (n_mels, T)
	:param fs: sampling frequency
	:param n_fft: number of samples used in the original FFT
	:param mel_fmin: minimum frequency used when converting to mel
	:param mel_fmax: maximum frequency used when converting to mel
	:param normalize_mel_bins: normalize energy per bins in the mel-spectrogram
	:param logger: arg to create logger object

	:return: Numpy array containing the spectrogram in linear frequency space -- shape = (n_fft // 2 + 1, T)
	"""
	# find the number of mel components
	n_mels = mel_spectrogram.shape[0]

	# get filter parameters
	mel_basis = _get_mel_filterbank_matrix(n_fft=n_fft, fs=fs, n_mels=n_mels, mel_fmin=mel_fmin, mel_fmax=mel_fmax,
	normalize_mel_bins=normalize_mel_bins, logger=logger)

	# normalise the row of the mel_basis
	weight_value = mel_basis.sum(axis=1)
	mel_basis = np.divide(mel_basis, weight_value.reshape(n_mels, 1))

	# apply the inverse of the mel_filter bank to the algorithm
	linear_spectrogram = np.dot(np.transpose(mel_spectrogram), mel_basis)

	return np.transpose(linear_spectrogram)


	def extract_spectrogram(x, n_fft, step_size, real_amplitude=True, centered=True):
	""" Extract the FFT spectrogram from a series of samples

	:param x: Numpy array of input samples -- shape = (T, )
	:param n_fft: number of point in the FFT window
	:param step_size: number of samples skipped at each extraction
	:param real_amplitude: if True the value of the bins will be divided by n_fft to get bin magnitude that
	reflect the temporal signal amplitude
	:param centered: if True, the extraction window will be centered on the time step.
	The time sequence has to be padded.

	:return: Numpy arrays of amplitude and phase of the spectrogram -- shapes = (n_fft // 2 + 1, L)
	"""
	# create the sampling window
	window = sps.hann(n_fft)

	# check input signal has a length superior or equal to n_fft
	if len(x) < n_fft:
	x = np.pad(x, (0, len(window) - len(x)), 'constant', constant_values=(0, 0))

	# pad before and after to center the window on the extracted values
	if centered:
	padding_left, padding_right = _get_padding_for_centered_spectrogram(n_fft=n_fft)
	x = np.pad(x, (padding_left, padding_right), mode='reflect')

	# count the number of frames
	if len(x) % step_size == 0:
	time_axis = int(np.floor((len(x) - n_fft) / step_size))
	else:
	time_axis = 1 + int(np.floor((len(x) - n_fft) / step_size))

	# create container for spectrogram
	amp = np.zeros((n_fft // 2 + 1, time_axis))
	phase = np.zeros((n_fft // 2 + 1, time_axis))

	for i in range(time_axis):
	# get slice of data
	win_data = x[i * step_size: i * step_size + n_fft]

	# apply windowing
	win_data = np.multiply(win_data, window)

	# get FFT
	freq = np.fft.rfft(win_data)

	# save magnitude and phase individually
	amp[:, i] = np.absolute(freq)
	phase[:, i] = np.angle(freq)

	# scale amplitude bins if necessary
	if real_amplitude:
	amp = amp / n_fft

	return amp, phase


	def get_nb_spectrogram_samples(wav_length, n_fft, step_size, centered=True):
	""" Return the number of spectrogram time frames given a WAV segment

	:param wav_length: number of samples in the WAV segment
	:param n_fft: filter length (in samples) of the FFT
	:param step_size: length (in samples) between successive analysis windows
	:param centered: if True, assume that the FFT extraction window is centered on the time step

	:return: the number of spectrogram time frames
	"""
	# create random signal
	random_signal = np.random.rand(wav_length)

	# extract amp and phase spectrograms -- shapes = (n_fft // 2 + 1, T)
	amp, phase = extract_spectrogram(x=random_signal, n_fft=n_fft, step_size=step_size, centered=centered)

	# return T
	return amp.shape[1]


	def get_nb_wav_samples(spectrogram_length, n_fft, step_size, centered=True):
	''' Return the number of WAV samples given a spectrogram segment

	:param spectrogram_length: number of time frames in the spectrogram segment
	:param n_fft: filter length (in samples) of the FFT
	:param step_size: length (in samples) between successive analysis windows
	:param centered: if True, assume that the FFT extraction window is centered on the time step

	:return: the number of WAV samples
	'''
	# audio segment was padded on the left and right to center the window on the extracted values
	if centered:
	padding_left, padding_right = _get_padding_for_centered_spectrogram(n_fft=n_fft)
	else:
	padding_left, padding_right = 0, 0

	return (spectrogram_length - 1) * step_size + n_fft - padding_left - padding_right


	def reconstruct_signal_griffin_lim(magnitude_spectrogram, step_size, iterations=30, logger=None):
	""" Reconstruct an audio signal from a magnitude spectrogram

	Given a magnitude spectrogram as input, reconstruct the audio signal and return it using
	the Griffin-Lim algorithm
	From the paper: "Signal estimation from modified short-time fourier transform" by Griffin and Lim, in IEEE
	transactions on Acoustics, Speech, and Signal Processing. Vol ASSP-32, No. 2, April 1984.

	:param magnitude_spectrogram: Numpy array magnitude spectrogram -- shape = (n_fft // 2 + 1, T)
	The rows correspond to frequency bins and the columns correspond to time slices
	:param step_size: length (in samples) between successive analysis windows
	:param iterations: Number of iterations for the Griffin-Lim algorithm
	Typically a few hundred is sufficient
	:param logger: arg to create logger object

	:return: the reconstructed time domain signal as a 1-dim Numpy array and the spectrogram that was used
	to produce the signal
	"""
	# create logger object
	logger = get_logger_from_arg(logger)

	# shape = (T, n_fft // 2 + 1)
	magnitude_spectrogram = np.transpose(magnitude_spectrogram)

	# find the number of samples used in the FFT window and extract the time steps
	n_fft = (magnitude_spectrogram.shape[1] - 1) * 2
	time_slices = magnitude_spectrogram.shape[0]

	# compute the number of samples needed
	len_samples = int(time_slices * step_size + n_fft)

	# initialize the reconstructed signal to noise
	x_reconstruct = np.random.randn(len_samples)
	window = np.hanning(n_fft)
	n = iterations # number of iterations of Griffin-Lim algorithm

	while n > 0:
	# decrement and compute FFT
	n -= 1
	reconstruction_spectrogram = np.array([np.fft.rfft(window * x_reconstruct[i: i + n_fft])
	for i in range(0, len(x_reconstruct) - n_fft, step_size)])

	# Discard magnitude part of the reconstruction and use the supplied magnitude spectrogram instead
	proposal_spectrogram = magnitude_spectrogram * np.exp(1.0j * np.angle(reconstruction_spectrogram))

	# store previous reconstructed signal and create a new one by iFFT
	prev_x = x_reconstruct
	x_reconstruct = np.zeros(len_samples)

	for i, j in enumerate(range(0, len(x_reconstruct) - n_fft, step_size)):
	x_reconstruct[j: j + n_fft] += window * np.real(np.fft.irfft(proposal_spectrogram[i]))

	# normalise signal due to overlap add
	x_reconstruct = x_reconstruct / (n_fft / step_size / 2)

	# compute diff between two signals and report progress
	diff = np.sqrt(sum((x_reconstruct - prev_x) ** 2) / x_reconstruct.size)
	logger.debug(f'Reconstruction iteration: {iterations - n}/{iterations} -- RMSE: {diff * 1e6:.3f}e-6')

	return x_reconstruct, proposal_spectrogram


	def _get_padding_for_centered_spectrogram(n_fft):
	""" Return padding that must be added to the left and right sides of a series of samples to extract a centered FFT

	:param n_fft: filter length (in samples) of the FFT

	:return: padding values for left and right sides
	"""
	# add same padding on left and right sides
	padding_left, padding_right = int(n_fft // 2), int(n_fft // 2)

	return padding_left, padding_right


	def _get_mel_filterbank_matrix(n_fft=None, fs=None, n_mels=80, mel_fmin=0, mel_fmax=None,
	normalize_mel_bins=False, logger=None):
	""" Create a Filterbank matrix to combine FFT bins into Mel-frequency bins

	:param n_fft: number of FFT components
	:param fs: sampling rate of the incoming signal
	:param n_mels: number of Mel bands to generate
	:param mel_fmin: lowest frequency (in Hz)
	:param mel_fmax: highest frequency (in Hz). If None, mel_fmax = sr / 2.0
	:param normalize_mel_bins: normalize energy per bins
	:param logger: arg to create logger object

	:return: np.ndarray [shape=(n_mels, 1 + n_fft // 2)] -- Mel transform matrix
	"""
	# create logger object
	logger = get_logger_from_arg(logger)

	# set mel_fmax
	if mel_fmax is None:
	mel_fmax = float(fs) / 2

	# Initialize the weights
	weights = np.zeros((int(n_mels), int(1 + n_fft // 2)))

	# Get the center frequencies of each FFT bin
	fft_freqs = np.linspace(0, float(fs) / 2, int(1 + n_fft // 2), endpoint=True)

	# 'Center freqs' of mel bands - uniformly spaced between limits
	min_mel = _hz_to_mel(mel_fmin)
	max_mel = _hz_to_mel(mel_fmax)

	mels = np.linspace(min_mel, max_mel, n_mels + 2)
	mel_f = _mel_to_hz(mels)

	fdiff = np.diff(mel_f)
	ramps = np.subtract.outer(mel_f, fft_freqs)

	for i in range(n_mels):
	# lower and upper slopes for all bins
	lower = -ramps[i] / fdiff[i]
	upper = ramps[i + 2] / fdiff[i + 1]

	# then intersect them with each other and zero
	weights[i] = np.maximum(0, np.minimum(lower, upper))

	if normalize_mel_bins: # Normalize energy per bins
	# Slaney-style mel is scaled to be approx constant energy per channel
	enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
	weights *= enorm[:, np.newaxis]

	# Only check weights if f_mel[0] is positive
	if not np.all((mel_f[:-2] == 0) \| (weights.max(axis=1) > 0)): # This means we have an empty channel somewhere
	# create logger object (only if needed)
	logger = get_logger_from_arg(logger)
	logger.warning('Empty filters detected in mel frequency basis. Some channels will produce empty responses. '
	'Try increasing your sampling rate (and fmax) or reducing n_mels.')

	return weights


	def _hz_to_mel(frequencies):
	""" Convert Hz to Mels

	:param frequencies: number or np.ndarray [shape=(n,)] -- scalar or array of frequencies

	:return: number or np.ndarray [shape=(n,)] -- input frequencies in Mels
	"""
	# create frequencies array
	frequencies = np.asanyarray(frequencies)

	# Fill in the linear part
	f_min = 0.0
	f_sp = 200.0 / 3
	mels = (frequencies - f_min) / f_sp

	# Fill in the log-scale part
	min_log_hz = 1000.0 # beginning of log region (Hz)
	min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
	log_step = np.log(6.4) / 27.0 # step size for log region

	if frequencies.ndim: # If we have array data, vectorize
	log_t = (frequencies >= min_log_hz)
	mels[log_t] = min_log_mel + np.log(frequencies[log_t] / min_log_hz) / log_step

	elif frequencies >= min_log_hz: # If we have scalar data, heck directly
	mels = min_log_mel + np.log(frequencies / min_log_hz) / log_step

	return mels


	def _mel_to_hz(mels):
	""" Convert mel bin numbers to frequencies

	:param mels: number or np.ndarray [shape=(n,)] -- scalar or array of mel bins to convert

	:return: number or np.ndarray [shape=(n,)] -- input mels in Hz
	"""
	# create mels array
	mels = np.asanyarray(mels)

	# Fill in the linear scale
	f_min = 0.0
	f_sp = 200.0 / 3
	freqs = f_min + f_sp * mels

	# And now the nonlinear scale
	min_log_hz = 1000.0 # beginning of log region (Hz)
	min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
	log_step = np.log(6.4) / 27.0 # step size for log region

	if mels.ndim: # If we have vector data, vectorize
	log_t = (mels >= min_log_mel)
	freqs[log_t] = min_log_hz * np.exp(log_step * (mels[log_t] - min_log_mel))

	elif mels >= min_log_mel: # If we have scalar data, check directly
	freqs = min_log_hz * np.exp(log_step * (mels - min_log_mel))

	return freqs


	def pre_emphasis_on_mel(mel_spec, preemph, fs, n_mels, mel_fmin=0, mel_fmax=None, min_amplitude=None,
	normalized_range=True, logger=''):
	logger = get_logger_from_arg(logger)

	# set mel_fmax
	if mel_fmax is None:
	mel_fmax = float(fs) / 2

	#### get the center frequency of all bins in the mel spectrum ####
	# 'Center freqs' of mel bands - uniformly spaced between limits
	min_mel = _hz_to_mel(mel_fmin)
	max_mel = _hz_to_mel(mel_fmax)

	mels = np.linspace(min_mel, max_mel, n_mels + 2)
	bin_freqs = _mel_to_hz(mels)

	#### get the the frequency response of the filter
	a = [1]
	b = [1, -preemph]
	w, h = sps.freqz(b=b, a=a, worN=bin_freqs[1:-1], fs=fs)

	#### apply filter to bins ###
	h = 20 * np.log10(np.abs(h)) # get the filter response in dB
	h = np.tile(np.expand_dims(h, axis=1), (1, mel_spec.shape[1]))

	# if range was normalized
	if normalized_range:
	dbr = -20 * np.log10(min_amplitude)
	# normalize filter
	h = h / dbr

	# Crazy empirical correction hack with magic numbers
	if min_amplitude == 1e-5 and preemph == 0.97:
	correction_matrix = np.log(w) / 30 - 0.277
	correction_matrix = np.tile(np.expand_dims(correction_matrix, axis=1), (1, h.shape[1]))
	h = h - correction_matrix
	else:
	logger.warn("You should probably compute a correction matrix for this config to compensate for the cliping.")

	return np.add(mel_spec, h)