youngseng's picture
Upload 187 files
da855ff
import numpy as np
import scipy.signal as sps
from .logs import get_logger_from_arg
from .signal_manipulation import preemphasis
def extract_mel_spectrogram_for_tts(wav_signal, fs, n_fft, step_size, n_mels, mel_fmin, mel_fmax, min_amplitude,
pre_emphasis=True, pre_emph_coeff=0.97, dynamic_range=None, real_amplitude=True,
centered=True, normalize_mel_bins=True, normalize_range=True, logger=None):
""" Extract mel-spectrogram from an audio signal for TTS training
:param wav_signal: Numpy array of audio samples -- shape = (T, )
:param fs: sampling frequency of the audio signal
:param n_fft: filter length (in samples) of the FFT
:param step_size: length (in samples) between successive analysis windows
:param n_mels: number of mel components in the mel-spectrogram
:param mel_fmin: minimum frequency used when converting to mel
:param mel_fmax: maximum frequency used when converting to mel
:param min_amplitude: mel-spectrogram minimal permitted amplitude value (limits the dynamic range)
:param pre_emphasis: perform pre-emphasis on input audio
:param pre_emph_coeff: pre-emphasis coefficient
:param dynamic_range: mel-spectrogram maximal dynamic range in dB (ignored if min_amplitude is specified)
:param real_amplitude: if True, the value of the spectrogram bins will be divided by n_fft to get bin magnitude that
reflect the temporal signal amplitude
:param centered: if True, the spectrogram extraction window will be centered on the time step.
The time sequence has to be padded.
:param normalize_mel_bins: normalize energy per bins in the mel-spectrogram
:param normalize_range: If True, map the db_dynamic_range to [0,1]
:param logger: arg to create logger object
:return: the mel-spectrogram corresponding to the input audio
"""
# perform pre-emphasis on input audio
if pre_emphasis:
wav_signal = preemphasis(wav_signal, preemph=pre_emph_coeff)
# get linear amplitude spectrogram
s, _ = extract_spectrogram(x=wav_signal, n_fft=n_fft, step_size=step_size,
real_amplitude=real_amplitude, centered=centered)
# convert to mel frequency scale
s = linear_to_mel(linear_spectrogram=s, fs=fs, n_mels=n_mels, mel_fmin=mel_fmin, mel_fmax=mel_fmax,
normalize_mel_bins=normalize_mel_bins, logger=logger)
# extract min amplitude to clip the mel-spectrogram and set the dynamic range
if min_amplitude or dynamic_range:
min_amplitude = get_spectrogram_min_amplitude(real_amplitude=real_amplitude, min_amplitude=min_amplitude,
dynamic_range=dynamic_range, n_fft=n_fft, logger=logger)
# convert to dB and normalize range to [0, 1]
s = amplitude_to_db(spectrogram=s, min_amplitude=min_amplitude, normalize_range=normalize_range, logger=logger)
return s, wav_signal
def get_spectrogram_min_amplitude(real_amplitude, min_amplitude=None, dynamic_range=None, n_fft=None, logger=None):
""" Compute the minimum amplitude value a spectrogram bin can reach
:param real_amplitude: If True, assume that the values of the spectrogram bins were divided by n_fft to get
bin magnitude that reflect the temporal signal amplitude
:param min_amplitude: The spectrogram minimal permitted amplitude value (limits the dynamic range)
This value is affected when real_amplitude is set to True
:param dynamic_range: The spectrogram maximal dynamic range in dB (ignored if min_amplitude is specified)
This value is affected when real_amplitude is set to True
:param n_fft: Number of samples of the FFT window used to extract spectrogram
Only used when real_amplitude is set to True
:param logger: arg to create logger object
:return: the minimum amplitude of spectrogram bins
"""
# create logger object
logger = get_logger_from_arg(logger)
if min_amplitude and dynamic_range:
logger.warning(f'Both "min_amplitude" and "dynamic_range" are specified, '
f'only "min_amplitude" ({min_amplitude}) will be considered')
else:
assert (min_amplitude or dynamic_range), logger.error(f'Neither "min_amplitude" nor "dynamic_range" are set')
if real_amplitude:
assert (n_fft is not None), logger.error(f'"real_amplitude" is set to True but "n_fft" has no value')
else:
n_fft = 1 # equivalent to using a FFT window of 1
if min_amplitude:
# compute real min amplitude per bin
min_amplitude = min_amplitude / n_fft
elif dynamic_range:
# compute real dynamic range per bin
dynamic_range = dynamic_range + 20 * np.log10(n_fft)
# compute real min amplitude per bin
min_amplitude = 10 ** (-dynamic_range / 20)
return min_amplitude
def amplitude_to_db(spectrogram, min_amplitude=None, normalize_range=False, logger=None):
""" Transform amplitude to dB with optional clipping and dynamic range normalization
:param spectrogram: Numpy array containing all amplitudes of a spectrogram
:param min_amplitude: Clip the spectrogram to the minimal permitted amplitude value
:param normalize_range: If True, map the db_dynamic_range to [0,1]
:param logger: arg to create logger object
:return: spectrogram in dB
"""
# create logger object
logger = get_logger_from_arg(logger)
# make sure amplitude bins are positive
spectrogram = np.abs(spectrogram)
if min_amplitude:
# apply clipping
spectrogram = np.clip(spectrogram, a_min=min_amplitude, a_max=None)
# transform to dB
spectrogram = 20 * np.log10(spectrogram)
# normalize range if necessary
if normalize_range:
# min_amplitude must be given to normalize de dB dynamic range
assert (min_amplitude), logger.error(f'Asked for dynamic range normalization, but "min_amplitude" has no value')
# compute dB dynamic range and map it to [0, 1]
dynamic_range = -20 * np.log10(min_amplitude)
spectrogram = (spectrogram + dynamic_range) / dynamic_range
return spectrogram
def denormalize_range(spectrogram, min_amplitude_used):
""" Take a dB spectrogram that has been mapped between [0, 1] and shape it back to its original dB dynamic range
:param spectrogram: Numpy array containing all amplitudes of a spectrogram in dB (values between 0 and 1)
:param min_amplitude_used: Minimal amplitude value that was used to normalize the dB spectrogram dynamic range
:return: spectrogram in dB with its range de-normalized
"""
# compute dB dynamic range
dynamic_range = -20 * np.log10(min_amplitude_used)
# denormalize dB dynamic range
spectrogram = spectrogram * dynamic_range - dynamic_range
return spectrogram
def db_to_amplitude(spectrogram):
""" Transform dB spectrogram to amplitude spectrogram
:param spectrogram: Numpy array containing all amplitude of a spectrogram
:return: spectrogram in amplitude value
"""
return 10 ** (spectrogram / 20)
def linear_to_mel(linear_spectrogram, fs=None, n_mels=80, mel_fmin=0, mel_fmax=None,
normalize_mel_bins=True, logger=None):
""" Convert a linear spectrogram to a mel-spectrogram
:param linear_spectrogram: Numpy array containing all amplitudes of a spectrogram -- shape = (n_fft // 2 + 1, T)
:param fs: Sampling frequency expected by the algorithm
:param n_mels: Number of bins in the mel-spectrogram
:param mel_fmin: Lowest frequency in the mel-spectrum (Hz)
:param mel_fmax: Highest frequency in the mel-spectrum (Hz)
:param normalize_mel_bins: normalize energy per bins in the mel-spectrogram
:param logger: arg to create logger object
:return: Numpy array containing the spectrogram in mel frequency space -- shape = (n_mels, T)
"""
# find the number of samples used in the FFT window
n_fft = (linear_spectrogram.shape[0] - 1) * 2
# get filter parameters
mel_basis = _get_mel_filterbank_matrix(n_fft=n_fft, fs=fs, n_mels=n_mels, mel_fmin=mel_fmin, mel_fmax=mel_fmax,
normalize_mel_bins=normalize_mel_bins, logger=logger)
# apply filter bank matrix
return np.dot(mel_basis, linear_spectrogram)
def mel_to_linear(mel_spectrogram, fs, n_fft, mel_fmin=0, mel_fmax=None, normalize_mel_bins=False, logger=None):
""" Convert a mel-spectrogram to a linear spectrogram
:param mel_spectrogram: Numpy array of the input mel spectrogram -- shape = (n_mels, T)
:param fs: sampling frequency
:param n_fft: number of samples used in the original FFT
:param mel_fmin: minimum frequency used when converting to mel
:param mel_fmax: maximum frequency used when converting to mel
:param normalize_mel_bins: normalize energy per bins in the mel-spectrogram
:param logger: arg to create logger object
:return: Numpy array containing the spectrogram in linear frequency space -- shape = (n_fft // 2 + 1, T)
"""
# find the number of mel components
n_mels = mel_spectrogram.shape[0]
# get filter parameters
mel_basis = _get_mel_filterbank_matrix(n_fft=n_fft, fs=fs, n_mels=n_mels, mel_fmin=mel_fmin, mel_fmax=mel_fmax,
normalize_mel_bins=normalize_mel_bins, logger=logger)
# normalise the row of the mel_basis
weight_value = mel_basis.sum(axis=1)
mel_basis = np.divide(mel_basis, weight_value.reshape(n_mels, 1))
# apply the inverse of the mel_filter bank to the algorithm
linear_spectrogram = np.dot(np.transpose(mel_spectrogram), mel_basis)
return np.transpose(linear_spectrogram)
def extract_spectrogram(x, n_fft, step_size, real_amplitude=True, centered=True):
""" Extract the FFT spectrogram from a series of samples
:param x: Numpy array of input samples -- shape = (T, )
:param n_fft: number of point in the FFT window
:param step_size: number of samples skipped at each extraction
:param real_amplitude: if True the value of the bins will be divided by n_fft to get bin magnitude that
reflect the temporal signal amplitude
:param centered: if True, the extraction window will be centered on the time step.
The time sequence has to be padded.
:return: Numpy arrays of amplitude and phase of the spectrogram -- shapes = (n_fft // 2 + 1, L)
"""
# create the sampling window
window = sps.hann(n_fft)
# check input signal has a length superior or equal to n_fft
if len(x) < n_fft:
x = np.pad(x, (0, len(window) - len(x)), 'constant', constant_values=(0, 0))
# pad before and after to center the window on the extracted values
if centered:
padding_left, padding_right = _get_padding_for_centered_spectrogram(n_fft=n_fft)
x = np.pad(x, (padding_left, padding_right), mode='reflect')
# count the number of frames
if len(x) % step_size == 0:
time_axis = int(np.floor((len(x) - n_fft) / step_size))
else:
time_axis = 1 + int(np.floor((len(x) - n_fft) / step_size))
# create container for spectrogram
amp = np.zeros((n_fft // 2 + 1, time_axis))
phase = np.zeros((n_fft // 2 + 1, time_axis))
for i in range(time_axis):
# get slice of data
win_data = x[i * step_size: i * step_size + n_fft]
# apply windowing
win_data = np.multiply(win_data, window)
# get FFT
freq = np.fft.rfft(win_data)
# save magnitude and phase individually
amp[:, i] = np.absolute(freq)
phase[:, i] = np.angle(freq)
# scale amplitude bins if necessary
if real_amplitude:
amp = amp / n_fft
return amp, phase
def get_nb_spectrogram_samples(wav_length, n_fft, step_size, centered=True):
""" Return the number of spectrogram time frames given a WAV segment
:param wav_length: number of samples in the WAV segment
:param n_fft: filter length (in samples) of the FFT
:param step_size: length (in samples) between successive analysis windows
:param centered: if True, assume that the FFT extraction window is centered on the time step
:return: the number of spectrogram time frames
"""
# create random signal
random_signal = np.random.rand(wav_length)
# extract amp and phase spectrograms -- shapes = (n_fft // 2 + 1, T)
amp, phase = extract_spectrogram(x=random_signal, n_fft=n_fft, step_size=step_size, centered=centered)
# return T
return amp.shape[1]
def get_nb_wav_samples(spectrogram_length, n_fft, step_size, centered=True):
''' Return the number of WAV samples given a spectrogram segment
:param spectrogram_length: number of time frames in the spectrogram segment
:param n_fft: filter length (in samples) of the FFT
:param step_size: length (in samples) between successive analysis windows
:param centered: if True, assume that the FFT extraction window is centered on the time step
:return: the number of WAV samples
'''
# audio segment was padded on the left and right to center the window on the extracted values
if centered:
padding_left, padding_right = _get_padding_for_centered_spectrogram(n_fft=n_fft)
else:
padding_left, padding_right = 0, 0
return (spectrogram_length - 1) * step_size + n_fft - padding_left - padding_right
def reconstruct_signal_griffin_lim(magnitude_spectrogram, step_size, iterations=30, logger=None):
""" Reconstruct an audio signal from a magnitude spectrogram
Given a magnitude spectrogram as input, reconstruct the audio signal and return it using
the Griffin-Lim algorithm
From the paper: "Signal estimation from modified short-time fourier transform" by Griffin and Lim, in IEEE
transactions on Acoustics, Speech, and Signal Processing. Vol ASSP-32, No. 2, April 1984.
:param magnitude_spectrogram: Numpy array magnitude spectrogram -- shape = (n_fft // 2 + 1, T)
The rows correspond to frequency bins and the columns correspond to time slices
:param step_size: length (in samples) between successive analysis windows
:param iterations: Number of iterations for the Griffin-Lim algorithm
Typically a few hundred is sufficient
:param logger: arg to create logger object
:return: the reconstructed time domain signal as a 1-dim Numpy array and the spectrogram that was used
to produce the signal
"""
# create logger object
logger = get_logger_from_arg(logger)
# shape = (T, n_fft // 2 + 1)
magnitude_spectrogram = np.transpose(magnitude_spectrogram)
# find the number of samples used in the FFT window and extract the time steps
n_fft = (magnitude_spectrogram.shape[1] - 1) * 2
time_slices = magnitude_spectrogram.shape[0]
# compute the number of samples needed
len_samples = int(time_slices * step_size + n_fft)
# initialize the reconstructed signal to noise
x_reconstruct = np.random.randn(len_samples)
window = np.hanning(n_fft)
n = iterations # number of iterations of Griffin-Lim algorithm
while n > 0:
# decrement and compute FFT
n -= 1
reconstruction_spectrogram = np.array([np.fft.rfft(window * x_reconstruct[i: i + n_fft])
for i in range(0, len(x_reconstruct) - n_fft, step_size)])
# Discard magnitude part of the reconstruction and use the supplied magnitude spectrogram instead
proposal_spectrogram = magnitude_spectrogram * np.exp(1.0j * np.angle(reconstruction_spectrogram))
# store previous reconstructed signal and create a new one by iFFT
prev_x = x_reconstruct
x_reconstruct = np.zeros(len_samples)
for i, j in enumerate(range(0, len(x_reconstruct) - n_fft, step_size)):
x_reconstruct[j: j + n_fft] += window * np.real(np.fft.irfft(proposal_spectrogram[i]))
# normalise signal due to overlap add
x_reconstruct = x_reconstruct / (n_fft / step_size / 2)
# compute diff between two signals and report progress
diff = np.sqrt(sum((x_reconstruct - prev_x) ** 2) / x_reconstruct.size)
logger.debug(f'Reconstruction iteration: {iterations - n}/{iterations} -- RMSE: {diff * 1e6:.3f}e-6')
return x_reconstruct, proposal_spectrogram
def _get_padding_for_centered_spectrogram(n_fft):
""" Return padding that must be added to the left and right sides of a series of samples to extract a centered FFT
:param n_fft: filter length (in samples) of the FFT
:return: padding values for left and right sides
"""
# add same padding on left and right sides
padding_left, padding_right = int(n_fft // 2), int(n_fft // 2)
return padding_left, padding_right
def _get_mel_filterbank_matrix(n_fft=None, fs=None, n_mels=80, mel_fmin=0, mel_fmax=None,
normalize_mel_bins=False, logger=None):
""" Create a Filterbank matrix to combine FFT bins into Mel-frequency bins
:param n_fft: number of FFT components
:param fs: sampling rate of the incoming signal
:param n_mels: number of Mel bands to generate
:param mel_fmin: lowest frequency (in Hz)
:param mel_fmax: highest frequency (in Hz). If None, mel_fmax = sr / 2.0
:param normalize_mel_bins: normalize energy per bins
:param logger: arg to create logger object
:return: np.ndarray [shape=(n_mels, 1 + n_fft // 2)] -- Mel transform matrix
"""
# create logger object
logger = get_logger_from_arg(logger)
# set mel_fmax
if mel_fmax is None:
mel_fmax = float(fs) / 2
# Initialize the weights
weights = np.zeros((int(n_mels), int(1 + n_fft // 2)))
# Get the center frequencies of each FFT bin
fft_freqs = np.linspace(0, float(fs) / 2, int(1 + n_fft // 2), endpoint=True)
# 'Center freqs' of mel bands - uniformly spaced between limits
min_mel = _hz_to_mel(mel_fmin)
max_mel = _hz_to_mel(mel_fmax)
mels = np.linspace(min_mel, max_mel, n_mels + 2)
mel_f = _mel_to_hz(mels)
fdiff = np.diff(mel_f)
ramps = np.subtract.outer(mel_f, fft_freqs)
for i in range(n_mels):
# lower and upper slopes for all bins
lower = -ramps[i] / fdiff[i]
upper = ramps[i + 2] / fdiff[i + 1]
# then intersect them with each other and zero
weights[i] = np.maximum(0, np.minimum(lower, upper))
if normalize_mel_bins: # Normalize energy per bins
# Slaney-style mel is scaled to be approx constant energy per channel
enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
weights *= enorm[:, np.newaxis]
# Only check weights if f_mel[0] is positive
if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)): # This means we have an empty channel somewhere
# create logger object (only if needed)
logger = get_logger_from_arg(logger)
logger.warning('Empty filters detected in mel frequency basis. Some channels will produce empty responses. '
'Try increasing your sampling rate (and fmax) or reducing n_mels.')
return weights
def _hz_to_mel(frequencies):
""" Convert Hz to Mels
:param frequencies: number or np.ndarray [shape=(n,)] -- scalar or array of frequencies
:return: number or np.ndarray [shape=(n,)] -- input frequencies in Mels
"""
# create frequencies array
frequencies = np.asanyarray(frequencies)
# Fill in the linear part
f_min = 0.0
f_sp = 200.0 / 3
mels = (frequencies - f_min) / f_sp
# Fill in the log-scale part
min_log_hz = 1000.0 # beginning of log region (Hz)
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
log_step = np.log(6.4) / 27.0 # step size for log region
if frequencies.ndim: # If we have array data, vectorize
log_t = (frequencies >= min_log_hz)
mels[log_t] = min_log_mel + np.log(frequencies[log_t] / min_log_hz) / log_step
elif frequencies >= min_log_hz: # If we have scalar data, heck directly
mels = min_log_mel + np.log(frequencies / min_log_hz) / log_step
return mels
def _mel_to_hz(mels):
""" Convert mel bin numbers to frequencies
:param mels: number or np.ndarray [shape=(n,)] -- scalar or array of mel bins to convert
:return: number or np.ndarray [shape=(n,)] -- input mels in Hz
"""
# create mels array
mels = np.asanyarray(mels)
# Fill in the linear scale
f_min = 0.0
f_sp = 200.0 / 3
freqs = f_min + f_sp * mels
# And now the nonlinear scale
min_log_hz = 1000.0 # beginning of log region (Hz)
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
log_step = np.log(6.4) / 27.0 # step size for log region
if mels.ndim: # If we have vector data, vectorize
log_t = (mels >= min_log_mel)
freqs[log_t] = min_log_hz * np.exp(log_step * (mels[log_t] - min_log_mel))
elif mels >= min_log_mel: # If we have scalar data, check directly
freqs = min_log_hz * np.exp(log_step * (mels - min_log_mel))
return freqs
def pre_emphasis_on_mel(mel_spec, preemph, fs, n_mels, mel_fmin=0, mel_fmax=None, min_amplitude=None,
normalized_range=True, logger=''):
logger = get_logger_from_arg(logger)
# set mel_fmax
if mel_fmax is None:
mel_fmax = float(fs) / 2
#### get the center frequency of all bins in the mel spectrum ####
# 'Center freqs' of mel bands - uniformly spaced between limits
min_mel = _hz_to_mel(mel_fmin)
max_mel = _hz_to_mel(mel_fmax)
mels = np.linspace(min_mel, max_mel, n_mels + 2)
bin_freqs = _mel_to_hz(mels)
#### get the the frequency response of the filter
a = [1]
b = [1, -preemph]
w, h = sps.freqz(b=b, a=a, worN=bin_freqs[1:-1], fs=fs)
#### apply filter to bins ###
h = 20 * np.log10(np.abs(h)) # get the filter response in dB
h = np.tile(np.expand_dims(h, axis=1), (1, mel_spec.shape[1]))
# if range was normalized
if normalized_range:
dbr = -20 * np.log10(min_amplitude)
# normalize filter
h = h / dbr
# Crazy empirical correction hack with magic numbers
if min_amplitude == 1e-5 and preemph == 0.97:
correction_matrix = np.log(w) / 30 - 0.277
correction_matrix = np.tile(np.expand_dims(correction_matrix, axis=1), (1, h.shape[1]))
h = h - correction_matrix
else:
logger.warn("You should probably compute a correction matrix for this config to compensate for the cliping.")
return np.add(mel_spec, h)