Spaces:
Running
Running
| """ | |
| | Description: libf0 salience-based F0 estimation implementation | |
| | Author: Sebastian Rosenzweig, Simon Schwär, Meinard Müller | |
| | License: The MIT license, https://opensource.org/licenses/MIT | |
| | This file is part of libf0. | |
| """ | |
| import numpy as np | |
| from librosa import stft | |
| from scipy import ndimage, linalg | |
| from numba import njit | |
| def salience(x, Fs=22050, N=2048, H=256, F_min=55.0, F_max=1760.0, R=10.0, num_harm=10, freq_smooth_len=11, | |
| alpha=0.9, gamma=0.0, constraint_region=None, tol=5, score_low=0.01, score_high=1.0): | |
| """ | |
| Implementation of a salience-based F0-estimation algorithm using pitch contours, inspired by Melodia. | |
| .. [#] Justin Salamon and Emilia Gómez, | |
| "Melody Extraction From Polyphonic Music Signals Using Pitch Contour Characteristics." | |
| IEEE Transactions on Audio, Speech, and Language Processing, vol. 20, no. 6, pp. 1759–1770, Aug. 2012. | |
| Parameters | |
| ---------- | |
| x : ndarray | |
| Audio signal | |
| Fs : int | |
| Sampling rate | |
| N : int | |
| Window size | |
| H : int | |
| Hop size | |
| F_min : float or int | |
| Minimal frequency | |
| F_max : float or int | |
| Maximal frequency | |
| R : int | |
| Frequency resolution given in cents | |
| num_harm : int | |
| Number of harmonics (Default value = 10) | |
| freq_smooth_len : int | |
| Filter length for vertical smoothing (Default value = 11) | |
| alpha : float | |
| Weighting parameter for harmonics (Default value = 0.9) | |
| gamma : float | |
| Logarithmic compression factor (Default value = 0.0) | |
| constraint_region : None or ndarray | |
| Constraint regions, row-format: (t_start_sec, t_end_sec, f_start_hz, f_end,hz) | |
| (Default value = None) | |
| tol : int | |
| Tolerance parameter for transition matrix (Default value = 5) | |
| score_low : float | |
| Score (low) for transition matrix (Default value = 0.01) | |
| score_high : float | |
| Score (high) for transition matrix (Default value = 1.0) | |
| Returns | |
| ------- | |
| f0 : ndarray | |
| Estimated F0-trajectory | |
| T_coef: ndarray | |
| Time axis | |
| sal: ndarray | |
| Salience value of estimated F0 | |
| See also | |
| -------- | |
| [FMP] Notebook: C8/C8S2_SalienceRepresentation.ipynb | |
| """ | |
| # compute salience representation via instantaneous frequency and harmonic summation | |
| Z, F_coef_hertz = compute_salience_rep(x, Fs, N=N, H=H, F_min=F_min, F_max=F_max, R=R, | |
| num_harm=num_harm, freq_smooth_len=freq_smooth_len, | |
| alpha=alpha, gamma=gamma) | |
| # compute trajectory via dynamic programming | |
| T_coef = (np.arange(Z.shape[1]) * H) / Fs | |
| index_CR = compute_trajectory_cr(Z, T_coef, F_coef_hertz, constraint_region, | |
| tol=tol, score_low=score_low, score_high=score_high) | |
| traj = F_coef_hertz[index_CR] | |
| traj[index_CR == -1] = 0 | |
| # compute salience value | |
| Z_max = np.max(Z, axis=0) | |
| Z_norm = np.divide(Z, np.ones((Z.shape[0], 1)) * Z_max) | |
| sal = Z_norm[index_CR, np.arange(Z.shape[1])] | |
| sal[traj == 0] = 0 | |
| return traj, T_coef, sal | |
| def compute_salience_rep(x, Fs, N, H, F_min, F_max, R, num_harm, freq_smooth_len, alpha, gamma): | |
| """ | |
| Compute salience representation [FMP, Eq. (8.56)] | |
| Parameters | |
| ---------- | |
| x : ndarray | |
| Audio signal | |
| Fs : int | |
| Sampling rate | |
| N : int | |
| Window size | |
| H : int | |
| Hop size | |
| F_min : float or int | |
| Minimal frequency | |
| F_max : float or int | |
| Maximal frequency | |
| R : int | |
| Frequency resolution given in cents | |
| num_harm : int | |
| Number of harmonics | |
| freq_smooth_len : int | |
| Filter length for vertical smoothing | |
| alpha : float | |
| Weighting parameter for harmonics | |
| gamma : float | |
| Logarithmic compression factor | |
| Returns | |
| ------- | |
| Z : ndarray | |
| Salience representation | |
| F_coef_hertz : ndarray | |
| Frequency axis in Hz | |
| See also | |
| -------- | |
| [FMP] Notebook: C8/C8S2_SalienceRepresentation.ipynb | |
| """ | |
| X = stft(x, n_fft=N, hop_length=H, win_length=N, pad_mode='constant') | |
| Y_LF_IF_bin, F_coef_hertz = compute_y_lf_if_bin_eff(X, Fs, N, H, F_min, F_max, R) | |
| # smoothing | |
| Y_LF_IF_bin = ndimage.convolve1d(Y_LF_IF_bin, np.hanning(freq_smooth_len), axis=0, mode='constant') | |
| Z = compute_salience_from_logfreq_spec(Y_LF_IF_bin, R, n_harmonics=num_harm, alpha=alpha, beta=1, gamma=gamma) | |
| return Z, F_coef_hertz | |
| def compute_y_lf_if_bin_eff(X, Fs, N, H, F_min, F_max, R): | |
| """ | |
| Binned Log-frequency Spectrogram with variable frequency resolution based on instantaneous frequency, | |
| more efficient implementation than FMP | |
| Parameters | |
| ---------- | |
| X : ndarray | |
| Complex spectrogram | |
| Fs : int | |
| Sampling rate in Hz | |
| N : int | |
| Window size | |
| H : int | |
| Hop size | |
| F_min : float or int | |
| Minimal frequency | |
| F_max : float or int | |
| Maximal frequency | |
| R : int | |
| Frequency resolution given in cents | |
| Returns | |
| ------- | |
| Y_LF_IF_bin : ndarray | |
| Binned log-frequency spectrogram using instantaneous frequency (shape: [freq, time]) | |
| F_coef_hertz : ndarray | |
| Frequency axis in Hz | |
| """ | |
| # calculate number of bins on log frequency axis | |
| B = frequency_to_bin_index(F_max, R, F_min) + 1 | |
| # center frequencies of the final bins | |
| F_coef_hertz = F_min * np.power(2, (np.arange(0, B) * R / 1200)) | |
| # calculate heterodyned phase increment (hpi) | |
| k = np.arange(X.shape[0]).reshape(-1, 1) | |
| omega = 2 * np.pi * k / N # center frequency for each bin in rad | |
| hpi = (np.angle(X[:, 1:]) - np.angle(X[:, 0:-1])) - omega * H | |
| # reduce hpi to -pi:pi range | |
| # this is much faster than using the modulo function below, but gives the same result | |
| # hpi = np.mod(hpi + np.pi, 2 * np.pi) - np.pi | |
| hpi = hpi - 2 * np.pi * (np.around((hpi / (2 * np.pi)) + 1) - 1) | |
| # calculate instantaneous frequencies in Hz | |
| inst_f = (omega + hpi / H) * Fs / (2 * np.pi) | |
| # repeat the first time frame to match dimensions of X | |
| inst_f = np.hstack((np.copy(inst_f[:, 0]).reshape(-1, 1), inst_f)) | |
| # mask frequencies that are not relevant | |
| mask = np.logical_and(inst_f >= F_min, inst_f < F_max) | |
| inst_f *= mask | |
| # set 0 to nan, so it does stay at nan in the bin assignment calculation | |
| inst_f[np.where(inst_f == 0)] = np.nan | |
| # find which inst_f values belong to which bin | |
| bin_assignment = frequency_to_bin_index(inst_f, R, F_min) | |
| # we map the discarded values to an extra bin that we remove before returning the binned spectrogram | |
| bin_assignment[np.where(np.isnan(inst_f))] = B | |
| # perform binning on power spectrogram for each time frame separately | |
| Y = np.abs(X) ** 2 | |
| Y_LF_IF_bin = np.zeros((B+1, Y.shape[1])) | |
| for t in range(Y.shape[1]): | |
| np.add.at(Y_LF_IF_bin[:, t], bin_assignment[:, t], Y[:, t]) | |
| return Y_LF_IF_bin[:B, :], F_coef_hertz | |
| def compute_salience_from_logfreq_spec(lf_spec, R, n_harmonics, alpha, beta, gamma, harmonic_win_len=11): | |
| """ | |
| Compute salience representation using harmonic summation following [1] | |
| [1] J. Salamon and E. Gomez, | |
| "Melody Extraction From Polyphonic Music Signals Using Pitch Contour Characteristics." | |
| IEEE Transactions on Audio, Speech, and Language Processing, vol. 20, no. 6, pp. 1759–1770, Aug. 2012. | |
| Parameters | |
| ---------- | |
| lf_spec : ndarray | |
| (F, T) log-spectrogram | |
| R : int | |
| Frequency resolution given in cents | |
| n_harmonics : int | |
| Number of harmonics | |
| alpha : float | |
| Weighting parameter for harmonics | |
| beta : float | |
| Compression parameter for spectrogram magnitudes | |
| gamma : float | |
| Magnitude threshold | |
| harmonic_win_len : int | |
| Length of a frequency weighting window in bins | |
| Returns | |
| ------- | |
| Z : ndarray | |
| (F, T) salience representation of the input spectrogram | |
| """ | |
| # magnitude thresholding and compression | |
| eps = np.finfo(np.float32).eps | |
| threshold_mask = (20 * np.log10(lf_spec/np.max(lf_spec) + eps)) < gamma | |
| lf_spec = lf_spec**beta * threshold_mask | |
| # compute window | |
| max_diff_bins = harmonic_win_len // 2 | |
| window = np.cos(np.linspace(-1, 1, 2*max_diff_bins+1)*np.pi/2)**2 # cosine^2 window | |
| # compute indices of harmonics | |
| harmonics = np.round(np.log2(np.arange(1, n_harmonics + 1)) * 1200 / R).astype(int) | |
| weighting_vec = np.zeros((lf_spec.shape[0] + max_diff_bins)) | |
| # compute weights | |
| for idx, h in enumerate(harmonics): | |
| if h+harmonic_win_len > len(weighting_vec): | |
| break # we reached the maximum length available | |
| weighting_vec[h:h+harmonic_win_len] += window * alpha**idx | |
| # correlate lf_spec with the weighting vector on the frequency axis | |
| Z = ndimage.correlate1d(lf_spec, weighting_vec[:], | |
| axis=0, mode='constant', cval=0, origin=-len(weighting_vec)//2 + max_diff_bins) | |
| # magnitude thresholding and compression | |
| threshold_mask = (20 * np.log10(Z / np.max(Z) + eps)) < gamma | |
| Z = Z ** beta * threshold_mask | |
| return Z | |
| def define_transition_matrix(B, tol=0, score_low=0.01, score_high=1.0): | |
| """ | |
| Generate transition matrix for dynamic programming | |
| Parameters | |
| ---------- | |
| B : int | |
| Number of bins | |
| tol : int | |
| Tolerance parameter for transition matrix (Default value = 0) | |
| score_low : float | |
| Score (low) for transition matrix (Default value = 0.01) | |
| score_high : float | |
| Score (high) for transition matrix (Default value = 1.0) | |
| Returns | |
| ------- | |
| T : ndarray | |
| (B, B) Transition matrix | |
| See also | |
| -------- | |
| [FMP] Notebook: C8/C8S2_FundFreqTracking.ipynb | |
| """ | |
| col = np.ones((B,)) * score_low | |
| col[0:tol+1] = np.ones((tol+1, )) * score_high | |
| T = linalg.toeplitz(col) | |
| return T | |
| def compute_trajectory_dp(Z, T): | |
| """ | |
| Trajectory tracking using dynamic programming | |
| Parameters | |
| ---------- | |
| Z : ndarray | |
| Salience representation | |
| T : ndarray | |
| Transisition matrix | |
| Returns | |
| ------- | |
| eta_DP : ndarray | |
| Trajectory indices | |
| See also | |
| -------- | |
| [FMP] Notebook: C8/C8S2_FundFreqTracking.ipynb | |
| """ | |
| B, N = Z.shape | |
| eps_machine = np.finfo(np.float32).eps | |
| Z_log = np.log(Z + eps_machine) | |
| T_log = np.log(T + eps_machine) | |
| E = np.zeros((B, N)) | |
| D = np.zeros((B, N)) | |
| D[:, 0] = Z_log[:, 0] | |
| for n in np.arange(1, N): | |
| for b in np.arange(0, B): | |
| D[b, n] = np.max(T_log[b, :] + D[:, n-1]) + Z_log[b, n] | |
| E[b, n-1] = np.argmax(T_log[b, :] + D[:, n-1]) | |
| # backtracking | |
| eta_DP = np.zeros(N) | |
| eta_DP[N-1] = int(np.argmax(D[:, N-1])) | |
| for n in np.arange(N-2, -1, -1): | |
| eta_DP[n] = E[int(eta_DP[n+1]), n] | |
| return eta_DP.astype(np.int64) | |
| def compute_trajectory_cr(Z, T_coef, F_coef_hertz, constraint_region=None, | |
| tol=5, score_low=0.01, score_high=1.0): | |
| """ | |
| Trajectory tracking with constraint regions | |
| Notebook: C8/C8S2_FundFreqTracking.ipynb | |
| Parameters | |
| ---------- | |
| Z : ndarray | |
| Salience representation | |
| T_coef : ndarray | |
| Time axis | |
| F_coef_hertz : ndarray | |
| Frequency axis in Hz | |
| constraint_region : ndarray or None | |
| Constraint regions, row-format: (t_start_sec, t_end_sec, f_start_hz, f_end_hz) | |
| (Default value = None) | |
| tol : int | |
| Tolerance parameter for transition matrix (Default value = 5) | |
| score_low : float | |
| Score (low) for transition matrix (Default value = 0.01) | |
| score_high : float | |
| Score (high) for transition matrix (Default value = 1.0) | |
| Returns | |
| ------- | |
| eta : ndarray | |
| Trajectory indices, unvoiced frames are indicated with -1 | |
| See also | |
| -------- | |
| [FMP] Notebook: C8/C8S2_FundFreqTracking.ipynb | |
| """ | |
| # do tracking within every constraint region | |
| if constraint_region is not None: | |
| # initialize contour, unvoiced frames are indicated with -1 | |
| eta = np.full(len(T_coef), -1) | |
| for row_idx in range(constraint_region.shape[0]): | |
| t_start = constraint_region[row_idx, 0] # sec | |
| t_end = constraint_region[row_idx, 1] # sec | |
| f_start = constraint_region[row_idx, 2] # Hz | |
| f_end = constraint_region[row_idx, 3] # Hz | |
| # convert start/end values to indices | |
| t_start_idx = np.argmin(np.abs(T_coef - t_start)) | |
| t_end_idx = np.argmin(np.abs(T_coef - t_end)) | |
| f_start_idx = np.argmin(np.abs(F_coef_hertz - f_start)) | |
| f_end_idx = np.argmin(np.abs(F_coef_hertz - f_end)) | |
| # track in salience part | |
| cur_Z = Z[f_start_idx:f_end_idx+1, t_start_idx:t_end_idx+1] | |
| T = define_transition_matrix(cur_Z.shape[0], tol=tol, | |
| score_low=score_low, score_high=score_high) | |
| cur_eta = compute_trajectory_dp(cur_Z, T) | |
| # fill contour | |
| eta[t_start_idx:t_end_idx+1] = f_start_idx + cur_eta | |
| else: | |
| T = define_transition_matrix(Z.shape[0], tol=tol, score_low=score_low, score_high=score_high) | |
| eta = compute_trajectory_dp(Z, T) | |
| return eta | |
| def frequency_to_bin_index(F, R, F_ref): | |
| """ | |
| Binning function with variable frequency resolution | |
| Note: Indexing starts with 0 (opposed to [FMP, Eq. (8.49)]) | |
| Parameters | |
| ---------- | |
| F : float or ndarray | |
| Frequency in Hz | |
| R : float | |
| Frequency resolution in cents (Default value = 10.0) | |
| F_ref : float | |
| Reference frequency in Hz (Default value = 55.0) | |
| Returns | |
| ------- | |
| bin_index (int): Index for bin (starting with index 0) | |
| See also | |
| -------- | |
| [FMP] Notebook: C8/C8S2_SalienceRepresentation.ipynb | |
| """ | |
| bin_index = np.floor((1200 / R) * np.log2(F / F_ref) + 0.5).astype(np.int64) | |
| return bin_index | |