Spaces:
Configuration error
Configuration error
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| """Constant-Q transforms""" | |
| import warnings | |
| import numpy as np | |
| from numba import jit | |
| from . import audio | |
| from .intervals import interval_frequencies | |
| from .fft import get_fftlib | |
| from .convert import cqt_frequencies, note_to_hz | |
| from .spectrum import stft, istft | |
| from .pitch import estimate_tuning | |
| from .._cache import cache | |
| from .. import filters | |
| from .. import util | |
| from ..util.exceptions import ParameterError | |
| from numpy.typing import DTypeLike | |
| from typing import Optional, Union, Collection, List | |
| from .._typing import _WindowSpec, _PadMode, _FloatLike_co, _ensure_not_reachable | |
| __all__ = ["cqt", "hybrid_cqt", "pseudo_cqt", "icqt", "griffinlim_cqt", "vqt"] | |
| # TODO: ivqt, griffinlim_vqt | |
| def cqt( | |
| y: np.ndarray, | |
| *, | |
| sr: float = 22050, | |
| hop_length: int = 512, | |
| fmin: Optional[_FloatLike_co] = None, | |
| n_bins: int = 84, | |
| bins_per_octave: int = 12, | |
| tuning: Optional[float] = 0.0, | |
| filter_scale: float = 1, | |
| norm: Optional[float] = 1, | |
| sparsity: float = 0.01, | |
| window: _WindowSpec = "hann", | |
| scale: bool = True, | |
| pad_mode: _PadMode = "constant", | |
| res_type: Optional[str] = "soxr_hq", | |
| dtype: Optional[DTypeLike] = None, | |
| ) -> np.ndarray: | |
| """Compute the constant-Q transform of an audio signal. | |
| This implementation is based on the recursive sub-sampling method | |
| described by [#]_. | |
| .. [#] Schoerkhuber, Christian, and Anssi Klapuri. | |
| "Constant-Q transform toolbox for music processing." | |
| 7th Sound and Music Computing Conference, Barcelona, Spain. 2010. | |
| Parameters | |
| ---------- | |
| y : np.ndarray [shape=(..., n)] | |
| audio time series. Multi-channel is supported. | |
| sr : number > 0 [scalar] | |
| sampling rate of ``y`` | |
| hop_length : int > 0 [scalar] | |
| number of samples between successive CQT columns. | |
| fmin : float > 0 [scalar] | |
| Minimum frequency. Defaults to `C1 ~= 32.70 Hz` | |
| n_bins : int > 0 [scalar] | |
| Number of frequency bins, starting at ``fmin`` | |
| bins_per_octave : int > 0 [scalar] | |
| Number of bins per octave | |
| tuning : None or float | |
| Tuning offset in fractions of a bin. | |
| If ``None``, tuning will be automatically estimated from the signal. | |
| The minimum frequency of the resulting CQT will be modified to | |
| ``fmin * 2**(tuning / bins_per_octave)``. | |
| filter_scale : float > 0 | |
| Filter scale factor. Small values (<1) use shorter windows | |
| for improved time resolution. | |
| norm : {inf, -inf, 0, float > 0} | |
| Type of norm to use for basis function normalization. | |
| See `librosa.util.normalize`. | |
| sparsity : float in [0, 1) | |
| Sparsify the CQT basis by discarding up to ``sparsity`` | |
| fraction of the energy in each basis. | |
| Set ``sparsity=0`` to disable sparsification. | |
| window : str, tuple, number, or function | |
| Window specification for the basis filters. | |
| See `filters.get_window` for details. | |
| scale : bool | |
| If ``True``, scale the CQT response by square-root the length of | |
| each channel's filter. This is analogous to ``norm='ortho'`` in FFT. | |
| If ``False``, do not scale the CQT. This is analogous to | |
| ``norm=None`` in FFT. | |
| pad_mode : string | |
| Padding mode for centered frame analysis. | |
| See also: `librosa.stft` and `numpy.pad`. | |
| res_type : string | |
| The resampling mode for recursive downsampling. | |
| dtype : np.dtype | |
| The (complex) data type of the output array. By default, this is inferred to match | |
| the numerical precision of the input signal. | |
| Returns | |
| ------- | |
| CQT : np.ndarray [shape=(..., n_bins, t)] | |
| Constant-Q value each frequency at each time. | |
| See Also | |
| -------- | |
| vqt | |
| librosa.resample | |
| librosa.util.normalize | |
| Notes | |
| ----- | |
| This function caches at level 20. | |
| Examples | |
| -------- | |
| Generate and plot a constant-Q power spectrum | |
| >>> import matplotlib.pyplot as plt | |
| >>> y, sr = librosa.load(librosa.ex('trumpet')) | |
| >>> C = np.abs(librosa.cqt(y, sr=sr)) | |
| >>> fig, ax = plt.subplots() | |
| >>> img = librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max), | |
| ... sr=sr, x_axis='time', y_axis='cqt_note', ax=ax) | |
| >>> ax.set_title('Constant-Q power spectrum') | |
| >>> fig.colorbar(img, ax=ax, format="%+2.0f dB") | |
| Limit the frequency range | |
| >>> C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('C2'), | |
| ... n_bins=60)) | |
| >>> C | |
| array([[6.830e-04, 6.361e-04, ..., 7.362e-09, 9.102e-09], | |
| [5.366e-04, 4.818e-04, ..., 8.953e-09, 1.067e-08], | |
| ..., | |
| [4.288e-02, 4.580e-01, ..., 1.529e-05, 5.572e-06], | |
| [2.965e-03, 1.508e-01, ..., 8.965e-06, 1.455e-05]]) | |
| Using a higher frequency resolution | |
| >>> C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('C2'), | |
| ... n_bins=60 * 2, bins_per_octave=12 * 2)) | |
| >>> C | |
| array([[5.468e-04, 5.382e-04, ..., 5.911e-09, 6.105e-09], | |
| [4.118e-04, 4.014e-04, ..., 7.788e-09, 8.160e-09], | |
| ..., | |
| [2.780e-03, 1.424e-01, ..., 4.225e-06, 2.388e-05], | |
| [5.147e-02, 6.959e-02, ..., 1.694e-05, 5.811e-06]]) | |
| """ | |
| # CQT is the special case of VQT with gamma=0 | |
| return vqt( | |
| y=y, | |
| sr=sr, | |
| hop_length=hop_length, | |
| fmin=fmin, | |
| n_bins=n_bins, | |
| intervals="equal", | |
| gamma=0, | |
| bins_per_octave=bins_per_octave, | |
| tuning=tuning, | |
| filter_scale=filter_scale, | |
| norm=norm, | |
| sparsity=sparsity, | |
| window=window, | |
| scale=scale, | |
| pad_mode=pad_mode, | |
| res_type=res_type, | |
| dtype=dtype, | |
| ) | |
| def hybrid_cqt( | |
| y: np.ndarray, | |
| *, | |
| sr: float = 22050, | |
| hop_length: int = 512, | |
| fmin: Optional[_FloatLike_co] = None, | |
| n_bins: int = 84, | |
| bins_per_octave: int = 12, | |
| tuning: Optional[float] = 0.0, | |
| filter_scale: float = 1, | |
| norm: Optional[float] = 1, | |
| sparsity: float = 0.01, | |
| window: _WindowSpec = "hann", | |
| scale: bool = True, | |
| pad_mode: _PadMode = "constant", | |
| res_type: str = "soxr_hq", | |
| dtype: Optional[DTypeLike] = None, | |
| ) -> np.ndarray: | |
| """Compute the hybrid constant-Q transform of an audio signal. | |
| Here, the hybrid CQT uses the pseudo CQT for higher frequencies where | |
| the hop_length is longer than half the filter length and the full CQT | |
| for lower frequencies. | |
| Parameters | |
| ---------- | |
| y : np.ndarray [shape=(..., n)] | |
| audio time series. Multi-channel is supported. | |
| sr : number > 0 [scalar] | |
| sampling rate of ``y`` | |
| hop_length : int > 0 [scalar] | |
| number of samples between successive CQT columns. | |
| fmin : float > 0 [scalar] | |
| Minimum frequency. Defaults to `C1 ~= 32.70 Hz` | |
| n_bins : int > 0 [scalar] | |
| Number of frequency bins, starting at ``fmin`` | |
| bins_per_octave : int > 0 [scalar] | |
| Number of bins per octave | |
| tuning : None or float | |
| Tuning offset in fractions of a bin. | |
| If ``None``, tuning will be automatically estimated from the signal. | |
| The minimum frequency of the resulting CQT will be modified to | |
| ``fmin * 2**(tuning / bins_per_octave)``. | |
| filter_scale : float > 0 | |
| Filter filter_scale factor. Larger values use longer windows. | |
| norm : {inf, -inf, 0, float > 0} | |
| Type of norm to use for basis function normalization. | |
| See `librosa.util.normalize`. | |
| sparsity : float in [0, 1) | |
| Sparsify the CQT basis by discarding up to ``sparsity`` | |
| fraction of the energy in each basis. | |
| Set ``sparsity=0`` to disable sparsification. | |
| window : str, tuple, number, or function | |
| Window specification for the basis filters. | |
| See `filters.get_window` for details. | |
| scale : bool | |
| If ``True``, scale the CQT response by square-root the length of | |
| each channel's filter. This is analogous to ``norm='ortho'`` in FFT. | |
| If ``False``, do not scale the CQT. This is analogous to | |
| ``norm=None`` in FFT. | |
| pad_mode : string | |
| Padding mode for centered frame analysis. | |
| See also: `librosa.stft` and `numpy.pad`. | |
| res_type : string | |
| Resampling mode. See `librosa.cqt` for details. | |
| dtype : np.dtype, optional | |
| The complex dtype to use for computing the CQT. | |
| By default, this is inferred to match the precision of | |
| the input signal. | |
| Returns | |
| ------- | |
| CQT : np.ndarray [shape=(..., n_bins, t), dtype=np.float] | |
| Constant-Q energy for each frequency at each time. | |
| See Also | |
| -------- | |
| cqt | |
| pseudo_cqt | |
| Notes | |
| ----- | |
| This function caches at level 20. | |
| """ | |
| if fmin is None: | |
| # C1 by default | |
| fmin = note_to_hz("C1") | |
| if tuning is None: | |
| tuning = estimate_tuning(y=y, sr=sr, bins_per_octave=bins_per_octave) | |
| # Apply tuning correction | |
| fmin = fmin * 2.0 ** (tuning / bins_per_octave) | |
| # Get all CQT frequencies | |
| freqs = cqt_frequencies(n_bins, fmin=fmin, bins_per_octave=bins_per_octave) | |
| # Compute an alpha parameter, just in case we need it | |
| alpha = __bpo_to_alpha(bins_per_octave) | |
| # Compute the length of each constant-Q basis function | |
| lengths, _ = filters.wavelet_lengths( | |
| freqs=freqs, sr=sr, filter_scale=filter_scale, window=window, alpha=alpha | |
| ) | |
| # Determine which filters to use with Pseudo CQT | |
| # These are the ones that fit within 2 hop lengths after padding | |
| pseudo_filters = 2.0 ** np.ceil(np.log2(lengths)) < 2 * hop_length | |
| n_bins_pseudo = int(np.sum(pseudo_filters)) | |
| n_bins_full = n_bins - n_bins_pseudo | |
| cqt_resp = [] | |
| if n_bins_pseudo > 0: | |
| fmin_pseudo = np.min(freqs[pseudo_filters]) | |
| cqt_resp.append( | |
| pseudo_cqt( | |
| y, | |
| sr=sr, | |
| hop_length=hop_length, | |
| fmin=fmin_pseudo, | |
| n_bins=n_bins_pseudo, | |
| bins_per_octave=bins_per_octave, | |
| filter_scale=filter_scale, | |
| norm=norm, | |
| sparsity=sparsity, | |
| window=window, | |
| scale=scale, | |
| pad_mode=pad_mode, | |
| dtype=dtype, | |
| ) | |
| ) | |
| if n_bins_full > 0: | |
| cqt_resp.append( | |
| np.abs( | |
| cqt( | |
| y, | |
| sr=sr, | |
| hop_length=hop_length, | |
| fmin=fmin, | |
| n_bins=n_bins_full, | |
| bins_per_octave=bins_per_octave, | |
| filter_scale=filter_scale, | |
| norm=norm, | |
| sparsity=sparsity, | |
| window=window, | |
| scale=scale, | |
| pad_mode=pad_mode, | |
| res_type=res_type, | |
| dtype=dtype, | |
| ) | |
| ) | |
| ) | |
| # Propagate dtype from the last component | |
| return __trim_stack(cqt_resp, n_bins, cqt_resp[-1].dtype) | |
| def pseudo_cqt( | |
| y: np.ndarray, | |
| *, | |
| sr: float = 22050, | |
| hop_length: int = 512, | |
| fmin: Optional[_FloatLike_co] = None, | |
| n_bins: int = 84, | |
| bins_per_octave: int = 12, | |
| tuning: Optional[float] = 0.0, | |
| filter_scale: float = 1, | |
| norm: Optional[float] = 1, | |
| sparsity: float = 0.01, | |
| window: _WindowSpec = "hann", | |
| scale: bool = True, | |
| pad_mode: _PadMode = "constant", | |
| dtype: Optional[DTypeLike] = None, | |
| ) -> np.ndarray: | |
| """Compute the pseudo constant-Q transform of an audio signal. | |
| This uses a single fft size that is the smallest power of 2 that is greater | |
| than or equal to the max of: | |
| 1. The longest CQT filter | |
| 2. 2x the hop_length | |
| Parameters | |
| ---------- | |
| y : np.ndarray [shape=(..., n)] | |
| audio time series. Multi-channel is supported. | |
| sr : number > 0 [scalar] | |
| sampling rate of ``y`` | |
| hop_length : int > 0 [scalar] | |
| number of samples between successive CQT columns. | |
| fmin : float > 0 [scalar] | |
| Minimum frequency. Defaults to `C1 ~= 32.70 Hz` | |
| n_bins : int > 0 [scalar] | |
| Number of frequency bins, starting at ``fmin`` | |
| bins_per_octave : int > 0 [scalar] | |
| Number of bins per octave | |
| tuning : None or float | |
| Tuning offset in fractions of a bin. | |
| If ``None``, tuning will be automatically estimated from the signal. | |
| The minimum frequency of the resulting CQT will be modified to | |
| ``fmin * 2**(tuning / bins_per_octave)``. | |
| filter_scale : float > 0 | |
| Filter filter_scale factor. Larger values use longer windows. | |
| norm : {inf, -inf, 0, float > 0} | |
| Type of norm to use for basis function normalization. | |
| See `librosa.util.normalize`. | |
| sparsity : float in [0, 1) | |
| Sparsify the CQT basis by discarding up to ``sparsity`` | |
| fraction of the energy in each basis. | |
| Set ``sparsity=0`` to disable sparsification. | |
| window : str, tuple, number, or function | |
| Window specification for the basis filters. | |
| See `filters.get_window` for details. | |
| scale : bool | |
| If ``True``, scale the CQT response by square-root the length of | |
| each channel's filter. This is analogous to ``norm='ortho'`` in FFT. | |
| If ``False``, do not scale the CQT. This is analogous to | |
| ``norm=None`` in FFT. | |
| pad_mode : string | |
| Padding mode for centered frame analysis. | |
| See also: `librosa.stft` and `numpy.pad`. | |
| dtype : np.dtype, optional | |
| The complex data type for CQT calculations. | |
| By default, this is inferred to match the precision of the input signal. | |
| Returns | |
| ------- | |
| CQT : np.ndarray [shape=(..., n_bins, t), dtype=np.float] | |
| Pseudo Constant-Q energy for each frequency at each time. | |
| Notes | |
| ----- | |
| This function caches at level 20. | |
| """ | |
| if fmin is None: | |
| # C1 by default | |
| fmin = note_to_hz("C1") | |
| if tuning is None: | |
| tuning = estimate_tuning(y=y, sr=sr, bins_per_octave=bins_per_octave) | |
| if dtype is None: | |
| dtype = util.dtype_r2c(y.dtype) | |
| # Apply tuning correction | |
| fmin = fmin * 2.0 ** (tuning / bins_per_octave) | |
| freqs = cqt_frequencies(fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave) | |
| alpha = __bpo_to_alpha(bins_per_octave) | |
| lengths, _ = filters.wavelet_lengths( | |
| freqs=freqs, sr=sr, window=window, filter_scale=filter_scale, alpha=alpha | |
| ) | |
| fft_basis, n_fft, _ = __vqt_filter_fft( | |
| sr, | |
| freqs, | |
| filter_scale, | |
| norm, | |
| sparsity, | |
| hop_length=hop_length, | |
| window=window, | |
| dtype=dtype, | |
| alpha=alpha, | |
| ) | |
| fft_basis = np.abs(fft_basis) | |
| # Compute the magnitude-only CQT response | |
| C: np.ndarray = __cqt_response( | |
| y, | |
| n_fft, | |
| hop_length, | |
| fft_basis, | |
| pad_mode, | |
| window="hann", | |
| dtype=dtype, | |
| phase=False, | |
| ) | |
| if scale: | |
| C /= np.sqrt(n_fft) | |
| else: | |
| # reshape lengths to match dimension properly | |
| lengths = util.expand_to(lengths, ndim=C.ndim, axes=-2) | |
| C *= np.sqrt(lengths / n_fft) | |
| return C | |
| def icqt( | |
| C: np.ndarray, | |
| *, | |
| sr: float = 22050, | |
| hop_length: int = 512, | |
| fmin: Optional[_FloatLike_co] = None, | |
| bins_per_octave: int = 12, | |
| tuning: float = 0.0, | |
| filter_scale: float = 1, | |
| norm: Optional[float] = 1, | |
| sparsity: float = 0.01, | |
| window: _WindowSpec = "hann", | |
| scale: bool = True, | |
| length: Optional[int] = None, | |
| res_type: str = "soxr_hq", | |
| dtype: Optional[DTypeLike] = None, | |
| ) -> np.ndarray: | |
| """Compute the inverse constant-Q transform. | |
| Given a constant-Q transform representation ``C`` of an audio signal ``y``, | |
| this function produces an approximation ``y_hat``. | |
| Parameters | |
| ---------- | |
| C : np.ndarray, [shape=(..., n_bins, n_frames)] | |
| Constant-Q representation as produced by `cqt` | |
| sr : number > 0 [scalar] | |
| sampling rate of the signal | |
| hop_length : int > 0 [scalar] | |
| number of samples between successive frames | |
| fmin : float > 0 [scalar] | |
| Minimum frequency. Defaults to `C1 ~= 32.70 Hz` | |
| bins_per_octave : int > 0 [scalar] | |
| Number of bins per octave | |
| tuning : float [scalar] | |
| Tuning offset in fractions of a bin. | |
| The minimum frequency of the CQT will be modified to | |
| ``fmin * 2**(tuning / bins_per_octave)``. | |
| filter_scale : float > 0 [scalar] | |
| Filter scale factor. Small values (<1) use shorter windows | |
| for improved time resolution. | |
| norm : {inf, -inf, 0, float > 0} | |
| Type of norm to use for basis function normalization. | |
| See `librosa.util.normalize`. | |
| sparsity : float in [0, 1) | |
| Sparsify the CQT basis by discarding up to ``sparsity`` | |
| fraction of the energy in each basis. | |
| Set ``sparsity=0`` to disable sparsification. | |
| window : str, tuple, number, or function | |
| Window specification for the basis filters. | |
| See `filters.get_window` for details. | |
| scale : bool | |
| If ``True``, scale the CQT response by square-root the length | |
| of each channel's filter. This is analogous to ``norm='ortho'`` in FFT. | |
| If ``False``, do not scale the CQT. This is analogous to ``norm=None`` | |
| in FFT. | |
| length : int > 0, optional | |
| If provided, the output ``y`` is zero-padded or clipped to exactly | |
| ``length`` samples. | |
| res_type : string | |
| Resampling mode. | |
| See `librosa.resample` for supported modes. | |
| dtype : numeric type | |
| Real numeric type for ``y``. Default is inferred to match the numerical | |
| precision of the input CQT. | |
| Returns | |
| ------- | |
| y : np.ndarray, [shape=(..., n_samples), dtype=np.float] | |
| Audio time-series reconstructed from the CQT representation. | |
| See Also | |
| -------- | |
| cqt | |
| librosa.resample | |
| Notes | |
| ----- | |
| This function caches at level 40. | |
| Examples | |
| -------- | |
| Using default parameters | |
| >>> y, sr = librosa.load(librosa.ex('trumpet')) | |
| >>> C = librosa.cqt(y=y, sr=sr) | |
| >>> y_hat = librosa.icqt(C=C, sr=sr) | |
| Or with a different hop length and frequency resolution: | |
| >>> hop_length = 256 | |
| >>> bins_per_octave = 12 * 3 | |
| >>> C = librosa.cqt(y=y, sr=sr, hop_length=256, n_bins=7*bins_per_octave, | |
| ... bins_per_octave=bins_per_octave) | |
| >>> y_hat = librosa.icqt(C=C, sr=sr, hop_length=hop_length, | |
| ... bins_per_octave=bins_per_octave) | |
| """ | |
| if fmin is None: | |
| fmin = note_to_hz("C1") | |
| # Apply tuning correction | |
| fmin = fmin * 2.0 ** (tuning / bins_per_octave) | |
| # Get the top octave of frequencies | |
| n_bins = C.shape[-2] | |
| n_octaves = int(np.ceil(float(n_bins) / bins_per_octave)) | |
| # truncate the cqt to max frames if helpful | |
| freqs = cqt_frequencies(fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave) | |
| alpha = __bpo_to_alpha(bins_per_octave) | |
| lengths, f_cutoff = filters.wavelet_lengths( | |
| freqs=freqs, sr=sr, window=window, filter_scale=filter_scale, alpha=alpha | |
| ) | |
| # Trim the CQT to only what's necessary for reconstruction | |
| if length is not None: | |
| n_frames = int(np.ceil((length + max(lengths)) / hop_length)) | |
| C = C[..., :n_frames] | |
| C_scale = np.sqrt(lengths) | |
| # This shape array will be used for broadcasting the basis scale | |
| # we'll have to adapt this per octave within the loop | |
| y: Optional[np.ndarray] = None | |
| # Assume the top octave is at the full rate | |
| srs = [sr] | |
| hops = [hop_length] | |
| for i in range(n_octaves - 1): | |
| if hops[0] % 2 == 0: | |
| # We can downsample: | |
| srs.insert(0, srs[0] * 0.5) | |
| hops.insert(0, hops[0] // 2) | |
| else: | |
| # We're out of downsamplings, carry forward | |
| srs.insert(0, srs[0]) | |
| hops.insert(0, hops[0]) | |
| for i, (my_sr, my_hop) in enumerate(zip(srs, hops)): | |
| # How many filters are in this octave? | |
| n_filters = min(bins_per_octave, n_bins - bins_per_octave * i) | |
| # Slice out the current octave | |
| sl = slice(bins_per_octave * i, bins_per_octave * i + n_filters) | |
| fft_basis, n_fft, _ = __vqt_filter_fft( | |
| my_sr, | |
| freqs[sl], | |
| filter_scale, | |
| norm, | |
| sparsity, | |
| window=window, | |
| dtype=dtype, | |
| alpha=alpha, | |
| ) | |
| # Transpose the basis | |
| inv_basis = fft_basis.H.todense() | |
| # Compute each filter's frequency-domain power | |
| freq_power = 1 / np.sum(util.abs2(np.asarray(inv_basis)), axis=0) | |
| # Compensate for length normalization in the forward transform | |
| freq_power *= n_fft / lengths[sl] | |
| # Inverse-project the basis for each octave | |
| if scale: | |
| # scale=True ==> re-scale by sqrt(lengths) | |
| D_oct = np.einsum( | |
| "fc,c,c,...ct->...ft", | |
| inv_basis, | |
| C_scale[sl], | |
| freq_power, | |
| C[..., sl, :], | |
| optimize=True, | |
| ) | |
| else: | |
| D_oct = np.einsum( | |
| "fc,c,...ct->...ft", inv_basis, freq_power, C[..., sl, :], optimize=True | |
| ) | |
| y_oct = istft(D_oct, window="ones", hop_length=my_hop, dtype=dtype) | |
| y_oct = audio.resample( | |
| y_oct, | |
| orig_sr=1, | |
| target_sr=sr // my_sr, | |
| res_type=res_type, | |
| scale=False, | |
| fix=False, | |
| ) | |
| if y is None: | |
| y = y_oct | |
| else: | |
| y[..., : y_oct.shape[-1]] += y_oct | |
| # make mypy happy | |
| assert y is not None | |
| if length: | |
| y = util.fix_length(y, size=length) | |
| return y | |
| def vqt( | |
| y: np.ndarray, | |
| *, | |
| sr: float = 22050, | |
| hop_length: int = 512, | |
| fmin: Optional[_FloatLike_co] = None, | |
| n_bins: int = 84, | |
| intervals: Union[str, Collection[float]] = "equal", | |
| gamma: Optional[float] = None, | |
| bins_per_octave: int = 12, | |
| tuning: Optional[float] = 0.0, | |
| filter_scale: float = 1, | |
| norm: Optional[float] = 1, | |
| sparsity: float = 0.01, | |
| window: _WindowSpec = "hann", | |
| scale: bool = True, | |
| pad_mode: _PadMode = "constant", | |
| res_type: Optional[str] = "soxr_hq", | |
| dtype: Optional[DTypeLike] = None, | |
| ) -> np.ndarray: | |
| """Compute the variable-Q transform of an audio signal. | |
| This implementation is based on the recursive sub-sampling method | |
| described by [#]_. | |
| .. [#] Schörkhuber, Christian, Anssi Klapuri, Nicki Holighaus, and Monika Dörfler. | |
| "A Matlab toolbox for efficient perfect reconstruction time-frequency | |
| transforms with log-frequency resolution." | |
| In Audio Engineering Society Conference: 53rd International Conference: Semantic Audio. | |
| Audio Engineering Society, 2014. | |
| Parameters | |
| ---------- | |
| y : np.ndarray [shape=(..., n)] | |
| audio time series. Multi-channel is supported. | |
| sr : number > 0 [scalar] | |
| sampling rate of ``y`` | |
| hop_length : int > 0 [scalar] | |
| number of samples between successive VQT columns. | |
| fmin : float > 0 [scalar] | |
| Minimum frequency. Defaults to `C1 ~= 32.70 Hz` | |
| n_bins : int > 0 [scalar] | |
| Number of frequency bins, starting at ``fmin`` | |
| intervals : str or array of floats in [1, 2) | |
| Either a string specification for an interval set, e.g., | |
| `'equal'`, `'pythagorean'`, `'ji3'`, etc. or an array of | |
| intervals expressed as numbers between 1 and 2. | |
| .. see also:: librosa.interval_frequencies | |
| gamma : number > 0 [scalar] | |
| Bandwidth offset for determining filter lengths. | |
| If ``gamma=0``, produces the constant-Q transform. | |
| If 'gamma=None', gamma will be calculated such that filter bandwidths are equal to a | |
| constant fraction of the equivalent rectangular bandwidths (ERB). This is accomplished | |
| by solving for the gamma which gives:: | |
| B_k = alpha * f_k + gamma = C * ERB(f_k), | |
| where ``B_k`` is the bandwidth of filter ``k`` with center frequency ``f_k``, alpha | |
| is the inverse of what would be the constant Q-factor, and ``C = alpha / 0.108`` is the | |
| constant fraction across all filters. | |
| Here we use ``ERB(f_k) = 24.7 + 0.108 * f_k``, the best-fit curve derived | |
| from experimental data in [#]_. | |
| .. [#] Glasberg, Brian R., and Brian CJ Moore. | |
| "Derivation of auditory filter shapes from notched-noise data." | |
| Hearing research 47.1-2 (1990): 103-138. | |
| bins_per_octave : int > 0 [scalar] | |
| Number of bins per octave | |
| tuning : None or float | |
| Tuning offset in fractions of a bin. | |
| If ``None``, tuning will be automatically estimated from the signal. | |
| The minimum frequency of the resulting VQT will be modified to | |
| ``fmin * 2**(tuning / bins_per_octave)``. | |
| filter_scale : float > 0 | |
| Filter scale factor. Small values (<1) use shorter windows | |
| for improved time resolution. | |
| norm : {inf, -inf, 0, float > 0} | |
| Type of norm to use for basis function normalization. | |
| See `librosa.util.normalize`. | |
| sparsity : float in [0, 1) | |
| Sparsify the VQT basis by discarding up to ``sparsity`` | |
| fraction of the energy in each basis. | |
| Set ``sparsity=0`` to disable sparsification. | |
| window : str, tuple, number, or function | |
| Window specification for the basis filters. | |
| See `filters.get_window` for details. | |
| scale : bool | |
| If ``True``, scale the VQT response by square-root the length of | |
| each channel's filter. This is analogous to ``norm='ortho'`` in FFT. | |
| If ``False``, do not scale the VQT. This is analogous to | |
| ``norm=None`` in FFT. | |
| pad_mode : string | |
| Padding mode for centered frame analysis. | |
| See also: `librosa.stft` and `numpy.pad`. | |
| res_type : string | |
| The resampling mode for recursive downsampling. | |
| dtype : np.dtype | |
| The dtype of the output array. By default, this is inferred to match the | |
| numerical precision of the input signal. | |
| Returns | |
| ------- | |
| VQT : np.ndarray [shape=(..., n_bins, t), dtype=np.complex] | |
| Variable-Q value each frequency at each time. | |
| See Also | |
| -------- | |
| cqt | |
| Notes | |
| ----- | |
| This function caches at level 20. | |
| Examples | |
| -------- | |
| Generate and plot a variable-Q power spectrum | |
| >>> import matplotlib.pyplot as plt | |
| >>> y, sr = librosa.load(librosa.ex('choice'), duration=5) | |
| >>> C = np.abs(librosa.cqt(y, sr=sr)) | |
| >>> V = np.abs(librosa.vqt(y, sr=sr)) | |
| >>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True) | |
| >>> librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max), | |
| ... sr=sr, x_axis='time', y_axis='cqt_note', ax=ax[0]) | |
| >>> ax[0].set(title='Constant-Q power spectrum', xlabel=None) | |
| >>> ax[0].label_outer() | |
| >>> img = librosa.display.specshow(librosa.amplitude_to_db(V, ref=np.max), | |
| ... sr=sr, x_axis='time', y_axis='cqt_note', ax=ax[1]) | |
| >>> ax[1].set_title('Variable-Q power spectrum') | |
| >>> fig.colorbar(img, ax=ax, format="%+2.0f dB") | |
| """ | |
| # If intervals are provided as an array, override BPO | |
| if not isinstance(intervals, str): | |
| bins_per_octave = len(intervals) | |
| # How many octaves are we dealing with? | |
| n_octaves = int(np.ceil(float(n_bins) / bins_per_octave)) | |
| n_filters = min(bins_per_octave, n_bins) | |
| if fmin is None: | |
| # C1 by default | |
| fmin = note_to_hz("C1") | |
| if tuning is None: | |
| tuning = estimate_tuning(y=y, sr=sr, bins_per_octave=bins_per_octave) | |
| if dtype is None: | |
| dtype = util.dtype_r2c(y.dtype) | |
| # Apply tuning correction | |
| fmin = fmin * 2.0 ** (tuning / bins_per_octave) | |
| # First thing, get the freqs of the top octave | |
| freqs = interval_frequencies( | |
| n_bins=n_bins, | |
| fmin=fmin, | |
| intervals=intervals, | |
| bins_per_octave=bins_per_octave, | |
| sort=True, | |
| ) | |
| freqs_top = freqs[-bins_per_octave:] | |
| fmax_t: float = np.max(freqs_top) | |
| alpha = __bpo_to_alpha(bins_per_octave) | |
| lengths, filter_cutoff = filters.wavelet_lengths( | |
| freqs=freqs, | |
| sr=sr, | |
| window=window, | |
| filter_scale=filter_scale, | |
| gamma=gamma, | |
| alpha=alpha, | |
| ) | |
| # Determine required resampling quality | |
| nyquist = sr / 2.0 | |
| if filter_cutoff > nyquist: | |
| raise ParameterError( | |
| f"Wavelet basis with max frequency={fmax_t} would exceed the Nyquist frequency={nyquist}. " | |
| "Try reducing the number of frequency bins." | |
| ) | |
| if res_type is None: | |
| warnings.warn( | |
| "Support for VQT with res_type=None is deprecated in librosa 0.10\n" | |
| "and will be removed in version 1.0.", | |
| category=FutureWarning, | |
| stacklevel=2, | |
| ) | |
| res_type = "soxr_hq" | |
| y, sr, hop_length = __early_downsample( | |
| y, sr, hop_length, res_type, n_octaves, nyquist, filter_cutoff, scale | |
| ) | |
| vqt_resp = [] | |
| # Iterate down the octaves | |
| my_y, my_sr, my_hop = y, sr, hop_length | |
| for i in range(n_octaves): | |
| # Slice out the current octave of filters | |
| if i == 0: | |
| sl = slice(-n_filters, None) | |
| else: | |
| sl = slice(-n_filters * (i + 1), -n_filters * i) | |
| # This may be incorrect with early downsampling | |
| freqs_oct = freqs[sl] | |
| fft_basis, n_fft, _ = __vqt_filter_fft( | |
| my_sr, | |
| freqs_oct, | |
| filter_scale, | |
| norm, | |
| sparsity, | |
| window=window, | |
| gamma=gamma, | |
| dtype=dtype, | |
| alpha=alpha, | |
| ) | |
| # Re-scale the filters to compensate for downsampling | |
| fft_basis[:] *= np.sqrt(sr / my_sr) | |
| # Compute the vqt filter response and append to the stack | |
| vqt_resp.append( | |
| __cqt_response(my_y, n_fft, my_hop, fft_basis, pad_mode, dtype=dtype) | |
| ) | |
| if my_hop % 2 == 0: | |
| my_hop //= 2 | |
| my_sr /= 2.0 | |
| my_y = audio.resample( | |
| my_y, orig_sr=2, target_sr=1, res_type=res_type, scale=True | |
| ) | |
| V = __trim_stack(vqt_resp, n_bins, dtype) | |
| if scale: | |
| # Recompute lengths here because early downsampling may have changed | |
| # our sampling rate | |
| lengths, _ = filters.wavelet_lengths( | |
| freqs=freqs, | |
| sr=sr, | |
| window=window, | |
| filter_scale=filter_scale, | |
| gamma=gamma, | |
| alpha=alpha, | |
| ) | |
| # reshape lengths to match V shape | |
| lengths = util.expand_to(lengths, ndim=V.ndim, axes=-2) | |
| V /= np.sqrt(lengths) | |
| return V | |
| def __vqt_filter_fft( | |
| sr, | |
| freqs, | |
| filter_scale, | |
| norm, | |
| sparsity, | |
| hop_length=None, | |
| window="hann", | |
| gamma=0.0, | |
| dtype=np.complex64, | |
| alpha=None, | |
| ): | |
| """Generate the frequency domain variable-Q filter basis.""" | |
| basis, lengths = filters.wavelet( | |
| freqs=freqs, | |
| sr=sr, | |
| filter_scale=filter_scale, | |
| norm=norm, | |
| pad_fft=True, | |
| window=window, | |
| gamma=gamma, | |
| alpha=alpha, | |
| ) | |
| # Filters are padded up to the nearest integral power of 2 | |
| n_fft = basis.shape[1] | |
| if hop_length is not None and n_fft < 2.0 ** (1 + np.ceil(np.log2(hop_length))): | |
| n_fft = int(2.0 ** (1 + np.ceil(np.log2(hop_length)))) | |
| # re-normalize bases with respect to the FFT window length | |
| basis *= lengths[:, np.newaxis] / float(n_fft) | |
| # FFT and retain only the non-negative frequencies | |
| fft = get_fftlib() | |
| fft_basis = fft.fft(basis, n=n_fft, axis=1)[:, : (n_fft // 2) + 1] | |
| # sparsify the basis | |
| fft_basis = util.sparsify_rows(fft_basis, quantile=sparsity, dtype=dtype) | |
| return fft_basis, n_fft, lengths | |
| def __trim_stack( | |
| cqt_resp: List[np.ndarray], n_bins: int, dtype: DTypeLike | |
| ) -> np.ndarray: | |
| """Helper function to trim and stack a collection of CQT responses""" | |
| max_col = min(c_i.shape[-1] for c_i in cqt_resp) | |
| # Grab any leading dimensions | |
| shape = list(cqt_resp[0].shape) | |
| shape[-2] = n_bins | |
| shape[-1] = max_col | |
| cqt_out = np.empty(shape, dtype=dtype, order="F") | |
| # Copy per-octave data into output array | |
| end = n_bins | |
| for c_i in cqt_resp: | |
| # By default, take the whole octave | |
| n_oct = c_i.shape[-2] | |
| # If the whole octave is more than we can fit, | |
| # take the highest bins from c_i | |
| if end < n_oct: | |
| cqt_out[..., :end, :] = c_i[..., -end:, :max_col] | |
| else: | |
| cqt_out[..., end - n_oct : end, :] = c_i[..., :max_col] | |
| end -= n_oct | |
| return cqt_out | |
| def __cqt_response( | |
| y, n_fft, hop_length, fft_basis, mode, window="ones", phase=True, dtype=None | |
| ): | |
| """Compute the filter response with a target STFT hop.""" | |
| # Compute the STFT matrix | |
| D = stft( | |
| y, n_fft=n_fft, hop_length=hop_length, window=window, pad_mode=mode, dtype=dtype | |
| ) | |
| if not phase: | |
| D = np.abs(D) | |
| # Reshape D to Dr | |
| Dr = D.reshape((-1, D.shape[-2], D.shape[-1])) | |
| output_flat = np.empty( | |
| (Dr.shape[0], fft_basis.shape[0], Dr.shape[-1]), dtype=D.dtype | |
| ) | |
| # iterate over channels | |
| # project fft_basis.dot(Dr[i]) | |
| for i in range(Dr.shape[0]): | |
| output_flat[i] = fft_basis.dot(Dr[i]) | |
| # reshape Dr to match D's leading dimensions again | |
| shape = list(D.shape) | |
| shape[-2] = fft_basis.shape[0] | |
| return output_flat.reshape(shape) | |
| def __early_downsample_count(nyquist, filter_cutoff, hop_length, n_octaves): | |
| """Compute the number of early downsampling operations""" | |
| downsample_count1 = max(0, int(np.ceil(np.log2(nyquist / filter_cutoff)) - 1) - 1) | |
| num_twos = __num_two_factors(hop_length) | |
| downsample_count2 = max(0, num_twos - n_octaves + 1) | |
| return min(downsample_count1, downsample_count2) | |
| def __early_downsample( | |
| y, sr, hop_length, res_type, n_octaves, nyquist, filter_cutoff, scale | |
| ): | |
| """Perform early downsampling on an audio signal, if it applies.""" | |
| downsample_count = __early_downsample_count( | |
| nyquist, filter_cutoff, hop_length, n_octaves | |
| ) | |
| if downsample_count > 0: | |
| downsample_factor = 2 ** (downsample_count) | |
| hop_length //= downsample_factor | |
| if y.shape[-1] < downsample_factor: | |
| raise ParameterError( | |
| f"Input signal length={len(y):d} is too short for " | |
| f"{n_octaves:d}-octave CQT" | |
| ) | |
| new_sr = sr / float(downsample_factor) | |
| y = audio.resample( | |
| y, orig_sr=downsample_factor, target_sr=1, res_type=res_type, scale=True | |
| ) | |
| # If we're not going to length-scale after CQT, we | |
| # need to compensate for the downsampling factor here | |
| if not scale: | |
| y *= np.sqrt(downsample_factor) | |
| sr = new_sr | |
| return y, sr, hop_length | |
| def __num_two_factors(x): | |
| """Return how many times integer x can be evenly divided by 2. | |
| Returns 0 for non-positive integers. | |
| """ | |
| if x <= 0: | |
| return 0 | |
| num_twos = 0 | |
| while x % 2 == 0: | |
| num_twos += 1 | |
| x //= 2 | |
| return num_twos | |
| def griffinlim_cqt( | |
| C: np.ndarray, | |
| *, | |
| n_iter: int = 32, | |
| sr: float = 22050, | |
| hop_length: int = 512, | |
| fmin: Optional[_FloatLike_co] = None, | |
| bins_per_octave: int = 12, | |
| tuning: float = 0.0, | |
| filter_scale: float = 1, | |
| norm: Optional[float] = 1, | |
| sparsity: float = 0.01, | |
| window: _WindowSpec = "hann", | |
| scale: bool = True, | |
| pad_mode: _PadMode = "constant", | |
| res_type: str = "soxr_hq", | |
| dtype: Optional[DTypeLike] = None, | |
| length: Optional[int] = None, | |
| momentum: float = 0.99, | |
| init: Optional[str] = "random", | |
| random_state: Optional[ | |
| Union[int, np.random.RandomState, np.random.Generator] | |
| ] = None, | |
| ) -> np.ndarray: | |
| """Approximate constant-Q magnitude spectrogram inversion using the "fast" Griffin-Lim | |
| algorithm. | |
| Given the magnitude of a constant-Q spectrogram (``C``), the algorithm randomly initializes | |
| phase estimates, and then alternates forward- and inverse-CQT operations. [#]_ | |
| This implementation is based on the (fast) Griffin-Lim method for Short-time Fourier Transforms, [#]_ | |
| but adapted for use with constant-Q spectrograms. | |
| .. [#] D. W. Griffin and J. S. Lim, | |
| "Signal estimation from modified short-time Fourier transform," | |
| IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984. | |
| .. [#] Perraudin, N., Balazs, P., & Søndergaard, P. L. | |
| "A fast Griffin-Lim algorithm," | |
| IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (pp. 1-4), | |
| Oct. 2013. | |
| Parameters | |
| ---------- | |
| C : np.ndarray [shape=(..., n_bins, n_frames)] | |
| The constant-Q magnitude spectrogram | |
| n_iter : int > 0 | |
| The number of iterations to run | |
| sr : number > 0 | |
| Audio sampling rate | |
| hop_length : int > 0 | |
| The hop length of the CQT | |
| fmin : number > 0 | |
| Minimum frequency for the CQT. | |
| If not provided, it defaults to `C1`. | |
| bins_per_octave : int > 0 | |
| Number of bins per octave | |
| tuning : float | |
| Tuning deviation from A440, in fractions of a bin | |
| filter_scale : float > 0 | |
| Filter scale factor. Small values (<1) use shorter windows | |
| for improved time resolution. | |
| norm : {inf, -inf, 0, float > 0} | |
| Type of norm to use for basis function normalization. | |
| See `librosa.util.normalize`. | |
| sparsity : float in [0, 1) | |
| Sparsify the CQT basis by discarding up to ``sparsity`` | |
| fraction of the energy in each basis. | |
| Set ``sparsity=0`` to disable sparsification. | |
| window : str, tuple, or function | |
| Window specification for the basis filters. | |
| See `filters.get_window` for details. | |
| scale : bool | |
| If ``True``, scale the CQT response by square-root the length | |
| of each channel's filter. This is analogous to ``norm='ortho'`` | |
| in FFT. | |
| If ``False``, do not scale the CQT. This is analogous to ``norm=None`` | |
| in FFT. | |
| pad_mode : string | |
| Padding mode for centered frame analysis. | |
| See also: `librosa.stft` and `numpy.pad`. | |
| res_type : string | |
| The resampling mode for recursive downsampling. | |
| See ``librosa.resample`` for a list of available options. | |
| dtype : numeric type | |
| Real numeric type for ``y``. Default is inferred to match the precision | |
| of the input CQT. | |
| length : int > 0, optional | |
| If provided, the output ``y`` is zero-padded or clipped to exactly | |
| ``length`` samples. | |
| momentum : float > 0 | |
| The momentum parameter for fast Griffin-Lim. | |
| Setting this to 0 recovers the original Griffin-Lim method. | |
| Values near 1 can lead to faster convergence, but above 1 may not converge. | |
| init : None or 'random' [default] | |
| If 'random' (the default), then phase values are initialized randomly | |
| according to ``random_state``. This is recommended when the input ``C`` is | |
| a magnitude spectrogram with no initial phase estimates. | |
| If ``None``, then the phase is initialized from ``C``. This is useful when | |
| an initial guess for phase can be provided, or when you want to resume | |
| Griffin-Lim from a previous output. | |
| random_state : None, int, np.random.RandomState, or np.random.Generator | |
| If int, random_state is the seed used by the random number generator | |
| for phase initialization. | |
| If `np.random.RandomState` or `np.random.Generator` instance, the random number generator itself. | |
| If ``None``, defaults to the `np.random.default_rng()` object. | |
| Returns | |
| ------- | |
| y : np.ndarray [shape=(..., n)] | |
| time-domain signal reconstructed from ``C`` | |
| See Also | |
| -------- | |
| cqt | |
| icqt | |
| griffinlim | |
| filters.get_window | |
| resample | |
| Examples | |
| -------- | |
| A basis CQT inverse example | |
| >>> y, sr = librosa.load(librosa.ex('trumpet', hq=True), sr=None) | |
| >>> # Get the CQT magnitude, 7 octaves at 36 bins per octave | |
| >>> C = np.abs(librosa.cqt(y=y, sr=sr, bins_per_octave=36, n_bins=7*36)) | |
| >>> # Invert using Griffin-Lim | |
| >>> y_inv = librosa.griffinlim_cqt(C, sr=sr, bins_per_octave=36) | |
| >>> # And invert without estimating phase | |
| >>> y_icqt = librosa.icqt(C, sr=sr, bins_per_octave=36) | |
| Wave-plot the results | |
| >>> import matplotlib.pyplot as plt | |
| >>> fig, ax = plt.subplots(nrows=3, sharex=True, sharey=True) | |
| >>> librosa.display.waveshow(y, sr=sr, color='b', ax=ax[0]) | |
| >>> ax[0].set(title='Original', xlabel=None) | |
| >>> ax[0].label_outer() | |
| >>> librosa.display.waveshow(y_inv, sr=sr, color='g', ax=ax[1]) | |
| >>> ax[1].set(title='Griffin-Lim reconstruction', xlabel=None) | |
| >>> ax[1].label_outer() | |
| >>> librosa.display.waveshow(y_icqt, sr=sr, color='r', ax=ax[2]) | |
| >>> ax[2].set(title='Magnitude-only icqt reconstruction') | |
| """ | |
| if fmin is None: | |
| fmin = note_to_hz("C1") | |
| if random_state is None: | |
| rng = np.random.default_rng() | |
| elif isinstance(random_state, int): | |
| rng = np.random.RandomState(seed=random_state) # type: ignore | |
| elif isinstance(random_state, (np.random.RandomState, np.random.Generator)): | |
| rng = random_state # type: ignore | |
| else: | |
| _ensure_not_reachable(random_state) | |
| raise ParameterError(f"Unsupported random_state={random_state!r}") | |
| if momentum > 1: | |
| warnings.warn( | |
| f"Griffin-Lim with momentum={momentum} > 1 can be unstable. " | |
| "Proceed with caution!", | |
| stacklevel=2, | |
| ) | |
| elif momentum < 0: | |
| raise ParameterError(f"griffinlim_cqt() called with momentum={momentum} < 0") | |
| # using complex64 will keep the result to minimal necessary precision | |
| angles = np.empty(C.shape, dtype=np.complex64) | |
| eps = util.tiny(angles) | |
| if init == "random": | |
| # randomly initialize the phase | |
| angles[:] = util.phasor(2 * np.pi * rng.random(size=C.shape)) | |
| elif init is None: | |
| # Initialize an all ones complex matrix | |
| angles[:] = 1.0 | |
| else: | |
| raise ParameterError(f"init={init} must either None or 'random'") | |
| # And initialize the previous iterate to 0 | |
| rebuilt: np.ndarray = np.array(0.0) | |
| for _ in range(n_iter): | |
| # Store the previous iterate | |
| tprev = rebuilt | |
| # Invert with our current estimate of the phases | |
| inverse = icqt( | |
| C * angles, | |
| sr=sr, | |
| hop_length=hop_length, | |
| bins_per_octave=bins_per_octave, | |
| fmin=fmin, | |
| tuning=tuning, | |
| filter_scale=filter_scale, | |
| window=window, | |
| length=length, | |
| res_type=res_type, | |
| norm=norm, | |
| scale=scale, | |
| sparsity=sparsity, | |
| dtype=dtype, | |
| ) | |
| # Rebuild the spectrogram | |
| rebuilt = cqt( | |
| inverse, | |
| sr=sr, | |
| bins_per_octave=bins_per_octave, | |
| n_bins=C.shape[-2], | |
| hop_length=hop_length, | |
| fmin=fmin, | |
| tuning=tuning, | |
| filter_scale=filter_scale, | |
| window=window, | |
| norm=norm, | |
| scale=scale, | |
| sparsity=sparsity, | |
| pad_mode=pad_mode, | |
| res_type=res_type, | |
| ) | |
| # Update our phase estimates | |
| angles[:] = rebuilt - (momentum / (1 + momentum)) * tprev | |
| angles[:] /= np.abs(angles) + eps | |
| # Return the final phase estimates | |
| return icqt( | |
| C * angles, | |
| sr=sr, | |
| hop_length=hop_length, | |
| bins_per_octave=bins_per_octave, | |
| tuning=tuning, | |
| filter_scale=filter_scale, | |
| fmin=fmin, | |
| window=window, | |
| length=length, | |
| res_type=res_type, | |
| norm=norm, | |
| scale=scale, | |
| sparsity=sparsity, | |
| dtype=dtype, | |
| ) | |
| def __bpo_to_alpha(bins_per_octave: int) -> float: | |
| """Compute the alpha coefficient for a given number of bins per octave | |
| Parameters | |
| ---------- | |
| bins_per_octave : int | |
| Returns | |
| ------- | |
| alpha : number > 0 | |
| """ | |
| r = 2 ** (1 / bins_per_octave) | |
| return (r**2 - 1) / (r**2 + 1) | |