Spaces:

undetectable
/

voice-clone

Configuration error

App Files Files Community

voice-clone / packages /constantq.py

renator

allow permission

a0f6c29 almost 2 years ago

raw

history blame contribute delete

44.7 kB




	#!/usr/bin/env python
	# -- coding: utf-8 --
	"""Constant-Q transforms"""
	import warnings
	import numpy as np
	from numba import jit

	from . import audio
	from .intervals import interval_frequencies
	from .fft import get_fftlib
	from .convert import cqt_frequencies, note_to_hz
	from .spectrum import stft, istft
	from .pitch import estimate_tuning
	from .._cache import cache
	from .. import filters
	from .. import util
	from ..util.exceptions import ParameterError
	from numpy.typing import DTypeLike
	from typing import Optional, Union, Collection, List
	from .._typing import _WindowSpec, _PadMode, _FloatLike_co, _ensure_not_reachable

	__all__ = ["cqt", "hybrid_cqt", "pseudo_cqt", "icqt", "griffinlim_cqt", "vqt"]

	# TODO: ivqt, griffinlim_vqt


	@cache(level=20)
	def cqt(
	y: np.ndarray,
	*,
	sr: float = 22050,
	hop_length: int = 512,
	fmin: Optional[_FloatLike_co] = None,
	n_bins: int = 84,
	bins_per_octave: int = 12,
	tuning: Optional[float] = 0.0,
	filter_scale: float = 1,
	norm: Optional[float] = 1,
	sparsity: float = 0.01,
	window: _WindowSpec = "hann",
	scale: bool = True,
	pad_mode: _PadMode = "constant",
	res_type: Optional[str] = "soxr_hq",
	dtype: Optional[DTypeLike] = None,
	) -> np.ndarray:
	"""Compute the constant-Q transform of an audio signal.

	This implementation is based on the recursive sub-sampling method
	described by [#]_.

	.. [#] Schoerkhuber, Christian, and Anssi Klapuri.
	"Constant-Q transform toolbox for music processing."
	7th Sound and Music Computing Conference, Barcelona, Spain. 2010.

	Parameters
	----------
	y : np.ndarray [shape=(..., n)]
	audio time series. Multi-channel is supported.

	sr : number > 0 [scalar]
	sampling rate of ``y``

	hop_length : int > 0 [scalar]
	number of samples between successive CQT columns.

	fmin : float > 0 [scalar]
	Minimum frequency. Defaults to `C1 ~= 32.70 Hz`

	n_bins : int > 0 [scalar]
	Number of frequency bins, starting at ``fmin``

	bins_per_octave : int > 0 [scalar]
	Number of bins per octave

	tuning : None or float
	Tuning offset in fractions of a bin.

	If ``None``, tuning will be automatically estimated from the signal.

	The minimum frequency of the resulting CQT will be modified to
	``fmin * 2**(tuning / bins_per_octave)``.

	filter_scale : float > 0
	Filter scale factor. Small values (<1) use shorter windows
	for improved time resolution.

	norm : {inf, -inf, 0, float > 0}
	Type of norm to use for basis function normalization.
	See `librosa.util.normalize`.

	sparsity : float in [0, 1)
	Sparsify the CQT basis by discarding up to ``sparsity``
	fraction of the energy in each basis.

	Set ``sparsity=0`` to disable sparsification.

	window : str, tuple, number, or function
	Window specification for the basis filters.
	See `filters.get_window` for details.

	scale : bool
	If ``True``, scale the CQT response by square-root the length of
	each channel's filter. This is analogous to ``norm='ortho'`` in FFT.

	If ``False``, do not scale the CQT. This is analogous to
	``norm=None`` in FFT.

	pad_mode : string
	Padding mode for centered frame analysis.

	See also: `librosa.stft` and `numpy.pad`.

	res_type : string
	The resampling mode for recursive downsampling.

	dtype : np.dtype
	The (complex) data type of the output array. By default, this is inferred to match
	the numerical precision of the input signal.

	Returns
	-------
	CQT : np.ndarray [shape=(..., n_bins, t)]
	Constant-Q value each frequency at each time.

	See Also
	--------
	vqt
	librosa.resample
	librosa.util.normalize

	Notes
	-----
	This function caches at level 20.

	Examples
	--------
	Generate and plot a constant-Q power spectrum

	>>> import matplotlib.pyplot as plt
	>>> y, sr = librosa.load(librosa.ex('trumpet'))
	>>> C = np.abs(librosa.cqt(y, sr=sr))
	>>> fig, ax = plt.subplots()
	>>> img = librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max),
	... sr=sr, x_axis='time', y_axis='cqt_note', ax=ax)
	>>> ax.set_title('Constant-Q power spectrum')
	>>> fig.colorbar(img, ax=ax, format="%+2.0f dB")

	Limit the frequency range

	>>> C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('C2'),
	... n_bins=60))
	>>> C
	array([[6.830e-04, 6.361e-04, ..., 7.362e-09, 9.102e-09],
	[5.366e-04, 4.818e-04, ..., 8.953e-09, 1.067e-08],
	...,
	[4.288e-02, 4.580e-01, ..., 1.529e-05, 5.572e-06],
	[2.965e-03, 1.508e-01, ..., 8.965e-06, 1.455e-05]])

	Using a higher frequency resolution

	>>> C = np.abs(librosa.cqt(y, sr=sr, fmin=librosa.note_to_hz('C2'),
	... n_bins=60 * 2, bins_per_octave=12 * 2))
	>>> C
	array([[5.468e-04, 5.382e-04, ..., 5.911e-09, 6.105e-09],
	[4.118e-04, 4.014e-04, ..., 7.788e-09, 8.160e-09],
	...,
	[2.780e-03, 1.424e-01, ..., 4.225e-06, 2.388e-05],
	[5.147e-02, 6.959e-02, ..., 1.694e-05, 5.811e-06]])
	"""

	# CQT is the special case of VQT with gamma=0
	return vqt(
	y=y,
	sr=sr,
	hop_length=hop_length,
	fmin=fmin,
	n_bins=n_bins,
	intervals="equal",
	gamma=0,
	bins_per_octave=bins_per_octave,
	tuning=tuning,
	filter_scale=filter_scale,
	norm=norm,
	sparsity=sparsity,
	window=window,
	scale=scale,
	pad_mode=pad_mode,
	res_type=res_type,
	dtype=dtype,
	)


	@cache(level=20)
	def hybrid_cqt(
	y: np.ndarray,
	*,
	sr: float = 22050,
	hop_length: int = 512,
	fmin: Optional[_FloatLike_co] = None,
	n_bins: int = 84,
	bins_per_octave: int = 12,
	tuning: Optional[float] = 0.0,
	filter_scale: float = 1,
	norm: Optional[float] = 1,
	sparsity: float = 0.01,
	window: _WindowSpec = "hann",
	scale: bool = True,
	pad_mode: _PadMode = "constant",
	res_type: str = "soxr_hq",
	dtype: Optional[DTypeLike] = None,
	) -> np.ndarray:
	"""Compute the hybrid constant-Q transform of an audio signal.

	Here, the hybrid CQT uses the pseudo CQT for higher frequencies where
	the hop_length is longer than half the filter length and the full CQT
	for lower frequencies.

	Parameters
	----------
	y : np.ndarray [shape=(..., n)]
	audio time series. Multi-channel is supported.

	sr : number > 0 [scalar]
	sampling rate of ``y``

	hop_length : int > 0 [scalar]
	number of samples between successive CQT columns.

	fmin : float > 0 [scalar]
	Minimum frequency. Defaults to `C1 ~= 32.70 Hz`

	n_bins : int > 0 [scalar]
	Number of frequency bins, starting at ``fmin``

	bins_per_octave : int > 0 [scalar]
	Number of bins per octave

	tuning : None or float
	Tuning offset in fractions of a bin.

	If ``None``, tuning will be automatically estimated from the signal.

	The minimum frequency of the resulting CQT will be modified to
	``fmin * 2**(tuning / bins_per_octave)``.

	filter_scale : float > 0
	Filter filter_scale factor. Larger values use longer windows.

	norm : {inf, -inf, 0, float > 0}
	Type of norm to use for basis function normalization.
	See `librosa.util.normalize`.

	sparsity : float in [0, 1)
	Sparsify the CQT basis by discarding up to ``sparsity``
	fraction of the energy in each basis.

	Set ``sparsity=0`` to disable sparsification.

	window : str, tuple, number, or function
	Window specification for the basis filters.
	See `filters.get_window` for details.

	scale : bool
	If ``True``, scale the CQT response by square-root the length of
	each channel's filter. This is analogous to ``norm='ortho'`` in FFT.

	If ``False``, do not scale the CQT. This is analogous to
	``norm=None`` in FFT.

	pad_mode : string
	Padding mode for centered frame analysis.

	See also: `librosa.stft` and `numpy.pad`.

	res_type : string
	Resampling mode. See `librosa.cqt` for details.

	dtype : np.dtype, optional
	The complex dtype to use for computing the CQT.
	By default, this is inferred to match the precision of
	the input signal.

	Returns
	-------
	CQT : np.ndarray [shape=(..., n_bins, t), dtype=np.float]
	Constant-Q energy for each frequency at each time.

	See Also
	--------
	cqt
	pseudo_cqt

	Notes
	-----
	This function caches at level 20.

	"""

	if fmin is None:
	# C1 by default
	fmin = note_to_hz("C1")

	if tuning is None:
	tuning = estimate_tuning(y=y, sr=sr, bins_per_octave=bins_per_octave)

	# Apply tuning correction
	fmin = fmin * 2.0 ** (tuning / bins_per_octave)

	# Get all CQT frequencies
	freqs = cqt_frequencies(n_bins, fmin=fmin, bins_per_octave=bins_per_octave)

	# Compute an alpha parameter, just in case we need it
	alpha = __bpo_to_alpha(bins_per_octave)

	# Compute the length of each constant-Q basis function
	lengths, _ = filters.wavelet_lengths(
	freqs=freqs, sr=sr, filter_scale=filter_scale, window=window, alpha=alpha
	)

	# Determine which filters to use with Pseudo CQT
	# These are the ones that fit within 2 hop lengths after padding
	pseudo_filters = 2.0 ** np.ceil(np.log2(lengths)) < 2 * hop_length

	n_bins_pseudo = int(np.sum(pseudo_filters))

	n_bins_full = n_bins - n_bins_pseudo
	cqt_resp = []

	if n_bins_pseudo > 0:
	fmin_pseudo = np.min(freqs[pseudo_filters])

	cqt_resp.append(
	pseudo_cqt(
	y,
	sr=sr,
	hop_length=hop_length,
	fmin=fmin_pseudo,
	n_bins=n_bins_pseudo,
	bins_per_octave=bins_per_octave,
	filter_scale=filter_scale,
	norm=norm,
	sparsity=sparsity,
	window=window,
	scale=scale,
	pad_mode=pad_mode,
	dtype=dtype,
	)
	)

	if n_bins_full > 0:
	cqt_resp.append(
	np.abs(
	cqt(
	y,
	sr=sr,
	hop_length=hop_length,
	fmin=fmin,
	n_bins=n_bins_full,
	bins_per_octave=bins_per_octave,
	filter_scale=filter_scale,
	norm=norm,
	sparsity=sparsity,
	window=window,
	scale=scale,
	pad_mode=pad_mode,
	res_type=res_type,
	dtype=dtype,
	)
	)
	)

	# Propagate dtype from the last component
	return __trim_stack(cqt_resp, n_bins, cqt_resp[-1].dtype)


	@cache(level=20)
	def pseudo_cqt(
	y: np.ndarray,
	*,
	sr: float = 22050,
	hop_length: int = 512,
	fmin: Optional[_FloatLike_co] = None,
	n_bins: int = 84,
	bins_per_octave: int = 12,
	tuning: Optional[float] = 0.0,
	filter_scale: float = 1,
	norm: Optional[float] = 1,
	sparsity: float = 0.01,
	window: _WindowSpec = "hann",
	scale: bool = True,
	pad_mode: _PadMode = "constant",
	dtype: Optional[DTypeLike] = None,
	) -> np.ndarray:
	"""Compute the pseudo constant-Q transform of an audio signal.

	This uses a single fft size that is the smallest power of 2 that is greater
	than or equal to the max of:

	1. The longest CQT filter
	2. 2x the hop_length

	Parameters
	----------
	y : np.ndarray [shape=(..., n)]
	audio time series. Multi-channel is supported.

	sr : number > 0 [scalar]
	sampling rate of ``y``

	hop_length : int > 0 [scalar]
	number of samples between successive CQT columns.

	fmin : float > 0 [scalar]
	Minimum frequency. Defaults to `C1 ~= 32.70 Hz`

	n_bins : int > 0 [scalar]
	Number of frequency bins, starting at ``fmin``

	bins_per_octave : int > 0 [scalar]
	Number of bins per octave

	tuning : None or float
	Tuning offset in fractions of a bin.

	If ``None``, tuning will be automatically estimated from the signal.

	The minimum frequency of the resulting CQT will be modified to
	``fmin * 2**(tuning / bins_per_octave)``.

	filter_scale : float > 0
	Filter filter_scale factor. Larger values use longer windows.

	norm : {inf, -inf, 0, float > 0}
	Type of norm to use for basis function normalization.
	See `librosa.util.normalize`.

	sparsity : float in [0, 1)
	Sparsify the CQT basis by discarding up to ``sparsity``
	fraction of the energy in each basis.

	Set ``sparsity=0`` to disable sparsification.

	window : str, tuple, number, or function
	Window specification for the basis filters.
	See `filters.get_window` for details.

	scale : bool
	If ``True``, scale the CQT response by square-root the length of
	each channel's filter. This is analogous to ``norm='ortho'`` in FFT.

	If ``False``, do not scale the CQT. This is analogous to
	``norm=None`` in FFT.

	pad_mode : string
	Padding mode for centered frame analysis.

	See also: `librosa.stft` and `numpy.pad`.

	dtype : np.dtype, optional
	The complex data type for CQT calculations.
	By default, this is inferred to match the precision of the input signal.

	Returns
	-------
	CQT : np.ndarray [shape=(..., n_bins, t), dtype=np.float]
	Pseudo Constant-Q energy for each frequency at each time.

	Notes
	-----
	This function caches at level 20.

	"""

	if fmin is None:
	# C1 by default
	fmin = note_to_hz("C1")

	if tuning is None:
	tuning = estimate_tuning(y=y, sr=sr, bins_per_octave=bins_per_octave)

	if dtype is None:
	dtype = util.dtype_r2c(y.dtype)

	# Apply tuning correction
	fmin = fmin * 2.0 ** (tuning / bins_per_octave)

	freqs = cqt_frequencies(fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave)

	alpha = __bpo_to_alpha(bins_per_octave)

	lengths, _ = filters.wavelet_lengths(
	freqs=freqs, sr=sr, window=window, filter_scale=filter_scale, alpha=alpha
	)

	fft_basis, n_fft, _ = __vqt_filter_fft(
	sr,
	freqs,
	filter_scale,
	norm,
	sparsity,
	hop_length=hop_length,
	window=window,
	dtype=dtype,
	alpha=alpha,
	)

	fft_basis = np.abs(fft_basis)

	# Compute the magnitude-only CQT response
	C: np.ndarray = __cqt_response(
	y,
	n_fft,
	hop_length,
	fft_basis,
	pad_mode,
	window="hann",
	dtype=dtype,
	phase=False,
	)

	if scale:
	C /= np.sqrt(n_fft)
	else:
	# reshape lengths to match dimension properly
	lengths = util.expand_to(lengths, ndim=C.ndim, axes=-2)

	C *= np.sqrt(lengths / n_fft)

	return C


	@cache(level=40)
	def icqt(
	C: np.ndarray,
	*,
	sr: float = 22050,
	hop_length: int = 512,
	fmin: Optional[_FloatLike_co] = None,
	bins_per_octave: int = 12,
	tuning: float = 0.0,
	filter_scale: float = 1,
	norm: Optional[float] = 1,
	sparsity: float = 0.01,
	window: _WindowSpec = "hann",
	scale: bool = True,
	length: Optional[int] = None,
	res_type: str = "soxr_hq",
	dtype: Optional[DTypeLike] = None,
	) -> np.ndarray:
	"""Compute the inverse constant-Q transform.

	Given a constant-Q transform representation ``C`` of an audio signal ``y``,
	this function produces an approximation ``y_hat``.

	Parameters
	----------
	C : np.ndarray, [shape=(..., n_bins, n_frames)]
	Constant-Q representation as produced by `cqt`

	sr : number > 0 [scalar]
	sampling rate of the signal

	hop_length : int > 0 [scalar]
	number of samples between successive frames

	fmin : float > 0 [scalar]
	Minimum frequency. Defaults to `C1 ~= 32.70 Hz`

	bins_per_octave : int > 0 [scalar]
	Number of bins per octave

	tuning : float [scalar]
	Tuning offset in fractions of a bin.

	The minimum frequency of the CQT will be modified to
	``fmin * 2**(tuning / bins_per_octave)``.

	filter_scale : float > 0 [scalar]
	Filter scale factor. Small values (<1) use shorter windows
	for improved time resolution.

	norm : {inf, -inf, 0, float > 0}
	Type of norm to use for basis function normalization.
	See `librosa.util.normalize`.

	sparsity : float in [0, 1)
	Sparsify the CQT basis by discarding up to ``sparsity``
	fraction of the energy in each basis.

	Set ``sparsity=0`` to disable sparsification.

	window : str, tuple, number, or function
	Window specification for the basis filters.
	See `filters.get_window` for details.

	scale : bool
	If ``True``, scale the CQT response by square-root the length
	of each channel's filter. This is analogous to ``norm='ortho'`` in FFT.

	If ``False``, do not scale the CQT. This is analogous to ``norm=None``
	in FFT.

	length : int > 0, optional
	If provided, the output ``y`` is zero-padded or clipped to exactly
	``length`` samples.

	res_type : string
	Resampling mode.
	See `librosa.resample` for supported modes.

	dtype : numeric type
	Real numeric type for ``y``. Default is inferred to match the numerical
	precision of the input CQT.

	Returns
	-------
	y : np.ndarray, [shape=(..., n_samples), dtype=np.float]
	Audio time-series reconstructed from the CQT representation.

	See Also
	--------
	cqt
	librosa.resample

	Notes
	-----
	This function caches at level 40.

	Examples
	--------
	Using default parameters

	>>> y, sr = librosa.load(librosa.ex('trumpet'))
	>>> C = librosa.cqt(y=y, sr=sr)
	>>> y_hat = librosa.icqt(C=C, sr=sr)

	Or with a different hop length and frequency resolution:

	>>> hop_length = 256
	>>> bins_per_octave = 12 * 3
	>>> C = librosa.cqt(y=y, sr=sr, hop_length=256, n_bins=7*bins_per_octave,
	... bins_per_octave=bins_per_octave)
	>>> y_hat = librosa.icqt(C=C, sr=sr, hop_length=hop_length,
	... bins_per_octave=bins_per_octave)
	"""
	if fmin is None:
	fmin = note_to_hz("C1")

	# Apply tuning correction
	fmin = fmin * 2.0 ** (tuning / bins_per_octave)

	# Get the top octave of frequencies
	n_bins = C.shape[-2]

	n_octaves = int(np.ceil(float(n_bins) / bins_per_octave))

	# truncate the cqt to max frames if helpful
	freqs = cqt_frequencies(fmin=fmin, n_bins=n_bins, bins_per_octave=bins_per_octave)
	alpha = __bpo_to_alpha(bins_per_octave)

	lengths, f_cutoff = filters.wavelet_lengths(
	freqs=freqs, sr=sr, window=window, filter_scale=filter_scale, alpha=alpha
	)

	# Trim the CQT to only what's necessary for reconstruction
	if length is not None:
	n_frames = int(np.ceil((length + max(lengths)) / hop_length))
	C = C[..., :n_frames]

	C_scale = np.sqrt(lengths)

	# This shape array will be used for broadcasting the basis scale
	# we'll have to adapt this per octave within the loop
	y: Optional[np.ndarray] = None

	# Assume the top octave is at the full rate
	srs = [sr]
	hops = [hop_length]

	for i in range(n_octaves - 1):
	if hops[0] % 2 == 0:
	# We can downsample:
	srs.insert(0, srs[0] * 0.5)
	hops.insert(0, hops[0] // 2)
	else:
	# We're out of downsamplings, carry forward
	srs.insert(0, srs[0])
	hops.insert(0, hops[0])

	for i, (my_sr, my_hop) in enumerate(zip(srs, hops)):
	# How many filters are in this octave?
	n_filters = min(bins_per_octave, n_bins - bins_per_octave * i)

	# Slice out the current octave
	sl = slice(bins_per_octave * i, bins_per_octave * i + n_filters)

	fft_basis, n_fft, _ = __vqt_filter_fft(
	my_sr,
	freqs[sl],
	filter_scale,
	norm,
	sparsity,
	window=window,
	dtype=dtype,
	alpha=alpha,
	)

	# Transpose the basis
	inv_basis = fft_basis.H.todense()

	# Compute each filter's frequency-domain power
	freq_power = 1 / np.sum(util.abs2(np.asarray(inv_basis)), axis=0)

	# Compensate for length normalization in the forward transform
	freq_power *= n_fft / lengths[sl]

	# Inverse-project the basis for each octave
	if scale:
	# scale=True ==> re-scale by sqrt(lengths)
	D_oct = np.einsum(
	"fc,c,c,...ct->...ft",
	inv_basis,
	C_scale[sl],
	freq_power,
	C[..., sl, :],
	optimize=True,
	)
	else:
	D_oct = np.einsum(
	"fc,c,...ct->...ft", inv_basis, freq_power, C[..., sl, :], optimize=True
	)

	y_oct = istft(D_oct, window="ones", hop_length=my_hop, dtype=dtype)

	y_oct = audio.resample(
	y_oct,
	orig_sr=1,
	target_sr=sr // my_sr,
	res_type=res_type,
	scale=False,
	fix=False,
	)

	if y is None:
	y = y_oct
	else:
	y[..., : y_oct.shape[-1]] += y_oct
	# make mypy happy
	assert y is not None

	if length:
	y = util.fix_length(y, size=length)

	return y


	@cache(level=20)
	def vqt(
	y: np.ndarray,
	*,
	sr: float = 22050,
	hop_length: int = 512,
	fmin: Optional[_FloatLike_co] = None,
	n_bins: int = 84,
	intervals: Union[str, Collection[float]] = "equal",
	gamma: Optional[float] = None,
	bins_per_octave: int = 12,
	tuning: Optional[float] = 0.0,
	filter_scale: float = 1,
	norm: Optional[float] = 1,
	sparsity: float = 0.01,
	window: _WindowSpec = "hann",
	scale: bool = True,
	pad_mode: _PadMode = "constant",
	res_type: Optional[str] = "soxr_hq",
	dtype: Optional[DTypeLike] = None,
	) -> np.ndarray:
	"""Compute the variable-Q transform of an audio signal.

	This implementation is based on the recursive sub-sampling method
	described by [#]_.

	.. [#] Schörkhuber, Christian, Anssi Klapuri, Nicki Holighaus, and Monika Dörfler.
	"A Matlab toolbox for efficient perfect reconstruction time-frequency
	transforms with log-frequency resolution."
	In Audio Engineering Society Conference: 53rd International Conference: Semantic Audio.
	Audio Engineering Society, 2014.

	Parameters
	----------
	y : np.ndarray [shape=(..., n)]
	audio time series. Multi-channel is supported.

	sr : number > 0 [scalar]
	sampling rate of ``y``

	hop_length : int > 0 [scalar]
	number of samples between successive VQT columns.

	fmin : float > 0 [scalar]
	Minimum frequency. Defaults to `C1 ~= 32.70 Hz`

	n_bins : int > 0 [scalar]
	Number of frequency bins, starting at ``fmin``

	intervals : str or array of floats in [1, 2)
	Either a string specification for an interval set, e.g.,
	`'equal'`, `'pythagorean'`, `'ji3'`, etc. or an array of
	intervals expressed as numbers between 1 and 2.
	.. see also:: librosa.interval_frequencies

	gamma : number > 0 [scalar]
	Bandwidth offset for determining filter lengths.

	If ``gamma=0``, produces the constant-Q transform.

	If 'gamma=None', gamma will be calculated such that filter bandwidths are equal to a
	constant fraction of the equivalent rectangular bandwidths (ERB). This is accomplished
	by solving for the gamma which gives::

	B_k = alpha * f_k + gamma = C * ERB(f_k),

	where ``B_k`` is the bandwidth of filter ``k`` with center frequency ``f_k``, alpha
	is the inverse of what would be the constant Q-factor, and ``C = alpha / 0.108`` is the
	constant fraction across all filters.

	Here we use ``ERB(f_k) = 24.7 + 0.108 * f_k``, the best-fit curve derived
	from experimental data in [#]_.

	.. [#] Glasberg, Brian R., and Brian CJ Moore.
	"Derivation of auditory filter shapes from notched-noise data."
	Hearing research 47.1-2 (1990): 103-138.

	bins_per_octave : int > 0 [scalar]
	Number of bins per octave

	tuning : None or float
	Tuning offset in fractions of a bin.

	If ``None``, tuning will be automatically estimated from the signal.

	The minimum frequency of the resulting VQT will be modified to
	``fmin * 2**(tuning / bins_per_octave)``.

	filter_scale : float > 0
	Filter scale factor. Small values (<1) use shorter windows
	for improved time resolution.

	norm : {inf, -inf, 0, float > 0}
	Type of norm to use for basis function normalization.
	See `librosa.util.normalize`.

	sparsity : float in [0, 1)
	Sparsify the VQT basis by discarding up to ``sparsity``
	fraction of the energy in each basis.

	Set ``sparsity=0`` to disable sparsification.

	window : str, tuple, number, or function
	Window specification for the basis filters.
	See `filters.get_window` for details.

	scale : bool
	If ``True``, scale the VQT response by square-root the length of
	each channel's filter. This is analogous to ``norm='ortho'`` in FFT.

	If ``False``, do not scale the VQT. This is analogous to
	``norm=None`` in FFT.

	pad_mode : string
	Padding mode for centered frame analysis.

	See also: `librosa.stft` and `numpy.pad`.

	res_type : string
	The resampling mode for recursive downsampling.

	dtype : np.dtype
	The dtype of the output array. By default, this is inferred to match the
	numerical precision of the input signal.

	Returns
	-------
	VQT : np.ndarray [shape=(..., n_bins, t), dtype=np.complex]
	Variable-Q value each frequency at each time.

	See Also
	--------
	cqt

	Notes
	-----
	This function caches at level 20.

	Examples
	--------
	Generate and plot a variable-Q power spectrum

	>>> import matplotlib.pyplot as plt
	>>> y, sr = librosa.load(librosa.ex('choice'), duration=5)
	>>> C = np.abs(librosa.cqt(y, sr=sr))
	>>> V = np.abs(librosa.vqt(y, sr=sr))
	>>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
	>>> librosa.display.specshow(librosa.amplitude_to_db(C, ref=np.max),
	... sr=sr, x_axis='time', y_axis='cqt_note', ax=ax[0])
	>>> ax[0].set(title='Constant-Q power spectrum', xlabel=None)
	>>> ax[0].label_outer()
	>>> img = librosa.display.specshow(librosa.amplitude_to_db(V, ref=np.max),
	... sr=sr, x_axis='time', y_axis='cqt_note', ax=ax[1])
	>>> ax[1].set_title('Variable-Q power spectrum')
	>>> fig.colorbar(img, ax=ax, format="%+2.0f dB")
	"""

	# If intervals are provided as an array, override BPO
	if not isinstance(intervals, str):
	bins_per_octave = len(intervals)

	# How many octaves are we dealing with?
	n_octaves = int(np.ceil(float(n_bins) / bins_per_octave))
	n_filters = min(bins_per_octave, n_bins)

	if fmin is None:
	# C1 by default
	fmin = note_to_hz("C1")

	if tuning is None:
	tuning = estimate_tuning(y=y, sr=sr, bins_per_octave=bins_per_octave)

	if dtype is None:
	dtype = util.dtype_r2c(y.dtype)

	# Apply tuning correction
	fmin = fmin * 2.0 ** (tuning / bins_per_octave)

	# First thing, get the freqs of the top octave
	freqs = interval_frequencies(
	n_bins=n_bins,
	fmin=fmin,
	intervals=intervals,
	bins_per_octave=bins_per_octave,
	sort=True,
	)

	freqs_top = freqs[-bins_per_octave:]

	fmax_t: float = np.max(freqs_top)
	alpha = __bpo_to_alpha(bins_per_octave)

	lengths, filter_cutoff = filters.wavelet_lengths(
	freqs=freqs,
	sr=sr,
	window=window,
	filter_scale=filter_scale,
	gamma=gamma,
	alpha=alpha,
	)

	# Determine required resampling quality
	nyquist = sr / 2.0

	if filter_cutoff > nyquist:
	raise ParameterError(
	f"Wavelet basis with max frequency={fmax_t} would exceed the Nyquist frequency={nyquist}. "
	"Try reducing the number of frequency bins."
	)

	if res_type is None:
	warnings.warn(
	"Support for VQT with res_type=None is deprecated in librosa 0.10\n"
	"and will be removed in version 1.0.",
	category=FutureWarning,
	stacklevel=2,
	)
	res_type = "soxr_hq"

	y, sr, hop_length = __early_downsample(
	y, sr, hop_length, res_type, n_octaves, nyquist, filter_cutoff, scale
	)

	vqt_resp = []

	# Iterate down the octaves
	my_y, my_sr, my_hop = y, sr, hop_length

	for i in range(n_octaves):
	# Slice out the current octave of filters
	if i == 0:
	sl = slice(-n_filters, None)
	else:
	sl = slice(-n_filters * (i + 1), -n_filters * i)

	# This may be incorrect with early downsampling
	freqs_oct = freqs[sl]

	fft_basis, n_fft, _ = __vqt_filter_fft(
	my_sr,
	freqs_oct,
	filter_scale,
	norm,
	sparsity,
	window=window,
	gamma=gamma,
	dtype=dtype,
	alpha=alpha,
	)

	# Re-scale the filters to compensate for downsampling
	fft_basis[:] *= np.sqrt(sr / my_sr)

	# Compute the vqt filter response and append to the stack
	vqt_resp.append(
	__cqt_response(my_y, n_fft, my_hop, fft_basis, pad_mode, dtype=dtype)
	)

	if my_hop % 2 == 0:
	my_hop //= 2
	my_sr /= 2.0
	my_y = audio.resample(
	my_y, orig_sr=2, target_sr=1, res_type=res_type, scale=True
	)

	V = __trim_stack(vqt_resp, n_bins, dtype)

	if scale:
	# Recompute lengths here because early downsampling may have changed
	# our sampling rate
	lengths, _ = filters.wavelet_lengths(
	freqs=freqs,
	sr=sr,
	window=window,
	filter_scale=filter_scale,
	gamma=gamma,
	alpha=alpha,
	)

	# reshape lengths to match V shape
	lengths = util.expand_to(lengths, ndim=V.ndim, axes=-2)
	V /= np.sqrt(lengths)

	return V


	@cache(level=10)
	def __vqt_filter_fft(
	sr,
	freqs,
	filter_scale,
	norm,
	sparsity,
	hop_length=None,
	window="hann",
	gamma=0.0,
	dtype=np.complex64,
	alpha=None,
	):
	"""Generate the frequency domain variable-Q filter basis."""

	basis, lengths = filters.wavelet(
	freqs=freqs,
	sr=sr,
	filter_scale=filter_scale,
	norm=norm,
	pad_fft=True,
	window=window,
	gamma=gamma,
	alpha=alpha,
	)

	# Filters are padded up to the nearest integral power of 2
	n_fft = basis.shape[1]

	if hop_length is not None and n_fft < 2.0 ** (1 + np.ceil(np.log2(hop_length))):
	n_fft = int(2.0 ** (1 + np.ceil(np.log2(hop_length))))

	# re-normalize bases with respect to the FFT window length
	basis *= lengths[:, np.newaxis] / float(n_fft)

	# FFT and retain only the non-negative frequencies
	fft = get_fftlib()
	fft_basis = fft.fft(basis, n=n_fft, axis=1)[:, : (n_fft // 2) + 1]

	# sparsify the basis
	fft_basis = util.sparsify_rows(fft_basis, quantile=sparsity, dtype=dtype)

	return fft_basis, n_fft, lengths


	def __trim_stack(
	cqt_resp: List[np.ndarray], n_bins: int, dtype: DTypeLike
	) -> np.ndarray:
	"""Helper function to trim and stack a collection of CQT responses"""

	max_col = min(c_i.shape[-1] for c_i in cqt_resp)
	# Grab any leading dimensions
	shape = list(cqt_resp[0].shape)
	shape[-2] = n_bins
	shape[-1] = max_col
	cqt_out = np.empty(shape, dtype=dtype, order="F")

	# Copy per-octave data into output array
	end = n_bins
	for c_i in cqt_resp:
	# By default, take the whole octave
	n_oct = c_i.shape[-2]
	# If the whole octave is more than we can fit,
	# take the highest bins from c_i
	if end < n_oct:
	cqt_out[..., :end, :] = c_i[..., -end:, :max_col]
	else:
	cqt_out[..., end - n_oct : end, :] = c_i[..., :max_col]

	end -= n_oct

	return cqt_out


	def __cqt_response(
	y, n_fft, hop_length, fft_basis, mode, window="ones", phase=True, dtype=None
	):
	"""Compute the filter response with a target STFT hop."""

	# Compute the STFT matrix
	D = stft(
	y, n_fft=n_fft, hop_length=hop_length, window=window, pad_mode=mode, dtype=dtype
	)

	if not phase:
	D = np.abs(D)

	# Reshape D to Dr
	Dr = D.reshape((-1, D.shape[-2], D.shape[-1]))
	output_flat = np.empty(
	(Dr.shape[0], fft_basis.shape[0], Dr.shape[-1]), dtype=D.dtype
	)

	# iterate over channels
	# project fft_basis.dot(Dr[i])
	for i in range(Dr.shape[0]):
	output_flat[i] = fft_basis.dot(Dr[i])

	# reshape Dr to match D's leading dimensions again
	shape = list(D.shape)
	shape[-2] = fft_basis.shape[0]
	return output_flat.reshape(shape)


	def __early_downsample_count(nyquist, filter_cutoff, hop_length, n_octaves):
	"""Compute the number of early downsampling operations"""

	downsample_count1 = max(0, int(np.ceil(np.log2(nyquist / filter_cutoff)) - 1) - 1)

	num_twos = __num_two_factors(hop_length)
	downsample_count2 = max(0, num_twos - n_octaves + 1)

	return min(downsample_count1, downsample_count2)


	def __early_downsample(
	y, sr, hop_length, res_type, n_octaves, nyquist, filter_cutoff, scale
	):
	"""Perform early downsampling on an audio signal, if it applies."""

	downsample_count = __early_downsample_count(
	nyquist, filter_cutoff, hop_length, n_octaves
	)

	if downsample_count > 0:
	downsample_factor = 2 ** (downsample_count)

	hop_length //= downsample_factor

	if y.shape[-1] < downsample_factor:
	raise ParameterError(
	f"Input signal length={len(y):d} is too short for "
	f"{n_octaves:d}-octave CQT"
	)

	new_sr = sr / float(downsample_factor)
	y = audio.resample(
	y, orig_sr=downsample_factor, target_sr=1, res_type=res_type, scale=True
	)

	# If we're not going to length-scale after CQT, we
	# need to compensate for the downsampling factor here
	if not scale:
	y *= np.sqrt(downsample_factor)

	sr = new_sr

	return y, sr, hop_length


	@jit(nopython=True, cache=False)
	def __num_two_factors(x):
	"""Return how many times integer x can be evenly divided by 2.

	Returns 0 for non-positive integers.
	"""
	if x <= 0:
	return 0
	num_twos = 0
	while x % 2 == 0:
	num_twos += 1
	x //= 2

	return num_twos


	def griffinlim_cqt(
	C: np.ndarray,
	*,
	n_iter: int = 32,
	sr: float = 22050,
	hop_length: int = 512,
	fmin: Optional[_FloatLike_co] = None,
	bins_per_octave: int = 12,
	tuning: float = 0.0,
	filter_scale: float = 1,
	norm: Optional[float] = 1,
	sparsity: float = 0.01,
	window: _WindowSpec = "hann",
	scale: bool = True,
	pad_mode: _PadMode = "constant",
	res_type: str = "soxr_hq",
	dtype: Optional[DTypeLike] = None,
	length: Optional[int] = None,
	momentum: float = 0.99,
	init: Optional[str] = "random",
	random_state: Optional[
	Union[int, np.random.RandomState, np.random.Generator]
	] = None,
	) -> np.ndarray:
	"""Approximate constant-Q magnitude spectrogram inversion using the "fast" Griffin-Lim
	algorithm.

	Given the magnitude of a constant-Q spectrogram (``C``), the algorithm randomly initializes
	phase estimates, and then alternates forward- and inverse-CQT operations. [#]_

	This implementation is based on the (fast) Griffin-Lim method for Short-time Fourier Transforms, [#]_
	but adapted for use with constant-Q spectrograms.

	.. [#] D. W. Griffin and J. S. Lim,
	"Signal estimation from modified short-time Fourier transform,"
	IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984.

	.. [#] Perraudin, N., Balazs, P., & Søndergaard, P. L.
	"A fast Griffin-Lim algorithm,"
	IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (pp. 1-4),
	Oct. 2013.

	Parameters
	----------
	C : np.ndarray [shape=(..., n_bins, n_frames)]
	The constant-Q magnitude spectrogram

	n_iter : int > 0
	The number of iterations to run

	sr : number > 0
	Audio sampling rate

	hop_length : int > 0
	The hop length of the CQT

	fmin : number > 0
	Minimum frequency for the CQT.

	If not provided, it defaults to `C1`.

	bins_per_octave : int > 0
	Number of bins per octave

	tuning : float
	Tuning deviation from A440, in fractions of a bin

	filter_scale : float > 0
	Filter scale factor. Small values (<1) use shorter windows
	for improved time resolution.

	norm : {inf, -inf, 0, float > 0}
	Type of norm to use for basis function normalization.
	See `librosa.util.normalize`.

	sparsity : float in [0, 1)
	Sparsify the CQT basis by discarding up to ``sparsity``
	fraction of the energy in each basis.

	Set ``sparsity=0`` to disable sparsification.

	window : str, tuple, or function
	Window specification for the basis filters.
	See `filters.get_window` for details.

	scale : bool
	If ``True``, scale the CQT response by square-root the length
	of each channel's filter. This is analogous to ``norm='ortho'``
	in FFT.

	If ``False``, do not scale the CQT. This is analogous to ``norm=None``
	in FFT.

	pad_mode : string
	Padding mode for centered frame analysis.

	See also: `librosa.stft` and `numpy.pad`.

	res_type : string
	The resampling mode for recursive downsampling.

	See ``librosa.resample`` for a list of available options.

	dtype : numeric type
	Real numeric type for ``y``. Default is inferred to match the precision
	of the input CQT.

	length : int > 0, optional
	If provided, the output ``y`` is zero-padded or clipped to exactly
	``length`` samples.

	momentum : float > 0
	The momentum parameter for fast Griffin-Lim.
	Setting this to 0 recovers the original Griffin-Lim method.
	Values near 1 can lead to faster convergence, but above 1 may not converge.

	init : None or 'random' [default]
	If 'random' (the default), then phase values are initialized randomly
	according to ``random_state``. This is recommended when the input ``C`` is
	a magnitude spectrogram with no initial phase estimates.

	If ``None``, then the phase is initialized from ``C``. This is useful when
	an initial guess for phase can be provided, or when you want to resume
	Griffin-Lim from a previous output.

	random_state : None, int, np.random.RandomState, or np.random.Generator
	If int, random_state is the seed used by the random number generator
	for phase initialization.

	If `np.random.RandomState` or `np.random.Generator` instance, the random number generator itself.

	If ``None``, defaults to the `np.random.default_rng()` object.

	Returns
	-------
	y : np.ndarray [shape=(..., n)]
	time-domain signal reconstructed from ``C``

	See Also
	--------
	cqt
	icqt
	griffinlim
	filters.get_window
	resample

	Examples
	--------
	A basis CQT inverse example

	>>> y, sr = librosa.load(librosa.ex('trumpet', hq=True), sr=None)
	>>> # Get the CQT magnitude, 7 octaves at 36 bins per octave
	>>> C = np.abs(librosa.cqt(y=y, sr=sr, bins_per_octave=36, n_bins=7*36))
	>>> # Invert using Griffin-Lim
	>>> y_inv = librosa.griffinlim_cqt(C, sr=sr, bins_per_octave=36)
	>>> # And invert without estimating phase
	>>> y_icqt = librosa.icqt(C, sr=sr, bins_per_octave=36)

	Wave-plot the results

	>>> import matplotlib.pyplot as plt
	>>> fig, ax = plt.subplots(nrows=3, sharex=True, sharey=True)
	>>> librosa.display.waveshow(y, sr=sr, color='b', ax=ax[0])
	>>> ax[0].set(title='Original', xlabel=None)
	>>> ax[0].label_outer()
	>>> librosa.display.waveshow(y_inv, sr=sr, color='g', ax=ax[1])
	>>> ax[1].set(title='Griffin-Lim reconstruction', xlabel=None)
	>>> ax[1].label_outer()
	>>> librosa.display.waveshow(y_icqt, sr=sr, color='r', ax=ax[2])
	>>> ax[2].set(title='Magnitude-only icqt reconstruction')
	"""
	if fmin is None:
	fmin = note_to_hz("C1")

	if random_state is None:
	rng = np.random.default_rng()
	elif isinstance(random_state, int):
	rng = np.random.RandomState(seed=random_state) # type: ignore
	elif isinstance(random_state, (np.random.RandomState, np.random.Generator)):
	rng = random_state # type: ignore
	else:
	_ensure_not_reachable(random_state)
	raise ParameterError(f"Unsupported random_state={random_state!r}")

	if momentum > 1:
	warnings.warn(
	f"Griffin-Lim with momentum={momentum} > 1 can be unstable. "
	"Proceed with caution!",
	stacklevel=2,
	)
	elif momentum < 0:
	raise ParameterError(f"griffinlim_cqt() called with momentum={momentum} < 0")

	# using complex64 will keep the result to minimal necessary precision
	angles = np.empty(C.shape, dtype=np.complex64)
	eps = util.tiny(angles)

	if init == "random":
	# randomly initialize the phase
	angles[:] = util.phasor(2 * np.pi * rng.random(size=C.shape))
	elif init is None:
	# Initialize an all ones complex matrix
	angles[:] = 1.0
	else:
	raise ParameterError(f"init={init} must either None or 'random'")

	# And initialize the previous iterate to 0
	rebuilt: np.ndarray = np.array(0.0)

	for _ in range(n_iter):
	# Store the previous iterate
	tprev = rebuilt

	# Invert with our current estimate of the phases
	inverse = icqt(
	C * angles,
	sr=sr,
	hop_length=hop_length,
	bins_per_octave=bins_per_octave,
	fmin=fmin,
	tuning=tuning,
	filter_scale=filter_scale,
	window=window,
	length=length,
	res_type=res_type,
	norm=norm,
	scale=scale,
	sparsity=sparsity,
	dtype=dtype,
	)

	# Rebuild the spectrogram
	rebuilt = cqt(
	inverse,
	sr=sr,
	bins_per_octave=bins_per_octave,
	n_bins=C.shape[-2],
	hop_length=hop_length,
	fmin=fmin,
	tuning=tuning,
	filter_scale=filter_scale,
	window=window,
	norm=norm,
	scale=scale,
	sparsity=sparsity,
	pad_mode=pad_mode,
	res_type=res_type,
	)

	# Update our phase estimates
	angles[:] = rebuilt - (momentum / (1 + momentum)) * tprev
	angles[:] /= np.abs(angles) + eps

	# Return the final phase estimates
	return icqt(
	C * angles,
	sr=sr,
	hop_length=hop_length,
	bins_per_octave=bins_per_octave,
	tuning=tuning,
	filter_scale=filter_scale,
	fmin=fmin,
	window=window,
	length=length,
	res_type=res_type,
	norm=norm,
	scale=scale,
	sparsity=sparsity,
	dtype=dtype,
	)


	def __bpo_to_alpha(bins_per_octave: int) -> float:
	"""Compute the alpha coefficient for a given number of bins per octave

	Parameters
	----------
	bins_per_octave : int

	Returns
	-------
	alpha : number > 0
	"""

	r = 2 ** (1 / bins_per_octave)
	return (r2 - 1) / (r2 + 1)