Upload 3363 files

4cef980 verified over 1 year ago

22 kB

	#!/usr/bin/env python
	# -- coding: utf-8 --
	"""
	Effects
	=======

	Harmonic-percussive source separation
	-------------------------------------
	.. autosummary::
	:toctree: generated/

	hpss
	harmonic
	percussive

	Time and frequency
	------------------
	.. autosummary::
	:toctree: generated/

	time_stretch
	pitch_shift

	Miscellaneous
	-------------
	.. autosummary::
	:toctree: generated/

	remix
	trim
	split
	preemphasis
	deemphasis
	"""

	import numpy as np
	import scipy.signal

	from . import core
	from . import decompose
	from . import feature
	from . import util
	from .util.exceptions import ParameterError
	from .util.decorators import deprecate_positional_args

	__all__ = [
	"hpss",
	"harmonic",
	"percussive",
	"time_stretch",
	"pitch_shift",
	"remix",
	"trim",
	"split",
	]


	def hpss(y, **kwargs):
	"""Decompose an audio time series into harmonic and percussive components.

	This function automates the STFT->HPSS->ISTFT pipeline, and ensures that
	the output waveforms have equal length to the input waveform ``y``.

	Parameters
	----------
	y : np.ndarray [shape=(..., n)]
	audio time series. Multi-channel is supported.
	**kwargs : additional keyword arguments.
	See `librosa.decompose.hpss` for details.

	Returns
	-------
	y_harmonic : np.ndarray [shape=(..., n)]
	audio time series of the harmonic elements
	y_percussive : np.ndarray [shape=(..., n)]
	audio time series of the percussive elements

	See Also
	--------
	harmonic : Extract only the harmonic component
	percussive : Extract only the percussive component
	librosa.decompose.hpss : HPSS on spectrograms

	Examples
	--------
	>>> # Extract harmonic and percussive components
	>>> y, sr = librosa.load(librosa.ex('choice'))
	>>> y_harmonic, y_percussive = librosa.effects.hpss(y)

	>>> # Get a more isolated percussive component by widening its margin
	>>> y_harmonic, y_percussive = librosa.effects.hpss(y, margin=(1.0,5.0))

	"""

	# Compute the STFT matrix
	stft = core.stft(y)

	# Decompose into harmonic and percussives
	stft_harm, stft_perc = decompose.hpss(stft, **kwargs)

	# Invert the STFTs. Adjust length to match the input.
	y_harm = core.istft(stft_harm, dtype=y.dtype, length=y.shape[-1])
	y_perc = core.istft(stft_perc, dtype=y.dtype, length=y.shape[-1])

	return y_harm, y_perc


	def harmonic(y, **kwargs):
	"""Extract harmonic elements from an audio time-series.

	Parameters
	----------
	y : np.ndarray [shape=(..., n)]
	audio time series. Multi-channel is supported.
	**kwargs : additional keyword arguments.
	See `librosa.decompose.hpss` for details.

	Returns
	-------
	y_harmonic : np.ndarray [shape=(..., n)]
	audio time series of just the harmonic portion

	See Also
	--------
	hpss : Separate harmonic and percussive components
	percussive : Extract only the percussive component
	librosa.decompose.hpss : HPSS for spectrograms

	Examples
	--------
	>>> # Extract harmonic component
	>>> y, sr = librosa.load(librosa.ex('choice'))
	>>> y_harmonic = librosa.effects.harmonic(y)

	>>> # Use a margin > 1.0 for greater harmonic separation
	>>> y_harmonic = librosa.effects.harmonic(y, margin=3.0)

	"""

	# Compute the STFT matrix
	stft = core.stft(y)

	# Remove percussives
	stft_harm = decompose.hpss(stft, **kwargs)[0]

	# Invert the STFTs
	y_harm = core.istft(stft_harm, dtype=y.dtype, length=y.shape[-1])

	return y_harm


	def percussive(y, **kwargs):
	"""Extract percussive elements from an audio time-series.

	Parameters
	----------
	y : np.ndarray [shape=(..., n)]
	audio time series. Multi-channel is supported.
	**kwargs : additional keyword arguments.
	See `librosa.decompose.hpss` for details.

	Returns
	-------
	y_percussive : np.ndarray [shape=(..., n)]
	audio time series of just the percussive portion

	See Also
	--------
	hpss : Separate harmonic and percussive components
	harmonic : Extract only the harmonic component
	librosa.decompose.hpss : HPSS for spectrograms

	Examples
	--------
	>>> # Extract percussive component
	>>> y, sr = librosa.load(librosa.ex('choice'))
	>>> y_percussive = librosa.effects.percussive(y)

	>>> # Use a margin > 1.0 for greater percussive separation
	>>> y_percussive = librosa.effects.percussive(y, margin=3.0)

	"""

	# Compute the STFT matrix
	stft = core.stft(y)

	# Remove harmonics
	stft_perc = decompose.hpss(stft, **kwargs)[1]

	# Invert the STFT
	y_perc = core.istft(stft_perc, dtype=y.dtype, length=y.shape[-1])

	return y_perc


	@deprecate_positional_args
	def time_stretch(y, , rate, *kwargs):
	"""Time-stretch an audio series by a fixed rate.

	Parameters
	----------
	y : np.ndarray [shape=(..., n)]
	audio time series. Multi-channel is supported.
	rate : float > 0 [scalar]
	Stretch factor. If ``rate > 1``, then the signal is sped up.
	If ``rate < 1``, then the signal is slowed down.
	**kwargs : additional keyword arguments.
	See `librosa.decompose.stft` for details.

	Returns
	-------
	y_stretch : np.ndarray [shape=(..., round(n/rate))]
	audio time series stretched by the specified rate

	See Also
	--------
	pitch_shift :
	pitch shifting
	librosa.phase_vocoder :
	spectrogram phase vocoder
	pyrubberband.pyrb.time_stretch :
	high-quality time stretching using RubberBand

	Examples
	--------
	Compress to be twice as fast

	>>> y, sr = librosa.load(librosa.ex('choice'))
	>>> y_fast = librosa.effects.time_stretch(y, rate=2.0)

	Or half the original speed

	>>> y_slow = librosa.effects.time_stretch(y, rate=0.5)

	"""

	if rate <= 0:
	raise ParameterError("rate must be a positive number")

	# Construct the short-term Fourier transform (STFT)
	stft = core.stft(y, **kwargs)

	# Stretch by phase vocoding
	stft_stretch = core.phase_vocoder(
	stft,
	rate=rate,
	hop_length=kwargs.get("hop_length", None),
	n_fft=kwargs.get("n_fft", None),
	)

	# Predict the length of y_stretch
	len_stretch = int(round(y.shape[-1] / rate))

	# Invert the STFT
	y_stretch = core.istft(stft_stretch, dtype=y.dtype, length=len_stretch, **kwargs)

	return y_stretch


	@deprecate_positional_args
	def pitch_shift(
	y, , sr, n_steps, bins_per_octave=12, res_type="kaiser_best", *kwargs
	):
	"""Shift the pitch of a waveform by ``n_steps`` steps.

	A step is equal to a semitone if ``bins_per_octave`` is set to 12.

	Parameters
	----------
	y : np.ndarray [shape=(..., n)]
	audio time series. Multi-channel is supported.

	sr : number > 0 [scalar]
	audio sampling rate of ``y``

	n_steps : float [scalar]
	how many (fractional) steps to shift ``y``

	bins_per_octave : float > 0 [scalar]
	how many steps per octave

	res_type : string
	Resample type. By default, 'kaiser_best' is used.

	See `librosa.resample` for more information.

	**kwargs : additional keyword arguments.
	See `librosa.decompose.stft` for details.

	Returns
	-------
	y_shift : np.ndarray [shape=(..., n)]
	The pitch-shifted audio time-series

	See Also
	--------
	time_stretch :
	time stretching
	librosa.phase_vocoder :
	spectrogram phase vocoder
	pyrubberband.pyrb.pitch_shift :
	high-quality pitch shifting using RubberBand

	Examples
	--------
	Shift up by a major third (four steps if ``bins_per_octave`` is 12)

	>>> y, sr = librosa.load(librosa.ex('choice'))
	>>> y_third = librosa.effects.pitch_shift(y, sr=sr, n_steps=4)

	Shift down by a tritone (six steps if ``bins_per_octave`` is 12)

	>>> y_tritone = librosa.effects.pitch_shift(y, sr=sr, n_steps=-6)

	Shift up by 3 quarter-tones

	>>> y_three_qt = librosa.effects.pitch_shift(y, sr=sr, n_steps=3,
	... bins_per_octave=24)
	"""

	if bins_per_octave < 1 or not np.issubdtype(type(bins_per_octave), np.integer):
	raise ParameterError("bins_per_octave must be a positive integer.")

	rate = 2.0 ** (-float(n_steps) / bins_per_octave)

	# Stretch in time, then resample
	y_shift = core.resample(
	time_stretch(y, rate=rate, **kwargs),
	orig_sr=float(sr) / rate,
	target_sr=sr,
	res_type=res_type,
	)

	# Crop to the same dimension as the input
	return util.fix_length(y_shift, size=y.shape[-1])


	@deprecate_positional_args
	def remix(y, intervals, *, align_zeros=True):
	"""Remix an audio signal by re-ordering time intervals.

	Parameters
	----------
	y : np.ndarray [shape=(..., t)]
	Audio time series. Multi-channel is supported.
	intervals : iterable of tuples (start, end)
	An iterable (list-like or generator) where the ``i``th item
	``intervals[i]`` indicates the start and end (in samples)
	of a slice of ``y``.
	align_zeros : boolean
	If ``True``, interval boundaries are mapped to the closest
	zero-crossing in ``y``. If ``y`` is stereo, zero-crossings
	are computed after converting to mono.

	Returns
	-------
	y_remix : np.ndarray [shape=(..., d)]
	``y`` remixed in the order specified by ``intervals``

	Examples
	--------
	Load in the example track and reverse the beats

	>>> y, sr = librosa.load(librosa.ex('choice'))

	Compute beats

	>>> _, beat_frames = librosa.beat.beat_track(y=y, sr=sr,
	... hop_length=512)

	Convert from frames to sample indices

	>>> beat_samples = librosa.frames_to_samples(beat_frames)

	Generate intervals from consecutive events

	>>> intervals = librosa.util.frame(beat_samples, frame_length=2,
	... hop_length=1).T

	Reverse the beat intervals

	>>> y_out = librosa.effects.remix(y, intervals[::-1])
	"""

	y_out = []

	if align_zeros:
	y_mono = core.to_mono(y)
	zeros = np.nonzero(core.zero_crossings(y_mono))[-1]
	# Force end-of-signal onto zeros
	zeros = np.append(zeros, [len(y_mono)])

	for interval in intervals:

	if align_zeros:
	interval = zeros[util.match_events(interval, zeros)]

	y_out.append(y[..., interval[0] : interval[1]])

	return np.concatenate(y_out, axis=-1)


	def _signal_to_frame_nonsilent(
	y, frame_length=2048, hop_length=512, top_db=60, ref=np.max, aggregate=np.max
	):
	"""Frame-wise non-silent indicator for audio input.

	This is a helper function for `trim` and `split`.

	Parameters
	----------
	y : np.ndarray
	Audio signal, mono or stereo

	frame_length : int > 0
	The number of samples per frame

	hop_length : int > 0
	The number of samples between frames

	top_db : number > 0
	The threshold (in decibels) below reference to consider as
	silence

	ref : callable or float
	The reference amplitude

	aggregate : callable [default: np.max]
	Function to aggregate dB measurements across channels (if y.ndim > 1)

	Note: for multiple leading axes, this is performed using ``np.apply_over_axes``.

	Returns
	-------
	non_silent : np.ndarray, shape=(m,), dtype=bool
	Indicator of non-silent frames
	"""

	# Compute the MSE for the signal
	mse = feature.rms(y=y, frame_length=frame_length, hop_length=hop_length)

	# Convert to decibels and slice out the mse channel
	db = core.amplitude_to_db(mse[..., 0, :], ref=ref, top_db=None)

	# Aggregate everything but the time dimension
	if db.ndim > 1:
	db = np.apply_over_axes(aggregate, db, range(db.ndim - 1))
	# Squeeze out leading singleton dimensions here
	# We always want to keep the trailing dimension though
	db = np.squeeze(db, axis=tuple(range(db.ndim - 1)))

	return db > -top_db


	@deprecate_positional_args
	def trim(
	y, *, top_db=60, ref=np.max, frame_length=2048, hop_length=512, aggregate=np.max
	):
	"""Trim leading and trailing silence from an audio signal.

	Parameters
	----------
	y : np.ndarray, shape=(..., n)
	Audio signal. Multi-channel is supported.
	top_db : number > 0
	The threshold (in decibels) below reference to consider as
	silence
	ref : number or callable
	The reference amplitude. By default, it uses `np.max` and compares
	to the peak amplitude in the signal.
	frame_length : int > 0
	The number of samples per analysis frame
	hop_length : int > 0
	The number of samples between analysis frames
	aggregate : callable [default: np.max]
	Function to aggregate across channels (if y.ndim > 1)

	Returns
	-------
	y_trimmed : np.ndarray, shape=(..., m)
	The trimmed signal
	index : np.ndarray, shape=(2,)
	the interval of ``y`` corresponding to the non-silent region:
	``y_trimmed = y[index[0]:index[1]]`` (for mono) or
	``y_trimmed = y[:, index[0]:index[1]]`` (for stereo).

	Examples
	--------
	>>> # Load some audio
	>>> y, sr = librosa.load(librosa.ex('choice'))
	>>> # Trim the beginning and ending silence
	>>> yt, index = librosa.effects.trim(y)
	>>> # Print the durations
	>>> print(librosa.get_duration(y), librosa.get_duration(yt))
	25.025986394557822 25.007891156462584
	"""

	non_silent = _signal_to_frame_nonsilent(
	y,
	frame_length=frame_length,
	hop_length=hop_length,
	ref=ref,
	top_db=top_db,
	aggregate=aggregate,
	)

	nonzero = np.flatnonzero(non_silent)

	if nonzero.size > 0:
	# Compute the start and end positions
	# End position goes one frame past the last non-zero
	start = int(core.frames_to_samples(nonzero[0], hop_length=hop_length))
	end = min(
	y.shape[-1],
	int(core.frames_to_samples(nonzero[-1] + 1, hop_length=hop_length)),
	)
	else:
	# The signal only contains zeros
	start, end = 0, 0

	# Build the mono/stereo index
	full_index = [slice(None)] * y.ndim
	full_index[-1] = slice(start, end)

	return y[tuple(full_index)], np.asarray([start, end])


	@deprecate_positional_args
	def split(
	y, *, top_db=60, ref=np.max, frame_length=2048, hop_length=512, aggregate=np.max
	):
	"""Split an audio signal into non-silent intervals.

	Parameters
	----------
	y : np.ndarray, shape=(..., n)
	An audio signal. Multi-channel is supported.
	top_db : number > 0
	The threshold (in decibels) below reference to consider as
	silence
	ref : number or callable
	The reference amplitude. By default, it uses `np.max` and compares
	to the peak amplitude in the signal.
	frame_length : int > 0
	The number of samples per analysis frame
	hop_length : int > 0
	The number of samples between analysis frames
	aggregate : callable [default: np.max]
	Function to aggregate across channels (if y.ndim > 1)

	Returns
	-------
	intervals : np.ndarray, shape=(m, 2)
	``intervals[i] == (start_i, end_i)`` are the start and end time
	(in samples) of non-silent interval ``i``.

	"""

	non_silent = _signal_to_frame_nonsilent(
	y,
	frame_length=frame_length,
	hop_length=hop_length,
	ref=ref,
	top_db=top_db,
	aggregate=aggregate,
	)

	# Interval slicing, adapted from
	# https://stackoverflow.com/questions/2619413/efficiently-finding-the-interval-with-non-zeros-in-scipy-numpy-in-python
	# Find points where the sign flips
	edges = np.flatnonzero(np.diff(non_silent.astype(int)))

	# Pad back the sample lost in the diff
	edges = [edges + 1]

	# If the first frame had high energy, count it
	if non_silent[0]:
	edges.insert(0, [0])

	# Likewise for the last frame
	if non_silent[-1]:
	edges.append([len(non_silent)])

	# Convert from frames to samples
	edges = core.frames_to_samples(np.concatenate(edges), hop_length=hop_length)

	# Clip to the signal duration
	edges = np.minimum(edges, y.shape[-1])

	# Stack the results back as an ndarray
	return edges.reshape((-1, 2))


	@deprecate_positional_args
	def preemphasis(y, *, coef=0.97, zi=None, return_zf=False):
	"""Pre-emphasize an audio signal with a first-order differencing filter:

	y[n] -> y[n] - coef * y[n-1]

	Parameters
	----------
	y : np.ndarray [shape=(..., n)]
	Audio signal. Multi-channel is supported.

	coef : positive number
	Pre-emphasis coefficient. Typical values of ``coef`` are between 0 and 1.

	At the limit ``coef=0``, the signal is unchanged.

	At ``coef=1``, the result is the first-order difference of the signal.

	The default (0.97) matches the pre-emphasis filter used in the HTK
	implementation of MFCCs [#]_.

	.. [#] http://htk.eng.cam.ac.uk/

	zi : number
	Initial filter state. When making successive calls to non-overlapping
	frames, this can be set to the ``zf`` returned from the previous call.
	(See example below.)

	By default ``zi`` is initialized as ``2*y[0] - y[1]``.

	return_zf : boolean
	If ``True``, return the final filter state.
	If ``False``, only return the pre-emphasized signal.

	Returns
	-------
	y_out : np.ndarray
	pre-emphasized signal
	zf : number
	if ``return_zf=True``, the final filter state is also returned

	Examples
	--------
	Apply a standard pre-emphasis filter

	>>> import matplotlib.pyplot as plt
	>>> y, sr = librosa.load(librosa.ex('trumpet'))
	>>> y_filt = librosa.effects.preemphasis(y)
	>>> # and plot the results for comparison
	>>> S_orig = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max, top_db=None)
	>>> S_preemph = librosa.amplitude_to_db(np.abs(librosa.stft(y_filt)), ref=np.max, top_db=None)
	>>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
	>>> librosa.display.specshow(S_orig, y_axis='log', x_axis='time', ax=ax[0])
	>>> ax[0].set(title='Original signal')
	>>> ax[0].label_outer()
	>>> img = librosa.display.specshow(S_preemph, y_axis='log', x_axis='time', ax=ax[1])
	>>> ax[1].set(title='Pre-emphasized signal')
	>>> fig.colorbar(img, ax=ax, format="%+2.f dB")

	Apply pre-emphasis in pieces for block streaming. Note that the second block
	initializes ``zi`` with the final state ``zf`` returned by the first call.

	>>> y_filt_1, zf = librosa.effects.preemphasis(y[:1000], return_zf=True)
	>>> y_filt_2, zf = librosa.effects.preemphasis(y[1000:], zi=zf, return_zf=True)
	>>> np.allclose(y_filt, np.concatenate([y_filt_1, y_filt_2]))
	True

	See Also
	--------
	deemphasis
	"""
	b = np.asarray([1.0, -coef], dtype=y.dtype)
	a = np.asarray([1.0], dtype=y.dtype)

	if zi is None:
	# Initialize the filter to implement linear extrapolation
	zi = 2 * y[..., 0:1] - y[..., 1:2]

	zi = np.atleast_1d(zi)

	y_out, z_f = scipy.signal.lfilter(b, a, y, zi=np.asarray(zi, dtype=y.dtype))

	if return_zf:
	return y_out, z_f

	return y_out


	@deprecate_positional_args
	def deemphasis(y, *, coef=0.97, zi=None, return_zf=False):
	"""De-emphasize an audio signal with the inverse operation of preemphasis():

	If y = preemphasis(x, coef=coef, zi=zi), the deemphasis is:

	>>> x[i] = y[i] + coef * x[i-1]
	>>> x = deemphasis(y, coef=coef, zi=zi)

	Parameters
	----------
	y : np.ndarray [shape=(..., n)]
	Audio signal. Multi-channel is supported.

	coef : positive number
	Pre-emphasis coefficient. Typical values of ``coef`` are between 0 and 1.

	At the limit ``coef=0``, the signal is unchanged.

	At ``coef=1``, the result is the first-order difference of the signal.

	The default (0.97) matches the pre-emphasis filter used in the HTK
	implementation of MFCCs [#]_.

	.. [#] http://htk.eng.cam.ac.uk/

	zi : number
	Initial filter state. If inverting a previous preemphasis(), the same value should be used.

	By default ``zi`` is initialized as
	``((2 - coef) * y[0] - y[1]) / (3 - coef)``. This
	value corresponds to the transformation of the default initialization of ``zi`` in ``preemphasis()``,
	``2*x[0] - x[1]``.

	return_zf : boolean
	If ``True``, return the final filter state.
	If ``False``, only return the pre-emphasized signal.

	Returns
	-------
	y_out : np.ndarray
	de-emphasized signal
	zf : number
	if ``return_zf=True``, the final filter state is also returned

	Examples
	--------
	Apply a standard pre-emphasis filter and invert it with de-emphasis

	>>> y, sr = librosa.load(librosa.ex('trumpet'))
	>>> y_filt = librosa.effects.preemphasis(y)
	>>> y_deemph = librosa.effects.deemphasis(y_filt)
	>>> np.allclose(y, y_deemph)
	True

	See Also
	--------
	preemphasis
	"""

	b = np.array([1.0, -coef], dtype=y.dtype)
	a = np.array([1.0], dtype=y.dtype)

	if zi is None:
	# initialize with all zeros
	zi = np.zeros(list(y.shape[:-1]) + [1], dtype=y.dtype)
	y_out, zf = scipy.signal.lfilter(a, b, y, zi=zi)

	# factor in the linear extrapolation
	y_out -= (
	((2 - coef) * y[..., 0:1] - y[..., 1:2])
	/ (3 - coef)
	* (coef ** np.arange(y.shape[-1]))
	)

	else:
	zi = np.atleast_1d(zi)
	y_out, zf = scipy.signal.lfilter(a, b, y, zi=zi.astype(y.dtype))

	if return_zf:
	return y_out, zf
	else:
	return y_out