|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
Audio processing functions to extract features from audio waveforms. This code is pure numpy to support all frameworks |
|
|
and remove unnecessary dependencies. |
|
|
""" |
|
|
|
|
|
import base64 |
|
|
import importlib |
|
|
import io |
|
|
import os |
|
|
import warnings |
|
|
from collections.abc import Sequence |
|
|
from io import BytesIO |
|
|
from typing import TYPE_CHECKING, Any, Optional, Union |
|
|
|
|
|
|
|
|
if TYPE_CHECKING: |
|
|
import torch |
|
|
import numpy as np |
|
|
import requests |
|
|
from packaging import version |
|
|
|
|
|
from .utils import ( |
|
|
is_librosa_available, |
|
|
is_numpy_array, |
|
|
is_soundfile_available, |
|
|
is_torch_tensor, |
|
|
is_torchcodec_available, |
|
|
requires_backends, |
|
|
) |
|
|
|
|
|
|
|
|
if is_soundfile_available(): |
|
|
import soundfile as sf |
|
|
|
|
|
if is_librosa_available(): |
|
|
import librosa |
|
|
|
|
|
|
|
|
import soxr |
|
|
|
|
|
if is_torchcodec_available(): |
|
|
TORCHCODEC_VERSION = version.parse(importlib.metadata.version("torchcodec")) |
|
|
|
|
|
AudioInput = Union[np.ndarray, "torch.Tensor", Sequence[np.ndarray], Sequence["torch.Tensor"]] |
|
|
|
|
|
|
|
|
def load_audio(audio: Union[str, np.ndarray], sampling_rate=16000, timeout=None) -> np.ndarray: |
|
|
""" |
|
|
Loads `audio` to an np.ndarray object. |
|
|
|
|
|
Args: |
|
|
audio (`str` or `np.ndarray`): |
|
|
The audio to be loaded to the numpy array format. |
|
|
sampling_rate (`int`, *optional*, defaults to 16000): |
|
|
The sampling rate to be used when loading the audio. It should be same as the |
|
|
sampling rate the model you will be using further was trained with. |
|
|
timeout (`float`, *optional*): |
|
|
The timeout value in seconds for the URL request. |
|
|
|
|
|
Returns: |
|
|
`np.ndarray`: A numpy array representing the audio. |
|
|
""" |
|
|
if isinstance(audio, str): |
|
|
|
|
|
|
|
|
|
|
|
if is_torchcodec_available() and TORCHCODEC_VERSION >= version.parse("0.3.0"): |
|
|
audio = load_audio_torchcodec(audio, sampling_rate=sampling_rate) |
|
|
else: |
|
|
audio = load_audio_librosa(audio, sampling_rate=sampling_rate, timeout=timeout) |
|
|
elif not isinstance(audio, np.ndarray): |
|
|
raise TypeError( |
|
|
"Incorrect format used for `audio`. Should be an url linking to an audio, a local path, or numpy array." |
|
|
) |
|
|
return audio |
|
|
|
|
|
|
|
|
def load_audio_torchcodec(audio: Union[str, np.ndarray], sampling_rate=16000) -> np.ndarray: |
|
|
""" |
|
|
Loads `audio` to an np.ndarray object using `torchcodec`. |
|
|
|
|
|
Args: |
|
|
audio (`str` or `np.ndarray`): |
|
|
The audio to be loaded to the numpy array format. |
|
|
sampling_rate (`int`, *optional*, defaults to 16000): |
|
|
The sampling rate to be used when loading the audio. It should be same as the |
|
|
sampling rate the model you will be using further was trained with. |
|
|
|
|
|
Returns: |
|
|
`np.ndarray`: A numpy array representing the audio. |
|
|
""" |
|
|
|
|
|
requires_backends(load_audio_torchcodec, ["torchcodec"]) |
|
|
from torchcodec.decoders import AudioDecoder |
|
|
|
|
|
|
|
|
decoder = AudioDecoder(audio, sample_rate=sampling_rate, num_channels=1) |
|
|
audio = decoder.get_all_samples().data[0].numpy() |
|
|
return audio |
|
|
|
|
|
|
|
|
def load_audio_librosa(audio: Union[str, np.ndarray], sampling_rate=16000, timeout=None) -> np.ndarray: |
|
|
""" |
|
|
Loads `audio` to an np.ndarray object using `librosa`. |
|
|
|
|
|
Args: |
|
|
audio (`str` or `np.ndarray`): |
|
|
The audio to be loaded to the numpy array format. |
|
|
sampling_rate (`int`, *optional*, defaults to 16000): |
|
|
The sampling rate to be used when loading the audio. It should be same as the |
|
|
sampling rate the model you will be using further was trained with. |
|
|
timeout (`float`, *optional*): |
|
|
The timeout value in seconds for the URL request. |
|
|
|
|
|
Returns: |
|
|
`np.ndarray`: A numpy array representing the audio. |
|
|
""" |
|
|
requires_backends(load_audio_librosa, ["librosa"]) |
|
|
|
|
|
|
|
|
if audio.startswith("http://") or audio.startswith("https://"): |
|
|
audio = librosa.load(BytesIO(requests.get(audio, timeout=timeout).content), sr=sampling_rate)[0] |
|
|
elif os.path.isfile(audio): |
|
|
audio = librosa.load(audio, sr=sampling_rate)[0] |
|
|
return audio |
|
|
|
|
|
|
|
|
def load_audio_as( |
|
|
audio: str, |
|
|
return_format: str, |
|
|
timeout: Optional[int] = None, |
|
|
force_mono: bool = False, |
|
|
sampling_rate: Optional[int] = None, |
|
|
) -> Union[str, dict[str, Any], io.BytesIO, None]: |
|
|
""" |
|
|
Load audio from either a local file path or URL and return in specified format. |
|
|
|
|
|
Args: |
|
|
audio (`str`): Either a local file path or a URL to an audio file |
|
|
return_format (`str`): Format to return the audio in: |
|
|
- "base64": Base64 encoded string |
|
|
- "dict": Dictionary with data and format |
|
|
- "buffer": BytesIO object |
|
|
timeout (`int`, *optional*): Timeout for URL requests in seconds |
|
|
force_mono (`bool`): Whether to convert stereo audio to mono |
|
|
sampling_rate (`int`, *optional*): If provided, the audio will be resampled to the specified sampling rate. |
|
|
|
|
|
Returns: |
|
|
`Union[str, Dict[str, Any], io.BytesIO, None]`: |
|
|
- `str`: Base64 encoded audio data (if return_format="base64") |
|
|
- `dict`: Dictionary with 'data' (base64 encoded audio data) and 'format' keys (if return_format="dict") |
|
|
- `io.BytesIO`: BytesIO object containing audio data (if return_format="buffer") |
|
|
""" |
|
|
|
|
|
requires_backends(load_audio_as, ["librosa"]) |
|
|
|
|
|
if return_format not in ["base64", "dict", "buffer"]: |
|
|
raise ValueError(f"Invalid return_format: {return_format}. Must be 'base64', 'dict', or 'buffer'") |
|
|
|
|
|
try: |
|
|
|
|
|
audio_bytes = None |
|
|
if audio.startswith(("http://", "https://")): |
|
|
response = requests.get(audio, timeout=timeout) |
|
|
response.raise_for_status() |
|
|
audio_bytes = response.content |
|
|
elif os.path.isfile(audio): |
|
|
with open(audio, "rb") as audio_file: |
|
|
audio_bytes = audio_file.read() |
|
|
else: |
|
|
raise ValueError(f"File not found: {audio}") |
|
|
|
|
|
|
|
|
with io.BytesIO(audio_bytes) as audio_file: |
|
|
with sf.SoundFile(audio_file) as f: |
|
|
audio_array = f.read(dtype="float32") |
|
|
original_sr = f.samplerate |
|
|
audio_format = f.format |
|
|
if sampling_rate is not None and sampling_rate != original_sr: |
|
|
|
|
|
audio_array = soxr.resample(audio_array, original_sr, sampling_rate, quality="HQ") |
|
|
else: |
|
|
sampling_rate = original_sr |
|
|
|
|
|
|
|
|
if force_mono and audio_array.ndim != 1: |
|
|
audio_array = audio_array.mean(axis=1) |
|
|
|
|
|
buffer = io.BytesIO() |
|
|
sf.write(buffer, audio_array, sampling_rate, format=audio_format.upper()) |
|
|
buffer.seek(0) |
|
|
|
|
|
if return_format == "buffer": |
|
|
return buffer |
|
|
elif return_format == "base64": |
|
|
return base64.b64encode(buffer.read()).decode("utf-8") |
|
|
elif return_format == "dict": |
|
|
return { |
|
|
"data": base64.b64encode(buffer.read()).decode("utf-8"), |
|
|
"format": audio_format.lower(), |
|
|
} |
|
|
|
|
|
except Exception as e: |
|
|
raise ValueError(f"Error loading audio: {e}") |
|
|
|
|
|
|
|
|
def is_valid_audio(audio): |
|
|
return is_numpy_array(audio) or is_torch_tensor(audio) |
|
|
|
|
|
|
|
|
def is_valid_list_of_audio(audio): |
|
|
return audio and all(is_valid_audio(audio_i) for audio_i in audio) |
|
|
|
|
|
|
|
|
def make_list_of_audio( |
|
|
audio: Union[list[AudioInput], AudioInput], |
|
|
) -> AudioInput: |
|
|
""" |
|
|
Ensure that the output is a list of audio. |
|
|
Args: |
|
|
audio (`Union[list[AudioInput], AudioInput]`): |
|
|
The input audio. |
|
|
Returns: |
|
|
list: A list of audio. |
|
|
""" |
|
|
|
|
|
if isinstance(audio, (list, tuple)) and is_valid_list_of_audio(audio): |
|
|
return audio |
|
|
|
|
|
|
|
|
if is_valid_audio(audio): |
|
|
return [audio] |
|
|
|
|
|
raise ValueError("Invalid input type. Must be a single audio or a list of audio") |
|
|
|
|
|
|
|
|
def hertz_to_mel(freq: Union[float, np.ndarray], mel_scale: str = "htk") -> Union[float, np.ndarray]: |
|
|
""" |
|
|
Convert frequency from hertz to mels. |
|
|
|
|
|
Args: |
|
|
freq (`float` or `np.ndarray`): |
|
|
The frequency, or multiple frequencies, in hertz (Hz). |
|
|
mel_scale (`str`, *optional*, defaults to `"htk"`): |
|
|
The mel frequency scale to use, `"htk"`, `"kaldi"` or `"slaney"`. |
|
|
|
|
|
Returns: |
|
|
`float` or `np.ndarray`: The frequencies on the mel scale. |
|
|
""" |
|
|
|
|
|
if mel_scale not in ["slaney", "htk", "kaldi"]: |
|
|
raise ValueError('mel_scale should be one of "htk", "slaney" or "kaldi".') |
|
|
|
|
|
if mel_scale == "htk": |
|
|
return 2595.0 * np.log10(1.0 + (freq / 700.0)) |
|
|
elif mel_scale == "kaldi": |
|
|
return 1127.0 * np.log(1.0 + (freq / 700.0)) |
|
|
|
|
|
min_log_hertz = 1000.0 |
|
|
min_log_mel = 15.0 |
|
|
logstep = 27.0 / np.log(6.4) |
|
|
mels = 3.0 * freq / 200.0 |
|
|
|
|
|
if isinstance(freq, np.ndarray): |
|
|
log_region = freq >= min_log_hertz |
|
|
mels[log_region] = min_log_mel + np.log(freq[log_region] / min_log_hertz) * logstep |
|
|
elif freq >= min_log_hertz: |
|
|
mels = min_log_mel + np.log(freq / min_log_hertz) * logstep |
|
|
|
|
|
return mels |
|
|
|
|
|
|
|
|
def mel_to_hertz(mels: Union[float, np.ndarray], mel_scale: str = "htk") -> Union[float, np.ndarray]: |
|
|
""" |
|
|
Convert frequency from mels to hertz. |
|
|
|
|
|
Args: |
|
|
mels (`float` or `np.ndarray`): |
|
|
The frequency, or multiple frequencies, in mels. |
|
|
mel_scale (`str`, *optional*, `"htk"`): |
|
|
The mel frequency scale to use, `"htk"`, `"kaldi"` or `"slaney"`. |
|
|
|
|
|
Returns: |
|
|
`float` or `np.ndarray`: The frequencies in hertz. |
|
|
""" |
|
|
|
|
|
if mel_scale not in ["slaney", "htk", "kaldi"]: |
|
|
raise ValueError('mel_scale should be one of "htk", "slaney" or "kaldi".') |
|
|
|
|
|
if mel_scale == "htk": |
|
|
return 700.0 * (np.power(10, mels / 2595.0) - 1.0) |
|
|
elif mel_scale == "kaldi": |
|
|
return 700.0 * (np.exp(mels / 1127.0) - 1.0) |
|
|
|
|
|
min_log_hertz = 1000.0 |
|
|
min_log_mel = 15.0 |
|
|
logstep = np.log(6.4) / 27.0 |
|
|
freq = 200.0 * mels / 3.0 |
|
|
|
|
|
if isinstance(mels, np.ndarray): |
|
|
log_region = mels >= min_log_mel |
|
|
freq[log_region] = min_log_hertz * np.exp(logstep * (mels[log_region] - min_log_mel)) |
|
|
elif mels >= min_log_mel: |
|
|
freq = min_log_hertz * np.exp(logstep * (mels - min_log_mel)) |
|
|
|
|
|
return freq |
|
|
|
|
|
|
|
|
def hertz_to_octave(freq: Union[float, np.ndarray], tuning: float = 0.0, bins_per_octave: int = 12): |
|
|
""" |
|
|
Convert frequency from hertz to fractional octave numbers. |
|
|
Adapted from *librosa*. |
|
|
|
|
|
Args: |
|
|
freq (`float` or `np.ndarray`): |
|
|
The frequency, or multiple frequencies, in hertz (Hz). |
|
|
tuning (`float`, defaults to `0.`): |
|
|
Tuning deviation from the Stuttgart pitch (A440) in (fractional) bins per octave. |
|
|
bins_per_octave (`int`, defaults to `12`): |
|
|
Number of bins per octave. |
|
|
|
|
|
Returns: |
|
|
`float` or `np.ndarray`: The frequencies on the octave scale. |
|
|
""" |
|
|
stuttgart_pitch = 440.0 * 2.0 ** (tuning / bins_per_octave) |
|
|
octave = np.log2(freq / (float(stuttgart_pitch) / 16)) |
|
|
return octave |
|
|
|
|
|
|
|
|
def _create_triangular_filter_bank(fft_freqs: np.ndarray, filter_freqs: np.ndarray) -> np.ndarray: |
|
|
""" |
|
|
Creates a triangular filter bank. |
|
|
|
|
|
Adapted from *torchaudio* and *librosa*. |
|
|
|
|
|
Args: |
|
|
fft_freqs (`np.ndarray` of shape `(num_frequency_bins,)`): |
|
|
Discrete frequencies of the FFT bins in Hz. |
|
|
filter_freqs (`np.ndarray` of shape `(num_mel_filters,)`): |
|
|
Center frequencies of the triangular filters to create, in Hz. |
|
|
|
|
|
Returns: |
|
|
`np.ndarray` of shape `(num_frequency_bins, num_mel_filters)` |
|
|
""" |
|
|
filter_diff = np.diff(filter_freqs) |
|
|
slopes = np.expand_dims(filter_freqs, 0) - np.expand_dims(fft_freqs, 1) |
|
|
down_slopes = -slopes[:, :-2] / filter_diff[:-1] |
|
|
up_slopes = slopes[:, 2:] / filter_diff[1:] |
|
|
return np.maximum(np.zeros(1), np.minimum(down_slopes, up_slopes)) |
|
|
|
|
|
|
|
|
def chroma_filter_bank( |
|
|
num_frequency_bins: int, |
|
|
num_chroma: int, |
|
|
sampling_rate: int, |
|
|
tuning: float = 0.0, |
|
|
power: Optional[float] = 2.0, |
|
|
weighting_parameters: Optional[tuple[float, float]] = (5.0, 2.0), |
|
|
start_at_c_chroma: bool = True, |
|
|
): |
|
|
""" |
|
|
Creates a chroma filter bank, i.e a linear transformation to project spectrogram bins onto chroma bins. |
|
|
|
|
|
Adapted from *librosa*. |
|
|
|
|
|
Args: |
|
|
num_frequency_bins (`int`): |
|
|
Number of frequencies used to compute the spectrogram (should be the same as in `stft`). |
|
|
num_chroma (`int`): |
|
|
Number of chroma bins (i.e pitch classes). |
|
|
sampling_rate (`float`): |
|
|
Sample rate of the audio waveform. |
|
|
tuning (`float`): |
|
|
Tuning deviation from A440 in fractions of a chroma bin. |
|
|
power (`float`, *optional*, defaults to 2.0): |
|
|
If 12.0, normalizes each column with their L2 norm. If 1.0, normalizes each column with their L1 norm. |
|
|
weighting_parameters (`tuple[float, float]`, *optional*, defaults to `(5., 2.)`): |
|
|
If specified, apply a Gaussian weighting parameterized by the first element of the tuple being the center and |
|
|
the second element being the Gaussian half-width. |
|
|
start_at_c_chroma (`bool`, *optional*, defaults to `True`): |
|
|
If True, the filter bank will start at the 'C' pitch class. Otherwise, it will start at 'A'. |
|
|
Returns: |
|
|
`np.ndarray` of shape `(num_frequency_bins, num_chroma)` |
|
|
""" |
|
|
|
|
|
frequencies = np.linspace(0, sampling_rate, num_frequency_bins, endpoint=False)[1:] |
|
|
|
|
|
freq_bins = num_chroma * hertz_to_octave(frequencies, tuning=tuning, bins_per_octave=num_chroma) |
|
|
|
|
|
|
|
|
|
|
|
freq_bins = np.concatenate(([freq_bins[0] - 1.5 * num_chroma], freq_bins)) |
|
|
|
|
|
bins_width = np.concatenate((np.maximum(freq_bins[1:] - freq_bins[:-1], 1.0), [1])) |
|
|
|
|
|
chroma_filters = np.subtract.outer(freq_bins, np.arange(0, num_chroma, dtype="d")).T |
|
|
|
|
|
num_chroma2 = np.round(float(num_chroma) / 2) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
chroma_filters = np.remainder(chroma_filters + num_chroma2 + 10 * num_chroma, num_chroma) - num_chroma2 |
|
|
|
|
|
|
|
|
chroma_filters = np.exp(-0.5 * (2 * chroma_filters / np.tile(bins_width, (num_chroma, 1))) ** 2) |
|
|
|
|
|
|
|
|
if power is not None: |
|
|
chroma_filters = chroma_filters / np.sum(chroma_filters**power, axis=0, keepdims=True) ** (1.0 / power) |
|
|
|
|
|
|
|
|
if weighting_parameters is not None: |
|
|
center, half_width = weighting_parameters |
|
|
chroma_filters *= np.tile( |
|
|
np.exp(-0.5 * (((freq_bins / num_chroma - center) / half_width) ** 2)), |
|
|
(num_chroma, 1), |
|
|
) |
|
|
|
|
|
if start_at_c_chroma: |
|
|
chroma_filters = np.roll(chroma_filters, -3 * (num_chroma // 12), axis=0) |
|
|
|
|
|
|
|
|
return np.ascontiguousarray(chroma_filters[:, : int(1 + num_frequency_bins / 2)]) |
|
|
|
|
|
|
|
|
def mel_filter_bank( |
|
|
num_frequency_bins: int, |
|
|
num_mel_filters: int, |
|
|
min_frequency: float, |
|
|
max_frequency: float, |
|
|
sampling_rate: int, |
|
|
norm: Optional[str] = None, |
|
|
mel_scale: str = "htk", |
|
|
triangularize_in_mel_space: bool = False, |
|
|
) -> np.ndarray: |
|
|
""" |
|
|
Creates a frequency bin conversion matrix used to obtain a mel spectrogram. This is called a *mel filter bank*, and |
|
|
various implementation exist, which differ in the number of filters, the shape of the filters, the way the filters |
|
|
are spaced, the bandwidth of the filters, and the manner in which the spectrum is warped. The goal of these |
|
|
features is to approximate the non-linear human perception of the variation in pitch with respect to the frequency. |
|
|
|
|
|
Different banks of mel filters were introduced in the literature. The following variations are supported: |
|
|
|
|
|
- MFCC FB-20: introduced in 1980 by Davis and Mermelstein, it assumes a sampling frequency of 10 kHz and a speech |
|
|
bandwidth of `[0, 4600]` Hz. |
|
|
- MFCC FB-24 HTK: from the Cambridge HMM Toolkit (HTK) (1995) uses a filter bank of 24 filters for a speech |
|
|
bandwidth of `[0, 8000]` Hz. This assumes sampling rate ≥ 16 kHz. |
|
|
- MFCC FB-40: from the Auditory Toolbox for MATLAB written by Slaney in 1998, assumes a sampling rate of 16 kHz and |
|
|
speech bandwidth of `[133, 6854]` Hz. This version also includes area normalization. |
|
|
- HFCC-E FB-29 (Human Factor Cepstral Coefficients) of Skowronski and Harris (2004), assumes a sampling rate of |
|
|
12.5 kHz and speech bandwidth of `[0, 6250]` Hz. |
|
|
|
|
|
This code is adapted from *torchaudio* and *librosa*. Note that the default parameters of torchaudio's |
|
|
`melscale_fbanks` implement the `"htk"` filters while librosa uses the `"slaney"` implementation. |
|
|
|
|
|
Args: |
|
|
num_frequency_bins (`int`): |
|
|
Number of frequency bins (should be the same as `n_fft // 2 + 1` where `n_fft` is the size of the Fourier Transform used to compute the spectrogram). |
|
|
num_mel_filters (`int`): |
|
|
Number of mel filters to generate. |
|
|
min_frequency (`float`): |
|
|
Lowest frequency of interest in Hz. |
|
|
max_frequency (`float`): |
|
|
Highest frequency of interest in Hz. This should not exceed `sampling_rate / 2`. |
|
|
sampling_rate (`int`): |
|
|
Sample rate of the audio waveform. |
|
|
norm (`str`, *optional*): |
|
|
If `"slaney"`, divide the triangular mel weights by the width of the mel band (area normalization). |
|
|
mel_scale (`str`, *optional*, defaults to `"htk"`): |
|
|
The mel frequency scale to use, `"htk"`, `"kaldi"` or `"slaney"`. |
|
|
triangularize_in_mel_space (`bool`, *optional*, defaults to `False`): |
|
|
If this option is enabled, the triangular filter is applied in mel space rather than frequency space. This |
|
|
should be set to `true` in order to get the same results as `torchaudio` when computing mel filters. |
|
|
|
|
|
Returns: |
|
|
`np.ndarray` of shape (`num_frequency_bins`, `num_mel_filters`): Triangular filter bank matrix. This is a |
|
|
projection matrix to go from a spectrogram to a mel spectrogram. |
|
|
""" |
|
|
if norm is not None and norm != "slaney": |
|
|
raise ValueError('norm must be one of None or "slaney"') |
|
|
|
|
|
if num_frequency_bins < 2: |
|
|
raise ValueError(f"Require num_frequency_bins: {num_frequency_bins} >= 2") |
|
|
|
|
|
if min_frequency > max_frequency: |
|
|
raise ValueError(f"Require min_frequency: {min_frequency} <= max_frequency: {max_frequency}") |
|
|
|
|
|
|
|
|
mel_min = hertz_to_mel(min_frequency, mel_scale=mel_scale) |
|
|
mel_max = hertz_to_mel(max_frequency, mel_scale=mel_scale) |
|
|
mel_freqs = np.linspace(mel_min, mel_max, num_mel_filters + 2) |
|
|
filter_freqs = mel_to_hertz(mel_freqs, mel_scale=mel_scale) |
|
|
|
|
|
if triangularize_in_mel_space: |
|
|
|
|
|
fft_bin_width = sampling_rate / ((num_frequency_bins - 1) * 2) |
|
|
fft_freqs = hertz_to_mel(fft_bin_width * np.arange(num_frequency_bins), mel_scale=mel_scale) |
|
|
filter_freqs = mel_freqs |
|
|
else: |
|
|
|
|
|
fft_freqs = np.linspace(0, sampling_rate // 2, num_frequency_bins) |
|
|
|
|
|
mel_filters = _create_triangular_filter_bank(fft_freqs, filter_freqs) |
|
|
|
|
|
if norm is not None and norm == "slaney": |
|
|
|
|
|
enorm = 2.0 / (filter_freqs[2 : num_mel_filters + 2] - filter_freqs[:num_mel_filters]) |
|
|
mel_filters *= np.expand_dims(enorm, 0) |
|
|
|
|
|
if (mel_filters.max(axis=0) == 0.0).any(): |
|
|
warnings.warn( |
|
|
"At least one mel filter has all zero values. " |
|
|
f"The value for `num_mel_filters` ({num_mel_filters}) may be set too high. " |
|
|
f"Or, the value for `num_frequency_bins` ({num_frequency_bins}) may be set too low." |
|
|
) |
|
|
|
|
|
return mel_filters |
|
|
|
|
|
|
|
|
def optimal_fft_length(window_length: int) -> int: |
|
|
""" |
|
|
Finds the best FFT input size for a given `window_length`. This function takes a given window length and, if not |
|
|
already a power of two, rounds it up to the next power or two. |
|
|
|
|
|
The FFT algorithm works fastest when the length of the input is a power of two, which may be larger than the size |
|
|
of the window or analysis frame. For example, if the window is 400 samples, using an FFT input size of 512 samples |
|
|
is more optimal than an FFT size of 400 samples. Using a larger FFT size does not affect the detected frequencies, |
|
|
it simply gives a higher frequency resolution (i.e. the frequency bins are smaller). |
|
|
""" |
|
|
return 2 ** int(np.ceil(np.log2(window_length))) |
|
|
|
|
|
|
|
|
def window_function( |
|
|
window_length: int, |
|
|
name: str = "hann", |
|
|
periodic: bool = True, |
|
|
frame_length: Optional[int] = None, |
|
|
center: bool = True, |
|
|
) -> np.ndarray: |
|
|
""" |
|
|
Returns an array containing the specified window. This window is intended to be used with `stft`. |
|
|
|
|
|
The following window types are supported: |
|
|
|
|
|
- `"boxcar"`: a rectangular window |
|
|
- `"hamming"`: the Hamming window |
|
|
- `"hann"`: the Hann window |
|
|
- `"povey"`: the Povey window |
|
|
|
|
|
Args: |
|
|
window_length (`int`): |
|
|
The length of the window in samples. |
|
|
name (`str`, *optional*, defaults to `"hann"`): |
|
|
The name of the window function. |
|
|
periodic (`bool`, *optional*, defaults to `True`): |
|
|
Whether the window is periodic or symmetric. |
|
|
frame_length (`int`, *optional*): |
|
|
The length of the analysis frames in samples. Provide a value for `frame_length` if the window is smaller |
|
|
than the frame length, so that it will be zero-padded. |
|
|
center (`bool`, *optional*, defaults to `True`): |
|
|
Whether to center the window inside the FFT buffer. Only used when `frame_length` is provided. |
|
|
|
|
|
Returns: |
|
|
`np.ndarray` of shape `(window_length,)` or `(frame_length,)` containing the window. |
|
|
""" |
|
|
length = window_length + 1 if periodic else window_length |
|
|
|
|
|
if name == "boxcar": |
|
|
window = np.ones(length) |
|
|
elif name in ["hamming", "hamming_window"]: |
|
|
window = np.hamming(length) |
|
|
elif name in ["hann", "hann_window"]: |
|
|
window = np.hanning(length) |
|
|
elif name == "povey": |
|
|
window = np.power(np.hanning(length), 0.85) |
|
|
else: |
|
|
raise ValueError(f"Unknown window function '{name}'") |
|
|
|
|
|
if periodic: |
|
|
window = window[:-1] |
|
|
|
|
|
if frame_length is None: |
|
|
return window |
|
|
|
|
|
if window_length > frame_length: |
|
|
raise ValueError( |
|
|
f"Length of the window ({window_length}) may not be larger than frame_length ({frame_length})" |
|
|
) |
|
|
|
|
|
padded_window = np.zeros(frame_length) |
|
|
offset = (frame_length - window_length) // 2 if center else 0 |
|
|
padded_window[offset : offset + window_length] = window |
|
|
return padded_window |
|
|
|
|
|
|
|
|
|
|
|
def spectrogram( |
|
|
waveform: np.ndarray, |
|
|
window: np.ndarray, |
|
|
frame_length: int, |
|
|
hop_length: int, |
|
|
fft_length: Optional[int] = None, |
|
|
power: Optional[float] = 1.0, |
|
|
center: bool = True, |
|
|
pad_mode: str = "reflect", |
|
|
onesided: bool = True, |
|
|
dither: float = 0.0, |
|
|
preemphasis: Optional[float] = None, |
|
|
mel_filters: Optional[np.ndarray] = None, |
|
|
mel_floor: float = 1e-10, |
|
|
log_mel: Optional[str] = None, |
|
|
reference: float = 1.0, |
|
|
min_value: float = 1e-10, |
|
|
db_range: Optional[float] = None, |
|
|
remove_dc_offset: bool = False, |
|
|
dtype: np.dtype = np.float32, |
|
|
) -> np.ndarray: |
|
|
""" |
|
|
Calculates a spectrogram over one waveform using the Short-Time Fourier Transform. |
|
|
|
|
|
This function can create the following kinds of spectrograms: |
|
|
|
|
|
- amplitude spectrogram (`power = 1.0`) |
|
|
- power spectrogram (`power = 2.0`) |
|
|
- complex-valued spectrogram (`power = None`) |
|
|
- log spectrogram (use `log_mel` argument) |
|
|
- mel spectrogram (provide `mel_filters`) |
|
|
- log-mel spectrogram (provide `mel_filters` and `log_mel`) |
|
|
|
|
|
How this works: |
|
|
|
|
|
1. The input waveform is split into frames of size `frame_length` that are partially overlapping by `frame_length |
|
|
- hop_length` samples. |
|
|
2. Each frame is multiplied by the window and placed into a buffer of size `fft_length`. |
|
|
3. The DFT is taken of each windowed frame. |
|
|
4. The results are stacked into a spectrogram. |
|
|
|
|
|
We make a distinction between the following "blocks" of sample data, each of which may have a different lengths: |
|
|
|
|
|
- The analysis frame. This is the size of the time slices that the input waveform is split into. |
|
|
- The window. Each analysis frame is multiplied by the window to avoid spectral leakage. |
|
|
- The FFT input buffer. The length of this determines how many frequency bins are in the spectrogram. |
|
|
|
|
|
In this implementation, the window is assumed to be zero-padded to have the same size as the analysis frame. A |
|
|
padded window can be obtained from `window_function()`. The FFT input buffer may be larger than the analysis frame, |
|
|
typically the next power of two. |
|
|
|
|
|
Note: This function is not optimized for speed yet. It should be mostly compatible with `librosa.stft` and |
|
|
`torchaudio.functional.transforms.Spectrogram`, although it is more flexible due to the different ways spectrograms |
|
|
can be constructed. |
|
|
|
|
|
Args: |
|
|
waveform (`np.ndarray` of shape `(length,)`): |
|
|
The input waveform. This must be a single real-valued, mono waveform. |
|
|
window (`np.ndarray` of shape `(frame_length,)`): |
|
|
The windowing function to apply, including zero-padding if necessary. The actual window length may be |
|
|
shorter than `frame_length`, but we're assuming the array has already been zero-padded. |
|
|
frame_length (`int`): |
|
|
The length of the analysis frames in samples. With librosa this is always equal to `fft_length` but we also |
|
|
allow smaller sizes. |
|
|
hop_length (`int`): |
|
|
The stride between successive analysis frames in samples. |
|
|
fft_length (`int`, *optional*): |
|
|
The size of the FFT buffer in samples. This determines how many frequency bins the spectrogram will have. |
|
|
For optimal speed, this should be a power of two. If `None`, uses `frame_length`. |
|
|
power (`float`, *optional*, defaults to 1.0): |
|
|
If 1.0, returns the amplitude spectrogram. If 2.0, returns the power spectrogram. If `None`, returns |
|
|
complex numbers. |
|
|
center (`bool`, *optional*, defaults to `True`): |
|
|
Whether to pad the waveform so that frame `t` is centered around time `t * hop_length`. If `False`, frame |
|
|
`t` will start at time `t * hop_length`. |
|
|
pad_mode (`str`, *optional*, defaults to `"reflect"`): |
|
|
Padding mode used when `center` is `True`. Possible values are: `"constant"` (pad with zeros), `"edge"` |
|
|
(pad with edge values), `"reflect"` (pads with mirrored values). |
|
|
onesided (`bool`, *optional*, defaults to `True`): |
|
|
If True, only computes the positive frequencies and returns a spectrogram containing `fft_length // 2 + 1` |
|
|
frequency bins. If False, also computes the negative frequencies and returns `fft_length` frequency bins. |
|
|
dither (`float`, *optional*, defaults to 0.0): |
|
|
Adds dithering. In other words, adds a small Gaussian noise to each frame. |
|
|
E.g. use 4.0 to add dithering with a normal distribution centered |
|
|
around 0.0 with standard deviation 4.0, 0.0 means no dithering. |
|
|
Dithering has similar effect as `mel_floor`. It reduces the high log_mel_fbank |
|
|
values for signals with hard-zero sections, when VAD cutoff is present in the signal. |
|
|
preemphasis (`float`, *optional*) |
|
|
Coefficient for a low-pass filter that applies pre-emphasis before the DFT. |
|
|
mel_filters (`np.ndarray` of shape `(num_freq_bins, num_mel_filters)`, *optional*): |
|
|
The mel filter bank. If supplied, applies a this filter bank to create a mel spectrogram. |
|
|
mel_floor (`float`, *optional*, defaults to 1e-10): |
|
|
Minimum value of mel frequency banks. |
|
|
log_mel (`str`, *optional*): |
|
|
How to convert the spectrogram to log scale. Possible options are: `None` (don't convert), `"log"` (take |
|
|
the natural logarithm) `"log10"` (take the base-10 logarithm), `"dB"` (convert to decibels). Can only be |
|
|
used when `power` is not `None`. |
|
|
reference (`float`, *optional*, defaults to 1.0): |
|
|
Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set |
|
|
the loudest part to 0 dB. Must be greater than zero. |
|
|
min_value (`float`, *optional*, defaults to `1e-10`): |
|
|
The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking |
|
|
`log(0)`. For a power spectrogram, the default of `1e-10` corresponds to a minimum of -100 dB. For an |
|
|
amplitude spectrogram, the value `1e-5` corresponds to -100 dB. Must be greater than zero. |
|
|
db_range (`float`, *optional*): |
|
|
Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the |
|
|
peak value and the smallest value will never be more than 80 dB. Must be greater than zero. |
|
|
remove_dc_offset (`bool`, *optional*): |
|
|
Subtract mean from waveform on each frame, applied before pre-emphasis. This should be set to `true` in |
|
|
order to get the same results as `torchaudio.compliance.kaldi.fbank` when computing mel filters. |
|
|
dtype (`np.dtype`, *optional*, defaults to `np.float32`): |
|
|
Data type of the spectrogram tensor. If `power` is None, this argument is ignored and the dtype will be |
|
|
`np.complex64`. |
|
|
|
|
|
Returns: |
|
|
`nd.array` containing a spectrogram of shape `(num_frequency_bins, length)` for a regular spectrogram or shape |
|
|
`(num_mel_filters, length)` for a mel spectrogram. |
|
|
""" |
|
|
window_length = len(window) |
|
|
|
|
|
if fft_length is None: |
|
|
fft_length = frame_length |
|
|
|
|
|
if frame_length > fft_length: |
|
|
raise ValueError(f"frame_length ({frame_length}) may not be larger than fft_length ({fft_length})") |
|
|
|
|
|
if window_length != frame_length: |
|
|
raise ValueError(f"Length of the window ({window_length}) must equal frame_length ({frame_length})") |
|
|
|
|
|
if hop_length <= 0: |
|
|
raise ValueError("hop_length must be greater than zero") |
|
|
|
|
|
if waveform.ndim != 1: |
|
|
raise ValueError(f"Input waveform must have only one dimension, shape is {waveform.shape}") |
|
|
|
|
|
if np.iscomplexobj(waveform): |
|
|
raise ValueError("Complex-valued input waveforms are not currently supported") |
|
|
|
|
|
if power is None and mel_filters is not None: |
|
|
raise ValueError( |
|
|
"You have provided `mel_filters` but `power` is `None`. Mel spectrogram computation is not yet supported for complex-valued spectrogram." |
|
|
"Specify `power` to fix this issue." |
|
|
) |
|
|
|
|
|
|
|
|
if center: |
|
|
padding = [(int(frame_length // 2), int(frame_length // 2))] |
|
|
waveform = np.pad(waveform, padding, mode=pad_mode) |
|
|
|
|
|
|
|
|
waveform = waveform.astype(np.float64) |
|
|
window = window.astype(np.float64) |
|
|
|
|
|
|
|
|
num_frames = int(1 + np.floor((waveform.size - frame_length) / hop_length)) |
|
|
|
|
|
num_frequency_bins = (fft_length // 2) + 1 if onesided else fft_length |
|
|
spectrogram = np.empty((num_frames, num_frequency_bins), dtype=np.complex64) |
|
|
|
|
|
|
|
|
fft_func = np.fft.rfft if onesided else np.fft.fft |
|
|
buffer = np.zeros(fft_length) |
|
|
|
|
|
timestep = 0 |
|
|
for frame_idx in range(num_frames): |
|
|
buffer[:frame_length] = waveform[timestep : timestep + frame_length] |
|
|
|
|
|
if dither != 0.0: |
|
|
buffer[:frame_length] += dither * np.random.randn(frame_length) |
|
|
|
|
|
if remove_dc_offset: |
|
|
buffer[:frame_length] = buffer[:frame_length] - buffer[:frame_length].mean() |
|
|
|
|
|
if preemphasis is not None: |
|
|
buffer[1:frame_length] -= preemphasis * buffer[: frame_length - 1] |
|
|
buffer[0] *= 1 - preemphasis |
|
|
|
|
|
buffer[:frame_length] *= window |
|
|
|
|
|
spectrogram[frame_idx] = fft_func(buffer) |
|
|
timestep += hop_length |
|
|
|
|
|
|
|
|
if power is not None: |
|
|
spectrogram = np.abs(spectrogram, dtype=np.float64) ** power |
|
|
|
|
|
spectrogram = spectrogram.T |
|
|
|
|
|
if mel_filters is not None: |
|
|
spectrogram = np.maximum(mel_floor, np.dot(mel_filters.T, spectrogram)) |
|
|
|
|
|
if power is not None and log_mel is not None: |
|
|
if log_mel == "log": |
|
|
spectrogram = np.log(spectrogram) |
|
|
elif log_mel == "log10": |
|
|
spectrogram = np.log10(spectrogram) |
|
|
elif log_mel == "dB": |
|
|
if power == 1.0: |
|
|
spectrogram = amplitude_to_db(spectrogram, reference, min_value, db_range) |
|
|
elif power == 2.0: |
|
|
spectrogram = power_to_db(spectrogram, reference, min_value, db_range) |
|
|
else: |
|
|
raise ValueError(f"Cannot use log_mel option '{log_mel}' with power {power}") |
|
|
else: |
|
|
raise ValueError(f"Unknown log_mel option: {log_mel}") |
|
|
|
|
|
spectrogram = np.asarray(spectrogram, dtype) |
|
|
|
|
|
return spectrogram |
|
|
|
|
|
|
|
|
def spectrogram_batch( |
|
|
waveform_list: list[np.ndarray], |
|
|
window: np.ndarray, |
|
|
frame_length: int, |
|
|
hop_length: int, |
|
|
fft_length: Optional[int] = None, |
|
|
power: Optional[float] = 1.0, |
|
|
center: bool = True, |
|
|
pad_mode: str = "reflect", |
|
|
onesided: bool = True, |
|
|
dither: float = 0.0, |
|
|
preemphasis: Optional[float] = None, |
|
|
mel_filters: Optional[np.ndarray] = None, |
|
|
mel_floor: float = 1e-10, |
|
|
log_mel: Optional[str] = None, |
|
|
reference: float = 1.0, |
|
|
min_value: float = 1e-10, |
|
|
db_range: Optional[float] = None, |
|
|
remove_dc_offset: bool = False, |
|
|
dtype: np.dtype = np.float32, |
|
|
) -> list[np.ndarray]: |
|
|
""" |
|
|
Calculates spectrograms for a list of waveforms using the Short-Time Fourier Transform, optimized for batch processing. |
|
|
This function extends the capabilities of the `spectrogram` function to handle multiple waveforms efficiently by leveraging broadcasting. |
|
|
|
|
|
It supports generating various types of spectrograms: |
|
|
|
|
|
- amplitude spectrogram (`power = 1.0`) |
|
|
- power spectrogram (`power = 2.0`) |
|
|
- complex-valued spectrogram (`power = None`) |
|
|
- log spectrogram (use `log_mel` argument) |
|
|
- mel spectrogram (provide `mel_filters`) |
|
|
- log-mel spectrogram (provide `mel_filters` and `log_mel`) |
|
|
|
|
|
How this works: |
|
|
|
|
|
1. The input waveform is split into frames of size `frame_length` that are partially overlapping by `frame_length |
|
|
- hop_length` samples. |
|
|
2. Each frame is multiplied by the window and placed into a buffer of size `fft_length`. |
|
|
3. The DFT is taken of each windowed frame. |
|
|
4. The results are stacked into a spectrogram. |
|
|
|
|
|
We make a distinction between the following "blocks" of sample data, each of which may have a different lengths: |
|
|
|
|
|
- The analysis frame. This is the size of the time slices that the input waveform is split into. |
|
|
- The window. Each analysis frame is multiplied by the window to avoid spectral leakage. |
|
|
- The FFT input buffer. The length of this determines how many frequency bins are in the spectrogram. |
|
|
|
|
|
In this implementation, the window is assumed to be zero-padded to have the same size as the analysis frame. A |
|
|
padded window can be obtained from `window_function()`. The FFT input buffer may be larger than the analysis frame, |
|
|
typically the next power of two. |
|
|
|
|
|
Note: This function is designed for efficient batch processing of multiple waveforms but retains compatibility with individual waveform processing methods like `librosa.stft`. |
|
|
|
|
|
Args: |
|
|
waveform_list (`list[np.ndarray]` with arrays of shape `(length,)`): |
|
|
The list of input waveforms, each a single-channel (mono) signal. |
|
|
window (`np.ndarray` of shape `(frame_length,)`): |
|
|
The windowing function to apply, including zero-padding if necessary. |
|
|
frame_length (`int`): |
|
|
The length of each frame for analysis. |
|
|
hop_length (`int`): |
|
|
The step size between successive frames. |
|
|
fft_length (`int`, *optional*): |
|
|
The size of the FFT buffer, defining frequency bin resolution. |
|
|
power (`float`, *optional*, defaults to 1.0): |
|
|
Determines the type of spectrogram: 1.0 for amplitude, 2.0 for power, None for complex. |
|
|
center (`bool`, *optional*, defaults to `True`): |
|
|
Whether to center-pad the waveform frames. |
|
|
pad_mode (`str`, *optional*, defaults to `"reflect"`): |
|
|
The padding strategy when `center` is `True`. |
|
|
onesided (`bool`, *optional*, defaults to `True`): |
|
|
If True, returns a one-sided spectrogram for real input signals. |
|
|
dither (`float`, *optional*, defaults to 0.0): |
|
|
Adds dithering. In other words, adds a small Gaussian noise to each frame. |
|
|
E.g. use 4.0 to add dithering with a normal distribution centered |
|
|
around 0.0 with standard deviation 4.0, 0.0 means no dithering. |
|
|
preemphasis (`float`, *optional*): |
|
|
Applies a pre-emphasis filter to each frame. |
|
|
mel_filters (`np.ndarray`, *optional*): |
|
|
Mel filter bank for converting to mel spectrogram. |
|
|
mel_floor (`float`, *optional*, defaults to 1e-10): |
|
|
Floor value for mel spectrogram to avoid log(0). |
|
|
log_mel (`str`, *optional*): |
|
|
Specifies log scaling strategy; options are None, "log", "log10", "dB". |
|
|
reference (`float`, *optional*, defaults to 1.0): |
|
|
Reference value for dB conversion in log_mel. |
|
|
min_value (`float`, *optional*, defaults to 1e-10): |
|
|
Minimum floor value for log scale conversions. |
|
|
db_range (`float`, *optional*): |
|
|
Dynamic range for dB scale spectrograms. |
|
|
remove_dc_offset (`bool`, *optional*): |
|
|
Whether to remove the DC offset from each frame. |
|
|
dtype (`np.dtype`, *optional*, defaults to `np.float32`): |
|
|
Data type of the output spectrogram. |
|
|
|
|
|
Returns: |
|
|
list[`np.ndarray`]: A list of spectrogram arrays, one for each input waveform. |
|
|
""" |
|
|
window_length = len(window) |
|
|
|
|
|
if fft_length is None: |
|
|
fft_length = frame_length |
|
|
|
|
|
if frame_length > fft_length: |
|
|
raise ValueError(f"frame_length ({frame_length}) may not be larger than fft_length ({fft_length})") |
|
|
|
|
|
if window_length != frame_length: |
|
|
raise ValueError(f"Length of the window ({window_length}) must equal frame_length ({frame_length})") |
|
|
|
|
|
if hop_length <= 0: |
|
|
raise ValueError("hop_length must be greater than zero") |
|
|
|
|
|
|
|
|
for waveform in waveform_list: |
|
|
if waveform.ndim != 1: |
|
|
raise ValueError(f"Input waveform must have only one dimension, shape is {waveform.shape}") |
|
|
if np.iscomplexobj(waveform): |
|
|
raise ValueError("Complex-valued input waveforms are not currently supported") |
|
|
|
|
|
if center: |
|
|
padding = [(int(frame_length // 2), int(frame_length // 2))] |
|
|
waveform_list = [ |
|
|
np.pad( |
|
|
waveform, |
|
|
padding, |
|
|
mode=pad_mode, |
|
|
) |
|
|
for waveform in waveform_list |
|
|
] |
|
|
original_waveform_lengths = [ |
|
|
len(waveform) for waveform in waveform_list |
|
|
] |
|
|
|
|
|
|
|
|
max_length = max(original_waveform_lengths) |
|
|
padded_waveform_batch = np.array( |
|
|
[ |
|
|
np.pad(waveform, (0, max_length - len(waveform)), mode="constant", constant_values=0) |
|
|
for waveform in waveform_list |
|
|
], |
|
|
dtype=dtype, |
|
|
) |
|
|
|
|
|
|
|
|
padded_waveform_batch = padded_waveform_batch.astype(np.float64) |
|
|
window = window.astype(np.float64) |
|
|
|
|
|
|
|
|
num_frames = int(1 + np.floor((padded_waveform_batch.shape[1] - frame_length) / hop_length)) |
|
|
|
|
|
true_num_frames = [int(1 + np.floor((length - frame_length) / hop_length)) for length in original_waveform_lengths] |
|
|
num_batches = padded_waveform_batch.shape[0] |
|
|
|
|
|
num_frequency_bins = (fft_length // 2) + 1 if onesided else fft_length |
|
|
spectrogram = np.empty((num_batches, num_frames, num_frequency_bins), dtype=np.complex64) |
|
|
|
|
|
|
|
|
fft_func = np.fft.rfft if onesided else np.fft.fft |
|
|
buffer = np.zeros((num_batches, fft_length)) |
|
|
|
|
|
for frame_idx in range(num_frames): |
|
|
timestep = frame_idx * hop_length |
|
|
buffer[:, :frame_length] = padded_waveform_batch[:, timestep : timestep + frame_length] |
|
|
|
|
|
if dither != 0.0: |
|
|
buffer[:, :frame_length] += dither * np.random.randn(*buffer[:, :frame_length].shape) |
|
|
|
|
|
if remove_dc_offset: |
|
|
buffer[:, :frame_length] -= buffer[:, :frame_length].mean(axis=1, keepdims=True) |
|
|
|
|
|
if preemphasis is not None: |
|
|
buffer[:, 1:frame_length] -= preemphasis * buffer[:, : frame_length - 1] |
|
|
buffer[:, 0] *= 1 - preemphasis |
|
|
|
|
|
buffer[:, :frame_length] *= window |
|
|
|
|
|
spectrogram[:, frame_idx] = fft_func(buffer) |
|
|
|
|
|
|
|
|
if power is not None: |
|
|
spectrogram = np.abs(spectrogram, dtype=np.float64) ** power |
|
|
|
|
|
|
|
|
if mel_filters is not None: |
|
|
result = np.tensordot(spectrogram, mel_filters.T, axes=([2], [1])) |
|
|
spectrogram = np.maximum(mel_floor, result) |
|
|
|
|
|
|
|
|
if power is not None and log_mel is not None: |
|
|
if log_mel == "log": |
|
|
spectrogram = np.log(spectrogram) |
|
|
elif log_mel == "log10": |
|
|
spectrogram = np.log10(spectrogram) |
|
|
elif log_mel == "dB": |
|
|
if power == 1.0: |
|
|
spectrogram = amplitude_to_db_batch(spectrogram, reference, min_value, db_range) |
|
|
elif power == 2.0: |
|
|
spectrogram = power_to_db_batch(spectrogram, reference, min_value, db_range) |
|
|
else: |
|
|
raise ValueError(f"Cannot use log_mel option '{log_mel}' with power {power}") |
|
|
else: |
|
|
raise ValueError(f"Unknown log_mel option: {log_mel}") |
|
|
|
|
|
spectrogram = np.asarray(spectrogram, dtype) |
|
|
|
|
|
spectrogram_list = [spectrogram[i, : true_num_frames[i], :].T for i in range(len(true_num_frames))] |
|
|
|
|
|
return spectrogram_list |
|
|
|
|
|
|
|
|
def power_to_db( |
|
|
spectrogram: np.ndarray, |
|
|
reference: float = 1.0, |
|
|
min_value: float = 1e-10, |
|
|
db_range: Optional[float] = None, |
|
|
) -> np.ndarray: |
|
|
""" |
|
|
Converts a power spectrogram to the decibel scale. This computes `10 * log10(spectrogram / reference)`, using basic |
|
|
logarithm properties for numerical stability. |
|
|
|
|
|
The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a |
|
|
linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it. |
|
|
This means that large variations in energy may not sound all that different if the sound is loud to begin with. |
|
|
This compression operation makes the (mel) spectrogram features match more closely what humans actually hear. |
|
|
|
|
|
Based on the implementation of `librosa.power_to_db`. |
|
|
|
|
|
Args: |
|
|
spectrogram (`np.ndarray`): |
|
|
The input power (mel) spectrogram. Note that a power spectrogram has the amplitudes squared! |
|
|
reference (`float`, *optional*, defaults to 1.0): |
|
|
Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set |
|
|
the loudest part to 0 dB. Must be greater than zero. |
|
|
min_value (`float`, *optional*, defaults to `1e-10`): |
|
|
The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking |
|
|
`log(0)`. The default of `1e-10` corresponds to a minimum of -100 dB. Must be greater than zero. |
|
|
db_range (`float`, *optional*): |
|
|
Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the |
|
|
peak value and the smallest value will never be more than 80 dB. Must be greater than zero. |
|
|
|
|
|
Returns: |
|
|
`np.ndarray`: the spectrogram in decibels |
|
|
""" |
|
|
if reference <= 0.0: |
|
|
raise ValueError("reference must be greater than zero") |
|
|
if min_value <= 0.0: |
|
|
raise ValueError("min_value must be greater than zero") |
|
|
|
|
|
reference = max(min_value, reference) |
|
|
|
|
|
spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None) |
|
|
spectrogram = 10.0 * (np.log10(spectrogram) - np.log10(reference)) |
|
|
|
|
|
if db_range is not None: |
|
|
if db_range <= 0.0: |
|
|
raise ValueError("db_range must be greater than zero") |
|
|
spectrogram = np.clip(spectrogram, a_min=spectrogram.max() - db_range, a_max=None) |
|
|
|
|
|
return spectrogram |
|
|
|
|
|
|
|
|
def power_to_db_batch( |
|
|
spectrogram: np.ndarray, |
|
|
reference: float = 1.0, |
|
|
min_value: float = 1e-10, |
|
|
db_range: Optional[float] = None, |
|
|
) -> np.ndarray: |
|
|
""" |
|
|
Converts a batch of power spectrograms to the decibel scale. This computes `10 * log10(spectrogram / reference)`, |
|
|
using basic logarithm properties for numerical stability. |
|
|
|
|
|
This function supports batch processing, where each item in the batch is an individual power (mel) spectrogram. |
|
|
|
|
|
Args: |
|
|
spectrogram (`np.ndarray`): |
|
|
The input batch of power (mel) spectrograms. Expected shape is (batch_size, *spectrogram_shape). |
|
|
Note that a power spectrogram has the amplitudes squared! |
|
|
reference (`float`, *optional*, defaults to 1.0): |
|
|
Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set |
|
|
the loudest part to 0 dB. Must be greater than zero. |
|
|
min_value (`float`, *optional*, defaults to `1e-10`): |
|
|
The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking |
|
|
`log(0)`. The default of `1e-10` corresponds to a minimum of -100 dB. Must be greater than zero. |
|
|
db_range (`float`, *optional*): |
|
|
Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the |
|
|
peak value and the smallest value will never be more than 80 dB. Must be greater than zero. |
|
|
|
|
|
Returns: |
|
|
`np.ndarray`: the batch of spectrograms in decibels |
|
|
""" |
|
|
if reference <= 0.0: |
|
|
raise ValueError("reference must be greater than zero") |
|
|
if min_value <= 0.0: |
|
|
raise ValueError("min_value must be greater than zero") |
|
|
|
|
|
reference = max(min_value, reference) |
|
|
|
|
|
spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None) |
|
|
spectrogram = 10.0 * (np.log10(spectrogram) - np.log10(reference)) |
|
|
|
|
|
if db_range is not None: |
|
|
if db_range <= 0.0: |
|
|
raise ValueError("db_range must be greater than zero") |
|
|
|
|
|
max_values = spectrogram.max(axis=(1, 2), keepdims=True) |
|
|
spectrogram = np.clip(spectrogram, a_min=max_values - db_range, a_max=None) |
|
|
|
|
|
return spectrogram |
|
|
|
|
|
|
|
|
def amplitude_to_db( |
|
|
spectrogram: np.ndarray, |
|
|
reference: float = 1.0, |
|
|
min_value: float = 1e-5, |
|
|
db_range: Optional[float] = None, |
|
|
) -> np.ndarray: |
|
|
""" |
|
|
Converts an amplitude spectrogram to the decibel scale. This computes `20 * log10(spectrogram / reference)`, using |
|
|
basic logarithm properties for numerical stability. |
|
|
|
|
|
The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a |
|
|
linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it. |
|
|
This means that large variations in energy may not sound all that different if the sound is loud to begin with. |
|
|
This compression operation makes the (mel) spectrogram features match more closely what humans actually hear. |
|
|
|
|
|
Args: |
|
|
spectrogram (`np.ndarray`): |
|
|
The input amplitude (mel) spectrogram. |
|
|
reference (`float`, *optional*, defaults to 1.0): |
|
|
Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set |
|
|
the loudest part to 0 dB. Must be greater than zero. |
|
|
min_value (`float`, *optional*, defaults to `1e-5`): |
|
|
The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking |
|
|
`log(0)`. The default of `1e-5` corresponds to a minimum of -100 dB. Must be greater than zero. |
|
|
db_range (`float`, *optional*): |
|
|
Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the |
|
|
peak value and the smallest value will never be more than 80 dB. Must be greater than zero. |
|
|
|
|
|
Returns: |
|
|
`np.ndarray`: the spectrogram in decibels |
|
|
""" |
|
|
if reference <= 0.0: |
|
|
raise ValueError("reference must be greater than zero") |
|
|
if min_value <= 0.0: |
|
|
raise ValueError("min_value must be greater than zero") |
|
|
|
|
|
reference = max(min_value, reference) |
|
|
|
|
|
spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None) |
|
|
spectrogram = 20.0 * (np.log10(spectrogram) - np.log10(reference)) |
|
|
|
|
|
if db_range is not None: |
|
|
if db_range <= 0.0: |
|
|
raise ValueError("db_range must be greater than zero") |
|
|
spectrogram = np.clip(spectrogram, a_min=spectrogram.max() - db_range, a_max=None) |
|
|
|
|
|
return spectrogram |
|
|
|
|
|
|
|
|
def amplitude_to_db_batch( |
|
|
spectrogram: np.ndarray, reference: float = 1.0, min_value: float = 1e-5, db_range: Optional[float] = None |
|
|
) -> np.ndarray: |
|
|
""" |
|
|
Converts a batch of amplitude spectrograms to the decibel scale. This computes `20 * log10(spectrogram / reference)`, |
|
|
using basic logarithm properties for numerical stability. |
|
|
|
|
|
The function supports batch processing, where each item in the batch is an individual amplitude (mel) spectrogram. |
|
|
|
|
|
Args: |
|
|
spectrogram (`np.ndarray`): |
|
|
The input batch of amplitude (mel) spectrograms. Expected shape is (batch_size, *spectrogram_shape). |
|
|
reference (`float`, *optional*, defaults to 1.0): |
|
|
Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set |
|
|
the loudest part to 0 dB. Must be greater than zero. |
|
|
min_value (`float`, *optional*, defaults to `1e-5`): |
|
|
The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking |
|
|
`log(0)`. The default of `1e-5` corresponds to a minimum of -100 dB. Must be greater than zero. |
|
|
db_range (`float`, *optional*): |
|
|
Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the |
|
|
peak value and the smallest value will never be more than 80 dB. Must be greater than zero. |
|
|
|
|
|
Returns: |
|
|
`np.ndarray`: the batch of spectrograms in decibels |
|
|
""" |
|
|
if reference <= 0.0: |
|
|
raise ValueError("reference must be greater than zero") |
|
|
if min_value <= 0.0: |
|
|
raise ValueError("min_value must be greater than zero") |
|
|
|
|
|
reference = max(min_value, reference) |
|
|
|
|
|
spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None) |
|
|
spectrogram = 20.0 * (np.log10(spectrogram) - np.log10(reference)) |
|
|
|
|
|
if db_range is not None: |
|
|
if db_range <= 0.0: |
|
|
raise ValueError("db_range must be greater than zero") |
|
|
|
|
|
max_values = spectrogram.max(axis=(1, 2), keepdims=True) |
|
|
spectrogram = np.clip(spectrogram, a_min=max_values - db_range, a_max=None) |
|
|
|
|
|
return spectrogram |
|
|
|