prasb commited on Apr 7, 2025

Commit

610facb

verified ·

1 Parent(s): b5147e7

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATN.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNConfig.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNConfigSet.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNDeserializationOptions.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNDeserializer.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNSimulator.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNState.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNType.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/LexerATNSimulator.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/LexerAction.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/LexerActionExecutor.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ParserATNSimulator.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/PredictionMode.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/SemanticContext.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/Transition.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/__init__.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/DFA.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/DFASerializer.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/DFAState.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/__init__.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/DiagnosticErrorListener.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/ErrorListener.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/ErrorStrategy.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/Errors.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/__init__.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/ParseTreeMatch.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/ParseTreePattern.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/ParseTreePatternMatcher.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/TokenTagToken.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/Trees.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/__init__.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/xpath/__pycache__/XPath.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/xpath/__pycache__/__init__.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/matplotlib/mpl-data/sample_data/goog.npz +3 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/matplotlib/mpl-data/sample_data/topobathy.npz +3 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/__init__.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/_extension.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/kaldi_io.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/version.cpython-38.pyc +0 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/compliance/__init__.py +5 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/compliance/kaldi.py +815 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/__init__.py +34 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/cmuarctic.py +148 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/cmudict.py +183 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/commonvoice.py +71 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/dr_vctk.py +106 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/gtzan.py +1108 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/librilight_limited.py +91 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/librimix.py +85 -0
my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/librispeech.py +135 -0

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATN.cpython-38.pyc ADDED Viewed

Binary file (3.13 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNConfig.cpython-38.pyc ADDED Viewed

Binary file (4.14 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNConfigSet.cpython-38.pyc ADDED Viewed

Binary file (6.22 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNDeserializationOptions.cpython-38.pyc ADDED Viewed

Binary file (1 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNDeserializer.cpython-38.pyc ADDED Viewed

Binary file (15.8 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNSimulator.cpython-38.pyc ADDED Viewed

Binary file (1.15 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNState.cpython-38.pyc ADDED Viewed

Binary file (6.68 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ATNType.cpython-38.pyc ADDED Viewed

Binary file (575 Bytes). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/LexerATNSimulator.cpython-38.pyc ADDED Viewed

Binary file (11.7 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/LexerAction.cpython-38.pyc ADDED Viewed

Binary file (8.59 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/LexerActionExecutor.cpython-38.pyc ADDED Viewed

Binary file (2.53 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/ParserATNSimulator.cpython-38.pyc ADDED Viewed

Binary file (24.5 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/PredictionMode.cpython-38.pyc ADDED Viewed

Binary file (5.07 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/SemanticContext.cpython-38.pyc ADDED Viewed

Binary file (7.36 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/Transition.cpython-38.pyc ADDED Viewed

Binary file (9.62 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/atn/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (189 Bytes). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/DFA.cpython-38.pyc ADDED Viewed

Binary file (3.13 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/DFASerializer.cpython-38.pyc ADDED Viewed

Binary file (2.48 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/DFAState.cpython-38.pyc ADDED Viewed

Binary file (2.33 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/dfa/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (189 Bytes). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/DiagnosticErrorListener.cpython-38.pyc ADDED Viewed

Binary file (2.9 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/ErrorListener.cpython-38.pyc ADDED Viewed

Binary file (2.78 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/ErrorStrategy.cpython-38.pyc ADDED Viewed

Binary file (9.89 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/Errors.cpython-38.pyc ADDED Viewed

Binary file (5.03 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/error/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (191 Bytes). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/ParseTreeMatch.cpython-38.pyc ADDED Viewed

Binary file (1.8 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/ParseTreePattern.cpython-38.pyc ADDED Viewed

Binary file (1.45 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/ParseTreePatternMatcher.cpython-38.pyc ADDED Viewed

Binary file (7.75 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/TokenTagToken.cpython-38.pyc ADDED Viewed

Binary file (1.05 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/Trees.cpython-38.pyc ADDED Viewed

Binary file (3.41 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/tree/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (163 Bytes). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/xpath/__pycache__/XPath.cpython-38.pyc ADDED Viewed

Binary file (10.8 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/antlr4/xpath/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (191 Bytes). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/matplotlib/mpl-data/sample_data/goog.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:400917cf30e6b664f7b0da93d7c745860d3aa9008da8b7f160d2dd12e6a318b1
+size 22845

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/matplotlib/mpl-data/sample_data/topobathy.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0244e03291702df45024dcb5cacbc4f3d4cb30d72dfa7fd371c4ac61c42b4fbf
+size 45224

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (761 Bytes). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/_extension.cpython-38.pyc ADDED Viewed

Binary file (3.43 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/kaldi_io.cpython-38.pyc ADDED Viewed

Binary file (4.47 kB). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/__pycache__/version.cpython-38.pyc ADDED Viewed

Binary file (250 Bytes). View file

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/compliance/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from . import kaldi
+__all__ = [
+    "kaldi",
+]

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/compliance/kaldi.py ADDED Viewed

	@@ -0,0 +1,815 @@

+import math
+from typing import Tuple
+import torch
+import torchaudio
+from torch import Tensor
+__all__ = [
+    "get_mel_banks",
+    "inverse_mel_scale",
+    "inverse_mel_scale_scalar",
+    "mel_scale",
+    "mel_scale_scalar",
+    "spectrogram",
+    "fbank",
+    "mfcc",
+    "vtln_warp_freq",
+    "vtln_warp_mel_freq",
+]
+# numeric_limits<float>::epsilon() 1.1920928955078125e-07
+EPSILON = torch.tensor(torch.finfo(torch.float).eps)
+# 1 milliseconds = 0.001 seconds
+MILLISECONDS_TO_SECONDS = 0.001
+# window types
+HAMMING = "hamming"
+HANNING = "hanning"
+POVEY = "povey"
+RECTANGULAR = "rectangular"
+BLACKMAN = "blackman"
+WINDOWS = [HAMMING, HANNING, POVEY, RECTANGULAR, BLACKMAN]
+def _get_epsilon(device, dtype):
+    return EPSILON.to(device=device, dtype=dtype)
+def _next_power_of_2(x: int) -> int:
+    r"""Returns the smallest power of 2 that is greater than x"""
+    return 1 if x == 0 else 2 ** (x - 1).bit_length()
+def _get_strided(waveform: Tensor, window_size: int, window_shift: int, snip_edges: bool) -> Tensor:
+    r"""Given a waveform (1D tensor of size ``num_samples``), it returns a 2D tensor (m, ``window_size``)
+    representing how the window is shifted along the waveform. Each row is a frame.
+    Args:
+        waveform (Tensor): Tensor of size ``num_samples``
+        window_size (int): Frame length
+        window_shift (int): Frame shift
+        snip_edges (bool): If True, end effects will be handled by outputting only frames that completely fit
+            in the file, and the number of frames depends on the frame_length.  If False, the number of frames
+            depends only on the frame_shift, and we reflect the data at the ends.
+    Returns:
+        Tensor: 2D tensor of size (m, ``window_size``) where each row is a frame
+    """
+    assert waveform.dim() == 1
+    num_samples = waveform.size(0)
+    strides = (window_shift * waveform.stride(0), waveform.stride(0))
+    if snip_edges:
+        if num_samples < window_size:
+            return torch.empty((0, 0), dtype=waveform.dtype, device=waveform.device)
+        else:
+            m = 1 + (num_samples - window_size) // window_shift
+    else:
+        reversed_waveform = torch.flip(waveform, [0])
+        m = (num_samples + (window_shift // 2)) // window_shift
+        pad = window_size // 2 - window_shift // 2
+        pad_right = reversed_waveform
+        if pad > 0:
+            # torch.nn.functional.pad returns [2,1,0,1,2] for 'reflect'
+            # but we want [2, 1, 0, 0, 1, 2]
+            pad_left = reversed_waveform[-pad:]
+            waveform = torch.cat((pad_left, waveform, pad_right), dim=0)
+        else:
+            # pad is negative so we want to trim the waveform at the front
+            waveform = torch.cat((waveform[-pad:], pad_right), dim=0)
+    sizes = (m, window_size)
+    return waveform.as_strided(sizes, strides)
+def _feature_window_function(
+    window_type: str,
+    window_size: int,
+    blackman_coeff: float,
+    device: torch.device,
+    dtype: int,
+) -> Tensor:
+    r"""Returns a window function with the given type and size"""
+    if window_type == HANNING:
+        return torch.hann_window(window_size, periodic=False, device=device, dtype=dtype)
+    elif window_type == HAMMING:
+        return torch.hamming_window(window_size, periodic=False, alpha=0.54, beta=0.46, device=device, dtype=dtype)
+    elif window_type == POVEY:
+        # like hanning but goes to zero at edges
+        return torch.hann_window(window_size, periodic=False, device=device, dtype=dtype).pow(0.85)
+    elif window_type == RECTANGULAR:
+        return torch.ones(window_size, device=device, dtype=dtype)
+    elif window_type == BLACKMAN:
+        a = 2 * math.pi / (window_size - 1)
+        window_function = torch.arange(window_size, device=device, dtype=dtype)
+        # can't use torch.blackman_window as they use different coefficients
+        return (
+            blackman_coeff
+            - 0.5 * torch.cos(a * window_function)
+            + (0.5 - blackman_coeff) * torch.cos(2 * a * window_function)
+        ).to(device=device, dtype=dtype)
+    else:
+        raise Exception("Invalid window type " + window_type)
+def _get_log_energy(strided_input: Tensor, epsilon: Tensor, energy_floor: float) -> Tensor:
+    r"""Returns the log energy of size (m) for a strided_input (m,*)"""
+    device, dtype = strided_input.device, strided_input.dtype
+    log_energy = torch.max(strided_input.pow(2).sum(1), epsilon).log()  # size (m)
+    if energy_floor == 0.0:
+        return log_energy
+    return torch.max(log_energy, torch.tensor(math.log(energy_floor), device=device, dtype=dtype))
+def _get_waveform_and_window_properties(
+    waveform: Tensor,
+    channel: int,
+    sample_frequency: float,
+    frame_shift: float,
+    frame_length: float,
+    round_to_power_of_two: bool,
+    preemphasis_coefficient: float,
+) -> Tuple[Tensor, int, int, int]:
+    r"""Gets the waveform and window properties"""
+    channel = max(channel, 0)
+    assert channel < waveform.size(0), "Invalid channel {} for size {}".format(channel, waveform.size(0))
+    waveform = waveform[channel, :]  # size (n)
+    window_shift = int(sample_frequency * frame_shift * MILLISECONDS_TO_SECONDS)
+    window_size = int(sample_frequency * frame_length * MILLISECONDS_TO_SECONDS)
+    padded_window_size = _next_power_of_2(window_size) if round_to_power_of_two else window_size
+    assert 2 <= window_size <= len(waveform), "choose a window size {} that is [2, {}]".format(
+        window_size, len(waveform)
+    )
+    assert 0 < window_shift, "`window_shift` must be greater than 0"
+    assert padded_window_size % 2 == 0, (
+        "the padded `window_size` must be divisible by two." " use `round_to_power_of_two` or change `frame_length`"
+    )
+    assert 0.0 <= preemphasis_coefficient <= 1.0, "`preemphasis_coefficient` must be between [0,1]"
+    assert sample_frequency > 0, "`sample_frequency` must be greater than zero"
+    return waveform, window_shift, window_size, padded_window_size
+def _get_window(
+    waveform: Tensor,
+    padded_window_size: int,
+    window_size: int,
+    window_shift: int,
+    window_type: str,
+    blackman_coeff: float,
+    snip_edges: bool,
+    raw_energy: bool,
+    energy_floor: float,
+    dither: float,
+    remove_dc_offset: bool,
+    preemphasis_coefficient: float,
+) -> Tuple[Tensor, Tensor]:
+    r"""Gets a window and its log energy
+    Returns:
+        (Tensor, Tensor): strided_input of size (m, ``padded_window_size``) and signal_log_energy of size (m)
+    """
+    device, dtype = waveform.device, waveform.dtype
+    epsilon = _get_epsilon(device, dtype)
+    # size (m, window_size)
+    strided_input = _get_strided(waveform, window_size, window_shift, snip_edges)
+    if dither != 0.0:
+        # Returns a random number strictly between 0 and 1
+        x = torch.max(epsilon, torch.rand(strided_input.shape, device=device, dtype=dtype))
+        rand_gauss = torch.sqrt(-2 * x.log()) * torch.cos(2 * math.pi * x)
+        strided_input = strided_input + rand_gauss * dither
+    if remove_dc_offset:
+        # Subtract each row/frame by its mean
+        row_means = torch.mean(strided_input, dim=1).unsqueeze(1)  # size (m, 1)
+        strided_input = strided_input - row_means
+    if raw_energy:
+        # Compute the log energy of each row/frame before applying preemphasis and
+        # window function
+        signal_log_energy = _get_log_energy(strided_input, epsilon, energy_floor)  # size (m)
+    if preemphasis_coefficient != 0.0:
+        # strided_input[i,j] -= preemphasis_coefficient * strided_input[i, max(0, j-1)] for all i,j
+        offset_strided_input = torch.nn.functional.pad(strided_input.unsqueeze(0), (1, 0), mode="replicate").squeeze(
+            0
+        )  # size (m, window_size + 1)
+        strided_input = strided_input - preemphasis_coefficient * offset_strided_input[:, :-1]
+    # Apply window_function to each row/frame
+    window_function = _feature_window_function(window_type, window_size, blackman_coeff, device, dtype).unsqueeze(
+        0
+    )  # size (1, window_size)
+    strided_input = strided_input * window_function  # size (m, window_size)
+    # Pad columns with zero until we reach size (m, padded_window_size)
+    if padded_window_size != window_size:
+        padding_right = padded_window_size - window_size
+        strided_input = torch.nn.functional.pad(
+            strided_input.unsqueeze(0), (0, padding_right), mode="constant", value=0
+        ).squeeze(0)
+    # Compute energy after window function (not the raw one)
+    if not raw_energy:
+        signal_log_energy = _get_log_energy(strided_input, epsilon, energy_floor)  # size (m)
+    return strided_input, signal_log_energy
+def _subtract_column_mean(tensor: Tensor, subtract_mean: bool) -> Tensor:
+    # subtracts the column mean of the tensor size (m, n) if subtract_mean=True
+    # it returns size (m, n)
+    if subtract_mean:
+        col_means = torch.mean(tensor, dim=0).unsqueeze(0)
+        tensor = tensor - col_means
+    return tensor
+def spectrogram(
+    waveform: Tensor,
+    blackman_coeff: float = 0.42,
+    channel: int = -1,
+    dither: float = 0.0,
+    energy_floor: float = 1.0,
+    frame_length: float = 25.0,
+    frame_shift: float = 10.0,
+    min_duration: float = 0.0,
+    preemphasis_coefficient: float = 0.97,
+    raw_energy: bool = True,
+    remove_dc_offset: bool = True,
+    round_to_power_of_two: bool = True,
+    sample_frequency: float = 16000.0,
+    snip_edges: bool = True,
+    subtract_mean: bool = False,
+    window_type: str = POVEY,
+) -> Tensor:
+    r"""Create a spectrogram from a raw audio signal. This matches the input/output of Kaldi's
+    compute-spectrogram-feats.
+    Args:
+        waveform (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
+        blackman_coeff (float, optional): Constant coefficient for generalized Blackman window. (Default: ``0.42``)
+        channel (int, optional): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: ``-1``)
+        dither (float, optional): Dithering constant (0.0 means no dither). If you turn this off, you should set
+            the energy_floor option, e.g. to 1.0 or 0.1 (Default: ``0.0``)
+        energy_floor (float, optional): Floor on energy (absolute, not relative) in Spectrogram computation.  Caution:
+            this floor is applied to the zeroth component, representing the total signal energy.  The floor on the
+            individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (Default: ``1.0``)
+        frame_length (float, optional): Frame length in milliseconds (Default: ``25.0``)
+        frame_shift (float, optional): Frame shift in milliseconds (Default: ``10.0``)
+        min_duration (float, optional): Minimum duration of segments to process (in seconds). (Default: ``0.0``)
+        preemphasis_coefficient (float, optional): Coefficient for use in signal preemphasis (Default: ``0.97``)
+        raw_energy (bool, optional): If True, compute energy before preemphasis and windowing (Default: ``True``)
+        remove_dc_offset (bool, optional): Subtract mean from waveform on each frame (Default: ``True``)
+        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
+            to FFT. (Default: ``True``)
+        sample_frequency (float, optional): Waveform data sample frequency (must match the waveform file, if
+            specified there) (Default: ``16000.0``)
+        snip_edges (bool, optional): If True, end effects will be handled by outputting only frames that completely fit
+            in the file, and the number of frames depends on the frame_length.  If False, the number of frames
+            depends only on the frame_shift, and we reflect the data at the ends. (Default: ``True``)
+        subtract_mean (bool, optional): Subtract mean of each feature file [CMS]; not recommended to do
+            it this way.  (Default: ``False``)
+        window_type (str, optional): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman')
+         (Default: ``'povey'``)
+    Returns:
+        Tensor: A spectrogram identical to what Kaldi would output. The shape is
+        (m, ``padded_window_size // 2 + 1``) where m is calculated in _get_strided
+    """
+    device, dtype = waveform.device, waveform.dtype
+    epsilon = _get_epsilon(device, dtype)
+    waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
+        waveform, channel, sample_frequency, frame_shift, frame_length, round_to_power_of_two, preemphasis_coefficient
+    )
+    if len(waveform) < min_duration * sample_frequency:
+        # signal is too short
+        return torch.empty(0)
+    strided_input, signal_log_energy = _get_window(
+        waveform,
+        padded_window_size,
+        window_size,
+        window_shift,
+        window_type,
+        blackman_coeff,
+        snip_edges,
+        raw_energy,
+        energy_floor,
+        dither,
+        remove_dc_offset,
+        preemphasis_coefficient,
+    )
+    # size (m, padded_window_size // 2 + 1, 2)
+    fft = torch.fft.rfft(strided_input)
+    # Convert the FFT into a power spectrum
+    power_spectrum = torch.max(fft.abs().pow(2.0), epsilon).log()  # size (m, padded_window_size // 2 + 1)
+    power_spectrum[:, 0] = signal_log_energy
+    power_spectrum = _subtract_column_mean(power_spectrum, subtract_mean)
+    return power_spectrum
+def inverse_mel_scale_scalar(mel_freq: float) -> float:
+    return 700.0 * (math.exp(mel_freq / 1127.0) - 1.0)
+def inverse_mel_scale(mel_freq: Tensor) -> Tensor:
+    return 700.0 * ((mel_freq / 1127.0).exp() - 1.0)
+def mel_scale_scalar(freq: float) -> float:
+    return 1127.0 * math.log(1.0 + freq / 700.0)
+def mel_scale(freq: Tensor) -> Tensor:
+    return 1127.0 * (1.0 + freq / 700.0).log()
+def vtln_warp_freq(
+    vtln_low_cutoff: float,
+    vtln_high_cutoff: float,
+    low_freq: float,
+    high_freq: float,
+    vtln_warp_factor: float,
+    freq: Tensor,
+) -> Tensor:
+    r"""This computes a VTLN warping function that is not the same as HTK's one,
+    but has similar inputs (this function has the advantage of never producing
+    empty bins).
+    This function computes a warp function F(freq), defined between low_freq
+    and high_freq inclusive, with the following properties:
+        F(low_freq) == low_freq
+        F(high_freq) == high_freq
+    The function is continuous and piecewise linear with two inflection
+        points.
+    The lower inflection point (measured in terms of the unwarped
+        frequency) is at frequency l, determined as described below.
+    The higher inflection point is at a frequency h, determined as
+        described below.
+    If l <= f <= h, then F(f) = f/vtln_warp_factor.
+    If the higher inflection point (measured in terms of the unwarped
+        frequency) is at h, then max(h, F(h)) == vtln_high_cutoff.
+        Since (by the last point) F(h) == h/vtln_warp_factor, then
+        max(h, h/vtln_warp_factor) == vtln_high_cutoff, so
+        h = vtln_high_cutoff / max(1, 1/vtln_warp_factor).
+          = vtln_high_cutoff * min(1, vtln_warp_factor).
+    If the lower inflection point (measured in terms of the unwarped
+        frequency) is at l, then min(l, F(l)) == vtln_low_cutoff
+        This implies that l = vtln_low_cutoff / min(1, 1/vtln_warp_factor)
+                            = vtln_low_cutoff * max(1, vtln_warp_factor)
+    Args:
+        vtln_low_cutoff (float): Lower frequency cutoffs for VTLN
+        vtln_high_cutoff (float): Upper frequency cutoffs for VTLN
+        low_freq (float): Lower frequency cutoffs in mel computation
+        high_freq (float): Upper frequency cutoffs in mel computation
+        vtln_warp_factor (float): Vtln warp factor
+        freq (Tensor): given frequency in Hz
+    Returns:
+        Tensor: Freq after vtln warp
+    """
+    assert vtln_low_cutoff > low_freq, "be sure to set the vtln_low option higher than low_freq"
+    assert vtln_high_cutoff < high_freq, "be sure to set the vtln_high option lower than high_freq [or negative]"
+    l = vtln_low_cutoff * max(1.0, vtln_warp_factor)
+    h = vtln_high_cutoff * min(1.0, vtln_warp_factor)
+    scale = 1.0 / vtln_warp_factor
+    Fl = scale * l  # F(l)
+    Fh = scale * h  # F(h)
+    assert l > low_freq and h < high_freq
+    # slope of left part of the 3-piece linear function
+    scale_left = (Fl - low_freq) / (l - low_freq)
+    # [slope of center part is just "scale"]
+    # slope of right part of the 3-piece linear function
+    scale_right = (high_freq - Fh) / (high_freq - h)
+    res = torch.empty_like(freq)
+    outside_low_high_freq = torch.lt(freq, low_freq) | torch.gt(freq, high_freq)  # freq < low_freq || freq > high_freq
+    before_l = torch.lt(freq, l)  # freq < l
+    before_h = torch.lt(freq, h)  # freq < h
+    after_h = torch.ge(freq, h)  # freq >= h
+    # order of operations matter here (since there is overlapping frequency regions)
+    res[after_h] = high_freq + scale_right * (freq[after_h] - high_freq)
+    res[before_h] = scale * freq[before_h]
+    res[before_l] = low_freq + scale_left * (freq[before_l] - low_freq)
+    res[outside_low_high_freq] = freq[outside_low_high_freq]
+    return res
+def vtln_warp_mel_freq(
+    vtln_low_cutoff: float,
+    vtln_high_cutoff: float,
+    low_freq,
+    high_freq: float,
+    vtln_warp_factor: float,
+    mel_freq: Tensor,
+) -> Tensor:
+    r"""
+    Args:
+        vtln_low_cutoff (float): Lower frequency cutoffs for VTLN
+        vtln_high_cutoff (float): Upper frequency cutoffs for VTLN
+        low_freq (float): Lower frequency cutoffs in mel computation
+        high_freq (float): Upper frequency cutoffs in mel computation
+        vtln_warp_factor (float): Vtln warp factor
+        mel_freq (Tensor): Given frequency in Mel
+    Returns:
+        Tensor: ``mel_freq`` after vtln warp
+    """
+    return mel_scale(
+        vtln_warp_freq(
+            vtln_low_cutoff, vtln_high_cutoff, low_freq, high_freq, vtln_warp_factor, inverse_mel_scale(mel_freq)
+        )
+    )
+def get_mel_banks(
+    num_bins: int,
+    window_length_padded: int,
+    sample_freq: float,
+    low_freq: float,
+    high_freq: float,
+    vtln_low: float,
+    vtln_high: float,
+    vtln_warp_factor: float,
+) -> Tuple[Tensor, Tensor]:
+    """
+    Returns:
+        (Tensor, Tensor): The tuple consists of ``bins`` (which is
+        melbank of size (``num_bins``, ``num_fft_bins``)) and ``center_freqs`` (which is
+        center frequencies of bins of size (``num_bins``)).
+    """
+    assert num_bins > 3, "Must have at least 3 mel bins"
+    assert window_length_padded % 2 == 0
+    num_fft_bins = window_length_padded / 2
+    nyquist = 0.5 * sample_freq
+    if high_freq <= 0.0:
+        high_freq += nyquist
+    assert (
+        (0.0 <= low_freq < nyquist) and (0.0 < high_freq <= nyquist) and (low_freq < high_freq)
+    ), "Bad values in options: low-freq {} and high-freq {} vs. nyquist {}".format(low_freq, high_freq, nyquist)
+    # fft-bin width [think of it as Nyquist-freq / half-window-length]
+    fft_bin_width = sample_freq / window_length_padded
+    mel_low_freq = mel_scale_scalar(low_freq)
+    mel_high_freq = mel_scale_scalar(high_freq)
+    # divide by num_bins+1 in next line because of end-effects where the bins
+    # spread out to the sides.
+    mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1)
+    if vtln_high < 0.0:
+        vtln_high += nyquist
+    assert vtln_warp_factor == 1.0 or (
+        (low_freq < vtln_low < high_freq) and (0.0 < vtln_high < high_freq) and (vtln_low < vtln_high)
+    ), "Bad values in options: vtln-low {} and vtln-high {}, versus " "low-freq {} and high-freq {}".format(
+        vtln_low, vtln_high, low_freq, high_freq
+    )
+    bin = torch.arange(num_bins).unsqueeze(1)
+    left_mel = mel_low_freq + bin * mel_freq_delta  # size(num_bins, 1)
+    center_mel = mel_low_freq + (bin + 1.0) * mel_freq_delta  # size(num_bins, 1)
+    right_mel = mel_low_freq + (bin + 2.0) * mel_freq_delta  # size(num_bins, 1)
+    if vtln_warp_factor != 1.0:
+        left_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, left_mel)
+        center_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, center_mel)
+        right_mel = vtln_warp_mel_freq(vtln_low, vtln_high, low_freq, high_freq, vtln_warp_factor, right_mel)
+    center_freqs = inverse_mel_scale(center_mel)  # size (num_bins)
+    # size(1, num_fft_bins)
+    mel = mel_scale(fft_bin_width * torch.arange(num_fft_bins)).unsqueeze(0)
+    # size (num_bins, num_fft_bins)
+    up_slope = (mel - left_mel) / (center_mel - left_mel)
+    down_slope = (right_mel - mel) / (right_mel - center_mel)
+    if vtln_warp_factor == 1.0:
+        # left_mel < center_mel < right_mel so we can min the two slopes and clamp negative values
+        bins = torch.max(torch.zeros(1), torch.min(up_slope, down_slope))
+    else:
+        # warping can move the order of left_mel, center_mel, right_mel anywhere
+        bins = torch.zeros_like(up_slope)
+        up_idx = torch.gt(mel, left_mel) & torch.le(mel, center_mel)  # left_mel < mel <= center_mel
+        down_idx = torch.gt(mel, center_mel) & torch.lt(mel, right_mel)  # center_mel < mel < right_mel
+        bins[up_idx] = up_slope[up_idx]
+        bins[down_idx] = down_slope[down_idx]
+    return bins, center_freqs
+def fbank(
+    waveform: Tensor,
+    blackman_coeff: float = 0.42,
+    channel: int = -1,
+    dither: float = 0.0,
+    energy_floor: float = 1.0,
+    frame_length: float = 25.0,
+    frame_shift: float = 10.0,
+    high_freq: float = 0.0,
+    htk_compat: bool = False,
+    low_freq: float = 20.0,
+    min_duration: float = 0.0,
+    num_mel_bins: int = 23,
+    preemphasis_coefficient: float = 0.97,
+    raw_energy: bool = True,
+    remove_dc_offset: bool = True,
+    round_to_power_of_two: bool = True,
+    sample_frequency: float = 16000.0,
+    snip_edges: bool = True,
+    subtract_mean: bool = False,
+    use_energy: bool = False,
+    use_log_fbank: bool = True,
+    use_power: bool = True,
+    vtln_high: float = -500.0,
+    vtln_low: float = 100.0,
+    vtln_warp: float = 1.0,
+    window_type: str = POVEY,
+) -> Tensor:
+    r"""Create a fbank from a raw audio signal. This matches the input/output of Kaldi's
+    compute-fbank-feats.
+    Args:
+        waveform (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
+        blackman_coeff (float, optional): Constant coefficient for generalized Blackman window. (Default: ``0.42``)
+        channel (int, optional): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: ``-1``)
+        dither (float, optional): Dithering constant (0.0 means no dither). If you turn this off, you should set
+            the energy_floor option, e.g. to 1.0 or 0.1 (Default: ``0.0``)
+        energy_floor (float, optional): Floor on energy (absolute, not relative) in Spectrogram computation.  Caution:
+            this floor is applied to the zeroth component, representing the total signal energy.  The floor on the
+            individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (Default: ``1.0``)
+        frame_length (float, optional): Frame length in milliseconds (Default: ``25.0``)
+        frame_shift (float, optional): Frame shift in milliseconds (Default: ``10.0``)
+        high_freq (float, optional): High cutoff frequency for mel bins (if <= 0, offset from Nyquist)
+         (Default: ``0.0``)
+        htk_compat (bool, optional): If true, put energy last.  Warning: not sufficient to get HTK compatible features
+         (need to change other parameters). (Default: ``False``)
+        low_freq (float, optional): Low cutoff frequency for mel bins (Default: ``20.0``)
+        min_duration (float, optional): Minimum duration of segments to process (in seconds). (Default: ``0.0``)
+        num_mel_bins (int, optional): Number of triangular mel-frequency bins (Default: ``23``)
+        preemphasis_coefficient (float, optional): Coefficient for use in signal preemphasis (Default: ``0.97``)
+        raw_energy (bool, optional): If True, compute energy before preemphasis and windowing (Default: ``True``)
+        remove_dc_offset (bool, optional): Subtract mean from waveform on each frame (Default: ``True``)
+        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
+            to FFT. (Default: ``True``)
+        sample_frequency (float, optional): Waveform data sample frequency (must match the waveform file, if
+            specified there) (Default: ``16000.0``)
+        snip_edges (bool, optional): If True, end effects will be handled by outputting only frames that completely fit
+            in the file, and the number of frames depends on the frame_length.  If False, the number of frames
+            depends only on the frame_shift, and we reflect the data at the ends. (Default: ``True``)
+        subtract_mean (bool, optional): Subtract mean of each feature file [CMS]; not recommended to do
+            it this way.  (Default: ``False``)
+        use_energy (bool, optional): Add an extra dimension with energy to the FBANK output. (Default: ``False``)
+        use_log_fbank (bool, optional):If true, produce log-filterbank, else produce linear. (Default: ``True``)
+        use_power (bool, optional): If true, use power, else use magnitude. (Default: ``True``)
+        vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function (if
+            negative, offset from high-mel-freq (Default: ``-500.0``)
+        vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function (Default: ``100.0``)
+        vtln_warp (float, optional): Vtln warp factor (only applicable if vtln_map not specified) (Default: ``1.0``)
+        window_type (str, optional): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman')
+         (Default: ``'povey'``)
+    Returns:
+        Tensor: A fbank identical to what Kaldi would output. The shape is (m, ``num_mel_bins + use_energy``)
+        where m is calculated in _get_strided
+    """
+    device, dtype = waveform.device, waveform.dtype
+    waveform, window_shift, window_size, padded_window_size = _get_waveform_and_window_properties(
+        waveform, channel, sample_frequency, frame_shift, frame_length, round_to_power_of_two, preemphasis_coefficient
+    )
+    if len(waveform) < min_duration * sample_frequency:
+        # signal is too short
+        return torch.empty(0, device=device, dtype=dtype)
+    # strided_input, size (m, padded_window_size) and signal_log_energy, size (m)
+    strided_input, signal_log_energy = _get_window(
+        waveform,
+        padded_window_size,
+        window_size,
+        window_shift,
+        window_type,
+        blackman_coeff,
+        snip_edges,
+        raw_energy,
+        energy_floor,
+        dither,
+        remove_dc_offset,
+        preemphasis_coefficient,
+    )
+    # size (m, padded_window_size // 2 + 1)
+    spectrum = torch.fft.rfft(strided_input).abs()
+    if use_power:
+        spectrum = spectrum.pow(2.0)
+    # size (num_mel_bins, padded_window_size // 2)
+    mel_energies, _ = get_mel_banks(
+        num_mel_bins, padded_window_size, sample_frequency, low_freq, high_freq, vtln_low, vtln_high, vtln_warp
+    )
+    mel_energies = mel_energies.to(device=device, dtype=dtype)
+    # pad right column with zeros and add dimension, size (num_mel_bins, padded_window_size // 2 + 1)
+    mel_energies = torch.nn.functional.pad(mel_energies, (0, 1), mode="constant", value=0)
+    # sum with mel fiterbanks over the power spectrum, size (m, num_mel_bins)
+    mel_energies = torch.mm(spectrum, mel_energies.T)
+    if use_log_fbank:
+        # avoid log of zero (which should be prevented anyway by dithering)
+        mel_energies = torch.max(mel_energies, _get_epsilon(device, dtype)).log()
+    # if use_energy then add it as the last column for htk_compat == true else first column
+    if use_energy:
+        signal_log_energy = signal_log_energy.unsqueeze(1)  # size (m, 1)
+        # returns size (m, num_mel_bins + 1)
+        if htk_compat:
+            mel_energies = torch.cat((mel_energies, signal_log_energy), dim=1)
+        else:
+            mel_energies = torch.cat((signal_log_energy, mel_energies), dim=1)
+    mel_energies = _subtract_column_mean(mel_energies, subtract_mean)
+    return mel_energies
+def _get_dct_matrix(num_ceps: int, num_mel_bins: int) -> Tensor:
+    # returns a dct matrix of size (num_mel_bins, num_ceps)
+    # size (num_mel_bins, num_mel_bins)
+    dct_matrix = torchaudio.functional.create_dct(num_mel_bins, num_mel_bins, "ortho")
+    # kaldi expects the first cepstral to be weighted sum of factor sqrt(1/num_mel_bins)
+    # this would be the first column in the dct_matrix for torchaudio as it expects a
+    # right multiply (which would be the first column of the kaldi's dct_matrix as kaldi
+    # expects a left multiply e.g. dct_matrix * vector).
+    dct_matrix[:, 0] = math.sqrt(1 / float(num_mel_bins))
+    dct_matrix = dct_matrix[:, :num_ceps]
+    return dct_matrix
+def _get_lifter_coeffs(num_ceps: int, cepstral_lifter: float) -> Tensor:
+    # returns size (num_ceps)
+    # Compute liftering coefficients (scaling on cepstral coeffs)
+    # coeffs are numbered slightly differently from HTK: the zeroth index is C0, which is not affected.
+    i = torch.arange(num_ceps)
+    return 1.0 + 0.5 * cepstral_lifter * torch.sin(math.pi * i / cepstral_lifter)
+def mfcc(
+    waveform: Tensor,
+    blackman_coeff: float = 0.42,
+    cepstral_lifter: float = 22.0,
+    channel: int = -1,
+    dither: float = 0.0,
+    energy_floor: float = 1.0,
+    frame_length: float = 25.0,
+    frame_shift: float = 10.0,
+    high_freq: float = 0.0,
+    htk_compat: bool = False,
+    low_freq: float = 20.0,
+    num_ceps: int = 13,
+    min_duration: float = 0.0,
+    num_mel_bins: int = 23,
+    preemphasis_coefficient: float = 0.97,
+    raw_energy: bool = True,
+    remove_dc_offset: bool = True,
+    round_to_power_of_two: bool = True,
+    sample_frequency: float = 16000.0,
+    snip_edges: bool = True,
+    subtract_mean: bool = False,
+    use_energy: bool = False,
+    vtln_high: float = -500.0,
+    vtln_low: float = 100.0,
+    vtln_warp: float = 1.0,
+    window_type: str = POVEY,
+) -> Tensor:
+    r"""Create a mfcc from a raw audio signal. This matches the input/output of Kaldi's
+    compute-mfcc-feats.
+    Args:
+        waveform (Tensor): Tensor of audio of size (c, n) where c is in the range [0,2)
+        blackman_coeff (float, optional): Constant coefficient for generalized Blackman window. (Default: ``0.42``)
+        cepstral_lifter (float, optional): Constant that controls scaling of MFCCs (Default: ``22.0``)
+        channel (int, optional): Channel to extract (-1 -> expect mono, 0 -> left, 1 -> right) (Default: ``-1``)
+        dither (float, optional): Dithering constant (0.0 means no dither). If you turn this off, you should set
+            the energy_floor option, e.g. to 1.0 or 0.1 (Default: ``0.0``)
+        energy_floor (float, optional): Floor on energy (absolute, not relative) in Spectrogram computation.  Caution:
+            this floor is applied to the zeroth component, representing the total signal energy.  The floor on the
+            individual spectrogram elements is fixed at std::numeric_limits<float>::epsilon(). (Default: ``1.0``)
+        frame_length (float, optional): Frame length in milliseconds (Default: ``25.0``)
+        frame_shift (float, optional): Frame shift in milliseconds (Default: ``10.0``)
+        high_freq (float, optional): High cutoff frequency for mel bins (if <= 0, offset from Nyquist)
+         (Default: ``0.0``)
+        htk_compat (bool, optional): If true, put energy last.  Warning: not sufficient to get HTK compatible
+         features (need to change other parameters). (Default: ``False``)
+        low_freq (float, optional): Low cutoff frequency for mel bins (Default: ``20.0``)
+        num_ceps (int, optional): Number of cepstra in MFCC computation (including C0) (Default: ``13``)
+        min_duration (float, optional): Minimum duration of segments to process (in seconds). (Default: ``0.0``)
+        num_mel_bins (int, optional): Number of triangular mel-frequency bins (Default: ``23``)
+        preemphasis_coefficient (float, optional): Coefficient for use in signal preemphasis (Default: ``0.97``)
+        raw_energy (bool, optional): If True, compute energy before preemphasis and windowing (Default: ``True``)
+        remove_dc_offset (bool, optional): Subtract mean from waveform on each frame (Default: ``True``)
+        round_to_power_of_two (bool, optional): If True, round window size to power of two by zero-padding input
+            to FFT. (Default: ``True``)
+        sample_frequency (float, optional): Waveform data sample frequency (must match the waveform file, if
+            specified there) (Default: ``16000.0``)
+        snip_edges (bool, optional): If True, end effects will be handled by outputting only frames that completely fit
+            in the file, and the number of frames depends on the frame_length.  If False, the number of frames
+            depends only on the frame_shift, and we reflect the data at the ends. (Default: ``True``)
+        subtract_mean (bool, optional): Subtract mean of each feature file [CMS]; not recommended to do
+            it this way.  (Default: ``False``)
+        use_energy (bool, optional): Add an extra dimension with energy to the FBANK output. (Default: ``False``)
+        vtln_high (float, optional): High inflection point in piecewise linear VTLN warping function (if
+            negative, offset from high-mel-freq (Default: ``-500.0``)
+        vtln_low (float, optional): Low inflection point in piecewise linear VTLN warping function (Default: ``100.0``)
+        vtln_warp (float, optional): Vtln warp factor (only applicable if vtln_map not specified) (Default: ``1.0``)
+        window_type (str, optional): Type of window ('hamming'|'hanning'|'povey'|'rectangular'|'blackman')
+         (Default: ``"povey"``)
+    Returns:
+        Tensor: A mfcc identical to what Kaldi would output. The shape is (m, ``num_ceps``)
+        where m is calculated in _get_strided
+    """
+    assert num_ceps <= num_mel_bins, "num_ceps cannot be larger than num_mel_bins: %d vs %d" % (num_ceps, num_mel_bins)
+    device, dtype = waveform.device, waveform.dtype
+    # The mel_energies should not be squared (use_power=True), not have mean subtracted
+    # (subtract_mean=False), and use log (use_log_fbank=True).
+    # size (m, num_mel_bins + use_energy)
+    feature = fbank(
+        waveform=waveform,
+        blackman_coeff=blackman_coeff,
+        channel=channel,
+        dither=dither,
+        energy_floor=energy_floor,
+        frame_length=frame_length,
+        frame_shift=frame_shift,
+        high_freq=high_freq,
+        htk_compat=htk_compat,
+        low_freq=low_freq,
+        min_duration=min_duration,
+        num_mel_bins=num_mel_bins,
+        preemphasis_coefficient=preemphasis_coefficient,
+        raw_energy=raw_energy,
+        remove_dc_offset=remove_dc_offset,
+        round_to_power_of_two=round_to_power_of_two,
+        sample_frequency=sample_frequency,
+        snip_edges=snip_edges,
+        subtract_mean=False,
+        use_energy=use_energy,
+        use_log_fbank=True,
+        use_power=True,
+        vtln_high=vtln_high,
+        vtln_low=vtln_low,
+        vtln_warp=vtln_warp,
+        window_type=window_type,
+    )
+    if use_energy:
+        # size (m)
+        signal_log_energy = feature[:, num_mel_bins if htk_compat else 0]
+        # offset is 0 if htk_compat==True else 1
+        mel_offset = int(not htk_compat)
+        feature = feature[:, mel_offset : (num_mel_bins + mel_offset)]
+    # size (num_mel_bins, num_ceps)
+    dct_matrix = _get_dct_matrix(num_ceps, num_mel_bins).to(dtype=dtype, device=device)
+    # size (m, num_ceps)
+    feature = feature.matmul(dct_matrix)
+    if cepstral_lifter != 0.0:
+        # size (1, num_ceps)
+        lifter_coeffs = _get_lifter_coeffs(num_ceps, cepstral_lifter).unsqueeze(0)
+        feature *= lifter_coeffs.to(device=device, dtype=dtype)
+    # if use_energy then replace the last column for htk_compat == true else first column
+    if use_energy:
+        feature[:, 0] = signal_log_energy
+    if htk_compat:
+        energy = feature[:, 0].unsqueeze(1)  # size (m, 1)
+        feature = feature[:, 1:]  # size (m, num_ceps - 1)
+        if not use_energy:
+            # scale on C0 (actually removing a scale we previously added that's
+            # part of one common definition of the cosine transform.)
+            energy *= math.sqrt(2)
+        feature = torch.cat((feature, energy), dim=1)
+    feature = _subtract_column_mean(feature, subtract_mean)
+    return feature

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from .cmuarctic import CMUARCTIC
+from .cmudict import CMUDict
+from .commonvoice import COMMONVOICE
+from .dr_vctk import DR_VCTK
+from .gtzan import GTZAN
+from .librilight_limited import LibriLightLimited
+from .librimix import LibriMix
+from .librispeech import LIBRISPEECH
+from .libritts import LIBRITTS
+from .ljspeech import LJSPEECH
+from .quesst14 import QUESST14
+from .speechcommands import SPEECHCOMMANDS
+from .tedlium import TEDLIUM
+from .vctk import VCTK_092
+from .yesno import YESNO
+__all__ = [
+    "COMMONVOICE",
+    "LIBRISPEECH",
+    "LibriLightLimited",
+    "SPEECHCOMMANDS",
+    "VCTK_092",
+    "DR_VCTK",
+    "YESNO",
+    "LJSPEECH",
+    "GTZAN",
+    "CMUARCTIC",
+    "CMUDict",
+    "LibriMix",
+    "LIBRITTS",
+    "TEDLIUM",
+    "QUESST14",
+]

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/cmuarctic.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import csv
+import os
+from pathlib import Path
+from typing import Tuple, Union
+import torchaudio
+from torch import Tensor
+from torch.hub import download_url_to_file
+from torch.utils.data import Dataset
+from torchaudio.datasets.utils import extract_archive
+URL = "aew"
+FOLDER_IN_ARCHIVE = "ARCTIC"
+_CHECKSUMS = {
+    "http://festvox.org/cmu_arctic/packed/cmu_us_aew_arctic.tar.bz2": "645cb33c0f0b2ce41384fdd8d3db2c3f5fc15c1e688baeb74d2e08cab18ab406",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_ahw_arctic.tar.bz2": "024664adeb892809d646a3efd043625b46b5bfa3e6189b3500b2d0d59dfab06c",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_aup_arctic.tar.bz2": "2c55bc3050caa996758869126ad10cf42e1441212111db034b3a45189c18b6fc",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_awb_arctic.tar.bz2": "d74a950c9739a65f7bfc4dfa6187f2730fa03de5b8eb3f2da97a51b74df64d3c",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_axb_arctic.tar.bz2": "dd65c3d2907d1ee52f86e44f578319159e60f4bf722a9142be01161d84e330ff",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_bdl_arctic.tar.bz2": "26b91aaf48b2799b2956792b4632c2f926cd0542f402b5452d5adecb60942904",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_clb_arctic.tar.bz2": "3f16dc3f3b97955ea22623efb33b444341013fc660677b2e170efdcc959fa7c6",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_eey_arctic.tar.bz2": "8a0ee4e5acbd4b2f61a4fb947c1730ab3adcc9dc50b195981d99391d29928e8a",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_fem_arctic.tar.bz2": "3fcff629412b57233589cdb058f730594a62c4f3a75c20de14afe06621ef45e2",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_gka_arctic.tar.bz2": "dc82e7967cbd5eddbed33074b0699128dbd4482b41711916d58103707e38c67f",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_jmk_arctic.tar.bz2": "3a37c0e1dfc91e734fdbc88b562d9e2ebca621772402cdc693bbc9b09b211d73",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_ksp_arctic.tar.bz2": "8029cafce8296f9bed3022c44ef1e7953332b6bf6943c14b929f468122532717",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_ljm_arctic.tar.bz2": "b23993765cbf2b9e7bbc3c85b6c56eaf292ac81ee4bb887b638a24d104f921a0",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_lnh_arctic.tar.bz2": "4faf34d71aa7112813252fb20c5433e2fdd9a9de55a00701ffcbf05f24a5991a",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_rms_arctic.tar.bz2": "c6dc11235629c58441c071a7ba8a2d067903dfefbaabc4056d87da35b72ecda4",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_rxr_arctic.tar.bz2": "1fa4271c393e5998d200e56c102ff46fcfea169aaa2148ad9e9469616fbfdd9b",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_slp_arctic.tar.bz2": "54345ed55e45c23d419e9a823eef427f1cc93c83a710735ec667d068c916abf1",  # noqa: E501
+    "http://festvox.org/cmu_arctic/packed/cmu_us_slt_arctic.tar.bz2": "7c173297916acf3cc7fcab2713be4c60b27312316765a90934651d367226b4ea",  # noqa: E501
+}
+def load_cmuarctic_item(line: str, path: str, folder_audio: str, ext_audio: str) -> Tuple[Tensor, int, str, str]:
+    utterance_id, transcript = line[0].strip().split(" ", 2)[1:]
+    # Remove space, double quote, and single parenthesis from transcript
+    transcript = transcript[1:-3]
+    file_audio = os.path.join(path, folder_audio, utterance_id + ext_audio)
+    # Load audio
+    waveform, sample_rate = torchaudio.load(file_audio)
+    return (waveform, sample_rate, transcript, utterance_id.split("_")[1])
+class CMUARCTIC(Dataset):
+    """Create a Dataset for *CMU ARCTIC* [:footcite:`Kominek03cmuarctic`].
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        url (str, optional):
+            The URL to download the dataset from or the type of the dataset to download.
+            (default: ``"aew"``)
+            Allowed type values are ``"aew"``, ``"ahw"``, ``"aup"``, ``"awb"``, ``"axb"``, ``"bdl"``,
+            ``"clb"``, ``"eey"``, ``"fem"``, ``"gka"``, ``"jmk"``, ``"ksp"``, ``"ljm"``, ``"lnh"``,
+            ``"rms"``, ``"rxr"``, ``"slp"`` or ``"slt"``.
+        folder_in_archive (str, optional):
+            The top-level directory of the dataset. (default: ``"ARCTIC"``)
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+    """
+    _file_text = "txt.done.data"
+    _folder_text = "etc"
+    _ext_audio = ".wav"
+    _folder_audio = "wav"
+    def __init__(
+        self, root: Union[str, Path], url: str = URL, folder_in_archive: str = FOLDER_IN_ARCHIVE, download: bool = False
+    ) -> None:
+        if url in [
+            "aew",
+            "ahw",
+            "aup",
+            "awb",
+            "axb",
+            "bdl",
+            "clb",
+            "eey",
+            "fem",
+            "gka",
+            "jmk",
+            "ksp",
+            "ljm",
+            "lnh",
+            "rms",
+            "rxr",
+            "slp",
+            "slt",
+        ]:
+            url = "cmu_us_" + url + "_arctic"
+            ext_archive = ".tar.bz2"
+            base_url = "http://www.festvox.org/cmu_arctic/packed/"
+            url = os.path.join(base_url, url + ext_archive)
+        # Get string representation of 'root' in case Path object is passed
+        root = os.fspath(root)
+        basename = os.path.basename(url)
+        root = os.path.join(root, folder_in_archive)
+        if not os.path.isdir(root):
+            os.mkdir(root)
+        archive = os.path.join(root, basename)
+        basename = basename.split(".")[0]
+        self._path = os.path.join(root, basename)
+        if download:
+            if not os.path.isdir(self._path):
+                if not os.path.isfile(archive):
+                    checksum = _CHECKSUMS.get(url, None)
+                    download_url_to_file(url, archive, hash_prefix=checksum)
+                extract_archive(archive)
+        else:
+            if not os.path.exists(self._path):
+                raise RuntimeError(
+                    f"The path {self._path} doesn't exist. "
+                    "Please check the ``root`` path or set `download=True` to download it"
+                )
+        self._text = os.path.join(self._path, self._folder_text, self._file_text)
+        with open(self._text, "r") as text:
+            walker = csv.reader(text, delimiter="\n")
+            self._walker = list(walker)
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str]:
+        """Load the n-th sample from the dataset.
+        Args:
+            n (int): The index of the sample to be loaded
+        Returns:
+            (Tensor, int, str, str): ``(waveform, sample_rate, transcript, utterance_id)``
+        """
+        line = self._walker[n]
+        return load_cmuarctic_item(line, self._path, self._folder_audio, self._ext_audio)
+    def __len__(self) -> int:
+        return len(self._walker)

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/cmudict.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import os
+import re
+from pathlib import Path
+from typing import Iterable, List, Tuple, Union
+from torch.hub import download_url_to_file
+from torch.utils.data import Dataset
+_CHECKSUMS = {
+    "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b": "209a8b4cd265013e96f4658632a9878103b0c5abf62b50d4ef3ae1be226b29e4",  # noqa: E501
+    "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.symbols": "408ccaae803641c6d7b626b6299949320c2dbca96b2220fd3fb17887b023b027",  # noqa: E501
+}
+_PUNCTUATIONS = set(
+    [
+        "!EXCLAMATION-POINT",
+        '"CLOSE-QUOTE',
+        '"DOUBLE-QUOTE',
+        '"END-OF-QUOTE',
+        '"END-QUOTE',
+        '"IN-QUOTES',
+        '"QUOTE',
+        '"UNQUOTE',
+        "#HASH-MARK",
+        "#POUND-SIGN",
+        "#SHARP-SIGN",
+        "%PERCENT",
+        "&AMPERSAND",
+        "'END-INNER-QUOTE",
+        "'END-QUOTE",
+        "'INNER-QUOTE",
+        "'QUOTE",
+        "'SINGLE-QUOTE",
+        "(BEGIN-PARENS",
+        "(IN-PARENTHESES",
+        "(LEFT-PAREN",
+        "(OPEN-PARENTHESES",
+        "(PAREN",
+        "(PARENS",
+        "(PARENTHESES",
+        ")CLOSE-PAREN",
+        ")CLOSE-PARENTHESES",
+        ")END-PAREN",
+        ")END-PARENS",
+        ")END-PARENTHESES",
+        ")END-THE-PAREN",
+        ")PAREN",
+        ")PARENS",
+        ")RIGHT-PAREN",
+        ")UN-PARENTHESES",
+        "+PLUS",
+        ",COMMA",
+        "--DASH",
+        "-DASH",
+        "-HYPHEN",
+        "...ELLIPSIS",
+        ".DECIMAL",
+        ".DOT",
+        ".FULL-STOP",
+        ".PERIOD",
+        ".POINT",
+        "/SLASH",
+        ":COLON",
+        ";SEMI-COLON",
+        ";SEMI-COLON(1)",
+        "?QUESTION-MARK",
+        "{BRACE",
+        "{LEFT-BRACE",
+        "{OPEN-BRACE",
+        "}CLOSE-BRACE",
+        "}RIGHT-BRACE",
+    ]
+)
+def _parse_dictionary(lines: Iterable[str], exclude_punctuations: bool) -> List[str]:
+    _alt_re = re.compile(r"\([0-9]+\)")
+    cmudict: List[Tuple[str, List[str]]] = list()
+    for line in lines:
+        if not line or line.startswith(";;;"):  # ignore comments
+            continue
+        word, phones = line.strip().split("  ")
+        if word in _PUNCTUATIONS:
+            if exclude_punctuations:
+                continue
+            # !EXCLAMATION-POINT -> !
+            # --DASH -> --
+            # ...ELLIPSIS -> ...
+            if word.startswith("..."):
+                word = "..."
+            elif word.startswith("--"):
+                word = "--"
+            else:
+                word = word[0]
+        # if a word have multiple pronunciations, there will be (number) appended to it
+        # for example, DATAPOINTS and DATAPOINTS(1),
+        # the regular expression `_alt_re` removes the '(1)' and change the word DATAPOINTS(1) to DATAPOINTS
+        word = re.sub(_alt_re, "", word)
+        phones = phones.split(" ")
+        cmudict.append((word, phones))
+    return cmudict
+class CMUDict(Dataset):
+    """Create a Dataset for *CMU Pronouncing Dictionary* [:footcite:`cmudict`] (CMUDict).
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        exclude_punctuations (bool, optional):
+            When enabled, exclude the pronounciation of punctuations, such as
+            `!EXCLAMATION-POINT` and `#HASH-MARK`.
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+        url (str, optional):
+            The URL to download the dictionary from.
+            (default: ``"http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b"``)
+        url_symbols (str, optional):
+            The URL to download the list of symbols from.
+            (default: ``"http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.symbols"``)
+    """
+    def __init__(
+        self,
+        root: Union[str, Path],
+        exclude_punctuations: bool = True,
+        *,
+        download: bool = False,
+        url: str = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b",
+        url_symbols: str = "http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-0.7b.symbols",
+    ) -> None:
+        self.exclude_punctuations = exclude_punctuations
+        self._root_path = Path(root)
+        if not os.path.isdir(self._root_path):
+            raise RuntimeError(f"The root directory does not exist; {root}")
+        dict_file = self._root_path / os.path.basename(url)
+        symbol_file = self._root_path / os.path.basename(url_symbols)
+        if not os.path.exists(dict_file):
+            if not download:
+                raise RuntimeError(
+                    "The dictionary file is not found in the following location. "
+                    f"Set `download=True` to download it. {dict_file}"
+                )
+            checksum = _CHECKSUMS.get(url, None)
+            download_url_to_file(url, dict_file, checksum)
+        if not os.path.exists(symbol_file):
+            if not download:
+                raise RuntimeError(
+                    "The symbol file is not found in the following location. "
+                    f"Set `download=True` to download it. {symbol_file}"
+                )
+            checksum = _CHECKSUMS.get(url_symbols, None)
+            download_url_to_file(url_symbols, symbol_file, checksum)
+        with open(symbol_file, "r") as text:
+            self._symbols = [line.strip() for line in text.readlines()]
+        with open(dict_file, "r", encoding="latin-1") as text:
+            self._dictionary = _parse_dictionary(text.readlines(), exclude_punctuations=self.exclude_punctuations)
+    def __getitem__(self, n: int) -> Tuple[str, List[str]]:
+        """Load the n-th sample from the dataset.
+        Args:
+            n (int): The index of the sample to be loaded.
+        Returns:
+            (str, List[str]): The corresponding word and phonemes ``(word, [phonemes])``.
+        """
+        return self._dictionary[n]
+    def __len__(self) -> int:
+        return len(self._dictionary)
+    @property
+    def symbols(self) -> List[str]:
+        """list[str]: A list of phonemes symbols, such as `AA`, `AE`, `AH`."""
+        return self._symbols.copy()

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/commonvoice.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import csv
+import os
+from pathlib import Path
+from typing import Dict, List, Tuple, Union
+import torchaudio
+from torch import Tensor
+from torch.utils.data import Dataset
+def load_commonvoice_item(
+    line: List[str], header: List[str], path: str, folder_audio: str, ext_audio: str
+) -> Tuple[Tensor, int, Dict[str, str]]:
+    # Each line as the following data:
+    # client_id, path, sentence, up_votes, down_votes, age, gender, accent
+    assert header[1] == "path"
+    fileid = line[1]
+    filename = os.path.join(path, folder_audio, fileid)
+    if not filename.endswith(ext_audio):
+        filename += ext_audio
+    waveform, sample_rate = torchaudio.load(filename)
+    dic = dict(zip(header, line))
+    return waveform, sample_rate, dic
+class COMMONVOICE(Dataset):
+    """Create a Dataset for *CommonVoice* [:footcite:`ardila2020common`].
+    Args:
+        root (str or Path): Path to the directory where the dataset is located.
+             (Where the ``tsv`` file is present.)
+        tsv (str, optional):
+            The name of the tsv file used to construct the metadata, such as
+            ``"train.tsv"``, ``"test.tsv"``, ``"dev.tsv"``, ``"invalidated.tsv"``,
+            ``"validated.tsv"`` and ``"other.tsv"``. (default: ``"train.tsv"``)
+    """
+    _ext_txt = ".txt"
+    _ext_audio = ".mp3"
+    _folder_audio = "clips"
+    def __init__(self, root: Union[str, Path], tsv: str = "train.tsv") -> None:
+        # Get string representation of 'root' in case Path object is passed
+        self._path = os.fspath(root)
+        self._tsv = os.path.join(self._path, tsv)
+        with open(self._tsv, "r") as tsv_:
+            walker = csv.reader(tsv_, delimiter="\t")
+            self._header = next(walker)
+            self._walker = list(walker)
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, Dict[str, str]]:
+        """Load the n-th sample from the dataset.
+        Args:
+            n (int): The index of the sample to be loaded
+        Returns:
+            (Tensor, int, Dict[str, str]): ``(waveform, sample_rate, dictionary)``,  where dictionary
+            is built from the TSV file with the following keys: ``client_id``, ``path``, ``sentence``,
+            ``up_votes``, ``down_votes``, ``age``, ``gender`` and ``accent``.
+        """
+        line = self._walker[n]
+        return load_commonvoice_item(line, self._header, self._path, self._folder_audio, self._ext_audio)
+    def __len__(self) -> int:
+        return len(self._walker)

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/dr_vctk.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from pathlib import Path
+from typing import Dict, Tuple, Union
+import torchaudio
+from torch import Tensor
+from torch.hub import download_url_to_file
+from torch.utils.data import Dataset
+from torchaudio.datasets.utils import extract_archive
+_URL = "https://datashare.ed.ac.uk/bitstream/handle/10283/3038/DR-VCTK.zip"
+_CHECKSUM = "781f12f4406ed36ed27ae3bce55da47ba176e2d8bae67319e389e07b2c9bd769"
+_SUPPORTED_SUBSETS = {"train", "test"}
+class DR_VCTK(Dataset):
+    """Create a dataset for *Device Recorded VCTK (Small subset version)* [:footcite:`Sarfjoo2018DeviceRV`].
+    Args:
+        root (str or Path): Root directory where the dataset's top level directory is found.
+        subset (str): The subset to use. Can be one of ``"train"`` and ``"test"``. (default: ``"train"``).
+        download (bool):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+        url (str): The URL to download the dataset from.
+            (default: ``"https://datashare.ed.ac.uk/bitstream/handle/10283/3038/DR-VCTK.zip"``)
+    """
+    def __init__(
+        self,
+        root: Union[str, Path],
+        subset: str = "train",
+        *,
+        download: bool = False,
+        url: str = _URL,
+    ) -> None:
+        if subset not in _SUPPORTED_SUBSETS:
+            raise RuntimeError(
+                f"The subset '{subset}' does not match any of the supported subsets: {_SUPPORTED_SUBSETS}"
+            )
+        root = Path(root).expanduser()
+        archive = root / "DR-VCTK.zip"
+        self._subset = subset
+        self._path = root / "DR-VCTK" / "DR-VCTK"
+        self._clean_audio_dir = self._path / f"clean_{self._subset}set_wav_16k"
+        self._noisy_audio_dir = self._path / f"device-recorded_{self._subset}set_wav_16k"
+        self._config_filepath = self._path / "configurations" / f"{self._subset}_ch_log.txt"
+        if not self._path.is_dir():
+            if not archive.is_file():
+                if not download:
+                    raise RuntimeError("Dataset not found. Please use `download=True` to download it.")
+                download_url_to_file(url, archive, hash_prefix=_CHECKSUM)
+            extract_archive(archive, root)
+        self._config = self._load_config(self._config_filepath)
+        self._filename_list = sorted(self._config)
+    def _load_config(self, filepath: str) -> Dict[str, Tuple[str, int]]:
+        # Skip header
+        skip_rows = 2 if self._subset == "train" else 1
+        config = {}
+        with open(filepath) as f:
+            for i, line in enumerate(f):
+                if i < skip_rows or not line:
+                    continue
+                filename, source, channel_id = line.strip().split("\t")
+                config[filename] = (source, int(channel_id))
+        return config
+    def _load_dr_vctk_item(self, filename: str) -> Tuple[Tensor, int, Tensor, int, str, str, str, int]:
+        speaker_id, utterance_id = filename.split(".")[0].split("_")
+        source, channel_id = self._config[filename]
+        file_clean_audio = self._clean_audio_dir / filename
+        file_noisy_audio = self._noisy_audio_dir / filename
+        waveform_clean, sample_rate_clean = torchaudio.load(file_clean_audio)
+        waveform_noisy, sample_rate_noisy = torchaudio.load(file_noisy_audio)
+        return (
+            waveform_clean,
+            sample_rate_clean,
+            waveform_noisy,
+            sample_rate_noisy,
+            speaker_id,
+            utterance_id,
+            source,
+            channel_id,
+        )
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, Tensor, int, str, str, str, int]:
+        """Load the n-th sample from the dataset.
+        Args:
+            n (int): The index of the sample to be loaded
+        Returns:
+            (Tensor, int, Tensor, int, str, str, str, int):
+            ``(waveform_clean, sample_rate_clean, waveform_noisy, sample_rate_noisy, speaker_id,\
+                utterance_id, source, channel_id)``
+        """
+        filename = self._filename_list[n]
+        return self._load_dr_vctk_item(filename)
+    def __len__(self) -> int:
+        return len(self._filename_list)

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/gtzan.py ADDED Viewed

	@@ -0,0 +1,1108 @@

+import os
+from pathlib import Path
+from typing import Optional, Tuple, Union
+import torchaudio
+from torch import Tensor
+from torch.hub import download_url_to_file
+from torch.utils.data import Dataset
+from torchaudio.datasets.utils import extract_archive
+# The following lists prefixed with `filtered_` provide a filtered split
+# that:
+#
+# a. Mitigate a known issue with GTZAN (duplication)
+#
+# b. Provide a standard split for testing it against other
+#    methods (e.g. the one in jordipons/sklearn-audio-transfer-learning).
+#
+# Those are used when GTZAN is initialised with the `filtered` keyword.
+# The split was taken from (github) jordipons/sklearn-audio-transfer-learning.
+gtzan_genres = [
+    "blues",
+    "classical",
+    "country",
+    "disco",
+    "hiphop",
+    "jazz",
+    "metal",
+    "pop",
+    "reggae",
+    "rock",
+]
+filtered_test = [
+    "blues.00012",
+    "blues.00013",
+    "blues.00014",
+    "blues.00015",
+    "blues.00016",
+    "blues.00017",
+    "blues.00018",
+    "blues.00019",
+    "blues.00020",
+    "blues.00021",
+    "blues.00022",
+    "blues.00023",
+    "blues.00024",
+    "blues.00025",
+    "blues.00026",
+    "blues.00027",
+    "blues.00028",
+    "blues.00061",
+    "blues.00062",
+    "blues.00063",
+    "blues.00064",
+    "blues.00065",
+    "blues.00066",
+    "blues.00067",
+    "blues.00068",
+    "blues.00069",
+    "blues.00070",
+    "blues.00071",
+    "blues.00072",
+    "blues.00098",
+    "blues.00099",
+    "classical.00011",
+    "classical.00012",
+    "classical.00013",
+    "classical.00014",
+    "classical.00015",
+    "classical.00016",
+    "classical.00017",
+    "classical.00018",
+    "classical.00019",
+    "classical.00020",
+    "classical.00021",
+    "classical.00022",
+    "classical.00023",
+    "classical.00024",
+    "classical.00025",
+    "classical.00026",
+    "classical.00027",
+    "classical.00028",
+    "classical.00029",
+    "classical.00034",
+    "classical.00035",
+    "classical.00036",
+    "classical.00037",
+    "classical.00038",
+    "classical.00039",
+    "classical.00040",
+    "classical.00041",
+    "classical.00049",
+    "classical.00077",
+    "classical.00078",
+    "classical.00079",
+    "country.00030",
+    "country.00031",
+    "country.00032",
+    "country.00033",
+    "country.00034",
+    "country.00035",
+    "country.00036",
+    "country.00037",
+    "country.00038",
+    "country.00039",
+    "country.00040",
+    "country.00043",
+    "country.00044",
+    "country.00046",
+    "country.00047",
+    "country.00048",
+    "country.00050",
+    "country.00051",
+    "country.00053",
+    "country.00054",
+    "country.00055",
+    "country.00056",
+    "country.00057",
+    "country.00058",
+    "country.00059",
+    "country.00060",
+    "country.00061",
+    "country.00062",
+    "country.00063",
+    "country.00064",
+    "disco.00001",
+    "disco.00021",
+    "disco.00058",
+    "disco.00062",
+    "disco.00063",
+    "disco.00064",
+    "disco.00065",
+    "disco.00066",
+    "disco.00069",
+    "disco.00076",
+    "disco.00077",
+    "disco.00078",
+    "disco.00079",
+    "disco.00080",
+    "disco.00081",
+    "disco.00082",
+    "disco.00083",
+    "disco.00084",
+    "disco.00085",
+    "disco.00086",
+    "disco.00087",
+    "disco.00088",
+    "disco.00091",
+    "disco.00092",
+    "disco.00093",
+    "disco.00094",
+    "disco.00096",
+    "disco.00097",
+    "disco.00099",
+    "hiphop.00000",
+    "hiphop.00026",
+    "hiphop.00027",
+    "hiphop.00030",
+    "hiphop.00040",
+    "hiphop.00043",
+    "hiphop.00044",
+    "hiphop.00045",
+    "hiphop.00051",
+    "hiphop.00052",
+    "hiphop.00053",
+    "hiphop.00054",
+    "hiphop.00062",
+    "hiphop.00063",
+    "hiphop.00064",
+    "hiphop.00065",
+    "hiphop.00066",
+    "hiphop.00067",
+    "hiphop.00068",
+    "hiphop.00069",
+    "hiphop.00070",
+    "hiphop.00071",
+    "hiphop.00072",
+    "hiphop.00073",
+    "hiphop.00074",
+    "hiphop.00075",
+    "hiphop.00099",
+    "jazz.00073",
+    "jazz.00074",
+    "jazz.00075",
+    "jazz.00076",
+    "jazz.00077",
+    "jazz.00078",
+    "jazz.00079",
+    "jazz.00080",
+    "jazz.00081",
+    "jazz.00082",
+    "jazz.00083",
+    "jazz.00084",
+    "jazz.00085",
+    "jazz.00086",
+    "jazz.00087",
+    "jazz.00088",
+    "jazz.00089",
+    "jazz.00090",
+    "jazz.00091",
+    "jazz.00092",
+    "jazz.00093",
+    "jazz.00094",
+    "jazz.00095",
+    "jazz.00096",
+    "jazz.00097",
+    "jazz.00098",
+    "jazz.00099",
+    "metal.00012",
+    "metal.00013",
+    "metal.00014",
+    "metal.00015",
+    "metal.00022",
+    "metal.00023",
+    "metal.00025",
+    "metal.00026",
+    "metal.00027",
+    "metal.00028",
+    "metal.00029",
+    "metal.00030",
+    "metal.00031",
+    "metal.00032",
+    "metal.00033",
+    "metal.00038",
+    "metal.00039",
+    "metal.00067",
+    "metal.00070",
+    "metal.00073",
+    "metal.00074",
+    "metal.00075",
+    "metal.00078",
+    "metal.00083",
+    "metal.00085",
+    "metal.00087",
+    "metal.00088",
+    "pop.00000",
+    "pop.00001",
+    "pop.00013",
+    "pop.00014",
+    "pop.00043",
+    "pop.00063",
+    "pop.00064",
+    "pop.00065",
+    "pop.00066",
+    "pop.00069",
+    "pop.00070",
+    "pop.00071",
+    "pop.00072",
+    "pop.00073",
+    "pop.00074",
+    "pop.00075",
+    "pop.00076",
+    "pop.00077",
+    "pop.00078",
+    "pop.00079",
+    "pop.00082",
+    "pop.00088",
+    "pop.00089",
+    "pop.00090",
+    "pop.00091",
+    "pop.00092",
+    "pop.00093",
+    "pop.00094",
+    "pop.00095",
+    "pop.00096",
+    "reggae.00034",
+    "reggae.00035",
+    "reggae.00036",
+    "reggae.00037",
+    "reggae.00038",
+    "reggae.00039",
+    "reggae.00040",
+    "reggae.00046",
+    "reggae.00047",
+    "reggae.00048",
+    "reggae.00052",
+    "reggae.00053",
+    "reggae.00064",
+    "reggae.00065",
+    "reggae.00066",
+    "reggae.00067",
+    "reggae.00068",
+    "reggae.00071",
+    "reggae.00079",
+    "reggae.00082",
+    "reggae.00083",
+    "reggae.00084",
+    "reggae.00087",
+    "reggae.00088",
+    "reggae.00089",
+    "reggae.00090",
+    "rock.00010",
+    "rock.00011",
+    "rock.00012",
+    "rock.00013",
+    "rock.00014",
+    "rock.00015",
+    "rock.00027",
+    "rock.00028",
+    "rock.00029",
+    "rock.00030",
+    "rock.00031",
+    "rock.00032",
+    "rock.00033",
+    "rock.00034",
+    "rock.00035",
+    "rock.00036",
+    "rock.00037",
+    "rock.00039",
+    "rock.00040",
+    "rock.00041",
+    "rock.00042",
+    "rock.00043",
+    "rock.00044",
+    "rock.00045",
+    "rock.00046",
+    "rock.00047",
+    "rock.00048",
+    "rock.00086",
+    "rock.00087",
+    "rock.00088",
+    "rock.00089",
+    "rock.00090",
+]
+filtered_train = [
+    "blues.00029",
+    "blues.00030",
+    "blues.00031",
+    "blues.00032",
+    "blues.00033",
+    "blues.00034",
+    "blues.00035",
+    "blues.00036",
+    "blues.00037",
+    "blues.00038",
+    "blues.00039",
+    "blues.00040",
+    "blues.00041",
+    "blues.00042",
+    "blues.00043",
+    "blues.00044",
+    "blues.00045",
+    "blues.00046",
+    "blues.00047",
+    "blues.00048",
+    "blues.00049",
+    "blues.00073",
+    "blues.00074",
+    "blues.00075",
+    "blues.00076",
+    "blues.00077",
+    "blues.00078",
+    "blues.00079",
+    "blues.00080",
+    "blues.00081",
+    "blues.00082",
+    "blues.00083",
+    "blues.00084",
+    "blues.00085",
+    "blues.00086",
+    "blues.00087",
+    "blues.00088",
+    "blues.00089",
+    "blues.00090",
+    "blues.00091",
+    "blues.00092",
+    "blues.00093",
+    "blues.00094",
+    "blues.00095",
+    "blues.00096",
+    "blues.00097",
+    "classical.00030",
+    "classical.00031",
+    "classical.00032",
+    "classical.00033",
+    "classical.00043",
+    "classical.00044",
+    "classical.00045",
+    "classical.00046",
+    "classical.00047",
+    "classical.00048",
+    "classical.00050",
+    "classical.00051",
+    "classical.00052",
+    "classical.00053",
+    "classical.00054",
+    "classical.00055",
+    "classical.00056",
+    "classical.00057",
+    "classical.00058",
+    "classical.00059",
+    "classical.00060",
+    "classical.00061",
+    "classical.00062",
+    "classical.00063",
+    "classical.00064",
+    "classical.00065",
+    "classical.00066",
+    "classical.00067",
+    "classical.00080",
+    "classical.00081",
+    "classical.00082",
+    "classical.00083",
+    "classical.00084",
+    "classical.00085",
+    "classical.00086",
+    "classical.00087",
+    "classical.00088",
+    "classical.00089",
+    "classical.00090",
+    "classical.00091",
+    "classical.00092",
+    "classical.00093",
+    "classical.00094",
+    "classical.00095",
+    "classical.00096",
+    "classical.00097",
+    "classical.00098",
+    "classical.00099",
+    "country.00019",
+    "country.00020",
+    "country.00021",
+    "country.00022",
+    "country.00023",
+    "country.00024",
+    "country.00025",
+    "country.00026",
+    "country.00028",
+    "country.00029",
+    "country.00065",
+    "country.00066",
+    "country.00067",
+    "country.00068",
+    "country.00069",
+    "country.00070",
+    "country.00071",
+    "country.00072",
+    "country.00073",
+    "country.00074",
+    "country.00075",
+    "country.00076",
+    "country.00077",
+    "country.00078",
+    "country.00079",
+    "country.00080",
+    "country.00081",
+    "country.00082",
+    "country.00083",
+    "country.00084",
+    "country.00085",
+    "country.00086",
+    "country.00087",
+    "country.00088",
+    "country.00089",
+    "country.00090",
+    "country.00091",
+    "country.00092",
+    "country.00093",
+    "country.00094",
+    "country.00095",
+    "country.00096",
+    "country.00097",
+    "country.00098",
+    "country.00099",
+    "disco.00005",
+    "disco.00015",
+    "disco.00016",
+    "disco.00017",
+    "disco.00018",
+    "disco.00019",
+    "disco.00020",
+    "disco.00022",
+    "disco.00023",
+    "disco.00024",
+    "disco.00025",
+    "disco.00026",
+    "disco.00027",
+    "disco.00028",
+    "disco.00029",
+    "disco.00030",
+    "disco.00031",
+    "disco.00032",
+    "disco.00033",
+    "disco.00034",
+    "disco.00035",
+    "disco.00036",
+    "disco.00037",
+    "disco.00039",
+    "disco.00040",
+    "disco.00041",
+    "disco.00042",
+    "disco.00043",
+    "disco.00044",
+    "disco.00045",
+    "disco.00047",
+    "disco.00049",
+    "disco.00053",
+    "disco.00054",
+    "disco.00056",
+    "disco.00057",
+    "disco.00059",
+    "disco.00061",
+    "disco.00070",
+    "disco.00073",
+    "disco.00074",
+    "disco.00089",
+    "hiphop.00002",
+    "hiphop.00003",
+    "hiphop.00004",
+    "hiphop.00005",
+    "hiphop.00006",
+    "hiphop.00007",
+    "hiphop.00008",
+    "hiphop.00009",
+    "hiphop.00010",
+    "hiphop.00011",
+    "hiphop.00012",
+    "hiphop.00013",
+    "hiphop.00014",
+    "hiphop.00015",
+    "hiphop.00016",
+    "hiphop.00017",
+    "hiphop.00018",
+    "hiphop.00019",
+    "hiphop.00020",
+    "hiphop.00021",
+    "hiphop.00022",
+    "hiphop.00023",
+    "hiphop.00024",
+    "hiphop.00025",
+    "hiphop.00028",
+    "hiphop.00029",
+    "hiphop.00031",
+    "hiphop.00032",
+    "hiphop.00033",
+    "hiphop.00034",
+    "hiphop.00035",
+    "hiphop.00036",
+    "hiphop.00037",
+    "hiphop.00038",
+    "hiphop.00041",
+    "hiphop.00042",
+    "hiphop.00055",
+    "hiphop.00056",
+    "hiphop.00057",
+    "hiphop.00058",
+    "hiphop.00059",
+    "hiphop.00060",
+    "hiphop.00061",
+    "hiphop.00077",
+    "hiphop.00078",
+    "hiphop.00079",
+    "hiphop.00080",
+    "jazz.00000",
+    "jazz.00001",
+    "jazz.00011",
+    "jazz.00012",
+    "jazz.00013",
+    "jazz.00014",
+    "jazz.00015",
+    "jazz.00016",
+    "jazz.00017",
+    "jazz.00018",
+    "jazz.00019",
+    "jazz.00020",
+    "jazz.00021",
+    "jazz.00022",
+    "jazz.00023",
+    "jazz.00024",
+    "jazz.00041",
+    "jazz.00047",
+    "jazz.00048",
+    "jazz.00049",
+    "jazz.00050",
+    "jazz.00051",
+    "jazz.00052",
+    "jazz.00053",
+    "jazz.00054",
+    "jazz.00055",
+    "jazz.00056",
+    "jazz.00057",
+    "jazz.00058",
+    "jazz.00059",
+    "jazz.00060",
+    "jazz.00061",
+    "jazz.00062",
+    "jazz.00063",
+    "jazz.00064",
+    "jazz.00065",
+    "jazz.00066",
+    "jazz.00067",
+    "jazz.00068",
+    "jazz.00069",
+    "jazz.00070",
+    "jazz.00071",
+    "jazz.00072",
+    "metal.00002",
+    "metal.00003",
+    "metal.00005",
+    "metal.00021",
+    "metal.00024",
+    "metal.00035",
+    "metal.00046",
+    "metal.00047",
+    "metal.00048",
+    "metal.00049",
+    "metal.00050",
+    "metal.00051",
+    "metal.00052",
+    "metal.00053",
+    "metal.00054",
+    "metal.00055",
+    "metal.00056",
+    "metal.00057",
+    "metal.00059",
+    "metal.00060",
+    "metal.00061",
+    "metal.00062",
+    "metal.00063",
+    "metal.00064",
+    "metal.00065",
+    "metal.00066",
+    "metal.00069",
+    "metal.00071",
+    "metal.00072",
+    "metal.00079",
+    "metal.00080",
+    "metal.00084",
+    "metal.00086",
+    "metal.00089",
+    "metal.00090",
+    "metal.00091",
+    "metal.00092",
+    "metal.00093",
+    "metal.00094",
+    "metal.00095",
+    "metal.00096",
+    "metal.00097",
+    "metal.00098",
+    "metal.00099",
+    "pop.00002",
+    "pop.00003",
+    "pop.00004",
+    "pop.00005",
+    "pop.00006",
+    "pop.00007",
+    "pop.00008",
+    "pop.00009",
+    "pop.00011",
+    "pop.00012",
+    "pop.00016",
+    "pop.00017",
+    "pop.00018",
+    "pop.00019",
+    "pop.00020",
+    "pop.00023",
+    "pop.00024",
+    "pop.00025",
+    "pop.00026",
+    "pop.00027",
+    "pop.00028",
+    "pop.00029",
+    "pop.00031",
+    "pop.00032",
+    "pop.00033",
+    "pop.00034",
+    "pop.00035",
+    "pop.00036",
+    "pop.00038",
+    "pop.00039",
+    "pop.00040",
+    "pop.00041",
+    "pop.00042",
+    "pop.00044",
+    "pop.00046",
+    "pop.00049",
+    "pop.00050",
+    "pop.00080",
+    "pop.00097",
+    "pop.00098",
+    "pop.00099",
+    "reggae.00000",
+    "reggae.00001",
+    "reggae.00002",
+    "reggae.00004",
+    "reggae.00006",
+    "reggae.00009",
+    "reggae.00011",
+    "reggae.00012",
+    "reggae.00014",
+    "reggae.00015",
+    "reggae.00016",
+    "reggae.00017",
+    "reggae.00018",
+    "reggae.00019",
+    "reggae.00020",
+    "reggae.00021",
+    "reggae.00022",
+    "reggae.00023",
+    "reggae.00024",
+    "reggae.00025",
+    "reggae.00026",
+    "reggae.00027",
+    "reggae.00028",
+    "reggae.00029",
+    "reggae.00030",
+    "reggae.00031",
+    "reggae.00032",
+    "reggae.00042",
+    "reggae.00043",
+    "reggae.00044",
+    "reggae.00045",
+    "reggae.00049",
+    "reggae.00050",
+    "reggae.00051",
+    "reggae.00054",
+    "reggae.00055",
+    "reggae.00056",
+    "reggae.00057",
+    "reggae.00058",
+    "reggae.00059",
+    "reggae.00060",
+    "reggae.00063",
+    "reggae.00069",
+    "rock.00000",
+    "rock.00001",
+    "rock.00002",
+    "rock.00003",
+    "rock.00004",
+    "rock.00005",
+    "rock.00006",
+    "rock.00007",
+    "rock.00008",
+    "rock.00009",
+    "rock.00016",
+    "rock.00017",
+    "rock.00018",
+    "rock.00019",
+    "rock.00020",
+    "rock.00021",
+    "rock.00022",
+    "rock.00023",
+    "rock.00024",
+    "rock.00025",
+    "rock.00026",
+    "rock.00057",
+    "rock.00058",
+    "rock.00059",
+    "rock.00060",
+    "rock.00061",
+    "rock.00062",
+    "rock.00063",
+    "rock.00064",
+    "rock.00065",
+    "rock.00066",
+    "rock.00067",
+    "rock.00068",
+    "rock.00069",
+    "rock.00070",
+    "rock.00091",
+    "rock.00092",
+    "rock.00093",
+    "rock.00094",
+    "rock.00095",
+    "rock.00096",
+    "rock.00097",
+    "rock.00098",
+    "rock.00099",
+]
+filtered_valid = [
+    "blues.00000",
+    "blues.00001",
+    "blues.00002",
+    "blues.00003",
+    "blues.00004",
+    "blues.00005",
+    "blues.00006",
+    "blues.00007",
+    "blues.00008",
+    "blues.00009",
+    "blues.00010",
+    "blues.00011",
+    "blues.00050",
+    "blues.00051",
+    "blues.00052",
+    "blues.00053",
+    "blues.00054",
+    "blues.00055",
+    "blues.00056",
+    "blues.00057",
+    "blues.00058",
+    "blues.00059",
+    "blues.00060",
+    "classical.00000",
+    "classical.00001",
+    "classical.00002",
+    "classical.00003",
+    "classical.00004",
+    "classical.00005",
+    "classical.00006",
+    "classical.00007",
+    "classical.00008",
+    "classical.00009",
+    "classical.00010",
+    "classical.00068",
+    "classical.00069",
+    "classical.00070",
+    "classical.00071",
+    "classical.00072",
+    "classical.00073",
+    "classical.00074",
+    "classical.00075",
+    "classical.00076",
+    "country.00000",
+    "country.00001",
+    "country.00002",
+    "country.00003",
+    "country.00004",
+    "country.00005",
+    "country.00006",
+    "country.00007",
+    "country.00009",
+    "country.00010",
+    "country.00011",
+    "country.00012",
+    "country.00013",
+    "country.00014",
+    "country.00015",
+    "country.00016",
+    "country.00017",
+    "country.00018",
+    "country.00027",
+    "country.00041",
+    "country.00042",
+    "country.00045",
+    "country.00049",
+    "disco.00000",
+    "disco.00002",
+    "disco.00003",
+    "disco.00004",
+    "disco.00006",
+    "disco.00007",
+    "disco.00008",
+    "disco.00009",
+    "disco.00010",
+    "disco.00011",
+    "disco.00012",
+    "disco.00013",
+    "disco.00014",
+    "disco.00046",
+    "disco.00048",
+    "disco.00052",
+    "disco.00067",
+    "disco.00068",
+    "disco.00072",
+    "disco.00075",
+    "disco.00090",
+    "disco.00095",
+    "hiphop.00081",
+    "hiphop.00082",
+    "hiphop.00083",
+    "hiphop.00084",
+    "hiphop.00085",
+    "hiphop.00086",
+    "hiphop.00087",
+    "hiphop.00088",
+    "hiphop.00089",
+    "hiphop.00090",
+    "hiphop.00091",
+    "hiphop.00092",
+    "hiphop.00093",
+    "hiphop.00094",
+    "hiphop.00095",
+    "hiphop.00096",
+    "hiphop.00097",
+    "hiphop.00098",
+    "jazz.00002",
+    "jazz.00003",
+    "jazz.00004",
+    "jazz.00005",
+    "jazz.00006",
+    "jazz.00007",
+    "jazz.00008",
+    "jazz.00009",
+    "jazz.00010",
+    "jazz.00025",
+    "jazz.00026",
+    "jazz.00027",
+    "jazz.00028",
+    "jazz.00029",
+    "jazz.00030",
+    "jazz.00031",
+    "jazz.00032",
+    "metal.00000",
+    "metal.00001",
+    "metal.00006",
+    "metal.00007",
+    "metal.00008",
+    "metal.00009",
+    "metal.00010",
+    "metal.00011",
+    "metal.00016",
+    "metal.00017",
+    "metal.00018",
+    "metal.00019",
+    "metal.00020",
+    "metal.00036",
+    "metal.00037",
+    "metal.00068",
+    "metal.00076",
+    "metal.00077",
+    "metal.00081",
+    "metal.00082",
+    "pop.00010",
+    "pop.00053",
+    "pop.00055",
+    "pop.00058",
+    "pop.00059",
+    "pop.00060",
+    "pop.00061",
+    "pop.00062",
+    "pop.00081",
+    "pop.00083",
+    "pop.00084",
+    "pop.00085",
+    "pop.00086",
+    "reggae.00061",
+    "reggae.00062",
+    "reggae.00070",
+    "reggae.00072",
+    "reggae.00074",
+    "reggae.00076",
+    "reggae.00077",
+    "reggae.00078",
+    "reggae.00085",
+    "reggae.00092",
+    "reggae.00093",
+    "reggae.00094",
+    "reggae.00095",
+    "reggae.00096",
+    "reggae.00097",
+    "reggae.00098",
+    "reggae.00099",
+    "rock.00038",
+    "rock.00049",
+    "rock.00050",
+    "rock.00051",
+    "rock.00052",
+    "rock.00053",
+    "rock.00054",
+    "rock.00055",
+    "rock.00056",
+    "rock.00071",
+    "rock.00072",
+    "rock.00073",
+    "rock.00074",
+    "rock.00075",
+    "rock.00076",
+    "rock.00077",
+    "rock.00078",
+    "rock.00079",
+    "rock.00080",
+    "rock.00081",
+    "rock.00082",
+    "rock.00083",
+    "rock.00084",
+    "rock.00085",
+]
+URL = "http://opihi.cs.uvic.ca/sound/genres.tar.gz"
+FOLDER_IN_ARCHIVE = "genres"
+_CHECKSUMS = {
+    "http://opihi.cs.uvic.ca/sound/genres.tar.gz": "24347e0223d2ba798e0a558c4c172d9d4a19c00bb7963fe055d183dadb4ef2c6"
+}
+def load_gtzan_item(fileid: str, path: str, ext_audio: str) -> Tuple[Tensor, str]:
+    """
+    Loads a file from the dataset and returns the raw waveform
+    as a Torch Tensor, its sample rate as an integer, and its
+    genre as a string.
+    """
+    # Filenames are of the form label.id, e.g. blues.00078
+    label, _ = fileid.split(".")
+    # Read wav
+    file_audio = os.path.join(path, label, fileid + ext_audio)
+    waveform, sample_rate = torchaudio.load(file_audio)
+    return waveform, sample_rate, label
+class GTZAN(Dataset):
+    """Create a Dataset for *GTZAN* [:footcite:`tzanetakis_essl_cook_2001`].
+    Note:
+        Please see http://marsyas.info/downloads/datasets.html if you are planning to use
+        this dataset to publish results.
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        url (str, optional): The URL to download the dataset from.
+            (default: ``"http://opihi.cs.uvic.ca/sound/genres.tar.gz"``)
+        folder_in_archive (str, optional): The top-level directory of the dataset.
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+        subset (str or None, optional): Which subset of the dataset to use.
+            One of ``"training"``, ``"validation"``, ``"testing"`` or ``None``.
+            If ``None``, the entire dataset is used. (default: ``None``).
+    """
+    _ext_audio = ".wav"
+    def __init__(
+        self,
+        root: Union[str, Path],
+        url: str = URL,
+        folder_in_archive: str = FOLDER_IN_ARCHIVE,
+        download: bool = False,
+        subset: Optional[str] = None,
+    ) -> None:
+        # super(GTZAN, self).__init__()
+        # Get string representation of 'root' in case Path object is passed
+        root = os.fspath(root)
+        self.root = root
+        self.url = url
+        self.folder_in_archive = folder_in_archive
+        self.download = download
+        self.subset = subset
+        assert subset is None or subset in ["training", "validation", "testing"], (
+            "When `subset` not None, it must take a value from " + "{'training', 'validation', 'testing'}."
+        )
+        archive = os.path.basename(url)
+        archive = os.path.join(root, archive)
+        self._path = os.path.join(root, folder_in_archive)
+        if download:
+            if not os.path.isdir(self._path):
+                if not os.path.isfile(archive):
+                    checksum = _CHECKSUMS.get(url, None)
+                    download_url_to_file(url, archive, hash_prefix=checksum)
+                extract_archive(archive)
+        if not os.path.isdir(self._path):
+            raise RuntimeError("Dataset not found. Please use `download=True` to download it.")
+        if self.subset is None:
+            # Check every subdirectory under dataset root
+            # which has the same name as the genres in
+            # GTZAN (e.g. `root_dir'/blues/, `root_dir'/rock, etc.)
+            # This lets users remove or move around song files,
+            # useful when e.g. they want to use only some of the files
+            # in a genre or want to label other files with a different
+            # genre.
+            self._walker = []
+            root = os.path.expanduser(self._path)
+            for directory in gtzan_genres:
+                fulldir = os.path.join(root, directory)
+                if not os.path.exists(fulldir):
+                    continue
+                songs_in_genre = os.listdir(fulldir)
+                songs_in_genre.sort()
+                for fname in songs_in_genre:
+                    name, ext = os.path.splitext(fname)
+                    if ext.lower() == ".wav" and "." in name:
+                        # Check whether the file is of the form
+                        # `gtzan_genre`.`5 digit number`.wav
+                        genre, num = name.split(".")
+                        if genre in gtzan_genres and len(num) == 5 and num.isdigit():
+                            self._walker.append(name)
+        else:
+            if self.subset == "training":
+                self._walker = filtered_train
+            elif self.subset == "validation":
+                self._walker = filtered_valid
+            elif self.subset == "testing":
+                self._walker = filtered_test
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, str]:
+        """Load the n-th sample from the dataset.
+        Args:
+            n (int): The index of the sample to be loaded
+        Returns:
+            (Tensor, int, str): ``(waveform, sample_rate, label)``
+        """
+        fileid = self._walker[n]
+        item = load_gtzan_item(fileid, self._path, self._ext_audio)
+        waveform, sample_rate, label = item
+        return waveform, sample_rate, label
+    def __len__(self) -> int:
+        return len(self._walker)

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/librilight_limited.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import os
+from pathlib import Path
+from typing import List, Tuple, Union
+from torch import Tensor
+from torch.hub import download_url_to_file
+from torch.utils.data import Dataset
+from torchaudio.datasets.librispeech import load_librispeech_item
+from torchaudio.datasets.utils import extract_archive
+_ARCHIVE_NAME = "librispeech_finetuning"
+_URL = "https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz"
+_CHECKSUM = "5d1efdc777b548194d7e09ba89126e2188026df9fd57aa57eb14408d2b2342af"
+def _get_fileids_paths(path, subset, _ext_audio) -> List[Tuple[str, str]]:
+    """Get the file names and the corresponding file paths without `speaker_id`
+    and `chapter_id` directories.
+    The format of path is like:
+        {root}/{_ARCHIVE_NAME}/1h/[0-5]/[clean, other] or
+        {root}/{_ARCHIVE_NAME}/9h/[clean, other]
+    """
+    if subset == "10min":
+        files_paths = [
+            (os.path.join(os.path.dirname(p), "..", ".."), str(p.stem))
+            for p in Path(path).glob("1h/0/*/*/*/*" + _ext_audio)
+        ]
+    elif subset in ["1h", "10h"]:
+        files_paths = [
+            (os.path.join(os.path.dirname(p), "..", ".."), str(p.stem))
+            for p in Path(path).glob("1h/*/*/*/*/*" + _ext_audio)
+        ]
+        if subset == "10h":
+            files_paths += [
+                (os.path.join(os.path.dirname(p), "..", ".."), str(p.stem))
+                for p in Path(path).glob("9h/*/*/*/*" + _ext_audio)
+            ]
+    else:
+        raise ValueError(f"Unsupported subset value. Found {subset}.")
+    files_paths = sorted(files_paths, key=lambda x: x[0] + x[1])
+    return files_paths
+class LibriLightLimited(Dataset):
+    """Create a Dataset for LibriLightLimited, which is the supervised subset of
+        LibriLight dataset.
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        subset (str, optional): The subset to use. Options: [``10min``, ``1h``, ``10h``]
+            (Default: ``10min``).
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+    """
+    _ext_txt = ".trans.txt"
+    _ext_audio = ".flac"
+    def __init__(
+        self,
+        root: Union[str, Path],
+        subset: str = "10min",
+        download: bool = False,
+    ) -> None:
+        assert subset in ["10min", "1h", "10h"], "`subset` must be one of ['10min', '1h', '10h']"
+        root = os.fspath(root)
+        self._path = os.path.join(root, _ARCHIVE_NAME)
+        archive = os.path.join(root, f"{_ARCHIVE_NAME}.tgz")
+        if not os.path.isdir(self._path):
+            if not download:
+                raise RuntimeError("Dataset not found. Please use `download=True` to download")
+            if not os.path.isfile(archive):
+                download_url_to_file(_URL, archive, hash_prefix=_CHECKSUM)
+            extract_archive(archive)
+        self._fileids_paths = _get_fileids_paths(self._path, subset, self._ext_audio)
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
+        """Load the n-th sample from the dataset.
+        Args:
+            n (int): The index of the sample to be loaded
+        Returns:
+            (Tensor, int, str, int, int, int):
+            ``(waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id)``
+        """
+        file_path, fileid = self._fileids_paths[n]
+        return load_librispeech_item(fileid, file_path, self._ext_audio, self._ext_txt)
+    def __len__(self) -> int:
+        return len(self._fileids_paths)

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/librimix.py ADDED Viewed

	@@ -0,0 +1,85 @@

+from pathlib import Path
+from typing import List, Tuple, Union
+import torch
+import torchaudio
+from torch.utils.data import Dataset
+SampleType = Tuple[int, torch.Tensor, List[torch.Tensor]]
+class LibriMix(Dataset):
+    r"""Create the *LibriMix* [:footcite:`cosentino2020librimix`] dataset.
+    Args:
+        root (str or Path): The path to the directory where the directory ``Libri2Mix`` or
+            ``Libri3Mix`` is stored.
+        subset (str, optional): The subset to use. Options: [``train-360``, ``train-100``,
+            ``dev``, and ``test``] (Default: ``train-360``).
+        num_speakers (int, optional): The number of speakers, which determines the directories
+            to traverse. The Dataset will traverse ``s1`` to ``sN`` directories to collect
+            N source audios. (Default: 2)
+        sample_rate (int, optional): sample rate of audio files. The ``sample_rate`` determines
+            which subdirectory the audio are fetched. If any of the audio has a different sample
+            rate, raises ``ValueError``. Options: [8000, 16000] (Default: 8000)
+        task (str, optional): the task of LibriMix.
+            Options: [``enh_single``, ``enh_both``, ``sep_clean``, ``sep_noisy``]
+            (Default: ``sep_clean``)
+    Note:
+        The LibriMix dataset needs to be manually generated. Please check https://github.com/JorisCos/LibriMix
+    """
+    def __init__(
+        self,
+        root: Union[str, Path],
+        subset: str = "train-360",
+        num_speakers: int = 2,
+        sample_rate: int = 8000,
+        task: str = "sep_clean",
+    ):
+        self.root = Path(root) / f"Libri{num_speakers}Mix"
+        if sample_rate == 8000:
+            self.root = self.root / "wav8k/min" / subset
+        elif sample_rate == 16000:
+            self.root = self.root / "wav16k/min" / subset
+        else:
+            raise ValueError(f"Unsupported sample rate. Found {sample_rate}.")
+        self.sample_rate = sample_rate
+        self.task = task
+        self.mix_dir = (self.root / f"mix_{task.split('_')[1]}").resolve()
+        self.src_dirs = [(self.root / f"s{i+1}").resolve() for i in range(num_speakers)]
+        self.files = [p.name for p in self.mix_dir.glob("*wav")]
+        self.files.sort()
+    def _load_audio(self, path) -> torch.Tensor:
+        waveform, sample_rate = torchaudio.load(path)
+        if sample_rate != self.sample_rate:
+            raise ValueError(
+                f"The dataset contains audio file of sample rate {sample_rate}, "
+                f"but the requested sample rate is {self.sample_rate}."
+            )
+        return waveform
+    def _load_sample(self, filename) -> SampleType:
+        mixed = self._load_audio(str(self.mix_dir / filename))
+        srcs = []
+        for i, dir_ in enumerate(self.src_dirs):
+            src = self._load_audio(str(dir_ / filename))
+            if mixed.shape != src.shape:
+                raise ValueError(f"Different waveform shapes. mixed: {mixed.shape}, src[{i}]: {src.shape}")
+            srcs.append(src)
+        return self.sample_rate, mixed, srcs
+    def __len__(self) -> int:
+        return len(self.files)
+    def __getitem__(self, key: int) -> SampleType:
+        """Load the n-th sample from the dataset.
+        Args:
+            key (int): The index of the sample to be loaded
+        Returns:
+            (int, Tensor, List[Tensor]): ``(sample_rate, mix_waveform, list_of_source_waveforms)``
+        """
+        return self._load_sample(self.files[key])

my_container_sandbox/workspace/anaconda3/lib/python3.8/site-packages/torchaudio/datasets/librispeech.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import os
+from pathlib import Path
+from typing import Tuple, Union
+import torchaudio
+from torch import Tensor
+from torch.hub import download_url_to_file
+from torch.utils.data import Dataset
+from torchaudio.datasets.utils import extract_archive
+URL = "train-clean-100"
+FOLDER_IN_ARCHIVE = "LibriSpeech"
+_DATA_SUBSETS = [
+    "dev-clean",
+    "dev-other",
+    "test-clean",
+    "test-other",
+    "train-clean-100",
+    "train-clean-360",
+    "train-other-500",
+]
+_CHECKSUMS = {
+    "http://www.openslr.org/resources/12/dev-clean.tar.gz": "76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3",  # noqa: E501
+    "http://www.openslr.org/resources/12/dev-other.tar.gz": "12661c48e8c3fe1de2c1caa4c3e135193bfb1811584f11f569dd12645aa84365",  # noqa: E501
+    "http://www.openslr.org/resources/12/test-clean.tar.gz": "39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23",  # noqa: E501
+    "http://www.openslr.org/resources/12/test-other.tar.gz": "d09c181bba5cf717b3dee7d4d592af11a3ee3a09e08ae025c5506f6ebe961c29",  # noqa: E501
+    "http://www.openslr.org/resources/12/train-clean-100.tar.gz": "d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2",  # noqa: E501
+    "http://www.openslr.org/resources/12/train-clean-360.tar.gz": "146a56496217e96c14334a160df97fffedd6e0a04e66b9c5af0d40be3c792ecf",  # noqa: E501
+    "http://www.openslr.org/resources/12/train-other-500.tar.gz": "ddb22f27f96ec163645d53215559df6aa36515f26e01dd70798188350adcb6d2",  # noqa: E501
+}
+def download_librispeech(root, url):
+    base_url = "http://www.openslr.org/resources/12/"
+    ext_archive = ".tar.gz"
+    filename = url + ext_archive
+    archive = os.path.join(root, filename)
+    download_url = os.path.join(base_url, filename)
+    if not os.path.isfile(archive):
+        checksum = _CHECKSUMS.get(download_url, None)
+        download_url_to_file(download_url, archive, hash_prefix=checksum)
+    extract_archive(archive)
+def load_librispeech_item(
+    fileid: str, path: str, ext_audio: str, ext_txt: str
+) -> Tuple[Tensor, int, str, int, int, int]:
+    speaker_id, chapter_id, utterance_id = fileid.split("-")
+    # Load audio
+    fileid_audio = f"{speaker_id}-{chapter_id}-{utterance_id}"
+    file_audio = fileid_audio + ext_audio
+    file_audio = os.path.join(path, speaker_id, chapter_id, file_audio)
+    waveform, sample_rate = torchaudio.load(file_audio)
+    # Load text
+    file_text = f"{speaker_id}-{chapter_id}{ext_txt}"
+    file_text = os.path.join(path, speaker_id, chapter_id, file_text)
+    with open(file_text) as ft:
+        for line in ft:
+            fileid_text, transcript = line.strip().split(" ", 1)
+            if fileid_audio == fileid_text:
+                break
+        else:
+            # Translation not found
+            raise FileNotFoundError(f"Translation not found for {fileid_audio}")
+    return (
+        waveform,
+        sample_rate,
+        transcript,
+        int(speaker_id),
+        int(chapter_id),
+        int(utterance_id),
+    )
+class LIBRISPEECH(Dataset):
+    """Create a Dataset for *LibriSpeech* [:footcite:`7178964`].
+    Args:
+        root (str or Path): Path to the directory where the dataset is found or downloaded.
+        url (str, optional): The URL to download the dataset from,
+            or the type of the dataset to dowload.
+            Allowed type values are ``"dev-clean"``, ``"dev-other"``, ``"test-clean"``,
+            ``"test-other"``, ``"train-clean-100"``, ``"train-clean-360"`` and
+            ``"train-other-500"``. (default: ``"train-clean-100"``)
+        folder_in_archive (str, optional):
+            The top-level directory of the dataset. (default: ``"LibriSpeech"``)
+        download (bool, optional):
+            Whether to download the dataset if it is not found at root path. (default: ``False``).
+    """
+    _ext_txt = ".trans.txt"
+    _ext_audio = ".flac"
+    def __init__(
+        self,
+        root: Union[str, Path],
+        url: str = URL,
+        folder_in_archive: str = FOLDER_IN_ARCHIVE,
+        download: bool = False,
+    ) -> None:
+        if url not in _DATA_SUBSETS:
+            raise ValueError(f"Invalid url '{url}' given; please provide one of {_DATA_SUBSETS}.")
+        root = os.fspath(root)
+        self._path = os.path.join(root, folder_in_archive, url)
+        if not os.path.isdir(self._path):
+            if download:
+                download_librispeech(root, url)
+            else:
+                raise RuntimeError(
+                    f"Dataset not found at {self._path}. Please set `download=True` to download the dataset."
+                )
+        self._walker = sorted(str(p.stem) for p in Path(self._path).glob("*/*/*" + self._ext_audio))
+    def __getitem__(self, n: int) -> Tuple[Tensor, int, str, int, int, int]:
+        """Load the n-th sample from the dataset.
+        Args:
+            n (int): The index of the sample to be loaded
+        Returns:
+            (Tensor, int, str, int, int, int):
+            ``(waveform, sample_rate, transcript, speaker_id, chapter_id, utterance_id)``
+        """
+        fileid = self._walker[n]
+        return load_librispeech_item(fileid, self._path, self._ext_audio, self._ext_txt)
+    def __len__(self) -> int:
+        return len(self._walker)