diff --git a/.gitattributes b/.gitattributes
index 98ff606500992d94c2f3bc5d0001f221f00df3a0..d004eddbe7e056f34c44cea3f7ddae3736ab8049 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -295,3 +295,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/lib/
 .venv/bin/py-spy filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/_cffi_backend.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/jsonschema/tests/__pycache__/test_validators.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/functional.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
diff --git a/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4d89adc41cd9f6d07eb634aafa30a46e320cd4e4
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/const_vs_enum.cpython-311.pyc b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/const_vs_enum.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2149c4becc6f404e8c4da54fe283de4ce388046e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/const_vs_enum.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/contains.cpython-311.pyc b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/contains.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..968c85585f6284fa3177ec171f9ec2e9718495a4
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/contains.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/issue232.cpython-311.pyc b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/issue232.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7698b2933a7155de1d2fd8224071a89506e9c520
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/issue232.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/json_schema_test_suite.cpython-311.pyc b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/json_schema_test_suite.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3cdaceefc2eb3936cf1af562d263f87c9c59dd0b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/json_schema_test_suite.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/nested_schemas.cpython-311.pyc b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/nested_schemas.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cd0218e16d6b82d9496618f9b0ce1c59c0ab2f07
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/nested_schemas.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/subcomponents.cpython-311.pyc b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/subcomponents.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..204eeacc0577b0d7a27b1540e048f22f68861498
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/subcomponents.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/unused_registry.cpython-311.pyc b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/unused_registry.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8140a856205afe10b951ad9b560e622827764035
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/unused_registry.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/useless_applicator_schemas.cpython-311.pyc b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/useless_applicator_schemas.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ac47c100dd7006c6797455f9a5bf5582c7b89be
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/useless_applicator_schemas.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/useless_keywords.cpython-311.pyc b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/useless_keywords.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3a759251276176df00c06bc575ba6b4a95fec259
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/useless_keywords.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/validator_creation.cpython-311.pyc b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/validator_creation.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..70fadd4046cd1f5cec7ab21c6f680f37dd746943
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/__pycache__/validator_creation.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/const_vs_enum.py b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/const_vs_enum.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6fecd10f6d8b845c675be9c19e9b504b08d30b9
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/const_vs_enum.py
@@ -0,0 +1,30 @@
+"""
+A benchmark for comparing equivalent validation of `const` and `enum`.
+"""
+
+from pyperf import Runner
+
+from jsonschema import Draft202012Validator
+
+value = [37] * 100
+const_schema = {"const": list(value)}
+enum_schema = {"enum": [list(value)]}
+
+valid = list(value)
+invalid = [*valid, 73]
+
+const = Draft202012Validator(const_schema)
+enum = Draft202012Validator(enum_schema)
+
+assert const.is_valid(valid)
+assert enum.is_valid(valid)
+assert not const.is_valid(invalid)
+assert not enum.is_valid(invalid)
+
+
+if __name__ == "__main__":
+    runner = Runner()
+    runner.bench_func("const valid", lambda: const.is_valid(valid))
+    runner.bench_func("const invalid", lambda: const.is_valid(invalid))
+    runner.bench_func("enum valid", lambda: enum.is_valid(valid))
+    runner.bench_func("enum invalid", lambda: enum.is_valid(invalid))
diff --git a/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/contains.py b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/contains.py
new file mode 100644
index 0000000000000000000000000000000000000000..739cd044cceb807b4029dca9447e954214a24809
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/contains.py
@@ -0,0 +1,28 @@
+"""
+A benchmark for validation of the `contains` keyword.
+"""
+
+from pyperf import Runner
+
+from jsonschema import Draft202012Validator
+
+schema = {
+    "type": "array",
+    "contains": {"const": 37},
+}
+validator = Draft202012Validator(schema)
+
+size = 1000
+beginning = [37] + [0] * (size - 1)
+middle = [0] * (size // 2) + [37] + [0] * (size // 2)
+end = [0] * (size - 1) + [37]
+invalid = [0] * size
+
+
+if __name__ == "__main__":
+    runner = Runner()
+    runner.bench_func("baseline", lambda: validator.is_valid([]))
+    runner.bench_func("beginning", lambda: validator.is_valid(beginning))
+    runner.bench_func("middle", lambda: validator.is_valid(middle))
+    runner.bench_func("end", lambda: validator.is_valid(end))
+    runner.bench_func("invalid", lambda: validator.is_valid(invalid))
diff --git a/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/json_schema_test_suite.py b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/json_schema_test_suite.py
new file mode 100644
index 0000000000000000000000000000000000000000..905fb6a3b88faf56e3288f7eb5053172f97abe8b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/json_schema_test_suite.py
@@ -0,0 +1,12 @@
+"""
+A performance benchmark using the official test suite.
+
+This benchmarks jsonschema using every valid example in the
+JSON-Schema-Test-Suite. It will take some time to complete.
+"""
+from pyperf import Runner
+
+from jsonschema.tests._suite import Suite
+
+if __name__ == "__main__":
+    Suite().benchmark(runner=Runner())
diff --git a/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/validator_creation.py b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/validator_creation.py
new file mode 100644
index 0000000000000000000000000000000000000000..4baeb3a31641a027496732a6f10e200346551209
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/jsonschema/benchmarks/validator_creation.py
@@ -0,0 +1,14 @@
+from pyperf import Runner
+
+from jsonschema import Draft202012Validator
+
+schema = {
+    "type": "array",
+    "minLength": 1,
+    "maxLength": 1,
+    "items": {"type": "integer"},
+}
+
+
+if __name__ == "__main__":
+    Runner().bench_func("validator creation", Draft202012Validator, schema)
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/__init__.py b/.venv/lib/python3.11/site-packages/torchaudio/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..90b411eae1f27ff5dc8adf4917c1ea8725cd2c9a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/__init__.py
@@ -0,0 +1,53 @@
+# Initialize extension and backend first
+from . import _extension  # noqa  # usort: skip
+from ._backend import (  # noqa  # usort: skip
+    AudioMetaData,
+    get_audio_backend,
+    info,
+    list_audio_backends,
+    load,
+    save,
+    set_audio_backend,
+)
+
+from . import (  # noqa: F401
+    compliance,
+    datasets,
+    functional,
+    io,
+    kaldi_io,
+    models,
+    pipelines,
+    sox_effects,
+    transforms,
+    utils,
+)
+
+# For BC
+from . import backend  # noqa # usort: skip
+
+try:
+    from .version import __version__, git_version  # noqa: F401
+except ImportError:
+    pass
+
+
+__all__ = [
+    "AudioMetaData",
+    "load",
+    "info",
+    "save",
+    "io",
+    "compliance",
+    "datasets",
+    "functional",
+    "models",
+    "pipelines",
+    "kaldi_io",
+    "utils",
+    "sox_effects",
+    "transforms",
+    "list_audio_backends",
+    "get_audio_backend",
+    "set_audio_backend",
+]
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f8d0511f85e8cdbd3ef955e9d038916c559608a2
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/__pycache__/kaldi_io.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/__pycache__/kaldi_io.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0c90b5b872a02eff71552363c8392f5a46e20dff
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/__pycache__/kaldi_io.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/__pycache__/version.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/__pycache__/version.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a8a0a9cad78fe8634d1ae59988b5ea268af5b47d
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/__pycache__/version.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/_backend/__init__.py b/.venv/lib/python3.11/site-packages/torchaudio/_backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..27337013ff12edbb9a6b18608a555e5c33031499
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/_backend/__init__.py
@@ -0,0 +1,61 @@
+from typing import List, Optional
+
+from torchaudio._internal.module_utils import deprecated
+
+from . import utils
+from .common import AudioMetaData
+
+__all__ = [
+    "AudioMetaData",
+    "load",
+    "info",
+    "save",
+    "list_audio_backends",
+    "get_audio_backend",
+    "set_audio_backend",
+]
+
+
+info = utils.get_info_func()
+load = utils.get_load_func()
+save = utils.get_save_func()
+
+
+def list_audio_backends() -> List[str]:
+    """List available backends
+
+    Returns:
+        list of str: The list of available backends.
+
+        The possible values are; ``"ffmpeg"``, ``"sox"`` and ``"soundfile"``.
+    """
+
+    return list(utils.get_available_backends().keys())
+
+
+# Temporary until global backend is removed
+@deprecated("With dispatcher enabled, this function is no-op. You can remove the function call.")
+def get_audio_backend() -> Optional[str]:
+    """Get the name of the current global backend
+
+    Returns:
+        str or None:
+            If dispatcher mode is enabled, returns ``None`` otherwise,
+            the name of current backend or ``None`` (no backend is set).
+    """
+    return None
+
+
+# Temporary until global backend is removed
+@deprecated("With dispatcher enabled, this function is no-op. You can remove the function call.")
+def set_audio_backend(backend: Optional[str]):  # noqa
+    """Set the global backend.
+
+    This is a no-op when dispatcher mode is enabled.
+
+    Args:
+        backend (str or None): Name of the backend.
+            One of ``"sox_io"`` or ``"soundfile"`` based on availability
+            of the system. If ``None`` is provided the  current backend is unassigned.
+    """
+    pass
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7495341e51e05574c8de6e3cfa4b6f0c4807452a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/backend.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/backend.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d035c9a9900a38948b83d9f0d4ab6b6592845092
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/backend.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/common.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/common.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2f1be477a83c09fb96987fa3986446d767b6dcc4
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/common.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/ffmpeg.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/ffmpeg.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..41e56993c72fe45d86a1612cac61cf95af4a3f3f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/ffmpeg.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/soundfile.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/soundfile.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..98591841949d441fd4e25bbcf7375a086fd4f21e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/soundfile.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/soundfile_backend.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/soundfile_backend.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba65c4c313f4b5dc6bf12315fcc0ef43d2057d0a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/soundfile_backend.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/sox.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/sox.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..894f3b9033ebedfd016b03fbab0f0befce01cdc9
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/sox.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..95ce0d6a7228e5a176cd5bd0152852f1bec8d8c0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/_backend/__pycache__/utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/_backend/backend.py b/.venv/lib/python3.11/site-packages/torchaudio/_backend/backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..579340962c42a2210d5d7a5a41a1886b5fb62045
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/_backend/backend.py
@@ -0,0 +1,53 @@
+import os
+from abc import ABC, abstractmethod
+from typing import BinaryIO, Optional, Tuple, Union
+
+from torch import Tensor
+from torchaudio.io import CodecConfig
+
+from .common import AudioMetaData
+
+
+class Backend(ABC):
+    @staticmethod
+    @abstractmethod
+    def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def load(
+        uri: Union[BinaryIO, str, os.PathLike],
+        frame_offset: int = 0,
+        num_frames: int = -1,
+        normalize: bool = True,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        buffer_size: int = 4096,
+    ) -> Tuple[Tensor, int]:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def save(
+        uri: Union[BinaryIO, str, os.PathLike],
+        src: Tensor,
+        sample_rate: int,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        encoding: Optional[str] = None,
+        bits_per_sample: Optional[int] = None,
+        buffer_size: int = 4096,
+        compression: Optional[Union[CodecConfig, float, int]] = None,
+    ) -> None:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
+        raise NotImplementedError
+
+    @staticmethod
+    @abstractmethod
+    def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
+        raise NotImplementedError
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/_backend/common.py b/.venv/lib/python3.11/site-packages/torchaudio/_backend/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..804b18d461b93d4a371e02d9cde902b59aba3111
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/_backend/common.py
@@ -0,0 +1,52 @@
+class AudioMetaData:
+    """AudioMetaData()
+
+    Return type of ``torchaudio.info`` function.
+
+    :ivar int sample_rate: Sample rate
+    :ivar int num_frames: The number of frames
+    :ivar int num_channels: The number of channels
+    :ivar int bits_per_sample: The number of bits per sample. This is 0 for lossy formats,
+        or when it cannot be accurately inferred.
+    :ivar str encoding: Audio encoding
+        The values encoding can take are one of the following:
+
+            * ``PCM_S``: Signed integer linear PCM
+            * ``PCM_U``: Unsigned integer linear PCM
+            * ``PCM_F``: Floating point linear PCM
+            * ``FLAC``: Flac, Free Lossless Audio Codec
+            * ``ULAW``: Mu-law
+            * ``ALAW``: A-law
+            * ``MP3`` : MP3, MPEG-1 Audio Layer III
+            * ``VORBIS``: OGG Vorbis
+            * ``AMR_WB``: Adaptive Multi-Rate Wideband
+            * ``AMR_NB``: Adaptive Multi-Rate Narrowband
+            * ``OPUS``: Opus
+            * ``HTK``: Single channel 16-bit PCM
+            * ``UNKNOWN`` : None of above
+    """
+
+    def __init__(
+        self,
+        sample_rate: int,
+        num_frames: int,
+        num_channels: int,
+        bits_per_sample: int,
+        encoding: str,
+    ):
+        self.sample_rate = sample_rate
+        self.num_frames = num_frames
+        self.num_channels = num_channels
+        self.bits_per_sample = bits_per_sample
+        self.encoding = encoding
+
+    def __str__(self):
+        return (
+            f"AudioMetaData("
+            f"sample_rate={self.sample_rate}, "
+            f"num_frames={self.num_frames}, "
+            f"num_channels={self.num_channels}, "
+            f"bits_per_sample={self.bits_per_sample}, "
+            f"encoding={self.encoding}"
+            f")"
+        )
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/_backend/ffmpeg.py b/.venv/lib/python3.11/site-packages/torchaudio/_backend/ffmpeg.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca8374ea07de9c9c06615f773d62b1ca910efb95
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/_backend/ffmpeg.py
@@ -0,0 +1,334 @@
+import os
+import re
+import sys
+from typing import BinaryIO, Optional, Tuple, Union
+
+import torch
+import torchaudio
+
+from .backend import Backend
+from .common import AudioMetaData
+
+InputType = Union[BinaryIO, str, os.PathLike]
+
+
+def info_audio(
+    src: InputType,
+    format: Optional[str],
+    buffer_size: int = 4096,
+) -> AudioMetaData:
+    s = torchaudio.io.StreamReader(src, format, None, buffer_size)
+    sinfo = s.get_src_stream_info(s.default_audio_stream)
+    if sinfo.num_frames == 0:
+        waveform = _load_audio(s)
+        num_frames = waveform.size(1)
+    else:
+        num_frames = sinfo.num_frames
+    return AudioMetaData(
+        int(sinfo.sample_rate),
+        num_frames,
+        sinfo.num_channels,
+        sinfo.bits_per_sample,
+        sinfo.codec.upper(),
+    )
+
+
+def _get_load_filter(
+    frame_offset: int = 0,
+    num_frames: int = -1,
+    convert: bool = True,
+) -> Optional[str]:
+    if frame_offset < 0:
+        raise RuntimeError("Invalid argument: frame_offset must be non-negative. Found: {}".format(frame_offset))
+    if num_frames == 0 or num_frames < -1:
+        raise RuntimeError("Invalid argument: num_frames must be -1 or greater than 0. Found: {}".format(num_frames))
+
+    # All default values -> no filter
+    if frame_offset == 0 and num_frames == -1 and not convert:
+        return None
+    # Only convert
+    aformat = "aformat=sample_fmts=fltp"
+    if frame_offset == 0 and num_frames == -1 and convert:
+        return aformat
+    # At least one of frame_offset or num_frames has non-default value
+    if num_frames > 0:
+        atrim = "atrim=start_sample={}:end_sample={}".format(frame_offset, frame_offset + num_frames)
+    else:
+        atrim = "atrim=start_sample={}".format(frame_offset)
+    if not convert:
+        return atrim
+    return "{},{}".format(atrim, aformat)
+
+
+def _load_audio(
+    s: "torchaudio.io.StreamReader",
+    filter: Optional[str] = None,
+    channels_first: bool = True,
+) -> torch.Tensor:
+    s.add_audio_stream(-1, -1, filter_desc=filter)
+    s.process_all_packets()
+    chunk = s.pop_chunks()[0]
+    if chunk is None:
+        raise RuntimeError("Failed to decode audio.")
+    waveform = chunk._elem
+    return waveform.T if channels_first else waveform
+
+
+def load_audio(
+    src: InputType,
+    frame_offset: int = 0,
+    num_frames: int = -1,
+    convert: bool = True,
+    channels_first: bool = True,
+    format: Optional[str] = None,
+    buffer_size: int = 4096,
+) -> Tuple[torch.Tensor, int]:
+    if hasattr(src, "read") and format == "vorbis":
+        format = "ogg"
+    s = torchaudio.io.StreamReader(src, format, None, buffer_size)
+    sample_rate = int(s.get_src_stream_info(s.default_audio_stream).sample_rate)
+    filter = _get_load_filter(frame_offset, num_frames, convert)
+    waveform = _load_audio(s, filter, channels_first)
+    return waveform, sample_rate
+
+
+def _get_sample_format(dtype: torch.dtype) -> str:
+    dtype_to_format = {
+        torch.uint8: "u8",
+        torch.int16: "s16",
+        torch.int32: "s32",
+        torch.int64: "s64",
+        torch.float32: "flt",
+        torch.float64: "dbl",
+    }
+    format = dtype_to_format.get(dtype)
+    if format is None:
+        raise ValueError(f"No format found for dtype {dtype}; dtype must be one of {list(dtype_to_format.keys())}.")
+    return format
+
+
+def _native_endianness() -> str:
+    if sys.byteorder == "little":
+        return "le"
+    else:
+        return "be"
+
+
+def _get_encoder_for_wav(encoding: str, bits_per_sample: int) -> str:
+    if bits_per_sample not in {None, 8, 16, 24, 32, 64}:
+        raise ValueError(f"Invalid bits_per_sample {bits_per_sample} for WAV encoding.")
+    endianness = _native_endianness()
+    if not encoding:
+        if not bits_per_sample:
+            # default to PCM S16
+            return f"pcm_s16{endianness}"
+        if bits_per_sample == 8:
+            return "pcm_u8"
+        return f"pcm_s{bits_per_sample}{endianness}"
+    if encoding == "PCM_S":
+        if not bits_per_sample:
+            bits_per_sample = 16
+        if bits_per_sample == 8:
+            raise ValueError("For WAV signed PCM, 8-bit encoding is not supported.")
+        return f"pcm_s{bits_per_sample}{endianness}"
+    if encoding == "PCM_U":
+        if bits_per_sample in (None, 8):
+            return "pcm_u8"
+        raise ValueError("For WAV unsigned PCM, only 8-bit encoding is supported.")
+    if encoding == "PCM_F":
+        if not bits_per_sample:
+            bits_per_sample = 32
+        if bits_per_sample in (32, 64):
+            return f"pcm_f{bits_per_sample}{endianness}"
+        raise ValueError("For WAV float PCM, only 32- and 64-bit encodings are supported.")
+    if encoding == "ULAW":
+        if bits_per_sample in (None, 8):
+            return "pcm_mulaw"
+        raise ValueError("For WAV PCM mu-law, only 8-bit encoding is supported.")
+    if encoding == "ALAW":
+        if bits_per_sample in (None, 8):
+            return "pcm_alaw"
+        raise ValueError("For WAV PCM A-law, only 8-bit encoding is supported.")
+    raise ValueError(f"WAV encoding {encoding} is not supported.")
+
+
+def _get_flac_sample_fmt(bps):
+    if bps is None or bps == 16:
+        return "s16"
+    if bps == 24:
+        return "s32"
+    raise ValueError(f"FLAC only supports bits_per_sample values of 16 and 24 ({bps} specified).")
+
+
+def _parse_save_args(
+    ext: Optional[str],
+    format: Optional[str],
+    encoding: Optional[str],
+    bps: Optional[int],
+):
+    # torchaudio's save function accepts the followings, which do not 1to1 map
+    # to FFmpeg.
+    #
+    # - format: audio format
+    # - bits_per_sample: encoder sample format
+    # - encoding: such as PCM_U8.
+    #
+    # In FFmpeg, format is specified with the following three (and more)
+    #
+    # - muxer: could be audio format or container format.
+    # the one we passed to the constructor of StreamWriter
+    # - encoder: the audio encoder used to encode audio
+    # - encoder sample format: the format used by encoder to encode audio.
+    #
+    # If encoder sample format is different from source sample format, StreamWriter
+    # will insert a filter automatically.
+    #
+    def _type(spec):
+        # either format is exactly the specified one
+        # or extension matches to the spec AND there is no format override.
+        return format == spec or (format is None and ext == spec)
+
+    if _type("wav") or _type("amb"):
+        # wav is special because it supports different encoding through encoders
+        # each encoder only supports one encoder format
+        #
+        # amb format is a special case originated from libsox.
+        # It is basically a WAV format, with slight modification.
+        # https://github.com/chirlu/sox/commit/4a4ea33edbca5972a1ed8933cc3512c7302fa67a#diff-39171191a858add9df87f5f210a34a776ac2c026842ae6db6ce97f5e68836795
+        # It is a format so that decoders will recognize it as ambisonic.
+        # https://www.ambisonia.com/Members/mleese/file-format-for-b-format/
+        # FFmpeg does not recognize amb because it is basically a WAV format.
+        muxer = "wav"
+        encoder = _get_encoder_for_wav(encoding, bps)
+        sample_fmt = None
+    elif _type("vorbis"):
+        # FFpmeg does not recognize vorbis extension, while libsox used to do.
+        # For the sake of bakward compatibility, (and the simplicity),
+        # we support the case where users want to do save("foo.vorbis")
+        muxer = "ogg"
+        encoder = "vorbis"
+        sample_fmt = None
+    else:
+        muxer = format
+        encoder = None
+        sample_fmt = None
+        if _type("flac"):
+            sample_fmt = _get_flac_sample_fmt(bps)
+        if _type("ogg"):
+            sample_fmt = _get_flac_sample_fmt(bps)
+    return muxer, encoder, sample_fmt
+
+
+def save_audio(
+    uri: InputType,
+    src: torch.Tensor,
+    sample_rate: int,
+    channels_first: bool = True,
+    format: Optional[str] = None,
+    encoding: Optional[str] = None,
+    bits_per_sample: Optional[int] = None,
+    buffer_size: int = 4096,
+    compression: Optional[torchaudio.io.CodecConfig] = None,
+) -> None:
+    ext = None
+    if hasattr(uri, "write"):
+        if format is None:
+            raise RuntimeError("'format' is required when saving to file object.")
+    else:
+        uri = os.path.normpath(uri)
+        if tokens := str(uri).split(".")[1:]:
+            ext = tokens[-1].lower()
+
+    muxer, encoder, enc_fmt = _parse_save_args(ext, format, encoding, bits_per_sample)
+
+    if channels_first:
+        src = src.T
+
+    s = torchaudio.io.StreamWriter(uri, format=muxer, buffer_size=buffer_size)
+    s.add_audio_stream(
+        sample_rate,
+        num_channels=src.size(-1),
+        format=_get_sample_format(src.dtype),
+        encoder=encoder,
+        encoder_format=enc_fmt,
+        codec_config=compression,
+    )
+    with s.open():
+        s.write_audio_chunk(0, src)
+
+
+def _map_encoding(encoding: str) -> str:
+    for dst in ["PCM_S", "PCM_U", "PCM_F"]:
+        if dst in encoding:
+            return dst
+    if encoding == "PCM_MULAW":
+        return "ULAW"
+    elif encoding == "PCM_ALAW":
+        return "ALAW"
+    return encoding
+
+
+def _get_bits_per_sample(encoding: str, bits_per_sample: int) -> str:
+    if m := re.search(r"PCM_\w(\d+)\w*", encoding):
+        return int(m.group(1))
+    elif encoding in ["PCM_ALAW", "PCM_MULAW"]:
+        return 8
+    return bits_per_sample
+
+
+class FFmpegBackend(Backend):
+    @staticmethod
+    def info(uri: InputType, format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
+        metadata = info_audio(uri, format, buffer_size)
+        metadata.bits_per_sample = _get_bits_per_sample(metadata.encoding, metadata.bits_per_sample)
+        metadata.encoding = _map_encoding(metadata.encoding)
+        return metadata
+
+    @staticmethod
+    def load(
+        uri: InputType,
+        frame_offset: int = 0,
+        num_frames: int = -1,
+        normalize: bool = True,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        buffer_size: int = 4096,
+    ) -> Tuple[torch.Tensor, int]:
+        return load_audio(uri, frame_offset, num_frames, normalize, channels_first, format)
+
+    @staticmethod
+    def save(
+        uri: InputType,
+        src: torch.Tensor,
+        sample_rate: int,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        encoding: Optional[str] = None,
+        bits_per_sample: Optional[int] = None,
+        buffer_size: int = 4096,
+        compression: Optional[Union[torchaudio.io.CodecConfig, float, int]] = None,
+    ) -> None:
+        if not isinstance(compression, (torchaudio.io.CodecConfig, type(None))):
+            raise ValueError(
+                "FFmpeg backend expects non-`None` value for argument `compression` to be of ",
+                f"type `torchaudio.io.CodecConfig`, but received value of type {type(compression)}",
+            )
+        save_audio(
+            uri,
+            src,
+            sample_rate,
+            channels_first,
+            format,
+            encoding,
+            bits_per_sample,
+            buffer_size,
+            compression,
+        )
+
+    @staticmethod
+    def can_decode(uri: InputType, format: Optional[str]) -> bool:
+        return True
+
+    @staticmethod
+    def can_encode(uri: InputType, format: Optional[str]) -> bool:
+        return True
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/_backend/soundfile.py b/.venv/lib/python3.11/site-packages/torchaudio/_backend/soundfile.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4be1f70999db86e4ff70b6b703d5784a891a84c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/_backend/soundfile.py
@@ -0,0 +1,54 @@
+import os
+from typing import BinaryIO, Optional, Tuple, Union
+
+import torch
+from torchaudio.io import CodecConfig
+
+from . import soundfile_backend
+from .backend import Backend
+from .common import AudioMetaData
+
+
+class SoundfileBackend(Backend):
+    @staticmethod
+    def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
+        return soundfile_backend.info(uri, format)
+
+    @staticmethod
+    def load(
+        uri: Union[BinaryIO, str, os.PathLike],
+        frame_offset: int = 0,
+        num_frames: int = -1,
+        normalize: bool = True,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        buffer_size: int = 4096,
+    ) -> Tuple[torch.Tensor, int]:
+        return soundfile_backend.load(uri, frame_offset, num_frames, normalize, channels_first, format)
+
+    @staticmethod
+    def save(
+        uri: Union[BinaryIO, str, os.PathLike],
+        src: torch.Tensor,
+        sample_rate: int,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        encoding: Optional[str] = None,
+        bits_per_sample: Optional[int] = None,
+        buffer_size: int = 4096,
+        compression: Optional[Union[CodecConfig, float, int]] = None,
+    ) -> None:
+        if compression:
+            raise ValueError("soundfile backend does not support argument `compression`.")
+
+        soundfile_backend.save(
+            uri, src, sample_rate, channels_first, format=format, encoding=encoding, bits_per_sample=bits_per_sample
+        )
+
+    @staticmethod
+    def can_decode(uri, format) -> bool:
+        return True
+
+    @staticmethod
+    def can_encode(uri, format) -> bool:
+        return True
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/_backend/soundfile_backend.py b/.venv/lib/python3.11/site-packages/torchaudio/_backend/soundfile_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e7b0b13cd9adf2106ef4d6885f89341822add13
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/_backend/soundfile_backend.py
@@ -0,0 +1,457 @@
+"""The new soundfile backend which will become default in 0.8.0 onward"""
+import warnings
+from typing import Optional, Tuple
+
+import torch
+from torchaudio._internal import module_utils as _mod_utils
+
+from .common import AudioMetaData
+
+
+_IS_SOUNDFILE_AVAILABLE = False
+
+# TODO: import soundfile only when it is used.
+if _mod_utils.is_module_available("soundfile"):
+    try:
+        import soundfile
+
+        _requires_soundfile = _mod_utils.no_op
+        _IS_SOUNDFILE_AVAILABLE = True
+    except Exception:
+        _requires_soundfile = _mod_utils.fail_with_message(
+            "requires soundfile, but we failed to import it. Please check the installation of soundfile."
+        )
+else:
+    _requires_soundfile = _mod_utils.fail_with_message(
+        "requires soundfile, but it is not installed. Please install soundfile."
+    )
+
+
+# Mapping from soundfile subtype to number of bits per sample.
+# This is mostly heuristical and the value is set to 0 when it is irrelevant
+# (lossy formats) or when it can't be inferred.
+# For ADPCM (and G72X) subtypes, it's hard to infer the bit depth because it's not part of the standard:
+# According to https://en.wikipedia.org/wiki/Adaptive_differential_pulse-code_modulation#In_telephony,
+# the default seems to be 8 bits but it can be compressed further to 4 bits.
+# The dict is inspired from
+# https://github.com/bastibe/python-soundfile/blob/744efb4b01abc72498a96b09115b42a4cabd85e4/soundfile.py#L66-L94
+_SUBTYPE_TO_BITS_PER_SAMPLE = {
+    "PCM_S8": 8,  # Signed 8 bit data
+    "PCM_16": 16,  # Signed 16 bit data
+    "PCM_24": 24,  # Signed 24 bit data
+    "PCM_32": 32,  # Signed 32 bit data
+    "PCM_U8": 8,  # Unsigned 8 bit data (WAV and RAW only)
+    "FLOAT": 32,  # 32 bit float data
+    "DOUBLE": 64,  # 64 bit float data
+    "ULAW": 8,  # U-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
+    "ALAW": 8,  # A-Law encoded. See https://en.wikipedia.org/wiki/G.711#Types
+    "IMA_ADPCM": 0,  # IMA ADPCM.
+    "MS_ADPCM": 0,  # Microsoft ADPCM.
+    "GSM610": 0,  # GSM 6.10 encoding. (Wikipedia says 1.625 bit depth?? https://en.wikipedia.org/wiki/Full_Rate)
+    "VOX_ADPCM": 0,  # OKI / Dialogix ADPCM
+    "G721_32": 0,  # 32kbs G721 ADPCM encoding.
+    "G723_24": 0,  # 24kbs G723 ADPCM encoding.
+    "G723_40": 0,  # 40kbs G723 ADPCM encoding.
+    "DWVW_12": 12,  # 12 bit Delta Width Variable Word encoding.
+    "DWVW_16": 16,  # 16 bit Delta Width Variable Word encoding.
+    "DWVW_24": 24,  # 24 bit Delta Width Variable Word encoding.
+    "DWVW_N": 0,  # N bit Delta Width Variable Word encoding.
+    "DPCM_8": 8,  # 8 bit differential PCM (XI only)
+    "DPCM_16": 16,  # 16 bit differential PCM (XI only)
+    "VORBIS": 0,  # Xiph Vorbis encoding. (lossy)
+    "ALAC_16": 16,  # Apple Lossless Audio Codec (16 bit).
+    "ALAC_20": 20,  # Apple Lossless Audio Codec (20 bit).
+    "ALAC_24": 24,  # Apple Lossless Audio Codec (24 bit).
+    "ALAC_32": 32,  # Apple Lossless Audio Codec (32 bit).
+}
+
+
+def _get_bit_depth(subtype):
+    if subtype not in _SUBTYPE_TO_BITS_PER_SAMPLE:
+        warnings.warn(
+            f"The {subtype} subtype is unknown to TorchAudio. As a result, the bits_per_sample "
+            "attribute will be set to 0. If you are seeing this warning, please "
+            "report by opening an issue on github (after checking for existing/closed ones). "
+            "You may otherwise ignore this warning."
+        )
+    return _SUBTYPE_TO_BITS_PER_SAMPLE.get(subtype, 0)
+
+
+_SUBTYPE_TO_ENCODING = {
+    "PCM_S8": "PCM_S",
+    "PCM_16": "PCM_S",
+    "PCM_24": "PCM_S",
+    "PCM_32": "PCM_S",
+    "PCM_U8": "PCM_U",
+    "FLOAT": "PCM_F",
+    "DOUBLE": "PCM_F",
+    "ULAW": "ULAW",
+    "ALAW": "ALAW",
+    "VORBIS": "VORBIS",
+}
+
+
+def _get_encoding(format: str, subtype: str):
+    if format == "FLAC":
+        return "FLAC"
+    return _SUBTYPE_TO_ENCODING.get(subtype, "UNKNOWN")
+
+
+@_requires_soundfile
+def info(filepath: str, format: Optional[str] = None) -> AudioMetaData:
+    """Get signal information of an audio file.
+
+    Note:
+        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
+        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
+        which has a restriction on type annotation due to TorchScript compiler compatiblity.
+
+    Args:
+        filepath (path-like object or file-like object):
+            Source of audio data.
+        format (str or None, optional):
+            Not used. PySoundFile does not accept format hint.
+
+    Returns:
+        AudioMetaData: meta data of the given audio.
+
+    """
+    sinfo = soundfile.info(filepath)
+    return AudioMetaData(
+        sinfo.samplerate,
+        sinfo.frames,
+        sinfo.channels,
+        bits_per_sample=_get_bit_depth(sinfo.subtype),
+        encoding=_get_encoding(sinfo.format, sinfo.subtype),
+    )
+
+
+_SUBTYPE2DTYPE = {
+    "PCM_S8": "int8",
+    "PCM_U8": "uint8",
+    "PCM_16": "int16",
+    "PCM_32": "int32",
+    "FLOAT": "float32",
+    "DOUBLE": "float64",
+}
+
+
+@_requires_soundfile
+def load(
+    filepath: str,
+    frame_offset: int = 0,
+    num_frames: int = -1,
+    normalize: bool = True,
+    channels_first: bool = True,
+    format: Optional[str] = None,
+) -> Tuple[torch.Tensor, int]:
+    """Load audio data from file.
+
+    Note:
+        The formats this function can handle depend on the soundfile installation.
+        This function is tested on the following formats;
+
+        * WAV
+
+            * 32-bit floating-point
+            * 32-bit signed integer
+            * 16-bit signed integer
+            * 8-bit unsigned integer
+
+        * FLAC
+        * OGG/VORBIS
+        * SPHERE
+
+    By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
+    ``float32`` dtype, and the shape of `[channel, time]`.
+
+    .. warning::
+
+       ``normalize`` argument does not perform volume normalization.
+       It only converts the sample type to `torch.float32` from the native sample
+       type.
+
+       When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
+       signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``,
+       this function can return integer Tensor, where the samples are expressed within the whole range
+       of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM,
+       ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not
+       support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors.
+
+       ``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as
+       ``flac`` and ``mp3``.
+
+       For these formats, this function always returns ``float32`` Tensor with values.
+
+    Note:
+        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
+        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
+        which has a restriction on type annotation due to TorchScript compiler compatiblity.
+
+    Args:
+        filepath (path-like object or file-like object):
+            Source of audio data.
+        frame_offset (int, optional):
+            Number of frames to skip before start reading data.
+        num_frames (int, optional):
+            Maximum number of frames to read. ``-1`` reads all the remaining samples,
+            starting from ``frame_offset``.
+            This function may return the less number of frames if there is not enough
+            frames in the given file.
+        normalize (bool, optional):
+            When ``True``, this function converts the native sample type to ``float32``.
+            Default: ``True``.
+
+            If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
+            integer type.
+            This argument has no effect for formats other than integer WAV type.
+
+        channels_first (bool, optional):
+            When True, the returned Tensor has dimension `[channel, time]`.
+            Otherwise, the returned Tensor's dimension is `[time, channel]`.
+        format (str or None, optional):
+            Not used. PySoundFile does not accept format hint.
+
+    Returns:
+        (torch.Tensor, int): Resulting Tensor and sample rate.
+            If the input file has integer wav format and normalization is off, then it has
+            integer type, else ``float32`` type. If ``channels_first=True``, it has
+            `[channel, time]` else `[time, channel]`.
+    """
+    with soundfile.SoundFile(filepath, "r") as file_:
+        if file_.format != "WAV" or normalize:
+            dtype = "float32"
+        elif file_.subtype not in _SUBTYPE2DTYPE:
+            raise ValueError(f"Unsupported subtype: {file_.subtype}")
+        else:
+            dtype = _SUBTYPE2DTYPE[file_.subtype]
+
+        frames = file_._prepare_read(frame_offset, None, num_frames)
+        waveform = file_.read(frames, dtype, always_2d=True)
+        sample_rate = file_.samplerate
+
+    waveform = torch.from_numpy(waveform)
+    if channels_first:
+        waveform = waveform.t()
+    return waveform, sample_rate
+
+
+def _get_subtype_for_wav(dtype: torch.dtype, encoding: str, bits_per_sample: int):
+    if not encoding:
+        if not bits_per_sample:
+            subtype = {
+                torch.uint8: "PCM_U8",
+                torch.int16: "PCM_16",
+                torch.int32: "PCM_32",
+                torch.float32: "FLOAT",
+                torch.float64: "DOUBLE",
+            }.get(dtype)
+            if not subtype:
+                raise ValueError(f"Unsupported dtype for wav: {dtype}")
+            return subtype
+        if bits_per_sample == 8:
+            return "PCM_U8"
+        return f"PCM_{bits_per_sample}"
+    if encoding == "PCM_S":
+        if not bits_per_sample:
+            return "PCM_32"
+        if bits_per_sample == 8:
+            raise ValueError("wav does not support 8-bit signed PCM encoding.")
+        return f"PCM_{bits_per_sample}"
+    if encoding == "PCM_U":
+        if bits_per_sample in (None, 8):
+            return "PCM_U8"
+        raise ValueError("wav only supports 8-bit unsigned PCM encoding.")
+    if encoding == "PCM_F":
+        if bits_per_sample in (None, 32):
+            return "FLOAT"
+        if bits_per_sample == 64:
+            return "DOUBLE"
+        raise ValueError("wav only supports 32/64-bit float PCM encoding.")
+    if encoding == "ULAW":
+        if bits_per_sample in (None, 8):
+            return "ULAW"
+        raise ValueError("wav only supports 8-bit mu-law encoding.")
+    if encoding == "ALAW":
+        if bits_per_sample in (None, 8):
+            return "ALAW"
+        raise ValueError("wav only supports 8-bit a-law encoding.")
+    raise ValueError(f"wav does not support {encoding}.")
+
+
+def _get_subtype_for_sphere(encoding: str, bits_per_sample: int):
+    if encoding in (None, "PCM_S"):
+        return f"PCM_{bits_per_sample}" if bits_per_sample else "PCM_32"
+    if encoding in ("PCM_U", "PCM_F"):
+        raise ValueError(f"sph does not support {encoding} encoding.")
+    if encoding == "ULAW":
+        if bits_per_sample in (None, 8):
+            return "ULAW"
+        raise ValueError("sph only supports 8-bit for mu-law encoding.")
+    if encoding == "ALAW":
+        return "ALAW"
+    raise ValueError(f"sph does not support {encoding}.")
+
+
+def _get_subtype(dtype: torch.dtype, format: str, encoding: str, bits_per_sample: int):
+    if format == "wav":
+        return _get_subtype_for_wav(dtype, encoding, bits_per_sample)
+    if format == "flac":
+        if encoding:
+            raise ValueError("flac does not support encoding.")
+        if not bits_per_sample:
+            return "PCM_16"
+        if bits_per_sample > 24:
+            raise ValueError("flac does not support bits_per_sample > 24.")
+        return "PCM_S8" if bits_per_sample == 8 else f"PCM_{bits_per_sample}"
+    if format in ("ogg", "vorbis"):
+        if bits_per_sample:
+            raise ValueError("ogg/vorbis does not support bits_per_sample.")
+        if encoding is None or encoding == "vorbis":
+            return "VORBIS"
+        if encoding == "opus":
+            return "OPUS"
+        raise ValueError(f"Unexpected encoding: {encoding}")
+    if format == "mp3":
+        return "MPEG_LAYER_III"
+    if format == "sph":
+        return _get_subtype_for_sphere(encoding, bits_per_sample)
+    if format in ("nis", "nist"):
+        return "PCM_16"
+    raise ValueError(f"Unsupported format: {format}")
+
+
+@_requires_soundfile
+def save(
+    filepath: str,
+    src: torch.Tensor,
+    sample_rate: int,
+    channels_first: bool = True,
+    compression: Optional[float] = None,
+    format: Optional[str] = None,
+    encoding: Optional[str] = None,
+    bits_per_sample: Optional[int] = None,
+):
+    """Save audio data to file.
+
+    Note:
+        The formats this function can handle depend on the soundfile installation.
+        This function is tested on the following formats;
+
+        * WAV
+
+            * 32-bit floating-point
+            * 32-bit signed integer
+            * 16-bit signed integer
+            * 8-bit unsigned integer
+
+        * FLAC
+        * OGG/VORBIS
+        * SPHERE
+
+    Note:
+        ``filepath`` argument is intentionally annotated as ``str`` only, even though it accepts
+        ``pathlib.Path`` object as well. This is for the consistency with ``"sox_io"`` backend,
+        which has a restriction on type annotation due to TorchScript compiler compatiblity.
+
+    Args:
+        filepath (str or pathlib.Path): Path to audio file.
+        src (torch.Tensor): Audio data to save. must be 2D tensor.
+        sample_rate (int): sampling rate
+        channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
+            otherwise `[time, channel]`.
+        compression (float of None, optional): Not used.
+            It is here only for interface compatibility reson with "sox_io" backend.
+        format (str or None, optional): Override the audio format.
+            When ``filepath`` argument is path-like object, audio format is
+            inferred from file extension. If the file extension is missing or
+            different, you can specify the correct format with this argument.
+
+            When ``filepath`` argument is file-like object,
+            this argument is required.
+
+            Valid values are ``"wav"``, ``"ogg"``, ``"vorbis"``,
+            ``"flac"`` and ``"sph"``.
+        encoding (str or None, optional): Changes the encoding for supported formats.
+            This argument is effective only for supported formats, sush as
+            ``"wav"``, ``""flac"`` and ``"sph"``. Valid values are;
+
+                - ``"PCM_S"`` (signed integer Linear PCM)
+                - ``"PCM_U"`` (unsigned integer Linear PCM)
+                - ``"PCM_F"`` (floating point PCM)
+                - ``"ULAW"`` (mu-law)
+                - ``"ALAW"`` (a-law)
+
+        bits_per_sample (int or None, optional): Changes the bit depth for the
+            supported formats.
+            When ``format`` is one of ``"wav"``, ``"flac"`` or ``"sph"``,
+            you can change the bit depth.
+            Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
+
+    Supported formats/encodings/bit depth/compression are:
+
+    ``"wav"``
+        - 32-bit floating-point PCM
+        - 32-bit signed integer PCM
+        - 24-bit signed integer PCM
+        - 16-bit signed integer PCM
+        - 8-bit unsigned integer PCM
+        - 8-bit mu-law
+        - 8-bit a-law
+
+        Note:
+            Default encoding/bit depth is determined by the dtype of
+            the input Tensor.
+
+    ``"flac"``
+        - 8-bit
+        - 16-bit (default)
+        - 24-bit
+
+    ``"ogg"``, ``"vorbis"``
+        - Doesn't accept changing configuration.
+
+    ``"sph"``
+        - 8-bit signed integer PCM
+        - 16-bit signed integer PCM
+        - 24-bit signed integer PCM
+        - 32-bit signed integer PCM (default)
+        - 8-bit mu-law
+        - 8-bit a-law
+        - 16-bit a-law
+        - 24-bit a-law
+        - 32-bit a-law
+
+    """
+    if src.ndim != 2:
+        raise ValueError(f"Expected 2D Tensor, got {src.ndim}D.")
+    if compression is not None:
+        warnings.warn(
+            '`save` function of "soundfile" backend does not support "compression" parameter. '
+            "The argument is silently ignored."
+        )
+    if hasattr(filepath, "write"):
+        if format is None:
+            raise RuntimeError("`format` is required when saving to file object.")
+        ext = format.lower()
+    else:
+        ext = str(filepath).split(".")[-1].lower()
+
+    if bits_per_sample not in (None, 8, 16, 24, 32, 64):
+        raise ValueError("Invalid bits_per_sample.")
+    if bits_per_sample == 24:
+        warnings.warn(
+            "Saving audio with 24 bits per sample might warp samples near -1. "
+            "Using 16 bits per sample might be able to avoid this."
+        )
+    subtype = _get_subtype(src.dtype, ext, encoding, bits_per_sample)
+
+    # sph is a extension used in TED-LIUM but soundfile does not recognize it as NIST format,
+    # so we extend the extensions manually here
+    if ext in ["nis", "nist", "sph"] and format is None:
+        format = "NIST"
+
+    if channels_first:
+        src = src.t()
+
+    soundfile.write(file=filepath, data=src, samplerate=sample_rate, subtype=subtype, format=format)
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/_backend/sox.py b/.venv/lib/python3.11/site-packages/torchaudio/_backend/sox.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfcd8a4f8beadcd80787a354d47219f9edfb98e8
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/_backend/sox.py
@@ -0,0 +1,91 @@
+import os
+from typing import BinaryIO, Optional, Tuple, Union
+
+import torch
+import torchaudio
+
+from .backend import Backend
+from .common import AudioMetaData
+
+sox_ext = torchaudio._extension.lazy_import_sox_ext()
+
+
+class SoXBackend(Backend):
+    @staticmethod
+    def info(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], buffer_size: int = 4096) -> AudioMetaData:
+        if hasattr(uri, "read"):
+            raise ValueError(
+                "SoX backend does not support reading from file-like objects. ",
+                "Please use an alternative backend that does support reading from file-like objects, e.g. FFmpeg.",
+            )
+        else:
+            sinfo = sox_ext.get_info(uri, format)
+            if sinfo:
+                return AudioMetaData(*sinfo)
+            else:
+                raise RuntimeError(f"Failed to fetch metadata for {uri}.")
+
+    @staticmethod
+    def load(
+        uri: Union[BinaryIO, str, os.PathLike],
+        frame_offset: int = 0,
+        num_frames: int = -1,
+        normalize: bool = True,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        buffer_size: int = 4096,
+    ) -> Tuple[torch.Tensor, int]:
+        if hasattr(uri, "read"):
+            raise ValueError(
+                "SoX backend does not support loading from file-like objects. ",
+                "Please use an alternative backend that does support loading from file-like objects, e.g. FFmpeg.",
+            )
+        else:
+            ret = sox_ext.load_audio_file(uri, frame_offset, num_frames, normalize, channels_first, format)
+            if not ret:
+                raise RuntimeError(f"Failed to load audio from {uri}.")
+            return ret
+
+    @staticmethod
+    def save(
+        uri: Union[BinaryIO, str, os.PathLike],
+        src: torch.Tensor,
+        sample_rate: int,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        encoding: Optional[str] = None,
+        bits_per_sample: Optional[int] = None,
+        buffer_size: int = 4096,
+        compression: Optional[Union[torchaudio.io.CodecConfig, float, int]] = None,
+    ) -> None:
+        if not isinstance(compression, (float, int, type(None))):
+            raise ValueError(
+                "SoX backend expects non-`None` value for argument `compression` to be of ",
+                f"type `float` or `int`, but received value of type {type(compression)}",
+            )
+        if hasattr(uri, "write"):
+            raise ValueError(
+                "SoX backend does not support writing to file-like objects. ",
+                "Please use an alternative backend that does support writing to file-like objects, e.g. FFmpeg.",
+            )
+        else:
+            sox_ext.save_audio_file(
+                uri,
+                src,
+                sample_rate,
+                channels_first,
+                compression,
+                format,
+                encoding,
+                bits_per_sample,
+            )
+
+    @staticmethod
+    def can_decode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
+        # i.e. not a file-like object.
+        return not hasattr(uri, "read")
+
+    @staticmethod
+    def can_encode(uri: Union[BinaryIO, str, os.PathLike], format: Optional[str]) -> bool:
+        # i.e. not a file-like object.
+        return not hasattr(uri, "write")
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/_backend/utils.py b/.venv/lib/python3.11/site-packages/torchaudio/_backend/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cde6b1927d2b35e34d4ab0e99124fde146e724d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/_backend/utils.py
@@ -0,0 +1,317 @@
+import os
+from functools import lru_cache
+from typing import BinaryIO, Dict, Optional, Tuple, Type, Union
+
+import torch
+
+from torchaudio._extension import lazy_import_sox_ext
+from torchaudio.io import CodecConfig
+from torio._extension import lazy_import_ffmpeg_ext
+
+from . import soundfile_backend
+
+from .backend import Backend
+from .common import AudioMetaData
+from .ffmpeg import FFmpegBackend
+from .soundfile import SoundfileBackend
+from .sox import SoXBackend
+
+
+@lru_cache(None)
+def get_available_backends() -> Dict[str, Type[Backend]]:
+    backend_specs: Dict[str, Type[Backend]] = {}
+    if lazy_import_ffmpeg_ext().is_available():
+        backend_specs["ffmpeg"] = FFmpegBackend
+    if lazy_import_sox_ext().is_available():
+        backend_specs["sox"] = SoXBackend
+    if soundfile_backend._IS_SOUNDFILE_AVAILABLE:
+        backend_specs["soundfile"] = SoundfileBackend
+    return backend_specs
+
+
+def get_backend(backend_name, backends) -> Backend:
+    if backend := backends.get(backend_name):
+        return backend
+    else:
+        raise ValueError(
+            f"Unsupported backend '{backend_name}' specified; ",
+            f"please select one of {list(backends.keys())} instead.",
+        )
+
+
+def get_info_func():
+    backends = get_available_backends()
+
+    def dispatcher(
+        uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str]
+    ) -> Backend:
+        if backend_name is not None:
+            return get_backend(backend_name, backends)
+
+        for backend in backends.values():
+            if backend.can_decode(uri, format):
+                return backend
+        raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.")
+
+    def info(
+        uri: Union[BinaryIO, str, os.PathLike],
+        format: Optional[str] = None,
+        buffer_size: int = 4096,
+        backend: Optional[str] = None,
+    ) -> AudioMetaData:
+        """Get signal information of an audio file.
+
+        Note:
+            When the input type is file-like object, this function cannot
+            get the correct length (``num_samples``) for certain formats,
+            such as ``vorbis``.
+            In this case, the value of ``num_samples`` is ``0``.
+
+        Args:
+            uri (path-like object or file-like object):
+                Source of audio data. The following types are accepted:
+
+                * ``path-like``: File path or URL.
+                * ``file-like``: Object with ``read(size: int) -> bytes`` method,
+                  which returns byte string of at most ``size`` length.
+
+            format (str or None, optional):
+                If not ``None``, interpreted as hint that may allow backend to override the detected format.
+                (Default: ``None``)
+
+            buffer_size (int, optional):
+                Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
+
+            backend (str or None, optional):
+                I/O backend to use.
+                If ``None``, function selects backend given input and available backends.
+                Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
+                with the corresponding backend available.
+                (Default: ``None``)
+
+                .. seealso::
+                   :ref:`backend`
+
+        Returns:
+            AudioMetaData
+        """
+        backend = dispatcher(uri, format, backend)
+        return backend.info(uri, format, buffer_size)
+
+    return info
+
+
+def get_load_func():
+    backends = get_available_backends()
+
+    def dispatcher(
+        uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str]
+    ) -> Backend:
+        if backend_name is not None:
+            return get_backend(backend_name, backends)
+
+        for backend in backends.values():
+            if backend.can_decode(uri, format):
+                return backend
+        raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.")
+
+    def load(
+        uri: Union[BinaryIO, str, os.PathLike],
+        frame_offset: int = 0,
+        num_frames: int = -1,
+        normalize: bool = True,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        buffer_size: int = 4096,
+        backend: Optional[str] = None,
+    ) -> Tuple[torch.Tensor, int]:
+        """Load audio data from source.
+
+        By default (``normalize=True``, ``channels_first=True``), this function returns Tensor with
+        ``float32`` dtype, and the shape of `[channel, time]`.
+
+        Note:
+            The formats this function can handle depend on the availability of backends.
+            Please use the following functions to fetch the supported formats.
+
+            - FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_decoders`
+            - Sox: :py:func:`torchaudio.utils.sox_utils.list_read_formats`
+            - SoundFile: Refer to `the official document <https://pysoundfile.readthedocs.io/>`__.
+
+        .. warning::
+
+            ``normalize`` argument does not perform volume normalization.
+            It only converts the sample type to `torch.float32` from the native sample
+            type.
+
+            When the input format is WAV with integer type, such as 32-bit signed integer, 16-bit
+            signed integer, 24-bit signed integer, and 8-bit unsigned integer, by providing ``normalize=False``,
+            this function can return integer Tensor, where the samples are expressed within the whole range
+            of the corresponding dtype, that is, ``int32`` tensor for 32-bit signed PCM,
+            ``int16`` for 16-bit signed PCM and ``uint8`` for 8-bit unsigned PCM. Since torch does not
+            support ``int24`` dtype, 24-bit signed PCM are converted to ``int32`` tensors.
+
+            ``normalize`` argument has no effect on 32-bit floating-point WAV and other formats, such as
+            ``flac`` and ``mp3``.
+
+            For these formats, this function always returns ``float32`` Tensor with values.
+
+
+        Args:
+            uri (path-like object or file-like object):
+                Source of audio data.
+            frame_offset (int, optional):
+                Number of frames to skip before start reading data.
+            num_frames (int, optional):
+                Maximum number of frames to read. ``-1`` reads all the remaining samples,
+                starting from ``frame_offset``.
+                This function may return the less number of frames if there is not enough
+                frames in the given file.
+            normalize (bool, optional):
+                When ``True``, this function converts the native sample type to ``float32``.
+                Default: ``True``.
+
+                If input file is integer WAV, giving ``False`` will change the resulting Tensor type to
+                integer type.
+                This argument has no effect for formats other than integer WAV type.
+
+            channels_first (bool, optional):
+                When True, the returned Tensor has dimension `[channel, time]`.
+                Otherwise, the returned Tensor's dimension is `[time, channel]`.
+
+            format (str or None, optional):
+                If not ``None``, interpreted as hint that may allow backend to override the detected format.
+                (Default: ``None``)
+
+            buffer_size (int, optional):
+                Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
+
+            backend (str or None, optional):
+                I/O backend to use.
+                If ``None``, function selects backend given input and available backends.
+                Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
+                with the corresponding backend being available. (Default: ``None``)
+
+                .. seealso::
+                   :ref:`backend`
+
+        Returns:
+            (torch.Tensor, int): Resulting Tensor and sample rate.
+                If the input file has integer wav format and normalization is off, then it has
+                integer type, else ``float32`` type. If ``channels_first=True``, it has
+                `[channel, time]` else `[time, channel]`.
+        """
+        backend = dispatcher(uri, format, backend)
+        return backend.load(uri, frame_offset, num_frames, normalize, channels_first, format, buffer_size)
+
+    return load
+
+
+def get_save_func():
+    backends = get_available_backends()
+
+    def dispatcher(
+        uri: Union[BinaryIO, str, os.PathLike], format: Optional[str], backend_name: Optional[str]
+    ) -> Backend:
+        if backend_name is not None:
+            return get_backend(backend_name, backends)
+
+        for backend in backends.values():
+            if backend.can_encode(uri, format):
+                return backend
+        raise RuntimeError(f"Couldn't find appropriate backend to handle uri {uri} and format {format}.")
+
+    def save(
+        uri: Union[BinaryIO, str, os.PathLike],
+        src: torch.Tensor,
+        sample_rate: int,
+        channels_first: bool = True,
+        format: Optional[str] = None,
+        encoding: Optional[str] = None,
+        bits_per_sample: Optional[int] = None,
+        buffer_size: int = 4096,
+        backend: Optional[str] = None,
+        compression: Optional[Union[CodecConfig, float, int]] = None,
+    ):
+        """Save audio data to file.
+
+        Note:
+            The formats this function can handle depend on the availability of backends.
+            Please use the following functions to fetch the supported formats.
+
+            - FFmpeg: :py:func:`torchaudio.utils.ffmpeg_utils.get_audio_encoders`
+            - Sox: :py:func:`torchaudio.utils.sox_utils.list_write_formats`
+            - SoundFile: Refer to `the official document <https://pysoundfile.readthedocs.io/>`__.
+
+        Args:
+            uri (str or pathlib.Path): Path to audio file.
+            src (torch.Tensor): Audio data to save. must be 2D tensor.
+            sample_rate (int): sampling rate
+            channels_first (bool, optional): If ``True``, the given tensor is interpreted as `[channel, time]`,
+                otherwise `[time, channel]`.
+            format (str or None, optional): Override the audio format.
+                When ``uri`` argument is path-like object, audio format is
+                inferred from file extension. If the file extension is missing or
+                different, you can specify the correct format with this argument.
+
+                When ``uri`` argument is file-like object,
+                this argument is required.
+
+                Valid values are ``"wav"``, ``"ogg"``, and ``"flac"``.
+            encoding (str or None, optional): Changes the encoding for supported formats.
+                This argument is effective only for supported formats, i.e.
+                ``"wav"`` and ``""flac"```. Valid values are
+
+                - ``"PCM_S"`` (signed integer Linear PCM)
+                - ``"PCM_U"`` (unsigned integer Linear PCM)
+                - ``"PCM_F"`` (floating point PCM)
+                - ``"ULAW"`` (mu-law)
+                - ``"ALAW"`` (a-law)
+
+            bits_per_sample (int or None, optional): Changes the bit depth for the
+                supported formats.
+                When ``format`` is one of ``"wav"`` and ``"flac"``,
+                you can change the bit depth.
+                Valid values are ``8``, ``16``, ``24``, ``32`` and ``64``.
+
+            buffer_size (int, optional):
+                Size of buffer to use when processing file-like objects, in bytes. (Default: ``4096``)
+
+            backend (str or None, optional):
+                I/O backend to use.
+                If ``None``, function selects backend given input and available backends.
+                Otherwise, must be one of [``"ffmpeg"``, ``"sox"``, ``"soundfile"``],
+                with the corresponding backend being available.
+                (Default: ``None``)
+
+                .. seealso::
+                   :ref:`backend`
+
+            compression (CodecConfig, float, int, or None, optional):
+                Compression configuration to apply.
+
+                If the selected backend is FFmpeg, an instance of :py:class:`CodecConfig` must be provided.
+
+                Otherwise, if the selected backend is SoX, a float or int value corresponding to option ``-C`` of the
+                ``sox`` command line interface must be provided. For instance:
+
+                ``"mp3"``
+                    Either bitrate (in ``kbps``) with quality factor, such as ``128.2``, or
+                    VBR encoding with quality factor such as ``-4.2``. Default: ``-4.5``.
+
+                ``"flac"``
+                    Whole number from ``0`` to ``8``. ``8`` is default and highest compression.
+
+                ``"ogg"``, ``"vorbis"``
+                    Number from ``-1`` to ``10``; ``-1`` is the highest compression
+                    and lowest quality. Default: ``3``.
+
+                Refer to http://sox.sourceforge.net/soxformat.html for more details.
+
+        """
+        backend = dispatcher(uri, format, backend)
+        return backend.save(
+            uri, src, sample_rate, channels_first, format, encoding, bits_per_sample, buffer_size, compression
+        )
+
+    return save
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/backend/__init__.py b/.venv/lib/python3.11/site-packages/torchaudio/backend/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..84df7e7d697616076d549dc4163b55cd34335a25
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/backend/__init__.py
@@ -0,0 +1,8 @@
+# NOTE:
+# The entire `torchaudio.backend` module is deprecated.
+# New things should be added to `torchaudio._backend`.
+# Only things related to backward compatibility should be placed here.
+
+from . import common, no_backend, soundfile_backend, sox_io_backend  # noqa
+
+__all__ = []
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/backend/__pycache__/_sox_io_backend.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/backend/__pycache__/_sox_io_backend.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7628c492841d3e11873ace25115a6d579fa7b991
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/backend/__pycache__/_sox_io_backend.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/backend/__pycache__/soundfile_backend.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/backend/__pycache__/soundfile_backend.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..682db31829a6e4b4cb6e06deac44394960d24753
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/backend/__pycache__/soundfile_backend.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/backend/_no_backend.py b/.venv/lib/python3.11/site-packages/torchaudio/backend/_no_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcbb2ad84aefcf33b181b686ee1105e532a8661d
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/backend/_no_backend.py
@@ -0,0 +1,25 @@
+from pathlib import Path
+from typing import Callable, Optional, Tuple, Union
+
+from torch import Tensor
+from torchaudio import AudioMetaData
+
+
+def load(
+    filepath: Union[str, Path],
+    out: Optional[Tensor] = None,
+    normalization: Union[bool, float, Callable] = True,
+    channels_first: bool = True,
+    num_frames: int = 0,
+    offset: int = 0,
+    filetype: Optional[str] = None,
+) -> Tuple[Tensor, int]:
+    raise RuntimeError("No audio I/O backend is available.")
+
+
+def save(filepath: str, src: Tensor, sample_rate: int, precision: int = 16, channels_first: bool = True) -> None:
+    raise RuntimeError("No audio I/O backend is available.")
+
+
+def info(filepath: str) -> AudioMetaData:
+    raise RuntimeError("No audio I/O backend is available.")
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/backend/common.py b/.venv/lib/python3.11/site-packages/torchaudio/backend/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f736bf4017c952c850dcbb3cc5fe1fe14f2715f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/backend/common.py
@@ -0,0 +1,13 @@
+def __getattr__(name: str):
+    if name == "AudioMetaData":
+        import warnings
+
+        warnings.warn(
+            "`torchaudio.backend.common.AudioMetaData` has been moved to "
+            "`torchaudio.AudioMetaData`. Please update the import path.",
+            stacklevel=2,
+        )
+        from torchaudio import AudioMetaData
+
+        return AudioMetaData
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/backend/soundfile_backend.py b/.venv/lib/python3.11/site-packages/torchaudio/backend/soundfile_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e81db372a12800a869f4a48291a77739c4f07e6
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/backend/soundfile_backend.py
@@ -0,0 +1,14 @@
+def __getattr__(name: str):
+    import warnings
+
+    warnings.warn(
+        "Torchaudio's I/O functions now support par-call bakcend dispatch. "
+        "Importing backend implementation directly is no longer guaranteed to work. "
+        "Please use `backend` keyword with load/save/info function, instead of "
+        "calling the udnerlying implementation directly.",
+        stacklevel=2,
+    )
+
+    from torchaudio._backend import soundfile_backend
+
+    return getattr(soundfile_backend, name)
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/backend/sox_io_backend.py b/.venv/lib/python3.11/site-packages/torchaudio/backend/sox_io_backend.py
new file mode 100644
index 0000000000000000000000000000000000000000..a361ab87a5dba694e247f7f2205c0f2a25b19686
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/backend/sox_io_backend.py
@@ -0,0 +1,14 @@
+def __getattr__(name: str):
+    import warnings
+
+    warnings.warn(
+        "Torchaudio's I/O functions now support par-call bakcend dispatch. "
+        "Importing backend implementation directly is no longer guaranteed to work. "
+        "Please use `backend` keyword with load/save/info function, instead of "
+        "calling the udnerlying implementation directly.",
+        stacklevel=2,
+    )
+
+    from . import _sox_io_backend
+
+    return getattr(_sox_io_backend, name)
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/functional/__init__.py b/.venv/lib/python3.11/site-packages/torchaudio/functional/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b866977c67c9cbcb6098c39ea5c26ed7b19d5979
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/functional/__init__.py
@@ -0,0 +1,127 @@
+from ._alignment import forced_align, merge_tokens, TokenSpan
+from .filtering import (
+    allpass_biquad,
+    band_biquad,
+    bandpass_biquad,
+    bandreject_biquad,
+    bass_biquad,
+    biquad,
+    contrast,
+    dcshift,
+    deemph_biquad,
+    dither,
+    equalizer_biquad,
+    filtfilt,
+    flanger,
+    gain,
+    highpass_biquad,
+    lfilter,
+    lowpass_biquad,
+    overdrive,
+    phaser,
+    riaa_biquad,
+    treble_biquad,
+    vad,
+)
+from .functional import (
+    add_noise,
+    amplitude_to_DB,
+    apply_beamforming,
+    apply_codec,
+    compute_deltas,
+    convolve,
+    create_dct,
+    DB_to_amplitude,
+    deemphasis,
+    detect_pitch_frequency,
+    edit_distance,
+    fftconvolve,
+    frechet_distance,
+    griffinlim,
+    inverse_spectrogram,
+    linear_fbanks,
+    loudness,
+    mask_along_axis,
+    mask_along_axis_iid,
+    melscale_fbanks,
+    mu_law_decoding,
+    mu_law_encoding,
+    mvdr_weights_rtf,
+    mvdr_weights_souden,
+    phase_vocoder,
+    pitch_shift,
+    preemphasis,
+    psd,
+    resample,
+    rnnt_loss,
+    rtf_evd,
+    rtf_power,
+    sliding_window_cmn,
+    spectral_centroid,
+    spectrogram,
+    speed,
+)
+
+__all__ = [
+    "amplitude_to_DB",
+    "compute_deltas",
+    "create_dct",
+    "melscale_fbanks",
+    "linear_fbanks",
+    "DB_to_amplitude",
+    "loudness",
+    "detect_pitch_frequency",
+    "griffinlim",
+    "mask_along_axis",
+    "mask_along_axis_iid",
+    "mu_law_encoding",
+    "mu_law_decoding",
+    "phase_vocoder",
+    "sliding_window_cmn",
+    "spectrogram",
+    "inverse_spectrogram",
+    "spectral_centroid",
+    "allpass_biquad",
+    "band_biquad",
+    "bandpass_biquad",
+    "bandreject_biquad",
+    "bass_biquad",
+    "biquad",
+    "contrast",
+    "dither",
+    "dcshift",
+    "deemph_biquad",
+    "equalizer_biquad",
+    "filtfilt",
+    "flanger",
+    "forced_align",
+    "merge_tokens",
+    "TokenSpan",
+    "gain",
+    "highpass_biquad",
+    "lfilter",
+    "lowpass_biquad",
+    "overdrive",
+    "phaser",
+    "riaa_biquad",
+    "treble_biquad",
+    "vad",
+    "apply_codec",
+    "resample",
+    "edit_distance",
+    "pitch_shift",
+    "rnnt_loss",
+    "psd",
+    "mvdr_weights_souden",
+    "mvdr_weights_rtf",
+    "rtf_evd",
+    "rtf_power",
+    "apply_beamforming",
+    "fftconvolve",
+    "convolve",
+    "add_noise",
+    "speed",
+    "preemphasis",
+    "deemphasis",
+    "frechet_distance",
+]
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..34141f910dfcf3862e376398d8af7f60dd789c3b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/_alignment.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/_alignment.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5eb6e606901eb0d6ae3ca6ca935b6715cf14fc61
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/_alignment.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/filtering.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/filtering.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3ff6ad915a8918eab299f98e9a45a63b2beb454
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/filtering.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/functional.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/functional.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..875bb4734f86c4db1a534e24ce77f28b9da0ee1c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/functional/__pycache__/functional.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71e5719c3daaa09433b5ece2431df353ef399f7678bc6bee1f1ebff9b16f9c13
+size 115834
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/functional/_alignment.py b/.venv/lib/python3.11/site-packages/torchaudio/functional/_alignment.py
new file mode 100644
index 0000000000000000000000000000000000000000..70d1e995e41b8e6817b2dcc25f6a2d0ec3d83752
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/functional/_alignment.py
@@ -0,0 +1,128 @@
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+
+import torch
+from torch import Tensor
+from torchaudio._extension import fail_if_no_align
+
+__all__ = []
+
+
+@fail_if_no_align
+def forced_align(
+    log_probs: Tensor,
+    targets: Tensor,
+    input_lengths: Optional[Tensor] = None,
+    target_lengths: Optional[Tensor] = None,
+    blank: int = 0,
+) -> Tuple[Tensor, Tensor]:
+    r"""Align a CTC label sequence to an emission.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Args:
+        log_probs (Tensor): log probability of CTC emission output.
+            Tensor of shape `(B, T, C)`. where `B` is the batch size, `T` is the input length,
+            `C` is the number of characters in alphabet including blank.
+        targets (Tensor): Target sequence. Tensor of shape `(B, L)`,
+            where `L` is the target length.
+        input_lengths (Tensor or None, optional):
+            Lengths of the inputs (max value must each be <= `T`). 1-D Tensor of shape `(B,)`.
+        target_lengths (Tensor or None, optional):
+            Lengths of the targets. 1-D Tensor of shape `(B,)`.
+        blank_id (int, optional): The index of blank symbol in CTC emission. (Default: 0)
+
+    Returns:
+        Tuple(Tensor, Tensor):
+            Tensor: Label for each time step in the alignment path computed using forced alignment.
+
+            Tensor: Log probability scores of the labels for each time step.
+
+    Note:
+        The sequence length of `log_probs` must satisfy:
+
+
+        .. math::
+            L_{\text{log\_probs}} \ge L_{\text{label}} + N_{\text{repeat}}
+
+        where :math:`N_{\text{repeat}}` is the number of consecutively repeated tokens.
+        For example, in str `"aabbc"`, the number of repeats are `2`.
+
+    Note:
+        The current version only supports ``batch_size==1``.
+    """
+    if blank in targets:
+        raise ValueError(f"targets Tensor shouldn't contain blank index. Found {targets}.")
+    if torch.max(targets) >= log_probs.shape[-1]:
+        raise ValueError("targets values must be less than the CTC dimension")
+
+    if input_lengths is None:
+        batch_size, length = log_probs.size(0), log_probs.size(1)
+        input_lengths = torch.full((batch_size,), length, dtype=torch.int64, device=log_probs.device)
+    if target_lengths is None:
+        batch_size, length = targets.size(0), targets.size(1)
+        target_lengths = torch.full((batch_size,), length, dtype=torch.int64, device=targets.device)
+
+    # For TorchScript compatibility
+    assert input_lengths is not None
+    assert target_lengths is not None
+
+    paths, scores = torch.ops.torchaudio.forced_align(log_probs, targets, input_lengths, target_lengths, blank)
+    return paths, scores
+
+
+@dataclass
+class TokenSpan:
+    """TokenSpan()
+    Token with time stamps and score. Returned by :py:func:`merge_tokens`.
+    """
+
+    token: int
+    """The token"""
+    start: int
+    """The start time (inclusive) in emission time axis."""
+    end: int
+    """The end time (exclusive) in emission time axis."""
+    score: float
+    """The score of the this token."""
+
+    def __len__(self) -> int:
+        """Returns the time span"""
+        return self.end - self.start
+
+
+def merge_tokens(tokens: Tensor, scores: Tensor, blank: int = 0) -> List[TokenSpan]:
+    """Removes repeated tokens and blank tokens from the given CTC token sequence.
+
+    Args:
+        tokens (Tensor): Alignment tokens (unbatched) returned from :py:func:`forced_align`.
+            Shape: `(time, )`.
+        scores (Tensor): Alignment scores (unbatched) returned from :py:func:`forced_align`.
+            Shape: `(time, )`. When computing the token-size score, the given score is averaged
+            across the corresponding time span.
+
+    Returns:
+        list of TokenSpan
+
+    Example:
+        >>> aligned_tokens, scores = forced_align(emission, targets, input_lengths, target_lengths)
+        >>> token_spans = merge_tokens(aligned_tokens[0], scores[0])
+    """
+    if tokens.ndim != 1 or scores.ndim != 1:
+        raise ValueError("`tokens` and `scores` must be 1D Tensor.")
+    if len(tokens) != len(scores):
+        raise ValueError("`tokens` and `scores` must be the same length.")
+
+    diff = torch.diff(
+        tokens, prepend=torch.tensor([-1], device=tokens.device), append=torch.tensor([-1], device=tokens.device)
+    )
+    changes_wo_blank = torch.nonzero((diff != 0)).squeeze().tolist()
+    tokens = tokens.tolist()
+    spans = [
+        TokenSpan(token=token, start=start, end=end, score=scores[start:end].mean().item())
+        for start, end in zip(changes_wo_blank[:-1], changes_wo_blank[1:])
+        if (token := tokens[start]) != blank
+    ]
+    return spans
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/functional/filtering.py b/.venv/lib/python3.11/site-packages/torchaudio/functional/filtering.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4140447efaf13edac5dce72a71d52a52686d286
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/functional/filtering.py
@@ -0,0 +1,1669 @@
+import math
+import warnings
+from typing import Optional
+
+import torch
+from torch import Tensor
+
+from torchaudio._extension import _IS_TORCHAUDIO_EXT_AVAILABLE
+
+
+def _dB2Linear(x: float) -> float:
+    return math.exp(x * math.log(10) / 20.0)
+
+
+def _generate_wave_table(
+    wave_type: str,
+    data_type: str,
+    table_size: int,
+    min: float,
+    max: float,
+    phase: float,
+    device: torch.device,
+) -> Tensor:
+    r"""A helper function for phaser. Generates a table with given parameters.
+
+    Args:
+        wave_type (str): SINE or TRIANGULAR
+        data_type (str): desired data_type ( `INT` or `FLOAT` )
+        table_size (int): desired table size
+        min (float): desired min value
+        max (float): desired max value
+        phase (float): desired phase
+        device (torch.device): Torch device on which table must be generated
+    Returns:
+        Tensor: A 1D tensor with wave table values
+    """
+
+    phase_offset = int(phase / math.pi / 2 * table_size + 0.5)
+
+    t = torch.arange(table_size, device=device, dtype=torch.int32)
+
+    point = (t + phase_offset) % table_size
+
+    d = torch.zeros_like(point, device=device, dtype=torch.float64)
+
+    if wave_type == "SINE":
+        d = (torch.sin(point.to(torch.float64) / table_size * 2 * math.pi) + 1) / 2
+    elif wave_type == "TRIANGLE":
+        d = point.to(torch.float64) * 2 / table_size
+        value = torch.div(4 * point, table_size, rounding_mode="floor")
+        d[value == 0] = d[value == 0] + 0.5
+        d[value == 1] = 1.5 - d[value == 1]
+        d[value == 2] = 1.5 - d[value == 2]
+        d[value == 3] = d[value == 3] - 1.5
+
+    d = d * (max - min) + min
+
+    if data_type == "INT":
+        mask = d < 0
+        d[mask] = d[mask] - 0.5
+        d[~mask] = d[~mask] + 0.5
+        d = d.to(torch.int32)
+    elif data_type == "FLOAT":
+        d = d.to(torch.float32)
+
+    return d
+
+
+def allpass_biquad(waveform: Tensor, sample_rate: int, central_freq: float, Q: float = 0.707) -> Tensor:
+    r"""Design two-pole all-pass filter.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform(torch.Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        central_freq (float or torch.Tensor): central frequency (in Hz)
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+
+    w0 = 2 * math.pi * central_freq / sample_rate
+
+    alpha = torch.sin(w0) / 2 / Q
+
+    b0 = 1 - alpha
+    b1 = -2 * torch.cos(w0)
+    b2 = 1 + alpha
+    a0 = 1 + alpha
+    a1 = -2 * torch.cos(w0)
+    a2 = 1 - alpha
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def band_biquad(
+    waveform: Tensor,
+    sample_rate: int,
+    central_freq: float,
+    Q: float = 0.707,
+    noise: bool = False,
+) -> Tensor:
+    r"""Design two-pole band filter.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        central_freq (float or torch.Tensor): central frequency (in Hz)
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``).
+        noise (bool, optional) : If ``True``, uses the alternate mode for un-pitched audio (e.g. percussion).
+            If ``False``, uses mode oriented to pitched audio, i.e. voice, singing,
+            or instrumental music (Default: ``False``).
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+
+    w0 = 2 * math.pi * central_freq / sample_rate
+    bw_Hz = central_freq / Q
+
+    a0 = 1.0
+    a2 = torch.exp(-2 * math.pi * bw_Hz / sample_rate)
+    a1 = -4 * a2 / (1 + a2) * torch.cos(w0)
+
+    b0 = torch.sqrt(1 - a1 * a1 / (4 * a2)) * (1 - a2)
+
+    if noise:
+        mult = torch.sqrt(((1 + a2) * (1 + a2) - a1 * a1) * (1 - a2) / (1 + a2)) / b0
+        b0 = mult * b0
+
+    b1 = 0.0
+    b2 = 0.0
+
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def bandpass_biquad(
+    waveform: Tensor,
+    sample_rate: int,
+    central_freq: float,
+    Q: float = 0.707,
+    const_skirt_gain: bool = False,
+) -> Tensor:
+    r"""Design two-pole band-pass filter.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        central_freq (float or torch.Tensor): central frequency (in Hz)
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
+        const_skirt_gain (bool, optional) : If ``True``, uses a constant skirt gain (peak gain = Q).
+            If ``False``, uses a constant 0dB peak gain. (Default: ``False``)
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+
+    w0 = 2 * math.pi * central_freq / sample_rate
+    alpha = torch.sin(w0) / 2 / Q
+
+    temp = torch.sin(w0) / 2 if const_skirt_gain else alpha
+    b0 = temp
+    b1 = 0.0
+    b2 = -temp
+    a0 = 1 + alpha
+    a1 = -2 * torch.cos(w0)
+    a2 = 1 - alpha
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def bandreject_biquad(waveform: Tensor, sample_rate: int, central_freq: float, Q: float = 0.707) -> Tensor:
+    r"""Design two-pole band-reject filter.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        central_freq (float or torch.Tensor): central frequency (in Hz)
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+
+    w0 = 2 * math.pi * central_freq / sample_rate
+    alpha = torch.sin(w0) / 2 / Q
+
+    b0 = 1.0
+    b1 = -2 * torch.cos(w0)
+    b2 = 1.0
+    a0 = 1 + alpha
+    a1 = -2 * torch.cos(w0)
+    a2 = 1 - alpha
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def bass_biquad(
+    waveform: Tensor,
+    sample_rate: int,
+    gain: float,
+    central_freq: float = 100,
+    Q: float = 0.707,
+) -> Tensor:
+    r"""Design a bass tone-control effect.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        gain (float or torch.Tensor): desired gain at the boost (or attenuation) in dB.
+        central_freq (float or torch.Tensor, optional): central frequency (in Hz). (Default: ``100``)
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``).
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+    gain = torch.as_tensor(gain, dtype=dtype, device=device)
+
+    w0 = 2 * math.pi * central_freq / sample_rate
+    alpha = torch.sin(w0) / 2 / Q
+    A = torch.exp(gain / 40 * math.log(10))
+
+    temp1 = 2 * torch.sqrt(A) * alpha
+    temp2 = (A - 1) * torch.cos(w0)
+    temp3 = (A + 1) * torch.cos(w0)
+
+    b0 = A * ((A + 1) - temp2 + temp1)
+    b1 = 2 * A * ((A - 1) - temp3)
+    b2 = A * ((A + 1) - temp2 - temp1)
+    a0 = (A + 1) + temp2 + temp1
+    a1 = -2 * ((A - 1) + temp3)
+    a2 = (A + 1) + temp2 - temp1
+
+    return biquad(waveform, b0 / a0, b1 / a0, b2 / a0, a0 / a0, a1 / a0, a2 / a0)
+
+
+def biquad(waveform: Tensor, b0: float, b1: float, b2: float, a0: float, a1: float, a2: float) -> Tensor:
+    r"""Perform a biquad filter of input tensor.  Initial conditions set to 0.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        b0 (float or torch.Tensor): numerator coefficient of current input, x[n]
+        b1 (float or torch.Tensor): numerator coefficient of input one time step ago x[n-1]
+        b2 (float or torch.Tensor): numerator coefficient of input two time steps ago x[n-2]
+        a0 (float or torch.Tensor): denominator coefficient of current output y[n], typically 1
+        a1 (float or torch.Tensor): denominator coefficient of current output y[n-1]
+        a2 (float or torch.Tensor): denominator coefficient of current output y[n-2]
+
+    Returns:
+        Tensor: Waveform with dimension of `(..., time)`
+
+    Reference:
+       - https://en.wikipedia.org/wiki/Digital_biquad_filter
+    """
+
+    device = waveform.device
+    dtype = waveform.dtype
+
+    b0 = torch.as_tensor(b0, dtype=dtype, device=device).view(1)
+    b1 = torch.as_tensor(b1, dtype=dtype, device=device).view(1)
+    b2 = torch.as_tensor(b2, dtype=dtype, device=device).view(1)
+    a0 = torch.as_tensor(a0, dtype=dtype, device=device).view(1)
+    a1 = torch.as_tensor(a1, dtype=dtype, device=device).view(1)
+    a2 = torch.as_tensor(a2, dtype=dtype, device=device).view(1)
+
+    output_waveform = lfilter(
+        waveform,
+        torch.cat([a0, a1, a2]),
+        torch.cat([b0, b1, b2]),
+    )
+    return output_waveform
+
+
+def contrast(waveform: Tensor, enhancement_amount: float = 75.0) -> Tensor:
+    r"""Apply contrast effect.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Comparable with compression, this effect modifies an audio signal to make it sound louder
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        enhancement_amount (float, optional): controls the amount of the enhancement
+            Allowed range of values for enhancement_amount : 0-100
+            Note that enhancement_amount = 0 still gives a significant contrast enhancement
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+    """
+
+    if not 0 <= enhancement_amount <= 100:
+        raise ValueError("Allowed range of values for enhancement_amount : 0-100")
+
+    contrast = enhancement_amount / 750.0
+
+    temp1 = waveform * (math.pi / 2)
+    temp2 = contrast * torch.sin(temp1 * 4)
+    output_waveform = torch.sin(temp1 + temp2)
+
+    return output_waveform
+
+
+def dcshift(waveform: Tensor, shift: float, limiter_gain: Optional[float] = None) -> Tensor:
+    r"""Apply a DC shift to the audio. Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    This can be useful to remove a DC offset
+    (caused perhaps by a hardware problem in the recording chain) from the audio
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        shift (float): indicates the amount to shift the audio
+            Allowed range of values for shift : -2.0 to +2.0
+        limiter_gain (float of None, optional): It is used only on peaks to prevent clipping
+            It should have a value much less than 1 (e.g. 0.05 or 0.02)
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+    """
+    output_waveform = waveform
+    limiter_threshold = 0.0
+
+    if limiter_gain is not None:
+        limiter_threshold = 1.0 - (abs(shift) - limiter_gain)
+
+    # Note:
+    # the following index-based update breaks auto-grad support
+    if limiter_gain is not None and shift > 0:
+        mask = waveform > limiter_threshold
+        temp = (waveform[mask] - limiter_threshold) * limiter_gain / (1 - limiter_threshold)
+        output_waveform[mask] = (temp + limiter_threshold + shift).clamp(max=limiter_threshold)
+        output_waveform[~mask] = (waveform[~mask] + shift).clamp(min=-1, max=1)
+    elif limiter_gain is not None and shift < 0:
+        mask = waveform < -limiter_threshold
+        temp = (waveform[mask] + limiter_threshold) * limiter_gain / (1 - limiter_threshold)
+        output_waveform[mask] = (temp - limiter_threshold + shift).clamp(min=-limiter_threshold)
+        output_waveform[~mask] = (waveform[~mask] + shift).clamp(min=-1, max=1)
+    else:
+        output_waveform = (waveform + shift).clamp(min=-1, max=1)
+
+    return output_waveform
+
+
+def deemph_biquad(waveform: Tensor, sample_rate: int) -> Tensor:
+    r"""Apply ISO 908 CD de-emphasis (shelving) IIR filter.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, Allowed sample rate ``44100`` or ``48000``
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+
+    if sample_rate == 44100:
+        central_freq = 5283
+        width_slope = 0.4845
+        gain = -9.477
+    elif sample_rate == 48000:
+        central_freq = 5356
+        width_slope = 0.479
+        gain = -9.62
+    else:
+        raise ValueError("Sample rate must be 44100 (audio-CD) or 48000 (DAT)")
+
+    w0 = 2 * math.pi * central_freq / sample_rate
+    A = math.exp(gain / 40.0 * math.log(10))
+    alpha = math.sin(w0) / 2 * math.sqrt((A + 1 / A) * (1 / width_slope - 1) + 2)
+
+    temp1 = 2 * math.sqrt(A) * alpha
+    temp2 = (A - 1) * math.cos(w0)
+    temp3 = (A + 1) * math.cos(w0)
+
+    b0 = A * ((A + 1) + temp2 + temp1)
+    b1 = -2 * A * ((A - 1) + temp3)
+    b2 = A * ((A + 1) + temp2 - temp1)
+    a0 = (A + 1) - temp2 + temp1
+    a1 = 2 * ((A - 1) - temp3)
+    a2 = (A + 1) - temp2 - temp1
+
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def _add_noise_shaping(dithered_waveform: Tensor, waveform: Tensor) -> Tensor:
+    r"""Noise shaping is calculated by error:
+    error[n] = dithered[n] - original[n]
+    noise_shaped_waveform[n] = dithered[n] + error[n-1]
+    """
+    wf_shape = waveform.size()
+    waveform = waveform.reshape(-1, wf_shape[-1])
+
+    dithered_shape = dithered_waveform.size()
+    dithered_waveform = dithered_waveform.reshape(-1, dithered_shape[-1])
+
+    error = dithered_waveform - waveform
+
+    # add error[n-1] to dithered_waveform[n], so offset the error by 1 index
+    zeros = torch.zeros(1, dtype=error.dtype, device=error.device)
+    for index in range(error.size()[0]):
+        err = error[index]
+        error_offset = torch.cat((zeros, err))
+        error[index] = error_offset[: waveform.size()[1]]
+
+    noise_shaped = dithered_waveform + error
+    return noise_shaped.reshape(dithered_shape[:-1] + noise_shaped.shape[-1:])
+
+
+def _apply_probability_distribution(waveform: Tensor, density_function: str = "TPDF") -> Tensor:
+    r"""Apply a probability distribution function on a waveform.
+
+    Triangular probability density function (TPDF) dither noise has a
+    triangular distribution; values in the center of the range have a higher
+    probability of occurring.
+
+    Rectangular probability density function (RPDF) dither noise has a
+    uniform distribution; any value in the specified range has the same
+    probability of occurring.
+
+    Gaussian probability density function (GPDF) has a normal distribution.
+    The relationship of probabilities of results follows a bell-shaped,
+    or Gaussian curve, typical of dither generated by analog sources.
+    Args:
+        waveform (Tensor): Tensor of audio of dimension (..., time)
+        density_function (str, optional): The density function of a
+           continuous random variable (Default: ``"TPDF"``)
+           Options: Triangular Probability Density Function - `TPDF`
+                    Rectangular Probability Density Function - `RPDF`
+                    Gaussian Probability Density Function - `GPDF`
+    Returns:
+        Tensor: waveform dithered with TPDF
+    """
+
+    # pack batch
+    shape = waveform.size()
+    waveform = waveform.reshape(-1, shape[-1])
+
+    channel_size = waveform.size()[0] - 1
+    time_size = waveform.size()[-1] - 1
+
+    random_channel = (
+        int(
+            torch.randint(
+                channel_size,
+                [
+                    1,
+                ],
+            ).item()
+        )
+        if channel_size > 0
+        else 0
+    )
+    random_time = (
+        int(
+            torch.randint(
+                time_size,
+                [
+                    1,
+                ],
+            ).item()
+        )
+        if time_size > 0
+        else 0
+    )
+
+    number_of_bits = 16
+    up_scaling = 2 ** (number_of_bits - 1) - 2
+    signal_scaled = waveform * up_scaling
+    down_scaling = 2 ** (number_of_bits - 1)
+
+    signal_scaled_dis = waveform
+    if density_function == "RPDF":
+        RPDF = waveform[random_channel][random_time] - 0.5
+
+        signal_scaled_dis = signal_scaled + RPDF
+    elif density_function == "GPDF":
+        # TODO Replace by distribution code once
+        # https://github.com/pytorch/pytorch/issues/29843 is resolved
+        # gaussian = torch.distributions.normal.Normal(torch.mean(waveform, -1), 1).sample()
+
+        num_rand_variables = 6
+
+        gaussian = waveform[random_channel][random_time]
+        for ws in num_rand_variables * [time_size]:
+            rand_chan = int(
+                torch.randint(
+                    channel_size,
+                    [
+                        1,
+                    ],
+                ).item()
+            )
+            gaussian += waveform[rand_chan][
+                int(
+                    torch.randint(
+                        ws,
+                        [
+                            1,
+                        ],
+                    ).item()
+                )
+            ]
+
+        signal_scaled_dis = signal_scaled + gaussian
+    else:
+        # dtype needed for https://github.com/pytorch/pytorch/issues/32358
+        TPDF = torch.bartlett_window(time_size + 1, dtype=signal_scaled.dtype, device=signal_scaled.device)
+        TPDF = TPDF.repeat((channel_size + 1), 1)
+        signal_scaled_dis = signal_scaled + TPDF
+
+    quantised_signal_scaled = torch.round(signal_scaled_dis)
+    quantised_signal = quantised_signal_scaled / down_scaling
+
+    # unpack batch
+    return quantised_signal.reshape(shape[:-1] + quantised_signal.shape[-1:])
+
+
+def dither(waveform: Tensor, density_function: str = "TPDF", noise_shaping: bool = False) -> Tensor:
+    r"""Apply dither
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Dither increases the perceived dynamic range of audio stored at a
+    particular bit-depth by eliminating nonlinear truncation distortion
+    (i.e. adding minimally perceived noise to mask distortion caused by quantization).
+
+    Args:
+        waveform (Tensor): Tensor of audio of dimension (..., time)
+        density_function (str, optional):
+            The density function of a continuous random variable. One of
+            ``"TPDF"`` (Triangular Probability Density Function),
+            ``"RPDF"`` (Rectangular Probability Density Function) or
+            ``"GPDF"`` (Gaussian Probability Density Function) (Default: ``"TPDF"``).
+        noise_shaping (bool, optional): a filtering process that shapes the spectral
+            energy of quantisation error (Default: ``False``)
+
+    Returns:
+       Tensor: waveform dithered
+    """
+    dithered = _apply_probability_distribution(waveform, density_function=density_function)
+
+    if noise_shaping:
+        return _add_noise_shaping(dithered, waveform)
+    else:
+        return dithered
+
+
+def equalizer_biquad(
+    waveform: Tensor,
+    sample_rate: int,
+    center_freq: float,
+    gain: float,
+    Q: float = 0.707,
+) -> Tensor:
+    r"""Design biquad peaking equalizer filter and perform filtering.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        center_freq (float): filter's central frequency
+        gain (float or torch.Tensor): desired gain at the boost (or attenuation) in dB
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    center_freq = torch.as_tensor(center_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+    gain = torch.as_tensor(gain, dtype=dtype, device=device)
+
+    w0 = 2 * math.pi * center_freq / sample_rate
+    A = torch.exp(gain / 40.0 * math.log(10))
+    alpha = torch.sin(w0) / 2 / Q
+
+    b0 = 1 + alpha * A
+    b1 = -2 * torch.cos(w0)
+    b2 = 1 - alpha * A
+    a0 = 1 + alpha / A
+    a1 = -2 * torch.cos(w0)
+    a2 = 1 - alpha / A
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def filtfilt(
+    waveform: Tensor,
+    a_coeffs: Tensor,
+    b_coeffs: Tensor,
+    clamp: bool = True,
+) -> Tensor:
+    r"""Apply an IIR filter forward and backward to a waveform.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Inspired by https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.filtfilt.html
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`.  Must be normalized to -1 to 1.
+        a_coeffs (Tensor): denominator coefficients of difference equation of dimension of either
+                                1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
+                                Lower delay coefficients are first, e.g. ``[a0, a1, a2, ...]``.
+                                Must be same size as b_coeffs (pad with 0's as necessary).
+        b_coeffs (Tensor): numerator coefficients of difference equation of dimension of either
+                                1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
+                                Lower delay coefficients are first, e.g. ``[b0, b1, b2, ...]``.
+                                Must be same size as a_coeffs (pad with 0's as necessary).
+        clamp (bool, optional): If ``True``, clamp the output signal to be in the range [-1, 1] (Default: ``True``)
+
+    Returns:
+        Tensor: Waveform with dimension of either `(..., num_filters, time)` if ``a_coeffs`` and ``b_coeffs``
+        are 2D Tensors, or `(..., time)` otherwise.
+    """
+    forward_filtered = lfilter(waveform, a_coeffs, b_coeffs, clamp=False, batching=True)
+    backward_filtered = lfilter(
+        forward_filtered.flip(-1),
+        a_coeffs,
+        b_coeffs,
+        clamp=clamp,
+        batching=True,
+    ).flip(-1)
+    return backward_filtered
+
+
+def flanger(
+    waveform: Tensor,
+    sample_rate: int,
+    delay: float = 0.0,
+    depth: float = 2.0,
+    regen: float = 0.0,
+    width: float = 71.0,
+    speed: float = 0.5,
+    phase: float = 25.0,
+    modulation: str = "sinusoidal",
+    interpolation: str = "linear",
+) -> Tensor:
+    r"""Apply a flanger effect to the audio. Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., channel, time)` .
+            Max 4 channels allowed
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        delay (float, optional): desired delay in milliseconds(ms)
+            Allowed range of values are 0 to 30
+        depth (float, optional): desired delay depth in milliseconds(ms)
+            Allowed range of values are 0 to 10
+        regen (float, optional): desired regen(feedback gain) in dB
+            Allowed range of values are -95 to 95
+        width (float, optional):  desired width(delay gain) in dB
+            Allowed range of values are 0 to 100
+        speed (float, optional):  modulation speed in Hz
+            Allowed range of values are 0.1 to 10
+        phase (float, optional):  percentage phase-shift for multi-channel
+            Allowed range of values are 0 to 100
+        modulation (str, optional):  Use either "sinusoidal" or "triangular" modulation. (Default: ``sinusoidal``)
+        interpolation (str, optional): Use either "linear" or "quadratic" for delay-line interpolation.
+            (Default: ``linear``)
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., channel, time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+
+        - Scott Lehman, `Effects Explained`_,
+
+    .. _Effects Explained:
+        https://web.archive.org/web/20051125072557/http://www.harmony-central.com/Effects/effects-explained.html
+    """
+
+    if modulation not in ("sinusoidal", "triangular"):
+        raise ValueError('Only "sinusoidal" or "triangular" modulation allowed')
+
+    if interpolation not in ("linear", "quadratic"):
+        raise ValueError('Only "linear" or "quadratic" interpolation allowed')
+
+    actual_shape = waveform.shape
+    device, dtype = waveform.device, waveform.dtype
+
+    if actual_shape[-2] > 4:
+        raise ValueError("Max 4 channels allowed")
+
+    # convert to 3D (batch, channels, time)
+    waveform = waveform.view(-1, actual_shape[-2], actual_shape[-1])
+
+    # Scaling
+    feedback_gain = regen / 100
+    delay_gain = width / 100
+    channel_phase = phase / 100
+    delay_min = delay / 1000
+    delay_depth = depth / 1000
+
+    n_channels = waveform.shape[-2]
+
+    if modulation == "sinusoidal":
+        wave_type = "SINE"
+    else:
+        wave_type = "TRIANGLE"
+
+    # Balance output:
+    in_gain = 1.0 / (1 + delay_gain)
+    delay_gain = delay_gain / (1 + delay_gain)
+
+    # Balance feedback loop:
+    delay_gain = delay_gain * (1 - abs(feedback_gain))
+
+    delay_buf_length = int((delay_min + delay_depth) * sample_rate + 0.5)
+    delay_buf_length = delay_buf_length + 2
+
+    delay_bufs = torch.zeros(waveform.shape[0], n_channels, delay_buf_length, dtype=dtype, device=device)
+    delay_last = torch.zeros(waveform.shape[0], n_channels, dtype=dtype, device=device)
+
+    lfo_length = int(sample_rate / speed)
+
+    table_min = math.floor(delay_min * sample_rate + 0.5)
+    table_max = delay_buf_length - 2.0
+
+    lfo = _generate_wave_table(
+        wave_type=wave_type,
+        data_type="FLOAT",
+        table_size=lfo_length,
+        min=float(table_min),
+        max=float(table_max),
+        phase=3 * math.pi / 2,
+        device=device,
+    )
+
+    output_waveform = torch.zeros_like(waveform, dtype=dtype, device=device)
+
+    delay_buf_pos = 0
+    lfo_pos = 0
+    channel_idxs = torch.arange(0, n_channels, device=device)
+
+    for i in range(waveform.shape[-1]):
+
+        delay_buf_pos = (delay_buf_pos + delay_buf_length - 1) % delay_buf_length
+
+        cur_channel_phase = (channel_idxs * lfo_length * channel_phase + 0.5).to(torch.int64)
+        delay_tensor = lfo[(lfo_pos + cur_channel_phase) % lfo_length]
+        frac_delay = torch.frac(delay_tensor)
+        delay_tensor = torch.floor(delay_tensor)
+
+        int_delay = delay_tensor.to(torch.int64)
+
+        temp = waveform[:, :, i]
+
+        delay_bufs[:, :, delay_buf_pos] = temp + delay_last * feedback_gain
+
+        delayed_0 = delay_bufs[:, channel_idxs, (delay_buf_pos + int_delay) % delay_buf_length]
+
+        int_delay = int_delay + 1
+
+        delayed_1 = delay_bufs[:, channel_idxs, (delay_buf_pos + int_delay) % delay_buf_length]
+
+        int_delay = int_delay + 1
+
+        if interpolation == "linear":
+            delayed = delayed_0 + (delayed_1 - delayed_0) * frac_delay
+        else:
+            delayed_2 = delay_bufs[:, channel_idxs, (delay_buf_pos + int_delay) % delay_buf_length]
+
+            int_delay = int_delay + 1
+
+            delayed_2 = delayed_2 - delayed_0
+            delayed_1 = delayed_1 - delayed_0
+            a = delayed_2 * 0.5 - delayed_1
+            b = delayed_1 * 2 - delayed_2 * 0.5
+
+            delayed = delayed_0 + (a * frac_delay + b) * frac_delay
+
+        delay_last = delayed
+        output_waveform[:, :, i] = waveform[:, :, i] * in_gain + delayed * delay_gain
+
+        lfo_pos = (lfo_pos + 1) % lfo_length
+
+    return output_waveform.clamp(min=-1, max=1).view(actual_shape)
+
+
+def gain(waveform: Tensor, gain_db: float = 1.0) -> Tensor:
+    r"""Apply amplification or attenuation to the whole waveform.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+       waveform (Tensor): Tensor of audio of dimension (..., time).
+       gain_db (float, optional) Gain adjustment in decibels (dB) (Default: ``1.0``).
+
+    Returns:
+       Tensor: the whole waveform amplified by gain_db.
+    """
+    if gain_db == 0:
+        return waveform
+
+    ratio = 10 ** (gain_db / 20)
+
+    return waveform * ratio
+
+
+def highpass_biquad(waveform: Tensor, sample_rate: int, cutoff_freq: float, Q: float = 0.707) -> Tensor:
+    r"""Design biquad highpass filter and perform filtering.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        cutoff_freq (float or torch.Tensor): filter cutoff frequency
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
+
+    Returns:
+        Tensor: Waveform dimension of `(..., time)`
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    cutoff_freq = torch.as_tensor(cutoff_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+
+    w0 = 2 * math.pi * cutoff_freq / sample_rate
+    alpha = torch.sin(w0) / 2.0 / Q
+
+    b0 = (1 + torch.cos(w0)) / 2
+    b1 = -1 - torch.cos(w0)
+    b2 = b0
+    a0 = 1 + alpha
+    a1 = -2 * torch.cos(w0)
+    a2 = 1 - alpha
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def _lfilter_core_generic_loop(input_signal_windows: Tensor, a_coeffs_flipped: Tensor, padded_output_waveform: Tensor):
+    n_order = a_coeffs_flipped.size(1)
+    a_coeffs_flipped = a_coeffs_flipped.unsqueeze(2)
+    for i_sample, o0 in enumerate(input_signal_windows.permute(2, 0, 1)):
+        windowed_output_signal = padded_output_waveform[:, :, i_sample : i_sample + n_order]
+        o0 -= (windowed_output_signal.transpose(0, 1) @ a_coeffs_flipped)[..., 0].t()
+        padded_output_waveform[:, :, i_sample + n_order - 1] = o0
+
+
+if _IS_TORCHAUDIO_EXT_AVAILABLE:
+    _lfilter_core_cpu_loop = torch.ops.torchaudio._lfilter_core_loop
+else:
+    _lfilter_core_cpu_loop = _lfilter_core_generic_loop
+
+
+def _lfilter_core(
+    waveform: Tensor,
+    a_coeffs: Tensor,
+    b_coeffs: Tensor,
+) -> Tensor:
+
+    if a_coeffs.size() != b_coeffs.size():
+        raise ValueError(
+            "Expected coeffs to be the same size."
+            f"Found a_coeffs size: {a_coeffs.size()}, b_coeffs size: {b_coeffs.size()}"
+        )
+    if waveform.ndim != 3:
+        raise ValueError(f"Expected waveform to be 3 dimensional. Found: {waveform.ndim}")
+    if not (waveform.device == a_coeffs.device == b_coeffs.device):
+        raise ValueError(
+            "Expected waveform and coeffs to be on the same device."
+            f"Found: waveform device:{waveform.device}, a_coeffs device: {a_coeffs.device}, "
+            f"b_coeffs device: {b_coeffs.device}"
+        )
+
+    n_batch, n_channel, n_sample = waveform.size()
+    n_order = a_coeffs.size(1)
+    if n_order <= 0:
+        raise ValueError(f"Expected n_order to be positive. Found: {n_order}")
+
+    # Pad the input and create output
+
+    padded_waveform = torch.nn.functional.pad(waveform, [n_order - 1, 0])
+    padded_output_waveform = torch.zeros_like(padded_waveform)
+
+    # Set up the coefficients matrix
+    # Flip coefficients' order
+    a_coeffs_flipped = a_coeffs.flip(1)
+    b_coeffs_flipped = b_coeffs.flip(1)
+
+    # calculate windowed_input_signal in parallel using convolution
+    input_signal_windows = torch.nn.functional.conv1d(padded_waveform, b_coeffs_flipped.unsqueeze(1), groups=n_channel)
+
+    input_signal_windows.div_(a_coeffs[:, :1])
+    a_coeffs_flipped.div_(a_coeffs[:, :1])
+
+    if (
+        input_signal_windows.device == torch.device("cpu")
+        and a_coeffs_flipped.device == torch.device("cpu")
+        and padded_output_waveform.device == torch.device("cpu")
+    ):
+        _lfilter_core_cpu_loop(input_signal_windows, a_coeffs_flipped, padded_output_waveform)
+    else:
+        _lfilter_core_generic_loop(input_signal_windows, a_coeffs_flipped, padded_output_waveform)
+
+    output = padded_output_waveform[:, :, n_order - 1 :]
+    return output
+
+
+if _IS_TORCHAUDIO_EXT_AVAILABLE:
+    _lfilter = torch.ops.torchaudio._lfilter
+else:
+    _lfilter = _lfilter_core
+
+
+def lfilter(waveform: Tensor, a_coeffs: Tensor, b_coeffs: Tensor, clamp: bool = True, batching: bool = True) -> Tensor:
+    r"""Perform an IIR filter by evaluating difference equation, using differentiable implementation
+    developed independently by *Yu et al.* :cite:`ismir_YuF23` and *Forgione et al.* :cite:`forgione2021dynonet`.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Note:
+        To avoid numerical problems, small filter order is preferred.
+        Using double precision could also minimize numerical precision errors.
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`.  Must be normalized to -1 to 1.
+        a_coeffs (Tensor): denominator coefficients of difference equation of dimension of either
+                                1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
+                                Lower delays coefficients are first, e.g. ``[a0, a1, a2, ...]``.
+                                Must be same size as b_coeffs (pad with 0's as necessary).
+        b_coeffs (Tensor): numerator coefficients of difference equation of dimension of either
+                                1D with shape `(num_order + 1)` or 2D with shape `(num_filters, num_order + 1)`.
+                                Lower delays coefficients are first, e.g. ``[b0, b1, b2, ...]``.
+                                Must be same size as a_coeffs (pad with 0's as necessary).
+        clamp (bool, optional): If ``True``, clamp the output signal to be in the range [-1, 1] (Default: ``True``)
+        batching (bool, optional): Effective only when coefficients are 2D. If ``True``, then waveform should be at
+                                    least 2D, and the size of second axis from last should equals to ``num_filters``.
+                                    The output can be expressed as ``output[..., i, :] = lfilter(waveform[..., i, :],
+                                    a_coeffs[i], b_coeffs[i], clamp=clamp, batching=False)``. (Default: ``True``)
+
+    Returns:
+        Tensor: Waveform with dimension of either `(..., num_filters, time)` if ``a_coeffs`` and ``b_coeffs``
+        are 2D Tensors, or `(..., time)` otherwise.
+    """
+    if a_coeffs.size() != b_coeffs.size():
+        raise ValueError(
+            "Expected coeffs to be the same size."
+            f"Found: a_coeffs size: {a_coeffs.size()}, b_coeffs size: {b_coeffs.size()}"
+        )
+    if a_coeffs.ndim > 2:
+        raise ValueError(f"Expected coeffs to have greater than 1 dimension. Found: {a_coeffs.ndim}")
+
+    if a_coeffs.ndim > 1:
+        if batching:
+            if waveform.ndim <= 0:
+                raise ValueError("Expected waveform to have a positive number of dimensions." f"Found: {waveform.ndim}")
+            if waveform.shape[-2] != a_coeffs.shape[0]:
+                raise ValueError(
+                    "Expected number of batches in waveform and coeffs to be the same."
+                    f"Found: coeffs batches: {a_coeffs.shape[0]}, waveform batches: {waveform.shape[-2]}"
+                )
+        else:
+            waveform = torch.stack([waveform] * a_coeffs.shape[0], -2)
+    else:
+        a_coeffs = a_coeffs.unsqueeze(0)
+        b_coeffs = b_coeffs.unsqueeze(0)
+
+    # pack batch
+    shape = waveform.size()
+    waveform = waveform.reshape(-1, a_coeffs.shape[0], shape[-1])
+    output = _lfilter(waveform, a_coeffs, b_coeffs)
+
+    if clamp:
+        output = torch.clamp(output, min=-1.0, max=1.0)
+
+    # unpack batch
+    output = output.reshape(shape[:-1] + output.shape[-1:])
+
+    return output
+
+
+def lowpass_biquad(waveform: Tensor, sample_rate: int, cutoff_freq: float, Q: float = 0.707) -> Tensor:
+    r"""Design biquad lowpass filter and perform filtering.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (torch.Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        cutoff_freq (float or torch.Tensor): filter cutoff frequency
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    cutoff_freq = torch.as_tensor(cutoff_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+
+    w0 = 2 * math.pi * cutoff_freq / sample_rate
+    alpha = torch.sin(w0) / 2 / Q
+
+    b0 = (1 - torch.cos(w0)) / 2
+    b1 = 1 - torch.cos(w0)
+    b2 = b0
+    a0 = 1 + alpha
+    a1 = -2 * torch.cos(w0)
+    a2 = 1 - alpha
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def _overdrive_core_loop_generic(
+    waveform: Tensor, temp: Tensor, last_in: Tensor, last_out: Tensor, output_waveform: Tensor
+):
+    for i in range(waveform.shape[-1]):
+        last_out = temp[:, i] - last_in + 0.995 * last_out
+        last_in = temp[:, i]
+        output_waveform[:, i] = waveform[:, i] * 0.5 + last_out * 0.75
+
+
+if _IS_TORCHAUDIO_EXT_AVAILABLE:
+    _overdrive_core_loop_cpu = torch.ops.torchaudio._overdrive_core_loop
+else:
+    _overdrive_core_loop_cpu = _overdrive_core_loop_generic
+
+
+def overdrive(waveform: Tensor, gain: float = 20, colour: float = 20) -> Tensor:
+    r"""Apply a overdrive effect to the audio. Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    This effect applies a non linear distortion to the audio signal.
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        gain (float, optional): desired gain at the boost (or attenuation) in dB
+            Allowed range of values are 0 to 100
+        colour (float, optional):  controls the amount of even harmonic content in the over-driven output
+            Allowed range of values are 0 to 100
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+    """
+    actual_shape = waveform.shape
+    device, dtype = waveform.device, waveform.dtype
+
+    # convert to 2D (..,time)
+    waveform = waveform.view(-1, actual_shape[-1])
+
+    gain = _dB2Linear(gain)
+    colour = colour / 200
+    last_in = torch.zeros(waveform.shape[:-1], dtype=dtype, device=device)
+    last_out = torch.zeros(waveform.shape[:-1], dtype=dtype, device=device)
+
+    temp = waveform * gain + colour
+
+    mask1 = temp < -1
+    temp[mask1] = torch.tensor(-2.0 / 3.0, dtype=dtype, device=device)
+    # Wrapping the constant with Tensor is required for Torchscript
+
+    mask2 = temp > 1
+    temp[mask2] = torch.tensor(2.0 / 3.0, dtype=dtype, device=device)
+
+    mask3 = ~mask1 & ~mask2
+    temp[mask3] = temp[mask3] - (temp[mask3] ** 3) * (1.0 / 3)
+
+    output_waveform = torch.zeros_like(waveform, dtype=dtype, device=device)
+
+    # Uses CPU optimized loop function if available for CPU device
+    if device == torch.device("cpu"):
+        _overdrive_core_loop_cpu(waveform, temp, last_in, last_out, output_waveform)
+    else:
+        _overdrive_core_loop_generic(waveform, temp, last_in, last_out, output_waveform)
+
+    return output_waveform.clamp(min=-1, max=1).view(actual_shape)
+
+
+def phaser(
+    waveform: Tensor,
+    sample_rate: int,
+    gain_in: float = 0.4,
+    gain_out: float = 0.74,
+    delay_ms: float = 3.0,
+    decay: float = 0.4,
+    mod_speed: float = 0.5,
+    sinusoidal: bool = True,
+) -> Tensor:
+    r"""Apply a phasing effect to the audio. Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        gain_in (float, optional): desired input gain at the boost (or attenuation) in dB
+            Allowed range of values are 0 to 1
+        gain_out (float, optional): desired output gain at the boost (or attenuation) in dB
+            Allowed range of values are 0 to 1e9
+        delay_ms (float, optional): desired delay in milliseconds
+            Allowed range of values are 0 to 5.0
+        decay (float, optional):  desired decay relative to gain-in
+            Allowed range of values are 0 to 0.99
+        mod_speed (float, optional):  modulation speed in Hz
+            Allowed range of values are 0.1 to 2
+        sinusoidal (bool, optional):  If ``True``, uses sinusoidal modulation (preferable for multiple instruments)
+            If ``False``, uses triangular modulation (gives single instruments a sharper phasing effect)
+            (Default: ``True``)
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - Scott Lehman, `Effects Explained`_.
+
+    .. _Effects Explained:
+        https://web.archive.org/web/20051125072557/http://www.harmony-central.com/Effects/effects-explained.html
+    """
+    actual_shape = waveform.shape
+    device, dtype = waveform.device, waveform.dtype
+
+    # convert to 2D (channels,time)
+    waveform = waveform.view(-1, actual_shape[-1])
+
+    delay_buf_len = int((delay_ms * 0.001 * sample_rate) + 0.5)
+    delay_buf = torch.zeros(waveform.shape[0], delay_buf_len, dtype=dtype, device=device)
+
+    mod_buf_len = int(sample_rate / mod_speed + 0.5)
+
+    if sinusoidal:
+        wave_type = "SINE"
+    else:
+        wave_type = "TRIANGLE"
+
+    mod_buf = _generate_wave_table(
+        wave_type=wave_type,
+        data_type="INT",
+        table_size=mod_buf_len,
+        min=1.0,
+        max=float(delay_buf_len),
+        phase=math.pi / 2,
+        device=device,
+    )
+
+    delay_pos = 0
+    mod_pos = 0
+
+    output_waveform_pre_gain_list = []
+    waveform = waveform * gain_in
+    delay_buf = delay_buf * decay
+    waveform_list = [waveform[:, i] for i in range(waveform.size(1))]
+    delay_buf_list = [delay_buf[:, i] for i in range(delay_buf.size(1))]
+    mod_buf_list = [mod_buf[i] for i in range(mod_buf.size(0))]
+
+    for i in range(waveform.shape[-1]):
+        idx = int((delay_pos + mod_buf_list[mod_pos]) % delay_buf_len)
+        mod_pos = (mod_pos + 1) % mod_buf_len
+        delay_pos = (delay_pos + 1) % delay_buf_len
+        temp = (waveform_list[i]) + (delay_buf_list[idx])
+        delay_buf_list[delay_pos] = temp * decay
+        output_waveform_pre_gain_list.append(temp)
+
+    output_waveform = torch.stack(output_waveform_pre_gain_list, dim=1).to(dtype=dtype, device=device)
+    output_waveform.mul_(gain_out)
+
+    return output_waveform.clamp(min=-1, max=1).view(actual_shape)
+
+
+def riaa_biquad(waveform: Tensor, sample_rate: int) -> Tensor:
+    r"""Apply RIAA vinyl playback equalization.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz).
+            Allowed sample rates in Hz : ``44100``,``48000``,``88200``,``96000``
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+
+    if sample_rate == 44100:
+        zeros = [-0.2014898, 0.9233820]
+        poles = [0.7083149, 0.9924091]
+
+    elif sample_rate == 48000:
+        zeros = [-0.1766069, 0.9321590]
+        poles = [0.7396325, 0.9931330]
+
+    elif sample_rate == 88200:
+        zeros = [-0.1168735, 0.9648312]
+        poles = [0.8590646, 0.9964002]
+
+    elif sample_rate == 96000:
+        zeros = [-0.1141486, 0.9676817]
+        poles = [0.8699137, 0.9966946]
+
+    else:
+        raise ValueError("Sample rate must be 44.1k, 48k, 88.2k, or 96k")
+
+    # polynomial coefficients with roots zeros[0] and zeros[1]
+    b0 = 1.0
+    b1 = -(zeros[0] + zeros[1])
+    b2 = zeros[0] * zeros[1]
+
+    # polynomial coefficients with roots poles[0] and poles[1]
+    a0 = 1.0
+    a1 = -(poles[0] + poles[1])
+    a2 = poles[0] * poles[1]
+
+    # Normalize to 0dB at 1kHz
+    y = 2 * math.pi * 1000 / sample_rate
+    b_re = b0 + b1 * math.cos(-y) + b2 * math.cos(-2 * y)
+    a_re = a0 + a1 * math.cos(-y) + a2 * math.cos(-2 * y)
+    b_im = b1 * math.sin(-y) + b2 * math.sin(-2 * y)
+    a_im = a1 * math.sin(-y) + a2 * math.sin(-2 * y)
+    g = 1 / math.sqrt((b_re**2 + b_im**2) / (a_re**2 + a_im**2))
+
+    b0 *= g
+    b1 *= g
+    b2 *= g
+
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def treble_biquad(
+    waveform: Tensor,
+    sample_rate: int,
+    gain: float,
+    central_freq: float = 3000,
+    Q: float = 0.707,
+) -> Tensor:
+    r"""Design a treble tone-control effect.  Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): audio waveform of dimension of `(..., time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+        gain (float or torch.Tensor): desired gain at the boost (or attenuation) in dB.
+        central_freq (float or torch.Tensor, optional): central frequency (in Hz). (Default: ``3000``)
+        Q (float or torch.Tensor, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``).
+
+    Returns:
+        Tensor: Waveform of dimension of `(..., time)`
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+        - https://www.w3.org/2011/audio/audio-eq-cookbook.html#APF
+    """
+    dtype = waveform.dtype
+    device = waveform.device
+    central_freq = torch.as_tensor(central_freq, dtype=dtype, device=device)
+    Q = torch.as_tensor(Q, dtype=dtype, device=device)
+    gain = torch.as_tensor(gain, dtype=dtype, device=device)
+
+    w0 = 2 * math.pi * central_freq / sample_rate
+    alpha = torch.sin(w0) / 2 / Q
+    A = torch.exp(gain / 40 * math.log(10))
+
+    temp1 = 2 * torch.sqrt(A) * alpha
+    temp2 = (A - 1) * torch.cos(w0)
+    temp3 = (A + 1) * torch.cos(w0)
+
+    b0 = A * ((A + 1) + temp2 + temp1)
+    b1 = -2 * A * ((A - 1) + temp3)
+    b2 = A * ((A + 1) + temp2 - temp1)
+    a0 = (A + 1) - temp2 + temp1
+    a1 = 2 * ((A - 1) - temp3)
+    a2 = (A + 1) - temp2 - temp1
+
+    return biquad(waveform, b0, b1, b2, a0, a1, a2)
+
+
+def _measure(
+    measure_len_ws: int,
+    samples: Tensor,
+    spectrum: Tensor,
+    noise_spectrum: Tensor,
+    spectrum_window: Tensor,
+    spectrum_start: int,
+    spectrum_end: int,
+    cepstrum_window: Tensor,
+    cepstrum_start: int,
+    cepstrum_end: int,
+    noise_reduction_amount: float,
+    measure_smooth_time_mult: float,
+    noise_up_time_mult: Tensor,
+    noise_down_time_mult: Tensor,
+    boot_count: int,
+) -> float:
+    device = samples.device
+
+    if spectrum.size(-1) != noise_spectrum.size(-1):
+        raise ValueError(
+            "Expected spectrum size to match noise spectrum size in final dimension."
+            f"Found: spectrum size: {spectrum.size()}, noise_spectrum size: {noise_spectrum.size()}"
+        )
+
+    dft_len_ws = spectrum.size()[-1]
+
+    dftBuf = torch.zeros(dft_len_ws, device=device)
+
+    dftBuf[:measure_len_ws] = samples * spectrum_window[:measure_len_ws]
+
+    # lsx_safe_rdft((int)p->dft_len_ws, 1, c->dftBuf);
+    _dftBuf = torch.fft.rfft(dftBuf)
+
+    mult: float = boot_count / (1.0 + boot_count) if boot_count >= 0 else measure_smooth_time_mult
+
+    _d = _dftBuf[spectrum_start:spectrum_end].abs()
+    spectrum[spectrum_start:spectrum_end].mul_(mult).add_(_d * (1 - mult))
+    _d = spectrum[spectrum_start:spectrum_end] ** 2
+
+    _zeros = torch.zeros(spectrum_end - spectrum_start, device=device)
+    _mult = (
+        _zeros
+        if boot_count >= 0
+        else torch.where(
+            _d > noise_spectrum[spectrum_start:spectrum_end],
+            noise_up_time_mult,  # if
+            noise_down_time_mult,  # else,
+        )
+    )
+
+    noise_spectrum[spectrum_start:spectrum_end].mul_(_mult).add_(_d * (1 - _mult))
+    _d = torch.sqrt(
+        torch.max(
+            _zeros,
+            _d - noise_reduction_amount * noise_spectrum[spectrum_start:spectrum_end],
+        ),
+    )
+
+    _cepstrum_Buf: Tensor = torch.zeros(dft_len_ws >> 1, device=device)
+    _cepstrum_Buf[spectrum_start:spectrum_end] = _d * cepstrum_window
+    _cepstrum_Buf[spectrum_end : dft_len_ws >> 1].zero_()
+
+    # lsx_safe_rdft((int)p->dft_len_ws >> 1, 1, c->dftBuf);
+    _cepstrum_Buf = torch.fft.rfft(_cepstrum_Buf)
+
+    result: float = float(torch.sum(_cepstrum_Buf[cepstrum_start:cepstrum_end].abs().pow(2)))
+    result = math.log(result / (cepstrum_end - cepstrum_start)) if result > 0 else -math.inf
+    return max(0, 21 + result)
+
+
+def vad(
+    waveform: Tensor,
+    sample_rate: int,
+    trigger_level: float = 7.0,
+    trigger_time: float = 0.25,
+    search_time: float = 1.0,
+    allowed_gap: float = 0.25,
+    pre_trigger_time: float = 0.0,
+    # Fine-tuning parameters
+    boot_time: float = 0.35,
+    noise_up_time: float = 0.1,
+    noise_down_time: float = 0.01,
+    noise_reduction_amount: float = 1.35,
+    measure_freq: float = 20.0,
+    measure_duration: Optional[float] = None,
+    measure_smooth_time: float = 0.4,
+    hp_filter_freq: float = 50.0,
+    lp_filter_freq: float = 6000.0,
+    hp_lifter_freq: float = 150.0,
+    lp_lifter_freq: float = 2000.0,
+) -> Tensor:
+    r"""Voice Activity Detector. Similar to SoX implementation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Attempts to trim silence and quiet background sounds from the ends of recordings of speech.
+    The algorithm currently uses a simple cepstral power measurement to detect voice,
+    so may be fooled by other things, especially music.
+
+    The effect can trim only from the front of the audio,
+    so in order to trim from the back, the reverse effect must also be used.
+
+    Args:
+        waveform (Tensor): Tensor of audio of dimension `(channels, time)` or `(time)`
+            Tensor of shape `(channels, time)` is treated as a multi-channel recording
+            of the same event and the resulting output will be trimmed to the earliest
+            voice activity in any channel.
+        sample_rate (int): Sample rate of audio signal.
+        trigger_level (float, optional): The measurement level used to trigger activity detection.
+            This may need to be cahnged depending on the noise level, signal level,
+            and other characteristics of the input audio. (Default: 7.0)
+        trigger_time (float, optional): The time constant (in seconds)
+            used to help ignore short bursts of sound. (Default: 0.25)
+        search_time (float, optional): The amount of audio (in seconds)
+            to search for quieter/shorter bursts of audio to include prior
+            to the detected trigger point. (Default: 1.0)
+        allowed_gap (float, optional): The allowed gap (in seconds) between
+            quieter/shorter bursts of audio to include prior
+            to the detected trigger point. (Default: 0.25)
+        pre_trigger_time (float, optional): The amount of audio (in seconds) to preserve
+            before the trigger point and any found quieter/shorter bursts. (Default: 0.0)
+        boot_time (float, optional) The algorithm (internally) uses adaptive noise
+            estimation/reduction in order to detect the start of the wanted audio.
+            This option sets the time for the initial noise estimate. (Default: 0.35)
+        noise_up_time (float, optional) Time constant used by the adaptive noise estimator
+            for when the noise level is increasing. (Default: 0.1)
+        noise_down_time (float, optional) Time constant used by the adaptive noise estimator
+            for when the noise level is decreasing. (Default: 0.01)
+        noise_reduction_amount (float, optional) Amount of noise reduction to use in
+            the detection algorithm (e.g. 0, 0.5, ...). (Default: 1.35)
+        measure_freq (float, optional) Frequency of the algorithm's
+            processing/measurements. (Default: 20.0)
+        measure_duration: (float, optional) Measurement duration.
+            (Default: Twice the measurement period; i.e. with overlap.)
+        measure_smooth_time (float, optional) Time constant used to smooth
+            spectral measurements. (Default: 0.4)
+        hp_filter_freq (float, optional) "Brick-wall" frequency of high-pass filter applied
+            at the input to the detector algorithm. (Default: 50.0)
+        lp_filter_freq (float, optional) "Brick-wall" frequency of low-pass filter applied
+            at the input to the detector algorithm. (Default: 6000.0)
+        hp_lifter_freq (float, optional) "Brick-wall" frequency of high-pass lifter used
+            in the detector algorithm. (Default: 150.0)
+        lp_lifter_freq (float, optional) "Brick-wall" frequency of low-pass lifter used
+            in the detector algorithm. (Default: 2000.0)
+
+    Returns:
+        Tensor: Tensor of audio of dimension `(..., time)`.
+
+    Reference:
+        - http://sox.sourceforge.net/sox.html
+    """
+    device = waveform.device
+
+    if waveform.ndim > 2:
+        warnings.warn(
+            "Expected input tensor dimension of 1 for single channel"
+            f" or 2 for multi-channel. Got {waveform.ndim} instead. "
+            "Batch semantics is not supported. "
+            "Please refer to https://github.com/pytorch/audio/issues/1348"
+            " and https://github.com/pytorch/audio/issues/1468."
+        )
+
+    measure_duration: float = 2.0 / measure_freq if measure_duration is None else measure_duration
+
+    measure_len_ws = int(sample_rate * measure_duration + 0.5)
+    measure_len_ns = measure_len_ws
+    # for (dft_len_ws = 16; dft_len_ws < measure_len_ws; dft_len_ws <<= 1);
+    dft_len_ws = 16
+    while dft_len_ws < measure_len_ws:
+        dft_len_ws *= 2
+
+    measure_period_ns = int(sample_rate / measure_freq + 0.5)
+    measures_len = math.ceil(search_time * measure_freq)
+    search_pre_trigger_len_ns = measures_len * measure_period_ns
+    gap_len = int(allowed_gap * measure_freq + 0.5)
+
+    fixed_pre_trigger_len_ns = int(pre_trigger_time * sample_rate + 0.5)
+    samplesLen_ns = fixed_pre_trigger_len_ns + search_pre_trigger_len_ns + measure_len_ns
+
+    spectrum_window = torch.zeros(measure_len_ws, device=device)
+    for i in range(measure_len_ws):
+        # sox.h:741 define SOX_SAMPLE_MIN (sox_sample_t)SOX_INT_MIN(32)
+        spectrum_window[i] = 2.0 / math.sqrt(float(measure_len_ws))
+    # lsx_apply_hann(spectrum_window, (int)measure_len_ws);
+    spectrum_window *= torch.hann_window(measure_len_ws, device=device, dtype=torch.float)
+
+    spectrum_start: int = int(hp_filter_freq / sample_rate * dft_len_ws + 0.5)
+    spectrum_start: int = max(spectrum_start, 1)
+    spectrum_end: int = int(lp_filter_freq / sample_rate * dft_len_ws + 0.5)
+    spectrum_end: int = min(spectrum_end, dft_len_ws // 2)
+
+    cepstrum_window = torch.zeros(spectrum_end - spectrum_start, device=device)
+    for i in range(spectrum_end - spectrum_start):
+        cepstrum_window[i] = 2.0 / math.sqrt(float(spectrum_end) - spectrum_start)
+    # lsx_apply_hann(cepstrum_window,(int)(spectrum_end - spectrum_start));
+    cepstrum_window *= torch.hann_window(spectrum_end - spectrum_start, device=device, dtype=torch.float)
+
+    cepstrum_start = math.ceil(sample_rate * 0.5 / lp_lifter_freq)
+    cepstrum_end = math.floor(sample_rate * 0.5 / hp_lifter_freq)
+    cepstrum_end = min(cepstrum_end, dft_len_ws // 4)
+
+    if cepstrum_end <= cepstrum_start:
+        raise ValueError(
+            "Expected cepstrum_start to be smaller than cepstrum_end."
+            f"Found: cepstrum_start: {cepstrum_start}, cepstrum_end: {cepstrum_end}."
+        )
+
+    noise_up_time_mult = torch.tensor(math.exp(-1.0 / (noise_up_time * measure_freq)), device=device)
+    noise_down_time_mult = torch.tensor(math.exp(-1.0 / (noise_down_time * measure_freq)), device=device)
+    measure_smooth_time_mult = math.exp(-1.0 / (measure_smooth_time * measure_freq))
+    trigger_meas_time_mult = math.exp(-1.0 / (trigger_time * measure_freq))
+
+    boot_count_max = int(boot_time * measure_freq - 0.5)
+    boot_count = measures_index = flushedLen_ns = 0
+
+    # pack batch
+    shape = waveform.size()
+    waveform = waveform.view(-1, shape[-1])
+
+    n_channels, ilen = waveform.size()
+
+    mean_meas = torch.zeros(n_channels, device=device)
+    spectrum = torch.zeros(n_channels, dft_len_ws, device=device)
+    noise_spectrum = torch.zeros(n_channels, dft_len_ws, device=device)
+    measures = torch.zeros(n_channels, measures_len, device=device)
+
+    has_triggered: bool = False
+    num_measures_to_flush: int = 0
+
+    pos = 0
+    for pos in range(measure_len_ns, ilen, measure_period_ns):
+        for i in range(n_channels):
+            meas: float = _measure(
+                measure_len_ws=measure_len_ws,
+                samples=waveform[i, pos - measure_len_ws : pos],
+                spectrum=spectrum[i],
+                noise_spectrum=noise_spectrum[i],
+                spectrum_window=spectrum_window,
+                spectrum_start=spectrum_start,
+                spectrum_end=spectrum_end,
+                cepstrum_window=cepstrum_window,
+                cepstrum_start=cepstrum_start,
+                cepstrum_end=cepstrum_end,
+                noise_reduction_amount=noise_reduction_amount,
+                measure_smooth_time_mult=measure_smooth_time_mult,
+                noise_up_time_mult=noise_up_time_mult,
+                noise_down_time_mult=noise_down_time_mult,
+                boot_count=boot_count,
+            )
+            measures[i, measures_index] = meas
+            mean_meas[i] = mean_meas[i] * trigger_meas_time_mult + meas * (1.0 - trigger_meas_time_mult)
+
+            has_triggered = has_triggered or (mean_meas[i] >= trigger_level)
+            if has_triggered:
+                n: int = measures_len
+                k: int = measures_index
+                jTrigger: int = n
+                jZero: int = n
+                j: int = 0
+
+                for j in range(n):
+                    if (measures[i, k] >= trigger_level) and (j <= jTrigger + gap_len):
+                        jZero = jTrigger = j
+                    elif (measures[i, k] == 0) and (jTrigger >= jZero):
+                        jZero = j
+                    k = (k + n - 1) % n
+                j = min(j, jZero)
+                # num_measures_to_flush = range_limit(j, num_measures_to_flush, n);
+                num_measures_to_flush = min(max(num_measures_to_flush, j), n)
+            # end if has_triggered
+        # end for channel
+        measures_index += 1
+        measures_index = measures_index % measures_len
+        if boot_count >= 0:
+            boot_count = -1 if boot_count == boot_count_max else boot_count + 1
+
+        if has_triggered:
+            flushedLen_ns = (measures_len - num_measures_to_flush) * measure_period_ns
+            break
+    # end for window
+    if not has_triggered:
+        return waveform[..., :0].view(shape[:-1] + torch.Size([0]))
+
+    res = waveform[:, pos - samplesLen_ns + flushedLen_ns :]
+    # unpack batch
+    return res.view(shape[:-1] + res.shape[-1:])
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/functional/functional.py b/.venv/lib/python3.11/site-packages/torchaudio/functional/functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..af34e707e552d6eae403278743b406988e16e9db
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/functional/functional.py
@@ -0,0 +1,2535 @@
+# -*- coding: utf-8 -*-
+
+import math
+import tempfile
+import warnings
+from collections.abc import Sequence
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torchaudio
+from torch import Tensor
+from torchaudio._internal.module_utils import deprecated
+
+from .filtering import highpass_biquad, treble_biquad
+
+__all__ = [
+    "spectrogram",
+    "inverse_spectrogram",
+    "griffinlim",
+    "amplitude_to_DB",
+    "DB_to_amplitude",
+    "compute_deltas",
+    "melscale_fbanks",
+    "linear_fbanks",
+    "create_dct",
+    "compute_deltas",
+    "detect_pitch_frequency",
+    "DB_to_amplitude",
+    "mu_law_encoding",
+    "mu_law_decoding",
+    "phase_vocoder",
+    "mask_along_axis",
+    "mask_along_axis_iid",
+    "sliding_window_cmn",
+    "spectral_centroid",
+    "apply_codec",
+    "resample",
+    "edit_distance",
+    "loudness",
+    "pitch_shift",
+    "rnnt_loss",
+    "psd",
+    "mvdr_weights_souden",
+    "mvdr_weights_rtf",
+    "rtf_evd",
+    "rtf_power",
+    "apply_beamforming",
+    "fftconvolve",
+    "convolve",
+    "add_noise",
+    "speed",
+    "preemphasis",
+    "deemphasis",
+]
+
+
+def spectrogram(
+    waveform: Tensor,
+    pad: int,
+    window: Tensor,
+    n_fft: int,
+    hop_length: int,
+    win_length: int,
+    power: Optional[float],
+    normalized: Union[bool, str],
+    center: bool = True,
+    pad_mode: str = "reflect",
+    onesided: bool = True,
+    return_complex: Optional[bool] = None,
+) -> Tensor:
+    r"""Create a spectrogram or a batch of spectrograms from a raw audio signal.
+    The spectrogram can be either magnitude-only or complex.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (Tensor): Tensor of audio of dimension `(..., time)`
+        pad (int): Two sided padding of signal
+        window (Tensor): Window tensor that is applied/multiplied to each frame/window
+        n_fft (int): Size of FFT
+        hop_length (int): Length of hop between STFT windows
+        win_length (int): Window size
+        power (float or None): Exponent for the magnitude spectrogram,
+            (must be > 0) e.g., 1 for magnitude, 2 for power, etc.
+            If None, then the complex spectrum is returned instead.
+        normalized (bool or str): Whether to normalize by magnitude after stft. If input is str, choices are
+            ``"window"`` and ``"frame_length"``, if specific normalization type is desirable. ``True`` maps to
+            ``"window"``. When normalized on ``"window"``, waveform is normalized upon the window's L2 energy. If
+            normalized on ``"frame_length"``, waveform is normalized by dividing by
+            :math:`(\text{frame\_length})^{0.5}`.
+        center (bool, optional): whether to pad :attr:`waveform` on both sides so
+            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
+            Default: ``True``
+        pad_mode (string, optional): controls the padding method used when
+            :attr:`center` is ``True``. Default: ``"reflect"``
+        onesided (bool, optional): controls whether to return half of results to
+            avoid redundancy. Default: ``True``
+        return_complex (bool, optional):
+            Deprecated and not used.
+
+    Returns:
+        Tensor: Dimension `(..., freq, time)`, freq is
+        ``n_fft // 2 + 1`` and ``n_fft`` is the number of
+        Fourier bins, and time is the number of window hops (n_frame).
+    """
+    if return_complex is not None:
+        warnings.warn(
+            "`return_complex` argument is now deprecated and is not effective."
+            "`torchaudio.functional.spectrogram(power=None)` always returns a tensor with "
+            "complex dtype. Please remove the argument in the function call."
+        )
+
+    if pad > 0:
+        # TODO add "with torch.no_grad():" back when JIT supports it
+        waveform = torch.nn.functional.pad(waveform, (pad, pad), "constant")
+
+    frame_length_norm, window_norm = _get_spec_norms(normalized)
+
+    # pack batch
+    shape = waveform.size()
+    waveform = waveform.reshape(-1, shape[-1])
+
+    # default values are consistent with librosa.core.spectrum._spectrogram
+    spec_f = torch.stft(
+        input=waveform,
+        n_fft=n_fft,
+        hop_length=hop_length,
+        win_length=win_length,
+        window=window,
+        center=center,
+        pad_mode=pad_mode,
+        normalized=frame_length_norm,
+        onesided=onesided,
+        return_complex=True,
+    )
+
+    # unpack batch
+    spec_f = spec_f.reshape(shape[:-1] + spec_f.shape[-2:])
+
+    if window_norm:
+        spec_f /= window.pow(2.0).sum().sqrt()
+    if power is not None:
+        if power == 1.0:
+            return spec_f.abs()
+        return spec_f.abs().pow(power)
+    return spec_f
+
+
+def inverse_spectrogram(
+    spectrogram: Tensor,
+    length: Optional[int],
+    pad: int,
+    window: Tensor,
+    n_fft: int,
+    hop_length: int,
+    win_length: int,
+    normalized: Union[bool, str],
+    center: bool = True,
+    pad_mode: str = "reflect",
+    onesided: bool = True,
+) -> Tensor:
+    r"""Create an inverse spectrogram or a batch of inverse spectrograms from the provided
+    complex-valued spectrogram.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        spectrogram (Tensor): Complex tensor of audio of dimension (..., freq, time).
+        length (int or None): The output length of the waveform.
+        pad (int): Two sided padding of signal. It is only effective when ``length`` is provided.
+        window (Tensor): Window tensor that is applied/multiplied to each frame/window
+        n_fft (int): Size of FFT
+        hop_length (int): Length of hop between STFT windows
+        win_length (int): Window size
+        normalized (bool or str): Whether the stft output was normalized by magnitude. If input is str, choices are
+            ``"window"`` and ``"frame_length"``, dependent on normalization mode. ``True`` maps to
+            ``"window"``.
+        center (bool, optional): whether the waveform was padded on both sides so
+            that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}`.
+            Default: ``True``
+        pad_mode (string, optional): controls the padding method used when
+            :attr:`center` is ``True``. This parameter is provided for compatibility with the
+            spectrogram function and is not used. Default: ``"reflect"``
+        onesided (bool, optional): controls whether spectrogram was done in onesided mode.
+            Default: ``True``
+
+    Returns:
+        Tensor: Dimension `(..., time)`. Least squares estimation of the original signal.
+    """
+
+    frame_length_norm, window_norm = _get_spec_norms(normalized)
+
+    if not spectrogram.is_complex():
+        raise ValueError("Expected `spectrogram` to be complex dtype.")
+
+    if window_norm:
+        spectrogram = spectrogram * window.pow(2.0).sum().sqrt()
+
+    # pack batch
+    shape = spectrogram.size()
+    spectrogram = spectrogram.reshape(-1, shape[-2], shape[-1])
+
+    # default values are consistent with librosa.core.spectrum._spectrogram
+    waveform = torch.istft(
+        input=spectrogram,
+        n_fft=n_fft,
+        hop_length=hop_length,
+        win_length=win_length,
+        window=window,
+        center=center,
+        normalized=frame_length_norm,
+        onesided=onesided,
+        length=length + 2 * pad if length is not None else None,
+        return_complex=False,
+    )
+
+    if length is not None and pad > 0:
+        # remove padding from front and back
+        waveform = waveform[:, pad:-pad]
+
+    # unpack batch
+    waveform = waveform.reshape(shape[:-2] + waveform.shape[-1:])
+
+    return waveform
+
+
+def _get_spec_norms(normalized: Union[str, bool]):
+    frame_length_norm, window_norm = False, False
+    if torch.jit.isinstance(normalized, str):
+        if normalized not in ["frame_length", "window"]:
+            raise ValueError("Invalid normalized parameter: {}".format(normalized))
+        if normalized == "frame_length":
+            frame_length_norm = True
+        elif normalized == "window":
+            window_norm = True
+    elif torch.jit.isinstance(normalized, bool):
+        if normalized:
+            window_norm = True
+    else:
+        raise TypeError("Input type not supported")
+    return frame_length_norm, window_norm
+
+
+def _get_complex_dtype(real_dtype: torch.dtype):
+    if real_dtype == torch.double:
+        return torch.cdouble
+    if real_dtype == torch.float:
+        return torch.cfloat
+    if real_dtype == torch.half:
+        return torch.complex32
+    raise ValueError(f"Unexpected dtype {real_dtype}")
+
+
+def griffinlim(
+    specgram: Tensor,
+    window: Tensor,
+    n_fft: int,
+    hop_length: int,
+    win_length: int,
+    power: float,
+    n_iter: int,
+    momentum: float,
+    length: Optional[int],
+    rand_init: bool,
+) -> Tensor:
+    r"""Compute waveform from a linear scale magnitude spectrogram using the Griffin-Lim transformation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Implementation ported from
+    *librosa* :cite:`brian_mcfee-proc-scipy-2015`, *A fast Griffin-Lim algorithm* :cite:`6701851`
+    and *Signal estimation from modified short-time Fourier transform* :cite:`1172092`.
+
+    Args:
+        specgram (Tensor): A magnitude-only STFT spectrogram of dimension `(..., freq, frames)`
+            where freq is ``n_fft // 2 + 1``.
+        window (Tensor): Window tensor that is applied/multiplied to each frame/window
+        n_fft (int): Size of FFT, creates ``n_fft // 2 + 1`` bins
+        hop_length (int): Length of hop between STFT windows. (
+            Default: ``win_length // 2``)
+        win_length (int): Window size. (Default: ``n_fft``)
+        power (float): Exponent for the magnitude spectrogram,
+            (must be > 0) e.g., 1 for magnitude, 2 for power, etc.
+        n_iter (int): Number of iteration for phase recovery process.
+        momentum (float): The momentum parameter for fast Griffin-Lim.
+            Setting this to 0 recovers the original Griffin-Lim method.
+            Values near 1 can lead to faster convergence, but above 1 may not converge.
+        length (int or None): Array length of the expected output.
+        rand_init (bool): Initializes phase randomly if True, to zero otherwise.
+
+    Returns:
+        Tensor: waveform of `(..., time)`, where time equals the ``length`` parameter if given.
+    """
+    if not 0 <= momentum < 1:
+        raise ValueError("momentum must be in range [0, 1). Found: {}".format(momentum))
+
+    momentum = momentum / (1 + momentum)
+
+    # pack batch
+    shape = specgram.size()
+    specgram = specgram.reshape([-1] + list(shape[-2:]))
+
+    specgram = specgram.pow(1 / power)
+
+    # initialize the phase
+    if rand_init:
+        angles = torch.rand(specgram.size(), dtype=_get_complex_dtype(specgram.dtype), device=specgram.device)
+    else:
+        angles = torch.full(specgram.size(), 1, dtype=_get_complex_dtype(specgram.dtype), device=specgram.device)
+
+    # And initialize the previous iterate to 0
+    tprev = torch.tensor(0.0, dtype=specgram.dtype, device=specgram.device)
+    for _ in range(n_iter):
+        # Invert with our current estimate of the phases
+        inverse = torch.istft(
+            specgram * angles, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window, length=length
+        )
+
+        # Rebuild the spectrogram
+        rebuilt = torch.stft(
+            input=inverse,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=win_length,
+            window=window,
+            center=True,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True,
+        )
+
+        # Update our phase estimates
+        angles = rebuilt
+        if momentum:
+            angles = angles - tprev.mul_(momentum)
+        angles = angles.div(angles.abs().add(1e-16))
+
+        # Store the previous iterate
+        tprev = rebuilt
+
+    # Return the final phase estimates
+    waveform = torch.istft(
+        specgram * angles, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window, length=length
+    )
+
+    # unpack batch
+    waveform = waveform.reshape(shape[:-2] + waveform.shape[-1:])
+
+    return waveform
+
+
+def amplitude_to_DB(
+    x: Tensor, multiplier: float, amin: float, db_multiplier: float, top_db: Optional[float] = None
+) -> Tensor:
+    r"""Turn a spectrogram from the power/amplitude scale to the decibel scale.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    The output of each tensor in a batch depends on the maximum value of that tensor,
+    and so may return different values for an audio clip split into snippets vs. a full clip.
+
+    Args:
+
+        x (Tensor): Input spectrogram(s) before being converted to decibel scale.
+            The expected shapes are ``(freq, time)``, ``(channel, freq, time)`` or
+            ``(..., batch, channel, freq, time)``.
+
+            .. note::
+
+               When ``top_db`` is specified, cut-off values are computed for each audio
+               in the batch. Therefore if the input shape is 4D (or larger), different
+               cut-off values are used for audio data in the batch.
+               If the input shape is 2D or 3D, a single cutoff value is used.
+
+        multiplier (float): Use 10. for power and 20. for amplitude
+        amin (float): Number to clamp ``x``
+        db_multiplier (float): Log10(max(reference value and amin))
+        top_db (float or None, optional): Minimum negative cut-off in decibels. A reasonable number
+            is 80. (Default: ``None``)
+
+    Returns:
+        Tensor: Output tensor in decibel scale
+    """
+    x_db = multiplier * torch.log10(torch.clamp(x, min=amin))
+    x_db -= multiplier * db_multiplier
+
+    if top_db is not None:
+        # Expand batch
+        shape = x_db.size()
+        packed_channels = shape[-3] if x_db.dim() > 2 else 1
+        x_db = x_db.reshape(-1, packed_channels, shape[-2], shape[-1])
+
+        x_db = torch.max(x_db, (x_db.amax(dim=(-3, -2, -1)) - top_db).view(-1, 1, 1, 1))
+
+        # Repack batch
+        x_db = x_db.reshape(shape)
+
+    return x_db
+
+
+def DB_to_amplitude(x: Tensor, ref: float, power: float) -> Tensor:
+    r"""Turn a tensor from the decibel scale to the power/amplitude scale.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Args:
+        x (Tensor): Input tensor before being converted to power/amplitude scale.
+        ref (float): Reference which the output will be scaled by.
+        power (float): If power equals 1, will compute DB to power. If 0.5, will compute DB to amplitude.
+
+    Returns:
+        Tensor: Output tensor in power/amplitude scale.
+    """
+    return ref * torch.pow(torch.pow(10.0, 0.1 * x), power)
+
+
+def _hz_to_mel(freq: float, mel_scale: str = "htk") -> float:
+    r"""Convert Hz to Mels.
+
+    Args:
+        freqs (float): Frequencies in Hz
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+
+    Returns:
+        mels (float): Frequency in Mels
+    """
+
+    if mel_scale not in ["slaney", "htk"]:
+        raise ValueError('mel_scale should be one of "htk" or "slaney".')
+
+    if mel_scale == "htk":
+        return 2595.0 * math.log10(1.0 + (freq / 700.0))
+
+    # Fill in the linear part
+    f_min = 0.0
+    f_sp = 200.0 / 3
+
+    mels = (freq - f_min) / f_sp
+
+    # Fill in the log-scale part
+    min_log_hz = 1000.0
+    min_log_mel = (min_log_hz - f_min) / f_sp
+    logstep = math.log(6.4) / 27.0
+
+    if freq >= min_log_hz:
+        mels = min_log_mel + math.log(freq / min_log_hz) / logstep
+
+    return mels
+
+
+def _mel_to_hz(mels: Tensor, mel_scale: str = "htk") -> Tensor:
+    """Convert mel bin numbers to frequencies.
+
+    Args:
+        mels (Tensor): Mel frequencies
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+
+    Returns:
+        freqs (Tensor): Mels converted in Hz
+    """
+
+    if mel_scale not in ["slaney", "htk"]:
+        raise ValueError('mel_scale should be one of "htk" or "slaney".')
+
+    if mel_scale == "htk":
+        return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)
+
+    # Fill in the linear scale
+    f_min = 0.0
+    f_sp = 200.0 / 3
+    freqs = f_min + f_sp * mels
+
+    # And now the nonlinear scale
+    min_log_hz = 1000.0
+    min_log_mel = (min_log_hz - f_min) / f_sp
+    logstep = math.log(6.4) / 27.0
+
+    log_t = mels >= min_log_mel
+    freqs[log_t] = min_log_hz * torch.exp(logstep * (mels[log_t] - min_log_mel))
+
+    return freqs
+
+
+def _create_triangular_filterbank(
+    all_freqs: Tensor,
+    f_pts: Tensor,
+) -> Tensor:
+    """Create a triangular filter bank.
+
+    Args:
+        all_freqs (Tensor): STFT freq points of size (`n_freqs`).
+        f_pts (Tensor): Filter mid points of size (`n_filter`).
+
+    Returns:
+        fb (Tensor): The filter bank of size (`n_freqs`, `n_filter`).
+    """
+    # Adopted from Librosa
+    # calculate the difference between each filter mid point and each stft freq point in hertz
+    f_diff = f_pts[1:] - f_pts[:-1]  # (n_filter + 1)
+    slopes = f_pts.unsqueeze(0) - all_freqs.unsqueeze(1)  # (n_freqs, n_filter + 2)
+    # create overlapping triangles
+    zero = torch.zeros(1)
+    down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1]  # (n_freqs, n_filter)
+    up_slopes = slopes[:, 2:] / f_diff[1:]  # (n_freqs, n_filter)
+    fb = torch.max(zero, torch.min(down_slopes, up_slopes))
+
+    return fb
+
+
+def melscale_fbanks(
+    n_freqs: int,
+    f_min: float,
+    f_max: float,
+    n_mels: int,
+    sample_rate: int,
+    norm: Optional[str] = None,
+    mel_scale: str = "htk",
+) -> Tensor:
+    r"""Create a frequency bin conversion matrix.
+
+    .. devices:: CPU
+
+    .. properties:: TorchScript
+
+    Note:
+        For the sake of the numerical compatibility with librosa, not all the coefficients
+        in the resulting filter bank has magnitude of 1.
+
+        .. image:: https://download.pytorch.org/torchaudio/doc-assets/mel_fbanks.png
+           :alt: Visualization of generated filter bank
+
+    Args:
+        n_freqs (int): Number of frequencies to highlight/apply
+        f_min (float): Minimum frequency (Hz)
+        f_max (float): Maximum frequency (Hz)
+        n_mels (int): Number of mel filterbanks
+        sample_rate (int): Sample rate of the audio waveform
+        norm (str or None, optional): If "slaney", divide the triangular mel weights by the width of the mel band
+            (area normalization). (Default: ``None``)
+        mel_scale (str, optional): Scale to use: ``htk`` or ``slaney``. (Default: ``htk``)
+
+    Returns:
+        Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_mels``)
+        meaning number of frequencies to highlight/apply to x the number of filterbanks.
+        Each column is a filterbank so that assuming there is a matrix A of
+        size (..., ``n_freqs``), the applied result would be
+        ``A @ melscale_fbanks(A.size(-1), ...)``.
+
+    """
+
+    if norm is not None and norm != "slaney":
+        raise ValueError('norm must be one of None or "slaney"')
+
+    # freq bins
+    all_freqs = torch.linspace(0, sample_rate // 2, n_freqs)
+
+    # calculate mel freq bins
+    m_min = _hz_to_mel(f_min, mel_scale=mel_scale)
+    m_max = _hz_to_mel(f_max, mel_scale=mel_scale)
+
+    m_pts = torch.linspace(m_min, m_max, n_mels + 2)
+    f_pts = _mel_to_hz(m_pts, mel_scale=mel_scale)
+
+    # create filterbank
+    fb = _create_triangular_filterbank(all_freqs, f_pts)
+
+    if norm is not None and norm == "slaney":
+        # Slaney-style mel is scaled to be approx constant energy per channel
+        enorm = 2.0 / (f_pts[2 : n_mels + 2] - f_pts[:n_mels])
+        fb *= enorm.unsqueeze(0)
+
+    if (fb.max(dim=0).values == 0.0).any():
+        warnings.warn(
+            "At least one mel filterbank has all zero values. "
+            f"The value for `n_mels` ({n_mels}) may be set too high. "
+            f"Or, the value for `n_freqs` ({n_freqs}) may be set too low."
+        )
+
+    return fb
+
+
+def linear_fbanks(
+    n_freqs: int,
+    f_min: float,
+    f_max: float,
+    n_filter: int,
+    sample_rate: int,
+) -> Tensor:
+    r"""Creates a linear triangular filterbank.
+
+    .. devices:: CPU
+
+    .. properties:: TorchScript
+
+    Note:
+        For the sake of the numerical compatibility with librosa, not all the coefficients
+        in the resulting filter bank has magnitude of 1.
+
+        .. image:: https://download.pytorch.org/torchaudio/doc-assets/lin_fbanks.png
+           :alt: Visualization of generated filter bank
+
+    Args:
+        n_freqs (int): Number of frequencies to highlight/apply
+        f_min (float): Minimum frequency (Hz)
+        f_max (float): Maximum frequency (Hz)
+        n_filter (int): Number of (linear) triangular filter
+        sample_rate (int): Sample rate of the audio waveform
+
+    Returns:
+        Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_filter``)
+        meaning number of frequencies to highlight/apply to x the number of filterbanks.
+        Each column is a filterbank so that assuming there is a matrix A of
+        size (..., ``n_freqs``), the applied result would be
+        ``A * linear_fbanks(A.size(-1), ...)``.
+    """
+    # freq bins
+    all_freqs = torch.linspace(0, sample_rate // 2, n_freqs)
+
+    # filter mid-points
+    f_pts = torch.linspace(f_min, f_max, n_filter + 2)
+
+    # create filterbank
+    fb = _create_triangular_filterbank(all_freqs, f_pts)
+
+    return fb
+
+
+def create_dct(n_mfcc: int, n_mels: int, norm: Optional[str]) -> Tensor:
+    r"""Create a DCT transformation matrix with shape (``n_mels``, ``n_mfcc``),
+    normalized depending on norm.
+
+    .. devices:: CPU
+
+    .. properties:: TorchScript
+
+    Args:
+        n_mfcc (int): Number of mfc coefficients to retain
+        n_mels (int): Number of mel filterbanks
+        norm (str or None): Norm to use (either "ortho" or None)
+
+    Returns:
+        Tensor: The transformation matrix, to be right-multiplied to
+        row-wise data of size (``n_mels``, ``n_mfcc``).
+    """
+
+    if norm is not None and norm != "ortho":
+        raise ValueError('norm must be either "ortho" or None')
+
+    # http://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II
+    n = torch.arange(float(n_mels))
+    k = torch.arange(float(n_mfcc)).unsqueeze(1)
+    dct = torch.cos(math.pi / float(n_mels) * (n + 0.5) * k)  # size (n_mfcc, n_mels)
+
+    if norm is None:
+        dct *= 2.0
+    else:
+        dct[0] *= 1.0 / math.sqrt(2.0)
+        dct *= math.sqrt(2.0 / float(n_mels))
+    return dct.t()
+
+
+def mu_law_encoding(x: Tensor, quantization_channels: int) -> Tensor:
+    r"""Encode signal based on mu-law companding.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    For more info see the
+    `Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_
+
+    This algorithm expects the signal has been scaled to between -1 and 1 and
+    returns a signal encoded with values from 0 to quantization_channels - 1.
+
+    Args:
+        x (Tensor): Input tensor
+        quantization_channels (int): Number of channels
+
+    Returns:
+        Tensor: Input after mu-law encoding
+    """
+    mu = quantization_channels - 1.0
+    if not x.is_floating_point():
+        warnings.warn(
+            "The input Tensor must be of floating type. \
+            This will be an error in the v0.12 release."
+        )
+        x = x.to(torch.float)
+    mu = torch.tensor(mu, dtype=x.dtype)
+    x_mu = torch.sign(x) * torch.log1p(mu * torch.abs(x)) / torch.log1p(mu)
+    x_mu = ((x_mu + 1) / 2 * mu + 0.5).to(torch.int64)
+    return x_mu
+
+
+def mu_law_decoding(x_mu: Tensor, quantization_channels: int) -> Tensor:
+    r"""Decode mu-law encoded signal.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    For more info see the
+    `Wikipedia Entry <https://en.wikipedia.org/wiki/%CE%9C-law_algorithm>`_
+
+    This expects an input with values between 0 and quantization_channels - 1
+    and returns a signal scaled between -1 and 1.
+
+    Args:
+        x_mu (Tensor): Input tensor
+        quantization_channels (int): Number of channels
+
+    Returns:
+        Tensor: Input after mu-law decoding
+    """
+    mu = quantization_channels - 1.0
+    if not x_mu.is_floating_point():
+        x_mu = x_mu.to(torch.float)
+    mu = torch.tensor(mu, dtype=x_mu.dtype)
+    x = ((x_mu) / mu) * 2 - 1.0
+    x = torch.sign(x) * (torch.exp(torch.abs(x) * torch.log1p(mu)) - 1.0) / mu
+    return x
+
+
+def phase_vocoder(complex_specgrams: Tensor, rate: float, phase_advance: Tensor) -> Tensor:
+    r"""Given a STFT tensor, speed up in time without modifying pitch by a factor of ``rate``.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        complex_specgrams (Tensor):
+            A tensor of dimension `(..., freq, num_frame)` with complex dtype.
+        rate (float): Speed-up factor
+        phase_advance (Tensor): Expected phase advance in each bin. Dimension of `(freq, 1)`
+
+    Returns:
+        Tensor:
+            Stretched spectrogram. The resulting tensor is of the same dtype as the input
+            spectrogram, but the number of frames is changed to ``ceil(num_frame / rate)``.
+
+    Example
+        >>> freq, hop_length = 1025, 512
+        >>> # (channel, freq, time)
+        >>> complex_specgrams = torch.randn(2, freq, 300, dtype=torch.cfloat)
+        >>> rate = 1.3 # Speed up by 30%
+        >>> phase_advance = torch.linspace(
+        >>>    0, math.pi * hop_length, freq)[..., None]
+        >>> x = phase_vocoder(complex_specgrams, rate, phase_advance)
+        >>> x.shape # with 231 == ceil(300 / 1.3)
+        torch.Size([2, 1025, 231])
+    """
+    if rate == 1.0:
+        return complex_specgrams
+
+    # pack batch
+    shape = complex_specgrams.size()
+    complex_specgrams = complex_specgrams.reshape([-1] + list(shape[-2:]))
+
+    # Figures out the corresponding real dtype, i.e. complex128 -> float64, complex64 -> float32
+    # Note torch.real is a view so it does not incur any memory copy.
+    real_dtype = torch.real(complex_specgrams).dtype
+    time_steps = torch.arange(0, complex_specgrams.size(-1), rate, device=complex_specgrams.device, dtype=real_dtype)
+
+    alphas = time_steps % 1.0
+    phase_0 = complex_specgrams[..., :1].angle()
+
+    # Time Padding
+    complex_specgrams = torch.nn.functional.pad(complex_specgrams, [0, 2])
+
+    # (new_bins, freq, 2)
+    complex_specgrams_0 = complex_specgrams.index_select(-1, time_steps.long())
+    complex_specgrams_1 = complex_specgrams.index_select(-1, (time_steps + 1).long())
+
+    angle_0 = complex_specgrams_0.angle()
+    angle_1 = complex_specgrams_1.angle()
+
+    norm_0 = complex_specgrams_0.abs()
+    norm_1 = complex_specgrams_1.abs()
+
+    phase = angle_1 - angle_0 - phase_advance
+    phase = phase - 2 * math.pi * torch.round(phase / (2 * math.pi))
+
+    # Compute Phase Accum
+    phase = phase + phase_advance
+    phase = torch.cat([phase_0, phase[..., :-1]], dim=-1)
+    phase_acc = torch.cumsum(phase, -1)
+
+    mag = alphas * norm_1 + (1 - alphas) * norm_0
+
+    complex_specgrams_stretch = torch.polar(mag, phase_acc)
+
+    # unpack batch
+    complex_specgrams_stretch = complex_specgrams_stretch.reshape(shape[:-2] + complex_specgrams_stretch.shape[1:])
+    return complex_specgrams_stretch
+
+
+def _get_mask_param(mask_param: int, p: float, axis_length: int) -> int:
+    if p == 1.0:
+        return mask_param
+    else:
+        return min(mask_param, int(axis_length * p))
+
+
+def mask_along_axis_iid(
+    specgrams: Tensor,
+    mask_param: int,
+    mask_value: float,
+    axis: int,
+    p: float = 1.0,
+) -> Tensor:
+    r"""Apply a mask along ``axis``.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Mask will be applied from indices ``[v_0, v_0 + v)``,
+    where ``v`` is sampled from ``uniform(0, max_v)`` and
+    ``v_0`` from ``uniform(0, specgrams.size(axis) - v)``,
+    with ``max_v = mask_param`` when ``p = 1.0`` and
+    ``max_v = min(mask_param, floor(specgrams.size(axis) * p))`` otherwise.
+
+    Args:
+        specgrams (Tensor): Real spectrograms `(..., freq, time)`, with at least 3 dimensions.
+        mask_param (int): Number of columns to be masked will be uniformly sampled from [0, mask_param]
+        mask_value (float): Value to assign to the masked columns
+        axis (int): Axis to apply masking on, which should be the one of the last two dimensions.
+        p (float, optional): maximum proportion of columns that can be masked. (Default: 1.0)
+
+    Returns:
+        Tensor: Masked spectrograms with the same dimensions as input specgrams Tensor`
+    """
+
+    dim = specgrams.dim()
+
+    if dim < 3:
+        raise ValueError(f"Spectrogram must have at least three dimensions ({dim} given).")
+
+    if axis not in [dim - 2, dim - 1]:
+        raise ValueError(
+            f"Only Frequency and Time masking are supported (axis {dim-2} and axis {dim-1} supported; {axis} given)."
+        )
+
+    if not 0.0 <= p <= 1.0:
+        raise ValueError(f"The value of p must be between 0.0 and 1.0 ({p} given).")
+
+    mask_param = _get_mask_param(mask_param, p, specgrams.shape[axis])
+    if mask_param < 1:
+        return specgrams
+
+    device = specgrams.device
+    dtype = specgrams.dtype
+
+    value = torch.rand(specgrams.shape[: (dim - 2)], device=device, dtype=dtype) * mask_param
+    min_value = torch.rand(specgrams.shape[: (dim - 2)], device=device, dtype=dtype) * (specgrams.size(axis) - value)
+
+    # Create broadcastable mask
+    mask_start = min_value.long()[..., None, None]
+    mask_end = (min_value.long() + value.long())[..., None, None]
+    mask = torch.arange(0, specgrams.size(axis), device=device, dtype=dtype)
+
+    # Per batch example masking
+    specgrams = specgrams.transpose(axis, -1)
+    specgrams = specgrams.masked_fill((mask >= mask_start) & (mask < mask_end), mask_value)
+    specgrams = specgrams.transpose(axis, -1)
+
+    return specgrams
+
+
+def mask_along_axis(
+    specgram: Tensor,
+    mask_param: int,
+    mask_value: float,
+    axis: int,
+    p: float = 1.0,
+) -> Tensor:
+    r"""Apply a mask along ``axis``.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Mask will be applied from indices ``[v_0, v_0 + v)``,
+    where ``v`` is sampled from ``uniform(0, max_v)`` and
+    ``v_0`` from ``uniform(0, specgram.size(axis) - v)``, with
+    ``max_v = mask_param`` when ``p = 1.0`` and
+    ``max_v = min(mask_param, floor(specgram.size(axis) * p))``
+    otherwise.
+    All examples will have the same mask interval.
+
+    Args:
+        specgram (Tensor): Real spectrograms `(..., freq, time)`, with at least 2 dimensions.
+        mask_param (int): Number of columns to be masked will be uniformly sampled from [0, mask_param]
+        mask_value (float): Value to assign to the masked columns
+        axis (int): Axis to apply masking on, which should be the one of the last two dimensions.
+        p (float, optional): maximum proportion of columns that can be masked. (Default: 1.0)
+
+    Returns:
+        Tensor: Masked spectrograms with the same dimensions as input specgram Tensor
+    """
+    dim = specgram.dim()
+
+    if dim < 2:
+        raise ValueError(f"Spectrogram must have at least two dimensions (time and frequency) ({dim} given).")
+
+    if axis not in [dim - 2, dim - 1]:
+        raise ValueError(
+            f"Only Frequency and Time masking are supported (axis {dim-2} and axis {dim-1} supported; {axis} given)."
+        )
+
+    if not 0.0 <= p <= 1.0:
+        raise ValueError(f"The value of p must be between 0.0 and 1.0 ({p} given).")
+
+    mask_param = _get_mask_param(mask_param, p, specgram.shape[axis])
+    if mask_param < 1:
+        return specgram
+
+    # pack batch
+    shape = specgram.size()
+    specgram = specgram.reshape([-1] + list(shape[-2:]))
+    # After packing, specgram is a 3D tensor, and the axis corresponding to the to-be-masked dimension
+    # is now (axis - dim + 3), e.g. a tensor of shape (10, 2, 50, 10, 2) becomes a tensor of shape (1000, 10, 2).
+    value = torch.rand(1) * mask_param
+    min_value = torch.rand(1) * (specgram.size(axis - dim + 3) - value)
+
+    mask_start = (min_value.long()).squeeze()
+    mask_end = (min_value.long() + value.long()).squeeze()
+    mask = torch.arange(0, specgram.shape[axis - dim + 3], device=specgram.device, dtype=specgram.dtype)
+    mask = (mask >= mask_start) & (mask < mask_end)
+    # unsqueeze the mask if the axis is frequency
+    if axis == dim - 2:
+        mask = mask.unsqueeze(-1)
+
+    if mask_end - mask_start >= mask_param:
+        raise ValueError("Number of columns to be masked should be less than mask_param")
+
+    specgram = specgram.masked_fill(mask, mask_value)
+
+    # unpack batch
+    specgram = specgram.reshape(shape[:-2] + specgram.shape[-2:])
+
+    return specgram
+
+
+def compute_deltas(specgram: Tensor, win_length: int = 5, mode: str = "replicate") -> Tensor:
+    r"""Compute delta coefficients of a tensor, usually a spectrogram:
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    .. math::
+       d_t = \frac{\sum_{n=1}^{\text{N}} n (c_{t+n} - c_{t-n})}{2 \sum_{n=1}^{\text{N}} n^2}
+
+    where :math:`d_t` is the deltas at time :math:`t`,
+    :math:`c_t` is the spectrogram coeffcients at time :math:`t`,
+    :math:`N` is ``(win_length-1)//2``.
+
+    Args:
+        specgram (Tensor): Tensor of audio of dimension `(..., freq, time)`
+        win_length (int, optional): The window length used for computing delta (Default: ``5``)
+        mode (str, optional): Mode parameter passed to padding (Default: ``"replicate"``)
+
+    Returns:
+        Tensor: Tensor of deltas of dimension `(..., freq, time)`
+
+    Example
+        >>> specgram = torch.randn(1, 40, 1000)
+        >>> delta = compute_deltas(specgram)
+        >>> delta2 = compute_deltas(delta)
+    """
+    device = specgram.device
+    dtype = specgram.dtype
+
+    # pack batch
+    shape = specgram.size()
+    specgram = specgram.reshape(1, -1, shape[-1])
+
+    if win_length < 3:
+        raise ValueError(f"Window length should be greater than or equal to 3. Found win_length {win_length}")
+
+    n = (win_length - 1) // 2
+
+    # twice sum of integer squared
+    denom = n * (n + 1) * (2 * n + 1) / 3
+
+    specgram = torch.nn.functional.pad(specgram, (n, n), mode=mode)
+
+    kernel = torch.arange(-n, n + 1, 1, device=device, dtype=dtype).repeat(specgram.shape[1], 1, 1)
+
+    output = torch.nn.functional.conv1d(specgram, kernel, groups=specgram.shape[1]) / denom
+
+    # unpack batch
+    output = output.reshape(shape)
+
+    return output
+
+
+def _compute_nccf(waveform: Tensor, sample_rate: int, frame_time: float, freq_low: int) -> Tensor:
+    r"""
+    Compute Normalized Cross-Correlation Function (NCCF).
+
+    .. math::
+        \phi_i(m) = \frac{\sum_{n=b_i}^{b_i + N-1} w(n) w(m+n)}{\sqrt{E(b_i) E(m+b_i)}},
+
+    where
+    :math:`\phi_i(m)` is the NCCF at frame :math:`i` with lag :math:`m`,
+    :math:`w` is the waveform,
+    :math:`N` is the length of a frame,
+    :math:`b_i` is the beginning of frame :math:`i`,
+    :math:`E(j)` is the energy :math:`\sum_{n=j}^{j+N-1} w^2(n)`.
+    """
+
+    EPSILON = 10 ** (-9)
+
+    # Number of lags to check
+    lags = int(math.ceil(sample_rate / freq_low))
+
+    frame_size = int(math.ceil(sample_rate * frame_time))
+
+    waveform_length = waveform.size()[-1]
+    num_of_frames = int(math.ceil(waveform_length / frame_size))
+
+    p = lags + num_of_frames * frame_size - waveform_length
+    waveform = torch.nn.functional.pad(waveform, (0, p))
+
+    # Compute lags
+    output_lag = []
+    for lag in range(1, lags + 1):
+        s1 = waveform[..., :-lag].unfold(-1, frame_size, frame_size)[..., :num_of_frames, :]
+        s2 = waveform[..., lag:].unfold(-1, frame_size, frame_size)[..., :num_of_frames, :]
+
+        output_frames = (
+            (s1 * s2).sum(-1)
+            / (EPSILON + torch.linalg.vector_norm(s1, ord=2, dim=-1)).pow(2)
+            / (EPSILON + torch.linalg.vector_norm(s2, ord=2, dim=-1)).pow(2)
+        )
+
+        output_lag.append(output_frames.unsqueeze(-1))
+
+    nccf = torch.cat(output_lag, -1)
+
+    return nccf
+
+
+def _combine_max(a: Tuple[Tensor, Tensor], b: Tuple[Tensor, Tensor], thresh: float = 0.99) -> Tuple[Tensor, Tensor]:
+    """
+    Take value from first if bigger than a multiplicative factor of the second, elementwise.
+    """
+    mask = a[0] > thresh * b[0]
+    values = mask * a[0] + ~mask * b[0]
+    indices = mask * a[1] + ~mask * b[1]
+    return values, indices
+
+
+def _find_max_per_frame(nccf: Tensor, sample_rate: int, freq_high: int) -> Tensor:
+    r"""
+    For each frame, take the highest value of NCCF,
+    apply centered median smoothing, and convert to frequency.
+
+    Note: If the max among all the lags is very close
+    to the first half of lags, then the latter is taken.
+    """
+
+    lag_min = int(math.ceil(sample_rate / freq_high))
+
+    # Find near enough max that is smallest
+
+    best = torch.max(nccf[..., lag_min:], -1)
+
+    half_size = nccf.shape[-1] // 2
+    half = torch.max(nccf[..., lag_min:half_size], -1)
+
+    best = _combine_max(half, best)
+    indices = best[1]
+
+    # Add back minimal lag
+    indices += lag_min
+    # Add 1 empirical calibration offset
+    indices += 1
+
+    return indices
+
+
+def _median_smoothing(indices: Tensor, win_length: int) -> Tensor:
+    r"""
+    Apply median smoothing to the 1D tensor over the given window.
+    """
+
+    # Centered windowed
+    pad_length = (win_length - 1) // 2
+
+    # "replicate" padding in any dimension
+    indices = torch.nn.functional.pad(indices, (pad_length, 0), mode="constant", value=0.0)
+
+    indices[..., :pad_length] = torch.cat(pad_length * [indices[..., pad_length].unsqueeze(-1)], dim=-1)
+    roll = indices.unfold(-1, win_length, 1)
+
+    values, _ = torch.median(roll, -1)
+    return values
+
+
+def detect_pitch_frequency(
+    waveform: Tensor,
+    sample_rate: int,
+    frame_time: float = 10 ** (-2),
+    win_length: int = 30,
+    freq_low: int = 85,
+    freq_high: int = 3400,
+) -> Tensor:
+    r"""Detect pitch frequency.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    It is implemented using normalized cross-correlation function and median smoothing.
+
+    Args:
+        waveform (Tensor): Tensor of audio of dimension `(..., freq, time)`
+        sample_rate (int): The sample rate of the waveform (Hz)
+        frame_time (float, optional): Duration of a frame (Default: ``10 ** (-2)``).
+        win_length (int, optional): The window length for median smoothing (in number of frames) (Default: ``30``).
+        freq_low (int, optional): Lowest frequency that can be detected (Hz) (Default: ``85``).
+        freq_high (int, optional): Highest frequency that can be detected (Hz) (Default: ``3400``).
+
+    Returns:
+        Tensor: Tensor of freq of dimension `(..., frame)`
+    """
+    # pack batch
+    shape = list(waveform.size())
+    waveform = waveform.reshape([-1] + shape[-1:])
+
+    nccf = _compute_nccf(waveform, sample_rate, frame_time, freq_low)
+    indices = _find_max_per_frame(nccf, sample_rate, freq_high)
+    indices = _median_smoothing(indices, win_length)
+
+    # Convert indices to frequency
+    EPSILON = 10 ** (-9)
+    freq = sample_rate / (EPSILON + indices.to(torch.float))
+
+    # unpack batch
+    freq = freq.reshape(shape[:-1] + list(freq.shape[-1:]))
+
+    return freq
+
+
+def sliding_window_cmn(
+    specgram: Tensor,
+    cmn_window: int = 600,
+    min_cmn_window: int = 100,
+    center: bool = False,
+    norm_vars: bool = False,
+) -> Tensor:
+    r"""
+    Apply sliding-window cepstral mean (and optionally variance) normalization per utterance.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Args:
+        specgram (Tensor): Tensor of spectrogram of dimension `(..., time, freq)`
+        cmn_window (int, optional): Window in frames for running average CMN computation (int, default = 600)
+        min_cmn_window (int, optional):  Minimum CMN window used at start of decoding (adds latency only at start).
+            Only applicable if center == false, ignored if center==true (int, default = 100)
+        center (bool, optional): If true, use a window centered on the current frame
+            (to the extent possible, modulo end effects). If false, window is to the left. (bool, default = false)
+        norm_vars (bool, optional): If true, normalize variance to one. (bool, default = false)
+
+    Returns:
+        Tensor: Tensor matching input shape `(..., freq, time)`
+    """
+    input_shape = specgram.shape
+    num_frames, num_feats = input_shape[-2:]
+    specgram = specgram.view(-1, num_frames, num_feats)
+    num_channels = specgram.shape[0]
+
+    dtype = specgram.dtype
+    device = specgram.device
+    last_window_start = last_window_end = -1
+    cur_sum = torch.zeros(num_channels, num_feats, dtype=dtype, device=device)
+    cur_sumsq = torch.zeros(num_channels, num_feats, dtype=dtype, device=device)
+    cmn_specgram = torch.zeros(num_channels, num_frames, num_feats, dtype=dtype, device=device)
+    for t in range(num_frames):
+        window_start = 0
+        window_end = 0
+        if center:
+            window_start = t - cmn_window // 2
+            window_end = window_start + cmn_window
+        else:
+            window_start = t - cmn_window
+            window_end = t + 1
+        if window_start < 0:
+            window_end -= window_start
+            window_start = 0
+        if not center:
+            if window_end > t:
+                window_end = max(t + 1, min_cmn_window)
+        if window_end > num_frames:
+            window_start -= window_end - num_frames
+            window_end = num_frames
+            if window_start < 0:
+                window_start = 0
+        if last_window_start == -1:
+            input_part = specgram[:, window_start : window_end - window_start, :]
+            cur_sum += torch.sum(input_part, 1)
+            if norm_vars:
+                cur_sumsq += torch.cumsum(input_part**2, 1)[:, -1, :]
+        else:
+            if window_start > last_window_start:
+                frame_to_remove = specgram[:, last_window_start, :]
+                cur_sum -= frame_to_remove
+                if norm_vars:
+                    cur_sumsq -= frame_to_remove**2
+            if window_end > last_window_end:
+                frame_to_add = specgram[:, last_window_end, :]
+                cur_sum += frame_to_add
+                if norm_vars:
+                    cur_sumsq += frame_to_add**2
+        window_frames = window_end - window_start
+        last_window_start = window_start
+        last_window_end = window_end
+        cmn_specgram[:, t, :] = specgram[:, t, :] - cur_sum / window_frames
+        if norm_vars:
+            if window_frames == 1:
+                cmn_specgram[:, t, :] = torch.zeros(num_channels, num_feats, dtype=dtype, device=device)
+            else:
+                variance = cur_sumsq
+                variance = variance / window_frames
+                variance -= (cur_sum**2) / (window_frames**2)
+                variance = torch.pow(variance, -0.5)
+                cmn_specgram[:, t, :] *= variance
+
+    cmn_specgram = cmn_specgram.view(input_shape[:-2] + (num_frames, num_feats))
+    if len(input_shape) == 2:
+        cmn_specgram = cmn_specgram.squeeze(0)
+    return cmn_specgram
+
+
+def spectral_centroid(
+    waveform: Tensor,
+    sample_rate: int,
+    pad: int,
+    window: Tensor,
+    n_fft: int,
+    hop_length: int,
+    win_length: int,
+) -> Tensor:
+    r"""Compute the spectral centroid for each channel along the time axis.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    The spectral centroid is defined as the weighted average of the
+    frequency values, weighted by their magnitude.
+
+    Args:
+        waveform (Tensor): Tensor of audio of dimension `(..., time)`
+        sample_rate (int): Sample rate of the audio waveform
+        pad (int): Two sided padding of signal
+        window (Tensor): Window tensor that is applied/multiplied to each frame/window
+        n_fft (int): Size of FFT
+        hop_length (int): Length of hop between STFT windows
+        win_length (int): Window size
+
+    Returns:
+        Tensor: Dimension `(..., time)`
+    """
+    specgram = spectrogram(
+        waveform,
+        pad=pad,
+        window=window,
+        n_fft=n_fft,
+        hop_length=hop_length,
+        win_length=win_length,
+        power=1.0,
+        normalized=False,
+    )
+    freqs = torch.linspace(0, sample_rate // 2, steps=1 + n_fft // 2, device=specgram.device).reshape((-1, 1))
+    freq_dim = -2
+    return (freqs * specgram).sum(dim=freq_dim) / specgram.sum(dim=freq_dim)
+
+
+@deprecated("Please migrate to :py:class:`torchaudio.io.AudioEffector`.", remove=False)
+def apply_codec(
+    waveform: Tensor,
+    sample_rate: int,
+    format: str,
+    channels_first: bool = True,
+    compression: Optional[float] = None,
+    encoding: Optional[str] = None,
+    bits_per_sample: Optional[int] = None,
+) -> Tensor:
+    r"""
+    Apply codecs as a form of augmentation.
+
+    .. devices:: CPU
+
+    Args:
+        waveform (Tensor): Audio data. Must be 2 dimensional. See also ```channels_first```.
+        sample_rate (int): Sample rate of the audio waveform.
+        format (str): File format.
+        channels_first (bool, optional):
+            When True, both the input and output Tensor have dimension `(channel, time)`.
+            Otherwise, they have dimension `(time, channel)`.
+        compression (float or None, optional): Used for formats other than WAV.
+            For more details see :py:func:`torchaudio.backend.sox_io_backend.save`.
+        encoding (str or None, optional): Changes the encoding for the supported formats.
+            For more details see :py:func:`torchaudio.backend.sox_io_backend.save`.
+        bits_per_sample (int or None, optional): Changes the bit depth for the supported formats.
+            For more details see :py:func:`torchaudio.backend.sox_io_backend.save`.
+
+    Returns:
+        Tensor: Resulting Tensor.
+        If ``channels_first=True``, it has `(channel, time)` else `(time, channel)`.
+    """
+    from torchaudio.backend import _sox_io_backend
+
+    with tempfile.NamedTemporaryFile() as f:
+        torchaudio.backend._sox_io_backend.save(
+            f.name, waveform, sample_rate, channels_first, compression, format, encoding, bits_per_sample
+        )
+        augmented, sr = _sox_io_backend.load(f.name, channels_first=channels_first, format=format)
+    if sr != sample_rate:
+        augmented = resample(augmented, sr, sample_rate)
+    return augmented
+
+
+_CPU = torch.device("cpu")
+
+
+def _get_sinc_resample_kernel(
+    orig_freq: int,
+    new_freq: int,
+    gcd: int,
+    lowpass_filter_width: int = 6,
+    rolloff: float = 0.99,
+    resampling_method: str = "sinc_interp_hann",
+    beta: Optional[float] = None,
+    device: torch.device = _CPU,
+    dtype: Optional[torch.dtype] = None,
+):
+    if not (int(orig_freq) == orig_freq and int(new_freq) == new_freq):
+        raise Exception(
+            "Frequencies must be of integer type to ensure quality resampling computation. "
+            "To work around this, manually convert both frequencies to integer values "
+            "that maintain their resampling rate ratio before passing them into the function. "
+            "Example: To downsample a 44100 hz waveform by a factor of 8, use "
+            "`orig_freq=8` and `new_freq=1` instead of `orig_freq=44100` and `new_freq=5512.5`. "
+            "For more information, please refer to https://github.com/pytorch/audio/issues/1487."
+        )
+
+    if resampling_method in ["sinc_interpolation", "kaiser_window"]:
+        method_map = {
+            "sinc_interpolation": "sinc_interp_hann",
+            "kaiser_window": "sinc_interp_kaiser",
+        }
+        warnings.warn(
+            f'"{resampling_method}" resampling method name is being deprecated and replaced by '
+            f'"{method_map[resampling_method]}" in the next release. '
+            "The default behavior remains unchanged.",
+            stacklevel=3,
+        )
+    elif resampling_method not in ["sinc_interp_hann", "sinc_interp_kaiser"]:
+        raise ValueError("Invalid resampling method: {}".format(resampling_method))
+
+    orig_freq = int(orig_freq) // gcd
+    new_freq = int(new_freq) // gcd
+
+    if lowpass_filter_width <= 0:
+        raise ValueError("Low pass filter width should be positive.")
+    base_freq = min(orig_freq, new_freq)
+    # This will perform antialiasing filtering by removing the highest frequencies.
+    # At first I thought I only needed this when downsampling, but when upsampling
+    # you will get edge artifacts without this, as the edge is equivalent to zero padding,
+    # which will add high freq artifacts.
+    base_freq *= rolloff
+
+    # The key idea of the algorithm is that x(t) can be exactly reconstructed from x[i] (tensor)
+    # using the sinc interpolation formula:
+    #   x(t) = sum_i x[i] sinc(pi * orig_freq * (i / orig_freq - t))
+    # We can then sample the function x(t) with a different sample rate:
+    #    y[j] = x(j / new_freq)
+    # or,
+    #    y[j] = sum_i x[i] sinc(pi * orig_freq * (i / orig_freq - j / new_freq))
+
+    # We see here that y[j] is the convolution of x[i] with a specific filter, for which
+    # we take an FIR approximation, stopping when we see at least `lowpass_filter_width` zeros crossing.
+    # But y[j+1] is going to have a different set of weights and so on, until y[j + new_freq].
+    # Indeed:
+    # y[j + new_freq] = sum_i x[i] sinc(pi * orig_freq * ((i / orig_freq - (j + new_freq) / new_freq))
+    #                 = sum_i x[i] sinc(pi * orig_freq * ((i - orig_freq) / orig_freq - j / new_freq))
+    #                 = sum_i x[i + orig_freq] sinc(pi * orig_freq * (i / orig_freq - j / new_freq))
+    # so y[j+new_freq] uses the same filter as y[j], but on a shifted version of x by `orig_freq`.
+    # This will explain the F.conv1d after, with a stride of orig_freq.
+    width = math.ceil(lowpass_filter_width * orig_freq / base_freq)
+    # If orig_freq is still big after GCD reduction, most filters will be very unbalanced, i.e.,
+    # they will have a lot of almost zero values to the left or to the right...
+    # There is probably a way to evaluate those filters more efficiently, but this is kept for
+    # future work.
+    idx_dtype = dtype if dtype is not None else torch.float64
+
+    idx = torch.arange(-width, width + orig_freq, dtype=idx_dtype, device=device)[None, None] / orig_freq
+
+    t = torch.arange(0, -new_freq, -1, dtype=dtype, device=device)[:, None, None] / new_freq + idx
+    t *= base_freq
+    t = t.clamp_(-lowpass_filter_width, lowpass_filter_width)
+
+    # we do not use built in torch windows here as we need to evaluate the window
+    # at specific positions, not over a regular grid.
+    if resampling_method == "sinc_interp_hann":
+        window = torch.cos(t * math.pi / lowpass_filter_width / 2) ** 2
+    else:
+        # sinc_interp_kaiser
+        if beta is None:
+            beta = 14.769656459379492
+        beta_tensor = torch.tensor(float(beta))
+        window = torch.i0(beta_tensor * torch.sqrt(1 - (t / lowpass_filter_width) ** 2)) / torch.i0(beta_tensor)
+
+    t *= math.pi
+
+    scale = base_freq / orig_freq
+    kernels = torch.where(t == 0, torch.tensor(1.0).to(t), t.sin() / t)
+    kernels *= window * scale
+
+    if dtype is None:
+        kernels = kernels.to(dtype=torch.float32)
+
+    return kernels, width
+
+
+def _apply_sinc_resample_kernel(
+    waveform: Tensor,
+    orig_freq: int,
+    new_freq: int,
+    gcd: int,
+    kernel: Tensor,
+    width: int,
+):
+    if not waveform.is_floating_point():
+        raise TypeError(f"Expected floating point type for waveform tensor, but received {waveform.dtype}.")
+
+    orig_freq = int(orig_freq) // gcd
+    new_freq = int(new_freq) // gcd
+
+    # pack batch
+    shape = waveform.size()
+    waveform = waveform.view(-1, shape[-1])
+
+    num_wavs, length = waveform.shape
+    waveform = torch.nn.functional.pad(waveform, (width, width + orig_freq))
+    resampled = torch.nn.functional.conv1d(waveform[:, None], kernel, stride=orig_freq)
+    resampled = resampled.transpose(1, 2).reshape(num_wavs, -1)
+    target_length = torch.ceil(torch.as_tensor(new_freq * length / orig_freq)).long()
+    resampled = resampled[..., :target_length]
+
+    # unpack batch
+    resampled = resampled.view(shape[:-1] + resampled.shape[-1:])
+    return resampled
+
+
+def resample(
+    waveform: Tensor,
+    orig_freq: int,
+    new_freq: int,
+    lowpass_filter_width: int = 6,
+    rolloff: float = 0.99,
+    resampling_method: str = "sinc_interp_hann",
+    beta: Optional[float] = None,
+) -> Tensor:
+    r"""Resamples the waveform at the new frequency using bandlimited interpolation. :cite:`RESAMPLE`.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Note:
+        ``transforms.Resample`` precomputes and reuses the resampling kernel, so using it will result in
+        more efficient computation if resampling multiple waveforms with the same resampling parameters.
+
+    Args:
+        waveform (Tensor): The input signal of dimension `(..., time)`
+        orig_freq (int): The original frequency of the signal
+        new_freq (int): The desired frequency
+        lowpass_filter_width (int, optional): Controls the sharpness of the filter, more == sharper
+            but less efficient. (Default: ``6``)
+        rolloff (float, optional): The roll-off frequency of the filter, as a fraction of the Nyquist.
+            Lower values reduce anti-aliasing, but also reduce some of the highest frequencies. (Default: ``0.99``)
+        resampling_method (str, optional): The resampling method to use.
+            Options: [``"sinc_interp_hann"``, ``"sinc_interp_kaiser"``] (Default: ``"sinc_interp_hann"``)
+        beta (float or None, optional): The shape parameter used for kaiser window.
+
+    Returns:
+        Tensor: The waveform at the new frequency of dimension `(..., time).`
+    """
+
+    if orig_freq <= 0.0 or new_freq <= 0.0:
+        raise ValueError("Original frequency and desired frequecy should be positive")
+
+    if orig_freq == new_freq:
+        return waveform
+
+    gcd = math.gcd(int(orig_freq), int(new_freq))
+
+    kernel, width = _get_sinc_resample_kernel(
+        orig_freq,
+        new_freq,
+        gcd,
+        lowpass_filter_width,
+        rolloff,
+        resampling_method,
+        beta,
+        waveform.device,
+        waveform.dtype,
+    )
+    resampled = _apply_sinc_resample_kernel(waveform, orig_freq, new_freq, gcd, kernel, width)
+    return resampled
+
+
+@torch.jit.unused
+def edit_distance(seq1: Sequence, seq2: Sequence) -> int:
+    """
+    Calculate the word level edit (Levenshtein) distance between two sequences.
+
+    .. devices:: CPU
+
+    The function computes an edit distance allowing deletion, insertion and
+    substitution. The result is an integer.
+
+    For most applications, the two input sequences should be the same type. If
+    two strings are given, the output is the edit distance between the two
+    strings (character edit distance). If two lists of strings are given, the
+    output is the edit distance between sentences (word edit distance). Users
+    may want to normalize the output by the length of the reference sequence.
+
+    Args:
+        seq1 (Sequence): the first sequence to compare.
+        seq2 (Sequence): the second sequence to compare.
+    Returns:
+        int: The distance between the first and second sequences.
+    """
+    len_sent2 = len(seq2)
+    dold = list(range(len_sent2 + 1))
+    dnew = [0 for _ in range(len_sent2 + 1)]
+
+    for i in range(1, len(seq1) + 1):
+        dnew[0] = i
+        for j in range(1, len_sent2 + 1):
+            if seq1[i - 1] == seq2[j - 1]:
+                dnew[j] = dold[j - 1]
+            else:
+                substitution = dold[j - 1] + 1
+                insertion = dnew[j - 1] + 1
+                deletion = dold[j] + 1
+                dnew[j] = min(substitution, insertion, deletion)
+
+        dnew, dold = dold, dnew
+
+    return int(dold[-1])
+
+
+def loudness(waveform: Tensor, sample_rate: int):
+    r"""Measure audio loudness according to the ITU-R BS.1770-4 recommendation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Args:
+        waveform(torch.Tensor): audio waveform of dimension `(..., channels, time)`
+        sample_rate (int): sampling rate of the waveform
+
+    Returns:
+        Tensor: loudness estimates (LKFS)
+
+    Reference:
+        - https://www.itu.int/rec/R-REC-BS.1770-4-201510-I/en
+    """
+
+    if waveform.size(-2) > 5:
+        raise ValueError("Only up to 5 channels are supported.")
+
+    gate_duration = 0.4
+    overlap = 0.75
+    gamma_abs = -70.0
+    kweight_bias = -0.691
+    gate_samples = int(round(gate_duration * sample_rate))
+    step = int(round(gate_samples * (1 - overlap)))
+
+    # Apply K-weighting
+    waveform = treble_biquad(waveform, sample_rate, 4.0, 1500.0, 1 / math.sqrt(2))
+    waveform = highpass_biquad(waveform, sample_rate, 38.0, 0.5)
+
+    # Compute the energy for each block
+    energy = torch.square(waveform).unfold(-1, gate_samples, step)
+    energy = torch.mean(energy, dim=-1)
+
+    # Compute channel-weighted summation
+    g = torch.tensor([1.0, 1.0, 1.0, 1.41, 1.41], dtype=waveform.dtype, device=waveform.device)
+    g = g[: energy.size(-2)]
+
+    energy_weighted = torch.sum(g.unsqueeze(-1) * energy, dim=-2)
+    loudness = -0.691 + 10 * torch.log10(energy_weighted)
+
+    # Apply absolute gating of the blocks
+    gated_blocks = loudness > gamma_abs
+    gated_blocks = gated_blocks.unsqueeze(-2)
+
+    energy_filtered = torch.sum(gated_blocks * energy, dim=-1) / torch.count_nonzero(gated_blocks, dim=-1)
+    energy_weighted = torch.sum(g * energy_filtered, dim=-1)
+    gamma_rel = kweight_bias + 10 * torch.log10(energy_weighted) - 10
+
+    # Apply relative gating of the blocks
+    gated_blocks = torch.logical_and(gated_blocks.squeeze(-2), loudness > gamma_rel.unsqueeze(-1))
+    gated_blocks = gated_blocks.unsqueeze(-2)
+
+    energy_filtered = torch.sum(gated_blocks * energy, dim=-1) / torch.count_nonzero(gated_blocks, dim=-1)
+    energy_weighted = torch.sum(g * energy_filtered, dim=-1)
+    LKFS = kweight_bias + 10 * torch.log10(energy_weighted)
+    return LKFS
+
+
+def pitch_shift(
+    waveform: Tensor,
+    sample_rate: int,
+    n_steps: int,
+    bins_per_octave: int = 12,
+    n_fft: int = 512,
+    win_length: Optional[int] = None,
+    hop_length: Optional[int] = None,
+    window: Optional[Tensor] = None,
+) -> Tensor:
+    """
+    Shift the pitch of a waveform by ``n_steps`` steps.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Args:
+        waveform (Tensor): The input waveform of shape `(..., time)`.
+        sample_rate (int): Sample rate of `waveform`.
+        n_steps (int): The (fractional) steps to shift `waveform`.
+        bins_per_octave (int, optional): The number of steps per octave (Default: ``12``).
+        n_fft (int, optional): Size of FFT, creates ``n_fft // 2 + 1`` bins (Default: ``512``).
+        win_length (int or None, optional): Window size. If None, then ``n_fft`` is used. (Default: ``None``).
+        hop_length (int or None, optional): Length of hop between STFT windows. If None, then
+            ``win_length // 4`` is used (Default: ``None``).
+        window (Tensor or None, optional): Window tensor that is applied/multiplied to each frame/window.
+            If None, then ``torch.hann_window(win_length)`` is used (Default: ``None``).
+
+
+    Returns:
+        Tensor: The pitch-shifted audio waveform of shape `(..., time)`.
+    """
+    waveform_stretch = _stretch_waveform(
+        waveform,
+        n_steps,
+        bins_per_octave,
+        n_fft,
+        win_length,
+        hop_length,
+        window,
+    )
+    rate = 2.0 ** (-float(n_steps) / bins_per_octave)
+    waveform_shift = resample(waveform_stretch, int(sample_rate / rate), sample_rate)
+
+    return _fix_waveform_shape(waveform_shift, waveform.size())
+
+
+def _stretch_waveform(
+    waveform: Tensor,
+    n_steps: int,
+    bins_per_octave: int = 12,
+    n_fft: int = 512,
+    win_length: Optional[int] = None,
+    hop_length: Optional[int] = None,
+    window: Optional[Tensor] = None,
+) -> Tensor:
+    """
+    Pitch shift helper function to preprocess and stretch waveform before resampling step.
+
+    Args:
+        See pitch_shift arg descriptions.
+
+    Returns:
+        Tensor: The preprocessed waveform stretched prior to resampling.
+    """
+    if hop_length is None:
+        hop_length = n_fft // 4
+    if win_length is None:
+        win_length = n_fft
+    if window is None:
+        window = torch.hann_window(window_length=win_length, device=waveform.device)
+
+    # pack batch
+    shape = waveform.size()
+    waveform = waveform.reshape(-1, shape[-1])
+
+    ori_len = shape[-1]
+    rate = 2.0 ** (-float(n_steps) / bins_per_octave)
+    spec_f = torch.stft(
+        input=waveform,
+        n_fft=n_fft,
+        hop_length=hop_length,
+        win_length=win_length,
+        window=window,
+        center=True,
+        pad_mode="reflect",
+        normalized=False,
+        onesided=True,
+        return_complex=True,
+    )
+    phase_advance = torch.linspace(0, math.pi * hop_length, spec_f.shape[-2], device=spec_f.device)[..., None]
+    spec_stretch = phase_vocoder(spec_f, rate, phase_advance)
+    len_stretch = int(round(ori_len / rate))
+    waveform_stretch = torch.istft(
+        spec_stretch, n_fft=n_fft, hop_length=hop_length, win_length=win_length, window=window, length=len_stretch
+    )
+    return waveform_stretch
+
+
+def _fix_waveform_shape(
+    waveform_shift: Tensor,
+    shape: List[int],
+) -> Tensor:
+    """
+    PitchShift helper function to process after resampling step to fix the shape back.
+
+    Args:
+        waveform_shift(Tensor): The waveform after stretch and resample
+        shape (List[int]): The shape of initial waveform
+
+    Returns:
+        Tensor: The pitch-shifted audio waveform of shape `(..., time)`.
+    """
+    ori_len = shape[-1]
+    shift_len = waveform_shift.size()[-1]
+    if shift_len > ori_len:
+        waveform_shift = waveform_shift[..., :ori_len]
+    else:
+        waveform_shift = torch.nn.functional.pad(waveform_shift, [0, ori_len - shift_len])
+
+    # unpack batch
+    waveform_shift = waveform_shift.view(shape[:-1] + waveform_shift.shape[-1:])
+    return waveform_shift
+
+
+def rnnt_loss(
+    logits: Tensor,
+    targets: Tensor,
+    logit_lengths: Tensor,
+    target_lengths: Tensor,
+    blank: int = -1,
+    clamp: float = -1,
+    reduction: str = "mean",
+    fused_log_softmax: bool = True,
+):
+    """Compute the RNN Transducer loss from *Sequence Transduction with Recurrent Neural Networks*
+    :cite:`graves2012sequence`.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    The RNN Transducer loss extends the CTC loss by defining a distribution over output
+    sequences of all lengths, and by jointly modelling both input-output and output-output
+    dependencies.
+
+    Args:
+        logits (Tensor): Tensor of dimension `(batch, max seq length, max target length + 1, class)`
+            containing output from joiner
+        targets (Tensor): Tensor of dimension `(batch, max target length)` containing targets with zero padded
+        logit_lengths (Tensor): Tensor of dimension `(batch)` containing lengths of each sequence from encoder
+        target_lengths (Tensor): Tensor of dimension `(batch)` containing lengths of targets for each sequence
+        blank (int, optional): blank label (Default: ``-1``)
+        clamp (float, optional): clamp for gradients (Default: ``-1``)
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            ``"none"`` | ``"mean"`` | ``"sum"``. (Default: ``"mean"``)
+        fused_log_softmax (bool): set to False if calling log_softmax outside of loss (Default: ``True``)
+    Returns:
+        Tensor: Loss with the reduction option applied. If ``reduction`` is  ``"none"``, then size `(batch)`,
+        otherwise scalar.
+    """
+    if reduction not in ["none", "mean", "sum"]:
+        raise ValueError('reduction should be one of "none", "mean", or "sum"')
+
+    if blank < 0:  # reinterpret blank index if blank < 0.
+        blank = logits.shape[-1] + blank
+
+    costs, _ = torch.ops.torchaudio.rnnt_loss(
+        logits=logits,
+        targets=targets,
+        logit_lengths=logit_lengths,
+        target_lengths=target_lengths,
+        blank=blank,
+        clamp=clamp,
+        fused_log_softmax=fused_log_softmax,
+    )
+
+    if reduction == "mean":
+        return costs.mean()
+    elif reduction == "sum":
+        return costs.sum()
+
+    return costs
+
+
+def psd(
+    specgram: Tensor,
+    mask: Optional[Tensor] = None,
+    normalize: bool = True,
+    eps: float = 1e-10,
+) -> Tensor:
+    """Compute cross-channel power spectral density (PSD) matrix.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        specgram (torch.Tensor): Multi-channel complex-valued spectrum.
+            Tensor with dimensions `(..., channel, freq, time)`.
+        mask (torch.Tensor or None, optional): Time-Frequency mask for normalization.
+            Tensor with dimensions `(..., freq, time)`. (Default: ``None``)
+        normalize (bool, optional): If ``True``, normalize the mask along the time dimension. (Default: ``True``)
+        eps (float, optional): Value to add to the denominator in mask normalization. (Default: ``1e-15``)
+
+    Returns:
+        torch.Tensor: The complex-valued PSD matrix of the input spectrum.
+        Tensor with dimensions `(..., freq, channel, channel)`
+    """
+    specgram = specgram.transpose(-3, -2)  # shape (freq, channel, time)
+    # outer product:
+    # (..., ch_1, time) x (..., ch_2, time) -> (..., time, ch_1, ch_2)
+    psd = torch.einsum("...ct,...et->...tce", [specgram, specgram.conj()])
+
+    if mask is not None:
+        if mask.shape[:-1] != specgram.shape[:-2] or mask.shape[-1] != specgram.shape[-1]:
+            raise ValueError(
+                "The dimensions of mask except the channel dimension should be the same as specgram."
+                f"Found {mask.shape} for mask and {specgram.shape} for specgram."
+            )
+        # Normalized mask along time dimension:
+        if normalize:
+            mask = mask / (mask.sum(dim=-1, keepdim=True) + eps)
+
+        psd = psd * mask[..., None, None]
+
+    psd = psd.sum(dim=-3)
+    return psd
+
+
+def _compute_mat_trace(input: torch.Tensor, dim1: int = -1, dim2: int = -2) -> torch.Tensor:
+    r"""Compute the trace of a Tensor along ``dim1`` and ``dim2`` dimensions.
+
+    Args:
+        input (torch.Tensor): Tensor with dimensions `(..., channel, channel)`.
+        dim1 (int, optional): The first dimension of the diagonal matrix.
+            (Default: ``-1``)
+        dim2 (int, optional): The second dimension of the diagonal matrix.
+            (Default: ``-2``)
+
+    Returns:
+        Tensor: The trace of the input Tensor.
+    """
+    if input.ndim < 2:
+        raise ValueError("The dimension of the tensor must be at least 2.")
+    if input.shape[dim1] != input.shape[dim2]:
+        raise ValueError("The size of ``dim1`` and ``dim2`` must be the same.")
+    input = torch.diagonal(input, 0, dim1=dim1, dim2=dim2)
+    return input.sum(dim=-1)
+
+
+def _tik_reg(mat: torch.Tensor, reg: float = 1e-7, eps: float = 1e-8) -> torch.Tensor:
+    """Perform Tikhonov regularization (only modifying real part).
+
+    Args:
+        mat (torch.Tensor): Input matrix with dimensions `(..., channel, channel)`.
+        reg (float, optional): Regularization factor. (Default: 1e-8)
+        eps (float, optional): Value to avoid the correlation matrix is all-zero. (Default: ``1e-8``)
+
+    Returns:
+        Tensor: Regularized matrix with dimensions `(..., channel, channel)`.
+    """
+    # Add eps
+    C = mat.size(-1)
+    eye = torch.eye(C, dtype=mat.dtype, device=mat.device)
+    epsilon = _compute_mat_trace(mat).real[..., None, None] * reg
+    # in case that correlation_matrix is all-zero
+    epsilon = epsilon + eps
+    mat = mat + epsilon * eye[..., :, :]
+    return mat
+
+
+def _assert_psd_matrices(psd_s: torch.Tensor, psd_n: torch.Tensor) -> None:
+    """Assertion checks of the PSD matrices of target speech and noise.
+
+    Args:
+        psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
+            Tensor with dimensions `(..., freq, channel, channel)`.
+        psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
+            Tensor with dimensions `(..., freq, channel, channel)`.
+    """
+    if psd_s.ndim < 3 or psd_n.ndim < 3:
+        raise ValueError(
+            "Expected at least 3D Tensor (..., freq, channel, channel) for psd_s and psd_n. "
+            f"Found {psd_s.shape} for psd_s and {psd_n.shape} for psd_n."
+        )
+    if not (psd_s.is_complex() and psd_n.is_complex()):
+        raise TypeError(
+            "The type of psd_s and psd_n must be ``torch.cfloat`` or ``torch.cdouble``. "
+            f"Found {psd_s.dtype} for psd_s and {psd_n.dtype} for psd_n."
+        )
+    if psd_s.shape != psd_n.shape:
+        raise ValueError(
+            f"The dimensions of psd_s and psd_n should be the same. Found {psd_s.shape} and {psd_n.shape}."
+        )
+    if psd_s.shape[-1] != psd_s.shape[-2]:
+        raise ValueError(f"The last two dimensions of psd_s should be the same. Found {psd_s.shape}.")
+
+
+def mvdr_weights_souden(
+    psd_s: Tensor,
+    psd_n: Tensor,
+    reference_channel: Union[int, Tensor],
+    diagonal_loading: bool = True,
+    diag_eps: float = 1e-7,
+    eps: float = 1e-8,
+) -> Tensor:
+    r"""Compute the Minimum Variance Distortionless Response (*MVDR* :cite:`capon1969high`) beamforming weights
+    by the method proposed by *Souden et, al.* :cite:`souden2009optimal`.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Given the power spectral density (PSD) matrix of target speech :math:`\bf{\Phi}_{\textbf{SS}}`,
+    the PSD matrix of noise :math:`\bf{\Phi}_{\textbf{NN}}`, and a one-hot vector that represents the
+    reference channel :math:`\bf{u}`, the method computes the MVDR beamforming weight martrix
+    :math:`\textbf{w}_{\text{MVDR}}`. The formula is defined as:
+
+    .. math::
+        \textbf{w}_{\text{MVDR}}(f) =
+        \frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bf{\Phi}_{\textbf{SS}}}}(f)}
+        {\text{Trace}({{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f) \bf{\Phi}_{\textbf{SS}}}(f))}}\bm{u}
+
+    Args:
+        psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
+            Tensor with dimensions `(..., freq, channel, channel)`.
+        psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
+            Tensor with dimensions `(..., freq, channel, channel)`.
+        reference_channel (int or torch.Tensor): Specifies the reference channel.
+            If the dtype is ``int``, it represents the reference channel index.
+            If the dtype is ``torch.Tensor``, its shape is `(..., channel)`, where the ``channel`` dimension
+            is one-hot.
+        diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``.
+            (Default: ``True``)
+        diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
+            It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
+        eps (float, optional): Value to add to the denominator in the beamforming weight formula.
+            (Default: ``1e-8``)
+
+    Returns:
+        torch.Tensor: The complex-valued MVDR beamforming weight matrix with dimensions `(..., freq, channel)`.
+    """
+    _assert_psd_matrices(psd_s, psd_n)
+
+    if diagonal_loading:
+        psd_n = _tik_reg(psd_n, reg=diag_eps)
+    numerator = torch.linalg.solve(psd_n, psd_s)  # psd_n.inv() @ psd_s
+    # ws: (..., C, C) / (...,) -> (..., C, C)
+    ws = numerator / (_compute_mat_trace(numerator)[..., None, None] + eps)
+    if torch.jit.isinstance(reference_channel, int):
+        beamform_weights = ws[..., :, reference_channel]
+    elif torch.jit.isinstance(reference_channel, Tensor):
+        reference_channel = reference_channel.to(psd_n.dtype)
+        # h: (..., F, C_1, C_2) x (..., C_2) -> (..., F, C_1)
+        beamform_weights = torch.einsum("...c,...c->...", [ws, reference_channel[..., None, None, :]])
+    else:
+        raise TypeError(f'Expected "int" or "Tensor" for reference_channel. Found: {type(reference_channel)}.')
+
+    return beamform_weights
+
+
+def mvdr_weights_rtf(
+    rtf: Tensor,
+    psd_n: Tensor,
+    reference_channel: Optional[Union[int, Tensor]] = None,
+    diagonal_loading: bool = True,
+    diag_eps: float = 1e-7,
+    eps: float = 1e-8,
+) -> Tensor:
+    r"""Compute the Minimum Variance Distortionless Response (*MVDR* :cite:`capon1969high`) beamforming weights
+    based on the relative transfer function (RTF) and power spectral density (PSD) matrix of noise.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Given the relative transfer function (RTF) matrix or the steering vector of target speech :math:`\bm{v}`,
+    the PSD matrix of noise :math:`\bf{\Phi}_{\textbf{NN}}`, and a one-hot vector that represents the
+    reference channel :math:`\bf{u}`, the method computes the MVDR beamforming weight martrix
+    :math:`\textbf{w}_{\text{MVDR}}`. The formula is defined as:
+
+    .. math::
+        \textbf{w}_{\text{MVDR}}(f) =
+        \frac{{{\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bm{v}}(f)}}
+        {{\bm{v}^{\mathsf{H}}}(f){\bf{\Phi}_{\textbf{NN}}^{-1}}(f){\bm{v}}(f)}
+
+    where :math:`(.)^{\mathsf{H}}` denotes the Hermitian Conjugate operation.
+
+    Args:
+        rtf (torch.Tensor): The complex-valued RTF vector of target speech.
+            Tensor with dimensions `(..., freq, channel)`.
+        psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
+            Tensor with dimensions `(..., freq, channel, channel)`.
+        reference_channel (int or torch.Tensor): Specifies the reference channel.
+            If the dtype is ``int``, it represents the reference channel index.
+            If the dtype is ``torch.Tensor``, its shape is `(..., channel)`, where the ``channel`` dimension
+            is one-hot.
+        diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``.
+            (Default: ``True``)
+        diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
+            It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
+        eps (float, optional): Value to add to the denominator in the beamforming weight formula.
+            (Default: ``1e-8``)
+
+    Returns:
+        torch.Tensor: The complex-valued MVDR beamforming weight matrix with dimensions `(..., freq, channel)`.
+    """
+    if rtf.ndim < 2:
+        raise ValueError(f"Expected at least 2D Tensor (..., freq, channel) for rtf. Found {rtf.shape}.")
+    if psd_n.ndim < 3:
+        raise ValueError(f"Expected at least 3D Tensor (..., freq, channel, channel) for psd_n. Found {psd_n.shape}.")
+    if not (rtf.is_complex() and psd_n.is_complex()):
+        raise TypeError(
+            "The type of rtf and psd_n must be ``torch.cfloat`` or ``torch.cdouble``. "
+            f"Found {rtf.dtype} for rtf and {psd_n.dtype} for psd_n."
+        )
+    if rtf.shape != psd_n.shape[:-1]:
+        raise ValueError(
+            "The dimensions of rtf and the dimensions withou the last dimension of psd_n should be the same. "
+            f"Found {rtf.shape} for rtf and {psd_n.shape} for psd_n."
+        )
+    if psd_n.shape[-1] != psd_n.shape[-2]:
+        raise ValueError(f"The last two dimensions of psd_n should be the same. Found {psd_n.shape}.")
+
+    if diagonal_loading:
+        psd_n = _tik_reg(psd_n, reg=diag_eps)
+    # numerator = psd_n.inv() @ stv
+    numerator = torch.linalg.solve(psd_n, rtf.unsqueeze(-1)).squeeze(-1)  # (..., freq, channel)
+    # denominator = stv^H @ psd_n.inv() @ stv
+    denominator = torch.einsum("...d,...d->...", [rtf.conj(), numerator])
+    beamform_weights = numerator / (denominator.real.unsqueeze(-1) + eps)
+    # normalize the numerator
+    if reference_channel is not None:
+        if torch.jit.isinstance(reference_channel, int):
+            scale = rtf[..., reference_channel].conj()
+        elif torch.jit.isinstance(reference_channel, Tensor):
+            reference_channel = reference_channel.to(psd_n.dtype)
+            scale = torch.einsum("...c,...c->...", [rtf.conj(), reference_channel[..., None, :]])
+        else:
+            raise TypeError(f'Expected "int" or "Tensor" for reference_channel. Found: {type(reference_channel)}.')
+
+        beamform_weights = beamform_weights * scale[..., None]
+
+    return beamform_weights
+
+
+def rtf_evd(psd_s: Tensor) -> Tensor:
+    r"""Estimate the relative transfer function (RTF) or the steering vector by eigenvalue decomposition.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Args:
+        psd_s (Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
+            Tensor of dimension `(..., freq, channel, channel)`
+
+    Returns:
+        Tensor: The estimated complex-valued RTF of target speech.
+        Tensor of dimension `(..., freq, channel)`
+    """
+    if not psd_s.is_complex():
+        raise TypeError(f"The type of psd_s must be ``torch.cfloat`` or ``torch.cdouble``. Found {psd_s.dtype}.")
+    if psd_s.shape[-1] != psd_s.shape[-2]:
+        raise ValueError(f"The last two dimensions of psd_s should be the same. Found {psd_s.shape}.")
+    _, v = torch.linalg.eigh(psd_s)  # v is sorted along with eigenvalues in ascending order
+    rtf = v[..., -1]  # choose the eigenvector with max eigenvalue
+    return rtf
+
+
+def rtf_power(
+    psd_s: Tensor,
+    psd_n: Tensor,
+    reference_channel: Union[int, Tensor],
+    n_iter: int = 3,
+    diagonal_loading: bool = True,
+    diag_eps: float = 1e-7,
+) -> Tensor:
+    r"""Estimate the relative transfer function (RTF) or the steering vector by the power method.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        psd_s (torch.Tensor): The complex-valued power spectral density (PSD) matrix of target speech.
+            Tensor with dimensions `(..., freq, channel, channel)`.
+        psd_n (torch.Tensor): The complex-valued power spectral density (PSD) matrix of noise.
+            Tensor with dimensions `(..., freq, channel, channel)`.
+        reference_channel (int or torch.Tensor): Specifies the reference channel.
+            If the dtype is ``int``, it represents the reference channel index.
+            If the dtype is ``torch.Tensor``, its shape is `(..., channel)`, where the ``channel`` dimension
+            is one-hot.
+        diagonal_loading (bool, optional): If ``True``, enables applying diagonal loading to ``psd_n``.
+            (Default: ``True``)
+        diag_eps (float, optional): The coefficient multiplied to the identity matrix for diagonal loading.
+            It is only effective when ``diagonal_loading`` is set to ``True``. (Default: ``1e-7``)
+
+    Returns:
+        torch.Tensor: The estimated complex-valued RTF of target speech.
+        Tensor of dimension `(..., freq, channel)`.
+    """
+    _assert_psd_matrices(psd_s, psd_n)
+    if n_iter <= 0:
+        raise ValueError("The number of iteration must be greater than 0.")
+
+    # Apply diagonal loading to psd_n to improve robustness.
+    if diagonal_loading:
+        psd_n = _tik_reg(psd_n, reg=diag_eps)
+    # phi is regarded as the first iteration
+    phi = torch.linalg.solve(psd_n, psd_s)  # psd_n.inv() @ psd_s
+    if torch.jit.isinstance(reference_channel, int):
+        rtf = phi[..., reference_channel]
+    elif torch.jit.isinstance(reference_channel, Tensor):
+        reference_channel = reference_channel.to(psd_n.dtype)
+        rtf = torch.einsum("...c,...c->...", [phi, reference_channel[..., None, None, :]])
+    else:
+        raise TypeError(f'Expected "int" or "Tensor" for reference_channel. Found: {type(reference_channel)}.')
+    rtf = rtf.unsqueeze(-1)  # (..., freq, channel, 1)
+    if n_iter >= 2:
+        # The number of iterations in the for loop is `n_iter - 2`
+        # because the `phi` above and `torch.matmul(psd_s, rtf)` are regarded as
+        # two iterations.
+        for _ in range(n_iter - 2):
+            rtf = torch.matmul(phi, rtf)
+        rtf = torch.matmul(psd_s, rtf)
+    else:
+        # if there is only one iteration, the rtf is the psd_s[..., referenc_channel]
+        # which is psd_n @ phi @ ref_channel
+        rtf = torch.matmul(psd_n, rtf)
+    return rtf.squeeze(-1)
+
+
+def apply_beamforming(beamform_weights: Tensor, specgram: Tensor) -> Tensor:
+    r"""Apply the beamforming weight to the multi-channel noisy spectrum to obtain the single-channel enhanced spectrum.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    .. math::
+        \hat{\textbf{S}}(f) = \textbf{w}_{\text{bf}}(f)^{\mathsf{H}} \textbf{Y}(f)
+
+    where :math:`\textbf{w}_{\text{bf}}(f)` is the beamforming weight for the :math:`f`-th frequency bin,
+    :math:`\textbf{Y}` is the multi-channel spectrum for the :math:`f`-th frequency bin.
+
+    Args:
+        beamform_weights (Tensor): The complex-valued beamforming weight matrix.
+            Tensor of dimension `(..., freq, channel)`
+        specgram (Tensor): The multi-channel complex-valued noisy spectrum.
+            Tensor of dimension `(..., channel, freq, time)`
+
+    Returns:
+        Tensor: The single-channel complex-valued enhanced spectrum.
+            Tensor of dimension `(..., freq, time)`
+    """
+    if beamform_weights.shape[:-2] != specgram.shape[:-3]:
+        raise ValueError(
+            "The dimensions except the last two dimensions of beamform_weights should be the same "
+            "as the dimensions except the last three dimensions of specgram. "
+            f"Found {beamform_weights.shape} for beamform_weights and {specgram.shape} for specgram."
+        )
+
+    if not (beamform_weights.is_complex() and specgram.is_complex()):
+        raise TypeError(
+            "The type of beamform_weights and specgram must be ``torch.cfloat`` or ``torch.cdouble``. "
+            f"Found {beamform_weights.dtype} for beamform_weights and {specgram.dtype} for specgram."
+        )
+
+    # (..., freq, channel) x (..., channel, freq, time) -> (..., freq, time)
+    specgram_enhanced = torch.einsum("...fc,...cft->...ft", [beamform_weights.conj(), specgram])
+    return specgram_enhanced
+
+
+def _check_shape_compatible(x: torch.Tensor, y: torch.Tensor) -> None:
+    if x.ndim != y.ndim:
+        raise ValueError(f"The operands must be the same dimension (got {x.ndim} and {y.ndim}).")
+
+    for i in range(x.ndim - 1):
+        xi = x.size(i)
+        yi = y.size(i)
+        if xi == yi or xi == 1 or yi == 1:
+            continue
+        raise ValueError(f"Leading dimensions of x and y are not broadcastable (got {x.shape} and {y.shape}).")
+
+
+def _check_convolve_mode(mode: str) -> None:
+    valid_convolve_modes = ["full", "valid", "same"]
+    if mode not in valid_convolve_modes:
+        raise ValueError(f"Unrecognized mode value '{mode}'. Please specify one of {valid_convolve_modes}.")
+
+
+def _apply_convolve_mode(conv_result: torch.Tensor, x_length: int, y_length: int, mode: str) -> torch.Tensor:
+    valid_convolve_modes = ["full", "valid", "same"]
+    if mode == "full":
+        return conv_result
+    elif mode == "valid":
+        target_length = max(x_length, y_length) - min(x_length, y_length) + 1
+        start_idx = (conv_result.size(-1) - target_length) // 2
+        return conv_result[..., start_idx : start_idx + target_length]
+    elif mode == "same":
+        start_idx = (conv_result.size(-1) - x_length) // 2
+        return conv_result[..., start_idx : start_idx + x_length]
+    else:
+        raise ValueError(f"Unrecognized mode value '{mode}'. Please specify one of {valid_convolve_modes}.")
+
+
+def fftconvolve(x: torch.Tensor, y: torch.Tensor, mode: str = "full") -> torch.Tensor:
+    r"""
+    Convolves inputs along their last dimension using FFT. For inputs with large last dimensions, this function
+    is generally much faster than :meth:`convolve`.
+    Note that, in contrast to :meth:`torch.nn.functional.conv1d`, which actually applies the valid cross-correlation
+    operator, this function applies the true `convolution`_ operator.
+    Also note that this function can only output float tensors (int tensor inputs will be cast to float).
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        x (torch.Tensor): First convolution operand, with shape `(..., N)`.
+        y (torch.Tensor): Second convolution operand, with shape `(..., M)`
+            (leading dimensions must be broadcast-able with those of ``x``).
+        mode (str, optional): Must be one of ("full", "valid", "same").
+
+            * "full": Returns the full convolution result, with shape `(..., N + M - 1)`. (Default)
+            * "valid": Returns the segment of the full convolution result corresponding to where
+              the two inputs overlap completely, with shape `(..., max(N, M) - min(N, M) + 1)`.
+            * "same": Returns the center segment of the full convolution result, with shape `(..., N)`.
+
+    Returns:
+        torch.Tensor: Result of convolving ``x`` and ``y``, with shape `(..., L)`, where
+        the leading dimensions match those of ``x`` and `L` is dictated by ``mode``.
+
+    .. _convolution:
+        https://en.wikipedia.org/wiki/Convolution
+    """
+    _check_shape_compatible(x, y)
+    _check_convolve_mode(mode)
+
+    n = x.size(-1) + y.size(-1) - 1
+    fresult = torch.fft.rfft(x, n=n) * torch.fft.rfft(y, n=n)
+    result = torch.fft.irfft(fresult, n=n)
+    return _apply_convolve_mode(result, x.size(-1), y.size(-1), mode)
+
+
+def convolve(x: torch.Tensor, y: torch.Tensor, mode: str = "full") -> torch.Tensor:
+    r"""
+    Convolves inputs along their last dimension using the direct method.
+    Note that, in contrast to :meth:`torch.nn.functional.conv1d`, which actually applies the valid cross-correlation
+    operator, this function applies the true `convolution`_ operator.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        x (torch.Tensor): First convolution operand, with shape `(..., N)`.
+        y (torch.Tensor): Second convolution operand, with shape `(..., M)`
+            (leading dimensions must be broadcast-able with those of ``x``).
+        mode (str, optional): Must be one of ("full", "valid", "same").
+
+            * "full": Returns the full convolution result, with shape `(..., N + M - 1)`. (Default)
+            * "valid": Returns the segment of the full convolution result corresponding to where
+              the two inputs overlap completely, with shape `(..., max(N, M) - min(N, M) + 1)`.
+            * "same": Returns the center segment of the full convolution result, with shape `(..., N)`.
+
+    Returns:
+        torch.Tensor: Result of convolving ``x`` and ``y``, with shape `(..., L)`, where
+        the leading dimensions match those of ``x`` and `L` is dictated by ``mode``.
+
+    .. _convolution:
+        https://en.wikipedia.org/wiki/Convolution
+    """
+    _check_shape_compatible(x, y)
+    _check_convolve_mode(mode)
+
+    x_size, y_size = x.size(-1), y.size(-1)
+
+    if x.size(-1) < y.size(-1):
+        x, y = y, x
+
+    if x.shape[:-1] != y.shape[:-1]:
+        new_shape = [max(i, j) for i, j in zip(x.shape[:-1], y.shape[:-1])]
+        x = x.broadcast_to(new_shape + [x.shape[-1]])
+        y = y.broadcast_to(new_shape + [y.shape[-1]])
+
+    num_signals = torch.tensor(x.shape[:-1]).prod()
+    reshaped_x = x.reshape((int(num_signals), x.size(-1)))
+    reshaped_y = y.reshape((int(num_signals), y.size(-1)))
+    output = torch.nn.functional.conv1d(
+        input=reshaped_x,
+        weight=reshaped_y.flip(-1).unsqueeze(1),
+        stride=1,
+        groups=reshaped_x.size(0),
+        padding=reshaped_y.size(-1) - 1,
+    )
+    output_shape = x.shape[:-1] + (-1,)
+    result = output.reshape(output_shape)
+    return _apply_convolve_mode(result, x_size, y_size, mode)
+
+
+def add_noise(
+    waveform: torch.Tensor, noise: torch.Tensor, snr: torch.Tensor, lengths: Optional[torch.Tensor] = None
+) -> torch.Tensor:
+    r"""Scales and adds noise to waveform per signal-to-noise ratio.
+
+    Specifically, for each pair of waveform vector :math:`x \in \mathbb{R}^L` and noise vector
+    :math:`n \in \mathbb{R}^L`, the function computes output :math:`y` as
+
+    .. math::
+        y = x + a n \, \text{,}
+
+    where
+
+    .. math::
+        a = \sqrt{ \frac{ ||x||_{2}^{2} }{ ||n||_{2}^{2} } \cdot 10^{-\frac{\text{SNR}}{10}} } \, \text{,}
+
+    with :math:`\text{SNR}` being the desired signal-to-noise ratio between :math:`x` and :math:`n`, in dB.
+
+    Note that this function broadcasts singleton leading dimensions in its inputs in a manner that is
+    consistent with the above formulae and PyTorch's broadcasting semantics.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (torch.Tensor): Input waveform, with shape `(..., L)`.
+        noise (torch.Tensor): Noise, with shape `(..., L)` (same shape as ``waveform``).
+        snr (torch.Tensor): Signal-to-noise ratios in dB, with shape `(...,)`.
+        lengths (torch.Tensor or None, optional): Valid lengths of signals in ``waveform`` and ``noise``, with shape
+            `(...,)` (leading dimensions must match those of ``waveform``). If ``None``, all elements in ``waveform``
+            and ``noise`` are treated as valid. (Default: ``None``)
+
+    Returns:
+        torch.Tensor: Result of scaling and adding ``noise`` to ``waveform``, with shape `(..., L)`
+        (same shape as ``waveform``).
+    """
+
+    if not (waveform.ndim - 1 == noise.ndim - 1 == snr.ndim and (lengths is None or lengths.ndim == snr.ndim)):
+        raise ValueError("Input leading dimensions don't match.")
+
+    L = waveform.size(-1)
+
+    if L != noise.size(-1):
+        raise ValueError(f"Length dimensions of waveform and noise don't match (got {L} and {noise.size(-1)}).")
+
+    # compute scale
+    if lengths is not None:
+        mask = torch.arange(0, L, device=lengths.device).expand(waveform.shape) < lengths.unsqueeze(
+            -1
+        )  # (*, L) < (*, 1) = (*, L)
+        masked_waveform = waveform * mask
+        masked_noise = noise * mask
+    else:
+        masked_waveform = waveform
+        masked_noise = noise
+
+    energy_signal = torch.linalg.vector_norm(masked_waveform, ord=2, dim=-1) ** 2  # (*,)
+    energy_noise = torch.linalg.vector_norm(masked_noise, ord=2, dim=-1) ** 2  # (*,)
+    original_snr_db = 10 * (torch.log10(energy_signal) - torch.log10(energy_noise))
+    scale = 10 ** ((original_snr_db - snr) / 20.0)  # (*,)
+
+    # scale noise
+    scaled_noise = scale.unsqueeze(-1) * noise  # (*, 1) * (*, L) = (*, L)
+
+    return waveform + scaled_noise  # (*, L)
+
+
+def speed(
+    waveform: torch.Tensor, orig_freq: int, factor: float, lengths: Optional[torch.Tensor] = None
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    r"""Adjusts waveform speed.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (torch.Tensor): Input signals, with shape `(..., time)`.
+        orig_freq (int): Original frequency of the signals in ``waveform``.
+        factor (float): Factor by which to adjust speed of input. Values greater than 1.0
+            compress ``waveform`` in time, whereas values less than 1.0 stretch ``waveform`` in time.
+        lengths (torch.Tensor or None, optional): Valid lengths of signals in ``waveform``, with shape `(...)`.
+            If ``None``, all elements in ``waveform`` are treated as valid. (Default: ``None``)
+
+    Returns:
+        (torch.Tensor, torch.Tensor or None):
+            torch.Tensor
+                Speed-adjusted waveform, with shape `(..., new_time).`
+            torch.Tensor or None
+                If ``lengths`` is not ``None``, valid lengths of signals in speed-adjusted waveform,
+                with shape `(...)`; otherwise, ``None``.
+    """
+
+    source_sample_rate = int(factor * orig_freq)
+    target_sample_rate = int(orig_freq)
+
+    gcd = math.gcd(source_sample_rate, target_sample_rate)
+    source_sample_rate = source_sample_rate // gcd
+    target_sample_rate = target_sample_rate // gcd
+
+    if lengths is None:
+        out_lengths = None
+    else:
+        out_lengths = torch.ceil(lengths * target_sample_rate / source_sample_rate).to(lengths.dtype)
+
+    return resample(waveform, source_sample_rate, target_sample_rate), out_lengths
+
+
+def preemphasis(waveform, coeff: float = 0.97) -> torch.Tensor:
+    r"""Pre-emphasizes a waveform along its last dimension, i.e.
+    for each signal :math:`x` in ``waveform``, computes
+    output :math:`y` as
+
+    .. math::
+        y[i] = x[i] - \text{coeff} \cdot x[i - 1]
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (torch.Tensor): Waveform, with shape `(..., N)`.
+        coeff (float, optional): Pre-emphasis coefficient. Typically between 0.0 and 1.0.
+            (Default: 0.97)
+
+    Returns:
+        torch.Tensor: Pre-emphasized waveform, with shape `(..., N)`.
+    """
+    waveform = waveform.clone()
+    waveform[..., 1:] -= coeff * waveform[..., :-1]
+    return waveform
+
+
+def deemphasis(waveform, coeff: float = 0.97) -> torch.Tensor:
+    r"""De-emphasizes a waveform along its last dimension.
+    Inverse of :meth:`preemphasis`. Concretely, for each signal
+    :math:`x` in ``waveform``, computes output :math:`y` as
+
+    .. math::
+        y[i] = x[i] + \text{coeff} \cdot y[i - 1]
+
+    .. devices:: CPU CUDA
+
+    .. properties:: Autograd TorchScript
+
+    Args:
+        waveform (torch.Tensor): Waveform, with shape `(..., N)`.
+        coeff (float, optional): De-emphasis coefficient. Typically between 0.0 and 1.0.
+            (Default: 0.97)
+
+    Returns:
+        torch.Tensor: De-emphasized waveform, with shape `(..., N)`.
+    """
+    a_coeffs = torch.tensor([1.0, -coeff], dtype=waveform.dtype, device=waveform.device)
+    b_coeffs = torch.tensor([1.0, 0.0], dtype=waveform.dtype, device=waveform.device)
+    return torchaudio.functional.lfilter(waveform, a_coeffs=a_coeffs, b_coeffs=b_coeffs)
+
+
+def frechet_distance(mu_x, sigma_x, mu_y, sigma_y):
+    r"""Computes the Fréchet distance between two multivariate normal distributions :cite:`dowson1982frechet`.
+
+    Concretely, for multivariate Gaussians :math:`X(\mu_X, \Sigma_X)`
+    and :math:`Y(\mu_Y, \Sigma_Y)`, the function computes and returns :math:`F` as
+
+    .. math::
+        F(X, Y) = || \mu_X - \mu_Y ||_2^2
+        + \text{Tr}\left( \Sigma_X + \Sigma_Y - 2 \sqrt{\Sigma_X \Sigma_Y} \right)
+
+    Args:
+        mu_x (torch.Tensor): mean :math:`\mu_X` of multivariate Gaussian :math:`X`, with shape `(N,)`.
+        sigma_x (torch.Tensor): covariance matrix :math:`\Sigma_X` of :math:`X`, with shape `(N, N)`.
+        mu_y (torch.Tensor): mean :math:`\mu_Y` of multivariate Gaussian :math:`Y`, with shape `(N,)`.
+        sigma_y (torch.Tensor): covariance matrix :math:`\Sigma_Y` of :math:`Y`, with shape `(N, N)`.
+
+    Returns:
+        torch.Tensor: the Fréchet distance between :math:`X` and :math:`Y`.
+    """
+    if len(mu_x.size()) != 1:
+        raise ValueError(f"Input mu_x must be one-dimensional; got dimension {len(mu_x.size())}.")
+    if len(sigma_x.size()) != 2:
+        raise ValueError(f"Input sigma_x must be two-dimensional; got dimension {len(sigma_x.size())}.")
+    if sigma_x.size(0) != sigma_x.size(1) != mu_x.size(0):
+        raise ValueError("Each of sigma_x's dimensions must match mu_x's size.")
+    if mu_x.size() != mu_y.size():
+        raise ValueError(f"Inputs mu_x and mu_y must have the same shape; got {mu_x.size()} and {mu_y.size()}.")
+    if sigma_x.size() != sigma_y.size():
+        raise ValueError(
+            f"Inputs sigma_x and sigma_y must have the same shape; got {sigma_x.size()} and {sigma_y.size()}."
+        )
+
+    a = (mu_x - mu_y).square().sum()
+    b = sigma_x.trace() + sigma_y.trace()
+    c = torch.linalg.eigvals(sigma_x @ sigma_y).sqrt().real.sum()
+    return a + b - 2 * c
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/kaldi_io.py b/.venv/lib/python3.11/site-packages/torchaudio/kaldi_io.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d372429dcfd23fdcbe8cd0f38abef1086d8a5eb
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/kaldi_io.py
@@ -0,0 +1,144 @@
+# To use this file, the dependency (https://github.com/vesis84/kaldi-io-for-python)
+# needs to be installed. This is a light wrapper around kaldi_io that returns
+# torch.Tensors.
+from typing import Any, Callable, Iterable, Tuple
+
+import torch
+from torch import Tensor
+from torchaudio._internal import module_utils as _mod_utils
+
+if _mod_utils.is_module_available("numpy"):
+    import numpy as np
+
+
+__all__ = [
+    "read_vec_int_ark",
+    "read_vec_flt_scp",
+    "read_vec_flt_ark",
+    "read_mat_scp",
+    "read_mat_ark",
+]
+
+
+def _convert_method_output_to_tensor(
+    file_or_fd: Any, fn: Callable, convert_contiguous: bool = False
+) -> Iterable[Tuple[str, Tensor]]:
+    r"""Takes a method invokes it. The output is converted to a tensor.
+
+    Args:
+        file_or_fd (str/FileDescriptor): File name or file descriptor
+        fn (Callable): Function that has the signature (file name/descriptor) and converts it to
+            Iterable[Tuple[str, Tensor]].
+        convert_contiguous (bool, optional): Determines whether the array should be converted into a
+            contiguous layout. (Default: ``False``)
+
+    Returns:
+        Iterable[Tuple[str, Tensor]]: The string is the key and the tensor is vec/mat
+    """
+    for key, np_arr in fn(file_or_fd):
+        if convert_contiguous:
+            np_arr = np.ascontiguousarray(np_arr)
+        yield key, torch.from_numpy(np_arr)
+
+
+@_mod_utils.requires_module("kaldi_io", "numpy")
+def read_vec_int_ark(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
+    r"""Create generator of (key,vector<int>) tuples, which reads from the ark file/stream.
+
+    Args:
+        file_or_fd (str/FileDescriptor): ark, gzipped ark, pipe or opened file descriptor
+
+    Returns:
+        Iterable[Tuple[str, Tensor]]: The string is the key and the tensor is the vector read from file
+
+    Example
+        >>> # read ark to a 'dictionary'
+        >>> d = { u:d for u,d in torchaudio.kaldi_io.read_vec_int_ark(file) }
+    """
+
+    import kaldi_io
+
+    # Requires convert_contiguous to be True because elements from int32 vector are
+    # sorted in tuples: (sizeof(int32), value) so strides are (5,) instead of (4,) which will throw an error
+    # in from_numpy as it expects strides to be a multiple of 4 (int32).
+    return _convert_method_output_to_tensor(file_or_fd, kaldi_io.read_vec_int_ark, convert_contiguous=True)
+
+
+@_mod_utils.requires_module("kaldi_io", "numpy")
+def read_vec_flt_scp(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
+    r"""Create generator of (key,vector<float32/float64>) tuples, read according to Kaldi scp.
+
+    Args:
+        file_or_fd (str/FileDescriptor): scp, gzipped scp, pipe or opened file descriptor
+
+    Returns:
+        Iterable[Tuple[str, Tensor]]: The string is the key and the tensor is the vector read from file
+
+    Example
+        >>> # read scp to a 'dictionary'
+        >>> # d = { u:d for u,d in torchaudio.kaldi_io.read_vec_flt_scp(file) }
+    """
+
+    import kaldi_io
+
+    return _convert_method_output_to_tensor(file_or_fd, kaldi_io.read_vec_flt_scp)
+
+
+@_mod_utils.requires_module("kaldi_io", "numpy")
+def read_vec_flt_ark(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
+    r"""Create generator of (key,vector<float32/float64>) tuples, which reads from the ark file/stream.
+
+    Args:
+        file_or_fd (str/FileDescriptor): ark, gzipped ark, pipe or opened file descriptor
+
+    Returns:
+        Iterable[Tuple[str, Tensor]]: The string is the key and the tensor is the vector read from file
+
+    Example
+        >>> # read ark to a 'dictionary'
+        >>> d = { u:d for u,d in torchaudio.kaldi_io.read_vec_flt_ark(file) }
+    """
+
+    import kaldi_io
+
+    return _convert_method_output_to_tensor(file_or_fd, kaldi_io.read_vec_flt_ark)
+
+
+@_mod_utils.requires_module("kaldi_io", "numpy")
+def read_mat_scp(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
+    r"""Create generator of (key,matrix<float32/float64>) tuples, read according to Kaldi scp.
+
+    Args:
+        file_or_fd (str/FileDescriptor): scp, gzipped scp, pipe or opened file descriptor
+
+    Returns:
+        Iterable[Tuple[str, Tensor]]: The string is the key and the tensor is the matrix read from file
+
+    Example
+        >>> # read scp to a 'dictionary'
+        >>> d = { u:d for u,d in torchaudio.kaldi_io.read_mat_scp(file) }
+    """
+
+    import kaldi_io
+
+    return _convert_method_output_to_tensor(file_or_fd, kaldi_io.read_mat_scp)
+
+
+@_mod_utils.requires_module("kaldi_io", "numpy")
+def read_mat_ark(file_or_fd: Any) -> Iterable[Tuple[str, Tensor]]:
+    r"""Create generator of (key,matrix<float32/float64>) tuples, which reads from the ark file/stream.
+
+    Args:
+        file_or_fd (str/FileDescriptor): ark, gzipped ark, pipe or opened file descriptor
+
+    Returns:
+        Iterable[Tuple[str, Tensor]]: The string is the key and the tensor is the matrix read from file
+
+    Example
+        >>> # read ark to a 'dictionary'
+        >>> d = { u:d for u,d in torchaudio.kaldi_io.read_mat_ark(file) }
+    """
+
+    import kaldi_io
+
+    return _convert_method_output_to_tensor(file_or_fd, kaldi_io.read_mat_ark)
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/pipelines/__init__.py b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..efec1f3521e760803e095efb71f164ed268896f1
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/__init__.py
@@ -0,0 +1,102 @@
+from ._source_separation_pipeline import (
+    CONVTASNET_BASE_LIBRI2MIX,
+    HDEMUCS_HIGH_MUSDB,
+    HDEMUCS_HIGH_MUSDB_PLUS,
+    SourceSeparationBundle,
+)
+from ._squim_pipeline import SQUIM_OBJECTIVE, SQUIM_SUBJECTIVE, SquimObjectiveBundle, SquimSubjectiveBundle
+from ._tts import (
+    TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH,
+    TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH,
+    TACOTRON2_WAVERNN_CHAR_LJSPEECH,
+    TACOTRON2_WAVERNN_PHONE_LJSPEECH,
+    Tacotron2TTSBundle,
+)
+from ._wav2vec2.impl import (
+    HUBERT_ASR_LARGE,
+    HUBERT_ASR_XLARGE,
+    HUBERT_BASE,
+    HUBERT_LARGE,
+    HUBERT_XLARGE,
+    MMS_FA,
+    VOXPOPULI_ASR_BASE_10K_DE,
+    VOXPOPULI_ASR_BASE_10K_EN,
+    VOXPOPULI_ASR_BASE_10K_ES,
+    VOXPOPULI_ASR_BASE_10K_FR,
+    VOXPOPULI_ASR_BASE_10K_IT,
+    WAV2VEC2_ASR_BASE_100H,
+    WAV2VEC2_ASR_BASE_10M,
+    WAV2VEC2_ASR_BASE_960H,
+    WAV2VEC2_ASR_LARGE_100H,
+    WAV2VEC2_ASR_LARGE_10M,
+    WAV2VEC2_ASR_LARGE_960H,
+    WAV2VEC2_ASR_LARGE_LV60K_100H,
+    WAV2VEC2_ASR_LARGE_LV60K_10M,
+    WAV2VEC2_ASR_LARGE_LV60K_960H,
+    WAV2VEC2_BASE,
+    WAV2VEC2_LARGE,
+    WAV2VEC2_LARGE_LV60K,
+    WAV2VEC2_XLSR53,
+    WAV2VEC2_XLSR_1B,
+    WAV2VEC2_XLSR_2B,
+    WAV2VEC2_XLSR_300M,
+    Wav2Vec2ASRBundle,
+    Wav2Vec2Bundle,
+    Wav2Vec2FABundle,
+    WAVLM_BASE,
+    WAVLM_BASE_PLUS,
+    WAVLM_LARGE,
+)
+from .rnnt_pipeline import EMFORMER_RNNT_BASE_LIBRISPEECH, RNNTBundle
+
+
+__all__ = [
+    "Wav2Vec2Bundle",
+    "Wav2Vec2ASRBundle",
+    "Wav2Vec2FABundle",
+    "WAV2VEC2_BASE",
+    "WAV2VEC2_LARGE",
+    "WAV2VEC2_LARGE_LV60K",
+    "WAV2VEC2_ASR_BASE_10M",
+    "WAV2VEC2_ASR_BASE_100H",
+    "WAV2VEC2_ASR_BASE_960H",
+    "WAV2VEC2_ASR_LARGE_10M",
+    "WAV2VEC2_ASR_LARGE_100H",
+    "WAV2VEC2_ASR_LARGE_960H",
+    "WAV2VEC2_ASR_LARGE_LV60K_10M",
+    "WAV2VEC2_ASR_LARGE_LV60K_100H",
+    "WAV2VEC2_ASR_LARGE_LV60K_960H",
+    "WAV2VEC2_XLSR53",
+    "WAV2VEC2_XLSR_300M",
+    "WAV2VEC2_XLSR_1B",
+    "WAV2VEC2_XLSR_2B",
+    "VOXPOPULI_ASR_BASE_10K_EN",
+    "VOXPOPULI_ASR_BASE_10K_ES",
+    "VOXPOPULI_ASR_BASE_10K_DE",
+    "VOXPOPULI_ASR_BASE_10K_FR",
+    "VOXPOPULI_ASR_BASE_10K_IT",
+    "HUBERT_BASE",
+    "HUBERT_LARGE",
+    "HUBERT_XLARGE",
+    "HUBERT_ASR_LARGE",
+    "HUBERT_ASR_XLARGE",
+    "MMS_FA",
+    "WAVLM_BASE",
+    "WAVLM_BASE_PLUS",
+    "WAVLM_LARGE",
+    "Tacotron2TTSBundle",
+    "TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH",
+    "TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH",
+    "TACOTRON2_WAVERNN_CHAR_LJSPEECH",
+    "TACOTRON2_WAVERNN_PHONE_LJSPEECH",
+    "RNNTBundle",
+    "EMFORMER_RNNT_BASE_LIBRISPEECH",
+    "SourceSeparationBundle",
+    "CONVTASNET_BASE_LIBRI2MIX",
+    "HDEMUCS_HIGH_MUSDB_PLUS",
+    "HDEMUCS_HIGH_MUSDB",
+    "SQUIM_OBJECTIVE",
+    "SQUIM_SUBJECTIVE",
+    "SquimObjectiveBundle",
+    "SquimSubjectiveBundle",
+]
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/pipelines/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ab43fa5f988fb96d82cec03edae9e6bd789bdce
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/pipelines/__pycache__/_source_separation_pipeline.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/__pycache__/_source_separation_pipeline.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d5a2c6b245dbd2bfe446d3e77ae9ce597235dc94
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/__pycache__/_source_separation_pipeline.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/pipelines/__pycache__/_squim_pipeline.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/__pycache__/_squim_pipeline.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f5db23f266ed2ed6055eb35830f7c89ec2908eb4
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/__pycache__/_squim_pipeline.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/pipelines/__pycache__/rnnt_pipeline.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/__pycache__/rnnt_pipeline.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d6727d90e450636686d5df9788d1ebce90c6b25d
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/__pycache__/rnnt_pipeline.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_source_separation_pipeline.py b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_source_separation_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae92e21831307f91450b32a73563c0011e455753
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_source_separation_pipeline.py
@@ -0,0 +1,109 @@
+from dataclasses import dataclass
+from functools import partial
+from typing import Callable
+
+import torch
+import torchaudio
+
+from torchaudio.models import conv_tasnet_base, hdemucs_high
+
+
+@dataclass
+class SourceSeparationBundle:
+    """Dataclass that bundles components for performing source separation.
+
+    Example
+        >>> import torchaudio
+        >>> from torchaudio.pipelines import CONVTASNET_BASE_LIBRI2MIX
+        >>> import torch
+        >>>
+        >>> # Build the separation model.
+        >>> model = CONVTASNET_BASE_LIBRI2MIX.get_model()
+        >>> 100%|███████████████████████████████|19.1M/19.1M [00:04<00:00, 4.93MB/s]
+        >>>
+        >>> # Instantiate the test set of Libri2Mix dataset.
+        >>> dataset = torchaudio.datasets.LibriMix("/home/datasets/", subset="test")
+        >>>
+        >>> # Apply source separation on mixture audio.
+        >>> for i, data in enumerate(dataset):
+        >>>     sample_rate, mixture, clean_sources = data
+        >>>     # Make sure the shape of input suits the model requirement.
+        >>>     mixture = mixture.reshape(1, 1, -1)
+        >>>     estimated_sources = model(mixture)
+        >>>     score = si_snr_pit(estimated_sources, clean_sources) # for demonstration
+        >>>     print(f"Si-SNR score is : {score}.)
+        >>>     break
+        >>> Si-SNR score is : 16.24.
+        >>>
+    """
+
+    _model_path: str
+    _model_factory_func: Callable[[], torch.nn.Module]
+    _sample_rate: int
+
+    @property
+    def sample_rate(self) -> int:
+        """Sample rate of the audio that the model is trained on.
+
+        :type: int
+        """
+        return self._sample_rate
+
+    def get_model(self) -> torch.nn.Module:
+        """Construct the model and load the pretrained weight."""
+        model = self._model_factory_func()
+        path = torchaudio.utils.download_asset(self._model_path)
+        state_dict = torch.load(path)
+        model.load_state_dict(state_dict)
+        model.eval()
+        return model
+
+
+CONVTASNET_BASE_LIBRI2MIX = SourceSeparationBundle(
+    _model_path="models/conv_tasnet_base_libri2mix.pt",
+    _model_factory_func=partial(conv_tasnet_base, num_sources=2),
+    _sample_rate=8000,
+)
+CONVTASNET_BASE_LIBRI2MIX.__doc__ = """Pre-trained Source Separation pipeline with *ConvTasNet*
+:cite:`Luo_2019` trained on *Libri2Mix dataset* :cite:`cosentino2020librimix`.
+
+The source separation model is constructed by :func:`~torchaudio.models.conv_tasnet_base`
+and is trained using the training script ``lightning_train.py``
+`here <https://github.com/pytorch/audio/tree/release/0.12/examples/source_separation/>`__
+with default arguments.
+
+Please refer to :class:`SourceSeparationBundle` for usage instructions.
+"""
+
+
+HDEMUCS_HIGH_MUSDB_PLUS = SourceSeparationBundle(
+    _model_path="models/hdemucs_high_trained.pt",
+    _model_factory_func=partial(hdemucs_high, sources=["drums", "bass", "other", "vocals"]),
+    _sample_rate=44100,
+)
+HDEMUCS_HIGH_MUSDB_PLUS.__doc__ = """Pre-trained music source separation pipeline with
+*Hybrid Demucs* :cite:`defossez2021hybrid` trained on both training and test sets of
+MUSDB-HQ :cite:`MUSDB18HQ` and an additional 150 extra songs from an internal database
+that was specifically produced for Meta.
+
+The model is constructed by :func:`~torchaudio.models.hdemucs_high`.
+
+Training was performed in the original HDemucs repository `here <https://github.com/facebookresearch/demucs/>`__.
+
+Please refer to :class:`SourceSeparationBundle` for usage instructions.
+"""
+
+
+HDEMUCS_HIGH_MUSDB = SourceSeparationBundle(
+    _model_path="models/hdemucs_high_musdbhq_only.pt",
+    _model_factory_func=partial(hdemucs_high, sources=["drums", "bass", "other", "vocals"]),
+    _sample_rate=44100,
+)
+HDEMUCS_HIGH_MUSDB.__doc__ = """Pre-trained music source separation pipeline with
+*Hybrid Demucs* :cite:`defossez2021hybrid` trained on the training set of MUSDB-HQ :cite:`MUSDB18HQ`.
+
+The model is constructed by :func:`~torchaudio.models.hdemucs_high`.
+Training was performed in the original HDemucs repository `here <https://github.com/facebookresearch/demucs/>`__.
+
+Please refer to :class:`SourceSeparationBundle` for usage instructions.
+"""
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_squim_pipeline.py b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_squim_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c70db4aef70397d33dcb3d3b28131221cef52c8
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_squim_pipeline.py
@@ -0,0 +1,156 @@
+from dataclasses import dataclass
+
+import torch
+import torchaudio
+
+from torchaudio.models import squim_objective_base, squim_subjective_base, SquimObjective, SquimSubjective
+
+
+@dataclass
+class SquimObjectiveBundle:
+    """Data class that bundles associated information to use pretrained
+    :py:class:`~torchaudio.models.SquimObjective` model.
+
+    This class provides interfaces for instantiating the pretrained model along with
+    the information necessary to retrieve pretrained weights and additional data
+    to be used with the model.
+
+    Torchaudio library instantiates objects of this class, each of which represents
+    a different pretrained model. Client code should access pretrained models via these
+    instances.
+
+    This bundle can estimate objective metric scores for speech enhancement, such as STOI, PESQ, Si-SDR.
+    A typical use case would be a flow like `waveform -> list of scores`. Please see below for the code example.
+
+    Example: Estimate the objective metric scores for the input waveform.
+        >>> import torch
+        >>> import torchaudio
+        >>> from torchaudio.pipelines import SQUIM_OBJECTIVE as bundle
+        >>>
+        >>> # Load the SquimObjective bundle
+        >>> model = bundle.get_model()
+        Downloading: "https://download.pytorch.org/torchaudio/models/squim_objective_dns2020.pth"
+        100%|████████████| 28.2M/28.2M [00:03<00:00, 9.24MB/s]
+        >>>
+        >>> # Resample audio to the expected sampling rate
+        >>> waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
+        >>>
+        >>> # Estimate objective metric scores
+        >>> scores = model(waveform)
+        >>> print(f"STOI: {scores[0].item()},  PESQ: {scores[1].item()}, SI-SDR: {scores[2].item()}.")
+    """  # noqa: E501
+
+    _path: str
+    _sample_rate: float
+
+    def get_model(self) -> SquimObjective:
+        """Construct the SquimObjective model, and load the pretrained weight.
+
+        Returns:
+            Variation of :py:class:`~torchaudio.models.SquimObjective`.
+        """
+        model = squim_objective_base()
+        path = torchaudio.utils.download_asset(f"models/{self._path}")
+        state_dict = torch.load(path, weights_only=True)
+        model.load_state_dict(state_dict)
+        model.eval()
+        return model
+
+    @property
+    def sample_rate(self):
+        """Sample rate of the audio that the model is trained on.
+
+        :type: float
+        """
+        return self._sample_rate
+
+
+SQUIM_OBJECTIVE = SquimObjectiveBundle(
+    "squim_objective_dns2020.pth",
+    _sample_rate=16000,
+)
+SQUIM_OBJECTIVE.__doc__ = """SquimObjective pipeline trained using approach described in
+    :cite:`kumar2023torchaudio` on the *DNS 2020 Dataset* :cite:`reddy2020interspeech`.
+
+    The underlying model is constructed by :py:func:`torchaudio.models.squim_objective_base`.
+    The weights are under `Creative Commons Attribution 4.0 International License
+    <https://github.com/microsoft/DNS-Challenge/blob/interspeech2020/master/LICENSE>`__.
+
+    Please refer to :py:class:`SquimObjectiveBundle` for usage instructions.
+    """
+
+
+@dataclass
+class SquimSubjectiveBundle:
+    """Data class that bundles associated information to use pretrained
+    :py:class:`~torchaudio.models.SquimSubjective` model.
+
+    This class provides interfaces for instantiating the pretrained model along with
+    the information necessary to retrieve pretrained weights and additional data
+    to be used with the model.
+
+    Torchaudio library instantiates objects of this class, each of which represents
+    a different pretrained model. Client code should access pretrained models via these
+    instances.
+
+    This bundle can estimate subjective metric scores for speech enhancement, such as MOS.
+    A typical use case would be a flow like `waveform -> score`. Please see below for the code example.
+
+    Example: Estimate the subjective metric scores for the input waveform.
+        >>> import torch
+        >>> import torchaudio
+        >>> from torchaudio.pipelines import SQUIM_SUBJECTIVE as bundle
+        >>>
+        >>> # Load the SquimSubjective bundle
+        >>> model = bundle.get_model()
+        Downloading: "https://download.pytorch.org/torchaudio/models/squim_subjective_bvcc_daps.pth"
+        100%|████████████| 360M/360M [00:09<00:00, 41.1MB/s]
+        >>>
+        >>> # Resample audio to the expected sampling rate
+        >>> waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
+        >>> # Use a clean reference (doesn't need to be the reference for the waveform) as the second input
+        >>> reference = torchaudio.functional.resample(reference, sample_rate, bundle.sample_rate)
+        >>>
+        >>> # Estimate subjective metric scores
+        >>> score = model(waveform, reference)
+        >>> print(f"MOS: {score}.")
+    """  # noqa: E501
+
+    _path: str
+    _sample_rate: float
+
+    def get_model(self) -> SquimSubjective:
+        """Construct the SquimSubjective model, and load the pretrained weight.
+        Returns:
+            Variation of :py:class:`~torchaudio.models.SquimObjective`.
+        """
+        model = squim_subjective_base()
+        path = torchaudio.utils.download_asset(f"models/{self._path}")
+        state_dict = torch.load(path, weights_only=True)
+        model.load_state_dict(state_dict)
+        model.eval()
+        return model
+
+    @property
+    def sample_rate(self):
+        """Sample rate of the audio that the model is trained on.
+
+        :type: float
+        """
+        return self._sample_rate
+
+
+SQUIM_SUBJECTIVE = SquimSubjectiveBundle(
+    "squim_subjective_bvcc_daps.pth",
+    _sample_rate=16000,
+)
+SQUIM_SUBJECTIVE.__doc__ = """SquimSubjective pipeline trained
+    as described in :cite:`manocha2022speech` and :cite:`kumar2023torchaudio`
+    on the *BVCC* :cite:`cooper2021voices` and *DAPS* :cite:`mysore2014can` datasets.
+
+    The underlying model is constructed by :py:func:`torchaudio.models.squim_subjective_base`.
+    The weights are under `Creative Commons Attribution Non Commercial 4.0 International
+    <https://zenodo.org/record/4660670#.ZBtWPOxuerN>`__.
+
+    Please refer to :py:class:`SquimSubjectiveBundle` for usage instructions.
+    """
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_tts/__init__.py b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_tts/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..02851f596ceb281acc75c4d6a1aaf17eeee4a809
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_tts/__init__.py
@@ -0,0 +1,16 @@
+from .impl import (
+    TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH,
+    TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH,
+    TACOTRON2_WAVERNN_CHAR_LJSPEECH,
+    TACOTRON2_WAVERNN_PHONE_LJSPEECH,
+)
+from .interface import Tacotron2TTSBundle
+
+
+__all__ = [
+    "Tacotron2TTSBundle",
+    "TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH",
+    "TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH",
+    "TACOTRON2_WAVERNN_CHAR_LJSPEECH",
+    "TACOTRON2_WAVERNN_PHONE_LJSPEECH",
+]
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_tts/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_tts/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd3e99e61e6aced84dcbfbce4a71eda7dfcdd9b5
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_tts/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_tts/__pycache__/impl.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_tts/__pycache__/impl.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..74ef3b5ed86d8ccb640a30a1ecbe2ff6448163b3
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_tts/__pycache__/impl.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_tts/__pycache__/interface.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_tts/__pycache__/interface.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..db50caa54f407e4938b17ab899db597859527133
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_tts/__pycache__/interface.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_tts/__pycache__/utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_tts/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..96af402fe30646e1415151bf37d208e5bae5bec7
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_tts/__pycache__/utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_tts/impl.py b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_tts/impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8542286242dcbb2036fff49c1d0e11fbbf9258b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_tts/impl.py
@@ -0,0 +1,385 @@
+import re
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+from torch import Tensor
+from torchaudio._internal import load_state_dict_from_url
+from torchaudio.functional import mu_law_decoding
+from torchaudio.models import Tacotron2, WaveRNN
+from torchaudio.transforms import GriffinLim, InverseMelScale
+
+from . import utils
+from .interface import Tacotron2TTSBundle
+
+__all__ = []
+
+_BASE_URL = "https://download.pytorch.org/torchaudio/models"
+
+
+################################################################################
+# Pipeline implementation - Text Processor
+################################################################################
+
+
+class _EnglishCharProcessor(Tacotron2TTSBundle.TextProcessor):
+    def __init__(self):
+        super().__init__()
+        self._tokens = utils._get_chars()
+        self._mapping = {s: i for i, s in enumerate(self._tokens)}
+
+    @property
+    def tokens(self):
+        return self._tokens
+
+    def __call__(self, texts: Union[str, List[str]]) -> Tuple[Tensor, Tensor]:
+        if isinstance(texts, str):
+            texts = [texts]
+        indices = [[self._mapping[c] for c in t.lower() if c in self._mapping] for t in texts]
+        return utils._to_tensor(indices)
+
+
+class _EnglishPhoneProcessor(Tacotron2TTSBundle.TextProcessor):
+    def __init__(self, *, dl_kwargs=None):
+        super().__init__()
+        self._tokens = utils._get_phones()
+        self._mapping = {p: i for i, p in enumerate(self._tokens)}
+        self._phonemizer = utils._load_phonemizer("en_us_cmudict_forward.pt", dl_kwargs=dl_kwargs)
+        self._pattern = r"(\[[A-Z]+?\]|[_!'(),.:;? -])"
+
+    @property
+    def tokens(self):
+        return self._tokens
+
+    def __call__(self, texts: Union[str, List[str]]) -> Tuple[Tensor, Tensor]:
+        if isinstance(texts, str):
+            texts = [texts]
+
+        indices = []
+        for phones in self._phonemizer(texts, lang="en_us"):
+            # '[F][UW][B][AA][R]!' -> ['F', 'UW', 'B', 'AA', 'R', '!']
+            ret = [re.sub(r"[\[\]]", "", r) for r in re.findall(self._pattern, phones)]
+            indices.append([self._mapping[p] for p in ret])
+        return utils._to_tensor(indices)
+
+
+################################################################################
+# Pipeline implementation - Vocoder
+################################################################################
+
+
+class _WaveRNNVocoder(torch.nn.Module, Tacotron2TTSBundle.Vocoder):
+    def __init__(self, model: WaveRNN, min_level_db: Optional[float] = -100):
+        super().__init__()
+        self._sample_rate = 22050
+        self._model = model
+        self._min_level_db = min_level_db
+
+    @property
+    def sample_rate(self):
+        return self._sample_rate
+
+    def forward(self, mel_spec, lengths=None):
+        mel_spec = torch.exp(mel_spec)
+        mel_spec = 20 * torch.log10(torch.clamp(mel_spec, min=1e-5))
+        if self._min_level_db is not None:
+            mel_spec = (self._min_level_db - mel_spec) / self._min_level_db
+            mel_spec = torch.clamp(mel_spec, min=0, max=1)
+        waveform, lengths = self._model.infer(mel_spec, lengths)
+        waveform = utils._unnormalize_waveform(waveform, self._model.n_bits)
+        waveform = mu_law_decoding(waveform, self._model.n_classes)
+        waveform = waveform.squeeze(1)
+        return waveform, lengths
+
+
+class _GriffinLimVocoder(torch.nn.Module, Tacotron2TTSBundle.Vocoder):
+    def __init__(self):
+        super().__init__()
+        self._sample_rate = 22050
+        self._inv_mel = InverseMelScale(
+            n_stft=(1024 // 2 + 1),
+            n_mels=80,
+            sample_rate=self.sample_rate,
+            f_min=0.0,
+            f_max=8000.0,
+            mel_scale="slaney",
+            norm="slaney",
+        )
+        self._griffin_lim = GriffinLim(
+            n_fft=1024,
+            power=1,
+            hop_length=256,
+            win_length=1024,
+        )
+
+    @property
+    def sample_rate(self):
+        return self._sample_rate
+
+    def forward(self, mel_spec, lengths=None):
+        mel_spec = torch.exp(mel_spec)
+        mel_spec = mel_spec.clone().detach().requires_grad_(True)
+        spec = self._inv_mel(mel_spec)
+        spec = spec.detach().requires_grad_(False)
+        waveforms = self._griffin_lim(spec)
+        return waveforms, lengths
+
+
+################################################################################
+# Bundle classes mixins
+################################################################################
+
+
+class _CharMixin:
+    def get_text_processor(self) -> Tacotron2TTSBundle.TextProcessor:
+        return _EnglishCharProcessor()
+
+
+class _PhoneMixin:
+    def get_text_processor(self, *, dl_kwargs=None) -> Tacotron2TTSBundle.TextProcessor:
+        return _EnglishPhoneProcessor(dl_kwargs=dl_kwargs)
+
+
+@dataclass
+class _Tacotron2Mixin:
+    _tacotron2_path: str
+    _tacotron2_params: Dict[str, Any]
+
+    def get_tacotron2(self, *, dl_kwargs=None) -> Tacotron2:
+        model = Tacotron2(**self._tacotron2_params)
+        url = f"{_BASE_URL}/{self._tacotron2_path}"
+        dl_kwargs = {} if dl_kwargs is None else dl_kwargs
+        state_dict = load_state_dict_from_url(url, **dl_kwargs)
+        model.load_state_dict(state_dict)
+        model.eval()
+        return model
+
+
+@dataclass
+class _WaveRNNMixin:
+    _wavernn_path: Optional[str]
+    _wavernn_params: Optional[Dict[str, Any]]
+
+    def get_vocoder(self, *, dl_kwargs=None):
+        wavernn = self._get_wavernn(dl_kwargs=dl_kwargs)
+        return _WaveRNNVocoder(wavernn)
+
+    def _get_wavernn(self, *, dl_kwargs=None):
+        model = WaveRNN(**self._wavernn_params)
+        url = f"{_BASE_URL}/{self._wavernn_path}"
+        dl_kwargs = {} if dl_kwargs is None else dl_kwargs
+        state_dict = load_state_dict_from_url(url, **dl_kwargs)
+        model.load_state_dict(state_dict)
+        model.eval()
+        return model
+
+
+class _GriffinLimMixin:
+    def get_vocoder(self, **_):
+        return _GriffinLimVocoder()
+
+
+################################################################################
+# Bundle classes
+################################################################################
+
+
+@dataclass
+class _Tacotron2WaveRNNCharBundle(_WaveRNNMixin, _Tacotron2Mixin, _CharMixin, Tacotron2TTSBundle):
+    pass
+
+
+@dataclass
+class _Tacotron2WaveRNNPhoneBundle(_WaveRNNMixin, _Tacotron2Mixin, _PhoneMixin, Tacotron2TTSBundle):
+    pass
+
+
+@dataclass
+class _Tacotron2GriffinLimCharBundle(_GriffinLimMixin, _Tacotron2Mixin, _CharMixin, Tacotron2TTSBundle):
+    pass
+
+
+@dataclass
+class _Tacotron2GriffinLimPhoneBundle(_GriffinLimMixin, _Tacotron2Mixin, _PhoneMixin, Tacotron2TTSBundle):
+    pass
+
+
+################################################################################
+# Instantiate bundle objects
+################################################################################
+
+
+TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH = _Tacotron2GriffinLimCharBundle(
+    _tacotron2_path="tacotron2_english_characters_1500_epochs_ljspeech.pth",
+    _tacotron2_params=utils._get_taco_params(n_symbols=38),
+)
+TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH.__doc__ = """Character-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs, and
+:py:class:`~torchaudio.transforms.GriffinLim` as vocoder.
+
+The text processor encodes the input texts character-by-character.
+
+You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
+The default parameters were used.
+
+Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.
+
+Example - "Hello world! T T S stands for Text to Speech!"
+
+   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH.png
+      :alt: Spectrogram generated by Tacotron2
+
+   .. raw:: html
+
+      <audio controls="controls">
+         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH.wav" type="audio/wav">
+         Your browser does not support the <code>audio</code> element.
+      </audio>
+
+Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"
+
+   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH_v2.png
+      :alt: Spectrogram generated by Tacotron2
+
+   .. raw:: html
+
+      <audio controls="controls">
+         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH_v2.wav" type="audio/wav">
+         Your browser does not support the <code>audio</code> element.
+      </audio>
+"""  # noqa: E501
+
+TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH = _Tacotron2GriffinLimPhoneBundle(
+    _tacotron2_path="tacotron2_english_phonemes_1500_epochs_ljspeech.pth",
+    _tacotron2_params=utils._get_taco_params(n_symbols=96),
+)
+TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH.__doc__ = """Phoneme-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs and
+:py:class:`~torchaudio.transforms.GriffinLim` as vocoder.
+
+The text processor encodes the input texts based on phoneme.
+It uses `DeepPhonemizer <https://github.com/as-ideas/DeepPhonemizer>`__ to convert
+graphemes to phonemes.
+The model (*en_us_cmudict_forward*) was trained on
+`CMUDict <http://www.speech.cs.cmu.edu/cgi-bin/cmudict>`__.
+
+You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
+The text processor is set to the *"english_phonemes"*.
+
+Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.
+
+Example - "Hello world! T T S stands for Text to Speech!"
+
+   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH.png
+      :alt: Spectrogram generated by Tacotron2
+
+   .. raw:: html
+
+      <audio controls="controls">
+         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH.wav" type="audio/wav">
+         Your browser does not support the <code>audio</code> element.
+      </audio>
+
+Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"
+
+   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH_v2.png
+      :alt: Spectrogram generated by Tacotron2
+
+   .. raw:: html
+
+      <audio controls="controls">
+         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH_v2.wav" type="audio/wav">
+         Your browser does not support the <code>audio</code> element.
+      </audio>
+
+"""  # noqa: E501
+
+TACOTRON2_WAVERNN_CHAR_LJSPEECH = _Tacotron2WaveRNNCharBundle(
+    _tacotron2_path="tacotron2_english_characters_1500_epochs_wavernn_ljspeech.pth",
+    _tacotron2_params=utils._get_taco_params(n_symbols=38),
+    _wavernn_path="wavernn_10k_epochs_8bits_ljspeech.pth",
+    _wavernn_params=utils._get_wrnn_params(),
+)
+TACOTRON2_WAVERNN_CHAR_LJSPEECH.__doc__ = """Character-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs and :py:class:`~torchaudio.models.WaveRNN` vocoder trained on 8 bits depth waveform of *LJSpeech* :cite:`ljspeech17` for 10,000 epochs.
+
+The text processor encodes the input texts character-by-character.
+
+You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
+The following parameters were used; ``win_length=1100``, ``hop_length=275``, ``n_fft=2048``,
+``mel_fmin=40``, and ``mel_fmax=11025``.
+
+You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_wavernn>`__.
+
+Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.
+
+Example - "Hello world! T T S stands for Text to Speech!"
+
+   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH.png
+      :alt: Spectrogram generated by Tacotron2
+
+   .. raw:: html
+
+      <audio controls="controls">
+         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH.wav" type="audio/wav">
+         Your browser does not support the <code>audio</code> element.
+      </audio>
+
+Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"
+
+   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH_v2.png
+      :alt: Spectrogram generated by Tacotron2
+
+   .. raw:: html
+
+      <audio controls="controls">
+         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH_v2.wav" type="audio/wav">
+         Your browser does not support the <code>audio</code> element.
+      </audio>
+"""  # noqa: E501
+
+TACOTRON2_WAVERNN_PHONE_LJSPEECH = _Tacotron2WaveRNNPhoneBundle(
+    _tacotron2_path="tacotron2_english_phonemes_1500_epochs_wavernn_ljspeech.pth",
+    _tacotron2_params=utils._get_taco_params(n_symbols=96),
+    _wavernn_path="wavernn_10k_epochs_8bits_ljspeech.pth",
+    _wavernn_params=utils._get_wrnn_params(),
+)
+TACOTRON2_WAVERNN_PHONE_LJSPEECH.__doc__ = """Phoneme-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs, and
+:py:class:`~torchaudio.models.WaveRNN` vocoder trained on 8 bits depth waveform of *LJSpeech* :cite:`ljspeech17` for 10,000 epochs.
+
+The text processor encodes the input texts based on phoneme.
+It uses `DeepPhonemizer <https://github.com/as-ideas/DeepPhonemizer>`__ to convert
+graphemes to phonemes.
+The model (*en_us_cmudict_forward*) was trained on
+`CMUDict <http://www.speech.cs.cmu.edu/cgi-bin/cmudict>`__.
+
+You can find the training script for Tacotron2 `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
+The following parameters were used; ``win_length=1100``, ``hop_length=275``, ``n_fft=2048``,
+``mel_fmin=40``, and ``mel_fmax=11025``.
+
+You can find the training script for WaveRNN `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_wavernn>`__.
+
+Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.
+
+Example - "Hello world! T T S stands for Text to Speech!"
+
+   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH.png
+      :alt: Spectrogram generated by Tacotron2
+
+   .. raw:: html
+
+      <audio controls="controls">
+         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH.wav" type="audio/wav">
+         Your browser does not support the <code>audio</code> element.
+      </audio>
+
+
+Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"
+
+   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH_v2.png
+      :alt: Spectrogram generated by Tacotron2
+
+   .. raw:: html
+
+      <audio controls="controls">
+         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH_v2.wav" type="audio/wav">
+         Your browser does not support the <code>audio</code> element.
+      </audio>
+"""  # noqa: E501
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_tts/interface.py b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_tts/interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..564f236bc7c239d17dc82db04c350a9ccc618841
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_tts/interface.py
@@ -0,0 +1,255 @@
+from abc import ABC, abstractmethod
+from typing import List, Optional, Tuple, Union
+
+from torch import Tensor
+from torchaudio.models import Tacotron2
+
+
+class _TextProcessor(ABC):
+    @property
+    @abstractmethod
+    def tokens(self):
+        """The tokens that the each value in the processed tensor represent.
+
+        :type: List[str]
+        """
+
+    @abstractmethod
+    def __call__(self, texts: Union[str, List[str]]) -> Tuple[Tensor, Tensor]:
+        """Encode the given (batch of) texts into numerical tensors
+
+        Args:
+            text (str or list of str): The input texts.
+
+        Returns:
+            (Tensor, Tensor):
+            Tensor:
+                The encoded texts. Shape: `(batch, max length)`
+            Tensor:
+                The valid length of each sample in the batch. Shape: `(batch, )`.
+        """
+
+
+class _Vocoder(ABC):
+    @property
+    @abstractmethod
+    def sample_rate(self):
+        """The sample rate of the resulting waveform
+
+        :type: float
+        """
+
+    @abstractmethod
+    def __call__(self, specgrams: Tensor, lengths: Optional[Tensor] = None) -> Tuple[Tensor, Optional[Tensor]]:
+        """Generate waveform from the given input, such as spectrogram
+
+        Args:
+            specgrams (Tensor):
+                The input spectrogram. Shape: `(batch, frequency bins, time)`.
+                The expected shape depends on the implementation.
+            lengths (Tensor, or None, optional):
+                The valid length of each sample in the batch. Shape: `(batch, )`.
+                (Default: `None`)
+
+        Returns:
+            (Tensor, Optional[Tensor]):
+            Tensor:
+                The generated waveform. Shape: `(batch, max length)`
+            Tensor or None:
+                The valid length of each sample in the batch. Shape: `(batch, )`.
+        """
+
+
+class Tacotron2TTSBundle(ABC):
+    """Data class that bundles associated information to use pretrained Tacotron2 and vocoder.
+
+    This class provides interfaces for instantiating the pretrained model along with
+    the information necessary to retrieve pretrained weights and additional data
+    to be used with the model.
+
+    Torchaudio library instantiates objects of this class, each of which represents
+    a different pretrained model. Client code should access pretrained models via these
+    instances.
+
+    Please see below for the usage and the available values.
+
+    Example - Character-based TTS pipeline with Tacotron2 and WaveRNN
+        >>> import torchaudio
+        >>>
+        >>> text = "Hello, T T S !"
+        >>> bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH
+        >>>
+        >>> # Build processor, Tacotron2 and WaveRNN model
+        >>> processor = bundle.get_text_processor()
+        >>> tacotron2 = bundle.get_tacotron2()
+        Downloading:
+        100%|███████████████████████████████| 107M/107M [00:01<00:00, 87.9MB/s]
+        >>> vocoder = bundle.get_vocoder()
+        Downloading:
+        100%|███████████████████████████████| 16.7M/16.7M [00:00<00:00, 78.1MB/s]
+        >>>
+        >>> # Encode text
+        >>> input, lengths = processor(text)
+        >>>
+        >>> # Generate (mel-scale) spectrogram
+        >>> specgram, lengths, _ = tacotron2.infer(input, lengths)
+        >>>
+        >>> # Convert spectrogram to waveform
+        >>> waveforms, lengths = vocoder(specgram, lengths)
+        >>>
+        >>> torchaudio.save('hello-tts.wav', waveforms, vocoder.sample_rate)
+
+    Example - Phoneme-based TTS pipeline with Tacotron2 and WaveRNN
+        >>>
+        >>> # Note:
+        >>> #     This bundle uses pre-trained DeepPhonemizer as
+        >>> #     the text pre-processor.
+        >>> #     Please install deep-phonemizer.
+        >>> #     See https://github.com/as-ideas/DeepPhonemizer
+        >>> #     The pretrained weight is automatically downloaded.
+        >>>
+        >>> import torchaudio
+        >>>
+        >>> text = "Hello, TTS!"
+        >>> bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH
+        >>>
+        >>> # Build processor, Tacotron2 and WaveRNN model
+        >>> processor = bundle.get_text_processor()
+        Downloading:
+        100%|███████████████████████████████| 63.6M/63.6M [00:04<00:00, 15.3MB/s]
+        >>> tacotron2 = bundle.get_tacotron2()
+        Downloading:
+        100%|███████████████████████████████| 107M/107M [00:01<00:00, 87.9MB/s]
+        >>> vocoder = bundle.get_vocoder()
+        Downloading:
+        100%|███████████████████████████████| 16.7M/16.7M [00:00<00:00, 78.1MB/s]
+        >>>
+        >>> # Encode text
+        >>> input, lengths = processor(text)
+        >>>
+        >>> # Generate (mel-scale) spectrogram
+        >>> specgram, lengths, _ = tacotron2.infer(input, lengths)
+        >>>
+        >>> # Convert spectrogram to waveform
+        >>> waveforms, lengths = vocoder(specgram, lengths)
+        >>>
+        >>> torchaudio.save('hello-tts.wav', waveforms, vocoder.sample_rate)
+    """
+
+    # Using the inner class so that these interfaces are not directly exposed on
+    # `torchaudio.pipelines`, but still listed in documentation.
+    # The thing is, text processing and vocoder are generic and we do not know what kind of
+    # new text processing and vocoder will be added in the future, so we want to make these
+    # interfaces specific to this Tacotron2TTS pipeline.
+
+    class TextProcessor(_TextProcessor):
+        """Interface of the text processing part of Tacotron2TTS pipeline
+
+        See :func:`torchaudio.pipelines.Tacotron2TTSBundle.get_text_processor` for the usage.
+        """
+
+    class Vocoder(_Vocoder):
+        """Interface of the vocoder part of Tacotron2TTS pipeline
+
+        See :func:`torchaudio.pipelines.Tacotron2TTSBundle.get_vocoder` for the usage.
+        """
+
+    @abstractmethod
+    def get_text_processor(self, *, dl_kwargs=None) -> TextProcessor:
+        """Create a text processor
+
+        For character-based pipeline, this processor splits the input text by character.
+        For phoneme-based pipeline, this processor converts the input text (grapheme) to
+        phonemes.
+
+        If a pre-trained weight file is necessary,
+        :func:`torch.hub.download_url_to_file` is used to downloaded it.
+
+        Args:
+            dl_kwargs (dictionary of keyword arguments,):
+                Passed to :func:`torch.hub.download_url_to_file`.
+
+        Returns:
+            TextProcessor:
+                A callable which takes a string or a list of strings as input and
+                returns Tensor of encoded texts and Tensor of valid lengths.
+                The object also has ``tokens`` property, which allows to recover the
+                tokenized form.
+
+        Example - Character-based
+            >>> text = [
+            >>>     "Hello World!",
+            >>>     "Text-to-speech!",
+            >>> ]
+            >>> bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH
+            >>> processor = bundle.get_text_processor()
+            >>> input, lengths = processor(text)
+            >>>
+            >>> print(input)
+            tensor([[19, 16, 23, 23, 26, 11, 34, 26, 29, 23, 15,  2,  0,  0,  0],
+                    [31, 16, 35, 31,  1, 31, 26,  1, 30, 27, 16, 16, 14, 19,  2]],
+                   dtype=torch.int32)
+            >>>
+            >>> print(lengths)
+            tensor([12, 15], dtype=torch.int32)
+            >>>
+            >>> print([processor.tokens[i] for i in input[0, :lengths[0]]])
+            ['h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd', '!']
+            >>> print([processor.tokens[i] for i in input[1, :lengths[1]]])
+            ['t', 'e', 'x', 't', '-', 't', 'o', '-', 's', 'p', 'e', 'e', 'c', 'h', '!']
+
+        Example - Phoneme-based
+            >>> text = [
+            >>>     "Hello, T T S !",
+            >>>     "Text-to-speech!",
+            >>> ]
+            >>> bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH
+            >>> processor = bundle.get_text_processor()
+            Downloading:
+            100%|███████████████████████████████| 63.6M/63.6M [00:04<00:00, 15.3MB/s]
+            >>> input, lengths = processor(text)
+            >>>
+            >>> print(input)
+            tensor([[54, 20, 65, 69, 11, 92, 44, 65, 38,  2,  0,  0,  0,  0],
+                    [81, 40, 64, 79, 81,  1, 81, 20,  1, 79, 77, 59, 37,  2]],
+                   dtype=torch.int32)
+            >>>
+            >>> print(lengths)
+            tensor([10, 14], dtype=torch.int32)
+            >>>
+            >>> print([processor.tokens[i] for i in input[0]])
+            ['HH', 'AH', 'L', 'OW', ' ', 'W', 'ER', 'L', 'D', '!', '_', '_', '_', '_']
+            >>> print([processor.tokens[i] for i in input[1]])
+            ['T', 'EH', 'K', 'S', 'T', '-', 'T', 'AH', '-', 'S', 'P', 'IY', 'CH', '!']
+        """
+
+    @abstractmethod
+    def get_vocoder(self, *, dl_kwargs=None) -> Vocoder:
+        """Create a vocoder module, based off of either WaveRNN or GriffinLim.
+
+        If a pre-trained weight file is necessary,
+        :func:`torch.hub.load_state_dict_from_url` is used to downloaded it.
+
+        Args:
+            dl_kwargs (dictionary of keyword arguments):
+                Passed to :func:`torch.hub.load_state_dict_from_url`.
+
+        Returns:
+            Vocoder:
+                A vocoder module, which takes spectrogram Tensor and an optional
+                length Tensor, then returns resulting waveform Tensor and an optional
+                length Tensor.
+        """
+
+    @abstractmethod
+    def get_tacotron2(self, *, dl_kwargs=None) -> Tacotron2:
+        """Create a Tacotron2 model with pre-trained weight.
+
+        Args:
+            dl_kwargs (dictionary of keyword arguments):
+                Passed to :func:`torch.hub.load_state_dict_from_url`.
+
+        Returns:
+            Tacotron2:
+                The resulting model.
+        """
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_tts/utils.py b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_tts/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef3ecb31335ae0cf9950d040fa11c21fb3403b25
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_tts/utils.py
@@ -0,0 +1,228 @@
+import logging
+import os
+
+import torch
+from torchaudio._internal import download_url_to_file, module_utils as _mod_utils
+
+
+def _get_chars():
+    return (
+        "_",
+        "-",
+        "!",
+        "'",
+        "(",
+        ")",
+        ",",
+        ".",
+        ":",
+        ";",
+        "?",
+        " ",
+        "a",
+        "b",
+        "c",
+        "d",
+        "e",
+        "f",
+        "g",
+        "h",
+        "i",
+        "j",
+        "k",
+        "l",
+        "m",
+        "n",
+        "o",
+        "p",
+        "q",
+        "r",
+        "s",
+        "t",
+        "u",
+        "v",
+        "w",
+        "x",
+        "y",
+        "z",
+    )
+
+
+def _get_phones():
+    return (
+        "_",
+        "-",
+        "!",
+        "'",
+        "(",
+        ")",
+        ",",
+        ".",
+        ":",
+        ";",
+        "?",
+        " ",
+        "AA",
+        "AA0",
+        "AA1",
+        "AA2",
+        "AE",
+        "AE0",
+        "AE1",
+        "AE2",
+        "AH",
+        "AH0",
+        "AH1",
+        "AH2",
+        "AO",
+        "AO0",
+        "AO1",
+        "AO2",
+        "AW",
+        "AW0",
+        "AW1",
+        "AW2",
+        "AY",
+        "AY0",
+        "AY1",
+        "AY2",
+        "B",
+        "CH",
+        "D",
+        "DH",
+        "EH",
+        "EH0",
+        "EH1",
+        "EH2",
+        "ER",
+        "ER0",
+        "ER1",
+        "ER2",
+        "EY",
+        "EY0",
+        "EY1",
+        "EY2",
+        "F",
+        "G",
+        "HH",
+        "IH",
+        "IH0",
+        "IH1",
+        "IH2",
+        "IY",
+        "IY0",
+        "IY1",
+        "IY2",
+        "JH",
+        "K",
+        "L",
+        "M",
+        "N",
+        "NG",
+        "OW",
+        "OW0",
+        "OW1",
+        "OW2",
+        "OY",
+        "OY0",
+        "OY1",
+        "OY2",
+        "P",
+        "R",
+        "S",
+        "SH",
+        "T",
+        "TH",
+        "UH",
+        "UH0",
+        "UH1",
+        "UH2",
+        "UW",
+        "UW0",
+        "UW1",
+        "UW2",
+        "V",
+        "W",
+        "Y",
+        "Z",
+        "ZH",
+    )
+
+
+def _to_tensor(indices):
+    lengths = torch.tensor([len(i) for i in indices], dtype=torch.int32)
+    values = [torch.tensor(i) for i in indices]
+    values = torch.nn.utils.rnn.pad_sequence(values, batch_first=True)
+    return values, lengths
+
+
+def _load_phonemizer(file, dl_kwargs):
+    if not _mod_utils.is_module_available("dp"):
+        raise RuntimeError("DeepPhonemizer is not installed. Please install it.")
+
+    from dp.phonemizer import Phonemizer
+
+    # By default, dp issues DEBUG level log.
+    logger = logging.getLogger("dp")
+    orig_level = logger.level
+    logger.setLevel(logging.INFO)
+    try:
+        url = f"https://public-asai-dl-models.s3.eu-central-1.amazonaws.com/DeepPhonemizer/{file}"
+        directory = os.path.join(torch.hub.get_dir(), "checkpoints")
+        os.makedirs(directory, exist_ok=True)
+        path = os.path.join(directory, file)
+        if not os.path.exists(path):
+            dl_kwargs = {} if dl_kwargs is None else dl_kwargs
+            download_url_to_file(url, path, **dl_kwargs)
+        return Phonemizer.from_checkpoint(path)
+    finally:
+        logger.setLevel(orig_level)
+
+
+def _unnormalize_waveform(waveform: torch.Tensor, bits: int) -> torch.Tensor:
+    r"""Transform waveform [-1, 1] to label [0, 2 ** bits - 1]"""
+    waveform = torch.clamp(waveform, -1, 1)
+    waveform = (waveform + 1.0) * (2**bits - 1) / 2
+    return torch.clamp(waveform, 0, 2**bits - 1).int()
+
+
+def _get_taco_params(n_symbols):
+    return {
+        "mask_padding": False,
+        "n_mels": 80,
+        "n_frames_per_step": 1,
+        "symbol_embedding_dim": 512,
+        "encoder_embedding_dim": 512,
+        "encoder_n_convolution": 3,
+        "encoder_kernel_size": 5,
+        "decoder_rnn_dim": 1024,
+        "decoder_max_step": 2000,
+        "decoder_dropout": 0.1,
+        "decoder_early_stopping": True,
+        "attention_rnn_dim": 1024,
+        "attention_hidden_dim": 128,
+        "attention_location_n_filter": 32,
+        "attention_location_kernel_size": 31,
+        "attention_dropout": 0.1,
+        "prenet_dim": 256,
+        "postnet_n_convolution": 5,
+        "postnet_kernel_size": 5,
+        "postnet_embedding_dim": 512,
+        "gate_threshold": 0.5,
+        "n_symbol": n_symbols,
+    }
+
+
+def _get_wrnn_params():
+    return {
+        "upsample_scales": [5, 5, 11],
+        "n_classes": 2**8,  # n_bits = 8
+        "hop_length": 275,
+        "n_res_block": 10,
+        "n_rnn": 512,
+        "n_fc": 512,
+        "kernel_size": 5,
+        "n_freq": 80,
+        "n_hidden": 128,
+        "n_output": 128,
+    }
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_wav2vec2/__init__.py b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_wav2vec2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_wav2vec2/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_wav2vec2/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ebe5b3f5477de1c5296ddc954e06316304dbb894
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_wav2vec2/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_wav2vec2/__pycache__/aligner.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_wav2vec2/__pycache__/aligner.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8e9505e42214ad07b12549538a651119a1c252a9
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_wav2vec2/__pycache__/aligner.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_wav2vec2/__pycache__/impl.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_wav2vec2/__pycache__/impl.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0242ac5dba9a1905c6467198fbed62b0e271ac7e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_wav2vec2/__pycache__/impl.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_wav2vec2/__pycache__/utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_wav2vec2/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8535b0ede89659dbf991dfd970e219864f2ba806
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_wav2vec2/__pycache__/utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_wav2vec2/aligner.py b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_wav2vec2/aligner.py
new file mode 100644
index 0000000000000000000000000000000000000000..3655d5bae88181796d6d889013b4438d0ea014b3
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_wav2vec2/aligner.py
@@ -0,0 +1,87 @@
+from abc import ABC, abstractmethod
+from typing import Dict, List
+
+import torch
+import torchaudio.functional as F
+from torch import Tensor
+from torchaudio.functional import TokenSpan
+
+
+class ITokenizer(ABC):
+    @abstractmethod
+    def __call__(self, transcript: List[str]) -> List[List[str]]:
+        """Tokenize the given transcript (list of word)
+
+        .. note::
+
+           The toranscript must be normalized.
+
+        Args:
+            transcript (list of str): Transcript (list of word).
+
+        Returns:
+            (list of int): List of token sequences
+        """
+
+
+class Tokenizer(ITokenizer):
+    def __init__(self, dictionary: Dict[str, int]):
+        self.dictionary = dictionary
+
+    def __call__(self, transcript: List[str]) -> List[List[int]]:
+        return [[self.dictionary[c] for c in word] for word in transcript]
+
+
+def _align_emission_and_tokens(emission: Tensor, tokens: List[int], blank: int = 0):
+    device = emission.device
+    emission = emission.unsqueeze(0)
+    targets = torch.tensor([tokens], dtype=torch.int32, device=device)
+
+    aligned_tokens, scores = F.forced_align(emission, targets, blank=blank)
+
+    scores = scores.exp()  # convert back to probability
+    aligned_tokens, scores = aligned_tokens[0], scores[0]  # remove batch dimension
+    return aligned_tokens, scores
+
+
+class IAligner(ABC):
+    @abstractmethod
+    def __call__(self, emission: Tensor, tokens: List[List[int]]) -> List[List[TokenSpan]]:
+        """Generate list of time-stamped token sequences
+
+        Args:
+            emission (Tensor): Sequence of token probability distributions in log-domain.
+                Shape: `(time, tokens)`.
+            tokens (list of integer sequence): Tokenized transcript.
+                Output from :py:class:`torchaudio.pipelines.Wav2Vec2FABundle.Tokenizer`.
+
+        Returns:
+            (list of TokenSpan sequence): Tokens with time stamps and scores.
+        """
+
+
+def _unflatten(list_, lengths):
+    assert len(list_) == sum(lengths)
+    i = 0
+    ret = []
+    for l in lengths:
+        ret.append(list_[i : i + l])
+        i += l
+    return ret
+
+
+def _flatten(nested_list):
+    return [item for list_ in nested_list for item in list_]
+
+
+class Aligner(IAligner):
+    def __init__(self, blank):
+        self.blank = blank
+
+    def __call__(self, emission: Tensor, tokens: List[List[int]]) -> List[List[TokenSpan]]:
+        if emission.ndim != 2:
+            raise ValueError(f"The input emission must be 2D. Found: {emission.shape}")
+
+        aligned_tokens, scores = _align_emission_and_tokens(emission, _flatten(tokens), self.blank)
+        spans = F.merge_tokens(aligned_tokens, scores)
+        return _unflatten(spans, [len(ts) for ts in tokens])
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_wav2vec2/impl.py b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_wav2vec2/impl.py
new file mode 100644
index 0000000000000000000000000000000000000000..d60fa8adb94e92e1a479fe94e09f521d0fe50056
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_wav2vec2/impl.py
@@ -0,0 +1,1699 @@
+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Tuple
+
+from torch.nn import Module
+
+from . import aligner, utils
+
+
+__all__ = []  # type: ignore
+
+
+@dataclass
+class Wav2Vec2Bundle:
+    """Data class that bundles associated information to use pretrained :py:class:`~torchaudio.models.Wav2Vec2Model`.
+
+    This class provides interfaces for instantiating the pretrained model along with
+    the information necessary to retrieve pretrained weights and additional data
+    to be used with the model.
+
+    Torchaudio library instantiates objects of this class, each of which represents
+    a different pretrained model. Client code should access pretrained models via these
+    instances.
+
+    Please see below for the usage and the available values.
+
+    Example - Feature Extraction
+        >>> import torchaudio
+        >>>
+        >>> bundle = torchaudio.pipelines.HUBERT_BASE
+        >>>
+        >>> # Build the model and load pretrained weight.
+        >>> model = bundle.get_model()
+        Downloading:
+        100%|███████████████████████████████| 360M/360M [00:06<00:00, 60.6MB/s]
+        >>>
+        >>> # Resample audio to the expected sampling rate
+        >>> waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
+        >>>
+        >>> # Extract acoustic features
+        >>> features, _ = model.extract_features(waveform)
+    """  # noqa: E501
+
+    _path: str
+    _params: Dict[str, Any]
+    _sample_rate: float
+    _normalize_waveform: bool
+    _model_type: str
+
+    @property
+    def sample_rate(self) -> float:
+        """Sample rate of the audio that the model is trained on.
+
+        :type: float
+        """
+        return self._sample_rate
+
+    def _get_state_dict(self, dl_kwargs):
+        # Note: This method is overridden in ASR bundle
+        return utils._get_state_dict(self._path, dl_kwargs)
+
+    def get_model(self, *, dl_kwargs=None) -> Module:
+        """Construct the model and load the pretrained weight.
+
+        The weight file is downloaded from the internet and cached with
+        :func:`torch.hub.load_state_dict_from_url`
+
+        Args:
+            dl_kwargs (dictionary of keyword arguments): Passed to :func:`torch.hub.load_state_dict_from_url`.
+
+        Returns:
+            Variation of :py:class:`~torchaudio.models.Wav2Vec2Model`.
+
+            For the models listed below, an additional layer normalization is performed on the input.
+
+            For all other models, a :py:class:`~torchaudio.models.Wav2Vec2Model` instance is returned.
+
+            - WAV2VEC2_LARGE_LV60K
+            - WAV2VEC2_ASR_LARGE_LV60K_10M
+            - WAV2VEC2_ASR_LARGE_LV60K_100H
+            - WAV2VEC2_ASR_LARGE_LV60K_960H
+            - WAV2VEC2_XLSR53
+            - WAV2VEC2_XLSR_300M
+            - WAV2VEC2_XLSR_1B
+            - WAV2VEC2_XLSR_2B
+            - HUBERT_LARGE
+            - HUBERT_XLARGE
+            - HUBERT_ASR_LARGE
+            - HUBERT_ASR_XLARGE
+            - WAVLM_LARGE
+        """
+        model = utils._get_model(self._model_type, self._params)
+        state_dict = self._get_state_dict(dl_kwargs)
+        model.load_state_dict(state_dict)
+        if self._normalize_waveform:
+            model = utils._extend_model(model, normalize_waveform=True)
+        model.eval()
+        return model
+
+
+@dataclass
+class Wav2Vec2ASRBundle(Wav2Vec2Bundle):
+    """Data class that bundles associated information to use pretrained
+    :py:class:`~torchaudio.models.Wav2Vec2Model`.
+
+    This class provides interfaces for instantiating the pretrained model along with
+    the information necessary to retrieve pretrained weights and additional data
+    to be used with the model.
+
+    Torchaudio library instantiates objects of this class, each of which represents
+    a different pretrained model. Client code should access pretrained models via these
+    instances.
+
+    Please see below for the usage and the available values.
+
+    Example - ASR
+        >>> import torchaudio
+        >>>
+        >>> bundle = torchaudio.pipelines.HUBERT_ASR_LARGE
+        >>>
+        >>> # Build the model and load pretrained weight.
+        >>> model = bundle.get_model()
+        Downloading:
+        100%|███████████████████████████████| 1.18G/1.18G [00:17<00:00, 73.8MB/s]
+        >>>
+        >>> # Check the corresponding labels of the output.
+        >>> labels = bundle.get_labels()
+        >>> print(labels)
+        ('-', '|', 'E', 'T', 'A', 'O', 'N', 'I', 'H', 'S', 'R', 'D', 'L', 'U', 'M', 'W', 'C', 'F', 'G', 'Y', 'P', 'B', 'V', 'K', "'", 'X', 'J', 'Q', 'Z')
+        >>>
+        >>> # Resample audio to the expected sampling rate
+        >>> waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
+        >>>
+        >>> # Infer the label probability distribution
+        >>> emissions, _ = model(waveform)
+        >>>
+        >>> # Pass emission to decoder
+        >>> # `ctc_decode` is for illustration purpose only
+        >>> transcripts = ctc_decode(emissions, labels)
+    """  # noqa: E501
+
+    _labels: Tuple[str, ...]
+    _remove_aux_axis: Tuple[int, ...] = (1, 2, 3)
+
+    def get_labels(
+        self,
+        *,
+        blank: str = "-",
+    ) -> Tuple[str, ...]:
+        """The output class labels.
+
+        The first is blank token, and it is customizable.
+
+        Args:
+            blank (str, optional): Blank token. (default: ``'-'``)
+
+        Returns:
+            Tuple[str, ...]:
+            For models fine-tuned on ASR, returns the tuple of strings representing
+            the output class labels.
+
+        Example
+            >>> from torchaudio.pipelines import HUBERT_ASR_LARGE as bundle
+            >>> bundle.get_labels()
+            ('-', '|', 'E', 'T', 'A', 'O', 'N', 'I', 'H', 'S', 'R', 'D', 'L', 'U', 'M', 'W', 'C', 'F', 'G', 'Y', 'P', 'B', 'V', 'K', "'", 'X', 'J', 'Q', 'Z')
+        """  # noqa: E501
+        return (blank, *self._labels)
+
+    def _get_state_dict(self, dl_kwargs):
+        return utils._get_state_dict(self._path, dl_kwargs, self._remove_aux_axis)
+
+
+WAV2VEC2_BASE = Wav2Vec2Bundle(
+    _path="wav2vec2_fairseq_base_ls960.pth",
+    _params={
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.1,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.05,
+        "aux_num_out": None,
+    },
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _model_type="Wav2Vec2",
+)
+WAV2VEC2_BASE.__doc__ = """Wav2vec 2.0 model ("base" architecture),
+pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), not fine-tuned.
+
+Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
+"""  # noqa: E501
+
+WAV2VEC2_ASR_BASE_10M = Wav2Vec2ASRBundle(
+    _path="wav2vec2_fairseq_base_ls960_asr_ll10m.pth",
+    _params={
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.1,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.05,
+        "aux_num_out": 29,
+    },
+    _labels=utils._get_en_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _model_type="Wav2Vec2",
+)
+WAV2VEC2_ASR_BASE_10M.__doc__ = """Wav2vec 2.0 model ("base" architecture with an extra linear module),
+pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
+fine-tuned for ASR on 10 minutes of transcribed audio from *Libri-Light* dataset
+:cite:`librilight` ("train-10min" subset).
+
+Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+WAV2VEC2_ASR_BASE_100H = Wav2Vec2ASRBundle(
+    "wav2vec2_fairseq_base_ls960_asr_ls100.pth",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.1,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.05,
+        "aux_num_out": 29,
+    },
+    _labels=utils._get_en_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _model_type="Wav2Vec2",
+)
+
+WAV2VEC2_ASR_BASE_100H.__doc__ = """Wav2vec 2.0 model ("base" architecture with an extra linear module),
+pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
+fine-tuned for ASR on 100 hours of transcribed audio from "train-clean-100" subset.
+
+Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+WAV2VEC2_ASR_BASE_960H = Wav2Vec2ASRBundle(
+    "wav2vec2_fairseq_base_ls960_asr_ls960.pth",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.1,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.05,
+        "aux_num_out": 29,
+    },
+    _labels=utils._get_en_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _model_type="Wav2Vec2",
+)
+WAV2VEC2_ASR_BASE_960H.__doc__ = """Wav2vec 2.0 model ("base" architecture with an extra linear module),
+pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
+fine-tuned for ASR on the same audio with the corresponding transcripts.
+
+Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+WAV2VEC2_LARGE = Wav2Vec2Bundle(
+    "wav2vec2_fairseq_large_ls960.pth",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.2,
+        "aux_num_out": None,
+    },
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _model_type="Wav2Vec2",
+)
+WAV2VEC2_LARGE.__doc__ = """Wav2vec 2.0 model ("large" architecture),
+pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), not fine-tuned.
+
+Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
+"""  # noqa: E501
+
+WAV2VEC2_ASR_LARGE_10M = Wav2Vec2ASRBundle(
+    "wav2vec2_fairseq_large_ls960_asr_ll10m.pth",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.2,
+        "aux_num_out": 29,
+    },
+    _labels=utils._get_en_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _model_type="Wav2Vec2",
+)
+WAV2VEC2_ASR_LARGE_10M.__doc__ = """Wav2vec 2.0 model ("large" architecture with an extra linear module),
+pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
+fine-tuned for ASR on 10 minutes of transcribed audio from *Libri-Light* dataset
+:cite:`librilight` ("train-10min" subset).
+
+Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+WAV2VEC2_ASR_LARGE_100H = Wav2Vec2ASRBundle(
+    "wav2vec2_fairseq_large_ls960_asr_ls100.pth",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.2,
+        "aux_num_out": 29,
+    },
+    _labels=utils._get_en_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _model_type="Wav2Vec2",
+)
+WAV2VEC2_ASR_LARGE_100H.__doc__ = """Wav2vec 2.0 model ("large" architecture with an extra linear module),
+pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
+fine-tuned for ASR on 100 hours of transcribed audio from
+the same dataset ("train-clean-100" subset).
+
+Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+WAV2VEC2_ASR_LARGE_960H = Wav2Vec2ASRBundle(
+    "wav2vec2_fairseq_large_ls960_asr_ls960.pth",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.2,
+        "aux_num_out": 29,
+    },
+    _labels=utils._get_en_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _model_type="Wav2Vec2",
+)
+WAV2VEC2_ASR_LARGE_960H.__doc__ = """Wav2vec 2.0 model ("large" architecture with an extra linear module),
+pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
+fine-tuned for ASR on the same audio with the corresponding transcripts.
+
+Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa:  E501
+
+WAV2VEC2_LARGE_LV60K = Wav2Vec2Bundle(
+    "wav2vec2_fairseq_large_lv60k.pth",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": None,
+    },
+    _sample_rate=16000,
+    _normalize_waveform=True,
+    _model_type="Wav2Vec2",
+)
+WAV2VEC2_LARGE_LV60K.__doc__ = """Wav2vec 2.0 model ("large-lv60k" architecture),
+pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`,
+not fine-tuned.
+
+Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
+"""  # noqa: E501
+
+WAV2VEC2_ASR_LARGE_LV60K_10M = Wav2Vec2ASRBundle(
+    "wav2vec2_fairseq_large_lv60k_asr_ll10m.pth",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": 29,
+    },
+    _labels=utils._get_en_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=True,
+    _model_type="Wav2Vec2",
+)
+WAV2VEC2_ASR_LARGE_LV60K_10M.__doc__ = """Wav2vec 2.0 model ("large-lv60k" architecture with an extra linear module),
+pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`, and
+fine-tuned for ASR on 10 minutes of transcribed audio from the same dataset ("train-10min" subset).
+
+Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+WAV2VEC2_ASR_LARGE_LV60K_100H = Wav2Vec2ASRBundle(
+    "wav2vec2_fairseq_large_lv60k_asr_ls100.pth",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": 29,
+    },
+    _labels=utils._get_en_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=True,
+    _model_type="Wav2Vec2",
+)
+WAV2VEC2_ASR_LARGE_LV60K_100H.__doc__ = """Wav2vec 2.0 model ("large-lv60k" architecture with an extra linear module),
+pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`, and
+fine-tuned for ASR on 100 hours of transcribed audio from
+*LibriSpeech* dataset :cite:`7178964` ("train-clean-100" subset).
+
+Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+WAV2VEC2_ASR_LARGE_LV60K_960H = Wav2Vec2ASRBundle(
+    "wav2vec2_fairseq_large_lv60k_asr_ls960.pth",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": 29,
+    },
+    _labels=utils._get_en_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=True,
+    _model_type="Wav2Vec2",
+)
+WAV2VEC2_ASR_LARGE_LV60K_960H.__doc__ = """Wav2vec 2.0 model ("large-lv60k" architecture with an extra linear module),
+pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* :cite:`librilight` dataset, and
+fine-tuned for ASR on 960 hours of transcribed audio from *LibriSpeech* dataset :cite:`7178964`
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
+
+Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+WAV2VEC2_XLSR53 = Wav2Vec2Bundle(
+    "wav2vec2_fairseq_large_xlsr53.pth",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": None,
+    },
+    _sample_rate=16000,
+    _normalize_waveform=True,
+    _model_type="Wav2Vec2",
+)
+WAV2VEC2_XLSR53.__doc__ = """Wav2vec 2.0 model ("base" architecture),
+pre-trained on 56,000 hours of unlabeled audio from multiple datasets (
+*Multilingual LibriSpeech* :cite:`Pratap_2020`,
+*CommonVoice* :cite:`ardila2020common` and
+*BABEL* :cite:`Gales2014SpeechRA`),
+not fine-tuned.
+
+Originally published by the authors of
+*Unsupervised Cross-lingual Representation Learning for Speech Recognition*
+:cite:`conneau2020unsupervised` under MIT License and redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
+"""  # noqa: E501
+
+HUBERT_BASE = Wav2Vec2Bundle(
+    "hubert_fairseq_base_ls960.pth",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.1,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.05,
+        "aux_num_out": None,
+    },
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _model_type="Wav2Vec2",
+)
+HUBERT_BASE.__doc__ = """HuBERT model ("base" architecture),
+pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), not fine-tuned.
+
+Originally published by the authors of *HuBERT* :cite:`hsu2021hubert` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
+"""  # noqa: E501
+
+HUBERT_LARGE = Wav2Vec2Bundle(
+    "hubert_fairseq_large_ll60k.pth",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": None,
+    },
+    _sample_rate=16000,
+    _normalize_waveform=True,
+    _model_type="Wav2Vec2",
+)
+HUBERT_LARGE.__doc__ = """HuBERT model ("large" architecture),
+pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`,
+not fine-tuned.
+
+Originally published by the authors of *HuBERT* :cite:`hsu2021hubert` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
+"""  # noqa: E501
+
+HUBERT_XLARGE = Wav2Vec2Bundle(
+    "hubert_fairseq_xlarge_ll60k.pth",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 1280,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 48,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 5120,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": None,
+    },
+    _sample_rate=16000,
+    _normalize_waveform=True,
+    _model_type="Wav2Vec2",
+)
+HUBERT_XLARGE.__doc__ = """HuBERT model ("extra large" architecture),
+pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`,
+not fine-tuned.
+
+Originally published by the authors of *HuBERT* :cite:`hsu2021hubert` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
+"""  # noqa: E501
+
+HUBERT_ASR_LARGE = Wav2Vec2ASRBundle(
+    "hubert_fairseq_large_ll60k_asr_ls960.pth",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.1,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.1,
+        "aux_num_out": 29,
+    },
+    _labels=utils._get_en_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=True,
+    _model_type="Wav2Vec2",
+)
+HUBERT_ASR_LARGE.__doc__ = """HuBERT model ("large" architecture),
+pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`, and
+fine-tuned for ASR on 960 hours of transcribed audio from *LibriSpeech* dataset :cite:`7178964`
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
+
+Originally published by the authors of *HuBERT* :cite:`hsu2021hubert` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+HUBERT_ASR_XLARGE = Wav2Vec2ASRBundle(
+    "hubert_fairseq_xlarge_ll60k_asr_ls960.pth",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 1280,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 48,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 5120,
+        "encoder_ff_interm_dropout": 0.1,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.1,
+        "aux_num_out": 29,
+    },
+    _labels=utils._get_en_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=True,
+    _model_type="Wav2Vec2",
+)
+HUBERT_ASR_XLARGE.__doc__ = """HuBERT model ("extra large" architecture),
+pre-trained on 60,000 hours of unlabeled audio from
+*Libri-Light* dataset :cite:`librilight`, and
+fine-tuned for ASR on 960 hours of transcribed audio from
+*LibriSpeech* dataset :cite:`7178964`
+(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
+
+Originally published by the authors of *HuBERT* :cite:`hsu2021hubert` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
+`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+
+VOXPOPULI_ASR_BASE_10K_DE = Wav2Vec2ASRBundle(
+    "wav2vec2_voxpopuli_base_10k_asr_de.pt",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.1,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.1,
+        "aux_num_out": 32,
+    },
+    _labels=utils._get_de_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _remove_aux_axis=(1, 2, 3, 35),
+    _model_type="Wav2Vec2",
+)
+VOXPOPULI_ASR_BASE_10K_DE.__doc__ = """wav2vec 2.0 model ("base" architecture),
+pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset :cite:`voxpopuli`
+("10k" subset, consisting of 23 languages), and
+fine-tuned for ASR on 282 hours of transcribed audio from "de" subset.
+
+Originally published by the authors of *VoxPopuli* :cite:`voxpopuli` under CC BY-NC 4.0 and
+redistributed with the same license.
+[`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
+`Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+
+VOXPOPULI_ASR_BASE_10K_EN = Wav2Vec2ASRBundle(
+    "wav2vec2_voxpopuli_base_10k_asr_en.pt",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.1,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.1,
+        "aux_num_out": 28,
+    },
+    _labels=utils._get_vp_en_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _remove_aux_axis=(1, 2, 3, 31),
+    _model_type="Wav2Vec2",
+)
+VOXPOPULI_ASR_BASE_10K_EN.__doc__ = """wav2vec 2.0 model ("base" architecture),
+pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset :cite:`voxpopuli`
+("10k" subset, consisting of 23 languages), and
+fine-tuned for ASR on 543 hours of transcribed audio from "en" subset.
+
+Originally published by the authors of *VoxPopuli* :cite:`voxpopuli` under CC BY-NC 4.0 and
+redistributed with the same license.
+[`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
+`Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+
+VOXPOPULI_ASR_BASE_10K_ES = Wav2Vec2ASRBundle(
+    "wav2vec2_voxpopuli_base_10k_asr_es.pt",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.1,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.1,
+        "aux_num_out": 35,
+    },
+    _labels=utils._get_es_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _remove_aux_axis=(1, 2, 3, 35),
+    _model_type="Wav2Vec2",
+)
+VOXPOPULI_ASR_BASE_10K_ES.__doc__ = """wav2vec 2.0 model ("base" architecture),
+pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset :cite:`voxpopuli`
+("10k" subset, consisting of 23 languages), and
+fine-tuned for ASR on 166 hours of transcribed audio from "es" subset.
+
+Originally published by the authors of *VoxPopuli* :cite:`voxpopuli` under CC BY-NC 4.0 and
+redistributed with the same license.
+[`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
+`Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+VOXPOPULI_ASR_BASE_10K_FR = Wav2Vec2ASRBundle(
+    "wav2vec2_voxpopuli_base_10k_asr_fr.pt",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.1,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.1,
+        "aux_num_out": 43,
+    },
+    _labels=utils._get_fr_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _model_type="Wav2Vec2",
+)
+VOXPOPULI_ASR_BASE_10K_FR.__doc__ = """wav2vec 2.0 model ("base" architecture),
+pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset :cite:`voxpopuli`
+("10k" subset, consisting of 23 languages), and
+fine-tuned for ASR on 211 hours of transcribed audio from "fr" subset.
+
+Originally published by the authors of *VoxPopuli* :cite:`voxpopuli` under CC BY-NC 4.0 and
+redistributed with the same license.
+[`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
+`Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+
+VOXPOPULI_ASR_BASE_10K_IT = Wav2Vec2ASRBundle(
+    "wav2vec2_voxpopuli_base_10k_asr_it.pt",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.1,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.1,
+        "aux_num_out": 37,
+    },
+    _labels=utils._get_it_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=False,
+    _remove_aux_axis=(1, 2, 3),
+    _model_type="Wav2Vec2",
+)
+VOXPOPULI_ASR_BASE_10K_IT.__doc__ = """wav2vec 2.0 model ("base" architecture),
+pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset :cite:`voxpopuli`
+("10k" subset, consisting of 23 languages), and
+fine-tuned for ASR on 91 hours of transcribed audio from "it" subset.
+
+Originally published by the authors of *VoxPopuli* :cite:`voxpopuli` under CC BY-NC 4.0 and
+redistributed with the same license.
+[`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
+`Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
+"""  # noqa: E501
+
+
+WAVLM_BASE = Wav2Vec2Bundle(
+    "wavlm_base.pth",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_max_distance": 800,
+        "encoder_num_buckets": 320,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.1,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.05,
+        "aux_num_out": None,
+    },
+    _model_type="WavLM",
+    _sample_rate=16000,
+    _normalize_waveform=False,
+)
+WAVLM_BASE.__doc__ = """WavLM Base model ("base" architecture),
+pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`, not fine-tuned.
+
+Originally published by the authors of *WavLM* :cite:`chen2022wavlm` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/microsoft/unilm/blob/65f15af2a307ebb64cfb25adf54375b002e6fe8d/LICENSE>`__,
+`Source <https://github.com/microsoft/unilm/tree/65f15af2a307ebb64cfb25adf54375b002e6fe8d/wavlm#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
+"""  # noqa: E501
+
+
+WAVLM_BASE_PLUS = Wav2Vec2Bundle(
+    "wavlm_base_plus.pth",
+    {
+        "extractor_mode": "group_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 768,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 12,
+        "encoder_num_heads": 12,
+        "encoder_max_distance": 800,
+        "encoder_num_buckets": 320,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 3072,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.1,
+        "encoder_layer_norm_first": False,
+        "encoder_layer_drop": 0.05,
+        "aux_num_out": None,
+    },
+    _model_type="WavLM",
+    _sample_rate=16000,
+    _normalize_waveform=False,
+)
+WAVLM_BASE_PLUS.__doc__ = """WavLM Base+ model ("base" architecture),
+pre-trained on 60,000 hours of Libri-Light dataset :cite:`librilight`, 10,000 hours of GigaSpeech :cite:`GigaSpeech2021`,
+and 24,000 hours of *VoxPopuli* :cite:`voxpopuli`, not fine-tuned.
+
+Originally published by the authors of *WavLM* :cite:`chen2022wavlm` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/microsoft/unilm/blob/65f15af2a307ebb64cfb25adf54375b002e6fe8d/LICENSE>`__,
+`Source <https://github.com/microsoft/unilm/tree/65f15af2a307ebb64cfb25adf54375b002e6fe8d/wavlm#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
+"""  # noqa: E501
+
+
+WAVLM_LARGE = Wav2Vec2Bundle(
+    "wavlm_large.pth",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": False,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_max_distance": 800,
+        "encoder_num_buckets": 320,
+        "encoder_attention_dropout": 0.1,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.1,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.05,
+        "aux_num_out": None,
+    },
+    _model_type="WavLM",
+    _sample_rate=16000,
+    _normalize_waveform=True,
+)
+WAVLM_LARGE.__doc__ = """WavLM Large model ("large" architecture),
+pre-trained on 60,000 hours of Libri-Light dataset :cite:`librilight`, 10,000 hours of GigaSpeech :cite:`GigaSpeech2021`,
+and 24,000 hours of *VoxPopuli* :cite:`voxpopuli`, not fine-tuned.
+
+Originally published by the authors of *WavLM* :cite:`chen2022wavlm` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/microsoft/unilm/blob/65f15af2a307ebb64cfb25adf54375b002e6fe8d/LICENSE>`__,
+`Source <https://github.com/microsoft/unilm/tree/65f15af2a307ebb64cfb25adf54375b002e6fe8d/wavlm#pre-trained-models>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
+"""  # noqa: E501
+
+
+WAV2VEC2_XLSR_300M = Wav2Vec2Bundle(
+    "wav2vec2_xlsr_300m.pth",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": None,
+    },
+    _model_type="Wav2Vec2",
+    _sample_rate=16000,
+    _normalize_waveform=True,
+)
+WAV2VEC2_XLSR_300M.__doc__ = """XLS-R model with 300 million parameters,
+pre-trained on 436,000 hours of unlabeled audio from multiple datasets (
+*Multilingual LibriSpeech* :cite:`Pratap_2020`,
+*CommonVoice* :cite:`ardila2020common`,
+*VoxLingua107* :cite:`valk2021voxlingua107`,
+*BABEL* :cite:`Gales2014SpeechRA`, and
+*VoxPopuli* :cite:`voxpopuli`) in 128 languages,
+not fine-tuned.
+
+Originally published by the authors of *XLS-R* :cite:`babu2021xls` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/facebookresearch/fairseq/blob/30c912b73c0f88d41171879b2f03226a171004ef/LICENSE>`__,
+`Source <https://github.com/facebookresearch/fairseq/tree/30c912b73c0f88d41171879b2f03226a171004ef/examples/wav2vec/xlsr#xls-r>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for usage details.
+"""  # noqa: E501
+
+
+WAV2VEC2_XLSR_1B = Wav2Vec2Bundle(
+    "wav2vec2_xlsr_1b.pth",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1280,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 48,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 5120,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": None,
+    },
+    _model_type="Wav2Vec2",
+    _sample_rate=16000,
+    _normalize_waveform=True,
+)
+WAV2VEC2_XLSR_1B.__doc__ = """XLS-R model with 1 billion parameters,
+pre-trained on 436,000 hours of unlabeled audio from multiple datasets (
+*Multilingual LibriSpeech* :cite:`Pratap_2020`,
+*CommonVoice* :cite:`ardila2020common`,
+*VoxLingua107* :cite:`valk2021voxlingua107`,
+*BABEL* :cite:`Gales2014SpeechRA`, and
+*VoxPopuli* :cite:`voxpopuli`) in 128 languages,
+not fine-tuned.
+
+Originally published by the authors of *XLS-R* :cite:`babu2021xls` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/facebookresearch/fairseq/blob/30c912b73c0f88d41171879b2f03226a171004ef/LICENSE>`__,
+`Source <https://github.com/facebookresearch/fairseq/tree/30c912b73c0f88d41171879b2f03226a171004ef/examples/wav2vec/xlsr#xls-r>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for usage details.
+"""  # noqa: E501
+
+WAV2VEC2_XLSR_2B = Wav2Vec2Bundle(
+    "wav2vec2_xlsr_2b.pth",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1920,
+        "encoder_projection_dropout": 0.1,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 48,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 7680,
+        "encoder_ff_interm_dropout": 0.0,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.0,
+        "aux_num_out": None,
+    },
+    _model_type="Wav2Vec2",
+    _sample_rate=16000,
+    _normalize_waveform=True,
+)
+WAV2VEC2_XLSR_2B.__doc__ = """XLS-R model with 2 billion parameters,
+pre-trained on 436,000 hours of unlabeled audio from multiple datasets (
+*Multilingual LibriSpeech* :cite:`Pratap_2020`,
+*CommonVoice* :cite:`ardila2020common`,
+*VoxLingua107* :cite:`valk2021voxlingua107`,
+*BABEL* :cite:`Gales2014SpeechRA`, and
+*VoxPopuli* :cite:`voxpopuli`) in 128 languages,
+not fine-tuned.
+
+Originally published by the authors of *XLS-R* :cite:`babu2021xls` under MIT License and
+redistributed with the same license.
+[`License <https://github.com/facebookresearch/fairseq/blob/30c912b73c0f88d41171879b2f03226a171004ef/LICENSE>`__,
+`Source <https://github.com/facebookresearch/fairseq/tree/30c912b73c0f88d41171879b2f03226a171004ef/examples/wav2vec/xlsr#xls-r>`__]
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for usage details.
+"""  # noqa: E501
+
+
+@dataclass
+class Wav2Vec2FABundle(Wav2Vec2ASRBundle):
+    """Data class that bundles associated information to use pretrained :py:class:`~torchaudio.models.Wav2Vec2Model` for forced alignment.
+
+    This class provides interfaces for instantiating the pretrained model along with
+    the information necessary to retrieve pretrained weights and additional data
+    to be used with the model.
+
+    Torchaudio library instantiates objects of this class, each of which represents
+    a different pretrained model. Client code should access pretrained models via these
+    instances.
+
+    Please see below for the usage and the available values.
+
+    Example - Feature Extraction
+        >>> import torchaudio
+        >>>
+        >>> bundle = torchaudio.pipelines.MMS_FA
+        >>>
+        >>> # Build the model and load pretrained weight.
+        >>> model = bundle.get_model()
+        Downloading:
+        100%|███████████████████████████████| 1.18G/1.18G [00:05<00:00, 216MB/s]
+        >>>
+        >>> # Resample audio to the expected sampling rate
+        >>> waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
+        >>>
+        >>> # Estimate the probability of token distribution
+        >>> emission, _ = model(waveform)
+        >>>
+        >>> # Generate frame-wise alignment
+        >>> alignment, scores = torchaudio.functional.forced_align(
+        >>>     emission, targets, input_lengths, target_lengths, blank=0)
+        >>>
+    """  # noqa: E501
+
+    class Tokenizer(aligner.ITokenizer):
+        """Interface of the tokenizer"""
+
+    class Aligner(aligner.IAligner):
+        """Interface of the aligner"""
+
+    def get_labels(self, star: Optional[str] = "*", blank: str = "-") -> Tuple[str, ...]:
+        """Get the labels corresponding to the feature dimension of emission.
+
+        The first is blank token, and it is customizable.
+
+        Args:
+            star (str or None, optional): Change or disable star token. (default: ``"*"``)
+            blank (str, optional): Change the blank token. (default: ``'-'``)
+
+        Returns:
+            Tuple[str, ...]:
+            For models fine-tuned on ASR, returns the tuple of strings representing
+            the output class labels.
+
+        Example
+            >>> from torchaudio.pipelines import MMS_FA as bundle
+            >>> bundle.get_labels()
+            ('-', 'a', 'i', 'e', 'n', 'o', 'u', 't', 's', 'r', 'm', 'k', 'l', 'd', 'g', 'h', 'y', 'b', 'p', 'w', 'c', 'v', 'j', 'z', 'f', "'", 'q', 'x', '*')
+            >>> bundle.get_labels(star=None)
+            ('-', 'a', 'i', 'e', 'n', 'o', 'u', 't', 's', 'r', 'm', 'k', 'l', 'd', 'g', 'h', 'y', 'b', 'p', 'w', 'c', 'v', 'j', 'z', 'f', "'", 'q', 'x')
+        """  # noqa: E501
+        labels = super().get_labels(blank=blank)
+        return labels if star is None else (*labels, star)
+
+    def get_model(self, with_star: bool = True, *, dl_kwargs=None) -> Module:
+        """Construct the model and load the pretrained weight.
+
+        The weight file is downloaded from the internet and cached with
+        :func:`torch.hub.load_state_dict_from_url`
+
+        Args:
+            with_star (bool, optional): If enabled, the last dimension of output layer is
+                extended by one, which corresponds to `star` token.
+            dl_kwargs (dictionary of keyword arguments): Passed to :func:`torch.hub.load_state_dict_from_url`.
+
+        Returns:
+            Variation of :py:class:`~torchaudio.models.Wav2Vec2Model`.
+
+            .. note::
+
+               The model created with this method returns probability in log-domain,
+               (i.e. :py:func:`torch.nn.functional.log_softmax` is applied), whereas
+               the other Wav2Vec2 models returns logit.
+        """
+        model = utils._get_model(self._model_type, self._params)
+        state_dict = utils._get_state_dict(self._path, dl_kwargs, self._remove_aux_axis)
+        model.load_state_dict(state_dict)
+        model = utils._extend_model(
+            model, normalize_waveform=self._normalize_waveform, apply_log_softmax=True, append_star=with_star
+        )
+        model.eval()
+        return model
+
+    def get_dict(self, star: Optional[str] = "*", blank: str = "-") -> Dict[str, int]:
+        """Get the mapping from token to index (in emission feature dim)
+
+        Args:
+            star (str or None, optional): Change or disable star token. (default: ``"*"``)
+            blank (str, optional): Change the blank token. (default: ``'-'``)
+
+        Returns:
+            Tuple[str, ...]:
+            For models fine-tuned on ASR, returns the tuple of strings representing
+            the output class labels.
+
+        Example
+            >>> from torchaudio.pipelines import MMS_FA as bundle
+            >>> bundle.get_dict()
+            {'-': 0, 'a': 1, 'i': 2, 'e': 3, 'n': 4, 'o': 5, 'u': 6, 't': 7, 's': 8, 'r': 9, 'm': 10, 'k': 11, 'l': 12, 'd': 13, 'g': 14, 'h': 15, 'y': 16, 'b': 17, 'p': 18, 'w': 19, 'c': 20, 'v': 21, 'j': 22, 'z': 23, 'f': 24, "'": 25, 'q': 26, 'x': 27, '*': 28}
+            >>> bundle.get_dict(star=None)
+            {'-': 0, 'a': 1, 'i': 2, 'e': 3, 'n': 4, 'o': 5, 'u': 6, 't': 7, 's': 8, 'r': 9, 'm': 10, 'k': 11, 'l': 12, 'd': 13, 'g': 14, 'h': 15, 'y': 16, 'b': 17, 'p': 18, 'w': 19, 'c': 20, 'v': 21, 'j': 22, 'z': 23, 'f': 24, "'": 25, 'q': 26, 'x': 27}
+        """  # noqa: E501
+        return {k: i for i, k in enumerate(self.get_labels(star=star, blank=blank))}
+
+    def get_tokenizer(self) -> Tokenizer:
+        """Instantiate a Tokenizer.
+
+        Returns:
+            Tokenizer
+        """
+        return aligner.Tokenizer(self.get_dict())
+
+    def get_aligner(self) -> Aligner:
+        """Instantiate an Aligner.
+
+        Returns:
+            Aligner
+        """
+        return aligner.Aligner(blank=0)
+
+
+MMS_FA = Wav2Vec2FABundle(
+    "https://dl.fbaipublicfiles.com/mms/torchaudio/ctc_alignment_mling_uroman/model.pt",
+    {
+        "extractor_mode": "layer_norm",
+        "extractor_conv_layer_config": [
+            (512, 10, 5),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 3, 2),
+            (512, 2, 2),
+            (512, 2, 2),
+        ],
+        "extractor_conv_bias": True,
+        "encoder_embed_dim": 1024,
+        "encoder_projection_dropout": 0.0,
+        "encoder_pos_conv_kernel": 128,
+        "encoder_pos_conv_groups": 16,
+        "encoder_num_layers": 24,
+        "encoder_num_heads": 16,
+        "encoder_attention_dropout": 0.0,
+        "encoder_ff_interm_features": 4096,
+        "encoder_ff_interm_dropout": 0.1,
+        "encoder_dropout": 0.0,
+        "encoder_layer_norm_first": True,
+        "encoder_layer_drop": 0.1,
+        "aux_num_out": 28,
+    },
+    _labels=utils._get_mms_labels(),
+    _sample_rate=16000,
+    _normalize_waveform=True,
+    _model_type="Wav2Vec2",
+)
+MMS_FA.__doc__ = """
+Trained on 31K hours of data in 1,130 languages from *Scaling Speech Technology to 1,000+ Languages* :cite:`pratap2023scaling`.
+
+Published by the authors of *Scaling Speech Technology to 1,000+ Languages* :cite:`pratap2023scaling` under [`CC-BY-NC 4.0 License <https://github.com/facebookresearch/fairseq/tree/100cd91db19bb27277a06a25eb4154c805b10189/examples/mms#license>`__].
+
+Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2FABundle` for usage details.
+
+.. note::
+
+   Unlike other Wav2Vec2 bundles, this model does not have a token for word boundary (like `|`). This makes the post-processing of alignments slightly different.
+"""  # noqa: E501
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_wav2vec2/utils.py b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_wav2vec2/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e690e8103c7a47a01d719e746e6c98a9c7f6c8db
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/_wav2vec2/utils.py
@@ -0,0 +1,346 @@
+from typing import List, Optional, Tuple
+
+import torch
+from torch import nn, Tensor
+
+from torchaudio._internal import load_state_dict_from_url
+from torchaudio.models import wav2vec2_model, Wav2Vec2Model, wavlm_model
+
+
+def _get_model(type_, params):
+    factories = {
+        "Wav2Vec2": wav2vec2_model,
+        "WavLM": wavlm_model,
+    }
+    if type_ not in factories:
+        raise ValueError(f"Supported model types are {tuple(factories.keys())}. Found: {type_}")
+    factory = factories[type_]
+    return factory(**params)
+
+
+class _Wav2Vec2Model(nn.Module):
+    """Wrapper class for :py:class:`~torchaudio.models.Wav2Vec2Model`.
+
+    This is used for layer normalization at the input
+    """
+
+    def __init__(self, model: Wav2Vec2Model, normalize_waveform: bool, apply_log_softmax: bool, append_star: bool):
+        super().__init__()
+        self.model = model
+        self.normalize_waveform = normalize_waveform
+        self.apply_log_softmax = apply_log_softmax
+        self.append_star = append_star
+
+    def forward(self, waveforms: Tensor, lengths: Optional[Tensor] = None) -> Tuple[Tensor, Optional[Tensor]]:
+        if self.normalize_waveform:
+            waveforms = nn.functional.layer_norm(waveforms, waveforms.shape)
+        output, output_lengths = self.model(waveforms, lengths)
+        if self.apply_log_softmax:
+            output = torch.nn.functional.log_softmax(output, dim=-1)
+        if self.append_star:
+            star_dim = torch.zeros((1, output.size(1), 1), dtype=output.dtype, device=output.device)
+            output = torch.cat((output, star_dim), dim=-1)
+        return output, output_lengths
+
+    @torch.jit.export
+    def extract_features(
+        self,
+        waveforms: Tensor,
+        lengths: Optional[Tensor] = None,
+        num_layers: Optional[int] = None,
+    ) -> Tuple[List[Tensor], Optional[Tensor]]:
+        if self.normalize_waveform:
+            waveforms = nn.functional.layer_norm(waveforms, waveforms.shape)
+        return self.model.extract_features(waveforms, lengths, num_layers)
+
+
+def _extend_model(module, normalize_waveform, apply_log_softmax=False, append_star=False):
+    """Add extra transformations to the model"""
+    return _Wav2Vec2Model(module, normalize_waveform, apply_log_softmax, append_star)
+
+
+def _remove_aux_axes(state_dict, axes):
+    # Remove the seemingly unnecessary axis
+    # For ASR task, the pretrained weights originated from fairseq has unrelated dimensions at index 1, 2, 3
+    # It's originated from the Dictionary implementation of fairseq, which was intended for NLP tasks,
+    # but not used during the ASR training.
+    # https://github.com/pytorch/fairseq/blob/c5ff181125c7e6126b49a85e5ebdd5f5b6a07914/fairseq/data/dictionary.py#L21-L37
+    # https://github.com/pytorch/fairseq/blob/c5ff181125c7e6126b49a85e5ebdd5f5b6a07914/fairseq/criterions/ctc.py#L126-L129
+    #
+    # Also, some pretrained weights originated from voxpopuli has an extra dimensions that almost never used and
+    # that resembles mistake.
+    # The label `1` shows up in the training dataset of German (1 out of 16M),
+    # English (1 / 28M), Spanish (1 / 9.4M), Romanian (1 / 4.7M) and Polish (6 / 5.8M)
+    for key in ["aux.weight", "aux.bias"]:
+        mat = state_dict[key]
+        state_dict[key] = torch.stack([mat[i] for i in range(mat.size(0)) if i not in axes])
+
+
+def _get_state_dict(url, dl_kwargs, remove_axes=None):
+    if not url.startswith("https"):
+        url = f"https://download.pytorch.org/torchaudio/models/{url}"
+    dl_kwargs = {} if dl_kwargs is None else dl_kwargs
+    state_dict = load_state_dict_from_url(url, **dl_kwargs)
+    if remove_axes:
+        _remove_aux_axes(state_dict, remove_axes)
+    return state_dict
+
+
+def _get_en_labels():
+    return (
+        "|",
+        "E",
+        "T",
+        "A",
+        "O",
+        "N",
+        "I",
+        "H",
+        "S",
+        "R",
+        "D",
+        "L",
+        "U",
+        "M",
+        "W",
+        "C",
+        "F",
+        "G",
+        "Y",
+        "P",
+        "B",
+        "V",
+        "K",
+        "'",
+        "X",
+        "J",
+        "Q",
+        "Z",
+    )
+
+
+def _get_de_labels():
+    return (
+        "|",
+        "e",
+        "n",
+        "i",
+        "r",
+        "s",
+        "t",
+        "a",
+        "d",
+        "h",
+        "u",
+        "l",
+        "g",
+        "c",
+        "m",
+        "o",
+        "b",
+        "w",
+        "f",
+        "k",
+        "z",
+        "p",
+        "v",
+        "ü",
+        "ä",
+        "ö",
+        "j",
+        "ß",
+        "y",
+        "x",
+        "q",
+    )
+
+
+def _get_vp_en_labels():
+    return (
+        "|",
+        "e",
+        "t",
+        "o",
+        "i",
+        "a",
+        "n",
+        "s",
+        "r",
+        "h",
+        "l",
+        "d",
+        "c",
+        "u",
+        "m",
+        "p",
+        "f",
+        "g",
+        "w",
+        "y",
+        "b",
+        "v",
+        "k",
+        "x",
+        "j",
+        "q",
+        "z",
+    )
+
+
+def _get_es_labels():
+    return (
+        "|",
+        "e",
+        "a",
+        "o",
+        "s",
+        "n",
+        "r",
+        "i",
+        "l",
+        "d",
+        "c",
+        "t",
+        "u",
+        "p",
+        "m",
+        "b",
+        "q",
+        "y",
+        "g",
+        "v",
+        "h",
+        "ó",
+        "f",
+        "í",
+        "á",
+        "j",
+        "z",
+        "ñ",
+        "é",
+        "x",
+        "ú",
+        "k",
+        "w",
+        "ü",
+    )
+
+
+def _get_fr_labels():
+    return (
+        "|",
+        "e",
+        "s",
+        "n",
+        "i",
+        "t",
+        "r",
+        "a",
+        "o",
+        "u",
+        "l",
+        "d",
+        "c",
+        "p",
+        "m",
+        "é",
+        "v",
+        "q",
+        "f",
+        "g",
+        "b",
+        "h",
+        "x",
+        "à",
+        "j",
+        "è",
+        "y",
+        "ê",
+        "z",
+        "ô",
+        "k",
+        "ç",
+        "œ",
+        "û",
+        "ù",
+        "î",
+        "â",
+        "w",
+        "ï",
+        "ë",
+        "ü",
+        "æ",
+    )
+
+
+def _get_it_labels():
+    return (
+        "|",
+        "e",
+        "i",
+        "a",
+        "o",
+        "n",
+        "t",
+        "r",
+        "l",
+        "s",
+        "c",
+        "d",
+        "u",
+        "p",
+        "m",
+        "g",
+        "v",
+        "h",
+        "z",
+        "f",
+        "b",
+        "q",
+        "à",
+        "è",
+        "ù",
+        "é",
+        "ò",
+        "ì",
+        "k",
+        "y",
+        "x",
+        "w",
+        "j",
+        "ó",
+        "í",
+        "ï",
+    )
+
+
+def _get_mms_labels():
+    return (
+        "a",
+        "i",
+        "e",
+        "n",
+        "o",
+        "u",
+        "t",
+        "s",
+        "r",
+        "m",
+        "k",
+        "l",
+        "d",
+        "g",
+        "h",
+        "y",
+        "b",
+        "p",
+        "w",
+        "c",
+        "v",
+        "j",
+        "z",
+        "f",
+        "'",
+        "q",
+        "x",
+    )
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/pipelines/rnnt_pipeline.py b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/rnnt_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..11b5a479f3785241a00313a85ead1405b2f673cd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/pipelines/rnnt_pipeline.py
@@ -0,0 +1,380 @@
+import json
+import math
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from functools import partial
+from typing import Callable, List, Tuple
+
+import torch
+import torchaudio
+from torchaudio._internal import module_utils
+from torchaudio.models import emformer_rnnt_base, RNNT, RNNTBeamSearch
+
+
+__all__ = []
+
+_decibel = 2 * 20 * math.log10(torch.iinfo(torch.int16).max)
+_gain = pow(10, 0.05 * _decibel)
+
+
+def _piecewise_linear_log(x):
+    x[x > math.e] = torch.log(x[x > math.e])
+    x[x <= math.e] = x[x <= math.e] / math.e
+    return x
+
+
+class _FunctionalModule(torch.nn.Module):
+    def __init__(self, functional):
+        super().__init__()
+        self.functional = functional
+
+    def forward(self, input):
+        return self.functional(input)
+
+
+class _GlobalStatsNormalization(torch.nn.Module):
+    def __init__(self, global_stats_path):
+        super().__init__()
+
+        with open(global_stats_path) as f:
+            blob = json.loads(f.read())
+
+        self.register_buffer("mean", torch.tensor(blob["mean"]))
+        self.register_buffer("invstddev", torch.tensor(blob["invstddev"]))
+
+    def forward(self, input):
+        return (input - self.mean) * self.invstddev
+
+
+class _FeatureExtractor(ABC):
+    @abstractmethod
+    def __call__(self, input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Generates features and length output from the given input tensor.
+
+        Args:
+            input (torch.Tensor): input tensor.
+
+        Returns:
+            (torch.Tensor, torch.Tensor):
+            torch.Tensor:
+                Features, with shape `(length, *)`.
+            torch.Tensor:
+                Length, with shape `(1,)`.
+        """
+
+
+class _TokenProcessor(ABC):
+    @abstractmethod
+    def __call__(self, tokens: List[int], **kwargs) -> str:
+        """Decodes given list of tokens to text sequence.
+
+        Args:
+            tokens (List[int]): list of tokens to decode.
+
+        Returns:
+            str:
+                Decoded text sequence.
+        """
+
+
+class _ModuleFeatureExtractor(torch.nn.Module, _FeatureExtractor):
+    """``torch.nn.Module``-based feature extraction pipeline.
+
+    Args:
+        pipeline (torch.nn.Module): module that implements feature extraction logic.
+    """
+
+    def __init__(self, pipeline: torch.nn.Module) -> None:
+        super().__init__()
+        self.pipeline = pipeline
+
+    def forward(self, input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Generates features and length output from the given input tensor.
+
+        Args:
+            input (torch.Tensor): input tensor.
+
+        Returns:
+            (torch.Tensor, torch.Tensor):
+            torch.Tensor:
+                Features, with shape `(length, *)`.
+            torch.Tensor:
+                Length, with shape `(1,)`.
+        """
+        features = self.pipeline(input)
+        length = torch.tensor([features.shape[0]])
+        return features, length
+
+
+class _SentencePieceTokenProcessor(_TokenProcessor):
+    """SentencePiece-model-based token processor.
+
+    Args:
+        sp_model_path (str): path to SentencePiece model.
+    """
+
+    def __init__(self, sp_model_path: str) -> None:
+        if not module_utils.is_module_available("sentencepiece"):
+            raise RuntimeError("SentencePiece is not available. Please install it.")
+
+        import sentencepiece as spm
+
+        self.sp_model = spm.SentencePieceProcessor(model_file=sp_model_path)
+        self.post_process_remove_list = {
+            self.sp_model.unk_id(),
+            self.sp_model.eos_id(),
+            self.sp_model.pad_id(),
+        }
+
+    def __call__(self, tokens: List[int], lstrip: bool = True) -> str:
+        """Decodes given list of tokens to text sequence.
+
+        Args:
+            tokens (List[int]): list of tokens to decode.
+            lstrip (bool, optional): if ``True``, returns text sequence with leading whitespace
+                removed. (Default: ``True``).
+
+        Returns:
+            str:
+                Decoded text sequence.
+        """
+        filtered_hypo_tokens = [
+            token_index for token_index in tokens[1:] if token_index not in self.post_process_remove_list
+        ]
+        output_string = "".join(self.sp_model.id_to_piece(filtered_hypo_tokens)).replace("\u2581", " ")
+
+        if lstrip:
+            return output_string.lstrip()
+        else:
+            return output_string
+
+
+@dataclass
+class RNNTBundle:
+    """Dataclass that bundles components for performing automatic speech recognition (ASR, speech-to-text)
+    inference with an RNN-T model.
+
+    More specifically, the class provides methods that produce the featurization pipeline,
+    decoder wrapping the specified RNN-T model, and output token post-processor that together
+    constitute a complete end-to-end ASR inference pipeline that produces a text sequence
+    given a raw waveform.
+
+    It can support non-streaming (full-context) inference as well as streaming inference.
+
+    Users should not directly instantiate objects of this class; rather, users should use the
+    instances (representing pre-trained models) that exist within the module,
+    e.g. :data:`torchaudio.pipelines.EMFORMER_RNNT_BASE_LIBRISPEECH`.
+
+    Example
+        >>> import torchaudio
+        >>> from torchaudio.pipelines import EMFORMER_RNNT_BASE_LIBRISPEECH
+        >>> import torch
+        >>>
+        >>> # Non-streaming inference.
+        >>> # Build feature extractor, decoder with RNN-T model, and token processor.
+        >>> feature_extractor = EMFORMER_RNNT_BASE_LIBRISPEECH.get_feature_extractor()
+        100%|███████████████████████████████| 3.81k/3.81k [00:00<00:00, 4.22MB/s]
+        >>> decoder = EMFORMER_RNNT_BASE_LIBRISPEECH.get_decoder()
+        Downloading: "https://download.pytorch.org/torchaudio/models/emformer_rnnt_base_librispeech.pt"
+        100%|███████████████████████████████| 293M/293M [00:07<00:00, 42.1MB/s]
+        >>> token_processor = EMFORMER_RNNT_BASE_LIBRISPEECH.get_token_processor()
+        100%|███████████████████████████████| 295k/295k [00:00<00:00, 25.4MB/s]
+        >>>
+        >>> # Instantiate LibriSpeech dataset; retrieve waveform for first sample.
+        >>> dataset = torchaudio.datasets.LIBRISPEECH("/home/librispeech", url="test-clean")
+        >>> waveform = next(iter(dataset))[0].squeeze()
+        >>>
+        >>> with torch.no_grad():
+        >>>     # Produce mel-scale spectrogram features.
+        >>>     features, length = feature_extractor(waveform)
+        >>>
+        >>>     # Generate top-10 hypotheses.
+        >>>     hypotheses = decoder(features, length, 10)
+        >>>
+        >>> # For top hypothesis, convert predicted tokens to text.
+        >>> text = token_processor(hypotheses[0][0])
+        >>> print(text)
+        he hoped there would be stew for dinner turnips and carrots and bruised potatoes and fat mutton pieces to [...]
+        >>>
+        >>>
+        >>> # Streaming inference.
+        >>> hop_length = EMFORMER_RNNT_BASE_LIBRISPEECH.hop_length
+        >>> num_samples_segment = EMFORMER_RNNT_BASE_LIBRISPEECH.segment_length * hop_length
+        >>> num_samples_segment_right_context = (
+        >>>     num_samples_segment + EMFORMER_RNNT_BASE_LIBRISPEECH.right_context_length * hop_length
+        >>> )
+        >>>
+        >>> # Build streaming inference feature extractor.
+        >>> streaming_feature_extractor = EMFORMER_RNNT_BASE_LIBRISPEECH.get_streaming_feature_extractor()
+        >>>
+        >>> # Process same waveform as before, this time sequentially across overlapping segments
+        >>> # to simulate streaming inference. Note the usage of ``streaming_feature_extractor`` and ``decoder.infer``.
+        >>> state, hypothesis = None, None
+        >>> for idx in range(0, len(waveform), num_samples_segment):
+        >>>     segment = waveform[idx: idx + num_samples_segment_right_context]
+        >>>     segment = torch.nn.functional.pad(segment, (0, num_samples_segment_right_context - len(segment)))
+        >>>     with torch.no_grad():
+        >>>         features, length = streaming_feature_extractor(segment)
+        >>>         hypotheses, state = decoder.infer(features, length, 10, state=state, hypothesis=hypothesis)
+        >>>     hypothesis = hypotheses[0]
+        >>>     transcript = token_processor(hypothesis[0])
+        >>>     if transcript:
+        >>>         print(transcript, end=" ", flush=True)
+        he hoped there would be stew for dinner turn ips and car rots and bru 'd oes and fat mut ton pieces to [...]
+    """
+
+    class FeatureExtractor(_FeatureExtractor):
+        """Interface of the feature extraction part of RNN-T pipeline"""
+
+    class TokenProcessor(_TokenProcessor):
+        """Interface of the token processor part of RNN-T pipeline"""
+
+    _rnnt_path: str
+    _rnnt_factory_func: Callable[[], RNNT]
+    _global_stats_path: str
+    _sp_model_path: str
+    _right_padding: int
+    _blank: int
+    _sample_rate: int
+    _n_fft: int
+    _n_mels: int
+    _hop_length: int
+    _segment_length: int
+    _right_context_length: int
+
+    def _get_model(self) -> RNNT:
+        model = self._rnnt_factory_func()
+        path = torchaudio.utils.download_asset(self._rnnt_path)
+        state_dict = torch.load(path)
+        model.load_state_dict(state_dict)
+        model.eval()
+        return model
+
+    @property
+    def sample_rate(self) -> int:
+        """Sample rate (in cycles per second) of input waveforms.
+
+        :type: int
+        """
+        return self._sample_rate
+
+    @property
+    def n_fft(self) -> int:
+        """Size of FFT window to use.
+
+        :type: int
+        """
+        return self._n_fft
+
+    @property
+    def n_mels(self) -> int:
+        """Number of mel spectrogram features to extract from input waveforms.
+
+        :type: int
+        """
+        return self._n_mels
+
+    @property
+    def hop_length(self) -> int:
+        """Number of samples between successive frames in input expected by model.
+
+        :type: int
+        """
+        return self._hop_length
+
+    @property
+    def segment_length(self) -> int:
+        """Number of frames in segment in input expected by model.
+
+        :type: int
+        """
+        return self._segment_length
+
+    @property
+    def right_context_length(self) -> int:
+        """Number of frames in right contextual block in input expected by model.
+
+        :type: int
+        """
+        return self._right_context_length
+
+    def get_decoder(self) -> RNNTBeamSearch:
+        """Constructs RNN-T decoder.
+
+        Returns:
+            RNNTBeamSearch
+        """
+        model = self._get_model()
+        return RNNTBeamSearch(model, self._blank)
+
+    def get_feature_extractor(self) -> FeatureExtractor:
+        """Constructs feature extractor for non-streaming (full-context) ASR.
+
+        Returns:
+            FeatureExtractor
+        """
+        local_path = torchaudio.utils.download_asset(self._global_stats_path)
+        return _ModuleFeatureExtractor(
+            torch.nn.Sequential(
+                torchaudio.transforms.MelSpectrogram(
+                    sample_rate=self.sample_rate, n_fft=self.n_fft, n_mels=self.n_mels, hop_length=self.hop_length
+                ),
+                _FunctionalModule(lambda x: x.transpose(1, 0)),
+                _FunctionalModule(lambda x: _piecewise_linear_log(x * _gain)),
+                _GlobalStatsNormalization(local_path),
+                _FunctionalModule(lambda x: torch.nn.functional.pad(x, (0, 0, 0, self._right_padding))),
+            )
+        )
+
+    def get_streaming_feature_extractor(self) -> FeatureExtractor:
+        """Constructs feature extractor for streaming (simultaneous) ASR.
+
+        Returns:
+            FeatureExtractor
+        """
+        local_path = torchaudio.utils.download_asset(self._global_stats_path)
+        return _ModuleFeatureExtractor(
+            torch.nn.Sequential(
+                torchaudio.transforms.MelSpectrogram(
+                    sample_rate=self.sample_rate, n_fft=self.n_fft, n_mels=self.n_mels, hop_length=self.hop_length
+                ),
+                _FunctionalModule(lambda x: x.transpose(1, 0)),
+                _FunctionalModule(lambda x: _piecewise_linear_log(x * _gain)),
+                _GlobalStatsNormalization(local_path),
+            )
+        )
+
+    def get_token_processor(self) -> TokenProcessor:
+        """Constructs token processor.
+
+        Returns:
+            TokenProcessor
+        """
+        local_path = torchaudio.utils.download_asset(self._sp_model_path)
+        return _SentencePieceTokenProcessor(local_path)
+
+
+EMFORMER_RNNT_BASE_LIBRISPEECH = RNNTBundle(
+    _rnnt_path="models/emformer_rnnt_base_librispeech.pt",
+    _rnnt_factory_func=partial(emformer_rnnt_base, num_symbols=4097),
+    _global_stats_path="pipeline-assets/global_stats_rnnt_librispeech.json",
+    _sp_model_path="pipeline-assets/spm_bpe_4096_librispeech.model",
+    _right_padding=4,
+    _blank=4096,
+    _sample_rate=16000,
+    _n_fft=400,
+    _n_mels=80,
+    _hop_length=160,
+    _segment_length=16,
+    _right_context_length=4,
+)
+EMFORMER_RNNT_BASE_LIBRISPEECH.__doc__ = """ASR pipeline based on Emformer-RNNT,
+pretrained on *LibriSpeech* dataset :cite:`7178964`,
+capable of performing both streaming and non-streaming inference.
+
+The underlying model is constructed by :py:func:`torchaudio.models.emformer_rnnt_base`
+and utilizes weights trained on LibriSpeech using training script ``train.py``
+`here <https://github.com/pytorch/audio/tree/main/examples/asr/emformer_rnnt>`__ with default arguments.
+
+Please refer to :py:class:`RNNTBundle` for usage instructions.
+"""
diff --git a/.venv/lib/python3.11/site-packages/torchaudio/version.py b/.venv/lib/python3.11/site-packages/torchaudio/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..8dd05cf81ab3c1b8e8b20f9b9ccdd335acebf71a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/torchaudio/version.py
@@ -0,0 +1,2 @@
+__version__ = '2.5.1+cu124'
+git_version = '1661daf10599ca8889f092ec37814fabbe202bb0'